1 /*
2  *  gretl -- Gnu Regression, Econometrics and Time-series Library
3  *  Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4  *
5  *  This program is free software: you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation, either version 3 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  */
19 
20 #include "libgretl.h"
21 #include "gretl_string_table.h"
22 #include "libset.h"
23 #include "usermat.h"
24 #include "uservar.h"
25 #include "genparse.h"
26 #include "gretl_xml.h"
27 #include "gretl_midas.h"
28 #include "matrix_extra.h"
29 #include "gretl_www.h"
30 #include "gretl_join.h"
31 #include "join_priv.h"
32 #include "csvdata.h"
33 
34 #ifdef WIN32
35 # include "gretl_win32.h"
36 #endif
37 
38 #include <errno.h>
39 
40 #define CDEBUG 0  /* CSV reading in general */
41 
42 #define CSVSTRLEN 128
43 
44 enum {
45     CSV_HAVEDATA = 1 << 0,
46     CSV_GOTDELIM = 1 << 1,
47     CSV_GOTTAB   = 1 << 2,
48     CSV_GOTSEMI  = 1 << 3,
49     CSV_BLANK1   = 1 << 4,
50     CSV_OBS1     = 1 << 5,
51     CSV_TRAIL    = 1 << 6,
52     CSV_AUTONAME = 1 << 7,
53     CSV_REVERSED = 1 << 8,
54     CSV_DOTSUB   = 1 << 9,
55     CSV_ALLCOLS  = 1 << 10,
56     CSV_BOM      = 1 << 11,
57     CSV_VERBOSE  = 1 << 12,
58     CSV_THOUSEP  = 1 << 13,
59     CSV_NOHEADER = 1 << 14,
60     CSV_QUOTES   = 1 << 15,
61     CSV_AS_MAT   = 1 << 16
62 };
63 
64 struct csvprobe_ {
65     DATASET *dset; /* more info might be wanted */
66 };
67 
68 typedef struct csvprobe_ csvprobe;
69 
70 struct csvdata_ {
71     int flags;
72     char delim;
73     char decpoint;
74     char thousep;
75     char qchar;
76     int markerpd;
77     int maxlinelen;
78     int real_n;
79     char *line;
80     DATASET *dset;
81     int ncols, nrows;
82     long datapos;
83     char str[CSVSTRLEN];
84     char skipstr[8];
85     int *codelist;
86     char *descrip;
87     const char *user_na;
88     gretl_string_table *st;
89     int *cols_list;
90     int *width_list;
91     const gretl_matrix *rowmask;
92     int masklen;
93     joinspec *jspec; /* info used for "join" command */
94     csvprobe *probe; /* used in connection with "join" */
95 };
96 
97 #define csv_has_trailing_comma(c) (c->flags & CSV_TRAIL)
98 #define csv_has_obs_column(c)     (c->flags & CSV_OBS1)
99 #define csv_has_blank_column(c)   (c->flags & CSV_BLANK1)
100 #define csv_got_tab(c)            (c->flags & CSV_GOTTAB)
101 #define csv_got_semi(c)           (c->flags & CSV_GOTSEMI)
102 #define csv_got_delim(c)          (c->flags & CSV_GOTDELIM)
103 #define csv_autoname(c)           (c->flags & CSV_AUTONAME)
104 #define csv_skip_col_1(c)         (c->flags & (CSV_OBS1 | CSV_BLANK1))
105 #define csv_have_data(c)          (c->flags & CSV_HAVEDATA)
106 #define csv_data_reversed(c)      (c->flags & CSV_REVERSED)
107 #define csv_do_dotsub(c)          (c->flags & CSV_DOTSUB)
108 #define csv_all_cols(c)           (c->flags & CSV_ALLCOLS)
109 #define csv_has_bom(c)            (c->flags & CSV_BOM)
110 #define csv_is_verbose(c)         (c->flags & CSV_VERBOSE)
111 #define csv_scrub_thousep(c)      (c->flags & CSV_THOUSEP)
112 #define csv_no_header(c)          (c->flags & CSV_NOHEADER)
113 #define csv_keep_quotes(c)        (c->flags & CSV_QUOTES)
114 #define csv_as_matrix(c)          (c->flags & CSV_AS_MAT)
115 
116 #define csv_set_trailing_comma(c)   (c->flags |= CSV_TRAIL)
117 #define csv_unset_trailing_comma(c) (c->flags &= ~CSV_TRAIL)
118 #define csv_set_obs_column(c)       (c->flags |= CSV_OBS1)
119 #define csv_set_blank_column(c)     (c->flags |= CSV_BLANK1)
120 #define csv_set_got_tab(c)          (c->flags |= CSV_GOTTAB)
121 #define csv_set_got_semi(c)         (c->flags |= CSV_GOTSEMI)
122 #define csv_set_got_delim(c)        (c->flags |= CSV_GOTDELIM)
123 #define csv_set_autoname(c)         (c->flags |= CSV_AUTONAME)
124 #define csv_set_data_reversed(c)    (c->flags |= CSV_REVERSED)
125 #define csv_set_dotsub(c)           (c->flags |= CSV_DOTSUB)
126 #define csv_set_all_cols(c)         (c->flags |= CSV_ALLCOLS)
127 #define csv_set_has_bom(c)          (c->flags |= CSV_BOM)
128 #define csv_set_verbose(c)          (c->flags |= CSV_VERBOSE)
129 #define csv_set_scrub_thousep(c)    (c->flags |= CSV_THOUSEP)
130 #define csv_set_no_header(c)        (c->flags |= CSV_NOHEADER)
131 #define csv_unset_keep_quotes(c)    (c->flags &= ~CSV_QUOTES)
132 #define csv_set_as_matrix(c)        (c->flags |= CSV_AS_MAT)
133 
134 #define csv_skip_bad(c)        (*c->skipstr != '\0')
135 #define csv_has_non_numeric(c) (c->st != NULL)
136 
137 #define fixed_format(c) (c->cols_list != NULL && c->width_list != NULL)
138 #define cols_subset(c) (c->cols_list != NULL && c->width_list == NULL)
139 #define rows_subset(c) (c->rowmask != NULL)
140 
141 #define joining(c) (c->jspec != NULL)
142 #define probing(c) (c->probe != NULL)
143 
144 static int
145 time_series_label_check (DATASET *dset, int reversed, char *skipstr,
146                          int convert_pd, PRN *prn);
147 
148 /* file-scope global */
149 static char import_na[8];
150 
151 
152 /* for use in gretl_join.c */
153 
csvdata_get_dataset(csvdata * c)154 DATASET *csvdata_get_dataset (csvdata *c)
155 {
156     return c->dset;
157 }
158 
159 /* shared with gretl_join.c */
160 
csvdata_free(csvdata * c)161 void csvdata_free (csvdata *c)
162 {
163     if (c == NULL) {
164         return;
165     }
166 
167     if (c->descrip != NULL) {
168         free(c->descrip);
169     }
170 
171     if (c->st != NULL) {
172         gretl_string_table_destroy(c->st);
173     }
174 
175     if (c->codelist != NULL) {
176         free(c->codelist);
177     }
178 
179     if (c->line != NULL) {
180         free(c->line);
181     }
182 
183     if (c->cols_list != NULL) {
184         free(c->cols_list);
185         free(c->width_list);
186     }
187 
188     destroy_dataset(c->dset);
189 
190     free(c);
191 }
192 
csvdata_new(DATASET * dset)193 static csvdata *csvdata_new (DATASET *dset)
194 {
195     csvdata *c = malloc(sizeof *c);
196 
197     if (c == NULL) {
198         return NULL;
199     }
200 
201     c->flags = CSV_QUOTES;
202     c->delim = '\t';
203     c->thousep = 0;
204     c->qchar = 0;
205     c->markerpd = -1;
206     c->maxlinelen = 0;
207     c->real_n = 0;
208     c->line = NULL;
209     c->dset = NULL;
210     c->ncols = 0;
211     c->nrows = 0;
212     c->datapos = 0;
213     *c->str = '\0';
214     *c->skipstr = '\0';
215     c->codelist = NULL;
216     c->descrip = NULL;
217     c->user_na = NULL;
218     c->st = NULL;
219     c->cols_list = NULL;
220     c->width_list = NULL;
221     c->rowmask = NULL;
222     c->masklen = 0;
223 
224     if (strcmp(import_na, "default")) {
225         c->user_na = import_na;
226     }
227 
228     c->jspec = NULL;
229     c->probe = NULL;
230 
231     c->dset = datainfo_new();
232 
233     if (c->dset == NULL) {
234         free(c);
235         c = NULL;
236     } else {
237         c->delim = get_data_export_delimiter();
238         c->decpoint = get_data_export_decpoint();
239         if (dset != NULL && dset->Z != NULL) {
240             c->flags |= CSV_HAVEDATA;
241         }
242 #if CDEBUG
243         fprintf(stderr, "csvdata_new: c->delim = '%c', c->decpoint = '%c'\n",
244                 c->delim, c->decpoint);
245 #endif
246     }
247 
248     return c;
249 }
250 
cols_list_from_matrix(const char * s,int * err)251 static int *cols_list_from_matrix (const char *s, int *err)
252 {
253     gretl_matrix *m = get_matrix_by_name(s);
254     int i, n = gretl_vector_get_length(m);
255     int *list = NULL;
256 
257     if (n == 0) {
258         *err = E_DATA;
259     } else {
260         list = gretl_list_new(n);
261         if (list == NULL) {
262             *err = E_ALLOC;
263         } else {
264             for (i=0; i<n; i++) {
265                 list[i+1] = gretl_vector_get(m, i);
266             }
267         }
268     }
269 
270     return list;
271 }
272 
273 /* The interpretation of the "cols" specification depends on
274    @opt: if this includes OPT_L then it should provide a 1-based
275    list of columns to be read; but if @opt includes OPT_F it
276    should provide a fixed-format spec, consisting of pairs
277    (start column, width).
278 */
279 
csvdata_add_cols_list(csvdata * c,const char * s,gretlopt opt)280 static int csvdata_add_cols_list (csvdata *c, const char *s,
281                                   gretlopt opt)
282 {
283     int delimited = (opt & OPT_L);
284     int *list, *clist = NULL, *wlist = NULL;
285     int i, n, m = 0;
286     int err = 0;
287 
288     if (get_matrix_by_name(s)) {
289         list = cols_list_from_matrix(s, &err);
290     } else {
291         list = gretl_list_from_string(s, &err);
292     }
293 
294     if (!err) {
295         n = list[0];
296         if (n == 0) {
297             err = E_DATA;
298         } else if (delimited) {
299             m = n;
300             clist = list;
301         } else {
302             /* fixed format: we need two lists */
303             if (n % 2 != 0) {
304                 err = E_DATA;
305             } else {
306                 m = n / 2;
307                 clist = gretl_list_new(m);
308                 wlist = gretl_list_new(m);
309                 if (clist == NULL || wlist == NULL) {
310                     err = E_ALLOC;
311                 } else {
312                     int j = 1;
313 
314                     for (i=1; i<=n; i+=2, j++) {
315                         clist[j] = list[i];
316                         wlist[j] = list[i+1];
317                     }
318                 }
319             }
320         }
321     }
322 
323     /* clist = column (start) list: must be a set of increasing
324        positive integers; and wlist = respective column widths,
325        must all be positive, if present
326     */
327 
328     for (i=1; i<=m && !err; i++) {
329         if (clist[i] <= 0 || (i > 1 && clist[i] <= clist[i-1])) {
330             err = E_DATA;
331         } else if (wlist != NULL && wlist[i] <= 0) {
332             err = E_DATA;
333         } else if (wlist != NULL && wlist[i] >= CSVSTRLEN) {
334             fprintf(stderr, "Warning: field %d too wide (%d), truncating\n",
335                     i, wlist[i]);
336             wlist[i] = CSVSTRLEN - 1;
337         }
338     }
339 
340     if (list != clist) {
341         free(list);
342     }
343 
344     if (!err) {
345         c->cols_list = clist;
346         c->width_list = wlist;
347     } else {
348         free(clist);
349         free(wlist);
350         if (err == E_DATA) {
351             gretl_errmsg_set(_("Invalid column specification"));
352         }
353     }
354 
355     return err;
356 }
357 
csvdata_add_row_mask(csvdata * c,const char * s)358 static int csvdata_add_row_mask (csvdata *c, const char *s)
359 {
360     int err = 0;
361 
362     c->rowmask = get_matrix_by_name(s);
363     if (c->rowmask == NULL) {
364         gretl_errmsg_sprintf(_("'%s': no such matrix"), s);
365         err = E_DATA;
366     } else {
367         c->masklen = gretl_vector_get_length(c->rowmask);
368         if (c->masklen == 0) {
369             err = E_NONCONF;
370         }
371     }
372 
373     return err;
374 }
375 
n_from_row_mask(csvdata * c)376 static int n_from_row_mask (csvdata *c)
377 {
378     int i, n = 0;
379 
380     for (i=0; i<c->masklen && i<=c->nrows; i++) {
381         if (gretl_vector_get(c->rowmask, i) != 0) {
382             n++;
383         }
384     }
385 
386     return n;
387 }
388 
add_obs_marker(DATASET * dset,int n)389 static int add_obs_marker (DATASET *dset, int n)
390 {
391     char **S = realloc(dset->S, n * sizeof *S);
392     int err = 0;
393 
394     if (S == NULL) {
395         err = E_ALLOC;
396     } else {
397         dset->S = S;
398         dset->S[n-1] = malloc(OBSLEN);
399         if (dset->S[n-1] == NULL) {
400             err = E_ALLOC;
401         } else {
402             strcpy(dset->S[n-1], "NA");
403         }
404     }
405 
406     return err;
407 }
408 
add_single_obs(DATASET * dset)409 static int add_single_obs (DATASET *dset)
410 {
411     double *x;
412     int i, err = 0;
413 
414     for (i=0; i<dset->v && !err; i++) {
415         x = realloc(dset->Z[i], (dset->n + 1) * sizeof *x);
416         if (x != NULL) {
417             dset->Z[i] = x;
418         } else {
419             err = E_ALLOC;
420         }
421     }
422 
423     if (!err) {
424         dset->n += 1;
425         dset->Z[0][dset->n - 1] = 1.0;
426         for (i=1; i<dset->v; i++) {
427             dset->Z[i][dset->n - 1] = NADBL;
428         }
429         if (dset->S != NULL) {
430             err = add_obs_marker(dset, dset->n);
431         }
432     }
433 
434     return err;
435 }
436 
pad_weekly_data(DATASET * dset,int add)437 static int pad_weekly_data (DATASET *dset, int add)
438 {
439     int oldn = dset->n;
440     int ttarg, offset = 0, skip = 0;
441     int i, s, t, tc, err;
442 
443     err = dataset_add_observations(dset, add, OPT_A);
444 
445     if (!err) {
446         for (t=0; t<oldn; t++) {
447             tc = calendar_obs_number(dset->S[t], dset) - offset;
448             if (tc != t) {
449                 skip = tc - t;
450                 fprintf(stderr, "Gap of size %d at original t = %d\n", skip, t);
451                 offset += skip;
452                 ttarg = oldn - 1 + offset;
453                 for (s=0; s<oldn-t+skip; s++) {
454                     for (i=1; i<dset->v; i++) {
455                         if (s < oldn - t) {
456                             if (s == 0 || s == oldn-t-1) {
457                                 fprintf(stderr, "shifting obs %d to obs %d\n",
458                                         ttarg-skip, ttarg);
459                             }
460                             dset->Z[i][ttarg] = dset->Z[i][ttarg - skip];
461                         } else {
462                             fprintf(stderr, "inserting NA at obs %d\n", ttarg);
463                             dset->Z[i][ttarg] = NADBL;
464                         }
465                     }
466                     ttarg--;
467                 }
468             }
469         }
470     }
471 
472     return err;
473 }
474 
475 /* FIXME the following needs to be made more flexible? */
476 
csv_weekly_data(DATASET * dset)477 static int csv_weekly_data (DATASET *dset)
478 {
479     char *lbl2 = dset->S[dset->n - 1];
480     int ret = 1;
481     int misscount = 0;
482     int t, tc;
483 
484     for (t=0; t<dset->n; t++) {
485         tc = calendar_obs_number(dset->S[t], dset) - misscount;
486         if (tc != t) {
487             misscount += tc - t;
488         }
489     }
490 
491     if (misscount > 0) {
492         double missfrac = (double) misscount / dset->n;
493 
494         fprintf(stderr, "nobs = %d, misscount = %d (%.2f%%)\n",
495                 dset->n, misscount, 100.0 * missfrac);
496         if (missfrac > 0.05) {
497             ret = 0;
498         } else {
499             int Tc = calendar_obs_number(lbl2, dset) + 1;
500             int altmiss = Tc - dset->n;
501 
502             fprintf(stderr, "check: Tc = %d, missing = %d\n", Tc, altmiss);
503             if (altmiss != misscount) {
504                 ret = 0;
505             } else if (dset->Z != NULL) {
506                 int err;
507 
508                 fprintf(stderr, "OK, consistent\n");
509                 err = pad_weekly_data(dset, misscount);
510                 if (err) ret = 0;
511             }
512         }
513     }
514 
515     return ret;
516 }
517 
518 #define DAY_DEBUG 1
519 
check_daily_dates(DATASET * dset,int * pd,int * reversed,PRN * prn)520 static int check_daily_dates (DATASET *dset, int *pd,
521                               int *reversed, PRN *prn)
522 {
523     int T = dset->n;
524     char *lbl1 = dset->S[0];
525     char *lbl2 = dset->S[T - 1];
526     int fulln = 0, n, t, nbak;
527     int alt_pd = 0;
528     int oldpd = dset->pd;
529     double oldsd0 = dset->sd0;
530     guint32 ed1, ed2;
531     int nmiss = 0, err = 0;
532 
533     *pd = 0;
534 
535     ed1 = get_epoch_day(lbl1);
536     ed2 = get_epoch_day(lbl2);
537     if (ed1 <= 0 || ed2 <= 0) {
538         err = 1;
539     }
540 
541 #if DAY_DEBUG
542     fprintf(stderr, "check_daily_dates: '%s' -> %d, '%s' -> %d\n",
543             lbl1, (int) ed1, lbl2, (int) ed2);
544 #endif
545 
546     dset->pd = guess_daily_pd(dset);
547     dset->structure = TIME_SERIES;
548 
549 #if DAY_DEBUG
550     fprintf(stderr, "guessed at daily pd = %d\n", dset->pd);
551 #endif
552 
553     if (!err) {
554         if (ed2 < ed1) {
555 #if DAY_DEBUG
556             fprintf(stderr, "check_daily_dates: data are reversed?\n");
557 #endif
558             dset->sd0 = ed2;
559             *reversed = 1;
560         } else {
561             dset->sd0 = ed1;
562         }
563     }
564 
565  recompute:
566 
567     alt_pd = 0;
568     nbak = 0;
569 
570     if (!err) {
571         guint32 n1 = (*reversed)? ed2 : ed1;
572         guint32 n2 = (*reversed)? ed1 : ed2;
573 
574         fulln = n2 - n1 + 1;
575 
576         if (T > fulln) {
577             err = 1;
578         } else {
579             nmiss = fulln - T;
580             pprintf(prn, _("Observations: %d; days in sample: %d\n"),
581                     T, fulln);
582             if (nmiss > 300 * T) {
583                 pprintf(prn, _("Probably annual data\n"));
584                 *pd = 1;
585             } else if (nmiss > 50 * T) {
586                 pprintf(prn, _("Probably quarterly data\n"));
587                 *pd = 4;
588             } else if (nmiss > 20 * T) {
589                 pprintf(prn, _("Probably monthly data\n"));
590                 *pd = 12;
591             } else if (nmiss > 3 * T) {
592                 pprintf(prn, _("Probably weekly data\n"));
593                 *pd = dset->pd = 52;
594             } else {
595                 pprintf(prn, _("Missing daily rows: %d\n"), nmiss);
596             }
597         }
598     }
599 
600     nbak = 0;
601 
602     for (t=0; t<dset->n && !err; t++) {
603         int wd, s = (*reversed)? (dset->n - 1 - t) : t;
604 
605         wd = weekday_from_date(dset->S[s]);
606 
607         if (dset->pd == 5 && (wd == 6 || wd == 0)) {
608             /* Got Sat or Sun, can't be 5-day daily? */
609             alt_pd = (wd == 6)? 6 : 7;
610             pprintf(prn, "Found a Saturday (%s): re-trying with pd = %d\n",
611                     dset->S[s], alt_pd);
612             break;
613         } else if (dset->pd == 6 && wd == 0) {
614             /* Got Sun, can't be 6-day daily? */
615             alt_pd = 7;
616             pprintf(prn, "Found a Sunday (%s): re-trying with pd = %d\n",
617                     dset->S[s], alt_pd);
618             break;
619         }
620 
621         n = calendar_obs_number(dset->S[s], dset);
622         if (n < t) {
623             pprintf(prn, "Daily dates error at t = %d:\n"
624                     "  calendar_obs_number() for '%s' = %d but t = %d\n",
625                     t, dset->S[s], n, t);
626             err = 1;
627         } else if (n > fulln - 1) {
628             pprintf(prn, "Error: date '%s' out of bounds\n", dset->S[s]);
629             err = 1;
630         } else if (nbak > 0 && n == nbak) {
631             pprintf(prn, "Error: date '%s' is repeated\n", dset->S[s]);
632             err = 1;
633         }
634         nbak = n;
635     }
636 
637     if (alt_pd > 0) {
638         dset->pd = alt_pd;
639         goto recompute;
640     }
641 
642     if (err) {
643         dset->pd = oldpd;
644         dset->sd0 = oldsd0;
645         dset->structure = CROSS_SECTION;
646     } else {
647         strcpy(dset->stobs, (*reversed)? lbl2 : lbl1);
648         strcpy(dset->endobs, (*reversed)? lbl1 : lbl2);
649         dset->t2 = dset->n - 1;
650         if (nmiss > 0 && *pd == 0) {
651             dset->markers = DAILY_DATE_STRINGS;
652         }
653     }
654 
655 #if DAY_DEBUG
656     fprintf(stderr, "check_daily_dates: daily pd = %d, reversed = %d, err = %d\n",
657             dset->pd, *reversed, err);
658 #endif
659 
660     return (err)? -1 : dset->pd;
661 }
662 
663 /* convert from daily date label to a lower frequency --
664    annual, monthly or quarterly -- if @pd indicates this
665    is required
666 */
667 
convert_daily_label(char * targ,const char * src,int pd)668 static void convert_daily_label (char *targ, const char *src,
669                                  int pd)
670 {
671     int y, m, d;
672 
673     sscanf(src, YMD_READ_FMT, &y, &m, &d);
674 
675     if (pd == 1) {
676         sprintf(targ, "%d", y);
677     } else if (pd == 12) {
678         sprintf(targ, "%d:%02d", y, m);
679     } else if (pd == 4) {
680         sprintf(targ, "%d:%d", y, m / 3 + (m % 3 != 0));
681     }
682 }
683 
684 /* There's a special case (ugh!) where observation strings are
685    given as in monthly data, but the frequency is in fact
686    quarterly, as in:
687 
688    1947.06
689    1947.09
690    1947.12
691    1948.03
692 
693    we'll make a brave attempt to handle this.
694 */
695 
696 #define fakequarter(m) (m==3 || m==6 || m==9 || m==12)
697 
consistent_qm_labels(DATASET * dset,int reversed,int convert_pd,char * skipstr,int * ppd,const char * fmt,int * extra_zero,PRN * prn)698 static int consistent_qm_labels (DATASET *dset, int reversed,
699                                  int convert_pd, char *skipstr,
700                                  int *ppd, const char *fmt,
701                                  int *extra_zero, PRN *prn)
702 {
703     char bad[16], skip[8];
704     char label[OBSLEN];
705     int Ey; /* expected year */
706     int Ep; /* expected sub-period */
707     int t, s, yr, per;
708     int pmin = 1;
709     int pd, pd0;
710     int ret = 1;
711 
712     pd = pd0 = *ppd;
713 
714  restart:
715 
716     s = reversed ? (dset->n - 1) : 0;
717 
718     if (convert_pd) {
719         convert_daily_label(label, dset->S[s], pd);
720     } else {
721         strcpy(label, dset->S[s]);
722     }
723 
724     if (sscanf(label, fmt, &yr, &per) != 2) {
725         return 0;
726     }
727 
728     for (t=1; t<dset->n; t++) {
729         s = (reversed)? (dset->n - 1 - t) : t;
730         Ey = (per == pd)? yr + 1 : yr;
731         Ep = (per == pd)? pmin : per + pmin;
732 
733         if (convert_pd) {
734             convert_daily_label(label, dset->S[s], pd);
735         } else {
736             strcpy(label, dset->S[s]);
737         }
738 
739         if (sscanf(label, fmt, &yr, &per) != 2) {
740             ret = 0;
741         } else if (Ep == 1 && pd == pd0 && per == pd + 1
742                    && skipstr != NULL) {
743             *skip = *bad = '\0';
744             strncat(skip, label + 4, 7);
745             strncat(bad, label, OBSLEN-1);
746             pd = pd0 + 1;
747             goto restart;
748         } else if (per == Ep + 2 && pmin == 1 && fakequarter(per)) {
749             *bad = '\0';
750             strncat(bad, label, OBSLEN-1);
751             pmin = 3;
752             goto restart;
753         } else if (pd == 12 && Ep == 5 && per == 1 && yr == Ey + 1) {
754             /* apparently monthly but really quarterly? */
755             pprintf(prn, "   \"%s\": quarterly date with spurious zero?\n", label);
756             *extra_zero = 1;
757             *ppd = pd0 = pd = 4;
758             goto restart;
759         } else if (yr != Ey || per != Ep) {
760             ret = 0;
761         }
762 
763         if (!ret) {
764             pprintf(prn, "   %s: not a consistent date\n", label);
765             break;
766         }
767     }
768 
769     if (ret) {
770         if (pmin == 3) {
771             pprintf(prn, "   \"%s\": quarterly data pretending to be monthly?\n",
772                     bad);
773             *ppd = 4;
774         } else if (pd == pd0 + 1) {
775             pprintf(prn, "   \"%s\": BLS-type nonsense? Trying again\n",
776                     bad);
777             strcpy(skipstr, skip);
778         }
779     }
780 
781     return ret;
782 }
783 
consistent_year_labels(const DATASET * dset,int reversed,int convert_pd)784 static int consistent_year_labels (const DATASET *dset,
785                                    int reversed,
786                                    int convert_pd)
787 {
788     char label[OBSLEN];
789     int s, t, yr, yprev;
790     int ret = 1;
791 
792     s = (reversed)? (dset->n - 1) : 0;
793     yprev = atoi(dset->S[s]);
794 
795     for (t=1; t<dset->n; t++) {
796         s = reversed ? (dset->n - 1 - t) : t;
797         if (convert_pd) {
798             convert_daily_label(label, dset->S[s], 1);
799             yr = atoi(label);
800         } else {
801             yr = atoi(dset->S[s]);
802         }
803         if (yr != yprev + 1) {
804             ret = 0;
805             break;
806         }
807         yprev = yr;
808     }
809 
810     return ret;
811 }
812 
813 /* check for all 1s in first column of dates: this may
814    indicate start-of-period dates, day first */
815 
all_day_ones(DATASET * dset)816 static int all_day_ones (DATASET *dset)
817 {
818     int t;
819 
820     for (t=1; t<dset->n; t++) {
821         if (atoi(dset->S[t]) != 1) {
822             return 0;
823         } else if (t > 31) {
824             /* "1" can't mean January */
825             return 1;
826         }
827     }
828 
829     return 0;
830 }
831 
832 enum date_orders {
833     YYYYMMDD = 1,
834     MMDDYYYY,
835     DDMMYYYY
836 };
837 
get_date_order(int f0,int fn,DATASET * dset)838 static int get_date_order (int f0, int fn, DATASET *dset)
839 {
840     if (f0 > 31 || fn > 31) {
841         /* first field must be year */
842         return YYYYMMDD;
843     } else if (f0 > 12 || fn > 12) {
844         /* first field must be day */
845         return DDMMYYYY;
846     } else if (f0 == 1 && fn == 1 && all_day_ones(dset)) {
847         /* start-of-period dates, day first? */
848         return DDMMYYYY;
849     } else {
850         /* could be wrong here */
851         return MMDDYYYY;
852     }
853 }
854 
retransform_daily_dates(DATASET * dset)855 static void retransform_daily_dates (DATASET *dset)
856 {
857     int t, y, m, d;
858 
859     /* we apparently guessed wrongly at MMDDYYYY, so
860        put the dates back as they were for another try,
861        at DDMMYYYY.
862     */
863 
864     for (t=0; t<dset->n; t++) {
865         sscanf(dset->S[t], YMD_READ_FMT, &y, &d, &m);
866         sprintf(dset->S[t], YMD_WRITE_FMT, d, m, y);
867     }
868 }
869 
transform_daily_dates(DATASET * dset,int dorder,char sep)870 static int transform_daily_dates (DATASET *dset, int dorder,
871                                   char sep)
872 {
873     char *label, fmt[16];
874     int t, yr, mon, day;
875     int n, err = 0;
876 
877     if (sep > 0) {
878         sprintf(fmt, "%%d%c%%d%c%%d", sep, sep);
879     } else {
880         strcpy(fmt, "%4d%2d%2d");
881     }
882 
883     for (t=0; t<dset->n && !err; t++) {
884         label = dset->S[t];
885         if (dorder == YYYYMMDD) {
886             n = sscanf(label, fmt, &yr, &mon, &day);
887         } else if (dorder == DDMMYYYY) {
888             n = sscanf(label, fmt, &day, &mon, &yr);
889         } else {
890             n = sscanf(label, fmt, &mon, &day, &yr);
891         }
892         if (n == 3) {
893             sprintf(label, YMD_WRITE_Y2_FMT, yr, mon, day);
894         } else {
895             err = 1;
896         }
897     }
898 
899     return err;
900 }
901 
reverse_data(DATASET * dset,PRN * prn)902 void reverse_data (DATASET *dset, PRN *prn)
903 {
904     char tmp[OBSLEN];
905     double x;
906     int T = dset->n / 2;
907     int i, t, s;
908 
909     pprintf(prn, _("reversing the data!\n"));
910 
911     for (t=0; t<T; t++) {
912         s = dset->n - 1 - t;
913         for (i=1; i<dset->v; i++) {
914             x = dset->Z[i][t];
915             dset->Z[i][t] = dset->Z[i][s];
916             dset->Z[i][s] = x;
917         }
918         if (dset->S != NULL) {
919             strcpy(tmp, dset->S[t]);
920             strcpy(dset->S[t], dset->S[s]);
921             strcpy(dset->S[s], tmp);
922         }
923     }
924 }
925 
csv_daily_date_check(DATASET * dset,int * reversed,char * skipstr,PRN * prn)926 static int csv_daily_date_check (DATASET *dset, int *reversed,
927                                  char *skipstr, PRN *prn)
928 {
929     int d1[3], d2[3];
930     char s1 = 0, s2 = 0;
931     char *lbl1 = dset->S[0];
932     char *lbl2 = dset->S[dset->n - 1];
933     int dorder = 0;
934 
935     if ((sscanf(lbl1, "%d%c%d%c%d", &d1[0], &s1, &d1[1], &s2, &d1[2]) == 5 &&
936          sscanf(lbl2, "%d%c%d%c%d", &d2[0], &s1, &d2[1], &s2, &d2[2]) == 5 &&
937          s1 == s2 && ispunct(s1)) ||
938         (sscanf(lbl1, "%4d%2d%2d", &d1[0], &d1[1], &d1[2]) == 3 &&
939          sscanf(lbl2, "%4d%2d%2d", &d2[0], &d2[1], &d2[2]) == 3)) {
940         int mon1, day1;
941         int mon2, day2;
942         int pd, ret = 0;
943 
944         dorder = get_date_order(d1[0], d2[0], dset);
945 
946     tryagain:
947 
948         if (dorder == YYYYMMDD) {
949             pputs(prn, _("Trying date order YYYYMMDD\n"));
950             mon1 = d1[1];
951             day1 = d1[2];
952             mon2 = d2[1];
953             day2 = d2[2];
954         } else if (dorder == DDMMYYYY) {
955             pputs(prn, _("Trying date order DDMMYYYY\n"));
956             day1 = d1[0];
957             mon1 = d1[1];
958             day2 = d2[0];
959             mon2 = d2[1];
960         } else {
961             pputs(prn, _("Trying date order MMDDYYYY\n"));
962             mon1 = d1[0];
963             day1 = d1[1];
964             mon2 = d2[0];
965             day2 = d2[1];
966         }
967 
968         if (mon1 > 0 && mon1 < 13 &&
969             mon2 > 0 && mon2 < 13 &&
970             day1 > 0 && day1 < 32 &&
971             day2 > 0 && day2 < 32) {
972             /* looks promising for calendar dates, but check
973                further if we don't have the canonical order
974                or separator
975             */
976             if (dorder != YYYYMMDD || s1 != '-') {
977                 if (transform_daily_dates(dset, dorder, s1)) {
978                     return -1;
979                 }
980                 s1 = '-';
981             }
982             pprintf(prn, _("Could be %s - %s\n"), lbl1, lbl2);
983             ret = check_daily_dates(dset, &pd, reversed, prn);
984             if (ret >= 0 && pd > 0) {
985                 if (pd == 52) {
986                     if (csv_weekly_data(dset)) {
987                         ret = 52;
988                     } else if (dorder == MMDDYYYY) {
989                         /* maybe we guessed wrong */
990                         retransform_daily_dates(dset);
991                         dorder = DDMMYYYY;
992                         goto tryagain;
993                     } else {
994                         ret = -1;
995                     }
996                 } else {
997                     int convert_pd = 0;
998 
999                     if (pd == 1 || pd == 4 || pd == 12) {
1000                         convert_pd = pd;
1001                     }
1002                     ret = time_series_label_check(dset,
1003                                                   *reversed,
1004                                                   skipstr,
1005                                                   convert_pd,
1006                                                   prn);
1007                     if (ret < 0 && dorder == MMDDYYYY) {
1008                         retransform_daily_dates(dset);
1009                         dorder = DDMMYYYY;
1010                         goto tryagain;
1011                     }
1012                 }
1013             }
1014             return ret;
1015         }
1016     } else {
1017         pprintf(prn, _("'%s' and '%s': couldn't get dates\n"), lbl1, lbl2);
1018     }
1019 
1020     return -1;
1021 }
1022 
pd_from_date_label(const char * lbl,char * year,char * subp,char * format,PRN * prn)1023 static int pd_from_date_label (const char *lbl, char *year, char *subp,
1024                                char *format, PRN *prn)
1025 {
1026     const char *subchars = ".:QqMmPp-";
1027     int len = strlen(lbl);
1028     int try, pd = -1;
1029 
1030     strncat(year, lbl, 4);
1031     try = atoi(year);
1032 
1033     if (try > 0 && try < 3000) {
1034         pprintf(prn, _("   %s: probably a year... "), year);
1035     } else {
1036         pprintf(prn, _("   %s: probably not a year\n"), year);
1037     }
1038 
1039     if (len == 5) {
1040         pputs(prn, _("   but I can't make sense of the extra bit\n"));
1041     } else if (len == 4) {
1042         pputs(prn, _("and just a year\n"));
1043         pd = 1;
1044     } else {
1045         char sep = lbl[4];
1046         char sub[3], *s = NULL;
1047         int dashQ = 0;
1048         int p;
1049 
1050         if (strchr(subchars, sep)) {
1051             *sub = '\0';
1052             strncat(sub, lbl + 5, 2);
1053             s = sub;
1054             if (len == 6 || (len == 7 && (sep == 'q' || sep == 'Q'))) {
1055                 if (len == 7) s++;
1056                 p = atoi(s);
1057                 if (p > 0 && p < 5) {
1058                     pprintf(prn, _("quarter %s?\n"), s);
1059                     pd = 4;
1060                 } else {
1061                     pprintf(prn, "quarter %d: not possible\n", p);
1062                 }
1063             } else if (len == 7) {
1064                 if (*s == 'Q') {
1065                     /* YYYY-Qn? This is supported by SDMX */
1066                     dashQ = 1;
1067                     s++;
1068                 }
1069                 p = atoi(s);
1070                 if (dashQ) {
1071                     if (p > 0 && p < 5) {
1072                         pprintf(prn, _("quarter %d?\n"), p);
1073                         pd = 4;
1074                     } else {
1075                         pprintf(prn, "quarter %d: not possible\n", p);
1076                     }
1077                 } else {
1078                     if (p > 0 && p < 13) {
1079                         pprintf(prn, _("month %s?\n"), s);
1080                         pd = 12;
1081                     } else {
1082                         pprintf(prn, "month %d: not possible\n", p);
1083                     }
1084                 }
1085             }
1086             strcpy(subp, s);
1087             if (format != NULL && (pd == 4 || pd == 12)) {
1088                 if (dashQ) {
1089                     sprintf(format, "%%d%cQ%%d", sep);
1090                 } else {
1091                     sprintf(format, "%%d%c%%d", sep);
1092                 }
1093             }
1094         }
1095     }
1096 
1097     return pd;
1098 }
1099 
time_series_label_check(DATASET * dset,int reversed,char * skipstr,int convert_pd,PRN * prn)1100 static int time_series_label_check (DATASET *dset, int reversed,
1101                                     char *skipstr, int convert_pd,
1102                                     PRN *prn)
1103 {
1104     char year[5], sub[3];
1105     char format[8] = {0};
1106     char *lbl1 = dset->S[0];
1107     char *lbl2 = dset->S[dset->n - 1];
1108     char *label;
1109     int pd = -1;
1110 
1111     *year = *sub = '\0';
1112     label = reversed ? lbl2 : lbl1;
1113 
1114     if (convert_pd) {
1115         char altobs[OBSLEN];
1116 
1117         convert_daily_label(altobs, label, convert_pd);
1118         pd = pd_from_date_label(altobs, year, sub, format, prn);
1119     } else {
1120         pd = pd_from_date_label(label, year, sub, format, prn);
1121     }
1122 
1123     if (pd == 1) {
1124         if (consistent_year_labels(dset, reversed, convert_pd)) {
1125             dset->pd = pd;
1126             strcpy(dset->stobs, year);
1127             dset->sd0 = atof(dset->stobs);
1128             strcpy(dset->endobs, lbl2);
1129             dset->structure = TIME_SERIES;
1130         } else {
1131             pputs(prn, _("   but the dates are not complete and consistent\n"));
1132             pd = -1;
1133         }
1134     } else if (pd == 4 || pd == 12) {
1135         int savepd = pd;
1136         int extra_zero = 0;
1137 
1138         if (consistent_qm_labels(dset, reversed, convert_pd,
1139                                  skipstr, &pd, format,
1140                                  &extra_zero, prn)) {
1141             dset->pd = pd;
1142             if (savepd == 12 && pd == 4) {
1143                 /* we switched the interpretation from
1144                    monthly to quarterly */
1145                 int s;
1146 
1147                 if (extra_zero) {
1148                     /* e.g. 1960Q1 written as 1960:01 */
1149                     s = atoi(sub + 1);
1150                 } else {
1151                     /* e.g. 1960Q1 written as 1960:03 */
1152                     s = atoi(sub) / 3;
1153                 }
1154                 sprintf(dset->stobs, "%s:%d", year, s);
1155             } else {
1156                 sprintf(dset->stobs, "%s:%s", year, sub);
1157             }
1158             dset->sd0 = obs_str_to_double(dset->stobs);
1159             ntolabel(dset->endobs, dset->n - 1, dset);
1160         } else {
1161             pputs(prn, _("   but the dates are not complete and consistent\n"));
1162             pd = -1;
1163         }
1164     }
1165 
1166     return pd;
1167 }
1168 
dates_maybe_reversed(const char * s1,const char * s2,PRN * prn)1169 static int dates_maybe_reversed (const char *s1,
1170                                  const char *s2,
1171                                  PRN *prn)
1172 {
1173     char d1[5], d2[5];
1174     int ret = 0;
1175 
1176     *d1 = *d2 = '\0';
1177 
1178     strncat(d1, s1, 4);
1179     strncat(d2, s2, 4);
1180 
1181     ret = atoi(d1) > atoi(d2);
1182 
1183     if (ret) {
1184         pputs(prn, _("   dates are reversed?\n"));
1185     }
1186 
1187     return ret;
1188 }
1189 
1190 /* e.g. "M1 1957", "M12 2009" */
1191 
fix_IFS_data_labels(DATASET * dset)1192 static int fix_IFS_data_labels (DATASET *dset)
1193 {
1194     char *s1 = dset->S[0];
1195     char *s2 = dset->S[dset->n - 1];
1196     int ret = 0;
1197 
1198     if ((*s1 == 'M' || *s1 == 'Q') && *s2 == *s1) {
1199         int n1 = strlen(s1);
1200         int n2 = strlen(s2);
1201 
1202         if ((n1 == 7 || n1 == 8) && (n2 == 7 || n2 == 8) &&
1203             isdigit(s1[1]) && isdigit(s2[1])) {
1204             int pmax = (*s1 == 'M')? 12 : 4;
1205             char c, tmp[8], *s;
1206             int y, p, pbak = 0;
1207             int i, n, doit = 1;
1208 
1209             for (i=0; i<dset->n; i++) {
1210                 s = dset->S[i];
1211                 n = strlen(s);
1212                 if (n != 7 && n != 8) {
1213                     doit = 0;
1214                     break;
1215                 }
1216                 n = sscanf(s, "%c%d %d", &c, &p, &y);
1217                 if (n != 3 || c != *s1) {
1218                     doit = 0;
1219                     break;
1220                 }
1221                 if (y < 1800 || y > 2500 || p <= 0 || p > pmax) {
1222                     doit = 0;
1223                     break;
1224                 }
1225                 if (i > 0 && p != pbak + 1 && p != 1) {
1226                     doit = 0;
1227                     break;
1228                 }
1229                 pbak = p;
1230             }
1231 
1232             if (doit) {
1233                 for (i=0; i<dset->n; i++) {
1234                     s = dset->S[i];
1235                     sscanf(s, "%c%d %d", &c, &p, &y);
1236                     if (pmax == 12) {
1237                         sprintf(tmp, "%d:%02d", y, p);
1238                     } else {
1239                         sprintf(tmp, "%d:%d", y, p);
1240                     }
1241                     if (strlen(tmp) > strlen(s)) {
1242                         free(s);
1243                         dset->S[i] = gretl_strdup(tmp);
1244                     } else {
1245                         strcpy(s, tmp);
1246                     }
1247                 }
1248                 ret = 1;
1249             }
1250         }
1251     }
1252 
1253     return ret;
1254 }
1255 
month_number(char * s)1256 static int month_number (char *s)
1257 {
1258     const char *mo[] = {
1259         "jan", "feb", "mar", "apr",
1260         "may", "jun", "jul", "aug",
1261         "sep", "oct", "nov", "dec"
1262     };
1263     int i;
1264 
1265     gretl_lower(s);
1266 
1267     for (i=0; i<12; i++) {
1268         if (!strcmp(s, mo[i])) {
1269             return i+1;
1270         }
1271     }
1272 
1273     return 0;
1274 }
1275 
1276 /* e.g. "Jan-1980", for monthly or quarterly data */
1277 
fix_mon_year_labels(DATASET * dset)1278 static int fix_mon_year_labels (DATASET *dset)
1279 {
1280     char *s1 = dset->S[0];
1281     char *s2 = dset->S[dset->n - 1];
1282     char m1[4] = {0};
1283     char m2[4] = {0};
1284     int yr1 = 0, yr2 = 0;
1285     int ret = 0;
1286 
1287     if (strlen(s1) == 8 && strlen(s2) == 8 &&
1288         s1[3] == '-' && s2[3] == '-') {
1289         yr1 = atoi(s1 + 4);
1290         yr2 = atoi(s2 + 4);
1291         strncat(m1, s1, 3);
1292         strncat(m2, s2, 3);
1293     }
1294 
1295     if (yr1 > 999 && yr1 < 3000 && yr2 > 999 && yr2 < 3000 &&
1296         month_number(m1) && month_number(m2)) {
1297         int i, p, pbak = 0;
1298         int dt, pd = 0;
1299         char *s;
1300 
1301         for (i=0; i<dset->n; i++) {
1302             s = dset->S[i];
1303             if (strlen(s) != 8 || s[3] != '-') {
1304                 pd = 0;
1305                 break;
1306             }
1307             yr1 = atoi(s + 4);
1308             *m1 = '\0';
1309             strncat(m1, s, 3);
1310             if (yr1 < 1000 || yr1 >= 3000 ||
1311                 (p = month_number(m1)) < 1) {
1312                 pd = 0;
1313                 break;
1314             }
1315             if (i > 0) {
1316                 dt = p - pbak;
1317                 if (dt != 1 && dt != 3 && p != 1) {
1318                     pd = 0;
1319                     break;
1320                 }
1321                 if (pd == 0 && dt > 0) {
1322                     pd = (dt == 1)? 12 : 4;
1323                 }
1324             }
1325             pbak = p;
1326         }
1327 
1328         if (pd > 0) {
1329             for (i=0; i<dset->n; i++) {
1330                 s = dset->S[i];
1331                 yr1 = atoi(s + 4);
1332                 *m1 = '\0';
1333                 strncat(m1, s, 3);
1334                 p = month_number(m1);
1335                 if (pd == 12) {
1336                     sprintf(dset->S[i], "%d:%02d", yr1, p);
1337                 } else {
1338                     sprintf(dset->S[i], "%d:%g", yr1, ceil((3+p)/4.0));
1339                 }
1340             }
1341             ret = 1;
1342         }
1343     }
1344 
1345     return ret;
1346 }
1347 
1348 /* Attempt to parse CSV row labels as dates.  Return -1 if this
1349    doesn't work out, or 0 if the labels seem to be just integer
1350    observation numbers, else return the inferred data frequency.
1351 */
1352 
test_markers_for_dates(DATASET * dset,int * reversed,char * skipstr,PRN * prn)1353 int test_markers_for_dates (DATASET *dset, int *reversed,
1354                             char *skipstr, PRN *prn)
1355 {
1356     char endobs[OBSLEN];
1357     int n = dset->n;
1358     char *lbl1 = dset->S[0];
1359     char *lbl2 = dset->S[n - 1];
1360     int len1 = strlen(lbl1);
1361     int len2 = strlen(lbl2);
1362     int pd = -1;
1363 
1364     if (skipstr != NULL && *skipstr != '\0') {
1365         return time_series_label_check(dset, *reversed, skipstr, 0, prn);
1366     }
1367 
1368     pprintf(prn, _("   first row label \"%s\", last label \"%s\"\n"),
1369             lbl1, lbl2);
1370 
1371     /* are the labels (probably) just 1, 2, 3 etc.? */
1372     sprintf(endobs, "%d", n);
1373     if (!strcmp(lbl1, "1") && !strcmp(lbl2, endobs)) {
1374         return 0;
1375     }
1376 
1377     if (fix_IFS_data_labels(dset) || fix_mon_year_labels(dset)) {
1378         lbl1 = dset->S[0];
1379         lbl2 = dset->S[n - 1];
1380         len1 = strlen(lbl1);
1381     }
1382 
1383     /* labels are of different lengths? */
1384     if (len1 != len2) {
1385         if (abs(len1 - len2) > 1) {
1386             return -1;
1387         } else if (len2 > len1) {
1388             len1 = len2;
1389         }
1390     }
1391 
1392     pputs(prn, _("trying to parse row labels as dates...\n"));
1393 
1394     if (len1 == 8 || len1 == 10) {
1395         /* daily data? */
1396         pd = csv_daily_date_check(dset, reversed, skipstr, prn);
1397     } else if (len1 >= 4) {
1398         /* annual, quarterly, monthly? */
1399         if (isdigit((unsigned char) lbl1[0]) &&
1400             isdigit((unsigned char) lbl1[1]) &&
1401             isdigit((unsigned char) lbl1[2]) &&
1402             isdigit((unsigned char) lbl1[3])) {
1403             *reversed = dates_maybe_reversed(lbl1, lbl2, prn);
1404             pd = time_series_label_check(dset, *reversed, skipstr, 0, prn);
1405         } else {
1406             pputs(prn, _("   definitely not a four-digit year\n"));
1407         }
1408     }
1409 
1410     if (pd <= 0 && *reversed) {
1411         /* give up the "reversed" notion if we didn't get
1412            a workable time-series interpretation */
1413         *reversed = 0;
1414     }
1415 
1416     return pd;
1417 }
1418 
utf8_ok(gzFile fp,int pos)1419 static int utf8_ok (gzFile fp, int pos)
1420 {
1421     long mark = gztell(fp);
1422     int len = pos + 9;
1423     char *test = malloc(len + 1);
1424     int i, ret = 0;
1425 
1426     gzseek(fp, mark - pos - 1, SEEK_SET);
1427 
1428     for (i=0; i<len; i++) {
1429         test[i] = gzgetc(fp);
1430     }
1431     test[i] = '\0';
1432 
1433     if (g_utf8_validate(test, -1, NULL)) {
1434         ret = 1;
1435     } else {
1436         GError *gerr = NULL;
1437         gsize wrote = 0;
1438         gchar *tr;
1439 
1440         /* try for iso-8859? */
1441         tr = g_convert(test, -1, "UTF-8", "ISO-8859-15",
1442                        NULL, &wrote, &gerr);
1443         if (gerr != NULL) {
1444             g_error_free(gerr);
1445         } else {
1446             g_free(tr);
1447             ret = 1;
1448         }
1449     }
1450 
1451     free(test);
1452 
1453     gzseek(fp, mark, SEEK_SET);
1454 
1455     return ret;
1456 }
1457 
1458 enum {
1459     UTF_8 = 1,
1460     UTF_16,
1461     UTF_32
1462 };
1463 
1464 /* If we got a UTF-16 or UTF-32 BOM, try recoding to
1465    UTF-8 before parsing data. We write the recoded text
1466    to a temporary file in the user's "dotdir" (and
1467    then delete that file once we're done).
1468 */
1469 
csv_recode_input(gzFile * fpp,const char * fname,gchar ** pfname,int ucode,PRN * prn)1470 static int csv_recode_input (gzFile *fpp,
1471                              const char *fname,
1472                              gchar **pfname,
1473                              int ucode,
1474                              PRN *prn)
1475 {
1476     const gchar *from_set =
1477         (ucode == UTF_32)? "UTF-32" : "UTF-16";
1478     gchar *altname = NULL;
1479     int err = 0;
1480 
1481     /* the current stream is not useable as is,
1482        so shut it down
1483     */
1484     gzclose(*fpp);
1485     *fpp = NULL;
1486 
1487     /* we'll recode to a temp file in dotdir */
1488     altname = g_strdup_printf("%srecode_tmp.u8", gretl_dotdir());
1489 
1490     err = gretl_recode_file(fname, altname,
1491                             from_set, "UTF-8",
1492                             prn);
1493 
1494     if (!err) {
1495         /* try reattaching the stream */
1496         *fpp = gretl_gzopen(altname, "rb");
1497         if (*fpp == NULL) {
1498             gretl_remove(altname);
1499             err = E_FOPEN;
1500         } else {
1501             pputs(prn, "switched to recoded input\n");
1502             *pfname = altname;
1503             altname = NULL;
1504         }
1505     }
1506 
1507     g_free(altname);
1508 
1509     return err;
1510 }
1511 
1512 /* Check the first 4 bytes of "CSV" input for a Byte Order
1513    Mark. If we find the UTF-8 BOM (typically written by
1514    Microsoft tools), simply record the fact so that we can
1515    skip it on reading. But if we find a BOM indicating a
1516    16-bit or 32-bit unicode encoding, flag this by returning
1517    a non-zero @ucode value; in that case we'll attempt a
1518    full recording of the input (via GLib) before we start
1519    reading data.
1520 */
1521 
csv_unicode_check(gzFile fp,csvdata * c,PRN * prn)1522 static int csv_unicode_check (gzFile fp, csvdata *c, PRN *prn)
1523 {
1524     unsigned char b[4];
1525     int n = gzread(fp, b, 4);
1526     int ucode = 0;
1527 
1528     if (n == 4) {
1529         if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) {
1530             pputs(prn, "got UTF-8 BOM\n");
1531             ucode = UTF_8;
1532         } else if (b[0] == 0xFE && b[1] == 0xFF) {
1533             pputs(prn, "got UTF-16BE, will try recoding\n");
1534             ucode = UTF_16;
1535         } else if (b[0] == 0xFF && b[1] == 0xFE) {
1536             if (b[2] == 0 && b[3] == 0) {
1537                 pputs(prn, "got UTF-32LE, will try recoding\n");
1538                 ucode = UTF_32;
1539             } else {
1540                 pputs(prn, "got UTF-16LE, will try recoding\n");
1541                 ucode = UTF_16;
1542             }
1543         } else if (b[0] == 0 && b[1] == 0 &&
1544                    b[0] == 0xFE && b[1] == 0xFF) {
1545             pputs(prn, "got UTF-32BE, will try recoding\n");
1546             ucode = UTF_32;
1547         }
1548     }
1549 
1550     if (ucode == UTF_8) {
1551         csv_set_has_bom(c);
1552         gzseek(fp, 3, SEEK_SET);
1553         ucode = 0;
1554     } else {
1555         gzrewind(fp);
1556     }
1557 
1558     return ucode;
1559 }
1560 
1561 /* The function below checks for the maximum line length in the given
1562    file.  It also checks for extraneous binary data (the file is
1563    supposed to be plain text), and checks whether the 'delim'
1564    character is present in the file, on a non-comment line (where
1565    a comment line is one that starts with '#').
1566 
1567    In addition, we check whether the file has a trailing comma on every
1568    line, and for the numbers of double- and single-quote characters
1569    to try to determine which, if either, is used to indicate quoted
1570    fields in the input.
1571 */
1572 
csv_max_line_length(gzFile fp,csvdata * cdata,PRN * prn)1573 static int csv_max_line_length (gzFile fp, csvdata *cdata, PRN *prn)
1574 {
1575     int c, c1, cbak = 0, cc = 0;
1576     int comment = 0, maxlinelen = 0;
1577     int max_ldquo = 0, max_lsquo = 0;
1578     int min_ldquo = 0, min_lsquo = 0;
1579     int ldquo = 0, lsquo = 0;
1580     int ndquo = 0, nsquo = 0;
1581     int crlf = 0, lines = 0;
1582 
1583     csv_set_trailing_comma(cdata); /* just provisionally */
1584 
1585     while ((c = gzgetc(fp)) != EOF) {
1586         if (c == 0x0d) {
1587             /* CR */
1588             c1 = gzgetc(fp);
1589             if (c1 == EOF) {
1590                 break;
1591             } else if (c1 == 0x0a) {
1592                 /* CR + LF -> LF */
1593                 crlf = 1;
1594                 c = c1;
1595             } else {
1596                 /* Mac-style: CR not followed by LF */
1597                 c = 0x0a;
1598                 gzungetc(c1, fp);
1599             }
1600         }
1601         if (c == 0x0a) {
1602             if (cc > maxlinelen) {
1603                 maxlinelen = cc;
1604             }
1605             cc = 0;
1606             if (cbak != 0 && cbak != ',') {
1607                 csv_unset_trailing_comma(cdata);
1608             }
1609             lines++;
1610             if (ldquo > max_ldquo) {
1611                 max_ldquo = ldquo;
1612             } else if (ldquo > 0 && ldquo < max_ldquo) {
1613                 min_ldquo = ldquo;
1614             }
1615             if (lsquo > max_lsquo) {
1616                 max_lsquo = lsquo;
1617             } else if (lsquo > 0 && lsquo < max_lsquo) {
1618                 min_lsquo = lsquo;
1619             }
1620             ldquo = lsquo = 0;
1621             continue;
1622         }
1623         cbak = c;
1624         if (!isspace((unsigned char) c) && !isprint((unsigned char) c) &&
1625             !(c == CTRLZ) && !utf8_ok(fp, cc)) {
1626             pprintf(prn, _("Binary data (%d) encountered (line %d:%d): "
1627                            "this is not a valid text file\n"),
1628                     c, lines + 1, cc + 1);
1629             return -1;
1630         }
1631         if (cc == 0) {
1632             comment = (c == '#');
1633         }
1634         if (!comment) {
1635             if (c == '\t') {
1636                 /* let's ignore trailing tabs in this heuristic */
1637                 c1 = gzgetc(fp);
1638                 if (c1 != 0x0d && c1 != 0x0a) {
1639                     csv_set_got_tab(cdata);
1640                 }
1641                 gzungetc(c1, fp);
1642             }
1643             if (c == ';') {
1644                 csv_set_got_semi(cdata);
1645             }
1646             if (c == cdata->delim) {
1647                 csv_set_got_delim(cdata);
1648             } else if (c == '"') {
1649                 ldquo++;
1650                 ndquo++;
1651             } else if (c == '\'') {
1652                 lsquo++;
1653                 nsquo++;
1654             }
1655         }
1656         cc++;
1657     }
1658 
1659     if (maxlinelen == 0) {
1660         pputs(prn, _("Data file is empty\n"));
1661     } else if (csv_has_trailing_comma(cdata)) {
1662         pputs(prn, _("Data file has trailing commas\n"));
1663     }
1664 
1665     if (ndquo > 0 || nsquo > 0) {
1666         /* candidates for quotation character? */
1667         int cands[2] = {0};
1668 
1669         if (ndquo > 0) {
1670             pprintf(prn, _("Found %d double-quotes, max %d per line\n"),
1671                     ndquo, max_ldquo);
1672         }
1673         if (nsquo > 0) {
1674             pprintf(prn, _("Found %d single-quotes, max %d per line\n"),
1675                     nsquo, max_lsquo);
1676         }
1677         if (max_ldquo > 0 && max_ldquo % 2 == 0) {
1678             /* double-quote is a candidate? */
1679             if (min_ldquo > 0 && min_ldquo % 2) {
1680                 ; /* nope */
1681             } else {
1682                 cands[0] = 1;
1683             }
1684         }
1685         if (max_lsquo > 0 && max_lsquo % 2 == 0) {
1686             /* single-quote is a candidate? */
1687             if (min_lsquo > 0 && min_lsquo % 2) {
1688                 ; /* nope */
1689             } else {
1690                 cands[1] = 1;
1691             }
1692         }
1693         if (cands[0] && cands[1]) {
1694             /* hmm, rule one out: prefer the more numerous */
1695             if (nsquo > ndquo) {
1696                 cands[0] = 0;
1697             } else {
1698                 cands[1] = 0;
1699             }
1700         }
1701         if (cands[0]) {
1702             pputs(prn, _("Assuming double-quote is the relevant "
1703                          "quotation character\n"));
1704             cdata->qchar = '"';
1705         } else if (cands[1]) {
1706             pputs(prn, _("Assuming single-quote is the relevant "
1707                          "quotation character\n"));
1708             cdata->qchar = '\'';
1709         }
1710     }
1711 
1712     if (maxlinelen > 0) {
1713         /* allow for newline and null terminator */
1714         maxlinelen += 2 + crlf;
1715     }
1716 
1717     return maxlinelen;
1718 }
1719 
1720 #define nonspace_delim(d) (d != ',' && d != ';' && d != '\t')
1721 
count_csv_fields(csvdata * c)1722 static int count_csv_fields (csvdata *c)
1723 {
1724     const char *s = c->line;
1725     int inquote = 0;
1726     int cbak, nf = 0;
1727 
1728     if (*s == c->delim && *s == ' ') {
1729         s++;
1730     }
1731 
1732     while (*s) {
1733         if (csv_keep_quotes(c) && *s == c->qchar) {
1734             inquote = !inquote;
1735         } else if (!inquote && *s == c->delim) {
1736             nf++;
1737         }
1738         cbak = *s;
1739         s++;
1740         /* Problem: (when) should a trailing delimiter be read as an
1741            implicit NA?  For now we'll so treat it if the delimiter
1742            is not plain space.
1743         */
1744         if (*s == '\0' && cbak == c->delim && nonspace_delim(c->delim)) {
1745             nf--;
1746         }
1747     }
1748 
1749     return nf + 1;
1750 }
1751 
purge_quoted_commas(char * s)1752 static void purge_quoted_commas (char *s)
1753 {
1754     int inquote = 0;
1755 
1756     while (*s) {
1757         if (*s == '"') {
1758             inquote = !inquote;
1759         } else if (inquote && *s == ',') {
1760             *s = ' ';
1761         }
1762         s++;
1763     }
1764 }
1765 
purge_unquoted_spaces(char * s)1766 static void purge_unquoted_spaces (char *s)
1767 {
1768     int inquote = 0;
1769 
1770     while (*s) {
1771         if (*s == '"') {
1772             inquote = !inquote;
1773         } else if (!inquote && *s == ' ') {
1774             shift_string_left(s, 1);
1775         }
1776         s++;
1777     }
1778 }
1779 
compress_csv_line(csvdata * c,int nospace)1780 static void compress_csv_line (csvdata *c, int nospace)
1781 {
1782     int n = strlen(c->line);
1783     char *p = c->line + n - 1;
1784 
1785     if (*p == 0x0a) {
1786         *p = '\0';
1787         p--;
1788     }
1789 
1790     if (*p == 0x0d) {
1791         *p = '\0';
1792     }
1793 
1794     if (!csv_keep_quotes(c) && c->delim == ',') {
1795         purge_quoted_commas(c->line);
1796     }
1797 
1798     if (c->delim != ' ') {
1799         if (nospace) {
1800             purge_unquoted_spaces(c->line);
1801         }
1802     } else {
1803         compress_spaces(c->line);
1804     }
1805 
1806     if (!csv_keep_quotes(c)) {
1807         gretl_delchar('"', c->line);
1808     }
1809 
1810     if (csv_has_trailing_comma(c)) {
1811         /* chop trailing comma */
1812         n = strlen(c->line);
1813         if (n > 0) {
1814             c->line[n-1] = '\0';
1815         }
1816     }
1817 }
1818 
import_obs_label(const char * s)1819 int import_obs_label (const char *s)
1820 {
1821     char tmp[VNAMELEN];
1822 
1823     if (s == NULL) {
1824         return 1;
1825     }
1826 
1827     if (!strcmp(s, "\"\"") || !strcmp(s, "''")) {
1828         return 1;
1829     }
1830 
1831     if (*s == '"' || *s == '\'') s++;
1832 
1833     if (*s == '\0') {
1834         return 1;
1835     }
1836 
1837     if (strlen(s) > VNAMELEN - 1) {
1838         return 0;
1839     }
1840 
1841     *tmp = '\0';
1842     strncat(tmp, s, VNAMELEN - 1);
1843     gretl_lower(tmp);
1844 
1845     return (!strcmp(tmp, "obs") ||
1846             !strcmp(tmp, "date") ||
1847             !strcmp(tmp, "year") ||
1848             !strcmp(tmp, "period") ||
1849             !strcmp(tmp, "observation") ||
1850             !strcmp(tmp, "observation_date"));
1851 }
1852 
join_wants_col_zero(csvdata * c,const char * s)1853 static int join_wants_col_zero (csvdata *c, const char *s)
1854 {
1855     const char *colname;
1856     int i;
1857 
1858     if (*s == '\0') {
1859         return 0;
1860     }
1861 
1862     for (i=0; i<c->jspec->ncols; i++) {
1863         colname = c->jspec->colnames[i];
1864         if (colname != NULL && !strcmp(s, colname)) {
1865             return 1;
1866         }
1867     }
1868 
1869     return 0;
1870 }
1871 
check_first_field(const char * line,csvdata * c,PRN * prn)1872 static void check_first_field (const char *line, csvdata *c, PRN *prn)
1873 {
1874     const char *s;
1875 
1876  tryagain:
1877     s = line;
1878 
1879     if (c->delim != ' ' && *s == c->delim) {
1880         csv_set_blank_column(c);
1881     } else {
1882         char field1[OBSLEN];
1883         int i = 0;
1884 
1885         if (c->delim == ' ' && *s == ' ') {
1886             s++;
1887         }
1888 
1889         while (*s && i < sizeof field1) {
1890             if (*s == c->delim) {
1891                 break;
1892             } else if (*s == '\t') {
1893                 /* presence of a tab must indicate tab-separation? */
1894                 c->delim = '\t';
1895                 goto tryagain;
1896             }
1897             field1[i++] = *s++;
1898         }
1899 
1900         field1[i] = '\0';
1901         iso_to_ascii(field1);
1902 
1903         if (joining(c) && join_wants_col_zero(c, field1)) {
1904             return;
1905         } else if (csv_all_cols(c)) {
1906             /* open/append wants all columns as data */
1907             return;
1908         }
1909 
1910         pprintf(prn, _("   first field: '%s'\n"), field1);
1911 
1912         if (import_obs_label(field1)) {
1913             pputs(prn, _("   seems to be observation label\n"));
1914             csv_set_obs_column(c);
1915         }
1916     }
1917 }
1918 
import_na_init(void)1919 void import_na_init (void)
1920 {
1921     const char *s = get_csv_na_read_string();
1922 
1923     strcpy(import_na, s);
1924 }
1925 
1926 /* Returns 1 if the string @s should be counted representing
1927    an NA or missing value, 0 otherwise. If there is a user-set
1928    "csv_read_na" value this is used for comparison, otherwise
1929    a set of default values is consulted.
1930 */
1931 
import_na_string(const char * s)1932 int import_na_string (const char *s)
1933 {
1934     if (*import_na != '\0' && strcmp(import_na, "default")) {
1935         /* the user has set a specific "NA" string, so
1936            respect it */
1937         return !strcmp(s, import_na);
1938     } else {
1939         /* consult a list of common representations of NA */
1940         const char *defaults[] = {
1941             "NA",
1942             "N.A.",
1943             "n.a.",
1944             "na",
1945             "n/a",
1946             "N/A",
1947             "#N/A",
1948             "NaN",
1949             ".NaN",
1950             ".",
1951             "..",
1952             "-999",
1953             "-9999",
1954             "-",
1955             NULL
1956         };
1957         int i;
1958 
1959         for (i=0; defaults[i] != NULL; i++) {
1960             if (!strcmp(s, defaults[i])) {
1961                 return 1;
1962             }
1963         }
1964     }
1965 
1966     return 0;
1967 }
1968 
csv_missval(const char * str,int i,int t,int * miss_shown,PRN * prn)1969 static int csv_missval (const char *str, int i, int t,
1970                         int *miss_shown, PRN *prn)
1971 {
1972     int miss = 0;
1973 
1974     if (*str == '\0' || !strcmp(str, "\"\"")) {
1975         /* 2021-03-03: let '""' indicate missing */
1976         if (miss_shown != NULL) {
1977             if (t < 80 || *miss_shown < i) {
1978                 pprintf(prn, _("   the cell for variable %d, obs %d "
1979                                "is empty: treating as missing value\n"),
1980                         i, t);
1981                 *miss_shown += 1;
1982             }
1983         }
1984         miss = 1;
1985     }
1986 
1987     if (import_na_string(str)) {
1988         if (miss_shown != NULL) {
1989             if (t < 80 || *miss_shown < i) {
1990                 pprintf(prn, _("   warning: missing value for variable "
1991                                "%d, obs %d\n"), i, t);
1992                 *miss_shown += 1;
1993             }
1994         }
1995         miss = 1;
1996     }
1997 
1998     return miss;
1999 }
2000 
2001 /* In the case where we think we've found thousands
2002    separators in numerical input, provisionally mark
2003    all "non-numeric" values as NAs; we do this prior
2004    to a second pass through the data.
2005 */
2006 
revise_non_numeric_values(csvdata * c)2007 static void revise_non_numeric_values (csvdata *c)
2008 {
2009     int i, t;
2010 
2011     for (i=1; i<c->dset->v; i++) {
2012         for (t=0; t<c->dset->n; t++) {
2013             if (c->dset->Z[i][t] == NON_NUMERIC) {
2014                 c->dset->Z[i][t] = NADBL;
2015             }
2016         }
2017     }
2018 }
2019 
non_numeric_check(DATASET * dset,int ** plist,gretl_string_table ** pst,PRN * prn)2020 int non_numeric_check (DATASET *dset, int **plist,
2021                        gretl_string_table **pst,
2022                        PRN *prn)
2023 {
2024     int *list = NULL;
2025     int i, j, t, nn = 0;
2026     int err = 0;
2027 
2028 #if CDEBUG > 1
2029     fprintf(stderr, "non_numeric_check: testing %d series, pst = %p\n",
2030             dset->v - 1, (void *) pst);
2031 #endif
2032 
2033     if (pst == NULL) {
2034         /* not interested in string-valued series/columns */
2035         for (i=1; i<dset->v; i++) {
2036             for (t=0; t<dset->n; t++) {
2037                 if (dset->Z[i][t] == NON_NUMERIC) {
2038                     dset->Z[i][t] = NADBL;
2039                 }
2040             }
2041         }
2042         return 0;
2043     }
2044 
2045     for (i=1; i<dset->v; i++) {
2046         for (t=0; t<dset->n; t++) {
2047             if (dset->Z[i][t] == NON_NUMERIC) {
2048                 nn++;
2049                 break;
2050             }
2051         }
2052     }
2053 
2054 #if CDEBUG > 1
2055     fprintf(stderr, " found %d candidate series\n", nn);
2056 #endif
2057 
2058     if (nn == 0) {
2059         return 0; /* nothing to be done */
2060     }
2061 
2062     list = gretl_list_new(nn);
2063     if (list == NULL) {
2064         return E_ALLOC;
2065     }
2066 
2067     j = 1;
2068     for (i=1; i<dset->v; i++) {
2069         for (t=0; t<dset->n; t++) {
2070             if (dset->Z[i][t] == NON_NUMERIC) {
2071                 list[j++] = i;
2072                 break;
2073             }
2074         }
2075     }
2076 
2077 #if CDEBUG > 1
2078     printlist(list, "non-numeric vars list");
2079 #endif
2080 
2081     for (i=1; i<=list[0]; i++) {
2082         /* check each member of @list */
2083         double nnfrac;
2084         int nnon = 0;
2085         int tnon = -1;
2086         int nok = 0;
2087         int v = list[i];
2088 
2089         series_set_flag(dset, v, VAR_DISCRETE);
2090 
2091         for (t=0; t<dset->n; t++) {
2092             if (dset->Z[v][t] == NON_NUMERIC) {
2093                 if (tnon < 0) {
2094                     /* record the first non-numeric obs */
2095                     tnon = t + 1;
2096                 }
2097                 nnon++;
2098             } else if (!na(dset->Z[v][t])) {
2099                 nok++;
2100             }
2101         }
2102 
2103         nnfrac = (nok == 0)? 1.0 : nnon / (double) (nnon + nok);
2104         pprintf(prn, _("variable %d (%s): non-numeric values = %d "
2105                        "(%.2f percent)\n"), v, dset->varname[v],
2106                 nnon, 100 * nnfrac);
2107         if ((nnon < 2 && dset->n > 2) || nnfrac < 0.05) {
2108             /* if we got just a few non-numeric values, we'll assume
2109                that the data file is broken
2110             */
2111             pprintf(prn, _("ERROR: variable %d (%s), observation %d, "
2112                            "expected numeric value\n"),
2113                     v, dset->varname[v], tnon);
2114             err = E_DATA;
2115             break;
2116         }
2117     }
2118 
2119     if (!err) {
2120         pputs(prn, _("allocating string table\n"));
2121         *pst = gretl_string_table_new(list);
2122         if (*pst == NULL) {
2123             err = E_ALLOC;
2124         }
2125     }
2126 
2127     if (err) {
2128         free(list);
2129     } else {
2130         *plist = list;
2131     }
2132 
2133     return err;
2134 }
2135 
csv_non_numeric_check(csvdata * c,PRN * prn)2136 static int csv_non_numeric_check (csvdata *c, PRN *prn)
2137 {
2138     gretl_string_table *st = NULL;
2139     int *nlist = NULL;
2140     int err = 0;
2141 
2142     if (csv_as_matrix(c)) {
2143         err = non_numeric_check(c->dset, &nlist, NULL, prn);
2144     } else {
2145         err = non_numeric_check(c->dset, &nlist, &st, prn);
2146     }
2147 
2148     if (!err) {
2149         c->codelist = nlist;
2150         c->st = st;
2151     }
2152 
2153     return err;
2154 }
2155 
2156 /* Handle the case in "join" where the user specified some time
2157    columns for conversion to numeric and also gave a specific format
2158    for the conversion.
2159 */
2160 
special_time_val(const char * s,const char * fmt,int m_means_q)2161 static double special_time_val (const char *s, const char *fmt,
2162                                 int m_means_q)
2163 {
2164     struct tm t = {0};
2165     char *test;
2166 
2167     test = strptime(s, fmt, &t);
2168 
2169     if (test == NULL || *test != '\0') {
2170         /* conversion didn't work right */
2171         return NADBL;
2172     } else {
2173         int y, m, d;
2174 
2175         y = t.tm_year + 1900;
2176         m = t.tm_mon + 1;
2177         d = t.tm_mday;
2178 
2179         if (m_means_q) {
2180             /* convert to 1st month of quarter */
2181             if (m == 2) m = 4;
2182             else if (m == 3) m = 7;
2183             else if (m == 4) m = 10;
2184             else if (m != 1) {
2185                 return NADBL;
2186             }
2187         }
2188 
2189         if (d == 0) d = 1;
2190 
2191         return 10000*y + 100*m + d;
2192     }
2193 }
2194 
char_count(char c,const char * s)2195 static int char_count (char c, const char *s)
2196 {
2197     int n = 0;
2198 
2199     while (*s) {
2200         if (*s == c) n++;
2201         s++;
2202     }
2203 
2204     return n;
2205 }
2206 
2207 /* Follow-up check for the case where we think we might
2208    have found a thousands separator: each occurrence of
2209    the putative separator must be followed by exactly 3
2210    digits: we set c->thousep to an invalid value if this
2211    is not the case.
2212 */
2213 
validate_thousep(csvdata * c,const char * s)2214 static void validate_thousep (csvdata *c, const char *s)
2215 {
2216     int nd;
2217 
2218     while (*s) {
2219         if (*s == c->thousep) {
2220             nd = 0;
2221             s++;
2222             while (*s) {
2223                 if (isdigit(*s)) {
2224                     nd++;
2225                     s++;
2226                 } else {
2227                     break;
2228                 }
2229             }
2230             if (nd != 3) {
2231                 /* nope! */
2232 #if CDEBUG
2233                 fprintf(stderr, "validate_thousep: no: '%c' is followed by %d digits\n",
2234                         c->thousep, nd);
2235 #endif
2236                 c->thousep = -1;
2237                 break;
2238             }
2239         } else {
2240             s++;
2241         }
2242     }
2243 }
2244 
2245 /* Initial heuristic for detecting a thousands separator,
2246    where the string @s has been determined to contain
2247    nothing but digits, dot and comma (allowing for a leading
2248    minus).
2249 
2250    1) If the string contains both comma and dot, whichever
2251    character appears to the left cannot be the decimal
2252    separator and may be a thousands separator.
2253 
2254    2) If more than one comma appears in the string, comma
2255    cannot be the decimal character and might be a thousands
2256    separator; mutatis mutandis for dot.
2257 */
2258 
test_for_thousands_sep(csvdata * c,const char * s)2259 static void test_for_thousands_sep (csvdata *c, const char *s)
2260 {
2261     const char *p1 = strrchr(s, '.');
2262     const char *p2 = strrchr(s, ',');
2263     char thousep = 0;
2264 
2265     if (p1 != NULL && p2 != NULL) {
2266         thousep = (p2 - p1 > 0)? '.' : ',';
2267     } else if (p1 != NULL && char_count('.', s) > 0) {
2268         thousep = '.';
2269     } else if (p2 != NULL && char_count(',', s) > 0) {
2270         thousep = ',';
2271     }
2272 
2273     if (c->thousep > 0) {
2274         if (thousep != 0 && thousep != c->thousep) {
2275             /* no consistent interpretation exists */
2276             c->thousep = -1; /* invalid */
2277         }
2278     } else if (thousep != 0) {
2279         /* we have a candidate for testing */
2280         char *test, tmp[CSVSTRLEN];
2281 
2282         strcpy(tmp, s);
2283         gretl_delchar(thousep, tmp);
2284         if (thousep == '.' && get_local_decpoint() == '.') {
2285             gretl_charsub(tmp, ',', '.');
2286         }
2287         errno = 0;
2288         strtod(tmp, &test);
2289         if (*test == '\0' && errno == 0) {
2290             c->thousep = thousep;
2291         }
2292     }
2293 
2294     if (c->thousep && thousep != 0) {
2295         validate_thousep(c, s);
2296     }
2297 }
2298 
all_digits_and_seps(const char * s)2299 static int all_digits_and_seps (const char *s)
2300 {
2301     const char *test = "0123456789.,";
2302 
2303     if (*s == '-') s++;
2304 
2305     return strspn(s, test) == strlen(s);
2306 }
2307 
eval_non_numeric(csvdata * c,int i,const char * s)2308 static double eval_non_numeric (csvdata *c, int i, const char *s)
2309 {
2310     double x = NON_NUMERIC;
2311 
2312     if (series_get_flags(c->dset, i) & VAR_TIMECOL) {
2313         char *fmt = NULL;
2314         int mq = 0;
2315 
2316         if (timecol_get_format(c->dset, i, &fmt, &mq)) {
2317             /* the user gave a specific format for this */
2318             x = special_time_val(s, fmt, mq);
2319         } else {
2320             /* default: ISO 8601 extended */
2321             int y, m, d, n;
2322 
2323             n = sscanf(s, "%d-%d-%d", &y, &m, &d);
2324             if (n == 3) {
2325                 x = 10000*y + 100*m + d;
2326             } else {
2327                 x = NADBL;
2328             }
2329         }
2330     } else if (c->thousep >= 0 && !csv_scrub_thousep(c)) {
2331         /* Here we consider the possibility although @s does not
2332            validate as numeric according to the C library, it is by
2333            intent numeric but includes one or more thousands
2334            separators.
2335 
2336            The condition c->thousep >= 0 requires that we haven't
2337            already ruled out this interpretation due to inconsistency,
2338            and !csv_scrub_thousep(c) requires that we're not on a
2339            second pass through the data.
2340         */
2341         if (all_digits_and_seps(s)) {
2342             test_for_thousands_sep(c, s);
2343         }
2344     }
2345 
2346     return x;
2347 }
2348 
converted_ok(const char * s,char * test,double x)2349 static int converted_ok (const char *s, char *test, double x)
2350 {
2351     if (*test != '\0') {
2352         if (errno) perror(s);
2353         return 0; /* definitely not OK */
2354     } else if (errno == ERANGE && fabs(x) > 0 && fabs(x) < 0.001) {
2355         return 1; /* subnormal, but we'll let that pass */
2356     } else if (errno) {
2357         perror(s);
2358         return 0;
2359     } else {
2360         return 1;
2361     }
2362 }
2363 
csv_unquote(char * s)2364 static char *csv_unquote (char *s)
2365 {
2366     if (s[0] == '"') {
2367         int i, n = strlen(s);
2368 
2369         if (n > 1 && s[n-1] == '"') {
2370             for (i=0; i<n-2; i++) {
2371                 s[i] = s[i+1];
2372             }
2373             s[i] = '\0';
2374         }
2375     }
2376     return s;
2377 }
2378 
csv_atof(csvdata * c,int i)2379 static double csv_atof (csvdata *c, int i)
2380 {
2381     char tmp[CSVSTRLEN], clean[CSVSTRLEN];
2382     double x = NON_NUMERIC;
2383     const char *s = c->str;
2384     char *test;
2385 
2386     if (csv_scrub_thousep(c) && strchr(s, c->thousep) &&
2387         all_digits_and_seps(s)) {
2388         /* second pass through the data: pre-process fields
2389            that we reckon include thousands separators
2390         */
2391         strcpy(clean, s);
2392         gretl_delchar(c->thousep, clean);
2393         s = clean;
2394     }
2395 
2396     if (c->decpoint == '.' || !csv_do_dotsub(c) || strchr(s, ',') == NULL) {
2397         /* either we're currently set to the correct locale,
2398            or there's no problematic decimal point in @s
2399         */
2400         errno = 0;
2401         x = strtod(s, &test);
2402         if (converted_ok(s, test, x)) {
2403             return x; /* handled */
2404         }
2405     } else if (csv_do_dotsub(c)) {
2406         /* in C numeric locale: substitute dot for comma */
2407         strcpy(tmp, s);
2408         gretl_charsub(tmp, ',', '.');
2409         errno = 0;
2410         x = strtod(tmp, &test);
2411         if (converted_ok(s, test, x)) {
2412             return x; /* handled */
2413         }
2414     }
2415 
2416     if (c->decpoint == '.' && strchr(s, ',') != NULL) {
2417         /* try remediation for decimal comma? */
2418         strcpy(tmp, s);
2419         gretl_charsub(tmp, ',', '.');
2420         errno = 0;
2421         x = strtod(tmp, &test);
2422         if (converted_ok(s, test, x)) {
2423             return x; /* handled */
2424         }
2425     }
2426 
2427     /* fallback */
2428     /* revised 2020-02-13 to use csv_unquote */
2429     return eval_non_numeric(c, i, csv_unquote(c->str));
2430 }
2431 
process_csv_obs(csvdata * c,int i,int t,int * miss_shown,PRN * prn)2432 static int process_csv_obs (csvdata *c, int i, int t, int *miss_shown,
2433                             PRN *prn)
2434 {
2435     int err = 0;
2436 
2437     if (c->st != NULL) {
2438         /* second round, handling string-valued variables */
2439         if (in_gretl_list(c->codelist, i)) {
2440             double zit = c->dset->Z[i][t];
2441             int ix;
2442 
2443             if (na(zit) && *c->str != '\0' && c->user_na == NULL) {
2444                 /* by default (no user_na) only blanks count as NAs */
2445                 zit = NON_NUMERIC;
2446             }
2447             if (!na(zit)) {
2448                 ix = gretl_string_table_index(c->st, c->str, i, 0, prn);
2449                 if (ix > 0) {
2450                     c->dset->Z[i][t] = (double) ix;
2451                 } else {
2452                     err = E_DATA;
2453                 }
2454             }
2455         }
2456     } else if (csv_missval(c->str, i, t+1, miss_shown, prn)) {
2457         c->dset->Z[i][t] = NADBL;
2458     } else {
2459         gretl_strstrip(c->str);
2460         c->dset->Z[i][t] = csv_atof(c, i);
2461     }
2462 
2463     return err;
2464 }
2465 
2466 /* Emulation of fgets(), designed to handle any sort of line
2467    termination (unix, DOS, Mac or even an unholy mixture).
2468    Line-endings are converted to LF (0x0a).
2469 */
2470 
csv_fgets(csvdata * cdata,gzFile fp)2471 static char *csv_fgets (csvdata *cdata, gzFile fp)
2472 {
2473     char *s = cdata->line;
2474     int n = cdata->maxlinelen;
2475     int i, c1, c = 0;
2476 
2477     for (i=0; i<n-1 && c!=0x0a; i++) {
2478         c = gzgetc(fp);
2479         if (c == EOF) {
2480             if (i == 0) {
2481                 /* signal end of read */
2482                 return NULL;
2483             } else {
2484                 break;
2485             }
2486         } else if (c == 0x0d) {
2487             /* CR: convert to LF and peek at next char: if it's
2488                LF swallow it, otherwise put it back */
2489             c = 0x0a;
2490             c1 = gzgetc(fp);
2491             if (c1 != 0x0a) {
2492                 gzungetc(c1, fp);
2493             }
2494         }
2495         s[i] = c;
2496     }
2497 
2498     s[i] = '\0';
2499 
2500     return s;
2501 }
2502 
2503 /* pick up any comments following the data block in a CSV file */
2504 
get_csv_descrip(csvdata * c,gzFile fp)2505 static char *get_csv_descrip (csvdata *c, gzFile fp)
2506 {
2507     char *line = c->line;
2508     char *desc = NULL;
2509     size_t llen, totlen;
2510 
2511     while (csv_fgets(c, fp)) {
2512         tailstrip(line);
2513         llen = strlen(line);
2514         if (desc == NULL) {
2515             totlen = llen + 4;
2516             desc = malloc(totlen);
2517             if (desc == NULL) {
2518                 return NULL;
2519             }
2520             sprintf(desc, "%s\n", line);
2521         } else {
2522             char *tmp;
2523 
2524             totlen = strlen(desc) + llen + 4;
2525             tmp = realloc(desc, totlen);
2526             if (tmp == NULL) {
2527                 free(desc);
2528                 return NULL;
2529             }
2530             desc = tmp;
2531             strcat(desc, line);
2532             strcat(desc, "\n");
2533         }
2534     }
2535 
2536     if (desc != NULL && string_is_blank(desc)) {
2537         free(desc);
2538         desc = NULL;
2539     }
2540 
2541     return desc;
2542 }
2543 
2544 static const char *
2545 csv_msg = N_("\nPlease note:\n"
2546              "- The first row of the CSV file should contain the "
2547              "names of the variables.\n"
2548              "- The first column may optionally contain date "
2549              "strings or other 'markers':\n  in that case its row 1 entry "
2550              "should be blank, or should say 'obs' or 'date'.\n"
2551              "- The remainder of the file must be a rectangular "
2552              "array of data.\n");
2553 
2554 /* Here we check whether we get a consistent reading on
2555    the number of fields per line in the CSV file
2556 */
2557 
csv_fields_check(gzFile fp,csvdata * c,PRN * prn)2558 static int csv_fields_check (gzFile fp, csvdata *c, PRN *prn)
2559 {
2560     int gotdata = 0;
2561     int chkcols = 0;
2562     int err = 0;
2563 
2564     c->ncols = c->nrows = 0;
2565 
2566     if (csv_has_bom(c)) {
2567         gzseek(fp, 3, SEEK_SET);
2568     }
2569 
2570     while (csv_fgets(c, fp) && !err) {
2571 
2572         /* skip comment lines */
2573         if (*c->line == '#') {
2574             continue;
2575         }
2576 
2577         /* skip blank lines -- but finish if the blank comes after data */
2578         if (string_is_blank(c->line)) {
2579             if (gotdata) {
2580                 if (!csv_have_data(c)) {
2581                     c->descrip = get_csv_descrip(c, fp);
2582                 }
2583                 break;
2584             } else {
2585                 continue;
2586             }
2587         }
2588 
2589         c->nrows += 1;
2590 
2591         if (fixed_format(c)) {
2592             tailstrip(c->line);
2593             gotdata = 1;
2594             chkcols = strlen(c->line);
2595             if (chkcols < c->cols_list[c->cols_list[0]]) {
2596                 gretl_errmsg_set(_("Invalid column specification"));
2597                 err = E_DATA;
2598                 break;
2599             } else {
2600                 continue;
2601             }
2602         }
2603 
2604         compress_csv_line(c, 1);
2605 
2606         if (!gotdata) {
2607             /* scrutinize the first "real" line */
2608             check_first_field(c->line, c, prn);
2609             gotdata = 1;
2610         }
2611 
2612         chkcols = count_csv_fields(c);
2613         if (c->ncols == 0) {
2614             c->ncols = chkcols;
2615             pprintf(prn, _("   number of columns = %d\n"), c->ncols);
2616         } else if (chkcols != c->ncols) {
2617             pprintf(prn, _("   ...but row %d has %d fields: aborting\n"),
2618                     c->nrows, chkcols);
2619             err = E_DATA;
2620         } else if (cols_subset(c)) {
2621             int datacols = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
2622 
2623             if (c->cols_list[c->cols_list[0]] > datacols) {
2624                 gretl_errmsg_set(_("Invalid column specification"));
2625                 err = E_DATA;
2626             }
2627         }
2628     }
2629 
2630     if (!err && fixed_format(c)) {
2631         c->ncols = c->cols_list[0];
2632     }
2633 
2634     return err;
2635 }
2636 
strip_illegals(char * s)2637 static void strip_illegals (char *s)
2638 {
2639     char name[VNAMELEN] = {0};
2640     int i, j = 0;
2641 
2642     for (i=0; s[i] != '\0'; i++) {
2643         if (isalnum(s[i]) || s[i] == '_') {
2644             name[j++] = s[i];
2645         }
2646     }
2647 
2648     name[j] = '\0';
2649     strcpy(s, name);
2650 }
2651 
intercept_nan_as_name(const char * s)2652 static int intercept_nan_as_name (const char *s)
2653 {
2654     if (strlen(s) == 3) {
2655         char screen[4];
2656 
2657         strcpy(screen, s);
2658         gretl_lower(screen);
2659         if (!strcmp(screen, "nan")) {
2660             return 1;
2661         }
2662     }
2663 
2664     return 0;
2665 }
2666 
csv_is_numeric(const char * s,csvdata * c)2667 static int csv_is_numeric (const char *s, csvdata *c)
2668 {
2669     int ret = 0;
2670 
2671     if (c->decpoint == '.') {
2672         ret = numeric_string(s);
2673     } else {
2674         /* decimal comma in force */
2675         char *tmp = gretl_strdup(s);
2676 
2677         gretl_charsub(tmp, ',', '.');
2678         ret = numeric_string(tmp);
2679         free(tmp);
2680     }
2681 
2682     return ret;
2683 }
2684 
process_csv_varname(csvdata * c,int j,int * numcount,PRN * prn)2685 static int process_csv_varname (csvdata *c, int j, int *numcount,
2686                                 PRN *prn)
2687 {
2688     char *vname = c->dset->varname[j];
2689     char *src = c->str;
2690     int err = 0;
2691 
2692     *vname = '\0';
2693 
2694     if (intercept_nan_as_name(src)) {
2695         gretl_errmsg_sprintf(_("If '%s' is intended as the name of a variable, "
2696                                "please change it --\nstrings of this sort usually "
2697                                "mean 'not a number'."), src);
2698         err = E_DATA;
2699     } else if (*src == '\0') {
2700         fprintf(stderr, "variable name %d is missing\n", j);
2701         sprintf(vname, "v%d", j);
2702     } else if (csv_is_numeric(src, c)) {
2703         *numcount += 1;
2704     } else {
2705         const char *s = src;
2706 
2707         while (*s && !isalpha(*s)) s++;
2708         if (*s == '\0') {
2709             fprintf(stderr, "variable name %d (%s) is garbage\n", j, src);
2710             sprintf(vname, "v%d", j);
2711         } else {
2712             strncat(vname, s, VNAMELEN - 1);
2713         }
2714         iso_to_ascii(vname);
2715         strip_illegals(vname);
2716         if (gretl_reserved_word(vname)) {
2717             /* try a fix for this */
2718             int n = strlen(vname);
2719 
2720             if (n < VNAMELEN-1) {
2721                 strcat(vname, "_");
2722             } else {
2723                 vname[n-1] = '_';
2724             }
2725         }
2726         if (check_varname(vname)) {
2727             errmsg(1, prn);
2728             err = E_DATA;
2729         }
2730     }
2731 
2732     return err;
2733 }
2734 
csv_reconfigure_for_markers(DATASET * dset)2735 static int csv_reconfigure_for_markers (DATASET *dset)
2736 {
2737     int err = dataset_allocate_obs_markers(dset);
2738 
2739     if (!err) {
2740         err = dataset_drop_last_variables(dset, 1);
2741     }
2742 
2743     return err;
2744 }
2745 
skip_data_column(csvdata * c,int k)2746 static int skip_data_column (csvdata *c, int k)
2747 {
2748     int col = csv_skip_col_1(c) ? k : k + 1;
2749 
2750     if (!in_gretl_list(c->cols_list, col)) {
2751         return 1;
2752     } else {
2753         return 0;
2754     }
2755 }
2756 
update_join_cols_list(csvdata * c,int k)2757 static int update_join_cols_list (csvdata *c, int k)
2758 {
2759     int *test;
2760     int err = 0;
2761 
2762     test = gretl_list_append_term(&c->cols_list, k);
2763     if (test == NULL) {
2764         err = E_ALLOC;
2765     }
2766 
2767 #if CDEBUG
2768     printlist(c->cols_list, "c->cols_list for join");
2769 #endif
2770 
2771     return err;
2772 }
2773 
2774 /* handle_join_varname: the index @k contains the column number
2775    relative to the entire CSV file, while @pj points to j, the column
2776    number relative to the reduced dataset that will be constructed by
2777    selection of columns from the file.
2778 
2779    Here we're examining a column heading read from file (c->str) to
2780    see whether it matches any of the column-names required for an
2781    ongoing join operation (held in c->jspec->colnames). If so, we
2782    write the index j into the appropriate slot in c->jspec->colnums
2783    (which starts off filled with zeros), so the joiner will know where
2784    to find the required data. (The j value is bound to be at least 1
2785    since column 0 is reserved to the constant.)
2786 
2787    In some cases a given named column may perform more than one role in
2788    a join operation -- for example, it may serve as an element in a
2789    filter and also as the auxiliary variable in an "aggregation"
2790    method. To allow for this we don't stop scanning at the first match
2791    of c->str with a required column name.
2792 
2793    The call to update_join_cols_list() uses the index @k to record the
2794    overall column position of "wanted data", for use by the CSV
2795    reader.
2796 */
2797 
handle_join_varname(csvdata * c,int k,int * pj)2798 static int handle_join_varname (csvdata *c, int k, int *pj)
2799 {
2800     const char *colname;
2801     char okname[VNAMELEN];
2802     int matched = 0;
2803     int i, j = *pj;
2804 
2805     if (!csv_skip_col_1(c)) {
2806         k++;
2807     }
2808 
2809     if (csv_no_header(c)) {
2810         sprintf(okname, "col%d", k);
2811     } else {
2812         /* convert to valid gretl identifier */
2813         gretl_normalize_varname(okname, c->str, 0, k);
2814     }
2815 
2816 #if CDEBUG
2817     fprintf(stderr, "handle_join_varname: looking at '%s' (%s)\n", c->str, okname);
2818 #endif
2819 
2820     for (i=0; i<c->jspec->ncols; i++) {
2821         /* find "wanted name" i */
2822         colname = c->jspec->colnames[i];
2823         if (colname == NULL || c->jspec->colnums[i] > 0) {
2824             /* name not wanted, or already found */
2825             continue;
2826         }
2827         if (!strcmp(okname, colname)) {
2828 #if CDEBUG
2829             fprintf(stderr, " target %d matched at CSV col %d, j=%d\n", i, k, j);
2830 #endif
2831             c->jspec->colnums[i] = j;
2832             if (!matched) {
2833                 matched = 1;
2834                 strcpy(c->dset->varname[j], okname);
2835                 update_join_cols_list(c, k);
2836                 *pj += 1;
2837                 if (in_gretl_list(c->jspec->timecols, i)) {
2838                     series_set_flag(c->dset, j, VAR_TIMECOL);
2839                 }
2840             }
2841         }
2842     }
2843 
2844     return 0;
2845 }
2846 
2847 #define starts_number(c) (isdigit((unsigned char) c) || c == '-' ||     \
2848                           c == '+' || c == '.')
2849 
2850 #define obs_labels_no_varnames(o,c,n)  (!o && c->v > 3 && n == c->v - 2)
2851 
csv_varname_scan(csvdata * c,gzFile fp,PRN * prn,PRN * mprn)2852 static int csv_varname_scan (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
2853 {
2854     char *p;
2855     int obscol = csv_has_obs_column(c);
2856     int i, j, k, numcount;
2857     int err = 0;
2858 
2859     if (!csv_no_header(c)) {
2860         pputs(mprn, _("scanning for variable names...\n"));
2861     }
2862 
2863     if (csv_has_bom(c)) {
2864         gzseek(fp, 3, SEEK_SET);
2865     }
2866 
2867     while (csv_fgets(c, fp)) {
2868         if (*c->line == '#' || string_is_blank(c->line)) {
2869             continue;
2870         } else {
2871             break;
2872         }
2873     }
2874 
2875     c->datapos = gztell(fp);
2876 
2877     compress_csv_line(c, 1);
2878 
2879     p = c->line;
2880     if (c->delim == ' ' && *p == ' ') p++;
2881     iso_to_ascii(p);
2882 
2883     if (strlen(p) > 118) {
2884         pprintf(mprn, _("   line: %.115s...\n"), p);
2885     } else {
2886         pprintf(mprn, _("   line: %s\n"), p);
2887     }
2888 
2889     numcount = 0;
2890     j = 1; /* for the constant */
2891 
2892     for (k=0; k<c->ncols && !err; k++) {
2893         i = 0;
2894         while (*p && *p != c->delim) {
2895             if (i < CSVSTRLEN - 1) {
2896                 c->str[i++] = *p;
2897             }
2898             p++;
2899         }
2900         c->str[i] = '\0';
2901         if (*p == c->delim) p++;
2902 
2903         if (k == 0 && csv_skip_col_1(c)) {
2904             ; /* no-op */
2905         } else if (!joining(c) && cols_subset(c) && skip_data_column(c, k)) {
2906             ; /* no-op */
2907         } else {
2908             if (joining(c)) {
2909                 handle_join_varname(c, k, &j);
2910             } else if (probing(c) && csv_no_header(c)) {
2911                 sprintf(c->dset->varname[j], "col%d", j);
2912                 j++;
2913             } else {
2914                 err = process_csv_varname(c, j, &numcount, prn);
2915                 j++;
2916             }
2917         }
2918         if (j == c->dset->v) {
2919 #if CDEBUG
2920             fprintf(stderr, "breaking on j = %d (k = %d)\n", j, k);
2921 #endif
2922             break;
2923         }
2924     }
2925 
2926     if (!err && joining(c) && c->cols_list == NULL) {
2927         /* no relevant columns were found */
2928         gretl_errmsg_set("No relevant columns were found");
2929         err = E_UNKVAR;
2930     }
2931 
2932     if (err) {
2933         return err;
2934     }
2935 
2936     if (csv_no_header(c) || numcount == c->dset->v - 1 ||
2937         obs_labels_no_varnames(obscol, c->dset, numcount)) {
2938         if (!csv_no_header(c)) {
2939             pputs(prn, _("it seems there are no variable names\n"));
2940             /* then we undercounted the observations by one? */
2941             if (!rows_subset(c)) {
2942                 err = add_single_obs(c->dset);
2943             }
2944         }
2945         if (!err) {
2946             /* set up to handle the "no varnames" case */
2947             csv_set_autoname(c);
2948             c->datapos = csv_has_bom(c) ? 3 : 0;
2949             if (!csv_all_cols(c)) {
2950                 if (obs_labels_no_varnames(obscol, c->dset, numcount)) {
2951                     err = csv_reconfigure_for_markers(c->dset);
2952                     if (!err) {
2953                         csv_set_obs_column(c);
2954                     }
2955                 }
2956             }
2957         }
2958     } else if (numcount > 0) {
2959         for (i=1; i<c->dset->v; i++) {
2960             if (check_varname(c->dset->varname[i])) {
2961                 errmsg(1, prn);
2962                 break;
2963             }
2964         }
2965         fprintf(stderr, "numcount = %d\n", numcount);
2966         err = E_DATA;
2967     }
2968 
2969     return err;
2970 }
2971 
row_not_wanted(csvdata * c,int t)2972 static int row_not_wanted (csvdata *c, int t)
2973 {
2974     if (c->rowmask != NULL) {
2975         if (t >= c->masklen) {
2976             return 1;
2977         } else if (gretl_vector_get(c->rowmask, t) == 0) {
2978             return 1;
2979         }
2980     }
2981 
2982     return 0;
2983 }
2984 
2985 /* read numerical data when we've been given a fixed column-reading
2986    specification */
2987 
fixed_format_read(csvdata * c,gzFile fp,PRN * prn)2988 static int fixed_format_read (csvdata *c, gzFile fp, PRN *prn)
2989 {
2990     char *p;
2991     int miss_shown = 0;
2992     int *missp = NULL;
2993     int t = 0, s = 0;
2994     int i, k, n, m;
2995     int err = 0;
2996 
2997     c->real_n = c->dset->n;
2998 
2999     if (csv_has_bom(c)) {
3000         gzseek(fp, 3, SEEK_SET);
3001     }
3002 
3003     if (csv_is_verbose(c)) {
3004         missp = &miss_shown;
3005     }
3006 
3007     while (csv_fgets(c, fp) && !err) {
3008         tailstrip(c->line);
3009         if (*c->line == '#' || string_is_blank(c->line)) {
3010             continue;
3011         }
3012         if (row_not_wanted(c, s)) {
3013             s++;
3014             continue;
3015         }
3016         m = strlen(c->line);
3017         for (i=1; i<=c->ncols && !err; i++) {
3018             k = c->cols_list[i];
3019             n = c->width_list[i];
3020             if (k + n - 1 > m) {
3021                 /* attempting to read out of bounds */
3022                 fprintf(stderr, "row %d, column %d: start=%d, width=%d, "
3023                         "but line length = %d\n", t+1, i, k, n, m);
3024                 err = E_DATA;
3025                 break;
3026             }
3027             p = c->line + k - 1;
3028             *c->str = '\0';
3029             strncat(c->str, p, n);
3030             /* Added 2016-11-16: allow trailing blanks in a field
3031                of specified width. This is required for handling
3032                US CPS data.
3033             */
3034             tailstrip(c->str);
3035             if (csv_missval(c->str, i, t+1, missp, prn)) {
3036                 c->dset->Z[i][t] = NADBL;
3037             } else {
3038                 c->dset->Z[i][t] = csv_atof(c, i);
3039                 if (c->dset->Z[i][t] == NON_NUMERIC) {
3040                     gretl_errmsg_sprintf(_("At row %d, column %d:\n"), t+1, k);
3041                     gretl_errmsg_sprintf(_("'%s' -- no numeric conversion performed!"),
3042                                          c->str);
3043                     err = E_DATA;
3044                 }
3045             }
3046         }
3047         s++;
3048         if (++t == c->dset->n) {
3049             break;
3050         }
3051     }
3052 
3053     if (err == E_DATA) {
3054         gretl_errmsg_set(_("Invalid column specification"));
3055     }
3056 
3057     return err;
3058 }
3059 
3060 #define XML1_OK(u) ((u>=0x0020 && u<=0xD7FF) || \
3061                     (u>=0xE000 && u<=0xFFFD))
3062 
3063 /* Check that an observation label contains only
3064    valid UTF-8, and moreover that every character
3065    is valid in XML 1.0. If not, try recoding from
3066    ISO 8859.
3067 */
3068 
maybe_fix_csv_string(gchar * s)3069 static int maybe_fix_csv_string (gchar *s)
3070 {
3071     int err = 0;
3072 
3073     if (!g_utf8_validate(s, -1, NULL)) {
3074         GError *gerr = NULL;
3075         gsize wrote = 0;
3076         gchar *tr;
3077 
3078         /* try for iso-8859? */
3079         tr = g_convert(s, -1, "UTF-8", "ISO-8859-15",
3080                        NULL, &wrote, &gerr);
3081         if (gerr != NULL) {
3082             gretl_errmsg_set(gerr->message);
3083             g_error_free(gerr);
3084             err = E_DATA;
3085         } else {
3086             *s = '\0';
3087             gretl_utf8_strncat(s, tr, CSVSTRLEN-1);
3088             g_free(tr);
3089         }
3090     }
3091 
3092     if (!err) {
3093         int i, n = g_utf8_strlen(s, -1);
3094         gunichar u;
3095 
3096         for (i=0; i<n; i++) {
3097             u = g_utf8_get_char(s);
3098             if (!XML1_OK(u)) {
3099                 return 0;
3100             }
3101             s = g_utf8_next_char(s);
3102         }
3103     }
3104 
3105     return err;
3106 }
3107 
transcribe_obs_label(csvdata * c,int t)3108 static void transcribe_obs_label (csvdata *c, int t)
3109 {
3110     char *s = c->str;
3111     char c0 = *s;
3112     int n = strlen(s);
3113 
3114     /* skip a leading quote, and unquote fully
3115        if a matching trailing quote is found
3116     */
3117 
3118     if (c0 == '"' || c0 == '\'') {
3119         if (s[n-1] == c0) {
3120             s[n-1] = '\0';
3121             n--;
3122         }
3123         s++;
3124         n--;
3125         /* and once more, with feeling... */
3126         if (s[0] == '\'') {
3127             s++;
3128             n--;
3129         }
3130     }
3131 
3132     if (n > OBSLEN - 1) {
3133         n = OBSLEN - 1;
3134     }
3135 
3136     c->dset->S[t][0] = '\0';
3137     gretl_utf8_strncat(c->dset->S[t], s, n);
3138 }
3139 
real_read_labels_and_data(csvdata * c,gzFile fp,PRN * prn)3140 static int real_read_labels_and_data (csvdata *c, gzFile fp, PRN *prn)
3141 {
3142     char *p;
3143     int miss_shown = 0;
3144     int *missp = NULL;
3145     int truncated = 0;
3146     int t = 0, s = 0;
3147     int i, j, k;
3148     int err = 0;
3149 
3150     if (csv_is_verbose(c)) {
3151         missp = &miss_shown;
3152     }
3153 
3154     c->real_n = c->dset->n;
3155 
3156     while (csv_fgets(c, fp) && !err) {
3157         int inquote = 0;
3158 
3159         if (*c->line == '#' || string_is_blank(c->line)) {
3160             continue;
3161         } else if (*c->skipstr != '\0' && strstr(c->line, c->skipstr)) {
3162             c->real_n -= 1;
3163             continue;
3164         } else if (row_not_wanted(c, s)) {
3165             s++;
3166             continue;
3167         }
3168 
3169         compress_csv_line(c, 0);
3170         p = c->line;
3171 
3172         if (c->delim == ' ') {
3173             if (*p == ' ') p++;
3174         } else {
3175             p += strspn(p, " ");
3176         }
3177 
3178         j = 1;
3179         for (k=0; k<c->ncols && !err; k++) {
3180             i = 0;
3181             while (*p) {
3182                 if (csv_keep_quotes(c) && *p == c->qchar) {
3183                     inquote = !inquote;
3184                 } else if (!inquote && *p == c->delim) {
3185                     break;
3186                 }
3187                 if (i < CSVSTRLEN - 1) {
3188                     c->str[i++] = *p;
3189                 } else {
3190                     truncated++;
3191                 }
3192                 p++;
3193             }
3194             c->str[i] = '\0';
3195             err = maybe_fix_csv_string(c->str);
3196             if (!err) {
3197                 if (k == 0 && csv_skip_col_1(c) && c->dset->S != NULL) {
3198                     transcribe_obs_label(c, t);
3199                 } else if (cols_subset(c) && skip_data_column(c, k)) {
3200                     ; /* no-op */
3201                 } else {
3202                     err = process_csv_obs(c, j++, t, missp, prn);
3203                 }
3204             }
3205             if (!err) {
3206                 /* prep for next column */
3207                 if (*p == c->delim) {
3208                     p++;
3209                 }
3210                 if (c->delim != ' ') {
3211                     p += strspn(p, " ");
3212                 }
3213             }
3214         }
3215 
3216         s++;
3217         if (++t == c->dset->n) {
3218             break;
3219         }
3220     }
3221 
3222     if (truncated) {
3223         pprintf(prn, _("warning: %d labels were truncated.\n"), truncated);
3224     }
3225 
3226     if (!err && c->real_n < c->dset->n) {
3227         int drop = c->dset->n - c->real_n;
3228 
3229         err = dataset_drop_observations(c->dset, drop);
3230     }
3231 
3232     return err;
3233 }
3234 
3235 /* When reading a CSV file, should we attempt to parse observation
3236    strings as dates (and impose time-series structure on the data
3237    if this is successful)? In general, yes, but maybe not if we're
3238    reading the data in the context of a "join" operation, since
3239    in this case automatic detection may collide with time-key
3240    information supplied by the user. Current status: we'll skip
3241    the auto-dating stuff when joining unless (a) it's a MIDAS
3242    join (mixed frequencies) and the user has _not_ supplied any
3243    time key specification.
3244 */
3245 
csv_skip_dates(csvdata * c)3246 static int csv_skip_dates (csvdata *c)
3247 {
3248     if (c->jspec != NULL) {
3249         /* with --aggr=spread (MIDAS) we'll need dates info,
3250            unless the user have a time key spec
3251         */
3252         return c->jspec->auto_midas == 0;
3253     } else {
3254         return 0;
3255     }
3256 }
3257 
csv_read_data(csvdata * c,gzFile fp,PRN * prn,PRN * mprn)3258 static int csv_read_data (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
3259 {
3260     int reversed = csv_data_reversed(c);
3261     int err;
3262 
3263     if (mprn != NULL) {
3264         if (csv_all_cols(c)) {
3265             pputs(mprn, _("scanning for data...\n"));
3266         } else {
3267             pputs(mprn, _("scanning for row labels and data...\n"));
3268         }
3269     }
3270 
3271     gzseek(fp, c->datapos, SEEK_SET);
3272 
3273     err = real_read_labels_and_data(c, fp, prn);
3274 
3275     if (!err && csv_skip_col_1(c) && !rows_subset(c) && !csv_skip_dates(c)) {
3276         c->markerpd = test_markers_for_dates(c->dset, &reversed,
3277                                              c->skipstr, prn);
3278         if (reversed) {
3279             csv_set_data_reversed(c);
3280         }
3281     }
3282 
3283     return err;
3284 }
3285 
print_csv_parsing_header(const char * fname,PRN * prn)3286 static void print_csv_parsing_header (const char *fname, PRN *prn)
3287 {
3288     if (!g_utf8_validate(fname, -1, NULL)) {
3289         gchar *trfname = g_locale_to_utf8(fname, -1, NULL, NULL, NULL);
3290 
3291         pprintf(prn, "%s %s...\n", _("parsing"), trfname);
3292         g_free(trfname);
3293     } else {
3294         pprintf(prn, "%s %s...\n", _("parsing"), fname);
3295     }
3296 }
3297 
join_unique_columns(csvdata * c)3298 static int join_unique_columns (csvdata *c)
3299 {
3300     const char **cnames = c->jspec->colnames;
3301     char *counted;
3302     int i, j, ncols = 0;
3303 
3304     counted = calloc(c->jspec->ncols, 1);
3305 
3306     for (i=0; i<c->jspec->ncols; i++) {
3307         if (cnames[i] != NULL && counted[i] == 0) {
3308             counted[i] = 1;
3309             /* mark any duplicates as counted too */
3310             for (j=i+1; j<c->jspec->ncols; j++) {
3311                 if (cnames[j] != NULL && !strcmp(cnames[i], cnames[j])) {
3312                     counted[j] = 1;
3313                 }
3314             }
3315 #if CDEBUG
3316             fprintf(stderr, "join_unique_columns: '%s'\n", cnames[i]);
3317 #endif
3318             ncols++;
3319         }
3320     }
3321 
3322     free(counted);
3323 
3324     return ncols;
3325 }
3326 
csv_set_dataset_dimensions(csvdata * c)3327 static int csv_set_dataset_dimensions (csvdata *c)
3328 {
3329     int err = 0;
3330 
3331     c->dset->v = 0;
3332 
3333     if (rows_subset(c)) {
3334         c->dset->n = n_from_row_mask(c);
3335     }
3336 
3337     if (fixed_format(c)) {
3338         if (c->dset->n == 0) {
3339             c->dset->n = c->nrows;
3340         }
3341         c->dset->v = c->ncols + 1;
3342     } else {
3343         int cols_wanted, cols_present;
3344 
3345         if (c->dset->n == 0) {
3346             if (csv_no_header(c)) {
3347                 c->dset->n = c->nrows;
3348             } else {
3349                 /* allow for varnames row */
3350                 c->dset->n = c->nrows - 1;
3351             }
3352         }
3353 
3354         cols_present = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
3355 
3356         if (joining(c)) {
3357             cols_wanted = join_unique_columns(c);
3358         } else if (cols_subset(c)) {
3359             cols_wanted = c->cols_list[0];
3360         } else {
3361             cols_wanted = cols_present;
3362         }
3363 
3364         if (cols_wanted > cols_present) {
3365             gretl_errmsg_set(_("Invalid column specification"));
3366             err = E_DATA;
3367         } else {
3368             /* allow for the constant */
3369             c->dset->v = cols_wanted + 1;
3370         }
3371     }
3372 
3373     if (probing(c)) {
3374         /* don't allocate tons of space for data that
3375            we won't read right now */
3376         c->dset->n = 1;
3377     }
3378 
3379 #if CDEBUG
3380     if (joining(c)) {
3381         fprintf(stderr, "csv dataset dimensions: v=%d, n=%d\n",
3382                 c->dset->v, c->dset->n);
3383     }
3384 #endif
3385 
3386     return err;
3387 }
3388 
3389 /*
3390  * real_import_csv:
3391  * @fname: name of CSV file.
3392  * @dset: dataset struct.
3393  * @cols: column specification.
3394  * @rows: row specification.
3395  * @join: specification pertaining to "join" command.
3396  * @probe: also pertains to "join" (via GUI).
3397  * @pm: location of matrix to accept the data or NULL.
3398  * @opt: use OPT_N to force interpretation of data colums containing
3399  * strings as coded (non-numeric) values and not errors; use OPT_H
3400  * to indicate absence of a header row; use OPT_A to indicate that
3401  * all columns should be read as data series (i.e. do not try to
3402  * interpret the first column as observation labels); for use of
3403  * OPT_T see the help text for the "append" command.
3404  * @prn: gretl printing struct (or NULL).
3405  *
3406  * Open a Comma-Separated Values data file and read the data into
3407  * the current work space. Shared with gretl_join.c.
3408  *
3409  * Returns: 0 on successful completion, non-zero otherwise.
3410  */
3411 
real_import_csv(const char * fname,DATASET * dset,const char * cols,const char * rows,joinspec * join,void * probe,gretl_matrix ** pm,gretlopt opt,PRN * prn)3412 int real_import_csv (const char *fname,
3413 		     DATASET *dset,
3414 		     const char *cols,
3415 		     const char *rows,
3416 		     joinspec *join,
3417 		     void *probe,
3418 		     gretl_matrix **pm,
3419 		     gretlopt opt,
3420 		     PRN *prn)
3421 {
3422     csvdata *c = NULL;
3423     gzFile fp = NULL;
3424     PRN *mprn = NULL;
3425     gchar *altname = NULL;
3426     int recode = 0;
3427     int popit = 0;
3428     int i, err = 0;
3429 
3430     import_na_init();
3431 
3432     if (gretl_messages_on()) {
3433         mprn = prn;
3434     }
3435 
3436     fp = gretl_gzopen(fname, "rb");
3437     if (fp == NULL) {
3438         pprintf(prn, _("Couldn't open %s\n"), fname);
3439         err = E_FOPEN;
3440         goto csv_bailout;
3441     }
3442 
3443     c = csvdata_new(dset);
3444     if (c == NULL) {
3445         err = E_ALLOC;
3446         goto csv_bailout;
3447     }
3448 
3449     recode = csv_unicode_check(fp, c, prn);
3450     if (recode) {
3451         err = csv_recode_input(&fp, fname, &altname, recode, prn);
3452         if (err) {
3453             goto csv_bailout;
3454         }
3455     }
3456 
3457     if (cols != NULL) {
3458         err = csvdata_add_cols_list(c, cols, opt);
3459         if (err) {
3460             goto csv_bailout;
3461         } else if (fixed_format(c)) {
3462             pprintf(mprn, _("using fixed column format\n"));
3463         }
3464     }
3465 
3466     if (rows != NULL) {
3467         err = csvdata_add_row_mask(c, rows);
3468         if (err) {
3469             goto csv_bailout;
3470         }
3471     }
3472 
3473     if (opt & OPT_H) {
3474         csv_set_no_header(c);
3475     }
3476 
3477     if (join != NULL) {
3478         c->jspec = join;
3479         c->flags |= CSV_HAVEDATA;
3480     } else if (probe != NULL) {
3481         c->probe = probe;
3482         c->flags |= CSV_HAVEDATA;
3483     } else {
3484         if (pm != NULL) {
3485             csv_set_as_matrix(c);
3486         }
3487         if (opt & OPT_A) {
3488             csv_set_all_cols(c);
3489         }
3490         if (opt & OPT_V) {
3491             csv_set_verbose(c);
3492         }
3493     }
3494 
3495     if (opt & OPT_I) {
3496         csv_unset_keep_quotes(c);
3497     }
3498 
3499     if (mprn != NULL) {
3500         print_csv_parsing_header(fname, mprn);
3501     }
3502 
3503     /* get line length, also check for binary data, etc. */
3504     c->maxlinelen = csv_max_line_length(fp, c, prn);
3505     if (c->maxlinelen <= 0) {
3506         err = E_DATA;
3507         goto csv_bailout;
3508     }
3509 
3510     if (csv_as_matrix(c) && csv_got_semi(c)) {
3511         if (c->delim == ',' && csv_got_delim(c)) {
3512             c->decpoint = ',';
3513         }
3514         c->delim = ';';
3515     } else if (!fixed_format(c) && !csv_got_delim(c)) {
3516         /* set default delimiter */
3517         if (csv_got_tab(c)) {
3518             c->delim = '\t';
3519         } else if (csv_got_semi(c)) {
3520             c->delim = ';';
3521         } else {
3522             c->delim = ' ';
3523         }
3524     }
3525 
3526 #if CDEBUG
3527     fprintf(stderr, "fixed_format? %s; got_delim (%c)? %s; got_tab? %s; ",
3528             fixed_format(c) ? "yes" : "no", c->delim,
3529             csv_got_delim(c) ? "yes" : "no",
3530             csv_got_tab(c)? "yes" : "no");
3531     fprintf(stderr, "decpoint '%c'\n", c->decpoint);
3532 #endif
3533 
3534     /* buffer to hold lines */
3535     c->line = malloc(c->maxlinelen);
3536     if (c->line == NULL) {
3537         err = E_ALLOC;
3538         goto csv_bailout;
3539     }
3540 
3541  alt_delim:
3542 
3543     if (mprn != NULL) {
3544         if (!fixed_format(c)) {
3545             pprintf(mprn, _("using delimiter '%c'\n"), c->delim);
3546         }
3547         pprintf(mprn, _("   longest line: %d characters\n"), c->maxlinelen - 1);
3548     }
3549 
3550     if (csv_has_trailing_comma(c) && c->delim != ',') {
3551         csv_unset_trailing_comma(c);
3552     }
3553 
3554     gzrewind(fp);
3555 
3556     /* read lines, check for consistency in number of fields */
3557     err = csv_fields_check(fp, c, mprn);
3558     if (err && !fixed_format(c)) {
3559         if (c->delim != ';' && csv_got_semi(c)) {
3560             c->delim = ';';
3561             err = 0;
3562             goto alt_delim;
3563         }
3564         pputs(prn, _(csv_msg));
3565         goto csv_bailout;
3566     }
3567 
3568     err = csv_set_dataset_dimensions(c);
3569     if (err) {
3570         err = E_DATA;
3571         goto csv_bailout;
3572     }
3573 
3574     pprintf(mprn, _("   number of variables: %d\n"), c->dset->v - 1);
3575     pprintf(mprn, _("   number of non-blank lines: %d\n"), c->nrows);
3576 
3577     if (c->dset->n == 0) {
3578         pputs(prn, _("Invalid data file\n"));
3579         err = E_DATA;
3580         goto csv_bailout;
3581     }
3582 
3583     /* initialize CSV dataset */
3584     err = start_new_Z(c->dset, 0);
3585     if (!err && csv_skip_col_1(c)) {
3586         err = dataset_allocate_obs_markers(c->dset);
3587     }
3588 
3589     if (err) {
3590         goto csv_bailout;
3591     }
3592 
3593     /* second pass */
3594 
3595     gzrewind(fp);
3596 
3597     if (fixed_format(c)) {
3598         err = fixed_format_read(c, fp, prn);
3599         if (err) {
3600             goto csv_bailout;
3601         } else {
3602             csv_set_autoname(c);
3603             goto csv_continue;
3604         }
3605     }
3606 
3607     err = csv_varname_scan(c, fp, prn, mprn);
3608     if (err || probing(c)) {
3609         goto csv_bailout;
3610     }
3611 
3612     if (c->decpoint == '.' && get_local_decpoint() == ',') {
3613         /* we're in a locale that uses decimal comma:
3614            switch to the C locale */
3615         gretl_push_c_numeric_locale();
3616         popit = 1;
3617     } else if (c->decpoint == ',' && get_local_decpoint() == '.') {
3618         /* dotsub: define this if we're in a '.' locale and
3619            we've figured that the decimal character is ',' in
3620            the file we're reading
3621         */
3622         csv_set_dotsub(c);
3623     }
3624 
3625     err = csv_read_data(c, fp, prn, mprn);
3626 
3627     if (!err) {
3628         /* try again, under certain conditions */
3629         if (csv_skip_bad(c)) {
3630             err = csv_read_data(c, fp, prn, NULL);
3631         } else if (c->thousep > 0) {
3632             pprintf(mprn, _("WARNING: it seems '%c' is being used "
3633                             "as thousands separator\n"), c->thousep);
3634             c->decpoint = (c->thousep == '.')? ',' : '.';
3635             if (c->decpoint == ',') {
3636                 if (get_local_decpoint() == '.') {
3637                     csv_set_dotsub(c);
3638                 } else if (popit) {
3639                     gretl_pop_c_numeric_locale();
3640                     popit = 0;
3641                 }
3642             }
3643             revise_non_numeric_values(c);
3644             csv_set_scrub_thousep(c);
3645             err = csv_read_data(c, fp, prn, NULL);
3646         }
3647     }
3648 
3649     if (!err && !probing(c)) {
3650         err = csv_non_numeric_check(c, prn);
3651         if (!err && csv_has_non_numeric(c)) {
3652             /* try once more */
3653             err = csv_read_data(c, fp, prn, NULL);
3654         }
3655     }
3656 
3657     if (popit) {
3658         gretl_pop_c_numeric_locale();
3659     }
3660 
3661     if (err) {
3662         goto csv_bailout;
3663     }
3664 
3665     if (csv_data_reversed(c)) {
3666         reverse_data(c->dset, mprn);
3667     }
3668 
3669  csv_continue:
3670 
3671     c->dset->t1 = 0;
3672     c->dset->t2 = c->dset->n - 1;
3673 
3674     if (c->markerpd > 0) {
3675         pputs(mprn, _("taking date information from row labels\n\n"));
3676         if (csv_skip_bad(c)) {
3677             pprintf(prn, "WARNING: Check your data! gretl has stripped out "
3678                     "what appear to be\nextraneous lines in a %s dataset: "
3679                     "this may not be right.\n\n",
3680                     (c->dset->pd == 4)? "quarterly" : "monthly");
3681         }
3682     } else {
3683         pputs(mprn, _("treating these as undated data\n\n"));
3684         dataset_obs_info_default(c->dset);
3685     }
3686 
3687     if (c->dset->pd != 1 || strcmp(c->dset->stobs, "1")) {
3688         c->dset->structure = TIME_SERIES;
3689     }
3690 
3691     if (c->st != NULL) {
3692         err = gretl_string_table_validate(c->st, OPT_NONE);
3693         if (err) {
3694             pputs(prn, _("Failed to interpret the data as numeric\n"));
3695             goto csv_bailout;
3696         } else if (joining(c)) {
3697             gretl_string_table_save(c->st, c->dset);
3698         } else {
3699             gretl_string_table_print(c->st, c->dset, fname, prn);
3700         }
3701     }
3702 
3703     if (csv_as_matrix(c)) {
3704         /* FIXME placement of this */
3705         if (csv_autoname(c)) {
3706             strings_array_free(c->dset->varname, c->dset->v);
3707             c->dset->varname = NULL;
3708         }
3709         *pm = gretl_matrix_data_subset(NULL, c->dset, -1, -1,
3710                                        M_MISSING_OK, &err);
3711         goto csv_bailout;
3712     }
3713 
3714     /* If there were observation labels and they were not interpretable
3715        as dates, and they weren't simply "1, 2, 3, ...", then they
3716        should probably be preserved; otherwise discard them.
3717     */
3718     if (c->dset->S != NULL && c->markerpd >= 0 &&
3719         c->dset->markers != DAILY_DATE_STRINGS) {
3720         dataset_destroy_obs_markers(c->dset);
3721     }
3722 
3723     if (csv_autoname(c)) {
3724         /* no variable names were found */
3725         for (i=1; i<c->dset->v; i++) {
3726             sprintf(c->dset->varname[i], "v%d", i);
3727         }
3728     } else {
3729 #if CDEBUG
3730         int ii;
3731 
3732         for (ii=0; ii<c->dset->v; ii++) {
3733             fprintf(stderr, " c->dset->varname[%d] = '%s'\n", ii, c->dset->varname[ii]);
3734         }
3735 #endif
3736         if (fix_varname_duplicates(c->dset)) {
3737             pputs(prn, _("warning: some variable names were duplicated\n"));
3738         }
3739     }
3740 
3741     if (!joining(c) && !probing(c)) {
3742         int newdata = (dset->Z == NULL);
3743 
3744         /* not doing a special "join" operation */
3745         err = merge_or_replace_data(dset, &c->dset, get_merge_opts(opt), prn);
3746 
3747         if (!err && newdata && c->descrip != NULL) {
3748             dset->descrip = c->descrip;
3749             c->descrip = NULL;
3750         }
3751 
3752         if (!err && newdata) {
3753             dataset_add_import_info(dset, fname, GRETL_CSV);
3754         }
3755     }
3756 
3757  csv_bailout:
3758 
3759     if (fp != NULL) {
3760         gzclose(fp);
3761     }
3762 
3763     if (!err && c->jspec != NULL) {
3764         c->jspec->c = c;
3765     } else if (!err && c->probe != NULL) {
3766         c->probe->dset = c->dset;
3767         c->dset = NULL;
3768         csvdata_free(c);
3769     } else {
3770         csvdata_free(c);
3771     }
3772 
3773     if (altname != NULL) {
3774         gretl_remove(altname);
3775         g_free(altname);
3776     }
3777 
3778     if (err == E_ALLOC) {
3779         pputs(prn, _("Out of memory\n"));
3780     }
3781 
3782     return err;
3783 }
3784 
3785 /**
3786  * import_csv:
3787  * @fname: name of CSV file.
3788  * @dset: dataset struct.
3789  * @opt: use OPT_N to force interpretation of data colums containing
3790  * strings as coded (non-numeric) values and not errors; for use of
3791  * OPT_T see the help for "append".
3792  * @prn: gretl printing struct (or NULL).
3793  *
3794  * Open a Comma-Separated Values data file and read the data into
3795  * the current work space.
3796  *
3797  * Returns: 0 on successful completion, non-zero otherwise.
3798  */
3799 
import_csv(const char * fname,DATASET * dset,gretlopt opt,PRN * prn)3800 int import_csv (const char *fname, DATASET *dset,
3801                 gretlopt opt, PRN *prn)
3802 {
3803     const char *cols = NULL;
3804     const char *rows = NULL;
3805     int ci, err;
3806 
3807     err = incompatible_options(opt, OPT_F | OPT_L);
3808     if (err) {
3809         /* --cols and --fixed-cols */
3810         return err;
3811     }
3812 
3813     ci = (dset != NULL && dset->v > 0)? APPEND : OPEN;
3814 
3815     if (opt & OPT_F) {
3816         /* we should have a "--fixed-cols=XXX" specification */
3817         cols = get_optval_string(ci, OPT_F);
3818         if (cols == NULL || *cols == '\0') {
3819             return E_PARSE;
3820         }
3821     } else if (opt & OPT_L) {
3822         /* should have a "--cols=XXX" specification */
3823         cols = get_optval_string(ci, OPT_L);
3824         if (cols == NULL || *cols == '\0') {
3825             return E_PARSE;
3826         }
3827     }
3828 
3829     if (opt & OPT_M) {
3830         /* we should have a "--rowmask=XXX" specification */
3831         rows = get_optval_string(ci, OPT_M);
3832         if (rows == NULL || *rows == '\0') {
3833             return E_PARSE;
3834         }
3835     }
3836 
3837     return real_import_csv(fname, dset, cols, rows,
3838                            NULL, NULL, NULL, opt, prn);
3839 }
3840 
import_csv_as_matrix(const char * fname,int * err)3841 gretl_matrix *import_csv_as_matrix (const char *fname, int *err)
3842 {
3843 #if CDEBUG
3844     PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
3845 #else
3846     PRN *prn = NULL;
3847 #endif
3848     gretl_matrix *m = NULL;
3849     char csvname[MAXLEN] = {0};
3850     gretlopt opt = OPT_A; /* --all-cols */
3851     int http = 0;
3852 
3853     *err = try_http(fname, csvname, &http);
3854 
3855     if (!*err && http) {
3856         *err = real_import_csv(csvname, NULL, NULL, NULL,
3857                                NULL, NULL, &m, opt, prn);
3858     } else if (!*err) {
3859         char fullname[FILENAME_MAX];
3860 
3861         strcpy(fullname, fname);
3862         gretl_maybe_prepend_dir(fullname);
3863         *err = real_import_csv(fullname, NULL, NULL, NULL,
3864                                NULL, NULL, &m, opt, prn);
3865     }
3866 
3867     gretl_print_destroy(prn);
3868 
3869     return m;
3870 }
3871 
probe_varnames_check(DATASET * dset,gretlopt opt,int * rerun)3872 static int probe_varnames_check (DATASET *dset, gretlopt opt,
3873                                  int *rerun)
3874 {
3875     int missnames = 0;
3876     int i, err = 0;
3877 
3878     for (i=1; i<dset->v; i++) {
3879         if (dset->varname[i][0] == '\0') {
3880             missnames = 1;
3881             break;
3882         }
3883     }
3884 
3885     if (missnames) {
3886         if (opt & OPT_H) {
3887             gretl_errmsg_set("Couldn't find all variable names");
3888             err = E_DATA;
3889         } else {
3890             *rerun = 1;
3891         }
3892     }
3893 
3894     return err;
3895 }
3896 
3897 /**
3898  * probe_csv:
3899  * @fname: name of CSV file.
3900  * @varnames: location to receive variable names.
3901  * @nvars: location to receive number of variables (columns).
3902  * @opt: on input, may contain any extra options to pass to
3903  * real_import_csv(); on return, OPT_H (indicating that the
3904  * CSV file has no header) may be added if it seems to be
3905  * required (no header).
3906  *
3907  * Open a Comma-Separated Values data file and read enough to
3908  * determine the variable names.
3909  *
3910  * Returns: 0 on successful completion, non-zero otherwise.
3911  */
3912 
probe_csv(const char * fname,char *** varnames,int * nvars,gretlopt * opt)3913 int probe_csv (const char *fname, char ***varnames,
3914                int *nvars, gretlopt *opt)
3915 {
3916     csvprobe probe = {0};
3917     int err;
3918 
3919     err = real_import_csv(fname, NULL, NULL, NULL, NULL,
3920                           &probe, NULL, *opt, NULL);
3921 
3922     if (!err) {
3923         int rerun = 0;
3924 
3925         err = probe_varnames_check(probe.dset, *opt, &rerun);
3926 
3927         if (err || rerun) {
3928             destroy_dataset(probe.dset);
3929             probe.dset = NULL;
3930         }
3931 
3932         if (!err && rerun) {
3933             /* try again with --no-header flag */
3934             *opt |= OPT_H;
3935             err = real_import_csv(fname, NULL, NULL, NULL, NULL,
3936                                   &probe, NULL, *opt, NULL);
3937         }
3938 
3939         if (!err) {
3940             /* steal the varname array */
3941             *varnames = probe.dset->varname;
3942             *nvars = probe.dset->v;
3943             probe.dset->varname = NULL;
3944         }
3945 
3946         destroy_dataset(probe.dset);
3947     }
3948 
3949     return err;
3950 }
3951 
csv_open_needs_matrix(gretlopt opt)3952 int csv_open_needs_matrix (gretlopt opt)
3953 {
3954     int ret = 0;
3955 
3956     if (opt & OPT_M) {
3957         /* --rowmask=matrix */
3958         ret = 1;
3959     } else if (opt & OPT_F) {
3960         /* --fixed-cols=whatever */
3961         const char *s = get_optval_string(OPEN, OPT_F);
3962 
3963         ret = get_matrix_by_name(s) != NULL;
3964     }
3965 
3966     return ret;
3967 }
3968