1 /*
2 * gretl -- Gnu Regression, Econometrics and Time-series Library
3 * Copyright (C) 2001 Allin Cottrell and Riccardo "Jack" Lucchetti
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 */
19
20 #include "libgretl.h"
21 #include "gretl_string_table.h"
22 #include "libset.h"
23 #include "usermat.h"
24 #include "uservar.h"
25 #include "genparse.h"
26 #include "gretl_xml.h"
27 #include "gretl_midas.h"
28 #include "matrix_extra.h"
29 #include "gretl_www.h"
30 #include "gretl_join.h"
31 #include "join_priv.h"
32 #include "csvdata.h"
33
34 #ifdef WIN32
35 # include "gretl_win32.h"
36 #endif
37
38 #include <errno.h>
39
40 #define CDEBUG 0 /* CSV reading in general */
41
42 #define CSVSTRLEN 128
43
44 enum {
45 CSV_HAVEDATA = 1 << 0,
46 CSV_GOTDELIM = 1 << 1,
47 CSV_GOTTAB = 1 << 2,
48 CSV_GOTSEMI = 1 << 3,
49 CSV_BLANK1 = 1 << 4,
50 CSV_OBS1 = 1 << 5,
51 CSV_TRAIL = 1 << 6,
52 CSV_AUTONAME = 1 << 7,
53 CSV_REVERSED = 1 << 8,
54 CSV_DOTSUB = 1 << 9,
55 CSV_ALLCOLS = 1 << 10,
56 CSV_BOM = 1 << 11,
57 CSV_VERBOSE = 1 << 12,
58 CSV_THOUSEP = 1 << 13,
59 CSV_NOHEADER = 1 << 14,
60 CSV_QUOTES = 1 << 15,
61 CSV_AS_MAT = 1 << 16
62 };
63
64 struct csvprobe_ {
65 DATASET *dset; /* more info might be wanted */
66 };
67
68 typedef struct csvprobe_ csvprobe;
69
70 struct csvdata_ {
71 int flags;
72 char delim;
73 char decpoint;
74 char thousep;
75 char qchar;
76 int markerpd;
77 int maxlinelen;
78 int real_n;
79 char *line;
80 DATASET *dset;
81 int ncols, nrows;
82 long datapos;
83 char str[CSVSTRLEN];
84 char skipstr[8];
85 int *codelist;
86 char *descrip;
87 const char *user_na;
88 gretl_string_table *st;
89 int *cols_list;
90 int *width_list;
91 const gretl_matrix *rowmask;
92 int masklen;
93 joinspec *jspec; /* info used for "join" command */
94 csvprobe *probe; /* used in connection with "join" */
95 };
96
97 #define csv_has_trailing_comma(c) (c->flags & CSV_TRAIL)
98 #define csv_has_obs_column(c) (c->flags & CSV_OBS1)
99 #define csv_has_blank_column(c) (c->flags & CSV_BLANK1)
100 #define csv_got_tab(c) (c->flags & CSV_GOTTAB)
101 #define csv_got_semi(c) (c->flags & CSV_GOTSEMI)
102 #define csv_got_delim(c) (c->flags & CSV_GOTDELIM)
103 #define csv_autoname(c) (c->flags & CSV_AUTONAME)
104 #define csv_skip_col_1(c) (c->flags & (CSV_OBS1 | CSV_BLANK1))
105 #define csv_have_data(c) (c->flags & CSV_HAVEDATA)
106 #define csv_data_reversed(c) (c->flags & CSV_REVERSED)
107 #define csv_do_dotsub(c) (c->flags & CSV_DOTSUB)
108 #define csv_all_cols(c) (c->flags & CSV_ALLCOLS)
109 #define csv_has_bom(c) (c->flags & CSV_BOM)
110 #define csv_is_verbose(c) (c->flags & CSV_VERBOSE)
111 #define csv_scrub_thousep(c) (c->flags & CSV_THOUSEP)
112 #define csv_no_header(c) (c->flags & CSV_NOHEADER)
113 #define csv_keep_quotes(c) (c->flags & CSV_QUOTES)
114 #define csv_as_matrix(c) (c->flags & CSV_AS_MAT)
115
116 #define csv_set_trailing_comma(c) (c->flags |= CSV_TRAIL)
117 #define csv_unset_trailing_comma(c) (c->flags &= ~CSV_TRAIL)
118 #define csv_set_obs_column(c) (c->flags |= CSV_OBS1)
119 #define csv_set_blank_column(c) (c->flags |= CSV_BLANK1)
120 #define csv_set_got_tab(c) (c->flags |= CSV_GOTTAB)
121 #define csv_set_got_semi(c) (c->flags |= CSV_GOTSEMI)
122 #define csv_set_got_delim(c) (c->flags |= CSV_GOTDELIM)
123 #define csv_set_autoname(c) (c->flags |= CSV_AUTONAME)
124 #define csv_set_data_reversed(c) (c->flags |= CSV_REVERSED)
125 #define csv_set_dotsub(c) (c->flags |= CSV_DOTSUB)
126 #define csv_set_all_cols(c) (c->flags |= CSV_ALLCOLS)
127 #define csv_set_has_bom(c) (c->flags |= CSV_BOM)
128 #define csv_set_verbose(c) (c->flags |= CSV_VERBOSE)
129 #define csv_set_scrub_thousep(c) (c->flags |= CSV_THOUSEP)
130 #define csv_set_no_header(c) (c->flags |= CSV_NOHEADER)
131 #define csv_unset_keep_quotes(c) (c->flags &= ~CSV_QUOTES)
132 #define csv_set_as_matrix(c) (c->flags |= CSV_AS_MAT)
133
134 #define csv_skip_bad(c) (*c->skipstr != '\0')
135 #define csv_has_non_numeric(c) (c->st != NULL)
136
137 #define fixed_format(c) (c->cols_list != NULL && c->width_list != NULL)
138 #define cols_subset(c) (c->cols_list != NULL && c->width_list == NULL)
139 #define rows_subset(c) (c->rowmask != NULL)
140
141 #define joining(c) (c->jspec != NULL)
142 #define probing(c) (c->probe != NULL)
143
144 static int
145 time_series_label_check (DATASET *dset, int reversed, char *skipstr,
146 int convert_pd, PRN *prn);
147
148 /* file-scope global */
149 static char import_na[8];
150
151
152 /* for use in gretl_join.c */
153
csvdata_get_dataset(csvdata * c)154 DATASET *csvdata_get_dataset (csvdata *c)
155 {
156 return c->dset;
157 }
158
159 /* shared with gretl_join.c */
160
csvdata_free(csvdata * c)161 void csvdata_free (csvdata *c)
162 {
163 if (c == NULL) {
164 return;
165 }
166
167 if (c->descrip != NULL) {
168 free(c->descrip);
169 }
170
171 if (c->st != NULL) {
172 gretl_string_table_destroy(c->st);
173 }
174
175 if (c->codelist != NULL) {
176 free(c->codelist);
177 }
178
179 if (c->line != NULL) {
180 free(c->line);
181 }
182
183 if (c->cols_list != NULL) {
184 free(c->cols_list);
185 free(c->width_list);
186 }
187
188 destroy_dataset(c->dset);
189
190 free(c);
191 }
192
csvdata_new(DATASET * dset)193 static csvdata *csvdata_new (DATASET *dset)
194 {
195 csvdata *c = malloc(sizeof *c);
196
197 if (c == NULL) {
198 return NULL;
199 }
200
201 c->flags = CSV_QUOTES;
202 c->delim = '\t';
203 c->thousep = 0;
204 c->qchar = 0;
205 c->markerpd = -1;
206 c->maxlinelen = 0;
207 c->real_n = 0;
208 c->line = NULL;
209 c->dset = NULL;
210 c->ncols = 0;
211 c->nrows = 0;
212 c->datapos = 0;
213 *c->str = '\0';
214 *c->skipstr = '\0';
215 c->codelist = NULL;
216 c->descrip = NULL;
217 c->user_na = NULL;
218 c->st = NULL;
219 c->cols_list = NULL;
220 c->width_list = NULL;
221 c->rowmask = NULL;
222 c->masklen = 0;
223
224 if (strcmp(import_na, "default")) {
225 c->user_na = import_na;
226 }
227
228 c->jspec = NULL;
229 c->probe = NULL;
230
231 c->dset = datainfo_new();
232
233 if (c->dset == NULL) {
234 free(c);
235 c = NULL;
236 } else {
237 c->delim = get_data_export_delimiter();
238 c->decpoint = get_data_export_decpoint();
239 if (dset != NULL && dset->Z != NULL) {
240 c->flags |= CSV_HAVEDATA;
241 }
242 #if CDEBUG
243 fprintf(stderr, "csvdata_new: c->delim = '%c', c->decpoint = '%c'\n",
244 c->delim, c->decpoint);
245 #endif
246 }
247
248 return c;
249 }
250
cols_list_from_matrix(const char * s,int * err)251 static int *cols_list_from_matrix (const char *s, int *err)
252 {
253 gretl_matrix *m = get_matrix_by_name(s);
254 int i, n = gretl_vector_get_length(m);
255 int *list = NULL;
256
257 if (n == 0) {
258 *err = E_DATA;
259 } else {
260 list = gretl_list_new(n);
261 if (list == NULL) {
262 *err = E_ALLOC;
263 } else {
264 for (i=0; i<n; i++) {
265 list[i+1] = gretl_vector_get(m, i);
266 }
267 }
268 }
269
270 return list;
271 }
272
273 /* The interpretation of the "cols" specification depends on
274 @opt: if this includes OPT_L then it should provide a 1-based
275 list of columns to be read; but if @opt includes OPT_F it
276 should provide a fixed-format spec, consisting of pairs
277 (start column, width).
278 */
279
csvdata_add_cols_list(csvdata * c,const char * s,gretlopt opt)280 static int csvdata_add_cols_list (csvdata *c, const char *s,
281 gretlopt opt)
282 {
283 int delimited = (opt & OPT_L);
284 int *list, *clist = NULL, *wlist = NULL;
285 int i, n, m = 0;
286 int err = 0;
287
288 if (get_matrix_by_name(s)) {
289 list = cols_list_from_matrix(s, &err);
290 } else {
291 list = gretl_list_from_string(s, &err);
292 }
293
294 if (!err) {
295 n = list[0];
296 if (n == 0) {
297 err = E_DATA;
298 } else if (delimited) {
299 m = n;
300 clist = list;
301 } else {
302 /* fixed format: we need two lists */
303 if (n % 2 != 0) {
304 err = E_DATA;
305 } else {
306 m = n / 2;
307 clist = gretl_list_new(m);
308 wlist = gretl_list_new(m);
309 if (clist == NULL || wlist == NULL) {
310 err = E_ALLOC;
311 } else {
312 int j = 1;
313
314 for (i=1; i<=n; i+=2, j++) {
315 clist[j] = list[i];
316 wlist[j] = list[i+1];
317 }
318 }
319 }
320 }
321 }
322
323 /* clist = column (start) list: must be a set of increasing
324 positive integers; and wlist = respective column widths,
325 must all be positive, if present
326 */
327
328 for (i=1; i<=m && !err; i++) {
329 if (clist[i] <= 0 || (i > 1 && clist[i] <= clist[i-1])) {
330 err = E_DATA;
331 } else if (wlist != NULL && wlist[i] <= 0) {
332 err = E_DATA;
333 } else if (wlist != NULL && wlist[i] >= CSVSTRLEN) {
334 fprintf(stderr, "Warning: field %d too wide (%d), truncating\n",
335 i, wlist[i]);
336 wlist[i] = CSVSTRLEN - 1;
337 }
338 }
339
340 if (list != clist) {
341 free(list);
342 }
343
344 if (!err) {
345 c->cols_list = clist;
346 c->width_list = wlist;
347 } else {
348 free(clist);
349 free(wlist);
350 if (err == E_DATA) {
351 gretl_errmsg_set(_("Invalid column specification"));
352 }
353 }
354
355 return err;
356 }
357
csvdata_add_row_mask(csvdata * c,const char * s)358 static int csvdata_add_row_mask (csvdata *c, const char *s)
359 {
360 int err = 0;
361
362 c->rowmask = get_matrix_by_name(s);
363 if (c->rowmask == NULL) {
364 gretl_errmsg_sprintf(_("'%s': no such matrix"), s);
365 err = E_DATA;
366 } else {
367 c->masklen = gretl_vector_get_length(c->rowmask);
368 if (c->masklen == 0) {
369 err = E_NONCONF;
370 }
371 }
372
373 return err;
374 }
375
n_from_row_mask(csvdata * c)376 static int n_from_row_mask (csvdata *c)
377 {
378 int i, n = 0;
379
380 for (i=0; i<c->masklen && i<=c->nrows; i++) {
381 if (gretl_vector_get(c->rowmask, i) != 0) {
382 n++;
383 }
384 }
385
386 return n;
387 }
388
add_obs_marker(DATASET * dset,int n)389 static int add_obs_marker (DATASET *dset, int n)
390 {
391 char **S = realloc(dset->S, n * sizeof *S);
392 int err = 0;
393
394 if (S == NULL) {
395 err = E_ALLOC;
396 } else {
397 dset->S = S;
398 dset->S[n-1] = malloc(OBSLEN);
399 if (dset->S[n-1] == NULL) {
400 err = E_ALLOC;
401 } else {
402 strcpy(dset->S[n-1], "NA");
403 }
404 }
405
406 return err;
407 }
408
add_single_obs(DATASET * dset)409 static int add_single_obs (DATASET *dset)
410 {
411 double *x;
412 int i, err = 0;
413
414 for (i=0; i<dset->v && !err; i++) {
415 x = realloc(dset->Z[i], (dset->n + 1) * sizeof *x);
416 if (x != NULL) {
417 dset->Z[i] = x;
418 } else {
419 err = E_ALLOC;
420 }
421 }
422
423 if (!err) {
424 dset->n += 1;
425 dset->Z[0][dset->n - 1] = 1.0;
426 for (i=1; i<dset->v; i++) {
427 dset->Z[i][dset->n - 1] = NADBL;
428 }
429 if (dset->S != NULL) {
430 err = add_obs_marker(dset, dset->n);
431 }
432 }
433
434 return err;
435 }
436
pad_weekly_data(DATASET * dset,int add)437 static int pad_weekly_data (DATASET *dset, int add)
438 {
439 int oldn = dset->n;
440 int ttarg, offset = 0, skip = 0;
441 int i, s, t, tc, err;
442
443 err = dataset_add_observations(dset, add, OPT_A);
444
445 if (!err) {
446 for (t=0; t<oldn; t++) {
447 tc = calendar_obs_number(dset->S[t], dset) - offset;
448 if (tc != t) {
449 skip = tc - t;
450 fprintf(stderr, "Gap of size %d at original t = %d\n", skip, t);
451 offset += skip;
452 ttarg = oldn - 1 + offset;
453 for (s=0; s<oldn-t+skip; s++) {
454 for (i=1; i<dset->v; i++) {
455 if (s < oldn - t) {
456 if (s == 0 || s == oldn-t-1) {
457 fprintf(stderr, "shifting obs %d to obs %d\n",
458 ttarg-skip, ttarg);
459 }
460 dset->Z[i][ttarg] = dset->Z[i][ttarg - skip];
461 } else {
462 fprintf(stderr, "inserting NA at obs %d\n", ttarg);
463 dset->Z[i][ttarg] = NADBL;
464 }
465 }
466 ttarg--;
467 }
468 }
469 }
470 }
471
472 return err;
473 }
474
475 /* FIXME the following needs to be made more flexible? */
476
csv_weekly_data(DATASET * dset)477 static int csv_weekly_data (DATASET *dset)
478 {
479 char *lbl2 = dset->S[dset->n - 1];
480 int ret = 1;
481 int misscount = 0;
482 int t, tc;
483
484 for (t=0; t<dset->n; t++) {
485 tc = calendar_obs_number(dset->S[t], dset) - misscount;
486 if (tc != t) {
487 misscount += tc - t;
488 }
489 }
490
491 if (misscount > 0) {
492 double missfrac = (double) misscount / dset->n;
493
494 fprintf(stderr, "nobs = %d, misscount = %d (%.2f%%)\n",
495 dset->n, misscount, 100.0 * missfrac);
496 if (missfrac > 0.05) {
497 ret = 0;
498 } else {
499 int Tc = calendar_obs_number(lbl2, dset) + 1;
500 int altmiss = Tc - dset->n;
501
502 fprintf(stderr, "check: Tc = %d, missing = %d\n", Tc, altmiss);
503 if (altmiss != misscount) {
504 ret = 0;
505 } else if (dset->Z != NULL) {
506 int err;
507
508 fprintf(stderr, "OK, consistent\n");
509 err = pad_weekly_data(dset, misscount);
510 if (err) ret = 0;
511 }
512 }
513 }
514
515 return ret;
516 }
517
518 #define DAY_DEBUG 1
519
check_daily_dates(DATASET * dset,int * pd,int * reversed,PRN * prn)520 static int check_daily_dates (DATASET *dset, int *pd,
521 int *reversed, PRN *prn)
522 {
523 int T = dset->n;
524 char *lbl1 = dset->S[0];
525 char *lbl2 = dset->S[T - 1];
526 int fulln = 0, n, t, nbak;
527 int alt_pd = 0;
528 int oldpd = dset->pd;
529 double oldsd0 = dset->sd0;
530 guint32 ed1, ed2;
531 int nmiss = 0, err = 0;
532
533 *pd = 0;
534
535 ed1 = get_epoch_day(lbl1);
536 ed2 = get_epoch_day(lbl2);
537 if (ed1 <= 0 || ed2 <= 0) {
538 err = 1;
539 }
540
541 #if DAY_DEBUG
542 fprintf(stderr, "check_daily_dates: '%s' -> %d, '%s' -> %d\n",
543 lbl1, (int) ed1, lbl2, (int) ed2);
544 #endif
545
546 dset->pd = guess_daily_pd(dset);
547 dset->structure = TIME_SERIES;
548
549 #if DAY_DEBUG
550 fprintf(stderr, "guessed at daily pd = %d\n", dset->pd);
551 #endif
552
553 if (!err) {
554 if (ed2 < ed1) {
555 #if DAY_DEBUG
556 fprintf(stderr, "check_daily_dates: data are reversed?\n");
557 #endif
558 dset->sd0 = ed2;
559 *reversed = 1;
560 } else {
561 dset->sd0 = ed1;
562 }
563 }
564
565 recompute:
566
567 alt_pd = 0;
568 nbak = 0;
569
570 if (!err) {
571 guint32 n1 = (*reversed)? ed2 : ed1;
572 guint32 n2 = (*reversed)? ed1 : ed2;
573
574 fulln = n2 - n1 + 1;
575
576 if (T > fulln) {
577 err = 1;
578 } else {
579 nmiss = fulln - T;
580 pprintf(prn, _("Observations: %d; days in sample: %d\n"),
581 T, fulln);
582 if (nmiss > 300 * T) {
583 pprintf(prn, _("Probably annual data\n"));
584 *pd = 1;
585 } else if (nmiss > 50 * T) {
586 pprintf(prn, _("Probably quarterly data\n"));
587 *pd = 4;
588 } else if (nmiss > 20 * T) {
589 pprintf(prn, _("Probably monthly data\n"));
590 *pd = 12;
591 } else if (nmiss > 3 * T) {
592 pprintf(prn, _("Probably weekly data\n"));
593 *pd = dset->pd = 52;
594 } else {
595 pprintf(prn, _("Missing daily rows: %d\n"), nmiss);
596 }
597 }
598 }
599
600 nbak = 0;
601
602 for (t=0; t<dset->n && !err; t++) {
603 int wd, s = (*reversed)? (dset->n - 1 - t) : t;
604
605 wd = weekday_from_date(dset->S[s]);
606
607 if (dset->pd == 5 && (wd == 6 || wd == 0)) {
608 /* Got Sat or Sun, can't be 5-day daily? */
609 alt_pd = (wd == 6)? 6 : 7;
610 pprintf(prn, "Found a Saturday (%s): re-trying with pd = %d\n",
611 dset->S[s], alt_pd);
612 break;
613 } else if (dset->pd == 6 && wd == 0) {
614 /* Got Sun, can't be 6-day daily? */
615 alt_pd = 7;
616 pprintf(prn, "Found a Sunday (%s): re-trying with pd = %d\n",
617 dset->S[s], alt_pd);
618 break;
619 }
620
621 n = calendar_obs_number(dset->S[s], dset);
622 if (n < t) {
623 pprintf(prn, "Daily dates error at t = %d:\n"
624 " calendar_obs_number() for '%s' = %d but t = %d\n",
625 t, dset->S[s], n, t);
626 err = 1;
627 } else if (n > fulln - 1) {
628 pprintf(prn, "Error: date '%s' out of bounds\n", dset->S[s]);
629 err = 1;
630 } else if (nbak > 0 && n == nbak) {
631 pprintf(prn, "Error: date '%s' is repeated\n", dset->S[s]);
632 err = 1;
633 }
634 nbak = n;
635 }
636
637 if (alt_pd > 0) {
638 dset->pd = alt_pd;
639 goto recompute;
640 }
641
642 if (err) {
643 dset->pd = oldpd;
644 dset->sd0 = oldsd0;
645 dset->structure = CROSS_SECTION;
646 } else {
647 strcpy(dset->stobs, (*reversed)? lbl2 : lbl1);
648 strcpy(dset->endobs, (*reversed)? lbl1 : lbl2);
649 dset->t2 = dset->n - 1;
650 if (nmiss > 0 && *pd == 0) {
651 dset->markers = DAILY_DATE_STRINGS;
652 }
653 }
654
655 #if DAY_DEBUG
656 fprintf(stderr, "check_daily_dates: daily pd = %d, reversed = %d, err = %d\n",
657 dset->pd, *reversed, err);
658 #endif
659
660 return (err)? -1 : dset->pd;
661 }
662
663 /* convert from daily date label to a lower frequency --
664 annual, monthly or quarterly -- if @pd indicates this
665 is required
666 */
667
convert_daily_label(char * targ,const char * src,int pd)668 static void convert_daily_label (char *targ, const char *src,
669 int pd)
670 {
671 int y, m, d;
672
673 sscanf(src, YMD_READ_FMT, &y, &m, &d);
674
675 if (pd == 1) {
676 sprintf(targ, "%d", y);
677 } else if (pd == 12) {
678 sprintf(targ, "%d:%02d", y, m);
679 } else if (pd == 4) {
680 sprintf(targ, "%d:%d", y, m / 3 + (m % 3 != 0));
681 }
682 }
683
684 /* There's a special case (ugh!) where observation strings are
685 given as in monthly data, but the frequency is in fact
686 quarterly, as in:
687
688 1947.06
689 1947.09
690 1947.12
691 1948.03
692
693 we'll make a brave attempt to handle this.
694 */
695
696 #define fakequarter(m) (m==3 || m==6 || m==9 || m==12)
697
consistent_qm_labels(DATASET * dset,int reversed,int convert_pd,char * skipstr,int * ppd,const char * fmt,int * extra_zero,PRN * prn)698 static int consistent_qm_labels (DATASET *dset, int reversed,
699 int convert_pd, char *skipstr,
700 int *ppd, const char *fmt,
701 int *extra_zero, PRN *prn)
702 {
703 char bad[16], skip[8];
704 char label[OBSLEN];
705 int Ey; /* expected year */
706 int Ep; /* expected sub-period */
707 int t, s, yr, per;
708 int pmin = 1;
709 int pd, pd0;
710 int ret = 1;
711
712 pd = pd0 = *ppd;
713
714 restart:
715
716 s = reversed ? (dset->n - 1) : 0;
717
718 if (convert_pd) {
719 convert_daily_label(label, dset->S[s], pd);
720 } else {
721 strcpy(label, dset->S[s]);
722 }
723
724 if (sscanf(label, fmt, &yr, &per) != 2) {
725 return 0;
726 }
727
728 for (t=1; t<dset->n; t++) {
729 s = (reversed)? (dset->n - 1 - t) : t;
730 Ey = (per == pd)? yr + 1 : yr;
731 Ep = (per == pd)? pmin : per + pmin;
732
733 if (convert_pd) {
734 convert_daily_label(label, dset->S[s], pd);
735 } else {
736 strcpy(label, dset->S[s]);
737 }
738
739 if (sscanf(label, fmt, &yr, &per) != 2) {
740 ret = 0;
741 } else if (Ep == 1 && pd == pd0 && per == pd + 1
742 && skipstr != NULL) {
743 *skip = *bad = '\0';
744 strncat(skip, label + 4, 7);
745 strncat(bad, label, OBSLEN-1);
746 pd = pd0 + 1;
747 goto restart;
748 } else if (per == Ep + 2 && pmin == 1 && fakequarter(per)) {
749 *bad = '\0';
750 strncat(bad, label, OBSLEN-1);
751 pmin = 3;
752 goto restart;
753 } else if (pd == 12 && Ep == 5 && per == 1 && yr == Ey + 1) {
754 /* apparently monthly but really quarterly? */
755 pprintf(prn, " \"%s\": quarterly date with spurious zero?\n", label);
756 *extra_zero = 1;
757 *ppd = pd0 = pd = 4;
758 goto restart;
759 } else if (yr != Ey || per != Ep) {
760 ret = 0;
761 }
762
763 if (!ret) {
764 pprintf(prn, " %s: not a consistent date\n", label);
765 break;
766 }
767 }
768
769 if (ret) {
770 if (pmin == 3) {
771 pprintf(prn, " \"%s\": quarterly data pretending to be monthly?\n",
772 bad);
773 *ppd = 4;
774 } else if (pd == pd0 + 1) {
775 pprintf(prn, " \"%s\": BLS-type nonsense? Trying again\n",
776 bad);
777 strcpy(skipstr, skip);
778 }
779 }
780
781 return ret;
782 }
783
consistent_year_labels(const DATASET * dset,int reversed,int convert_pd)784 static int consistent_year_labels (const DATASET *dset,
785 int reversed,
786 int convert_pd)
787 {
788 char label[OBSLEN];
789 int s, t, yr, yprev;
790 int ret = 1;
791
792 s = (reversed)? (dset->n - 1) : 0;
793 yprev = atoi(dset->S[s]);
794
795 for (t=1; t<dset->n; t++) {
796 s = reversed ? (dset->n - 1 - t) : t;
797 if (convert_pd) {
798 convert_daily_label(label, dset->S[s], 1);
799 yr = atoi(label);
800 } else {
801 yr = atoi(dset->S[s]);
802 }
803 if (yr != yprev + 1) {
804 ret = 0;
805 break;
806 }
807 yprev = yr;
808 }
809
810 return ret;
811 }
812
813 /* check for all 1s in first column of dates: this may
814 indicate start-of-period dates, day first */
815
all_day_ones(DATASET * dset)816 static int all_day_ones (DATASET *dset)
817 {
818 int t;
819
820 for (t=1; t<dset->n; t++) {
821 if (atoi(dset->S[t]) != 1) {
822 return 0;
823 } else if (t > 31) {
824 /* "1" can't mean January */
825 return 1;
826 }
827 }
828
829 return 0;
830 }
831
832 enum date_orders {
833 YYYYMMDD = 1,
834 MMDDYYYY,
835 DDMMYYYY
836 };
837
get_date_order(int f0,int fn,DATASET * dset)838 static int get_date_order (int f0, int fn, DATASET *dset)
839 {
840 if (f0 > 31 || fn > 31) {
841 /* first field must be year */
842 return YYYYMMDD;
843 } else if (f0 > 12 || fn > 12) {
844 /* first field must be day */
845 return DDMMYYYY;
846 } else if (f0 == 1 && fn == 1 && all_day_ones(dset)) {
847 /* start-of-period dates, day first? */
848 return DDMMYYYY;
849 } else {
850 /* could be wrong here */
851 return MMDDYYYY;
852 }
853 }
854
retransform_daily_dates(DATASET * dset)855 static void retransform_daily_dates (DATASET *dset)
856 {
857 int t, y, m, d;
858
859 /* we apparently guessed wrongly at MMDDYYYY, so
860 put the dates back as they were for another try,
861 at DDMMYYYY.
862 */
863
864 for (t=0; t<dset->n; t++) {
865 sscanf(dset->S[t], YMD_READ_FMT, &y, &d, &m);
866 sprintf(dset->S[t], YMD_WRITE_FMT, d, m, y);
867 }
868 }
869
transform_daily_dates(DATASET * dset,int dorder,char sep)870 static int transform_daily_dates (DATASET *dset, int dorder,
871 char sep)
872 {
873 char *label, fmt[16];
874 int t, yr, mon, day;
875 int n, err = 0;
876
877 if (sep > 0) {
878 sprintf(fmt, "%%d%c%%d%c%%d", sep, sep);
879 } else {
880 strcpy(fmt, "%4d%2d%2d");
881 }
882
883 for (t=0; t<dset->n && !err; t++) {
884 label = dset->S[t];
885 if (dorder == YYYYMMDD) {
886 n = sscanf(label, fmt, &yr, &mon, &day);
887 } else if (dorder == DDMMYYYY) {
888 n = sscanf(label, fmt, &day, &mon, &yr);
889 } else {
890 n = sscanf(label, fmt, &mon, &day, &yr);
891 }
892 if (n == 3) {
893 sprintf(label, YMD_WRITE_Y2_FMT, yr, mon, day);
894 } else {
895 err = 1;
896 }
897 }
898
899 return err;
900 }
901
reverse_data(DATASET * dset,PRN * prn)902 void reverse_data (DATASET *dset, PRN *prn)
903 {
904 char tmp[OBSLEN];
905 double x;
906 int T = dset->n / 2;
907 int i, t, s;
908
909 pprintf(prn, _("reversing the data!\n"));
910
911 for (t=0; t<T; t++) {
912 s = dset->n - 1 - t;
913 for (i=1; i<dset->v; i++) {
914 x = dset->Z[i][t];
915 dset->Z[i][t] = dset->Z[i][s];
916 dset->Z[i][s] = x;
917 }
918 if (dset->S != NULL) {
919 strcpy(tmp, dset->S[t]);
920 strcpy(dset->S[t], dset->S[s]);
921 strcpy(dset->S[s], tmp);
922 }
923 }
924 }
925
csv_daily_date_check(DATASET * dset,int * reversed,char * skipstr,PRN * prn)926 static int csv_daily_date_check (DATASET *dset, int *reversed,
927 char *skipstr, PRN *prn)
928 {
929 int d1[3], d2[3];
930 char s1 = 0, s2 = 0;
931 char *lbl1 = dset->S[0];
932 char *lbl2 = dset->S[dset->n - 1];
933 int dorder = 0;
934
935 if ((sscanf(lbl1, "%d%c%d%c%d", &d1[0], &s1, &d1[1], &s2, &d1[2]) == 5 &&
936 sscanf(lbl2, "%d%c%d%c%d", &d2[0], &s1, &d2[1], &s2, &d2[2]) == 5 &&
937 s1 == s2 && ispunct(s1)) ||
938 (sscanf(lbl1, "%4d%2d%2d", &d1[0], &d1[1], &d1[2]) == 3 &&
939 sscanf(lbl2, "%4d%2d%2d", &d2[0], &d2[1], &d2[2]) == 3)) {
940 int mon1, day1;
941 int mon2, day2;
942 int pd, ret = 0;
943
944 dorder = get_date_order(d1[0], d2[0], dset);
945
946 tryagain:
947
948 if (dorder == YYYYMMDD) {
949 pputs(prn, _("Trying date order YYYYMMDD\n"));
950 mon1 = d1[1];
951 day1 = d1[2];
952 mon2 = d2[1];
953 day2 = d2[2];
954 } else if (dorder == DDMMYYYY) {
955 pputs(prn, _("Trying date order DDMMYYYY\n"));
956 day1 = d1[0];
957 mon1 = d1[1];
958 day2 = d2[0];
959 mon2 = d2[1];
960 } else {
961 pputs(prn, _("Trying date order MMDDYYYY\n"));
962 mon1 = d1[0];
963 day1 = d1[1];
964 mon2 = d2[0];
965 day2 = d2[1];
966 }
967
968 if (mon1 > 0 && mon1 < 13 &&
969 mon2 > 0 && mon2 < 13 &&
970 day1 > 0 && day1 < 32 &&
971 day2 > 0 && day2 < 32) {
972 /* looks promising for calendar dates, but check
973 further if we don't have the canonical order
974 or separator
975 */
976 if (dorder != YYYYMMDD || s1 != '-') {
977 if (transform_daily_dates(dset, dorder, s1)) {
978 return -1;
979 }
980 s1 = '-';
981 }
982 pprintf(prn, _("Could be %s - %s\n"), lbl1, lbl2);
983 ret = check_daily_dates(dset, &pd, reversed, prn);
984 if (ret >= 0 && pd > 0) {
985 if (pd == 52) {
986 if (csv_weekly_data(dset)) {
987 ret = 52;
988 } else if (dorder == MMDDYYYY) {
989 /* maybe we guessed wrong */
990 retransform_daily_dates(dset);
991 dorder = DDMMYYYY;
992 goto tryagain;
993 } else {
994 ret = -1;
995 }
996 } else {
997 int convert_pd = 0;
998
999 if (pd == 1 || pd == 4 || pd == 12) {
1000 convert_pd = pd;
1001 }
1002 ret = time_series_label_check(dset,
1003 *reversed,
1004 skipstr,
1005 convert_pd,
1006 prn);
1007 if (ret < 0 && dorder == MMDDYYYY) {
1008 retransform_daily_dates(dset);
1009 dorder = DDMMYYYY;
1010 goto tryagain;
1011 }
1012 }
1013 }
1014 return ret;
1015 }
1016 } else {
1017 pprintf(prn, _("'%s' and '%s': couldn't get dates\n"), lbl1, lbl2);
1018 }
1019
1020 return -1;
1021 }
1022
pd_from_date_label(const char * lbl,char * year,char * subp,char * format,PRN * prn)1023 static int pd_from_date_label (const char *lbl, char *year, char *subp,
1024 char *format, PRN *prn)
1025 {
1026 const char *subchars = ".:QqMmPp-";
1027 int len = strlen(lbl);
1028 int try, pd = -1;
1029
1030 strncat(year, lbl, 4);
1031 try = atoi(year);
1032
1033 if (try > 0 && try < 3000) {
1034 pprintf(prn, _(" %s: probably a year... "), year);
1035 } else {
1036 pprintf(prn, _(" %s: probably not a year\n"), year);
1037 }
1038
1039 if (len == 5) {
1040 pputs(prn, _(" but I can't make sense of the extra bit\n"));
1041 } else if (len == 4) {
1042 pputs(prn, _("and just a year\n"));
1043 pd = 1;
1044 } else {
1045 char sep = lbl[4];
1046 char sub[3], *s = NULL;
1047 int dashQ = 0;
1048 int p;
1049
1050 if (strchr(subchars, sep)) {
1051 *sub = '\0';
1052 strncat(sub, lbl + 5, 2);
1053 s = sub;
1054 if (len == 6 || (len == 7 && (sep == 'q' || sep == 'Q'))) {
1055 if (len == 7) s++;
1056 p = atoi(s);
1057 if (p > 0 && p < 5) {
1058 pprintf(prn, _("quarter %s?\n"), s);
1059 pd = 4;
1060 } else {
1061 pprintf(prn, "quarter %d: not possible\n", p);
1062 }
1063 } else if (len == 7) {
1064 if (*s == 'Q') {
1065 /* YYYY-Qn? This is supported by SDMX */
1066 dashQ = 1;
1067 s++;
1068 }
1069 p = atoi(s);
1070 if (dashQ) {
1071 if (p > 0 && p < 5) {
1072 pprintf(prn, _("quarter %d?\n"), p);
1073 pd = 4;
1074 } else {
1075 pprintf(prn, "quarter %d: not possible\n", p);
1076 }
1077 } else {
1078 if (p > 0 && p < 13) {
1079 pprintf(prn, _("month %s?\n"), s);
1080 pd = 12;
1081 } else {
1082 pprintf(prn, "month %d: not possible\n", p);
1083 }
1084 }
1085 }
1086 strcpy(subp, s);
1087 if (format != NULL && (pd == 4 || pd == 12)) {
1088 if (dashQ) {
1089 sprintf(format, "%%d%cQ%%d", sep);
1090 } else {
1091 sprintf(format, "%%d%c%%d", sep);
1092 }
1093 }
1094 }
1095 }
1096
1097 return pd;
1098 }
1099
time_series_label_check(DATASET * dset,int reversed,char * skipstr,int convert_pd,PRN * prn)1100 static int time_series_label_check (DATASET *dset, int reversed,
1101 char *skipstr, int convert_pd,
1102 PRN *prn)
1103 {
1104 char year[5], sub[3];
1105 char format[8] = {0};
1106 char *lbl1 = dset->S[0];
1107 char *lbl2 = dset->S[dset->n - 1];
1108 char *label;
1109 int pd = -1;
1110
1111 *year = *sub = '\0';
1112 label = reversed ? lbl2 : lbl1;
1113
1114 if (convert_pd) {
1115 char altobs[OBSLEN];
1116
1117 convert_daily_label(altobs, label, convert_pd);
1118 pd = pd_from_date_label(altobs, year, sub, format, prn);
1119 } else {
1120 pd = pd_from_date_label(label, year, sub, format, prn);
1121 }
1122
1123 if (pd == 1) {
1124 if (consistent_year_labels(dset, reversed, convert_pd)) {
1125 dset->pd = pd;
1126 strcpy(dset->stobs, year);
1127 dset->sd0 = atof(dset->stobs);
1128 strcpy(dset->endobs, lbl2);
1129 dset->structure = TIME_SERIES;
1130 } else {
1131 pputs(prn, _(" but the dates are not complete and consistent\n"));
1132 pd = -1;
1133 }
1134 } else if (pd == 4 || pd == 12) {
1135 int savepd = pd;
1136 int extra_zero = 0;
1137
1138 if (consistent_qm_labels(dset, reversed, convert_pd,
1139 skipstr, &pd, format,
1140 &extra_zero, prn)) {
1141 dset->pd = pd;
1142 if (savepd == 12 && pd == 4) {
1143 /* we switched the interpretation from
1144 monthly to quarterly */
1145 int s;
1146
1147 if (extra_zero) {
1148 /* e.g. 1960Q1 written as 1960:01 */
1149 s = atoi(sub + 1);
1150 } else {
1151 /* e.g. 1960Q1 written as 1960:03 */
1152 s = atoi(sub) / 3;
1153 }
1154 sprintf(dset->stobs, "%s:%d", year, s);
1155 } else {
1156 sprintf(dset->stobs, "%s:%s", year, sub);
1157 }
1158 dset->sd0 = obs_str_to_double(dset->stobs);
1159 ntolabel(dset->endobs, dset->n - 1, dset);
1160 } else {
1161 pputs(prn, _(" but the dates are not complete and consistent\n"));
1162 pd = -1;
1163 }
1164 }
1165
1166 return pd;
1167 }
1168
dates_maybe_reversed(const char * s1,const char * s2,PRN * prn)1169 static int dates_maybe_reversed (const char *s1,
1170 const char *s2,
1171 PRN *prn)
1172 {
1173 char d1[5], d2[5];
1174 int ret = 0;
1175
1176 *d1 = *d2 = '\0';
1177
1178 strncat(d1, s1, 4);
1179 strncat(d2, s2, 4);
1180
1181 ret = atoi(d1) > atoi(d2);
1182
1183 if (ret) {
1184 pputs(prn, _(" dates are reversed?\n"));
1185 }
1186
1187 return ret;
1188 }
1189
1190 /* e.g. "M1 1957", "M12 2009" */
1191
fix_IFS_data_labels(DATASET * dset)1192 static int fix_IFS_data_labels (DATASET *dset)
1193 {
1194 char *s1 = dset->S[0];
1195 char *s2 = dset->S[dset->n - 1];
1196 int ret = 0;
1197
1198 if ((*s1 == 'M' || *s1 == 'Q') && *s2 == *s1) {
1199 int n1 = strlen(s1);
1200 int n2 = strlen(s2);
1201
1202 if ((n1 == 7 || n1 == 8) && (n2 == 7 || n2 == 8) &&
1203 isdigit(s1[1]) && isdigit(s2[1])) {
1204 int pmax = (*s1 == 'M')? 12 : 4;
1205 char c, tmp[8], *s;
1206 int y, p, pbak = 0;
1207 int i, n, doit = 1;
1208
1209 for (i=0; i<dset->n; i++) {
1210 s = dset->S[i];
1211 n = strlen(s);
1212 if (n != 7 && n != 8) {
1213 doit = 0;
1214 break;
1215 }
1216 n = sscanf(s, "%c%d %d", &c, &p, &y);
1217 if (n != 3 || c != *s1) {
1218 doit = 0;
1219 break;
1220 }
1221 if (y < 1800 || y > 2500 || p <= 0 || p > pmax) {
1222 doit = 0;
1223 break;
1224 }
1225 if (i > 0 && p != pbak + 1 && p != 1) {
1226 doit = 0;
1227 break;
1228 }
1229 pbak = p;
1230 }
1231
1232 if (doit) {
1233 for (i=0; i<dset->n; i++) {
1234 s = dset->S[i];
1235 sscanf(s, "%c%d %d", &c, &p, &y);
1236 if (pmax == 12) {
1237 sprintf(tmp, "%d:%02d", y, p);
1238 } else {
1239 sprintf(tmp, "%d:%d", y, p);
1240 }
1241 if (strlen(tmp) > strlen(s)) {
1242 free(s);
1243 dset->S[i] = gretl_strdup(tmp);
1244 } else {
1245 strcpy(s, tmp);
1246 }
1247 }
1248 ret = 1;
1249 }
1250 }
1251 }
1252
1253 return ret;
1254 }
1255
month_number(char * s)1256 static int month_number (char *s)
1257 {
1258 const char *mo[] = {
1259 "jan", "feb", "mar", "apr",
1260 "may", "jun", "jul", "aug",
1261 "sep", "oct", "nov", "dec"
1262 };
1263 int i;
1264
1265 gretl_lower(s);
1266
1267 for (i=0; i<12; i++) {
1268 if (!strcmp(s, mo[i])) {
1269 return i+1;
1270 }
1271 }
1272
1273 return 0;
1274 }
1275
1276 /* e.g. "Jan-1980", for monthly or quarterly data */
1277
fix_mon_year_labels(DATASET * dset)1278 static int fix_mon_year_labels (DATASET *dset)
1279 {
1280 char *s1 = dset->S[0];
1281 char *s2 = dset->S[dset->n - 1];
1282 char m1[4] = {0};
1283 char m2[4] = {0};
1284 int yr1 = 0, yr2 = 0;
1285 int ret = 0;
1286
1287 if (strlen(s1) == 8 && strlen(s2) == 8 &&
1288 s1[3] == '-' && s2[3] == '-') {
1289 yr1 = atoi(s1 + 4);
1290 yr2 = atoi(s2 + 4);
1291 strncat(m1, s1, 3);
1292 strncat(m2, s2, 3);
1293 }
1294
1295 if (yr1 > 999 && yr1 < 3000 && yr2 > 999 && yr2 < 3000 &&
1296 month_number(m1) && month_number(m2)) {
1297 int i, p, pbak = 0;
1298 int dt, pd = 0;
1299 char *s;
1300
1301 for (i=0; i<dset->n; i++) {
1302 s = dset->S[i];
1303 if (strlen(s) != 8 || s[3] != '-') {
1304 pd = 0;
1305 break;
1306 }
1307 yr1 = atoi(s + 4);
1308 *m1 = '\0';
1309 strncat(m1, s, 3);
1310 if (yr1 < 1000 || yr1 >= 3000 ||
1311 (p = month_number(m1)) < 1) {
1312 pd = 0;
1313 break;
1314 }
1315 if (i > 0) {
1316 dt = p - pbak;
1317 if (dt != 1 && dt != 3 && p != 1) {
1318 pd = 0;
1319 break;
1320 }
1321 if (pd == 0 && dt > 0) {
1322 pd = (dt == 1)? 12 : 4;
1323 }
1324 }
1325 pbak = p;
1326 }
1327
1328 if (pd > 0) {
1329 for (i=0; i<dset->n; i++) {
1330 s = dset->S[i];
1331 yr1 = atoi(s + 4);
1332 *m1 = '\0';
1333 strncat(m1, s, 3);
1334 p = month_number(m1);
1335 if (pd == 12) {
1336 sprintf(dset->S[i], "%d:%02d", yr1, p);
1337 } else {
1338 sprintf(dset->S[i], "%d:%g", yr1, ceil((3+p)/4.0));
1339 }
1340 }
1341 ret = 1;
1342 }
1343 }
1344
1345 return ret;
1346 }
1347
1348 /* Attempt to parse CSV row labels as dates. Return -1 if this
1349 doesn't work out, or 0 if the labels seem to be just integer
1350 observation numbers, else return the inferred data frequency.
1351 */
1352
test_markers_for_dates(DATASET * dset,int * reversed,char * skipstr,PRN * prn)1353 int test_markers_for_dates (DATASET *dset, int *reversed,
1354 char *skipstr, PRN *prn)
1355 {
1356 char endobs[OBSLEN];
1357 int n = dset->n;
1358 char *lbl1 = dset->S[0];
1359 char *lbl2 = dset->S[n - 1];
1360 int len1 = strlen(lbl1);
1361 int len2 = strlen(lbl2);
1362 int pd = -1;
1363
1364 if (skipstr != NULL && *skipstr != '\0') {
1365 return time_series_label_check(dset, *reversed, skipstr, 0, prn);
1366 }
1367
1368 pprintf(prn, _(" first row label \"%s\", last label \"%s\"\n"),
1369 lbl1, lbl2);
1370
1371 /* are the labels (probably) just 1, 2, 3 etc.? */
1372 sprintf(endobs, "%d", n);
1373 if (!strcmp(lbl1, "1") && !strcmp(lbl2, endobs)) {
1374 return 0;
1375 }
1376
1377 if (fix_IFS_data_labels(dset) || fix_mon_year_labels(dset)) {
1378 lbl1 = dset->S[0];
1379 lbl2 = dset->S[n - 1];
1380 len1 = strlen(lbl1);
1381 }
1382
1383 /* labels are of different lengths? */
1384 if (len1 != len2) {
1385 if (abs(len1 - len2) > 1) {
1386 return -1;
1387 } else if (len2 > len1) {
1388 len1 = len2;
1389 }
1390 }
1391
1392 pputs(prn, _("trying to parse row labels as dates...\n"));
1393
1394 if (len1 == 8 || len1 == 10) {
1395 /* daily data? */
1396 pd = csv_daily_date_check(dset, reversed, skipstr, prn);
1397 } else if (len1 >= 4) {
1398 /* annual, quarterly, monthly? */
1399 if (isdigit((unsigned char) lbl1[0]) &&
1400 isdigit((unsigned char) lbl1[1]) &&
1401 isdigit((unsigned char) lbl1[2]) &&
1402 isdigit((unsigned char) lbl1[3])) {
1403 *reversed = dates_maybe_reversed(lbl1, lbl2, prn);
1404 pd = time_series_label_check(dset, *reversed, skipstr, 0, prn);
1405 } else {
1406 pputs(prn, _(" definitely not a four-digit year\n"));
1407 }
1408 }
1409
1410 if (pd <= 0 && *reversed) {
1411 /* give up the "reversed" notion if we didn't get
1412 a workable time-series interpretation */
1413 *reversed = 0;
1414 }
1415
1416 return pd;
1417 }
1418
utf8_ok(gzFile fp,int pos)1419 static int utf8_ok (gzFile fp, int pos)
1420 {
1421 long mark = gztell(fp);
1422 int len = pos + 9;
1423 char *test = malloc(len + 1);
1424 int i, ret = 0;
1425
1426 gzseek(fp, mark - pos - 1, SEEK_SET);
1427
1428 for (i=0; i<len; i++) {
1429 test[i] = gzgetc(fp);
1430 }
1431 test[i] = '\0';
1432
1433 if (g_utf8_validate(test, -1, NULL)) {
1434 ret = 1;
1435 } else {
1436 GError *gerr = NULL;
1437 gsize wrote = 0;
1438 gchar *tr;
1439
1440 /* try for iso-8859? */
1441 tr = g_convert(test, -1, "UTF-8", "ISO-8859-15",
1442 NULL, &wrote, &gerr);
1443 if (gerr != NULL) {
1444 g_error_free(gerr);
1445 } else {
1446 g_free(tr);
1447 ret = 1;
1448 }
1449 }
1450
1451 free(test);
1452
1453 gzseek(fp, mark, SEEK_SET);
1454
1455 return ret;
1456 }
1457
1458 enum {
1459 UTF_8 = 1,
1460 UTF_16,
1461 UTF_32
1462 };
1463
1464 /* If we got a UTF-16 or UTF-32 BOM, try recoding to
1465 UTF-8 before parsing data. We write the recoded text
1466 to a temporary file in the user's "dotdir" (and
1467 then delete that file once we're done).
1468 */
1469
csv_recode_input(gzFile * fpp,const char * fname,gchar ** pfname,int ucode,PRN * prn)1470 static int csv_recode_input (gzFile *fpp,
1471 const char *fname,
1472 gchar **pfname,
1473 int ucode,
1474 PRN *prn)
1475 {
1476 const gchar *from_set =
1477 (ucode == UTF_32)? "UTF-32" : "UTF-16";
1478 gchar *altname = NULL;
1479 int err = 0;
1480
1481 /* the current stream is not useable as is,
1482 so shut it down
1483 */
1484 gzclose(*fpp);
1485 *fpp = NULL;
1486
1487 /* we'll recode to a temp file in dotdir */
1488 altname = g_strdup_printf("%srecode_tmp.u8", gretl_dotdir());
1489
1490 err = gretl_recode_file(fname, altname,
1491 from_set, "UTF-8",
1492 prn);
1493
1494 if (!err) {
1495 /* try reattaching the stream */
1496 *fpp = gretl_gzopen(altname, "rb");
1497 if (*fpp == NULL) {
1498 gretl_remove(altname);
1499 err = E_FOPEN;
1500 } else {
1501 pputs(prn, "switched to recoded input\n");
1502 *pfname = altname;
1503 altname = NULL;
1504 }
1505 }
1506
1507 g_free(altname);
1508
1509 return err;
1510 }
1511
1512 /* Check the first 4 bytes of "CSV" input for a Byte Order
1513 Mark. If we find the UTF-8 BOM (typically written by
1514 Microsoft tools), simply record the fact so that we can
1515 skip it on reading. But if we find a BOM indicating a
1516 16-bit or 32-bit unicode encoding, flag this by returning
1517 a non-zero @ucode value; in that case we'll attempt a
1518 full recording of the input (via GLib) before we start
1519 reading data.
1520 */
1521
csv_unicode_check(gzFile fp,csvdata * c,PRN * prn)1522 static int csv_unicode_check (gzFile fp, csvdata *c, PRN *prn)
1523 {
1524 unsigned char b[4];
1525 int n = gzread(fp, b, 4);
1526 int ucode = 0;
1527
1528 if (n == 4) {
1529 if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) {
1530 pputs(prn, "got UTF-8 BOM\n");
1531 ucode = UTF_8;
1532 } else if (b[0] == 0xFE && b[1] == 0xFF) {
1533 pputs(prn, "got UTF-16BE, will try recoding\n");
1534 ucode = UTF_16;
1535 } else if (b[0] == 0xFF && b[1] == 0xFE) {
1536 if (b[2] == 0 && b[3] == 0) {
1537 pputs(prn, "got UTF-32LE, will try recoding\n");
1538 ucode = UTF_32;
1539 } else {
1540 pputs(prn, "got UTF-16LE, will try recoding\n");
1541 ucode = UTF_16;
1542 }
1543 } else if (b[0] == 0 && b[1] == 0 &&
1544 b[0] == 0xFE && b[1] == 0xFF) {
1545 pputs(prn, "got UTF-32BE, will try recoding\n");
1546 ucode = UTF_32;
1547 }
1548 }
1549
1550 if (ucode == UTF_8) {
1551 csv_set_has_bom(c);
1552 gzseek(fp, 3, SEEK_SET);
1553 ucode = 0;
1554 } else {
1555 gzrewind(fp);
1556 }
1557
1558 return ucode;
1559 }
1560
1561 /* The function below checks for the maximum line length in the given
1562 file. It also checks for extraneous binary data (the file is
1563 supposed to be plain text), and checks whether the 'delim'
1564 character is present in the file, on a non-comment line (where
1565 a comment line is one that starts with '#').
1566
1567 In addition, we check whether the file has a trailing comma on every
1568 line, and for the numbers of double- and single-quote characters
1569 to try to determine which, if either, is used to indicate quoted
1570 fields in the input.
1571 */
1572
csv_max_line_length(gzFile fp,csvdata * cdata,PRN * prn)1573 static int csv_max_line_length (gzFile fp, csvdata *cdata, PRN *prn)
1574 {
1575 int c, c1, cbak = 0, cc = 0;
1576 int comment = 0, maxlinelen = 0;
1577 int max_ldquo = 0, max_lsquo = 0;
1578 int min_ldquo = 0, min_lsquo = 0;
1579 int ldquo = 0, lsquo = 0;
1580 int ndquo = 0, nsquo = 0;
1581 int crlf = 0, lines = 0;
1582
1583 csv_set_trailing_comma(cdata); /* just provisionally */
1584
1585 while ((c = gzgetc(fp)) != EOF) {
1586 if (c == 0x0d) {
1587 /* CR */
1588 c1 = gzgetc(fp);
1589 if (c1 == EOF) {
1590 break;
1591 } else if (c1 == 0x0a) {
1592 /* CR + LF -> LF */
1593 crlf = 1;
1594 c = c1;
1595 } else {
1596 /* Mac-style: CR not followed by LF */
1597 c = 0x0a;
1598 gzungetc(c1, fp);
1599 }
1600 }
1601 if (c == 0x0a) {
1602 if (cc > maxlinelen) {
1603 maxlinelen = cc;
1604 }
1605 cc = 0;
1606 if (cbak != 0 && cbak != ',') {
1607 csv_unset_trailing_comma(cdata);
1608 }
1609 lines++;
1610 if (ldquo > max_ldquo) {
1611 max_ldquo = ldquo;
1612 } else if (ldquo > 0 && ldquo < max_ldquo) {
1613 min_ldquo = ldquo;
1614 }
1615 if (lsquo > max_lsquo) {
1616 max_lsquo = lsquo;
1617 } else if (lsquo > 0 && lsquo < max_lsquo) {
1618 min_lsquo = lsquo;
1619 }
1620 ldquo = lsquo = 0;
1621 continue;
1622 }
1623 cbak = c;
1624 if (!isspace((unsigned char) c) && !isprint((unsigned char) c) &&
1625 !(c == CTRLZ) && !utf8_ok(fp, cc)) {
1626 pprintf(prn, _("Binary data (%d) encountered (line %d:%d): "
1627 "this is not a valid text file\n"),
1628 c, lines + 1, cc + 1);
1629 return -1;
1630 }
1631 if (cc == 0) {
1632 comment = (c == '#');
1633 }
1634 if (!comment) {
1635 if (c == '\t') {
1636 /* let's ignore trailing tabs in this heuristic */
1637 c1 = gzgetc(fp);
1638 if (c1 != 0x0d && c1 != 0x0a) {
1639 csv_set_got_tab(cdata);
1640 }
1641 gzungetc(c1, fp);
1642 }
1643 if (c == ';') {
1644 csv_set_got_semi(cdata);
1645 }
1646 if (c == cdata->delim) {
1647 csv_set_got_delim(cdata);
1648 } else if (c == '"') {
1649 ldquo++;
1650 ndquo++;
1651 } else if (c == '\'') {
1652 lsquo++;
1653 nsquo++;
1654 }
1655 }
1656 cc++;
1657 }
1658
1659 if (maxlinelen == 0) {
1660 pputs(prn, _("Data file is empty\n"));
1661 } else if (csv_has_trailing_comma(cdata)) {
1662 pputs(prn, _("Data file has trailing commas\n"));
1663 }
1664
1665 if (ndquo > 0 || nsquo > 0) {
1666 /* candidates for quotation character? */
1667 int cands[2] = {0};
1668
1669 if (ndquo > 0) {
1670 pprintf(prn, _("Found %d double-quotes, max %d per line\n"),
1671 ndquo, max_ldquo);
1672 }
1673 if (nsquo > 0) {
1674 pprintf(prn, _("Found %d single-quotes, max %d per line\n"),
1675 nsquo, max_lsquo);
1676 }
1677 if (max_ldquo > 0 && max_ldquo % 2 == 0) {
1678 /* double-quote is a candidate? */
1679 if (min_ldquo > 0 && min_ldquo % 2) {
1680 ; /* nope */
1681 } else {
1682 cands[0] = 1;
1683 }
1684 }
1685 if (max_lsquo > 0 && max_lsquo % 2 == 0) {
1686 /* single-quote is a candidate? */
1687 if (min_lsquo > 0 && min_lsquo % 2) {
1688 ; /* nope */
1689 } else {
1690 cands[1] = 1;
1691 }
1692 }
1693 if (cands[0] && cands[1]) {
1694 /* hmm, rule one out: prefer the more numerous */
1695 if (nsquo > ndquo) {
1696 cands[0] = 0;
1697 } else {
1698 cands[1] = 0;
1699 }
1700 }
1701 if (cands[0]) {
1702 pputs(prn, _("Assuming double-quote is the relevant "
1703 "quotation character\n"));
1704 cdata->qchar = '"';
1705 } else if (cands[1]) {
1706 pputs(prn, _("Assuming single-quote is the relevant "
1707 "quotation character\n"));
1708 cdata->qchar = '\'';
1709 }
1710 }
1711
1712 if (maxlinelen > 0) {
1713 /* allow for newline and null terminator */
1714 maxlinelen += 2 + crlf;
1715 }
1716
1717 return maxlinelen;
1718 }
1719
1720 #define nonspace_delim(d) (d != ',' && d != ';' && d != '\t')
1721
count_csv_fields(csvdata * c)1722 static int count_csv_fields (csvdata *c)
1723 {
1724 const char *s = c->line;
1725 int inquote = 0;
1726 int cbak, nf = 0;
1727
1728 if (*s == c->delim && *s == ' ') {
1729 s++;
1730 }
1731
1732 while (*s) {
1733 if (csv_keep_quotes(c) && *s == c->qchar) {
1734 inquote = !inquote;
1735 } else if (!inquote && *s == c->delim) {
1736 nf++;
1737 }
1738 cbak = *s;
1739 s++;
1740 /* Problem: (when) should a trailing delimiter be read as an
1741 implicit NA? For now we'll so treat it if the delimiter
1742 is not plain space.
1743 */
1744 if (*s == '\0' && cbak == c->delim && nonspace_delim(c->delim)) {
1745 nf--;
1746 }
1747 }
1748
1749 return nf + 1;
1750 }
1751
purge_quoted_commas(char * s)1752 static void purge_quoted_commas (char *s)
1753 {
1754 int inquote = 0;
1755
1756 while (*s) {
1757 if (*s == '"') {
1758 inquote = !inquote;
1759 } else if (inquote && *s == ',') {
1760 *s = ' ';
1761 }
1762 s++;
1763 }
1764 }
1765
purge_unquoted_spaces(char * s)1766 static void purge_unquoted_spaces (char *s)
1767 {
1768 int inquote = 0;
1769
1770 while (*s) {
1771 if (*s == '"') {
1772 inquote = !inquote;
1773 } else if (!inquote && *s == ' ') {
1774 shift_string_left(s, 1);
1775 }
1776 s++;
1777 }
1778 }
1779
compress_csv_line(csvdata * c,int nospace)1780 static void compress_csv_line (csvdata *c, int nospace)
1781 {
1782 int n = strlen(c->line);
1783 char *p = c->line + n - 1;
1784
1785 if (*p == 0x0a) {
1786 *p = '\0';
1787 p--;
1788 }
1789
1790 if (*p == 0x0d) {
1791 *p = '\0';
1792 }
1793
1794 if (!csv_keep_quotes(c) && c->delim == ',') {
1795 purge_quoted_commas(c->line);
1796 }
1797
1798 if (c->delim != ' ') {
1799 if (nospace) {
1800 purge_unquoted_spaces(c->line);
1801 }
1802 } else {
1803 compress_spaces(c->line);
1804 }
1805
1806 if (!csv_keep_quotes(c)) {
1807 gretl_delchar('"', c->line);
1808 }
1809
1810 if (csv_has_trailing_comma(c)) {
1811 /* chop trailing comma */
1812 n = strlen(c->line);
1813 if (n > 0) {
1814 c->line[n-1] = '\0';
1815 }
1816 }
1817 }
1818
import_obs_label(const char * s)1819 int import_obs_label (const char *s)
1820 {
1821 char tmp[VNAMELEN];
1822
1823 if (s == NULL) {
1824 return 1;
1825 }
1826
1827 if (!strcmp(s, "\"\"") || !strcmp(s, "''")) {
1828 return 1;
1829 }
1830
1831 if (*s == '"' || *s == '\'') s++;
1832
1833 if (*s == '\0') {
1834 return 1;
1835 }
1836
1837 if (strlen(s) > VNAMELEN - 1) {
1838 return 0;
1839 }
1840
1841 *tmp = '\0';
1842 strncat(tmp, s, VNAMELEN - 1);
1843 gretl_lower(tmp);
1844
1845 return (!strcmp(tmp, "obs") ||
1846 !strcmp(tmp, "date") ||
1847 !strcmp(tmp, "year") ||
1848 !strcmp(tmp, "period") ||
1849 !strcmp(tmp, "observation") ||
1850 !strcmp(tmp, "observation_date"));
1851 }
1852
join_wants_col_zero(csvdata * c,const char * s)1853 static int join_wants_col_zero (csvdata *c, const char *s)
1854 {
1855 const char *colname;
1856 int i;
1857
1858 if (*s == '\0') {
1859 return 0;
1860 }
1861
1862 for (i=0; i<c->jspec->ncols; i++) {
1863 colname = c->jspec->colnames[i];
1864 if (colname != NULL && !strcmp(s, colname)) {
1865 return 1;
1866 }
1867 }
1868
1869 return 0;
1870 }
1871
check_first_field(const char * line,csvdata * c,PRN * prn)1872 static void check_first_field (const char *line, csvdata *c, PRN *prn)
1873 {
1874 const char *s;
1875
1876 tryagain:
1877 s = line;
1878
1879 if (c->delim != ' ' && *s == c->delim) {
1880 csv_set_blank_column(c);
1881 } else {
1882 char field1[OBSLEN];
1883 int i = 0;
1884
1885 if (c->delim == ' ' && *s == ' ') {
1886 s++;
1887 }
1888
1889 while (*s && i < sizeof field1) {
1890 if (*s == c->delim) {
1891 break;
1892 } else if (*s == '\t') {
1893 /* presence of a tab must indicate tab-separation? */
1894 c->delim = '\t';
1895 goto tryagain;
1896 }
1897 field1[i++] = *s++;
1898 }
1899
1900 field1[i] = '\0';
1901 iso_to_ascii(field1);
1902
1903 if (joining(c) && join_wants_col_zero(c, field1)) {
1904 return;
1905 } else if (csv_all_cols(c)) {
1906 /* open/append wants all columns as data */
1907 return;
1908 }
1909
1910 pprintf(prn, _(" first field: '%s'\n"), field1);
1911
1912 if (import_obs_label(field1)) {
1913 pputs(prn, _(" seems to be observation label\n"));
1914 csv_set_obs_column(c);
1915 }
1916 }
1917 }
1918
import_na_init(void)1919 void import_na_init (void)
1920 {
1921 const char *s = get_csv_na_read_string();
1922
1923 strcpy(import_na, s);
1924 }
1925
1926 /* Returns 1 if the string @s should be counted representing
1927 an NA or missing value, 0 otherwise. If there is a user-set
1928 "csv_read_na" value this is used for comparison, otherwise
1929 a set of default values is consulted.
1930 */
1931
import_na_string(const char * s)1932 int import_na_string (const char *s)
1933 {
1934 if (*import_na != '\0' && strcmp(import_na, "default")) {
1935 /* the user has set a specific "NA" string, so
1936 respect it */
1937 return !strcmp(s, import_na);
1938 } else {
1939 /* consult a list of common representations of NA */
1940 const char *defaults[] = {
1941 "NA",
1942 "N.A.",
1943 "n.a.",
1944 "na",
1945 "n/a",
1946 "N/A",
1947 "#N/A",
1948 "NaN",
1949 ".NaN",
1950 ".",
1951 "..",
1952 "-999",
1953 "-9999",
1954 "-",
1955 NULL
1956 };
1957 int i;
1958
1959 for (i=0; defaults[i] != NULL; i++) {
1960 if (!strcmp(s, defaults[i])) {
1961 return 1;
1962 }
1963 }
1964 }
1965
1966 return 0;
1967 }
1968
csv_missval(const char * str,int i,int t,int * miss_shown,PRN * prn)1969 static int csv_missval (const char *str, int i, int t,
1970 int *miss_shown, PRN *prn)
1971 {
1972 int miss = 0;
1973
1974 if (*str == '\0' || !strcmp(str, "\"\"")) {
1975 /* 2021-03-03: let '""' indicate missing */
1976 if (miss_shown != NULL) {
1977 if (t < 80 || *miss_shown < i) {
1978 pprintf(prn, _(" the cell for variable %d, obs %d "
1979 "is empty: treating as missing value\n"),
1980 i, t);
1981 *miss_shown += 1;
1982 }
1983 }
1984 miss = 1;
1985 }
1986
1987 if (import_na_string(str)) {
1988 if (miss_shown != NULL) {
1989 if (t < 80 || *miss_shown < i) {
1990 pprintf(prn, _(" warning: missing value for variable "
1991 "%d, obs %d\n"), i, t);
1992 *miss_shown += 1;
1993 }
1994 }
1995 miss = 1;
1996 }
1997
1998 return miss;
1999 }
2000
2001 /* In the case where we think we've found thousands
2002 separators in numerical input, provisionally mark
2003 all "non-numeric" values as NAs; we do this prior
2004 to a second pass through the data.
2005 */
2006
revise_non_numeric_values(csvdata * c)2007 static void revise_non_numeric_values (csvdata *c)
2008 {
2009 int i, t;
2010
2011 for (i=1; i<c->dset->v; i++) {
2012 for (t=0; t<c->dset->n; t++) {
2013 if (c->dset->Z[i][t] == NON_NUMERIC) {
2014 c->dset->Z[i][t] = NADBL;
2015 }
2016 }
2017 }
2018 }
2019
non_numeric_check(DATASET * dset,int ** plist,gretl_string_table ** pst,PRN * prn)2020 int non_numeric_check (DATASET *dset, int **plist,
2021 gretl_string_table **pst,
2022 PRN *prn)
2023 {
2024 int *list = NULL;
2025 int i, j, t, nn = 0;
2026 int err = 0;
2027
2028 #if CDEBUG > 1
2029 fprintf(stderr, "non_numeric_check: testing %d series, pst = %p\n",
2030 dset->v - 1, (void *) pst);
2031 #endif
2032
2033 if (pst == NULL) {
2034 /* not interested in string-valued series/columns */
2035 for (i=1; i<dset->v; i++) {
2036 for (t=0; t<dset->n; t++) {
2037 if (dset->Z[i][t] == NON_NUMERIC) {
2038 dset->Z[i][t] = NADBL;
2039 }
2040 }
2041 }
2042 return 0;
2043 }
2044
2045 for (i=1; i<dset->v; i++) {
2046 for (t=0; t<dset->n; t++) {
2047 if (dset->Z[i][t] == NON_NUMERIC) {
2048 nn++;
2049 break;
2050 }
2051 }
2052 }
2053
2054 #if CDEBUG > 1
2055 fprintf(stderr, " found %d candidate series\n", nn);
2056 #endif
2057
2058 if (nn == 0) {
2059 return 0; /* nothing to be done */
2060 }
2061
2062 list = gretl_list_new(nn);
2063 if (list == NULL) {
2064 return E_ALLOC;
2065 }
2066
2067 j = 1;
2068 for (i=1; i<dset->v; i++) {
2069 for (t=0; t<dset->n; t++) {
2070 if (dset->Z[i][t] == NON_NUMERIC) {
2071 list[j++] = i;
2072 break;
2073 }
2074 }
2075 }
2076
2077 #if CDEBUG > 1
2078 printlist(list, "non-numeric vars list");
2079 #endif
2080
2081 for (i=1; i<=list[0]; i++) {
2082 /* check each member of @list */
2083 double nnfrac;
2084 int nnon = 0;
2085 int tnon = -1;
2086 int nok = 0;
2087 int v = list[i];
2088
2089 series_set_flag(dset, v, VAR_DISCRETE);
2090
2091 for (t=0; t<dset->n; t++) {
2092 if (dset->Z[v][t] == NON_NUMERIC) {
2093 if (tnon < 0) {
2094 /* record the first non-numeric obs */
2095 tnon = t + 1;
2096 }
2097 nnon++;
2098 } else if (!na(dset->Z[v][t])) {
2099 nok++;
2100 }
2101 }
2102
2103 nnfrac = (nok == 0)? 1.0 : nnon / (double) (nnon + nok);
2104 pprintf(prn, _("variable %d (%s): non-numeric values = %d "
2105 "(%.2f percent)\n"), v, dset->varname[v],
2106 nnon, 100 * nnfrac);
2107 if ((nnon < 2 && dset->n > 2) || nnfrac < 0.05) {
2108 /* if we got just a few non-numeric values, we'll assume
2109 that the data file is broken
2110 */
2111 pprintf(prn, _("ERROR: variable %d (%s), observation %d, "
2112 "expected numeric value\n"),
2113 v, dset->varname[v], tnon);
2114 err = E_DATA;
2115 break;
2116 }
2117 }
2118
2119 if (!err) {
2120 pputs(prn, _("allocating string table\n"));
2121 *pst = gretl_string_table_new(list);
2122 if (*pst == NULL) {
2123 err = E_ALLOC;
2124 }
2125 }
2126
2127 if (err) {
2128 free(list);
2129 } else {
2130 *plist = list;
2131 }
2132
2133 return err;
2134 }
2135
csv_non_numeric_check(csvdata * c,PRN * prn)2136 static int csv_non_numeric_check (csvdata *c, PRN *prn)
2137 {
2138 gretl_string_table *st = NULL;
2139 int *nlist = NULL;
2140 int err = 0;
2141
2142 if (csv_as_matrix(c)) {
2143 err = non_numeric_check(c->dset, &nlist, NULL, prn);
2144 } else {
2145 err = non_numeric_check(c->dset, &nlist, &st, prn);
2146 }
2147
2148 if (!err) {
2149 c->codelist = nlist;
2150 c->st = st;
2151 }
2152
2153 return err;
2154 }
2155
2156 /* Handle the case in "join" where the user specified some time
2157 columns for conversion to numeric and also gave a specific format
2158 for the conversion.
2159 */
2160
special_time_val(const char * s,const char * fmt,int m_means_q)2161 static double special_time_val (const char *s, const char *fmt,
2162 int m_means_q)
2163 {
2164 struct tm t = {0};
2165 char *test;
2166
2167 test = strptime(s, fmt, &t);
2168
2169 if (test == NULL || *test != '\0') {
2170 /* conversion didn't work right */
2171 return NADBL;
2172 } else {
2173 int y, m, d;
2174
2175 y = t.tm_year + 1900;
2176 m = t.tm_mon + 1;
2177 d = t.tm_mday;
2178
2179 if (m_means_q) {
2180 /* convert to 1st month of quarter */
2181 if (m == 2) m = 4;
2182 else if (m == 3) m = 7;
2183 else if (m == 4) m = 10;
2184 else if (m != 1) {
2185 return NADBL;
2186 }
2187 }
2188
2189 if (d == 0) d = 1;
2190
2191 return 10000*y + 100*m + d;
2192 }
2193 }
2194
char_count(char c,const char * s)2195 static int char_count (char c, const char *s)
2196 {
2197 int n = 0;
2198
2199 while (*s) {
2200 if (*s == c) n++;
2201 s++;
2202 }
2203
2204 return n;
2205 }
2206
2207 /* Follow-up check for the case where we think we might
2208 have found a thousands separator: each occurrence of
2209 the putative separator must be followed by exactly 3
2210 digits: we set c->thousep to an invalid value if this
2211 is not the case.
2212 */
2213
validate_thousep(csvdata * c,const char * s)2214 static void validate_thousep (csvdata *c, const char *s)
2215 {
2216 int nd;
2217
2218 while (*s) {
2219 if (*s == c->thousep) {
2220 nd = 0;
2221 s++;
2222 while (*s) {
2223 if (isdigit(*s)) {
2224 nd++;
2225 s++;
2226 } else {
2227 break;
2228 }
2229 }
2230 if (nd != 3) {
2231 /* nope! */
2232 #if CDEBUG
2233 fprintf(stderr, "validate_thousep: no: '%c' is followed by %d digits\n",
2234 c->thousep, nd);
2235 #endif
2236 c->thousep = -1;
2237 break;
2238 }
2239 } else {
2240 s++;
2241 }
2242 }
2243 }
2244
2245 /* Initial heuristic for detecting a thousands separator,
2246 where the string @s has been determined to contain
2247 nothing but digits, dot and comma (allowing for a leading
2248 minus).
2249
2250 1) If the string contains both comma and dot, whichever
2251 character appears to the left cannot be the decimal
2252 separator and may be a thousands separator.
2253
2254 2) If more than one comma appears in the string, comma
2255 cannot be the decimal character and might be a thousands
2256 separator; mutatis mutandis for dot.
2257 */
2258
test_for_thousands_sep(csvdata * c,const char * s)2259 static void test_for_thousands_sep (csvdata *c, const char *s)
2260 {
2261 const char *p1 = strrchr(s, '.');
2262 const char *p2 = strrchr(s, ',');
2263 char thousep = 0;
2264
2265 if (p1 != NULL && p2 != NULL) {
2266 thousep = (p2 - p1 > 0)? '.' : ',';
2267 } else if (p1 != NULL && char_count('.', s) > 0) {
2268 thousep = '.';
2269 } else if (p2 != NULL && char_count(',', s) > 0) {
2270 thousep = ',';
2271 }
2272
2273 if (c->thousep > 0) {
2274 if (thousep != 0 && thousep != c->thousep) {
2275 /* no consistent interpretation exists */
2276 c->thousep = -1; /* invalid */
2277 }
2278 } else if (thousep != 0) {
2279 /* we have a candidate for testing */
2280 char *test, tmp[CSVSTRLEN];
2281
2282 strcpy(tmp, s);
2283 gretl_delchar(thousep, tmp);
2284 if (thousep == '.' && get_local_decpoint() == '.') {
2285 gretl_charsub(tmp, ',', '.');
2286 }
2287 errno = 0;
2288 strtod(tmp, &test);
2289 if (*test == '\0' && errno == 0) {
2290 c->thousep = thousep;
2291 }
2292 }
2293
2294 if (c->thousep && thousep != 0) {
2295 validate_thousep(c, s);
2296 }
2297 }
2298
all_digits_and_seps(const char * s)2299 static int all_digits_and_seps (const char *s)
2300 {
2301 const char *test = "0123456789.,";
2302
2303 if (*s == '-') s++;
2304
2305 return strspn(s, test) == strlen(s);
2306 }
2307
eval_non_numeric(csvdata * c,int i,const char * s)2308 static double eval_non_numeric (csvdata *c, int i, const char *s)
2309 {
2310 double x = NON_NUMERIC;
2311
2312 if (series_get_flags(c->dset, i) & VAR_TIMECOL) {
2313 char *fmt = NULL;
2314 int mq = 0;
2315
2316 if (timecol_get_format(c->dset, i, &fmt, &mq)) {
2317 /* the user gave a specific format for this */
2318 x = special_time_val(s, fmt, mq);
2319 } else {
2320 /* default: ISO 8601 extended */
2321 int y, m, d, n;
2322
2323 n = sscanf(s, "%d-%d-%d", &y, &m, &d);
2324 if (n == 3) {
2325 x = 10000*y + 100*m + d;
2326 } else {
2327 x = NADBL;
2328 }
2329 }
2330 } else if (c->thousep >= 0 && !csv_scrub_thousep(c)) {
2331 /* Here we consider the possibility although @s does not
2332 validate as numeric according to the C library, it is by
2333 intent numeric but includes one or more thousands
2334 separators.
2335
2336 The condition c->thousep >= 0 requires that we haven't
2337 already ruled out this interpretation due to inconsistency,
2338 and !csv_scrub_thousep(c) requires that we're not on a
2339 second pass through the data.
2340 */
2341 if (all_digits_and_seps(s)) {
2342 test_for_thousands_sep(c, s);
2343 }
2344 }
2345
2346 return x;
2347 }
2348
converted_ok(const char * s,char * test,double x)2349 static int converted_ok (const char *s, char *test, double x)
2350 {
2351 if (*test != '\0') {
2352 if (errno) perror(s);
2353 return 0; /* definitely not OK */
2354 } else if (errno == ERANGE && fabs(x) > 0 && fabs(x) < 0.001) {
2355 return 1; /* subnormal, but we'll let that pass */
2356 } else if (errno) {
2357 perror(s);
2358 return 0;
2359 } else {
2360 return 1;
2361 }
2362 }
2363
csv_unquote(char * s)2364 static char *csv_unquote (char *s)
2365 {
2366 if (s[0] == '"') {
2367 int i, n = strlen(s);
2368
2369 if (n > 1 && s[n-1] == '"') {
2370 for (i=0; i<n-2; i++) {
2371 s[i] = s[i+1];
2372 }
2373 s[i] = '\0';
2374 }
2375 }
2376 return s;
2377 }
2378
csv_atof(csvdata * c,int i)2379 static double csv_atof (csvdata *c, int i)
2380 {
2381 char tmp[CSVSTRLEN], clean[CSVSTRLEN];
2382 double x = NON_NUMERIC;
2383 const char *s = c->str;
2384 char *test;
2385
2386 if (csv_scrub_thousep(c) && strchr(s, c->thousep) &&
2387 all_digits_and_seps(s)) {
2388 /* second pass through the data: pre-process fields
2389 that we reckon include thousands separators
2390 */
2391 strcpy(clean, s);
2392 gretl_delchar(c->thousep, clean);
2393 s = clean;
2394 }
2395
2396 if (c->decpoint == '.' || !csv_do_dotsub(c) || strchr(s, ',') == NULL) {
2397 /* either we're currently set to the correct locale,
2398 or there's no problematic decimal point in @s
2399 */
2400 errno = 0;
2401 x = strtod(s, &test);
2402 if (converted_ok(s, test, x)) {
2403 return x; /* handled */
2404 }
2405 } else if (csv_do_dotsub(c)) {
2406 /* in C numeric locale: substitute dot for comma */
2407 strcpy(tmp, s);
2408 gretl_charsub(tmp, ',', '.');
2409 errno = 0;
2410 x = strtod(tmp, &test);
2411 if (converted_ok(s, test, x)) {
2412 return x; /* handled */
2413 }
2414 }
2415
2416 if (c->decpoint == '.' && strchr(s, ',') != NULL) {
2417 /* try remediation for decimal comma? */
2418 strcpy(tmp, s);
2419 gretl_charsub(tmp, ',', '.');
2420 errno = 0;
2421 x = strtod(tmp, &test);
2422 if (converted_ok(s, test, x)) {
2423 return x; /* handled */
2424 }
2425 }
2426
2427 /* fallback */
2428 /* revised 2020-02-13 to use csv_unquote */
2429 return eval_non_numeric(c, i, csv_unquote(c->str));
2430 }
2431
process_csv_obs(csvdata * c,int i,int t,int * miss_shown,PRN * prn)2432 static int process_csv_obs (csvdata *c, int i, int t, int *miss_shown,
2433 PRN *prn)
2434 {
2435 int err = 0;
2436
2437 if (c->st != NULL) {
2438 /* second round, handling string-valued variables */
2439 if (in_gretl_list(c->codelist, i)) {
2440 double zit = c->dset->Z[i][t];
2441 int ix;
2442
2443 if (na(zit) && *c->str != '\0' && c->user_na == NULL) {
2444 /* by default (no user_na) only blanks count as NAs */
2445 zit = NON_NUMERIC;
2446 }
2447 if (!na(zit)) {
2448 ix = gretl_string_table_index(c->st, c->str, i, 0, prn);
2449 if (ix > 0) {
2450 c->dset->Z[i][t] = (double) ix;
2451 } else {
2452 err = E_DATA;
2453 }
2454 }
2455 }
2456 } else if (csv_missval(c->str, i, t+1, miss_shown, prn)) {
2457 c->dset->Z[i][t] = NADBL;
2458 } else {
2459 gretl_strstrip(c->str);
2460 c->dset->Z[i][t] = csv_atof(c, i);
2461 }
2462
2463 return err;
2464 }
2465
2466 /* Emulation of fgets(), designed to handle any sort of line
2467 termination (unix, DOS, Mac or even an unholy mixture).
2468 Line-endings are converted to LF (0x0a).
2469 */
2470
csv_fgets(csvdata * cdata,gzFile fp)2471 static char *csv_fgets (csvdata *cdata, gzFile fp)
2472 {
2473 char *s = cdata->line;
2474 int n = cdata->maxlinelen;
2475 int i, c1, c = 0;
2476
2477 for (i=0; i<n-1 && c!=0x0a; i++) {
2478 c = gzgetc(fp);
2479 if (c == EOF) {
2480 if (i == 0) {
2481 /* signal end of read */
2482 return NULL;
2483 } else {
2484 break;
2485 }
2486 } else if (c == 0x0d) {
2487 /* CR: convert to LF and peek at next char: if it's
2488 LF swallow it, otherwise put it back */
2489 c = 0x0a;
2490 c1 = gzgetc(fp);
2491 if (c1 != 0x0a) {
2492 gzungetc(c1, fp);
2493 }
2494 }
2495 s[i] = c;
2496 }
2497
2498 s[i] = '\0';
2499
2500 return s;
2501 }
2502
2503 /* pick up any comments following the data block in a CSV file */
2504
get_csv_descrip(csvdata * c,gzFile fp)2505 static char *get_csv_descrip (csvdata *c, gzFile fp)
2506 {
2507 char *line = c->line;
2508 char *desc = NULL;
2509 size_t llen, totlen;
2510
2511 while (csv_fgets(c, fp)) {
2512 tailstrip(line);
2513 llen = strlen(line);
2514 if (desc == NULL) {
2515 totlen = llen + 4;
2516 desc = malloc(totlen);
2517 if (desc == NULL) {
2518 return NULL;
2519 }
2520 sprintf(desc, "%s\n", line);
2521 } else {
2522 char *tmp;
2523
2524 totlen = strlen(desc) + llen + 4;
2525 tmp = realloc(desc, totlen);
2526 if (tmp == NULL) {
2527 free(desc);
2528 return NULL;
2529 }
2530 desc = tmp;
2531 strcat(desc, line);
2532 strcat(desc, "\n");
2533 }
2534 }
2535
2536 if (desc != NULL && string_is_blank(desc)) {
2537 free(desc);
2538 desc = NULL;
2539 }
2540
2541 return desc;
2542 }
2543
2544 static const char *
2545 csv_msg = N_("\nPlease note:\n"
2546 "- The first row of the CSV file should contain the "
2547 "names of the variables.\n"
2548 "- The first column may optionally contain date "
2549 "strings or other 'markers':\n in that case its row 1 entry "
2550 "should be blank, or should say 'obs' or 'date'.\n"
2551 "- The remainder of the file must be a rectangular "
2552 "array of data.\n");
2553
2554 /* Here we check whether we get a consistent reading on
2555 the number of fields per line in the CSV file
2556 */
2557
csv_fields_check(gzFile fp,csvdata * c,PRN * prn)2558 static int csv_fields_check (gzFile fp, csvdata *c, PRN *prn)
2559 {
2560 int gotdata = 0;
2561 int chkcols = 0;
2562 int err = 0;
2563
2564 c->ncols = c->nrows = 0;
2565
2566 if (csv_has_bom(c)) {
2567 gzseek(fp, 3, SEEK_SET);
2568 }
2569
2570 while (csv_fgets(c, fp) && !err) {
2571
2572 /* skip comment lines */
2573 if (*c->line == '#') {
2574 continue;
2575 }
2576
2577 /* skip blank lines -- but finish if the blank comes after data */
2578 if (string_is_blank(c->line)) {
2579 if (gotdata) {
2580 if (!csv_have_data(c)) {
2581 c->descrip = get_csv_descrip(c, fp);
2582 }
2583 break;
2584 } else {
2585 continue;
2586 }
2587 }
2588
2589 c->nrows += 1;
2590
2591 if (fixed_format(c)) {
2592 tailstrip(c->line);
2593 gotdata = 1;
2594 chkcols = strlen(c->line);
2595 if (chkcols < c->cols_list[c->cols_list[0]]) {
2596 gretl_errmsg_set(_("Invalid column specification"));
2597 err = E_DATA;
2598 break;
2599 } else {
2600 continue;
2601 }
2602 }
2603
2604 compress_csv_line(c, 1);
2605
2606 if (!gotdata) {
2607 /* scrutinize the first "real" line */
2608 check_first_field(c->line, c, prn);
2609 gotdata = 1;
2610 }
2611
2612 chkcols = count_csv_fields(c);
2613 if (c->ncols == 0) {
2614 c->ncols = chkcols;
2615 pprintf(prn, _(" number of columns = %d\n"), c->ncols);
2616 } else if (chkcols != c->ncols) {
2617 pprintf(prn, _(" ...but row %d has %d fields: aborting\n"),
2618 c->nrows, chkcols);
2619 err = E_DATA;
2620 } else if (cols_subset(c)) {
2621 int datacols = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
2622
2623 if (c->cols_list[c->cols_list[0]] > datacols) {
2624 gretl_errmsg_set(_("Invalid column specification"));
2625 err = E_DATA;
2626 }
2627 }
2628 }
2629
2630 if (!err && fixed_format(c)) {
2631 c->ncols = c->cols_list[0];
2632 }
2633
2634 return err;
2635 }
2636
strip_illegals(char * s)2637 static void strip_illegals (char *s)
2638 {
2639 char name[VNAMELEN] = {0};
2640 int i, j = 0;
2641
2642 for (i=0; s[i] != '\0'; i++) {
2643 if (isalnum(s[i]) || s[i] == '_') {
2644 name[j++] = s[i];
2645 }
2646 }
2647
2648 name[j] = '\0';
2649 strcpy(s, name);
2650 }
2651
intercept_nan_as_name(const char * s)2652 static int intercept_nan_as_name (const char *s)
2653 {
2654 if (strlen(s) == 3) {
2655 char screen[4];
2656
2657 strcpy(screen, s);
2658 gretl_lower(screen);
2659 if (!strcmp(screen, "nan")) {
2660 return 1;
2661 }
2662 }
2663
2664 return 0;
2665 }
2666
csv_is_numeric(const char * s,csvdata * c)2667 static int csv_is_numeric (const char *s, csvdata *c)
2668 {
2669 int ret = 0;
2670
2671 if (c->decpoint == '.') {
2672 ret = numeric_string(s);
2673 } else {
2674 /* decimal comma in force */
2675 char *tmp = gretl_strdup(s);
2676
2677 gretl_charsub(tmp, ',', '.');
2678 ret = numeric_string(tmp);
2679 free(tmp);
2680 }
2681
2682 return ret;
2683 }
2684
process_csv_varname(csvdata * c,int j,int * numcount,PRN * prn)2685 static int process_csv_varname (csvdata *c, int j, int *numcount,
2686 PRN *prn)
2687 {
2688 char *vname = c->dset->varname[j];
2689 char *src = c->str;
2690 int err = 0;
2691
2692 *vname = '\0';
2693
2694 if (intercept_nan_as_name(src)) {
2695 gretl_errmsg_sprintf(_("If '%s' is intended as the name of a variable, "
2696 "please change it --\nstrings of this sort usually "
2697 "mean 'not a number'."), src);
2698 err = E_DATA;
2699 } else if (*src == '\0') {
2700 fprintf(stderr, "variable name %d is missing\n", j);
2701 sprintf(vname, "v%d", j);
2702 } else if (csv_is_numeric(src, c)) {
2703 *numcount += 1;
2704 } else {
2705 const char *s = src;
2706
2707 while (*s && !isalpha(*s)) s++;
2708 if (*s == '\0') {
2709 fprintf(stderr, "variable name %d (%s) is garbage\n", j, src);
2710 sprintf(vname, "v%d", j);
2711 } else {
2712 strncat(vname, s, VNAMELEN - 1);
2713 }
2714 iso_to_ascii(vname);
2715 strip_illegals(vname);
2716 if (gretl_reserved_word(vname)) {
2717 /* try a fix for this */
2718 int n = strlen(vname);
2719
2720 if (n < VNAMELEN-1) {
2721 strcat(vname, "_");
2722 } else {
2723 vname[n-1] = '_';
2724 }
2725 }
2726 if (check_varname(vname)) {
2727 errmsg(1, prn);
2728 err = E_DATA;
2729 }
2730 }
2731
2732 return err;
2733 }
2734
csv_reconfigure_for_markers(DATASET * dset)2735 static int csv_reconfigure_for_markers (DATASET *dset)
2736 {
2737 int err = dataset_allocate_obs_markers(dset);
2738
2739 if (!err) {
2740 err = dataset_drop_last_variables(dset, 1);
2741 }
2742
2743 return err;
2744 }
2745
skip_data_column(csvdata * c,int k)2746 static int skip_data_column (csvdata *c, int k)
2747 {
2748 int col = csv_skip_col_1(c) ? k : k + 1;
2749
2750 if (!in_gretl_list(c->cols_list, col)) {
2751 return 1;
2752 } else {
2753 return 0;
2754 }
2755 }
2756
update_join_cols_list(csvdata * c,int k)2757 static int update_join_cols_list (csvdata *c, int k)
2758 {
2759 int *test;
2760 int err = 0;
2761
2762 test = gretl_list_append_term(&c->cols_list, k);
2763 if (test == NULL) {
2764 err = E_ALLOC;
2765 }
2766
2767 #if CDEBUG
2768 printlist(c->cols_list, "c->cols_list for join");
2769 #endif
2770
2771 return err;
2772 }
2773
2774 /* handle_join_varname: the index @k contains the column number
2775 relative to the entire CSV file, while @pj points to j, the column
2776 number relative to the reduced dataset that will be constructed by
2777 selection of columns from the file.
2778
2779 Here we're examining a column heading read from file (c->str) to
2780 see whether it matches any of the column-names required for an
2781 ongoing join operation (held in c->jspec->colnames). If so, we
2782 write the index j into the appropriate slot in c->jspec->colnums
2783 (which starts off filled with zeros), so the joiner will know where
2784 to find the required data. (The j value is bound to be at least 1
2785 since column 0 is reserved to the constant.)
2786
2787 In some cases a given named column may perform more than one role in
2788 a join operation -- for example, it may serve as an element in a
2789 filter and also as the auxiliary variable in an "aggregation"
2790 method. To allow for this we don't stop scanning at the first match
2791 of c->str with a required column name.
2792
2793 The call to update_join_cols_list() uses the index @k to record the
2794 overall column position of "wanted data", for use by the CSV
2795 reader.
2796 */
2797
handle_join_varname(csvdata * c,int k,int * pj)2798 static int handle_join_varname (csvdata *c, int k, int *pj)
2799 {
2800 const char *colname;
2801 char okname[VNAMELEN];
2802 int matched = 0;
2803 int i, j = *pj;
2804
2805 if (!csv_skip_col_1(c)) {
2806 k++;
2807 }
2808
2809 if (csv_no_header(c)) {
2810 sprintf(okname, "col%d", k);
2811 } else {
2812 /* convert to valid gretl identifier */
2813 gretl_normalize_varname(okname, c->str, 0, k);
2814 }
2815
2816 #if CDEBUG
2817 fprintf(stderr, "handle_join_varname: looking at '%s' (%s)\n", c->str, okname);
2818 #endif
2819
2820 for (i=0; i<c->jspec->ncols; i++) {
2821 /* find "wanted name" i */
2822 colname = c->jspec->colnames[i];
2823 if (colname == NULL || c->jspec->colnums[i] > 0) {
2824 /* name not wanted, or already found */
2825 continue;
2826 }
2827 if (!strcmp(okname, colname)) {
2828 #if CDEBUG
2829 fprintf(stderr, " target %d matched at CSV col %d, j=%d\n", i, k, j);
2830 #endif
2831 c->jspec->colnums[i] = j;
2832 if (!matched) {
2833 matched = 1;
2834 strcpy(c->dset->varname[j], okname);
2835 update_join_cols_list(c, k);
2836 *pj += 1;
2837 if (in_gretl_list(c->jspec->timecols, i)) {
2838 series_set_flag(c->dset, j, VAR_TIMECOL);
2839 }
2840 }
2841 }
2842 }
2843
2844 return 0;
2845 }
2846
2847 #define starts_number(c) (isdigit((unsigned char) c) || c == '-' || \
2848 c == '+' || c == '.')
2849
2850 #define obs_labels_no_varnames(o,c,n) (!o && c->v > 3 && n == c->v - 2)
2851
csv_varname_scan(csvdata * c,gzFile fp,PRN * prn,PRN * mprn)2852 static int csv_varname_scan (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
2853 {
2854 char *p;
2855 int obscol = csv_has_obs_column(c);
2856 int i, j, k, numcount;
2857 int err = 0;
2858
2859 if (!csv_no_header(c)) {
2860 pputs(mprn, _("scanning for variable names...\n"));
2861 }
2862
2863 if (csv_has_bom(c)) {
2864 gzseek(fp, 3, SEEK_SET);
2865 }
2866
2867 while (csv_fgets(c, fp)) {
2868 if (*c->line == '#' || string_is_blank(c->line)) {
2869 continue;
2870 } else {
2871 break;
2872 }
2873 }
2874
2875 c->datapos = gztell(fp);
2876
2877 compress_csv_line(c, 1);
2878
2879 p = c->line;
2880 if (c->delim == ' ' && *p == ' ') p++;
2881 iso_to_ascii(p);
2882
2883 if (strlen(p) > 118) {
2884 pprintf(mprn, _(" line: %.115s...\n"), p);
2885 } else {
2886 pprintf(mprn, _(" line: %s\n"), p);
2887 }
2888
2889 numcount = 0;
2890 j = 1; /* for the constant */
2891
2892 for (k=0; k<c->ncols && !err; k++) {
2893 i = 0;
2894 while (*p && *p != c->delim) {
2895 if (i < CSVSTRLEN - 1) {
2896 c->str[i++] = *p;
2897 }
2898 p++;
2899 }
2900 c->str[i] = '\0';
2901 if (*p == c->delim) p++;
2902
2903 if (k == 0 && csv_skip_col_1(c)) {
2904 ; /* no-op */
2905 } else if (!joining(c) && cols_subset(c) && skip_data_column(c, k)) {
2906 ; /* no-op */
2907 } else {
2908 if (joining(c)) {
2909 handle_join_varname(c, k, &j);
2910 } else if (probing(c) && csv_no_header(c)) {
2911 sprintf(c->dset->varname[j], "col%d", j);
2912 j++;
2913 } else {
2914 err = process_csv_varname(c, j, &numcount, prn);
2915 j++;
2916 }
2917 }
2918 if (j == c->dset->v) {
2919 #if CDEBUG
2920 fprintf(stderr, "breaking on j = %d (k = %d)\n", j, k);
2921 #endif
2922 break;
2923 }
2924 }
2925
2926 if (!err && joining(c) && c->cols_list == NULL) {
2927 /* no relevant columns were found */
2928 gretl_errmsg_set("No relevant columns were found");
2929 err = E_UNKVAR;
2930 }
2931
2932 if (err) {
2933 return err;
2934 }
2935
2936 if (csv_no_header(c) || numcount == c->dset->v - 1 ||
2937 obs_labels_no_varnames(obscol, c->dset, numcount)) {
2938 if (!csv_no_header(c)) {
2939 pputs(prn, _("it seems there are no variable names\n"));
2940 /* then we undercounted the observations by one? */
2941 if (!rows_subset(c)) {
2942 err = add_single_obs(c->dset);
2943 }
2944 }
2945 if (!err) {
2946 /* set up to handle the "no varnames" case */
2947 csv_set_autoname(c);
2948 c->datapos = csv_has_bom(c) ? 3 : 0;
2949 if (!csv_all_cols(c)) {
2950 if (obs_labels_no_varnames(obscol, c->dset, numcount)) {
2951 err = csv_reconfigure_for_markers(c->dset);
2952 if (!err) {
2953 csv_set_obs_column(c);
2954 }
2955 }
2956 }
2957 }
2958 } else if (numcount > 0) {
2959 for (i=1; i<c->dset->v; i++) {
2960 if (check_varname(c->dset->varname[i])) {
2961 errmsg(1, prn);
2962 break;
2963 }
2964 }
2965 fprintf(stderr, "numcount = %d\n", numcount);
2966 err = E_DATA;
2967 }
2968
2969 return err;
2970 }
2971
row_not_wanted(csvdata * c,int t)2972 static int row_not_wanted (csvdata *c, int t)
2973 {
2974 if (c->rowmask != NULL) {
2975 if (t >= c->masklen) {
2976 return 1;
2977 } else if (gretl_vector_get(c->rowmask, t) == 0) {
2978 return 1;
2979 }
2980 }
2981
2982 return 0;
2983 }
2984
2985 /* read numerical data when we've been given a fixed column-reading
2986 specification */
2987
fixed_format_read(csvdata * c,gzFile fp,PRN * prn)2988 static int fixed_format_read (csvdata *c, gzFile fp, PRN *prn)
2989 {
2990 char *p;
2991 int miss_shown = 0;
2992 int *missp = NULL;
2993 int t = 0, s = 0;
2994 int i, k, n, m;
2995 int err = 0;
2996
2997 c->real_n = c->dset->n;
2998
2999 if (csv_has_bom(c)) {
3000 gzseek(fp, 3, SEEK_SET);
3001 }
3002
3003 if (csv_is_verbose(c)) {
3004 missp = &miss_shown;
3005 }
3006
3007 while (csv_fgets(c, fp) && !err) {
3008 tailstrip(c->line);
3009 if (*c->line == '#' || string_is_blank(c->line)) {
3010 continue;
3011 }
3012 if (row_not_wanted(c, s)) {
3013 s++;
3014 continue;
3015 }
3016 m = strlen(c->line);
3017 for (i=1; i<=c->ncols && !err; i++) {
3018 k = c->cols_list[i];
3019 n = c->width_list[i];
3020 if (k + n - 1 > m) {
3021 /* attempting to read out of bounds */
3022 fprintf(stderr, "row %d, column %d: start=%d, width=%d, "
3023 "but line length = %d\n", t+1, i, k, n, m);
3024 err = E_DATA;
3025 break;
3026 }
3027 p = c->line + k - 1;
3028 *c->str = '\0';
3029 strncat(c->str, p, n);
3030 /* Added 2016-11-16: allow trailing blanks in a field
3031 of specified width. This is required for handling
3032 US CPS data.
3033 */
3034 tailstrip(c->str);
3035 if (csv_missval(c->str, i, t+1, missp, prn)) {
3036 c->dset->Z[i][t] = NADBL;
3037 } else {
3038 c->dset->Z[i][t] = csv_atof(c, i);
3039 if (c->dset->Z[i][t] == NON_NUMERIC) {
3040 gretl_errmsg_sprintf(_("At row %d, column %d:\n"), t+1, k);
3041 gretl_errmsg_sprintf(_("'%s' -- no numeric conversion performed!"),
3042 c->str);
3043 err = E_DATA;
3044 }
3045 }
3046 }
3047 s++;
3048 if (++t == c->dset->n) {
3049 break;
3050 }
3051 }
3052
3053 if (err == E_DATA) {
3054 gretl_errmsg_set(_("Invalid column specification"));
3055 }
3056
3057 return err;
3058 }
3059
3060 #define XML1_OK(u) ((u>=0x0020 && u<=0xD7FF) || \
3061 (u>=0xE000 && u<=0xFFFD))
3062
3063 /* Check that an observation label contains only
3064 valid UTF-8, and moreover that every character
3065 is valid in XML 1.0. If not, try recoding from
3066 ISO 8859.
3067 */
3068
maybe_fix_csv_string(gchar * s)3069 static int maybe_fix_csv_string (gchar *s)
3070 {
3071 int err = 0;
3072
3073 if (!g_utf8_validate(s, -1, NULL)) {
3074 GError *gerr = NULL;
3075 gsize wrote = 0;
3076 gchar *tr;
3077
3078 /* try for iso-8859? */
3079 tr = g_convert(s, -1, "UTF-8", "ISO-8859-15",
3080 NULL, &wrote, &gerr);
3081 if (gerr != NULL) {
3082 gretl_errmsg_set(gerr->message);
3083 g_error_free(gerr);
3084 err = E_DATA;
3085 } else {
3086 *s = '\0';
3087 gretl_utf8_strncat(s, tr, CSVSTRLEN-1);
3088 g_free(tr);
3089 }
3090 }
3091
3092 if (!err) {
3093 int i, n = g_utf8_strlen(s, -1);
3094 gunichar u;
3095
3096 for (i=0; i<n; i++) {
3097 u = g_utf8_get_char(s);
3098 if (!XML1_OK(u)) {
3099 return 0;
3100 }
3101 s = g_utf8_next_char(s);
3102 }
3103 }
3104
3105 return err;
3106 }
3107
transcribe_obs_label(csvdata * c,int t)3108 static void transcribe_obs_label (csvdata *c, int t)
3109 {
3110 char *s = c->str;
3111 char c0 = *s;
3112 int n = strlen(s);
3113
3114 /* skip a leading quote, and unquote fully
3115 if a matching trailing quote is found
3116 */
3117
3118 if (c0 == '"' || c0 == '\'') {
3119 if (s[n-1] == c0) {
3120 s[n-1] = '\0';
3121 n--;
3122 }
3123 s++;
3124 n--;
3125 /* and once more, with feeling... */
3126 if (s[0] == '\'') {
3127 s++;
3128 n--;
3129 }
3130 }
3131
3132 if (n > OBSLEN - 1) {
3133 n = OBSLEN - 1;
3134 }
3135
3136 c->dset->S[t][0] = '\0';
3137 gretl_utf8_strncat(c->dset->S[t], s, n);
3138 }
3139
real_read_labels_and_data(csvdata * c,gzFile fp,PRN * prn)3140 static int real_read_labels_and_data (csvdata *c, gzFile fp, PRN *prn)
3141 {
3142 char *p;
3143 int miss_shown = 0;
3144 int *missp = NULL;
3145 int truncated = 0;
3146 int t = 0, s = 0;
3147 int i, j, k;
3148 int err = 0;
3149
3150 if (csv_is_verbose(c)) {
3151 missp = &miss_shown;
3152 }
3153
3154 c->real_n = c->dset->n;
3155
3156 while (csv_fgets(c, fp) && !err) {
3157 int inquote = 0;
3158
3159 if (*c->line == '#' || string_is_blank(c->line)) {
3160 continue;
3161 } else if (*c->skipstr != '\0' && strstr(c->line, c->skipstr)) {
3162 c->real_n -= 1;
3163 continue;
3164 } else if (row_not_wanted(c, s)) {
3165 s++;
3166 continue;
3167 }
3168
3169 compress_csv_line(c, 0);
3170 p = c->line;
3171
3172 if (c->delim == ' ') {
3173 if (*p == ' ') p++;
3174 } else {
3175 p += strspn(p, " ");
3176 }
3177
3178 j = 1;
3179 for (k=0; k<c->ncols && !err; k++) {
3180 i = 0;
3181 while (*p) {
3182 if (csv_keep_quotes(c) && *p == c->qchar) {
3183 inquote = !inquote;
3184 } else if (!inquote && *p == c->delim) {
3185 break;
3186 }
3187 if (i < CSVSTRLEN - 1) {
3188 c->str[i++] = *p;
3189 } else {
3190 truncated++;
3191 }
3192 p++;
3193 }
3194 c->str[i] = '\0';
3195 err = maybe_fix_csv_string(c->str);
3196 if (!err) {
3197 if (k == 0 && csv_skip_col_1(c) && c->dset->S != NULL) {
3198 transcribe_obs_label(c, t);
3199 } else if (cols_subset(c) && skip_data_column(c, k)) {
3200 ; /* no-op */
3201 } else {
3202 err = process_csv_obs(c, j++, t, missp, prn);
3203 }
3204 }
3205 if (!err) {
3206 /* prep for next column */
3207 if (*p == c->delim) {
3208 p++;
3209 }
3210 if (c->delim != ' ') {
3211 p += strspn(p, " ");
3212 }
3213 }
3214 }
3215
3216 s++;
3217 if (++t == c->dset->n) {
3218 break;
3219 }
3220 }
3221
3222 if (truncated) {
3223 pprintf(prn, _("warning: %d labels were truncated.\n"), truncated);
3224 }
3225
3226 if (!err && c->real_n < c->dset->n) {
3227 int drop = c->dset->n - c->real_n;
3228
3229 err = dataset_drop_observations(c->dset, drop);
3230 }
3231
3232 return err;
3233 }
3234
3235 /* When reading a CSV file, should we attempt to parse observation
3236 strings as dates (and impose time-series structure on the data
3237 if this is successful)? In general, yes, but maybe not if we're
3238 reading the data in the context of a "join" operation, since
3239 in this case automatic detection may collide with time-key
3240 information supplied by the user. Current status: we'll skip
3241 the auto-dating stuff when joining unless (a) it's a MIDAS
3242 join (mixed frequencies) and the user has _not_ supplied any
3243 time key specification.
3244 */
3245
csv_skip_dates(csvdata * c)3246 static int csv_skip_dates (csvdata *c)
3247 {
3248 if (c->jspec != NULL) {
3249 /* with --aggr=spread (MIDAS) we'll need dates info,
3250 unless the user have a time key spec
3251 */
3252 return c->jspec->auto_midas == 0;
3253 } else {
3254 return 0;
3255 }
3256 }
3257
csv_read_data(csvdata * c,gzFile fp,PRN * prn,PRN * mprn)3258 static int csv_read_data (csvdata *c, gzFile fp, PRN *prn, PRN *mprn)
3259 {
3260 int reversed = csv_data_reversed(c);
3261 int err;
3262
3263 if (mprn != NULL) {
3264 if (csv_all_cols(c)) {
3265 pputs(mprn, _("scanning for data...\n"));
3266 } else {
3267 pputs(mprn, _("scanning for row labels and data...\n"));
3268 }
3269 }
3270
3271 gzseek(fp, c->datapos, SEEK_SET);
3272
3273 err = real_read_labels_and_data(c, fp, prn);
3274
3275 if (!err && csv_skip_col_1(c) && !rows_subset(c) && !csv_skip_dates(c)) {
3276 c->markerpd = test_markers_for_dates(c->dset, &reversed,
3277 c->skipstr, prn);
3278 if (reversed) {
3279 csv_set_data_reversed(c);
3280 }
3281 }
3282
3283 return err;
3284 }
3285
print_csv_parsing_header(const char * fname,PRN * prn)3286 static void print_csv_parsing_header (const char *fname, PRN *prn)
3287 {
3288 if (!g_utf8_validate(fname, -1, NULL)) {
3289 gchar *trfname = g_locale_to_utf8(fname, -1, NULL, NULL, NULL);
3290
3291 pprintf(prn, "%s %s...\n", _("parsing"), trfname);
3292 g_free(trfname);
3293 } else {
3294 pprintf(prn, "%s %s...\n", _("parsing"), fname);
3295 }
3296 }
3297
join_unique_columns(csvdata * c)3298 static int join_unique_columns (csvdata *c)
3299 {
3300 const char **cnames = c->jspec->colnames;
3301 char *counted;
3302 int i, j, ncols = 0;
3303
3304 counted = calloc(c->jspec->ncols, 1);
3305
3306 for (i=0; i<c->jspec->ncols; i++) {
3307 if (cnames[i] != NULL && counted[i] == 0) {
3308 counted[i] = 1;
3309 /* mark any duplicates as counted too */
3310 for (j=i+1; j<c->jspec->ncols; j++) {
3311 if (cnames[j] != NULL && !strcmp(cnames[i], cnames[j])) {
3312 counted[j] = 1;
3313 }
3314 }
3315 #if CDEBUG
3316 fprintf(stderr, "join_unique_columns: '%s'\n", cnames[i]);
3317 #endif
3318 ncols++;
3319 }
3320 }
3321
3322 free(counted);
3323
3324 return ncols;
3325 }
3326
csv_set_dataset_dimensions(csvdata * c)3327 static int csv_set_dataset_dimensions (csvdata *c)
3328 {
3329 int err = 0;
3330
3331 c->dset->v = 0;
3332
3333 if (rows_subset(c)) {
3334 c->dset->n = n_from_row_mask(c);
3335 }
3336
3337 if (fixed_format(c)) {
3338 if (c->dset->n == 0) {
3339 c->dset->n = c->nrows;
3340 }
3341 c->dset->v = c->ncols + 1;
3342 } else {
3343 int cols_wanted, cols_present;
3344
3345 if (c->dset->n == 0) {
3346 if (csv_no_header(c)) {
3347 c->dset->n = c->nrows;
3348 } else {
3349 /* allow for varnames row */
3350 c->dset->n = c->nrows - 1;
3351 }
3352 }
3353
3354 cols_present = csv_skip_col_1(c) ? (c->ncols - 1) : c->ncols;
3355
3356 if (joining(c)) {
3357 cols_wanted = join_unique_columns(c);
3358 } else if (cols_subset(c)) {
3359 cols_wanted = c->cols_list[0];
3360 } else {
3361 cols_wanted = cols_present;
3362 }
3363
3364 if (cols_wanted > cols_present) {
3365 gretl_errmsg_set(_("Invalid column specification"));
3366 err = E_DATA;
3367 } else {
3368 /* allow for the constant */
3369 c->dset->v = cols_wanted + 1;
3370 }
3371 }
3372
3373 if (probing(c)) {
3374 /* don't allocate tons of space for data that
3375 we won't read right now */
3376 c->dset->n = 1;
3377 }
3378
3379 #if CDEBUG
3380 if (joining(c)) {
3381 fprintf(stderr, "csv dataset dimensions: v=%d, n=%d\n",
3382 c->dset->v, c->dset->n);
3383 }
3384 #endif
3385
3386 return err;
3387 }
3388
3389 /*
3390 * real_import_csv:
3391 * @fname: name of CSV file.
3392 * @dset: dataset struct.
3393 * @cols: column specification.
3394 * @rows: row specification.
3395 * @join: specification pertaining to "join" command.
3396 * @probe: also pertains to "join" (via GUI).
3397 * @pm: location of matrix to accept the data or NULL.
3398 * @opt: use OPT_N to force interpretation of data colums containing
3399 * strings as coded (non-numeric) values and not errors; use OPT_H
3400 * to indicate absence of a header row; use OPT_A to indicate that
3401 * all columns should be read as data series (i.e. do not try to
3402 * interpret the first column as observation labels); for use of
3403 * OPT_T see the help text for the "append" command.
3404 * @prn: gretl printing struct (or NULL).
3405 *
3406 * Open a Comma-Separated Values data file and read the data into
3407 * the current work space. Shared with gretl_join.c.
3408 *
3409 * Returns: 0 on successful completion, non-zero otherwise.
3410 */
3411
real_import_csv(const char * fname,DATASET * dset,const char * cols,const char * rows,joinspec * join,void * probe,gretl_matrix ** pm,gretlopt opt,PRN * prn)3412 int real_import_csv (const char *fname,
3413 DATASET *dset,
3414 const char *cols,
3415 const char *rows,
3416 joinspec *join,
3417 void *probe,
3418 gretl_matrix **pm,
3419 gretlopt opt,
3420 PRN *prn)
3421 {
3422 csvdata *c = NULL;
3423 gzFile fp = NULL;
3424 PRN *mprn = NULL;
3425 gchar *altname = NULL;
3426 int recode = 0;
3427 int popit = 0;
3428 int i, err = 0;
3429
3430 import_na_init();
3431
3432 if (gretl_messages_on()) {
3433 mprn = prn;
3434 }
3435
3436 fp = gretl_gzopen(fname, "rb");
3437 if (fp == NULL) {
3438 pprintf(prn, _("Couldn't open %s\n"), fname);
3439 err = E_FOPEN;
3440 goto csv_bailout;
3441 }
3442
3443 c = csvdata_new(dset);
3444 if (c == NULL) {
3445 err = E_ALLOC;
3446 goto csv_bailout;
3447 }
3448
3449 recode = csv_unicode_check(fp, c, prn);
3450 if (recode) {
3451 err = csv_recode_input(&fp, fname, &altname, recode, prn);
3452 if (err) {
3453 goto csv_bailout;
3454 }
3455 }
3456
3457 if (cols != NULL) {
3458 err = csvdata_add_cols_list(c, cols, opt);
3459 if (err) {
3460 goto csv_bailout;
3461 } else if (fixed_format(c)) {
3462 pprintf(mprn, _("using fixed column format\n"));
3463 }
3464 }
3465
3466 if (rows != NULL) {
3467 err = csvdata_add_row_mask(c, rows);
3468 if (err) {
3469 goto csv_bailout;
3470 }
3471 }
3472
3473 if (opt & OPT_H) {
3474 csv_set_no_header(c);
3475 }
3476
3477 if (join != NULL) {
3478 c->jspec = join;
3479 c->flags |= CSV_HAVEDATA;
3480 } else if (probe != NULL) {
3481 c->probe = probe;
3482 c->flags |= CSV_HAVEDATA;
3483 } else {
3484 if (pm != NULL) {
3485 csv_set_as_matrix(c);
3486 }
3487 if (opt & OPT_A) {
3488 csv_set_all_cols(c);
3489 }
3490 if (opt & OPT_V) {
3491 csv_set_verbose(c);
3492 }
3493 }
3494
3495 if (opt & OPT_I) {
3496 csv_unset_keep_quotes(c);
3497 }
3498
3499 if (mprn != NULL) {
3500 print_csv_parsing_header(fname, mprn);
3501 }
3502
3503 /* get line length, also check for binary data, etc. */
3504 c->maxlinelen = csv_max_line_length(fp, c, prn);
3505 if (c->maxlinelen <= 0) {
3506 err = E_DATA;
3507 goto csv_bailout;
3508 }
3509
3510 if (csv_as_matrix(c) && csv_got_semi(c)) {
3511 if (c->delim == ',' && csv_got_delim(c)) {
3512 c->decpoint = ',';
3513 }
3514 c->delim = ';';
3515 } else if (!fixed_format(c) && !csv_got_delim(c)) {
3516 /* set default delimiter */
3517 if (csv_got_tab(c)) {
3518 c->delim = '\t';
3519 } else if (csv_got_semi(c)) {
3520 c->delim = ';';
3521 } else {
3522 c->delim = ' ';
3523 }
3524 }
3525
3526 #if CDEBUG
3527 fprintf(stderr, "fixed_format? %s; got_delim (%c)? %s; got_tab? %s; ",
3528 fixed_format(c) ? "yes" : "no", c->delim,
3529 csv_got_delim(c) ? "yes" : "no",
3530 csv_got_tab(c)? "yes" : "no");
3531 fprintf(stderr, "decpoint '%c'\n", c->decpoint);
3532 #endif
3533
3534 /* buffer to hold lines */
3535 c->line = malloc(c->maxlinelen);
3536 if (c->line == NULL) {
3537 err = E_ALLOC;
3538 goto csv_bailout;
3539 }
3540
3541 alt_delim:
3542
3543 if (mprn != NULL) {
3544 if (!fixed_format(c)) {
3545 pprintf(mprn, _("using delimiter '%c'\n"), c->delim);
3546 }
3547 pprintf(mprn, _(" longest line: %d characters\n"), c->maxlinelen - 1);
3548 }
3549
3550 if (csv_has_trailing_comma(c) && c->delim != ',') {
3551 csv_unset_trailing_comma(c);
3552 }
3553
3554 gzrewind(fp);
3555
3556 /* read lines, check for consistency in number of fields */
3557 err = csv_fields_check(fp, c, mprn);
3558 if (err && !fixed_format(c)) {
3559 if (c->delim != ';' && csv_got_semi(c)) {
3560 c->delim = ';';
3561 err = 0;
3562 goto alt_delim;
3563 }
3564 pputs(prn, _(csv_msg));
3565 goto csv_bailout;
3566 }
3567
3568 err = csv_set_dataset_dimensions(c);
3569 if (err) {
3570 err = E_DATA;
3571 goto csv_bailout;
3572 }
3573
3574 pprintf(mprn, _(" number of variables: %d\n"), c->dset->v - 1);
3575 pprintf(mprn, _(" number of non-blank lines: %d\n"), c->nrows);
3576
3577 if (c->dset->n == 0) {
3578 pputs(prn, _("Invalid data file\n"));
3579 err = E_DATA;
3580 goto csv_bailout;
3581 }
3582
3583 /* initialize CSV dataset */
3584 err = start_new_Z(c->dset, 0);
3585 if (!err && csv_skip_col_1(c)) {
3586 err = dataset_allocate_obs_markers(c->dset);
3587 }
3588
3589 if (err) {
3590 goto csv_bailout;
3591 }
3592
3593 /* second pass */
3594
3595 gzrewind(fp);
3596
3597 if (fixed_format(c)) {
3598 err = fixed_format_read(c, fp, prn);
3599 if (err) {
3600 goto csv_bailout;
3601 } else {
3602 csv_set_autoname(c);
3603 goto csv_continue;
3604 }
3605 }
3606
3607 err = csv_varname_scan(c, fp, prn, mprn);
3608 if (err || probing(c)) {
3609 goto csv_bailout;
3610 }
3611
3612 if (c->decpoint == '.' && get_local_decpoint() == ',') {
3613 /* we're in a locale that uses decimal comma:
3614 switch to the C locale */
3615 gretl_push_c_numeric_locale();
3616 popit = 1;
3617 } else if (c->decpoint == ',' && get_local_decpoint() == '.') {
3618 /* dotsub: define this if we're in a '.' locale and
3619 we've figured that the decimal character is ',' in
3620 the file we're reading
3621 */
3622 csv_set_dotsub(c);
3623 }
3624
3625 err = csv_read_data(c, fp, prn, mprn);
3626
3627 if (!err) {
3628 /* try again, under certain conditions */
3629 if (csv_skip_bad(c)) {
3630 err = csv_read_data(c, fp, prn, NULL);
3631 } else if (c->thousep > 0) {
3632 pprintf(mprn, _("WARNING: it seems '%c' is being used "
3633 "as thousands separator\n"), c->thousep);
3634 c->decpoint = (c->thousep == '.')? ',' : '.';
3635 if (c->decpoint == ',') {
3636 if (get_local_decpoint() == '.') {
3637 csv_set_dotsub(c);
3638 } else if (popit) {
3639 gretl_pop_c_numeric_locale();
3640 popit = 0;
3641 }
3642 }
3643 revise_non_numeric_values(c);
3644 csv_set_scrub_thousep(c);
3645 err = csv_read_data(c, fp, prn, NULL);
3646 }
3647 }
3648
3649 if (!err && !probing(c)) {
3650 err = csv_non_numeric_check(c, prn);
3651 if (!err && csv_has_non_numeric(c)) {
3652 /* try once more */
3653 err = csv_read_data(c, fp, prn, NULL);
3654 }
3655 }
3656
3657 if (popit) {
3658 gretl_pop_c_numeric_locale();
3659 }
3660
3661 if (err) {
3662 goto csv_bailout;
3663 }
3664
3665 if (csv_data_reversed(c)) {
3666 reverse_data(c->dset, mprn);
3667 }
3668
3669 csv_continue:
3670
3671 c->dset->t1 = 0;
3672 c->dset->t2 = c->dset->n - 1;
3673
3674 if (c->markerpd > 0) {
3675 pputs(mprn, _("taking date information from row labels\n\n"));
3676 if (csv_skip_bad(c)) {
3677 pprintf(prn, "WARNING: Check your data! gretl has stripped out "
3678 "what appear to be\nextraneous lines in a %s dataset: "
3679 "this may not be right.\n\n",
3680 (c->dset->pd == 4)? "quarterly" : "monthly");
3681 }
3682 } else {
3683 pputs(mprn, _("treating these as undated data\n\n"));
3684 dataset_obs_info_default(c->dset);
3685 }
3686
3687 if (c->dset->pd != 1 || strcmp(c->dset->stobs, "1")) {
3688 c->dset->structure = TIME_SERIES;
3689 }
3690
3691 if (c->st != NULL) {
3692 err = gretl_string_table_validate(c->st, OPT_NONE);
3693 if (err) {
3694 pputs(prn, _("Failed to interpret the data as numeric\n"));
3695 goto csv_bailout;
3696 } else if (joining(c)) {
3697 gretl_string_table_save(c->st, c->dset);
3698 } else {
3699 gretl_string_table_print(c->st, c->dset, fname, prn);
3700 }
3701 }
3702
3703 if (csv_as_matrix(c)) {
3704 /* FIXME placement of this */
3705 if (csv_autoname(c)) {
3706 strings_array_free(c->dset->varname, c->dset->v);
3707 c->dset->varname = NULL;
3708 }
3709 *pm = gretl_matrix_data_subset(NULL, c->dset, -1, -1,
3710 M_MISSING_OK, &err);
3711 goto csv_bailout;
3712 }
3713
3714 /* If there were observation labels and they were not interpretable
3715 as dates, and they weren't simply "1, 2, 3, ...", then they
3716 should probably be preserved; otherwise discard them.
3717 */
3718 if (c->dset->S != NULL && c->markerpd >= 0 &&
3719 c->dset->markers != DAILY_DATE_STRINGS) {
3720 dataset_destroy_obs_markers(c->dset);
3721 }
3722
3723 if (csv_autoname(c)) {
3724 /* no variable names were found */
3725 for (i=1; i<c->dset->v; i++) {
3726 sprintf(c->dset->varname[i], "v%d", i);
3727 }
3728 } else {
3729 #if CDEBUG
3730 int ii;
3731
3732 for (ii=0; ii<c->dset->v; ii++) {
3733 fprintf(stderr, " c->dset->varname[%d] = '%s'\n", ii, c->dset->varname[ii]);
3734 }
3735 #endif
3736 if (fix_varname_duplicates(c->dset)) {
3737 pputs(prn, _("warning: some variable names were duplicated\n"));
3738 }
3739 }
3740
3741 if (!joining(c) && !probing(c)) {
3742 int newdata = (dset->Z == NULL);
3743
3744 /* not doing a special "join" operation */
3745 err = merge_or_replace_data(dset, &c->dset, get_merge_opts(opt), prn);
3746
3747 if (!err && newdata && c->descrip != NULL) {
3748 dset->descrip = c->descrip;
3749 c->descrip = NULL;
3750 }
3751
3752 if (!err && newdata) {
3753 dataset_add_import_info(dset, fname, GRETL_CSV);
3754 }
3755 }
3756
3757 csv_bailout:
3758
3759 if (fp != NULL) {
3760 gzclose(fp);
3761 }
3762
3763 if (!err && c->jspec != NULL) {
3764 c->jspec->c = c;
3765 } else if (!err && c->probe != NULL) {
3766 c->probe->dset = c->dset;
3767 c->dset = NULL;
3768 csvdata_free(c);
3769 } else {
3770 csvdata_free(c);
3771 }
3772
3773 if (altname != NULL) {
3774 gretl_remove(altname);
3775 g_free(altname);
3776 }
3777
3778 if (err == E_ALLOC) {
3779 pputs(prn, _("Out of memory\n"));
3780 }
3781
3782 return err;
3783 }
3784
3785 /**
3786 * import_csv:
3787 * @fname: name of CSV file.
3788 * @dset: dataset struct.
3789 * @opt: use OPT_N to force interpretation of data colums containing
3790 * strings as coded (non-numeric) values and not errors; for use of
3791 * OPT_T see the help for "append".
3792 * @prn: gretl printing struct (or NULL).
3793 *
3794 * Open a Comma-Separated Values data file and read the data into
3795 * the current work space.
3796 *
3797 * Returns: 0 on successful completion, non-zero otherwise.
3798 */
3799
import_csv(const char * fname,DATASET * dset,gretlopt opt,PRN * prn)3800 int import_csv (const char *fname, DATASET *dset,
3801 gretlopt opt, PRN *prn)
3802 {
3803 const char *cols = NULL;
3804 const char *rows = NULL;
3805 int ci, err;
3806
3807 err = incompatible_options(opt, OPT_F | OPT_L);
3808 if (err) {
3809 /* --cols and --fixed-cols */
3810 return err;
3811 }
3812
3813 ci = (dset != NULL && dset->v > 0)? APPEND : OPEN;
3814
3815 if (opt & OPT_F) {
3816 /* we should have a "--fixed-cols=XXX" specification */
3817 cols = get_optval_string(ci, OPT_F);
3818 if (cols == NULL || *cols == '\0') {
3819 return E_PARSE;
3820 }
3821 } else if (opt & OPT_L) {
3822 /* should have a "--cols=XXX" specification */
3823 cols = get_optval_string(ci, OPT_L);
3824 if (cols == NULL || *cols == '\0') {
3825 return E_PARSE;
3826 }
3827 }
3828
3829 if (opt & OPT_M) {
3830 /* we should have a "--rowmask=XXX" specification */
3831 rows = get_optval_string(ci, OPT_M);
3832 if (rows == NULL || *rows == '\0') {
3833 return E_PARSE;
3834 }
3835 }
3836
3837 return real_import_csv(fname, dset, cols, rows,
3838 NULL, NULL, NULL, opt, prn);
3839 }
3840
import_csv_as_matrix(const char * fname,int * err)3841 gretl_matrix *import_csv_as_matrix (const char *fname, int *err)
3842 {
3843 #if CDEBUG
3844 PRN *prn = gretl_print_new(GRETL_PRINT_STDERR, NULL);
3845 #else
3846 PRN *prn = NULL;
3847 #endif
3848 gretl_matrix *m = NULL;
3849 char csvname[MAXLEN] = {0};
3850 gretlopt opt = OPT_A; /* --all-cols */
3851 int http = 0;
3852
3853 *err = try_http(fname, csvname, &http);
3854
3855 if (!*err && http) {
3856 *err = real_import_csv(csvname, NULL, NULL, NULL,
3857 NULL, NULL, &m, opt, prn);
3858 } else if (!*err) {
3859 char fullname[FILENAME_MAX];
3860
3861 strcpy(fullname, fname);
3862 gretl_maybe_prepend_dir(fullname);
3863 *err = real_import_csv(fullname, NULL, NULL, NULL,
3864 NULL, NULL, &m, opt, prn);
3865 }
3866
3867 gretl_print_destroy(prn);
3868
3869 return m;
3870 }
3871
probe_varnames_check(DATASET * dset,gretlopt opt,int * rerun)3872 static int probe_varnames_check (DATASET *dset, gretlopt opt,
3873 int *rerun)
3874 {
3875 int missnames = 0;
3876 int i, err = 0;
3877
3878 for (i=1; i<dset->v; i++) {
3879 if (dset->varname[i][0] == '\0') {
3880 missnames = 1;
3881 break;
3882 }
3883 }
3884
3885 if (missnames) {
3886 if (opt & OPT_H) {
3887 gretl_errmsg_set("Couldn't find all variable names");
3888 err = E_DATA;
3889 } else {
3890 *rerun = 1;
3891 }
3892 }
3893
3894 return err;
3895 }
3896
3897 /**
3898 * probe_csv:
3899 * @fname: name of CSV file.
3900 * @varnames: location to receive variable names.
3901 * @nvars: location to receive number of variables (columns).
3902 * @opt: on input, may contain any extra options to pass to
3903 * real_import_csv(); on return, OPT_H (indicating that the
3904 * CSV file has no header) may be added if it seems to be
3905 * required (no header).
3906 *
3907 * Open a Comma-Separated Values data file and read enough to
3908 * determine the variable names.
3909 *
3910 * Returns: 0 on successful completion, non-zero otherwise.
3911 */
3912
probe_csv(const char * fname,char *** varnames,int * nvars,gretlopt * opt)3913 int probe_csv (const char *fname, char ***varnames,
3914 int *nvars, gretlopt *opt)
3915 {
3916 csvprobe probe = {0};
3917 int err;
3918
3919 err = real_import_csv(fname, NULL, NULL, NULL, NULL,
3920 &probe, NULL, *opt, NULL);
3921
3922 if (!err) {
3923 int rerun = 0;
3924
3925 err = probe_varnames_check(probe.dset, *opt, &rerun);
3926
3927 if (err || rerun) {
3928 destroy_dataset(probe.dset);
3929 probe.dset = NULL;
3930 }
3931
3932 if (!err && rerun) {
3933 /* try again with --no-header flag */
3934 *opt |= OPT_H;
3935 err = real_import_csv(fname, NULL, NULL, NULL, NULL,
3936 &probe, NULL, *opt, NULL);
3937 }
3938
3939 if (!err) {
3940 /* steal the varname array */
3941 *varnames = probe.dset->varname;
3942 *nvars = probe.dset->v;
3943 probe.dset->varname = NULL;
3944 }
3945
3946 destroy_dataset(probe.dset);
3947 }
3948
3949 return err;
3950 }
3951
csv_open_needs_matrix(gretlopt opt)3952 int csv_open_needs_matrix (gretlopt opt)
3953 {
3954 int ret = 0;
3955
3956 if (opt & OPT_M) {
3957 /* --rowmask=matrix */
3958 ret = 1;
3959 } else if (opt & OPT_F) {
3960 /* --fixed-cols=whatever */
3961 const char *s = get_optval_string(OPEN, OPT_F);
3962
3963 ret = get_matrix_by_name(s) != NULL;
3964 }
3965
3966 return ret;
3967 }
3968