1 /*
2 * Read SPSS .sav files : based on spss.c in the GNU R 'foreign' package.
3 * Re-written for use with gretl by Allin Cottrell, November 2008.
4 *
5 * Original notice from spss.c:
6 *
7 * Copyright 2000-2000 Saikat DebRoy <saikat@stat.wisc.edu>
8 * Thomas Lumley <tlumley@u.washington.edu>
9 * Copyright 2005-8 R Core Development Team
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be
16 * useful, but WITHOUT ANY WARRANTY; without even the implied
17 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
18 * PURPOSE. See the GNU General Public License for more
19 * details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program. If not, see <http://www.gnu.org/licenses/>.
23 */
24
25 #include "libgretl.h"
26 #include "version.h"
27 #include "gretl_string_table.h"
28 #include "swap_bytes.h"
29
30 #include <string.h>
31 #include <errno.h>
32 #include <stdarg.h>
33 #include <stdint.h>
34
35 #define SPSS_DEBUG 0
36
37 enum {
38 SPSS_NUMERIC,
39 SPSS_STRING,
40 SPSS_UNDEF
41 };
42
43 enum {
44 MISSING_NONE, /* no user-missing values */
45 MISSING_1, /* one user-missing value */
46 MISSING_2, /* two user-missing values */
47 MISSING_3, /* three user-missing values */
48 MISSING_RANGE, /* [a,b] */
49 MISSING_LOW, /* (-inf,a] */
50 MISSING_HIGH, /* (a,+inf] */
51 MISSING_RANGE_1, /* [a,b], c */
52 MISSING_LOW_1, /* (-inf,a], b */
53 MISSING_HIGH_1, /* (a,+inf), b */
54 MISSING_MAX /* sentinel */
55 };
56
57 #define MAX_SHORT_STRING 8
58 #define SYSMIS (-DBL_MAX)
59 #define MAX_CASESIZE (INT_MAX / sizeof(double) / 2)
60 #define MAX_CASES (INT_MAX / 2)
61
62 /* Divides nonnegative x by positive y, rounding up */
63 #define DIV_RND_UP(x,y) (((x) + ((y) - 1)) / (y))
64
65 /* Rounds x up to the next multiple of y */
66 #define ROUND_UP(x,y) (((x) + ((y) - 1)) / (y) * (y))
67
68 /* Gives the padding needed to round x to next multiple of y */
69 #define REM_RND_UP(x,y) ((x) % (y) ? (y) - (x) % (y) : 0)
70
71 typedef struct spss_var_ spss_var;
72 typedef struct spss_data_ spss_data;
73 typedef struct spss_labelset_ spss_labelset;
74
75 struct spss_var_ {
76 int type;
77 int gretl_index; /* position in gretl dataset */
78 int width; /* Size of string variables in chars */
79 int n_ok_obs; /* number of non-missing values */
80 int fv, nv; /* Index into values, number of values */
81 int offset; /* Index for retrieving actual values */
82 int miss_type; /* One of the MISSING_* constants */
83 double missing[3]; /* User-missing value */
84 char name[VNAMELEN];
85 char label[MAXLABEL];
86 };
87
88 struct spss_labelset_ {
89 int nlabels;
90 int vtype;
91 int *varlist;
92 double *vals;
93 char **labels;
94 };
95
96 /* SPSS native record: general dataset information */
97 struct sysfile_header {
98 char rec_type[4]; /* Record-type code, "$FL2" */
99 char prod_name[60]; /* Product identification */
100 int32_t layout_code; /* 2 */
101 int32_t case_size; /* Number of 'values per case' (i.e. variables) */
102 int32_t compressed; /* 1 = yes, 0 = no */
103 int32_t weight_index; /* 1-based index of weighting var, or zero */
104 int32_t ncases; /* Number of cases (observations), -1 if unknown */
105 double bias; /* Compression bias (100.0) */
106 char creation_date[9]; /* 'dd mmm yy' creation date of file */
107 char creation_time[8]; /* 'hh:mm:ss' 24-hour creation time */
108 char file_label[64]; /* File label */
109 char padding[3];
110 };
111
112 /* SPSS native record: info on variable */
113 struct sysfile_variable {
114 int32_t rec_type; /* must be 2 */
115 int32_t type; /* 0 = numeric, 1-255 = string width,
116 (allowed to be up to 65535 in 0.8-24)
117 -1 = continued string */
118 int32_t has_var_label; /* 1 = yes, 0 = no */
119 int32_t n_missing_values; /* Missing value code: -3, -2, 0, 1, 2, or 3 */
120 int32_t print; /* Print format */
121 int32_t write; /* Write format */
122 char name[8]; /* Variable name */
123 /* The rest of the structure varies */
124 };
125
126 /* extended info regarding SPSS sav file */
127 struct sav_extension {
128 /* special constants */
129 double sysmis;
130 double highest;
131 double lowest;
132
133 double *buf; /* decompression buffer */
134 double *ptr; /* current location in buffer */
135 double *end; /* end of buffer marker */
136
137 /* compression instruction octet and pointer */
138 unsigned char x[sizeof(double)];
139 unsigned char *y;
140 };
141
142 /* working information on dataset */
143 struct spss_data_ {
144 FILE *fp; /* stream for reading */
145 gretlopt opt; /* option flags */
146 int nvars; /* number of variables (really, 'elements') */
147 int nobs; /* number of observations ('cases') */
148 int swapends; /* reversing endianness? (0/1) */
149 int encoding; /* encoding for strings */
150 int max_sv; /* index of highest-numbered string variable */
151 spss_var *vars; /* info on individual variables */
152 int nlabelsets; /* number of sets of value -> label mappings */
153 spss_labelset **labelsets; /* sets of value -> label mappings */
154 struct sav_extension ext; /* extra info */
155 gretl_string_table *st; /* string table for string-valued variables */
156 char *descrip; /* dataset description */
157 int *droplist; /* list of 'empty' variables to drop */
158 };
159
160 #define CONTD(d,i) (d->vars[i].type == -1)
161
162 static void free_labelset (spss_labelset *lset);
163
164 #if SPSS_DEBUG
mt_string(int mt)165 static const char *mt_string (int mt)
166 {
167 switch(mt) {
168 case MISSING_1: return "MISSING_1";
169 case MISSING_2: return "MISSING_2";
170 case MISSING_3: return "MISSING_3";
171 case MISSING_RANGE: return "MISSING_RANGE";
172 case MISSING_LOW: return "MISSING_LOW";
173 case MISSING_HIGH: return "MISSING_HIGH";
174 case MISSING_RANGE_1: return "MISSING_RANGE_1";
175 case MISSING_LOW_1: return "MISSING_LOW_1";
176 case MISSING_HIGH_1: return "MISSING_HIGH_1";
177 default: return "??";
178 }
179
180 return "??";
181 }
182 #endif
183
second_lowest_double_val(void)184 static double second_lowest_double_val (void)
185 {
186 #ifdef WORDS_BIGENDIAN
187 union {
188 unsigned char c[8];
189 double d;
190 } second_lowest = {{0xff, 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}};
191 #else
192 union {
193 unsigned char c[8];
194 double d;
195 } second_lowest = {{0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xef, 0xff}};
196 #endif
197
198 return second_lowest.d;
199 }
200
sav_error(const char * fmt,...)201 static int sav_error (const char *fmt, ...)
202 {
203 char msg[512];
204 va_list args;
205
206 if (fmt == NULL) {
207 return 1;
208 }
209
210 va_start(args, fmt);
211 vsnprintf(msg, sizeof msg, fmt, args);
212 va_end(args);
213
214 gretl_errmsg_set(msg);
215
216 #if SPSS_DEBUG
217 fprintf(stderr, "spss_error: %s\n", msg);
218 #endif
219
220 return 1;
221 }
222
sav_read_int32(spss_data * sdat,int * err)223 static int sav_read_int32 (spss_data *sdat, int *err)
224 {
225 int32_t ret = 0;
226
227 *err = fread(&ret, sizeof ret, 1, sdat->fp) != 1;
228
229 if (!*err && sdat->swapends) {
230 reverse_int(ret);
231 }
232
233 return ret;
234 }
235
236 /* Read record type 7, subtype 3 */
237
read_rec_7_3(spss_data * sdat,int size,int count)238 static int read_rec_7_3 (spss_data *sdat, int size, int count)
239 {
240 int file_big_endian;
241 int32_t data[8];
242 int i, enc;
243 int err = 0;
244
245 if (size != sizeof(int32_t) || count != 8) {
246 return sav_error("Bad size (%d) or count (%d) field on record type 7, "
247 "subtype 3. Expected size %d, count 8.",
248 size, count, sizeof(int32_t));
249 }
250
251 if (fread(data, sizeof data, 1, sdat->fp) != 1) {
252 return E_DATA;
253 }
254
255 if (sdat->swapends) {
256 for (i=0; i<8; i++) {
257 reverse_int(data[i]);
258 }
259 }
260
261 if (data[4] != 1) {
262 return sav_error("Floating-point representation in SPSS file is not IEEE-754.");
263 }
264
265 #ifdef WORDS_BIGENDIAN
266 file_big_endian = !sdat->swapends;
267 #else
268 file_big_endian = sdat->swapends;
269 #endif
270
271 if ((file_big_endian && data[6] != 1) || (!file_big_endian && data[6] != 2)) {
272 return sav_error("I'm confused over the endianness of this .sav file");
273 }
274
275 enc = sdat->encoding = data[7];
276
277 #if SPSS_DEBUG
278 fprintf(stderr, "rec_7_3 encoding: got %d\n", enc);
279 #endif
280
281 /* See
282 http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html
283 */
284
285 if (enc == 1 || enc == 4) {
286 return sav_error("Indicated character encoding (%s) is not ASCII",
287 (enc == 1)? "EBCDIC" : "DEC Kanji");
288 }
289
290 if (enc >= 500) {
291 fprintf(stderr, "Indicated character encoding (%d) may be a Windows codepage\n", enc);
292 } else if (enc > 4) {
293 fprintf(stderr, "Indicated character encoding (%d) is unknown", enc);
294 }
295
296 return err;
297 }
298
299 /* Read record type 7, subtype 4 */
300
read_rec_7_4(spss_data * sdat,int size,int count)301 static int read_rec_7_4 (spss_data *sdat, int size, int count)
302 {
303 double data[3];
304 int i;
305
306 if (size != sizeof(double) || count != 3) {
307 return sav_error("Bad size (%d) or count (%d) field on record type 7, "
308 "subtype 4. Expected size %d, count 8",
309 size, count, sizeof(double));
310 }
311
312 if (fread(data, sizeof data, 1, sdat->fp) != 1) {
313 return E_DATA;
314 }
315
316 if (sdat->swapends) {
317 for (i=0; i<3; i++) {
318 reverse_double(data[i]);
319 }
320 }
321
322 if (data[0] != SYSMIS || data[1] != DBL_MAX || data[2] != second_lowest_double_val()) {
323 sdat->ext.sysmis = data[0];
324 sdat->ext.highest = data[1];
325 sdat->ext.lowest = data[2];
326 fprintf(stderr, "File-indicated value is different from internal value for at "
327 "least one of the three system values. SYSMIS: indicated %g, "
328 "expected %g; HIGHEST: %g, %g; LOWEST: %g, %g\n",
329 data[0], SYSMIS,
330 data[1], DBL_MAX,
331 data[2], second_lowest_double_val());
332 }
333
334 return 0;
335 }
336
get_spss_var_by_name(spss_data * sdat,const char * s)337 static spss_var *get_spss_var_by_name (spss_data *sdat, const char *s)
338 {
339 int i;
340
341 for (i=0; i<sdat->nvars; i++) {
342 if (!strcmp(sdat->vars[i].name, s)) {
343 return &sdat->vars[i];
344 }
345 }
346
347 return NULL;
348 }
349
recode_sav_string(char * targ,const char * src,int encoding,int maxlen)350 static int recode_sav_string (char *targ, const char *src,
351 int encoding, int maxlen)
352 {
353 int err = 0;
354
355 #if SPSS_DEBUG > 1
356 fprintf(stderr, "recode_sav_string: '%s', enc=%d\n", src, encoding);
357 #endif
358
359 *targ = '\0';
360
361 if (g_utf8_validate(src, -1, NULL)) {
362 gretl_utf8_strncat_trim(targ, src, maxlen);
363 } else if (encoding == 28591) {
364 /* https://www.gnu.org/software/pspp/pspp-dev/html_node/ \
365 Machine-Integer-Info-Record.html */
366 gchar *conv;
367 gsize wrote = 0;
368
369 conv = g_convert(src, -1, "UTF-8", "ISO8859-1",
370 NULL, &wrote, NULL);
371 if (conv != NULL) {
372 gretl_utf8_strncat_trim(targ, conv, maxlen);
373 g_free(conv);
374 } else {
375 err = E_DATA;
376 }
377 } else if (encoding >= 500 && encoding <= 9999) {
378 /* try Windows codepage? */
379 char cpage[8];
380 gchar *conv;
381 gsize wrote = 0;
382
383 sprintf(cpage, "CP%d", encoding);
384 conv = g_convert(src, -1, "UTF-8", cpage,
385 NULL, &wrote, NULL);
386 if (conv != NULL) {
387 gretl_utf8_strncat_trim(targ, conv, maxlen);
388 g_free(conv);
389 } else {
390 err = E_DATA;
391 }
392 } else {
393 err = E_DATA;
394 }
395
396 if (err) {
397 strcpy(targ, "unknown");
398 }
399
400 return err;
401 }
402
strip_vname_illegals(char * s)403 static void strip_vname_illegals (char *s)
404 {
405 char name[VNAMELEN] = {0};
406 int i, j = 0;
407
408 for (i=0; s[i] != '\0'; i++) {
409 if (isalnum(s[i]) || s[i] == '_') {
410 name[j++] = s[i];
411 }
412 }
413
414 name[j] = '\0';
415 strcpy(s, name);
416 }
417
check_sav_varname(char * vname,const char * s)418 static int check_sav_varname (char *vname, const char *s)
419 {
420 /* skip any leading garbage */
421 while (*s && !isalpha(*s)) s++;
422
423 strncat(vname, s, VNAMELEN - 1);
424 iso_to_ascii(vname);
425 strip_vname_illegals(vname);
426 return check_varname(vname);
427 }
428
429 /* Read record type 7, subtype 13: long variable names */
430
read_long_varnames(spss_data * sdat,unsigned size,unsigned count)431 static int read_long_varnames (spss_data *sdat, unsigned size,
432 unsigned count)
433 {
434 char *buf, *val;
435 char *p, *endp;
436 spss_var *var;
437
438 if (size != 1 || count == 0) {
439 fprintf(stderr, "Strange record info: size=%u, count=%u,"
440 "ignoring long variable names\n", size, count);
441 return E_DATA;
442 }
443
444 size *= count;
445
446 #if SPSS_DEBUG
447 fprintf(stderr, "long_varnames: getting %u bytes\n", size);
448 #endif
449
450 buf = calloc(size + 1, 1);
451 if (buf == NULL) {
452 return E_ALLOC;
453 }
454
455 if (fread(buf, size, 1, sdat->fp) != 1) {
456 free(buf);
457 return E_DATA;
458 }
459
460 p = buf;
461
462 do {
463 if ((endp = strchr(p, '\t')) != NULL) {
464 *endp = '\0'; /* put null terminator */
465 }
466 if ((val = strchr(p, '=')) == NULL) {
467 fprintf(stderr, "No long name for variable '%s'\n", p);
468 } else {
469 *val = '\0';
470 ++val;
471 /* at this point p is key, val is long name */
472 var = get_spss_var_by_name(sdat, p);
473 if (var != NULL) {
474 /* replace original name with "long name" */
475 #if SPSS_DEBUG > 1
476 fprintf(stderr, "'%s' -> '%s'\n", p, val);
477 #endif
478 *var->name = '\0';
479 check_sav_varname(var->name, val);
480 }
481 }
482 if (endp != NULL) {
483 p = endp + 1;
484 }
485 } while (endp != NULL);
486
487 free(buf);
488
489 return 0;
490 }
491
492 /* read a subrecord of SPSS record type 7 */
493
read_subrecord(spss_data * sdat)494 static int read_subrecord (spss_data *sdat)
495 {
496 struct {
497 int32_t subtype;
498 int32_t size;
499 int32_t count;
500 } data;
501 int skip = 0;
502 int err = 0;
503
504 if (fread(&data, sizeof data, 1, sdat->fp) != 1) {
505 return E_DATA;
506 }
507
508 #if SPSS_DEBUG
509 fprintf(stderr, "subtype = %d, size = %d, count = %d\n",
510 data.subtype, data.size, data.count);
511 #endif
512
513 if (sdat->swapends) {
514 reverse_int(data.subtype);
515 reverse_int(data.size);
516 reverse_int(data.count);
517 }
518
519 switch (data.subtype) {
520 case 3:
521 err = read_rec_7_3(sdat, data.size, data.count);
522 break;
523 case 4:
524 err = read_rec_7_4(sdat, data.size, data.count);
525 break;
526 case 5:
527 case 6:
528 case 11: /* ? Used by SPSS 8.0 */
529 skip = 1;
530 break;
531 case 7: /* Multiple-response sets (later versions of SPSS) */
532 skip = 1;
533 break;
534 case 13:
535 err = read_long_varnames(sdat, data.size, data.count);
536 break;
537 case 16:
538 /* http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */
539 skip = 1;
540 break;
541 case 20:
542 fprintf(stderr, "Found subtype 20: Character Encoding Record (FIXME)\n");
543 skip = 1;
544 break;
545 default:
546 fprintf(stderr, "Unrecognized record type 7, subtype %d encountered "
547 "in sav file\n", data.subtype);
548 skip = 1;
549 }
550
551 if (skip) {
552 int n = data.size * data.count;
553
554 fprintf(stderr, "record type 7, subtype %d: skipping %d bytes\n",
555 data.subtype, n);
556 fseek(sdat->fp, n, SEEK_CUR);
557 }
558
559 return err;
560 }
561
read_type_4(spss_data * sdat,int * err)562 static int read_type_4 (spss_data *sdat, int *err)
563 {
564 int32_t rec_type, n_vars = 0;
565
566 rec_type = sav_read_int32(sdat, err);
567 if (*err) {
568 return 0;
569 }
570
571 if (rec_type != 4) {
572 fprintf(stderr, "Variable index record (type 4) does not immediately\n"
573 "follow value label record (type 3) as it should\n");
574 *err = E_DATA;
575 return 0;
576 }
577
578 n_vars = sav_read_int32(sdat, err);
579 if (*err) {
580 return 0;
581 }
582
583 if (n_vars < 1 || n_vars > sdat->nvars) {
584 *err = sav_error("Number of variables associated with a value label "
585 "(%d) is not between 1 and the number of variables (%d)",
586 n_vars, sdat->nvars);
587 }
588
589 return n_vars;
590 }
591
sdat_remove_labelset(spss_data * sdat)592 static void sdat_remove_labelset (spss_data *sdat)
593 {
594 int n = sdat->nlabelsets - 1;
595
596 if (n < 0) {
597 return;
598 }
599
600 free_labelset(sdat->labelsets[n]);
601 sdat->labelsets[n] = NULL;
602 sdat->nlabelsets = n;
603 }
604
sdat_add_labelset(spss_data * sdat,int n_labels)605 static int sdat_add_labelset (spss_data *sdat, int n_labels)
606 {
607 spss_labelset *lset, **lsets;
608 int n = sdat->nlabelsets + 1;
609 int err = 0;
610
611 if (n_labels <= 0) {
612 return E_DATA;
613 }
614
615 lsets = realloc(sdat->labelsets, n * sizeof *lsets);
616 if (lsets == NULL) {
617 return E_ALLOC;
618 }
619
620 lsets[n-1] = NULL;
621 sdat->labelsets = lsets;
622 sdat->nlabelsets = n;
623
624 lset = malloc(sizeof *lset);
625 if (lset == NULL) {
626 return E_ALLOC;
627 }
628
629 lsets[n-1] = lset;
630 lset->nlabels = n_labels;
631 lset->vtype = SPSS_UNDEF;
632
633 lset->varlist = NULL;
634 lset->vals = NULL;
635 lset->labels = NULL;
636
637 lset->vals = malloc(n_labels * sizeof *lset->vals);
638 if (lset->vals == NULL) {
639 err = E_ALLOC;
640 } else {
641 lset->labels = strings_array_new(n_labels);
642 if (lset->labels == NULL) {
643 err = E_ALLOC;
644 }
645 }
646
647 return err;
648 }
649
check_label_varindex(spss_data * sdat,spss_labelset * lset,int idx)650 static int check_label_varindex (spss_data *sdat, spss_labelset *lset, int idx)
651 {
652 int err = 0;
653
654 if (idx < 1 || idx > sdat->nvars) {
655 err = sav_error("Variable index associated with value label (%d) is "
656 "not between 1 and the number of values (%d)",
657 idx, sdat->nvars);
658 } else {
659 spss_var *v = &sdat->vars[idx-1];
660
661 if (v->type == -1) {
662 err = sav_error("Variable index associated with value label (%d) refers "
663 "to a continuation of a string variable, not to an actual "
664 "variable", idx);
665 } else if (v->type == SPSS_STRING && v->width > MAX_SHORT_STRING) {
666 err = sav_error("Value labels are not allowed on long string variables (%s)",
667 v->name);
668 } else if (lset->vtype == SPSS_UNDEF) {
669 /* record type of first variable */
670 lset->vtype = v->type;
671 } else if (v->type != lset->vtype) {
672 err = sav_error("Variables associated with value label are not all "
673 "of the same type.");
674 }
675 }
676
677 return err;
678 }
679
read_value_labels(spss_data * sdat)680 static int read_value_labels (spss_data *sdat)
681 {
682 FILE *fp = sdat->fp;
683 spss_labelset *lset;
684 int32_t n_labels = 0; /* Number of labels */
685 int32_t n_vars = 0; /* Number of associated variables */
686 int empty_label = 0;
687 int i, err = 0;
688
689 n_labels = sav_read_int32(sdat, &err);
690 if (err) {
691 return err;
692 }
693
694 #if SPSS_DEBUG
695 fprintf(stderr, "\n*** label set: %d labels\n", n_labels);
696 #endif
697
698 err = sdat_add_labelset(sdat, n_labels);
699 if (err) {
700 return err;
701 }
702
703 lset = sdat->labelsets[sdat->nlabelsets - 1];
704
705 /* first step: read the value/label pairs */
706
707 for (i=0; i<n_labels && !err; i++) {
708 char label[256] = {0};
709 double value = 0;
710 unsigned char label_len = 0;
711 int rem, fgot = 0;
712
713 /* read value, label length, label */
714 fgot += fread(&value, sizeof value, 1, fp);
715 fgot += fread(&label_len, 1, 1, fp);
716 fgot += fread(label, label_len, 1, fp);
717
718 #if SPSS_DEBUG
719 fprintf(stderr, "i=%d: fgot=%d, value=%g, len=%d, '%s'\n",
720 i, fgot, value, label_len, label);
721 #endif
722 if (fgot < 3) {
723 if (n_labels == 1 && label_len == 0) {
724 /* tolerate apparent breakage here? */
725 empty_label = 1;
726 } else {
727 err = E_DATA;
728 break;
729 }
730 }
731
732 lset->vals[i] = value;
733 lset->labels[i] = gretl_strdup(label);
734 if (lset->labels[i] == NULL) {
735 err = E_ALLOC;
736 break;
737 }
738
739 /* skip padding */
740 rem = REM_RND_UP(label_len + 1, sizeof(double));
741 if (rem) {
742 fseek(fp, rem, SEEK_CUR);
743 }
744 }
745
746 /* second step: read the type 4 record holding the list of
747 variables to which the value labels are to be applied
748 */
749
750 if (!err) {
751 n_vars = read_type_4(sdat, &err);
752 #if SPSS_DEBUG
753 fprintf(stderr, "Got %d associated variables\n", n_vars);
754 #endif
755 }
756
757 if (!err && n_vars > 0) {
758 lset->varlist = gretl_list_new(n_vars);
759 if (lset->varlist == NULL) {
760 fprintf(stderr, "lset->varlist: failed, n_vars = %d\n", n_vars);
761 err = E_ALLOC;
762 }
763 }
764
765 for (i=0; i<n_vars && !err; i++) {
766 int32_t idx;
767
768 idx = sav_read_int32(sdat, &err);
769 if (err) {
770 break;
771 }
772
773 err = check_label_varindex(sdat, lset, idx);
774 if (!err) {
775 lset->varlist[i+1] = idx;
776 #if SPSS_DEBUG
777 fprintf(stderr, " %3d: idx = %d (%s)\n", i, idx,
778 sdat->vars[idx-1].name);
779 #endif
780 }
781 }
782
783 /* we'll preserve value -> label mappings only for numeric
784 variables */
785 if (!err && (empty_label || lset->vtype != SPSS_NUMERIC)) {
786 sdat_remove_labelset(sdat);
787 }
788
789 return err;
790 }
791
792 /* Reads an SPSS "document" record, type 6, and discards it: for now,
793 this is just to keep our place in the stream */
794
read_documents(spss_data * sdat)795 static int read_documents (spss_data *sdat)
796 {
797 int32_t n_lines;
798 int err = 0;
799
800 n_lines = sav_read_int32(sdat, &err);
801
802 if (n_lines <= 0) {
803 fprintf(stderr, "Number of document lines (%d) must be greater than 0\n",
804 n_lines);
805 err = E_DATA;
806 } else {
807 int i, n = 80 * n_lines;
808 unsigned char c;
809 size_t b = 0;
810
811 for (i=0; i<n; i++) {
812 b = fread(&c, 1, 1, sdat->fp);
813 }
814
815 fprintf(stderr, "read_documents: got %d lines (%d bytes)\n",
816 n_lines, (int) b);
817 }
818
819 return err;
820 }
821
822 /* Read records of types 3, 4, 6, and 7 */
823
read_sav_other_records(spss_data * sdat)824 static int read_sav_other_records (spss_data *sdat)
825 {
826 FILE *fp = sdat->fp;
827 int32_t pad, rec_type = 0;
828 size_t n;
829 int err = 0;
830
831 while (rec_type >= 0 && !err) {
832 rec_type = sav_read_int32(sdat, &err);
833 if (err) {
834 break;
835 }
836
837 #if SPSS_DEBUG > 1
838 fprintf(stderr, "read_sav_other_records: type = %d\n", rec_type);
839 #endif
840
841 switch (rec_type) {
842 case 3:
843 err = read_value_labels(sdat);
844 break;
845 case 4:
846 fprintf(stderr, "Orphaned variable index record (type 4). Type 4\n"
847 "records must immediately follow type 3 records\n");
848 err = E_DATA;
849 break;
850 case 6:
851 err = read_documents(sdat);
852 break;
853 case 7:
854 read_subrecord(sdat);
855 break;
856 case 999:
857 n = fread(&pad, sizeof pad, 1, fp);
858 if (n != 1) {
859 fprintf(stderr, "couldn't read padding\n");
860 }
861 rec_type = -1;
862 break;
863 default:
864 fprintf(stderr, "Unrecognized record type %d\n", rec_type);
865 err = E_DATA;
866 break;
867 }
868 }
869
870 return err;
871 }
872
sdat_add_variables(spss_data * sdat)873 static int sdat_add_variables (spss_data *sdat)
874 {
875 int i;
876
877 sdat->vars = malloc(sdat->nvars * sizeof *sdat->vars);
878 if (sdat->vars == NULL) {
879 return E_ALLOC;
880 }
881
882 for (i=0; i<sdat->nvars; i++) {
883 sdat->vars[i].type = SPSS_NUMERIC;
884 sdat->vars[i].gretl_index = -1;
885 sdat->vars[i].n_ok_obs = 0;
886 sdat->vars[i].miss_type = MISSING_NONE;
887 sdat->vars[i].name[0] = '\0';
888 sdat->vars[i].label[0] = '\0';
889 sdat->vars[i].offset = -1;
890 }
891
892 return 0;
893 }
894
validate_var_info(struct sysfile_variable * sv,int i)895 static int validate_var_info (struct sysfile_variable *sv, int i)
896 {
897 int err = 0;
898
899 if (sv->type < 0 || sv->type > 255) {
900 err = sav_error("position %d: bad variable type code %d\n", i, sv->type);
901 } else if (sv->has_var_label != 0 && sv->has_var_label != 1) {
902 err = sav_error("position %d: variable label indicator field is not 0 or 1\n", i);
903 } else if (sv->n_missing_values < -3 || sv->n_missing_values > 3 ||
904 sv->n_missing_values == -1) {
905 err = sav_error("position %d: missing value indicator field is not "
906 "-3, -2, 0, 1, 2, or 3\n", i);
907 }
908
909 return err;
910 }
911
transcribe_varname(spss_data * sdat,struct sysfile_variable * sv,int i)912 static int transcribe_varname (spss_data *sdat, struct sysfile_variable *sv, int i)
913 {
914 spss_var *v = &sdat->vars[i];
915 unsigned char c = sv->name[0];
916 int j = 0, k = 0;
917
918 if (!isalpha(c) && c != '@' && c != '#') {
919 return sav_error("position %d: Variable name begins with invalid character", i);
920 }
921
922 if (c == '#') {
923 fprintf(stderr, "position %d: Variable name begins with '#'.\n"
924 "Scratch variables should not appear in .sav files\n", i);
925 j++;
926 }
927
928 v->name[k++] = sv->name[j++];
929
930 for (; j<8; j++) {
931 c = sv->name[j];
932 if (isspace(c)) {
933 break;
934 } else if (c == '.') {
935 v->name[k++] = '_';
936 } else if (isalnum(c) || c == '_') {
937 v->name[k++] = c;
938 } else {
939 fprintf(stderr, "position %d: character `\\%03o' (%c) is not valid in a "
940 "variable name\n", i, c, c);
941 }
942 }
943
944 v->name[k] = 0;
945
946 #if SPSS_DEBUG
947 fprintf(stderr, " name = '%s'\n", v->name);
948 #endif
949
950 return 0;
951 }
952
grab_var_label(spss_data * sdat,spss_var * v)953 static int grab_var_label (spss_data *sdat, spss_var *v)
954 {
955 size_t labread, rem = 0;
956 int32_t len;
957 int err = 0;
958
959 len = sav_read_int32(sdat, &err);
960 if (err) {
961 return err;
962 }
963
964 if (len < 0 || len > 65535) {
965 return sav_error("Variable %s indicates label of invalid length %d",
966 v->name, len);
967 }
968
969 labread = ROUND_UP(len, sizeof(int32_t));
970
971 if (labread > MAXLABEL - 1) {
972 rem = labread - (MAXLABEL - 1);
973 labread = MAXLABEL - 1;
974 if (len > MAXLABEL - 1) {
975 len = MAXLABEL - 1;
976 }
977 }
978
979 if (fread(v->label, labread, 1, sdat->fp) != 1) {
980 return sav_error("Variable %s: couldn't read label", v->name);
981 }
982
983 v->label[len] = '\0';
984 tailstrip(v->label);
985
986 #if SPSS_DEBUG
987 fprintf(stderr, " label '%s'\n", v->label);
988 #endif
989
990 if (rem > 0) {
991 /* skip excess label characters */
992 fseek(sdat->fp, rem, SEEK_CUR);
993 }
994
995 return 0;
996 }
997
record_missing_vals_info(spss_data * sdat,spss_var * v,int nmiss)998 static int record_missing_vals_info (spss_data *sdat, spss_var *v,
999 int nmiss)
1000 {
1001 int anmiss = abs(nmiss);
1002 double mv[3];
1003 int j;
1004
1005 if (v->width > MAX_SHORT_STRING) {
1006 return sav_error("Long string variable %s cannot have missing values",
1007 v->name);
1008 }
1009
1010 if (fread(mv, anmiss * sizeof *mv, 1, sdat->fp) != 1) {
1011 return sav_error("%s: couldn't mv", v->name);
1012 }
1013
1014 if (sdat->swapends && v->type == SPSS_NUMERIC) {
1015 for (j=0; j<anmiss; j++) {
1016 reverse_double(mv[j]);
1017 }
1018 }
1019
1020 if (nmiss > 0) {
1021 v->miss_type = nmiss;
1022 if (v->type == SPSS_NUMERIC) {
1023 for (j=0; j<nmiss; j++) {
1024 v->missing[j] = mv[j];
1025 }
1026 } else {
1027 for (j=0; j<nmiss; j++) {
1028 memcpy(&v->missing[j], &mv[j], v->width);
1029 }
1030 }
1031 } else if (v->type == SPSS_STRING) {
1032 return sav_error("String variable %s may not have missing values "
1033 "specified as a range", v->name);
1034 } else {
1035 j = 0;
1036 if (mv[0] == sdat->ext.lowest) {
1037 v->miss_type = MISSING_LOW;
1038 v->missing[j++] = mv[1];
1039 } else if (mv[1] == sdat->ext.highest) {
1040 v->miss_type = MISSING_HIGH;
1041 v->missing[j++] = mv[0];
1042 } else {
1043 v->miss_type = MISSING_RANGE;
1044 v->missing[j++] = mv[0];
1045 v->missing[j++] = mv[1];
1046 }
1047
1048 if (nmiss == -3) {
1049 v->miss_type += 3;
1050 v->missing[j++] = mv[2];
1051 }
1052 }
1053
1054 #if SPSS_DEBUG
1055 fprintf(stderr, " miss_type = %d (%s)\n", v->miss_type, mt_string(v->miss_type));
1056 int i;
1057 for (i=0; i<j; i++) {
1058 fprintf(stderr, " missing[%d] = %g\n", i, v->missing[i]);
1059 }
1060 #endif
1061
1062 return 0;
1063 }
1064
1065 /* read info on the variables in SPSS .sav file */
1066
read_sav_variables(spss_data * sdat,struct sysfile_header * hdr)1067 static int read_sav_variables (spss_data *sdat, struct sysfile_header *hdr)
1068 {
1069 struct sysfile_variable sv;
1070 int long_string_count = 0; /* # of long string continuation
1071 records still expected */
1072 int next_value = 0; /* index to next 'value' structure */
1073 int gidx = 0; /* gretl index number for variable */
1074 int i, err;
1075
1076 err = sdat_add_variables(sdat);
1077 if (err) {
1078 return err;
1079 }
1080
1081 for (i=0; i<sdat->nvars && !err; i++) {
1082 spss_var *v = &sdat->vars[i];
1083
1084 if (fread(&sv, sizeof sv, 1, sdat->fp) != 1) {
1085 err = E_DATA;
1086 break;
1087 }
1088
1089 if (sdat->swapends) {
1090 reverse_int(sv.rec_type);
1091 reverse_int(sv.type);
1092 reverse_int(sv.has_var_label);
1093 reverse_int(sv.n_missing_values);
1094 reverse_int(sv.print);
1095 reverse_int(sv.write);
1096 }
1097
1098 if (sv.rec_type != 2) {
1099 fprintf(stderr, "position %d: Bad record type (%d); the expected "
1100 "value was 2\n", i, sv.rec_type);
1101 err = E_DATA;
1102 break;
1103 }
1104
1105 #if SPSS_DEBUG
1106 fprintf(stderr, "*** position %d\n", i);
1107 fprintf(stderr, " type = %d\n", sv.type);
1108 fprintf(stderr, " has_var_label = %d\n", sv.has_var_label);
1109 fprintf(stderr, " n_missing_values = %d\n", sv.n_missing_values);
1110 #endif
1111
1112 /* If there was a long string previously, make sure that the
1113 continuations are present; otherwise make sure there aren't
1114 any */
1115 if (long_string_count) {
1116 if (sv.type == -1) {
1117 #if SPSS_DEBUG
1118 fprintf(stderr, " long string continuation\n");
1119 #endif
1120 v->type = -1;
1121 long_string_count--;
1122 /* note: no further processing here */
1123 continue;
1124 } else {
1125 err = sav_error("position %d: string variable is missing a "
1126 "continuation record", i);
1127 break;
1128 }
1129 } else if (sv.type == -1) {
1130 fprintf(stderr, "position %d: superfluous string continuation record\n", i);
1131 }
1132
1133 err = validate_var_info(&sv, i);
1134
1135 if (!err) {
1136 err = transcribe_varname(sdat, &sv, i);
1137 }
1138
1139 #if SPSS_DEBUG
1140 fputs((sv.type == SPSS_NUMERIC)? " NUMERIC\n" : " STRING\n", stderr);
1141 #endif
1142
1143 if (!err) {
1144 v->gretl_index = ++gidx;
1145 if (sv.type == SPSS_NUMERIC) {
1146 v->width = 0;
1147 v->offset = next_value++;
1148 v->nv = 1;
1149 } else {
1150 v->type = SPSS_STRING;
1151 v->width = sv.type;
1152 v->nv = DIV_RND_UP(v->width, sizeof(double));
1153 v->offset = next_value;
1154 next_value += v->nv;
1155 long_string_count = v->nv - 1;
1156 }
1157 }
1158
1159 /* get the variable label, if any */
1160 if (!err && sv.has_var_label == 1) {
1161 err = grab_var_label(sdat, v);
1162 }
1163
1164 /* set missing values, if applicable */
1165 if (!err && sv.n_missing_values != 0) {
1166 err = record_missing_vals_info(sdat, v, sv.n_missing_values);
1167 }
1168 }
1169
1170 /* consistency checks */
1171
1172 if (long_string_count != 0) {
1173 fprintf(stderr, "Long string continuation records omitted at end of dictionary\n");
1174 }
1175
1176 if (next_value != hdr->case_size) {
1177 fprintf(stderr, "System file header indicates %d variable positions but %d were "
1178 "read from file\n", hdr->case_size, next_value);
1179 }
1180
1181 return 0;
1182 }
1183
print_product_name(struct sysfile_header * hdr)1184 static void print_product_name (struct sysfile_header *hdr)
1185 {
1186 char prod_name[sizeof hdr->prod_name + 1];
1187 int i;
1188
1189 memcpy(prod_name, hdr->prod_name, sizeof hdr->prod_name);
1190
1191 for (i=0; i<60; i++) {
1192 if (!isprint((unsigned char) prod_name[i])) {
1193 prod_name[i] = ' ';
1194 }
1195 }
1196
1197 for (i=59; i>=0; i--) {
1198 if (!isgraph((unsigned char) prod_name[i])) {
1199 prod_name[i] = '\0';
1200 break;
1201 }
1202 }
1203
1204 prod_name[60] = '\0';
1205
1206 fprintf(stderr, "%s\n", prod_name);
1207 }
1208
sdat_add_descrip(spss_data * sdat,struct sysfile_header * hdr)1209 static void sdat_add_descrip (spss_data *sdat, struct sysfile_header *hdr)
1210 {
1211 char datestr[10];
1212 char *s = hdr->file_label;
1213 char *label = NULL;
1214 int i, n = 11;
1215
1216 memcpy(datestr, hdr->creation_date, 9);
1217 datestr[9] = '\0';
1218 fprintf(stderr, "Creation date %s\n", datestr);
1219
1220 /* preserve the header label if it's not empty */
1221
1222 for (i = sizeof hdr->file_label - 1; i >= 0; i--) {
1223 if (!isspace((unsigned char) s[i]) && s[i] != '\0') {
1224 label = calloc(i + 2, 1);
1225 if (label != NULL) {
1226 memcpy(label, s, i + 1);
1227 label[i + 1] = '\0';
1228 fprintf(stderr, "label: '%s'\n", label);
1229 }
1230 break;
1231 }
1232 }
1233
1234 if (label != NULL) {
1235 n += strlen(label) + 1;
1236 }
1237
1238 sdat->descrip = malloc(n);
1239
1240 if (sdat->descrip != NULL) {
1241 if (label != NULL) {
1242 sprintf(sdat->descrip, "%s\n%s\n", label, datestr);
1243 } else {
1244 sprintf(sdat->descrip, "%s\n", datestr);
1245 }
1246 }
1247
1248 free(label);
1249 }
1250
1251 /* read sav header rcord and check for errors */
1252
read_sav_header(spss_data * sdat,struct sysfile_header * hdr)1253 static int read_sav_header (spss_data *sdat, struct sysfile_header *hdr)
1254 {
1255 int fgot = 0;
1256
1257 fgot += fread(&hdr->rec_type, 4, 1, sdat->fp);
1258 fgot += fread(&hdr->prod_name, 60, 1, sdat->fp);
1259 fgot += fread(&hdr->layout_code, 4, 1, sdat->fp);
1260 fgot += fread(&hdr->case_size, 4, 1, sdat->fp);
1261 fgot += fread(&hdr->compressed, 4, 1, sdat->fp);
1262 fgot += fread(&hdr->weight_index, 4, 1, sdat->fp);
1263 fgot += fread(&hdr->ncases, 4, 1, sdat->fp);
1264 fgot += fread(&hdr->bias, 8, 1, sdat->fp);
1265 fgot += fread(&hdr->creation_date, 9, 1, sdat->fp);
1266 fgot += fread(&hdr->creation_time, 8, 1, sdat->fp);
1267 fgot += fread(&hdr->file_label, 64, 1, sdat->fp);
1268 fgot += fread(&hdr->padding, 3, 1, sdat->fp);
1269
1270 if (fgot < 12) {
1271 fprintf(stderr, "read_sav_header: failed to read full header\n");
1272 return E_DATA;
1273 }
1274
1275 print_product_name(hdr);
1276
1277 /* check endianness */
1278 if (hdr->layout_code == 2 || hdr->layout_code == 3) {
1279 fprintf(stderr, "layout_code = %d, no reverse endianness\n", hdr->layout_code);
1280 } else {
1281 fprintf(stderr, "Need to reverse endianness!\n");
1282 reverse_int(hdr->layout_code);
1283 if (hdr->layout_code != 2 && hdr->layout_code != 3) {
1284 return sav_error("File layout code has unexpected value %d. Value should be 2 or 3, "
1285 "in big-endian or little-endian format", hdr->layout_code);
1286 }
1287 sdat->swapends = 1;
1288 reverse_int(hdr->case_size);
1289 reverse_int(hdr->compressed);
1290 reverse_int(hdr->weight_index);
1291 reverse_int(hdr->ncases);
1292 reverse_double(hdr->bias);
1293 }
1294
1295 /* check basic header info */
1296
1297 if (hdr->case_size <= 0 || hdr->case_size > MAX_CASESIZE) {
1298 return sav_error("Number of elements per case (%d) is not between 1 and %d",
1299 hdr->case_size, MAX_CASESIZE);
1300 }
1301
1302 if (hdr->weight_index < 0 || hdr->weight_index > hdr->case_size) {
1303 return sav_error("Index of weighting variable (%d) is not between 0 and number "
1304 "of elements per case (%d)", hdr->weight_index, hdr->case_size);
1305 }
1306
1307 if (hdr->ncases < -1 || hdr->ncases > MAX_CASES) {
1308 return sav_error("Number of cases in file (%d) is not between -1 and %d",
1309 hdr->ncases, INT_MAX / 2);
1310 }
1311
1312 fprintf(stderr, "case_size (number of variables) = %d\n", hdr->case_size);
1313 fprintf(stderr, "compression = %d\n", hdr->compressed);
1314 fprintf(stderr, "weight index = %d\n", hdr->weight_index);
1315 fprintf(stderr, "ncases = %d\n", hdr->ncases);
1316 fprintf(stderr, "compression bias = %g\n", hdr->bias);
1317
1318 if (hdr->ncases == -1) {
1319 /* is there a way to count cases later? */
1320 return sav_error("Sorry, I don't know what to do with ncases = -1");
1321 }
1322
1323 if (hdr->bias != 100.0) {
1324 fprintf(stderr, "Warning: compression bias (%g) is not the usual value of 100\n",
1325 hdr->bias);
1326 }
1327
1328 sdat->nvars = hdr->case_size;
1329 sdat->nobs = hdr->ncases;
1330
1331 sdat_add_descrip(sdat, hdr);
1332
1333 return 0;
1334 }
1335
1336 /* For a given value of a given varable, check to see if it falls
1337 under a user-defined "missing" category. FIXME: do we really want
1338 to set all such values to NA? This may lose info regarding the
1339 reason why the value is missing.
1340 */
1341
spss_user_missing(spss_var * v,double x)1342 static int spss_user_missing (spss_var *v, double x)
1343 {
1344 int mt = v->miss_type;
1345 double a, b, c;
1346 int n, j, miss;
1347
1348 if (mt == MISSING_NONE) {
1349 return 0;
1350 }
1351
1352 a = b = c = 0;
1353 n = miss = 0;
1354
1355 switch (mt) {
1356 case MISSING_1:
1357 n = 1;
1358 break;
1359 case MISSING_2:
1360 n = 2;
1361 break;
1362 case MISSING_3:
1363 n = 3;
1364 break;
1365 case MISSING_LOW:
1366 case MISSING_HIGH:
1367 a = v->missing[0];
1368 break;
1369 case MISSING_RANGE:
1370 case MISSING_LOW_1:
1371 case MISSING_HIGH_1:
1372 a = v->missing[0];
1373 b = v->missing[1];
1374 break;
1375 case MISSING_RANGE_1:
1376 a = v->missing[0];
1377 b = v->missing[1];
1378 c = v->missing[2];
1379 break;
1380 default:
1381 break;
1382 }
1383
1384 /* FIXME SPSS_NUMERIC vs SPSS_STRING */
1385
1386 if (n > 0) {
1387 for (j=0; j<n; j++) {
1388 if (x == v->missing[j]) {
1389 miss = 1;
1390 break;
1391 }
1392 }
1393 } else if (mt == MISSING_RANGE) {
1394 /* [a,b] */
1395 miss = (x >= a && x <= b);
1396 } else if (mt == MISSING_LOW) {
1397 /* (-inf, a] */
1398 miss = (x <= a);
1399 } else if (mt == MISSING_HIGH) {
1400 /* (a,+inf] */
1401 miss = (x > a);
1402 } else if (mt == MISSING_RANGE_1) {
1403 /* [a,b], c */
1404 miss = (x >= a && x <= b) || x == c;
1405 } else if (mt == MISSING_LOW_1) {
1406 /* (-inf,a], b */
1407 miss = (x <= a) || x == b;
1408 } else if (mt == MISSING_HIGH_1) {
1409 /* (a,+inf), b */
1410 miss = (x > a) || x == b;
1411 }
1412
1413 return miss;
1414 }
1415
has_value_labels(spss_data * sdat,int i)1416 static int has_value_labels (spss_data *sdat, int i)
1417 {
1418 int j;
1419
1420 for (j=0; j<sdat->nlabelsets; j++) {
1421 if (in_gretl_list(sdat->labelsets[j]->varlist, i)) {
1422 return 1;
1423 }
1424 }
1425
1426 return 0;
1427 }
1428
buffer_input(struct sav_extension * ext,FILE * fp)1429 static int buffer_input (struct sav_extension *ext, FILE *fp)
1430 {
1431 size_t amt;
1432
1433 amt = fread(ext->buf, sizeof *ext->buf, 128, fp);
1434
1435 if (ferror(fp)) {
1436 sav_error("Error reading file: %s", strerror(errno));
1437 return 0;
1438 }
1439
1440 ext->ptr = ext->buf;
1441 ext->end = ext->buf + amt;
1442
1443 return amt;
1444 }
1445
1446 /* decompression routine for special SPSS-compressed data */
1447
read_compressed_data(spss_data * sdat,struct sysfile_header * hdr,double * tmp)1448 static int read_compressed_data (spss_data *sdat, struct sysfile_header *hdr,
1449 double *tmp)
1450 {
1451 struct sav_extension *ext = &sdat->ext;
1452 const unsigned char *p_end = ext->x + sizeof(double);
1453 unsigned char *p = ext->y;
1454 const double *tmp_beg = tmp;
1455 const double *tmp_end = tmp + sdat->nvars;
1456 int err = 0, done = 0;
1457
1458 if (ext->buf == NULL) {
1459 ext->buf = malloc(128 * sizeof *ext->buf);
1460 if (ext->buf == NULL) {
1461 return E_ALLOC;
1462 }
1463 }
1464
1465 while (!err) {
1466 for (; p < p_end && !err; p++) {
1467 switch (*p) {
1468 case 0:
1469 /* Code 0 is ignored */
1470 continue;
1471 case 252:
1472 /* Code 252: end of file */
1473 if (tmp_beg != tmp) {
1474 err = sav_error("Compressed data is corrupted: ends "
1475 "partway through a case");
1476 }
1477 break;
1478 case 253:
1479 /* Code 253: the value is stored explicitly
1480 following the instruction bytes */
1481 if (ext->ptr == NULL || ext->ptr >= ext->end) {
1482 if (!buffer_input(ext, sdat->fp)) {
1483 err = sav_error("Unexpected end of file");
1484 }
1485 }
1486 memcpy(tmp++, ext->ptr++, sizeof *tmp);
1487 break;
1488 case 254:
1489 /* Code 254: a string that is all blanks */
1490 memset(tmp++, ' ', sizeof *tmp);
1491 break;
1492 case 255:
1493 /* Code 255: denotes the system-missing value */
1494 *tmp = ext->sysmis;
1495 if (sdat->swapends) {
1496 reverse_double(*tmp);
1497 }
1498 tmp++;
1499 break;
1500 default:
1501 /* Codes 1 through 251 inclusive indicate a value of
1502 (BYTE - BIAS), where BYTE is the byte's value and
1503 BIAS is the compression bias (generally 100.0)
1504 */
1505 *tmp = *p - hdr->bias;
1506 if (sdat->swapends) {
1507 reverse_double(*tmp);
1508 }
1509 tmp++;
1510 break;
1511 }
1512
1513 if (tmp >= tmp_end) {
1514 done = 1;
1515 break;
1516 }
1517 }
1518
1519 if (err || done) {
1520 break;
1521 }
1522
1523 /* Reached the end of this instruction octet: read another */
1524 if (ext->ptr == NULL || ext->ptr >= ext->end) {
1525 if (!buffer_input(ext, sdat->fp)) {
1526 if (tmp_beg != tmp) {
1527 err = sav_error("Unexpected end of file");
1528 }
1529 }
1530 }
1531
1532 memcpy(ext->x, ext->ptr++, sizeof *tmp);
1533 p = ext->x;
1534 }
1535
1536 if (!err) {
1537 /* We filled up an entire record: update state */
1538 ext->y = ++p;
1539 }
1540
1541 return err;
1542 }
1543
value_is_missing(spss_data * sdat,spss_var * v,double x)1544 static int value_is_missing (spss_data *sdat, spss_var *v, double x)
1545 {
1546 if (x == sdat->ext.sysmis) {
1547 return 1;
1548 } else if (spss_user_missing(v, x)) {
1549 /* What should we be doing here? For now, we'll not mark
1550 these values as NA, but neither will we count them as
1551 "OK observations" for accounting purposes. */
1552 return 0;
1553 } else {
1554 v->n_ok_obs += 1;
1555 return 0;
1556 }
1557 }
1558
1559 /* Read values for all variables at observation t */
1560
sav_read_observation(spss_data * sdat,struct sysfile_header * hdr,double * tmp,DATASET * dset,int t)1561 static int sav_read_observation (spss_data *sdat,
1562 struct sysfile_header *hdr,
1563 double *tmp,
1564 DATASET *dset,
1565 int t)
1566 {
1567 spss_var *v;
1568 double xit;
1569 int i, j, err = 0;
1570
1571 if (hdr->compressed) {
1572 err = read_compressed_data(sdat, hdr, tmp);
1573 } else {
1574 size_t amt = fread(tmp, sizeof *tmp, sdat->nvars, sdat->fp);
1575
1576 if (amt != sdat->nvars) {
1577 if (ferror(sdat->fp)) {
1578 err = sav_error("Reading SPSS file: %s", strerror(errno));
1579 } else if (amt != 0) {
1580 err = sav_error("Partial record at end of SPSS file");
1581 }
1582 }
1583 }
1584
1585 j = 1;
1586
1587 for (i=0; i<sdat->nvars && !err; i++) {
1588 v = &sdat->vars[i];
1589
1590 if (v->offset == -1) {
1591 continue;
1592 }
1593
1594 if (v->type == SPSS_NUMERIC) {
1595 xit = tmp[v->offset];
1596 if (sdat->swapends) {
1597 reverse_double(xit);
1598 }
1599 if (value_is_missing(sdat, v, xit)) {
1600 dset->Z[j][t] = NADBL;
1601 } else {
1602 dset->Z[j][t] = xit;
1603 }
1604 } else {
1605 /* variable is of string type */
1606 char raw[256], cval[256];
1607 int ix, len;
1608
1609 len = (v->width < 256)? v->width : 255;
1610 memcpy(raw, &tmp[v->offset], len);
1611 raw[len] = '\0';
1612 tailstrip(raw);
1613 recode_sav_string(cval, raw, sdat->encoding, 255);
1614 #if SPSS_DEBUG
1615 fprintf(stderr, "string val Z[%d][%d] = '%s'\n", j, t, cval);
1616 #endif
1617 ix = gretl_string_table_index(sdat->st, cval, j, 0, NULL);
1618 if (ix > 0) {
1619 dset->Z[j][t] = ix;
1620 v->n_ok_obs += 1;
1621 if (t == 0) {
1622 series_set_discrete(dset, j, 1);
1623 }
1624 } else {
1625 dset->Z[j][t] = NADBL;
1626
1627 }
1628 }
1629 j++;
1630 }
1631
1632 return err;
1633 }
1634
do_drop_empty(spss_data * sdat)1635 static int do_drop_empty (spss_data *sdat)
1636 {
1637 if (sdat->droplist != NULL) {
1638 int mindrop = sdat->droplist[1];
1639 int max_sv = sdat->vars[sdat->max_sv].gretl_index;
1640
1641 if (mindrop > max_sv) {
1642 /* no collision with string table: OK to drop vars */
1643 return 1;
1644 }
1645 }
1646
1647 return 0;
1648 }
1649
read_sav_data(spss_data * sdat,struct sysfile_header * hdr,DATASET * dset,PRN * prn)1650 static int read_sav_data (spss_data *sdat, struct sysfile_header *hdr,
1651 DATASET *dset, PRN *prn)
1652 {
1653 double *tmp = NULL;
1654 char label[MAXLABEL];
1655 int i, j, t, err = 0;
1656
1657 /* temporary storage for one complete observation */
1658 tmp = malloc(sdat->nvars * sizeof *tmp);
1659 if (tmp == NULL) {
1660 err = E_ALLOC;
1661 }
1662
1663 /* transcribe varnames and labels; also mark as discrete any vars
1664 for which we got 'value labels' */
1665 j = 1;
1666 for (i=0; i<sdat->nvars; i++) {
1667 if (!CONTD(sdat, i)) {
1668 recode_sav_string(dset->varname[j], sdat->vars[i].name,
1669 sdat->encoding, VNAMELEN - 1);
1670 recode_sav_string(label, sdat->vars[i].label,
1671 sdat->encoding, MAXLABEL - 1);
1672 series_set_label(dset, j, label);
1673 if (has_value_labels(sdat, i)) {
1674 series_set_discrete(dset, j, 1);
1675 }
1676 j++;
1677 }
1678 }
1679
1680 /* retrieve actual data values */
1681 for (t=0; t<sdat->nobs && !err; t++) {
1682 err = sav_read_observation(sdat, hdr, tmp, dset, t);
1683 if (err) {
1684 fprintf(stderr, "sav_read_case: err = %d at i = %d\n", err, i);
1685 }
1686 }
1687
1688 free(tmp);
1689
1690 /* count valid observations, to determine 'empty' variables */
1691 j = 1;
1692 for (i=0; i<sdat->nvars; i++) {
1693 if (!CONTD(sdat, i)) {
1694 if (sdat->vars[i].n_ok_obs == 0) {
1695 fprintf(stderr, "var %d: no valid observations\n", j);
1696 if (sdat->opt & OPT_D) {
1697 gretl_list_append_term(&sdat->droplist, j);
1698 sdat->vars[i].gretl_index = -1;
1699 }
1700 }
1701 j++;
1702 }
1703 }
1704
1705 /* Delete variables for which we got no observations? This is
1706 governed by OPT_D, and whether or not we have constructed
1707 a 'string table' that would get messed up by dropping
1708 variables.
1709 */
1710 if (do_drop_empty(sdat)) {
1711 err = dataset_drop_listed_variables(sdat->droplist, dset,
1712 NULL, NULL);
1713 }
1714
1715 return err;
1716 }
1717
spss_data_init(spss_data * sdat,FILE * fp)1718 static void spss_data_init (spss_data *sdat, FILE *fp)
1719 {
1720 sdat->fp = fp;
1721 sdat->opt = OPT_NONE;
1722 sdat->nvars = 0;
1723 sdat->nobs = 0;
1724 sdat->swapends = 0;
1725 sdat->encoding = 0;
1726 sdat->max_sv = -1;
1727 sdat->vars = NULL;
1728 sdat->nlabelsets = 0;
1729 sdat->labelsets = NULL;
1730
1731 sdat->ext.buf = sdat->ext.ptr = sdat->ext.end = NULL;
1732 sdat->ext.sysmis = -DBL_MAX;
1733 sdat->ext.highest = DBL_MAX;
1734 sdat->ext.lowest = second_lowest_double_val();
1735
1736 memset(sdat->ext.x, 0, sizeof sdat->ext.x);
1737 sdat->ext.y = sdat->ext.x + sizeof sdat->ext.x;
1738
1739 sdat->st = NULL;
1740 sdat->descrip = NULL;
1741 sdat->droplist = NULL;
1742 }
1743
free_labelset(spss_labelset * lset)1744 static void free_labelset (spss_labelset *lset)
1745 {
1746 if (lset != NULL) {
1747 free(lset->varlist);
1748 free(lset->vals);
1749 strings_array_free(lset->labels, lset->nlabels);
1750 free(lset);
1751 }
1752 }
1753
free_spss_data(spss_data * sdat)1754 static void free_spss_data (spss_data *sdat)
1755 {
1756 int i;
1757
1758 free(sdat->vars);
1759 free(sdat->ext.buf);
1760
1761 if (sdat->labelsets != NULL) {
1762 for (i=0; i<sdat->nlabelsets; i++) {
1763 free_labelset(sdat->labelsets[i]);
1764 }
1765 free(sdat->labelsets);
1766 }
1767
1768 gretl_string_table_destroy(sdat->st);
1769 free(sdat->descrip);
1770 free(sdat->droplist);
1771 }
1772
1773 /* eliminate any elements of @list which represent variables
1774 that have been dropped because they have no valid values
1775 */
1776
prune_labellist(spss_data * sdat,int * list)1777 static void prune_labellist (spss_data *sdat, int *list)
1778 {
1779 int i;
1780
1781 for (i=list[0]; i>0; i--) {
1782 if (in_gretl_list(sdat->droplist, list[i] - 1)) {
1783 gretl_list_delete_at_pos(list, i);
1784 }
1785 }
1786 }
1787
1788 /* Print info on value -> label mappings. Note that the
1789 indices into sdat->vars, stored in the lset->varlist's,
1790 are 1-based.
1791 */
1792
print_labelsets(spss_data * sdat,PRN * prn)1793 static void print_labelsets (spss_data *sdat, PRN *prn)
1794 {
1795 int i, j, lj;
1796
1797 pputs(prn, "\nNote: any values marked with a suffix of '[M]' are annotated\n"
1798 "in the SPSS data file as 'missing'.\n");
1799
1800 for (i=0; i<sdat->nlabelsets; i++) {
1801 spss_labelset *lset = sdat->labelsets[i];
1802 int *vlist = lset->varlist;
1803 spss_var *v;
1804
1805 if (sdat->droplist != NULL) {
1806 prune_labellist(sdat, vlist);
1807 if (vlist[0] == 0) {
1808 continue;
1809 }
1810 }
1811
1812 pputc(prn, '\n');
1813
1814 if (vlist[0] == 1) {
1815 /* mapping applies to just one variable */
1816 lj = vlist[1] - 1;
1817 v = &sdat->vars[lj];
1818 pprintf(prn, "Value -> label mappings for variable %d (%s)\n",
1819 v->gretl_index, v->name);
1820 } else {
1821 /* mapping applies to two or more variables */
1822 pprintf(prn, "Value -> label mappings for the following %d variables\n",
1823 vlist[0]);
1824 for (j=1; j<=vlist[0]; j++) {
1825 lj = vlist[j] - 1;
1826 v = &sdat->vars[lj];
1827 pprintf(prn, " %3d (%s)\n", v->gretl_index, v->name);
1828 }
1829 }
1830
1831 lj = vlist[1] - 1;
1832 v = &sdat->vars[lj];
1833
1834 for (j=0; j<lset->nlabels; j++) {
1835 char label[256];
1836
1837 recode_sav_string(label, lset->labels[j], sdat->encoding, 255);
1838 pprintf(prn, "%10g -> '%s'", lset->vals[j], label);
1839 if (spss_user_missing(v, lset->vals[j])) {
1840 pputs(prn, " [M]\n");
1841 } else {
1842 pputc(prn, '\n');
1843 }
1844 }
1845 }
1846 }
1847
1848 /* print out value -> label mappings (for numeric variables) and
1849 attach the text buffer to the dataset's string table */
1850
add_label_mappings_to_st(spss_data * sdat)1851 static int add_label_mappings_to_st (spss_data *sdat)
1852 {
1853 PRN *prn;
1854 int err = 0;
1855
1856 prn = gretl_print_new(GRETL_PRINT_BUFFER, &err);
1857
1858 if (prn != NULL) {
1859 print_labelsets(sdat, prn);
1860 gretl_string_table_add_extra(sdat->st, prn);
1861 gretl_print_destroy(prn);
1862 }
1863
1864 return err;
1865 }
1866
1867 /* We'll add a 'string table' if (a) the dataset contains one or more
1868 string-valued variables or (b) it contains one or more numerical
1869 variables that have associated value labels. The latter situation
1870 is registered in a non-zero value for sdat->nlabelsets.
1871 */
1872
maybe_add_string_table(spss_data * sdat)1873 static int maybe_add_string_table (spss_data *sdat)
1874 {
1875 int i, nsv = 0;
1876 int err = 0;
1877
1878 for (i=0; i<sdat->nvars; i++) {
1879 if (sdat->vars[i].type > 0) {
1880 sdat->max_sv = i;
1881 nsv++;
1882 }
1883 }
1884
1885 fprintf(stderr, "Found %d string variables\n", nsv);
1886
1887 if (nsv > 0) {
1888 int *list = gretl_list_new(nsv);
1889 int j = 1;
1890
1891 if (list == NULL) {
1892 err = E_ALLOC;
1893 } else {
1894 for (i=0; i<sdat->nvars; i++) {
1895 if (sdat->vars[i].type > 0) {
1896 list[j++] = sdat->vars[i].gretl_index;
1897 }
1898 }
1899 sdat->st = gretl_string_table_new(list);
1900 free(list);
1901 }
1902 } else if (sdat->nlabelsets > 0) {
1903 sdat->st = gretl_string_table_new(NULL);
1904 }
1905
1906 return err;
1907 }
1908
prepare_gretl_dataset(spss_data * sdat,DATASET ** pdset,PRN * prn)1909 static int prepare_gretl_dataset (spss_data *sdat,
1910 DATASET **pdset,
1911 PRN *prn)
1912 {
1913 DATASET *newset = datainfo_new();
1914 int nvars = sdat->nvars + 1;
1915 int i, err = 0;
1916
1917 if (newset == NULL) {
1918 pputs(prn, _("Out of memory\n"));
1919 err = E_ALLOC;
1920 }
1921
1922 for (i=0; i<sdat->nvars && !err; i++) {
1923 if (CONTD(sdat, i)) {
1924 /* not a real variable (string continuation) */
1925 nvars--;
1926 }
1927 }
1928
1929 if (!err) {
1930 maybe_add_string_table(sdat);
1931 }
1932
1933 if (!err) {
1934 newset->v = nvars;
1935 newset->n = sdat->nobs;
1936 /* time-series info? */
1937 err = start_new_Z(newset, 0);
1938 if (err) {
1939 pputs(prn, _("Out of memory\n"));
1940 free_datainfo(newset);
1941 err = E_ALLOC;
1942 }
1943 }
1944
1945 *pdset = newset;
1946
1947 return err;
1948 }
1949
sav_get_data(const char * fname,DATASET * dset,gretlopt opt,PRN * prn)1950 int sav_get_data (const char *fname, DATASET *dset,
1951 gretlopt opt, PRN *prn)
1952 {
1953 spss_data sdat;
1954 struct sysfile_header hdr;
1955 char buf[5] = {0};
1956 FILE *fp;
1957 DATASET *newset = NULL;
1958 int err = 0;
1959
1960 if (sizeof(double) != 8 || sizeof(int) != 4) {
1961 /* be on the safe side */
1962 pputs(prn, _("cannot read SPSS .sav on this platform"));
1963 return E_DATA;
1964 }
1965
1966 fp = gretl_fopen(fname, "rb");
1967 if (fp == NULL) {
1968 return E_FOPEN;
1969 }
1970
1971 if (fread(buf, 4, 1, fp) != 1) {
1972 fclose(fp);
1973 return E_DATA;
1974 }
1975
1976 if (!strncmp("$FL2", buf, 4)) {
1977 /* should be OK */
1978 rewind(fp);
1979 } else {
1980 fprintf(stderr, "file '%s' is not in SPSS sav format", fname);
1981 fclose(fp);
1982 return E_DATA;
1983 }
1984
1985 spss_data_init(&sdat, fp);
1986 err = read_sav_header(&sdat, &hdr);
1987
1988 if (!err) {
1989 if (opt & OPT_D) {
1990 sdat.opt |= OPT_D;
1991 }
1992 read_sav_variables(&sdat, &hdr);
1993 }
1994
1995 if (!err) {
1996 err = read_sav_other_records(&sdat);
1997 }
1998
1999 if (!err) {
2000 err = prepare_gretl_dataset(&sdat, &newset, prn);
2001 }
2002
2003 if (!err) {
2004 err = read_sav_data(&sdat, &hdr, newset, prn);
2005 }
2006
2007 if (err) {
2008 destroy_dataset(newset);
2009 } else {
2010 int merge = (dset->Z != NULL);
2011
2012 if (fix_varname_duplicates(newset)) {
2013 pputs(prn, _("warning: some variable names were duplicated\n"));
2014 }
2015
2016 if (sdat.st != NULL) {
2017 if (sdat.nlabelsets > 0) {
2018 add_label_mappings_to_st(&sdat);
2019 }
2020 gretl_string_table_print(sdat.st, newset, fname, prn);
2021 }
2022
2023 if (sdat.descrip != NULL) {
2024 newset->descrip = sdat.descrip;
2025 sdat.descrip = NULL;
2026 }
2027
2028 err = merge_or_replace_data(dset, &newset, get_merge_opts(opt), prn);
2029
2030 if (!err && !merge) {
2031 dataset_add_import_info(dset, fname, GRETL_SAV);
2032 }
2033 }
2034
2035 fclose(fp);
2036 free_spss_data(&sdat);
2037
2038 return err;
2039 }
2040