1 /*
2  *  Read SPSS .sav files : based on spss.c in the GNU R 'foreign' package.
3  *  Re-written for use with gretl by Allin Cottrell, November 2008.
4  *
5  * Original notice from spss.c:
6  *
7  *  Copyright 2000-2000 Saikat DebRoy <saikat@stat.wisc.edu>
8  *                      Thomas Lumley <tlumley@u.washington.edu>
9  *  Copyright 2005-8 R Core Development Team
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be
16  *  useful, but WITHOUT ANY WARRANTY; without even the implied
17  *  warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
18  *  PURPOSE.  See the GNU General Public License for more
19  *  details.
20  *
21  *  You should have received a copy of the GNU General Public License
22  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
23  */
24 
25 #include "libgretl.h"
26 #include "version.h"
27 #include "gretl_string_table.h"
28 #include "swap_bytes.h"
29 
30 #include <string.h>
31 #include <errno.h>
32 #include <stdarg.h>
33 #include <stdint.h>
34 
35 #define SPSS_DEBUG 0
36 
37 enum {
38     SPSS_NUMERIC,
39     SPSS_STRING,
40     SPSS_UNDEF
41 };
42 
43 enum {
44     MISSING_NONE,      /* no user-missing values */
45     MISSING_1,         /* one user-missing value */
46     MISSING_2,         /* two user-missing values */
47     MISSING_3,         /* three user-missing values */
48     MISSING_RANGE,     /* [a,b] */
49     MISSING_LOW,       /* (-inf,a] */
50     MISSING_HIGH,      /* (a,+inf] */
51     MISSING_RANGE_1,   /* [a,b], c */
52     MISSING_LOW_1,     /* (-inf,a], b */
53     MISSING_HIGH_1,    /* (a,+inf), b */
54     MISSING_MAX        /* sentinel */
55 };
56 
57 #define MAX_SHORT_STRING 8
58 #define SYSMIS (-DBL_MAX)
59 #define MAX_CASESIZE (INT_MAX / sizeof(double) / 2)
60 #define MAX_CASES (INT_MAX / 2)
61 
62 /* Divides nonnegative x by positive y, rounding up */
63 #define DIV_RND_UP(x,y) (((x) + ((y) - 1)) / (y))
64 
65 /* Rounds x up to the next multiple of y */
66 #define ROUND_UP(x,y) (((x) + ((y) - 1)) / (y) * (y))
67 
68 /* Gives the padding needed to round x to next multiple of y */
69 #define REM_RND_UP(x,y) ((x) % (y) ? (y) - (x) % (y) : 0)
70 
71 typedef struct spss_var_ spss_var;
72 typedef struct spss_data_ spss_data;
73 typedef struct spss_labelset_ spss_labelset;
74 
75 struct spss_var_ {
76     int type;
77     int gretl_index;            /* position in gretl dataset */
78     int width;                  /* Size of string variables in chars */
79     int n_ok_obs;               /* number of non-missing values */
80     int fv, nv;                 /* Index into values, number of values */
81     int offset;                 /* Index for retrieving actual values */
82     int miss_type;		/* One of the MISSING_* constants */
83     double missing[3];	        /* User-missing value */
84     char name[VNAMELEN];
85     char label[MAXLABEL];
86 };
87 
88 struct spss_labelset_ {
89     int nlabels;
90     int vtype;
91     int *varlist;
92     double *vals;
93     char **labels;
94 };
95 
96 /* SPSS native record: general dataset information */
97 struct sysfile_header {
98     char rec_type[4];           /* Record-type code, "$FL2" */
99     char prod_name[60];         /* Product identification */
100     int32_t layout_code;        /* 2 */
101     int32_t case_size;          /* Number of 'values per case' (i.e. variables) */
102     int32_t compressed;         /* 1 = yes, 0 = no */
103     int32_t weight_index;       /* 1-based index of weighting var, or zero */
104     int32_t ncases;             /* Number of cases (observations), -1 if unknown */
105     double bias;                /* Compression bias (100.0) */
106     char creation_date[9];      /* 'dd mmm yy' creation date of file */
107     char creation_time[8];      /* 'hh:mm:ss' 24-hour creation time */
108     char file_label[64];        /* File label */
109     char padding[3];
110 };
111 
112 /* SPSS native record: info on variable */
113 struct sysfile_variable {
114     int32_t rec_type;		/* must be 2 */
115     int32_t type;		/* 0 = numeric, 1-255 = string width,
116 				   (allowed to be up to 65535 in 0.8-24)
117 				   -1 = continued string */
118     int32_t has_var_label;	/* 1 = yes, 0 = no */
119     int32_t n_missing_values;	/* Missing value code: -3, -2, 0, 1, 2, or 3 */
120     int32_t print;	        /* Print format */
121     int32_t write;	        /* Write format */
122     char name[8];		/* Variable name */
123     /* The rest of the structure varies */
124 };
125 
126 /* extended info regarding SPSS sav file */
127 struct sav_extension {
128     /* special constants */
129     double sysmis;
130     double highest;
131     double lowest;
132 
133     double *buf; /* decompression buffer */
134     double *ptr; /* current location in buffer */
135     double *end; /* end of buffer marker */
136 
137     /* compression instruction octet and pointer */
138     unsigned char x[sizeof(double)];
139     unsigned char *y;
140 };
141 
142 /* working information on dataset */
143 struct spss_data_ {
144     FILE *fp;                  /* stream for reading */
145     gretlopt opt;              /* option flags */
146     int nvars;                 /* number of variables (really, 'elements') */
147     int nobs;                  /* number of observations ('cases') */
148     int swapends;              /* reversing endianness? (0/1) */
149     int encoding;              /* encoding for strings */
150     int max_sv;                /* index of highest-numbered string variable */
151     spss_var *vars;            /* info on individual variables */
152     int nlabelsets;            /* number of sets of value -> label mappings */
153     spss_labelset **labelsets; /* sets of value -> label mappings */
154     struct sav_extension ext;  /* extra info */
155     gretl_string_table *st;    /* string table for string-valued variables */
156     char *descrip;             /* dataset description */
157     int *droplist;             /* list of 'empty' variables to drop */
158 };
159 
160 #define CONTD(d,i) (d->vars[i].type == -1)
161 
162 static void free_labelset (spss_labelset *lset);
163 
164 #if SPSS_DEBUG
mt_string(int mt)165 static const char *mt_string (int mt)
166 {
167     switch(mt) {
168     case MISSING_1: return "MISSING_1";
169     case MISSING_2: return "MISSING_2";
170     case MISSING_3: return "MISSING_3";
171     case MISSING_RANGE: return "MISSING_RANGE";
172     case MISSING_LOW:   return "MISSING_LOW";
173     case MISSING_HIGH:  return "MISSING_HIGH";
174     case MISSING_RANGE_1: return "MISSING_RANGE_1";
175     case MISSING_LOW_1:   return "MISSING_LOW_1";
176     case MISSING_HIGH_1:  return "MISSING_HIGH_1";
177     default: return "??";
178     }
179 
180     return "??";
181 }
182 #endif
183 
second_lowest_double_val(void)184 static double second_lowest_double_val (void)
185 {
186 #ifdef WORDS_BIGENDIAN
187     union {
188 	unsigned char c[8];
189 	double d;
190     } second_lowest = {{0xff, 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}};
191 #else
192     union {
193 	unsigned char c[8];
194 	double d;
195     } second_lowest = {{0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xef, 0xff}};
196 #endif
197 
198     return second_lowest.d;
199 }
200 
sav_error(const char * fmt,...)201 static int sav_error (const char *fmt, ...)
202 {
203     char msg[512];
204     va_list args;
205 
206     if (fmt == NULL) {
207 	return 1;
208     }
209 
210     va_start(args, fmt);
211     vsnprintf(msg, sizeof msg, fmt, args);
212     va_end(args);
213 
214     gretl_errmsg_set(msg);
215 
216 #if SPSS_DEBUG
217     fprintf(stderr, "spss_error: %s\n", msg);
218 #endif
219 
220     return 1;
221 }
222 
sav_read_int32(spss_data * sdat,int * err)223 static int sav_read_int32 (spss_data *sdat, int *err)
224 {
225     int32_t ret = 0;
226 
227     *err = fread(&ret, sizeof ret, 1, sdat->fp) != 1;
228 
229     if (!*err && sdat->swapends) {
230 	reverse_int(ret);
231     }
232 
233     return ret;
234 }
235 
236 /* Read record type 7, subtype 3 */
237 
read_rec_7_3(spss_data * sdat,int size,int count)238 static int read_rec_7_3 (spss_data *sdat, int size, int count)
239 {
240     int file_big_endian;
241     int32_t data[8];
242     int i, enc;
243     int err = 0;
244 
245     if (size != sizeof(int32_t) || count != 8) {
246 	return sav_error("Bad size (%d) or count (%d) field on record type 7, "
247 			 "subtype 3.  Expected size %d, count 8.",
248 			 size, count, sizeof(int32_t));
249     }
250 
251     if (fread(data, sizeof data, 1, sdat->fp) != 1) {
252 	return E_DATA;
253     }
254 
255     if (sdat->swapends) {
256 	for (i=0; i<8; i++) {
257 	    reverse_int(data[i]);
258 	}
259     }
260 
261     if (data[4] != 1) {
262 	return sav_error("Floating-point representation in SPSS file is not IEEE-754.");
263     }
264 
265 #ifdef WORDS_BIGENDIAN
266     file_big_endian = !sdat->swapends;
267 #else
268     file_big_endian = sdat->swapends;
269 #endif
270 
271     if ((file_big_endian && data[6] != 1) || (!file_big_endian && data[6] != 2)) {
272 	return sav_error("I'm confused over the endianness of this .sav file");
273     }
274 
275     enc = sdat->encoding = data[7];
276 
277 #if SPSS_DEBUG
278     fprintf(stderr, "rec_7_3 encoding: got %d\n", enc);
279 #endif
280 
281     /* See
282        http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html
283     */
284 
285     if (enc == 1 || enc == 4) {
286 	return sav_error("Indicated character encoding (%s) is not ASCII",
287 			 (enc == 1)? "EBCDIC" : "DEC Kanji");
288     }
289 
290     if (enc >= 500) {
291 	fprintf(stderr, "Indicated character encoding (%d) may be a Windows codepage\n", enc);
292     } else if (enc > 4) {
293 	fprintf(stderr, "Indicated character encoding (%d) is unknown", enc);
294     }
295 
296     return err;
297 }
298 
299 /* Read record type 7, subtype 4 */
300 
read_rec_7_4(spss_data * sdat,int size,int count)301 static int read_rec_7_4 (spss_data *sdat, int size, int count)
302 {
303     double data[3];
304     int i;
305 
306     if (size != sizeof(double) || count != 3) {
307 	return sav_error("Bad size (%d) or count (%d) field on record type 7, "
308 			 "subtype 4. Expected size %d, count 8",
309 			 size, count, sizeof(double));
310     }
311 
312     if (fread(data, sizeof data, 1, sdat->fp) != 1) {
313 	return E_DATA;
314     }
315 
316     if (sdat->swapends) {
317 	for (i=0; i<3; i++) {
318 	    reverse_double(data[i]);
319 	}
320     }
321 
322     if (data[0] != SYSMIS || data[1] != DBL_MAX || data[2] != second_lowest_double_val()) {
323 	sdat->ext.sysmis = data[0];
324 	sdat->ext.highest = data[1];
325 	sdat->ext.lowest = data[2];
326 	fprintf(stderr, "File-indicated value is different from internal value for at "
327 		"least one of the three system values.  SYSMIS: indicated %g, "
328 		"expected %g; HIGHEST: %g, %g; LOWEST: %g, %g\n",
329 		data[0], SYSMIS,
330 		data[1], DBL_MAX,
331 		data[2], second_lowest_double_val());
332     }
333 
334     return 0;
335 }
336 
get_spss_var_by_name(spss_data * sdat,const char * s)337 static spss_var *get_spss_var_by_name (spss_data *sdat, const char *s)
338 {
339     int i;
340 
341     for (i=0; i<sdat->nvars; i++) {
342 	if (!strcmp(sdat->vars[i].name, s)) {
343 	    return &sdat->vars[i];
344 	}
345     }
346 
347     return NULL;
348 }
349 
recode_sav_string(char * targ,const char * src,int encoding,int maxlen)350 static int recode_sav_string (char *targ, const char *src,
351 			      int encoding, int maxlen)
352 {
353     int err = 0;
354 
355 #if SPSS_DEBUG > 1
356     fprintf(stderr, "recode_sav_string: '%s', enc=%d\n", src, encoding);
357 #endif
358 
359     *targ = '\0';
360 
361     if (g_utf8_validate(src, -1, NULL)) {
362 	gretl_utf8_strncat_trim(targ, src, maxlen);
363     } else if (encoding == 28591) {
364 	/* https://www.gnu.org/software/pspp/pspp-dev/html_node/ \
365 	   Machine-Integer-Info-Record.html */
366 	gchar *conv;
367 	gsize wrote = 0;
368 
369 	conv = g_convert(src, -1, "UTF-8", "ISO8859-1",
370 			 NULL, &wrote, NULL);
371 	if (conv != NULL) {
372 	    gretl_utf8_strncat_trim(targ, conv, maxlen);
373 	    g_free(conv);
374 	} else {
375 	    err = E_DATA;
376 	}
377     } else if (encoding >= 500 && encoding <= 9999) {
378 	/* try Windows codepage? */
379 	char cpage[8];
380 	gchar *conv;
381 	gsize wrote = 0;
382 
383 	sprintf(cpage, "CP%d", encoding);
384 	conv = g_convert(src, -1, "UTF-8", cpage,
385 			 NULL, &wrote, NULL);
386 	if (conv != NULL) {
387 	    gretl_utf8_strncat_trim(targ, conv, maxlen);
388 	    g_free(conv);
389 	} else {
390 	    err = E_DATA;
391 	}
392     } else {
393 	err = E_DATA;
394     }
395 
396     if (err) {
397 	strcpy(targ, "unknown");
398     }
399 
400     return err;
401 }
402 
strip_vname_illegals(char * s)403 static void strip_vname_illegals (char *s)
404 {
405     char name[VNAMELEN] = {0};
406     int i, j = 0;
407 
408     for (i=0; s[i] != '\0'; i++) {
409 	if (isalnum(s[i]) || s[i] == '_') {
410 	    name[j++] = s[i];
411 	}
412     }
413 
414     name[j] = '\0';
415     strcpy(s, name);
416 }
417 
check_sav_varname(char * vname,const char * s)418 static int check_sav_varname (char *vname, const char *s)
419 {
420     /* skip any leading garbage */
421     while (*s && !isalpha(*s)) s++;
422 
423     strncat(vname, s, VNAMELEN - 1);
424     iso_to_ascii(vname);
425     strip_vname_illegals(vname);
426     return check_varname(vname);
427 }
428 
429 /* Read record type 7, subtype 13: long variable names */
430 
read_long_varnames(spss_data * sdat,unsigned size,unsigned count)431 static int read_long_varnames (spss_data *sdat, unsigned size,
432 			       unsigned count)
433 {
434     char *buf, *val;
435     char *p, *endp;
436     spss_var *var;
437 
438     if (size != 1 || count == 0) {
439 	fprintf(stderr, "Strange record info: size=%u, count=%u,"
440 		"ignoring long variable names\n", size, count);
441 	return E_DATA;
442     }
443 
444     size *= count;
445 
446 #if SPSS_DEBUG
447     fprintf(stderr, "long_varnames: getting %u bytes\n", size);
448 #endif
449 
450     buf = calloc(size + 1, 1);
451     if (buf == NULL) {
452 	return E_ALLOC;
453     }
454 
455     if (fread(buf, size, 1, sdat->fp) != 1) {
456 	free(buf);
457 	return E_DATA;
458     }
459 
460     p = buf;
461 
462     do {
463 	if ((endp = strchr(p, '\t')) != NULL) {
464 	    *endp = '\0'; /* put null terminator */
465 	}
466 	if ((val = strchr(p, '=')) == NULL) {
467 	    fprintf(stderr, "No long name for variable '%s'\n", p);
468 	} else {
469 	    *val = '\0';
470 	    ++val;
471 	    /* at this point p is key, val is long name */
472 	    var = get_spss_var_by_name(sdat, p);
473 	    if (var != NULL) {
474 		/* replace original name with "long name" */
475 #if SPSS_DEBUG > 1
476 		fprintf(stderr, "'%s' -> '%s'\n", p, val);
477 #endif
478 		*var->name = '\0';
479 		check_sav_varname(var->name, val);
480 	    }
481 	}
482 	if (endp != NULL) {
483 	    p = endp + 1;
484 	}
485     } while (endp != NULL);
486 
487     free(buf);
488 
489     return 0;
490 }
491 
492 /* read a subrecord of SPSS record type 7 */
493 
read_subrecord(spss_data * sdat)494 static int read_subrecord (spss_data *sdat)
495 {
496     struct {
497 	int32_t subtype;
498 	int32_t size;
499 	int32_t count;
500     } data;
501     int skip = 0;
502     int err = 0;
503 
504     if (fread(&data, sizeof data, 1, sdat->fp) != 1) {
505 	return E_DATA;
506     }
507 
508 #if SPSS_DEBUG
509     fprintf(stderr, "subtype = %d, size = %d, count = %d\n",
510 	    data.subtype, data.size, data.count);
511 #endif
512 
513     if (sdat->swapends) {
514 	reverse_int(data.subtype);
515 	reverse_int(data.size);
516 	reverse_int(data.count);
517     }
518 
519     switch (data.subtype) {
520     case 3:
521 	err = read_rec_7_3(sdat, data.size, data.count);
522 	break;
523     case 4:
524 	err = read_rec_7_4(sdat, data.size, data.count);
525 	break;
526     case 5:
527     case 6:
528     case 11: /* ? Used by SPSS 8.0 */
529 	skip = 1;
530 	break;
531     case 7: /* Multiple-response sets (later versions of SPSS) */
532 	skip = 1;
533 	break;
534     case 13:
535 	err = read_long_varnames(sdat, data.size, data.count);
536 	break;
537     case 16:
538 	/* http://www.nabble.com/problem-loading-SPSS-15.0-save-files-t2726500.html */
539 	skip = 1;
540 	break;
541     case 20:
542 	fprintf(stderr, "Found subtype 20: Character Encoding Record (FIXME)\n");
543 	skip = 1;
544 	break;
545     default:
546 	fprintf(stderr, "Unrecognized record type 7, subtype %d encountered "
547 		"in sav file\n", data.subtype);
548 	skip = 1;
549     }
550 
551     if (skip) {
552 	int n = data.size * data.count;
553 
554 	fprintf(stderr, "record type 7, subtype %d: skipping %d bytes\n",
555 		data.subtype, n);
556 	fseek(sdat->fp, n, SEEK_CUR);
557     }
558 
559     return err;
560 }
561 
read_type_4(spss_data * sdat,int * err)562 static int read_type_4 (spss_data *sdat, int *err)
563 {
564     int32_t rec_type, n_vars = 0;
565 
566     rec_type = sav_read_int32(sdat, err);
567     if (*err) {
568 	return 0;
569     }
570 
571     if (rec_type != 4) {
572 	fprintf(stderr, "Variable index record (type 4) does not immediately\n"
573 		"follow value label record (type 3) as it should\n");
574 	*err = E_DATA;
575 	return 0;
576     }
577 
578     n_vars = sav_read_int32(sdat, err);
579     if (*err) {
580 	return 0;
581     }
582 
583     if (n_vars < 1 || n_vars > sdat->nvars) {
584 	*err = sav_error("Number of variables associated with a value label "
585 			  "(%d) is not between 1 and the number of variables (%d)",
586 			  n_vars, sdat->nvars);
587     }
588 
589     return n_vars;
590 }
591 
sdat_remove_labelset(spss_data * sdat)592 static void sdat_remove_labelset (spss_data *sdat)
593 {
594     int n = sdat->nlabelsets - 1;
595 
596     if (n < 0) {
597 	return;
598     }
599 
600     free_labelset(sdat->labelsets[n]);
601     sdat->labelsets[n] = NULL;
602     sdat->nlabelsets = n;
603 }
604 
sdat_add_labelset(spss_data * sdat,int n_labels)605 static int sdat_add_labelset (spss_data *sdat, int n_labels)
606 {
607     spss_labelset *lset, **lsets;
608     int n = sdat->nlabelsets + 1;
609     int err = 0;
610 
611     if (n_labels <= 0) {
612 	return E_DATA;
613     }
614 
615     lsets = realloc(sdat->labelsets, n * sizeof *lsets);
616     if (lsets == NULL) {
617 	return E_ALLOC;
618     }
619 
620     lsets[n-1] = NULL;
621     sdat->labelsets = lsets;
622     sdat->nlabelsets = n;
623 
624     lset = malloc(sizeof *lset);
625     if (lset == NULL) {
626 	return E_ALLOC;
627     }
628 
629     lsets[n-1] = lset;
630     lset->nlabels = n_labels;
631     lset->vtype = SPSS_UNDEF;
632 
633     lset->varlist = NULL;
634     lset->vals = NULL;
635     lset->labels = NULL;
636 
637     lset->vals = malloc(n_labels * sizeof *lset->vals);
638     if (lset->vals == NULL) {
639 	err = E_ALLOC;
640     } else {
641 	lset->labels = strings_array_new(n_labels);
642 	if (lset->labels == NULL) {
643 	    err = E_ALLOC;
644 	}
645     }
646 
647     return err;
648 }
649 
check_label_varindex(spss_data * sdat,spss_labelset * lset,int idx)650 static int check_label_varindex (spss_data *sdat, spss_labelset *lset, int idx)
651 {
652     int err = 0;
653 
654     if (idx < 1 || idx > sdat->nvars) {
655 	err = sav_error("Variable index associated with value label (%d) is "
656 			"not between 1 and the number of values (%d)",
657 			idx, sdat->nvars);
658     } else {
659 	spss_var *v = &sdat->vars[idx-1];
660 
661 	if (v->type == -1) {
662 	    err = sav_error("Variable index associated with value label (%d) refers "
663 			    "to a continuation of a string variable, not to an actual "
664 			    "variable", idx);
665 	} else if (v->type == SPSS_STRING && v->width > MAX_SHORT_STRING) {
666 	    err = sav_error("Value labels are not allowed on long string variables (%s)",
667 			    v->name);
668 	} else if (lset->vtype == SPSS_UNDEF) {
669 	    /* record type of first variable */
670 	    lset->vtype = v->type;
671 	} else if (v->type != lset->vtype) {
672 	    err = sav_error("Variables associated with value label are not all "
673 			    "of the same type.");
674 	}
675     }
676 
677     return err;
678 }
679 
read_value_labels(spss_data * sdat)680 static int read_value_labels (spss_data *sdat)
681 {
682     FILE *fp = sdat->fp;
683     spss_labelset *lset;
684     int32_t n_labels = 0;  /* Number of labels */
685     int32_t n_vars = 0;    /* Number of associated variables */
686     int empty_label = 0;
687     int i, err = 0;
688 
689     n_labels = sav_read_int32(sdat, &err);
690     if (err) {
691 	return err;
692     }
693 
694 #if SPSS_DEBUG
695     fprintf(stderr, "\n*** label set: %d labels\n", n_labels);
696 #endif
697 
698     err = sdat_add_labelset(sdat, n_labels);
699     if (err) {
700 	return err;
701     }
702 
703     lset = sdat->labelsets[sdat->nlabelsets - 1];
704 
705     /* first step: read the value/label pairs */
706 
707     for (i=0; i<n_labels && !err; i++) {
708 	char label[256] = {0};
709 	double value = 0;
710 	unsigned char label_len = 0;
711 	int rem, fgot = 0;
712 
713 	/* read value, label length, label */
714 	fgot += fread(&value, sizeof value, 1, fp);
715 	fgot += fread(&label_len, 1, 1, fp);
716 	fgot += fread(label, label_len, 1, fp);
717 
718 #if SPSS_DEBUG
719 	fprintf(stderr, "i=%d: fgot=%d, value=%g, len=%d, '%s'\n",
720 		i, fgot, value, label_len, label);
721 #endif
722 	if (fgot < 3) {
723 	    if (n_labels == 1 && label_len == 0) {
724 		/* tolerate apparent breakage here? */
725 		empty_label = 1;
726 	    } else {
727 		err = E_DATA;
728 		break;
729 	    }
730 	}
731 
732 	lset->vals[i] = value;
733 	lset->labels[i] = gretl_strdup(label);
734 	if (lset->labels[i] == NULL) {
735 	    err = E_ALLOC;
736 	    break;
737 	}
738 
739 	/* skip padding */
740 	rem = REM_RND_UP(label_len + 1, sizeof(double));
741 	if (rem) {
742 	    fseek(fp, rem, SEEK_CUR);
743 	}
744     }
745 
746     /* second step: read the type 4 record holding the list of
747        variables to which the value labels are to be applied
748     */
749 
750     if (!err) {
751 	n_vars = read_type_4(sdat, &err);
752 #if SPSS_DEBUG
753 	fprintf(stderr, "Got %d associated variables\n", n_vars);
754 #endif
755     }
756 
757     if (!err && n_vars > 0) {
758 	lset->varlist = gretl_list_new(n_vars);
759 	if (lset->varlist == NULL) {
760 	    fprintf(stderr, "lset->varlist: failed, n_vars = %d\n", n_vars);
761 	    err = E_ALLOC;
762 	}
763     }
764 
765     for (i=0; i<n_vars && !err; i++) {
766 	int32_t idx;
767 
768 	idx = sav_read_int32(sdat, &err);
769 	if (err) {
770 	    break;
771 	}
772 
773 	err = check_label_varindex(sdat, lset, idx);
774 	if (!err) {
775 	    lset->varlist[i+1] = idx;
776 #if SPSS_DEBUG
777 	    fprintf(stderr, " %3d: idx = %d (%s)\n", i, idx,
778 		    sdat->vars[idx-1].name);
779 #endif
780 	}
781     }
782 
783     /* we'll preserve value -> label mappings only for numeric
784        variables */
785     if (!err && (empty_label || lset->vtype != SPSS_NUMERIC)) {
786 	sdat_remove_labelset(sdat);
787     }
788 
789     return err;
790 }
791 
792 /* Reads an SPSS "document" record, type 6, and discards it: for now,
793    this is just to keep our place in the stream */
794 
read_documents(spss_data * sdat)795 static int read_documents (spss_data *sdat)
796 {
797     int32_t n_lines;
798     int err = 0;
799 
800     n_lines = sav_read_int32(sdat, &err);
801 
802     if (n_lines <= 0) {
803 	fprintf(stderr, "Number of document lines (%d) must be greater than 0\n",
804 		n_lines);
805 	err = E_DATA;
806     } else {
807 	int i, n = 80 * n_lines;
808 	unsigned char c;
809 	size_t b = 0;
810 
811 	for (i=0; i<n; i++) {
812 	    b = fread(&c, 1, 1, sdat->fp);
813 	}
814 
815 	fprintf(stderr, "read_documents: got %d lines (%d bytes)\n",
816 		n_lines, (int) b);
817     }
818 
819     return err;
820 }
821 
822 /* Read records of types 3, 4, 6, and 7 */
823 
read_sav_other_records(spss_data * sdat)824 static int read_sav_other_records (spss_data *sdat)
825 {
826     FILE *fp = sdat->fp;
827     int32_t pad, rec_type = 0;
828     size_t n;
829     int err = 0;
830 
831     while (rec_type >= 0 && !err) {
832 	rec_type = sav_read_int32(sdat, &err);
833 	if (err) {
834 	    break;
835 	}
836 
837 #if SPSS_DEBUG > 1
838 	fprintf(stderr, "read_sav_other_records: type = %d\n", rec_type);
839 #endif
840 
841 	switch (rec_type) {
842 	case 3:
843 	    err = read_value_labels(sdat);
844 	    break;
845 	case 4:
846 	    fprintf(stderr, "Orphaned variable index record (type 4).  Type 4\n"
847 		    "records must immediately follow type 3 records\n");
848 	    err = E_DATA;
849 	    break;
850 	case 6:
851 	    err = read_documents(sdat);
852 	    break;
853 	case 7:
854 	    read_subrecord(sdat);
855 	    break;
856 	case 999:
857 	    n = fread(&pad, sizeof pad, 1, fp);
858 	    if (n != 1) {
859 		fprintf(stderr, "couldn't read padding\n");
860 	    }
861 	    rec_type = -1;
862 	    break;
863 	default:
864 	    fprintf(stderr, "Unrecognized record type %d\n", rec_type);
865 	    err = E_DATA;
866 	    break;
867 	}
868     }
869 
870     return err;
871 }
872 
sdat_add_variables(spss_data * sdat)873 static int sdat_add_variables (spss_data *sdat)
874 {
875     int i;
876 
877     sdat->vars = malloc(sdat->nvars * sizeof *sdat->vars);
878     if (sdat->vars == NULL) {
879 	return E_ALLOC;
880     }
881 
882     for (i=0; i<sdat->nvars; i++) {
883 	sdat->vars[i].type = SPSS_NUMERIC;
884 	sdat->vars[i].gretl_index = -1;
885 	sdat->vars[i].n_ok_obs = 0;
886 	sdat->vars[i].miss_type = MISSING_NONE;
887 	sdat->vars[i].name[0] = '\0';
888 	sdat->vars[i].label[0] = '\0';
889 	sdat->vars[i].offset = -1;
890     }
891 
892     return 0;
893 }
894 
validate_var_info(struct sysfile_variable * sv,int i)895 static int validate_var_info (struct sysfile_variable *sv, int i)
896 {
897     int err = 0;
898 
899     if (sv->type < 0 || sv->type > 255) {
900 	err = sav_error("position %d: bad variable type code %d\n", i, sv->type);
901     } else if (sv->has_var_label != 0 && sv->has_var_label != 1) {
902 	err = sav_error("position %d: variable label indicator field is not 0 or 1\n", i);
903     } else if (sv->n_missing_values < -3 || sv->n_missing_values > 3 ||
904 	       sv->n_missing_values == -1) {
905 	err = sav_error("position %d: missing value indicator field is not "
906 			 "-3, -2, 0, 1, 2, or 3\n", i);
907     }
908 
909     return err;
910 }
911 
transcribe_varname(spss_data * sdat,struct sysfile_variable * sv,int i)912 static int transcribe_varname (spss_data *sdat, struct sysfile_variable *sv, int i)
913 {
914     spss_var *v = &sdat->vars[i];
915     unsigned char c = sv->name[0];
916     int j = 0, k = 0;
917 
918     if (!isalpha(c) && c != '@' && c != '#') {
919 	return sav_error("position %d: Variable name begins with invalid character", i);
920     }
921 
922     if (c == '#') {
923 	fprintf(stderr, "position %d: Variable name begins with '#'.\n"
924 		"Scratch variables should not appear in .sav files\n", i);
925 	j++;
926     }
927 
928     v->name[k++] = sv->name[j++];
929 
930     for (; j<8; j++) {
931 	c = sv->name[j];
932 	if (isspace(c)) {
933 	    break;
934 	} else if (c == '.') {
935 	    v->name[k++] = '_';
936 	} else if (isalnum(c) || c == '_') {
937 	    v->name[k++] = c;
938 	} else {
939 	    fprintf(stderr, "position %d: character `\\%03o' (%c) is not valid in a "
940 		    "variable name\n", i, c, c);
941 	}
942     }
943 
944     v->name[k] = 0;
945 
946 #if SPSS_DEBUG
947     fprintf(stderr, " name = '%s'\n", v->name);
948 #endif
949 
950     return 0;
951 }
952 
grab_var_label(spss_data * sdat,spss_var * v)953 static int grab_var_label (spss_data *sdat, spss_var *v)
954 {
955     size_t labread, rem = 0;
956     int32_t len;
957     int err = 0;
958 
959     len = sav_read_int32(sdat, &err);
960     if (err) {
961 	return err;
962     }
963 
964     if (len < 0 || len > 65535) {
965 	return sav_error("Variable %s indicates label of invalid length %d",
966 			  v->name, len);
967     }
968 
969     labread = ROUND_UP(len, sizeof(int32_t));
970 
971     if (labread > MAXLABEL - 1) {
972 	rem = labread - (MAXLABEL - 1);
973 	labread = MAXLABEL - 1;
974 	if (len > MAXLABEL - 1) {
975 	    len = MAXLABEL - 1;
976 	}
977     }
978 
979     if (fread(v->label, labread, 1, sdat->fp) != 1) {
980 	return sav_error("Variable %s: couldn't read label", v->name);
981     }
982 
983     v->label[len] = '\0';
984     tailstrip(v->label);
985 
986 #if SPSS_DEBUG
987     fprintf(stderr, " label '%s'\n", v->label);
988 #endif
989 
990     if (rem > 0) {
991 	/* skip excess label characters */
992 	fseek(sdat->fp, rem, SEEK_CUR);
993     }
994 
995     return 0;
996 }
997 
record_missing_vals_info(spss_data * sdat,spss_var * v,int nmiss)998 static int record_missing_vals_info (spss_data *sdat, spss_var *v,
999 				     int nmiss)
1000 {
1001     int anmiss = abs(nmiss);
1002     double mv[3];
1003     int j;
1004 
1005     if (v->width > MAX_SHORT_STRING) {
1006 	return sav_error("Long string variable %s cannot have missing values",
1007 			  v->name);
1008     }
1009 
1010     if (fread(mv, anmiss * sizeof *mv, 1, sdat->fp) != 1) {
1011 	return sav_error("%s: couldn't mv", v->name);
1012     }
1013 
1014     if (sdat->swapends && v->type == SPSS_NUMERIC) {
1015 	for (j=0; j<anmiss; j++) {
1016 	    reverse_double(mv[j]);
1017 	}
1018     }
1019 
1020     if (nmiss > 0) {
1021 	v->miss_type = nmiss;
1022 	if (v->type == SPSS_NUMERIC) {
1023 	    for (j=0; j<nmiss; j++) {
1024 		v->missing[j] = mv[j];
1025 	    }
1026 	} else {
1027 	    for (j=0; j<nmiss; j++) {
1028 		memcpy(&v->missing[j], &mv[j], v->width);
1029 	    }
1030 	}
1031     } else if (v->type == SPSS_STRING) {
1032 	return sav_error("String variable %s may not have missing values "
1033 			  "specified as a range", v->name);
1034     } else {
1035 	j = 0;
1036 	if (mv[0] == sdat->ext.lowest) {
1037 	    v->miss_type = MISSING_LOW;
1038 	    v->missing[j++] = mv[1];
1039 	} else if (mv[1] == sdat->ext.highest) {
1040 	    v->miss_type = MISSING_HIGH;
1041 	    v->missing[j++] = mv[0];
1042 	} else {
1043 	    v->miss_type = MISSING_RANGE;
1044 	    v->missing[j++] = mv[0];
1045 	    v->missing[j++] = mv[1];
1046 	}
1047 
1048 	if (nmiss == -3) {
1049 	    v->miss_type += 3;
1050 	    v->missing[j++] = mv[2];
1051 	}
1052     }
1053 
1054 #if SPSS_DEBUG
1055     fprintf(stderr, " miss_type = %d (%s)\n", v->miss_type, mt_string(v->miss_type));
1056     int i;
1057     for (i=0; i<j; i++) {
1058 	fprintf(stderr, " missing[%d] = %g\n", i, v->missing[i]);
1059     }
1060 #endif
1061 
1062     return 0;
1063 }
1064 
1065 /* read info on the variables in SPSS .sav file */
1066 
read_sav_variables(spss_data * sdat,struct sysfile_header * hdr)1067 static int read_sav_variables (spss_data *sdat, struct sysfile_header *hdr)
1068 {
1069     struct sysfile_variable sv;
1070     int long_string_count = 0; /* # of long string continuation
1071                                   records still expected */
1072     int next_value = 0;        /* index to next 'value' structure */
1073     int gidx = 0;              /* gretl index number for variable */
1074     int i, err;
1075 
1076     err = sdat_add_variables(sdat);
1077     if (err) {
1078 	return err;
1079     }
1080 
1081     for (i=0; i<sdat->nvars && !err; i++) {
1082 	spss_var *v = &sdat->vars[i];
1083 
1084 	if (fread(&sv, sizeof sv, 1, sdat->fp) != 1) {
1085 	    err = E_DATA;
1086 	    break;
1087 	}
1088 
1089 	if (sdat->swapends) {
1090 	    reverse_int(sv.rec_type);
1091 	    reverse_int(sv.type);
1092 	    reverse_int(sv.has_var_label);
1093 	    reverse_int(sv.n_missing_values);
1094 	    reverse_int(sv.print);
1095 	    reverse_int(sv.write);
1096 	}
1097 
1098 	if (sv.rec_type != 2) {
1099 	    fprintf(stderr, "position %d: Bad record type (%d); the expected "
1100 		    "value was 2\n", i, sv.rec_type);
1101 	    err = E_DATA;
1102 	    break;
1103 	}
1104 
1105 #if SPSS_DEBUG
1106 	fprintf(stderr, "*** position %d\n", i);
1107 	fprintf(stderr, " type = %d\n", sv.type);
1108 	fprintf(stderr, " has_var_label = %d\n", sv.has_var_label);
1109 	fprintf(stderr, " n_missing_values = %d\n", sv.n_missing_values);
1110 #endif
1111 
1112 	/* If there was a long string previously, make sure that the
1113 	   continuations are present; otherwise make sure there aren't
1114 	   any */
1115 	if (long_string_count) {
1116 	    if (sv.type == -1) {
1117 #if SPSS_DEBUG
1118 		fprintf(stderr, " long string continuation\n");
1119 #endif
1120 		v->type = -1;
1121 		long_string_count--;
1122 		/* note: no further processing here */
1123 		continue;
1124 	    } else {
1125 		err = sav_error("position %d: string variable is missing a "
1126 				"continuation record", i);
1127 		break;
1128 	    }
1129 	} else if (sv.type == -1) {
1130 	    fprintf(stderr, "position %d: superfluous string continuation record\n", i);
1131 	}
1132 
1133 	err = validate_var_info(&sv, i);
1134 
1135 	if (!err) {
1136 	    err = transcribe_varname(sdat, &sv, i);
1137 	}
1138 
1139 #if SPSS_DEBUG
1140 	fputs((sv.type == SPSS_NUMERIC)? " NUMERIC\n" : " STRING\n", stderr);
1141 #endif
1142 
1143 	if (!err) {
1144 	    v->gretl_index = ++gidx;
1145 	    if (sv.type == SPSS_NUMERIC) {
1146 		v->width = 0;
1147 		v->offset = next_value++;
1148 		v->nv = 1;
1149 	    } else {
1150 		v->type = SPSS_STRING;
1151 		v->width = sv.type;
1152 		v->nv = DIV_RND_UP(v->width, sizeof(double));
1153 		v->offset = next_value;
1154 		next_value += v->nv;
1155 		long_string_count = v->nv - 1;
1156 	    }
1157 	}
1158 
1159 	/* get the variable label, if any */
1160 	if (!err && sv.has_var_label == 1) {
1161 	    err = grab_var_label(sdat, v);
1162 	}
1163 
1164 	/* set missing values, if applicable */
1165 	if (!err && sv.n_missing_values != 0) {
1166 	    err = record_missing_vals_info(sdat, v, sv.n_missing_values);
1167 	}
1168     }
1169 
1170     /* consistency checks */
1171 
1172     if (long_string_count != 0) {
1173 	fprintf(stderr, "Long string continuation records omitted at end of dictionary\n");
1174     }
1175 
1176     if (next_value != hdr->case_size) {
1177 	fprintf(stderr, "System file header indicates %d variable positions but %d were "
1178 		"read from file\n", hdr->case_size, next_value);
1179     }
1180 
1181     return 0;
1182 }
1183 
print_product_name(struct sysfile_header * hdr)1184 static void print_product_name (struct sysfile_header *hdr)
1185 {
1186     char prod_name[sizeof hdr->prod_name + 1];
1187     int i;
1188 
1189     memcpy(prod_name, hdr->prod_name, sizeof hdr->prod_name);
1190 
1191     for (i=0; i<60; i++) {
1192 	if (!isprint((unsigned char) prod_name[i])) {
1193 	    prod_name[i] = ' ';
1194 	}
1195     }
1196 
1197     for (i=59; i>=0; i--) {
1198 	if (!isgraph((unsigned char) prod_name[i])) {
1199 	    prod_name[i] = '\0';
1200 	    break;
1201 	}
1202     }
1203 
1204     prod_name[60] = '\0';
1205 
1206     fprintf(stderr, "%s\n", prod_name);
1207 }
1208 
sdat_add_descrip(spss_data * sdat,struct sysfile_header * hdr)1209 static void sdat_add_descrip (spss_data *sdat, struct sysfile_header *hdr)
1210 {
1211     char datestr[10];
1212     char *s = hdr->file_label;
1213     char *label = NULL;
1214     int i, n = 11;
1215 
1216     memcpy(datestr, hdr->creation_date, 9);
1217     datestr[9] = '\0';
1218     fprintf(stderr, "Creation date %s\n", datestr);
1219 
1220     /* preserve the header label if it's not empty */
1221 
1222     for (i = sizeof hdr->file_label - 1; i >= 0; i--) {
1223 	if (!isspace((unsigned char) s[i]) && s[i] != '\0') {
1224 	    label = calloc(i + 2, 1);
1225 	    if (label != NULL) {
1226 		memcpy(label, s, i + 1);
1227 		label[i + 1] = '\0';
1228 		fprintf(stderr, "label: '%s'\n", label);
1229 	    }
1230 	    break;
1231 	}
1232     }
1233 
1234     if (label != NULL) {
1235 	n += strlen(label) + 1;
1236     }
1237 
1238     sdat->descrip = malloc(n);
1239 
1240     if (sdat->descrip != NULL) {
1241 	if (label != NULL) {
1242 	    sprintf(sdat->descrip, "%s\n%s\n", label, datestr);
1243 	} else {
1244 	    sprintf(sdat->descrip, "%s\n", datestr);
1245 	}
1246     }
1247 
1248     free(label);
1249 }
1250 
1251 /* read sav header rcord and check for errors */
1252 
read_sav_header(spss_data * sdat,struct sysfile_header * hdr)1253 static int read_sav_header (spss_data *sdat, struct sysfile_header *hdr)
1254 {
1255     int fgot = 0;
1256 
1257     fgot += fread(&hdr->rec_type, 4, 1, sdat->fp);
1258     fgot += fread(&hdr->prod_name, 60, 1, sdat->fp);
1259     fgot += fread(&hdr->layout_code, 4, 1, sdat->fp);
1260     fgot += fread(&hdr->case_size, 4, 1, sdat->fp);
1261     fgot += fread(&hdr->compressed, 4, 1, sdat->fp);
1262     fgot += fread(&hdr->weight_index, 4, 1, sdat->fp);
1263     fgot += fread(&hdr->ncases, 4, 1, sdat->fp);
1264     fgot += fread(&hdr->bias, 8, 1, sdat->fp);
1265     fgot += fread(&hdr->creation_date, 9, 1, sdat->fp);
1266     fgot += fread(&hdr->creation_time, 8, 1, sdat->fp);
1267     fgot += fread(&hdr->file_label, 64, 1, sdat->fp);
1268     fgot += fread(&hdr->padding, 3, 1, sdat->fp);
1269 
1270     if (fgot < 12) {
1271 	fprintf(stderr, "read_sav_header: failed to read full header\n");
1272 	return E_DATA;
1273     }
1274 
1275     print_product_name(hdr);
1276 
1277     /* check endianness */
1278     if (hdr->layout_code == 2 || hdr->layout_code == 3) {
1279 	fprintf(stderr, "layout_code = %d, no reverse endianness\n", hdr->layout_code);
1280     } else {
1281 	fprintf(stderr, "Need to reverse endianness!\n");
1282 	reverse_int(hdr->layout_code);
1283 	if (hdr->layout_code != 2 && hdr->layout_code != 3) {
1284 	    return sav_error("File layout code has unexpected value %d.  Value should be 2 or 3, "
1285 			     "in big-endian or little-endian format", hdr->layout_code);
1286 	}
1287 	sdat->swapends = 1;
1288 	reverse_int(hdr->case_size);
1289 	reverse_int(hdr->compressed);
1290 	reverse_int(hdr->weight_index);
1291 	reverse_int(hdr->ncases);
1292 	reverse_double(hdr->bias);
1293     }
1294 
1295     /* check basic header info */
1296 
1297     if (hdr->case_size <= 0 || hdr->case_size > MAX_CASESIZE) {
1298 	return sav_error("Number of elements per case (%d) is not between 1 and %d",
1299 			 hdr->case_size, MAX_CASESIZE);
1300     }
1301 
1302     if (hdr->weight_index < 0 || hdr->weight_index > hdr->case_size) {
1303 	return sav_error("Index of weighting variable (%d) is not between 0 and number "
1304 			 "of elements per case (%d)", hdr->weight_index, hdr->case_size);
1305     }
1306 
1307     if (hdr->ncases < -1 || hdr->ncases > MAX_CASES) {
1308 	return sav_error("Number of cases in file (%d) is not between -1 and %d",
1309 			 hdr->ncases, INT_MAX / 2);
1310     }
1311 
1312     fprintf(stderr, "case_size (number of variables) = %d\n", hdr->case_size);
1313     fprintf(stderr, "compression = %d\n", hdr->compressed);
1314     fprintf(stderr, "weight index = %d\n", hdr->weight_index);
1315     fprintf(stderr, "ncases = %d\n", hdr->ncases);
1316     fprintf(stderr, "compression bias = %g\n", hdr->bias);
1317 
1318     if (hdr->ncases == -1) {
1319 	/* is there a way to count cases later? */
1320 	return sav_error("Sorry, I don't know what to do with ncases = -1");
1321     }
1322 
1323     if (hdr->bias != 100.0) {
1324 	fprintf(stderr, "Warning: compression bias (%g) is not the usual value of 100\n",
1325 		hdr->bias);
1326     }
1327 
1328     sdat->nvars = hdr->case_size;
1329     sdat->nobs = hdr->ncases;
1330 
1331     sdat_add_descrip(sdat, hdr);
1332 
1333     return 0;
1334 }
1335 
1336 /* For a given value of a given varable, check to see if it falls
1337    under a user-defined "missing" category.  FIXME: do we really want
1338    to set all such values to NA?  This may lose info regarding the
1339    reason why the value is missing.
1340 */
1341 
spss_user_missing(spss_var * v,double x)1342 static int spss_user_missing (spss_var *v, double x)
1343 {
1344     int mt = v->miss_type;
1345     double a, b, c;
1346     int n, j, miss;
1347 
1348     if (mt == MISSING_NONE) {
1349 	return 0;
1350     }
1351 
1352     a = b = c = 0;
1353     n = miss = 0;
1354 
1355     switch (mt) {
1356     case MISSING_1:
1357 	n = 1;
1358 	break;
1359     case MISSING_2:
1360 	n = 2;
1361 	break;
1362     case MISSING_3:
1363 	n = 3;
1364 	break;
1365     case MISSING_LOW:
1366     case MISSING_HIGH:
1367 	a = v->missing[0];
1368 	break;
1369     case MISSING_RANGE:
1370     case MISSING_LOW_1:
1371     case MISSING_HIGH_1:
1372 	a = v->missing[0];
1373 	b = v->missing[1];
1374 	break;
1375     case MISSING_RANGE_1:
1376 	a = v->missing[0];
1377 	b = v->missing[1];
1378 	c = v->missing[2];
1379 	break;
1380     default:
1381 	break;
1382     }
1383 
1384     /* FIXME SPSS_NUMERIC vs SPSS_STRING */
1385 
1386     if (n > 0) {
1387 	for (j=0; j<n; j++) {
1388 	    if (x == v->missing[j]) {
1389 		miss = 1;
1390 		break;
1391 	    }
1392 	}
1393     } else if (mt == MISSING_RANGE) {
1394 	/* [a,b] */
1395 	miss = (x >= a && x <= b);
1396     } else if (mt == MISSING_LOW) {
1397 	/* (-inf, a] */
1398 	miss = (x <= a);
1399     } else if (mt == MISSING_HIGH) {
1400 	/* (a,+inf] */
1401 	miss = (x > a);
1402     } else if (mt == MISSING_RANGE_1) {
1403 	/* [a,b], c */
1404 	miss = (x >= a && x <= b) || x == c;
1405     } else if (mt == MISSING_LOW_1) {
1406 	/* (-inf,a], b */
1407 	miss = (x <= a) || x == b;
1408     } else if (mt == MISSING_HIGH_1) {
1409 	/* (a,+inf), b */
1410 	miss = (x > a) || x == b;
1411     }
1412 
1413     return miss;
1414 }
1415 
has_value_labels(spss_data * sdat,int i)1416 static int has_value_labels (spss_data *sdat, int i)
1417 {
1418     int j;
1419 
1420     for (j=0; j<sdat->nlabelsets; j++) {
1421 	if (in_gretl_list(sdat->labelsets[j]->varlist, i)) {
1422 	    return 1;
1423 	}
1424     }
1425 
1426     return 0;
1427 }
1428 
buffer_input(struct sav_extension * ext,FILE * fp)1429 static int buffer_input (struct sav_extension *ext, FILE *fp)
1430 {
1431     size_t amt;
1432 
1433     amt = fread(ext->buf, sizeof *ext->buf, 128, fp);
1434 
1435     if (ferror(fp)) {
1436 	sav_error("Error reading file: %s", strerror(errno));
1437 	return 0;
1438     }
1439 
1440     ext->ptr = ext->buf;
1441     ext->end = ext->buf + amt;
1442 
1443     return amt;
1444 }
1445 
1446 /* decompression routine for special SPSS-compressed data */
1447 
read_compressed_data(spss_data * sdat,struct sysfile_header * hdr,double * tmp)1448 static int read_compressed_data (spss_data *sdat, struct sysfile_header *hdr,
1449 				 double *tmp)
1450 {
1451     struct sav_extension *ext = &sdat->ext;
1452     const unsigned char *p_end = ext->x + sizeof(double);
1453     unsigned char *p = ext->y;
1454     const double *tmp_beg = tmp;
1455     const double *tmp_end = tmp + sdat->nvars;
1456     int err = 0, done = 0;
1457 
1458     if (ext->buf == NULL) {
1459 	ext->buf = malloc(128 * sizeof *ext->buf);
1460 	if (ext->buf == NULL) {
1461 	    return E_ALLOC;
1462 	}
1463     }
1464 
1465     while (!err) {
1466 	for (; p < p_end && !err; p++) {
1467 	    switch (*p) {
1468 	    case 0:
1469 		/* Code 0 is ignored */
1470 		continue;
1471 	    case 252:
1472 		/* Code 252: end of file */
1473 		if (tmp_beg != tmp) {
1474 		    err = sav_error("Compressed data is corrupted: ends "
1475 				    "partway through a case");
1476 		}
1477 		break;
1478 	    case 253:
1479 		/* Code 253: the value is stored explicitly
1480 		   following the instruction bytes */
1481 		if (ext->ptr == NULL || ext->ptr >= ext->end) {
1482 		    if (!buffer_input(ext, sdat->fp)) {
1483 			err = sav_error("Unexpected end of file");
1484 		    }
1485 		}
1486 		memcpy(tmp++, ext->ptr++, sizeof *tmp);
1487 		break;
1488 	    case 254:
1489 		/* Code 254: a string that is all blanks */
1490 		memset(tmp++, ' ', sizeof *tmp);
1491 		break;
1492 	    case 255:
1493 		/* Code 255: denotes the system-missing value */
1494 		*tmp = ext->sysmis;
1495 		if (sdat->swapends) {
1496 		    reverse_double(*tmp);
1497 		}
1498 		tmp++;
1499 		break;
1500 	    default:
1501 		/* Codes 1 through 251 inclusive indicate a value of
1502 		   (BYTE - BIAS), where BYTE is the byte's value and
1503 		   BIAS is the compression bias (generally 100.0)
1504 		*/
1505 		*tmp = *p - hdr->bias;
1506 		if (sdat->swapends) {
1507 		    reverse_double(*tmp);
1508 		}
1509 		tmp++;
1510 		break;
1511 	    }
1512 
1513 	    if (tmp >= tmp_end) {
1514 		done = 1;
1515 		break;
1516 	    }
1517 	}
1518 
1519 	if (err || done) {
1520 	    break;
1521 	}
1522 
1523 	/* Reached the end of this instruction octet: read another */
1524 	if (ext->ptr == NULL || ext->ptr >= ext->end) {
1525 	    if (!buffer_input(ext, sdat->fp)) {
1526 		if (tmp_beg != tmp) {
1527 		    err = sav_error("Unexpected end of file");
1528 		}
1529 	    }
1530 	}
1531 
1532 	memcpy(ext->x, ext->ptr++, sizeof *tmp);
1533 	p = ext->x;
1534     }
1535 
1536     if (!err) {
1537 	/* We filled up an entire record: update state */
1538 	ext->y = ++p;
1539     }
1540 
1541     return err;
1542 }
1543 
value_is_missing(spss_data * sdat,spss_var * v,double x)1544 static int value_is_missing (spss_data *sdat, spss_var *v, double x)
1545 {
1546     if (x == sdat->ext.sysmis) {
1547 	return 1;
1548     } else if (spss_user_missing(v, x)) {
1549 	/* What should we be doing here?  For now, we'll not mark
1550 	   these values as NA, but neither will we count them as
1551 	   "OK observations" for accounting purposes. */
1552 	return 0;
1553     } else {
1554 	v->n_ok_obs += 1;
1555 	return 0;
1556     }
1557 }
1558 
1559 /* Read values for all variables at observation t */
1560 
sav_read_observation(spss_data * sdat,struct sysfile_header * hdr,double * tmp,DATASET * dset,int t)1561 static int sav_read_observation (spss_data *sdat,
1562 				 struct sysfile_header *hdr,
1563 				 double *tmp,
1564 				 DATASET *dset,
1565 				 int t)
1566 {
1567     spss_var *v;
1568     double xit;
1569     int i, j, err = 0;
1570 
1571     if (hdr->compressed) {
1572 	err = read_compressed_data(sdat, hdr, tmp);
1573     } else {
1574 	size_t amt = fread(tmp, sizeof *tmp, sdat->nvars, sdat->fp);
1575 
1576 	if (amt != sdat->nvars) {
1577 	    if (ferror(sdat->fp)) {
1578 		err = sav_error("Reading SPSS file: %s", strerror(errno));
1579 	    } else if (amt != 0) {
1580 		err = sav_error("Partial record at end of SPSS file");
1581 	    }
1582 	}
1583     }
1584 
1585     j = 1;
1586 
1587     for (i=0; i<sdat->nvars && !err; i++) {
1588 	v = &sdat->vars[i];
1589 
1590 	if (v->offset == -1) {
1591 	    continue;
1592 	}
1593 
1594 	if (v->type == SPSS_NUMERIC) {
1595 	    xit = tmp[v->offset];
1596 	    if (sdat->swapends) {
1597 		reverse_double(xit);
1598 	    }
1599 	    if (value_is_missing(sdat, v, xit)) {
1600 		dset->Z[j][t] = NADBL;
1601 	    } else {
1602 		dset->Z[j][t] = xit;
1603 	    }
1604 	} else {
1605 	    /* variable is of string type */
1606 	    char raw[256], cval[256];
1607 	    int ix, len;
1608 
1609 	    len = (v->width < 256)? v->width : 255;
1610 	    memcpy(raw, &tmp[v->offset], len);
1611 	    raw[len] = '\0';
1612 	    tailstrip(raw);
1613 	    recode_sav_string(cval, raw, sdat->encoding, 255);
1614 #if SPSS_DEBUG
1615 	    fprintf(stderr, "string val Z[%d][%d] = '%s'\n", j, t, cval);
1616 #endif
1617 	    ix = gretl_string_table_index(sdat->st, cval, j, 0, NULL);
1618 	    if (ix > 0) {
1619 		dset->Z[j][t] = ix;
1620 		v->n_ok_obs += 1;
1621 		if (t == 0) {
1622 		    series_set_discrete(dset, j, 1);
1623 		}
1624 	    } else {
1625 		dset->Z[j][t] = NADBL;
1626 
1627 	    }
1628 	}
1629 	j++;
1630     }
1631 
1632     return err;
1633 }
1634 
do_drop_empty(spss_data * sdat)1635 static int do_drop_empty (spss_data *sdat)
1636 {
1637     if (sdat->droplist != NULL) {
1638 	int mindrop = sdat->droplist[1];
1639 	int max_sv = sdat->vars[sdat->max_sv].gretl_index;
1640 
1641 	if (mindrop > max_sv) {
1642 	    /* no collision with string table: OK to drop vars */
1643 	    return 1;
1644 	}
1645     }
1646 
1647     return 0;
1648 }
1649 
read_sav_data(spss_data * sdat,struct sysfile_header * hdr,DATASET * dset,PRN * prn)1650 static int read_sav_data (spss_data *sdat, struct sysfile_header *hdr,
1651 			  DATASET *dset, PRN *prn)
1652 {
1653     double *tmp = NULL;
1654     char label[MAXLABEL];
1655     int i, j, t, err = 0;
1656 
1657     /* temporary storage for one complete observation */
1658     tmp = malloc(sdat->nvars * sizeof *tmp);
1659     if (tmp == NULL) {
1660 	err = E_ALLOC;
1661     }
1662 
1663     /* transcribe varnames and labels; also mark as discrete any vars
1664        for which we got 'value labels' */
1665     j = 1;
1666     for (i=0; i<sdat->nvars; i++) {
1667 	if (!CONTD(sdat, i)) {
1668 	    recode_sav_string(dset->varname[j], sdat->vars[i].name,
1669 			      sdat->encoding, VNAMELEN - 1);
1670 	    recode_sav_string(label, sdat->vars[i].label,
1671 			      sdat->encoding, MAXLABEL - 1);
1672 	    series_set_label(dset, j, label);
1673 	    if (has_value_labels(sdat, i)) {
1674 		series_set_discrete(dset, j, 1);
1675 	    }
1676 	    j++;
1677 	}
1678     }
1679 
1680     /* retrieve actual data values */
1681     for (t=0; t<sdat->nobs && !err; t++) {
1682 	err = sav_read_observation(sdat, hdr, tmp, dset, t);
1683 	if (err) {
1684 	    fprintf(stderr, "sav_read_case: err = %d at i = %d\n", err, i);
1685 	}
1686     }
1687 
1688     free(tmp);
1689 
1690     /* count valid observations, to determine 'empty' variables */
1691     j = 1;
1692     for (i=0; i<sdat->nvars; i++) {
1693 	if (!CONTD(sdat, i)) {
1694 	    if (sdat->vars[i].n_ok_obs == 0) {
1695 		fprintf(stderr, "var %d: no valid observations\n", j);
1696 		if (sdat->opt & OPT_D) {
1697 		    gretl_list_append_term(&sdat->droplist, j);
1698 		    sdat->vars[i].gretl_index = -1;
1699 		}
1700 	    }
1701 	    j++;
1702 	}
1703     }
1704 
1705     /* Delete variables for which we got no observations?  This is
1706        governed by OPT_D, and whether or not we have constructed
1707        a 'string table' that would get messed up by dropping
1708        variables.
1709     */
1710     if (do_drop_empty(sdat)) {
1711 	err = dataset_drop_listed_variables(sdat->droplist, dset,
1712 					    NULL, NULL);
1713     }
1714 
1715     return err;
1716 }
1717 
spss_data_init(spss_data * sdat,FILE * fp)1718 static void spss_data_init (spss_data *sdat, FILE *fp)
1719 {
1720     sdat->fp = fp;
1721     sdat->opt = OPT_NONE;
1722     sdat->nvars = 0;
1723     sdat->nobs = 0;
1724     sdat->swapends = 0;
1725     sdat->encoding = 0;
1726     sdat->max_sv = -1;
1727     sdat->vars = NULL;
1728     sdat->nlabelsets = 0;
1729     sdat->labelsets = NULL;
1730 
1731     sdat->ext.buf = sdat->ext.ptr = sdat->ext.end = NULL;
1732     sdat->ext.sysmis = -DBL_MAX;
1733     sdat->ext.highest = DBL_MAX;
1734     sdat->ext.lowest = second_lowest_double_val();
1735 
1736     memset(sdat->ext.x, 0, sizeof sdat->ext.x);
1737     sdat->ext.y = sdat->ext.x + sizeof sdat->ext.x;
1738 
1739     sdat->st = NULL;
1740     sdat->descrip = NULL;
1741     sdat->droplist = NULL;
1742 }
1743 
free_labelset(spss_labelset * lset)1744 static void free_labelset (spss_labelset *lset)
1745 {
1746     if (lset != NULL) {
1747 	free(lset->varlist);
1748 	free(lset->vals);
1749 	strings_array_free(lset->labels, lset->nlabels);
1750 	free(lset);
1751     }
1752 }
1753 
free_spss_data(spss_data * sdat)1754 static void free_spss_data (spss_data *sdat)
1755 {
1756     int i;
1757 
1758     free(sdat->vars);
1759     free(sdat->ext.buf);
1760 
1761     if (sdat->labelsets != NULL) {
1762 	for (i=0; i<sdat->nlabelsets; i++) {
1763 	    free_labelset(sdat->labelsets[i]);
1764 	}
1765 	free(sdat->labelsets);
1766     }
1767 
1768     gretl_string_table_destroy(sdat->st);
1769     free(sdat->descrip);
1770     free(sdat->droplist);
1771 }
1772 
1773 /* eliminate any elements of @list which represent variables
1774    that have been dropped because they have no valid values
1775 */
1776 
prune_labellist(spss_data * sdat,int * list)1777 static void prune_labellist (spss_data *sdat, int *list)
1778 {
1779     int i;
1780 
1781     for (i=list[0]; i>0; i--) {
1782 	if (in_gretl_list(sdat->droplist, list[i] - 1)) {
1783 	    gretl_list_delete_at_pos(list, i);
1784 	}
1785     }
1786 }
1787 
1788 /* Print info on value -> label mappings.  Note that the
1789    indices into sdat->vars, stored in the lset->varlist's,
1790    are 1-based.
1791 */
1792 
print_labelsets(spss_data * sdat,PRN * prn)1793 static void print_labelsets (spss_data *sdat, PRN *prn)
1794 {
1795     int i, j, lj;
1796 
1797     pputs(prn, "\nNote: any values marked with a suffix of '[M]' are annotated\n"
1798 	  "in the SPSS data file as 'missing'.\n");
1799 
1800     for (i=0; i<sdat->nlabelsets; i++) {
1801 	spss_labelset *lset = sdat->labelsets[i];
1802 	int *vlist = lset->varlist;
1803 	spss_var *v;
1804 
1805 	if (sdat->droplist != NULL) {
1806 	    prune_labellist(sdat, vlist);
1807 	    if (vlist[0] == 0) {
1808 		continue;
1809 	    }
1810 	}
1811 
1812 	pputc(prn, '\n');
1813 
1814 	if (vlist[0] == 1) {
1815 	    /* mapping applies to just one variable */
1816 	    lj = vlist[1] - 1;
1817 	    v = &sdat->vars[lj];
1818 	    pprintf(prn, "Value -> label mappings for variable %d (%s)\n",
1819 		    v->gretl_index, v->name);
1820 	} else {
1821 	    /* mapping applies to two or more variables */
1822 	    pprintf(prn, "Value -> label mappings for the following %d variables\n",
1823 		    vlist[0]);
1824 	    for (j=1; j<=vlist[0]; j++) {
1825 		lj = vlist[j] - 1;
1826 		v = &sdat->vars[lj];
1827 		pprintf(prn, " %3d (%s)\n", v->gretl_index, v->name);
1828 	    }
1829 	}
1830 
1831 	lj = vlist[1] - 1;
1832 	v = &sdat->vars[lj];
1833 
1834 	for (j=0; j<lset->nlabels; j++) {
1835 	    char label[256];
1836 
1837 	    recode_sav_string(label, lset->labels[j], sdat->encoding, 255);
1838 	    pprintf(prn, "%10g -> '%s'", lset->vals[j], label);
1839 	    if (spss_user_missing(v, lset->vals[j])) {
1840 		pputs(prn, " [M]\n");
1841 	    } else {
1842 		pputc(prn, '\n');
1843 	    }
1844 	}
1845     }
1846 }
1847 
1848 /* print out value -> label mappings (for numeric variables) and
1849    attach the text buffer to the dataset's string table */
1850 
add_label_mappings_to_st(spss_data * sdat)1851 static int add_label_mappings_to_st (spss_data *sdat)
1852 {
1853     PRN *prn;
1854     int err = 0;
1855 
1856     prn = gretl_print_new(GRETL_PRINT_BUFFER, &err);
1857 
1858     if (prn != NULL) {
1859 	print_labelsets(sdat, prn);
1860 	gretl_string_table_add_extra(sdat->st, prn);
1861 	gretl_print_destroy(prn);
1862     }
1863 
1864     return err;
1865 }
1866 
1867 /* We'll add a 'string table' if (a) the dataset contains one or more
1868    string-valued variables or (b) it contains one or more numerical
1869    variables that have associated value labels.  The latter situation
1870    is registered in a non-zero value for sdat->nlabelsets.
1871 */
1872 
maybe_add_string_table(spss_data * sdat)1873 static int maybe_add_string_table (spss_data *sdat)
1874 {
1875     int i, nsv = 0;
1876     int err = 0;
1877 
1878     for (i=0; i<sdat->nvars; i++) {
1879 	if (sdat->vars[i].type > 0) {
1880 	    sdat->max_sv = i;
1881 	    nsv++;
1882 	}
1883     }
1884 
1885     fprintf(stderr, "Found %d string variables\n", nsv);
1886 
1887     if (nsv > 0) {
1888 	int *list = gretl_list_new(nsv);
1889 	int j = 1;
1890 
1891 	if (list == NULL) {
1892 	    err = E_ALLOC;
1893 	} else {
1894 	    for (i=0; i<sdat->nvars; i++) {
1895 		if (sdat->vars[i].type > 0) {
1896 		    list[j++] = sdat->vars[i].gretl_index;
1897 		}
1898 	    }
1899 	    sdat->st = gretl_string_table_new(list);
1900 	    free(list);
1901 	}
1902     } else if (sdat->nlabelsets > 0) {
1903 	sdat->st = gretl_string_table_new(NULL);
1904     }
1905 
1906     return err;
1907 }
1908 
prepare_gretl_dataset(spss_data * sdat,DATASET ** pdset,PRN * prn)1909 static int prepare_gretl_dataset (spss_data *sdat,
1910 				  DATASET **pdset,
1911 				  PRN *prn)
1912 {
1913     DATASET *newset = datainfo_new();
1914     int nvars = sdat->nvars + 1;
1915     int i, err = 0;
1916 
1917     if (newset == NULL) {
1918 	pputs(prn, _("Out of memory\n"));
1919 	err = E_ALLOC;
1920     }
1921 
1922     for (i=0; i<sdat->nvars && !err; i++) {
1923 	if (CONTD(sdat, i)) {
1924 	    /* not a real variable (string continuation) */
1925 	    nvars--;
1926 	}
1927     }
1928 
1929     if (!err) {
1930 	maybe_add_string_table(sdat);
1931     }
1932 
1933     if (!err) {
1934 	newset->v = nvars;
1935 	newset->n = sdat->nobs;
1936 	/* time-series info? */
1937 	err = start_new_Z(newset, 0);
1938 	if (err) {
1939 	    pputs(prn, _("Out of memory\n"));
1940 	    free_datainfo(newset);
1941 	    err = E_ALLOC;
1942 	}
1943     }
1944 
1945     *pdset = newset;
1946 
1947     return err;
1948 }
1949 
sav_get_data(const char * fname,DATASET * dset,gretlopt opt,PRN * prn)1950 int sav_get_data (const char *fname, DATASET *dset,
1951 		  gretlopt opt, PRN *prn)
1952 {
1953     spss_data sdat;
1954     struct sysfile_header hdr;
1955     char buf[5] = {0};
1956     FILE *fp;
1957     DATASET *newset = NULL;
1958     int err = 0;
1959 
1960     if (sizeof(double) != 8 || sizeof(int) != 4) {
1961 	/* be on the safe side */
1962 	pputs(prn, _("cannot read SPSS .sav on this platform"));
1963 	return E_DATA;
1964     }
1965 
1966     fp = gretl_fopen(fname, "rb");
1967     if (fp == NULL) {
1968 	return E_FOPEN;
1969     }
1970 
1971     if (fread(buf, 4, 1, fp) != 1) {
1972 	fclose(fp);
1973 	return E_DATA;
1974     }
1975 
1976     if (!strncmp("$FL2", buf, 4)) {
1977 	/* should be OK */
1978 	rewind(fp);
1979     } else {
1980 	fprintf(stderr, "file '%s' is not in SPSS sav format", fname);
1981 	fclose(fp);
1982 	return E_DATA;
1983     }
1984 
1985     spss_data_init(&sdat, fp);
1986     err = read_sav_header(&sdat, &hdr);
1987 
1988     if (!err) {
1989 	if (opt & OPT_D) {
1990 	    sdat.opt |= OPT_D;
1991 	}
1992 	read_sav_variables(&sdat, &hdr);
1993     }
1994 
1995     if (!err) {
1996 	err = read_sav_other_records(&sdat);
1997     }
1998 
1999     if (!err) {
2000 	err = prepare_gretl_dataset(&sdat, &newset, prn);
2001     }
2002 
2003     if (!err) {
2004 	err = read_sav_data(&sdat, &hdr, newset, prn);
2005     }
2006 
2007     if (err) {
2008 	destroy_dataset(newset);
2009     } else {
2010 	int merge = (dset->Z != NULL);
2011 
2012 	if (fix_varname_duplicates(newset)) {
2013 	    pputs(prn, _("warning: some variable names were duplicated\n"));
2014 	}
2015 
2016 	if (sdat.st != NULL) {
2017 	    if (sdat.nlabelsets > 0) {
2018 		add_label_mappings_to_st(&sdat);
2019 	    }
2020 	    gretl_string_table_print(sdat.st, newset, fname, prn);
2021 	}
2022 
2023 	if (sdat.descrip != NULL) {
2024 	    newset->descrip = sdat.descrip;
2025 	    sdat.descrip = NULL;
2026 	}
2027 
2028 	err = merge_or_replace_data(dset, &newset, get_merge_opts(opt), prn);
2029 
2030 	if (!err && !merge) {
2031 	    dataset_add_import_info(dset, fname, GRETL_SAV);
2032 	}
2033     }
2034 
2035     fclose(fp);
2036     free_spss_data(&sdat);
2037 
2038     return err;
2039 }
2040