1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "data/sys-file-private.h"
20
21 #include <errno.h>
22 #include <float.h>
23 #include <inttypes.h>
24 #include <stdlib.h>
25 #include <sys/stat.h>
26 #include <zlib.h>
27
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/ll.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
54
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
63 #include "gl/xsize.h"
64
65 #include "gettext.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
68
69 enum
70 {
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
93 };
94
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
97 {
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
101
102 /* These correspond to the members of struct any_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
108 };
109
110 struct sfm_var_record
111 {
112 off_t pos;
113 int width;
114 char name[9];
115 int print_format;
116 int write_format;
117 int missing_value_code;
118 uint8_t missing[24];
119 char *label;
120 struct variable *var;
121 };
122
123 struct sfm_value_label
124 {
125 uint8_t value[8];
126 char *label;
127 };
128
129 struct sfm_value_label_record
130 {
131 off_t pos;
132 struct sfm_value_label *labels;
133 unsigned int n_labels;
134
135 int *vars;
136 unsigned int n_vars;
137 };
138
139 struct sfm_document_record
140 {
141 off_t pos;
142 char *documents;
143 size_t n_lines;
144 };
145
146 struct sfm_mrset
147 {
148 const char *name; /* Name. */
149 const char *label; /* Human-readable label for group. */
150 enum mrset_type type; /* Group type. */
151 const char **vars; /* Constituent variables' names. */
152 size_t n_vars; /* Number of constituent variables. */
153
154 /* MRSET_MD only. */
155 enum mrset_md_cat_source cat_source; /* Source of category labels. */
156 bool label_from_var_label; /* 'label' taken from variable label? */
157 const char *counted; /* Counted value, as string. */
158 };
159
160 struct sfm_extension_record
161 {
162 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
163 int subtype; /* Record subtype. */
164 off_t pos; /* Starting offset in file. */
165 unsigned int size; /* Size of data elements. */
166 unsigned int count; /* Number of data elements. */
167 void *data; /* Contents. */
168 };
169
170 /* System file reader. */
171 struct sfm_reader
172 {
173 struct any_reader any_reader;
174
175 /* Resource tracking. */
176 struct pool *pool; /* All system file state. */
177
178 /* File data. */
179 struct any_read_info info;
180 struct sfm_header_record header;
181 struct sfm_var_record *vars;
182 size_t n_vars;
183 struct sfm_value_label_record *labels;
184 size_t n_labels;
185 struct sfm_document_record *document;
186 struct sfm_mrset *mrsets;
187 size_t n_mrsets;
188 struct sfm_extension_record *extensions[32];
189 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
190
191 /* File state. */
192 struct file_handle *fh; /* File handle. */
193 struct fh_lock *lock; /* Mutual exclusion for file handle. */
194 FILE *file; /* File stream. */
195 off_t pos; /* Position in file. */
196 bool error; /* I/O or corruption error? */
197 struct caseproto *proto; /* Format of output cases. */
198
199 /* File format. */
200 enum integer_format integer_format; /* On-disk integer format. */
201 enum float_format float_format; /* On-disk floating point format. */
202 struct sfm_var *sfm_vars; /* Variables. */
203 size_t sfm_var_cnt; /* Number of variables. */
204 int case_cnt; /* Number of cases */
205 const char *encoding; /* String encoding. */
206 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
207
208 /* Decompression. */
209 enum any_compression compression;
210 double bias; /* Compression bias, usually 100.0. */
211 uint8_t opcodes[8]; /* Current block of opcodes. */
212 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
213 bool corruption_warning; /* Warned about possible corruption? */
214
215 /* ZLIB decompression. */
216 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
217 #define ZIN_BUF_SIZE 4096
218 uint8_t *zin_buf; /* Inflation input buffer. */
219 #define ZOUT_BUF_SIZE 16384
220 uint8_t *zout_buf; /* Inflation output buffer. */
221 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
222 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
223 z_stream zstream; /* ZLIB inflater. */
224 };
225
226 static const struct casereader_class sys_file_casereader_class;
227
228 static struct sfm_reader *
sfm_reader_cast(const struct any_reader * r_)229 sfm_reader_cast (const struct any_reader *r_)
230 {
231 assert (r_->klass == &sys_file_reader_class);
232 return UP_CAST (r_, struct sfm_reader, any_reader);
233 }
234
235 static bool sfm_close (struct any_reader *);
236
237 static void sys_msg (struct sfm_reader *r, off_t, int class,
238 const char *format, va_list args)
239 PRINTF_FORMAT (4, 0);
240 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
241 PRINTF_FORMAT (3, 4);
242 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
243 PRINTF_FORMAT (3, 4);
244
245 static bool read_bytes (struct sfm_reader *, void *, size_t)
246 WARN_UNUSED_RESULT;
247 static int try_read_bytes (struct sfm_reader *, void *, size_t)
248 WARN_UNUSED_RESULT;
249 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
250 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
251 static bool read_int64 (struct sfm_reader *, long long int *)
252 WARN_UNUSED_RESULT;
253 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
254 WARN_UNUSED_RESULT;
255 static bool read_string (struct sfm_reader *, char *, size_t)
256 WARN_UNUSED_RESULT;
257 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
258
259 /* ZLIB compressed data handling. */
260 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
261 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
262 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
263 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
264 WARN_UNUSED_RESULT;
265 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
266 WARN_UNUSED_RESULT;
267 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
268 WARN_UNUSED_RESULT;
269 static bool read_compressed_float (struct sfm_reader *, double *)
270 WARN_UNUSED_RESULT;
271
272 static char *fix_line_ends (const char *);
273
274 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
275 static double parse_float (const struct sfm_reader *,
276 const void *data, size_t ofs);
277
278 static bool read_variable_record (struct sfm_reader *,
279 struct sfm_var_record *);
280 static bool read_value_label_record (struct sfm_reader *,
281 struct sfm_value_label_record *);
282 static bool read_document_record (struct sfm_reader *);
283 static bool read_extension_record (struct sfm_reader *, int subtype,
284 struct sfm_extension_record **);
285 static bool skip_extension_record (struct sfm_reader *, int subtype);
286
287 static struct text_record *open_text_record (
288 struct sfm_reader *, const struct sfm_extension_record *,
289 bool recode_to_utf8);
290 static void close_text_record (struct sfm_reader *,
291 struct text_record *);
292 static bool read_variable_to_value_pair (struct sfm_reader *,
293 struct dictionary *,
294 struct text_record *,
295 struct variable **var, char **value);
296 static void text_warn (struct sfm_reader *r, struct text_record *text,
297 const char *format, ...) PRINTF_FORMAT (3, 4);
298 static char *text_get_token (struct text_record *,
299 struct substring delimiters, char *delimiter);
300 static bool text_match (struct text_record *, char c);
301 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
302 struct text_record *,
303 struct substring delimiters,
304 struct variable **);
305 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
306 struct text_record *,
307 struct substring delimiters,
308 struct variable **);
309 static const char *text_parse_counted_string (struct sfm_reader *,
310 struct text_record *);
311 static size_t text_pos (const struct text_record *);
312 static const char *text_get_all (const struct text_record *);
313
314 /* Dictionary reader. */
315
316 enum which_format
317 {
318 PRINT_FORMAT,
319 WRITE_FORMAT
320 };
321
322 static bool read_dictionary (struct sfm_reader *);
323 static bool read_record (struct sfm_reader *, int type,
324 size_t *allocated_vars, size_t *allocated_labels);
325 static bool read_header (struct sfm_reader *, struct any_read_info *,
326 struct sfm_header_record *);
327 static void parse_header (struct sfm_reader *,
328 const struct sfm_header_record *,
329 struct any_read_info *, struct dictionary *);
330 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
331 struct sfm_var_record *, size_t n);
332 static void parse_format_spec (struct sfm_reader *, off_t pos,
333 unsigned int format, enum which_format,
334 struct variable *, int *format_warning_cnt);
335 static void parse_document (struct dictionary *, struct sfm_document_record *);
336 static void parse_display_parameters (struct sfm_reader *,
337 const struct sfm_extension_record *,
338 struct dictionary *);
339 static bool parse_machine_integer_info (struct sfm_reader *,
340 const struct sfm_extension_record *,
341 struct any_read_info *);
342 static void parse_machine_float_info (struct sfm_reader *,
343 const struct sfm_extension_record *);
344 static void parse_extra_product_info (struct sfm_reader *,
345 const struct sfm_extension_record *,
346 struct any_read_info *);
347 static void parse_mrsets (struct sfm_reader *,
348 const struct sfm_extension_record *,
349 size_t *allocated_mrsets);
350 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
351 static void parse_long_var_name_map (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 struct dictionary *);
354 static bool parse_long_string_map (struct sfm_reader *,
355 const struct sfm_extension_record *,
356 struct dictionary *);
357 static void parse_value_labels (struct sfm_reader *, struct dictionary *);
358 static struct variable *parse_weight_var (struct sfm_reader *,
359 const struct sfm_var_record *, size_t n_var_recs,
360 int idx);
361 static void parse_data_file_attributes (struct sfm_reader *,
362 const struct sfm_extension_record *,
363 struct dictionary *);
364 static void parse_variable_attributes (struct sfm_reader *,
365 const struct sfm_extension_record *,
366 struct dictionary *);
367 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
368 static void parse_long_string_value_labels (struct sfm_reader *,
369 const struct sfm_extension_record *,
370 struct dictionary *);
371 static void parse_long_string_missing_values (
372 struct sfm_reader *, const struct sfm_extension_record *,
373 struct dictionary *);
374
375 /* Frees the strings inside INFO. */
376 void
any_read_info_destroy(struct any_read_info * info)377 any_read_info_destroy (struct any_read_info *info)
378 {
379 if (info)
380 {
381 free (info->creation_date);
382 free (info->creation_time);
383 free (info->product);
384 free (info->product_ext);
385 }
386 }
387
388 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
389 successful, otherwise NULL. */
390 static struct any_reader *
sfm_open(struct file_handle * fh)391 sfm_open (struct file_handle *fh)
392 {
393 size_t allocated_mrsets = 0;
394 struct sfm_reader *r;
395
396 /* Create and initialize reader. */
397 r = xzalloc (sizeof *r);
398 r->any_reader.klass = &sys_file_reader_class;
399 r->pool = pool_create ();
400 pool_register (r->pool, free, r);
401 r->fh = fh_ref (fh);
402 r->opcode_idx = sizeof r->opcodes;
403 ll_init (&r->var_attrs);
404
405 /* TRANSLATORS: this fragment will be interpolated into
406 messages in fh_lock() that identify types of files. */
407 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
408 if (r->lock == NULL)
409 goto error;
410
411 r->file = fn_open (fh, "rb");
412 if (r->file == NULL)
413 {
414 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
415 fh_get_file_name (r->fh), strerror (errno));
416 goto error;
417 }
418
419 if (!read_dictionary (r))
420 goto error;
421
422 if (r->extensions[EXT_MRSETS] != NULL)
423 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
424
425 if (r->extensions[EXT_MRSETS2] != NULL)
426 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
427
428 return &r->any_reader;
429
430 error:
431 if (r)
432 sfm_close (&r->any_reader);
433 return NULL;
434 }
435
436 static bool
read_dictionary(struct sfm_reader * r)437 read_dictionary (struct sfm_reader *r)
438 {
439 size_t allocated_vars;
440 size_t allocated_labels;
441
442 if (!read_header (r, &r->info, &r->header))
443 return false;
444
445 allocated_vars = 0;
446 allocated_labels = 0;
447 for (;;)
448 {
449 int type;
450
451 if (!read_int (r, &type))
452 return false;
453 if (type == 999)
454 break;
455 if (!read_record (r, type, &allocated_vars, &allocated_labels))
456 return false;
457 }
458
459 if (!skip_bytes (r, 4))
460 return false;
461
462 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
463 return false;
464
465 return true;
466 }
467
468 static bool
read_record(struct sfm_reader * r,int type,size_t * allocated_vars,size_t * allocated_labels)469 read_record (struct sfm_reader *r, int type,
470 size_t *allocated_vars, size_t *allocated_labels)
471 {
472 int subtype;
473
474 switch (type)
475 {
476 case 2:
477 if (r->n_vars >= *allocated_vars)
478 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
479 sizeof *r->vars);
480 return read_variable_record (r, &r->vars[r->n_vars++]);
481
482 case 3:
483 if (r->n_labels >= *allocated_labels)
484 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
485 sizeof *r->labels);
486 return read_value_label_record (r, &r->labels[r->n_labels++]);
487
488 case 4:
489 /* A Type 4 record is always immediately after a type 3 record,
490 so the code for type 3 records reads the type 4 record too. */
491 sys_error (r, r->pos, _("Misplaced type 4 record."));
492 return false;
493
494 case 6:
495 if (r->document != NULL)
496 sys_warn (r, r->pos, _("Duplicate type 6 (document) record."));
497 return read_document_record (r);
498
499 case 7:
500 if (!read_int (r, &subtype))
501 return false;
502 else if (subtype < 0
503 || subtype >= sizeof r->extensions / sizeof *r->extensions)
504 {
505 sys_warn (r, r->pos,
506 _("Unrecognized record type 7, subtype %d. For help, "
507 "please send this file to %s and mention that you were "
508 "using %s."),
509 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
510 return skip_extension_record (r, subtype);
511 }
512 else if (subtype == 18)
513 {
514 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
515 put each variable attribute into a separate record with subtype
516 18. I'm surprised that SPSS puts up with this. */
517 struct sfm_extension_record *ext;
518 bool ok = read_extension_record (r, subtype, &ext);
519 if (ok && ext)
520 ll_push_tail (&r->var_attrs, &ext->ll);
521 return ok;
522 }
523 else if (r->extensions[subtype] != NULL)
524 {
525 sys_warn (r, r->pos,
526 _("Record type 7, subtype %d found here has the same "
527 "type as the record found near offset 0x%llx. For "
528 "help, please send this file to %s and mention that "
529 "you were using %s."),
530 subtype, (long long int) r->extensions[subtype]->pos,
531 PACKAGE_BUGREPORT, PACKAGE_STRING);
532 return skip_extension_record (r, subtype);
533 }
534 else
535 return read_extension_record (r, subtype, &r->extensions[subtype]);
536
537 default:
538 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
539 return false;
540 }
541
542 NOT_REACHED ();
543 }
544
545 /* Returns the character encoding obtained from R, or a null pointer if R
546 doesn't have an indication of its character encoding. */
547 static const char *
sfm_get_encoding(const struct sfm_reader * r)548 sfm_get_encoding (const struct sfm_reader *r)
549 {
550 /* The EXT_ENCODING record is the best way to determine dictionary
551 encoding. */
552 if (r->extensions[EXT_ENCODING])
553 return r->extensions[EXT_ENCODING]->data;
554
555 /* But EXT_INTEGER is better than nothing as a fallback. */
556 if (r->extensions[EXT_INTEGER])
557 {
558 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
559 const char *encoding;
560
561 switch (codepage)
562 {
563 case 1:
564 return "EBCDIC-US";
565
566 case 2:
567 case 3:
568 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
569 respectively. However, many files have character code 2 but data
570 which are clearly not ASCII. Therefore, ignore these values. */
571 break;
572
573 case 4:
574 return "MS_KANJI";
575
576 default:
577 encoding = sys_get_encoding_from_codepage (codepage);
578 if (encoding != NULL)
579 return encoding;
580 break;
581 }
582 }
583
584 /* If the file magic number is EBCDIC then its character data is too. */
585 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
586 return "EBCDIC-US";
587
588 return NULL;
589 }
590
591 struct get_strings_aux
592 {
593 struct pool *pool;
594 char **titles;
595 char **strings;
596 bool *ids;
597 size_t allocated;
598 size_t n;
599 };
600
601 static void
add_string__(struct get_strings_aux * aux,const char * string,bool id,char * title)602 add_string__ (struct get_strings_aux *aux,
603 const char *string, bool id, char *title)
604 {
605 if (aux->n >= aux->allocated)
606 {
607 aux->allocated = 2 * (aux->allocated + 1);
608 aux->titles = pool_realloc (aux->pool, aux->titles,
609 aux->allocated * sizeof *aux->titles);
610 aux->strings = pool_realloc (aux->pool, aux->strings,
611 aux->allocated * sizeof *aux->strings);
612 aux->ids = pool_realloc (aux->pool, aux->ids,
613 aux->allocated * sizeof *aux->ids);
614 }
615
616 aux->titles[aux->n] = title;
617 aux->strings[aux->n] = pool_strdup (aux->pool, string);
618 aux->ids[aux->n] = id;
619 aux->n++;
620 }
621
622 static void PRINTF_FORMAT (3, 4)
add_string(struct get_strings_aux * aux,const char * string,const char * title,...)623 add_string (struct get_strings_aux *aux,
624 const char *string, const char *title, ...)
625 {
626 va_list args;
627
628 va_start (args, title);
629 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
630 va_end (args);
631 }
632
633 static void PRINTF_FORMAT (3, 4)
add_id(struct get_strings_aux * aux,const char * id,const char * title,...)634 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
635 {
636 va_list args;
637
638 va_start (args, title);
639 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
640 va_end (args);
641 }
642
643 /* Retrieves significant string data from R in its raw format, to allow the
644 caller to try to detect the encoding in use.
645
646 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
647 and *STRINGSP to an array of N elements allocated from POOL. For each I in
648 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
649 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
650 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
651 text. */
652 static size_t
sfm_get_strings(const struct any_reader * r_,struct pool * pool,char *** titlesp,bool ** idsp,char *** stringsp)653 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
654 char ***titlesp, bool **idsp, char ***stringsp)
655 {
656 struct sfm_reader *r = sfm_reader_cast (r_);
657 const struct sfm_mrset *mrset;
658 struct get_strings_aux aux;
659 size_t var_idx;
660 size_t i, j, k;
661
662 aux.pool = pool;
663 aux.titles = NULL;
664 aux.strings = NULL;
665 aux.ids = NULL;
666 aux.allocated = 0;
667 aux.n = 0;
668
669 var_idx = 0;
670 for (i = 0; i < r->n_vars; i++)
671 if (r->vars[i].width != -1)
672 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
673
674 var_idx = 0;
675 for (i = 0; i < r->n_vars; i++)
676 if (r->vars[i].width != -1)
677 {
678 var_idx++;
679 if (r->vars[i].label)
680 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
681 var_idx);
682 }
683
684 k = 0;
685 for (i = 0; i < r->n_labels; i++)
686 for (j = 0; j < r->labels[i].n_labels; j++)
687 add_string (&aux, r->labels[i].labels[j].label,
688 _("Value Label %zu"), k++);
689
690 add_string (&aux, r->header.creation_date, _("Creation Date"));
691 add_string (&aux, r->header.creation_time, _("Creation Time"));
692 add_string (&aux, r->header.eye_catcher, _("Product"));
693 add_string (&aux, r->header.file_label, _("File Label"));
694
695 if (r->extensions[EXT_PRODUCT_INFO])
696 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
697 _("Extra Product Info"));
698
699 if (r->document)
700 {
701 size_t i;
702
703 for (i = 0; i < r->document->n_lines; i++)
704 {
705 char line[81];
706
707 memcpy (line, r->document->documents + i * 80, 80);
708 line[80] = '\0';
709
710 add_string (&aux, line, _("Document Line %zu"), i + 1);
711 }
712 }
713
714 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
715 {
716 size_t mrset_idx = mrset - r->mrsets + 1;
717
718 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
719 if (mrset->label[0])
720 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
721
722 /* Skip the variables because they ought to be duplicates. */
723
724 if (mrset->counted)
725 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
726 mrset_idx);
727 }
728
729 /* data file attributes */
730 /* variable attributes */
731 /* long var map */
732 /* long string value labels */
733 /* long string missing values */
734
735 *titlesp = aux.titles;
736 *idsp = aux.ids;
737 *stringsp = aux.strings;
738 return aux.n;
739 }
740
741 /* Decodes the dictionary read from R, saving it into into *DICT. Character
742 strings in R are decoded using ENCODING, or an encoding obtained from R if
743 ENCODING is null, or the locale encoding if R specifies no encoding.
744
745 If INFOP is non-null, then it receives additional info about the system
746 file, which the caller must eventually free with any_read_info_destroy()
747 when it is no longer needed.
748
749 This function consumes R. The caller must use it again later, even to
750 destroy it with sfm_close(). */
751 static struct casereader *
sfm_decode(struct any_reader * r_,const char * encoding,struct dictionary ** dictp,struct any_read_info * infop)752 sfm_decode (struct any_reader *r_, const char *encoding,
753 struct dictionary **dictp, struct any_read_info *infop)
754 {
755 struct sfm_reader *r = sfm_reader_cast (r_);
756 struct dictionary *dict;
757
758 if (encoding == NULL)
759 {
760 encoding = sfm_get_encoding (r);
761 if (encoding == NULL)
762 {
763 sys_warn (r, -1, _("This system file does not indicate its own "
764 "character encoding. Using default encoding "
765 "%s. For best results, specify an encoding "
766 "explicitly. Use SYSFILE INFO with "
767 "ENCODING=\"DETECT\" to analyze the possible "
768 "encodings."),
769 locale_charset ());
770 encoding = locale_charset ();
771 }
772 }
773
774 dict = dict_create (encoding);
775 r->encoding = dict_get_encoding (dict);
776
777 /* These records don't use variables at all. */
778 if (r->document != NULL)
779 parse_document (dict, r->document);
780
781 if (r->extensions[EXT_INTEGER] != NULL
782 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
783 goto error;
784
785 if (r->extensions[EXT_FLOAT] != NULL)
786 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
787
788 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
789 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
790
791 if (r->extensions[EXT_FILE_ATTRS] != NULL)
792 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
793
794 parse_header (r, &r->header, &r->info, dict);
795
796 /* Parse the variable records, the basis of almost everything else. */
797 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
798 goto error;
799
800 /* Parse value labels and the weight variable immediately after the variable
801 records. These records use indexes into var_recs[], so we must parse them
802 before those indexes become invalidated by very long string variables. */
803 parse_value_labels (r, dict);
804 if (r->header.weight_idx != 0)
805 dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars,
806 r->header.weight_idx));
807
808 if (r->extensions[EXT_DISPLAY] != NULL)
809 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
810
811 /* The following records use short names, so they need to be parsed before
812 parse_long_var_name_map() changes short names to long names. */
813 decode_mrsets (r, dict);
814
815 if (r->extensions[EXT_LONG_STRINGS] != NULL
816 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
817 goto error;
818
819 /* Now rename variables to their long names. */
820 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
821
822 /* The following records use long names, so they need to follow renaming. */
823 if (!ll_is_empty (&r->var_attrs))
824 {
825 struct sfm_extension_record *ext;
826 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
827 parse_variable_attributes (r, ext, dict);
828
829 /* Roles use the $@Role attribute. */
830 assign_variable_roles (r, dict);
831 }
832 if (r->extensions[EXT_LONG_LABELS] != NULL)
833 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
834 if (r->extensions[EXT_LONG_MISSING] != NULL)
835 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
836 dict);
837
838 /* Warn if the actual amount of data per case differs from the
839 amount that the header claims. SPSS version 13 gets this
840 wrong when very long strings are involved, so don't warn in
841 that case. */
842 if (r->header.nominal_case_size > 0
843 && r->header.nominal_case_size != r->n_vars
844 && r->info.version_major != 13)
845 sys_warn (r, -1, _("File header claims %d variable positions but "
846 "%zu were read from file."),
847 r->header.nominal_case_size, r->n_vars);
848
849 /* Create an index of dictionary variable widths for
850 sfm_read_case to use. We cannot use the `struct variable's
851 from the dictionary we created, because the caller owns the
852 dictionary and may destroy or modify its variables. */
853 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
854 pool_register (r->pool, free, r->sfm_vars);
855 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
856
857 *dictp = dict;
858 if (infop)
859 {
860 *infop = r->info;
861 memset (&r->info, 0, sizeof r->info);
862 }
863
864 return casereader_create_sequential
865 (NULL, r->proto,
866 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
867 &sys_file_casereader_class, r);
868
869 error:
870 sfm_close (r_);
871 dict_unref (dict);
872 *dictp = NULL;
873 return NULL;
874 }
875
876 /* Closes R, which should have been returned by sfm_open() but not already
877 closed with sfm_decode() or this function.
878 Returns true if an I/O error has occurred on READER, false
879 otherwise. */
880 static bool
sfm_close(struct any_reader * r_)881 sfm_close (struct any_reader *r_)
882 {
883 struct sfm_reader *r = sfm_reader_cast (r_);
884 bool error;
885
886 if (r->file)
887 {
888 if (fn_close (r->fh, r->file) == EOF)
889 {
890 msg (ME, _("Error closing system file `%s': %s."),
891 fh_get_file_name (r->fh), strerror (errno));
892 r->error = true;
893 }
894 r->file = NULL;
895 }
896
897 any_read_info_destroy (&r->info);
898 fh_unlock (r->lock);
899 fh_unref (r->fh);
900
901 error = r->error;
902 pool_destroy (r->pool);
903
904 return !error;
905 }
906
907 /* Destroys READER. */
908 static void
sys_file_casereader_destroy(struct casereader * reader UNUSED,void * r_)909 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
910 {
911 struct sfm_reader *r = r_;
912 sfm_close (&r->any_reader);
913 }
914
915 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
916 a negative errno value if there is an error reading FILE. */
917 static int
sfm_detect(FILE * file)918 sfm_detect (FILE *file)
919 {
920 char magic[5];
921
922 if (fseek (file, 0, SEEK_SET) != 0)
923 return -errno;
924 if (fread (magic, 4, 1, file) != 1)
925 return ferror (file) ? -errno : 0;
926 magic[4] = '\0';
927
928 return (!strcmp (ASCII_MAGIC, magic)
929 || !strcmp (ASCII_ZMAGIC, magic)
930 || !strcmp (EBCDIC_MAGIC, magic));
931 }
932
933 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
934 except for the string fields in *INFO, which parse_header() will initialize
935 later once the file's encoding is known. */
936 static bool
read_header(struct sfm_reader * r,struct any_read_info * info,struct sfm_header_record * header)937 read_header (struct sfm_reader *r, struct any_read_info *info,
938 struct sfm_header_record *header)
939 {
940 uint8_t raw_layout_code[4];
941 uint8_t raw_bias[8];
942 int compressed;
943 bool zmagic;
944
945 if (!read_string (r, header->magic, sizeof header->magic)
946 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
947 return false;
948 r->written_by_readstat = strstr (header->eye_catcher,
949 "https://github.com/WizardMac/ReadStat");
950
951 if (!strcmp (ASCII_MAGIC, header->magic)
952 || !strcmp (EBCDIC_MAGIC, header->magic))
953 zmagic = false;
954 else if (!strcmp (ASCII_ZMAGIC, header->magic))
955 zmagic = true;
956 else
957 {
958 sys_error (r, 0, _("This is not an SPSS system file."));
959 return false;
960 }
961
962 /* Identify integer format. */
963 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
964 return false;
965 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
966 &r->integer_format)
967 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
968 &r->integer_format))
969 || (r->integer_format != INTEGER_MSB_FIRST
970 && r->integer_format != INTEGER_LSB_FIRST))
971 {
972 sys_error (r, 64, _("This is not an SPSS system file."));
973 return false;
974 }
975
976 if (!read_int (r, &header->nominal_case_size))
977 return false;
978
979 if (header->nominal_case_size < 0
980 || header->nominal_case_size > INT_MAX / 16)
981 header->nominal_case_size = -1;
982
983 if (!read_int (r, &compressed))
984 return false;
985 if (!zmagic)
986 {
987 if (compressed == 0)
988 r->compression = ANY_COMP_NONE;
989 else if (compressed == 1)
990 r->compression = ANY_COMP_SIMPLE;
991 else if (compressed != 0)
992 {
993 sys_error (r, 0, "System file header has invalid compression "
994 "value %d.", compressed);
995 return false;
996 }
997 }
998 else
999 {
1000 if (compressed == 2)
1001 r->compression = ANY_COMP_ZLIB;
1002 else
1003 {
1004 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1005 "compression value %d.", compressed);
1006 return false;
1007 }
1008 }
1009
1010 if (!read_int (r, &header->weight_idx))
1011 return false;
1012
1013 if (!read_int (r, &r->case_cnt))
1014 return false;
1015 if (r->case_cnt > INT_MAX / 2)
1016 r->case_cnt = -1;
1017
1018 /* Identify floating-point format and obtain compression bias. */
1019 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1020 return false;
1021 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1022 {
1023 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1024
1025 if (memcmp (raw_bias, zero_bias, 8))
1026 sys_warn (r, r->pos - 8,
1027 _("Compression bias is not the usual "
1028 "value of 100, or system file uses unrecognized "
1029 "floating-point format."));
1030 else
1031 {
1032 /* Some software is known to write all-zeros to this
1033 field. Such software also writes floating-point
1034 numbers in the format that we expect by default
1035 (it seems that all software most likely does, in
1036 reality), so don't warn in this case. */
1037 }
1038
1039 if (r->integer_format == INTEGER_MSB_FIRST)
1040 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1041 else
1042 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1043 }
1044 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1045
1046 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1047 || !read_string (r, header->creation_time, sizeof header->creation_time)
1048 || !read_string (r, header->file_label, sizeof header->file_label)
1049 || !skip_bytes (r, 3))
1050 return false;
1051
1052 info->integer_format = r->integer_format;
1053 info->float_format = r->float_format;
1054 info->compression = r->compression;
1055 info->case_cnt = r->case_cnt;
1056
1057 return true;
1058 }
1059
1060 /* Reads a variable (type 2) record from R into RECORD. */
1061 static bool
read_variable_record(struct sfm_reader * r,struct sfm_var_record * record)1062 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1063 {
1064 int has_variable_label;
1065
1066 memset (record, 0, sizeof *record);
1067
1068 record->pos = r->pos;
1069 if (!read_int (r, &record->width)
1070 || !read_int (r, &has_variable_label)
1071 || !read_int (r, &record->missing_value_code)
1072 || !read_int (r, &record->print_format)
1073 || !read_int (r, &record->write_format)
1074 || !read_string (r, record->name, sizeof record->name))
1075 return false;
1076
1077 if (has_variable_label == 1)
1078 {
1079 enum { MAX_LABEL_LEN = 65536 };
1080 unsigned int len, read_len;
1081
1082 if (!read_uint (r, &len))
1083 return false;
1084
1085 /* Read up to MAX_LABEL_LEN bytes of label. */
1086 read_len = MIN (MAX_LABEL_LEN, len);
1087 record->label = pool_malloc (r->pool, read_len + 1);
1088 if (!read_string (r, record->label, read_len + 1))
1089 return false;
1090
1091 /* Skip unread label bytes. */
1092 if (!skip_bytes (r, len - read_len))
1093 return false;
1094
1095 /* Skip label padding up to multiple of 4 bytes. */
1096 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1097 return false;
1098 }
1099 else if (has_variable_label != 0)
1100 {
1101 sys_error (r, record->pos,
1102 _("Variable label indicator field is not 0 or 1."));
1103 return false;
1104 }
1105
1106 /* Set missing values. */
1107 if (record->missing_value_code != 0)
1108 {
1109 int code = record->missing_value_code;
1110 if (record->width == 0)
1111 {
1112 if (code < -3 || code > 3 || code == -1)
1113 {
1114 sys_error (r, record->pos,
1115 _("Numeric missing value indicator field is not "
1116 "-3, -2, 0, 1, 2, or 3."));
1117 return false;
1118 }
1119 }
1120 else
1121 {
1122 if (code < 1 || code > 3)
1123 {
1124 sys_error (r, record->pos,
1125 _("String missing value indicator field is not "
1126 "0, 1, 2, or 3."));
1127 return false;
1128 }
1129 }
1130
1131 if (!read_bytes (r, record->missing, 8 * abs (code)))
1132 return false;
1133 }
1134
1135 return true;
1136 }
1137
1138 /* Reads value labels from R into RECORD. */
1139 static bool
read_value_label_record(struct sfm_reader * r,struct sfm_value_label_record * record)1140 read_value_label_record (struct sfm_reader *r,
1141 struct sfm_value_label_record *record)
1142 {
1143 size_t i;
1144 int type;
1145
1146 /* Read type 3 record. */
1147 record->pos = r->pos;
1148 if (!read_uint (r, &record->n_labels))
1149 return false;
1150 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1151 {
1152 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1153 record->n_labels);
1154 return false;
1155 }
1156 record->labels = pool_nmalloc (r->pool, record->n_labels,
1157 sizeof *record->labels);
1158 for (i = 0; i < record->n_labels; i++)
1159 {
1160 struct sfm_value_label *label = &record->labels[i];
1161 unsigned char label_len;
1162 size_t padded_len;
1163
1164 if (!read_bytes (r, label->value, sizeof label->value))
1165 return false;
1166
1167 /* Read label length. */
1168 if (!read_bytes (r, &label_len, sizeof label_len))
1169 return false;
1170 padded_len = ROUND_UP (label_len + 1, 8);
1171
1172 /* Read label, padding. */
1173 label->label = pool_malloc (r->pool, padded_len + 1);
1174 if (!read_bytes (r, label->label, padded_len - 1))
1175 return false;
1176 label->label[label_len] = '\0';
1177 }
1178
1179 /* Read record type of type 4 record. */
1180 if (!read_int (r, &type))
1181 return false;
1182 if (type != 4)
1183 {
1184 sys_error (r, r->pos - 4,
1185 _("Variable index record (type 4) does not immediately "
1186 "follow value label record (type 3) as it should."));
1187 return false;
1188 }
1189
1190 /* Read number of variables associated with value label from type 4
1191 record. */
1192 if (!read_uint (r, &record->n_vars))
1193 return false;
1194 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1195 {
1196 sys_error (r, r->pos - 4,
1197 _("Number of variables associated with a value label (%u) "
1198 "is not between 1 and the number of variables (%zu)."),
1199 record->n_vars, r->n_vars);
1200 return false;
1201 }
1202
1203 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1204 for (i = 0; i < record->n_vars; i++)
1205 if (!read_int (r, &record->vars[i]))
1206 return false;
1207
1208 return true;
1209 }
1210
1211 /* Reads a document record from R. Returns true if successful, false on
1212 error. */
1213 static bool
read_document_record(struct sfm_reader * r)1214 read_document_record (struct sfm_reader *r)
1215 {
1216 int n_lines;
1217 if (!read_int (r, &n_lines))
1218 return false;
1219 else if (n_lines == 0)
1220 return true;
1221 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1222 {
1223 sys_error (r, r->pos,
1224 _("Number of document lines (%d) "
1225 "must be greater than 0 and less than %d."),
1226 n_lines, INT_MAX / DOC_LINE_LENGTH);
1227 return false;
1228 }
1229
1230 struct sfm_document_record *record;
1231 record = pool_malloc (r->pool, sizeof *record);
1232 record->pos = r->pos;
1233 record->n_lines = n_lines;
1234 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1235 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1236 return false;
1237
1238 r->document = record;
1239 return true;
1240 }
1241
1242 static bool
read_extension_record_header(struct sfm_reader * r,int subtype,struct sfm_extension_record * record)1243 read_extension_record_header (struct sfm_reader *r, int subtype,
1244 struct sfm_extension_record *record)
1245 {
1246 record->subtype = subtype;
1247 record->pos = r->pos;
1248 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1249 return false;
1250
1251 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1252 allows an extra byte for a null terminator, used by some
1253 extension processing routines. */
1254 if (record->size != 0
1255 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1256 {
1257 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1258 subtype);
1259 return false;
1260 }
1261
1262 return true;
1263 }
1264
1265 /* Reads an extension record from R into RECORD. */
1266 static bool
read_extension_record(struct sfm_reader * r,int subtype,struct sfm_extension_record ** recordp)1267 read_extension_record (struct sfm_reader *r, int subtype,
1268 struct sfm_extension_record **recordp)
1269 {
1270 struct extension_record_type
1271 {
1272 int subtype;
1273 int size;
1274 int count;
1275 };
1276
1277 static const struct extension_record_type types[] =
1278 {
1279 /* Implemented record types. */
1280 { EXT_INTEGER, 4, 8 },
1281 { EXT_FLOAT, 8, 3 },
1282 { EXT_MRSETS, 1, 0 },
1283 { EXT_PRODUCT_INFO, 1, 0 },
1284 { EXT_DISPLAY, 4, 0 },
1285 { EXT_LONG_NAMES, 1, 0 },
1286 { EXT_LONG_STRINGS, 1, 0 },
1287 { EXT_NCASES, 8, 2 },
1288 { EXT_FILE_ATTRS, 1, 0 },
1289 { EXT_VAR_ATTRS, 1, 0 },
1290 { EXT_MRSETS2, 1, 0 },
1291 { EXT_ENCODING, 1, 0 },
1292 { EXT_LONG_LABELS, 1, 0 },
1293 { EXT_LONG_MISSING, 1, 0 },
1294
1295 /* Ignored record types. */
1296 { EXT_VAR_SETS, 0, 0 },
1297 { EXT_DATE, 0, 0 },
1298 { EXT_DATA_ENTRY, 0, 0 },
1299 { EXT_DATAVIEW, 0, 0 },
1300 };
1301
1302 const struct extension_record_type *type;
1303 struct sfm_extension_record *record;
1304 size_t n_bytes;
1305
1306 *recordp = NULL;
1307 record = pool_malloc (r->pool, sizeof *record);
1308 if (!read_extension_record_header (r, subtype, record))
1309 return false;
1310 n_bytes = record->count * record->size;
1311
1312 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1313 if (subtype == type->subtype)
1314 {
1315 if (type->size > 0 && record->size != type->size)
1316 sys_warn (r, record->pos,
1317 _("Record type 7, subtype %d has bad size %u "
1318 "(expected %d)."), subtype, record->size, type->size);
1319 else if (type->count > 0 && record->count != type->count)
1320 sys_warn (r, record->pos,
1321 _("Record type 7, subtype %d has bad count %u "
1322 "(expected %d)."), subtype, record->count, type->count);
1323 else if (type->count == 0 && type->size == 0)
1324 {
1325 /* Ignore this record. */
1326 }
1327 else
1328 {
1329 char *data = pool_malloc (r->pool, n_bytes + 1);
1330 data[n_bytes] = '\0';
1331
1332 record->data = data;
1333 if (!read_bytes (r, record->data, n_bytes))
1334 return false;
1335 *recordp = record;
1336 return true;
1337 }
1338
1339 goto skip;
1340 }
1341
1342 sys_warn (r, record->pos,
1343 _("Unrecognized record type 7, subtype %d. For help, please "
1344 "send this file to %s and mention that you were using %s."),
1345 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1346
1347 skip:
1348 return skip_bytes (r, n_bytes);
1349 }
1350
1351 static bool
skip_extension_record(struct sfm_reader * r,int subtype)1352 skip_extension_record (struct sfm_reader *r, int subtype)
1353 {
1354 struct sfm_extension_record record;
1355
1356 return (read_extension_record_header (r, subtype, &record)
1357 && skip_bytes (r, record.count * record.size));
1358 }
1359
1360 static void
parse_header(struct sfm_reader * r,const struct sfm_header_record * header,struct any_read_info * info,struct dictionary * dict)1361 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1362 struct any_read_info *info, struct dictionary *dict)
1363 {
1364 const char *dict_encoding = dict_get_encoding (dict);
1365 struct substring product;
1366 struct substring label;
1367 char *fixed_label;
1368
1369 /* Convert file label to UTF-8 and put it into DICT. */
1370 label = recode_substring_pool ("UTF-8", dict_encoding,
1371 ss_cstr (header->file_label), r->pool);
1372 ss_trim (&label, ss_cstr (" "));
1373 label.string[label.length] = '\0';
1374 fixed_label = fix_line_ends (label.string);
1375 dict_set_label (dict, fixed_label);
1376 free (fixed_label);
1377
1378 /* Put creation date and time in UTF-8 into INFO. */
1379 info->creation_date = recode_string ("UTF-8", dict_encoding,
1380 header->creation_date, -1);
1381 info->creation_time = recode_string ("UTF-8", dict_encoding,
1382 header->creation_time, -1);
1383
1384 /* Put product name into INFO, dropping eye-catcher string if present. */
1385 product = recode_substring_pool ("UTF-8", dict_encoding,
1386 ss_cstr (header->eye_catcher), r->pool);
1387 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1388 ss_trim (&product, ss_cstr (" "));
1389 info->product = ss_xstrdup (product);
1390 }
1391
1392 static struct variable *
add_var_with_generated_name(struct dictionary * dict,int width)1393 add_var_with_generated_name (struct dictionary *dict, int width)
1394 {
1395 char *name = dict_make_unique_var_name (dict, NULL, NULL);
1396 struct variable *var = dict_create_var_assert (dict, name, width);
1397 free (name);
1398 return var;
1399 }
1400
1401 /* Reads a variable (type 2) record from R and adds the
1402 corresponding variable to DICT.
1403 Also skips past additional variable records for long string
1404 variables. */
1405 static bool
parse_variable_records(struct sfm_reader * r,struct dictionary * dict,struct sfm_var_record * var_recs,size_t n_var_recs)1406 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1407 struct sfm_var_record *var_recs, size_t n_var_recs)
1408 {
1409 const char *dict_encoding = dict_get_encoding (dict);
1410 struct sfm_var_record *rec;
1411 int n_warnings = 0;
1412
1413 for (rec = var_recs; rec < &var_recs[n_var_recs];)
1414 {
1415 size_t n_values;
1416 char *name;
1417 size_t i;
1418
1419 name = recode_string_pool ("UTF-8", dict_encoding,
1420 rec->name, -1, r->pool);
1421 name[strcspn (name, " ")] = '\0';
1422
1423 if (rec->width < 0 || rec->width > 255)
1424 {
1425 sys_error (r, rec->pos,
1426 _("Bad width %d for variable %s."), rec->width, name);
1427 return false;
1428 }
1429
1430 struct variable *var;
1431 if (!dict_id_is_valid (dict, name, false)
1432 || name[0] == '$' || name[0] == '#')
1433 {
1434 var = add_var_with_generated_name (dict, rec->width);
1435 sys_warn (r, rec->pos, _("Renaming variable with invalid name "
1436 "`%s' to `%s'."), name, var_get_name (var));
1437 }
1438 else
1439 {
1440 var = dict_create_var (dict, name, rec->width);
1441 if (var == NULL)
1442 {
1443 var = add_var_with_generated_name (dict, rec->width);
1444 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1445 "`%s' to `%s'."),
1446 name, var_get_name (var));
1447 }
1448 }
1449 rec->var = var;
1450
1451 /* Set the short name the same as the long name (even if we renamed
1452 it). */
1453 var_set_short_name (var, 0, var_get_name (var));
1454
1455 /* Get variable label, if any. */
1456 if (rec->label)
1457 {
1458 char *utf8_label;
1459
1460 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1461 rec->label, -1, r->pool);
1462 var_set_label (var, utf8_label);
1463 }
1464
1465 /* Set missing values. */
1466 if (rec->missing_value_code != 0)
1467 {
1468 int width = var_get_width (var);
1469 struct missing_values mv;
1470
1471 mv_init_pool (r->pool, &mv, width);
1472 if (var_is_numeric (var))
1473 {
1474 bool has_range = rec->missing_value_code < 0;
1475 int n_discrete = (has_range
1476 ? rec->missing_value_code == -3
1477 : rec->missing_value_code);
1478 int ofs = 0;
1479
1480 if (has_range)
1481 {
1482 double low = parse_float (r, rec->missing, 0);
1483 double high = parse_float (r, rec->missing, 8);
1484
1485 /* Deal with SPSS 21 change in representation. */
1486 if (low == SYSMIS)
1487 low = LOWEST;
1488
1489 mv_add_range (&mv, low, high);
1490 ofs += 16;
1491 }
1492
1493 for (i = 0; i < n_discrete; i++)
1494 {
1495 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1496 ofs += 8;
1497 }
1498 }
1499 else
1500 for (i = 0; i < rec->missing_value_code; i++)
1501 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1502 var_set_missing_values (var, &mv);
1503 }
1504
1505 /* Set formats. */
1506 parse_format_spec (r, rec->pos + 12, rec->print_format,
1507 PRINT_FORMAT, var, &n_warnings);
1508 parse_format_spec (r, rec->pos + 16, rec->write_format,
1509 WRITE_FORMAT, var, &n_warnings);
1510
1511 /* Account for values.
1512 Skip long string continuation records, if any. */
1513 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1514 for (i = 1; i < n_values; i++)
1515 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1516 {
1517 sys_error (r, rec->pos, _("Missing string continuation record."));
1518 return false;
1519 }
1520 rec += n_values;
1521 }
1522
1523 return true;
1524 }
1525
1526 /* Translates the format spec from sysfile format to internal
1527 format. */
1528 static void
parse_format_spec(struct sfm_reader * r,off_t pos,unsigned int format,enum which_format which,struct variable * v,int * n_warnings)1529 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1530 enum which_format which, struct variable *v,
1531 int *n_warnings)
1532 {
1533 const int max_warnings = 8;
1534 struct fmt_spec f;
1535
1536 if (fmt_from_u32 (format, var_get_width (v), false, &f))
1537 {
1538 if (which == PRINT_FORMAT)
1539 var_set_print_format (v, &f);
1540 else
1541 var_set_write_format (v, &f);
1542 }
1543 else if (format == 0)
1544 {
1545 /* Actually observed in the wild. No point in warning about it. */
1546 }
1547 else if (++*n_warnings <= max_warnings)
1548 {
1549 if (which == PRINT_FORMAT)
1550 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1551 "format 0x%x."),
1552 var_get_name (v), var_get_width (v), format);
1553 else
1554 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1555 "format 0x%x."),
1556 var_get_name (v), var_get_width (v), format);
1557
1558 if (*n_warnings == max_warnings)
1559 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1560 }
1561 }
1562
1563 static void
parse_document(struct dictionary * dict,struct sfm_document_record * record)1564 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1565 {
1566 const char *p;
1567
1568 for (p = record->documents;
1569 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1570 p += DOC_LINE_LENGTH)
1571 {
1572 struct substring line;
1573
1574 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1575 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1576 ss_rtrim (&line, ss_cstr (" "));
1577 line.string[line.length] = '\0';
1578
1579 dict_add_document_line (dict, line.string, false);
1580
1581 ss_dealloc (&line);
1582 }
1583 }
1584
1585 /* Parses record type 7, subtype 3. */
1586 static bool
parse_machine_integer_info(struct sfm_reader * r,const struct sfm_extension_record * record,struct any_read_info * info)1587 parse_machine_integer_info (struct sfm_reader *r,
1588 const struct sfm_extension_record *record,
1589 struct any_read_info *info)
1590 {
1591 int float_representation, expected_float_format;
1592 int integer_representation, expected_integer_format;
1593
1594 /* Save version info. */
1595 info->version_major = parse_int (r, record->data, 0);
1596 info->version_minor = parse_int (r, record->data, 4);
1597 info->version_revision = parse_int (r, record->data, 8);
1598
1599 /* Check floating point format. */
1600 float_representation = parse_int (r, record->data, 16);
1601 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1602 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1603 expected_float_format = 1;
1604 else if (r->float_format == FLOAT_Z_LONG)
1605 expected_float_format = 2;
1606 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1607 expected_float_format = 3;
1608 else
1609 NOT_REACHED ();
1610 if (float_representation != expected_float_format)
1611 {
1612 sys_error (r, record->pos,
1613 _("Floating-point representation indicated by "
1614 "system file (%d) differs from expected (%d)."),
1615 float_representation, expected_float_format);
1616 return false;
1617 }
1618
1619 /* Check integer format. */
1620 integer_representation = parse_int (r, record->data, 24);
1621 if (r->integer_format == INTEGER_MSB_FIRST)
1622 expected_integer_format = 1;
1623 else if (r->integer_format == INTEGER_LSB_FIRST)
1624 expected_integer_format = 2;
1625 else
1626 NOT_REACHED ();
1627 if (integer_representation != expected_integer_format)
1628 sys_warn (r, record->pos,
1629 _("Integer format indicated by system file (%d) "
1630 "differs from expected (%d)."),
1631 integer_representation, expected_integer_format);
1632
1633 return true;
1634 }
1635
1636 /* Parses record type 7, subtype 4. */
1637 static void
parse_machine_float_info(struct sfm_reader * r,const struct sfm_extension_record * record)1638 parse_machine_float_info (struct sfm_reader *r,
1639 const struct sfm_extension_record *record)
1640 {
1641 double sysmis = parse_float (r, record->data, 0);
1642 double highest = parse_float (r, record->data, 8);
1643 double lowest = parse_float (r, record->data, 16);
1644
1645 if (sysmis != SYSMIS)
1646 sys_warn (r, record->pos,
1647 _("File specifies unexpected value %g (%a) as %s, "
1648 "instead of %g (%a)."),
1649 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1650
1651 if (highest != HIGHEST)
1652 sys_warn (r, record->pos,
1653 _("File specifies unexpected value %g (%a) as %s, "
1654 "instead of %g (%a)."),
1655 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1656
1657 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1658 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1659 appears in a context (missing values) where SYSMIS cannot. */
1660 if (lowest != LOWEST && lowest != SYSMIS)
1661 sys_warn (r, record->pos,
1662 _("File specifies unexpected value %g (%a) as %s, "
1663 "instead of %g (%a) or %g (%a)."),
1664 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1665 }
1666
1667 /* Parses record type 7, subtype 10. */
1668 static void
parse_extra_product_info(struct sfm_reader * r,const struct sfm_extension_record * record,struct any_read_info * info)1669 parse_extra_product_info (struct sfm_reader *r,
1670 const struct sfm_extension_record *record,
1671 struct any_read_info *info)
1672 {
1673 struct text_record *text;
1674
1675 text = open_text_record (r, record, true);
1676 info->product_ext = fix_line_ends (text_get_all (text));
1677 close_text_record (r, text);
1678 }
1679
1680 /* Parses record type 7, subtype 7 or 19. */
1681 static void
parse_mrsets(struct sfm_reader * r,const struct sfm_extension_record * record,size_t * allocated_mrsets)1682 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1683 size_t *allocated_mrsets)
1684 {
1685 struct text_record *text;
1686
1687 text = open_text_record (r, record, false);
1688 for (;;)
1689 {
1690 struct sfm_mrset *mrset = NULL;
1691 size_t allocated_vars = 0;
1692 char delimiter = '4';
1693
1694 /* Skip extra line feeds if present. */
1695 while (text_match (text, '\n'))
1696 continue;
1697
1698 if (r->n_mrsets >= *allocated_mrsets)
1699 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1700 sizeof *r->mrsets);
1701 mrset = &r->mrsets[r->n_mrsets];
1702 memset(mrset, 0, sizeof *mrset);
1703
1704 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1705 if (mrset->name == NULL)
1706 break;
1707
1708 if (text_match (text, 'C'))
1709 {
1710 mrset->type = MRSET_MC;
1711 if (!text_match (text, ' '))
1712 {
1713 sys_warn (r, record->pos,
1714 _("Missing space following `%c' at offset %zu "
1715 "in MRSETS record."), 'C', text_pos (text));
1716 break;
1717 }
1718 }
1719 else if (text_match (text, 'D'))
1720 {
1721 mrset->type = MRSET_MD;
1722 mrset->cat_source = MRSET_VARLABELS;
1723 }
1724 else if (text_match (text, 'E'))
1725 {
1726 char *number;
1727
1728 mrset->type = MRSET_MD;
1729 mrset->cat_source = MRSET_COUNTEDVALUES;
1730 if (!text_match (text, ' '))
1731 {
1732 sys_warn (r, record->pos,
1733 _("Missing space following `%c' at offset %zu "
1734 "in MRSETS record."), 'E', text_pos (text));
1735 break;
1736 }
1737
1738 number = text_get_token (text, ss_cstr (" "), NULL);
1739 if (!number)
1740 sys_warn (r, record->pos,
1741 _("Missing label source value "
1742 "following `E' at offset %zu in MRSETS record."),
1743 text_pos (text));
1744 else if (!strcmp (number, "11"))
1745 mrset->label_from_var_label = true;
1746 else if (strcmp (number, "1"))
1747 sys_warn (r, record->pos,
1748 _("Unexpected label source value following `E' "
1749 "at offset %zu in MRSETS record."),
1750 text_pos (text));
1751 }
1752 else
1753 {
1754 sys_warn (r, record->pos,
1755 _("Missing `C', `D', or `E' at offset %zu "
1756 "in MRSETS record."),
1757 text_pos (text));
1758 break;
1759 }
1760
1761 if (mrset->type == MRSET_MD)
1762 {
1763 mrset->counted = text_parse_counted_string (r, text);
1764 if (mrset->counted == NULL)
1765 break;
1766 }
1767
1768 mrset->label = text_parse_counted_string (r, text);
1769 if (mrset->label == NULL)
1770 break;
1771
1772 allocated_vars = 0;
1773 do
1774 {
1775 const char *var;
1776
1777 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1778 if (var == NULL)
1779 {
1780 if (delimiter != '\n')
1781 sys_warn (r, record->pos,
1782 _("Missing new-line parsing variable names "
1783 "at offset %zu in MRSETS record."),
1784 text_pos (text));
1785 break;
1786 }
1787
1788 if (mrset->n_vars >= allocated_vars)
1789 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1790 &allocated_vars,
1791 sizeof *mrset->vars);
1792 mrset->vars[mrset->n_vars++] = var;
1793 }
1794 while (delimiter != '\n');
1795
1796 r->n_mrsets++;
1797 }
1798 close_text_record (r, text);
1799 }
1800
1801 static void
decode_mrsets(struct sfm_reader * r,struct dictionary * dict)1802 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1803 {
1804 const struct sfm_mrset *s;
1805
1806 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1807 {
1808 struct stringi_set var_names;
1809 struct mrset *mrset;
1810 char *name;
1811 int width;
1812 size_t i;
1813
1814 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1815 if (!mrset_is_valid_name (name, dict_get_encoding (dict), false))
1816 {
1817 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1818 name);
1819 free (name);
1820 continue;
1821 }
1822
1823 mrset = xzalloc (sizeof *mrset);
1824 mrset->name = name;
1825 mrset->type = s->type;
1826 mrset->cat_source = s->cat_source;
1827 mrset->label_from_var_label = s->label_from_var_label;
1828 if (s->label[0] != '\0')
1829 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1830
1831 stringi_set_init (&var_names);
1832 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1833 width = INT_MAX;
1834 for (i = 0; i < s->n_vars; i++)
1835 {
1836 struct variable *var;
1837 char *var_name;
1838
1839 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1840
1841 var = dict_lookup_var (dict, var_name);
1842 if (var == NULL)
1843 {
1844 free (var_name);
1845 continue;
1846 }
1847 if (!stringi_set_insert (&var_names, var_name))
1848 {
1849 sys_warn (r, -1,
1850 _("MRSET %s contains duplicate variable name %s."),
1851 mrset->name, var_name);
1852 free (var_name);
1853 continue;
1854 }
1855 free (var_name);
1856
1857 if (mrset->label == NULL && mrset->label_from_var_label
1858 && var_has_label (var))
1859 mrset->label = xstrdup (var_get_label (var));
1860
1861 if (mrset->n_vars
1862 && var_get_type (var) != var_get_type (mrset->vars[0]))
1863 {
1864 sys_warn (r, -1,
1865 _("MRSET %s contains both string and "
1866 "numeric variables."), mrset->name);
1867 continue;
1868 }
1869 width = MIN (width, var_get_width (var));
1870
1871 mrset->vars[mrset->n_vars++] = var;
1872 }
1873
1874 if (mrset->n_vars < 2)
1875 {
1876 if (mrset->n_vars == 0)
1877 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1878 else
1879 sys_warn (r, -1, _("MRSET %s has only one variable."),
1880 mrset->name);
1881 mrset_destroy (mrset);
1882 stringi_set_destroy (&var_names);
1883 continue;
1884 }
1885
1886 if (mrset->type == MRSET_MD)
1887 {
1888 mrset->width = width;
1889 value_init (&mrset->counted, width);
1890 if (width == 0)
1891 mrset->counted.f = c_strtod (s->counted, NULL);
1892 else
1893 value_copy_str_rpad (&mrset->counted, width,
1894 (const uint8_t *) s->counted, ' ');
1895 }
1896
1897 dict_add_mrset (dict, mrset);
1898 stringi_set_destroy (&var_names);
1899 }
1900 }
1901
1902 /* Read record type 7, subtype 11, which specifies how variables
1903 should be displayed in GUI environments. */
1904 static void
parse_display_parameters(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)1905 parse_display_parameters (struct sfm_reader *r,
1906 const struct sfm_extension_record *record,
1907 struct dictionary *dict)
1908 {
1909 bool includes_width;
1910 bool warned = false;
1911 size_t n_vars;
1912 size_t ofs;
1913 size_t i;
1914
1915 n_vars = dict_get_var_cnt (dict);
1916 if (record->count == 3 * n_vars)
1917 includes_width = true;
1918 else if (record->count == 2 * n_vars)
1919 includes_width = false;
1920 else
1921 {
1922 sys_warn (r, record->pos,
1923 _("Extension 11 has bad count %u (for %zu variables)."),
1924 record->count, n_vars);
1925 return;
1926 }
1927
1928 ofs = 0;
1929 for (i = 0; i < n_vars; ++i)
1930 {
1931 struct variable *v = dict_get_var (dict, i);
1932 int measure, width, align;
1933
1934 measure = parse_int (r, record->data, ofs);
1935 ofs += 4;
1936
1937 if (includes_width)
1938 {
1939 width = parse_int (r, record->data, ofs);
1940 ofs += 4;
1941 }
1942 else
1943 width = 0;
1944
1945 align = parse_int (r, record->data, ofs);
1946 ofs += 4;
1947
1948 /* SPSS sometimes seems to set variables' measure to zero. */
1949 if (0 == measure)
1950 measure = 1;
1951
1952 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1953 {
1954 if (!warned)
1955 sys_warn (r, record->pos,
1956 _("Invalid variable display parameters for variable "
1957 "%zu (%s). Default parameters substituted."),
1958 i, var_get_name (v));
1959 warned = true;
1960 continue;
1961 }
1962
1963 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1964 : measure == 2 ? MEASURE_ORDINAL
1965 : MEASURE_SCALE));
1966 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1967 : align == 1 ? ALIGN_RIGHT
1968 : ALIGN_CENTRE));
1969
1970 /* Older versions (SPSS 9.0) sometimes set the display
1971 width to zero. This causes confusion in the GUI, so
1972 only set the width if it is nonzero. */
1973 if (width > 0)
1974 var_set_display_width (v, width);
1975 }
1976 }
1977
1978 static void
rename_var_and_save_short_names(struct sfm_reader * r,off_t pos,struct dictionary * dict,struct variable * var,const char * new_name)1979 rename_var_and_save_short_names (struct sfm_reader *r, off_t pos,
1980 struct dictionary *dict,
1981 struct variable *var, const char *new_name)
1982 {
1983 size_t n_short_names;
1984 char **short_names;
1985 size_t i;
1986
1987 /* Renaming a variable may clear its short names, but we
1988 want to retain them, so we save them and re-set them
1989 afterward. */
1990 n_short_names = var_get_short_name_cnt (var);
1991 short_names = xnmalloc (n_short_names, sizeof *short_names);
1992 for (i = 0; i < n_short_names; i++)
1993 {
1994 const char *s = var_get_short_name (var, i);
1995 short_names[i] = s != NULL ? xstrdup (s) : NULL;
1996 }
1997
1998 /* Set long name. */
1999 if (!dict_try_rename_var (dict, var, new_name))
2000 sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name);
2001
2002 /* Restore short names. */
2003 for (i = 0; i < n_short_names; i++)
2004 {
2005 var_set_short_name (var, i, short_names[i]);
2006 free (short_names[i]);
2007 }
2008 free (short_names);
2009 }
2010
2011 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2012 to each short name. Modifies variable names in DICT accordingly. */
2013 static void
parse_long_var_name_map(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2014 parse_long_var_name_map (struct sfm_reader *r,
2015 const struct sfm_extension_record *record,
2016 struct dictionary *dict)
2017 {
2018 struct text_record *text;
2019 struct variable *var;
2020 char *long_name;
2021
2022 if (record == NULL)
2023 {
2024 /* There are no long variable names. Use the short variable names,
2025 converted to lowercase, as the long variable names. */
2026 size_t i;
2027
2028 for (i = 0; i < dict_get_var_cnt (dict); i++)
2029 {
2030 struct variable *var = dict_get_var (dict, i);
2031 char *new_name;
2032
2033 new_name = utf8_to_lower (var_get_name (var));
2034 rename_var_and_save_short_names (r, -1, dict, var, new_name);
2035 free (new_name);
2036 }
2037
2038 return;
2039 }
2040
2041 /* Rename each of the variables, one by one. (In a correctly constructed
2042 system file, this cannot create any intermediate duplicate variable names,
2043 because all of the new variable names are longer than any of the old
2044 variable names and thus there cannot be any overlaps.) */
2045 text = open_text_record (r, record, true);
2046 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2047 {
2048 /* Validate long name. */
2049 if (!dict_id_is_valid (dict, long_name, false)
2050 || long_name[0] == '$' || long_name[0] == '#')
2051 {
2052 sys_warn (r, record->pos,
2053 _("Long variable mapping from %s to invalid "
2054 "variable name `%s'."),
2055 var_get_name (var), long_name);
2056 continue;
2057 }
2058
2059 rename_var_and_save_short_names (r, record->pos, dict, var, long_name);
2060 }
2061 close_text_record (r, text);
2062 }
2063
2064 /* Reads record type 7, subtype 14, which gives the real length
2065 of each very long string. Rearranges DICT accordingly. */
2066 static bool
parse_long_string_map(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2067 parse_long_string_map (struct sfm_reader *r,
2068 const struct sfm_extension_record *record,
2069 struct dictionary *dict)
2070 {
2071 struct text_record *text;
2072 struct variable *var;
2073 char *length_s;
2074
2075 text = open_text_record (r, record, true);
2076 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2077 {
2078 size_t idx = var_get_dict_index (var);
2079 long int length;
2080 int segment_cnt;
2081 int i;
2082
2083 /* Get length. */
2084 length = strtol (length_s, NULL, 10);
2085 if (length < 1 || length > MAX_STRING)
2086 {
2087 sys_warn (r, record->pos,
2088 _("%s listed as string of invalid length %s "
2089 "in very long string record."),
2090 var_get_name (var), length_s);
2091 continue;
2092 }
2093
2094 /* Check segments. */
2095 segment_cnt = sfm_width_to_segments (length);
2096 if (segment_cnt == 1)
2097 {
2098 sys_warn (r, record->pos,
2099 _("%s listed in very long string record with width %s, "
2100 "which requires only one segment."),
2101 var_get_name (var), length_s);
2102 continue;
2103 }
2104 if (idx + segment_cnt > dict_get_var_cnt (dict))
2105 {
2106 sys_error (r, record->pos,
2107 _("Very long string %s overflows dictionary."),
2108 var_get_name (var));
2109 return false;
2110 }
2111
2112 /* Get the short names from the segments and check their
2113 lengths. */
2114 for (i = 0; i < segment_cnt; i++)
2115 {
2116 struct variable *seg = dict_get_var (dict, idx + i);
2117 int alloc_width = sfm_segment_alloc_width (length, i);
2118 int width = var_get_width (seg);
2119
2120 if (i > 0)
2121 var_set_short_name (var, i, var_get_short_name (seg, 0));
2122 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2123 {
2124 sys_error (r, record->pos,
2125 _("Very long string with width %ld has segment %d "
2126 "of width %d (expected %d)."),
2127 length, i, width, alloc_width);
2128 return false;
2129 }
2130 }
2131 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2132 var_set_width (var, length);
2133 }
2134 close_text_record (r, text);
2135 dict_compact_values (dict);
2136
2137 return true;
2138 }
2139
2140 #define MAX_LABEL_WARNINGS 5
2141
2142 /* Displays a warning for offset OFFSET in the file. */
2143 static void
value_label_warning(struct sfm_reader * r,off_t offset,int * n_label_warnings,const char * format,...)2144 value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings,
2145 const char *format, ...)
2146 {
2147 if (++*n_label_warnings > MAX_LABEL_WARNINGS)
2148 return;
2149
2150 va_list args;
2151
2152 va_start (args, format);
2153 sys_msg (r, offset, MW, format, args);
2154 va_end (args);
2155 }
2156
2157 #define MAX_LABEL_WARNINGS 5
2158
2159 static void
parse_one_value_label_set(struct sfm_reader * r,struct dictionary * dict,const struct sfm_var_record * var_recs,size_t n_var_recs,const struct sfm_value_label_record * record,int * n_label_warnings)2160 parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict,
2161 const struct sfm_var_record *var_recs,
2162 size_t n_var_recs,
2163 const struct sfm_value_label_record *record,
2164 int *n_label_warnings)
2165 {
2166 char **utf8_labels
2167 = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2168 for (size_t i = 0; i < record->n_labels; i++)
2169 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2170 record->labels[i].label, -1,
2171 r->pool);
2172
2173 struct variable **vars = pool_nmalloc (r->pool,
2174 record->n_vars, sizeof *vars);
2175 unsigned int n_vars = 0;
2176 for (size_t i = 0; i < record->n_vars; i++)
2177 {
2178 int idx = record->vars[i];
2179 if (idx < 1 || idx > n_var_recs)
2180 {
2181 value_label_warning (
2182 r, record->pos, n_label_warnings,
2183 _("Value label variable index %d not in valid range 1...%zu."),
2184 idx, n_var_recs);
2185 continue;
2186 }
2187
2188 const struct sfm_var_record *rec = &var_recs[idx - 1];
2189 if (rec->var == NULL)
2190 {
2191 value_label_warning (
2192 r, record->pos, n_label_warnings,
2193 _("Value label variable index %d "
2194 "refers to long string continuation."), idx);
2195 continue;
2196 }
2197
2198 vars[n_vars++] = rec->var;
2199 }
2200 if (!n_vars)
2201 return;
2202
2203 for (size_t i = 1; i < n_vars; i++)
2204 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2205 {
2206 value_label_warning (
2207 r, record->pos, n_label_warnings,
2208 _("Variables associated with value label are not all of "
2209 "identical type. Variable %s is %s, but variable "
2210 "%s is %s."),
2211 var_get_name (vars[0]),
2212 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2213 var_get_name (vars[i]),
2214 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2215 return;
2216 }
2217
2218 for (size_t i = 0; i < n_vars; i++)
2219 {
2220 struct variable *var = vars[i];
2221 int width = var_get_width (var);
2222 if (width > 8)
2223 {
2224 value_label_warning (
2225 r, record->pos, n_label_warnings,
2226 _("Value labels may not be added to long string "
2227 "variables (e.g. %s) using records types 3 and 4."),
2228 var_get_name (var));
2229 continue;
2230 }
2231
2232 for (size_t j = 0; j < record->n_labels; j++)
2233 {
2234 struct sfm_value_label *label = &record->labels[j];
2235 union value value;
2236
2237 value_init (&value, width);
2238 if (width == 0)
2239 value.f = parse_float (r, label->value, 0);
2240 else
2241 memcpy (value.s, label->value, width);
2242
2243 if (!var_add_value_label (var, &value, utf8_labels[j]))
2244 {
2245 if (r->written_by_readstat)
2246 {
2247 /* Ignore the problem. ReadStat is buggy and emits value
2248 labels whose values are longer than string variables'
2249 widths, that are identical in the actual width of the
2250 variable, e.g. both values "ABC123" and "ABC456" for a
2251 string variable with width 3. */
2252 }
2253 else if (var_is_numeric (var))
2254 value_label_warning (r, record->pos, n_label_warnings,
2255 _("Duplicate value label for %g on %s."),
2256 value.f, var_get_name (var));
2257 else
2258 value_label_warning (
2259 r, record->pos, n_label_warnings,
2260 _("Duplicate value label for `%.*s' on %s."),
2261 width, value.s, var_get_name (var));
2262 }
2263
2264 value_destroy (&value, width);
2265 }
2266 }
2267
2268 pool_free (r->pool, vars);
2269 for (size_t i = 0; i < record->n_labels; i++)
2270 pool_free (r->pool, utf8_labels[i]);
2271 pool_free (r->pool, utf8_labels);
2272 }
2273
2274 static void
parse_value_labels(struct sfm_reader * r,struct dictionary * dict)2275 parse_value_labels (struct sfm_reader *r, struct dictionary *dict)
2276 {
2277 int n_label_warnings = 0;
2278 for (size_t i = 0; i < r->n_labels; i++)
2279 parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i],
2280 &n_label_warnings);
2281 if (n_label_warnings > MAX_LABEL_WARNINGS)
2282 sys_warn (r, -1,
2283 _("Suppressed %d additional warnings for value labels."),
2284 n_label_warnings - MAX_LABEL_WARNINGS);
2285 }
2286
2287 static struct variable *
parse_weight_var(struct sfm_reader * r,const struct sfm_var_record * var_recs,size_t n_var_recs,int idx)2288 parse_weight_var (struct sfm_reader *r,
2289 const struct sfm_var_record *var_recs, size_t n_var_recs,
2290 int idx)
2291 {
2292 off_t offset = 76; /* Offset to variable index in header. */
2293
2294 if (idx < 1 || idx > n_var_recs)
2295 {
2296 sys_warn (r, offset,
2297 _("Weight variable index %d not in valid range 1...%zu. "
2298 "Treating file as unweighted."),
2299 idx, n_var_recs);
2300 return NULL;
2301 }
2302
2303 const struct sfm_var_record *rec = &var_recs[idx - 1];
2304 if (rec->var == NULL)
2305 {
2306 sys_warn (r, offset,
2307 _("Weight variable index %d refers to long string "
2308 "continuation. Treating file as unweighted."), idx);
2309 return NULL;
2310 }
2311
2312 struct variable *weight_var = rec->var;
2313 if (!var_is_numeric (weight_var))
2314 {
2315 sys_warn (r, offset, _("Ignoring string variable `%s' set "
2316 "as weighting variable."),
2317 var_get_name (weight_var));
2318 return NULL;
2319 }
2320
2321 return weight_var;
2322 }
2323
2324 /* Parses a set of custom attributes from TEXT into ATTRS.
2325 ATTRS may be a null pointer, in which case the attributes are
2326 read but discarded. */
2327 static void
parse_attributes(struct sfm_reader * r,struct text_record * text,struct attrset * attrs)2328 parse_attributes (struct sfm_reader *r, struct text_record *text,
2329 struct attrset *attrs)
2330 {
2331 do
2332 {
2333 struct attribute *attr;
2334 char *key;
2335 int index;
2336
2337 /* Parse the key. */
2338 key = text_get_token (text, ss_cstr ("("), NULL);
2339 if (key == NULL)
2340 return;
2341
2342 attr = attribute_create (key);
2343 for (index = 1; ; index++)
2344 {
2345 /* Parse the value. */
2346 char *value;
2347 size_t length;
2348
2349 value = text_get_token (text, ss_cstr ("\n"), NULL);
2350 if (value == NULL)
2351 {
2352 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2353 key, index);
2354 break;
2355 }
2356
2357 length = strlen (value);
2358 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2359 {
2360 value[length - 1] = '\0';
2361 attribute_add_value (attr, value + 1);
2362 }
2363 else
2364 {
2365 text_warn (r, text,
2366 _("Attribute value %s[%d] is not quoted: %s."),
2367 key, index, value);
2368 attribute_add_value (attr, value);
2369 }
2370
2371 /* Was this the last value for this attribute? */
2372 if (text_match (text, ')'))
2373 break;
2374 }
2375 if (attrs != NULL && attribute_get_n_values (attr) > 0)
2376 {
2377 if (!attrset_try_add (attrs, attr))
2378 {
2379 text_warn (r, text, _("Duplicate attribute %s."),
2380 attribute_get_name (attr));
2381 attribute_destroy (attr);
2382 }
2383 }
2384 else
2385 attribute_destroy (attr);
2386 }
2387 while (!text_match (text, '/'));
2388 }
2389
2390 /* Reads record type 7, subtype 17, which lists custom
2391 attributes on the data file. */
2392 static void
parse_data_file_attributes(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2393 parse_data_file_attributes (struct sfm_reader *r,
2394 const struct sfm_extension_record *record,
2395 struct dictionary *dict)
2396 {
2397 struct text_record *text = open_text_record (r, record, true);
2398 parse_attributes (r, text, dict_get_attributes (dict));
2399 close_text_record (r, text);
2400 }
2401
2402 /* Parses record type 7, subtype 18, which lists custom
2403 attributes on individual variables. */
2404 static void
parse_variable_attributes(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2405 parse_variable_attributes (struct sfm_reader *r,
2406 const struct sfm_extension_record *record,
2407 struct dictionary *dict)
2408 {
2409 struct text_record *text;
2410 struct variable *var;
2411
2412 text = open_text_record (r, record, true);
2413 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2414 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2415 close_text_record (r, text);
2416 }
2417
2418 static void
assign_variable_roles(struct sfm_reader * r,struct dictionary * dict)2419 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2420 {
2421 size_t n_warnings = 0;
2422 size_t i;
2423
2424 for (i = 0; i < dict_get_var_cnt (dict); i++)
2425 {
2426 struct variable *var = dict_get_var (dict, i);
2427 struct attrset *attrs = var_get_attributes (var);
2428 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2429 if (attr != NULL && attribute_get_n_values (attr) > 0)
2430 {
2431 int value = atoi (attribute_get_value (attr, 0));
2432 enum var_role role;
2433
2434 switch (value)
2435 {
2436 case 0:
2437 role = ROLE_INPUT;
2438 break;
2439
2440 case 1:
2441 role = ROLE_TARGET;
2442 break;
2443
2444 case 2:
2445 role = ROLE_BOTH;
2446 break;
2447
2448 case 3:
2449 role = ROLE_NONE;
2450 break;
2451
2452 case 4:
2453 role = ROLE_PARTITION;
2454 break;
2455
2456 case 5:
2457 role = ROLE_SPLIT;
2458 break;
2459
2460 default:
2461 role = ROLE_INPUT;
2462 if (n_warnings++ == 0)
2463 sys_warn (r, -1, _("Invalid role for variable %s."),
2464 var_get_name (var));
2465 }
2466
2467 var_set_role (var, role);
2468 }
2469 }
2470
2471 if (n_warnings > 1)
2472 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2473 n_warnings - 1);
2474 }
2475
2476 static bool
check_overflow(struct sfm_reader * r,const struct sfm_extension_record * record,size_t ofs,size_t length)2477 check_overflow (struct sfm_reader *r,
2478 const struct sfm_extension_record *record,
2479 size_t ofs, size_t length)
2480 {
2481 size_t end = record->size * record->count;
2482 if (length >= end || ofs + length > end)
2483 {
2484 sys_warn (r, record->pos + end,
2485 _("Extension record subtype %d ends unexpectedly."),
2486 record->subtype);
2487 return false;
2488 }
2489 return true;
2490 }
2491
2492 static void
parse_long_string_value_labels(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2493 parse_long_string_value_labels (struct sfm_reader *r,
2494 const struct sfm_extension_record *record,
2495 struct dictionary *dict)
2496 {
2497 const char *dict_encoding = dict_get_encoding (dict);
2498 size_t end = record->size * record->count;
2499 size_t ofs = 0;
2500
2501 while (ofs < end)
2502 {
2503 char *var_name;
2504 size_t n_labels, i;
2505 struct variable *var;
2506 union value value;
2507 int var_name_len;
2508 int width;
2509
2510 /* Parse variable name length. */
2511 if (!check_overflow (r, record, ofs, 4))
2512 return;
2513 var_name_len = parse_int (r, record->data, ofs);
2514 ofs += 4;
2515
2516 /* Parse variable name, width, and number of labels. */
2517 if (!check_overflow (r, record, ofs, var_name_len)
2518 || !check_overflow (r, record, ofs, var_name_len + 8))
2519 return;
2520 var_name = recode_string_pool ("UTF-8", dict_encoding,
2521 (const char *) record->data + ofs,
2522 var_name_len, r->pool);
2523 width = parse_int (r, record->data, ofs + var_name_len);
2524 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2525 ofs += var_name_len + 8;
2526
2527 /* Look up 'var' and validate. */
2528 var = dict_lookup_var (dict, var_name);
2529 if (var == NULL)
2530 sys_warn (r, record->pos + ofs,
2531 _("Ignoring long string value label record for "
2532 "unknown variable %s."), var_name);
2533 else if (var_is_numeric (var))
2534 {
2535 sys_warn (r, record->pos + ofs,
2536 _("Ignoring long string value label record for "
2537 "numeric variable %s."), var_name);
2538 var = NULL;
2539 }
2540 else if (width != var_get_width (var))
2541 {
2542 sys_warn (r, record->pos + ofs,
2543 _("Ignoring long string value label record for variable "
2544 "%s because the record's width (%d) does not match the "
2545 "variable's width (%d)."),
2546 var_name, width, var_get_width (var));
2547 var = NULL;
2548 }
2549
2550 /* Parse values. */
2551 value_init_pool (r->pool, &value, width);
2552 for (i = 0; i < n_labels; i++)
2553 {
2554 size_t value_length, label_length;
2555 bool skip = var == NULL;
2556
2557 /* Parse value length. */
2558 if (!check_overflow (r, record, ofs, 4))
2559 return;
2560 value_length = parse_int (r, record->data, ofs);
2561 ofs += 4;
2562
2563 /* Parse value. */
2564 if (!check_overflow (r, record, ofs, value_length))
2565 return;
2566 if (!skip)
2567 {
2568 if (value_length == width)
2569 memcpy (value.s, (const uint8_t *) record->data + ofs, width);
2570 else
2571 {
2572 sys_warn (r, record->pos + ofs,
2573 _("Ignoring long string value label %zu for "
2574 "variable %s, with width %d, that has bad value "
2575 "width %zu."),
2576 i, var_get_name (var), width, value_length);
2577 skip = true;
2578 }
2579 }
2580 ofs += value_length;
2581
2582 /* Parse label length. */
2583 if (!check_overflow (r, record, ofs, 4))
2584 return;
2585 label_length = parse_int (r, record->data, ofs);
2586 ofs += 4;
2587
2588 /* Parse label. */
2589 if (!check_overflow (r, record, ofs, label_length))
2590 return;
2591 if (!skip)
2592 {
2593 char *label;
2594
2595 label = recode_string_pool ("UTF-8", dict_encoding,
2596 (const char *) record->data + ofs,
2597 label_length, r->pool);
2598 if (!var_add_value_label (var, &value, label))
2599 sys_warn (r, record->pos + ofs,
2600 _("Duplicate value label for `%.*s' on %s."),
2601 width, value.s, var_get_name (var));
2602 pool_free (r->pool, label);
2603 }
2604 ofs += label_length;
2605 }
2606 }
2607 }
2608
2609 static void
parse_long_string_missing_values(struct sfm_reader * r,const struct sfm_extension_record * record,struct dictionary * dict)2610 parse_long_string_missing_values (struct sfm_reader *r,
2611 const struct sfm_extension_record *record,
2612 struct dictionary *dict)
2613 {
2614 const char *dict_encoding = dict_get_encoding (dict);
2615 size_t end = record->size * record->count;
2616 size_t ofs = 0;
2617
2618 while (ofs < end)
2619 {
2620 struct missing_values mv;
2621 char *var_name;
2622 struct variable *var;
2623 int n_missing_values;
2624 int var_name_len;
2625 size_t i;
2626
2627 /* Parse variable name length. */
2628 if (!check_overflow (r, record, ofs, 4))
2629 return;
2630 var_name_len = parse_int (r, record->data, ofs);
2631 ofs += 4;
2632
2633 /* Parse variable name. */
2634 if (!check_overflow (r, record, ofs, var_name_len)
2635 || !check_overflow (r, record, ofs, var_name_len + 1))
2636 return;
2637 var_name = recode_string_pool ("UTF-8", dict_encoding,
2638 (const char *) record->data + ofs,
2639 var_name_len, r->pool);
2640 ofs += var_name_len;
2641
2642 /* Parse number of missing values. */
2643 n_missing_values = ((const uint8_t *) record->data)[ofs];
2644 if (n_missing_values < 1 || n_missing_values > 3)
2645 sys_warn (r, record->pos + ofs,
2646 _("Long string missing values record says variable %s "
2647 "has %d missing values, but only 1 to 3 missing values "
2648 "are allowed."),
2649 var_name, n_missing_values);
2650 ofs++;
2651
2652 /* Look up 'var' and validate. */
2653 var = dict_lookup_var (dict, var_name);
2654 if (var == NULL)
2655 sys_warn (r, record->pos + ofs,
2656 _("Ignoring long string missing value record for "
2657 "unknown variable %s."), var_name);
2658 else if (var_is_numeric (var))
2659 {
2660 sys_warn (r, record->pos + ofs,
2661 _("Ignoring long string missing value record for "
2662 "numeric variable %s."), var_name);
2663 var = NULL;
2664 }
2665
2666 /* Parse values. */
2667 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2668 for (i = 0; i < n_missing_values; i++)
2669 {
2670 size_t value_length;
2671
2672 /* Parse value length. */
2673 if (!check_overflow (r, record, ofs, 4))
2674 return;
2675 value_length = parse_int (r, record->data, ofs);
2676 ofs += 4;
2677
2678 /* Parse value. */
2679 if (!check_overflow (r, record, ofs, value_length))
2680 return;
2681 if (var != NULL
2682 && i < 3
2683 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2684 value_length))
2685 sys_warn (r, record->pos + ofs,
2686 _("Ignoring long string missing value %zu for variable "
2687 "%s, with width %d, that has bad value width %zu."),
2688 i, var_get_name (var), var_get_width (var),
2689 value_length);
2690 ofs += value_length;
2691 }
2692 if (var != NULL)
2693 var_set_missing_values (var, &mv);
2694 }
2695 }
2696
2697 /* Case reader. */
2698
2699 static void partial_record (struct sfm_reader *);
2700
2701 static void read_error (struct casereader *, const struct sfm_reader *);
2702
2703 static bool read_case_number (struct sfm_reader *, double *);
2704 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2705 static int read_opcode (struct sfm_reader *);
2706 static bool read_compressed_number (struct sfm_reader *, double *);
2707 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2708 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2709 static bool skip_whole_strings (struct sfm_reader *, size_t);
2710
2711 /* Reads and returns one case from READER's file. Returns a null
2712 pointer if not successful. */
2713 static struct ccase *
sys_file_casereader_read(struct casereader * reader,void * r_)2714 sys_file_casereader_read (struct casereader *reader, void *r_)
2715 {
2716 struct sfm_reader *r = r_;
2717 struct ccase *c;
2718 int retval;
2719 int i;
2720
2721 if (r->error || !r->sfm_var_cnt)
2722 return NULL;
2723
2724 c = case_create (r->proto);
2725
2726 for (i = 0; i < r->sfm_var_cnt; i++)
2727 {
2728 struct sfm_var *sv = &r->sfm_vars[i];
2729 union value *v = case_data_rw_idx (c, sv->case_index);
2730
2731 if (sv->var_width == 0)
2732 retval = read_case_number (r, &v->f);
2733 else
2734 {
2735 retval = read_case_string (r, v->s + sv->offset, sv->segment_width);
2736 if (retval == 1)
2737 {
2738 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2739 if (retval == 0)
2740 sys_error (r, r->pos, _("File ends in partial string value."));
2741 }
2742 }
2743
2744 if (retval != 1)
2745 goto eof;
2746 }
2747 return c;
2748
2749 eof:
2750 if (i != 0)
2751 partial_record (r);
2752 if (r->case_cnt != -1)
2753 read_error (reader, r);
2754 case_unref (c);
2755 return NULL;
2756 }
2757
2758 /* Issues an error that R ends in a partial record. */
2759 static void
partial_record(struct sfm_reader * r)2760 partial_record (struct sfm_reader *r)
2761 {
2762 sys_error (r, r->pos, _("File ends in partial case."));
2763 }
2764
2765 /* Issues an error that an unspecified error occurred SFM, and
2766 marks R tainted. */
2767 static void
read_error(struct casereader * r,const struct sfm_reader * sfm)2768 read_error (struct casereader *r, const struct sfm_reader *sfm)
2769 {
2770 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2771 casereader_force_error (r);
2772 }
2773
2774 /* Reads a number from R and stores its value in *D.
2775 If R is compressed, reads a compressed number;
2776 otherwise, reads a number in the regular way.
2777 Returns true if successful, false if end of file is
2778 reached immediately. */
2779 static bool
read_case_number(struct sfm_reader * r,double * d)2780 read_case_number (struct sfm_reader *r, double *d)
2781 {
2782 if (r->compression == ANY_COMP_NONE)
2783 {
2784 uint8_t number[8];
2785 if (!try_read_bytes (r, number, sizeof number))
2786 return false;
2787 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2788 return true;
2789 }
2790 else
2791 return read_compressed_number (r, d);
2792 }
2793
2794 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2795 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2796 discarded without being written to S. Reads compressed strings if S is
2797 compressed. Returns 1 if successful, 0 if end of file is reached
2798 immediately, or -1 for some kind of error. */
2799 static int
read_case_string(struct sfm_reader * r,uint8_t * s,size_t length)2800 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2801 {
2802 size_t whole = ROUND_DOWN (length, 8);
2803 size_t partial = length % 8;
2804
2805 if (whole)
2806 {
2807 int retval = read_whole_strings (r, s, whole);
2808 if (retval != 1)
2809 return retval;
2810 }
2811
2812 if (partial)
2813 {
2814 uint8_t bounce[8];
2815 int retval = read_whole_strings (r, bounce, sizeof bounce);
2816 if (retval == -1)
2817 return -1;
2818 else if (!retval)
2819 {
2820 if (whole)
2821 {
2822 partial_record (r);
2823 return -1;
2824 }
2825 return 0;
2826 }
2827 memcpy (s + whole, bounce, partial);
2828 }
2829
2830 return 1;
2831 }
2832
2833 /* Reads and returns the next compression opcode from R. */
2834 static int
read_opcode(struct sfm_reader * r)2835 read_opcode (struct sfm_reader *r)
2836 {
2837 assert (r->compression != ANY_COMP_NONE);
2838 for (;;)
2839 {
2840 int opcode;
2841 if (r->opcode_idx >= sizeof r->opcodes)
2842 {
2843
2844 int retval = try_read_compressed_bytes (r, r->opcodes,
2845 sizeof r->opcodes);
2846 if (retval != 1)
2847 return -1;
2848 r->opcode_idx = 0;
2849 }
2850 opcode = r->opcodes[r->opcode_idx++];
2851
2852 if (opcode != 0)
2853 return opcode;
2854 }
2855 }
2856
2857 /* Reads a compressed number from R and stores its value in D.
2858 Returns true if successful, false if end of file is
2859 reached immediately. */
2860 static bool
read_compressed_number(struct sfm_reader * r,double * d)2861 read_compressed_number (struct sfm_reader *r, double *d)
2862 {
2863 int opcode = read_opcode (r);
2864 switch (opcode)
2865 {
2866 case -1:
2867 case 252:
2868 return false;
2869
2870 case 253:
2871 return read_compressed_float (r, d);
2872
2873 case 254:
2874 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2875 if (!r->corruption_warning)
2876 {
2877 r->corruption_warning = true;
2878 sys_warn (r, r->pos,
2879 _("Possible compressed data corruption: "
2880 "compressed spaces appear in numeric field."));
2881 }
2882 break;
2883
2884 case 255:
2885 *d = SYSMIS;
2886 break;
2887
2888 default:
2889 *d = opcode - r->bias;
2890 break;
2891 }
2892
2893 return true;
2894 }
2895
2896 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2897 static int
read_compressed_string(struct sfm_reader * r,uint8_t * dst)2898 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2899 {
2900 int opcode;
2901 int retval;
2902
2903 opcode = read_opcode (r);
2904 switch (opcode)
2905 {
2906 case -1:
2907 case 252:
2908 return 0;
2909
2910 case 253:
2911 retval = read_compressed_bytes (r, dst, 8);
2912 return retval == 1 ? 1 : -1;
2913
2914 case 254:
2915 memset (dst, ' ', 8);
2916 return 1;
2917
2918 default:
2919 {
2920 double value = opcode - r->bias;
2921 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2922 if (value == 0.0)
2923 {
2924 /* This has actually been seen "in the wild". The submitter of the
2925 file that showed that the contents decoded as spaces, but they
2926 were at the end of the field so it's possible that the null
2927 bytes just acted as null terminators. */
2928 }
2929 else if (!r->corruption_warning)
2930 {
2931 r->corruption_warning = true;
2932 sys_warn (r, r->pos,
2933 _("Possible compressed data corruption: "
2934 "string contains compressed integer (opcode %d)."),
2935 opcode);
2936 }
2937 }
2938 return 1;
2939 }
2940 }
2941
2942 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2943 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2944 end of file is reached immediately, or -1 for some kind of error. */
2945 static int
read_whole_strings(struct sfm_reader * r,uint8_t * s,size_t length)2946 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2947 {
2948 assert (length % 8 == 0);
2949 if (r->compression == ANY_COMP_NONE)
2950 return try_read_bytes (r, s, length);
2951 else
2952 {
2953 size_t ofs;
2954
2955 for (ofs = 0; ofs < length; ofs += 8)
2956 {
2957 int retval = read_compressed_string (r, s + ofs);
2958 if (retval != 1)
2959 {
2960 if (ofs != 0)
2961 {
2962 partial_record (r);
2963 return -1;
2964 }
2965 return retval;
2966 }
2967 }
2968 return 1;
2969 }
2970 }
2971
2972 /* Skips LENGTH string bytes from R.
2973 LENGTH must be a multiple of 8.
2974 (LENGTH is also limited to 1024, but that's only because the
2975 current caller never needs more than that many bytes.)
2976 Returns true if successful, false if end of file is
2977 reached immediately. */
2978 static bool
skip_whole_strings(struct sfm_reader * r,size_t length)2979 skip_whole_strings (struct sfm_reader *r, size_t length)
2980 {
2981 uint8_t buffer[1024];
2982 assert (length < sizeof buffer);
2983 return read_whole_strings (r, buffer, length);
2984 }
2985
2986 /* Helpers for reading records that contain structured text
2987 strings. */
2988
2989 /* Maximum number of warnings to issue for a single text
2990 record. */
2991 #define MAX_TEXT_WARNINGS 5
2992
2993 /* State. */
2994 struct text_record
2995 {
2996 struct substring buffer; /* Record contents. */
2997 off_t start; /* Starting offset in file. */
2998 size_t pos; /* Current position in buffer. */
2999 int n_warnings; /* Number of warnings issued or suppressed. */
3000 bool recoded; /* Recoded into UTF-8? */
3001 };
3002
3003 static struct text_record *
open_text_record(struct sfm_reader * r,const struct sfm_extension_record * record,bool recode_to_utf8)3004 open_text_record (struct sfm_reader *r,
3005 const struct sfm_extension_record *record,
3006 bool recode_to_utf8)
3007 {
3008 struct text_record *text;
3009 struct substring raw;
3010
3011 text = pool_alloc (r->pool, sizeof *text);
3012 raw = ss_buffer (record->data, record->size * record->count);
3013 text->start = record->pos;
3014 text->buffer = (recode_to_utf8
3015 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
3016 : raw);
3017 text->pos = 0;
3018 text->n_warnings = 0;
3019 text->recoded = recode_to_utf8;
3020
3021 return text;
3022 }
3023
3024 /* Closes TEXT, frees its storage, and issues a final warning
3025 about suppressed warnings if necessary. */
3026 static void
close_text_record(struct sfm_reader * r,struct text_record * text)3027 close_text_record (struct sfm_reader *r, struct text_record *text)
3028 {
3029 if (text->n_warnings > MAX_TEXT_WARNINGS)
3030 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
3031 text->n_warnings - MAX_TEXT_WARNINGS);
3032 if (text->recoded)
3033 pool_free (r->pool, ss_data (text->buffer));
3034 }
3035
3036 /* Reads a variable=value pair from TEXT.
3037 Looks up the variable in DICT and stores it into *VAR.
3038 Stores a null-terminated value into *VALUE. */
3039 static bool
read_variable_to_value_pair(struct sfm_reader * r,struct dictionary * dict,struct text_record * text,struct variable ** var,char ** value)3040 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3041 struct text_record *text,
3042 struct variable **var, char **value)
3043 {
3044 for (;;)
3045 {
3046 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3047 return false;
3048
3049 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3050 if (*value == NULL)
3051 return false;
3052
3053 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3054 ss_buffer ("\t\0", 2));
3055
3056 if (*var != NULL)
3057 return true;
3058 }
3059 }
3060
3061 static bool
text_read_variable_name(struct sfm_reader * r,struct dictionary * dict,struct text_record * text,struct substring delimiters,struct variable ** var)3062 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3063 struct text_record *text, struct substring delimiters,
3064 struct variable **var)
3065 {
3066 char *name;
3067
3068 name = text_get_token (text, delimiters, NULL);
3069 if (name == NULL)
3070 return false;
3071
3072 *var = dict_lookup_var (dict, name);
3073 if (*var != NULL)
3074 return true;
3075
3076 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3077 name);
3078 return false;
3079 }
3080
3081
3082 static bool
text_read_short_name(struct sfm_reader * r,struct dictionary * dict,struct text_record * text,struct substring delimiters,struct variable ** var)3083 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3084 struct text_record *text, struct substring delimiters,
3085 struct variable **var)
3086 {
3087 char *short_name = text_get_token (text, delimiters, NULL);
3088 if (short_name == NULL)
3089 return false;
3090
3091 *var = dict_lookup_var (dict, short_name);
3092 if (*var == NULL)
3093 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3094 short_name);
3095 return true;
3096 }
3097
3098 /* Displays a warning for the current file position, limiting the
3099 number to MAX_TEXT_WARNINGS for TEXT. */
3100 static void
text_warn(struct sfm_reader * r,struct text_record * text,const char * format,...)3101 text_warn (struct sfm_reader *r, struct text_record *text,
3102 const char *format, ...)
3103 {
3104 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3105 {
3106 va_list args;
3107
3108 va_start (args, format);
3109 sys_msg (r, text->start + text->pos, MW, format, args);
3110 va_end (args);
3111 }
3112 }
3113
3114 static char *
text_get_token(struct text_record * text,struct substring delimiters,char * delimiter)3115 text_get_token (struct text_record *text, struct substring delimiters,
3116 char *delimiter)
3117 {
3118 struct substring token;
3119 char *end;
3120
3121 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3122 {
3123 if (delimiter != NULL)
3124 *delimiter = ss_data (text->buffer)[text->pos-1];
3125 return NULL;
3126 }
3127
3128 end = &ss_data (token)[ss_length (token)];
3129 if (delimiter != NULL)
3130 *delimiter = *end;
3131 *end = '\0';
3132 return ss_data (token);
3133 }
3134
3135 /* Reads a integer value expressed in decimal, then a space, then a string that
3136 consists of exactly as many bytes as specified by the integer, then a space,
3137 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3138 buffer (so the caller should not free the string). */
3139 static const char *
text_parse_counted_string(struct sfm_reader * r,struct text_record * text)3140 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3141 {
3142 size_t start;
3143 size_t n;
3144 char *s;
3145
3146 start = text->pos;
3147 n = 0;
3148 while (text->pos < text->buffer.length)
3149 {
3150 int c = text->buffer.string[text->pos];
3151 if (c < '0' || c > '9')
3152 break;
3153 n = (n * 10) + (c - '0');
3154 text->pos++;
3155 }
3156 if (text->pos >= text->buffer.length || start == text->pos)
3157 {
3158 sys_warn (r, text->start,
3159 _("Expecting digit at offset %zu in MRSETS record."),
3160 text->pos);
3161 return NULL;
3162 }
3163
3164 if (!text_match (text, ' '))
3165 {
3166 sys_warn (r, text->start,
3167 _("Expecting space at offset %zu in MRSETS record."),
3168 text->pos);
3169 return NULL;
3170 }
3171
3172 if (text->pos + n > text->buffer.length)
3173 {
3174 sys_warn (r, text->start,
3175 _("%zu-byte string starting at offset %zu "
3176 "exceeds record length %zu."),
3177 n, text->pos, text->buffer.length);
3178 return NULL;
3179 }
3180
3181 s = &text->buffer.string[text->pos];
3182 if (s[n] != ' ')
3183 {
3184 sys_warn (r, text->start,
3185 _("Expecting space at offset %zu following %zu-byte string."),
3186 text->pos + n, n);
3187 return NULL;
3188 }
3189 s[n] = '\0';
3190 text->pos += n + 1;
3191 return s;
3192 }
3193
3194 static bool
text_match(struct text_record * text,char c)3195 text_match (struct text_record *text, char c)
3196 {
3197 if (text->pos >= text->buffer.length)
3198 return false;
3199
3200 if (text->buffer.string[text->pos] == c)
3201 {
3202 text->pos++;
3203 return true;
3204 }
3205 else
3206 return false;
3207 }
3208
3209 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3210 inside the TEXT's string. */
3211 static size_t
text_pos(const struct text_record * text)3212 text_pos (const struct text_record *text)
3213 {
3214 return text->pos;
3215 }
3216
3217 static const char *
text_get_all(const struct text_record * text)3218 text_get_all (const struct text_record *text)
3219 {
3220 return text->buffer.string;
3221 }
3222
3223 /* Messages. */
3224
3225 /* Displays a corruption message. */
3226 static void
sys_msg(struct sfm_reader * r,off_t offset,int class,const char * format,va_list args)3227 sys_msg (struct sfm_reader *r, off_t offset,
3228 int class, const char *format, va_list args)
3229 {
3230 struct string text;
3231
3232 ds_init_empty (&text);
3233 if (offset >= 0)
3234 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3235 fh_get_file_name (r->fh), (long long int) offset);
3236 else
3237 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3238 ds_put_vformat (&text, format, args);
3239
3240 struct msg m = {
3241 .category = msg_class_to_category (class),
3242 .severity = msg_class_to_severity (class),
3243 .text = ds_cstr (&text),
3244 };
3245 msg_emit (&m);
3246 }
3247
3248 /* Displays a warning for offset OFFSET in the file. */
3249 static void
sys_warn(struct sfm_reader * r,off_t offset,const char * format,...)3250 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3251 {
3252 va_list args;
3253
3254 va_start (args, format);
3255 sys_msg (r, offset, MW, format, args);
3256 va_end (args);
3257 }
3258
3259 /* Displays an error for the current file position and marks it as in an error
3260 state. */
3261 static void
sys_error(struct sfm_reader * r,off_t offset,const char * format,...)3262 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3263 {
3264 va_list args;
3265
3266 va_start (args, format);
3267 sys_msg (r, offset, ME, format, args);
3268 va_end (args);
3269
3270 r->error = true;
3271 }
3272
3273 /* Reads BYTE_CNT bytes into BUF.
3274 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3275 Returns -1 if an I/O error or a partial read occurs.
3276 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3277 an error. */
3278 static inline int
read_bytes_internal(struct sfm_reader * r,bool eof_is_ok,void * buf,size_t byte_cnt)3279 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3280 void *buf, size_t byte_cnt)
3281 {
3282 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3283 r->pos += bytes_read;
3284 if (bytes_read == byte_cnt)
3285 return 1;
3286 else if (ferror (r->file))
3287 {
3288 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3289 return -1;
3290 }
3291 else if (!eof_is_ok || bytes_read != 0)
3292 {
3293 sys_error (r, r->pos, _("Unexpected end of file."));
3294 return -1;
3295 }
3296 else
3297 return 0;
3298 }
3299
3300 /* Reads BYTE_CNT into BUF.
3301 Returns true if successful.
3302 Returns false upon I/O error or if end-of-file is encountered. */
3303 static bool
read_bytes(struct sfm_reader * r,void * buf,size_t byte_cnt)3304 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3305 {
3306 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3307 }
3308
3309 /* Reads BYTE_CNT bytes into BUF.
3310 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3311 Returns 0 if an immediate end-of-file is encountered.
3312 Returns -1 if an I/O error or a partial read occurs. */
3313 static int
try_read_bytes(struct sfm_reader * r,void * buf,size_t byte_cnt)3314 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3315 {
3316 return read_bytes_internal (r, true, buf, byte_cnt);
3317 }
3318
3319 /* Reads a 32-bit signed integer from R and stores its value in host format in
3320 *X. Returns true if successful, otherwise false. */
3321 static bool
read_int(struct sfm_reader * r,int * x)3322 read_int (struct sfm_reader *r, int *x)
3323 {
3324 uint8_t integer[4];
3325 if (read_bytes (r, integer, sizeof integer) != 1)
3326 return false;
3327 *x = integer_get (r->integer_format, integer, sizeof integer);
3328 return true;
3329 }
3330
3331 static bool
read_uint(struct sfm_reader * r,unsigned int * x)3332 read_uint (struct sfm_reader *r, unsigned int *x)
3333 {
3334 bool ok;
3335 int y;
3336
3337 ok = read_int (r, &y);
3338 *x = y;
3339 return ok;
3340 }
3341
3342 /* Reads a 64-bit signed integer from R and returns its value in
3343 host format. */
3344 static bool
read_int64(struct sfm_reader * r,long long int * x)3345 read_int64 (struct sfm_reader *r, long long int *x)
3346 {
3347 uint8_t integer[8];
3348 if (read_bytes (r, integer, sizeof integer) != 1)
3349 return false;
3350 *x = integer_get (r->integer_format, integer, sizeof integer);
3351 return true;
3352 }
3353
3354 /* Reads a 64-bit signed integer from R and returns its value in
3355 host format. */
3356 static bool
read_uint64(struct sfm_reader * r,unsigned long long int * x)3357 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3358 {
3359 long long int y;
3360 bool ok;
3361
3362 ok = read_int64 (r, &y);
3363 *x = y;
3364 return ok;
3365 }
3366
3367 static int
parse_int(const struct sfm_reader * r,const void * data,size_t ofs)3368 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3369 {
3370 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3371 }
3372
3373 static double
parse_float(const struct sfm_reader * r,const void * data,size_t ofs)3374 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3375 {
3376 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3377 }
3378
3379 /* Reads exactly SIZE - 1 bytes into BUFFER
3380 and stores a null byte into BUFFER[SIZE - 1]. */
3381 static bool
read_string(struct sfm_reader * r,char * buffer,size_t size)3382 read_string (struct sfm_reader *r, char *buffer, size_t size)
3383 {
3384 bool ok;
3385
3386 assert (size > 0);
3387 ok = read_bytes (r, buffer, size - 1);
3388 if (ok)
3389 buffer[size - 1] = '\0';
3390 return ok;
3391 }
3392
3393 /* Skips BYTES bytes forward in R. */
3394 static bool
skip_bytes(struct sfm_reader * r,size_t bytes)3395 skip_bytes (struct sfm_reader *r, size_t bytes)
3396 {
3397 while (bytes > 0)
3398 {
3399 char buffer[1024];
3400 size_t chunk = MIN (sizeof buffer, bytes);
3401 if (!read_bytes (r, buffer, chunk))
3402 return false;
3403 bytes -= chunk;
3404 }
3405
3406 return true;
3407 }
3408
3409 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3410 been replaced by LFs.
3411
3412 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3413 files that use CR-only line ends in the file label and extra product
3414 info.) */
3415 static char *
fix_line_ends(const char * s)3416 fix_line_ends (const char *s)
3417 {
3418 char *dst, *d;
3419
3420 d = dst = xmalloc (strlen (s) + 1);
3421 while (*s != '\0')
3422 {
3423 if (*s == '\r')
3424 {
3425 s++;
3426 if (*s == '\n')
3427 s++;
3428 *d++ = '\n';
3429 }
3430 else
3431 *d++ = *s++;
3432 }
3433 *d = '\0';
3434
3435 return dst;
3436 }
3437
3438 static bool
3439 read_ztrailer (struct sfm_reader *r,
3440 long long int zheader_ofs,
3441 long long int ztrailer_len);
3442
3443 static void *
zalloc(voidpf pool_,uInt items,uInt size)3444 zalloc (voidpf pool_, uInt items, uInt size)
3445 {
3446 struct pool *pool = pool_;
3447
3448 return (!size || xalloc_oversized (items, size)
3449 ? Z_NULL
3450 : pool_malloc (pool, items * size));
3451 }
3452
3453 static void
zfree(voidpf pool_,voidpf address)3454 zfree (voidpf pool_, voidpf address)
3455 {
3456 struct pool *pool = pool_;
3457
3458 pool_free (pool, address);
3459 }
3460
3461 static bool
read_zheader(struct sfm_reader * r)3462 read_zheader (struct sfm_reader *r)
3463 {
3464 off_t pos = r->pos;
3465 long long int zheader_ofs;
3466 long long int ztrailer_ofs;
3467 long long int ztrailer_len;
3468
3469 if (!read_int64 (r, &zheader_ofs)
3470 || !read_int64 (r, &ztrailer_ofs)
3471 || !read_int64 (r, &ztrailer_len))
3472 return false;
3473
3474 if (zheader_ofs != pos)
3475 {
3476 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3477 "(expected %#llx)."),
3478 zheader_ofs, (long long int) pos);
3479 return false;
3480 }
3481
3482 if (ztrailer_ofs < r->pos)
3483 {
3484 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3485 ztrailer_ofs);
3486 return false;
3487 }
3488
3489 if (ztrailer_len < 24 || ztrailer_len % 24)
3490 {
3491 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3492 return false;
3493 }
3494
3495 r->ztrailer_ofs = ztrailer_ofs;
3496 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3497 return false;
3498
3499 if (r->zin_buf == NULL)
3500 {
3501 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3502 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3503 r->zstream.next_in = NULL;
3504 r->zstream.avail_in = 0;
3505 }
3506
3507 r->zstream.zalloc = zalloc;
3508 r->zstream.zfree = zfree;
3509 r->zstream.opaque = r->pool;
3510
3511 return open_zstream (r);
3512 }
3513
3514 static void
seek(struct sfm_reader * r,off_t offset)3515 seek (struct sfm_reader *r, off_t offset)
3516 {
3517 if (fseeko (r->file, offset, SEEK_SET))
3518 sys_error (r, 0, _("%s: seek failed (%s)."),
3519 fh_get_file_name (r->fh), strerror (errno));
3520 r->pos = offset;
3521 }
3522
3523 /* Performs some additional consistency checks on the ZLIB compressed data
3524 trailer. */
3525 static bool
read_ztrailer(struct sfm_reader * r,long long int zheader_ofs,long long int ztrailer_len)3526 read_ztrailer (struct sfm_reader *r,
3527 long long int zheader_ofs,
3528 long long int ztrailer_len)
3529 {
3530 long long int expected_uncmp_ofs;
3531 long long int expected_cmp_ofs;
3532 long long int bias;
3533 long long int zero;
3534 unsigned int block_size;
3535 unsigned int n_blocks;
3536 unsigned int i;
3537 struct stat s;
3538
3539 if (fstat (fileno (r->file), &s))
3540 {
3541 sys_error (r, 0, _("%s: stat failed (%s)."),
3542 fh_get_file_name (r->fh), strerror (errno));
3543 return false;
3544 }
3545
3546 if (!S_ISREG (s.st_mode))
3547 {
3548 /* We can't seek to the trailer and then back to the data in this file,
3549 so skip doing extra checks. */
3550 return true;
3551 }
3552
3553 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3554 sys_warn (r, r->pos,
3555 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3556 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3557
3558 seek (r, r->ztrailer_ofs);
3559
3560 /* Read fixed header from ZLIB data trailer. */
3561 if (!read_int64 (r, &bias))
3562 return false;
3563 if (-bias != r->bias)
3564 {
3565 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3566 "file header bias (%.2f)."),
3567 -bias, r->bias);
3568 return false;
3569 }
3570
3571 if (!read_int64 (r, &zero))
3572 return false;
3573 if (zero != 0)
3574 sys_warn (r, r->pos,
3575 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3576
3577 if (!read_uint (r, &block_size))
3578 return false;
3579 if (block_size != ZBLOCK_SIZE)
3580 sys_warn (r, r->pos,
3581 _("ZLIB trailer specifies unexpected %u-byte block size."),
3582 block_size);
3583
3584 if (!read_uint (r, &n_blocks))
3585 return false;
3586 if (n_blocks != (ztrailer_len - 24) / 24)
3587 {
3588 sys_error (r, r->pos,
3589 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3590 "%lld)."),
3591 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3592 return false;
3593 }
3594
3595 expected_uncmp_ofs = zheader_ofs;
3596 expected_cmp_ofs = zheader_ofs + 24;
3597 for (i = 0; i < n_blocks; i++)
3598 {
3599 off_t desc_ofs = r->pos;
3600 unsigned long long int uncompressed_ofs;
3601 unsigned long long int compressed_ofs;
3602 unsigned int uncompressed_size;
3603 unsigned int compressed_size;
3604
3605 if (!read_uint64 (r, &uncompressed_ofs)
3606 || !read_uint64 (r, &compressed_ofs)
3607 || !read_uint (r, &uncompressed_size)
3608 || !read_uint (r, &compressed_size))
3609 return false;
3610
3611 if (uncompressed_ofs != expected_uncmp_ofs)
3612 {
3613 sys_error (r, desc_ofs,
3614 _("ZLIB block descriptor %u reported uncompressed data "
3615 "offset %#llx, when %#llx was expected."),
3616 i, uncompressed_ofs, expected_uncmp_ofs);
3617 return false;
3618 }
3619
3620 if (compressed_ofs != expected_cmp_ofs)
3621 {
3622 sys_error (r, desc_ofs,
3623 _("ZLIB block descriptor %u reported compressed data "
3624 "offset %#llx, when %#llx was expected."),
3625 i, compressed_ofs, expected_cmp_ofs);
3626 return false;
3627 }
3628
3629 if (i < n_blocks - 1)
3630 {
3631 if (uncompressed_size != block_size)
3632 sys_warn (r, desc_ofs,
3633 _("ZLIB block descriptor %u reported block size %#x, "
3634 "when %#x was expected."),
3635 i, uncompressed_size, block_size);
3636 }
3637 else
3638 {
3639 if (uncompressed_size > block_size)
3640 sys_warn (r, desc_ofs,
3641 _("ZLIB block descriptor %u reported block size %#x, "
3642 "when at most %#x was expected."),
3643 i, uncompressed_size, block_size);
3644 }
3645
3646 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3647 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3648 This code checks for an expansion of more than 14.3% plus 11
3649 bytes. */
3650 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3651 {
3652 sys_error (r, desc_ofs,
3653 _("ZLIB block descriptor %u reports compressed size %u "
3654 "and uncompressed size %u."),
3655 i, compressed_size, uncompressed_size);
3656 return false;
3657 }
3658
3659 expected_uncmp_ofs += uncompressed_size;
3660 expected_cmp_ofs += compressed_size;
3661 }
3662
3663 if (expected_cmp_ofs != r->ztrailer_ofs)
3664 {
3665 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3666 "would be expected from block descriptors."),
3667 r->ztrailer_ofs, expected_cmp_ofs);
3668 return false;
3669 }
3670
3671 seek (r, zheader_ofs + 24);
3672 return true;
3673 }
3674
3675 static bool
open_zstream(struct sfm_reader * r)3676 open_zstream (struct sfm_reader *r)
3677 {
3678 int error;
3679
3680 r->zout_pos = r->zout_end = 0;
3681 error = inflateInit (&r->zstream);
3682 if (error != Z_OK)
3683 {
3684 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3685 r->zstream.msg);
3686 return false;
3687 }
3688 return true;
3689 }
3690
3691 static bool
close_zstream(struct sfm_reader * r)3692 close_zstream (struct sfm_reader *r)
3693 {
3694 int error;
3695
3696 error = inflateEnd (&r->zstream);
3697 if (error != Z_OK)
3698 {
3699 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3700 r->zstream.msg);
3701 return false;
3702 }
3703 return true;
3704 }
3705
3706 static int
read_bytes_zlib(struct sfm_reader * r,void * buf_,size_t byte_cnt)3707 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3708 {
3709 uint8_t *buf = buf_;
3710
3711 if (byte_cnt == 0)
3712 return 1;
3713
3714 for (;;)
3715 {
3716 int error;
3717
3718 /* Use already inflated data if there is any. */
3719 if (r->zout_pos < r->zout_end)
3720 {
3721 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3722 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3723 r->zout_pos += n;
3724 byte_cnt -= n;
3725 buf += n;
3726
3727 if (byte_cnt == 0)
3728 return 1;
3729 }
3730
3731 /* We need to inflate some more data.
3732 Get some more input data if we don't have any. */
3733 if (r->zstream.avail_in == 0)
3734 {
3735 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3736 if (n == 0)
3737 return 0;
3738 else
3739 {
3740 int retval = try_read_bytes (r, r->zin_buf, n);
3741 if (retval != 1)
3742 return retval;
3743 r->zstream.avail_in = n;
3744 r->zstream.next_in = r->zin_buf;
3745 }
3746 }
3747
3748 /* Inflate the (remaining) input data. */
3749 r->zstream.avail_out = ZOUT_BUF_SIZE;
3750 r->zstream.next_out = r->zout_buf;
3751 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3752 r->zout_pos = 0;
3753 r->zout_end = r->zstream.next_out - r->zout_buf;
3754 if (r->zout_end == 0)
3755 {
3756 if (error != Z_STREAM_END)
3757 {
3758 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3759 r->zstream.msg);
3760 return -1;
3761 }
3762 else if (!close_zstream (r) || !open_zstream (r))
3763 return -1;
3764 }
3765 else
3766 {
3767 /* Process the output data and ignore 'error' for now. ZLIB will
3768 present it to us again on the next inflate() call. */
3769 }
3770 }
3771 }
3772
3773 static int
read_compressed_bytes(struct sfm_reader * r,void * buf,size_t byte_cnt)3774 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3775 {
3776 if (r->compression == ANY_COMP_SIMPLE)
3777 return read_bytes (r, buf, byte_cnt);
3778 else
3779 {
3780 int retval = read_bytes_zlib (r, buf, byte_cnt);
3781 if (retval == 0)
3782 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3783 return retval;
3784 }
3785 }
3786
3787 static int
try_read_compressed_bytes(struct sfm_reader * r,void * buf,size_t byte_cnt)3788 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3789 {
3790 if (r->compression == ANY_COMP_SIMPLE)
3791 return try_read_bytes (r, buf, byte_cnt);
3792 else
3793 return read_bytes_zlib (r, buf, byte_cnt);
3794 }
3795
3796 /* Reads a 64-bit floating-point number from R and returns its
3797 value in host format. */
3798 static bool
read_compressed_float(struct sfm_reader * r,double * d)3799 read_compressed_float (struct sfm_reader *r, double *d)
3800 {
3801 uint8_t number[8];
3802
3803 if (!read_compressed_bytes (r, number, sizeof number))
3804 return false;
3805
3806 *d = float_get_double (r->float_format, number);
3807 return true;
3808 }
3809
3810 static const struct casereader_class sys_file_casereader_class =
3811 {
3812 sys_file_casereader_read,
3813 sys_file_casereader_destroy,
3814 NULL,
3815 NULL,
3816 };
3817
3818 const struct any_reader_class sys_file_reader_class =
3819 {
3820 N_("SPSS System File"),
3821 sfm_detect,
3822 sfm_open,
3823 sfm_close,
3824 sfm_decode,
3825 sfm_get_strings,
3826 };
3827