1
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <errno.h>
5 #include <string.h>
6 #include <math.h>
7 #include <time.h>
8 #include <limits.h>
9 #include <inttypes.h>
10
11 #include "readstat_sas.h"
12 #include "../readstat_iconv.h"
13 #include "../readstat_convert.h"
14 #include "../readstat_writer.h"
15
16 #define SAS_FILE_HEADER_SIZE_32BIT 1024
17 #define SAS_FILE_HEADER_SIZE_64BIT 8192
18 #define SAS_DEFAULT_PAGE_SIZE 4096
19
20 #define SAS_DEFAULT_STRING_ENCODING "WINDOWS-1252"
21
22 unsigned char sas7bdat_magic_number[32] = {
23 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
24 0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x60,
25 0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
26 0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
27 };
28
29 unsigned char sas7bcat_magic_number[32] = {
30 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
31 0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x63,
32 0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
33 0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
34 };
35
36 /* This table is cobbled together from extant files and:
37 * https://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
38 * https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
39 *
40 * Discrepancies form the official documentation are noted with a comment. It
41 * appears that in some instances that SAS software uses a newer encoding than
42 * what's listed in the docs. In these cases the encoding used by ReadStat
43 * represents the author's best guess.
44 */
45 static readstat_charset_entry_t _charset_table[] = {
46 { .code = 0, .name = SAS_DEFAULT_STRING_ENCODING },
47 { .code = 20, .name = "UTF-8" },
48 { .code = 28, .name = "US-ASCII" },
49 { .code = 29, .name = "ISO-8859-1" },
50 { .code = 30, .name = "ISO-8859-2" },
51 { .code = 31, .name = "ISO-8859-3" },
52 { .code = 32, .name = "ISO-8859-4" },
53 { .code = 33, .name = "ISO-8859-5" },
54 { .code = 34, .name = "ISO-8859-6" },
55 { .code = 35, .name = "ISO-8859-7" },
56 { .code = 36, .name = "ISO-8859-8" },
57 { .code = 37, .name = "ISO-8859-9" },
58 { .code = 39, .name = "ISO-8859-11" },
59 { .code = 40, .name = "ISO-8859-15" },
60 { .code = 41, .name = "CP437" },
61 { .code = 42, .name = "CP850" },
62 { .code = 43, .name = "CP852" },
63 { .code = 44, .name = "CP857" },
64 { .code = 45, .name = "CP858" },
65 { .code = 46, .name = "CP862" },
66 { .code = 47, .name = "CP864" },
67 { .code = 48, .name = "CP865" },
68 { .code = 49, .name = "CP866" },
69 { .code = 50, .name = "CP869" },
70 { .code = 51, .name = "CP874" },
71 { .code = 52, .name = "CP921" },
72 { .code = 53, .name = "CP922" },
73 { .code = 54, .name = "CP1129" },
74 { .code = 55, .name = "CP720" },
75 { .code = 56, .name = "CP737" },
76 { .code = 57, .name = "CP775" },
77 { .code = 58, .name = "CP860" },
78 { .code = 59, .name = "CP863" },
79 { .code = 60, .name = "WINDOWS-1250" },
80 { .code = 61, .name = "WINDOWS-1251" },
81 { .code = 62, .name = "WINDOWS-1252" },
82 { .code = 63, .name = "WINDOWS-1253" },
83 { .code = 64, .name = "WINDOWS-1254" },
84 { .code = 65, .name = "WINDOWS-1255" },
85 { .code = 66, .name = "WINDOWS-1256" },
86 { .code = 67, .name = "WINDOWS-1257" },
87 { .code = 68, .name = "WINDOWS-1258" },
88 { .code = 69, .name = "MACROMAN" },
89 { .code = 70, .name = "MACARABIC" },
90 { .code = 71, .name = "MACHEBREW" },
91 { .code = 72, .name = "MACGREEK" },
92 { .code = 73, .name = "MACTHAI" },
93 { .code = 75, .name = "MACTURKISH" },
94 { .code = 76, .name = "MACUKRAINE" },
95 { .code = 118, .name = "CP950" },
96 { .code = 119, .name = "EUC-TW" },
97 { .code = 123, .name = "BIG-5" },
98 { .code = 125, .name = "GB18030" }, // "euc-cn" in SAS
99 { .code = 126, .name = "WINDOWS-936" }, // "zwin"
100 { .code = 128, .name = "CP1381" }, // "zpce"
101 { .code = 134, .name = "EUC-JP" },
102 { .code = 136, .name = "CP949" },
103 { .code = 137, .name = "CP942" },
104 { .code = 138, .name = "CP932" }, // "shift-jis" in SAS
105 { .code = 140, .name = "EUC-KR" },
106 { .code = 141, .name = "CP949" }, // "kpce"
107 { .code = 142, .name = "CP949" }, // "kwin"
108 { .code = 163, .name = "MACICELAND" },
109 { .code = 167, .name = "ISO-2022-JP" },
110 { .code = 168, .name = "ISO-2022-KR" },
111 { .code = 169, .name = "ISO-2022-CN" },
112 { .code = 172, .name = "ISO-2022-CN-EXT" },
113 { .code = 204, .name = SAS_DEFAULT_STRING_ENCODING }, // "any" in SAS
114 { .code = 205, .name = "GB18030" },
115 { .code = 227, .name = "ISO-8859-14" },
116 { .code = 242, .name = "ISO-8859-13" },
117 { .code = 245, .name = "MACCROATIAN" },
118 { .code = 246, .name = "MACCYRILLIC" },
119 { .code = 247, .name = "MACROMANIA" },
120 { .code = 248, .name = "SHIFT_JISX0213" },
121 };
122
sas_epoch()123 static time_t sas_epoch() {
124 return - 3653 * 86400; // seconds between 01-01-1960 and 01-01-1970
125 }
126
sas_convert_time(double time,time_t epoch)127 static time_t sas_convert_time(double time, time_t epoch) {
128 time += epoch;
129 if (isnan(time))
130 return 0;
131 if (time > (double)LONG_MAX)
132 return LONG_MAX;
133 if (time < (double)LONG_MIN)
134 return LONG_MIN;
135 return time;
136 }
137
sas_read8(const char * data,int bswap)138 uint64_t sas_read8(const char *data, int bswap) {
139 uint64_t tmp;
140 memcpy(&tmp, data, 8);
141 return bswap ? byteswap8(tmp) : tmp;
142 }
143
sas_read4(const char * data,int bswap)144 uint32_t sas_read4(const char *data, int bswap) {
145 uint32_t tmp;
146 memcpy(&tmp, data, 4);
147 return bswap ? byteswap4(tmp) : tmp;
148 }
149
sas_read2(const char * data,int bswap)150 uint16_t sas_read2(const char *data, int bswap) {
151 uint16_t tmp;
152 memcpy(&tmp, data, 2);
153 return bswap ? byteswap2(tmp) : tmp;
154 }
155
sas_subheader_remainder(size_t len,size_t signature_len)156 size_t sas_subheader_remainder(size_t len, size_t signature_len) {
157 return len - (4+2*signature_len);
158 }
159
sas_read_header(readstat_io_t * io,sas_header_info_t * hinfo,readstat_error_handler error_handler,void * user_ctx)160 readstat_error_t sas_read_header(readstat_io_t *io, sas_header_info_t *hinfo,
161 readstat_error_handler error_handler, void *user_ctx) {
162 sas_header_start_t header_start;
163 sas_header_end_t header_end;
164 int retval = READSTAT_OK;
165 char error_buf[1024];
166 time_t epoch = sas_epoch();
167
168 if (io->read(&header_start, sizeof(sas_header_start_t), io->io_ctx) < sizeof(sas_header_start_t)) {
169 retval = READSTAT_ERROR_READ;
170 goto cleanup;
171 }
172 if (memcmp(header_start.magic, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0 &&
173 memcmp(header_start.magic, sas7bcat_magic_number, sizeof(sas7bcat_magic_number)) != 0) {
174 retval = READSTAT_ERROR_PARSE;
175 goto cleanup;
176 }
177 if (header_start.a1 == SAS_ALIGNMENT_OFFSET_4) {
178 hinfo->pad1 = 4;
179 }
180 if (header_start.a2 == SAS_ALIGNMENT_OFFSET_4) {
181 hinfo->u64 = 1;
182 }
183 int bswap = 0;
184 if (header_start.endian == SAS_ENDIAN_BIG) {
185 bswap = machine_is_little_endian();
186 hinfo->little_endian = 0;
187 } else if (header_start.endian == SAS_ENDIAN_LITTLE) {
188 bswap = !machine_is_little_endian();
189 hinfo->little_endian = 1;
190 } else {
191 retval = READSTAT_ERROR_PARSE;
192 goto cleanup;
193 }
194 int i;
195 for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) {
196 if (header_start.encoding == _charset_table[i].code) {
197 hinfo->encoding = _charset_table[i].name;
198 break;
199 }
200 }
201 if (hinfo->encoding == NULL) {
202 if (error_handler) {
203 snprintf(error_buf, sizeof(error_buf), "Unsupported character set code: %d", header_start.encoding);
204 error_handler(error_buf, user_ctx);
205 }
206 retval = READSTAT_ERROR_UNSUPPORTED_CHARSET;
207 goto cleanup;
208 }
209 memcpy(hinfo->table_name, header_start.table_name, sizeof(header_start.table_name));
210 if (io->seek(hinfo->pad1, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
211 retval = READSTAT_ERROR_SEEK;
212 goto cleanup;
213 }
214
215 double creation_time, modification_time;
216
217 if (io->read(&creation_time, sizeof(double), io->io_ctx) < sizeof(double)) {
218 retval = READSTAT_ERROR_READ;
219 goto cleanup;
220 }
221 if (bswap)
222 creation_time = byteswap_double(creation_time);
223
224 if (io->read(&modification_time, sizeof(double), io->io_ctx) < sizeof(double)) {
225 retval = READSTAT_ERROR_READ;
226 goto cleanup;
227 }
228 if (bswap)
229 modification_time = byteswap_double(modification_time);
230
231 hinfo->creation_time = sas_convert_time(creation_time, epoch);
232 hinfo->modification_time = sas_convert_time(modification_time, epoch);
233
234 if (io->seek(16, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
235 retval = READSTAT_ERROR_SEEK;
236 goto cleanup;
237 }
238
239 uint32_t header_size, page_size;
240
241 if (io->read(&header_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
242 retval = READSTAT_ERROR_READ;
243 goto cleanup;
244 }
245 if (io->read(&page_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
246 retval = READSTAT_ERROR_READ;
247 goto cleanup;
248 }
249
250 hinfo->header_size = bswap ? byteswap4(header_size) : header_size;
251 hinfo->page_size = bswap ? byteswap4(page_size) : page_size;
252
253 if (hinfo->header_size < 1024 || hinfo->page_size < 1024) {
254 retval = READSTAT_ERROR_PARSE;
255 goto cleanup;
256 }
257 if (hinfo->header_size > (1<<24) || hinfo->page_size > (1<<24)) {
258 retval = READSTAT_ERROR_PARSE;
259 goto cleanup;
260 }
261
262 if (hinfo->u64) {
263 hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_64BIT;
264 hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_64BIT;
265 } else {
266 hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_32BIT;
267 hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_32BIT;
268 }
269
270 if (hinfo->u64) {
271 uint64_t page_count;
272 if (io->read(&page_count, sizeof(uint64_t), io->io_ctx) < sizeof(uint64_t)) {
273 retval = READSTAT_ERROR_READ;
274 goto cleanup;
275 }
276 hinfo->page_count = bswap ? byteswap8(page_count) : page_count;
277 } else {
278 uint32_t page_count;
279 if (io->read(&page_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
280 retval = READSTAT_ERROR_READ;
281 goto cleanup;
282 }
283 hinfo->page_count = bswap ? byteswap4(page_count) : page_count;
284 }
285 if (hinfo->page_count > (1<<24)) {
286 retval = READSTAT_ERROR_PARSE;
287 goto cleanup;
288 }
289
290 if (io->seek(8, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
291 retval = READSTAT_ERROR_SEEK;
292 if (error_handler) {
293 snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek forward by %d", 8);
294 error_handler(error_buf, user_ctx);
295 }
296 goto cleanup;
297 }
298 if (io->read(&header_end, sizeof(sas_header_end_t), io->io_ctx) < sizeof(sas_header_end_t)) {
299 retval = READSTAT_ERROR_READ;
300 goto cleanup;
301 }
302 char major;
303 int minor, revision;
304 if (sscanf(header_end.release, "%c.%04dM%1d", &major, &minor, &revision) != 3) {
305 retval = READSTAT_ERROR_PARSE;
306 goto cleanup;
307 }
308
309 if (major >= '1' && major <= '9') {
310 hinfo->major_version = major - '0';
311 } else if (major == 'V') {
312 // It appears that SAS Visual Forecaster reports the major version as "V"
313 // Treat it as version 9 for all intents and purposes
314 hinfo->major_version = 9;
315 } else {
316 retval = READSTAT_ERROR_PARSE;
317 goto cleanup;
318 }
319 hinfo->minor_version = minor;
320 hinfo->revision = revision;
321
322 if ((major == '8' || major == '9') && minor == 0 && revision == 0) {
323 /* A bit of a hack, but most SAS installations are running a minor update */
324 hinfo->vendor = READSTAT_VENDOR_STAT_TRANSFER;
325 } else {
326 hinfo->vendor = READSTAT_VENDOR_SAS;
327 }
328 if (io->seek(hinfo->header_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
329 retval = READSTAT_ERROR_SEEK;
330 if (error_handler) {
331 snprintf(error_buf, sizeof(error_buf),
332 "ReadStat: Failed to seek to position %" PRId64, hinfo->header_size);
333 error_handler(error_buf, user_ctx);
334 }
335 goto cleanup;
336 }
337
338 cleanup:
339 return retval;
340 }
341
sas_write_header(readstat_writer_t * writer,sas_header_info_t * hinfo,sas_header_start_t header_start)342 readstat_error_t sas_write_header(readstat_writer_t *writer, sas_header_info_t *hinfo, sas_header_start_t header_start) {
343 readstat_error_t retval = READSTAT_OK;
344 time_t epoch = sas_epoch();
345
346 memset(header_start.table_name, ' ', sizeof(header_start.table_name));
347
348 size_t table_name_len = strlen(writer->table_name);
349 if (table_name_len > sizeof(header_start.table_name))
350 table_name_len = sizeof(header_start.table_name);
351
352 if (table_name_len) {
353 memcpy(header_start.table_name, writer->table_name, table_name_len);
354 } else {
355 memcpy(header_start.table_name, "DATASET", sizeof("DATASET")-1);
356 }
357
358 retval = readstat_write_bytes(writer, &header_start, sizeof(sas_header_start_t));
359 if (retval != READSTAT_OK)
360 goto cleanup;
361
362 retval = readstat_write_zeros(writer, hinfo->pad1);
363 if (retval != READSTAT_OK)
364 goto cleanup;
365
366 double creation_time = hinfo->creation_time - epoch;
367
368 retval = readstat_write_bytes(writer, &creation_time, sizeof(double));
369 if (retval != READSTAT_OK)
370 goto cleanup;
371
372 double modification_time = hinfo->modification_time - epoch;
373
374 retval = readstat_write_bytes(writer, &modification_time, sizeof(double));
375 if (retval != READSTAT_OK)
376 goto cleanup;
377
378 retval = readstat_write_zeros(writer, 16);
379 if (retval != READSTAT_OK)
380 goto cleanup;
381
382 uint32_t header_size = hinfo->header_size;
383 uint32_t page_size = hinfo->page_size;
384
385 retval = readstat_write_bytes(writer, &header_size, sizeof(uint32_t));
386 if (retval != READSTAT_OK)
387 goto cleanup;
388
389 retval = readstat_write_bytes(writer, &page_size, sizeof(uint32_t));
390 if (retval != READSTAT_OK)
391 goto cleanup;
392
393 if (hinfo->u64) {
394 uint64_t page_count = hinfo->page_count;
395 retval = readstat_write_bytes(writer, &page_count, sizeof(uint64_t));
396 } else {
397 uint32_t page_count = hinfo->page_count;
398 retval = readstat_write_bytes(writer, &page_count, sizeof(uint32_t));
399 }
400 if (retval != READSTAT_OK)
401 goto cleanup;
402
403 retval = readstat_write_zeros(writer, 8);
404 if (retval != READSTAT_OK)
405 goto cleanup;
406
407 sas_header_end_t header_end = {
408 .host = "9.0401M6Linux"
409 };
410
411 char release[sizeof(header_end.release)+1] = { 0 };
412 snprintf(release, sizeof(release), "%1d.%04dM0", (unsigned int)writer->version % 10, 101);
413 memcpy(header_end.release, release, sizeof(header_end.release));
414
415 retval = readstat_write_bytes(writer, &header_end, sizeof(sas_header_end_t));
416 if (retval != READSTAT_OK)
417 goto cleanup;
418
419 retval = readstat_write_zeros(writer, hinfo->header_size-writer->bytes_written);
420 if (retval != READSTAT_OK)
421 goto cleanup;
422
423 cleanup:
424 return retval;
425 }
426
sas_header_info_init(readstat_writer_t * writer,int is_64bit)427 sas_header_info_t *sas_header_info_init(readstat_writer_t *writer, int is_64bit) {
428 sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t));
429 hinfo->creation_time = writer->timestamp;
430 hinfo->modification_time = writer->timestamp;
431 hinfo->page_size = SAS_DEFAULT_PAGE_SIZE;
432 hinfo->u64 = !!is_64bit;
433
434 if (hinfo->u64) {
435 hinfo->header_size = SAS_FILE_HEADER_SIZE_64BIT;
436 hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_64BIT;
437 hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_64BIT;
438 } else {
439 hinfo->header_size = SAS_FILE_HEADER_SIZE_32BIT;
440 hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_32BIT;
441 hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_32BIT;
442 }
443
444 return hinfo;
445 }
446
sas_fill_page(readstat_writer_t * writer,sas_header_info_t * hinfo)447 readstat_error_t sas_fill_page(readstat_writer_t *writer, sas_header_info_t *hinfo) {
448 if ((writer->bytes_written - hinfo->header_size) % hinfo->page_size) {
449 size_t num_zeros = (hinfo->page_size -
450 (writer->bytes_written - hinfo->header_size) % hinfo->page_size);
451 return readstat_write_zeros(writer, num_zeros);
452 }
453 return READSTAT_OK;
454 }
455
sas_validate_name(const char * name,size_t max_len)456 readstat_error_t sas_validate_name(const char *name, size_t max_len) {
457 int j;
458 for (j=0; name[j]; j++) {
459 if (name[j] != '_' &&
460 !(name[j] >= 'a' && name[j] <= 'z') &&
461 !(name[j] >= 'A' && name[j] <= 'Z') &&
462 !(name[j] >= '0' && name[j] <= '9')) {
463 return READSTAT_ERROR_NAME_CONTAINS_ILLEGAL_CHARACTER;
464 }
465 }
466 char first_char = name[0];
467
468 if (!first_char)
469 return READSTAT_ERROR_NAME_IS_ZERO_LENGTH;
470
471 if (first_char != '_' &&
472 !(first_char >= 'a' && first_char <= 'z') &&
473 !(first_char >= 'A' && first_char <= 'Z')) {
474 return READSTAT_ERROR_NAME_BEGINS_WITH_ILLEGAL_CHARACTER;
475 }
476 if (strcmp(name, "_N_") == 0 || strcmp(name, "_ERROR_") == 0 ||
477 strcmp(name, "_NUMERIC_") == 0 || strcmp(name, "_CHARACTER_") == 0 ||
478 strcmp(name, "_ALL_") == 0) {
479 return READSTAT_ERROR_NAME_IS_RESERVED_WORD;
480 }
481
482 if (strlen(name) > max_len)
483 return READSTAT_ERROR_NAME_IS_TOO_LONG;
484
485 return READSTAT_OK;
486 }
487
sas_validate_variable(const readstat_variable_t * variable)488 readstat_error_t sas_validate_variable(const readstat_variable_t *variable) {
489 return sas_validate_name(readstat_variable_get_name(variable), 32);
490 }
491
sas_validate_tag(char tag)492 readstat_error_t sas_validate_tag(char tag) {
493 if (tag == '_' || (tag >= 'A' && tag <= 'Z'))
494 return READSTAT_OK;
495
496 return READSTAT_ERROR_TAGGED_VALUE_IS_OUT_OF_RANGE;
497 }
498
sas_assign_tag(readstat_value_t * value,uint8_t tag)499 void sas_assign_tag(readstat_value_t *value, uint8_t tag) {
500 /* We accommodate two tag schemes. In the first, the tag is an ASCII code
501 * given by uint8_t tag above. System missing is represented by an ASCII
502 * period. In the second scheme, (tag-2) is an offset from 'A', except when
503 * tag == 0, in which case it represents an underscore, or tag == 1, in
504 * which case it represents system-missing.
505 */
506 if (tag == 0) {
507 tag = '_';
508 } else if (tag >= 2 && tag < 28) {
509 tag = 'A' + (tag - 2);
510 }
511 if (sas_validate_tag(tag) == READSTAT_OK) {
512 value->tag = tag;
513 value->is_tagged_missing = 1;
514 } else {
515 value->tag = 0;
516 value->is_system_missing = 1;
517 }
518 }
519