1 
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <errno.h>
5 #include <string.h>
6 #include <math.h>
7 #include <time.h>
8 #include <limits.h>
9 #include <inttypes.h>
10 
11 #include "readstat_sas.h"
12 #include "../readstat_iconv.h"
13 #include "../readstat_convert.h"
14 #include "../readstat_writer.h"
15 
16 #define SAS_FILE_HEADER_SIZE_32BIT 1024
17 #define SAS_FILE_HEADER_SIZE_64BIT 8192
18 #define SAS_DEFAULT_PAGE_SIZE      4096
19 
20 #define SAS_DEFAULT_STRING_ENCODING "WINDOWS-1252"
21 
22 unsigned char sas7bdat_magic_number[32] = {
23     0x00, 0x00, 0x00, 0x00,   0x00, 0x00, 0x00, 0x00,
24     0x00, 0x00, 0x00, 0x00,   0xc2, 0xea, 0x81, 0x60,
25     0xb3, 0x14, 0x11, 0xcf,   0xbd, 0x92, 0x08, 0x00,
26     0x09, 0xc7, 0x31, 0x8c,   0x18, 0x1f, 0x10, 0x11
27 };
28 
29 unsigned char sas7bcat_magic_number[32] = {
30     0x00, 0x00, 0x00, 0x00,   0x00, 0x00, 0x00, 0x00,
31     0x00, 0x00, 0x00, 0x00,   0xc2, 0xea, 0x81, 0x63,
32     0xb3, 0x14, 0x11, 0xcf,   0xbd, 0x92, 0x08, 0x00,
33     0x09, 0xc7, 0x31, 0x8c,   0x18, 0x1f, 0x10, 0x11
34 };
35 
36 /* This table is cobbled together from extant files and:
37  * https://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
38  * https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
39  *
40  * Discrepancies form the official documentation are noted with a comment. It
41  * appears that in some instances that SAS software uses a newer encoding than
42  * what's listed in the docs. In these cases the encoding used by ReadStat
43  * represents the author's best guess.
44  */
45 static readstat_charset_entry_t _charset_table[] = {
46     { .code = 0,     .name = SAS_DEFAULT_STRING_ENCODING },
47     { .code = 20,    .name = "UTF-8" },
48     { .code = 28,    .name = "US-ASCII" },
49     { .code = 29,    .name = "ISO-8859-1" },
50     { .code = 30,    .name = "ISO-8859-2" },
51     { .code = 31,    .name = "ISO-8859-3" },
52     { .code = 32,    .name = "ISO-8859-4" },
53     { .code = 33,    .name = "ISO-8859-5" },
54     { .code = 34,    .name = "ISO-8859-6" },
55     { .code = 35,    .name = "ISO-8859-7" },
56     { .code = 36,    .name = "ISO-8859-8" },
57     { .code = 37,    .name = "ISO-8859-9" },
58     { .code = 39,    .name = "ISO-8859-11" },
59     { .code = 40,    .name = "ISO-8859-15" },
60     { .code = 41,    .name = "CP437" },
61     { .code = 42,    .name = "CP850" },
62     { .code = 43,    .name = "CP852" },
63     { .code = 44,    .name = "CP857" },
64     { .code = 45,    .name = "CP858" },
65     { .code = 46,    .name = "CP862" },
66     { .code = 47,    .name = "CP864" },
67     { .code = 48,    .name = "CP865" },
68     { .code = 49,    .name = "CP866" },
69     { .code = 50,    .name = "CP869" },
70     { .code = 51,    .name = "CP874" },
71     { .code = 52,    .name = "CP921" },
72     { .code = 53,    .name = "CP922" },
73     { .code = 54,    .name = "CP1129" },
74     { .code = 55,    .name = "CP720" },
75     { .code = 56,    .name = "CP737" },
76     { .code = 57,    .name = "CP775" },
77     { .code = 58,    .name = "CP860" },
78     { .code = 59,    .name = "CP863" },
79     { .code = 60,    .name = "WINDOWS-1250" },
80     { .code = 61,    .name = "WINDOWS-1251" },
81     { .code = 62,    .name = "WINDOWS-1252" },
82     { .code = 63,    .name = "WINDOWS-1253" },
83     { .code = 64,    .name = "WINDOWS-1254" },
84     { .code = 65,    .name = "WINDOWS-1255" },
85     { .code = 66,    .name = "WINDOWS-1256" },
86     { .code = 67,    .name = "WINDOWS-1257" },
87     { .code = 68,    .name = "WINDOWS-1258" },
88     { .code = 69,    .name = "MACROMAN" },
89     { .code = 70,    .name = "MACARABIC" },
90     { .code = 71,    .name = "MACHEBREW" },
91     { .code = 72,    .name = "MACGREEK" },
92     { .code = 73,    .name = "MACTHAI" },
93     { .code = 75,    .name = "MACTURKISH" },
94     { .code = 76,    .name = "MACUKRAINE" },
95     { .code = 118,   .name = "CP950" },
96     { .code = 119,   .name = "EUC-TW" },
97     { .code = 123,   .name = "BIG-5" },
98     { .code = 125,   .name = "GB18030" }, // "euc-cn" in SAS
99     { .code = 126,   .name = "WINDOWS-936" }, // "zwin"
100     { .code = 128,   .name = "CP1381" }, // "zpce"
101     { .code = 134,   .name = "EUC-JP" },
102     { .code = 136,   .name = "CP949" },
103     { .code = 137,   .name = "CP942" },
104     { .code = 138,   .name = "CP932" }, // "shift-jis" in SAS
105     { .code = 140,   .name = "EUC-KR" },
106     { .code = 141,   .name = "CP949" }, // "kpce"
107     { .code = 142,   .name = "CP949" }, // "kwin"
108     { .code = 163,   .name = "MACICELAND" },
109     { .code = 167,   .name = "ISO-2022-JP" },
110     { .code = 168,   .name = "ISO-2022-KR" },
111     { .code = 169,   .name = "ISO-2022-CN" },
112     { .code = 172,   .name = "ISO-2022-CN-EXT" },
113     { .code = 204,   .name = SAS_DEFAULT_STRING_ENCODING }, // "any" in SAS
114     { .code = 205,   .name = "GB18030" },
115     { .code = 227,   .name = "ISO-8859-14" },
116     { .code = 242,   .name = "ISO-8859-13" },
117     { .code = 245,   .name = "MACCROATIAN" },
118     { .code = 246,   .name = "MACCYRILLIC" },
119     { .code = 247,   .name = "MACROMANIA" },
120     { .code = 248,   .name = "SHIFT_JISX0213" },
121 };
122 
sas_epoch()123 static time_t sas_epoch() {
124     return - 3653 * 86400; // seconds between 01-01-1960 and 01-01-1970
125 }
126 
sas_convert_time(double time,time_t epoch)127 static time_t sas_convert_time(double time, time_t epoch) {
128     time += epoch;
129     if (isnan(time))
130         return 0;
131     if (time > (double)LONG_MAX)
132         return LONG_MAX;
133     if (time < (double)LONG_MIN)
134         return LONG_MIN;
135     return time;
136 }
137 
sas_read8(const char * data,int bswap)138 uint64_t sas_read8(const char *data, int bswap) {
139     uint64_t tmp;
140     memcpy(&tmp, data, 8);
141     return bswap ? byteswap8(tmp) : tmp;
142 }
143 
sas_read4(const char * data,int bswap)144 uint32_t sas_read4(const char *data, int bswap) {
145     uint32_t tmp;
146     memcpy(&tmp, data, 4);
147     return bswap ? byteswap4(tmp) : tmp;
148 }
149 
sas_read2(const char * data,int bswap)150 uint16_t sas_read2(const char *data, int bswap) {
151     uint16_t tmp;
152     memcpy(&tmp, data, 2);
153     return bswap ? byteswap2(tmp) : tmp;
154 }
155 
sas_subheader_remainder(size_t len,size_t signature_len)156 size_t sas_subheader_remainder(size_t len, size_t signature_len) {
157     return len - (4+2*signature_len);
158 }
159 
sas_read_header(readstat_io_t * io,sas_header_info_t * hinfo,readstat_error_handler error_handler,void * user_ctx)160 readstat_error_t sas_read_header(readstat_io_t *io, sas_header_info_t *hinfo,
161         readstat_error_handler error_handler, void *user_ctx) {
162     sas_header_start_t  header_start;
163     sas_header_end_t    header_end;
164     int retval = READSTAT_OK;
165     char error_buf[1024];
166     time_t epoch = sas_epoch();
167 
168     if (io->read(&header_start, sizeof(sas_header_start_t), io->io_ctx) < sizeof(sas_header_start_t)) {
169         retval = READSTAT_ERROR_READ;
170         goto cleanup;
171     }
172     if (memcmp(header_start.magic, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0 &&
173             memcmp(header_start.magic, sas7bcat_magic_number, sizeof(sas7bcat_magic_number)) != 0) {
174         retval = READSTAT_ERROR_PARSE;
175         goto cleanup;
176     }
177     if (header_start.a1 == SAS_ALIGNMENT_OFFSET_4) {
178         hinfo->pad1 = 4;
179     }
180     if (header_start.a2 == SAS_ALIGNMENT_OFFSET_4) {
181         hinfo->u64 = 1;
182     }
183     int bswap = 0;
184     if (header_start.endian == SAS_ENDIAN_BIG) {
185         bswap = machine_is_little_endian();
186         hinfo->little_endian = 0;
187     } else if (header_start.endian == SAS_ENDIAN_LITTLE) {
188         bswap = !machine_is_little_endian();
189         hinfo->little_endian = 1;
190     } else {
191         retval = READSTAT_ERROR_PARSE;
192         goto cleanup;
193     }
194     int i;
195     for (i=0; i<sizeof(_charset_table)/sizeof(_charset_table[0]); i++) {
196         if (header_start.encoding == _charset_table[i].code) {
197             hinfo->encoding = _charset_table[i].name;
198             break;
199         }
200     }
201     if (hinfo->encoding == NULL) {
202         if (error_handler) {
203             snprintf(error_buf, sizeof(error_buf), "Unsupported character set code: %d", header_start.encoding);
204             error_handler(error_buf, user_ctx);
205         }
206         retval = READSTAT_ERROR_UNSUPPORTED_CHARSET;
207         goto cleanup;
208     }
209     memcpy(hinfo->table_name, header_start.table_name, sizeof(header_start.table_name));
210     if (io->seek(hinfo->pad1, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
211         retval = READSTAT_ERROR_SEEK;
212         goto cleanup;
213     }
214 
215     double creation_time, modification_time;
216 
217     if (io->read(&creation_time, sizeof(double), io->io_ctx) < sizeof(double)) {
218         retval = READSTAT_ERROR_READ;
219         goto cleanup;
220     }
221     if (bswap)
222         creation_time = byteswap_double(creation_time);
223 
224     if (io->read(&modification_time, sizeof(double), io->io_ctx) < sizeof(double)) {
225         retval = READSTAT_ERROR_READ;
226         goto cleanup;
227     }
228     if (bswap)
229         modification_time = byteswap_double(modification_time);
230 
231     hinfo->creation_time = sas_convert_time(creation_time, epoch);
232     hinfo->modification_time = sas_convert_time(modification_time, epoch);
233 
234     if (io->seek(16, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
235         retval = READSTAT_ERROR_SEEK;
236         goto cleanup;
237     }
238 
239     uint32_t header_size, page_size;
240 
241     if (io->read(&header_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
242         retval = READSTAT_ERROR_READ;
243         goto cleanup;
244     }
245     if (io->read(&page_size, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
246         retval = READSTAT_ERROR_READ;
247         goto cleanup;
248     }
249 
250     hinfo->header_size = bswap ? byteswap4(header_size) : header_size;
251     hinfo->page_size = bswap ? byteswap4(page_size) : page_size;
252 
253     if (hinfo->header_size < 1024 || hinfo->page_size < 1024) {
254         retval = READSTAT_ERROR_PARSE;
255         goto cleanup;
256     }
257     if (hinfo->header_size > (1<<24) || hinfo->page_size > (1<<24)) {
258         retval = READSTAT_ERROR_PARSE;
259         goto cleanup;
260     }
261 
262     if (hinfo->u64) {
263         hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_64BIT;
264         hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_64BIT;
265     } else {
266         hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_32BIT;
267         hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_32BIT;
268     }
269 
270     if (hinfo->u64) {
271         uint64_t page_count;
272         if (io->read(&page_count, sizeof(uint64_t), io->io_ctx) < sizeof(uint64_t)) {
273             retval = READSTAT_ERROR_READ;
274             goto cleanup;
275         }
276         hinfo->page_count = bswap ? byteswap8(page_count) : page_count;
277     } else {
278         uint32_t page_count;
279         if (io->read(&page_count, sizeof(uint32_t), io->io_ctx) < sizeof(uint32_t)) {
280             retval = READSTAT_ERROR_READ;
281             goto cleanup;
282         }
283         hinfo->page_count = bswap ? byteswap4(page_count) : page_count;
284     }
285     if (hinfo->page_count > (1<<24)) {
286         retval = READSTAT_ERROR_PARSE;
287         goto cleanup;
288     }
289 
290     if (io->seek(8, READSTAT_SEEK_CUR, io->io_ctx) == -1) {
291         retval = READSTAT_ERROR_SEEK;
292         if (error_handler) {
293             snprintf(error_buf, sizeof(error_buf), "ReadStat: Failed to seek forward by %d", 8);
294             error_handler(error_buf, user_ctx);
295         }
296         goto cleanup;
297     }
298     if (io->read(&header_end, sizeof(sas_header_end_t), io->io_ctx) < sizeof(sas_header_end_t)) {
299         retval = READSTAT_ERROR_READ;
300         goto cleanup;
301     }
302     char major;
303     int minor, revision;
304     if (sscanf(header_end.release, "%c.%04dM%1d", &major, &minor, &revision) != 3) {
305         retval = READSTAT_ERROR_PARSE;
306         goto cleanup;
307     }
308 
309     if (major >= '1' && major <= '9') {
310         hinfo->major_version = major - '0';
311     } else if (major == 'V') {
312         // It appears that SAS Visual Forecaster reports the major version as "V"
313         // Treat it as version 9 for all intents and purposes
314         hinfo->major_version = 9;
315     } else {
316         retval = READSTAT_ERROR_PARSE;
317         goto cleanup;
318     }
319     hinfo->minor_version = minor;
320     hinfo->revision = revision;
321 
322     if ((major == '8' || major == '9') && minor == 0 && revision == 0) {
323         /* A bit of a hack, but most SAS installations are running a minor update */
324         hinfo->vendor = READSTAT_VENDOR_STAT_TRANSFER;
325     } else {
326         hinfo->vendor = READSTAT_VENDOR_SAS;
327     }
328     if (io->seek(hinfo->header_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
329         retval = READSTAT_ERROR_SEEK;
330         if (error_handler) {
331             snprintf(error_buf, sizeof(error_buf),
332                     "ReadStat: Failed to seek to position %" PRId64, hinfo->header_size);
333             error_handler(error_buf, user_ctx);
334         }
335         goto cleanup;
336     }
337 
338 cleanup:
339     return retval;
340 }
341 
sas_write_header(readstat_writer_t * writer,sas_header_info_t * hinfo,sas_header_start_t header_start)342 readstat_error_t sas_write_header(readstat_writer_t *writer, sas_header_info_t *hinfo, sas_header_start_t header_start) {
343     readstat_error_t retval = READSTAT_OK;
344     time_t epoch = sas_epoch();
345 
346     memset(header_start.table_name, ' ', sizeof(header_start.table_name));
347 
348     size_t table_name_len = strlen(writer->table_name);
349     if (table_name_len > sizeof(header_start.table_name))
350         table_name_len = sizeof(header_start.table_name);
351 
352     if (table_name_len) {
353         memcpy(header_start.table_name, writer->table_name, table_name_len);
354     } else {
355         memcpy(header_start.table_name, "DATASET", sizeof("DATASET")-1);
356     }
357 
358     retval = readstat_write_bytes(writer, &header_start, sizeof(sas_header_start_t));
359     if (retval != READSTAT_OK)
360         goto cleanup;
361 
362     retval = readstat_write_zeros(writer, hinfo->pad1);
363     if (retval != READSTAT_OK)
364         goto cleanup;
365 
366     double creation_time = hinfo->creation_time - epoch;
367 
368     retval = readstat_write_bytes(writer, &creation_time, sizeof(double));
369     if (retval != READSTAT_OK)
370         goto cleanup;
371 
372     double modification_time = hinfo->modification_time - epoch;
373 
374     retval = readstat_write_bytes(writer, &modification_time, sizeof(double));
375     if (retval != READSTAT_OK)
376         goto cleanup;
377 
378     retval = readstat_write_zeros(writer, 16);
379     if (retval != READSTAT_OK)
380         goto cleanup;
381 
382     uint32_t header_size = hinfo->header_size;
383     uint32_t page_size = hinfo->page_size;
384 
385     retval = readstat_write_bytes(writer, &header_size, sizeof(uint32_t));
386     if (retval != READSTAT_OK)
387         goto cleanup;
388 
389     retval = readstat_write_bytes(writer, &page_size, sizeof(uint32_t));
390     if (retval != READSTAT_OK)
391         goto cleanup;
392 
393     if (hinfo->u64) {
394         uint64_t page_count = hinfo->page_count;
395         retval = readstat_write_bytes(writer, &page_count, sizeof(uint64_t));
396     } else {
397         uint32_t page_count = hinfo->page_count;
398         retval = readstat_write_bytes(writer, &page_count, sizeof(uint32_t));
399     }
400     if (retval != READSTAT_OK)
401         goto cleanup;
402 
403     retval = readstat_write_zeros(writer, 8);
404     if (retval != READSTAT_OK)
405         goto cleanup;
406 
407     sas_header_end_t header_end = {
408         .host = "9.0401M6Linux"
409     };
410 
411     char release[sizeof(header_end.release)+1] = { 0 };
412     snprintf(release, sizeof(release), "%1d.%04dM0", (unsigned int)writer->version % 10, 101);
413     memcpy(header_end.release, release, sizeof(header_end.release));
414 
415     retval = readstat_write_bytes(writer, &header_end, sizeof(sas_header_end_t));
416     if (retval != READSTAT_OK)
417         goto cleanup;
418 
419     retval = readstat_write_zeros(writer, hinfo->header_size-writer->bytes_written);
420     if (retval != READSTAT_OK)
421         goto cleanup;
422 
423 cleanup:
424     return retval;
425 }
426 
sas_header_info_init(readstat_writer_t * writer,int is_64bit)427 sas_header_info_t *sas_header_info_init(readstat_writer_t *writer, int is_64bit) {
428     sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t));
429     hinfo->creation_time = writer->timestamp;
430     hinfo->modification_time = writer->timestamp;
431     hinfo->page_size = SAS_DEFAULT_PAGE_SIZE;
432     hinfo->u64 = !!is_64bit;
433 
434     if (hinfo->u64) {
435         hinfo->header_size = SAS_FILE_HEADER_SIZE_64BIT;
436         hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_64BIT;
437         hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_64BIT;
438     } else {
439         hinfo->header_size = SAS_FILE_HEADER_SIZE_32BIT;
440         hinfo->page_header_size = SAS_PAGE_HEADER_SIZE_32BIT;
441         hinfo->subheader_pointer_size = SAS_SUBHEADER_POINTER_SIZE_32BIT;
442     }
443 
444     return hinfo;
445 }
446 
sas_fill_page(readstat_writer_t * writer,sas_header_info_t * hinfo)447 readstat_error_t sas_fill_page(readstat_writer_t *writer, sas_header_info_t *hinfo) {
448     if ((writer->bytes_written - hinfo->header_size) % hinfo->page_size) {
449         size_t num_zeros = (hinfo->page_size -
450                 (writer->bytes_written - hinfo->header_size) % hinfo->page_size);
451         return readstat_write_zeros(writer, num_zeros);
452     }
453     return READSTAT_OK;
454 }
455 
sas_validate_name(const char * name,size_t max_len)456 readstat_error_t sas_validate_name(const char *name, size_t max_len) {
457     int j;
458     for (j=0; name[j]; j++) {
459         if (name[j] != '_' &&
460                 !(name[j] >= 'a' && name[j] <= 'z') &&
461                 !(name[j] >= 'A' && name[j] <= 'Z') &&
462                 !(name[j] >= '0' && name[j] <= '9')) {
463             return READSTAT_ERROR_NAME_CONTAINS_ILLEGAL_CHARACTER;
464         }
465     }
466     char first_char = name[0];
467 
468     if (!first_char)
469         return READSTAT_ERROR_NAME_IS_ZERO_LENGTH;
470 
471     if (first_char != '_' &&
472             !(first_char >= 'a' && first_char <= 'z') &&
473             !(first_char >= 'A' && first_char <= 'Z')) {
474         return READSTAT_ERROR_NAME_BEGINS_WITH_ILLEGAL_CHARACTER;
475     }
476     if (strcmp(name, "_N_") == 0 || strcmp(name, "_ERROR_") == 0 ||
477             strcmp(name, "_NUMERIC_") == 0 || strcmp(name, "_CHARACTER_") == 0 ||
478             strcmp(name, "_ALL_") == 0) {
479         return READSTAT_ERROR_NAME_IS_RESERVED_WORD;
480     }
481 
482     if (strlen(name) > max_len)
483         return READSTAT_ERROR_NAME_IS_TOO_LONG;
484 
485     return READSTAT_OK;
486 }
487 
sas_validate_variable(const readstat_variable_t * variable)488 readstat_error_t sas_validate_variable(const readstat_variable_t *variable) {
489     return sas_validate_name(readstat_variable_get_name(variable), 32);
490 }
491 
sas_validate_tag(char tag)492 readstat_error_t sas_validate_tag(char tag) {
493     if (tag == '_' || (tag >= 'A' && tag <= 'Z'))
494         return READSTAT_OK;
495 
496     return READSTAT_ERROR_TAGGED_VALUE_IS_OUT_OF_RANGE;
497 }
498 
sas_assign_tag(readstat_value_t * value,uint8_t tag)499 void sas_assign_tag(readstat_value_t *value, uint8_t tag) {
500     /* We accommodate two tag schemes. In the first, the tag is an ASCII code
501      * given by uint8_t tag above. System missing is represented by an ASCII
502      * period. In the second scheme, (tag-2) is an offset from 'A', except when
503      * tag == 0, in which case it represents an underscore, or tag == 1, in
504      * which case it represents system-missing.
505      */
506     if (tag == 0) {
507         tag = '_';
508     } else if (tag >= 2 && tag < 28) {
509         tag = 'A' + (tag - 2);
510     }
511     if (sas_validate_tag(tag) == READSTAT_OK) {
512         value->tag = tag;
513         value->is_tagged_missing = 1;
514     } else {
515         value->tag = 0;
516         value->is_system_missing = 1;
517     }
518 }
519