1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "data-in.h"
20
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <math.h>
25 #include <stdarg.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31
32 #include "calendar.h"
33 #include "dictionary.h"
34 #include "format.h"
35 #include "identifier.h"
36 #include "libpspp/assertion.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/integer-format.h"
40 #include "libpspp/misc.h"
41 #include "libpspp/str.h"
42 #include "settings.h"
43 #include "value.h"
44
45 #include "gl/c-ctype.h"
46 #include "gl/c-strtod.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49
50 #include "gettext.h"
51 #define _(msgid) gettext (msgid)
52
53 /* Information about parsing one data field. */
54 struct data_in
55 {
56 struct substring input; /* Source. */
57 enum fmt_type format; /* Input format. */
58
59 union value *output; /* Destination. */
60 int width; /* Output width. */
61 };
62
63 typedef char *data_in_parser_func (struct data_in *);
64 #define FMT(NAME, METHOD, IMIN, OMIN, IO, CATEGORY) \
65 static data_in_parser_func parse_##METHOD;
66 #include "format.def"
67
68 static void default_result (struct data_in *);
69 static bool trim_spaces_and_check_missing (struct data_in *);
70
71 static int hexit_value (int c);
72
73 /* Parses the characters in INPUT, which are encoded in the given
74 INPUT_ENCODING, according to FORMAT.
75
76 Stores the parsed representation in OUTPUT, which the caller must have
77 initialized with the given WIDTH (0 for a numeric field, otherwise the
78 string width). If FORMAT is FMT_A, then OUTPUT_ENCODING must specify the
79 correct encoding for OUTPUT (normally obtained via dict_get_encoding()).
80
81 If successful NULL is the return value. Otherwise a string describing
82 the problem is returned. The caller must free this string.
83 */
84 char *
data_in(struct substring input,const char * input_encoding,enum fmt_type format,union value * output,int width,const char * output_encoding)85 data_in (struct substring input, const char *input_encoding,
86 enum fmt_type format,
87 union value *output, int width, const char *output_encoding)
88 {
89 static data_in_parser_func *const handlers[FMT_NUMBER_OF_FORMATS] =
90 {
91 #define FMT(NAME, METHOD, IMIN, OMIN, IO, CATEGORY) parse_##METHOD,
92 #include "format.def"
93 };
94
95 struct data_in i;
96
97 enum fmt_category cat;
98 const char *dest_encoding;
99 char *s;
100 char *error;
101
102 assert ((width != 0) == fmt_is_string (format));
103
104 i.format = format;
105
106 i.output = output;
107 i.width = width;
108
109 if (ss_is_empty (input))
110 {
111 default_result (&i);
112 return NULL;
113 }
114
115 cat = fmt_get_category (format);
116 if (cat & (FMT_CAT_BASIC | FMT_CAT_HEXADECIMAL | FMT_CAT_CUSTOM
117 | FMT_CAT_DATE | FMT_CAT_TIME | FMT_CAT_DATE_COMPONENT))
118 {
119 /* We're going to parse these into numbers. For this purpose we want to
120 deal with them in the local "C" encoding. Any character not in that
121 encoding wouldn't be valid anyhow. */
122 dest_encoding = C_ENCODING;
123 }
124 else if (cat & (FMT_CAT_BINARY | FMT_CAT_LEGACY))
125 {
126 /* Don't recode these binary formats at all, since they are not text. */
127 dest_encoding = NULL;
128 }
129 else
130 {
131 assert (cat == FMT_CAT_STRING);
132 if (format == FMT_AHEX)
133 {
134 /* We want the hex digits in the local "C" encoding, even though the
135 result may not be in that encoding. */
136 dest_encoding = C_ENCODING;
137 }
138 else
139 {
140 /* Use the final output encoding. */
141 dest_encoding = output_encoding;
142 }
143 }
144
145 if (dest_encoding != NULL)
146 {
147 i.input = recode_substring_pool (dest_encoding, input_encoding, input,
148 NULL);
149 s = i.input.string;
150 }
151 else
152 {
153 i.input = input;
154 s = NULL;
155 }
156
157 error = handlers[i.format] (&i);
158 if (error != NULL)
159 default_result (&i);
160
161 free (s);
162
163 return error;
164 }
165
166 bool
data_in_msg(struct substring input,const char * input_encoding,enum fmt_type format,union value * output,int width,const char * output_encoding)167 data_in_msg (struct substring input, const char *input_encoding,
168 enum fmt_type format,
169 union value *output, int width, const char *output_encoding)
170 {
171 char *error = data_in (input, input_encoding, format,
172 output, width, output_encoding);
173 if (error != NULL)
174 {
175 msg (SW, _("Data is not valid as format %s: %s"),
176 fmt_name (format), error);
177 free (error);
178 return false;
179 }
180 else
181 return true;
182 }
183
184 static bool
number_has_implied_decimals(const char * s,enum fmt_type type)185 number_has_implied_decimals (const char *s, enum fmt_type type)
186 {
187 int decimal = settings_get_style (type)->decimal;
188 bool got_digit = false;
189 for (;;)
190 {
191 switch (*s)
192 {
193 case '0': case '1': case '2': case '3': case '4':
194 case '5': case '6': case '7': case '8': case '9':
195 got_digit = true;
196 break;
197
198 case '+': case '-':
199 if (got_digit)
200 return false;
201 break;
202
203 case 'e': case 'E': case 'd': case 'D':
204 return false;
205
206 case '.': case ',':
207 if (*s == decimal)
208 return false;
209 break;
210
211 case '\0':
212 return true;
213
214 default:
215 break;
216 }
217
218 s++;
219 }
220 }
221
222 static bool
has_implied_decimals(struct substring input,const char * input_encoding,enum fmt_type format)223 has_implied_decimals (struct substring input, const char *input_encoding,
224 enum fmt_type format)
225 {
226 bool retval;
227 char *s;
228
229 switch (format)
230 {
231 case FMT_F:
232 case FMT_COMMA:
233 case FMT_DOT:
234 case FMT_DOLLAR:
235 case FMT_PCT:
236 case FMT_E:
237 case FMT_Z:
238 break;
239
240 case FMT_N:
241 case FMT_IB:
242 case FMT_PIB:
243 case FMT_P:
244 case FMT_PK:
245 return true;
246
247 default:
248 return false;
249 }
250
251 s = recode_string (C_ENCODING, input_encoding,
252 ss_data (input), ss_length (input));
253 retval = (format == FMT_Z
254 ? strchr (s, '.') == NULL
255 : number_has_implied_decimals (s, format));
256 free (s);
257
258 return retval;
259 }
260
261 /* In some cases, when no decimal point is explicitly included in numeric
262 input, its position is implied by the number of decimal places in the input
263 format. In such a case, this function may be called just after data_in().
264 Its arguments are a subset of that function's arguments plus D, the number
265 of decimal places associated with FORMAT.
266
267 If it is appropriate, this function modifies the numeric value in OUTPUT. */
268 void
data_in_imply_decimals(struct substring input,const char * input_encoding,enum fmt_type format,int d,union value * output)269 data_in_imply_decimals (struct substring input, const char *input_encoding,
270 enum fmt_type format, int d, union value *output)
271 {
272 if (d > 0 && output->f != SYSMIS
273 && has_implied_decimals (input, input_encoding, format))
274 output->f /= pow (10., d);
275 }
276
277 /* Format parsers. */
278
279 /* Parses F, COMMA, DOT, DOLLAR, PCT, and E input formats. */
280 static char *
parse_number(struct data_in * i)281 parse_number (struct data_in *i)
282 {
283 const struct fmt_number_style *style =
284 settings_get_style (i->format);
285
286 struct string tmp;
287
288 int save_errno;
289 char *tail;
290
291 if (fmt_get_category (i->format) == FMT_CAT_CUSTOM)
292 {
293 style = settings_get_style (FMT_F);
294 }
295
296 /* Trim spaces and check for missing value representation. */
297 if (trim_spaces_and_check_missing (i))
298 return NULL;
299
300 ds_init_empty (&tmp);
301 ds_extend (&tmp, 64);
302
303 /* Prefix character may precede sign. */
304 if (style->prefix.s[0] != '\0')
305 {
306 ss_match_byte (&i->input, style->prefix.s[0]);
307 ss_ltrim (&i->input, ss_cstr (CC_SPACES));
308 }
309
310 /* Sign. */
311 if (ss_match_byte (&i->input, '-'))
312 {
313 ds_put_byte (&tmp, '-');
314 ss_ltrim (&i->input, ss_cstr (CC_SPACES));
315 }
316 else
317 {
318 ss_match_byte (&i->input, '+');
319 ss_ltrim (&i->input, ss_cstr (CC_SPACES));
320 }
321
322 /* Prefix character may follow sign. */
323 if (style->prefix.s[0] != '\0')
324 {
325 ss_match_byte (&i->input, style->prefix.s[0]);
326 ss_ltrim (&i->input, ss_cstr (CC_SPACES));
327 }
328
329 /* Digits before decimal point. */
330 while (c_isdigit (ss_first (i->input)))
331 {
332 ds_put_byte (&tmp, ss_get_byte (&i->input));
333 if (style->grouping != 0)
334 ss_match_byte (&i->input, style->grouping);
335 }
336
337 /* Decimal point and following digits. */
338 if (ss_match_byte (&i->input, style->decimal))
339 {
340 ds_put_byte (&tmp, '.');
341 while (c_isdigit (ss_first (i->input)))
342 ds_put_byte (&tmp, ss_get_byte (&i->input));
343 }
344
345 /* Exponent. */
346 if (!ds_is_empty (&tmp)
347 && !ss_is_empty (i->input)
348 && strchr ("eEdD-+", ss_first (i->input)))
349 {
350 ds_put_byte (&tmp, 'e');
351
352 if (strchr ("eEdD", ss_first (i->input)))
353 {
354 ss_advance (&i->input, 1);
355 ss_match_byte (&i->input, ' ');
356 }
357
358 if (ss_first (i->input) == '-' || ss_first (i->input) == '+')
359 {
360 if (ss_get_byte (&i->input) == '-')
361 ds_put_byte (&tmp, '-');
362 ss_match_byte (&i->input, ' ');
363 }
364
365 while (c_isdigit (ss_first (i->input)))
366 ds_put_byte (&tmp, ss_get_byte (&i->input));
367 }
368
369 /* Suffix character. */
370 if (style->suffix.s[0] != '\0')
371 ss_match_byte (&i->input, style->suffix.s[0]);
372
373 if (!ss_is_empty (i->input))
374 {
375 char *error;
376 if (ds_is_empty (&tmp))
377 error = xstrdup (_("Field contents are not numeric."));
378 else
379 error = xstrdup (_("Number followed by garbage."));
380 ds_destroy (&tmp);
381 return error;
382 }
383
384 /* Let c_strtod() do the conversion. */
385 save_errno = errno;
386 errno = 0;
387 i->output->f = c_strtod (ds_cstr (&tmp), &tail);
388 if (*tail != '\0')
389 {
390 errno = save_errno;
391 ds_destroy (&tmp);
392 return xstrdup (_("Invalid numeric syntax."));
393 }
394 else if (errno == ERANGE)
395 {
396 if (fabs (i->output->f) > 1)
397 {
398 i->output->f = SYSMIS;
399 ds_destroy (&tmp);
400 return xstrdup (_("Too-large number set to system-missing."));
401 }
402 else
403 {
404 i->output->f = 0.0;
405 ds_destroy (&tmp);
406 return xstrdup (_("Too-small number set to zero."));
407 }
408 }
409 else
410 errno = save_errno;
411
412 ds_destroy (&tmp);
413 return NULL;
414 }
415
416 /* Parses N format. */
417 static char *
parse_N(struct data_in * i)418 parse_N (struct data_in *i)
419 {
420 int c;
421
422 i->output->f = 0;
423 while ((c = ss_get_byte (&i->input)) != EOF)
424 {
425 if (!c_isdigit (c))
426 return xstrdup (_("All characters in field must be digits."));
427 i->output->f = i->output->f * 10.0 + (c - '0');
428 }
429
430 return NULL;
431 }
432
433 /* Parses PIBHEX format. */
434 static char *
parse_PIBHEX(struct data_in * i)435 parse_PIBHEX (struct data_in *i)
436 {
437 double n;
438 int c;
439
440 n = 0.0;
441
442 while ((c = ss_get_byte (&i->input)) != EOF)
443 {
444 if (!c_isxdigit (c))
445 return xstrdup (_("Unrecognized character in field."));
446 n = n * 16.0 + hexit_value (c);
447 }
448
449 i->output->f = n;
450 return NULL;
451 }
452
453 /* Parses RBHEX format. */
454 static char *
parse_RBHEX(struct data_in * i)455 parse_RBHEX (struct data_in *i)
456 {
457 double d;
458 size_t j;
459
460 memset (&d, 0, sizeof d);
461 for (j = 0; !ss_is_empty (i->input) && j < sizeof d; j++)
462 {
463 int hi = ss_get_byte (&i->input);
464 int lo = ss_get_byte (&i->input);
465 if (lo == EOF)
466 return xstrdup (_("Field must have even length."));
467 else if (!c_isxdigit (hi) || !c_isxdigit (lo))
468 return xstrdup (_("Field must contain only hex digits."));
469 ((unsigned char *) &d)[j] = 16 * hexit_value (hi) + hexit_value (lo);
470 }
471
472 i->output->f = d;
473
474 return NULL;
475 }
476
477 /* Digits for Z format. */
478 static const char z_digits[] = "0123456789{ABCDEFGHI}JKLMNOPQR";
479
480 /* Returns true if C is a Z format digit, false otherwise. */
481 static bool
is_z_digit(int c)482 is_z_digit (int c)
483 {
484 return c > 0 && strchr (z_digits, c) != NULL;
485 }
486
487 /* Returns the (absolute value of the) value of C as a Z format
488 digit. */
489 static int
z_digit_value(int c)490 z_digit_value (int c)
491 {
492 assert (is_z_digit (c));
493 return (strchr (z_digits, c) - z_digits) % 10;
494 }
495
496 /* Returns true if Z format digit C represents a negative value,
497 false otherwise. */
498 static bool
is_negative_z_digit(int c)499 is_negative_z_digit (int c)
500 {
501 assert (is_z_digit (c));
502 return (strchr (z_digits, c) - z_digits) >= 20;
503 }
504
505 /* Parses Z format. */
506 static char *
parse_Z(struct data_in * i)507 parse_Z (struct data_in *i)
508 {
509 struct string tmp;
510
511 int save_errno;
512
513 bool got_dot = false;
514 bool got_final_digit = false;
515
516 /* Trim spaces and check for missing value representation. */
517 if (trim_spaces_and_check_missing (i))
518 return NULL;
519
520 ds_init_empty (&tmp);
521 ds_extend (&tmp, 64);
522
523 ds_put_byte (&tmp, '+');
524 while (!ss_is_empty (i->input))
525 {
526 int c = ss_get_byte (&i->input);
527 if (c_isdigit (c) && !got_final_digit)
528 ds_put_byte (&tmp, c);
529 else if (is_z_digit (c) && !got_final_digit)
530 {
531 ds_put_byte (&tmp, z_digit_value (c) + '0');
532 if (is_negative_z_digit (c))
533 ds_data (&tmp)[0] = '-';
534 got_final_digit = true;
535 }
536 else if (c == '.' && !got_dot)
537 {
538 ds_put_byte (&tmp, '.');
539 got_dot = true;
540 }
541 else
542 {
543 ds_destroy (&tmp);
544 return xstrdup (_("Invalid zoned decimal syntax."));
545 }
546 }
547
548 if (!ss_is_empty (i->input))
549 {
550 char *error;
551
552 if (ds_length (&tmp) == 1)
553 error = xstrdup (_("Field contents are not numeric."));
554 else
555 error = xstrdup (_("Number followed by garbage."));
556
557 ds_destroy (&tmp);
558 return error;
559 }
560
561 /* Let c_strtod() do the conversion. */
562 save_errno = errno;
563 errno = 0;
564 i->output->f = c_strtod (ds_cstr (&tmp), NULL);
565 if (errno == ERANGE)
566 {
567 if (fabs (i->output->f) > 1)
568 {
569 i->output->f = SYSMIS;
570 ds_destroy (&tmp);
571 return xstrdup (_("Too-large number set to system-missing."));
572 }
573 else
574 {
575 i->output->f = 0.0;
576 ds_destroy (&tmp);
577 return xstrdup (_("Too-small number set to zero."));
578 }
579 }
580 else
581 errno = save_errno;
582
583 ds_destroy (&tmp);
584 return NULL;
585 }
586
587 /* Parses IB format. */
588 static char *
parse_IB(struct data_in * i)589 parse_IB (struct data_in *i)
590 {
591 size_t bytes;
592 uint64_t value;
593 uint64_t sign_bit;
594
595 bytes = MIN (8, ss_length (i->input));
596 value = integer_get (settings_get_input_integer_format (), ss_data (i->input), bytes);
597
598 sign_bit = UINT64_C(1) << (8 * bytes - 1);
599 if (!(value & sign_bit))
600 i->output->f = value;
601 else
602 {
603 /* Sign-extend to full 64 bits. */
604 value -= sign_bit << 1;
605 i->output->f = -(double) -value;
606 }
607
608 return NULL;
609 }
610
611 /* Parses PIB format. */
612 static char *
parse_PIB(struct data_in * i)613 parse_PIB (struct data_in *i)
614 {
615 i->output->f = integer_get (settings_get_input_integer_format (), ss_data (i->input),
616 MIN (8, ss_length (i->input)));
617
618 return NULL;
619 }
620
621 /* Consumes the first character of S. Stores its high 4 bits in
622 HIGH_NIBBLE and its low 4 bits in LOW_NIBBLE. */
623 static void
get_nibbles(struct substring * s,int * high_nibble,int * low_nibble)624 get_nibbles (struct substring *s, int *high_nibble, int *low_nibble)
625 {
626 int c = ss_get_byte (s);
627 assert (c != EOF);
628 *high_nibble = (c >> 4) & 15;
629 *low_nibble = c & 15;
630 }
631
632 /* Parses P format. */
633 static char *
parse_P(struct data_in * i)634 parse_P (struct data_in *i)
635 {
636 int high_nibble, low_nibble;
637
638 i->output->f = 0.0;
639
640 while (ss_length (i->input) > 1)
641 {
642 get_nibbles (&i->input, &high_nibble, &low_nibble);
643 if (high_nibble > 9 || low_nibble > 9)
644 return xstrdup (_("Invalid syntax for P field."));
645 i->output->f = (100 * i->output->f) + (10 * high_nibble) + low_nibble;
646 }
647
648 get_nibbles (&i->input, &high_nibble, &low_nibble);
649 if (high_nibble > 9)
650 return xstrdup (_("Invalid syntax for P field."));
651 i->output->f = (10 * i->output->f) + high_nibble;
652 if (low_nibble < 10)
653 i->output->f = (10 * i->output->f) + low_nibble;
654 else if (low_nibble == 0xb || low_nibble == 0xd)
655 i->output->f = -i->output->f;
656
657 return NULL;
658 }
659
660 /* Parses PK format. */
661 static char *
parse_PK(struct data_in * i)662 parse_PK (struct data_in *i)
663 {
664 i->output->f = 0.0;
665 while (!ss_is_empty (i->input))
666 {
667 int high_nibble, low_nibble;
668
669 get_nibbles (&i->input, &high_nibble, &low_nibble);
670 if (high_nibble > 9 || low_nibble > 9)
671 {
672 i->output->f = SYSMIS;
673 return NULL;
674 }
675 i->output->f = (100 * i->output->f) + (10 * high_nibble) + low_nibble;
676 }
677
678 return NULL;
679 }
680
681 /* Parses RB format. */
682 static char *
parse_RB(struct data_in * i)683 parse_RB (struct data_in *i)
684 {
685 enum float_format ff = settings_get_input_float_format ();
686 size_t size = float_get_size (ff);
687 if (ss_length (i->input) >= size)
688 float_convert (ff, ss_data (i->input),
689 FLOAT_NATIVE_DOUBLE, &i->output->f);
690 else
691 i->output->f = SYSMIS;
692
693 return NULL;
694 }
695
696 /* Parses A format. */
697 static char *
parse_A(struct data_in * i)698 parse_A (struct data_in *i)
699 {
700 /* This is equivalent to buf_copy_rpad, except that we posibly
701 do a character set recoding in the middle. */
702 uint8_t *dst = i->output->s;
703 size_t dst_size = i->width;
704 const char *src = ss_data (i->input);
705 size_t src_size = ss_length (i->input);
706
707 memcpy (dst, src, MIN (src_size, dst_size));
708
709 if (dst_size > src_size)
710 memset (&dst[src_size], ' ', dst_size - src_size);
711
712 return NULL;
713 }
714
715 /* Parses AHEX format. */
716 static char *
parse_AHEX(struct data_in * i)717 parse_AHEX (struct data_in *i)
718 {
719 uint8_t *s = i->output->s;
720 size_t j;
721
722 for (j = 0; ; j++)
723 {
724 int hi = ss_get_byte (&i->input);
725 int lo = ss_get_byte (&i->input);
726 if (hi == EOF)
727 break;
728 else if (lo == EOF)
729 return xstrdup (_("Field must have even length."));
730
731 if (!c_isxdigit (hi) || !c_isxdigit (lo))
732 return xstrdup (_("Field must contain only hex digits."));
733
734 if (j < i->width)
735 s[j] = hexit_value (hi) * 16 + hexit_value (lo);
736 }
737
738 memset (&s[j], ' ', i->width - j);
739
740 return NULL;
741 }
742
743 /* Date & time format components. */
744
745 /* Sign of a time value. */
746 enum time_sign
747 {
748 SIGN_NO_TIME, /* No time yet encountered. */
749 SIGN_POSITIVE, /* Positive time. */
750 SIGN_NEGATIVE /* Negative time. */
751 };
752
753 /* Parses a signed decimal integer from at most the first
754 MAX_DIGITS characters in I, storing the result into *RESULT.
755 Returns true if successful, false if no integer was
756 present. */
757 static char * WARN_UNUSED_RESULT
parse_int(struct data_in * i,long * result,size_t max_digits)758 parse_int (struct data_in *i, long *result, size_t max_digits)
759 {
760 struct substring head = ss_head (i->input, max_digits);
761 size_t n = ss_get_long (&head, result);
762 if (n)
763 {
764 ss_advance (&i->input, n);
765 return NULL;
766 }
767 else
768 return xstrdup (_("Syntax error in date field."));
769 }
770
771 /* Parses a date integer between 1 and 31 from I, storing it into
772 *DAY.
773 Returns true if successful, false if no date was present. */
774 static char *
parse_day(struct data_in * i,long * day)775 parse_day (struct data_in *i, long *day)
776 {
777 char *error = parse_int (i, day, SIZE_MAX);
778 if (error != NULL)
779 return error;
780 if (*day >= 1 && *day <= 31)
781 return NULL;
782
783 return xasprintf (_("Day (%ld) must be between 1 and 31."), *day);
784 }
785
786 /* If *TIME_SIGN is SIGN_NO_TIME, allows a sign to precede the
787 time and sets *TIME_SIGN. Otherwise, does not allow a sign. */
788 static void
parse_time_sign(struct data_in * i,enum time_sign * time_sign)789 parse_time_sign (struct data_in *i, enum time_sign *time_sign)
790 {
791 if (*time_sign == SIGN_NO_TIME)
792 {
793 if (ss_match_byte (&i->input, '-'))
794 *time_sign = SIGN_NEGATIVE;
795 else
796 {
797 ss_match_byte (&i->input, '+');
798 *time_sign = SIGN_POSITIVE;
799 }
800 }
801 }
802
803 /* Parses an integer from the beginning of I.
804 Adds SECONDS_PER_UNIT times the absolute value of the integer
805 to *TIME.
806 Returns true if successful, false if no integer was present. */
807 static char *
parse_time_units(struct data_in * i,double seconds_per_unit,double * time)808 parse_time_units (struct data_in *i, double seconds_per_unit, double *time)
809
810 {
811 char *error;
812 long units;
813
814 error = parse_int (i, &units, SIZE_MAX);
815 if (error != NULL)
816 return error;
817 if (units < 0)
818 return xstrdup (_("Syntax error in date field."));
819 *time += units * seconds_per_unit;
820 return NULL;
821 }
822
823 /* Parses a data delimiter from the beginning of I.
824 Returns true if successful, false if no delimiter was
825 present. */
826 static char *
parse_date_delimiter(struct data_in * i)827 parse_date_delimiter (struct data_in *i)
828 {
829 if (ss_ltrim (&i->input, ss_cstr ("-/.," CC_SPACES)))
830 return NULL;
831
832 return xstrdup (_("Delimiter expected between fields in date."));
833 }
834
835 /* Parses spaces at the beginning of I. */
836 static void
parse_spaces(struct data_in * i)837 parse_spaces (struct data_in *i)
838 {
839 ss_ltrim (&i->input, ss_cstr (CC_SPACES));
840 }
841
842 static struct substring
parse_name_token(struct data_in * i)843 parse_name_token (struct data_in *i)
844 {
845 struct substring token;
846 ss_get_bytes (&i->input, ss_span (i->input, ss_cstr (CC_LETTERS)), &token);
847 return token;
848 }
849
850 /* Reads a name from I and sets *OUTPUT to the value associated
851 with that name. If ALLOW_SUFFIXES is true, then names that
852 begin with one of the names are accepted; otherwise, only
853 exact matches (except for case) are allowed.
854 Returns true if successful, false otherwise. */
855 static bool
match_name(struct substring token,const char * const * names,long * output)856 match_name (struct substring token, const char *const *names, long *output)
857 {
858 int i;
859
860 for (i = 1; *names != NULL; i++)
861 if (ss_equals_case (ss_cstr (*names++), token))
862 {
863 *output = i;
864 return true;
865 }
866
867 return false;
868 }
869
870 /* Parses a month name or number from the beginning of I,
871 storing the month (in range 1...12) into *MONTH.
872 Returns true if successful, false if no month was present. */
873 static char *
parse_month(struct data_in * i,long * month)874 parse_month (struct data_in *i, long *month)
875 {
876 if (c_isdigit (ss_first (i->input)))
877 {
878 char *error = parse_int (i, month, SIZE_MAX);
879 if (error != NULL)
880 return error;
881 if (*month >= 1 && *month <= 12)
882 return NULL;
883 }
884 else
885 {
886 static const char *const english_names[] =
887 {
888 "jan", "feb", "mar", "apr", "may", "jun",
889 "jul", "aug", "sep", "oct", "nov", "dec",
890 NULL,
891 };
892
893 static const char *const roman_names[] =
894 {
895 "i", "ii", "iii", "iv", "v", "vi",
896 "vii", "viii", "ix", "x", "xi", "xii",
897 NULL,
898 };
899
900 struct substring token = parse_name_token (i);
901 if (match_name (ss_head (token, 3), english_names, month)
902 || match_name (ss_head (token, 4), roman_names, month))
903 return NULL;
904 }
905
906 return xstrdup (_("Unrecognized month format. Months may be specified "
907 "as Arabic or Roman numerals or as at least 3 letters "
908 "of their English names."));
909 }
910
911 /* Parses a year of at most MAX_DIGITS from the beginning of I,
912 storing a "4-digit" year into *YEAR. */
913 static char *
parse_year(struct data_in * i,long * year,size_t max_digits)914 parse_year (struct data_in *i, long *year, size_t max_digits)
915 {
916 char *error = parse_int (i, year, max_digits);
917 if (error != NULL)
918 return error;
919
920 if (*year >= 0 && *year <= 99)
921 {
922 int epoch = settings_get_epoch ();
923 int epoch_century = ROUND_DOWN (epoch, 100);
924 int epoch_offset = epoch - epoch_century;
925 if (*year >= epoch_offset)
926 *year += epoch_century;
927 else
928 *year += epoch_century + 100;
929 }
930 if (*year >= 1582 && *year <= 19999)
931 return NULL;
932
933 return xasprintf (_("Year (%ld) must be between 1582 and 19999."), *year);
934 }
935
936 /* Returns true if input in I has been exhausted,
937 false otherwise. */
938 static char *
parse_trailer(struct data_in * i)939 parse_trailer (struct data_in *i)
940 {
941 if (ss_is_empty (i->input))
942 return NULL;
943
944 return xasprintf (_("Trailing garbage `%.*s' following date."),
945 (int) ss_length (i->input), ss_data (i->input));
946 }
947
948 /* Parses a 3-digit Julian day-of-year value from I into *YDAY.
949 Returns true if successful, false on failure. */
950 static char *
parse_yday(struct data_in * i,long * yday)951 parse_yday (struct data_in *i, long *yday)
952 {
953 struct substring num_s;
954 long num;
955
956 ss_get_bytes (&i->input, 3, &num_s);
957 if (ss_span (num_s, ss_cstr (CC_DIGITS)) != 3)
958 return xstrdup (_("Julian day must have exactly three digits."));
959 else if (!ss_get_long (&num_s, &num) || num < 1 || num > 366)
960 return xasprintf (_("Julian day (%ld) must be between 1 and 366."), num);
961
962 *yday = num;
963 return NULL;
964 }
965
966 /* Parses a quarter-of-year integer between 1 and 4 from I.
967 Stores the corresponding month into *MONTH.
968 Returns true if successful, false if no quarter was present. */
969 static char *
parse_quarter(struct data_in * i,long int * month)970 parse_quarter (struct data_in *i, long int *month)
971 {
972 long quarter;
973 char *error;
974
975 error = parse_int (i, &quarter, SIZE_MAX);
976 if (error != NULL)
977 return error;
978 if (quarter >= 1 && quarter <= 4)
979 {
980 *month = (quarter - 1) * 3 + 1;
981 return NULL;
982 }
983
984 return xasprintf (_("Quarter (%ld) must be between 1 and 4."), quarter);
985 }
986
987 /* Parses a week-of-year integer between 1 and 53 from I,
988 Stores the corresponding year-of-day into *YDAY.
989 Returns true if successful, false if no week was present. */
990 static char *
parse_week(struct data_in * i,long int * yday)991 parse_week (struct data_in *i, long int *yday)
992 {
993 char *error;
994 long week;
995
996 error = parse_int (i, &week, SIZE_MAX);
997 if (error != NULL)
998 return error;
999 if (week >= 1 && week <= 53)
1000 {
1001 *yday = (week - 1) * 7 + 1;
1002 return NULL;
1003 }
1004
1005 return xasprintf (_("Week (%ld) must be between 1 and 53."), week);
1006 }
1007
1008 /* Parses a time delimiter from the beginning of I.
1009 Returns true if successful, false if no delimiter was
1010 present. */
1011 static char *
parse_time_delimiter(struct data_in * i)1012 parse_time_delimiter (struct data_in *i)
1013 {
1014 if (ss_ltrim (&i->input, ss_cstr (":" CC_SPACES)) > 0)
1015 return NULL;
1016
1017 return xstrdup (_("Delimiter expected between fields in time."));
1018 }
1019
1020 /* Parses minutes and optional seconds from the beginning of I.
1021 The time is converted into seconds, which are added to
1022 *TIME.
1023 Returns true if successful, false if an error was found. */
1024 static char *
parse_minute_second(struct data_in * i,double * time)1025 parse_minute_second (struct data_in *i, double *time)
1026 {
1027 long minute;
1028 char buf[64];
1029 char *error;
1030 char *cp;
1031
1032 /* Parse minutes. */
1033 error = parse_int (i, &minute, SIZE_MAX);
1034 if (error != NULL)
1035 return error;
1036 if (i->format != FMT_MTIME && (minute < 0 || minute > 59))
1037 return xasprintf (_("Minute (%ld) must be between 0 and 59."), minute);
1038 *time += 60. * minute;
1039
1040 /* Check for seconds. */
1041 if (ss_ltrim (&i->input, ss_cstr (":" CC_SPACES)) == 0
1042 || !c_isdigit (ss_first (i->input)))
1043 return NULL;
1044
1045 /* Parse seconds. */
1046 cp = buf;
1047 while (c_isdigit (ss_first (i->input)))
1048 *cp++ = ss_get_byte (&i->input);
1049 if (ss_match_byte (&i->input, settings_get_decimal_char (FMT_F)))
1050 *cp++ = '.';
1051 while (c_isdigit (ss_first (i->input)))
1052 *cp++ = ss_get_byte (&i->input);
1053 *cp = '\0';
1054
1055 *time += c_strtod (buf, NULL);
1056
1057 return NULL;
1058 }
1059
1060 /* Parses a weekday name from the beginning of I,
1061 storing a value of 1=Sunday...7=Saturday into *WEEKDAY.
1062 Returns true if successful, false if an error was found. */
1063 static char *
parse_weekday(struct data_in * i,long * weekday)1064 parse_weekday (struct data_in *i, long *weekday)
1065 {
1066 static const char *const weekday_names[] =
1067 {
1068 "su", "mo", "tu", "we", "th", "fr", "sa",
1069 NULL,
1070 };
1071
1072 struct substring token = parse_name_token (i);
1073 bool ok = match_name (ss_head (token, 2), weekday_names, weekday);
1074 if (!ok)
1075 return xstrdup (_("Unrecognized weekday name. At least the first two "
1076 "letters of an English weekday name must be "
1077 "specified."));
1078 return NULL;
1079 }
1080
1081 /* Date & time formats. */
1082
1083 /* Parses WKDAY format. */
1084 static char *
parse_WKDAY(struct data_in * i)1085 parse_WKDAY (struct data_in *i)
1086 {
1087 long weekday = 0;
1088 char *error;
1089
1090 if (trim_spaces_and_check_missing (i))
1091 return NULL;
1092
1093 error = parse_weekday (i, &weekday);
1094 if (error == NULL)
1095 error = parse_trailer (i);
1096
1097 i->output->f = weekday;
1098 return error;
1099 }
1100
1101 /* Parses MONTH format. */
1102 static char *
parse_MONTH(struct data_in * i)1103 parse_MONTH (struct data_in *i)
1104 {
1105 long month;
1106 char *error;
1107
1108 if (trim_spaces_and_check_missing (i))
1109 return NULL;
1110
1111 error = parse_month (i, &month);
1112 if (error == NULL)
1113 error = parse_trailer (i);
1114
1115 i->output->f = month;
1116 return error;
1117 }
1118
1119 /* Parses DATE, ADATE, EDATE, JDATE, SDATE, QYR, MOYR, KWYR,
1120 DATETIME, YMDHMS, MTIME, TIME, and DTIME formats. */
1121 static char *
parse_date(struct data_in * i)1122 parse_date (struct data_in *i)
1123 {
1124 long int year = INT_MIN;
1125 long int month = 1;
1126 long int day = 1;
1127 long int yday = 1;
1128 double time = 0, date = 0;
1129 enum time_sign time_sign = SIGN_NO_TIME;
1130
1131 const char *template = fmt_date_template (i->format, 0);
1132 size_t template_width = strlen (template);
1133 char *error;
1134
1135 if (trim_spaces_and_check_missing (i))
1136 return NULL;
1137
1138 while (*template != '\0')
1139 {
1140 unsigned char ch = *template;
1141 int count = 1;
1142
1143 while (template[count] == ch)
1144 count++;
1145 template += count;
1146
1147 switch (ch)
1148 {
1149 case 'd':
1150 error = count < 3 ? parse_day (i, &day) : parse_yday (i, &yday);
1151 break;
1152 case 'm':
1153 error = parse_month (i, &month);
1154 break;
1155 case 'y':
1156 {
1157 size_t max_digits;
1158 if (!c_isalpha (*template))
1159 max_digits = SIZE_MAX;
1160 else
1161 {
1162 if (ss_length (i->input) >= template_width + 2)
1163 max_digits = 4;
1164 else
1165 max_digits = 2;
1166 }
1167 error = parse_year (i, &year, max_digits);
1168 }
1169 break;
1170 case 'q':
1171 error = parse_quarter (i, &month);
1172 break;
1173 case 'w':
1174 error = parse_week (i, &yday);
1175 break;
1176 case 'D':
1177 parse_time_sign (i, &time_sign);
1178 error = parse_time_units (i, 60. * 60. * 24., &time);
1179 break;
1180 case 'H':
1181 parse_time_sign (i, &time_sign);
1182 error = parse_time_units (i, 60. * 60., &time);
1183 break;
1184 case 'M':
1185 if (i->format == FMT_MTIME)
1186 parse_time_sign (i, &time_sign);
1187 error = parse_minute_second (i, &time);
1188 break;
1189 case '-':
1190 case '/':
1191 case '.':
1192 error = parse_date_delimiter (i);
1193 break;
1194 case ':':
1195 error = parse_time_delimiter (i);
1196 break;
1197 case ' ':
1198 if (i->format != FMT_MOYR)
1199 {
1200 parse_spaces (i);
1201 error = NULL;
1202 }
1203 else
1204 error = parse_date_delimiter (i);
1205 break;
1206 default:
1207 assert (count == 1);
1208 if (!ss_match_byte (&i->input, c_toupper (ch))
1209 && !ss_match_byte (&i->input, c_tolower (ch)))
1210 error = xasprintf (_("`%c' expected in date field."), ch);
1211 else
1212 error = NULL;
1213 break;
1214 }
1215 if (error != NULL)
1216 return error;
1217 }
1218 error = parse_trailer (i);
1219 if (error != NULL)
1220 return error;
1221
1222 if (year != INT_MIN)
1223 {
1224 char *error;
1225 double ofs;
1226
1227 ofs = calendar_gregorian_to_offset (year, month, day, &error);
1228 if (ofs == SYSMIS)
1229 return error;
1230 date = (yday - 1 + ofs) * 60. * 60. * 24.;
1231 }
1232 else
1233 date = 0.;
1234 i->output->f = date + (time_sign == SIGN_NEGATIVE ? -time : time);
1235
1236 return NULL;
1237 }
1238
1239 /* Utility functions. */
1240
1241 /* Sets the default result for I.
1242 For a numeric format, this is the value set on SET BLANKS
1243 (typically system-missing); for a string format, it is all
1244 spaces. */
1245 static void
default_result(struct data_in * i)1246 default_result (struct data_in *i)
1247 {
1248 if (fmt_is_string (i->format))
1249 memset (i->output->s, ' ', i->width);
1250 else
1251 i->output->f = settings_get_blanks ();
1252 }
1253
1254 /* Trims leading and trailing spaces from I.
1255 If the result is empty, or a single period character, then
1256 sets the default result and returns true; otherwise, returns
1257 false. */
1258 static bool
trim_spaces_and_check_missing(struct data_in * i)1259 trim_spaces_and_check_missing (struct data_in *i)
1260 {
1261 ss_trim (&i->input, ss_cstr (" "));
1262 if (ss_is_empty (i->input) || ss_equals (i->input, ss_cstr (".")))
1263 {
1264 default_result (i);
1265 return true;
1266 }
1267 return false;
1268 }
1269
1270 /* Returns the integer value of hex digit C. */
1271 static int
hexit_value(int c)1272 hexit_value (int c)
1273 {
1274 const char s[] = "0123456789abcdef";
1275 const char *cp = strchr (s, c_tolower ((unsigned char) c));
1276
1277 assert (cp != NULL);
1278 return cp - s;
1279 }
1280