1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "data-in.h"
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <math.h>
25 #include <stdarg.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 
32 #include "calendar.h"
33 #include "dictionary.h"
34 #include "format.h"
35 #include "identifier.h"
36 #include "libpspp/assertion.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/integer-format.h"
40 #include "libpspp/misc.h"
41 #include "libpspp/str.h"
42 #include "settings.h"
43 #include "value.h"
44 
45 #include "gl/c-ctype.h"
46 #include "gl/c-strtod.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 
50 #include "gettext.h"
51 #define _(msgid) gettext (msgid)
52 
53 /* Information about parsing one data field. */
54 struct data_in
55   {
56     struct substring input;     /* Source. */
57     enum fmt_type format;       /* Input format. */
58 
59     union value *output;        /* Destination. */
60     int width;                  /* Output width. */
61   };
62 
63 typedef char *data_in_parser_func (struct data_in *);
64 #define FMT(NAME, METHOD, IMIN, OMIN, IO, CATEGORY) \
65         static data_in_parser_func parse_##METHOD;
66 #include "format.def"
67 
68 static void default_result (struct data_in *);
69 static bool trim_spaces_and_check_missing (struct data_in *);
70 
71 static int hexit_value (int c);
72 
73 /* Parses the characters in INPUT, which are encoded in the given
74    INPUT_ENCODING, according to FORMAT.
75 
76    Stores the parsed representation in OUTPUT, which the caller must have
77    initialized with the given WIDTH (0 for a numeric field, otherwise the
78    string width).  If FORMAT is FMT_A, then OUTPUT_ENCODING must specify the
79    correct encoding for OUTPUT (normally obtained via dict_get_encoding()).
80 
81    If successful NULL is the return value.  Otherwise a string describing
82    the problem is returned.  The caller must free this string.
83  */
84 char *
data_in(struct substring input,const char * input_encoding,enum fmt_type format,union value * output,int width,const char * output_encoding)85 data_in (struct substring input, const char *input_encoding,
86          enum fmt_type format,
87          union value *output, int width, const char *output_encoding)
88 {
89   static data_in_parser_func *const handlers[FMT_NUMBER_OF_FORMATS] =
90     {
91 #define FMT(NAME, METHOD, IMIN, OMIN, IO, CATEGORY) parse_##METHOD,
92 #include "format.def"
93     };
94 
95   struct data_in i;
96 
97   enum fmt_category cat;
98   const char *dest_encoding;
99   char *s;
100   char *error;
101 
102   assert ((width != 0) == fmt_is_string (format));
103 
104   i.format = format;
105 
106   i.output = output;
107   i.width = width;
108 
109   if (ss_is_empty (input))
110     {
111       default_result (&i);
112       return NULL;
113     }
114 
115   cat = fmt_get_category (format);
116   if (cat & (FMT_CAT_BASIC | FMT_CAT_HEXADECIMAL | FMT_CAT_CUSTOM
117              | FMT_CAT_DATE | FMT_CAT_TIME | FMT_CAT_DATE_COMPONENT))
118     {
119       /* We're going to parse these into numbers.  For this purpose we want to
120          deal with them in the local "C" encoding.  Any character not in that
121          encoding wouldn't be valid anyhow. */
122       dest_encoding = C_ENCODING;
123     }
124   else if (cat & (FMT_CAT_BINARY | FMT_CAT_LEGACY))
125     {
126       /* Don't recode these binary formats at all, since they are not text. */
127       dest_encoding = NULL;
128     }
129   else
130     {
131       assert (cat == FMT_CAT_STRING);
132       if (format == FMT_AHEX)
133         {
134           /* We want the hex digits in the local "C" encoding, even though the
135              result may not be in that encoding. */
136           dest_encoding = C_ENCODING;
137         }
138       else
139         {
140           /* Use the final output encoding. */
141           dest_encoding = output_encoding;
142         }
143     }
144 
145   if (dest_encoding != NULL)
146     {
147       i.input = recode_substring_pool (dest_encoding, input_encoding, input,
148                                        NULL);
149       s = i.input.string;
150     }
151   else
152     {
153       i.input = input;
154       s = NULL;
155     }
156 
157   error = handlers[i.format] (&i);
158   if (error != NULL)
159     default_result (&i);
160 
161   free (s);
162 
163   return error;
164 }
165 
166 bool
data_in_msg(struct substring input,const char * input_encoding,enum fmt_type format,union value * output,int width,const char * output_encoding)167 data_in_msg (struct substring input, const char *input_encoding,
168              enum fmt_type format,
169              union value *output, int width, const char *output_encoding)
170 {
171   char *error = data_in (input, input_encoding, format,
172                          output, width, output_encoding);
173   if (error != NULL)
174     {
175       msg (SW, _("Data is not valid as format %s: %s"),
176            fmt_name (format), error);
177       free (error);
178       return false;
179     }
180   else
181     return true;
182 }
183 
184 static bool
number_has_implied_decimals(const char * s,enum fmt_type type)185 number_has_implied_decimals (const char *s, enum fmt_type type)
186 {
187   int decimal = settings_get_style (type)->decimal;
188   bool got_digit = false;
189   for (;;)
190     {
191       switch (*s)
192         {
193         case '0': case '1': case '2': case '3': case '4':
194         case '5': case '6': case '7': case '8': case '9':
195           got_digit = true;
196           break;
197 
198         case '+': case '-':
199           if (got_digit)
200             return false;
201           break;
202 
203         case 'e': case 'E': case 'd': case 'D':
204           return false;
205 
206         case '.': case ',':
207           if (*s == decimal)
208             return false;
209           break;
210 
211         case '\0':
212           return true;
213 
214         default:
215           break;
216         }
217 
218       s++;
219     }
220 }
221 
222 static bool
has_implied_decimals(struct substring input,const char * input_encoding,enum fmt_type format)223 has_implied_decimals (struct substring input, const char *input_encoding,
224                       enum fmt_type format)
225 {
226   bool retval;
227   char *s;
228 
229   switch (format)
230     {
231     case FMT_F:
232     case FMT_COMMA:
233     case FMT_DOT:
234     case FMT_DOLLAR:
235     case FMT_PCT:
236     case FMT_E:
237     case FMT_Z:
238       break;
239 
240     case FMT_N:
241     case FMT_IB:
242     case FMT_PIB:
243     case FMT_P:
244     case FMT_PK:
245       return true;
246 
247     default:
248       return false;
249     }
250 
251   s = recode_string (C_ENCODING, input_encoding,
252                      ss_data (input), ss_length (input));
253   retval = (format == FMT_Z
254             ? strchr (s, '.') == NULL
255             : number_has_implied_decimals (s, format));
256   free (s);
257 
258   return retval;
259 }
260 
261 /* In some cases, when no decimal point is explicitly included in numeric
262    input, its position is implied by the number of decimal places in the input
263    format.  In such a case, this function may be called just after data_in().
264    Its arguments are a subset of that function's arguments plus D, the number
265    of decimal places associated with FORMAT.
266 
267    If it is appropriate, this function modifies the numeric value in OUTPUT. */
268 void
data_in_imply_decimals(struct substring input,const char * input_encoding,enum fmt_type format,int d,union value * output)269 data_in_imply_decimals (struct substring input, const char *input_encoding,
270                         enum fmt_type format, int d, union value *output)
271 {
272   if (d > 0 && output->f != SYSMIS
273       && has_implied_decimals (input, input_encoding, format))
274     output->f /= pow (10., d);
275 }
276 
277 /* Format parsers. */
278 
279 /* Parses F, COMMA, DOT, DOLLAR, PCT, and E input formats. */
280 static char *
parse_number(struct data_in * i)281 parse_number (struct data_in *i)
282 {
283   const struct fmt_number_style *style =
284     settings_get_style (i->format);
285 
286   struct string tmp;
287 
288   int save_errno;
289   char *tail;
290 
291   if  (fmt_get_category (i->format) == FMT_CAT_CUSTOM)
292     {
293       style = settings_get_style (FMT_F);
294     }
295 
296   /* Trim spaces and check for missing value representation. */
297   if (trim_spaces_and_check_missing (i))
298     return NULL;
299 
300   ds_init_empty (&tmp);
301   ds_extend (&tmp, 64);
302 
303   /* Prefix character may precede sign. */
304   if (style->prefix.s[0] != '\0')
305     {
306       ss_match_byte (&i->input, style->prefix.s[0]);
307       ss_ltrim (&i->input, ss_cstr (CC_SPACES));
308     }
309 
310   /* Sign. */
311   if (ss_match_byte (&i->input, '-'))
312     {
313       ds_put_byte (&tmp, '-');
314       ss_ltrim (&i->input, ss_cstr (CC_SPACES));
315     }
316   else
317     {
318       ss_match_byte (&i->input, '+');
319       ss_ltrim (&i->input, ss_cstr (CC_SPACES));
320     }
321 
322   /* Prefix character may follow sign. */
323   if (style->prefix.s[0] != '\0')
324     {
325       ss_match_byte (&i->input, style->prefix.s[0]);
326       ss_ltrim (&i->input, ss_cstr (CC_SPACES));
327     }
328 
329   /* Digits before decimal point. */
330   while (c_isdigit (ss_first (i->input)))
331     {
332       ds_put_byte (&tmp, ss_get_byte (&i->input));
333       if (style->grouping != 0)
334         ss_match_byte (&i->input, style->grouping);
335     }
336 
337   /* Decimal point and following digits. */
338   if (ss_match_byte (&i->input, style->decimal))
339     {
340       ds_put_byte (&tmp, '.');
341       while (c_isdigit (ss_first (i->input)))
342         ds_put_byte (&tmp, ss_get_byte (&i->input));
343     }
344 
345   /* Exponent. */
346   if (!ds_is_empty (&tmp)
347       && !ss_is_empty (i->input)
348       && strchr ("eEdD-+", ss_first (i->input)))
349     {
350       ds_put_byte (&tmp, 'e');
351 
352       if (strchr ("eEdD", ss_first (i->input)))
353         {
354           ss_advance (&i->input, 1);
355           ss_match_byte (&i->input, ' ');
356         }
357 
358       if (ss_first (i->input) == '-' || ss_first (i->input) == '+')
359         {
360           if (ss_get_byte (&i->input) == '-')
361             ds_put_byte (&tmp, '-');
362           ss_match_byte (&i->input, ' ');
363         }
364 
365       while (c_isdigit (ss_first (i->input)))
366         ds_put_byte (&tmp, ss_get_byte (&i->input));
367     }
368 
369   /* Suffix character. */
370   if (style->suffix.s[0] != '\0')
371     ss_match_byte (&i->input, style->suffix.s[0]);
372 
373   if (!ss_is_empty (i->input))
374     {
375       char *error;
376       if (ds_is_empty (&tmp))
377         error = xstrdup (_("Field contents are not numeric."));
378       else
379         error = xstrdup (_("Number followed by garbage."));
380       ds_destroy (&tmp);
381       return error;
382     }
383 
384   /* Let c_strtod() do the conversion. */
385   save_errno = errno;
386   errno = 0;
387   i->output->f = c_strtod (ds_cstr (&tmp), &tail);
388   if (*tail != '\0')
389     {
390       errno = save_errno;
391       ds_destroy (&tmp);
392       return xstrdup (_("Invalid numeric syntax."));
393     }
394   else if (errno == ERANGE)
395     {
396       if (fabs (i->output->f) > 1)
397         {
398           i->output->f = SYSMIS;
399           ds_destroy (&tmp);
400           return xstrdup (_("Too-large number set to system-missing."));
401         }
402       else
403         {
404           i->output->f = 0.0;
405           ds_destroy (&tmp);
406           return xstrdup (_("Too-small number set to zero."));
407         }
408     }
409   else
410     errno = save_errno;
411 
412   ds_destroy (&tmp);
413   return NULL;
414 }
415 
416 /* Parses N format. */
417 static char *
parse_N(struct data_in * i)418 parse_N (struct data_in *i)
419 {
420   int c;
421 
422   i->output->f = 0;
423   while ((c = ss_get_byte (&i->input)) != EOF)
424     {
425       if (!c_isdigit (c))
426         return xstrdup (_("All characters in field must be digits."));
427       i->output->f = i->output->f * 10.0 + (c - '0');
428     }
429 
430   return NULL;
431 }
432 
433 /* Parses PIBHEX format. */
434 static char *
parse_PIBHEX(struct data_in * i)435 parse_PIBHEX (struct data_in *i)
436 {
437   double n;
438   int c;
439 
440   n = 0.0;
441 
442   while ((c = ss_get_byte (&i->input)) != EOF)
443     {
444       if (!c_isxdigit (c))
445         return xstrdup (_("Unrecognized character in field."));
446       n = n * 16.0 + hexit_value (c);
447     }
448 
449   i->output->f = n;
450   return NULL;
451 }
452 
453 /* Parses RBHEX format. */
454 static char *
parse_RBHEX(struct data_in * i)455 parse_RBHEX (struct data_in *i)
456 {
457   double d;
458   size_t j;
459 
460   memset (&d, 0, sizeof d);
461   for (j = 0; !ss_is_empty (i->input) && j < sizeof d; j++)
462     {
463       int hi = ss_get_byte (&i->input);
464       int lo = ss_get_byte (&i->input);
465       if (lo == EOF)
466         return xstrdup (_("Field must have even length."));
467       else if (!c_isxdigit (hi) || !c_isxdigit (lo))
468         return xstrdup (_("Field must contain only hex digits."));
469       ((unsigned char *) &d)[j] = 16 * hexit_value (hi) + hexit_value (lo);
470     }
471 
472   i->output->f = d;
473 
474   return NULL;
475 }
476 
477 /* Digits for Z format. */
478 static const char z_digits[] = "0123456789{ABCDEFGHI}JKLMNOPQR";
479 
480 /* Returns true if C is a Z format digit, false otherwise. */
481 static bool
is_z_digit(int c)482 is_z_digit (int c)
483 {
484   return c > 0 && strchr (z_digits, c) != NULL;
485 }
486 
487 /* Returns the (absolute value of the) value of C as a Z format
488    digit. */
489 static int
z_digit_value(int c)490 z_digit_value (int c)
491 {
492   assert (is_z_digit (c));
493   return (strchr (z_digits, c) - z_digits) % 10;
494 }
495 
496 /* Returns true if Z format digit C represents a negative value,
497    false otherwise. */
498 static bool
is_negative_z_digit(int c)499 is_negative_z_digit (int c)
500 {
501   assert (is_z_digit (c));
502   return (strchr (z_digits, c) - z_digits) >= 20;
503 }
504 
505 /* Parses Z format. */
506 static char *
parse_Z(struct data_in * i)507 parse_Z (struct data_in *i)
508 {
509   struct string tmp;
510 
511   int save_errno;
512 
513   bool got_dot = false;
514   bool got_final_digit = false;
515 
516   /* Trim spaces and check for missing value representation. */
517   if (trim_spaces_and_check_missing (i))
518     return NULL;
519 
520   ds_init_empty (&tmp);
521   ds_extend (&tmp, 64);
522 
523   ds_put_byte (&tmp, '+');
524   while (!ss_is_empty (i->input))
525     {
526       int c = ss_get_byte (&i->input);
527       if (c_isdigit (c) && !got_final_digit)
528         ds_put_byte (&tmp, c);
529       else if (is_z_digit (c) && !got_final_digit)
530         {
531           ds_put_byte (&tmp, z_digit_value (c) + '0');
532           if (is_negative_z_digit (c))
533             ds_data (&tmp)[0] = '-';
534           got_final_digit = true;
535         }
536       else if (c == '.' && !got_dot)
537         {
538           ds_put_byte (&tmp, '.');
539           got_dot = true;
540         }
541       else
542         {
543           ds_destroy (&tmp);
544           return xstrdup (_("Invalid zoned decimal syntax."));
545         }
546     }
547 
548   if (!ss_is_empty (i->input))
549     {
550       char *error;
551 
552       if (ds_length (&tmp) == 1)
553         error = xstrdup (_("Field contents are not numeric."));
554       else
555         error = xstrdup (_("Number followed by garbage."));
556 
557       ds_destroy (&tmp);
558       return error;
559     }
560 
561   /* Let c_strtod() do the conversion. */
562   save_errno = errno;
563   errno = 0;
564   i->output->f = c_strtod (ds_cstr (&tmp), NULL);
565   if (errno == ERANGE)
566     {
567       if (fabs (i->output->f) > 1)
568         {
569           i->output->f = SYSMIS;
570           ds_destroy (&tmp);
571           return xstrdup (_("Too-large number set to system-missing."));
572         }
573       else
574         {
575           i->output->f = 0.0;
576           ds_destroy (&tmp);
577           return xstrdup (_("Too-small number set to zero."));
578         }
579     }
580   else
581     errno = save_errno;
582 
583   ds_destroy (&tmp);
584   return NULL;
585 }
586 
587 /* Parses IB format. */
588 static char *
parse_IB(struct data_in * i)589 parse_IB (struct data_in *i)
590 {
591   size_t bytes;
592   uint64_t value;
593   uint64_t sign_bit;
594 
595   bytes = MIN (8, ss_length (i->input));
596   value = integer_get (settings_get_input_integer_format (), ss_data (i->input), bytes);
597 
598   sign_bit = UINT64_C(1) << (8 * bytes - 1);
599   if (!(value & sign_bit))
600     i->output->f = value;
601   else
602     {
603       /* Sign-extend to full 64 bits. */
604       value -= sign_bit << 1;
605       i->output->f = -(double) -value;
606     }
607 
608   return NULL;
609 }
610 
611 /* Parses PIB format. */
612 static char *
parse_PIB(struct data_in * i)613 parse_PIB (struct data_in *i)
614 {
615   i->output->f = integer_get (settings_get_input_integer_format (), ss_data (i->input),
616                               MIN (8, ss_length (i->input)));
617 
618   return NULL;
619 }
620 
621 /* Consumes the first character of S.  Stores its high 4 bits in
622    HIGH_NIBBLE and its low 4 bits in LOW_NIBBLE. */
623 static void
get_nibbles(struct substring * s,int * high_nibble,int * low_nibble)624 get_nibbles (struct substring *s, int *high_nibble, int *low_nibble)
625 {
626   int c = ss_get_byte (s);
627   assert (c != EOF);
628   *high_nibble = (c >> 4) & 15;
629   *low_nibble = c & 15;
630 }
631 
632 /* Parses P format. */
633 static char *
parse_P(struct data_in * i)634 parse_P (struct data_in *i)
635 {
636   int high_nibble, low_nibble;
637 
638   i->output->f = 0.0;
639 
640   while (ss_length (i->input) > 1)
641     {
642       get_nibbles (&i->input, &high_nibble, &low_nibble);
643       if (high_nibble > 9 || low_nibble > 9)
644         return xstrdup (_("Invalid syntax for P field."));
645       i->output->f = (100 * i->output->f) + (10 * high_nibble) + low_nibble;
646     }
647 
648   get_nibbles (&i->input, &high_nibble, &low_nibble);
649   if (high_nibble > 9)
650     return xstrdup (_("Invalid syntax for P field."));
651   i->output->f = (10 * i->output->f) + high_nibble;
652   if (low_nibble < 10)
653     i->output->f = (10 * i->output->f) + low_nibble;
654   else if (low_nibble == 0xb || low_nibble == 0xd)
655     i->output->f = -i->output->f;
656 
657   return NULL;
658 }
659 
660 /* Parses PK format. */
661 static char *
parse_PK(struct data_in * i)662 parse_PK (struct data_in *i)
663 {
664   i->output->f = 0.0;
665   while (!ss_is_empty (i->input))
666     {
667       int high_nibble, low_nibble;
668 
669       get_nibbles (&i->input, &high_nibble, &low_nibble);
670       if (high_nibble > 9 || low_nibble > 9)
671         {
672           i->output->f = SYSMIS;
673           return NULL;
674         }
675       i->output->f = (100 * i->output->f) + (10 * high_nibble) + low_nibble;
676     }
677 
678   return NULL;
679 }
680 
681 /* Parses RB format. */
682 static char *
parse_RB(struct data_in * i)683 parse_RB (struct data_in *i)
684 {
685   enum float_format ff = settings_get_input_float_format ();
686   size_t size = float_get_size (ff);
687   if (ss_length (i->input) >= size)
688     float_convert (ff, ss_data (i->input),
689                    FLOAT_NATIVE_DOUBLE, &i->output->f);
690   else
691     i->output->f = SYSMIS;
692 
693   return NULL;
694 }
695 
696 /* Parses A format. */
697 static char *
parse_A(struct data_in * i)698 parse_A (struct data_in *i)
699 {
700   /* This is equivalent to buf_copy_rpad, except that we posibly
701      do a character set recoding in the middle. */
702   uint8_t *dst = i->output->s;
703   size_t dst_size = i->width;
704   const char *src = ss_data (i->input);
705   size_t src_size = ss_length (i->input);
706 
707   memcpy (dst, src, MIN (src_size, dst_size));
708 
709   if (dst_size > src_size)
710     memset (&dst[src_size], ' ', dst_size - src_size);
711 
712   return NULL;
713 }
714 
715 /* Parses AHEX format. */
716 static char *
parse_AHEX(struct data_in * i)717 parse_AHEX (struct data_in *i)
718 {
719   uint8_t *s = i->output->s;
720   size_t j;
721 
722   for (j = 0; ; j++)
723     {
724       int hi = ss_get_byte (&i->input);
725       int lo = ss_get_byte (&i->input);
726       if (hi == EOF)
727         break;
728       else if (lo == EOF)
729         return xstrdup (_("Field must have even length."));
730 
731       if (!c_isxdigit (hi) || !c_isxdigit (lo))
732         return xstrdup (_("Field must contain only hex digits."));
733 
734       if (j < i->width)
735         s[j] = hexit_value (hi) * 16 + hexit_value (lo);
736     }
737 
738   memset (&s[j], ' ', i->width - j);
739 
740   return NULL;
741 }
742 
743 /* Date & time format components. */
744 
745 /* Sign of a time value. */
746 enum time_sign
747   {
748     SIGN_NO_TIME,       /* No time yet encountered. */
749     SIGN_POSITIVE,      /* Positive time. */
750     SIGN_NEGATIVE       /* Negative time. */
751   };
752 
753 /* Parses a signed decimal integer from at most the first
754    MAX_DIGITS characters in I, storing the result into *RESULT.
755    Returns true if successful, false if no integer was
756    present. */
757 static char * WARN_UNUSED_RESULT
parse_int(struct data_in * i,long * result,size_t max_digits)758 parse_int (struct data_in *i, long *result, size_t max_digits)
759 {
760   struct substring head = ss_head (i->input, max_digits);
761   size_t n = ss_get_long (&head, result);
762   if (n)
763     {
764       ss_advance (&i->input, n);
765       return NULL;
766     }
767   else
768     return xstrdup (_("Syntax error in date field."));
769 }
770 
771 /* Parses a date integer between 1 and 31 from I, storing it into
772    *DAY.
773    Returns true if successful, false if no date was present. */
774 static char *
parse_day(struct data_in * i,long * day)775 parse_day (struct data_in *i, long *day)
776 {
777   char *error = parse_int (i, day, SIZE_MAX);
778   if (error != NULL)
779     return error;
780   if (*day >= 1 && *day <= 31)
781     return NULL;
782 
783   return xasprintf (_("Day (%ld) must be between 1 and 31."), *day);
784 }
785 
786 /* If *TIME_SIGN is SIGN_NO_TIME, allows a sign to precede the
787    time and sets *TIME_SIGN.  Otherwise, does not allow a sign. */
788 static void
parse_time_sign(struct data_in * i,enum time_sign * time_sign)789 parse_time_sign (struct data_in *i, enum time_sign *time_sign)
790 {
791   if (*time_sign == SIGN_NO_TIME)
792     {
793       if (ss_match_byte (&i->input, '-'))
794         *time_sign = SIGN_NEGATIVE;
795       else
796         {
797           ss_match_byte (&i->input, '+');
798           *time_sign = SIGN_POSITIVE;
799         }
800     }
801 }
802 
803 /* Parses an integer from the beginning of I.
804    Adds SECONDS_PER_UNIT times the absolute value of the integer
805    to *TIME.
806    Returns true if successful, false if no integer was present. */
807 static char *
parse_time_units(struct data_in * i,double seconds_per_unit,double * time)808 parse_time_units (struct data_in *i, double seconds_per_unit, double *time)
809 
810 {
811   char *error;
812   long units;
813 
814   error = parse_int (i, &units, SIZE_MAX);
815   if (error != NULL)
816     return error;
817   if (units < 0)
818     return xstrdup (_("Syntax error in date field."));
819   *time += units * seconds_per_unit;
820   return NULL;
821 }
822 
823 /* Parses a data delimiter from the beginning of I.
824    Returns true if successful, false if no delimiter was
825    present. */
826 static char *
parse_date_delimiter(struct data_in * i)827 parse_date_delimiter (struct data_in *i)
828 {
829   if (ss_ltrim (&i->input, ss_cstr ("-/.," CC_SPACES)))
830     return NULL;
831 
832   return xstrdup (_("Delimiter expected between fields in date."));
833 }
834 
835 /* Parses spaces at the beginning of I. */
836 static void
parse_spaces(struct data_in * i)837 parse_spaces (struct data_in *i)
838 {
839   ss_ltrim (&i->input, ss_cstr (CC_SPACES));
840 }
841 
842 static struct substring
parse_name_token(struct data_in * i)843 parse_name_token (struct data_in *i)
844 {
845   struct substring token;
846   ss_get_bytes (&i->input, ss_span (i->input, ss_cstr (CC_LETTERS)), &token);
847   return token;
848 }
849 
850 /* Reads a name from I and sets *OUTPUT to the value associated
851    with that name.  If ALLOW_SUFFIXES is true, then names that
852    begin with one of the names are accepted; otherwise, only
853    exact matches (except for case) are allowed.
854    Returns true if successful, false otherwise. */
855 static bool
match_name(struct substring token,const char * const * names,long * output)856 match_name (struct substring token, const char *const *names, long *output)
857 {
858   int i;
859 
860   for (i = 1; *names != NULL; i++)
861     if (ss_equals_case (ss_cstr (*names++), token))
862       {
863         *output = i;
864         return true;
865       }
866 
867   return false;
868 }
869 
870 /* Parses a month name or number from the beginning of I,
871    storing the month (in range 1...12) into *MONTH.
872    Returns true if successful, false if no month was present. */
873 static char *
parse_month(struct data_in * i,long * month)874 parse_month (struct data_in *i, long *month)
875 {
876   if (c_isdigit (ss_first (i->input)))
877     {
878       char *error = parse_int (i, month, SIZE_MAX);
879       if (error != NULL)
880 	return error;
881       if (*month >= 1 && *month <= 12)
882         return NULL;
883     }
884   else
885     {
886       static const char *const english_names[] =
887         {
888           "jan", "feb", "mar", "apr", "may", "jun",
889           "jul", "aug", "sep", "oct", "nov", "dec",
890           NULL,
891         };
892 
893       static const char *const roman_names[] =
894         {
895           "i", "ii", "iii", "iv", "v", "vi",
896           "vii", "viii", "ix", "x", "xi", "xii",
897           NULL,
898         };
899 
900       struct substring token = parse_name_token (i);
901       if (match_name (ss_head (token, 3), english_names, month)
902           || match_name (ss_head (token, 4), roman_names, month))
903         return NULL;
904     }
905 
906   return xstrdup (_("Unrecognized month format.  Months may be specified "
907                     "as Arabic or Roman numerals or as at least 3 letters "
908                     "of their English names."));
909 }
910 
911 /* Parses a year of at most MAX_DIGITS from the beginning of I,
912    storing a "4-digit" year into *YEAR. */
913 static char *
parse_year(struct data_in * i,long * year,size_t max_digits)914 parse_year (struct data_in *i, long *year, size_t max_digits)
915 {
916   char *error = parse_int (i, year, max_digits);
917   if (error != NULL)
918     return error;
919 
920   if (*year >= 0 && *year <= 99)
921     {
922       int epoch = settings_get_epoch ();
923       int epoch_century = ROUND_DOWN (epoch, 100);
924       int epoch_offset = epoch - epoch_century;
925       if (*year >= epoch_offset)
926         *year += epoch_century;
927       else
928         *year += epoch_century + 100;
929     }
930   if (*year >= 1582 && *year <= 19999)
931     return NULL;
932 
933   return xasprintf (_("Year (%ld) must be between 1582 and 19999."), *year);
934 }
935 
936 /* Returns true if input in I has been exhausted,
937    false otherwise. */
938 static char *
parse_trailer(struct data_in * i)939 parse_trailer (struct data_in *i)
940 {
941   if (ss_is_empty (i->input))
942     return NULL;
943 
944   return xasprintf (_("Trailing garbage `%.*s' following date."),
945                     (int) ss_length (i->input), ss_data (i->input));
946 }
947 
948 /* Parses a 3-digit Julian day-of-year value from I into *YDAY.
949    Returns true if successful, false on failure. */
950 static char *
parse_yday(struct data_in * i,long * yday)951 parse_yday (struct data_in *i, long *yday)
952 {
953   struct substring num_s;
954   long num;
955 
956   ss_get_bytes (&i->input, 3, &num_s);
957   if (ss_span (num_s, ss_cstr (CC_DIGITS)) != 3)
958     return xstrdup (_("Julian day must have exactly three digits."));
959   else if (!ss_get_long (&num_s, &num) || num < 1 || num > 366)
960     return xasprintf (_("Julian day (%ld) must be between 1 and 366."), num);
961 
962   *yday = num;
963   return NULL;
964 }
965 
966 /* Parses a quarter-of-year integer between 1 and 4 from I.
967    Stores the corresponding month into *MONTH.
968    Returns true if successful, false if no quarter was present. */
969 static char *
parse_quarter(struct data_in * i,long int * month)970 parse_quarter (struct data_in *i, long int *month)
971 {
972   long quarter;
973   char *error;
974 
975   error = parse_int (i, &quarter, SIZE_MAX);
976   if (error != NULL)
977     return error;
978   if (quarter >= 1 && quarter <= 4)
979     {
980       *month = (quarter - 1) * 3 + 1;
981       return NULL;
982     }
983 
984   return xasprintf (_("Quarter (%ld) must be between 1 and 4."), quarter);
985 }
986 
987 /* Parses a week-of-year integer between 1 and 53 from I,
988    Stores the corresponding year-of-day into *YDAY.
989    Returns true if successful, false if no week was present. */
990 static char *
parse_week(struct data_in * i,long int * yday)991 parse_week (struct data_in *i, long int *yday)
992 {
993   char *error;
994   long week;
995 
996   error = parse_int (i, &week, SIZE_MAX);
997   if (error != NULL)
998     return error;
999   if (week >= 1 && week <= 53)
1000     {
1001       *yday = (week - 1) * 7 + 1;
1002       return NULL;
1003     }
1004 
1005   return xasprintf (_("Week (%ld) must be between 1 and 53."), week);
1006 }
1007 
1008 /* Parses a time delimiter from the beginning of I.
1009    Returns true if successful, false if no delimiter was
1010    present. */
1011 static char *
parse_time_delimiter(struct data_in * i)1012 parse_time_delimiter (struct data_in *i)
1013 {
1014   if (ss_ltrim (&i->input, ss_cstr (":" CC_SPACES)) > 0)
1015     return NULL;
1016 
1017   return xstrdup (_("Delimiter expected between fields in time."));
1018 }
1019 
1020 /* Parses minutes and optional seconds from the beginning of I.
1021    The time is converted into seconds, which are added to
1022    *TIME.
1023    Returns true if successful, false if an error was found. */
1024 static char *
parse_minute_second(struct data_in * i,double * time)1025 parse_minute_second (struct data_in *i, double *time)
1026 {
1027   long minute;
1028   char buf[64];
1029   char *error;
1030   char *cp;
1031 
1032   /* Parse minutes. */
1033   error = parse_int (i, &minute, SIZE_MAX);
1034   if (error != NULL)
1035     return error;
1036   if (i->format != FMT_MTIME && (minute < 0 || minute > 59))
1037     return xasprintf (_("Minute (%ld) must be between 0 and 59."), minute);
1038   *time += 60. * minute;
1039 
1040   /* Check for seconds. */
1041   if (ss_ltrim (&i->input, ss_cstr (":" CC_SPACES)) == 0
1042       || !c_isdigit (ss_first (i->input)))
1043    return NULL;
1044 
1045   /* Parse seconds. */
1046   cp = buf;
1047   while (c_isdigit (ss_first (i->input)))
1048     *cp++ = ss_get_byte (&i->input);
1049   if (ss_match_byte (&i->input, settings_get_decimal_char (FMT_F)))
1050     *cp++ = '.';
1051   while (c_isdigit (ss_first (i->input)))
1052     *cp++ = ss_get_byte (&i->input);
1053   *cp = '\0';
1054 
1055   *time += c_strtod (buf, NULL);
1056 
1057   return NULL;
1058 }
1059 
1060 /* Parses a weekday name from the beginning of I,
1061    storing a value of 1=Sunday...7=Saturday into *WEEKDAY.
1062    Returns true if successful, false if an error was found. */
1063 static char *
parse_weekday(struct data_in * i,long * weekday)1064 parse_weekday (struct data_in *i, long *weekday)
1065 {
1066   static const char *const weekday_names[] =
1067     {
1068       "su", "mo", "tu", "we", "th", "fr", "sa",
1069       NULL,
1070     };
1071 
1072   struct substring token = parse_name_token (i);
1073   bool ok = match_name (ss_head (token, 2), weekday_names, weekday);
1074   if (!ok)
1075     return xstrdup (_("Unrecognized weekday name.  At least the first two "
1076                       "letters of an English weekday name must be "
1077                       "specified."));
1078   return NULL;
1079 }
1080 
1081 /* Date & time formats. */
1082 
1083 /* Parses WKDAY format. */
1084 static char *
parse_WKDAY(struct data_in * i)1085 parse_WKDAY (struct data_in *i)
1086 {
1087   long weekday = 0;
1088   char *error;
1089 
1090   if (trim_spaces_and_check_missing (i))
1091     return NULL;
1092 
1093   error = parse_weekday (i, &weekday);
1094   if (error == NULL)
1095     error = parse_trailer (i);
1096 
1097   i->output->f = weekday;
1098   return error;
1099 }
1100 
1101 /* Parses MONTH format. */
1102 static char *
parse_MONTH(struct data_in * i)1103 parse_MONTH (struct data_in *i)
1104 {
1105   long month;
1106   char *error;
1107 
1108   if (trim_spaces_and_check_missing (i))
1109     return NULL;
1110 
1111   error = parse_month (i, &month);
1112   if (error == NULL)
1113     error = parse_trailer (i);
1114 
1115   i->output->f = month;
1116   return error;
1117 }
1118 
1119 /* Parses DATE, ADATE, EDATE, JDATE, SDATE, QYR, MOYR, KWYR,
1120    DATETIME, YMDHMS, MTIME, TIME, and DTIME formats. */
1121 static char *
parse_date(struct data_in * i)1122 parse_date (struct data_in *i)
1123 {
1124   long int year = INT_MIN;
1125   long int month = 1;
1126   long int day = 1;
1127   long int yday = 1;
1128   double time = 0, date = 0;
1129   enum time_sign time_sign = SIGN_NO_TIME;
1130 
1131   const char *template = fmt_date_template (i->format, 0);
1132   size_t template_width = strlen (template);
1133   char *error;
1134 
1135   if (trim_spaces_and_check_missing (i))
1136     return NULL;
1137 
1138   while (*template != '\0')
1139     {
1140       unsigned char ch = *template;
1141       int count = 1;
1142 
1143       while (template[count] == ch)
1144         count++;
1145       template += count;
1146 
1147       switch (ch)
1148         {
1149         case 'd':
1150           error = count < 3 ? parse_day (i, &day) : parse_yday (i, &yday);
1151           break;
1152         case 'm':
1153           error = parse_month (i, &month);
1154           break;
1155         case 'y':
1156           {
1157             size_t max_digits;
1158             if (!c_isalpha (*template))
1159               max_digits = SIZE_MAX;
1160             else
1161               {
1162                 if (ss_length (i->input) >= template_width + 2)
1163                   max_digits = 4;
1164                 else
1165                   max_digits = 2;
1166               }
1167             error = parse_year (i, &year, max_digits);
1168           }
1169           break;
1170         case 'q':
1171           error = parse_quarter (i, &month);
1172           break;
1173         case 'w':
1174           error = parse_week (i, &yday);
1175           break;
1176         case 'D':
1177           parse_time_sign (i, &time_sign);
1178           error = parse_time_units (i, 60. * 60. * 24., &time);
1179           break;
1180         case 'H':
1181           parse_time_sign (i, &time_sign);
1182           error = parse_time_units (i, 60. * 60., &time);
1183           break;
1184         case 'M':
1185           if (i->format == FMT_MTIME)
1186             parse_time_sign (i, &time_sign);
1187           error = parse_minute_second (i, &time);
1188           break;
1189         case '-':
1190         case '/':
1191         case '.':
1192           error = parse_date_delimiter (i);
1193           break;
1194         case ':':
1195           error = parse_time_delimiter (i);
1196           break;
1197         case ' ':
1198           if (i->format != FMT_MOYR)
1199             {
1200               parse_spaces (i);
1201               error = NULL;
1202             }
1203           else
1204             error = parse_date_delimiter (i);
1205           break;
1206         default:
1207           assert (count == 1);
1208           if (!ss_match_byte (&i->input, c_toupper (ch))
1209               && !ss_match_byte (&i->input, c_tolower (ch)))
1210             error = xasprintf (_("`%c' expected in date field."), ch);
1211           else
1212             error = NULL;
1213           break;
1214         }
1215       if (error != NULL)
1216         return error;
1217     }
1218   error = parse_trailer (i);
1219   if (error != NULL)
1220     return error;
1221 
1222   if (year != INT_MIN)
1223     {
1224       char *error;
1225       double ofs;
1226 
1227       ofs = calendar_gregorian_to_offset (year, month, day, &error);
1228       if (ofs == SYSMIS)
1229         return error;
1230       date = (yday - 1 + ofs) * 60. * 60. * 24.;
1231     }
1232   else
1233     date = 0.;
1234   i->output->f = date + (time_sign == SIGN_NEGATIVE ? -time : time);
1235 
1236   return NULL;
1237 }
1238 
1239 /* Utility functions. */
1240 
1241 /* Sets the default result for I.
1242    For a numeric format, this is the value set on SET BLANKS
1243    (typically system-missing); for a string format, it is all
1244    spaces. */
1245 static void
default_result(struct data_in * i)1246 default_result (struct data_in *i)
1247 {
1248   if (fmt_is_string (i->format))
1249     memset (i->output->s, ' ', i->width);
1250   else
1251     i->output->f = settings_get_blanks ();
1252 }
1253 
1254 /* Trims leading and trailing spaces from I.
1255    If the result is empty, or a single period character, then
1256    sets the default result and returns true; otherwise, returns
1257    false. */
1258 static bool
trim_spaces_and_check_missing(struct data_in * i)1259 trim_spaces_and_check_missing (struct data_in *i)
1260 {
1261   ss_trim (&i->input, ss_cstr (" "));
1262   if (ss_is_empty (i->input) || ss_equals (i->input, ss_cstr (".")))
1263     {
1264       default_result (i);
1265       return true;
1266     }
1267   return false;
1268 }
1269 
1270 /* Returns the integer value of hex digit C. */
1271 static int
hexit_value(int c)1272 hexit_value (int c)
1273 {
1274   const char s[] = "0123456789abcdef";
1275   const char *cp = strchr (s, c_tolower ((unsigned char) c));
1276 
1277   assert (cp != NULL);
1278   return cp - s;
1279 }
1280