1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "language/lexer/scan.h"
20 
21 #include <limits.h>
22 #include <unistr.h>
23 
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28 
29 #include "gl/c-ctype.h"
30 #include "gl/c-strtod.h"
31 #include "gl/xmemdup0.h"
32 
33 enum
34   {
35     S_START,
36     S_DASH,
37     S_STRING
38   };
39 
40 #define SS_NL_BEFORE_PLUS (1u << 0)
41 #define SS_PLUS           (1u << 1)
42 #define SS_NL_AFTER_PLUS  (1u << 2)
43 
44 /* Returns the integer value of (hex) digit C. */
45 static int
digit_value(int c)46 digit_value (int c)
47 {
48   switch (c)
49     {
50     case '0': return 0;
51     case '1': return 1;
52     case '2': return 2;
53     case '3': return 3;
54     case '4': return 4;
55     case '5': return 5;
56     case '6': return 6;
57     case '7': return 7;
58     case '8': return 8;
59     case '9': return 9;
60     case 'a': case 'A': return 10;
61     case 'b': case 'B': return 11;
62     case 'c': case 'C': return 12;
63     case 'd': case 'D': return 13;
64     case 'e': case 'E': return 14;
65     case 'f': case 'F': return 15;
66     default: return INT_MAX;
67     }
68 }
69 
70 static bool
scan_quoted_string__(struct substring s,struct token * token)71 scan_quoted_string__ (struct substring s, struct token *token)
72 {
73   int quote;
74 
75   /* Trim ' or " from front and back. */
76   quote = s.string[s.length - 1];
77   s.string++;
78   s.length -= 2;
79 
80   ss_realloc (&token->string, token->string.length + s.length + 1);
81 
82   for (;;)
83     {
84       size_t pos = ss_find_byte (s, quote);
85       if (pos == SIZE_MAX)
86         break;
87 
88       memcpy (ss_end (token->string), s.string, pos + 1);
89       token->string.length += pos + 1;
90       ss_advance (&s, pos + 2);
91     }
92 
93   memcpy (ss_end (token->string), s.string, ss_length (s));
94   token->string.length += ss_length (s);
95 
96   return true;
97 }
98 
99 static bool
scan_hex_string__(struct substring s,struct token * token)100 scan_hex_string__ (struct substring s, struct token *token)
101 {
102   uint8_t *dst;
103   size_t i;
104 
105   /* Trim X' from front and ' from back. */
106   s.string += 2;
107   s.length -= 3;
108 
109   if (s.length % 2 != 0)
110     {
111       token->type = SCAN_BAD_HEX_LENGTH;
112       token->number = s.length;
113       return false;
114     }
115 
116   ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
117   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
118   token->string.length += s.length / 2;
119   for (i = 0; i < s.length; i += 2)
120     {
121       int hi = digit_value (s.string[i]);
122       int lo = digit_value (s.string[i + 1]);
123 
124       if (hi >= 16 || lo >= 16)
125         {
126           token->type = SCAN_BAD_HEX_DIGIT;
127           token->number = s.string[hi >= 16 ? i : i + 1];
128           return false;
129         }
130 
131       *dst++ = hi * 16 + lo;
132     }
133 
134   return true;
135 }
136 
137 static bool
scan_unicode_string__(struct substring s,struct token * token)138 scan_unicode_string__ (struct substring s, struct token *token)
139 {
140   uint8_t *dst;
141   ucs4_t uc;
142   size_t i;
143 
144   /* Trim U' from front and ' from back. */
145   s.string += 2;
146   s.length -= 3;
147 
148   if (s.length < 1 || s.length > 8)
149     {
150       token->type = SCAN_BAD_UNICODE_LENGTH;
151       token->number = s.length;
152       return 0;
153     }
154 
155   ss_realloc (&token->string, token->string.length + 4 + 1);
156 
157   uc = 0;
158   for (i = 0; i < s.length; i++)
159     {
160       int digit = digit_value (s.string[i]);
161       if (digit >= 16)
162         {
163           token->type = SCAN_BAD_UNICODE_DIGIT;
164           token->number = s.string[i];
165           return 0;
166         }
167       uc = uc * 16 + digit;
168     }
169 
170   if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
171     {
172       token->type = SCAN_BAD_UNICODE_CODE_POINT;
173       token->number = uc;
174       return 0;
175     }
176 
177   dst = CHAR_CAST (uint8_t *, ss_end (token->string));
178   token->string.length += u8_uctomb (dst, uc, 4);
179 
180   return true;
181 }
182 
183 static enum scan_result
scan_string_segment__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)184 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
185                        struct substring s, struct token *token)
186 {
187   bool ok;
188 
189   switch (type)
190     {
191     case SEG_QUOTED_STRING:
192       ok = scan_quoted_string__ (s, token);
193       break;
194 
195     case SEG_HEX_STRING:
196       ok = scan_hex_string__ (s, token);
197       break;
198 
199     case SEG_UNICODE_STRING:
200       ok = scan_unicode_string__ (s, token);
201       break;
202 
203     default:
204       NOT_REACHED ();
205     }
206 
207   if (ok)
208     {
209       token->type = T_STRING;
210       token->string.string[token->string.length] = '\0';
211       scanner->state = S_STRING;
212       scanner->substate = 0;
213       return SCAN_SAVE;
214     }
215   else
216     {
217       /* The function we called above should have filled in token->type and
218          token->number properly to describe the error. */
219       ss_dealloc (&token->string);
220       token->string = ss_empty ();
221       return SCAN_DONE;
222     }
223 
224 }
225 
226 static enum scan_result
add_bit(struct scanner * scanner,unsigned int bit)227 add_bit (struct scanner *scanner, unsigned int bit)
228 {
229   if (!(scanner->substate & bit))
230     {
231       scanner->substate |= bit;
232       return SCAN_MORE;
233     }
234   else
235     return SCAN_BACK;
236 }
237 
238 static enum scan_result
scan_string__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)239 scan_string__ (struct scanner *scanner, enum segment_type type,
240                struct substring s, struct token *token)
241 {
242   switch (type)
243     {
244     case SEG_SPACES:
245     case SEG_COMMENT:
246       return SCAN_MORE;
247 
248     case SEG_NEWLINE:
249       if (scanner->substate & SS_PLUS)
250         return add_bit (scanner, SS_NL_AFTER_PLUS);
251       else
252         return add_bit (scanner, SS_NL_BEFORE_PLUS);
253 
254     case SEG_PUNCT:
255       return (s.length == 1 && s.string[0] == '+'
256               ? add_bit (scanner, SS_PLUS)
257               : SCAN_BACK);
258 
259     case SEG_QUOTED_STRING:
260     case SEG_HEX_STRING:
261     case SEG_UNICODE_STRING:
262       return (scanner->substate & SS_PLUS
263               ? scan_string_segment__ (scanner, type, s, token)
264               : SCAN_BACK);
265 
266     default:
267       return SCAN_BACK;
268     }
269 }
270 
271 static enum token_type
scan_reserved_word__(struct substring word)272 scan_reserved_word__ (struct substring word)
273 {
274   switch (c_toupper (word.string[0]))
275     {
276     case 'B':
277       return T_BY;
278 
279     case 'E':
280       return T_EQ;
281 
282     case 'G':
283       return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
284 
285     case 'L':
286       return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
287 
288     case 'N':
289       return word.length == 2 ? T_NE : T_NOT;
290 
291     case 'O':
292       return T_OR;
293 
294     case 'T':
295       return T_TO;
296 
297     case 'A':
298       return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
299 
300     case 'W':
301       return T_WITH;
302     }
303 
304   NOT_REACHED ();
305 }
306 
307 static enum token_type
scan_punct1__(char c0)308 scan_punct1__ (char c0)
309 {
310   switch (c0)
311     {
312     case '(': return T_LPAREN;
313     case ')': return T_RPAREN;
314     case ',': return T_COMMA;
315     case '=': return T_EQUALS;
316     case '-': return T_DASH;
317     case '[': return T_LBRACK;
318     case ']': return T_RBRACK;
319     case '&': return T_AND;
320     case '|': return T_OR;
321     case '+': return T_PLUS;
322     case '/': return T_SLASH;
323     case '*': return T_ASTERISK;
324     case '<': return T_LT;
325     case '>': return T_GT;
326     case '~': return T_NOT;
327     }
328 
329   NOT_REACHED ();
330 }
331 
332 static enum token_type
scan_punct2__(char c0,char c1)333 scan_punct2__ (char c0, char c1)
334 {
335   switch (c0)
336     {
337     case '*':
338       return T_EXP;
339 
340     case '<':
341       return c1 == '=' ? T_LE : T_NE;
342 
343     case '>':
344       return T_GE;
345 
346     case '~':
347       return T_NE;
348 
349     case '&':
350       return T_AND;
351 
352     case '|':
353       return T_OR;
354     }
355 
356   NOT_REACHED ();
357 }
358 
359 static enum token_type
scan_punct__(struct substring s)360 scan_punct__ (struct substring s)
361 {
362   return (s.length == 1
363           ? scan_punct1__ (s.string[0])
364           : scan_punct2__ (s.string[0], s.string[1]));
365 }
366 
367 static double
scan_number__(struct substring s)368 scan_number__ (struct substring s)
369 {
370   char buf[128];
371   double number;
372   char *p;
373 
374   if (s.length < sizeof buf)
375     {
376       p = buf;
377       memcpy (buf, s.string, s.length);
378       buf[s.length] = '\0';
379     }
380   else
381     p = xmemdup0 (s.string, s.length);
382 
383   number = c_strtod (p, NULL);
384 
385   if (p != buf)
386     free (p);
387 
388   return number;
389 }
390 
391 static enum scan_result
scan_unexpected_char(const struct substring * s,struct token * token)392 scan_unexpected_char (const struct substring *s, struct token *token)
393 {
394   ucs4_t uc;
395 
396   token->type = SCAN_UNEXPECTED_CHAR;
397   u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
398   token->number = uc;
399 
400   return SCAN_DONE;
401 }
402 
403 const char *
scan_type_to_string(enum scan_type type)404 scan_type_to_string (enum scan_type type)
405 {
406   switch (type)
407     {
408 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
409       SCAN_TYPES
410 #undef SCAN_TYPE
411 
412     default:
413       return token_type_to_name ((enum token_type) type);
414     }
415 }
416 
417 bool
is_scan_type(enum scan_type type)418 is_scan_type (enum scan_type type)
419 {
420   return type > SCAN_FIRST && type < SCAN_LAST;
421 }
422 
423 static enum scan_result
scan_start__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)424 scan_start__ (struct scanner *scanner, enum segment_type type,
425               struct substring s, struct token *token)
426 {
427   switch (type)
428     {
429     case SEG_NUMBER:
430       token->type = T_POS_NUM;
431       token->number = scan_number__ (s);
432       return SCAN_DONE;
433 
434     case SEG_QUOTED_STRING:
435     case SEG_HEX_STRING:
436     case SEG_UNICODE_STRING:
437       return scan_string_segment__ (scanner, type, s, token);
438 
439     case SEG_UNQUOTED_STRING:
440     case SEG_DO_REPEAT_COMMAND:
441     case SEG_INLINE_DATA:
442     case SEG_DOCUMENT:
443       token->type = T_STRING;
444       ss_alloc_substring (&token->string, s);
445       return SCAN_DONE;
446 
447     case SEG_RESERVED_WORD:
448       token->type = scan_reserved_word__ (s);
449       return SCAN_DONE;
450 
451     case SEG_IDENTIFIER:
452       token->type = T_ID;
453       ss_alloc_substring (&token->string, s);
454       return SCAN_DONE;
455 
456     case SEG_PUNCT:
457       if (s.length == 1 && s.string[0] == '-')
458         {
459           scanner->state = S_DASH;
460           return SCAN_SAVE;
461         }
462       else
463         {
464           token->type = scan_punct__ (s);
465           return SCAN_DONE;
466         }
467 
468     case SEG_SHBANG:
469     case SEG_SPACES:
470     case SEG_COMMENT:
471     case SEG_NEWLINE:
472     case SEG_COMMENT_COMMAND:
473       token->type = SCAN_SKIP;
474       return SCAN_DONE;
475 
476     case SEG_START_DOCUMENT:
477       token->type = T_ID;
478       ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
479       return SCAN_DONE;
480 
481     case SEG_START_COMMAND:
482     case SEG_SEPARATE_COMMANDS:
483     case SEG_END_COMMAND:
484       token->type = T_ENDCMD;
485       return SCAN_DONE;
486 
487     case SEG_END:
488       token->type = T_STOP;
489       return SCAN_DONE;
490 
491     case SEG_EXPECTED_QUOTE:
492       token->type = SCAN_EXPECTED_QUOTE;
493       return SCAN_DONE;
494 
495     case SEG_EXPECTED_EXPONENT:
496       token->type = SCAN_EXPECTED_EXPONENT;
497       ss_alloc_substring (&token->string, s);
498       return SCAN_DONE;
499 
500     case SEG_UNEXPECTED_DOT:
501       token->type = SCAN_UNEXPECTED_DOT;
502       return SCAN_DONE;
503 
504     case SEG_UNEXPECTED_CHAR:
505       return scan_unexpected_char (&s, token);
506     }
507 
508   NOT_REACHED ();
509 }
510 
511 static enum scan_result
scan_dash__(enum segment_type type,struct substring s,struct token * token)512 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
513 {
514   switch (type)
515     {
516     case SEG_SPACES:
517     case SEG_COMMENT:
518       return SCAN_MORE;
519 
520     case SEG_NUMBER:
521       token->type = T_NEG_NUM;
522       token->number = -scan_number__ (s);
523       return SCAN_DONE;
524 
525     default:
526       token->type = T_DASH;
527       return SCAN_BACK;
528     }
529 }
530 
531 /* Initializes SCANNER for scanning a token from a sequence of segments.
532    Initializes TOKEN as the output token.  (The client retains ownership of
533    TOKEN, but it must be preserved across subsequent calls to scanner_push()
534    for SCANNER.)
535 
536    A scanner only produces a single token.  To obtain the next token,
537    re-initialize it by calling this function again.
538 
539    A scanner does not contain any external references, so nothing needs to be
540    done to destroy one.  For the same reason, scanners may be copied with plain
541    struct assignment (or memcpy). */
542 void
scanner_init(struct scanner * scanner,struct token * token)543 scanner_init (struct scanner *scanner, struct token *token)
544 {
545   scanner->state = S_START;
546   token_init (token);
547 }
548 
549 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER.  TOKEN must be
550    the same token passed to scanner_init() for SCANNER, or a copy of it.
551    scanner_push() may modify TOKEN.  The client retains ownership of TOKEN,
552 
553    The possible return values are:
554 
555      - SCAN_DONE: All of the segments that have been passed to scanner_push()
556        form the token now stored in TOKEN.  SCANNER is now "used up" and must
557        be reinitialized with scanner_init() if it is to be used again.
558 
559        Most tokens only consist of a single segment, so this is the most common
560        return value.
561 
562      - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
563        token.  The caller should call scanner_push() again with the next token.
564        (This won't happen if TYPE is SEG_END indicating the end of input.)
565 
566      - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
567        needs to "save its place" in the stream of segments for a possible
568        future SCAN_BACK return.  This value can be returned more than once in a
569        sequence of scanner_push() calls for SCANNER, but the caller only needs
570        to keep track of the most recent position.
571 
572      - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
573        the segments up to and including the segment for which SCAN_SAVE was
574        most recently returned.  Segments following that one should be passed to
575        the next scanner to be initialized.
576 */
577 enum scan_result
scanner_push(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)578 scanner_push (struct scanner *scanner, enum segment_type type,
579               struct substring s, struct token *token)
580 {
581   switch (scanner->state)
582     {
583     case S_START:
584       return scan_start__ (scanner, type, s, token);
585 
586     case S_DASH:
587       return scan_dash__ (type, s, token);
588 
589     case S_STRING:
590       return scan_string__ (scanner, type, s, token);
591     }
592 
593   NOT_REACHED ();
594 }
595 
596 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
597    specified MODE.
598 
599    SLEX has no internal state to free, but it retains a reference to INPUT, so
600    INPUT must not be modified or freed while SLEX is still in use. */
601 void
string_lexer_init(struct string_lexer * slex,const char * input,size_t length,enum segmenter_mode mode)602 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
603                    enum segmenter_mode mode)
604 {
605   slex->input = input;
606   slex->length = length;
607   slex->offset = 0;
608   segmenter_init (&slex->segmenter, mode);
609 }
610 
611 /*  */
612 bool
string_lexer_next(struct string_lexer * slex,struct token * token)613 string_lexer_next (struct string_lexer *slex, struct token *token)
614 {
615   struct segmenter saved_segmenter;
616   size_t saved_offset = 0;
617 
618   struct scanner scanner;
619 
620   scanner_init (&scanner, token);
621   for (;;)
622     {
623       const char *s = slex->input + slex->offset;
624       size_t left = slex->length - slex->offset;
625       enum segment_type type;
626       int n;
627 
628       n = segmenter_push (&slex->segmenter, s, left, true, &type);
629       assert (n >= 0);
630 
631       slex->offset += n;
632       switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
633         {
634         case SCAN_BACK:
635           slex->segmenter = saved_segmenter;
636           slex->offset = saved_offset;
637           /* Fall through. */
638         case SCAN_DONE:
639           return token->type != T_STOP;
640 
641         case SCAN_MORE:
642           break;
643 
644         case SCAN_SAVE:
645           saved_segmenter = slex->segmenter;
646           saved_offset = slex->offset;
647           break;
648         }
649     }
650 }
651