1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "language/lexer/scan.h"
20
21 #include <limits.h>
22 #include <unistr.h>
23
24 #include "data/identifier.h"
25 #include "language/lexer/token.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28
29 #include "gl/c-ctype.h"
30 #include "gl/c-strtod.h"
31 #include "gl/xmemdup0.h"
32
33 enum
34 {
35 S_START,
36 S_DASH,
37 S_STRING
38 };
39
40 #define SS_NL_BEFORE_PLUS (1u << 0)
41 #define SS_PLUS (1u << 1)
42 #define SS_NL_AFTER_PLUS (1u << 2)
43
44 /* Returns the integer value of (hex) digit C. */
45 static int
digit_value(int c)46 digit_value (int c)
47 {
48 switch (c)
49 {
50 case '0': return 0;
51 case '1': return 1;
52 case '2': return 2;
53 case '3': return 3;
54 case '4': return 4;
55 case '5': return 5;
56 case '6': return 6;
57 case '7': return 7;
58 case '8': return 8;
59 case '9': return 9;
60 case 'a': case 'A': return 10;
61 case 'b': case 'B': return 11;
62 case 'c': case 'C': return 12;
63 case 'd': case 'D': return 13;
64 case 'e': case 'E': return 14;
65 case 'f': case 'F': return 15;
66 default: return INT_MAX;
67 }
68 }
69
70 static bool
scan_quoted_string__(struct substring s,struct token * token)71 scan_quoted_string__ (struct substring s, struct token *token)
72 {
73 int quote;
74
75 /* Trim ' or " from front and back. */
76 quote = s.string[s.length - 1];
77 s.string++;
78 s.length -= 2;
79
80 ss_realloc (&token->string, token->string.length + s.length + 1);
81
82 for (;;)
83 {
84 size_t pos = ss_find_byte (s, quote);
85 if (pos == SIZE_MAX)
86 break;
87
88 memcpy (ss_end (token->string), s.string, pos + 1);
89 token->string.length += pos + 1;
90 ss_advance (&s, pos + 2);
91 }
92
93 memcpy (ss_end (token->string), s.string, ss_length (s));
94 token->string.length += ss_length (s);
95
96 return true;
97 }
98
99 static bool
scan_hex_string__(struct substring s,struct token * token)100 scan_hex_string__ (struct substring s, struct token *token)
101 {
102 uint8_t *dst;
103 size_t i;
104
105 /* Trim X' from front and ' from back. */
106 s.string += 2;
107 s.length -= 3;
108
109 if (s.length % 2 != 0)
110 {
111 token->type = SCAN_BAD_HEX_LENGTH;
112 token->number = s.length;
113 return false;
114 }
115
116 ss_realloc (&token->string, token->string.length + s.length / 2 + 1);
117 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
118 token->string.length += s.length / 2;
119 for (i = 0; i < s.length; i += 2)
120 {
121 int hi = digit_value (s.string[i]);
122 int lo = digit_value (s.string[i + 1]);
123
124 if (hi >= 16 || lo >= 16)
125 {
126 token->type = SCAN_BAD_HEX_DIGIT;
127 token->number = s.string[hi >= 16 ? i : i + 1];
128 return false;
129 }
130
131 *dst++ = hi * 16 + lo;
132 }
133
134 return true;
135 }
136
137 static bool
scan_unicode_string__(struct substring s,struct token * token)138 scan_unicode_string__ (struct substring s, struct token *token)
139 {
140 uint8_t *dst;
141 ucs4_t uc;
142 size_t i;
143
144 /* Trim U' from front and ' from back. */
145 s.string += 2;
146 s.length -= 3;
147
148 if (s.length < 1 || s.length > 8)
149 {
150 token->type = SCAN_BAD_UNICODE_LENGTH;
151 token->number = s.length;
152 return 0;
153 }
154
155 ss_realloc (&token->string, token->string.length + 4 + 1);
156
157 uc = 0;
158 for (i = 0; i < s.length; i++)
159 {
160 int digit = digit_value (s.string[i]);
161 if (digit >= 16)
162 {
163 token->type = SCAN_BAD_UNICODE_DIGIT;
164 token->number = s.string[i];
165 return 0;
166 }
167 uc = uc * 16 + digit;
168 }
169
170 if ((uc >= 0xd800 && uc < 0xe000) || uc > 0x10ffff)
171 {
172 token->type = SCAN_BAD_UNICODE_CODE_POINT;
173 token->number = uc;
174 return 0;
175 }
176
177 dst = CHAR_CAST (uint8_t *, ss_end (token->string));
178 token->string.length += u8_uctomb (dst, uc, 4);
179
180 return true;
181 }
182
183 static enum scan_result
scan_string_segment__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)184 scan_string_segment__ (struct scanner *scanner, enum segment_type type,
185 struct substring s, struct token *token)
186 {
187 bool ok;
188
189 switch (type)
190 {
191 case SEG_QUOTED_STRING:
192 ok = scan_quoted_string__ (s, token);
193 break;
194
195 case SEG_HEX_STRING:
196 ok = scan_hex_string__ (s, token);
197 break;
198
199 case SEG_UNICODE_STRING:
200 ok = scan_unicode_string__ (s, token);
201 break;
202
203 default:
204 NOT_REACHED ();
205 }
206
207 if (ok)
208 {
209 token->type = T_STRING;
210 token->string.string[token->string.length] = '\0';
211 scanner->state = S_STRING;
212 scanner->substate = 0;
213 return SCAN_SAVE;
214 }
215 else
216 {
217 /* The function we called above should have filled in token->type and
218 token->number properly to describe the error. */
219 ss_dealloc (&token->string);
220 token->string = ss_empty ();
221 return SCAN_DONE;
222 }
223
224 }
225
226 static enum scan_result
add_bit(struct scanner * scanner,unsigned int bit)227 add_bit (struct scanner *scanner, unsigned int bit)
228 {
229 if (!(scanner->substate & bit))
230 {
231 scanner->substate |= bit;
232 return SCAN_MORE;
233 }
234 else
235 return SCAN_BACK;
236 }
237
238 static enum scan_result
scan_string__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)239 scan_string__ (struct scanner *scanner, enum segment_type type,
240 struct substring s, struct token *token)
241 {
242 switch (type)
243 {
244 case SEG_SPACES:
245 case SEG_COMMENT:
246 return SCAN_MORE;
247
248 case SEG_NEWLINE:
249 if (scanner->substate & SS_PLUS)
250 return add_bit (scanner, SS_NL_AFTER_PLUS);
251 else
252 return add_bit (scanner, SS_NL_BEFORE_PLUS);
253
254 case SEG_PUNCT:
255 return (s.length == 1 && s.string[0] == '+'
256 ? add_bit (scanner, SS_PLUS)
257 : SCAN_BACK);
258
259 case SEG_QUOTED_STRING:
260 case SEG_HEX_STRING:
261 case SEG_UNICODE_STRING:
262 return (scanner->substate & SS_PLUS
263 ? scan_string_segment__ (scanner, type, s, token)
264 : SCAN_BACK);
265
266 default:
267 return SCAN_BACK;
268 }
269 }
270
271 static enum token_type
scan_reserved_word__(struct substring word)272 scan_reserved_word__ (struct substring word)
273 {
274 switch (c_toupper (word.string[0]))
275 {
276 case 'B':
277 return T_BY;
278
279 case 'E':
280 return T_EQ;
281
282 case 'G':
283 return c_toupper (word.string[1]) == 'E' ? T_GE : T_GT;
284
285 case 'L':
286 return c_toupper (word.string[1]) == 'E' ? T_LE : T_LT;
287
288 case 'N':
289 return word.length == 2 ? T_NE : T_NOT;
290
291 case 'O':
292 return T_OR;
293
294 case 'T':
295 return T_TO;
296
297 case 'A':
298 return c_toupper (word.string[1]) == 'L' ? T_ALL : T_AND;
299
300 case 'W':
301 return T_WITH;
302 }
303
304 NOT_REACHED ();
305 }
306
307 static enum token_type
scan_punct1__(char c0)308 scan_punct1__ (char c0)
309 {
310 switch (c0)
311 {
312 case '(': return T_LPAREN;
313 case ')': return T_RPAREN;
314 case ',': return T_COMMA;
315 case '=': return T_EQUALS;
316 case '-': return T_DASH;
317 case '[': return T_LBRACK;
318 case ']': return T_RBRACK;
319 case '&': return T_AND;
320 case '|': return T_OR;
321 case '+': return T_PLUS;
322 case '/': return T_SLASH;
323 case '*': return T_ASTERISK;
324 case '<': return T_LT;
325 case '>': return T_GT;
326 case '~': return T_NOT;
327 }
328
329 NOT_REACHED ();
330 }
331
332 static enum token_type
scan_punct2__(char c0,char c1)333 scan_punct2__ (char c0, char c1)
334 {
335 switch (c0)
336 {
337 case '*':
338 return T_EXP;
339
340 case '<':
341 return c1 == '=' ? T_LE : T_NE;
342
343 case '>':
344 return T_GE;
345
346 case '~':
347 return T_NE;
348
349 case '&':
350 return T_AND;
351
352 case '|':
353 return T_OR;
354 }
355
356 NOT_REACHED ();
357 }
358
359 static enum token_type
scan_punct__(struct substring s)360 scan_punct__ (struct substring s)
361 {
362 return (s.length == 1
363 ? scan_punct1__ (s.string[0])
364 : scan_punct2__ (s.string[0], s.string[1]));
365 }
366
367 static double
scan_number__(struct substring s)368 scan_number__ (struct substring s)
369 {
370 char buf[128];
371 double number;
372 char *p;
373
374 if (s.length < sizeof buf)
375 {
376 p = buf;
377 memcpy (buf, s.string, s.length);
378 buf[s.length] = '\0';
379 }
380 else
381 p = xmemdup0 (s.string, s.length);
382
383 number = c_strtod (p, NULL);
384
385 if (p != buf)
386 free (p);
387
388 return number;
389 }
390
391 static enum scan_result
scan_unexpected_char(const struct substring * s,struct token * token)392 scan_unexpected_char (const struct substring *s, struct token *token)
393 {
394 ucs4_t uc;
395
396 token->type = SCAN_UNEXPECTED_CHAR;
397 u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
398 token->number = uc;
399
400 return SCAN_DONE;
401 }
402
403 const char *
scan_type_to_string(enum scan_type type)404 scan_type_to_string (enum scan_type type)
405 {
406 switch (type)
407 {
408 #define SCAN_TYPE(NAME) case SCAN_##NAME: return #NAME;
409 SCAN_TYPES
410 #undef SCAN_TYPE
411
412 default:
413 return token_type_to_name ((enum token_type) type);
414 }
415 }
416
417 bool
is_scan_type(enum scan_type type)418 is_scan_type (enum scan_type type)
419 {
420 return type > SCAN_FIRST && type < SCAN_LAST;
421 }
422
423 static enum scan_result
scan_start__(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)424 scan_start__ (struct scanner *scanner, enum segment_type type,
425 struct substring s, struct token *token)
426 {
427 switch (type)
428 {
429 case SEG_NUMBER:
430 token->type = T_POS_NUM;
431 token->number = scan_number__ (s);
432 return SCAN_DONE;
433
434 case SEG_QUOTED_STRING:
435 case SEG_HEX_STRING:
436 case SEG_UNICODE_STRING:
437 return scan_string_segment__ (scanner, type, s, token);
438
439 case SEG_UNQUOTED_STRING:
440 case SEG_DO_REPEAT_COMMAND:
441 case SEG_INLINE_DATA:
442 case SEG_DOCUMENT:
443 token->type = T_STRING;
444 ss_alloc_substring (&token->string, s);
445 return SCAN_DONE;
446
447 case SEG_RESERVED_WORD:
448 token->type = scan_reserved_word__ (s);
449 return SCAN_DONE;
450
451 case SEG_IDENTIFIER:
452 token->type = T_ID;
453 ss_alloc_substring (&token->string, s);
454 return SCAN_DONE;
455
456 case SEG_PUNCT:
457 if (s.length == 1 && s.string[0] == '-')
458 {
459 scanner->state = S_DASH;
460 return SCAN_SAVE;
461 }
462 else
463 {
464 token->type = scan_punct__ (s);
465 return SCAN_DONE;
466 }
467
468 case SEG_SHBANG:
469 case SEG_SPACES:
470 case SEG_COMMENT:
471 case SEG_NEWLINE:
472 case SEG_COMMENT_COMMAND:
473 token->type = SCAN_SKIP;
474 return SCAN_DONE;
475
476 case SEG_START_DOCUMENT:
477 token->type = T_ID;
478 ss_alloc_substring (&token->string, ss_cstr ("DOCUMENT"));
479 return SCAN_DONE;
480
481 case SEG_START_COMMAND:
482 case SEG_SEPARATE_COMMANDS:
483 case SEG_END_COMMAND:
484 token->type = T_ENDCMD;
485 return SCAN_DONE;
486
487 case SEG_END:
488 token->type = T_STOP;
489 return SCAN_DONE;
490
491 case SEG_EXPECTED_QUOTE:
492 token->type = SCAN_EXPECTED_QUOTE;
493 return SCAN_DONE;
494
495 case SEG_EXPECTED_EXPONENT:
496 token->type = SCAN_EXPECTED_EXPONENT;
497 ss_alloc_substring (&token->string, s);
498 return SCAN_DONE;
499
500 case SEG_UNEXPECTED_DOT:
501 token->type = SCAN_UNEXPECTED_DOT;
502 return SCAN_DONE;
503
504 case SEG_UNEXPECTED_CHAR:
505 return scan_unexpected_char (&s, token);
506 }
507
508 NOT_REACHED ();
509 }
510
511 static enum scan_result
scan_dash__(enum segment_type type,struct substring s,struct token * token)512 scan_dash__ (enum segment_type type, struct substring s, struct token *token)
513 {
514 switch (type)
515 {
516 case SEG_SPACES:
517 case SEG_COMMENT:
518 return SCAN_MORE;
519
520 case SEG_NUMBER:
521 token->type = T_NEG_NUM;
522 token->number = -scan_number__ (s);
523 return SCAN_DONE;
524
525 default:
526 token->type = T_DASH;
527 return SCAN_BACK;
528 }
529 }
530
531 /* Initializes SCANNER for scanning a token from a sequence of segments.
532 Initializes TOKEN as the output token. (The client retains ownership of
533 TOKEN, but it must be preserved across subsequent calls to scanner_push()
534 for SCANNER.)
535
536 A scanner only produces a single token. To obtain the next token,
537 re-initialize it by calling this function again.
538
539 A scanner does not contain any external references, so nothing needs to be
540 done to destroy one. For the same reason, scanners may be copied with plain
541 struct assignment (or memcpy). */
542 void
scanner_init(struct scanner * scanner,struct token * token)543 scanner_init (struct scanner *scanner, struct token *token)
544 {
545 scanner->state = S_START;
546 token_init (token);
547 }
548
549 /* Adds the segment with type TYPE and UTF-8 text S to SCANNER. TOKEN must be
550 the same token passed to scanner_init() for SCANNER, or a copy of it.
551 scanner_push() may modify TOKEN. The client retains ownership of TOKEN,
552
553 The possible return values are:
554
555 - SCAN_DONE: All of the segments that have been passed to scanner_push()
556 form the token now stored in TOKEN. SCANNER is now "used up" and must
557 be reinitialized with scanner_init() if it is to be used again.
558
559 Most tokens only consist of a single segment, so this is the most common
560 return value.
561
562 - SCAN_MORE: The segments passed to scanner_push() don't yet determine a
563 token. The caller should call scanner_push() again with the next token.
564 (This won't happen if TYPE is SEG_END indicating the end of input.)
565
566 - SCAN_SAVE: This is similar to SCAN_MORE, with one difference: the caller
567 needs to "save its place" in the stream of segments for a possible
568 future SCAN_BACK return. This value can be returned more than once in a
569 sequence of scanner_push() calls for SCANNER, but the caller only needs
570 to keep track of the most recent position.
571
572 - SCAN_BACK: This is similar to SCAN_DONE, but the token consists of only
573 the segments up to and including the segment for which SCAN_SAVE was
574 most recently returned. Segments following that one should be passed to
575 the next scanner to be initialized.
576 */
577 enum scan_result
scanner_push(struct scanner * scanner,enum segment_type type,struct substring s,struct token * token)578 scanner_push (struct scanner *scanner, enum segment_type type,
579 struct substring s, struct token *token)
580 {
581 switch (scanner->state)
582 {
583 case S_START:
584 return scan_start__ (scanner, type, s, token);
585
586 case S_DASH:
587 return scan_dash__ (type, s, token);
588
589 case S_STRING:
590 return scan_string__ (scanner, type, s, token);
591 }
592
593 NOT_REACHED ();
594 }
595
596 /* Initializes SLEX for parsing INPUT, which is LENGTH bytes long, in the
597 specified MODE.
598
599 SLEX has no internal state to free, but it retains a reference to INPUT, so
600 INPUT must not be modified or freed while SLEX is still in use. */
601 void
string_lexer_init(struct string_lexer * slex,const char * input,size_t length,enum segmenter_mode mode)602 string_lexer_init (struct string_lexer *slex, const char *input, size_t length,
603 enum segmenter_mode mode)
604 {
605 slex->input = input;
606 slex->length = length;
607 slex->offset = 0;
608 segmenter_init (&slex->segmenter, mode);
609 }
610
611 /* */
612 bool
string_lexer_next(struct string_lexer * slex,struct token * token)613 string_lexer_next (struct string_lexer *slex, struct token *token)
614 {
615 struct segmenter saved_segmenter;
616 size_t saved_offset = 0;
617
618 struct scanner scanner;
619
620 scanner_init (&scanner, token);
621 for (;;)
622 {
623 const char *s = slex->input + slex->offset;
624 size_t left = slex->length - slex->offset;
625 enum segment_type type;
626 int n;
627
628 n = segmenter_push (&slex->segmenter, s, left, true, &type);
629 assert (n >= 0);
630
631 slex->offset += n;
632 switch (scanner_push (&scanner, type, ss_buffer (s, n), token))
633 {
634 case SCAN_BACK:
635 slex->segmenter = saved_segmenter;
636 slex->offset = saved_offset;
637 /* Fall through. */
638 case SCAN_DONE:
639 return token->type != T_STOP;
640
641 case SCAN_MORE:
642 break;
643
644 case SCAN_SAVE:
645 saved_segmenter = slex->segmenter;
646 saved_offset = slex->offset;
647 break;
648 }
649 }
650 }
651