1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "language/lexer/lexer.h"
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <limits.h>
24 #include <math.h>
25 #include <stdarg.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <unictype.h>
29 #include <unistd.h>
30 #include <unistr.h>
31 #include <uniwidth.h>
32
33 #include "language/command.h"
34 #include "language/lexer/scan.h"
35 #include "language/lexer/segment.h"
36 #include "language/lexer/token.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/cast.h"
39 #include "libpspp/deque.h"
40 #include "libpspp/i18n.h"
41 #include "libpspp/ll.h"
42 #include "libpspp/message.h"
43 #include "libpspp/misc.h"
44 #include "libpspp/str.h"
45 #include "libpspp/u8-istream.h"
46 #include "output/journal.h"
47 #include "output/text-item.h"
48
49 #include "gl/c-ctype.h"
50 #include "gl/minmax.h"
51 #include "gl/xalloc.h"
52 #include "gl/xmemdup0.h"
53
54 #include "gettext.h"
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
57
58 /* A token within a lex_source. */
59 struct lex_token
60 {
61 /* The regular token information. */
62 struct token token;
63
64 /* Location of token in terms of the lex_source's buffer.
65 src->tail <= line_pos <= token_pos <= src->head. */
66 size_t token_pos; /* Start of token. */
67 size_t token_len; /* Length of source for token in bytes. */
68 size_t line_pos; /* Start of line containing token_pos. */
69 int first_line; /* Line number at token_pos. */
70 };
71
72 /* A source of tokens, corresponding to a syntax file.
73
74 This is conceptually a lex_reader wrapped with everything needed to convert
75 its UTF-8 bytes into tokens. */
76 struct lex_source
77 {
78 struct ll ll; /* In lexer's list of sources. */
79 struct lex_reader *reader;
80 struct segmenter segmenter;
81 bool eof; /* True if T_STOP was read from 'reader'. */
82
83 /* Buffer of UTF-8 bytes. */
84 char *buffer;
85 size_t allocated; /* Number of bytes allocated. */
86 size_t tail; /* &buffer[0] offset into UTF-8 source. */
87 size_t head; /* &buffer[head - tail] offset into source. */
88
89 /* Positions in source file, tail <= pos <= head for each member here. */
90 size_t journal_pos; /* First byte not yet output to journal. */
91 size_t seg_pos; /* First byte not yet scanned as token. */
92 size_t line_pos; /* First byte of line containing seg_pos. */
93
94 int n_newlines; /* Number of new-lines up to seg_pos. */
95 bool suppress_next_newline;
96
97 /* Tokens. */
98 struct deque deque; /* Indexes into 'tokens'. */
99 struct lex_token *tokens; /* Lookahead tokens for parser. */
100 };
101
102 static struct lex_source *lex_source_create (struct lex_reader *);
103 static void lex_source_destroy (struct lex_source *);
104
105 /* Lexer. */
106 struct lexer
107 {
108 struct ll_list sources; /* Contains "struct lex_source"s. */
109 };
110
111 static struct lex_source *lex_source__ (const struct lexer *);
112 static const struct lex_token *lex_next__ (const struct lexer *, int n);
113 static void lex_source_push_endcmd__ (struct lex_source *);
114
115 static void lex_source_pop__ (struct lex_source *);
116 static bool lex_source_get__ (const struct lex_source *);
117 static void lex_source_error_valist (struct lex_source *, int n0, int n1,
118 const char *format, va_list)
119 PRINTF_FORMAT (4, 0);
120 static const struct lex_token *lex_source_next__ (const struct lex_source *,
121 int n);
122
123 /* Initializes READER with the specified CLASS and otherwise some reasonable
124 defaults. The caller should fill in the others members as desired. */
125 void
lex_reader_init(struct lex_reader * reader,const struct lex_reader_class * class)126 lex_reader_init (struct lex_reader *reader,
127 const struct lex_reader_class *class)
128 {
129 reader->class = class;
130 reader->syntax = LEX_SYNTAX_AUTO;
131 reader->error = LEX_ERROR_CONTINUE;
132 reader->file_name = NULL;
133 reader->encoding = NULL;
134 reader->line_number = 0;
135 reader->eof = false;
136 }
137
138 /* Frees any file name already in READER and replaces it by a copy of
139 FILE_NAME, or if FILE_NAME is null then clears any existing name. */
140 void
lex_reader_set_file_name(struct lex_reader * reader,const char * file_name)141 lex_reader_set_file_name (struct lex_reader *reader, const char *file_name)
142 {
143 free (reader->file_name);
144 reader->file_name = file_name != NULL ? xstrdup (file_name) : NULL;
145 }
146
147 /* Creates and returns a new lexer. */
148 struct lexer *
lex_create(void)149 lex_create (void)
150 {
151 struct lexer *lexer = xzalloc (sizeof *lexer);
152 ll_init (&lexer->sources);
153 return lexer;
154 }
155
156 /* Destroys LEXER. */
157 void
lex_destroy(struct lexer * lexer)158 lex_destroy (struct lexer *lexer)
159 {
160 if (lexer != NULL)
161 {
162 struct lex_source *source, *next;
163
164 ll_for_each_safe (source, next, struct lex_source, ll, &lexer->sources)
165 lex_source_destroy (source);
166 free (lexer);
167 }
168 }
169
170 /* Inserts READER into LEXER so that the next token read by LEXER comes from
171 READER. Before the caller, LEXER must either be empty or at a T_ENDCMD
172 token. */
173 void
lex_include(struct lexer * lexer,struct lex_reader * reader)174 lex_include (struct lexer *lexer, struct lex_reader *reader)
175 {
176 assert (ll_is_empty (&lexer->sources) || lex_token (lexer) == T_ENDCMD);
177 ll_push_head (&lexer->sources, &lex_source_create (reader)->ll);
178 }
179
180 /* Appends READER to LEXER, so that it will be read after all other current
181 readers have already been read. */
182 void
lex_append(struct lexer * lexer,struct lex_reader * reader)183 lex_append (struct lexer *lexer, struct lex_reader *reader)
184 {
185 ll_push_tail (&lexer->sources, &lex_source_create (reader)->ll);
186 }
187
188 /* Advancing. */
189
190 static struct lex_token *
lex_push_token__(struct lex_source * src)191 lex_push_token__ (struct lex_source *src)
192 {
193 struct lex_token *token;
194
195 if (deque_is_full (&src->deque))
196 src->tokens = deque_expand (&src->deque, src->tokens, sizeof *src->tokens);
197
198 token = &src->tokens[deque_push_front (&src->deque)];
199 token_init (&token->token);
200 return token;
201 }
202
203 static void
lex_source_pop__(struct lex_source * src)204 lex_source_pop__ (struct lex_source *src)
205 {
206 token_destroy (&src->tokens[deque_pop_back (&src->deque)].token);
207 }
208
209 static void
lex_source_pop_front(struct lex_source * src)210 lex_source_pop_front (struct lex_source *src)
211 {
212 token_destroy (&src->tokens[deque_pop_front (&src->deque)].token);
213 }
214
215 /* Advances LEXER to the next token, consuming the current token. */
216 void
lex_get(struct lexer * lexer)217 lex_get (struct lexer *lexer)
218 {
219 struct lex_source *src;
220
221 src = lex_source__ (lexer);
222 if (src == NULL)
223 return;
224
225 if (!deque_is_empty (&src->deque))
226 lex_source_pop__ (src);
227
228 while (deque_is_empty (&src->deque))
229 if (!lex_source_get__ (src))
230 {
231 lex_source_destroy (src);
232 src = lex_source__ (lexer);
233 if (src == NULL)
234 return;
235 }
236 }
237
238 /* Issuing errors. */
239
240 /* Prints a syntax error message containing the current token and
241 given message MESSAGE (if non-null). */
242 void
lex_error(struct lexer * lexer,const char * format,...)243 lex_error (struct lexer *lexer, const char *format, ...)
244 {
245 va_list args;
246
247 va_start (args, format);
248 lex_next_error_valist (lexer, 0, 0, format, args);
249 va_end (args);
250 }
251
252 /* Prints a syntax error message containing the current token and
253 given message MESSAGE (if non-null). */
254 void
lex_error_valist(struct lexer * lexer,const char * format,va_list args)255 lex_error_valist (struct lexer *lexer, const char *format, va_list args)
256 {
257 lex_next_error_valist (lexer, 0, 0, format, args);
258 }
259
260 /* Prints a syntax error message containing the current token and
261 given message MESSAGE (if non-null). */
262 void
lex_next_error(struct lexer * lexer,int n0,int n1,const char * format,...)263 lex_next_error (struct lexer *lexer, int n0, int n1, const char *format, ...)
264 {
265 va_list args;
266
267 va_start (args, format);
268 lex_next_error_valist (lexer, n0, n1, format, args);
269 va_end (args);
270 }
271
272 /* Prints a syntax error message saying that OPTION0 or one of the other
273 strings following it, up to the first NULL, is expected. */
274 void
275 (lex_error_expecting) (struct lexer *lexer, const char *option0, ...)
276 {
277 enum { MAX_OPTIONS = 8 };
278 const char *options[MAX_OPTIONS + 1];
279 va_list args;
280 int n;
281
282 va_start (args, option0);
283 options[0] = option0;
284 n = 0;
285 while (n + 1 < MAX_OPTIONS && options[n] != NULL)
286 options[++n] = va_arg (args, const char *);
287 va_end (args);
288
289 switch (n)
290 {
291 case 0:
292 lex_error (lexer, NULL);
293 break;
294
295 case 1:
296 lex_error (lexer, _("expecting %s"), options[0]);
297 break;
298
299 case 2:
300 lex_error (lexer, _("expecting %s or %s"), options[0], options[1]);
301 break;
302
303 case 3:
304 lex_error (lexer, _("expecting %s, %s, or %s"), options[0], options[1],
305 options[2]);
306 break;
307
308 case 4:
309 lex_error (lexer, _("expecting %s, %s, %s, or %s"),
310 options[0], options[1], options[2], options[3]);
311 break;
312
313 case 5:
314 lex_error (lexer, _("expecting %s, %s, %s, %s, or %s"),
315 options[0], options[1], options[2], options[3], options[4]);
316 break;
317
318 case 6:
319 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, or %s"),
320 options[0], options[1], options[2], options[3], options[4],
321 options[5]);
322 break;
323
324 case 7:
325 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, or %s"),
326 options[0], options[1], options[2], options[3], options[4],
327 options[5], options[6]);
328 break;
329
330 case 8:
331 lex_error (lexer, _("expecting %s, %s, %s, %s, %s, %s, %s, or %s"),
332 options[0], options[1], options[2], options[3], options[4],
333 options[5], options[6], options[7]);
334 break;
335
336 default:
337 NOT_REACHED ();
338 }
339 }
340
341 /* Reports an error to the effect that subcommand SBC may only be specified
342 once.
343
344 This function does not take a lexer as an argument or use lex_error(),
345 because the result would ordinarily just be redundant: "Syntax error at
346 SUBCOMMAND: Subcommand SUBCOMMAND may only be specified once.", which does
347 not help the user find the error. */
348 void
lex_sbc_only_once(const char * sbc)349 lex_sbc_only_once (const char *sbc)
350 {
351 msg (SE, _("Subcommand %s may only be specified once."), sbc);
352 }
353
354 /* Reports an error to the effect that subcommand SBC is missing.
355
356 This function does not take a lexer as an argument or use lex_error(),
357 because a missing subcommand can normally be detected only after the whole
358 command has been parsed, and so lex_error() would always report "Syntax
359 error at end of command", which does not help the user find the error. */
360 void
lex_sbc_missing(const char * sbc)361 lex_sbc_missing (const char *sbc)
362 {
363 msg (SE, _("Required subcommand %s was not specified."), sbc);
364 }
365
366 /* Reports an error to the effect that specification SPEC may only be specified
367 once within subcommand SBC. */
368 void
lex_spec_only_once(struct lexer * lexer,const char * sbc,const char * spec)369 lex_spec_only_once (struct lexer *lexer, const char *sbc, const char *spec)
370 {
371 lex_error (lexer, _("%s may only be specified once within subcommand %s"),
372 spec, sbc);
373 }
374
375 /* Reports an error to the effect that specification SPEC is missing within
376 subcommand SBC. */
377 void
lex_spec_missing(struct lexer * lexer,const char * sbc,const char * spec)378 lex_spec_missing (struct lexer *lexer, const char *sbc, const char *spec)
379 {
380 lex_error (lexer, _("Required %s specification missing from %s subcommand"),
381 sbc, spec);
382 }
383
384 /* Prints a syntax error message containing the current token and
385 given message MESSAGE (if non-null). */
386 void
lex_next_error_valist(struct lexer * lexer,int n0,int n1,const char * format,va_list args)387 lex_next_error_valist (struct lexer *lexer, int n0, int n1,
388 const char *format, va_list args)
389 {
390 struct lex_source *src = lex_source__ (lexer);
391
392 if (src != NULL)
393 lex_source_error_valist (src, n0, n1, format, args);
394 else
395 {
396 struct string s;
397
398 ds_init_empty (&s);
399 ds_put_format (&s, _("Syntax error at end of input"));
400 if (format != NULL)
401 {
402 ds_put_cstr (&s, ": ");
403 ds_put_vformat (&s, format, args);
404 }
405 ds_put_byte (&s, '.');
406 msg (SE, "%s", ds_cstr (&s));
407 ds_destroy (&s);
408 }
409 }
410
411 /* Checks that we're at end of command.
412 If so, returns a successful command completion code.
413 If not, flags a syntax error and returns an error command
414 completion code. */
415 int
lex_end_of_command(struct lexer * lexer)416 lex_end_of_command (struct lexer *lexer)
417 {
418 if (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_STOP)
419 {
420 lex_error (lexer, _("expecting end of command"));
421 return CMD_FAILURE;
422 }
423 else
424 return CMD_SUCCESS;
425 }
426
427 /* Token testing functions. */
428
429 /* Returns true if the current token is a number. */
430 bool
lex_is_number(const struct lexer * lexer)431 lex_is_number (const struct lexer *lexer)
432 {
433 return lex_next_is_number (lexer, 0);
434 }
435
436 /* Returns true if the current token is a string. */
437 bool
lex_is_string(const struct lexer * lexer)438 lex_is_string (const struct lexer *lexer)
439 {
440 return lex_next_is_string (lexer, 0);
441 }
442
443 /* Returns the value of the current token, which must be a
444 floating point number. */
445 double
lex_number(const struct lexer * lexer)446 lex_number (const struct lexer *lexer)
447 {
448 return lex_next_number (lexer, 0);
449 }
450
451 /* Returns true iff the current token is an integer. */
452 bool
lex_is_integer(const struct lexer * lexer)453 lex_is_integer (const struct lexer *lexer)
454 {
455 return lex_next_is_integer (lexer, 0);
456 }
457
458 /* Returns the value of the current token, which must be an
459 integer. */
460 long
lex_integer(const struct lexer * lexer)461 lex_integer (const struct lexer *lexer)
462 {
463 return lex_next_integer (lexer, 0);
464 }
465
466 /* Token testing functions with lookahead.
467
468 A value of 0 for N as an argument to any of these functions refers to the
469 current token. Lookahead is limited to the current command. Any N greater
470 than the number of tokens remaining in the current command will be treated
471 as referring to a T_ENDCMD token. */
472
473 /* Returns true if the token N ahead of the current token is a number. */
474 bool
lex_next_is_number(const struct lexer * lexer,int n)475 lex_next_is_number (const struct lexer *lexer, int n)
476 {
477 enum token_type next_token = lex_next_token (lexer, n);
478 return next_token == T_POS_NUM || next_token == T_NEG_NUM;
479 }
480
481 /* Returns true if the token N ahead of the current token is a string. */
482 bool
lex_next_is_string(const struct lexer * lexer,int n)483 lex_next_is_string (const struct lexer *lexer, int n)
484 {
485 return lex_next_token (lexer, n) == T_STRING;
486 }
487
488 /* Returns the value of the token N ahead of the current token, which must be a
489 floating point number. */
490 double
lex_next_number(const struct lexer * lexer,int n)491 lex_next_number (const struct lexer *lexer, int n)
492 {
493 assert (lex_next_is_number (lexer, n));
494 return lex_next_tokval (lexer, n);
495 }
496
497 /* Returns true if the token N ahead of the current token is an integer. */
498 bool
lex_next_is_integer(const struct lexer * lexer,int n)499 lex_next_is_integer (const struct lexer *lexer, int n)
500 {
501 double value;
502
503 if (!lex_next_is_number (lexer, n))
504 return false;
505
506 value = lex_next_tokval (lexer, n);
507 return value > LONG_MIN && value <= LONG_MAX && floor (value) == value;
508 }
509
510 /* Returns the value of the token N ahead of the current token, which must be
511 an integer. */
512 long
lex_next_integer(const struct lexer * lexer,int n)513 lex_next_integer (const struct lexer *lexer, int n)
514 {
515 assert (lex_next_is_integer (lexer, n));
516 return lex_next_tokval (lexer, n);
517 }
518
519 /* Token matching functions. */
520
521 /* If the current token has the specified TYPE, skips it and returns true.
522 Otherwise, returns false. */
523 bool
lex_match(struct lexer * lexer,enum token_type type)524 lex_match (struct lexer *lexer, enum token_type type)
525 {
526 if (lex_token (lexer) == type)
527 {
528 lex_get (lexer);
529 return true;
530 }
531 else
532 return false;
533 }
534
535 /* If the current token matches IDENTIFIER, skips it and returns true.
536 IDENTIFIER may be abbreviated to its first three letters. Otherwise,
537 returns false.
538
539 IDENTIFIER must be an ASCII string. */
540 bool
lex_match_id(struct lexer * lexer,const char * identifier)541 lex_match_id (struct lexer *lexer, const char *identifier)
542 {
543 return lex_match_id_n (lexer, identifier, 3);
544 }
545
546 /* If the current token is IDENTIFIER, skips it and returns true. IDENTIFIER
547 may be abbreviated to its first N letters. Otherwise, returns false.
548
549 IDENTIFIER must be an ASCII string. */
550 bool
lex_match_id_n(struct lexer * lexer,const char * identifier,size_t n)551 lex_match_id_n (struct lexer *lexer, const char *identifier, size_t n)
552 {
553 if (lex_token (lexer) == T_ID
554 && lex_id_match_n (ss_cstr (identifier), lex_tokss (lexer), n))
555 {
556 lex_get (lexer);
557 return true;
558 }
559 else
560 return false;
561 }
562
563 /* If the current token is integer X, skips it and returns true. Otherwise,
564 returns false. */
565 bool
lex_match_int(struct lexer * lexer,int x)566 lex_match_int (struct lexer *lexer, int x)
567 {
568 if (lex_is_integer (lexer) && lex_integer (lexer) == x)
569 {
570 lex_get (lexer);
571 return true;
572 }
573 else
574 return false;
575 }
576
577 /* Forced matches. */
578
579 /* If this token is IDENTIFIER, skips it and returns true. IDENTIFIER may be
580 abbreviated to its first 3 letters. Otherwise, reports an error and returns
581 false.
582
583 IDENTIFIER must be an ASCII string. */
584 bool
lex_force_match_id(struct lexer * lexer,const char * identifier)585 lex_force_match_id (struct lexer *lexer, const char *identifier)
586 {
587 if (lex_match_id (lexer, identifier))
588 return true;
589 else
590 {
591 lex_error_expecting (lexer, identifier);
592 return false;
593 }
594 }
595
596 /* If the current token has the specified TYPE, skips it and returns true.
597 Otherwise, reports an error and returns false. */
598 bool
lex_force_match(struct lexer * lexer,enum token_type type)599 lex_force_match (struct lexer *lexer, enum token_type type)
600 {
601 if (lex_token (lexer) == type)
602 {
603 lex_get (lexer);
604 return true;
605 }
606 else
607 {
608 const char *type_string = token_type_to_string (type);
609 if (type_string)
610 {
611 char *s = xasprintf ("`%s'", type_string);
612 lex_error_expecting (lexer, s);
613 free (s);
614 }
615 else
616 lex_error_expecting (lexer, token_type_to_name (type));
617
618 return false;
619 }
620 }
621
622 /* If the current token is a string, does nothing and returns true.
623 Otherwise, reports an error and returns false. */
624 bool
lex_force_string(struct lexer * lexer)625 lex_force_string (struct lexer *lexer)
626 {
627 if (lex_is_string (lexer))
628 return true;
629 else
630 {
631 lex_error (lexer, _("expecting string"));
632 return false;
633 }
634 }
635
636 /* If the current token is a string or an identifier, does nothing and returns
637 true. Otherwise, reports an error and returns false.
638
639 This is meant for use in syntactic situations where we want to encourage the
640 user to supply a quoted string, but for compatibility we also accept
641 identifiers. (One example of such a situation is file names.) Therefore,
642 the error message issued when the current token is wrong only says that a
643 string is expected and doesn't mention that an identifier would also be
644 accepted. */
645 bool
lex_force_string_or_id(struct lexer * lexer)646 lex_force_string_or_id (struct lexer *lexer)
647 {
648 return lex_token (lexer) == T_ID || lex_force_string (lexer);
649 }
650
651 /* If the current token is an integer, does nothing and returns true.
652 Otherwise, reports an error and returns false. */
653 bool
lex_force_int(struct lexer * lexer)654 lex_force_int (struct lexer *lexer)
655 {
656 if (lex_is_integer (lexer))
657 return true;
658 else
659 {
660 lex_error (lexer, _("expecting integer"));
661 return false;
662 }
663 }
664
665 /* If the current token is a number, does nothing and returns true.
666 Otherwise, reports an error and returns false. */
667 bool
lex_force_num(struct lexer * lexer)668 lex_force_num (struct lexer *lexer)
669 {
670 if (lex_is_number (lexer))
671 return true;
672
673 lex_error (lexer, _("expecting number"));
674 return false;
675 }
676
677 /* If the current token is an identifier, does nothing and returns true.
678 Otherwise, reports an error and returns false. */
679 bool
lex_force_id(struct lexer * lexer)680 lex_force_id (struct lexer *lexer)
681 {
682 if (lex_token (lexer) == T_ID)
683 return true;
684
685 lex_error (lexer, _("expecting identifier"));
686 return false;
687 }
688
689 /* Token accessors. */
690
691 /* Returns the type of LEXER's current token. */
692 enum token_type
lex_token(const struct lexer * lexer)693 lex_token (const struct lexer *lexer)
694 {
695 return lex_next_token (lexer, 0);
696 }
697
698 /* Returns the number in LEXER's current token.
699
700 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
701 tokens this function will always return zero. */
702 double
lex_tokval(const struct lexer * lexer)703 lex_tokval (const struct lexer *lexer)
704 {
705 return lex_next_tokval (lexer, 0);
706 }
707
708 /* Returns the null-terminated string in LEXER's current token, UTF-8 encoded.
709
710 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
711 this functions this function will always return NULL.
712
713 The UTF-8 encoding of the returned string is correct for variable names and
714 other identifiers. Use filename_to_utf8() to use it as a filename. Use
715 data_in() to use it in a "union value". */
716 const char *
lex_tokcstr(const struct lexer * lexer)717 lex_tokcstr (const struct lexer *lexer)
718 {
719 return lex_next_tokcstr (lexer, 0);
720 }
721
722 /* Returns the string in LEXER's current token, UTF-8 encoded. The string is
723 null-terminated (but the null terminator is not included in the returned
724 substring's 'length').
725
726 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
727 this functions this function will always return NULL.
728
729 The UTF-8 encoding of the returned string is correct for variable names and
730 other identifiers. Use filename_to_utf8() to use it as a filename. Use
731 data_in() to use it in a "union value". */
732 struct substring
lex_tokss(const struct lexer * lexer)733 lex_tokss (const struct lexer *lexer)
734 {
735 return lex_next_tokss (lexer, 0);
736 }
737
738 /* Looking ahead.
739
740 A value of 0 for N as an argument to any of these functions refers to the
741 current token. Lookahead is limited to the current command. Any N greater
742 than the number of tokens remaining in the current command will be treated
743 as referring to a T_ENDCMD token. */
744
745 static const struct lex_token *
lex_next__(const struct lexer * lexer_,int n)746 lex_next__ (const struct lexer *lexer_, int n)
747 {
748 struct lexer *lexer = CONST_CAST (struct lexer *, lexer_);
749 struct lex_source *src = lex_source__ (lexer);
750
751 if (src != NULL)
752 return lex_source_next__ (src, n);
753 else
754 {
755 static const struct lex_token stop_token =
756 { TOKEN_INITIALIZER (T_STOP, 0.0, ""), 0, 0, 0, 0 };
757
758 return &stop_token;
759 }
760 }
761
762 static const struct lex_token *
lex_source_next__(const struct lex_source * src,int n)763 lex_source_next__ (const struct lex_source *src, int n)
764 {
765 while (deque_count (&src->deque) <= n)
766 {
767 if (!deque_is_empty (&src->deque))
768 {
769 struct lex_token *front;
770
771 front = &src->tokens[deque_front (&src->deque, 0)];
772 if (front->token.type == T_STOP || front->token.type == T_ENDCMD)
773 return front;
774 }
775
776 lex_source_get__ (src);
777 }
778
779 return &src->tokens[deque_back (&src->deque, n)];
780 }
781
782 /* Returns the "struct token" of the token N after the current one in LEXER.
783 The returned pointer can be invalidated by pretty much any succeeding call
784 into the lexer, although the string pointer within the returned token is
785 only invalidated by consuming the token (e.g. with lex_get()). */
786 const struct token *
lex_next(const struct lexer * lexer,int n)787 lex_next (const struct lexer *lexer, int n)
788 {
789 return &lex_next__ (lexer, n)->token;
790 }
791
792 /* Returns the type of the token N after the current one in LEXER. */
793 enum token_type
lex_next_token(const struct lexer * lexer,int n)794 lex_next_token (const struct lexer *lexer, int n)
795 {
796 return lex_next (lexer, n)->type;
797 }
798
799 /* Returns the number in the tokn N after the current one in LEXER.
800
801 Only T_NEG_NUM and T_POS_NUM tokens have meaningful values. For other
802 tokens this function will always return zero. */
803 double
lex_next_tokval(const struct lexer * lexer,int n)804 lex_next_tokval (const struct lexer *lexer, int n)
805 {
806 const struct token *token = lex_next (lexer, n);
807 return token->number;
808 }
809
810 /* Returns the null-terminated string in the token N after the current one, in
811 UTF-8 encoding.
812
813 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
814 this functions this function will always return NULL.
815
816 The UTF-8 encoding of the returned string is correct for variable names and
817 other identifiers. Use filename_to_utf8() to use it as a filename. Use
818 data_in() to use it in a "union value". */
819 const char *
lex_next_tokcstr(const struct lexer * lexer,int n)820 lex_next_tokcstr (const struct lexer *lexer, int n)
821 {
822 return lex_next_tokss (lexer, n).string;
823 }
824
825 /* Returns the string in the token N after the current one, in UTF-8 encoding.
826 The string is null-terminated (but the null terminator is not included in
827 the returned substring's 'length').
828
829 Only T_ID and T_STRING tokens have meaningful strings. For other tokens
830 this functions this function will always return NULL.
831
832 The UTF-8 encoding of the returned string is correct for variable names and
833 other identifiers. Use filename_to_utf8() to use it as a filename. Use
834 data_in() to use it in a "union value". */
835 struct substring
lex_next_tokss(const struct lexer * lexer,int n)836 lex_next_tokss (const struct lexer *lexer, int n)
837 {
838 return lex_next (lexer, n)->string;
839 }
840
841 static bool
lex_tokens_match(const struct token * actual,const struct token * expected)842 lex_tokens_match (const struct token *actual, const struct token *expected)
843 {
844 if (actual->type != expected->type)
845 return false;
846
847 switch (actual->type)
848 {
849 case T_POS_NUM:
850 case T_NEG_NUM:
851 return actual->number == expected->number;
852
853 case T_ID:
854 return lex_id_match (expected->string, actual->string);
855
856 case T_STRING:
857 return (actual->string.length == expected->string.length
858 && !memcmp (actual->string.string, expected->string.string,
859 actual->string.length));
860
861 default:
862 return true;
863 }
864 }
865
866 /* If LEXER is positioned at the sequence of tokens that may be parsed from S,
867 skips it and returns true. Otherwise, returns false.
868
869 S may consist of an arbitrary sequence of tokens, e.g. "KRUSKAL-WALLIS",
870 "2SLS", or "END INPUT PROGRAM". Identifiers may be abbreviated to their
871 first three letters. */
872 bool
lex_match_phrase(struct lexer * lexer,const char * s)873 lex_match_phrase (struct lexer *lexer, const char *s)
874 {
875 struct string_lexer slex;
876 struct token token;
877 int i;
878
879 i = 0;
880 string_lexer_init (&slex, s, strlen (s), SEG_MODE_INTERACTIVE);
881 while (string_lexer_next (&slex, &token))
882 if (token.type != SCAN_SKIP)
883 {
884 bool match = lex_tokens_match (lex_next (lexer, i++), &token);
885 token_destroy (&token);
886 if (!match)
887 return false;
888 }
889
890 while (i-- > 0)
891 lex_get (lexer);
892 return true;
893 }
894
895 static int
lex_source_get_first_line_number(const struct lex_source * src,int n)896 lex_source_get_first_line_number (const struct lex_source *src, int n)
897 {
898 return lex_source_next__ (src, n)->first_line;
899 }
900
901 static int
count_newlines(char * s,size_t length)902 count_newlines (char *s, size_t length)
903 {
904 int n_newlines = 0;
905 char *newline;
906
907 while ((newline = memchr (s, '\n', length)) != NULL)
908 {
909 n_newlines++;
910 length -= (newline + 1) - s;
911 s = newline + 1;
912 }
913
914 return n_newlines;
915 }
916
917 static int
lex_source_get_last_line_number(const struct lex_source * src,int n)918 lex_source_get_last_line_number (const struct lex_source *src, int n)
919 {
920 const struct lex_token *token = lex_source_next__ (src, n);
921
922 if (token->first_line == 0)
923 return 0;
924 else
925 {
926 char *token_str = &src->buffer[token->token_pos - src->tail];
927 return token->first_line + count_newlines (token_str, token->token_len) + 1;
928 }
929 }
930
931 static int
count_columns(const char * s_,size_t length)932 count_columns (const char *s_, size_t length)
933 {
934 const uint8_t *s = CHAR_CAST (const uint8_t *, s_);
935 int columns;
936 size_t ofs;
937 int mblen;
938
939 columns = 0;
940 for (ofs = 0; ofs < length; ofs += mblen)
941 {
942 ucs4_t uc;
943
944 mblen = u8_mbtouc (&uc, s + ofs, length - ofs);
945 if (uc != '\t')
946 {
947 int width = uc_width (uc, "UTF-8");
948 if (width > 0)
949 columns += width;
950 }
951 else
952 columns = ROUND_UP (columns + 1, 8);
953 }
954
955 return columns + 1;
956 }
957
958 static int
lex_source_get_first_column(const struct lex_source * src,int n)959 lex_source_get_first_column (const struct lex_source *src, int n)
960 {
961 const struct lex_token *token = lex_source_next__ (src, n);
962 return count_columns (&src->buffer[token->line_pos - src->tail],
963 token->token_pos - token->line_pos);
964 }
965
966 static int
lex_source_get_last_column(const struct lex_source * src,int n)967 lex_source_get_last_column (const struct lex_source *src, int n)
968 {
969 const struct lex_token *token = lex_source_next__ (src, n);
970 char *start, *end, *newline;
971
972 start = &src->buffer[token->line_pos - src->tail];
973 end = &src->buffer[(token->token_pos + token->token_len) - src->tail];
974 newline = memrchr (start, '\n', end - start);
975 if (newline != NULL)
976 start = newline + 1;
977 return count_columns (start, end - start);
978 }
979
980 /* Returns the 1-based line number of the start of the syntax that represents
981 the token N after the current one in LEXER. Returns 0 for a T_STOP token or
982 if the token is drawn from a source that does not have line numbers. */
983 int
lex_get_first_line_number(const struct lexer * lexer,int n)984 lex_get_first_line_number (const struct lexer *lexer, int n)
985 {
986 const struct lex_source *src = lex_source__ (lexer);
987 return src != NULL ? lex_source_get_first_line_number (src, n) : 0;
988 }
989
990 /* Returns the 1-based line number of the end of the syntax that represents the
991 token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
992 token or if the token is drawn from a source that does not have line
993 numbers.
994
995 Most of the time, a single token is wholly within a single line of syntax,
996 but there are two exceptions: a T_STRING token can be made up of multiple
997 segments on adjacent lines connected with "+" punctuators, and a T_NEG_NUM
998 token can consist of a "-" on one line followed by the number on the next.
999 */
1000 int
lex_get_last_line_number(const struct lexer * lexer,int n)1001 lex_get_last_line_number (const struct lexer *lexer, int n)
1002 {
1003 const struct lex_source *src = lex_source__ (lexer);
1004 return src != NULL ? lex_source_get_last_line_number (src, n) : 0;
1005 }
1006
1007 /* Returns the 1-based column number of the start of the syntax that represents
1008 the token N after the current one in LEXER. Returns 0 for a T_STOP
1009 token.
1010
1011 Column numbers are measured according to the width of characters as shown in
1012 a typical fixed-width font, in which CJK characters have width 2 and
1013 combining characters have width 0. */
1014 int
lex_get_first_column(const struct lexer * lexer,int n)1015 lex_get_first_column (const struct lexer *lexer, int n)
1016 {
1017 const struct lex_source *src = lex_source__ (lexer);
1018 return src != NULL ? lex_source_get_first_column (src, n) : 0;
1019 }
1020
1021 /* Returns the 1-based column number of the end of the syntax that represents
1022 the token N after the current one in LEXER, plus 1. Returns 0 for a T_STOP
1023 token.
1024
1025 Column numbers are measured according to the width of characters as shown in
1026 a typical fixed-width font, in which CJK characters have width 2 and
1027 combining characters have width 0. */
1028 int
lex_get_last_column(const struct lexer * lexer,int n)1029 lex_get_last_column (const struct lexer *lexer, int n)
1030 {
1031 const struct lex_source *src = lex_source__ (lexer);
1032 return src != NULL ? lex_source_get_last_column (src, n) : 0;
1033 }
1034
1035 /* Returns the name of the syntax file from which the current command is drawn.
1036 Returns NULL for a T_STOP token or if the command's source does not have
1037 line numbers.
1038
1039 There is no version of this function that takes an N argument because
1040 lookahead only works to the end of a command and any given command is always
1041 within a single syntax file. */
1042 const char *
lex_get_file_name(const struct lexer * lexer)1043 lex_get_file_name (const struct lexer *lexer)
1044 {
1045 struct lex_source *src = lex_source__ (lexer);
1046 return src == NULL ? NULL : src->reader->file_name;
1047 }
1048
1049 const char *
lex_get_encoding(const struct lexer * lexer)1050 lex_get_encoding (const struct lexer *lexer)
1051 {
1052 struct lex_source *src = lex_source__ (lexer);
1053 return src == NULL ? NULL : src->reader->encoding;
1054 }
1055
1056
1057 /* Returns the syntax mode for the syntax file from which the current drawn is
1058 drawn. Returns LEX_SYNTAX_AUTO for a T_STOP token or if the command's
1059 source does not have line numbers.
1060
1061 There is no version of this function that takes an N argument because
1062 lookahead only works to the end of a command and any given command is always
1063 within a single syntax file. */
1064 enum lex_syntax_mode
lex_get_syntax_mode(const struct lexer * lexer)1065 lex_get_syntax_mode (const struct lexer *lexer)
1066 {
1067 struct lex_source *src = lex_source__ (lexer);
1068 return src == NULL ? LEX_SYNTAX_AUTO : src->reader->syntax;
1069 }
1070
1071 /* Returns the error mode for the syntax file from which the current drawn is
1072 drawn. Returns LEX_ERROR_TERMINAL for a T_STOP token or if the command's
1073 source does not have line numbers.
1074
1075 There is no version of this function that takes an N argument because
1076 lookahead only works to the end of a command and any given command is always
1077 within a single syntax file. */
1078 enum lex_error_mode
lex_get_error_mode(const struct lexer * lexer)1079 lex_get_error_mode (const struct lexer *lexer)
1080 {
1081 struct lex_source *src = lex_source__ (lexer);
1082 return src == NULL ? LEX_ERROR_TERMINAL : src->reader->error;
1083 }
1084
1085 /* If the source that LEXER is currently reading has error mode
1086 LEX_ERROR_TERMINAL, discards all buffered input and tokens, so that the next
1087 token to be read comes directly from whatever is next read from the stream.
1088
1089 It makes sense to call this function after encountering an error in a
1090 command entered on the console, because usually the user would prefer not to
1091 have cascading errors. */
1092 void
lex_interactive_reset(struct lexer * lexer)1093 lex_interactive_reset (struct lexer *lexer)
1094 {
1095 struct lex_source *src = lex_source__ (lexer);
1096 if (src != NULL && src->reader->error == LEX_ERROR_TERMINAL)
1097 {
1098 src->head = src->tail = 0;
1099 src->journal_pos = src->seg_pos = src->line_pos = 0;
1100 src->n_newlines = 0;
1101 src->suppress_next_newline = false;
1102 segmenter_init (&src->segmenter, segmenter_get_mode (&src->segmenter));
1103 while (!deque_is_empty (&src->deque))
1104 lex_source_pop__ (src);
1105 lex_source_push_endcmd__ (src);
1106 }
1107 }
1108
1109 /* Advances past any tokens in LEXER up to a T_ENDCMD or T_STOP. */
1110 void
lex_discard_rest_of_command(struct lexer * lexer)1111 lex_discard_rest_of_command (struct lexer *lexer)
1112 {
1113 while (lex_token (lexer) != T_STOP && lex_token (lexer) != T_ENDCMD)
1114 lex_get (lexer);
1115 }
1116
1117 /* Discards all lookahead tokens in LEXER, then discards all input sources
1118 until it encounters one with error mode LEX_ERROR_TERMINAL or until it
1119 runs out of input sources. */
1120 void
lex_discard_noninteractive(struct lexer * lexer)1121 lex_discard_noninteractive (struct lexer *lexer)
1122 {
1123 struct lex_source *src = lex_source__ (lexer);
1124
1125 if (src != NULL)
1126 {
1127 while (!deque_is_empty (&src->deque))
1128 lex_source_pop__ (src);
1129
1130 for (; src != NULL && src->reader->error != LEX_ERROR_TERMINAL;
1131 src = lex_source__ (lexer))
1132 lex_source_destroy (src);
1133 }
1134 }
1135
1136 static size_t
lex_source_max_tail__(const struct lex_source * src)1137 lex_source_max_tail__ (const struct lex_source *src)
1138 {
1139 const struct lex_token *token;
1140 size_t max_tail;
1141
1142 assert (src->seg_pos >= src->line_pos);
1143 max_tail = MIN (src->journal_pos, src->line_pos);
1144
1145 /* Use the oldest token also. (We know that src->deque cannot be empty
1146 because we are in the process of adding a new token, which is already
1147 initialized enough to use here.) */
1148 token = &src->tokens[deque_back (&src->deque, 0)];
1149 assert (token->token_pos >= token->line_pos);
1150 max_tail = MIN (max_tail, token->line_pos);
1151
1152 return max_tail;
1153 }
1154
1155 static void
lex_source_expand__(struct lex_source * src)1156 lex_source_expand__ (struct lex_source *src)
1157 {
1158 if (src->head - src->tail >= src->allocated)
1159 {
1160 size_t max_tail = lex_source_max_tail__ (src);
1161 if (max_tail > src->tail)
1162 {
1163 /* Advance the tail, freeing up room at the head. */
1164 memmove (src->buffer, src->buffer + (max_tail - src->tail),
1165 src->head - max_tail);
1166 src->tail = max_tail;
1167 }
1168 else
1169 {
1170 /* Buffer is completely full. Expand it. */
1171 src->buffer = x2realloc (src->buffer, &src->allocated);
1172 }
1173 }
1174 else
1175 {
1176 /* There's space available at the head of the buffer. Nothing to do. */
1177 }
1178 }
1179
1180 static void
lex_source_read__(struct lex_source * src)1181 lex_source_read__ (struct lex_source *src)
1182 {
1183 do
1184 {
1185 lex_source_expand__ (src);
1186
1187 size_t head_ofs = src->head - src->tail;
1188 size_t space = src->allocated - head_ofs;
1189 enum prompt_style prompt = segmenter_get_prompt (&src->segmenter);
1190 size_t n = src->reader->class->read (src->reader, &src->buffer[head_ofs],
1191 space, prompt);
1192 assert (n <= space);
1193
1194 if (n == 0)
1195 {
1196 /* End of input. */
1197 src->reader->eof = true;
1198 lex_source_expand__ (src);
1199 return;
1200 }
1201
1202 src->head += n;
1203 }
1204 while (!memchr (&src->buffer[src->seg_pos - src->tail], '\n',
1205 src->head - src->seg_pos));
1206 }
1207
1208 static struct lex_source *
lex_source__(const struct lexer * lexer)1209 lex_source__ (const struct lexer *lexer)
1210 {
1211 return (ll_is_empty (&lexer->sources) ? NULL
1212 : ll_data (ll_head (&lexer->sources), struct lex_source, ll));
1213 }
1214
1215 static struct substring
lex_source_get_syntax__(const struct lex_source * src,int n0,int n1)1216 lex_source_get_syntax__ (const struct lex_source *src, int n0, int n1)
1217 {
1218 const struct lex_token *token0 = lex_source_next__ (src, n0);
1219 const struct lex_token *token1 = lex_source_next__ (src, MAX (n0, n1));
1220 size_t start = token0->token_pos;
1221 size_t end = token1->token_pos + token1->token_len;
1222
1223 return ss_buffer (&src->buffer[start - src->tail], end - start);
1224 }
1225
1226 static void
lex_ellipsize__(struct substring in,char * out,size_t out_size)1227 lex_ellipsize__ (struct substring in, char *out, size_t out_size)
1228 {
1229 size_t out_maxlen;
1230 size_t out_len;
1231 int mblen;
1232
1233 assert (out_size >= 16);
1234 out_maxlen = out_size - 1;
1235 if (in.length > out_maxlen - 3)
1236 out_maxlen -= 3;
1237
1238 for (out_len = 0; out_len < in.length; out_len += mblen)
1239 {
1240 if (in.string[out_len] == '\n'
1241 || in.string[out_len] == '\0'
1242 || (in.string[out_len] == '\r'
1243 && out_len + 1 < in.length
1244 && in.string[out_len + 1] == '\n'))
1245 break;
1246
1247 mblen = u8_mblen (CHAR_CAST (const uint8_t *, in.string + out_len),
1248 in.length - out_len);
1249
1250 if (mblen < 0)
1251 break;
1252
1253 if (out_len + mblen > out_maxlen)
1254 break;
1255 }
1256
1257 memcpy (out, in.string, out_len);
1258 strcpy (&out[out_len], out_len < in.length ? "..." : "");
1259 }
1260
1261 static void
lex_source_error_valist(struct lex_source * src,int n0,int n1,const char * format,va_list args)1262 lex_source_error_valist (struct lex_source *src, int n0, int n1,
1263 const char *format, va_list args)
1264 {
1265 const struct lex_token *token;
1266 struct string s;
1267
1268 ds_init_empty (&s);
1269
1270 token = lex_source_next__ (src, n0);
1271 if (token->token.type == T_ENDCMD)
1272 ds_put_cstr (&s, _("Syntax error at end of command"));
1273 else
1274 {
1275 struct substring syntax = lex_source_get_syntax__ (src, n0, n1);
1276 if (!ss_is_empty (syntax))
1277 {
1278 char syntax_cstr[64];
1279
1280 lex_ellipsize__ (syntax, syntax_cstr, sizeof syntax_cstr);
1281 ds_put_format (&s, _("Syntax error at `%s'"), syntax_cstr);
1282 }
1283 else
1284 ds_put_cstr (&s, _("Syntax error"));
1285 }
1286
1287 if (format)
1288 {
1289 ds_put_cstr (&s, ": ");
1290 ds_put_vformat (&s, format, args);
1291 }
1292 ds_put_byte (&s, '.');
1293
1294 struct msg m = {
1295 .category = MSG_C_SYNTAX,
1296 .severity = MSG_S_ERROR,
1297 .file_name = src->reader->file_name,
1298 .first_line = lex_source_get_first_line_number (src, n0),
1299 .last_line = lex_source_get_last_line_number (src, n1),
1300 .first_column = lex_source_get_first_column (src, n0),
1301 .last_column = lex_source_get_last_column (src, n1),
1302 .text = ds_steal_cstr (&s),
1303 };
1304 msg_emit (&m);
1305 }
1306
1307 static void PRINTF_FORMAT (2, 3)
lex_get_error(struct lex_source * src,const char * format,...)1308 lex_get_error (struct lex_source *src, const char *format, ...)
1309 {
1310 va_list args;
1311 int n;
1312
1313 va_start (args, format);
1314
1315 n = deque_count (&src->deque) - 1;
1316 lex_source_error_valist (src, n, n, format, args);
1317 lex_source_pop_front (src);
1318
1319 va_end (args);
1320 }
1321
1322 /* Attempts to append an additional token into SRC's deque, reading more from
1323 the underlying lex_reader if necessary.. Returns true if successful, false
1324 if the deque already represents (a suffix of) the whole lex_reader's
1325 contents, */
1326 static bool
lex_source_get__(const struct lex_source * src_)1327 lex_source_get__ (const struct lex_source *src_)
1328 {
1329 struct lex_source *src = CONST_CAST (struct lex_source *, src_);
1330 if (src->eof)
1331 return false;
1332
1333 /* State maintained while scanning tokens. Usually we only need a single
1334 state, but scanner_push() can return SCAN_SAVE to indicate that the state
1335 needs to be saved and possibly restored later with SCAN_BACK. */
1336 struct state
1337 {
1338 struct segmenter segmenter;
1339 enum segment_type last_segment;
1340 int newlines; /* Number of newlines encountered so far. */
1341 /* Maintained here so we can update lex_source's similar members when we
1342 finish. */
1343 size_t line_pos;
1344 size_t seg_pos;
1345 };
1346
1347 /* Initialize state. */
1348 struct state state =
1349 {
1350 .segmenter = src->segmenter,
1351 .newlines = 0,
1352 .seg_pos = src->seg_pos,
1353 .line_pos = src->line_pos,
1354 };
1355 struct state saved = state;
1356
1357 /* Append a new token to SRC and initialize it. */
1358 struct lex_token *token = lex_push_token__ (src);
1359 struct scanner scanner;
1360 scanner_init (&scanner, &token->token);
1361 token->line_pos = src->line_pos;
1362 token->token_pos = src->seg_pos;
1363 if (src->reader->line_number > 0)
1364 token->first_line = src->reader->line_number + src->n_newlines;
1365 else
1366 token->first_line = 0;
1367
1368 /* Extract segments and pass them through the scanner until we obtain a
1369 token. */
1370 for (;;)
1371 {
1372 /* Extract a segment. */
1373 const char *segment = &src->buffer[state.seg_pos - src->tail];
1374 size_t seg_maxlen = src->head - state.seg_pos;
1375 enum segment_type type;
1376 int seg_len = segmenter_push (&state.segmenter, segment, seg_maxlen,
1377 src->reader->eof, &type);
1378 if (seg_len < 0)
1379 {
1380 /* The segmenter needs more input to produce a segment. */
1381 assert (!src->reader->eof);
1382 lex_source_read__ (src);
1383 continue;
1384 }
1385
1386 /* Update state based on the segment. */
1387 state.last_segment = type;
1388 state.seg_pos += seg_len;
1389 if (type == SEG_NEWLINE)
1390 {
1391 state.newlines++;
1392 state.line_pos = state.seg_pos;
1393 }
1394
1395 /* Pass the segment into the scanner and try to get a token out. */
1396 enum scan_result result = scanner_push (&scanner, type,
1397 ss_buffer (segment, seg_len),
1398 &token->token);
1399 if (result == SCAN_SAVE)
1400 saved = state;
1401 else if (result == SCAN_BACK)
1402 {
1403 state = saved;
1404 break;
1405 }
1406 else if (result == SCAN_DONE)
1407 break;
1408 }
1409
1410 /* If we've reached the end of a line, or the end of a command, then pass
1411 the line to the output engine as a syntax text item. */
1412 int n_lines = state.newlines;
1413 if (state.last_segment == SEG_END_COMMAND && !src->suppress_next_newline)
1414 {
1415 n_lines++;
1416 src->suppress_next_newline = true;
1417 }
1418 else if (n_lines > 0 && src->suppress_next_newline)
1419 {
1420 n_lines--;
1421 src->suppress_next_newline = false;
1422 }
1423 for (int i = 0; i < n_lines; i++)
1424 {
1425 /* Beginning of line. */
1426 const char *line = &src->buffer[src->journal_pos - src->tail];
1427
1428 /* Calculate line length, including \n or \r\n end-of-line if present.
1429
1430 We use src->head even though that may be beyond what we've actually
1431 converted to tokens (which is only through state.line_pos). That's
1432 because, if we're emitting the line due to SEG_END_COMMAND, we want to
1433 take the whole line through the newline, not just through the '.'. */
1434 size_t max_len = src->head - src->journal_pos;
1435 const char *newline = memchr (line, '\n', max_len);
1436 size_t line_len = newline ? newline - line + 1 : max_len;
1437
1438 /* Calculate line length excluding end-of-line. */
1439 size_t copy_len = line_len;
1440 if (copy_len > 0 && line[copy_len - 1] == '\n')
1441 copy_len--;
1442 if (copy_len > 0 && line[copy_len - 1] == '\r')
1443 copy_len--;
1444
1445 /* Submit the line as syntax. */
1446 text_item_submit (text_item_create_nocopy (TEXT_ITEM_SYNTAX,
1447 xmemdup0 (line, copy_len)));
1448
1449 src->journal_pos += line_len;
1450 }
1451
1452 token->token_len = state.seg_pos - src->seg_pos;
1453
1454 src->segmenter = state.segmenter;
1455 src->seg_pos = state.seg_pos;
1456 src->line_pos = state.line_pos;
1457 src->n_newlines += state.newlines;
1458
1459 switch (token->token.type)
1460 {
1461 default:
1462 break;
1463
1464 case T_STOP:
1465 token->token.type = T_ENDCMD;
1466 src->eof = true;
1467 break;
1468
1469 case SCAN_BAD_HEX_LENGTH:
1470 lex_get_error (src, _("String of hex digits has %d characters, which "
1471 "is not a multiple of 2"),
1472 (int) token->token.number);
1473 break;
1474
1475 case SCAN_BAD_HEX_DIGIT:
1476 case SCAN_BAD_UNICODE_DIGIT:
1477 lex_get_error (src, _("`%c' is not a valid hex digit"),
1478 (int) token->token.number);
1479 break;
1480
1481 case SCAN_BAD_UNICODE_LENGTH:
1482 lex_get_error (src, _("Unicode string contains %d bytes, which is "
1483 "not in the valid range of 1 to 8 bytes"),
1484 (int) token->token.number);
1485 break;
1486
1487 case SCAN_BAD_UNICODE_CODE_POINT:
1488 lex_get_error (src, _("U+%04X is not a valid Unicode code point"),
1489 (int) token->token.number);
1490 break;
1491
1492 case SCAN_EXPECTED_QUOTE:
1493 lex_get_error (src, _("Unterminated string constant"));
1494 break;
1495
1496 case SCAN_EXPECTED_EXPONENT:
1497 lex_get_error (src, _("Missing exponent following `%s'"),
1498 token->token.string.string);
1499 break;
1500
1501 case SCAN_UNEXPECTED_DOT:
1502 lex_get_error (src, _("Unexpected `.' in middle of command"));
1503 break;
1504
1505 case SCAN_UNEXPECTED_CHAR:
1506 {
1507 char c_name[16];
1508 lex_get_error (src, _("Bad character %s in input"),
1509 uc_name (token->token.number, c_name));
1510 }
1511 break;
1512
1513 case SCAN_SKIP:
1514 lex_source_pop_front (src);
1515 break;
1516 }
1517
1518 return true;
1519 }
1520
1521 static void
lex_source_push_endcmd__(struct lex_source * src)1522 lex_source_push_endcmd__ (struct lex_source *src)
1523 {
1524 struct lex_token *token = lex_push_token__ (src);
1525 token->token.type = T_ENDCMD;
1526 token->token_pos = 0;
1527 token->token_len = 0;
1528 token->line_pos = 0;
1529 token->first_line = 0;
1530 }
1531
1532 static struct lex_source *
lex_source_create(struct lex_reader * reader)1533 lex_source_create (struct lex_reader *reader)
1534 {
1535 struct lex_source *src;
1536 enum segmenter_mode mode;
1537
1538 src = xzalloc (sizeof *src);
1539 src->reader = reader;
1540
1541 if (reader->syntax == LEX_SYNTAX_AUTO)
1542 mode = SEG_MODE_AUTO;
1543 else if (reader->syntax == LEX_SYNTAX_INTERACTIVE)
1544 mode = SEG_MODE_INTERACTIVE;
1545 else if (reader->syntax == LEX_SYNTAX_BATCH)
1546 mode = SEG_MODE_BATCH;
1547 else
1548 NOT_REACHED ();
1549 segmenter_init (&src->segmenter, mode);
1550
1551 src->tokens = deque_init (&src->deque, 4, sizeof *src->tokens);
1552
1553 lex_source_push_endcmd__ (src);
1554
1555 return src;
1556 }
1557
1558 static void
lex_source_destroy(struct lex_source * src)1559 lex_source_destroy (struct lex_source *src)
1560 {
1561 char *file_name = src->reader->file_name;
1562 char *encoding = src->reader->encoding;
1563 if (src->reader->class->destroy != NULL)
1564 src->reader->class->destroy (src->reader);
1565 free (file_name);
1566 free (encoding);
1567 free (src->buffer);
1568 while (!deque_is_empty (&src->deque))
1569 lex_source_pop__ (src);
1570 free (src->tokens);
1571 ll_remove (&src->ll);
1572 free (src);
1573 }
1574
1575 struct lex_file_reader
1576 {
1577 struct lex_reader reader;
1578 struct u8_istream *istream;
1579 };
1580
1581 static struct lex_reader_class lex_file_reader_class;
1582
1583 /* Creates and returns a new lex_reader that will read from file FILE_NAME (or
1584 from stdin if FILE_NAME is "-"). The file is expected to be encoded with
1585 ENCODING, which should take one of the forms accepted by
1586 u8_istream_for_file(). SYNTAX and ERROR become the syntax mode and error
1587 mode of the new reader, respectively.
1588
1589 Returns a null pointer if FILE_NAME cannot be opened. */
1590 struct lex_reader *
lex_reader_for_file(const char * file_name,const char * encoding,enum lex_syntax_mode syntax,enum lex_error_mode error)1591 lex_reader_for_file (const char *file_name, const char *encoding,
1592 enum lex_syntax_mode syntax,
1593 enum lex_error_mode error)
1594 {
1595 struct lex_file_reader *r;
1596 struct u8_istream *istream;
1597
1598 istream = (!strcmp(file_name, "-")
1599 ? u8_istream_for_fd (encoding, STDIN_FILENO)
1600 : u8_istream_for_file (encoding, file_name, O_RDONLY));
1601 if (istream == NULL)
1602 {
1603 msg (ME, _("Opening `%s': %s."), file_name, strerror (errno));
1604 return NULL;
1605 }
1606
1607 r = xmalloc (sizeof *r);
1608 lex_reader_init (&r->reader, &lex_file_reader_class);
1609 r->reader.syntax = syntax;
1610 r->reader.error = error;
1611 r->reader.file_name = xstrdup (file_name);
1612 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1613 r->reader.line_number = 1;
1614 r->istream = istream;
1615
1616 return &r->reader;
1617 }
1618
1619 static struct lex_file_reader *
lex_file_reader_cast(struct lex_reader * r)1620 lex_file_reader_cast (struct lex_reader *r)
1621 {
1622 return UP_CAST (r, struct lex_file_reader, reader);
1623 }
1624
1625 static size_t
lex_file_read(struct lex_reader * r_,char * buf,size_t n,enum prompt_style prompt_style UNUSED)1626 lex_file_read (struct lex_reader *r_, char *buf, size_t n,
1627 enum prompt_style prompt_style UNUSED)
1628 {
1629 struct lex_file_reader *r = lex_file_reader_cast (r_);
1630 ssize_t n_read = u8_istream_read (r->istream, buf, n);
1631 if (n_read < 0)
1632 {
1633 msg (ME, _("Error reading `%s': %s."), r_->file_name, strerror (errno));
1634 return 0;
1635 }
1636 return n_read;
1637 }
1638
1639 static void
lex_file_close(struct lex_reader * r_)1640 lex_file_close (struct lex_reader *r_)
1641 {
1642 struct lex_file_reader *r = lex_file_reader_cast (r_);
1643
1644 if (u8_istream_fileno (r->istream) != STDIN_FILENO)
1645 {
1646 if (u8_istream_close (r->istream) != 0)
1647 msg (ME, _("Error closing `%s': %s."), r_->file_name, strerror (errno));
1648 }
1649 else
1650 u8_istream_free (r->istream);
1651
1652 free (r);
1653 }
1654
1655 static struct lex_reader_class lex_file_reader_class =
1656 {
1657 lex_file_read,
1658 lex_file_close
1659 };
1660
1661 struct lex_string_reader
1662 {
1663 struct lex_reader reader;
1664 struct substring s;
1665 size_t offset;
1666 };
1667
1668 static struct lex_reader_class lex_string_reader_class;
1669
1670 /* Creates and returns a new lex_reader for the contents of S, which must be
1671 encoded in the given ENCODING. The new reader takes ownership of S and will free it
1672 with ss_dealloc() when it is closed. */
1673 struct lex_reader *
lex_reader_for_substring_nocopy(struct substring s,const char * encoding)1674 lex_reader_for_substring_nocopy (struct substring s, const char *encoding)
1675 {
1676 struct lex_string_reader *r;
1677
1678 r = xmalloc (sizeof *r);
1679 lex_reader_init (&r->reader, &lex_string_reader_class);
1680 r->reader.syntax = LEX_SYNTAX_AUTO;
1681 r->reader.encoding = encoding ? xstrdup (encoding) : NULL;
1682 r->s = s;
1683 r->offset = 0;
1684
1685 return &r->reader;
1686 }
1687
1688 /* Creates and returns a new lex_reader for a copy of null-terminated string S,
1689 which must be encoded in ENCODING. The caller retains ownership of S. */
1690 struct lex_reader *
lex_reader_for_string(const char * s,const char * encoding)1691 lex_reader_for_string (const char *s, const char *encoding)
1692 {
1693 struct substring ss;
1694 ss_alloc_substring (&ss, ss_cstr (s));
1695 return lex_reader_for_substring_nocopy (ss, encoding);
1696 }
1697
1698 /* Formats FORMAT as a printf()-like format string and creates and returns a
1699 new lex_reader for the formatted result. */
1700 struct lex_reader *
lex_reader_for_format(const char * format,const char * encoding,...)1701 lex_reader_for_format (const char *format, const char *encoding, ...)
1702 {
1703 struct lex_reader *r;
1704 va_list args;
1705
1706 va_start (args, encoding);
1707 r = lex_reader_for_substring_nocopy (ss_cstr (xvasprintf (format, args)), encoding);
1708 va_end (args);
1709
1710 return r;
1711 }
1712
1713 static struct lex_string_reader *
lex_string_reader_cast(struct lex_reader * r)1714 lex_string_reader_cast (struct lex_reader *r)
1715 {
1716 return UP_CAST (r, struct lex_string_reader, reader);
1717 }
1718
1719 static size_t
lex_string_read(struct lex_reader * r_,char * buf,size_t n,enum prompt_style prompt_style UNUSED)1720 lex_string_read (struct lex_reader *r_, char *buf, size_t n,
1721 enum prompt_style prompt_style UNUSED)
1722 {
1723 struct lex_string_reader *r = lex_string_reader_cast (r_);
1724 size_t chunk;
1725
1726 chunk = MIN (n, r->s.length - r->offset);
1727 memcpy (buf, r->s.string + r->offset, chunk);
1728 r->offset += chunk;
1729
1730 return chunk;
1731 }
1732
1733 static void
lex_string_close(struct lex_reader * r_)1734 lex_string_close (struct lex_reader *r_)
1735 {
1736 struct lex_string_reader *r = lex_string_reader_cast (r_);
1737
1738 ss_dealloc (&r->s);
1739 free (r);
1740 }
1741
1742 static struct lex_reader_class lex_string_reader_class =
1743 {
1744 lex_string_read,
1745 lex_string_close
1746 };
1747