1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2010, 2011, 2013, 2016 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18
19 #include "language/lexer/segment.h"
20
21 #include <limits.h>
22 #include <unistr.h>
23
24 #include "data/identifier.h"
25 #include "language/lexer/command-name.h"
26 #include "libpspp/assertion.h"
27 #include "libpspp/cast.h"
28
29 #include "gl/c-ctype.h"
30 #include "gl/c-strcase.h"
31
32 enum segmenter_state
33 {
34 S_SHBANG,
35 S_GENERAL,
36 S_COMMENT_1,
37 S_COMMENT_2,
38 S_DOCUMENT_1,
39 S_DOCUMENT_2,
40 S_DOCUMENT_3,
41 S_FILE_LABEL,
42 S_DO_REPEAT_1,
43 S_DO_REPEAT_2,
44 S_DO_REPEAT_3,
45 S_BEGIN_DATA_1,
46 S_BEGIN_DATA_2,
47 S_BEGIN_DATA_3,
48 S_BEGIN_DATA_4,
49 S_TITLE_1,
50 S_TITLE_2
51 };
52
53 #define SS_START_OF_LINE (1u << 0)
54 #define SS_START_OF_COMMAND (1u << 1)
55
56 static int segmenter_detect_command_name__ (const char *input,
57 size_t n, bool eof, int ofs);
58
59 static int
segmenter_u8_to_uc__(ucs4_t * puc,const char * input_,size_t n,bool eof,size_t ofs)60 segmenter_u8_to_uc__ (ucs4_t *puc, const char *input_, size_t n, bool eof,
61 size_t ofs)
62 {
63 const uint8_t *input = CHAR_CAST (const uint8_t *, input_);
64 int mblen;
65
66 assert (n > ofs);
67
68 input += ofs;
69 n -= ofs;
70
71 mblen = u8_mbtoucr (puc, input, n);
72 if (mblen >= 0)
73 return mblen;
74 else if (mblen != -2)
75 return u8_mbtouc (puc, input, n);
76 else if (eof)
77 {
78 *puc = 0xfffd;
79 return n;
80 }
81 else
82 return -1;
83 }
84
85 static int
segmenter_parse_shbang__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)86 segmenter_parse_shbang__ (struct segmenter *s, const char *input, size_t n,
87 bool eof, enum segment_type *type)
88 {
89 if (input[0] == '#')
90 {
91 if (n >= 2)
92 {
93 if (input[1] == '!')
94 {
95 for (int ofs = 2; ; ofs++)
96 {
97 if (ofs >= n)
98 {
99 if (!eof)
100 return -1;
101 }
102 else if (input[ofs] == '\n')
103 {
104 if (input[ofs - 1] == '\r')
105 ofs--;
106 }
107 else
108 continue;
109
110 s->state = S_GENERAL;
111 s->substate = SS_START_OF_COMMAND;
112 *type = SEG_SHBANG;
113 return ofs;
114 }
115 }
116 }
117 else if (!eof)
118 return -1;
119 }
120
121 s->state = S_GENERAL;
122 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
123 return segmenter_push (s, input, n, eof, type);
124 }
125
126 static int
segmenter_parse_digraph__(const char * seconds,struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)127 segmenter_parse_digraph__ (const char *seconds, struct segmenter *s,
128 const char *input, size_t n, bool eof,
129 enum segment_type *type)
130 {
131 assert (s->state == S_GENERAL);
132
133 *type = SEG_PUNCT;
134 s->substate = 0;
135 return (n < 2
136 ? (eof ? 1 : -1)
137 : (strchr (seconds, input[1]) != NULL ? 2 : 1));
138 }
139
140 static int
skip_comment(const char * input,size_t n,bool eof,size_t ofs)141 skip_comment (const char *input, size_t n, bool eof, size_t ofs)
142 {
143 for (; ofs < n; ofs++)
144 {
145 if (input[ofs] == '\n')
146 return ofs;
147 else if (input[ofs] == '*')
148 {
149 if (ofs + 1 >= n)
150 return eof ? ofs + 1 : -1;
151 else if (input[ofs + 1] == '/')
152 return ofs + 2;
153 }
154 }
155 return eof ? ofs : -1;
156 }
157
158 static int
skip_spaces_and_comments(const char * input,size_t n,bool eof,int ofs)159 skip_spaces_and_comments (const char *input, size_t n, bool eof, int ofs)
160 {
161 while (ofs < n)
162 {
163 ucs4_t uc;
164 int mblen;
165
166 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
167 if (mblen < 0)
168 return -1;
169
170 if (uc == '/')
171 {
172 if (ofs + 1 >= n)
173 return eof ? ofs : -1;
174 else if (input[ofs + 1] != '*')
175 return ofs;
176
177 ofs = skip_comment (input, n, eof, ofs + 2);
178 if (ofs < 0)
179 return -1;
180 }
181 else if (lex_uc_is_space (uc) && uc != '\n')
182 ofs += mblen;
183 else
184 return ofs;
185 }
186
187 return eof ? ofs : -1;
188 }
189
190 static int
is_end_of_line(const char * input,size_t n,bool eof,int ofs)191 is_end_of_line (const char *input, size_t n, bool eof, int ofs)
192 {
193 if (ofs >= n)
194 return eof ? 1 : -1;
195 else if (input[ofs] == '\n')
196 return 1;
197 else if (input[ofs] == '\r')
198 {
199 if (ofs + 1 >= n)
200 return eof ? 1 : -1;
201 return input[ofs + 1] == '\n';
202 }
203 else
204 return 0;
205 }
206
207 static int
at_end_of_line(const char * input,size_t n,bool eof,int ofs)208 at_end_of_line (const char *input, size_t n, bool eof, int ofs)
209 {
210 ofs = skip_spaces_and_comments (input, n, eof, ofs);
211 if (ofs < 0)
212 return -1;
213
214 return is_end_of_line (input, n, eof, ofs);
215 }
216
217 static int
segmenter_parse_newline__(const char * input,size_t n,bool eof,enum segment_type * type)218 segmenter_parse_newline__ (const char *input, size_t n, bool eof,
219 enum segment_type *type)
220 {
221 int ofs;
222
223 if (input[0] == '\n')
224 ofs = 1;
225 else
226 {
227 if (n < 2)
228 {
229 assert (!eof);
230 return -1;
231 }
232
233 assert (input[0] == '\r');
234 assert (input[1] == '\n');
235 ofs = 2;
236 }
237
238 *type = SEG_NEWLINE;
239 return ofs;
240 }
241
242 static int
skip_spaces(const char * input,size_t n,bool eof,size_t ofs)243 skip_spaces (const char *input, size_t n, bool eof, size_t ofs)
244 {
245 while (ofs < n)
246 {
247 ucs4_t uc;
248 int mblen;
249
250 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
251 if (mblen < 0)
252 return -1;
253
254 if (!lex_uc_is_space (uc) || uc == '\n')
255 return ofs;
256
257 ofs += mblen;
258 }
259
260 return eof ? ofs : -1;
261 }
262
263 static int
skip_digits(const char * input,size_t n,bool eof,int ofs)264 skip_digits (const char *input, size_t n, bool eof, int ofs)
265 {
266 for (; ofs < n; ofs++)
267 if (!c_isdigit (input[ofs]))
268 return ofs;
269 return eof ? ofs : -1;
270 }
271
272 static int
segmenter_parse_number__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)273 segmenter_parse_number__ (struct segmenter *s, const char *input, size_t n,
274 bool eof, enum segment_type *type)
275 {
276 int ofs;
277
278 assert (s->state == S_GENERAL);
279
280 ofs = skip_digits (input, n, eof, 0);
281 if (ofs < 0)
282 return -1;
283
284 if (ofs >= n)
285 {
286 if (!eof)
287 return -1;
288 goto number;
289 };
290 if (input[ofs] == '.')
291 {
292 ofs = skip_digits (input, n, eof, ofs + 1);
293 if (ofs < 0)
294 return -1;
295 }
296
297 if (ofs >= n)
298 {
299 if (!eof)
300 return -1;
301 goto number;
302 }
303 if (input[ofs] == 'e' || input[ofs] == 'E')
304 {
305 ofs++;
306 if (ofs >= n)
307 {
308 if (!eof)
309 return -1;
310 goto expected_exponent;
311 }
312
313 if (input[ofs] == '+' || input[ofs] == '-')
314 {
315 ofs++;
316 if (ofs >= n)
317 {
318 if (!eof)
319 return -1;
320 goto expected_exponent;
321 }
322 }
323
324 if (!c_isdigit (input[ofs]))
325 goto expected_exponent;
326
327 ofs = skip_digits (input, n, eof, ofs);
328 if (ofs < 0)
329 return -1;
330 }
331
332 if (input[ofs - 1] == '.')
333 {
334 int eol = at_end_of_line (input, n, eof, ofs);
335 if (eol < 0)
336 return -1;
337 else if (eol)
338 ofs--;
339 }
340
341 number:
342 *type = SEG_NUMBER;
343 s->substate = 0;
344 return ofs;
345
346 expected_exponent:
347 *type = SEG_EXPECTED_EXPONENT;
348 s->substate = 0;
349 return ofs;
350 }
351
352 static bool
is_reserved_word(const char * s,int n)353 is_reserved_word (const char *s, int n)
354 {
355 char s0, s1, s2, s3;
356
357 s0 = c_toupper (s[0]);
358 switch (n)
359 {
360 case 2:
361 s1 = c_toupper (s[1]);
362 return ((s0 == 'B' && s1 == 'Y')
363 || (s0 == 'E' && s1 == 'Q')
364 || (s0 == 'G' && (s1 == 'E' || s1 == 'T'))
365 || (s0 == 'L' && (s1 == 'E' || s1 == 'T'))
366 || (s0 == 'N' && s1 == 'E')
367 || (s0 == 'O' && s1 == 'R')
368 || (s0 == 'T' && s1 == 'O'));
369
370 case 3:
371 s1 = c_toupper (s[1]);
372 s2 = c_toupper (s[2]);
373 return ((s0 == 'A' && ((s1 == 'L' && s2 == 'L')
374 || (s1 == 'N' && s2 == 'D')))
375 || (s0 == 'N' && s1 == 'O' && s2 == 'T'));
376
377 case 4:
378 s1 = c_toupper (s[1]);
379 s2 = c_toupper (s[2]);
380 s3 = c_toupper (s[3]);
381 return s0 == 'W' && s1 == 'I' && s2 == 'T' && s3 == 'H';
382
383 default:
384 return false;
385 }
386 }
387
388 static int
segmenter_parse_comment_1__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)389 segmenter_parse_comment_1__ (struct segmenter *s,
390 const char *input, size_t n, bool eof,
391 enum segment_type *type)
392 {
393 int endcmd;
394 int ofs;
395
396 endcmd = -2;
397 ofs = 0;
398 while (ofs < n)
399 {
400 ucs4_t uc;
401 int mblen;
402
403 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
404 if (mblen < 0)
405 return -1;
406
407 switch (uc)
408 {
409 case '.':
410 endcmd = ofs;
411 break;
412
413 case '\n':
414 if (ofs > 1 && input[ofs - 1] == '\r')
415 ofs--;
416 if (endcmd == -2)
417 {
418 /* Blank line ends comment command. */
419 s->state = S_GENERAL;
420 s->substate = SS_START_OF_COMMAND;
421 *type = SEG_SEPARATE_COMMANDS;
422 return ofs;
423 }
424 else if (endcmd >= 0)
425 {
426 /* '.' at end of line ends comment command. */
427 s->state = S_GENERAL;
428 s->substate = 0;
429 *type = SEG_COMMENT_COMMAND;
430 return endcmd;
431 }
432 else
433 {
434 /* Comment continues onto next line. */
435 *type = SEG_COMMENT_COMMAND;
436 s->state = S_COMMENT_2;
437 return ofs;
438 }
439 NOT_REACHED ();
440
441 default:
442 if (!lex_uc_is_space (uc))
443 endcmd = -1;
444 break;
445 }
446
447 ofs += mblen;
448 }
449
450 if (eof)
451 {
452 /* End of file. */
453 s->state = S_GENERAL;
454 s->substate = SS_START_OF_COMMAND;
455 *type = SEG_SEPARATE_COMMANDS;
456 return ofs;
457 }
458
459 return -1;
460 }
461
462 static int
segmenter_parse_comment_2__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)463 segmenter_parse_comment_2__ (struct segmenter *s, const char *input,
464 size_t n, bool eof, enum segment_type *type)
465 {
466 int ofs = segmenter_parse_newline__ (input, n, eof, type);
467 if (ofs < 0)
468 return -1;
469
470 int new_cmd;
471 if (ofs >= n)
472 {
473 if (!eof)
474 return -1;
475 new_cmd = false;
476 }
477 else
478 {
479 ucs4_t uc;
480 int mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
481 if (mblen < 0)
482 return -1;
483
484 if (uc == '+' || uc == '-' || uc == '.')
485 new_cmd = true;
486 else if (!lex_uc_is_space (uc))
487 switch (s->mode)
488 {
489 case SEG_MODE_INTERACTIVE:
490 new_cmd = false;
491 break;
492
493 case SEG_MODE_BATCH:
494 new_cmd = true;
495 break;
496
497 case SEG_MODE_AUTO:
498 new_cmd = segmenter_detect_command_name__ (input, n, eof, ofs);
499 if (new_cmd < 0)
500 return -1;
501 break;
502
503 default:
504 NOT_REACHED ();
505 }
506 else
507 new_cmd = false;
508 }
509
510 if (new_cmd)
511 {
512 s->state = S_GENERAL;
513 s->substate = SS_START_OF_LINE | SS_START_OF_COMMAND;
514 }
515 else
516 s->state = S_COMMENT_1;
517 return ofs;
518 }
519
520 static int
segmenter_parse_document_1__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)521 segmenter_parse_document_1__ (struct segmenter *s, const char *input, size_t n,
522 bool eof, enum segment_type *type)
523 {
524 bool end_cmd;
525 int ofs;
526
527 end_cmd = false;
528 ofs = 0;
529 while (ofs < n)
530 {
531 ucs4_t uc;
532 int mblen;
533
534 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
535 if (mblen < 0)
536 return -1;
537
538 switch (uc)
539 {
540 case '.':
541 end_cmd = true;
542 break;
543
544 case '\n':
545 if (ofs > 1 && input[ofs - 1] == '\r')
546 ofs--;
547
548 *type = SEG_DOCUMENT;
549 s->state = end_cmd ? S_DOCUMENT_3 : S_DOCUMENT_2;
550 return ofs;
551
552 default:
553 if (!lex_uc_is_space (uc))
554 end_cmd = false;
555 break;
556 }
557
558 ofs += mblen;
559 }
560 if (eof)
561 {
562 *type = SEG_DOCUMENT;
563 s->state = S_DOCUMENT_3;
564 return ofs;
565 }
566 return -1;
567 }
568
569 static int
segmenter_parse_document_2__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)570 segmenter_parse_document_2__ (struct segmenter *s, const char *input, size_t n,
571 bool eof, enum segment_type *type)
572 {
573 int ofs;
574
575 ofs = segmenter_parse_newline__ (input, n, eof, type);
576 if (ofs < 0)
577 return -1;
578
579 s->state = S_DOCUMENT_1;
580 return ofs;
581 }
582
583 static int
segmenter_parse_document_3__(struct segmenter * s,enum segment_type * type)584 segmenter_parse_document_3__ (struct segmenter *s, enum segment_type *type)
585 {
586 *type = SEG_END_COMMAND;
587 s->state = S_GENERAL;
588 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
589 return 0;
590 }
591
592 static int
segmenter_unquoted(const char * input,size_t n,bool eof,int ofs)593 segmenter_unquoted (const char *input, size_t n, bool eof, int ofs)
594
595 {
596 ofs = skip_spaces_and_comments (input, n, eof, ofs);
597 if (ofs < 0)
598 return -1;
599 else if (ofs < n)
600 {
601 char c = input[ofs];
602 return c != '\'' && c != '"' && c != '\n';
603 }
604 else
605 {
606 assert (eof);
607 return 0;
608 }
609 }
610
611 static int
next_id_in_command(const struct segmenter * s,const char * input,size_t n,bool eof,int ofs,char id[],size_t id_size)612 next_id_in_command (const struct segmenter *s, const char *input, size_t n,
613 bool eof, int ofs, char id[], size_t id_size)
614 {
615 struct segmenter sub;
616
617 assert (id_size > 0);
618
619 sub.mode = s->mode;
620 sub.state = S_GENERAL;
621 sub.substate = 0;
622 for (;;)
623 {
624 enum segment_type type;
625 int retval;
626
627 retval = segmenter_push (&sub, input + ofs, n - ofs, eof, &type);
628 if (retval < 0)
629 {
630 id[0] = '\0';
631 return -1;
632 }
633
634 switch (type)
635 {
636 case SEG_SHBANG:
637 case SEG_SPACES:
638 case SEG_COMMENT:
639 case SEG_NEWLINE:
640 break;
641
642 case SEG_IDENTIFIER:
643 if (retval < id_size)
644 {
645 memcpy (id, input + ofs, retval);
646 id[retval] = '\0';
647 return ofs + retval;
648 }
649 /* fall through */
650
651 case SEG_NUMBER:
652 case SEG_QUOTED_STRING:
653 case SEG_HEX_STRING:
654 case SEG_UNICODE_STRING:
655 case SEG_UNQUOTED_STRING:
656 case SEG_RESERVED_WORD:
657 case SEG_PUNCT:
658 case SEG_COMMENT_COMMAND:
659 case SEG_DO_REPEAT_COMMAND:
660 case SEG_INLINE_DATA:
661 case SEG_START_DOCUMENT:
662 case SEG_DOCUMENT:
663 case SEG_START_COMMAND:
664 case SEG_SEPARATE_COMMANDS:
665 case SEG_END_COMMAND:
666 case SEG_END:
667 case SEG_EXPECTED_QUOTE:
668 case SEG_EXPECTED_EXPONENT:
669 case SEG_UNEXPECTED_DOT:
670 case SEG_UNEXPECTED_CHAR:
671 id[0] = '\0';
672 return ofs + retval;
673 }
674 ofs += retval;
675 }
676 }
677
678 /* Called when INPUT begins with a character that can start off an ID token. */
679 static int
segmenter_parse_id__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)680 segmenter_parse_id__ (struct segmenter *s, const char *input, size_t n,
681 bool eof, enum segment_type *type)
682 {
683 ucs4_t uc;
684 int ofs;
685
686 assert (n > 0);
687 assert (s->state == S_GENERAL);
688
689 ofs = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, input), n);
690 for (;;)
691 {
692 int mblen;
693
694 if (ofs >= n)
695 {
696 if (eof)
697 break;
698 return -1;
699 }
700
701 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
702 if (mblen < 0)
703 return -1;
704 else if (!lex_uc_is_idn (uc))
705 break;
706
707 ofs += mblen;
708 }
709
710 if (input[ofs - 1] == '.')
711 {
712 int eol = at_end_of_line (input, n, eof, ofs);
713 if (eol < 0)
714 return -1;
715 else if (eol)
716 ofs--;
717 }
718
719 if (is_reserved_word (input, ofs))
720 *type = SEG_RESERVED_WORD;
721 else
722 *type = SEG_IDENTIFIER;
723
724 if (s->substate & SS_START_OF_COMMAND)
725 {
726 struct substring word = ss_buffer (input, ofs);
727
728 if (lex_id_match_n (ss_cstr ("COMMENT"), word, 4))
729 {
730 s->state = S_COMMENT_1;
731 return segmenter_parse_comment_1__ (s, input, n, eof, type);
732 }
733 else if (lex_id_match (ss_cstr ("DOCUMENT"), word))
734 {
735 s->state = S_DOCUMENT_1;
736 *type = SEG_START_DOCUMENT;
737 return 0;
738 }
739 else if (lex_id_match (ss_cstr ("TITLE"), word)
740 || lex_id_match (ss_cstr ("SUBTITLE"), word))
741 {
742 int result = segmenter_unquoted (input, n, eof, ofs);
743 if (result < 0)
744 return -1;
745 else if (result)
746 {
747 s->state = S_TITLE_1;
748 return ofs;
749 }
750 }
751 else if (lex_id_match (ss_cstr ("FILE"), word))
752 {
753 char id[16];
754
755 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
756 return -1;
757 else if (lex_id_match (ss_cstr ("LABEL"), ss_cstr (id)))
758 {
759 s->state = S_FILE_LABEL;
760 s->substate = 0;
761 return ofs;
762 }
763 }
764 else if (lex_id_match (ss_cstr ("DO"), word))
765 {
766 char id[16];
767
768 if (next_id_in_command (s, input, n, eof, ofs, id, sizeof id) < 0)
769 return -1;
770 else if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
771 {
772 s->state = S_DO_REPEAT_1;
773 s->substate = 0;
774 return ofs;
775 }
776 }
777 else if (lex_id_match (ss_cstr ("BEGIN"), word))
778 {
779 char id[16];
780 int ofs2;
781
782 ofs2 = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
783 if (ofs2 < 0)
784 return -1;
785 else if (lex_id_match (ss_cstr ("DATA"), ss_cstr (id)))
786 {
787 int eol;
788
789 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2);
790 if (ofs2 < 0)
791 return -1;
792
793 if (ofs2 >= n)
794 assert (eof);
795 else if (input[ofs2] == '.')
796 {
797 ofs2 = skip_spaces_and_comments (input, n, eof, ofs2 + 1);
798 if (ofs2 < 0)
799 return -1;
800 }
801
802 eol = is_end_of_line (input, n, eof, ofs2);
803 if (eol < 0)
804 return -1;
805 else if (eol)
806 {
807 if (memchr (input, '\n', ofs2))
808 s->state = S_BEGIN_DATA_1;
809 else
810 s->state = S_BEGIN_DATA_2;
811 s->substate = 0;
812 return ofs;
813 }
814 }
815 }
816 }
817
818 s->substate = 0;
819 return ofs;
820 }
821
822 static int
segmenter_parse_string__(enum segment_type string_type,int ofs,struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)823 segmenter_parse_string__ (enum segment_type string_type,
824 int ofs, struct segmenter *s,
825 const char *input, size_t n, bool eof,
826 enum segment_type *type)
827 {
828 int quote = input[ofs];
829
830 ofs++;
831 while (ofs < n)
832 if (input[ofs] == quote)
833 {
834 ofs++;
835 if (ofs < n)
836 {
837 if (input[ofs] == quote)
838 {
839 ofs++;
840 continue;
841 }
842 }
843 else if (!eof)
844 return -1;
845
846 *type = string_type;
847 s->substate = 0;
848 return ofs;
849 }
850 else if (input[ofs] == '\n')
851 goto expected_quote;
852 else
853 ofs++;
854
855 if (eof)
856 goto expected_quote;
857
858 return -1;
859
860 expected_quote:
861 *type = SEG_EXPECTED_QUOTE;
862 s->substate = 0;
863 return ofs;
864 }
865
866 static int
segmenter_maybe_parse_string__(enum segment_type string_type,struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)867 segmenter_maybe_parse_string__ (enum segment_type string_type,
868 struct segmenter *s,
869 const char *input, size_t n, bool eof,
870 enum segment_type *type)
871 {
872 if (n < 2)
873 {
874 if (!eof)
875 return -1;
876 }
877 else if (input[1] == '\'' || input[1] == '"')
878 return segmenter_parse_string__ (string_type, 1, s, input, n, eof, type);
879
880 return segmenter_parse_id__ (s, input, n, eof, type);
881 }
882
883 static int
segmenter_parse_mid_command__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)884 segmenter_parse_mid_command__ (struct segmenter *s,
885 const char *input, size_t n, bool eof,
886 enum segment_type *type)
887 {
888 ucs4_t uc;
889 int mblen;
890 int ofs;
891
892 assert (s->state == S_GENERAL);
893 assert (!(s->substate & SS_START_OF_LINE));
894
895 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
896 if (mblen < 0)
897 return -1;
898
899 switch (uc)
900 {
901 case '\n':
902 s->substate |= SS_START_OF_LINE;
903 *type = SEG_NEWLINE;
904 return 1;
905
906 case '/':
907 if (n < 2)
908 {
909 if (!eof)
910 return -1;
911 }
912 else if (input[1] == '*')
913 {
914 ofs = skip_comment (input, n, eof, 2);
915 if (ofs < 0)
916 return -1;
917
918 *type = SEG_COMMENT;
919 return ofs;
920 }
921
922 s->substate = 0;
923 *type = SEG_PUNCT;
924 return 1;
925
926 case '(': case ')': case ',': case '=': case '-':
927 case '[': case ']': case '&': case '|': case '+':
928 *type = SEG_PUNCT;
929 s->substate = 0;
930 return 1;
931
932 case '*':
933 if (s->substate & SS_START_OF_COMMAND)
934 {
935 /* '*' at the beginning of a command begins a comment. */
936 s->state = S_COMMENT_1;
937 return segmenter_parse_comment_1__ (s, input, n, eof, type);
938 }
939 else
940 return segmenter_parse_digraph__ ("*", s, input, n, eof, type);
941
942 case '<':
943 return segmenter_parse_digraph__ ("=>", s, input, n, eof, type);
944
945 case '>':
946 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
947
948 case '~':
949 return segmenter_parse_digraph__ ("=", s, input, n, eof, type);
950
951 case '.':
952 if (n < 2)
953 {
954 if (!eof)
955 return -1;
956 }
957 else if (c_isdigit (input[1]))
958 return segmenter_parse_number__ (s, input, n, eof, type);
959
960 int eol = at_end_of_line (input, n, eof, 1);
961 if (eol < 0)
962 return -1;
963
964 if (eol)
965 {
966 *type = SEG_END_COMMAND;
967 s->substate = SS_START_OF_COMMAND;
968 }
969 else
970 *type = SEG_UNEXPECTED_DOT;
971 return 1;
972
973 case '0': case '1': case '2': case '3': case '4':
974 case '5': case '6': case '7': case '8': case '9':
975 return segmenter_parse_number__ (s, input, n, eof, type);
976
977 case 'u': case 'U':
978 return segmenter_maybe_parse_string__ (SEG_UNICODE_STRING,
979 s, input, n, eof, type);
980
981 case 'x': case 'X':
982 return segmenter_maybe_parse_string__ (SEG_HEX_STRING,
983 s, input, n, eof, type);
984
985 case '\'': case '"':
986 return segmenter_parse_string__ (SEG_QUOTED_STRING, 0,
987 s, input, n, eof, type);
988
989 default:
990 if (lex_uc_is_space (uc))
991 {
992 ofs = skip_spaces (input, n, eof, mblen);
993 if (ofs < 0)
994 return -1;
995
996 if (input[ofs - 1] == '\r' && input[ofs] == '\n')
997 {
998 if (ofs == 1)
999 {
1000 s->substate |= SS_START_OF_LINE;
1001 *type = SEG_NEWLINE;
1002 return 2;
1003 }
1004 else
1005 ofs--;
1006 }
1007 *type = SEG_SPACES;
1008 return ofs;
1009 }
1010 else if (lex_uc_is_id1 (uc))
1011 return segmenter_parse_id__ (s, input, n, eof, type);
1012 else
1013 {
1014 *type = SEG_UNEXPECTED_CHAR;
1015 s->substate = 0;
1016 return mblen;
1017 }
1018 }
1019 }
1020
1021 static int
compare_commands(const void * a_,const void * b_)1022 compare_commands (const void *a_, const void *b_)
1023 {
1024 const char *const *ap = a_;
1025 const char *const *bp = b_;
1026 const char *a = *ap;
1027 const char *b = *bp;
1028
1029 return c_strcasecmp (a, b);
1030 }
1031
1032 static const char **
segmenter_get_command_name_candidates(unsigned char first)1033 segmenter_get_command_name_candidates (unsigned char first)
1034 {
1035 #define DEF_CMD(STATES, FLAGS, NAME, FUNCTION) NAME,
1036 #define UNIMPL_CMD(NAME, DESCRIPTION) NAME,
1037 static const char *commands[] =
1038 {
1039 #include "language/command.def"
1040 ""
1041 };
1042 static size_t n_commands = (sizeof commands / sizeof *commands) - 1;
1043 #undef DEF_CMD
1044 #undef UNIMPL_CMD
1045
1046 static bool inited;
1047
1048 static const char **cindex[UCHAR_MAX + 1];
1049
1050 if (!inited)
1051 {
1052 size_t i;
1053
1054 inited = true;
1055
1056 qsort (commands, n_commands, sizeof *commands, compare_commands);
1057 for (i = 0; i < n_commands; i++)
1058 {
1059 unsigned char c = c_toupper (commands[i][0]);
1060 if (cindex[c] == NULL)
1061 cindex[c] = &commands[i];
1062 }
1063 for (i = 0; i <= UCHAR_MAX; i++)
1064 if (cindex[i] == NULL)
1065 cindex[i] = &commands[n_commands];
1066 }
1067
1068 return cindex[c_toupper (first)];
1069 }
1070
1071 static int
segmenter_detect_command_name__(const char * input,size_t n,bool eof,int ofs)1072 segmenter_detect_command_name__ (const char *input, size_t n, bool eof,
1073 int ofs)
1074 {
1075 const char **commands;
1076
1077 input += ofs;
1078 n -= ofs;
1079 ofs = 0;
1080 for (;;)
1081 {
1082 ucs4_t uc;
1083 int mblen;
1084
1085 if (ofs >= n)
1086 {
1087 if (eof)
1088 break;
1089 return -1;
1090 }
1091
1092 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1093 if (mblen < 0)
1094 return -1;
1095
1096 if (uc == '\n'
1097 || !(lex_uc_is_space (uc) || lex_uc_is_idn (uc) || uc == '-'))
1098 break;
1099
1100 ofs += mblen;
1101 }
1102 if (!ofs)
1103 return 0;
1104
1105 if (input[ofs - 1] == '.')
1106 ofs--;
1107
1108 for (commands = segmenter_get_command_name_candidates (input[0]);
1109 c_toupper (input[0]) == c_toupper ((*commands)[0]);
1110 commands++)
1111 {
1112 int missing_words;
1113 bool exact;
1114
1115 if (command_match (ss_cstr (*commands), ss_buffer (input, ofs),
1116 &exact, &missing_words)
1117 && missing_words <= 0)
1118 return 1;
1119 }
1120
1121 return 0;
1122 }
1123
1124 static int
is_start_of_string__(const char * input,size_t n,bool eof,int ofs)1125 is_start_of_string__ (const char *input, size_t n, bool eof, int ofs)
1126 {
1127 if (ofs >= n)
1128 return eof ? 0 : -1;
1129
1130 int c = input[ofs];
1131 if (c == 'x' || c == 'X' || c == 'u' || c == 'U')
1132 {
1133 if (ofs + 1 >= n)
1134 return eof ? 0 : -1;
1135
1136 return input[ofs + 1] == '\'' || input[ofs + 1] == '"';
1137 }
1138 else
1139 return c == '\'' || c == '"' || c == '\n';
1140 }
1141
1142 static int
segmenter_parse_start_of_line__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1143 segmenter_parse_start_of_line__ (struct segmenter *s,
1144 const char *input, size_t n, bool eof,
1145 enum segment_type *type)
1146 {
1147 ucs4_t uc;
1148 int mblen;
1149 int ofs;
1150
1151 assert (s->state == S_GENERAL);
1152 assert (s->substate & SS_START_OF_LINE);
1153
1154 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, 0);
1155 if (mblen < 0)
1156 return -1;
1157
1158 switch (uc)
1159 {
1160 case '+':
1161 ofs = skip_spaces_and_comments (input, n, eof, 1);
1162 if (ofs < 0)
1163 return -1;
1164 else
1165 {
1166 int is_string = is_start_of_string__ (input, n, eof, ofs);
1167 if (is_string < 0)
1168 return -1;
1169 else if (is_string)
1170 {
1171 /* This is punctuation that may separate pieces of a string. */
1172 *type = SEG_PUNCT;
1173 s->substate = 0;
1174 return 1;
1175 }
1176 }
1177 /* Fall through. */
1178
1179 case '-':
1180 case '.':
1181 *type = SEG_START_COMMAND;
1182 s->substate = SS_START_OF_COMMAND;
1183 return 1;
1184
1185 default:
1186 if (lex_uc_is_space (uc))
1187 {
1188 int eol = at_end_of_line (input, n, eof, 0);
1189 if (eol < 0)
1190 return -1;
1191 else if (eol)
1192 {
1193 s->substate = SS_START_OF_COMMAND;
1194 *type = SEG_SEPARATE_COMMANDS;
1195 return 0;
1196 }
1197 break;
1198 }
1199
1200 if (s->mode == SEG_MODE_INTERACTIVE || s->substate & SS_START_OF_COMMAND)
1201 break;
1202 else if (s->mode == SEG_MODE_AUTO)
1203 {
1204 int cmd = segmenter_detect_command_name__ (input, n, eof, 0);
1205 if (cmd < 0)
1206 return -1;
1207 else if (cmd == 0)
1208 break;
1209 }
1210 else
1211 assert (s->mode == SEG_MODE_BATCH);
1212
1213 s->substate = SS_START_OF_COMMAND;
1214 *type = SEG_START_COMMAND;
1215 return 0;
1216 }
1217
1218 s->substate = SS_START_OF_COMMAND;
1219 return segmenter_parse_mid_command__ (s, input, n, eof, type);
1220 }
1221
1222 static int
segmenter_parse_file_label__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1223 segmenter_parse_file_label__ (struct segmenter *s,
1224 const char *input, size_t n, bool eof,
1225 enum segment_type *type)
1226 {
1227 struct segmenter sub;
1228 int ofs;
1229
1230 sub = *s;
1231 sub.state = S_GENERAL;
1232 ofs = segmenter_push (&sub, input, n, eof, type);
1233
1234 if (ofs < 0)
1235 return -1;
1236 else if (*type == SEG_IDENTIFIER)
1237 {
1238 int result;
1239
1240 assert (lex_id_match (ss_cstr ("LABEL"),
1241 ss_buffer ((char *) input, ofs)));
1242 result = segmenter_unquoted (input, n, eof, ofs);
1243 if (result < 0)
1244 return -1;
1245 else
1246 {
1247 if (result)
1248 s->state = S_TITLE_1;
1249 else
1250 *s = sub;
1251 return ofs;
1252 }
1253 }
1254 else
1255 {
1256 s->substate = sub.substate;
1257 return ofs;
1258 }
1259 }
1260
1261 static int
segmenter_subparse(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1262 segmenter_subparse (struct segmenter *s,
1263 const char *input, size_t n, bool eof,
1264 enum segment_type *type)
1265 {
1266 struct segmenter sub;
1267 int ofs;
1268
1269 sub.mode = s->mode;
1270 sub.state = S_GENERAL;
1271 sub.substate = s->substate;
1272 ofs = segmenter_push (&sub, input, n, eof, type);
1273 s->substate = sub.substate;
1274 return ofs;
1275 }
1276
1277 static int
segmenter_parse_do_repeat_1__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1278 segmenter_parse_do_repeat_1__ (struct segmenter *s,
1279 const char *input, size_t n, bool eof,
1280 enum segment_type *type)
1281 {
1282 int ofs = segmenter_subparse (s, input, n, eof, type);
1283 if (ofs < 0)
1284 return -1;
1285
1286 if (*type == SEG_START_COMMAND || *type == SEG_SEPARATE_COMMANDS)
1287 s->state = S_DO_REPEAT_2;
1288 else if (*type == SEG_END_COMMAND)
1289 {
1290 s->state = S_DO_REPEAT_3;
1291 s->substate = 1;
1292 }
1293
1294 return ofs;
1295 }
1296
1297 static int
segmenter_parse_do_repeat_2__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1298 segmenter_parse_do_repeat_2__ (struct segmenter *s,
1299 const char *input, size_t n, bool eof,
1300 enum segment_type *type)
1301 {
1302 int ofs = segmenter_subparse (s, input, n, eof, type);
1303 if (ofs < 0)
1304 return -1;
1305
1306 if (*type == SEG_NEWLINE)
1307 {
1308 s->state = S_DO_REPEAT_3;
1309 s->substate = 1;
1310 }
1311
1312 return ofs;
1313 }
1314
1315 static bool
check_repeat_command(struct segmenter * s,const char * input,size_t n,bool eof)1316 check_repeat_command (struct segmenter *s,
1317 const char *input, size_t n, bool eof)
1318 {
1319 int direction;
1320 char id[16];
1321 int ofs;
1322
1323 ofs = 0;
1324 if (input[ofs] == '+' || input[ofs] == '-')
1325 ofs++;
1326
1327 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1328 if (ofs < 0)
1329 return false;
1330 else if (lex_id_match (ss_cstr ("DO"), ss_cstr (id)))
1331 direction = 1;
1332 else if (lex_id_match (ss_cstr ("END"), ss_cstr (id)))
1333 direction = -1;
1334 else
1335 return true;
1336
1337 ofs = next_id_in_command (s, input, n, eof, ofs, id, sizeof id);
1338 if (ofs < 0)
1339 return false;
1340
1341 if (lex_id_match (ss_cstr ("REPEAT"), ss_cstr (id)))
1342 s->substate += direction;
1343 return true;
1344 }
1345
1346 static int
segmenter_parse_full_line__(const char * input,size_t n,bool eof,enum segment_type * type)1347 segmenter_parse_full_line__ (const char *input, size_t n, bool eof,
1348 enum segment_type *type)
1349 {
1350 const char *newline = memchr (input, '\n', n);
1351 if (!newline)
1352 return eof ? n : -1;
1353
1354 ptrdiff_t ofs = newline - input;
1355 if (ofs == 0 || (ofs == 1 && input[0] == '\r'))
1356 {
1357 *type = SEG_NEWLINE;
1358 return ofs + 1;
1359 }
1360 else
1361 return ofs - (input[ofs - 1] == '\r');
1362 }
1363
1364 static int
segmenter_parse_do_repeat_3__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1365 segmenter_parse_do_repeat_3__ (struct segmenter *s,
1366 const char *input, size_t n, bool eof,
1367 enum segment_type *type)
1368 {
1369 int ofs;
1370
1371 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1372 if (ofs < 0 || (ofs > 0 && input[ofs - 1] == '\n'))
1373 return ofs;
1374 else if (!check_repeat_command (s, input, n, eof) && !eof)
1375 return -1;
1376 else if (s->substate == 0)
1377 {
1378 s->state = S_GENERAL;
1379 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1380 return segmenter_push (s, input, n, eof, type);
1381 }
1382 else
1383 {
1384 *type = SEG_DO_REPEAT_COMMAND;
1385 return ofs;
1386 }
1387 }
1388
1389 static int
segmenter_parse_begin_data_1__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1390 segmenter_parse_begin_data_1__ (struct segmenter *s,
1391 const char *input, size_t n, bool eof,
1392 enum segment_type *type)
1393 {
1394 int ofs = segmenter_subparse (s, input, n, eof, type);
1395 if (ofs < 0)
1396 return -1;
1397
1398 if (*type == SEG_NEWLINE)
1399 s->state = S_BEGIN_DATA_2;
1400
1401 return ofs;
1402 }
1403
1404 static int
segmenter_parse_begin_data_2__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1405 segmenter_parse_begin_data_2__ (struct segmenter *s,
1406 const char *input, size_t n, bool eof,
1407 enum segment_type *type)
1408 {
1409 int ofs = segmenter_subparse (s, input, n, eof, type);
1410 if (ofs < 0)
1411 return -1;
1412
1413 if (*type == SEG_NEWLINE)
1414 s->state = S_BEGIN_DATA_3;
1415
1416 return ofs;
1417 }
1418
1419 static bool
is_end_data(const char * input,size_t n)1420 is_end_data (const char *input, size_t n)
1421 {
1422 const uint8_t *u_input = CHAR_CAST (const uint8_t *, input);
1423 bool endcmd;
1424 ucs4_t uc;
1425 int mblen;
1426 int ofs;
1427
1428 if (n < 4 || c_strncasecmp (input, "END", 3))
1429 return false;
1430
1431 ofs = 3;
1432 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1433 if (!lex_uc_is_space (uc))
1434 return false;
1435 ofs += mblen;
1436
1437 if (n - ofs < 4 || c_strncasecmp (input + ofs, "DATA", 4))
1438 return false;
1439 ofs += 4;
1440
1441 endcmd = false;
1442 while (ofs < n)
1443 {
1444 mblen = u8_mbtouc (&uc, u_input + ofs, n - ofs);
1445 if (uc == '.')
1446 {
1447 if (endcmd)
1448 return false;
1449 endcmd = true;
1450 }
1451 else if (!lex_uc_is_space (uc))
1452 return false;
1453 ofs += mblen;
1454 }
1455
1456 return true;
1457 }
1458
1459 static int
segmenter_parse_begin_data_3__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1460 segmenter_parse_begin_data_3__ (struct segmenter *s,
1461 const char *input, size_t n, bool eof,
1462 enum segment_type *type)
1463 {
1464 int ofs;
1465
1466 ofs = segmenter_parse_full_line__ (input, n, eof, type);
1467 if (ofs < 0)
1468 return -1;
1469 else if (is_end_data (input, ofs))
1470 {
1471 s->state = S_GENERAL;
1472 s->substate = SS_START_OF_COMMAND | SS_START_OF_LINE;
1473 return segmenter_push (s, input, n, eof, type);
1474 }
1475 else
1476 {
1477 *type = SEG_INLINE_DATA;
1478 s->state = S_BEGIN_DATA_4;
1479 return input[ofs - 1] == '\n' ? 0 : ofs;
1480 }
1481 }
1482
1483 static int
segmenter_parse_begin_data_4__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1484 segmenter_parse_begin_data_4__ (struct segmenter *s,
1485 const char *input, size_t n, bool eof,
1486 enum segment_type *type)
1487 {
1488 int ofs;
1489
1490 ofs = segmenter_parse_newline__ (input, n, eof, type);
1491 if (ofs < 0)
1492 return -1;
1493
1494 s->state = S_BEGIN_DATA_3;
1495 return ofs;
1496 }
1497
1498 static int
segmenter_parse_title_1__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1499 segmenter_parse_title_1__ (struct segmenter *s,
1500 const char *input, size_t n, bool eof,
1501 enum segment_type *type)
1502 {
1503 int ofs;
1504
1505 ofs = skip_spaces (input, n, eof, 0);
1506 if (ofs < 0)
1507 return -1;
1508 s->state = S_TITLE_2;
1509 *type = SEG_SPACES;
1510 return ofs;
1511 }
1512
1513 static int
segmenter_parse_title_2__(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1514 segmenter_parse_title_2__ (struct segmenter *s,
1515 const char *input, size_t n, bool eof,
1516 enum segment_type *type)
1517 {
1518 int endcmd;
1519 int ofs;
1520
1521 endcmd = -1;
1522 ofs = 0;
1523 while (ofs < n)
1524 {
1525 ucs4_t uc;
1526 int mblen;
1527
1528 mblen = segmenter_u8_to_uc__ (&uc, input, n, eof, ofs);
1529 if (mblen < 0)
1530 return -1;
1531
1532 switch (uc)
1533 {
1534 case '\n':
1535 goto end_of_line;
1536
1537 case '.':
1538 endcmd = ofs;
1539 break;
1540
1541 default:
1542 if (!lex_uc_is_space (uc))
1543 endcmd = -1;
1544 break;
1545 }
1546
1547 ofs += mblen;
1548 }
1549
1550 if (eof)
1551 {
1552 end_of_line:
1553 s->state = S_GENERAL;
1554 s->substate = 0;
1555 *type = SEG_UNQUOTED_STRING;
1556 return endcmd >= 0 ? endcmd : ofs;
1557 }
1558
1559 return -1;
1560 }
1561
1562 /* Returns the name of segment TYPE as a string. The caller must not modify
1563 or free the returned string.
1564
1565 This is useful only for debugging and testing. */
1566 const char *
segment_type_to_string(enum segment_type type)1567 segment_type_to_string (enum segment_type type)
1568 {
1569 switch (type)
1570 {
1571 #define SEG_TYPE(NAME) case SEG_##NAME: return #NAME;
1572 SEG_TYPES
1573 #undef SEG_TYPE
1574 default:
1575 return "unknown segment type";
1576 }
1577 }
1578
1579 /* Initializes S as a segmenter with the given syntax MODE.
1580
1581 A segmenter does not contain any external references, so nothing needs to be
1582 done to destroy one. For the same reason, segmenters may be copied with
1583 plain struct assignment (or memcpy). */
1584 void
segmenter_init(struct segmenter * s,enum segmenter_mode mode)1585 segmenter_init (struct segmenter *s, enum segmenter_mode mode)
1586 {
1587 s->state = S_SHBANG;
1588 s->substate = 0;
1589 s->mode = mode;
1590 }
1591
1592 /* Returns the mode passed to segmenter_init() for S. */
1593 enum segmenter_mode
segmenter_get_mode(const struct segmenter * s)1594 segmenter_get_mode (const struct segmenter *s)
1595 {
1596 return s->mode;
1597 }
1598
1599 /* Attempts to label a prefix of S's remaining input with a segment type. The
1600 caller supplies the first N bytes of the remaining input as INPUT, which
1601 must be a UTF-8 encoded string. If EOF is true, then the N bytes supplied
1602 are the entire (remainder) of the input; if EOF is false, then further input
1603 is potentially available.
1604
1605 The input may contain '\n' or '\r\n' line ends in any combination.
1606
1607 If successful, returns the number of bytes in the segment at the beginning
1608 of INPUT (between 0 and N, inclusive) and stores the type of that segment
1609 into *TYPE. The next call to segmenter_push() should not include those
1610 bytes as part of INPUT, because they have (figuratively) been consumed by
1611 the segmenter.
1612
1613 Failure occurs only if the segment type of the N bytes in INPUT cannot yet
1614 be determined. In this case segmenter_push() returns -1. If more input is
1615 available, the caller should obtain some more, then call again with a larger
1616 N. If this is not enough, the process might need to repeat again and agin.
1617 If input is exhausted, then the caller may call again setting EOF to true.
1618 segmenter_push() will never return -1 when EOF is true.
1619
1620 The caller must not, in a sequence of calls, supply contradictory input.
1621 That is, bytes provided as part of INPUT in one call, but not consumed, must
1622 not be provided with *different* values on subsequent calls. This is
1623 because segmenter_push() must often make decisions based on looking ahead
1624 beyond the bytes that it consumes. */
1625 int
segmenter_push(struct segmenter * s,const char * input,size_t n,bool eof,enum segment_type * type)1626 segmenter_push (struct segmenter *s, const char *input, size_t n, bool eof,
1627 enum segment_type *type)
1628 {
1629 if (!n)
1630 {
1631 if (eof)
1632 {
1633 *type = SEG_END;
1634 return 0;
1635 }
1636 else
1637 return -1;
1638 }
1639
1640 switch (s->state)
1641 {
1642 case S_SHBANG:
1643 return segmenter_parse_shbang__ (s, input, n, eof, type);
1644
1645 case S_GENERAL:
1646 return (s->substate & SS_START_OF_LINE
1647 ? segmenter_parse_start_of_line__ (s, input, n, eof, type)
1648 : segmenter_parse_mid_command__ (s, input, n, eof, type));
1649
1650 case S_COMMENT_1:
1651 return segmenter_parse_comment_1__ (s, input, n, eof, type);
1652 case S_COMMENT_2:
1653 return segmenter_parse_comment_2__ (s, input, n, eof, type);
1654
1655 case S_DOCUMENT_1:
1656 return segmenter_parse_document_1__ (s, input, n, eof, type);
1657 case S_DOCUMENT_2:
1658 return segmenter_parse_document_2__ (s, input, n, eof, type);
1659 case S_DOCUMENT_3:
1660 return segmenter_parse_document_3__ (s, type);
1661
1662 case S_FILE_LABEL:
1663 return segmenter_parse_file_label__ (s, input, n, eof, type);
1664
1665 case S_DO_REPEAT_1:
1666 return segmenter_parse_do_repeat_1__ (s, input, n, eof, type);
1667 case S_DO_REPEAT_2:
1668 return segmenter_parse_do_repeat_2__ (s, input, n, eof, type);
1669 case S_DO_REPEAT_3:
1670 return segmenter_parse_do_repeat_3__ (s, input, n, eof, type);
1671
1672 case S_BEGIN_DATA_1:
1673 return segmenter_parse_begin_data_1__ (s, input, n, eof, type);
1674 case S_BEGIN_DATA_2:
1675 return segmenter_parse_begin_data_2__ (s, input, n, eof, type);
1676 case S_BEGIN_DATA_3:
1677 return segmenter_parse_begin_data_3__ (s, input, n, eof, type);
1678 case S_BEGIN_DATA_4:
1679 return segmenter_parse_begin_data_4__ (s, input, n, eof, type);
1680
1681 case S_TITLE_1:
1682 return segmenter_parse_title_1__ (s, input, n, eof, type);
1683 case S_TITLE_2:
1684 return segmenter_parse_title_2__ (s, input, n, eof, type);
1685 }
1686
1687 NOT_REACHED ();
1688 }
1689
1690 /* Returns the style of command prompt to display to an interactive user for
1691 input in S. The return value is most accurate in mode SEG_MODE_INTERACTIVE
1692 and at the beginning of a line (that is, if segmenter_push() consumed as
1693 much as possible of the input up to a new-line). */
1694 enum prompt_style
segmenter_get_prompt(const struct segmenter * s)1695 segmenter_get_prompt (const struct segmenter *s)
1696 {
1697 switch (s->state)
1698 {
1699 case S_SHBANG:
1700 return PROMPT_FIRST;
1701
1702 case S_GENERAL:
1703 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1704
1705 case S_COMMENT_1:
1706 case S_COMMENT_2:
1707 return PROMPT_COMMENT;
1708
1709 case S_DOCUMENT_1:
1710 case S_DOCUMENT_2:
1711 return PROMPT_DOCUMENT;
1712 case S_DOCUMENT_3:
1713 return PROMPT_FIRST;
1714
1715 case S_FILE_LABEL:
1716 return PROMPT_LATER;
1717
1718 case S_DO_REPEAT_1:
1719 case S_DO_REPEAT_2:
1720 return s->substate & SS_START_OF_COMMAND ? PROMPT_FIRST : PROMPT_LATER;
1721 case S_DO_REPEAT_3:
1722 return PROMPT_DO_REPEAT;
1723
1724 case S_BEGIN_DATA_1:
1725 return PROMPT_FIRST;
1726 case S_BEGIN_DATA_2:
1727 return PROMPT_LATER;
1728 case S_BEGIN_DATA_3:
1729 case S_BEGIN_DATA_4:
1730 return PROMPT_DATA;
1731
1732 case S_TITLE_1:
1733 case S_TITLE_2:
1734 return PROMPT_FIRST;
1735 }
1736
1737 NOT_REACHED ();
1738 }
1739