1 /* PSPP - a program for statistical analysis. 2 Copyright (C) 2010, 2011, 2013 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation, either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17 #ifndef SEGMENT_H 18 #define SEGMENT_H 1 19 20 #include <stdbool.h> 21 #include <stddef.h> 22 #include "libpspp/prompt.h" 23 24 /* PSPP syntax segmentation. 25 26 PSPP divides traditional "lexical analysis" or "tokenization" into two 27 phases: a lower-level phase called "segmentation" and a higher-level phase 28 called "scanning". This header file provides declarations for the 29 segmentation phase. scan.h contains declarations for the scanning phase. 30 31 Segmentation accepts a stream of UTF-8 bytes as input. It outputs a label 32 (a segment type) for each byte or contiguous sequence of bytes in the input. 33 It also, in a few corner cases, outputs zero-width segments that label the 34 boundary between a pair of bytes in the input. 35 36 Some segment types correspond directly to tokens; for example, an 37 "identifier" segment (SEG_IDENTIFIER) becomes an identifier token (T_ID) 38 later in lexical analysis. Other segments contribute to tokens but do not 39 correspond diectly; for example, multiple quoted string segments 40 (SEG_QUOTED_STRING) separated by spaces (SEG_SPACES) and "+" punctuators 41 (SEG_PUNCT) may be combined to form a single string token (T_STRING). 42 Still other segments are ignored (e.g. SEG_SPACES) or trigger special 43 behavior such as error messages later in tokenization 44 (e.g. SEG_EXPECTED_QUOTE). 45 */ 46 47 /* Segmentation mode. 48 49 This corresponds to the syntax mode for which a syntax file is intended. 50 This is the only configuration setting for a segmenter. */ 51 enum segmenter_mode 52 { 53 /* Try to interpret input correctly regardless of whether it is written 54 for interactive or batch mode. */ 55 SEG_MODE_AUTO, 56 57 /* Interactive or batch syntax mode. */ 58 SEG_MODE_INTERACTIVE, 59 SEG_MODE_BATCH 60 }; 61 62 #define SEG_TYPES \ 63 SEG_TYPE(NUMBER) \ 64 SEG_TYPE(QUOTED_STRING) \ 65 SEG_TYPE(HEX_STRING) \ 66 SEG_TYPE(UNICODE_STRING) \ 67 SEG_TYPE(UNQUOTED_STRING) \ 68 SEG_TYPE(RESERVED_WORD) \ 69 SEG_TYPE(IDENTIFIER) \ 70 SEG_TYPE(PUNCT) \ 71 \ 72 SEG_TYPE(SHBANG) \ 73 SEG_TYPE(SPACES) \ 74 SEG_TYPE(COMMENT) \ 75 SEG_TYPE(NEWLINE) \ 76 \ 77 SEG_TYPE(COMMENT_COMMAND) \ 78 SEG_TYPE(DO_REPEAT_COMMAND) \ 79 SEG_TYPE(INLINE_DATA) \ 80 \ 81 SEG_TYPE(START_DOCUMENT) \ 82 SEG_TYPE(DOCUMENT) \ 83 \ 84 SEG_TYPE(START_COMMAND) \ 85 SEG_TYPE(SEPARATE_COMMANDS) \ 86 SEG_TYPE(END_COMMAND) \ 87 SEG_TYPE(END) \ 88 \ 89 SEG_TYPE(EXPECTED_QUOTE) \ 90 SEG_TYPE(EXPECTED_EXPONENT) \ 91 SEG_TYPE(UNEXPECTED_DOT) \ 92 SEG_TYPE(UNEXPECTED_CHAR) 93 94 /* Types of segments. */ 95 enum segment_type 96 { 97 #define SEG_TYPE(NAME) SEG_##NAME, 98 SEG_TYPES 99 #undef SEG_TYPE 100 }; 101 102 /* Number of segment types. */ 103 #define SEG_TYPE(NAME) + 1 104 enum { SEG_N_TYPES = SEG_TYPES }; 105 #undef SEG_TYPE 106 107 const char *segment_type_to_string (enum segment_type); 108 109 /* A segmenter. Opaque. */ 110 struct segmenter 111 { 112 unsigned char state; 113 unsigned char substate; 114 unsigned char mode; 115 }; 116 117 void segmenter_init (struct segmenter *, enum segmenter_mode); 118 119 enum segmenter_mode segmenter_get_mode (const struct segmenter *); 120 121 int segmenter_push (struct segmenter *, const char *input, size_t n, bool eof, 122 enum segment_type *); 123 124 enum prompt_style segmenter_get_prompt (const struct segmenter *); 125 126 #endif /* segment.h */ 127