1 #ifndef LEXER_HPP
2 #define LEXER_HPP
3 
4 #include "wayfire/lexer/symbol.hpp"
5 #include "wayfire/variant.hpp"
6 #include <cstddef>
7 #include <set>
8 #include <string>
9 #include <string_view>
10 #include <utility>
11 #include <vector>
12 
13 namespace wf
14 {
15 
16 /**
17  * @brief List of delimiters. The lexer will look for these to determine beginning/end of symbols.
18  */
19 static const std::set<std::string_view> DELIMITERS = {" ", "\'", "\""};
20 
21 /**
22  * @brief The lexer_t class takes the given text and parses this in symbols.
23  *
24  * @note Lexer is short for Lexical Parser.
25  *
26  * The lexer is a smart tokenizer that will present the given text as a set of symbols that can be
27  * consumed by a structural parser to do exciting things with.
28  *
29  * It is reversible to the beginning to allow a parser to peek at values ahead of the parse position.
30  * This is helpful when working with a hierarchy of parsers. The inner parser may recognize a symbol
31  * that is not meant for it, reverse the lexer and hand it back to the outer parser.
32  *
33  * To allow fast parsing when doing multipass parsing, all parsed symbols are added to a std::vector.
34  * Reversing the lexer will just go back one value in the std::vector of symbols. This means that the
35  * lexer will only do the parsing once. Even if it is reversed all the way back to the beginning,
36  * followed by new calls to parse_symbol. This is an optimization, making the parser a bit more memory
37  * heavy, but saving time on the string manipulations on the second pass.
38  *
39  * The lexer supports a set of methods that can be used to give accurate error reporting. It is
40  * possible to obtain the lexer text and the position of the current symbol. In case the parser throws
41  * due to an error in the supplied string, the caller can figure out exactly where in the string the
42  * mistake occurred.
43  */
44 class lexer_t
45 {
46 public:
47     /**
48      * @brief lexer_t Constructor.
49      */
50     lexer_t();
51 
52     /**
53      * @brief lexer_t Constructor.
54      *
55      * @param[in] text The text for the lexer to tokenize.
56      */
57     lexer_t(const std::string &text);
58 
59     /**
60      * @brief reset Resets the lexer to the start position.
61      *
62      * Calling reset will also clear the std::vector. If you want to bring back the lexer to the
63      * beginning and force it to re-parse the string, this is the method you want.
64      */
65     void reset();
66 
67     /**
68      * @brief reset Resets the lexer and introduces a new text to parse.
69      *
70      * @param[in] text The text for the lexer to tokenize.
71      */
72     void reset(const std::string &text);
73 
74     /**
75      * @brief parse_symbol Advances 1 symbol through the text and returns it.
76      *
77      * If the lexer has been reversed, this may return from cache rather then parse in the string.
78      *
79      * This method throws std::runtime_error in case a symbol cannot be parsed due to malformed text.
80      *
81      * @return The parser symbol.
82      */
83     symbol_t parse_symbol();
84 
85     /**
86      * @brief reverse Reverses 1 symbol in the lexer text.
87      *
88      * If the lexer is reversed to the beginning (before first symbol), calling reverse again has
89      * no effect.
90      */
91     void reverse();
92 
93     /**
94      * @brief text Gets a reference to the text the lexer is currently parsing.
95      *
96      * Usefull for finding out what is wrong with the parse string.
97      *
98      * @return The current lexer text.
99      */
100     const std::string &text() const;
101 
102     /**
103      * @brief current_symbol_position Gets the character position of the first character of the
104      *                                current symbol.
105      *
106      * @return Character position in the text of the lexer to the beginning of the current symbol.
107      */
108     std::size_t current_symbol_position() const;
109 
110 private:
111     /**
112      * @brief _parse_literal Parses a literal from the current parse position.
113      *
114      * This method throws std::runtime_error in case a literal cannot be parsed due to malformed text.
115      *
116      * @return The parsed literal.
117      */
118     variant_t _parse_literal();
119 
120     /**
121      * @brief _parse_encapsulated_literal Parses a literal that is encapsulated by the given start
122      *                                    and end boundary character.
123      *
124      * e.g. A string literal would be bounded by " at beginning and " at the end.
125      *
126      * e.g. (bis) an xml tag would be bounded by < at beginning and > at the end. This is the reason
127      * why there is a start and end boundary.
128      *
129      * @param[in] s_bound Start boundary character.
130      * @param[in] e_bound End boundary character.
131      *
132      * @return The parsed literal.
133      */
134     variant_t _parse_encapsulated_literal(const std::string &s_bound, const std::string &e_bound);
135 
136     /**
137      * @brief _parse_comment_literal Parses a comment literal.
138      *
139      * A comment literal starts at the # symbol and runs until the first encountered \n character.
140      * @return
141      */
142     variant_t _parse_comment_literal();
143 
144     /**
145      * @brief _size Length of the current parse text.
146      */
147     std::size_t _size;
148 
149     /**
150      * @brief _text The current parse text.
151      */
152     std::string _text;
153 
154     /**
155      * @brief _parse_position The current parse character position.
156      *
157      * This is the index of the parse position in the _text string.
158      */
159     std::size_t _parse_position;
160 
161     /**
162      * @brief _symbol_position The character position of the first character of the current symbol.
163      *
164      * This is the index of the current symbol position in the _text string.
165      */
166     std::size_t _symbol_position;
167 
168     /**
169      * @brief _reversed Counter to indicate how many times reverse has been called from the current
170      *                  parse position.
171      */
172     std::size_t _reversed;
173 
174     /**
175      * @brief _history The std::vector with the already parsed symbols.
176      *
177      * Each symbol that is encountered while servicing the parse_symbol() method is stored here to
178      * support reverse() calls and multipass/peeking.
179      */
180     std::vector<std::pair<std::size_t, symbol_t>> _history;
181 };
182 
183 } // End namespace wf.
184 
185 #endif // LEXER_HPP
186