1 #ifndef LEXER_HPP 2 #define LEXER_HPP 3 4 #include "wayfire/lexer/symbol.hpp" 5 #include "wayfire/variant.hpp" 6 #include <cstddef> 7 #include <set> 8 #include <string> 9 #include <string_view> 10 #include <utility> 11 #include <vector> 12 13 namespace wf 14 { 15 16 /** 17 * @brief List of delimiters. The lexer will look for these to determine beginning/end of symbols. 18 */ 19 static const std::set<std::string_view> DELIMITERS = {" ", "\'", "\""}; 20 21 /** 22 * @brief The lexer_t class takes the given text and parses this in symbols. 23 * 24 * @note Lexer is short for Lexical Parser. 25 * 26 * The lexer is a smart tokenizer that will present the given text as a set of symbols that can be 27 * consumed by a structural parser to do exciting things with. 28 * 29 * It is reversible to the beginning to allow a parser to peek at values ahead of the parse position. 30 * This is helpful when working with a hierarchy of parsers. The inner parser may recognize a symbol 31 * that is not meant for it, reverse the lexer and hand it back to the outer parser. 32 * 33 * To allow fast parsing when doing multipass parsing, all parsed symbols are added to a std::vector. 34 * Reversing the lexer will just go back one value in the std::vector of symbols. This means that the 35 * lexer will only do the parsing once. Even if it is reversed all the way back to the beginning, 36 * followed by new calls to parse_symbol. This is an optimization, making the parser a bit more memory 37 * heavy, but saving time on the string manipulations on the second pass. 38 * 39 * The lexer supports a set of methods that can be used to give accurate error reporting. It is 40 * possible to obtain the lexer text and the position of the current symbol. In case the parser throws 41 * due to an error in the supplied string, the caller can figure out exactly where in the string the 42 * mistake occurred. 43 */ 44 class lexer_t 45 { 46 public: 47 /** 48 * @brief lexer_t Constructor. 49 */ 50 lexer_t(); 51 52 /** 53 * @brief lexer_t Constructor. 54 * 55 * @param[in] text The text for the lexer to tokenize. 56 */ 57 lexer_t(const std::string &text); 58 59 /** 60 * @brief reset Resets the lexer to the start position. 61 * 62 * Calling reset will also clear the std::vector. If you want to bring back the lexer to the 63 * beginning and force it to re-parse the string, this is the method you want. 64 */ 65 void reset(); 66 67 /** 68 * @brief reset Resets the lexer and introduces a new text to parse. 69 * 70 * @param[in] text The text for the lexer to tokenize. 71 */ 72 void reset(const std::string &text); 73 74 /** 75 * @brief parse_symbol Advances 1 symbol through the text and returns it. 76 * 77 * If the lexer has been reversed, this may return from cache rather then parse in the string. 78 * 79 * This method throws std::runtime_error in case a symbol cannot be parsed due to malformed text. 80 * 81 * @return The parser symbol. 82 */ 83 symbol_t parse_symbol(); 84 85 /** 86 * @brief reverse Reverses 1 symbol in the lexer text. 87 * 88 * If the lexer is reversed to the beginning (before first symbol), calling reverse again has 89 * no effect. 90 */ 91 void reverse(); 92 93 /** 94 * @brief text Gets a reference to the text the lexer is currently parsing. 95 * 96 * Usefull for finding out what is wrong with the parse string. 97 * 98 * @return The current lexer text. 99 */ 100 const std::string &text() const; 101 102 /** 103 * @brief current_symbol_position Gets the character position of the first character of the 104 * current symbol. 105 * 106 * @return Character position in the text of the lexer to the beginning of the current symbol. 107 */ 108 std::size_t current_symbol_position() const; 109 110 private: 111 /** 112 * @brief _parse_literal Parses a literal from the current parse position. 113 * 114 * This method throws std::runtime_error in case a literal cannot be parsed due to malformed text. 115 * 116 * @return The parsed literal. 117 */ 118 variant_t _parse_literal(); 119 120 /** 121 * @brief _parse_encapsulated_literal Parses a literal that is encapsulated by the given start 122 * and end boundary character. 123 * 124 * e.g. A string literal would be bounded by " at beginning and " at the end. 125 * 126 * e.g. (bis) an xml tag would be bounded by < at beginning and > at the end. This is the reason 127 * why there is a start and end boundary. 128 * 129 * @param[in] s_bound Start boundary character. 130 * @param[in] e_bound End boundary character. 131 * 132 * @return The parsed literal. 133 */ 134 variant_t _parse_encapsulated_literal(const std::string &s_bound, const std::string &e_bound); 135 136 /** 137 * @brief _parse_comment_literal Parses a comment literal. 138 * 139 * A comment literal starts at the # symbol and runs until the first encountered \n character. 140 * @return 141 */ 142 variant_t _parse_comment_literal(); 143 144 /** 145 * @brief _size Length of the current parse text. 146 */ 147 std::size_t _size; 148 149 /** 150 * @brief _text The current parse text. 151 */ 152 std::string _text; 153 154 /** 155 * @brief _parse_position The current parse character position. 156 * 157 * This is the index of the parse position in the _text string. 158 */ 159 std::size_t _parse_position; 160 161 /** 162 * @brief _symbol_position The character position of the first character of the current symbol. 163 * 164 * This is the index of the current symbol position in the _text string. 165 */ 166 std::size_t _symbol_position; 167 168 /** 169 * @brief _reversed Counter to indicate how many times reverse has been called from the current 170 * parse position. 171 */ 172 std::size_t _reversed; 173 174 /** 175 * @brief _history The std::vector with the already parsed symbols. 176 * 177 * Each symbol that is encountered while servicing the parse_symbol() method is stored here to 178 * support reverse() calls and multipass/peeking. 179 */ 180 std::vector<std::pair<std::size_t, symbol_t>> _history; 181 }; 182 183 } // End namespace wf. 184 185 #endif // LEXER_HPP 186