1 // license:GPL-2.0+ 2 // copyright-holders:Couriersud 3 4 #ifndef PTOKENIZER_H_ 5 #define PTOKENIZER_H_ 6 7 /// 8 /// \file ptokenizer.h 9 /// 10 11 #include "pstream.h" 12 #include "pstring.h" 13 14 #include "penum.h" 15 #include "psource.h" 16 17 #include <unordered_map> 18 #include <vector> 19 20 namespace plib { 21 22 namespace detail { 23 24 PENUM(token_type, 25 IDENTIFIER, 26 NUMBER, 27 TOKEN, 28 STRING, 29 COMMENT, 30 LINEMARKER, 31 SOURCELINE, 32 UNKNOWN, 33 ENDOFFILE 34 ) 35 36 struct token_id_t 37 { 38 public: 39 40 static constexpr std::size_t npos = static_cast<std::size_t>(-1); 41 token_id_ttoken_id_t42 token_id_t() : m_id(npos) {} token_id_ttoken_id_t43 explicit token_id_t(std::size_t id, const pstring &name) 44 : m_id(id) 45 , m_name(name) 46 {} 47 48 PCOPYASSIGNMOVE(token_id_t, default) 49 50 ~token_id_t() = default; 51 idtoken_id_t52 std::size_t id() const { return m_id; } nametoken_id_t53 const pstring & name() const { return m_name; } 54 private: 55 std::size_t m_id; 56 pstring m_name; 57 }; 58 59 struct token_t 60 { token_ttoken_t61 explicit token_t(token_type type) 62 : m_type(type), m_id(token_id_t::npos), m_token("") 63 { 64 } token_ttoken_t65 token_t(token_type type, const pstring &str) 66 : m_type(type), m_id(token_id_t::npos), m_token(str) 67 { 68 } token_ttoken_t69 token_t(const token_id_t &id) 70 : m_type(token_type::TOKEN), m_id(id.id()), m_token(id.name()) 71 { 72 } token_ttoken_t73 token_t(const token_id_t &id, const pstring &str) 74 : m_type(token_type::TOKEN), m_id(id.id()), m_token(str) 75 { 76 } 77 78 PCOPYASSIGNMOVE(token_t, default) 79 80 ~token_t() = default; 81 istoken_t82 bool is(const token_id_t &tok_id) const noexcept { return m_id == tok_id.id(); } is_nottoken_t83 bool is_not(const token_id_t &tok_id) const noexcept { return !is(tok_id); } 84 is_typetoken_t85 bool is_type(const token_type type) const noexcept { return m_type == type; } 86 typetoken_t87 token_type type() const noexcept { return m_type; } 88 strtoken_t89 const pstring &str() const noexcept { return m_token; } 90 91 private: 92 token_type m_type; 93 std::size_t m_id; 94 pstring m_token; 95 }; 96 97 class token_store : public std::vector<token_t> 98 { 99 using std::vector<token_t>::vector; 100 }; 101 102 } // namespace detail 103 104 class ptokenizer 105 { 106 public: ptokenizer()107 explicit ptokenizer() // NOLINT(misc-forwarding-reference-overload, bugprone-forwarding-reference-overload) 108 : m_strm(nullptr) 109 , m_unget(0) 110 , m_string('"') 111 , m_support_line_markers(true) // FIXME 112 , m_token_queue(nullptr) 113 { 114 clear(); 115 } 116 117 PCOPYASSIGNMOVE(ptokenizer, delete) 118 119 virtual ~ptokenizer() = default; 120 121 using token_type = detail::token_type; 122 using token_id_t = detail::token_id_t; 123 using token_t = detail::token_t; 124 using token_store = detail::token_store; 125 126 // tokenizer stuff follows ... 127 register_token(const pstring & token)128 token_id_t register_token(const pstring &token) 129 { 130 token_id_t ret(m_tokens.size(), token); 131 m_tokens.emplace(token, ret); 132 return ret; 133 } 134 identifier_chars(const pstring & s)135 ptokenizer & identifier_chars(const pstring &s) { m_identifier_chars = s; return *this; } number_chars(const pstring & st,const pstring & rem)136 ptokenizer & number_chars(const pstring &st, const pstring & rem) { m_number_chars_start = st; m_number_chars = rem; return *this; } string_char(pstring::value_type c)137 ptokenizer & string_char(pstring::value_type c) { m_string = c; return *this; } whitespace(const pstring & s)138 ptokenizer & whitespace(const pstring & s) { m_whitespace = s; return *this; } comment(const pstring & start,const pstring & end,const pstring & line)139 ptokenizer & comment(const pstring &start, const pstring &end, const pstring &line) 140 { 141 m_tok_comment_start = register_token(start); 142 m_tok_comment_end = register_token(end); 143 m_tok_line_comment = register_token(line); 144 return *this; 145 } 146 append_to_store(putf8_reader * reader,token_store & tokstor)147 void append_to_store(putf8_reader *reader, token_store &tokstor) 148 { 149 clear(); 150 m_strm = reader; 151 // Process tokens into queue 152 token_t ret(token_type::UNKNOWN); 153 m_token_queue = &tokstor; 154 do { 155 ret = get_token_comment(); 156 tokstor.push_back(ret); 157 } while (!ret.is_type(token_type::token_type::ENDOFFILE)); 158 m_token_queue = nullptr; 159 } 160 161 private: 162 clear()163 void clear() 164 { 165 m_cur_line = ""; 166 m_px = m_cur_line.begin(); 167 m_unget = 0; 168 } 169 170 token_t get_token_internal(); 171 172 // get internal token with comment processing 173 token_t get_token_comment(); 174 175 void skipeol(); 176 177 pstring::value_type getc(); 178 void ungetc(pstring::value_type c); 179 eof()180 bool eof() const { return m_strm->eof(); } 181 182 putf8_reader *m_strm; 183 184 pstring m_cur_line; 185 pstring::const_iterator m_px; 186 pstring::value_type m_unget; 187 188 // tokenizer stuff follows ... 189 190 pstring m_identifier_chars; 191 pstring m_number_chars; 192 pstring m_number_chars_start; 193 std::unordered_map<pstring, token_id_t> m_tokens; 194 pstring m_whitespace; 195 pstring::value_type m_string; 196 197 token_id_t m_tok_comment_start; 198 token_id_t m_tok_comment_end; 199 token_id_t m_tok_line_comment; 200 201 protected: 202 bool m_support_line_markers; 203 token_store *m_token_queue; 204 }; 205 206 class ptoken_reader 207 { 208 public: 209 210 using token_t = ptokenizer::token_t; 211 using token_type = ptokenizer::token_type; 212 using token_id_t = ptokenizer::token_id_t; 213 using token_store = ptokenizer::token_store; 214 ptoken_reader()215 explicit ptoken_reader() 216 : m_idx(0) 217 , m_token_store(nullptr) 218 { 219 // add a first entry to the stack 220 m_source_location.emplace_back(plib::source_location("Unknown", 0)); 221 } 222 223 PCOPYASSIGNMOVE(ptoken_reader, delete) 224 225 virtual ~ptoken_reader() = default; 226 set_token_source(const token_store * tokstor)227 void set_token_source(const token_store *tokstor) 228 { 229 m_token_store = tokstor; 230 } 231 232 pstring currentline_str() const; 233 234 // tokenizer stuff follows ... 235 236 token_t get_token(); 237 token_t get_token_raw(); // includes line information 238 pstring get_string(); 239 pstring get_identifier(); 240 pstring get_identifier_or_number(); 241 242 double get_number_double(); 243 long get_number_long(); 244 245 void require_token(const token_id_t &token_num); 246 void require_token(const token_t &tok, const token_id_t &token_num); 247 248 void error(const perrmsg &errs); 249 sourceloc()250 plib::source_location sourceloc() { return m_source_location.back(); } 251 current_line()252 pstring current_line() const { return m_line; } 253 protected: 254 virtual void verror(const pstring &msg) = 0; 255 256 private: 257 bool process_line_token(const token_t &tok); 258 get_token_queue()259 token_t get_token_queue() 260 { 261 if (m_idx < m_token_store->size()) 262 return (*m_token_store)[m_idx++]; 263 return token_t(token_type::ENDOFFILE); 264 } 265 266 // source locations, vector used as stack because we need to loop through stack 267 268 std::vector<plib::source_location> m_source_location; 269 pstring m_line; 270 std::size_t m_idx; 271 const token_store * m_token_store; 272 }; 273 274 } // namespace plib 275 276 #endif // PTOKENIZER_H_ 277