1 /* 2 * PROJECT: ReactOS host tools 3 * LICENSE: MIT (https://spdx.org/licenses/MIT) 4 * PURPOSE: Tokenizer class implementation 5 * COPYRIGHT: Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org> 6 */ 7 8 #include <string> 9 #include <vector> 10 #include <fstream> 11 #include <regex> 12 #include <ctime> 13 14 // Uncomment this for easier debugging 15 #if 0 16 #define throw __debugbreak(); throw 17 #endif 18 19 extern time_t search_time; 20 21 struct TOKEN_DEF 22 { 23 int Type; 24 std::string RegExString; 25 }; 26 27 class Token 28 { 29 const std::string& m_text; 30 unsigned int m_pos; 31 unsigned int m_len; 32 #if _DEBUG 33 std::string m_dbgstr; 34 #endif 35 int m_type; 36 37 public: 38 Token(const std::string & text,size_t pos,size_t len,int type)39 Token(const std::string& text, size_t pos, size_t len, int type) 40 : m_text(text), 41 m_pos(static_cast<unsigned int>(pos)), 42 m_len(static_cast<unsigned int>(len)), 43 m_type(type) 44 { 45 #if _DEBUG 46 m_dbgstr = str(); 47 #endif 48 } 49 str() const50 std::string str() const 51 { 52 return m_text.substr(m_pos, m_len); 53 } 54 type() const55 int type() const 56 { 57 return m_type; 58 } 59 }; 60 61 struct Tokenizer 62 { 63 const std::vector<TOKEN_DEF> &m_tokendefs; 64 const std::regex m_re; 65 66 typedef int myint; 67 68 static 69 unsigned int count_capturesTokenizer70 count_captures(const std::string& exp) 71 { 72 bool in_char_group = false; 73 unsigned int count = 0; 74 75 for (size_t i = 0; i < exp.size(); i++) 76 { 77 char c = exp[i]; 78 79 // Skip escaped characters 80 if (c == '\\') 81 { 82 i++; 83 continue; 84 } 85 86 if (in_char_group) 87 { 88 if (c == ']') 89 { 90 in_char_group = false; 91 } 92 continue; 93 } 94 95 if (c == '[') 96 { 97 in_char_group = true; 98 continue; 99 } 100 101 if (c == '(') 102 { 103 if (exp[i + 1] != '?') 104 { 105 count++; 106 } 107 } 108 } 109 110 return count; 111 } 112 113 static 114 std::regex CompileMultiRegexTokenizer115 CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs) 116 { 117 std::string combinedString; 118 119 if (tokendefs.size() == 0) 120 { 121 return std::regex(); 122 } 123 124 // Validate all token definitions 125 for (auto def : tokendefs) 126 { 127 size_t found = -1; 128 129 // Count capture groups 130 unsigned int count = count_captures(def.RegExString); 131 if (count != 1) 132 { 133 throw "invalid count!\n"; 134 } 135 } 136 137 // Combine all expressions into one (one capture group for each) 138 combinedString = "(?:" + tokendefs[0].RegExString + ")"; 139 for (size_t i = 1; i < tokendefs.size(); i++) 140 { 141 combinedString += "|(?:" + tokendefs[i].RegExString + ")"; 142 } 143 144 return std::regex(combinedString, std::regex_constants::icase); 145 } 146 147 public: 148 149 struct TOKEN_REF 150 { 151 unsigned int pos; 152 unsigned int len; 153 int type; 154 }; 155 TokenizerTokenizer156 Tokenizer(std::vector<TOKEN_DEF> &tokendefs) 157 : m_tokendefs(tokendefs), 158 m_re(CompileMultiRegex(tokendefs)) 159 { 160 } 161 matchTokenizer162 TOKEN_REF match(std::smatch &matches, const std::string& str) const 163 { 164 return match(matches, str, 0); 165 } 166 matchTokenizer167 TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const 168 { 169 const std::string::const_iterator first = str.cbegin() + startpos; 170 const std::string::const_iterator last = str.cend(); 171 172 // If we reached the end, there is nothing more to do 173 if (first == last) 174 { 175 return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 }; 176 } 177 178 time_t start_time = time(NULL); 179 180 // Try to find a match 181 if (!std::regex_search(first, last, matches, m_re)) 182 { 183 throw "Failed to match\n"; 184 } 185 186 search_time += time(NULL) - start_time; 187 188 // Validate that it's at the start of the string 189 if (matches.prefix().matched) 190 { 191 throw "Failed to match at current position!\n"; 192 } 193 194 // We have a match, check which one it is 195 for (size_t i = 1; i < matches.size(); i++) 196 { 197 if (matches[i].matched) 198 { 199 unsigned int len = static_cast<unsigned int>(matches.length(i)); 200 int type = m_tokendefs[i - 1].Type; 201 return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type}; 202 } 203 } 204 205 // We should never get here 206 throw "Something went wrong!\n"; 207 } 208 }; 209 210 211 class TokenList 212 { 213 using TOKEN_REF = typename Tokenizer::TOKEN_REF; 214 215 const Tokenizer& m_tokenizer; 216 const std::string& m_text; 217 std::vector<TOKEN_REF> m_tokens; 218 219 public: 220 TokenList(const Tokenizer & tokenizer,const std::string & text)221 TokenList(const Tokenizer& tokenizer, const std::string& text) 222 : m_tokenizer(tokenizer), 223 m_text(text) 224 { 225 size_t startpos = 0; 226 size_t len = m_text.size(); 227 std::smatch matches; 228 229 m_tokens.reserve(len / 5); 230 231 while (startpos < len) 232 { 233 TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos); 234 m_tokens.push_back(tref); 235 startpos += tref.len; 236 }; 237 } 238 size() const239 size_t size() const 240 { 241 return m_tokens.size(); 242 } 243 operator [](size_t n) const244 Token operator[](size_t n) const 245 { 246 return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type); 247 } 248 249 }; 250