xref: /reactos/sdk/tools/asmpp/tokenizer.hpp (revision 61cc62d1)
1 /*
2  * PROJECT:     ReactOS host tools
3  * LICENSE:     MIT (https://spdx.org/licenses/MIT)
4  * PURPOSE:     Tokenizer class implementation
5  * COPYRIGHT:   Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
6  */
7 
8 #include <string>
9 #include <vector>
10 #include <fstream>
11 #include <regex>
12 #include <ctime>
13 
14 // Uncomment this for easier debugging
15 #if 0
16 #define throw __debugbreak(); throw
17 #endif
18 
19 extern time_t search_time;
20 
21 struct TOKEN_DEF
22 {
23     int Type;
24     std::string RegExString;
25 };
26 
27 class Token
28 {
29     const std::string& m_text;
30     unsigned int m_pos;
31     unsigned int m_len;
32 #if _DEBUG
33     std::string m_dbgstr;
34 #endif
35     int m_type;
36 
37 public:
38 
Token(const std::string & text,size_t pos,size_t len,int type)39     Token(const std::string& text, size_t pos, size_t len, int type)
40         : m_text(text),
41         m_pos(static_cast<unsigned int>(pos)),
42         m_len(static_cast<unsigned int>(len)),
43         m_type(type)
44     {
45 #if _DEBUG
46         m_dbgstr = str();
47 #endif
48     }
49 
str() const50     std::string str() const
51     {
52         return m_text.substr(m_pos, m_len);
53     }
54 
type() const55     int type() const
56     {
57         return m_type;
58     }
59 };
60 
61 struct Tokenizer
62 {
63     const std::vector<TOKEN_DEF> &m_tokendefs;
64     const std::regex m_re;
65 
66     typedef int myint;
67 
68     static
69     unsigned int
count_capturesTokenizer70     count_captures(const std::string& exp)
71     {
72         bool in_char_group = false;
73         unsigned int count = 0;
74 
75         for (size_t i = 0; i < exp.size(); i++)
76         {
77             char c = exp[i];
78 
79             // Skip escaped characters
80             if (c == '\\')
81             {
82                 i++;
83                 continue;
84             }
85 
86             if (in_char_group)
87             {
88                 if (c == ']')
89                 {
90                     in_char_group = false;
91                 }
92                 continue;
93             }
94 
95             if (c == '[')
96             {
97                 in_char_group = true;
98                 continue;
99             }
100 
101             if (c == '(')
102             {
103                 if (exp[i + 1] != '?')
104                 {
105                     count++;
106                 }
107             }
108         }
109 
110         return count;
111     }
112 
113     static
114     std::regex
CompileMultiRegexTokenizer115     CompileMultiRegex(const std::vector<TOKEN_DEF> &tokendefs)
116     {
117         std::string combinedString;
118 
119         if (tokendefs.size() == 0)
120         {
121             return std::regex();
122         }
123 
124         // Validate all token definitions
125         for (auto def : tokendefs)
126         {
127             size_t found = -1;
128 
129             // Count capture groups
130             unsigned int count = count_captures(def.RegExString);
131             if (count != 1)
132             {
133                 throw "invalid count!\n";
134             }
135         }
136 
137         // Combine all expressions into one (one capture group for each)
138         combinedString = "(?:" + tokendefs[0].RegExString + ")";
139         for (size_t i = 1; i < tokendefs.size(); i++)
140         {
141             combinedString += "|(?:" + tokendefs[i].RegExString + ")";
142         }
143 
144         return std::regex(combinedString, std::regex_constants::icase);
145     }
146 
147 public:
148 
149     struct TOKEN_REF
150     {
151         unsigned int pos;
152         unsigned int len;
153         int type;
154     };
155 
TokenizerTokenizer156     Tokenizer(std::vector<TOKEN_DEF> &tokendefs)
157         : m_tokendefs(tokendefs),
158           m_re(CompileMultiRegex(tokendefs))
159     {
160     }
161 
matchTokenizer162     TOKEN_REF match(std::smatch &matches, const std::string& str) const
163     {
164         return match(matches, str, 0);
165     }
166 
matchTokenizer167     TOKEN_REF match(std::smatch &matches, const std::string &str, size_t startpos) const
168     {
169         const std::string::const_iterator first = str.cbegin() + startpos;
170         const std::string::const_iterator last = str.cend();
171 
172         // If we reached the end, there is nothing more to do
173         if (first == last)
174         {
175             return TOKEN_REF{ static_cast<unsigned int>(startpos), 0, -1 };
176         }
177 
178         time_t start_time = time(NULL);
179 
180         // Try to find a match
181         if (!std::regex_search(first, last, matches, m_re))
182         {
183             throw "Failed to match\n";
184         }
185 
186         search_time += time(NULL) - start_time;
187 
188         // Validate that it's at the start of the string
189         if (matches.prefix().matched)
190         {
191             throw "Failed to match at current position!\n";
192         }
193 
194         // We have a match, check which one it is
195         for (size_t i = 1; i < matches.size(); i++)
196         {
197             if (matches[i].matched)
198             {
199                 unsigned int len = static_cast<unsigned int>(matches.length(i));
200                 int type = m_tokendefs[i - 1].Type;
201                 return TOKEN_REF{ static_cast<unsigned int>(startpos), len, type};
202             }
203         }
204 
205         // We should never get here
206         throw "Something went wrong!\n";
207     }
208 };
209 
210 
211 class TokenList
212 {
213     using TOKEN_REF = typename Tokenizer::TOKEN_REF;
214 
215     const Tokenizer& m_tokenizer;
216     const std::string& m_text;
217     std::vector<TOKEN_REF> m_tokens;
218 
219 public:
220 
TokenList(const Tokenizer & tokenizer,const std::string & text)221     TokenList(const Tokenizer& tokenizer, const std::string& text)
222         : m_tokenizer(tokenizer),
223           m_text(text)
224     {
225         size_t startpos = 0;
226         size_t len = m_text.size();
227         std::smatch matches;
228 
229         m_tokens.reserve(len / 5);
230 
231         while (startpos < len)
232         {
233             TOKEN_REF tref = m_tokenizer.match(matches, m_text, startpos);
234             m_tokens.push_back(tref);
235             startpos += tref.len;
236         };
237     }
238 
size() const239     size_t size() const
240     {
241         return m_tokens.size();
242     }
243 
operator [](size_t n) const244     Token operator[](size_t n) const
245     {
246         return Token(m_text, m_tokens[n].pos, m_tokens[n].len, m_tokens[n].type);
247     }
248 
249 };
250