1 #include "./wrapped_re2.h"
2 #include "./util.h"
3
4 #include <memory>
5 #include <string>
6 #include <unordered_set>
7 #include <vector>
8
9 static char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
10
isUpperCaseAlpha(char ch)11 inline bool isUpperCaseAlpha(char ch)
12 {
13 return 'A' <= ch && ch <= 'Z';
14 }
15
isHexadecimal(char ch)16 inline bool isHexadecimal(char ch)
17 {
18 return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z');
19 }
20
translateRegExp(const char * data,size_t size,bool multiline,std::vector<char> & buffer)21 static bool translateRegExp(const char *data, size_t size, bool multiline, std::vector<char> &buffer)
22 {
23 std::string result;
24 bool changed = false;
25
26 if (!size)
27 {
28 result = "(?:)";
29 changed = true;
30 }
31 else if (multiline)
32 {
33 result = "(?m)";
34 changed = true;
35 }
36
37 for (size_t i = 0; i < size;)
38 {
39 char ch = data[i];
40 if (ch == '\\')
41 {
42 if (i + 1 < size)
43 {
44 ch = data[i + 1];
45 switch (ch)
46 {
47 case '\\':
48 result += "\\\\";
49 i += 2;
50 continue;
51 case 'c':
52 if (i + 2 < size)
53 {
54 ch = data[i + 2];
55 if (isUpperCaseAlpha(ch))
56 {
57 result += "\\x";
58 result += hex[((ch - '@') / 16) & 15];
59 result += hex[(ch - '@') & 15];
60 i += 3;
61 changed = true;
62 continue;
63 }
64 }
65 result += "\\c";
66 i += 2;
67 continue;
68 case 'u':
69 if (i + 2 < size)
70 {
71 ch = data[i + 2];
72 if (isHexadecimal(ch))
73 {
74 result += "\\x{";
75 result += ch;
76 i += 3;
77 for (size_t j = 0; j < 3 && i < size; ++i, ++j)
78 {
79 ch = data[i];
80 if (!isHexadecimal(ch))
81 {
82 break;
83 }
84 result += ch;
85 }
86 result += '}';
87 changed = true;
88 continue;
89 }
90 else if (ch == '{')
91 {
92 result += "\\x";
93 i += 2;
94 changed = true;
95 continue;
96 }
97 }
98 result += "\\u";
99 i += 2;
100 continue;
101 default:
102 result += "\\";
103 size_t sym_size = getUtf8CharSize(ch);
104 result.append(data + i + 1, sym_size);
105 i += sym_size + 1;
106 continue;
107 }
108 }
109 }
110 else if (ch == '/')
111 {
112 result += "\\/";
113 i += 1;
114 changed = true;
115 continue;
116 }
117 else if (ch == '(' && i + 2 < size && data[i + 1] == '?' && data[i + 2] == '<')
118 {
119 if (i + 3 >= size || (data[i + 3] != '=' && data[i + 3] != '!'))
120 {
121 result += "(?P<";
122 i += 3;
123 changed = true;
124 continue;
125 }
126 }
127 size_t sym_size = getUtf8CharSize(ch);
128 result.append(data + i, sym_size);
129 i += sym_size;
130 }
131
132 if (!changed)
133 {
134 return false;
135 }
136
137 buffer.resize(0);
138 buffer.insert(buffer.end(), result.data(), result.data() + result.size());
139 buffer.push_back('\0');
140
141 return true;
142 }
143
escapeRegExp(const char * data,size_t size)144 static std::string escapeRegExp(const char *data, size_t size)
145 {
146 std::string result;
147
148 if (!size)
149 {
150 result = "(?:)";
151 }
152
153 size_t prevBackSlashes = 0;
154 for (size_t i = 0; i < size;)
155 {
156 char ch = data[i];
157 if (ch == '\\')
158 {
159 ++prevBackSlashes;
160 }
161 else if (ch == '/' && !(prevBackSlashes & 1))
162 {
163 result += "\\/";
164 i += 1;
165 prevBackSlashes = 0;
166 continue;
167 }
168 else
169 {
170 prevBackSlashes = 0;
171 }
172 size_t sym_size = getUtf8CharSize(ch);
173 result.append(data + i, sym_size);
174 i += sym_size;
175 }
176
177 return result;
178 }
179
180 bool WrappedRE2::alreadyWarnedAboutUnicode = false;
181
182 static const char *depricationMessage = "BMP patterns aren't supported by node-re2. An implicit \"u\" flag is assumed by the RE2 constructor. In a future major version, calling the RE2 constructor without the \"u\" flag may become forbidden, or cause a different behavior. Please see https://github.com/uhop/node-re2/issues/21 for more information.";
183
ensureUniqueNamedGroups(const std::map<int,std::string> & groups)184 inline bool ensureUniqueNamedGroups(const std::map<int, std::string> &groups)
185 {
186 std::unordered_set<std::string> names;
187
188 for (auto group : groups)
189 {
190 if (!names.insert(group.second).second)
191 {
192 return false;
193 }
194 }
195
196 return true;
197 }
198
NAN_METHOD(WrappedRE2::New)199 NAN_METHOD(WrappedRE2::New)
200 {
201
202 if (!info.IsConstructCall())
203 {
204 // call a constructor and return the result
205
206 std::vector<v8::Local<v8::Value>> parameters(info.Length());
207 for (size_t i = 0, n = info.Length(); i < n; ++i)
208 {
209 parameters[i] = info[i];
210 }
211 auto isolate = v8::Isolate::GetCurrent();
212 auto p_tpl = Nan::GetIsolateData<Nan::Persistent<v8::FunctionTemplate>>(isolate);
213 auto newObject = Nan::NewInstance(Nan::GetFunction(p_tpl->Get(isolate)).ToLocalChecked(), parameters.size(), ¶meters[0]);
214 if (!newObject.IsEmpty())
215 {
216 info.GetReturnValue().Set(newObject.ToLocalChecked());
217 }
218 return;
219 }
220
221 // process arguments
222
223 std::vector<char> buffer;
224
225 char *data = NULL;
226 size_t size = 0;
227
228 std::string source;
229 bool global = false;
230 bool ignoreCase = false;
231 bool multiline = false;
232 bool unicode = false;
233 bool sticky = false;
234
235 auto context = Nan::GetCurrentContext();
236
237 if (info.Length() > 1)
238 {
239 if (info[1]->IsString())
240 {
241 size = Nan::DecodeBytes(info[1], Nan::UTF8);
242 buffer.resize(size + 1);
243 data = &buffer[0];
244 Nan::DecodeWrite(data, size, info[1], Nan::UTF8);
245 buffer[size] = '\0';
246 }
247 else if (node::Buffer::HasInstance(info[1]))
248 {
249 size = node::Buffer::Length(info[1]);
250 data = node::Buffer::Data(info[1]);
251 }
252 for (size_t i = 0; i < size; ++i)
253 {
254 switch (data[i])
255 {
256 case 'g':
257 global = true;
258 break;
259 case 'i':
260 ignoreCase = true;
261 break;
262 case 'm':
263 multiline = true;
264 break;
265 case 'u':
266 unicode = true;
267 break;
268 case 'y':
269 sticky = true;
270 break;
271 }
272 }
273 size = 0;
274 }
275
276 bool needConversion = true;
277
278 if (node::Buffer::HasInstance(info[0]))
279 {
280 size = node::Buffer::Length(info[0]);
281 data = node::Buffer::Data(info[0]);
282
283 source = escapeRegExp(data, size);
284 }
285 else if (info[0]->IsRegExp())
286 {
287 const auto *re = v8::RegExp::Cast(*info[0]);
288
289 auto t = re->GetSource();
290 size = Nan::DecodeBytes(t, Nan::UTF8);
291 buffer.resize(size + 1);
292 data = &buffer[0];
293 Nan::DecodeWrite(data, size, t, Nan::UTF8);
294 buffer[size] = '\0';
295
296 source = escapeRegExp(data, size);
297
298 v8::RegExp::Flags flags = re->GetFlags();
299 global = bool(flags & v8::RegExp::kGlobal);
300 ignoreCase = bool(flags & v8::RegExp::kIgnoreCase);
301 multiline = bool(flags & v8::RegExp::kMultiline);
302 unicode = bool(flags & v8::RegExp::kUnicode);
303 sticky = bool(flags & v8::RegExp::kSticky);
304 }
305 else if (info[0]->IsObject() && !info[0]->IsString())
306 {
307 WrappedRE2 *re2 = nullptr;
308 auto object = info[0]->ToObject(context).ToLocalChecked();
309 if (!object.IsEmpty() && object->InternalFieldCount() > 0)
310 {
311 re2 = Nan::ObjectWrap::Unwrap<WrappedRE2>(object);
312 }
313 if (re2)
314 {
315 const auto &pattern = re2->regexp.pattern();
316 size = pattern.size();
317 buffer.resize(size);
318 data = &buffer[0];
319 memcpy(data, pattern.data(), size);
320 needConversion = false;
321
322 source = re2->source;
323
324 global = re2->global;
325 ignoreCase = re2->ignoreCase;
326 multiline = re2->multiline;
327 unicode = true;
328 sticky = re2->sticky;
329 }
330 }
331 else if (info[0]->IsString())
332 {
333 size = Nan::DecodeBytes(info[0], Nan::UTF8);
334 buffer.resize(size + 1);
335 data = &buffer[0];
336 Nan::DecodeWrite(data, size, info[0], Nan::UTF8);
337 buffer[size] = '\0';
338
339 source = escapeRegExp(data, size);
340 }
341
342 if (!data)
343 {
344 return Nan::ThrowTypeError("Expected string, Buffer, RegExp, or RE2 as the 1st argument.");
345 }
346
347 if (!unicode)
348 {
349 switch (unicodeWarningLevel)
350 {
351 case THROW:
352 return Nan::ThrowSyntaxError(depricationMessage);
353 case WARN:
354 printDeprecationWarning(depricationMessage);
355 break;
356 case WARN_ONCE:
357 if (!alreadyWarnedAboutUnicode)
358 {
359 printDeprecationWarning(depricationMessage);
360 alreadyWarnedAboutUnicode = true;
361 }
362 break;
363 default:
364 break;
365 }
366 }
367
368 if (needConversion && translateRegExp(data, size, multiline, buffer))
369 {
370 size = buffer.size() - 1;
371 data = &buffer[0];
372 }
373
374 // create and return an object
375
376 re2::RE2::Options options;
377 options.set_case_sensitive(!ignoreCase);
378 options.set_one_line(!multiline); // to track this state, otherwise it is ignored
379 options.set_log_errors(false); // inappropriate when embedding
380
381 std::unique_ptr<WrappedRE2> re2(new WrappedRE2(re2::StringPiece(data, size), options, source, global, ignoreCase, multiline, sticky));
382 if (!re2->regexp.ok())
383 {
384 return Nan::ThrowSyntaxError(re2->regexp.error().c_str());
385 }
386 if (!ensureUniqueNamedGroups(re2->regexp.CapturingGroupNames()))
387 {
388 return Nan::ThrowSyntaxError("duplicate capture group name");
389 }
390 re2->Wrap(info.This());
391 re2.release();
392
393 info.GetReturnValue().Set(info.This());
394 }
395