1 #include "./wrapped_re2.h"
2 #include "./util.h"
3 
4 #include <memory>
5 #include <string>
6 #include <unordered_set>
7 #include <vector>
8 
9 static char hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
10 
isUpperCaseAlpha(char ch)11 inline bool isUpperCaseAlpha(char ch)
12 {
13 	return 'A' <= ch && ch <= 'Z';
14 }
15 
isHexadecimal(char ch)16 inline bool isHexadecimal(char ch)
17 {
18 	return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z');
19 }
20 
translateRegExp(const char * data,size_t size,bool multiline,std::vector<char> & buffer)21 static bool translateRegExp(const char *data, size_t size, bool multiline, std::vector<char> &buffer)
22 {
23 	std::string result;
24 	bool changed = false;
25 
26 	if (!size)
27 	{
28 		result = "(?:)";
29 		changed = true;
30 	}
31 	else if (multiline)
32 	{
33 		result = "(?m)";
34 		changed = true;
35 	}
36 
37 	for (size_t i = 0; i < size;)
38 	{
39 		char ch = data[i];
40 		if (ch == '\\')
41 		{
42 			if (i + 1 < size)
43 			{
44 				ch = data[i + 1];
45 				switch (ch)
46 				{
47 				case '\\':
48 					result += "\\\\";
49 					i += 2;
50 					continue;
51 				case 'c':
52 					if (i + 2 < size)
53 					{
54 						ch = data[i + 2];
55 						if (isUpperCaseAlpha(ch))
56 						{
57 							result += "\\x";
58 							result += hex[((ch - '@') / 16) & 15];
59 							result += hex[(ch - '@') & 15];
60 							i += 3;
61 							changed = true;
62 							continue;
63 						}
64 					}
65 					result += "\\c";
66 					i += 2;
67 					continue;
68 				case 'u':
69 					if (i + 2 < size)
70 					{
71 						ch = data[i + 2];
72 						if (isHexadecimal(ch))
73 						{
74 							result += "\\x{";
75 							result += ch;
76 							i += 3;
77 							for (size_t j = 0; j < 3 && i < size; ++i, ++j)
78 							{
79 								ch = data[i];
80 								if (!isHexadecimal(ch))
81 								{
82 									break;
83 								}
84 								result += ch;
85 							}
86 							result += '}';
87 							changed = true;
88 							continue;
89 						}
90 						else if (ch == '{')
91 						{
92 							result += "\\x";
93 							i += 2;
94 							changed = true;
95 							continue;
96 						}
97 					}
98 					result += "\\u";
99 					i += 2;
100 					continue;
101 				default:
102 					result += "\\";
103 					size_t sym_size = getUtf8CharSize(ch);
104 					result.append(data + i + 1, sym_size);
105 					i += sym_size + 1;
106 					continue;
107 				}
108 			}
109 		}
110 		else if (ch == '/')
111 		{
112 			result += "\\/";
113 			i += 1;
114 			changed = true;
115 			continue;
116 		}
117 		else if (ch == '(' && i + 2 < size && data[i + 1] == '?' && data[i + 2] == '<')
118 		{
119 			if (i + 3 >= size || (data[i + 3] != '=' && data[i + 3] != '!'))
120 			{
121 				result += "(?P<";
122 				i += 3;
123 				changed = true;
124 				continue;
125 			}
126 		}
127 		size_t sym_size = getUtf8CharSize(ch);
128 		result.append(data + i, sym_size);
129 		i += sym_size;
130 	}
131 
132 	if (!changed)
133 	{
134 		return false;
135 	}
136 
137 	buffer.resize(0);
138 	buffer.insert(buffer.end(), result.data(), result.data() + result.size());
139 	buffer.push_back('\0');
140 
141 	return true;
142 }
143 
escapeRegExp(const char * data,size_t size)144 static std::string escapeRegExp(const char *data, size_t size)
145 {
146 	std::string result;
147 
148 	if (!size)
149 	{
150 		result = "(?:)";
151 	}
152 
153 	size_t prevBackSlashes = 0;
154 	for (size_t i = 0; i < size;)
155 	{
156 		char ch = data[i];
157 		if (ch == '\\')
158 		{
159 			++prevBackSlashes;
160 		}
161 		else if (ch == '/' && !(prevBackSlashes & 1))
162 		{
163 			result += "\\/";
164 			i += 1;
165 			prevBackSlashes = 0;
166 			continue;
167 		}
168 		else
169 		{
170 			prevBackSlashes = 0;
171 		}
172 		size_t sym_size = getUtf8CharSize(ch);
173 		result.append(data + i, sym_size);
174 		i += sym_size;
175 	}
176 
177 	return result;
178 }
179 
180 bool WrappedRE2::alreadyWarnedAboutUnicode = false;
181 
182 static const char *depricationMessage = "BMP patterns aren't supported by node-re2. An implicit \"u\" flag is assumed by the RE2 constructor. In a future major version, calling the RE2 constructor without the \"u\" flag may become forbidden, or cause a different behavior. Please see https://github.com/uhop/node-re2/issues/21 for more information.";
183 
ensureUniqueNamedGroups(const std::map<int,std::string> & groups)184 inline bool ensureUniqueNamedGroups(const std::map<int, std::string> &groups)
185 {
186 	std::unordered_set<std::string> names;
187 
188 	for (auto group : groups)
189 	{
190 		if (!names.insert(group.second).second)
191 		{
192 			return false;
193 		}
194 	}
195 
196 	return true;
197 }
198 
NAN_METHOD(WrappedRE2::New)199 NAN_METHOD(WrappedRE2::New)
200 {
201 
202 	if (!info.IsConstructCall())
203 	{
204 		// call a constructor and return the result
205 
206 		std::vector<v8::Local<v8::Value>> parameters(info.Length());
207 		for (size_t i = 0, n = info.Length(); i < n; ++i)
208 		{
209 			parameters[i] = info[i];
210 		}
211 		auto isolate = v8::Isolate::GetCurrent();
212 		auto p_tpl = Nan::GetIsolateData<Nan::Persistent<v8::FunctionTemplate>>(isolate);
213 		auto newObject = Nan::NewInstance(Nan::GetFunction(p_tpl->Get(isolate)).ToLocalChecked(), parameters.size(), &parameters[0]);
214 		if (!newObject.IsEmpty())
215 		{
216 			info.GetReturnValue().Set(newObject.ToLocalChecked());
217 		}
218 		return;
219 	}
220 
221 	// process arguments
222 
223 	std::vector<char> buffer;
224 
225 	char *data = NULL;
226 	size_t size = 0;
227 
228 	std::string source;
229 	bool global = false;
230 	bool ignoreCase = false;
231 	bool multiline = false;
232 	bool unicode = false;
233 	bool sticky = false;
234 
235 	auto context = Nan::GetCurrentContext();
236 
237 	if (info.Length() > 1)
238 	{
239 		if (info[1]->IsString())
240 		{
241 			size = Nan::DecodeBytes(info[1], Nan::UTF8);
242 			buffer.resize(size + 1);
243 			data = &buffer[0];
244 			Nan::DecodeWrite(data, size, info[1], Nan::UTF8);
245 			buffer[size] = '\0';
246 		}
247 		else if (node::Buffer::HasInstance(info[1]))
248 		{
249 			size = node::Buffer::Length(info[1]);
250 			data = node::Buffer::Data(info[1]);
251 		}
252 		for (size_t i = 0; i < size; ++i)
253 		{
254 			switch (data[i])
255 			{
256 			case 'g':
257 				global = true;
258 				break;
259 			case 'i':
260 				ignoreCase = true;
261 				break;
262 			case 'm':
263 				multiline = true;
264 				break;
265 			case 'u':
266 				unicode = true;
267 				break;
268 			case 'y':
269 				sticky = true;
270 				break;
271 			}
272 		}
273 		size = 0;
274 	}
275 
276 	bool needConversion = true;
277 
278 	if (node::Buffer::HasInstance(info[0]))
279 	{
280 		size = node::Buffer::Length(info[0]);
281 		data = node::Buffer::Data(info[0]);
282 
283 		source = escapeRegExp(data, size);
284 	}
285 	else if (info[0]->IsRegExp())
286 	{
287 		const auto *re = v8::RegExp::Cast(*info[0]);
288 
289 		auto t = re->GetSource();
290 		size = Nan::DecodeBytes(t, Nan::UTF8);
291 		buffer.resize(size + 1);
292 		data = &buffer[0];
293 		Nan::DecodeWrite(data, size, t, Nan::UTF8);
294 		buffer[size] = '\0';
295 
296 		source = escapeRegExp(data, size);
297 
298 		v8::RegExp::Flags flags = re->GetFlags();
299 		global = bool(flags & v8::RegExp::kGlobal);
300 		ignoreCase = bool(flags & v8::RegExp::kIgnoreCase);
301 		multiline = bool(flags & v8::RegExp::kMultiline);
302 		unicode = bool(flags & v8::RegExp::kUnicode);
303 		sticky = bool(flags & v8::RegExp::kSticky);
304 	}
305 	else if (info[0]->IsObject() && !info[0]->IsString())
306 	{
307 		WrappedRE2 *re2 = nullptr;
308 		auto object = info[0]->ToObject(context).ToLocalChecked();
309 		if (!object.IsEmpty() && object->InternalFieldCount() > 0)
310 		{
311 			re2 = Nan::ObjectWrap::Unwrap<WrappedRE2>(object);
312 		}
313 		if (re2)
314 		{
315 			const auto &pattern = re2->regexp.pattern();
316 			size = pattern.size();
317 			buffer.resize(size);
318 			data = &buffer[0];
319 			memcpy(data, pattern.data(), size);
320 			needConversion = false;
321 
322 			source = re2->source;
323 
324 			global = re2->global;
325 			ignoreCase = re2->ignoreCase;
326 			multiline = re2->multiline;
327 			unicode = true;
328 			sticky = re2->sticky;
329 		}
330 	}
331 	else if (info[0]->IsString())
332 	{
333 		size = Nan::DecodeBytes(info[0], Nan::UTF8);
334 		buffer.resize(size + 1);
335 		data = &buffer[0];
336 		Nan::DecodeWrite(data, size, info[0], Nan::UTF8);
337 		buffer[size] = '\0';
338 
339 		source = escapeRegExp(data, size);
340 	}
341 
342 	if (!data)
343 	{
344 		return Nan::ThrowTypeError("Expected string, Buffer, RegExp, or RE2 as the 1st argument.");
345 	}
346 
347 	if (!unicode)
348 	{
349 		switch (unicodeWarningLevel)
350 		{
351 		case THROW:
352 			return Nan::ThrowSyntaxError(depricationMessage);
353 		case WARN:
354 			printDeprecationWarning(depricationMessage);
355 			break;
356 		case WARN_ONCE:
357 			if (!alreadyWarnedAboutUnicode)
358 			{
359 				printDeprecationWarning(depricationMessage);
360 				alreadyWarnedAboutUnicode = true;
361 			}
362 			break;
363 		default:
364 			break;
365 		}
366 	}
367 
368 	if (needConversion && translateRegExp(data, size, multiline, buffer))
369 	{
370 		size = buffer.size() - 1;
371 		data = &buffer[0];
372 	}
373 
374 	// create and return an object
375 
376 	re2::RE2::Options options;
377 	options.set_case_sensitive(!ignoreCase);
378 	options.set_one_line(!multiline); // to track this state, otherwise it is ignored
379 	options.set_log_errors(false);	  // inappropriate when embedding
380 
381 	std::unique_ptr<WrappedRE2> re2(new WrappedRE2(re2::StringPiece(data, size), options, source, global, ignoreCase, multiline, sticky));
382 	if (!re2->regexp.ok())
383 	{
384 		return Nan::ThrowSyntaxError(re2->regexp.error().c_str());
385 	}
386 	if (!ensureUniqueNamedGroups(re2->regexp.CapturingGroupNames()))
387 	{
388 		return Nan::ThrowSyntaxError("duplicate capture group name");
389 	}
390 	re2->Wrap(info.This());
391 	re2.release();
392 
393 	info.GetReturnValue().Set(info.This());
394 }
395