1 /**
2 * Copyright (c) 2007-2012, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * @file pcrepp.cc
30 */
31
32 #include "config.h"
33
34 #include "pcrepp.hh"
35
36 using namespace std;
37
38 const int JIT_STACK_MIN_SIZE = 32 * 1024;
39 const int JIT_STACK_MAX_SIZE = 512 * 1024;
40
operator [](const char * name) const41 pcre_context::capture_t *pcre_context::operator[](const char *name) const
42 {
43 capture_t *retval = nullptr;
44 int index;
45
46 index = this->pc_pcre->name_index(name);
47 if (index != PCRE_ERROR_NOSUBSTRING) {
48 retval = &this->pc_captures[index + 1];
49 }
50
51 return retval;
52 }
53
quote(const char * unquoted)54 std::string pcrepp::quote(const char *unquoted)
55 {
56 std::string retval;
57
58 for (int lpc = 0; unquoted[lpc]; lpc++) {
59 if (isalnum(unquoted[lpc]) ||
60 unquoted[lpc] == '_' ||
61 unquoted[lpc] & 0x80) {
62 retval.push_back(unquoted[lpc]);
63 } else {
64 retval.push_back('\\');
65 retval.push_back(unquoted[lpc]);
66 }
67 }
68
69 return retval;
70 }
71
from_str(std::string pattern,int options)72 Result<pcrepp, pcrepp::compile_error> pcrepp::from_str(std::string pattern, int options)
73 {
74 const char *errptr;
75 int eoff;
76 auto code = pcre_compile(pattern.c_str(),
77 options | PCRE_UTF8,
78 &errptr,
79 &eoff,
80 nullptr);
81
82 if (!code) {
83 return Err(compile_error{errptr, eoff});
84 }
85
86 return Ok(pcrepp(std::move(pattern), code));
87 }
88
find_captures(const char * pattern)89 void pcrepp::find_captures(const char *pattern)
90 {
91 bool in_class = false, in_escape = false, in_literal = false;
92 vector<pcre_context::capture> cap_in_progress;
93
94 for (int lpc = 0; pattern[lpc]; lpc++) {
95 if (in_escape) {
96 in_escape = false;
97 if (pattern[lpc] == 'Q') {
98 in_literal = true;
99 }
100 }
101 else if (in_class) {
102 if (pattern[lpc] == ']') {
103 in_class = false;
104 }
105 if (pattern[lpc] == '\\') {
106 in_escape = true;
107 }
108 }
109 else if (in_literal) {
110 if (pattern[lpc] == '\\' && pattern[lpc + 1] == 'E') {
111 in_literal = false;
112 lpc += 1;
113 }
114 }
115 else {
116 switch (pattern[lpc]) {
117 case '\\':
118 in_escape = true;
119 break;
120 case '[':
121 in_class = true;
122 break;
123 case '(':
124 cap_in_progress.emplace_back(lpc, lpc);
125 break;
126 case ')': {
127 if (!cap_in_progress.empty()) {
128 pcre_context::capture &cap = cap_in_progress.back();
129 char first = '\0', second = '\0', third = '\0';
130 bool is_cap = false;
131
132 cap.c_end = lpc + 1;
133 if (cap.length() >= 2) {
134 first = pattern[cap.c_begin + 1];
135 }
136 if (cap.length() >= 3) {
137 second = pattern[cap.c_begin + 2];
138 }
139 if (cap.length() >= 4) {
140 third = pattern[cap.c_begin + 3];
141 }
142 if (first == '?') {
143 if (second == '\'') {
144 is_cap = true;
145 }
146 if (second == '<' &&
147 (isalpha(third) || third == '_')) {
148 is_cap = true;
149 }
150 if (second == 'P' && third == '<') {
151 is_cap = true;
152 }
153 }
154 else if (first != '*') {
155 is_cap = true;
156 }
157 if (is_cap) {
158 this->p_captures.push_back(cap);
159 }
160 cap_in_progress.pop_back();
161 }
162 break;
163 }
164 }
165 }
166 }
167
168 assert((size_t) this->p_capture_count == this->p_captures.size());
169 }
170
match(pcre_context & pc,pcre_input & pi,int options) const171 bool pcrepp::match(pcre_context &pc, pcre_input &pi, int options) const
172 {
173 int length, startoffset, filtered_options = options;
174 int count = pc.get_max_count();
175 const char *str;
176 int rc;
177
178 pc.set_pcrepp(this);
179 pi.pi_offset = pi.pi_next_offset;
180
181 str = pi.get_string();
182 if (filtered_options & PCRE_ANCHORED) {
183 filtered_options &= ~PCRE_ANCHORED;
184 str = &str[pi.pi_offset];
185 startoffset = 0;
186 length = pi.pi_length - pi.pi_offset;
187 }
188 else {
189 startoffset = pi.pi_offset;
190 length = pi.pi_length;
191 }
192 rc = pcre_exec(this->p_code,
193 this->p_code_extra.in(),
194 str,
195 length,
196 startoffset,
197 filtered_options,
198 (int *)pc.all(),
199 count * 2);
200
201 if (rc < 0) {
202 switch (rc) {
203 case PCRE_ERROR_NOMATCH:
204 break;
205 case PCRE_ERROR_PARTIAL:
206 pc.set_count(1);
207 return true;
208
209 default:
210 break;
211 }
212 }
213 else if (rc == 0) {
214 rc = 0;
215 }
216 else if (pc.all()->c_begin == pc.all()->c_end) {
217 rc = 0;
218 if (pi.pi_next_offset + 1 < pi.pi_length) {
219 pi.pi_next_offset += 1;
220 }
221 }
222 else {
223 if (options & PCRE_ANCHORED) {
224 for (int lpc = 0; lpc < rc; lpc++) {
225 if (pc.all()[lpc].c_begin == -1) {
226 continue;
227 }
228 pc.all()[lpc].c_begin += pi.pi_offset;
229 pc.all()[lpc].c_end += pi.pi_offset;
230 }
231 }
232 pi.pi_next_offset = pc.all()->c_end;
233 }
234
235 pc.set_count(rc);
236
237 return rc > 0;
238 }
239
replace(const char * str,const char * repl) const240 std::string pcrepp::replace(const char *str, const char *repl) const
241 {
242 pcre_context_static<30> pc;
243 pcre_input pi(str);
244 std::string retval;
245 std::string::size_type start = 0;
246
247 while (pi.pi_offset < pi.pi_length) {
248 this->match(pc, pi);
249 auto all = pc.all();
250 bool in_escape = false;
251
252 if (pc.get_count() < 0) {
253 break;
254 }
255
256 retval.append(str, start, (all->c_begin - start));
257 start = all->c_end;
258 for (int lpc = 0; repl[lpc]; lpc++) {
259 auto ch = repl[lpc];
260
261 if (in_escape) {
262 if (isdigit(ch)) {
263 auto capture_index = (ch - '0');
264
265 if (capture_index < pc.get_count()) {
266 retval.append(pi.get_substr_start(&all[capture_index]),
267 pi.get_substr_len(&all[capture_index]));
268 } else if (capture_index > this->p_capture_count) {
269 retval.push_back('\\');
270 retval.push_back(ch);
271 }
272 } else {
273 if (ch != '\\') {
274 retval.push_back('\\');
275 }
276 retval.push_back(ch);
277 }
278 in_escape = false;
279 } else {
280 switch (ch) {
281 case '\\':
282 in_escape = true;
283 break;
284 default:
285 retval.push_back(ch);
286 break;
287 }
288 }
289 }
290 }
291 retval.append(str, start, std::string::npos);
292
293 return retval;
294 }
295
study()296 void pcrepp::study()
297 {
298 const char *errptr;
299
300 this->p_code_extra = pcre_study(this->p_code,
301 #ifdef PCRE_STUDY_JIT_COMPILE
302 PCRE_STUDY_JIT_COMPILE,
303 #else
304 0,
305 #endif
306 &errptr);
307 if (!this->p_code_extra && errptr) {
308 // log_error("pcre_study error: %s", errptr);
309 }
310 if (this->p_code_extra != nullptr) {
311 pcre_extra *extra = this->p_code_extra;
312
313 extra->flags |= (PCRE_EXTRA_MATCH_LIMIT |
314 PCRE_EXTRA_MATCH_LIMIT_RECURSION);
315 extra->match_limit = 10000;
316 extra->match_limit_recursion = 500;
317 #ifdef PCRE_STUDY_JIT_COMPILE
318 // pcre_assign_jit_stack(extra, nullptr, jit_stack());
319 #endif
320 }
321 pcre_fullinfo(this->p_code,
322 this->p_code_extra,
323 PCRE_INFO_OPTIONS,
324 &this->p_options);
325 pcre_fullinfo(this->p_code,
326 this->p_code_extra,
327 PCRE_INFO_CAPTURECOUNT,
328 &this->p_capture_count);
329 pcre_fullinfo(this->p_code,
330 this->p_code_extra,
331 PCRE_INFO_NAMECOUNT,
332 &this->p_named_count);
333 pcre_fullinfo(this->p_code,
334 this->p_code_extra,
335 PCRE_INFO_NAMEENTRYSIZE,
336 &this->p_name_len);
337 pcre_fullinfo(this->p_code,
338 this->p_code_extra,
339 PCRE_INFO_NAMETABLE,
340 &this->p_named_entries);
341 }
342
343 #ifdef PCRE_STUDY_JIT_COMPILE
jit_stack()344 pcre_jit_stack *pcrepp::jit_stack()
345 {
346 static pcre_jit_stack *retval = nullptr;
347
348 if (retval == nullptr) {
349 retval = pcre_jit_stack_alloc(JIT_STACK_MIN_SIZE, JIT_STACK_MAX_SIZE);
350 }
351
352 return retval;
353 }
354
355 #else
356 #warning "pcrejit is not available, search performance will be degraded"
357
pcre_free_study(pcre_extra * extra)358 void pcrepp::pcre_free_study(pcre_extra *extra)
359 {
360 free(extra);
361 }
362 #endif
363