1 /**
2  * Copyright (c) 2007-2012, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * @file pcrepp.cc
30  */
31 
32 #include "config.h"
33 
34 #include "pcrepp.hh"
35 
36 using namespace std;
37 
38 const int JIT_STACK_MIN_SIZE = 32 * 1024;
39 const int JIT_STACK_MAX_SIZE = 512 * 1024;
40 
operator [](const char * name) const41 pcre_context::capture_t *pcre_context::operator[](const char *name) const
42 {
43     capture_t *retval = nullptr;
44     int index;
45 
46     index = this->pc_pcre->name_index(name);
47     if (index != PCRE_ERROR_NOSUBSTRING) {
48         retval = &this->pc_captures[index + 1];
49     }
50 
51     return retval;
52 }
53 
quote(const char * unquoted)54 std::string pcrepp::quote(const char *unquoted)
55 {
56     std::string retval;
57 
58     for (int lpc = 0; unquoted[lpc]; lpc++) {
59         if (isalnum(unquoted[lpc]) ||
60             unquoted[lpc] == '_' ||
61             unquoted[lpc] & 0x80) {
62             retval.push_back(unquoted[lpc]);
63         } else {
64             retval.push_back('\\');
65             retval.push_back(unquoted[lpc]);
66         }
67     }
68 
69     return retval;
70 }
71 
from_str(std::string pattern,int options)72 Result<pcrepp, pcrepp::compile_error> pcrepp::from_str(std::string pattern, int options)
73 {
74     const char *errptr;
75     int eoff;
76     auto code = pcre_compile(pattern.c_str(),
77                              options | PCRE_UTF8,
78                              &errptr,
79                              &eoff,
80                              nullptr);
81 
82     if (!code) {
83         return Err(compile_error{errptr, eoff});
84     }
85 
86     return Ok(pcrepp(std::move(pattern), code));
87 }
88 
find_captures(const char * pattern)89 void pcrepp::find_captures(const char *pattern)
90 {
91     bool in_class = false, in_escape = false, in_literal = false;
92     vector<pcre_context::capture> cap_in_progress;
93 
94     for (int lpc = 0; pattern[lpc]; lpc++) {
95         if (in_escape) {
96             in_escape = false;
97             if (pattern[lpc] == 'Q') {
98                 in_literal = true;
99             }
100         }
101         else if (in_class) {
102             if (pattern[lpc] == ']') {
103                 in_class = false;
104             }
105             if (pattern[lpc] == '\\') {
106                 in_escape = true;
107             }
108         }
109         else if (in_literal) {
110             if (pattern[lpc] == '\\' && pattern[lpc + 1] == 'E') {
111                 in_literal = false;
112                 lpc += 1;
113             }
114         }
115         else {
116             switch (pattern[lpc]) {
117                 case '\\':
118                     in_escape = true;
119                     break;
120                 case '[':
121                     in_class = true;
122                     break;
123                 case '(':
124                     cap_in_progress.emplace_back(lpc, lpc);
125                     break;
126                 case ')': {
127                     if (!cap_in_progress.empty()) {
128                         pcre_context::capture &cap = cap_in_progress.back();
129                         char first = '\0', second = '\0', third = '\0';
130                         bool is_cap = false;
131 
132                         cap.c_end = lpc + 1;
133                         if (cap.length() >= 2) {
134                             first = pattern[cap.c_begin + 1];
135                         }
136                         if (cap.length() >= 3) {
137                             second = pattern[cap.c_begin + 2];
138                         }
139                         if (cap.length() >= 4) {
140                             third = pattern[cap.c_begin + 3];
141                         }
142                         if (first == '?') {
143                             if (second == '\'') {
144                                 is_cap = true;
145                             }
146                             if (second == '<' &&
147                                 (isalpha(third) || third == '_')) {
148                                 is_cap = true;
149                             }
150                             if (second == 'P' && third == '<') {
151                                 is_cap = true;
152                             }
153                         }
154                         else if (first != '*') {
155                             is_cap = true;
156                         }
157                         if (is_cap) {
158                             this->p_captures.push_back(cap);
159                         }
160                         cap_in_progress.pop_back();
161                     }
162                     break;
163                 }
164             }
165         }
166     }
167 
168     assert((size_t) this->p_capture_count == this->p_captures.size());
169 }
170 
match(pcre_context & pc,pcre_input & pi,int options) const171 bool pcrepp::match(pcre_context &pc, pcre_input &pi, int options) const
172 {
173     int         length, startoffset, filtered_options = options;
174     int         count = pc.get_max_count();
175     const char *str;
176     int         rc;
177 
178     pc.set_pcrepp(this);
179     pi.pi_offset = pi.pi_next_offset;
180 
181     str = pi.get_string();
182     if (filtered_options & PCRE_ANCHORED) {
183         filtered_options &= ~PCRE_ANCHORED;
184         str         = &str[pi.pi_offset];
185         startoffset = 0;
186         length      = pi.pi_length - pi.pi_offset;
187     }
188     else {
189         startoffset = pi.pi_offset;
190         length      = pi.pi_length;
191     }
192     rc = pcre_exec(this->p_code,
193                    this->p_code_extra.in(),
194                    str,
195                    length,
196                    startoffset,
197                    filtered_options,
198                    (int *)pc.all(),
199                    count * 2);
200 
201     if (rc < 0) {
202         switch (rc) {
203             case PCRE_ERROR_NOMATCH:
204                 break;
205             case PCRE_ERROR_PARTIAL:
206                 pc.set_count(1);
207                 return true;
208 
209             default:
210                 break;
211         }
212     }
213     else if (rc == 0) {
214         rc = 0;
215     }
216     else if (pc.all()->c_begin == pc.all()->c_end) {
217         rc = 0;
218         if (pi.pi_next_offset + 1 < pi.pi_length) {
219             pi.pi_next_offset += 1;
220         }
221     }
222     else {
223         if (options & PCRE_ANCHORED) {
224             for (int lpc = 0; lpc < rc; lpc++) {
225                 if (pc.all()[lpc].c_begin == -1) {
226                     continue;
227                 }
228                 pc.all()[lpc].c_begin += pi.pi_offset;
229                 pc.all()[lpc].c_end   += pi.pi_offset;
230             }
231         }
232         pi.pi_next_offset = pc.all()->c_end;
233     }
234 
235     pc.set_count(rc);
236 
237     return rc > 0;
238 }
239 
replace(const char * str,const char * repl) const240 std::string pcrepp::replace(const char *str, const char *repl) const
241 {
242     pcre_context_static<30> pc;
243     pcre_input pi(str);
244     std::string retval;
245     std::string::size_type start = 0;
246 
247     while (pi.pi_offset < pi.pi_length) {
248         this->match(pc, pi);
249         auto all = pc.all();
250         bool in_escape = false;
251 
252         if (pc.get_count() < 0) {
253             break;
254         }
255 
256         retval.append(str, start, (all->c_begin - start));
257         start = all->c_end;
258         for (int lpc = 0; repl[lpc]; lpc++) {
259             auto ch = repl[lpc];
260 
261             if (in_escape) {
262                 if (isdigit(ch)) {
263                     auto capture_index = (ch - '0');
264 
265                     if (capture_index < pc.get_count()) {
266                         retval.append(pi.get_substr_start(&all[capture_index]),
267                                       pi.get_substr_len(&all[capture_index]));
268                     } else if (capture_index > this->p_capture_count) {
269                         retval.push_back('\\');
270                         retval.push_back(ch);
271                     }
272                 } else {
273                     if (ch != '\\') {
274                         retval.push_back('\\');
275                     }
276                     retval.push_back(ch);
277                 }
278                 in_escape = false;
279             } else {
280                 switch (ch) {
281                     case '\\':
282                         in_escape = true;
283                         break;
284                     default:
285                         retval.push_back(ch);
286                         break;
287                 }
288             }
289         }
290     }
291     retval.append(str, start, std::string::npos);
292 
293     return retval;
294 }
295 
study()296 void pcrepp::study()
297 {
298     const char *errptr;
299 
300     this->p_code_extra = pcre_study(this->p_code,
301 #ifdef PCRE_STUDY_JIT_COMPILE
302                                     PCRE_STUDY_JIT_COMPILE,
303 #else
304         0,
305 #endif
306                                     &errptr);
307     if (!this->p_code_extra && errptr) {
308         // log_error("pcre_study error: %s", errptr);
309     }
310     if (this->p_code_extra != nullptr) {
311         pcre_extra *extra = this->p_code_extra;
312 
313         extra->flags |= (PCRE_EXTRA_MATCH_LIMIT |
314                          PCRE_EXTRA_MATCH_LIMIT_RECURSION);
315         extra->match_limit           = 10000;
316         extra->match_limit_recursion = 500;
317 #ifdef PCRE_STUDY_JIT_COMPILE
318         // pcre_assign_jit_stack(extra, nullptr, jit_stack());
319 #endif
320     }
321     pcre_fullinfo(this->p_code,
322                   this->p_code_extra,
323                   PCRE_INFO_OPTIONS,
324                   &this->p_options);
325     pcre_fullinfo(this->p_code,
326                   this->p_code_extra,
327                   PCRE_INFO_CAPTURECOUNT,
328                   &this->p_capture_count);
329     pcre_fullinfo(this->p_code,
330                   this->p_code_extra,
331                   PCRE_INFO_NAMECOUNT,
332                   &this->p_named_count);
333     pcre_fullinfo(this->p_code,
334                   this->p_code_extra,
335                   PCRE_INFO_NAMEENTRYSIZE,
336                   &this->p_name_len);
337     pcre_fullinfo(this->p_code,
338                   this->p_code_extra,
339                   PCRE_INFO_NAMETABLE,
340                   &this->p_named_entries);
341 }
342 
343 #ifdef PCRE_STUDY_JIT_COMPILE
jit_stack()344 pcre_jit_stack *pcrepp::jit_stack()
345 {
346     static pcre_jit_stack *retval = nullptr;
347 
348     if (retval == nullptr) {
349         retval = pcre_jit_stack_alloc(JIT_STACK_MIN_SIZE, JIT_STACK_MAX_SIZE);
350     }
351 
352     return retval;
353 }
354 
355 #else
356 #warning "pcrejit is not available, search performance will be degraded"
357 
pcre_free_study(pcre_extra * extra)358 void pcrepp::pcre_free_study(pcre_extra *extra)
359 {
360     free(extra);
361 }
362 #endif
363