1 /**
2 * Copyright (c) 2007-2013, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * @file pcrepp.hh
30 *
31 * A C++ adapter for the pcre library. The interface provided here has a
32 * different focus than the pcrecpp.h file included in the pcre distribution.
33 * The standard pcrecpp.h interface is more concerned with regular expressions
34 * that are digesting data to be used within the program itself. Whereas this
35 * interface is dealing with regular expression entered by the user and
36 * processing a series of matches on text files.
37 */
38
39 #ifndef pcrepp_hh
40 #define pcrepp_hh
41
42 #ifdef HAVE_PCRE_H
43 #include <pcre.h>
44 #elif HAVE_PCRE_PCRE_H
45 #include <pcre/pcre.h>
46 #else
47 #error "pcre.h not found?"
48 #endif
49
50 #include <string.h>
51
52 #include <cassert>
53 #include <string>
54 #include <memory>
55 #include <utility>
56 #include <vector>
57 #include <exception>
58
59 #include "auto_mem.hh"
60 #include "base/intern_string.hh"
61 #include "base/result.h"
62
63 #include <stdio.h>
64
65 class pcrepp;
66
67 /**
68 * Context that tracks captures found during a match operation. This class is a
69 * base that defines iterator methods and fields, but does not allocate space
70 * for the capture array.
71 */
72 class pcre_context {
73 public:
74 typedef struct capture {
capturepcre_context::capture75 capture() {
76 /* We don't initialize anything since it's a perf hit. */
77 };
78
capturepcre_context::capture79 capture(int begin, int end) : c_begin(begin), c_end(end)
80 {
81 assert(begin <= end);
82 };
83
84 int c_begin;
85 int c_end;
86
ltrimpcre_context::capture87 void ltrim(const char *str) {
88 while (this->c_begin < this->c_end && isspace(str[this->c_begin])) {
89 this->c_begin += 1;
90 }
91 };
92
containspcre_context::capture93 bool contains(int pos) const {
94 return this->c_begin <= pos && pos < this->c_end;
95 };
96
is_validpcre_context::capture97 bool is_valid() const { return this->c_begin != -1; };
98
lengthpcre_context::capture99 int length() const { return this->c_end - this->c_begin; };
100
emptypcre_context::capture101 bool empty() const { return this->c_begin == this->c_end; };
102 } capture_t;
103 typedef capture_t *iterator;
104 typedef const capture_t *const_iterator;
105
106 /** @return The maximum number of strings this context can capture. */
get_max_count() const107 int get_max_count() const
108 {
109 return this->pc_max_count;
110 };
111
set_count(int count)112 void set_count(int count)
113 {
114 this->pc_count = count;
115 };
116
get_count() const117 int get_count() const
118 {
119 return this->pc_count;
120 };
121
set_pcrepp(const pcrepp * src)122 void set_pcrepp(const pcrepp *src) { this->pc_pcre = src; };
123
124 /**
125 * @return a capture_t that covers all of the text that was matched.
126 */
all() const127 capture_t *all() const { return pc_captures; };
128
129 /** @return An iterator to the first capture. */
begin()130 iterator begin() { return pc_captures + 1; };
131 /** @return An iterator that refers to the end of the capture array. */
end()132 iterator end() { return pc_captures + pc_count; };
133
operator [](int offset) const134 capture_t *operator[](int offset) const {
135 if (offset < 0) {
136 return nullptr;
137 }
138 return &this->pc_captures[offset + 1];
139 };
140
141 capture_t *operator[](const char *name) const;
142
operator [](const std::string & name) const143 capture_t *operator[](const std::string &name) const {
144 return (*this)[name.c_str()];
145 };
146
first_valid() const147 capture_t *first_valid() const {
148 for (int lpc = 1; lpc < this->pc_count; lpc++) {
149 if (this->pc_captures[lpc].is_valid()) {
150 return &this->pc_captures[lpc];
151 }
152 }
153
154 return nullptr;
155 };
156
157 protected:
pcre_context(capture_t * captures,int max_count)158 pcre_context(capture_t *captures, int max_count)
159 : pc_pcre(nullptr), pc_captures(captures), pc_max_count(max_count), pc_count(0) { };
160
161 const pcrepp *pc_pcre;
162 capture_t *pc_captures;
163 int pc_max_count;
164 int pc_count;
165 };
166
167 struct capture_if_not {
capture_if_notcapture_if_not168 capture_if_not(int begin) : cin_begin(begin) { };
169
operator ()capture_if_not170 bool operator()(const pcre_context::capture_t &cap) const
171 {
172 return cap.c_begin != this->cin_begin;
173 }
174
175 int cin_begin;
176 };
177
178 inline
skip_invalid_captures(pcre_context::iterator iter,pcre_context::iterator pc_end)179 pcre_context::iterator skip_invalid_captures(pcre_context::iterator iter,
180 pcre_context::iterator pc_end)
181 {
182 for (; iter != pc_end; ++iter) {
183 if (iter->c_begin == -1) {
184 continue;
185 }
186 }
187
188 return iter;
189 }
190
191 /**
192 * A pcre_context that allocates storage for the capture array within the object
193 * itself.
194 */
195 template<size_t MAX_COUNT>
196 class pcre_context_static : public pcre_context {
197 public:
pcre_context_static()198 pcre_context_static()
199 : pcre_context(this->pc_match_buffer, MAX_COUNT + 1) { };
200
201 private:
202 capture_t pc_match_buffer[MAX_COUNT + 1];
203 };
204
205 /**
206 *
207 */
208 class pcre_input {
209 public:
pcre_input(const char * str,size_t off=0,size_t len=-1)210 pcre_input(const char *str, size_t off = 0, size_t len = -1)
211 : pi_offset(off),
212 pi_next_offset(off),
213 pi_length(len),
214 pi_string(str)
215 {
216 if (this->pi_length == (size_t)-1) {
217 this->pi_length = strlen(str);
218 }
219 };
220
pcre_input(const string_fragment & s)221 pcre_input(const string_fragment &s)
222 : pi_offset(0),
223 pi_next_offset(0),
224 pi_length(s.length()),
225 pi_string(s.data()) {};
226
227 pcre_input(const string_fragment &&) = delete;
228
pcre_input(const std::string & str,size_t off=0)229 pcre_input(const std::string &str, size_t off = 0)
230 : pi_offset(off),
231 pi_next_offset(off),
232 pi_length(str.length()),
233 pi_string(str.c_str()) {};
234
235 pcre_input(const std::string &&, size_t off = 0) = delete;
236
get_string() const237 const char *get_string() const { return this->pi_string; };
238
get_substr_start(pcre_context::const_iterator iter) const239 const char *get_substr_start(pcre_context::const_iterator iter) const
240 {
241 return &this->pi_string[iter->c_begin];
242 };
243
get_substr_len(pcre_context::const_iterator iter) const244 size_t get_substr_len(pcre_context::const_iterator iter) const
245 {
246 return iter->length();
247 };
248
get_substr(pcre_context::const_iterator iter) const249 std::string get_substr(pcre_context::const_iterator iter) const
250 {
251 if (iter->c_begin == -1) {
252 return "";
253 }
254 return std::string(&this->pi_string[iter->c_begin],
255 iter->length());
256 };
257
get_substr_i(pcre_context::const_iterator iter) const258 intern_string_t get_substr_i(pcre_context::const_iterator iter) const {
259 return intern_string::lookup(&this->pi_string[iter->c_begin], iter->length());
260 };
261
get_substr_opt(pcre_context::const_iterator iter) const262 nonstd::optional<std::string> get_substr_opt(pcre_context::const_iterator iter) const {
263 if (iter->is_valid()) {
264 return std::string(&this->pi_string[iter->c_begin], iter->length());
265 }
266
267 return nonstd::nullopt;
268 }
269
get_substr(pcre_context::const_iterator iter,char * dst) const270 void get_substr(pcre_context::const_iterator iter, char *dst) const {
271 memcpy(dst, &this->pi_string[iter->c_begin], iter->length());
272 dst[iter->length()] = '\0';
273 };
274
reset_next_offset()275 void reset_next_offset() {
276 this->pi_next_offset = this->pi_offset;
277 };
278
reset(const char * str,size_t off=0,size_t len=-1)279 void reset(const char *str, size_t off = 0, size_t len = -1)
280 {
281 this->pi_string = str;
282 this->pi_offset = off;
283 this->pi_next_offset = off;
284 if (this->pi_length == (size_t)-1) {
285 this->pi_length = strlen(str);
286 }
287 else {
288 this->pi_length = len;
289 }
290 }
291
reset(const std::string & str,size_t off=0)292 void reset(const std::string &str, size_t off = 0)
293 {
294 this->reset(str.c_str(), off, str.length());
295 };
296
297 size_t pi_offset;
298 size_t pi_next_offset;
299 size_t pi_length;
300 private:
301 const char *pi_string;
302 };
303
304 struct pcre_named_capture {
305 class iterator {
306 public:
iterator(pcre_named_capture * pnc,size_t name_len)307 iterator(pcre_named_capture *pnc, size_t name_len)
308 : i_named_capture(pnc), i_name_len(name_len)
309 {
310 };
311
iterator()312 iterator() : i_named_capture(nullptr), i_name_len(0) { };
313
operator *() const314 const pcre_named_capture &operator*() const {
315 return *this->i_named_capture;
316 };
317
operator ->() const318 const pcre_named_capture *operator->() const {
319 return this->i_named_capture;
320 };
321
operator !=(const iterator & rhs) const322 bool operator!=(const iterator &rhs) const {
323 return this->i_named_capture != rhs.i_named_capture;
324 };
325
operator ++()326 iterator &operator++() {
327 char *ptr = (char *)this->i_named_capture;
328
329 ptr += this->i_name_len;
330 this->i_named_capture = (pcre_named_capture *)ptr;
331 return *this;
332 };
333
334 private:
335 pcre_named_capture *i_named_capture;
336 size_t i_name_len;
337 };
338
indexpcre_named_capture339 int index() const {
340 return (this->pnc_index_msb << 8 | this->pnc_index_lsb) - 1;
341 };
342
343 char pnc_index_msb;
344 char pnc_index_lsb;
345 char pnc_name[];
346 };
347
348 struct pcre_extractor {
349 const pcre_context &pe_context;
350 const pcre_input &pe_input;
351
352 template<typename T>
get_substr_ipcre_extractor353 intern_string_t get_substr_i(T name) const {
354 return this->pe_input.get_substr_i(this->pe_context[name]);
355 };
356
357 template<typename T>
get_substrpcre_extractor358 std::string get_substr(T name) const {
359 return this->pe_input.get_substr(this->pe_context[name]);
360 };
361 };
362
363 class pcrepp {
364 public:
365 class error : public std::exception {
366 public:
error(std::string msg,int offset=0)367 error(std::string msg, int offset = 0)
368 : e_msg(std::move(msg)), e_offset(offset) { };
369
what() const370 const char *what() const noexcept override {
371 return this->e_msg.c_str();
372 };
373
374 const std::string e_msg;
375 int e_offset;
376 };
377
378 static std::string quote(const char *unquoted);
379
quote(const std::string & unquoted)380 static std::string quote(const std::string& unquoted) {
381 return quote(unquoted.c_str());
382 }
383
384 struct compile_error {
385 const char *ce_msg;
386 int ce_offset;
387 };
388
389 static Result<pcrepp, compile_error> from_str(std::string pattern, int options = 0);
390
pcrepp(pcre * code)391 pcrepp(pcre *code) : p_code(code), p_code_extra(pcre_free_study)
392 {
393 pcre_refcount(this->p_code, 1);
394 this->study();
395 };
396
pcrepp(std::string pattern,pcre * code)397 pcrepp(std::string pattern, pcre *code)
398 : p_code(code),
399 p_pattern(std::move(pattern)),
400 p_code_extra(pcre_free_study)
401 {
402 pcre_refcount(this->p_code, 1);
403 this->study();
404 this->find_captures(this->p_pattern.c_str());
405 };
406
pcrepp(const char * pattern,int options=0)407 explicit pcrepp(const char *pattern, int options = 0)
408 : p_pattern(pattern), p_code_extra(pcre_free_study)
409 {
410 const char *errptr;
411 int eoff;
412
413 if ((this->p_code = pcre_compile(pattern,
414 options,
415 &errptr,
416 &eoff,
417 nullptr)) == nullptr) {
418 throw error(errptr, eoff);
419 }
420
421 pcre_refcount(this->p_code, 1);
422 this->study();
423 this->find_captures(pattern);
424 };
425
pcrepp(const std::string & pattern,int options=0)426 explicit pcrepp(const std::string &pattern, int options = 0)
427 : p_pattern(pattern), p_code_extra(pcre_free_study)
428 {
429 const char *errptr;
430 int eoff;
431
432 if ((this->p_code = pcre_compile(pattern.c_str(),
433 options | PCRE_UTF8,
434 &errptr,
435 &eoff,
436 nullptr)) == nullptr) {
437 throw error(errptr, eoff);
438 }
439
440 pcre_refcount(this->p_code, 1);
441 this->study();
442 this->find_captures(pattern.c_str());
443 };
444
pcrepp()445 pcrepp() {
446 }
447
pcrepp(const pcrepp & other)448 pcrepp(const pcrepp &other)
449 : p_code(other.p_code),
450 p_pattern(other.p_pattern),
451 p_code_extra(pcre_free_study),
452 p_captures(other.p_captures)
453 {
454 pcre_refcount(this->p_code, 1);
455 this->study();
456 };
457
pcrepp(pcrepp && other)458 pcrepp(pcrepp &&other)
459 : p_code(other.p_code),
460 p_pattern(std::move(other.p_pattern)),
461 p_code_extra(pcre_free_study),
462 p_capture_count(other.p_capture_count),
463 p_named_count(other.p_named_count),
464 p_name_len(other.p_name_len),
465 p_options(other.p_options),
466 p_named_entries(other.p_named_entries),
467 p_captures(std::move(other.p_captures)) {
468 pcre_refcount(this->p_code, 1);
469 this->p_code_extra = std::move(other.p_code_extra);
470 }
471
~pcrepp()472 virtual ~pcrepp()
473 {
474 this->clear();
475 };
476
operator =(pcrepp && other)477 pcrepp& operator=(pcrepp&& other) noexcept {
478 if (this == &other) {
479 return *this;
480 }
481
482 this->p_code = other.p_code;
483 pcre_refcount(this->p_code, 1);
484 this->p_pattern = std::move(other.p_pattern);
485 this->p_code_extra = std::move(other.p_code_extra);
486 this->p_capture_count = other.p_capture_count;
487 this->p_named_count = other.p_named_count;
488 this->p_name_len = other.p_name_len;
489 this->p_options = other.p_options;
490 this->p_named_entries = other.p_named_entries;
491 this->p_captures = std::move(other.p_captures);
492
493 return *this;
494 }
495
get_pattern() const496 const std::string& get_pattern() const {
497 return this->p_pattern;
498 }
499
empty() const500 bool empty() const {
501 return this->p_pattern.empty();
502 }
503
clear()504 void clear() {
505 if (this->p_code && pcre_refcount(this->p_code, -1) == 0) {
506 free(this->p_code);
507 this->p_code = nullptr;
508 }
509 this->p_pattern.clear();
510 this->p_code_extra.reset();
511 this->p_capture_count = 0;
512 this->p_named_count = 0;
513 this->p_name_len = 0;
514 this->p_options = 0;
515 this->p_named_entries = nullptr;
516 this->p_captures.clear();
517 }
518
named_begin() const519 pcre_named_capture::iterator named_begin() const {
520 return {this->p_named_entries, static_cast<size_t>(this->p_name_len)};
521 };
522
named_end() const523 pcre_named_capture::iterator named_end() const {
524 char *ptr = (char *)this->p_named_entries;
525
526 ptr += this->p_named_count * this->p_name_len;
527 return {(pcre_named_capture *)ptr,
528 static_cast<size_t>(this->p_name_len)};
529 };
530
captures() const531 const std::vector<pcre_context::capture> &captures() const {
532 return this->p_captures;
533 };
534
cap_begin() const535 std::vector<pcre_context::capture>::const_iterator cap_begin() const {
536 return this->p_captures.begin();
537 };
538
cap_end() const539 std::vector<pcre_context::capture>::const_iterator cap_end() const {
540 return this->p_captures.end();
541 };
542
name_index(const std::string & name) const543 int name_index(const std::string &name) const {
544 return this->name_index(name.c_str());
545 };
546
name_index(const char * name) const547 int name_index(const char *name) const {
548 int retval = pcre_get_stringnumber(this->p_code, name);
549
550 if (retval == PCRE_ERROR_NOSUBSTRING) {
551 return retval;
552 }
553
554 return retval - 1;
555 };
556
name_for_capture(int index) const557 const char *name_for_capture(int index) const {
558 for (pcre_named_capture::iterator iter = this->named_begin();
559 iter != this->named_end();
560 ++iter) {
561 if (iter->index() == index) {
562 return iter->pnc_name;
563 }
564 }
565 return "";
566 };
567
get_capture_count() const568 int get_capture_count() const {
569 return this->p_capture_count;
570 };
571
572 bool match(pcre_context &pc, pcre_input &pi, int options = 0) const;
573
574 template<size_t MATCH_COUNT>
match(pcre_input & pi,int options=0) const575 nonstd::optional<pcre_context_static<MATCH_COUNT>> match(pcre_input &pi, int options = 0) const {
576 pcre_context_static<MATCH_COUNT> pc;
577
578 if (this->match(pc, pi, options)) {
579 return pc;
580 }
581
582 return nonstd::nullopt;
583 }
584
585 std::string replace(const char *str, const char *repl) const;
586
match_partial(pcre_input & pi) const587 size_t match_partial(pcre_input &pi) const {
588 size_t length = pi.pi_length;
589 int rc;
590
591 do {
592 rc = pcre_exec(this->p_code,
593 this->p_code_extra.in(),
594 pi.get_string(),
595 length,
596 pi.pi_offset,
597 PCRE_PARTIAL,
598 nullptr,
599 0);
600 switch (rc) {
601 case 0:
602 case PCRE_ERROR_PARTIAL:
603 return length;
604 }
605 length -= 1;
606 } while (length > 0);
607
608 return length;
609 };
610
611 // #undef PCRE_STUDY_JIT_COMPILE
612 #ifdef PCRE_STUDY_JIT_COMPILE
613 static pcre_jit_stack *jit_stack();
614
615 #else
616 static void pcre_free_study(pcre_extra *);
617 #endif
618
619 void study();
620
621 void find_captures(const char *pattern);
622
623 pcre *p_code{nullptr};
624 std::string p_pattern;
625 auto_mem<pcre_extra> p_code_extra;
626 int p_capture_count{0};
627 int p_named_count{0};
628 int p_name_len{0};
629 unsigned long p_options{0};
630 pcre_named_capture *p_named_entries{nullptr};
631 std::vector<pcre_context::capture> p_captures;
632 };
633
634 #endif
635