1 /**
2  * Copyright (c) 2007-2013, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * @file pcrepp.hh
30  *
31  * A C++ adapter for the pcre library.  The interface provided here has a
32  * different focus than the pcrecpp.h file included in the pcre distribution.
33  * The standard pcrecpp.h interface is more concerned with regular expressions
34  * that are digesting data to be used within the program itself.  Whereas this
35  * interface is dealing with regular expression entered by the user and
36  * processing a series of matches on text files.
37  */
38 
39 #ifndef pcrepp_hh
40 #define pcrepp_hh
41 
42 #ifdef HAVE_PCRE_H
43 #include <pcre.h>
44 #elif HAVE_PCRE_PCRE_H
45 #include <pcre/pcre.h>
46 #else
47 #error "pcre.h not found?"
48 #endif
49 
50 #include <string.h>
51 
52 #include <cassert>
53 #include <string>
54 #include <memory>
55 #include <utility>
56 #include <vector>
57 #include <exception>
58 
59 #include "auto_mem.hh"
60 #include "base/intern_string.hh"
61 #include "base/result.h"
62 
63 #include <stdio.h>
64 
65 class pcrepp;
66 
67 /**
68  * Context that tracks captures found during a match operation.  This class is a
69  * base that defines iterator methods and fields, but does not allocate space
70  * for the capture array.
71  */
72 class pcre_context {
73 public:
74     typedef struct capture {
capturepcre_context::capture75         capture() {
76             /* We don't initialize anything since it's a perf hit. */
77         };
78 
capturepcre_context::capture79         capture(int begin, int end) : c_begin(begin), c_end(end)
80         {
81             assert(begin <= end);
82         };
83 
84         int c_begin;
85         int c_end;
86 
ltrimpcre_context::capture87         void ltrim(const char *str) {
88             while (this->c_begin < this->c_end && isspace(str[this->c_begin])) {
89                 this->c_begin += 1;
90             }
91         };
92 
containspcre_context::capture93         bool contains(int pos) const {
94             return this->c_begin <= pos && pos < this->c_end;
95         };
96 
is_validpcre_context::capture97         bool is_valid() const { return this->c_begin != -1; };
98 
lengthpcre_context::capture99         int length() const { return this->c_end - this->c_begin; };
100 
emptypcre_context::capture101         bool empty() const { return this->c_begin == this->c_end; };
102     } capture_t;
103     typedef capture_t       *iterator;
104     typedef const capture_t *const_iterator;
105 
106     /** @return The maximum number of strings this context can capture. */
get_max_count() const107     int get_max_count() const
108     {
109         return this->pc_max_count;
110     };
111 
set_count(int count)112     void set_count(int count)
113     {
114         this->pc_count = count;
115     };
116 
get_count() const117     int get_count() const
118     {
119         return this->pc_count;
120     };
121 
set_pcrepp(const pcrepp * src)122     void set_pcrepp(const pcrepp *src) { this->pc_pcre = src; };
123 
124     /**
125      * @return a capture_t that covers all of the text that was matched.
126      */
all() const127     capture_t *all() const { return pc_captures; };
128 
129     /** @return An iterator to the first capture. */
begin()130     iterator begin() { return pc_captures + 1; };
131     /** @return An iterator that refers to the end of the capture array. */
end()132     iterator end() { return pc_captures + pc_count; };
133 
operator [](int offset) const134     capture_t *operator[](int offset) const {
135         if (offset < 0) {
136             return nullptr;
137         }
138         return &this->pc_captures[offset + 1];
139     };
140 
141     capture_t *operator[](const char *name) const;
142 
operator [](const std::string & name) const143     capture_t *operator[](const std::string &name) const {
144         return (*this)[name.c_str()];
145     };
146 
first_valid() const147     capture_t *first_valid() const {
148         for (int lpc = 1; lpc < this->pc_count; lpc++) {
149             if (this->pc_captures[lpc].is_valid()) {
150                 return &this->pc_captures[lpc];
151             }
152         }
153 
154         return nullptr;
155     };
156 
157 protected:
pcre_context(capture_t * captures,int max_count)158     pcre_context(capture_t *captures, int max_count)
159         : pc_pcre(nullptr), pc_captures(captures), pc_max_count(max_count), pc_count(0) { };
160 
161     const pcrepp *pc_pcre;
162     capture_t *pc_captures;
163     int        pc_max_count;
164     int        pc_count;
165 };
166 
167 struct capture_if_not {
capture_if_notcapture_if_not168     capture_if_not(int begin) : cin_begin(begin) { };
169 
operator ()capture_if_not170     bool operator()(const pcre_context::capture_t &cap) const
171     {
172         return cap.c_begin != this->cin_begin;
173     }
174 
175     int cin_begin;
176 };
177 
178 inline
skip_invalid_captures(pcre_context::iterator iter,pcre_context::iterator pc_end)179 pcre_context::iterator skip_invalid_captures(pcre_context::iterator iter,
180                                              pcre_context::iterator pc_end)
181 {
182     for (; iter != pc_end; ++iter) {
183         if (iter->c_begin == -1) {
184             continue;
185         }
186     }
187 
188     return iter;
189 }
190 
191 /**
192  * A pcre_context that allocates storage for the capture array within the object
193  * itself.
194  */
195 template<size_t MAX_COUNT>
196 class pcre_context_static : public pcre_context {
197 public:
pcre_context_static()198     pcre_context_static()
199         : pcre_context(this->pc_match_buffer, MAX_COUNT + 1) { };
200 
201 private:
202     capture_t pc_match_buffer[MAX_COUNT + 1];
203 };
204 
205 /**
206  *
207  */
208 class pcre_input {
209 public:
pcre_input(const char * str,size_t off=0,size_t len=-1)210     pcre_input(const char *str, size_t off = 0, size_t len = -1)
211         : pi_offset(off),
212           pi_next_offset(off),
213           pi_length(len),
214           pi_string(str)
215     {
216         if (this->pi_length == (size_t)-1) {
217             this->pi_length = strlen(str);
218         }
219     };
220 
pcre_input(const string_fragment & s)221     pcre_input(const string_fragment &s)
222         : pi_offset(0),
223           pi_next_offset(0),
224           pi_length(s.length()),
225           pi_string(s.data()) {};
226 
227     pcre_input(const string_fragment &&) = delete;
228 
pcre_input(const std::string & str,size_t off=0)229     pcre_input(const std::string &str, size_t off = 0)
230         : pi_offset(off),
231           pi_next_offset(off),
232           pi_length(str.length()),
233           pi_string(str.c_str()) {};
234 
235     pcre_input(const std::string &&, size_t off = 0) = delete;
236 
get_string() const237     const char *get_string() const { return this->pi_string; };
238 
get_substr_start(pcre_context::const_iterator iter) const239     const char *get_substr_start(pcre_context::const_iterator iter) const
240     {
241         return &this->pi_string[iter->c_begin];
242     };
243 
get_substr_len(pcre_context::const_iterator iter) const244     size_t get_substr_len(pcre_context::const_iterator iter) const
245     {
246         return iter->length();
247     };
248 
get_substr(pcre_context::const_iterator iter) const249     std::string get_substr(pcre_context::const_iterator iter) const
250     {
251         if (iter->c_begin == -1) {
252             return "";
253         }
254         return std::string(&this->pi_string[iter->c_begin],
255                            iter->length());
256     };
257 
get_substr_i(pcre_context::const_iterator iter) const258     intern_string_t get_substr_i(pcre_context::const_iterator iter) const {
259         return intern_string::lookup(&this->pi_string[iter->c_begin], iter->length());
260     };
261 
get_substr_opt(pcre_context::const_iterator iter) const262     nonstd::optional<std::string> get_substr_opt(pcre_context::const_iterator iter) const {
263         if (iter->is_valid()) {
264             return std::string(&this->pi_string[iter->c_begin], iter->length());
265         }
266 
267         return nonstd::nullopt;
268     }
269 
get_substr(pcre_context::const_iterator iter,char * dst) const270     void get_substr(pcre_context::const_iterator iter, char *dst) const {
271         memcpy(dst, &this->pi_string[iter->c_begin], iter->length());
272         dst[iter->length()] = '\0';
273     };
274 
reset_next_offset()275     void reset_next_offset() {
276         this->pi_next_offset = this->pi_offset;
277     };
278 
reset(const char * str,size_t off=0,size_t len=-1)279     void reset(const char *str, size_t off = 0, size_t len = -1)
280     {
281         this->pi_string      = str;
282         this->pi_offset      = off;
283         this->pi_next_offset = off;
284         if (this->pi_length == (size_t)-1) {
285             this->pi_length = strlen(str);
286         }
287         else {
288             this->pi_length = len;
289         }
290     }
291 
reset(const std::string & str,size_t off=0)292     void reset(const std::string &str, size_t off = 0)
293     {
294         this->reset(str.c_str(), off, str.length());
295     };
296 
297     size_t pi_offset;
298     size_t pi_next_offset;
299     size_t pi_length;
300 private:
301     const char *pi_string;
302 };
303 
304 struct pcre_named_capture {
305     class iterator {
306     public:
iterator(pcre_named_capture * pnc,size_t name_len)307         iterator(pcre_named_capture *pnc, size_t name_len)
308             : i_named_capture(pnc), i_name_len(name_len)
309         {
310         };
311 
iterator()312         iterator() : i_named_capture(nullptr), i_name_len(0) { };
313 
operator *() const314         const pcre_named_capture &operator*() const {
315             return *this->i_named_capture;
316         };
317 
operator ->() const318         const pcre_named_capture *operator->() const {
319             return this->i_named_capture;
320         };
321 
operator !=(const iterator & rhs) const322         bool operator!=(const iterator &rhs) const {
323             return this->i_named_capture != rhs.i_named_capture;
324         };
325 
operator ++()326         iterator &operator++() {
327             char *ptr = (char *)this->i_named_capture;
328 
329             ptr += this->i_name_len;
330             this->i_named_capture = (pcre_named_capture *)ptr;
331             return *this;
332         };
333 
334     private:
335         pcre_named_capture *i_named_capture;
336         size_t i_name_len;
337     };
338 
indexpcre_named_capture339     int index() const {
340         return (this->pnc_index_msb << 8 | this->pnc_index_lsb) - 1;
341     };
342 
343     char pnc_index_msb;
344     char pnc_index_lsb;
345     char pnc_name[];
346 };
347 
348 struct pcre_extractor {
349     const pcre_context &pe_context;
350     const pcre_input &pe_input;
351 
352     template<typename T>
get_substr_ipcre_extractor353     intern_string_t get_substr_i(T name) const {
354         return this->pe_input.get_substr_i(this->pe_context[name]);
355     };
356 
357     template<typename T>
get_substrpcre_extractor358     std::string get_substr(T name) const {
359         return this->pe_input.get_substr(this->pe_context[name]);
360     };
361 };
362 
363 class pcrepp {
364 public:
365     class error : public std::exception {
366 public:
error(std::string msg,int offset=0)367         error(std::string msg, int offset = 0)
368             : e_msg(std::move(msg)), e_offset(offset) { };
369 
what() const370         const char *what() const noexcept override {
371             return this->e_msg.c_str();
372         };
373 
374         const std::string e_msg;
375         int e_offset;
376     };
377 
378     static std::string quote(const char *unquoted);
379 
quote(const std::string & unquoted)380     static std::string quote(const std::string& unquoted) {
381         return quote(unquoted.c_str());
382     }
383 
384     struct compile_error {
385         const char *ce_msg;
386         int ce_offset;
387     };
388 
389     static Result<pcrepp, compile_error> from_str(std::string pattern, int options = 0);
390 
pcrepp(pcre * code)391     pcrepp(pcre *code) : p_code(code), p_code_extra(pcre_free_study)
392     {
393         pcre_refcount(this->p_code, 1);
394         this->study();
395     };
396 
pcrepp(std::string pattern,pcre * code)397     pcrepp(std::string pattern, pcre *code)
398         : p_code(code),
399           p_pattern(std::move(pattern)),
400           p_code_extra(pcre_free_study)
401     {
402         pcre_refcount(this->p_code, 1);
403         this->study();
404         this->find_captures(this->p_pattern.c_str());
405     };
406 
pcrepp(const char * pattern,int options=0)407     explicit pcrepp(const char *pattern, int options = 0)
408             : p_pattern(pattern), p_code_extra(pcre_free_study)
409     {
410         const char *errptr;
411         int         eoff;
412 
413         if ((this->p_code = pcre_compile(pattern,
414                                          options,
415                                          &errptr,
416                                          &eoff,
417                                          nullptr)) == nullptr) {
418             throw error(errptr, eoff);
419         }
420 
421         pcre_refcount(this->p_code, 1);
422         this->study();
423         this->find_captures(pattern);
424     };
425 
pcrepp(const std::string & pattern,int options=0)426     explicit pcrepp(const std::string &pattern, int options = 0)
427             : p_pattern(pattern), p_code_extra(pcre_free_study)
428     {
429         const char *errptr;
430         int         eoff;
431 
432         if ((this->p_code = pcre_compile(pattern.c_str(),
433                                          options | PCRE_UTF8,
434                                          &errptr,
435                                          &eoff,
436                                          nullptr)) == nullptr) {
437             throw error(errptr, eoff);
438         }
439 
440         pcre_refcount(this->p_code, 1);
441         this->study();
442         this->find_captures(pattern.c_str());
443     };
444 
pcrepp()445     pcrepp() {
446     }
447 
pcrepp(const pcrepp & other)448     pcrepp(const pcrepp &other)
449         : p_code(other.p_code),
450           p_pattern(other.p_pattern),
451           p_code_extra(pcre_free_study),
452           p_captures(other.p_captures)
453     {
454         pcre_refcount(this->p_code, 1);
455         this->study();
456     };
457 
pcrepp(pcrepp && other)458     pcrepp(pcrepp &&other)
459         : p_code(other.p_code),
460           p_pattern(std::move(other.p_pattern)),
461           p_code_extra(pcre_free_study),
462           p_capture_count(other.p_capture_count),
463           p_named_count(other.p_named_count),
464           p_name_len(other.p_name_len),
465           p_options(other.p_options),
466           p_named_entries(other.p_named_entries),
467           p_captures(std::move(other.p_captures)) {
468         pcre_refcount(this->p_code, 1);
469         this->p_code_extra = std::move(other.p_code_extra);
470     }
471 
~pcrepp()472     virtual ~pcrepp()
473     {
474         this->clear();
475     };
476 
operator =(pcrepp && other)477     pcrepp& operator=(pcrepp&& other) noexcept {
478         if (this == &other) {
479             return *this;
480         }
481 
482         this->p_code = other.p_code;
483         pcre_refcount(this->p_code, 1);
484         this->p_pattern = std::move(other.p_pattern);
485         this->p_code_extra = std::move(other.p_code_extra);
486         this->p_capture_count = other.p_capture_count;
487         this->p_named_count = other.p_named_count;
488         this->p_name_len = other.p_name_len;
489         this->p_options = other.p_options;
490         this->p_named_entries = other.p_named_entries;
491         this->p_captures = std::move(other.p_captures);
492 
493         return *this;
494     }
495 
get_pattern() const496     const std::string& get_pattern() const {
497         return this->p_pattern;
498     }
499 
empty() const500     bool empty() const {
501         return this->p_pattern.empty();
502     }
503 
clear()504     void clear() {
505         if (this->p_code && pcre_refcount(this->p_code, -1) == 0) {
506             free(this->p_code);
507             this->p_code = nullptr;
508         }
509         this->p_pattern.clear();
510         this->p_code_extra.reset();
511         this->p_capture_count = 0;
512         this->p_named_count = 0;
513         this->p_name_len = 0;
514         this->p_options = 0;
515         this->p_named_entries = nullptr;
516         this->p_captures.clear();
517     }
518 
named_begin() const519     pcre_named_capture::iterator named_begin() const {
520         return {this->p_named_entries, static_cast<size_t>(this->p_name_len)};
521     };
522 
named_end() const523     pcre_named_capture::iterator named_end() const {
524         char *ptr = (char *)this->p_named_entries;
525 
526         ptr += this->p_named_count * this->p_name_len;
527         return {(pcre_named_capture *)ptr,
528                 static_cast<size_t>(this->p_name_len)};
529     };
530 
captures() const531     const std::vector<pcre_context::capture> &captures() const {
532         return this->p_captures;
533     };
534 
cap_begin() const535     std::vector<pcre_context::capture>::const_iterator cap_begin() const {
536         return this->p_captures.begin();
537     };
538 
cap_end() const539     std::vector<pcre_context::capture>::const_iterator cap_end() const {
540         return this->p_captures.end();
541     };
542 
name_index(const std::string & name) const543     int name_index(const std::string &name) const {
544         return this->name_index(name.c_str());
545     };
546 
name_index(const char * name) const547     int name_index(const char *name) const {
548         int retval = pcre_get_stringnumber(this->p_code, name);
549 
550         if (retval == PCRE_ERROR_NOSUBSTRING) {
551             return retval;
552         }
553 
554         return retval - 1;
555     };
556 
name_for_capture(int index) const557     const char *name_for_capture(int index) const {
558         for (pcre_named_capture::iterator iter = this->named_begin();
559              iter != this->named_end();
560              ++iter) {
561             if (iter->index() == index) {
562                 return iter->pnc_name;
563             }
564         }
565         return "";
566     };
567 
get_capture_count() const568     int get_capture_count() const {
569         return this->p_capture_count;
570     };
571 
572     bool match(pcre_context &pc, pcre_input &pi, int options = 0) const;
573 
574     template<size_t MATCH_COUNT>
match(pcre_input & pi,int options=0) const575     nonstd::optional<pcre_context_static<MATCH_COUNT>> match(pcre_input &pi, int options = 0) const {
576         pcre_context_static<MATCH_COUNT> pc;
577 
578         if (this->match(pc, pi, options)) {
579             return pc;
580         }
581 
582         return nonstd::nullopt;
583     }
584 
585     std::string replace(const char *str, const char *repl) const;
586 
match_partial(pcre_input & pi) const587     size_t match_partial(pcre_input &pi) const {
588         size_t length = pi.pi_length;
589         int rc;
590 
591         do {
592             rc = pcre_exec(this->p_code,
593                            this->p_code_extra.in(),
594                            pi.get_string(),
595                            length,
596                            pi.pi_offset,
597                            PCRE_PARTIAL,
598                            nullptr,
599                            0);
600             switch (rc) {
601                 case 0:
602                 case PCRE_ERROR_PARTIAL:
603                     return length;
604             }
605             length -= 1;
606         } while (length > 0);
607 
608         return length;
609     };
610 
611 // #undef PCRE_STUDY_JIT_COMPILE
612 #ifdef PCRE_STUDY_JIT_COMPILE
613     static pcre_jit_stack *jit_stack();
614 
615 #else
616     static void pcre_free_study(pcre_extra *);
617 #endif
618 
619     void study();
620 
621     void find_captures(const char *pattern);
622 
623     pcre *p_code{nullptr};
624     std::string p_pattern;
625     auto_mem<pcre_extra> p_code_extra;
626     int p_capture_count{0};
627     int p_named_count{0};
628     int p_name_len{0};
629     unsigned long p_options{0};
630     pcre_named_capture *p_named_entries{nullptr};
631     std::vector<pcre_context::capture> p_captures;
632 };
633 
634 #endif
635