1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Sanjay Ghemawat
31 
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <string.h>      /* for memcpy */
41 #include <assert.h>
42 #include <errno.h>
43 #include <string>
44 #include <algorithm>
45 
46 #include "pcrecpp_internal.h"
47 #include "pcre.h"
48 #include "pcrecpp.h"
49 #include "pcre_stringpiece.h"
50 
51 
52 namespace pcrecpp {
53 
54 // Maximum number of args we can set
55 static const int kMaxArgs = 16;
56 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
57 
58 // Special object that stands-in for no argument
59 Arg RE::no_arg((void*)NULL);
60 
61 // This is for ABI compatibility with old versions of pcre (pre-7.6),
62 // which defined a global no_arg variable instead of putting it in the
63 // RE class.  This works on GCC >= 3, at least.  It definitely works
64 // for ELF, but may not for other object formats (Mach-O, for
65 // instance, does not support aliases.)  We could probably have a more
66 // inclusive test if we ever needed it.  (Note that not only the
67 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
68 // gnu-specific.)
69 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__) \
70        && !defined(__INTEL_COMPILER) && !defined(__LCC__)
71 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
72 # define ULP_AS_STRING_INTERNAL(x)   #x
73 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
74 extern Arg no_arg
75   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
76 #endif
77 
78 // If a regular expression has no error, its error_ field points here
79 static const string empty_string;
80 
81 // If the user doesn't ask for any options, we just use this one
82 static RE_Options default_options;
83 
84 // Specials for the start of patterns. See comments where start_options is used
85 // below. (PH June 2018)
86 static const char *start_options[] = {
87   "(*UTF8)",
88   "(*UTF)",
89   "(*UCP)",
90   "(*NO_START_OPT)",
91   "(*NO_AUTO_POSSESS)",
92   "(*LIMIT_RECURSION=",
93   "(*LIMIT_MATCH=",
94   "(*CRLF)",
95   "(*LF)",
96   "(*CR)",
97   "(*BSR_UNICODE)",
98   "(*BSR_ANYCRLF)",
99   "(*ANYCRLF)",
100   "(*ANY)",
101   "" };
102 
Init(const string & pat,const RE_Options * options)103 void RE::Init(const string& pat, const RE_Options* options) {
104   pattern_ = pat;
105   if (options == NULL) {
106     options_ = default_options;
107   } else {
108     options_ = *options;
109   }
110   error_ = &empty_string;
111   re_full_ = NULL;
112   re_partial_ = NULL;
113 
114   re_partial_ = Compile(UNANCHORED);
115   if (re_partial_ != NULL) {
116     re_full_ = Compile(ANCHOR_BOTH);
117   }
118 }
119 
Cleanup()120 void RE::Cleanup() {
121   if (re_full_ != NULL)         (*pcre_free)(re_full_);
122   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
123   if (error_ != &empty_string)  delete error_;
124 }
125 
126 
~RE()127 RE::~RE() {
128   Cleanup();
129 }
130 
131 
Compile(Anchor anchor)132 pcre* RE::Compile(Anchor anchor) {
133   // First, convert RE_Options into pcre options
134   int pcre_options = 0;
135   pcre_options = options_.all_options();
136 
137   // Special treatment for anchoring.  This is needed because at
138   // runtime pcre only provides an option for anchoring at the
139   // beginning of a string (unless you use offset).
140   //
141   // There are three types of anchoring we want:
142   //    UNANCHORED      Compile the original pattern, and use
143   //                    a pcre unanchored match.
144   //    ANCHOR_START    Compile the original pattern, and use
145   //                    a pcre anchored match.
146   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
147   //                    and use a pcre anchored match.
148 
149   const char* compile_error;
150   int eoffset;
151   pcre* re;
152   if (anchor != ANCHOR_BOTH) {
153     re = pcre_compile(pattern_.c_str(), pcre_options,
154                       &compile_error, &eoffset, NULL);
155   } else {
156     // Tack a '\z' at the end of RE.  Parenthesize it first so that
157     // the '\z' applies to all top-level alternatives in the regexp.
158 
159     /* When this code was written (for PCRE 6.0) it was enough just to
160     parenthesize the entire pattern. Unfortunately, when the feature of
161     starting patterns with (*UTF8) or (*CR) etc. was added to PCRE patterns,
162     this code was never updated. This bug was not noticed till 2018, long after
163     PCRE became obsolescent and its maintainer no longer around. Since PCRE is
164     frozen, I have added a hack to check for all the existing "start of
165     pattern" specials - knowing that no new ones will ever be added. I am not a
166     C++ programmer, so the code style is no doubt crude. It is also
167     inefficient, but is only run when the pattern starts with "(*".
168     PH June 2018. */
169 
170     string wrapped = "";
171 
172     if (pattern_.c_str()[0] == '(' && pattern_.c_str()[1] == '*') {
173       int kk, klen, kmat;
174       for (;;) {   // Loop for any number of leading items
175 
176         for (kk = 0; start_options[kk][0] != 0; kk++) {
177           klen = strlen(start_options[kk]);
178           kmat = strncmp(pattern_.c_str(), start_options[kk], klen);
179           if (kmat >= 0) break;
180         }
181         if (kmat != 0) break;  // Not found
182 
183         // If the item ended in "=" we must copy digits up to ")".
184 
185         if (start_options[kk][klen-1] == '=') {
186           while (isdigit(pattern_.c_str()[klen])) klen++;
187           if (pattern_.c_str()[klen] != ')') break;  // Syntax error
188           klen++;
189         }
190 
191         // Move the item from the pattern to the start of the wrapped string.
192 
193         wrapped += pattern_.substr(0, klen);
194         pattern_.erase(0, klen);
195       }
196     }
197 
198     // Wrap the rest of the pattern.
199 
200     wrapped += "(?:";  // A non-counting grouping operator
201     wrapped += pattern_;
202     wrapped += ")\\z";
203     re = pcre_compile(wrapped.c_str(), pcre_options,
204                       &compile_error, &eoffset, NULL);
205   }
206   if (re == NULL) {
207     if (error_ == &empty_string) error_ = new string(compile_error);
208   }
209   return re;
210 }
211 
212 /***** Matching interfaces *****/
213 
FullMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const214 bool RE::FullMatch(const StringPiece& text,
215                    const Arg& ptr1,
216                    const Arg& ptr2,
217                    const Arg& ptr3,
218                    const Arg& ptr4,
219                    const Arg& ptr5,
220                    const Arg& ptr6,
221                    const Arg& ptr7,
222                    const Arg& ptr8,
223                    const Arg& ptr9,
224                    const Arg& ptr10,
225                    const Arg& ptr11,
226                    const Arg& ptr12,
227                    const Arg& ptr13,
228                    const Arg& ptr14,
229                    const Arg& ptr15,
230                    const Arg& ptr16) const {
231   const Arg* args[kMaxArgs];
232   int n = 0;
233   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
234   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
235   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
236   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
237   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
238   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
239   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
240   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
241   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
242   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
243   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
244   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
245   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
246   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
247   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
248   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
249  done:
250 
251   int consumed;
252   int vec[kVecSize];
253   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
254 }
255 
PartialMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const256 bool RE::PartialMatch(const StringPiece& text,
257                       const Arg& ptr1,
258                       const Arg& ptr2,
259                       const Arg& ptr3,
260                       const Arg& ptr4,
261                       const Arg& ptr5,
262                       const Arg& ptr6,
263                       const Arg& ptr7,
264                       const Arg& ptr8,
265                       const Arg& ptr9,
266                       const Arg& ptr10,
267                       const Arg& ptr11,
268                       const Arg& ptr12,
269                       const Arg& ptr13,
270                       const Arg& ptr14,
271                       const Arg& ptr15,
272                       const Arg& ptr16) const {
273   const Arg* args[kMaxArgs];
274   int n = 0;
275   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
276   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
277   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
278   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
279   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
280   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
281   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
282   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
283   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
284   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
285   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
286   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
287   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
288   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
289   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
290   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
291  done:
292 
293   int consumed;
294   int vec[kVecSize];
295   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
296 }
297 
Consume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const298 bool RE::Consume(StringPiece* input,
299                  const Arg& ptr1,
300                  const Arg& ptr2,
301                  const Arg& ptr3,
302                  const Arg& ptr4,
303                  const Arg& ptr5,
304                  const Arg& ptr6,
305                  const Arg& ptr7,
306                  const Arg& ptr8,
307                  const Arg& ptr9,
308                  const Arg& ptr10,
309                  const Arg& ptr11,
310                  const Arg& ptr12,
311                  const Arg& ptr13,
312                  const Arg& ptr14,
313                  const Arg& ptr15,
314                  const Arg& ptr16) const {
315   const Arg* args[kMaxArgs];
316   int n = 0;
317   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
318   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
319   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
320   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
321   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
322   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
323   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
324   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
325   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
326   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
327   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
328   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
329   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
330   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
331   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
332   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
333  done:
334 
335   int consumed;
336   int vec[kVecSize];
337   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
338                   args, n, vec, kVecSize)) {
339     input->remove_prefix(consumed);
340     return true;
341   } else {
342     return false;
343   }
344 }
345 
FindAndConsume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const346 bool RE::FindAndConsume(StringPiece* input,
347                         const Arg& ptr1,
348                         const Arg& ptr2,
349                         const Arg& ptr3,
350                         const Arg& ptr4,
351                         const Arg& ptr5,
352                         const Arg& ptr6,
353                         const Arg& ptr7,
354                         const Arg& ptr8,
355                         const Arg& ptr9,
356                         const Arg& ptr10,
357                         const Arg& ptr11,
358                         const Arg& ptr12,
359                         const Arg& ptr13,
360                         const Arg& ptr14,
361                         const Arg& ptr15,
362                         const Arg& ptr16) const {
363   const Arg* args[kMaxArgs];
364   int n = 0;
365   if (&ptr1  == &no_arg) { goto done; } args[n++] = &ptr1;
366   if (&ptr2  == &no_arg) { goto done; } args[n++] = &ptr2;
367   if (&ptr3  == &no_arg) { goto done; } args[n++] = &ptr3;
368   if (&ptr4  == &no_arg) { goto done; } args[n++] = &ptr4;
369   if (&ptr5  == &no_arg) { goto done; } args[n++] = &ptr5;
370   if (&ptr6  == &no_arg) { goto done; } args[n++] = &ptr6;
371   if (&ptr7  == &no_arg) { goto done; } args[n++] = &ptr7;
372   if (&ptr8  == &no_arg) { goto done; } args[n++] = &ptr8;
373   if (&ptr9  == &no_arg) { goto done; } args[n++] = &ptr9;
374   if (&ptr10 == &no_arg) { goto done; } args[n++] = &ptr10;
375   if (&ptr11 == &no_arg) { goto done; } args[n++] = &ptr11;
376   if (&ptr12 == &no_arg) { goto done; } args[n++] = &ptr12;
377   if (&ptr13 == &no_arg) { goto done; } args[n++] = &ptr13;
378   if (&ptr14 == &no_arg) { goto done; } args[n++] = &ptr14;
379   if (&ptr15 == &no_arg) { goto done; } args[n++] = &ptr15;
380   if (&ptr16 == &no_arg) { goto done; } args[n++] = &ptr16;
381  done:
382 
383   int consumed;
384   int vec[kVecSize];
385   if (DoMatchImpl(*input, UNANCHORED, &consumed,
386                   args, n, vec, kVecSize)) {
387     input->remove_prefix(consumed);
388     return true;
389   } else {
390     return false;
391   }
392 }
393 
Replace(const StringPiece & rewrite,string * str) const394 bool RE::Replace(const StringPiece& rewrite,
395                  string *str) const {
396   int vec[kVecSize];
397   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
398   if (matches == 0)
399     return false;
400 
401   string s;
402   if (!Rewrite(&s, rewrite, *str, vec, matches))
403     return false;
404 
405   assert(vec[0] >= 0);
406   assert(vec[1] >= 0);
407   str->replace(vec[0], vec[1] - vec[0], s);
408   return true;
409 }
410 
411 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
412 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
413 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
414 
NewlineMode(int pcre_options)415 static int NewlineMode(int pcre_options) {
416   // TODO: if we can make it threadsafe, cache this var
417   int newline_mode = 0;
418   /* if (newline_mode) return newline_mode; */  // do this once it's cached
419   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
420                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
421     newline_mode = (pcre_options &
422                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
423                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
424   } else {
425     int newline;
426     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
427     if (newline == 10)
428       newline_mode = PCRE_NEWLINE_LF;
429     else if (newline == 13)
430       newline_mode = PCRE_NEWLINE_CR;
431     else if (newline == 3338)
432       newline_mode = PCRE_NEWLINE_CRLF;
433     else if (newline == -1)
434       newline_mode = PCRE_NEWLINE_ANY;
435     else if (newline == -2)
436       newline_mode = PCRE_NEWLINE_ANYCRLF;
437     else
438       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
439   }
440   return newline_mode;
441 }
442 
GlobalReplace(const StringPiece & rewrite,string * str) const443 int RE::GlobalReplace(const StringPiece& rewrite,
444                       string *str) const {
445   int count = 0;
446   int vec[kVecSize];
447   string out;
448   int start = 0;
449   bool last_match_was_empty_string = false;
450 
451   while (start <= static_cast<int>(str->length())) {
452     // If the previous match was for the empty string, we shouldn't
453     // just match again: we'll match in the same way and get an
454     // infinite loop.  Instead, we do the match in a special way:
455     // anchored -- to force another try at the same position --
456     // and with a flag saying that this time, ignore empty matches.
457     // If this special match returns, that means there's a non-empty
458     // match at this position as well, and we can continue.  If not,
459     // we do what perl does, and just advance by one.
460     // Notice that perl prints '@@@' for this;
461     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
462     int matches;
463     if (last_match_was_empty_string) {
464       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
465       if (matches <= 0) {
466         int matchend = start + 1;     // advance one character.
467         // If the current char is CR and we're in CRLF mode, skip LF too.
468         // Note it's better to call pcre_fullinfo() than to examine
469         // all_options(), since options_ could have changed bewteen
470         // compile-time and now, but this is simpler and safe enough.
471         // Modified by PH to add ANY and ANYCRLF.
472         if (matchend < static_cast<int>(str->length()) &&
473             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
474             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
475              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
476              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
477           matchend++;
478         }
479         // We also need to advance more than one char if we're in utf8 mode.
480 #ifdef SUPPORT_UTF
481         if (options_.utf8()) {
482           while (matchend < static_cast<int>(str->length()) &&
483                  ((*str)[matchend] & 0xc0) == 0x80)
484             matchend++;
485         }
486 #endif
487         if (start < static_cast<int>(str->length()))
488           out.append(*str, start, matchend - start);
489         start = matchend;
490         last_match_was_empty_string = false;
491         continue;
492       }
493     } else {
494       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
495       if (matches <= 0)
496         break;
497     }
498     int matchstart = vec[0], matchend = vec[1];
499     assert(matchstart >= start);
500     assert(matchend >= matchstart);
501     out.append(*str, start, matchstart - start);
502     Rewrite(&out, rewrite, *str, vec, matches);
503     start = matchend;
504     count++;
505     last_match_was_empty_string = (matchstart == matchend);
506   }
507 
508   if (count == 0)
509     return 0;
510 
511   if (start < static_cast<int>(str->length()))
512     out.append(*str, start, str->length() - start);
513   swap(out, *str);
514   return count;
515 }
516 
Extract(const StringPiece & rewrite,const StringPiece & text,string * out) const517 bool RE::Extract(const StringPiece& rewrite,
518                  const StringPiece& text,
519                  string *out) const {
520   int vec[kVecSize];
521   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
522   if (matches == 0)
523     return false;
524   out->erase();
525   return Rewrite(out, rewrite, text, vec, matches);
526 }
527 
QuoteMeta(const StringPiece & unquoted)528 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
529   string result;
530 
531   // Escape any ascii character not in [A-Za-z_0-9].
532   //
533   // Note that it's legal to escape a character even if it has no
534   // special meaning in a regular expression -- so this function does
535   // that.  (This also makes it identical to the perl function of the
536   // same name; see `perldoc -f quotemeta`.)  The one exception is
537   // escaping NUL: rather than doing backslash + NUL, like perl does,
538   // we do '\0', because pcre itself doesn't take embedded NUL chars.
539   for (int ii = 0; ii < unquoted.size(); ++ii) {
540     // Note that using 'isalnum' here raises the benchmark time from
541     // 32ns to 58ns:
542     if (unquoted[ii] == '\0') {
543       result += "\\0";
544     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
545                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
546                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
547                unquoted[ii] != '_' &&
548                // If this is the part of a UTF8 or Latin1 character, we need
549                // to copy this byte without escaping.  Experimentally this is
550                // what works correctly with the regexp library.
551                !(unquoted[ii] & 128)) {
552       result += '\\';
553       result += unquoted[ii];
554     } else {
555       result += unquoted[ii];
556     }
557   }
558 
559   return result;
560 }
561 
562 /***** Actual matching and rewriting code *****/
563 
TryMatch(const StringPiece & text,int startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const564 int RE::TryMatch(const StringPiece& text,
565                  int startpos,
566                  Anchor anchor,
567                  bool empty_ok,
568                  int *vec,
569                  int vecsize) const {
570   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
571   if (re == NULL) {
572     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
573     return 0;
574   }
575 
576   pcre_extra extra = { 0, 0, 0, 0, 0, 0, 0, 0 };
577   if (options_.match_limit() > 0) {
578     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
579     extra.match_limit = options_.match_limit();
580   }
581   if (options_.match_limit_recursion() > 0) {
582     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
583     extra.match_limit_recursion = options_.match_limit_recursion();
584   }
585 
586   // int options = 0;
587   // Changed by PH as a result of bugzilla #1288
588   int options = (options_.all_options() & PCRE_NO_UTF8_CHECK);
589 
590   if (anchor != UNANCHORED)
591     options |= PCRE_ANCHORED;
592   if (!empty_ok)
593     options |= PCRE_NOTEMPTY;
594 
595   int rc = pcre_exec(re,              // The regular expression object
596                      &extra,
597                      (text.data() == NULL) ? "" : text.data(),
598                      text.size(),
599                      startpos,
600                      options,
601                      vec,
602                      vecsize);
603 
604   // Handle errors
605   if (rc == PCRE_ERROR_NOMATCH) {
606     return 0;
607   } else if (rc < 0) {
608     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
609     //        re, pattern_.c_str());
610     return 0;
611   } else if (rc == 0) {
612     // pcre_exec() returns 0 as a special case when the number of
613     // capturing subpatterns exceeds the size of the vector.
614     // When this happens, there is a match and the output vector
615     // is filled, but we miss out on the positions of the extra subpatterns.
616     rc = vecsize / 2;
617   }
618 
619   return rc;
620 }
621 
DoMatchImpl(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n,int * vec,int vecsize) const622 bool RE::DoMatchImpl(const StringPiece& text,
623                      Anchor anchor,
624                      int* consumed,
625                      const Arg* const* args,
626                      int n,
627                      int* vec,
628                      int vecsize) const {
629   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
630   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
631   assert(matches >= 0);  // TryMatch never returns negatives
632   if (matches == 0)
633     return false;
634 
635   *consumed = vec[1];
636 
637   if (n == 0 || args == NULL) {
638     // We are not interested in results
639     return true;
640   }
641 
642   if (NumberOfCapturingGroups() < n) {
643     // RE has fewer capturing groups than number of arg pointers passed in
644     return false;
645   }
646 
647   // If we got here, we must have matched the whole pattern.
648   // We do not need (can not do) any more checks on the value of 'matches' here
649   // -- see the comment for TryMatch.
650   for (int i = 0; i < n; i++) {
651     const int start = vec[2*(i+1)];
652     const int limit = vec[2*(i+1)+1];
653     if (!args[i]->Parse(text.data() + start, limit-start)) {
654       // TODO: Should we indicate what the error was?
655       return false;
656     }
657   }
658 
659   return true;
660 }
661 
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const args[],int n) const662 bool RE::DoMatch(const StringPiece& text,
663                  Anchor anchor,
664                  int* consumed,
665                  const Arg* const args[],
666                  int n) const {
667   assert(n >= 0);
668   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
669                                        // (as for kVecSize)
670   int space[21];   // use stack allocation for small vecsize (common case)
671   int* vec = vecsize <= 21 ? space : new int[vecsize];
672   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
673   if (vec != space) delete [] vec;
674   return retval;
675 }
676 
Rewrite(string * out,const StringPiece & rewrite,const StringPiece & text,int * vec,int veclen) const677 bool RE::Rewrite(string *out, const StringPiece &rewrite,
678                  const StringPiece &text, int *vec, int veclen) const {
679   for (const char *s = rewrite.data(), *end = s + rewrite.size();
680        s < end; s++) {
681     int c = *s;
682     if (c == '\\') {
683       c = *++s;
684       if (isdigit(c)) {
685         int n = (c - '0');
686         if (n >= veclen) {
687           //fprintf(stderr, requested group %d in regexp %.*s\n",
688           //        n, rewrite.size(), rewrite.data());
689           return false;
690         }
691         int start = vec[2 * n];
692         if (start >= 0)
693           out->append(text.data() + start, vec[2 * n + 1] - start);
694       } else if (c == '\\') {
695         *out += '\\';
696       } else {
697         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
698         //        rewrite.size(), rewrite.data());
699         return false;
700       }
701     } else {
702       *out += c;
703     }
704   }
705   return true;
706 }
707 
708 // Return the number of capturing subpatterns, or -1 if the
709 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const710 int RE::NumberOfCapturingGroups() const {
711   if (re_partial_ == NULL) return -1;
712 
713   int result;
714   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
715                                   NULL,         // We did not study the pattern
716                                   PCRE_INFO_CAPTURECOUNT,
717                                   &result);
718   assert(pcre_retval == 0);
719   return result;
720 }
721 
722 /***** Parsers for various types *****/
723 
parse_null(const char * str,int n,void * dest)724 bool Arg::parse_null(const char* str, int n, void* dest) {
725   (void)str;
726   (void)n;
727   // We fail if somebody asked us to store into a non-NULL void* pointer
728   return (dest == NULL);
729 }
730 
parse_string(const char * str,int n,void * dest)731 bool Arg::parse_string(const char* str, int n, void* dest) {
732   if (dest == NULL) return true;
733   reinterpret_cast<string*>(dest)->assign(str, n);
734   return true;
735 }
736 
parse_stringpiece(const char * str,int n,void * dest)737 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
738   if (dest == NULL) return true;
739   reinterpret_cast<StringPiece*>(dest)->set(str, n);
740   return true;
741 }
742 
parse_char(const char * str,int n,void * dest)743 bool Arg::parse_char(const char* str, int n, void* dest) {
744   if (n != 1) return false;
745   if (dest == NULL) return true;
746   *(reinterpret_cast<char*>(dest)) = str[0];
747   return true;
748 }
749 
parse_uchar(const char * str,int n,void * dest)750 bool Arg::parse_uchar(const char* str, int n, void* dest) {
751   if (n != 1) return false;
752   if (dest == NULL) return true;
753   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
754   return true;
755 }
756 
757 // Largest number spec that we are willing to parse
758 static const int kMaxNumberLength = 32;
759 
760 // REQUIRES "buf" must have length at least kMaxNumberLength+1
761 // REQUIRES "n > 0"
762 // Copies "str" into "buf" and null-terminates if necessary.
763 // Returns one of:
764 //      a. "str" if no termination is needed
765 //      b. "buf" if the string was copied and null-terminated
766 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,int n)767 static const char* TerminateNumber(char* buf, const char* str, int n) {
768   if ((n > 0) && isspace(*str)) {
769     // We are less forgiving than the strtoxxx() routines and do not
770     // allow leading spaces.
771     return "";
772   }
773 
774   // See if the character right after the input text may potentially
775   // look like a digit.
776   if (isdigit(str[n]) ||
777       ((str[n] >= 'a') && (str[n] <= 'f')) ||
778       ((str[n] >= 'A') && (str[n] <= 'F'))) {
779     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
780     memcpy(buf, str, n);
781     buf[n] = '\0';
782     return buf;
783   } else {
784     // We can parse right out of the supplied string, so return it.
785     return str;
786   }
787 }
788 
parse_long_radix(const char * str,int n,void * dest,int radix)789 bool Arg::parse_long_radix(const char* str,
790                            int n,
791                            void* dest,
792                            int radix) {
793   if (n == 0) return false;
794   char buf[kMaxNumberLength+1];
795   str = TerminateNumber(buf, str, n);
796   char* end;
797   errno = 0;
798   long r = strtol(str, &end, radix);
799   if (end != str + n) return false;   // Leftover junk
800   if (errno) return false;
801   if (dest == NULL) return true;
802   *(reinterpret_cast<long*>(dest)) = r;
803   return true;
804 }
805 
parse_ulong_radix(const char * str,int n,void * dest,int radix)806 bool Arg::parse_ulong_radix(const char* str,
807                             int n,
808                             void* dest,
809                             int radix) {
810   if (n == 0) return false;
811   char buf[kMaxNumberLength+1];
812   str = TerminateNumber(buf, str, n);
813   if (str[0] == '-') return false;    // strtoul() on a negative number?!
814   char* end;
815   errno = 0;
816   unsigned long r = strtoul(str, &end, radix);
817   if (end != str + n) return false;   // Leftover junk
818   if (errno) return false;
819   if (dest == NULL) return true;
820   *(reinterpret_cast<unsigned long*>(dest)) = r;
821   return true;
822 }
823 
parse_short_radix(const char * str,int n,void * dest,int radix)824 bool Arg::parse_short_radix(const char* str,
825                             int n,
826                             void* dest,
827                             int radix) {
828   long r;
829   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
830   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
831   if (dest == NULL) return true;
832   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
833   return true;
834 }
835 
parse_ushort_radix(const char * str,int n,void * dest,int radix)836 bool Arg::parse_ushort_radix(const char* str,
837                              int n,
838                              void* dest,
839                              int radix) {
840   unsigned long r;
841   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
842   if (r > USHRT_MAX) return false;                      // Out of range
843   if (dest == NULL) return true;
844   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
845   return true;
846 }
847 
parse_int_radix(const char * str,int n,void * dest,int radix)848 bool Arg::parse_int_radix(const char* str,
849                           int n,
850                           void* dest,
851                           int radix) {
852   long r;
853   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
854   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
855   if (dest == NULL) return true;
856   *(reinterpret_cast<int*>(dest)) = r;
857   return true;
858 }
859 
parse_uint_radix(const char * str,int n,void * dest,int radix)860 bool Arg::parse_uint_radix(const char* str,
861                            int n,
862                            void* dest,
863                            int radix) {
864   unsigned long r;
865   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
866   if (r > UINT_MAX) return false;                       // Out of range
867   if (dest == NULL) return true;
868   *(reinterpret_cast<unsigned int*>(dest)) = r;
869   return true;
870 }
871 
parse_longlong_radix(const char * str,int n,void * dest,int radix)872 bool Arg::parse_longlong_radix(const char* str,
873                                int n,
874                                void* dest,
875                                int radix) {
876 #ifndef HAVE_LONG_LONG
877   return false;
878 #else
879   if (n == 0) return false;
880   char buf[kMaxNumberLength+1];
881   str = TerminateNumber(buf, str, n);
882   char* end;
883   errno = 0;
884 #if defined HAVE_STRTOQ
885   long long r = strtoq(str, &end, radix);
886 #elif defined HAVE_STRTOLL
887   long long r = strtoll(str, &end, radix);
888 #elif defined HAVE__STRTOI64
889   long long r = _strtoi64(str, &end, radix);
890 #elif defined HAVE_STRTOIMAX
891   long long r = strtoimax(str, &end, radix);
892 #else
893 #error parse_longlong_radix: cannot convert input to a long-long
894 #endif
895   if (end != str + n) return false;   // Leftover junk
896   if (errno) return false;
897   if (dest == NULL) return true;
898   *(reinterpret_cast<long long*>(dest)) = r;
899   return true;
900 #endif   /* HAVE_LONG_LONG */
901 }
902 
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)903 bool Arg::parse_ulonglong_radix(const char* str,
904                                 int n,
905                                 void* dest,
906                                 int radix) {
907 #ifndef HAVE_UNSIGNED_LONG_LONG
908   return false;
909 #else
910   if (n == 0) return false;
911   char buf[kMaxNumberLength+1];
912   str = TerminateNumber(buf, str, n);
913   if (str[0] == '-') return false;    // strtoull() on a negative number?!
914   char* end;
915   errno = 0;
916 #if defined HAVE_STRTOQ
917   unsigned long long r = strtouq(str, &end, radix);
918 #elif defined HAVE_STRTOLL
919   unsigned long long r = strtoull(str, &end, radix);
920 #elif defined HAVE__STRTOI64
921   unsigned long long r = _strtoui64(str, &end, radix);
922 #elif defined HAVE_STRTOIMAX
923   unsigned long long r = strtoumax(str, &end, radix);
924 #else
925 #error parse_ulonglong_radix: cannot convert input to a long-long
926 #endif
927   if (end != str + n) return false;   // Leftover junk
928   if (errno) return false;
929   if (dest == NULL) return true;
930   *(reinterpret_cast<unsigned long long*>(dest)) = r;
931   return true;
932 #endif   /* HAVE_UNSIGNED_LONG_LONG */
933 }
934 
parse_double(const char * str,int n,void * dest)935 bool Arg::parse_double(const char* str, int n, void* dest) {
936   if (n == 0) return false;
937   static const int kMaxLength = 200;
938   char buf[kMaxLength];
939   if (n >= kMaxLength) return false;
940   memcpy(buf, str, n);
941   buf[n] = '\0';
942   errno = 0;
943   char* end;
944   double r = strtod(buf, &end);
945   if (end != buf + n) return false;   // Leftover junk
946   if (errno) return false;
947   if (dest == NULL) return true;
948   *(reinterpret_cast<double*>(dest)) = r;
949   return true;
950 }
951 
parse_float(const char * str,int n,void * dest)952 bool Arg::parse_float(const char* str, int n, void* dest) {
953   double r;
954   if (!parse_double(str, n, &r)) return false;
955   if (dest == NULL) return true;
956   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
957   return true;
958 }
959 
960 
961 #define DEFINE_INTEGER_PARSERS(name)                                    \
962   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
963     return parse_##name##_radix(str, n, dest, 10);                      \
964   }                                                                     \
965   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
966     return parse_##name##_radix(str, n, dest, 16);                      \
967   }                                                                     \
968   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
969     return parse_##name##_radix(str, n, dest, 8);                       \
970   }                                                                     \
971   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
972     return parse_##name##_radix(str, n, dest, 0);                       \
973   }
974 
975 DEFINE_INTEGER_PARSERS(short)      /*                                   */
976 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
977 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
978 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
979 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
980 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
981 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
982 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
983 
984 #undef DEFINE_INTEGER_PARSERS
985 
986 }   // namespace pcrecpp
987