1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #ifndef RE2_RE2_H
6 #define RE2_RE2_H
7
8 // C++ interface to the re2 regular-expression library.
9 // RE2 supports Perl-style regular expressions (with extensions like
10 // \d, \w, \s, ...).
11 //
12 // -----------------------------------------------------------------------
13 // REGEXP SYNTAX:
14 //
15 // This module uses the re2 library and hence supports
16 // its syntax for regular expressions, which is similar to Perl's with
17 // some of the more complicated things thrown away. In particular,
18 // backreferences and generalized assertions are not available, nor is \Z.
19 //
20 // See http://code.google.com/p/re2/wiki/Syntax for the syntax
21 // supported by RE2, and a comparison with PCRE and PERL regexps.
22 //
23 // For those not familiar with Perl's regular expressions,
24 // here are some examples of the most commonly used extensions:
25 //
26 // "hello (\\w+) world" -- \w matches a "word" character
27 // "version (\\d+)" -- \d matches a digit
28 // "hello\\s+world" -- \s matches any whitespace character
29 // "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
30 // "(?i)hello" -- (?i) turns on case-insensitive matching
31 // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
32 //
33 // -----------------------------------------------------------------------
34 // MATCHING INTERFACE:
35 //
36 // The "FullMatch" operation checks that supplied text matches a
37 // supplied pattern exactly.
38 //
39 // Example: successful match
40 // CHECK(RE2::FullMatch("hello", "h.*o"));
41 //
42 // Example: unsuccessful match (requires full match):
43 // CHECK(!RE2::FullMatch("hello", "e"));
44 //
45 // -----------------------------------------------------------------------
46 // UTF-8 AND THE MATCHING INTERFACE:
47 //
48 // By default, the pattern and input text are interpreted as UTF-8.
49 // The RE2::Latin1 option causes them to be interpreted as Latin-1.
50 //
51 // Example:
52 // CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
53 // CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
54 //
55 // -----------------------------------------------------------------------
56 // MATCHING WITH SUB-STRING EXTRACTION:
57 //
58 // You can supply extra pointer arguments to extract matched subpieces.
59 //
60 // Example: extracts "ruby" into "s" and 1234 into "i"
61 // int i;
62 // string s;
63 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
64 //
65 // Example: fails because string cannot be stored in integer
66 // CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
67 //
68 // Example: fails because there aren't enough sub-patterns:
69 // CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
70 //
71 // Example: does not try to extract any extra sub-patterns
72 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
73 //
74 // Example: does not try to extract into NULL
75 // CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
76 //
77 // Example: integer overflow causes failure
78 // CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
79 //
80 // NOTE(rsc): Asking for substrings slows successful matches quite a bit.
81 // This may get a little faster in the future, but right now is slower
82 // than PCRE. On the other hand, failed matches run *very* fast (faster
83 // than PCRE), as do matches without substring extraction.
84 //
85 // -----------------------------------------------------------------------
86 // PARTIAL MATCHES
87 //
88 // You can use the "PartialMatch" operation when you want the pattern
89 // to match any substring of the text.
90 //
91 // Example: simple search for a string:
92 // CHECK(RE2::PartialMatch("hello", "ell"));
93 //
94 // Example: find first number in a string
95 // int number;
96 // CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
97 // CHECK_EQ(number, 100);
98 //
99 // -----------------------------------------------------------------------
100 // PRE-COMPILED REGULAR EXPRESSIONS
101 //
102 // RE2 makes it easy to use any string as a regular expression, without
103 // requiring a separate compilation step.
104 //
105 // If speed is of the essence, you can create a pre-compiled "RE2"
106 // object from the pattern and use it multiple times. If you do so,
107 // you can typically parse text faster than with sscanf.
108 //
109 // Example: precompile pattern for faster matching:
110 // RE2 pattern("h.*o");
111 // while (ReadLine(&str)) {
112 // if (RE2::FullMatch(str, pattern)) ...;
113 // }
114 //
115 // -----------------------------------------------------------------------
116 // SCANNING TEXT INCREMENTALLY
117 //
118 // The "Consume" operation may be useful if you want to repeatedly
119 // match regular expressions at the front of a string and skip over
120 // them as they match. This requires use of the "StringPiece" type,
121 // which represents a sub-range of a real string.
122 //
123 // Example: read lines of the form "var = value" from a string.
124 // string contents = ...; // Fill string somehow
125 // StringPiece input(contents); // Wrap a StringPiece around it
126 //
127 // string var;
128 // int value;
129 // while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
130 // ...;
131 // }
132 //
133 // Each successful call to "Consume" will set "var/value", and also
134 // advance "input" so it points past the matched text. Note that if the
135 // regular expression matches an empty string, input will advance
136 // by 0 bytes. If the regular expression being used might match
137 // an empty string, the loop body must check for this case and either
138 // advance the string or break out of the loop.
139 //
140 // The "FindAndConsume" operation is similar to "Consume" but does not
141 // anchor your match at the beginning of the string. For example, you
142 // could extract all words from a string by repeatedly calling
143 // RE2::FindAndConsume(&input, "(\\w+)", &word)
144 //
145 // -----------------------------------------------------------------------
146 // USING VARIABLE NUMBER OF ARGUMENTS
147 //
148 // The above operations require you to know the number of arguments
149 // when you write the code. This is not always possible or easy (for
150 // example, the regular expression may be calculated at run time).
151 // You can use the "N" version of the operations when the number of
152 // match arguments are determined at run time.
153 //
154 // Example:
155 // const RE2::Arg* args[10];
156 // int n;
157 // // ... populate args with pointers to RE2::Arg values ...
158 // // ... set n to the number of RE2::Arg objects ...
159 // bool match = RE2::FullMatchN(input, pattern, args, n);
160 //
161 // The last statement is equivalent to
162 //
163 // bool match = RE2::FullMatch(input, pattern,
164 // *args[0], *args[1], ..., *args[n - 1]);
165 //
166 // -----------------------------------------------------------------------
167 // PARSING HEX/OCTAL/C-RADIX NUMBERS
168 //
169 // By default, if you pass a pointer to a numeric value, the
170 // corresponding text is interpreted as a base-10 number. You can
171 // instead wrap the pointer with a call to one of the operators Hex(),
172 // Octal(), or CRadix() to interpret the text in another base. The
173 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
174 // prefixes, but defaults to base-10.
175 //
176 // Example:
177 // int a, b, c, d;
178 // CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
179 // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
180 // will leave 64 in a, b, c, and d.
181
182 #include <stdint.h>
183 #include <map>
184 #include <string>
185 #include "re2/stringpiece.h"
186 #include "re2/variadic_function.h"
187
188 #ifndef RE2_HAVE_LONGLONG
189 #define RE2_HAVE_LONGLONG 1
190 #endif
191
192 namespace re2 {
193
194 using std::string;
195 using std::map;
196 class Mutex;
197 class Prog;
198 class Regexp;
199
200 // The following enum should be used only as a constructor argument to indicate
201 // that the variable has static storage class, and that the constructor should
202 // do nothing to its state. It indicates to the reader that it is legal to
203 // declare a static instance of the class, provided the constructor is given
204 // the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
205 // static variable that has a constructor or a destructor because invocation
206 // order is undefined. However, IF the type can be initialized by filling with
207 // zeroes (which the loader does for static variables), AND the type's
208 // destructor does nothing to the storage, then a constructor for static
209 // initialization can be declared as
210 // explicit MyClass(LinkerInitialized x) {}
211 // and invoked as
212 // static MyClass my_variable_name(LINKER_INITIALIZED);
213 enum LinkerInitialized { LINKER_INITIALIZED };
214
215 // Interface for regular expression matching. Also corresponds to a
216 // pre-compiled regular expression. An "RE2" object is safe for
217 // concurrent use by multiple threads.
218 class RE2 {
219 public:
220 // We convert user-passed pointers into special Arg objects
221 class Arg;
222 class Options;
223
224 // Defined in set.h.
225 class Set;
226
227 enum ErrorCode {
228 NoError = 0,
229
230 // Unexpected error
231 ErrorInternal,
232
233 // Parse errors
234 ErrorBadEscape, // bad escape sequence
235 ErrorBadCharClass, // bad character class
236 ErrorBadCharRange, // bad character class range
237 ErrorMissingBracket, // missing closing ]
238 ErrorMissingParen, // missing closing )
239 ErrorTrailingBackslash, // trailing \ at end of regexp
240 ErrorRepeatArgument, // repeat argument missing, e.g. "*"
241 ErrorRepeatSize, // bad repetition argument
242 ErrorRepeatOp, // bad repetition operator
243 ErrorBadPerlOp, // bad perl operator
244 ErrorBadUTF8, // invalid UTF-8 in regexp
245 ErrorBadNamedCapture, // bad named capture group
246 ErrorPatternTooLarge // pattern too large (compile failed)
247 };
248
249 // Predefined common options.
250 // If you need more complicated things, instantiate
251 // an Option class, possibly passing one of these to
252 // the Option constructor, change the settings, and pass that
253 // Option class to the RE2 constructor.
254 enum CannedOptions {
255 DefaultOptions = 0,
256 Latin1, // treat input as Latin-1 (default UTF-8)
257 POSIX, // POSIX syntax, leftmost-longest match
258 Quiet // do not log about regexp parse errors
259 };
260
261 // Need to have the const char* and const string& forms for implicit
262 // conversions when passing string literals to FullMatch and PartialMatch.
263 // Otherwise the StringPiece form would be sufficient.
264 #ifndef SWIG
265 RE2(const char* pattern);
266 RE2(const string& pattern);
267 #endif
268 RE2(const StringPiece& pattern);
269 RE2(const StringPiece& pattern, const Options& option);
270 ~RE2();
271
272 // Returns whether RE2 was created properly.
ok()273 bool ok() const { return error_code() == NoError; }
274
275 // The string specification for this RE2. E.g.
276 // RE2 re("ab*c?d+");
277 // re.pattern(); // "ab*c?d+"
pattern()278 const string& pattern() const { return pattern_; }
279
280 // If RE2 could not be created properly, returns an error string.
281 // Else returns the empty string.
error()282 const string& error() const { return *error_; }
283
284 // If RE2 could not be created properly, returns an error code.
285 // Else returns RE2::NoError (== 0).
error_code()286 ErrorCode error_code() const { return error_code_; }
287
288 // If RE2 could not be created properly, returns the offending
289 // portion of the regexp.
error_arg()290 const string& error_arg() const { return error_arg_; }
291
292 // Returns the program size, a very approximate measure of a regexp's "cost".
293 // Larger numbers are more expensive than smaller numbers.
294 int ProgramSize() const;
295
296 // Returns the underlying Regexp; not for general use.
297 // Returns entire_regexp_ so that callers don't need
298 // to know about prefix_ and prefix_foldcase_.
Regexp()299 re2::Regexp* Regexp() const { return entire_regexp_; }
300
301 /***** The useful part: the matching interface *****/
302
303 // Matches "text" against "pattern". If pointer arguments are
304 // supplied, copies matched sub-patterns into them.
305 //
306 // You can pass in a "const char*" or a "string" for "text".
307 // You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
308 //
309 // The provided pointer arguments can be pointers to any scalar numeric
310 // type, or one of:
311 // string (matched piece is copied to string)
312 // StringPiece (StringPiece is mutated to point to matched piece)
313 // T (where "bool T::ParseFrom(const char*, int)" exists)
314 // (void*)NULL (the corresponding matched sub-pattern is not copied)
315 //
316 // Returns true iff all of the following conditions are satisfied:
317 // a. "text" matches "pattern" exactly
318 // b. The number of matched sub-patterns is >= number of supplied pointers
319 // c. The "i"th argument has a suitable type for holding the
320 // string captured as the "i"th sub-pattern. If you pass in
321 // NULL for the "i"th argument, or pass fewer arguments than
322 // number of sub-patterns, "i"th captured sub-pattern is
323 // ignored.
324 //
325 // CAVEAT: An optional sub-pattern that does not exist in the
326 // matched string is assigned the empty string. Therefore, the
327 // following will return false (because the empty string is not a
328 // valid number):
329 // int number;
330 // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
331 static bool FullMatchN(const StringPiece& text, const RE2& re,
332 const Arg* const args[], int argc);
333 static const VariadicFunction2<
334 bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
335
336 // Exactly like FullMatch(), except that "pattern" is allowed to match
337 // a substring of "text".
338 static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
339 const Arg* const args[], int argc);
340 static const VariadicFunction2<
341 bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
342
343 // Like FullMatch() and PartialMatch(), except that pattern has to
344 // match a prefix of "text", and "input" is advanced past the matched
345 // text. Note: "input" is modified iff this routine returns true.
346 static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
347 const Arg* const args[], int argc);
348 static const VariadicFunction2<
349 bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
350
351 // Like Consume(..), but does not anchor the match at the beginning of the
352 // string. That is, "pattern" need not start its match at the beginning of
353 // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
354 // word in "s" and stores it in "word".
355 static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
356 const Arg* const args[], int argc);
357 static const VariadicFunction2<
358 bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
359
360 // Replace the first match of "pattern" in "str" with "rewrite".
361 // Within "rewrite", backslash-escaped digits (\1 to \9) can be
362 // used to insert text matching corresponding parenthesized group
363 // from the pattern. \0 in "rewrite" refers to the entire matching
364 // text. E.g.,
365 //
366 // string s = "yabba dabba doo";
367 // CHECK(RE2::Replace(&s, "b+", "d"));
368 //
369 // will leave "s" containing "yada dabba doo"
370 //
371 // Returns true if the pattern matches and a replacement occurs,
372 // false otherwise.
373 static bool Replace(string *str,
374 const RE2& pattern,
375 const StringPiece& rewrite);
376
377 // Like Replace(), except replaces successive non-overlapping occurrences
378 // of the pattern in the string with the rewrite. E.g.
379 //
380 // string s = "yabba dabba doo";
381 // CHECK(RE2::GlobalReplace(&s, "b+", "d"));
382 //
383 // will leave "s" containing "yada dada doo"
384 // Replacements are not subject to re-matching.
385 //
386 // Because GlobalReplace only replaces non-overlapping matches,
387 // replacing "ana" within "banana" makes only one replacement, not two.
388 //
389 // Returns the number of replacements made.
390 static int GlobalReplace(string *str,
391 const RE2& pattern,
392 const StringPiece& rewrite);
393
394 // Like Replace, except that if the pattern matches, "rewrite"
395 // is copied into "out" with substitutions. The non-matching
396 // portions of "text" are ignored.
397 //
398 // Returns true iff a match occurred and the extraction happened
399 // successfully; if no match occurs, the string is left unaffected.
400 static bool Extract(const StringPiece &text,
401 const RE2& pattern,
402 const StringPiece &rewrite,
403 string *out);
404
405 // Escapes all potentially meaningful regexp characters in
406 // 'unquoted'. The returned string, used as a regular expression,
407 // will exactly match the original string. For example,
408 // 1.5-2.0?
409 // may become:
410 // 1\.5\-2\.0\?
411 static string QuoteMeta(const StringPiece& unquoted);
412
413 // Computes range for any strings matching regexp. The min and max can in
414 // some cases be arbitrarily precise, so the caller gets to specify the
415 // maximum desired length of string returned.
416 //
417 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
418 // string s that is an anchored match for this regexp satisfies
419 // min <= s && s <= max.
420 //
421 // Note that PossibleMatchRange() will only consider the first copy of an
422 // infinitely repeated element (i.e., any regexp element followed by a '*' or
423 // '+' operator). Regexps with "{N}" constructions are not affected, as those
424 // do not compile down to infinite repetitions.
425 //
426 // Returns true on success, false on error.
427 bool PossibleMatchRange(string* min, string* max, int maxlen) const;
428
429 // Generic matching interface
430
431 // Type of match.
432 enum Anchor {
433 UNANCHORED, // No anchoring
434 ANCHOR_START, // Anchor at start only
435 ANCHOR_BOTH // Anchor at start and end
436 };
437
438 // Return the number of capturing subpatterns, or -1 if the
439 // regexp wasn't valid on construction. The overall match ($0)
440 // does not count: if the regexp is "(a)(b)", returns 2.
441 int NumberOfCapturingGroups() const;
442
443
444 // Return a map from names to capturing indices.
445 // The map records the index of the leftmost group
446 // with the given name.
447 // Only valid until the re is deleted.
448 const map<string, int>& NamedCapturingGroups() const;
449
450 // Return a map from capturing indices to names.
451 // The map has no entries for unnamed groups.
452 // Only valid until the re is deleted.
453 const map<int, string>& CapturingGroupNames() const;
454
455 // General matching routine.
456 // Match against text starting at offset startpos
457 // and stopping the search at offset endpos.
458 // Returns true if match found, false if not.
459 // On a successful match, fills in match[] (up to nmatch entries)
460 // with information about submatches.
461 // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
462 // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
463 // match[3] = NULL, ..., up to match[nmatch-1] = NULL.
464 //
465 // Don't ask for more match information than you will use:
466 // runs much faster with nmatch == 1 than nmatch > 1, and
467 // runs even faster if nmatch == 0.
468 // Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
469 // but will be handled correctly.
470 //
471 // Passing text == StringPiece(NULL, 0) will be handled like any other
472 // empty string, but note that on return, it will not be possible to tell
473 // whether submatch i matched the empty string or did not match:
474 // either way, match[i] == NULL.
475 bool Match(const StringPiece& text,
476 int startpos,
477 int endpos,
478 Anchor anchor,
479 StringPiece *match,
480 int nmatch) const;
481
482 // Check that the given rewrite string is suitable for use with this
483 // regular expression. It checks that:
484 // * The regular expression has enough parenthesized subexpressions
485 // to satisfy all of the \N tokens in rewrite
486 // * The rewrite string doesn't have any syntax errors. E.g.,
487 // '\' followed by anything other than a digit or '\'.
488 // A true return value guarantees that Replace() and Extract() won't
489 // fail because of a bad rewrite string.
490 bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
491
492 // Returns the maximum submatch needed for the rewrite to be done by
493 // Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
494 static int MaxSubmatch(const StringPiece& rewrite);
495
496 // Append the "rewrite" string, with backslash subsitutions from "vec",
497 // to string "out".
498 // Returns true on success. This method can fail because of a malformed
499 // rewrite string. CheckRewriteString guarantees that the rewrite will
500 // be sucessful.
501 bool Rewrite(string *out,
502 const StringPiece &rewrite,
503 const StringPiece* vec,
504 int veclen) const;
505
506 // Constructor options
507 class Options {
508 public:
509 // The options are (defaults in parentheses):
510 //
511 // utf8 (true) text and pattern are UTF-8; otherwise Latin-1
512 // posix_syntax (false) restrict regexps to POSIX egrep syntax
513 // longest_match (false) search for longest match, not first match
514 // log_errors (true) log syntax and execution errors to ERROR
515 // max_mem (see below) approx. max memory footprint of RE2
516 // literal (false) interpret string as literal, not regexp
517 // never_nl (false) never match \n, even if it is in regexp
518 // dot_nl (false) dot matches everything including new line
519 // never_capture (false) parse all parens as non-capturing
520 // case_sensitive (true) match is case-sensitive (regexp can override
521 // with (?i) unless in posix_syntax mode)
522 //
523 // The following options are only consulted when posix_syntax == true.
524 // (When posix_syntax == false these features are always enabled and
525 // cannot be turned off.)
526 // perl_classes (false) allow Perl's \d \s \w \D \S \W
527 // word_boundary (false) allow Perl's \b \B (word boundary and not)
528 // one_line (false) ^ and $ only match beginning and end of text
529 //
530 // The max_mem option controls how much memory can be used
531 // to hold the compiled form of the regexp (the Prog) and
532 // its cached DFA graphs. Code Search placed limits on the number
533 // of Prog instructions and DFA states: 10,000 for both.
534 // In RE2, those limits would translate to about 240 KB per Prog
535 // and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
536 // better job of keeping them small than Code Search did).
537 // Each RE2 has two Progs (one forward, one reverse), and each Prog
538 // can have two DFAs (one first match, one longest match).
539 // That makes 4 DFAs:
540 //
541 // forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
542 // if opt.longest_match() == false
543 // forward, longest-match - used for all ANCHOR_BOTH searches,
544 // and the other two kinds if
545 // opt.longest_match() == true
546 // reverse, first-match - never used
547 // reverse, longest-match - used as second phase for unanchored searches
548 //
549 // The RE2 memory budget is statically divided between the two
550 // Progs and then the DFAs: two thirds to the forward Prog
551 // and one third to the reverse Prog. The forward Prog gives half
552 // of what it has left over to each of its DFAs. The reverse Prog
553 // gives it all to its longest-match DFA.
554 //
555 // Once a DFA fills its budget, it flushes its cache and starts over.
556 // If this happens too often, RE2 falls back on the NFA implementation.
557
558 // For now, make the default budget something close to Code Search.
559 static const int kDefaultMaxMem = 8<<20;
560
561 enum Encoding {
562 EncodingUTF8 = 1,
563 EncodingLatin1
564 };
565
Options()566 Options() :
567 encoding_(EncodingUTF8),
568 posix_syntax_(false),
569 longest_match_(false),
570 log_errors_(true),
571 max_mem_(kDefaultMaxMem),
572 literal_(false),
573 never_nl_(false),
574 dot_nl_(false),
575 never_capture_(false),
576 case_sensitive_(true),
577 perl_classes_(false),
578 word_boundary_(false),
579 one_line_(false) {
580 }
581
582 /*implicit*/ Options(CannedOptions);
583
encoding()584 Encoding encoding() const { return encoding_; }
set_encoding(Encoding encoding)585 void set_encoding(Encoding encoding) { encoding_ = encoding; }
586
587 // Legacy interface to encoding.
588 // TODO(rsc): Remove once clients have been converted.
utf8()589 bool utf8() const { return encoding_ == EncodingUTF8; }
set_utf8(bool b)590 void set_utf8(bool b) {
591 if (b) {
592 encoding_ = EncodingUTF8;
593 } else {
594 encoding_ = EncodingLatin1;
595 }
596 }
597
posix_syntax()598 bool posix_syntax() const { return posix_syntax_; }
set_posix_syntax(bool b)599 void set_posix_syntax(bool b) { posix_syntax_ = b; }
600
longest_match()601 bool longest_match() const { return longest_match_; }
set_longest_match(bool b)602 void set_longest_match(bool b) { longest_match_ = b; }
603
log_errors()604 bool log_errors() const { return log_errors_; }
set_log_errors(bool b)605 void set_log_errors(bool b) { log_errors_ = b; }
606
max_mem()607 int64_t max_mem() const { return max_mem_; }
set_max_mem(int64_t m)608 void set_max_mem(int64_t m) { max_mem_ = m; }
609
literal()610 bool literal() const { return literal_; }
set_literal(bool b)611 void set_literal(bool b) { literal_ = b; }
612
never_nl()613 bool never_nl() const { return never_nl_; }
set_never_nl(bool b)614 void set_never_nl(bool b) { never_nl_ = b; }
615
dot_nl()616 bool dot_nl() const { return dot_nl_; }
set_dot_nl(bool b)617 void set_dot_nl(bool b) { dot_nl_ = b; }
618
never_capture()619 bool never_capture() const { return never_capture_; }
set_never_capture(bool b)620 void set_never_capture(bool b) { never_capture_ = b; }
621
case_sensitive()622 bool case_sensitive() const { return case_sensitive_; }
set_case_sensitive(bool b)623 void set_case_sensitive(bool b) { case_sensitive_ = b; }
624
perl_classes()625 bool perl_classes() const { return perl_classes_; }
set_perl_classes(bool b)626 void set_perl_classes(bool b) { perl_classes_ = b; }
627
word_boundary()628 bool word_boundary() const { return word_boundary_; }
set_word_boundary(bool b)629 void set_word_boundary(bool b) { word_boundary_ = b; }
630
one_line()631 bool one_line() const { return one_line_; }
set_one_line(bool b)632 void set_one_line(bool b) { one_line_ = b; }
633
Copy(const Options & src)634 void Copy(const Options& src) {
635 encoding_ = src.encoding_;
636 posix_syntax_ = src.posix_syntax_;
637 longest_match_ = src.longest_match_;
638 log_errors_ = src.log_errors_;
639 max_mem_ = src.max_mem_;
640 literal_ = src.literal_;
641 never_nl_ = src.never_nl_;
642 dot_nl_ = src.dot_nl_;
643 never_capture_ = src.never_capture_;
644 case_sensitive_ = src.case_sensitive_;
645 perl_classes_ = src.perl_classes_;
646 word_boundary_ = src.word_boundary_;
647 one_line_ = src.one_line_;
648 }
649
650 int ParseFlags() const;
651
652 private:
653 Encoding encoding_;
654 bool posix_syntax_;
655 bool longest_match_;
656 bool log_errors_;
657 int64_t max_mem_;
658 bool literal_;
659 bool never_nl_;
660 bool dot_nl_;
661 bool never_capture_;
662 bool case_sensitive_;
663 bool perl_classes_;
664 bool word_boundary_;
665 bool one_line_;
666
667 //DISALLOW_EVIL_CONSTRUCTORS(Options);
668 Options(const Options&);
669 void operator=(const Options&);
670 };
671
672 // Returns the options set in the constructor.
options()673 const Options& options() const { return options_; };
674
675 // Argument converters; see below.
676 static inline Arg CRadix(short* x);
677 static inline Arg CRadix(unsigned short* x);
678 static inline Arg CRadix(int* x);
679 static inline Arg CRadix(unsigned int* x);
680 static inline Arg CRadix(long* x);
681 static inline Arg CRadix(unsigned long* x);
682 #ifdef RE2_HAVE_LONGLONG
683 static inline Arg CRadix(long long* x);
684 static inline Arg CRadix(unsigned long long* x);
685 #endif
686
687 static inline Arg Hex(short* x);
688 static inline Arg Hex(unsigned short* x);
689 static inline Arg Hex(int* x);
690 static inline Arg Hex(unsigned int* x);
691 static inline Arg Hex(long* x);
692 static inline Arg Hex(unsigned long* x);
693 #ifdef RE2_HAVE_LONGLONG
694 static inline Arg Hex(long long* x);
695 static inline Arg Hex(unsigned long long* x);
696 #endif
697
698 static inline Arg Octal(short* x);
699 static inline Arg Octal(unsigned short* x);
700 static inline Arg Octal(int* x);
701 static inline Arg Octal(unsigned int* x);
702 static inline Arg Octal(long* x);
703 static inline Arg Octal(unsigned long* x);
704 #ifdef RE2_HAVE_LONGLONG
705 static inline Arg Octal(long long* x);
706 static inline Arg Octal(unsigned long long* x);
707 #endif
708
709 private:
710 void Init(const StringPiece& pattern, const Options& options);
711
712 bool DoMatch(const StringPiece& text,
713 Anchor anchor,
714 int* consumed,
715 const Arg* const args[],
716 int n) const;
717
718 re2::Prog* ReverseProg() const;
719
720 mutable Mutex* mutex_;
721 string pattern_; // string regular expression
722 Options options_; // option flags
723 string prefix_; // required prefix (before regexp_)
724 bool prefix_foldcase_; // prefix is ASCII case-insensitive
725 re2::Regexp* entire_regexp_; // parsed regular expression
726 re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
727 re2::Prog* prog_; // compiled program for regexp
728 mutable re2::Prog* rprog_; // reverse program for regexp
729 bool is_one_pass_; // can use prog_->SearchOnePass?
730 mutable const string* error_; // Error indicator
731 // (or points to empty string)
732 mutable ErrorCode error_code_; // Error code
733 mutable string error_arg_; // Fragment of regexp showing error
734 mutable int num_captures_; // Number of capturing groups
735
736 // Map from capture names to indices
737 mutable const map<string, int>* named_groups_;
738
739 // Map from capture indices to names
740 mutable const map<int, string>* group_names_;
741
742 //DISALLOW_EVIL_CONSTRUCTORS(RE2);
743 RE2(const RE2&);
744 void operator=(const RE2&);
745 };
746
747 /***** Implementation details *****/
748
749 // Hex/Octal/Binary?
750
751 // Special class for parsing into objects that define a ParseFrom() method
752 template <class T>
753 class _RE2_MatchObject {
754 public:
Parse(const char * str,int n,void * dest)755 static inline bool Parse(const char* str, int n, void* dest) {
756 if (dest == NULL) return true;
757 T* object = reinterpret_cast<T*>(dest);
758 return object->ParseFrom(str, n);
759 }
760 };
761
762 class RE2::Arg {
763 public:
764 // Empty constructor so we can declare arrays of RE2::Arg
765 Arg();
766
767 // Constructor specially designed for NULL arguments
768 Arg(void*);
769
770 typedef bool (*Parser)(const char* str, int n, void* dest);
771
772 // Type-specific parsers
773 #define MAKE_PARSER(type,name) \
774 Arg(type* p) : arg_(p), parser_(name) { } \
775 Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
776
777
778 MAKE_PARSER(char, parse_char);
779 MAKE_PARSER(signed char, parse_char);
780 MAKE_PARSER(unsigned char, parse_uchar);
781 MAKE_PARSER(short, parse_short);
782 MAKE_PARSER(unsigned short, parse_ushort);
783 MAKE_PARSER(int, parse_int);
784 MAKE_PARSER(unsigned int, parse_uint);
785 MAKE_PARSER(long, parse_long);
786 MAKE_PARSER(unsigned long, parse_ulong);
787 #ifdef RE2_HAVE_LONGLONG
788 MAKE_PARSER(long long, parse_longlong);
789 MAKE_PARSER(unsigned long long, parse_ulonglong);
790 #endif
791 MAKE_PARSER(float, parse_float);
792 MAKE_PARSER(double, parse_double);
793 MAKE_PARSER(string, parse_string);
794 MAKE_PARSER(StringPiece, parse_stringpiece);
795
796 #undef MAKE_PARSER
797
798 // Generic constructor
799 template <class T> Arg(T*, Parser parser);
800 // Generic constructor template
Arg(T * p)801 template <class T> Arg(T* p)
802 : arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
803 }
804
805 // Parse the data
806 bool Parse(const char* str, int n) const;
807
808 private:
809 void* arg_;
810 Parser parser_;
811
812 static bool parse_null (const char* str, int n, void* dest);
813 static bool parse_char (const char* str, int n, void* dest);
814 static bool parse_uchar (const char* str, int n, void* dest);
815 static bool parse_float (const char* str, int n, void* dest);
816 static bool parse_double (const char* str, int n, void* dest);
817 static bool parse_string (const char* str, int n, void* dest);
818 static bool parse_stringpiece (const char* str, int n, void* dest);
819
820 #define DECLARE_INTEGER_PARSER(name) \
821 private: \
822 static bool parse_ ## name(const char* str, int n, void* dest); \
823 static bool parse_ ## name ## _radix( \
824 const char* str, int n, void* dest, int radix); \
825 public: \
826 static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
827 static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
828 static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
829
830 DECLARE_INTEGER_PARSER(short);
831 DECLARE_INTEGER_PARSER(ushort);
832 DECLARE_INTEGER_PARSER(int);
833 DECLARE_INTEGER_PARSER(uint);
834 DECLARE_INTEGER_PARSER(long);
835 DECLARE_INTEGER_PARSER(ulong);
836 #ifdef RE2_HAVE_LONGLONG
837 DECLARE_INTEGER_PARSER(longlong);
838 DECLARE_INTEGER_PARSER(ulonglong);
839 #endif
840
841 #undef DECLARE_INTEGER_PARSER
842 };
843
Arg()844 inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
Arg(void * p)845 inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
846
Parse(const char * str,int n)847 inline bool RE2::Arg::Parse(const char* str, int n) const {
848 return (*parser_)(str, n, arg_);
849 }
850
851 // This part of the parser, appropriate only for ints, deals with bases
852 #define MAKE_INTEGER_PARSER(type, name) \
853 inline RE2::Arg RE2::Hex(type* ptr) { \
854 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
855 inline RE2::Arg RE2::Octal(type* ptr) { \
856 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
857 inline RE2::Arg RE2::CRadix(type* ptr) { \
858 return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
859
860 MAKE_INTEGER_PARSER(short, short)
861 MAKE_INTEGER_PARSER(unsigned short, ushort)
862 MAKE_INTEGER_PARSER(int, int)
863 MAKE_INTEGER_PARSER(unsigned int, uint)
864 MAKE_INTEGER_PARSER(long, long)
865 MAKE_INTEGER_PARSER(unsigned long, ulong)
866 #ifdef RE2_HAVE_LONGLONG
867 MAKE_INTEGER_PARSER(long long, longlong)
868 MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
869 #endif
870
871 #undef MAKE_INTEGER_PARSER
872
873 } // namespace re2
874
875 using re2::RE2;
876
877 #endif /* RE2_RE2_H */
878