1 //-
2 // Copyright 2017 Jason Lingle
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! Strategies for generating strings and byte strings from regular
11 //! expressions.
12 
13 use crate::std_facade::{Box, Cow, String, ToOwned, Vec};
14 use core::fmt;
15 use core::mem;
16 use core::ops::RangeInclusive;
17 use core::u32;
18 
19 use regex_syntax::hir::{
20     self, Hir,
21     HirKind::*,
22     Literal::*,
23     RepetitionKind::{self, *},
24     RepetitionRange::*,
25 };
26 use regex_syntax::{Error as ParseError, Parser};
27 
28 use crate::bool;
29 use crate::char;
30 use crate::collection::{size_range, vec, SizeRange};
31 use crate::strategy::*;
32 use crate::test_runner::*;
33 
34 /// Wraps the regex that forms the `Strategy` for `String` so that a sensible
35 /// `Default` can be given. The default is a string of non-control characters.
36 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
37 pub struct StringParam(&'static str);
38 
39 impl From<StringParam> for &'static str {
from(x: StringParam) -> Self40     fn from(x: StringParam) -> Self {
41         x.0
42     }
43 }
44 
45 impl From<&'static str> for StringParam {
from(x: &'static str) -> Self46     fn from(x: &'static str) -> Self {
47         StringParam(x)
48     }
49 }
50 
51 impl Default for StringParam {
default() -> Self52     fn default() -> Self {
53         StringParam("\\PC*")
54     }
55 }
56 
57 // quick_error! uses bare trait objects, so we enclose its invocation here in a
58 // module so the lint can be disabled just for it.
59 #[allow(bare_trait_objects)]
60 mod error_container {
61     use super::*;
62 
63     quick_error! {
64         /// Errors which may occur when preparing a regular expression for use with
65         /// string generation.
66         #[derive(Debug)]
67         pub enum Error {
68             /// The string passed as the regex was not syntactically valid.
69             RegexSyntax(err: ParseError) {
70                 from()
71                     source(err)
72                     display("{}", err)
73             }
74             /// The regex was syntactically valid, but contains elements not
75             /// supported by proptest.
76             UnsupportedRegex(message: &'static str) {
77                 display("{}", message)
78             }
79         }
80     }
81 }
82 
83 pub use self::error_container::Error;
84 
85 opaque_strategy_wrapper! {
86     /// Strategy which generates values (i.e., `String` or `Vec<u8>`) matching
87     /// a regular expression.
88     ///
89     /// Created by various functions in this module.
90     #[derive(Debug)]
91     pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
92         (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
93     /// `ValueTree` corresponding to `RegexGeneratorStrategy`.
94     pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
95         (Box<dyn ValueTree<Value = T>>) -> T;
96 }
97 
98 impl Strategy for str {
99     type Tree = RegexGeneratorValueTree<String>;
100     type Value = String;
101 
new_tree(&self, runner: &mut TestRunner) -> NewTree<Self>102     fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
103         string_regex(self).unwrap().new_tree(runner)
104     }
105 }
106 
107 type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;
108 
109 #[doc(hidden)]
110 /// A type which knows how to produce a `Strategy` from a regular expression
111 /// generating the type.
112 ///
113 /// This trait exists for the benefit of `#[proptest(regex = "...")]`.
114 /// It is semver exempt, so use at your own risk.
115 /// If you found a use for the trait beyond `Vec<u8>` and `String`,
116 /// please file an issue at https://github.com/AltSysrq/proptest.
117 pub trait StrategyFromRegex: Sized + fmt::Debug {
118     type Strategy: Strategy<Value = Self>;
119 
120     /// Produce a strategy for `Self` from the `regex`.
from_regex(regex: &str) -> Self::Strategy121     fn from_regex(regex: &str) -> Self::Strategy;
122 }
123 
124 impl StrategyFromRegex for String {
125     type Strategy = RegexGeneratorStrategy<Self>;
126 
from_regex(regex: &str) -> Self::Strategy127     fn from_regex(regex: &str) -> Self::Strategy {
128         string_regex(regex).unwrap()
129     }
130 }
131 
132 impl StrategyFromRegex for Vec<u8> {
133     type Strategy = RegexGeneratorStrategy<Self>;
134 
from_regex(regex: &str) -> Self::Strategy135     fn from_regex(regex: &str) -> Self::Strategy {
136         bytes_regex(regex).unwrap()
137     }
138 }
139 
140 /// Creates a strategy which generates strings matching the given regular
141 /// expression.
142 ///
143 /// If you don't need error handling and aren't limited by setup time, it is
144 /// also possible to directly use a `&str` as a strategy with the same effect.
string_regex(regex: &str) -> ParseResult<String>145 pub fn string_regex(regex: &str) -> ParseResult<String> {
146     string_regex_parsed(&regex_to_hir(regex)?)
147 }
148 
149 /// Like `string_regex()`, but allows providing a pre-parsed expression.
string_regex_parsed(expr: &Hir) -> ParseResult<String>150 pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
151     bytes_regex_parsed(expr)
152         .map(|v| {
153             v.prop_map(|bytes| {
154                 String::from_utf8(bytes).expect("non-utf8 string")
155             })
156             .sboxed()
157         })
158         .map(RegexGeneratorStrategy)
159 }
160 
161 /// Creates a strategy which generates byte strings matching the given regular
162 /// expression.
bytes_regex(regex: &str) -> ParseResult<Vec<u8>>163 pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
164     bytes_regex_parsed(&regex_to_hir(regex)?)
165 }
166 
167 /// Like `bytes_regex()`, but allows providing a pre-parsed expression.
bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>>168 pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
169     match expr.kind() {
170         Empty => Ok(Just(vec![]).sboxed()),
171 
172         Literal(lit) => Ok(Just(match lit {
173             Unicode(scalar) => to_bytes(*scalar),
174             Byte(byte) => vec![*byte],
175         })
176         .sboxed()),
177 
178         Class(class) => Ok(match class {
179             hir::Class::Unicode(class) => {
180                 unicode_class_strategy(class).prop_map(to_bytes).sboxed()
181             }
182             hir::Class::Bytes(class) => {
183                 let subs = class.iter().map(|r| r.start()..=r.end());
184                 Union::new(subs).prop_map(|b| vec![b]).sboxed()
185             }
186         }),
187 
188         Repetition(rep) => Ok(vec(
189             bytes_regex_parsed(&rep.hir)?,
190             to_range(rep.kind.clone())?,
191         )
192         .prop_map(|parts| {
193             parts.into_iter().fold(vec![], |mut acc, child| {
194                 acc.extend(child);
195                 acc
196             })
197         })
198         .sboxed()),
199 
200         Group(group) => bytes_regex_parsed(&group.hir).map(|v| v.0),
201 
202         Concat(subs) => {
203             let subs = ConcatIter {
204                 iter: subs.iter(),
205                 buf: vec![],
206                 next: None,
207             };
208             let ext = |(mut lhs, rhs): (Vec<_>, _)| {
209                 lhs.extend(rhs);
210                 lhs
211             };
212             Ok(subs
213                 .fold(Ok(None), |accum: Result<_, Error>, rhs| {
214                     Ok(match accum? {
215                         None => Some(rhs?.sboxed()),
216                         Some(accum) => {
217                             Some((accum, rhs?).prop_map(ext).sboxed())
218                         }
219                     })
220                 })?
221                 .unwrap_or_else(|| Just(vec![]).sboxed()))
222         }
223 
224         Alternation(subs) => {
225             Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed())
226         }
227 
228         Anchor(_) => {
229             unsupported("line/text anchors not supported for string generation")
230         }
231 
232         WordBoundary(_) => unsupported(
233             "word boundary tests not supported for string generation",
234         ),
235     }
236     .map(RegexGeneratorStrategy)
237 }
238 
unicode_class_strategy( class: &hir::ClassUnicode, ) -> char::CharStrategy<'static>239 fn unicode_class_strategy(
240     class: &hir::ClassUnicode,
241 ) -> char::CharStrategy<'static> {
242     static NONL_RANGES: &[RangeInclusive<char>] = &[
243         '\x00'..='\x09',
244         // Multiple instances of the latter range to partially make up
245         // for the bias of having such a tiny range in the control
246         // characters.
247         '\x0B'..=::core::char::MAX,
248         '\x0B'..=::core::char::MAX,
249         '\x0B'..=::core::char::MAX,
250         '\x0B'..=::core::char::MAX,
251         '\x0B'..=::core::char::MAX,
252     ];
253 
254     let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange| {
255         x.start() == '\0'
256             && x.end() == '\x09'
257             && y.start() == '\x0B'
258             && y.end() == '\u{10FFFF}'
259     };
260 
261     char::ranges(match class.ranges() {
262         [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
263         _ => Cow::Owned(class.iter().map(|r| r.start()..=r.end()).collect()),
264     })
265 }
266 
267 struct ConcatIter<'a, I> {
268     buf: Vec<u8>,
269     iter: I,
270     next: Option<&'a Hir>,
271 }
272 
flush_lit_buf<I>( it: &mut ConcatIter<'_, I>, ) -> Option<ParseResult<Vec<u8>>>273 fn flush_lit_buf<I>(
274     it: &mut ConcatIter<'_, I>,
275 ) -> Option<ParseResult<Vec<u8>>> {
276     Some(Ok(RegexGeneratorStrategy(
277         Just(mem::replace(&mut it.buf, vec![])).sboxed(),
278     )))
279 }
280 
281 impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
282     type Item = ParseResult<Vec<u8>>;
283 
next(&mut self) -> Option<Self::Item>284     fn next(&mut self) -> Option<Self::Item> {
285         // A left-over node, process it first:
286         if let Some(next) = self.next.take() {
287             return Some(bytes_regex_parsed(next));
288         }
289 
290         // Accumulate a literal sequence as long as we can:
291         while let Some(next) = self.iter.next() {
292             match next.kind() {
293                 // A literal. Accumulate:
294                 Literal(Unicode(scalar)) => self.buf.extend(to_bytes(*scalar)),
295                 Literal(Byte(byte)) => self.buf.push(*byte),
296                 // Encountered a non-literal.
297                 _ => {
298                     return if !self.buf.is_empty() {
299                         // We've accumulated a literal from before, flush it out.
300                         // Store this node so we deal with it the next call.
301                         self.next = Some(next);
302                         flush_lit_buf(self)
303                     } else {
304                         // We didn't; just yield this node.
305                         Some(bytes_regex_parsed(next))
306                     };
307                 }
308             }
309         }
310 
311         // Flush out any accumulated literal from before.
312         if !self.buf.is_empty() {
313             flush_lit_buf(self)
314         } else {
315             self.next.take().map(bytes_regex_parsed)
316         }
317     }
318 }
319 
to_range(kind: RepetitionKind) -> Result<SizeRange, Error>320 fn to_range(kind: RepetitionKind) -> Result<SizeRange, Error> {
321     Ok(match kind {
322         ZeroOrOne => size_range(0..=1),
323         ZeroOrMore => size_range(0..=32),
324         OneOrMore => size_range(1..=32),
325         Range(range) => match range {
326             Exactly(count) if u32::MAX == count => {
327                 return unsupported(
328                     "Cannot have repetition of exactly u32::MAX",
329                 )
330             }
331             Exactly(count) => size_range(count as usize),
332             AtLeast(min) => {
333                 let max = if min < u32::MAX as u32 / 2 {
334                     min as usize * 2
335                 } else {
336                     u32::MAX as usize
337                 };
338                 size_range((min as usize)..max)
339             }
340             Bounded(_, max) if u32::MAX == max => {
341                 return unsupported("Cannot have repetition max of u32::MAX")
342             }
343             Bounded(min, max) => size_range((min as usize)..(max as usize + 1)),
344         },
345     })
346 }
347 
to_bytes(khar: char) -> Vec<u8>348 fn to_bytes(khar: char) -> Vec<u8> {
349     let mut buf = [0u8; 4];
350     khar.encode_utf8(&mut buf).as_bytes().to_owned()
351 }
352 
regex_to_hir(pattern: &str) -> Result<Hir, Error>353 fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
354     Ok(Parser::new().parse(pattern)?)
355 }
356 
unsupported<T>(error: &'static str) -> Result<T, Error>357 fn unsupported<T>(error: &'static str) -> Result<T, Error> {
358     Err(Error::UnsupportedRegex(error))
359 }
360 
361 #[cfg(test)]
362 mod test {
363     use std::collections::HashSet;
364 
365     use regex::Regex;
366 
367     use super::*;
368 
do_test( pattern: &str, min_distinct: usize, max_distinct: usize, iterations: usize, )369     fn do_test(
370         pattern: &str,
371         min_distinct: usize,
372         max_distinct: usize,
373         iterations: usize,
374     ) {
375         let generated = generate_values_matching_regex(pattern, iterations);
376         assert!(
377             generated.len() >= min_distinct,
378             "Expected to generate at least {} strings, but only \
379              generated {}",
380             min_distinct,
381             generated.len()
382         );
383         assert!(
384             generated.len() <= max_distinct,
385             "Expected to generate at most {} strings, but \
386              generated {}",
387             max_distinct,
388             generated.len()
389         );
390     }
391 
generate_values_matching_regex( pattern: &str, iterations: usize, ) -> HashSet<String>392     fn generate_values_matching_regex(
393         pattern: &str,
394         iterations: usize,
395     ) -> HashSet<String> {
396         let rx = Regex::new(pattern).unwrap();
397         let mut generated = HashSet::new();
398 
399         let strategy = string_regex(pattern).unwrap();
400         let mut runner = TestRunner::deterministic();
401         for _ in 0..iterations {
402             let mut value = strategy.new_tree(&mut runner).unwrap();
403 
404             loop {
405                 let s = value.current();
406                 let ok = if let Some(matsch) = rx.find(&s) {
407                     0 == matsch.start() && s.len() == matsch.end()
408                 } else {
409                     false
410                 };
411                 if !ok {
412                     panic!(
413                         "Generated string {:?} which does not match {:?}",
414                         s, pattern
415                     );
416                 }
417 
418                 generated.insert(s);
419 
420                 if !value.simplify() {
421                     break;
422                 }
423             }
424         }
425         generated
426     }
427 
428     #[test]
test_case_insensitive_produces_all_available_values()429     fn test_case_insensitive_produces_all_available_values() {
430         let mut expected: HashSet<String> = HashSet::new();
431         expected.insert("a".into());
432         expected.insert("b".into());
433         expected.insert("A".into());
434         expected.insert("B".into());
435         assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
436     }
437 
438     #[test]
test_literal()439     fn test_literal() {
440         do_test("foo", 1, 1, 8);
441     }
442 
443     #[test]
test_casei_literal()444     fn test_casei_literal() {
445         do_test("(?i:fOo)", 8, 8, 64);
446     }
447 
448     #[test]
test_alternation()449     fn test_alternation() {
450         do_test("foo|bar|baz", 3, 3, 16);
451     }
452 
453     #[test]
test_repitition()454     fn test_repitition() {
455         do_test("a{0,8}", 9, 9, 64);
456     }
457 
458     #[test]
test_question()459     fn test_question() {
460         do_test("a?", 2, 2, 16);
461     }
462 
463     #[test]
test_star()464     fn test_star() {
465         do_test("a*", 33, 33, 256);
466     }
467 
468     #[test]
test_plus()469     fn test_plus() {
470         do_test("a+", 32, 32, 256);
471     }
472 
473     #[test]
test_n_to_range()474     fn test_n_to_range() {
475         do_test("a{4,}", 4, 4, 64);
476     }
477 
478     #[test]
test_concatenation()479     fn test_concatenation() {
480         do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
481     }
482 
483     #[test]
test_ascii_class()484     fn test_ascii_class() {
485         do_test("[[:digit:]]", 10, 10, 256);
486     }
487 
488     #[test]
test_unicode_class()489     fn test_unicode_class() {
490         do_test("\\p{Greek}", 24, 512, 256);
491     }
492 
493     #[test]
test_dot()494     fn test_dot() {
495         do_test(".", 200, 65536, 256);
496     }
497 
498     #[test]
test_dot_s()499     fn test_dot_s() {
500         do_test("(?s).", 200, 65536, 256);
501     }
502 
503     #[test]
test_backslash_d_plus()504     fn test_backslash_d_plus() {
505         do_test("\\d+", 1, 65536, 256);
506     }
507 
assert_send_and_sync<T: Send + Sync>(_: T)508     fn assert_send_and_sync<T: Send + Sync>(_: T) {}
509 
510     #[test]
regex_strategy_is_send_and_sync()511     fn regex_strategy_is_send_and_sync() {
512         assert_send_and_sync(string_regex(".").unwrap());
513     }
514 
515     macro_rules! consistent {
516         ($name:ident, $value:expr) => {
517             #[test]
518             fn $name() {
519                 test_generates_matching_strings($value);
520             }
521         };
522     }
523 
test_generates_matching_strings(pattern: &str)524     fn test_generates_matching_strings(pattern: &str) {
525         use std::time;
526 
527         let mut runner = TestRunner::default();
528         let start = time::Instant::now();
529 
530         // If we don't support this regex, just move on quietly
531         if let Ok(strategy) = string_regex(pattern) {
532             let rx = Regex::new(pattern).unwrap();
533 
534             for _ in 0..1000 {
535                 let mut val = strategy.new_tree(&mut runner).unwrap();
536                 // No more than 1000 simplify steps to keep test time down
537                 for _ in 0..1000 {
538                     let s = val.current();
539                     assert!(
540                         rx.is_match(&s),
541                         "Produced string {:?}, which does not match {:?}",
542                         s,
543                         pattern
544                     );
545 
546                     if !val.simplify() {
547                         break;
548                     }
549                 }
550 
551                 // Quietly stop testing if we've run for >10 s
552                 if start.elapsed().as_secs() > 10 {
553                     break;
554                 }
555             }
556         }
557     }
558 
559     include!("regex-contrib/crates_regex.rs");
560 }
561