1 //-
2 // Copyright 2017 Jason Lingle
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! Strategies for generating strings and byte strings from regular
11 //! expressions.
12
13 use crate::std_facade::{Box, Cow, String, ToOwned, Vec};
14 use core::fmt;
15 use core::mem;
16 use core::ops::RangeInclusive;
17 use core::u32;
18
19 use regex_syntax::hir::{
20 self, Hir,
21 HirKind::*,
22 Literal::*,
23 RepetitionKind::{self, *},
24 RepetitionRange::*,
25 };
26 use regex_syntax::{Error as ParseError, Parser};
27
28 use crate::bool;
29 use crate::char;
30 use crate::collection::{size_range, vec, SizeRange};
31 use crate::strategy::*;
32 use crate::test_runner::*;
33
34 /// Wraps the regex that forms the `Strategy` for `String` so that a sensible
35 /// `Default` can be given. The default is a string of non-control characters.
36 #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
37 pub struct StringParam(&'static str);
38
39 impl From<StringParam> for &'static str {
from(x: StringParam) -> Self40 fn from(x: StringParam) -> Self {
41 x.0
42 }
43 }
44
45 impl From<&'static str> for StringParam {
from(x: &'static str) -> Self46 fn from(x: &'static str) -> Self {
47 StringParam(x)
48 }
49 }
50
51 impl Default for StringParam {
default() -> Self52 fn default() -> Self {
53 StringParam("\\PC*")
54 }
55 }
56
57 // quick_error! uses bare trait objects, so we enclose its invocation here in a
58 // module so the lint can be disabled just for it.
59 #[allow(bare_trait_objects)]
60 mod error_container {
61 use super::*;
62
63 quick_error! {
64 /// Errors which may occur when preparing a regular expression for use with
65 /// string generation.
66 #[derive(Debug)]
67 pub enum Error {
68 /// The string passed as the regex was not syntactically valid.
69 RegexSyntax(err: ParseError) {
70 from()
71 source(err)
72 display("{}", err)
73 }
74 /// The regex was syntactically valid, but contains elements not
75 /// supported by proptest.
76 UnsupportedRegex(message: &'static str) {
77 display("{}", message)
78 }
79 }
80 }
81 }
82
83 pub use self::error_container::Error;
84
85 opaque_strategy_wrapper! {
86 /// Strategy which generates values (i.e., `String` or `Vec<u8>`) matching
87 /// a regular expression.
88 ///
89 /// Created by various functions in this module.
90 #[derive(Debug)]
91 pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
92 (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
93 /// `ValueTree` corresponding to `RegexGeneratorStrategy`.
94 pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
95 (Box<dyn ValueTree<Value = T>>) -> T;
96 }
97
98 impl Strategy for str {
99 type Tree = RegexGeneratorValueTree<String>;
100 type Value = String;
101
new_tree(&self, runner: &mut TestRunner) -> NewTree<Self>102 fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
103 string_regex(self).unwrap().new_tree(runner)
104 }
105 }
106
107 type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;
108
109 #[doc(hidden)]
110 /// A type which knows how to produce a `Strategy` from a regular expression
111 /// generating the type.
112 ///
113 /// This trait exists for the benefit of `#[proptest(regex = "...")]`.
114 /// It is semver exempt, so use at your own risk.
115 /// If you found a use for the trait beyond `Vec<u8>` and `String`,
116 /// please file an issue at https://github.com/AltSysrq/proptest.
117 pub trait StrategyFromRegex: Sized + fmt::Debug {
118 type Strategy: Strategy<Value = Self>;
119
120 /// Produce a strategy for `Self` from the `regex`.
from_regex(regex: &str) -> Self::Strategy121 fn from_regex(regex: &str) -> Self::Strategy;
122 }
123
124 impl StrategyFromRegex for String {
125 type Strategy = RegexGeneratorStrategy<Self>;
126
from_regex(regex: &str) -> Self::Strategy127 fn from_regex(regex: &str) -> Self::Strategy {
128 string_regex(regex).unwrap()
129 }
130 }
131
132 impl StrategyFromRegex for Vec<u8> {
133 type Strategy = RegexGeneratorStrategy<Self>;
134
from_regex(regex: &str) -> Self::Strategy135 fn from_regex(regex: &str) -> Self::Strategy {
136 bytes_regex(regex).unwrap()
137 }
138 }
139
140 /// Creates a strategy which generates strings matching the given regular
141 /// expression.
142 ///
143 /// If you don't need error handling and aren't limited by setup time, it is
144 /// also possible to directly use a `&str` as a strategy with the same effect.
string_regex(regex: &str) -> ParseResult<String>145 pub fn string_regex(regex: &str) -> ParseResult<String> {
146 string_regex_parsed(®ex_to_hir(regex)?)
147 }
148
149 /// Like `string_regex()`, but allows providing a pre-parsed expression.
string_regex_parsed(expr: &Hir) -> ParseResult<String>150 pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
151 bytes_regex_parsed(expr)
152 .map(|v| {
153 v.prop_map(|bytes| {
154 String::from_utf8(bytes).expect("non-utf8 string")
155 })
156 .sboxed()
157 })
158 .map(RegexGeneratorStrategy)
159 }
160
161 /// Creates a strategy which generates byte strings matching the given regular
162 /// expression.
bytes_regex(regex: &str) -> ParseResult<Vec<u8>>163 pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
164 bytes_regex_parsed(®ex_to_hir(regex)?)
165 }
166
167 /// Like `bytes_regex()`, but allows providing a pre-parsed expression.
bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>>168 pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
169 match expr.kind() {
170 Empty => Ok(Just(vec![]).sboxed()),
171
172 Literal(lit) => Ok(Just(match lit {
173 Unicode(scalar) => to_bytes(*scalar),
174 Byte(byte) => vec![*byte],
175 })
176 .sboxed()),
177
178 Class(class) => Ok(match class {
179 hir::Class::Unicode(class) => {
180 unicode_class_strategy(class).prop_map(to_bytes).sboxed()
181 }
182 hir::Class::Bytes(class) => {
183 let subs = class.iter().map(|r| r.start()..=r.end());
184 Union::new(subs).prop_map(|b| vec![b]).sboxed()
185 }
186 }),
187
188 Repetition(rep) => Ok(vec(
189 bytes_regex_parsed(&rep.hir)?,
190 to_range(rep.kind.clone())?,
191 )
192 .prop_map(|parts| {
193 parts.into_iter().fold(vec![], |mut acc, child| {
194 acc.extend(child);
195 acc
196 })
197 })
198 .sboxed()),
199
200 Group(group) => bytes_regex_parsed(&group.hir).map(|v| v.0),
201
202 Concat(subs) => {
203 let subs = ConcatIter {
204 iter: subs.iter(),
205 buf: vec![],
206 next: None,
207 };
208 let ext = |(mut lhs, rhs): (Vec<_>, _)| {
209 lhs.extend(rhs);
210 lhs
211 };
212 Ok(subs
213 .fold(Ok(None), |accum: Result<_, Error>, rhs| {
214 Ok(match accum? {
215 None => Some(rhs?.sboxed()),
216 Some(accum) => {
217 Some((accum, rhs?).prop_map(ext).sboxed())
218 }
219 })
220 })?
221 .unwrap_or_else(|| Just(vec![]).sboxed()))
222 }
223
224 Alternation(subs) => {
225 Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed())
226 }
227
228 Anchor(_) => {
229 unsupported("line/text anchors not supported for string generation")
230 }
231
232 WordBoundary(_) => unsupported(
233 "word boundary tests not supported for string generation",
234 ),
235 }
236 .map(RegexGeneratorStrategy)
237 }
238
unicode_class_strategy( class: &hir::ClassUnicode, ) -> char::CharStrategy<'static>239 fn unicode_class_strategy(
240 class: &hir::ClassUnicode,
241 ) -> char::CharStrategy<'static> {
242 static NONL_RANGES: &[RangeInclusive<char>] = &[
243 '\x00'..='\x09',
244 // Multiple instances of the latter range to partially make up
245 // for the bias of having such a tiny range in the control
246 // characters.
247 '\x0B'..=::core::char::MAX,
248 '\x0B'..=::core::char::MAX,
249 '\x0B'..=::core::char::MAX,
250 '\x0B'..=::core::char::MAX,
251 '\x0B'..=::core::char::MAX,
252 ];
253
254 let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange| {
255 x.start() == '\0'
256 && x.end() == '\x09'
257 && y.start() == '\x0B'
258 && y.end() == '\u{10FFFF}'
259 };
260
261 char::ranges(match class.ranges() {
262 [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
263 _ => Cow::Owned(class.iter().map(|r| r.start()..=r.end()).collect()),
264 })
265 }
266
267 struct ConcatIter<'a, I> {
268 buf: Vec<u8>,
269 iter: I,
270 next: Option<&'a Hir>,
271 }
272
flush_lit_buf<I>( it: &mut ConcatIter<'_, I>, ) -> Option<ParseResult<Vec<u8>>>273 fn flush_lit_buf<I>(
274 it: &mut ConcatIter<'_, I>,
275 ) -> Option<ParseResult<Vec<u8>>> {
276 Some(Ok(RegexGeneratorStrategy(
277 Just(mem::replace(&mut it.buf, vec![])).sboxed(),
278 )))
279 }
280
281 impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
282 type Item = ParseResult<Vec<u8>>;
283
next(&mut self) -> Option<Self::Item>284 fn next(&mut self) -> Option<Self::Item> {
285 // A left-over node, process it first:
286 if let Some(next) = self.next.take() {
287 return Some(bytes_regex_parsed(next));
288 }
289
290 // Accumulate a literal sequence as long as we can:
291 while let Some(next) = self.iter.next() {
292 match next.kind() {
293 // A literal. Accumulate:
294 Literal(Unicode(scalar)) => self.buf.extend(to_bytes(*scalar)),
295 Literal(Byte(byte)) => self.buf.push(*byte),
296 // Encountered a non-literal.
297 _ => {
298 return if !self.buf.is_empty() {
299 // We've accumulated a literal from before, flush it out.
300 // Store this node so we deal with it the next call.
301 self.next = Some(next);
302 flush_lit_buf(self)
303 } else {
304 // We didn't; just yield this node.
305 Some(bytes_regex_parsed(next))
306 };
307 }
308 }
309 }
310
311 // Flush out any accumulated literal from before.
312 if !self.buf.is_empty() {
313 flush_lit_buf(self)
314 } else {
315 self.next.take().map(bytes_regex_parsed)
316 }
317 }
318 }
319
to_range(kind: RepetitionKind) -> Result<SizeRange, Error>320 fn to_range(kind: RepetitionKind) -> Result<SizeRange, Error> {
321 Ok(match kind {
322 ZeroOrOne => size_range(0..=1),
323 ZeroOrMore => size_range(0..=32),
324 OneOrMore => size_range(1..=32),
325 Range(range) => match range {
326 Exactly(count) if u32::MAX == count => {
327 return unsupported(
328 "Cannot have repetition of exactly u32::MAX",
329 )
330 }
331 Exactly(count) => size_range(count as usize),
332 AtLeast(min) => {
333 let max = if min < u32::MAX as u32 / 2 {
334 min as usize * 2
335 } else {
336 u32::MAX as usize
337 };
338 size_range((min as usize)..max)
339 }
340 Bounded(_, max) if u32::MAX == max => {
341 return unsupported("Cannot have repetition max of u32::MAX")
342 }
343 Bounded(min, max) => size_range((min as usize)..(max as usize + 1)),
344 },
345 })
346 }
347
to_bytes(khar: char) -> Vec<u8>348 fn to_bytes(khar: char) -> Vec<u8> {
349 let mut buf = [0u8; 4];
350 khar.encode_utf8(&mut buf).as_bytes().to_owned()
351 }
352
regex_to_hir(pattern: &str) -> Result<Hir, Error>353 fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
354 Ok(Parser::new().parse(pattern)?)
355 }
356
unsupported<T>(error: &'static str) -> Result<T, Error>357 fn unsupported<T>(error: &'static str) -> Result<T, Error> {
358 Err(Error::UnsupportedRegex(error))
359 }
360
361 #[cfg(test)]
362 mod test {
363 use std::collections::HashSet;
364
365 use regex::Regex;
366
367 use super::*;
368
do_test( pattern: &str, min_distinct: usize, max_distinct: usize, iterations: usize, )369 fn do_test(
370 pattern: &str,
371 min_distinct: usize,
372 max_distinct: usize,
373 iterations: usize,
374 ) {
375 let generated = generate_values_matching_regex(pattern, iterations);
376 assert!(
377 generated.len() >= min_distinct,
378 "Expected to generate at least {} strings, but only \
379 generated {}",
380 min_distinct,
381 generated.len()
382 );
383 assert!(
384 generated.len() <= max_distinct,
385 "Expected to generate at most {} strings, but \
386 generated {}",
387 max_distinct,
388 generated.len()
389 );
390 }
391
generate_values_matching_regex( pattern: &str, iterations: usize, ) -> HashSet<String>392 fn generate_values_matching_regex(
393 pattern: &str,
394 iterations: usize,
395 ) -> HashSet<String> {
396 let rx = Regex::new(pattern).unwrap();
397 let mut generated = HashSet::new();
398
399 let strategy = string_regex(pattern).unwrap();
400 let mut runner = TestRunner::deterministic();
401 for _ in 0..iterations {
402 let mut value = strategy.new_tree(&mut runner).unwrap();
403
404 loop {
405 let s = value.current();
406 let ok = if let Some(matsch) = rx.find(&s) {
407 0 == matsch.start() && s.len() == matsch.end()
408 } else {
409 false
410 };
411 if !ok {
412 panic!(
413 "Generated string {:?} which does not match {:?}",
414 s, pattern
415 );
416 }
417
418 generated.insert(s);
419
420 if !value.simplify() {
421 break;
422 }
423 }
424 }
425 generated
426 }
427
428 #[test]
test_case_insensitive_produces_all_available_values()429 fn test_case_insensitive_produces_all_available_values() {
430 let mut expected: HashSet<String> = HashSet::new();
431 expected.insert("a".into());
432 expected.insert("b".into());
433 expected.insert("A".into());
434 expected.insert("B".into());
435 assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
436 }
437
438 #[test]
test_literal()439 fn test_literal() {
440 do_test("foo", 1, 1, 8);
441 }
442
443 #[test]
test_casei_literal()444 fn test_casei_literal() {
445 do_test("(?i:fOo)", 8, 8, 64);
446 }
447
448 #[test]
test_alternation()449 fn test_alternation() {
450 do_test("foo|bar|baz", 3, 3, 16);
451 }
452
453 #[test]
test_repitition()454 fn test_repitition() {
455 do_test("a{0,8}", 9, 9, 64);
456 }
457
458 #[test]
test_question()459 fn test_question() {
460 do_test("a?", 2, 2, 16);
461 }
462
463 #[test]
test_star()464 fn test_star() {
465 do_test("a*", 33, 33, 256);
466 }
467
468 #[test]
test_plus()469 fn test_plus() {
470 do_test("a+", 32, 32, 256);
471 }
472
473 #[test]
test_n_to_range()474 fn test_n_to_range() {
475 do_test("a{4,}", 4, 4, 64);
476 }
477
478 #[test]
test_concatenation()479 fn test_concatenation() {
480 do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
481 }
482
483 #[test]
test_ascii_class()484 fn test_ascii_class() {
485 do_test("[[:digit:]]", 10, 10, 256);
486 }
487
488 #[test]
test_unicode_class()489 fn test_unicode_class() {
490 do_test("\\p{Greek}", 24, 512, 256);
491 }
492
493 #[test]
test_dot()494 fn test_dot() {
495 do_test(".", 200, 65536, 256);
496 }
497
498 #[test]
test_dot_s()499 fn test_dot_s() {
500 do_test("(?s).", 200, 65536, 256);
501 }
502
503 #[test]
test_backslash_d_plus()504 fn test_backslash_d_plus() {
505 do_test("\\d+", 1, 65536, 256);
506 }
507
assert_send_and_sync<T: Send + Sync>(_: T)508 fn assert_send_and_sync<T: Send + Sync>(_: T) {}
509
510 #[test]
regex_strategy_is_send_and_sync()511 fn regex_strategy_is_send_and_sync() {
512 assert_send_and_sync(string_regex(".").unwrap());
513 }
514
515 macro_rules! consistent {
516 ($name:ident, $value:expr) => {
517 #[test]
518 fn $name() {
519 test_generates_matching_strings($value);
520 }
521 };
522 }
523
test_generates_matching_strings(pattern: &str)524 fn test_generates_matching_strings(pattern: &str) {
525 use std::time;
526
527 let mut runner = TestRunner::default();
528 let start = time::Instant::now();
529
530 // If we don't support this regex, just move on quietly
531 if let Ok(strategy) = string_regex(pattern) {
532 let rx = Regex::new(pattern).unwrap();
533
534 for _ in 0..1000 {
535 let mut val = strategy.new_tree(&mut runner).unwrap();
536 // No more than 1000 simplify steps to keep test time down
537 for _ in 0..1000 {
538 let s = val.current();
539 assert!(
540 rx.is_match(&s),
541 "Produced string {:?}, which does not match {:?}",
542 s,
543 pattern
544 );
545
546 if !val.simplify() {
547 break;
548 }
549 }
550
551 // Quietly stop testing if we've run for >10 s
552 if start.elapsed().as_secs() > 10 {
553 break;
554 }
555 }
556 }
557 }
558
559 include!("regex-contrib/crates_regex.rs");
560 }
561