1 /*
2  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #![allow(deprecated)]
18 
19 use crate::regexp::feature::Feature;
20 use crate::regexp::{RegExp, RegExpConfig};
21 use itertools::Itertools;
22 use std::io::ErrorKind;
23 use std::path::PathBuf;
24 
25 /// This struct builds regular expressions from user-provided test cases.
26 pub struct RegExpBuilder {
27     test_cases: Vec<String>,
28     config: RegExpConfig,
29 }
30 
31 impl RegExpBuilder {
32     /// Specifies the test cases to build the regular expression from.
33     ///
34     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
35     ///
36     /// ⚠ Panics if `test_cases` is empty.
from<T: Clone + Into<String>>(test_cases: &[T]) -> Self37     pub fn from<T: Clone + Into<String>>(test_cases: &[T]) -> Self {
38         if test_cases.is_empty() {
39             panic!("No test cases have been provided for regular expression generation");
40         }
41         Self {
42             test_cases: test_cases.iter().cloned().map(|it| it.into()).collect_vec(),
43             config: RegExpConfig::new(),
44         }
45     }
46 
47     /// Specifies a text file containing test cases to build the regular expression from.
48     ///
49     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
50     ///
51     /// Each test case needs to be on a separate line.
52     /// Lines may be ended with either a newline (`\n`) or
53     /// a carriage return with a line feed (`\r\n`).
54     /// The final line ending is optional.
55     ///
56     /// ⚠ Panics if:
57     /// - the file cannot be found
58     /// - the file's encoding is not valid UTF-8 data
59     /// - the file cannot be opened because of conflicting permissions
from_file<T: Into<PathBuf>>(file_path: T) -> Self60     pub fn from_file<T: Into<PathBuf>>(file_path: T) -> Self {
61         match std::fs::read_to_string(file_path.into()) {
62             Ok(file_content) => Self {
63                 test_cases: file_content.lines().map(|it| it.to_string()).collect_vec(),
64                 config: RegExpConfig::new(),
65             },
66             Err(error) => match error.kind() {
67                 ErrorKind::NotFound => panic!("The specified file could not be found"),
68                 ErrorKind::InvalidData => {
69                     panic!("The specified file's encoding is not valid UTF-8")
70                 }
71                 ErrorKind::PermissionDenied => {
72                     panic!("Permission denied: The specified file could not be opened")
73                 }
74                 _ => panic!("{}", error),
75             },
76         }
77     }
78 
79     /// Tells `RegExpBuilder` to convert any Unicode decimal digit to character class `\d`.
80     ///
81     /// This method takes precedence over
82     /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set.
83     /// Decimal digits are converted to `\d`, the remaining word characters to `\w`.
84     ///
85     /// This method takes precedence over
86     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
87     /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
with_conversion_of_digits(&mut self) -> &mut Self88     pub fn with_conversion_of_digits(&mut self) -> &mut Self {
89         self.config.is_digit_converted = true;
90         self
91     }
92 
93     /// Tells `RegExpBuilder` to convert any character which is not
94     /// a Unicode decimal digit to character class `\D`.
95     ///
96     /// This method takes precedence over
97     /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
98     /// Non-digits which are also non-word characters are converted to `\D`.
99     ///
100     /// This method takes precedence over
101     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
102     /// Non-digits which are also non-space characters are converted to `\D`.
with_conversion_of_non_digits(&mut self) -> &mut Self103     pub fn with_conversion_of_non_digits(&mut self) -> &mut Self {
104         self.config.is_non_digit_converted = true;
105         self
106     }
107 
108     /// Tells `RegExpBuilder` to convert any Unicode whitespace character to character class `\s`.
109     ///
110     /// This method takes precedence over
111     /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
112     /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
113     ///
114     /// This method takes precedence over
115     /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
116     /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
with_conversion_of_whitespace(&mut self) -> &mut Self117     pub fn with_conversion_of_whitespace(&mut self) -> &mut Self {
118         self.config.is_space_converted = true;
119         self
120     }
121 
122     /// Tells `RegExpBuilder` to convert any character which is not
123     /// a Unicode whitespace character to character class `\S`.
with_conversion_of_non_whitespace(&mut self) -> &mut Self124     pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self {
125         self.config.is_non_space_converted = true;
126         self
127     }
128 
129     /// Tells `RegExpBuilder` to convert any Unicode word character to character class `\w`.
130     ///
131     /// This method takes precedence over
132     /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
133     /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
134     ///
135     /// This method takes precedence over
136     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
137     /// Word characters are converted to `\w`, the remaining non-space characters to `\S`.
with_conversion_of_words(&mut self) -> &mut Self138     pub fn with_conversion_of_words(&mut self) -> &mut Self {
139         self.config.is_word_converted = true;
140         self
141     }
142 
143     /// Tells `RegExpBuilder` to convert any character which is not
144     /// a Unicode word character to character class `\W`.
145     ///
146     /// This method takes precedence over
147     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
148     /// Non-words which are also non-space characters are converted to `\W`.
with_conversion_of_non_words(&mut self) -> &mut Self149     pub fn with_conversion_of_non_words(&mut self) -> &mut Self {
150         self.config.is_non_word_converted = true;
151         self
152     }
153 
154     /// Tells `RegExpBuilder` to detect repeated non-overlapping substrings and
155     /// to convert them to `{min,max}` quantifier notation.
with_conversion_of_repetitions(&mut self) -> &mut Self156     pub fn with_conversion_of_repetitions(&mut self) -> &mut Self {
157         self.config.is_repetition_converted = true;
158         self
159     }
160 
161     /// Tells `RegExpBuilder` to enable case-insensitive matching of test cases
162     /// so that letters match both upper and lower case.
with_case_insensitive_matching(&mut self) -> &mut Self163     pub fn with_case_insensitive_matching(&mut self) -> &mut Self {
164         self.config.is_case_insensitive_matching = true;
165         self
166     }
167 
168     /// Tells `RegExpBuilder` to replace non-capturing groups by capturing ones.
with_capturing_groups(&mut self) -> &mut Self169     pub fn with_capturing_groups(&mut self) -> &mut Self {
170         self.config.is_capturing_group_enabled = true;
171         self
172     }
173 
174     /// Tells `RegExpBuilder` which conversions should be performed during
175     /// regular expression generation. The available conversion features
176     /// are listed in the [`Feature`](./enum.Feature.html#variants) enum.
177     ///
178     /// ⚠ Panics if `features` is empty.
179     #[deprecated(since = "1.3.0", note = "This method will be removed in 1.4.0.")]
with_conversion_of(&mut self, features: &[Feature]) -> &mut Self180     pub fn with_conversion_of(&mut self, features: &[Feature]) -> &mut Self {
181         if features.is_empty() {
182             panic!("No conversion features have been provided for regular expression generation");
183         }
184         self.config.conversion_features = features.to_vec();
185         self
186     }
187 
188     /// Specifies the minimum quantity of substring repetitions to be converted if
189     /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
190     ///
191     /// If the quantity is not explicitly set with this method, a default value of 1 will be used.
192     ///
193     /// ⚠ Panics if `quantity` is zero.
with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self194     pub fn with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self {
195         if quantity == 0 {
196             panic!("Quantity of minimum repetitions must not be zero");
197         }
198         self.config.minimum_repetitions = quantity;
199         self
200     }
201 
202     /// Specifies the minimum length a repeated substring must have in order to be converted if
203     /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
204     ///
205     /// If the length is not explicitly set with this method, a default value of 1 will be used.
206     ///
207     /// ⚠ Panics if `length` is zero.
with_minimum_substring_length(&mut self, length: u32) -> &mut Self208     pub fn with_minimum_substring_length(&mut self, length: u32) -> &mut Self {
209         if length == 0 {
210             panic!("Minimum substring length must not be zero");
211         }
212         self.config.minimum_substring_length = length;
213         self
214     }
215 
216     /// Tells `RegExpBuilder` to convert non-ASCII characters to unicode escape sequences.
217     /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes
218     /// (range `U+010000` to `U+10FFFF`) to surrogate pairs.
with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self219     pub fn with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self {
220         self.config.is_non_ascii_char_escaped = true;
221         self.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs;
222         self
223     }
224 
225     /// Tells `RegExpBuilder` to produce a nicer looking regular expression in verbose mode.
with_verbose_mode(&mut self) -> &mut Self226     pub fn with_verbose_mode(&mut self) -> &mut Self {
227         self.config.is_verbose_mode_enabled = true;
228         self
229     }
230 
231     /// Tells `RegExpBuilder` to remove the caret anchor '^' from the resulting regular
232     /// expression, thereby allowing to match the test cases also when they do not occur
233     /// at the start of a string.
without_start_anchor(&mut self) -> &mut Self234     pub fn without_start_anchor(&mut self) -> &mut Self {
235         self.config.is_start_anchor_disabled = true;
236         self
237     }
238 
239     /// Tells `RegExpBuilder` to remove the dollar sign anchor '$' from the resulting regular
240     /// expression, thereby allowing to match the test cases also when they do not occur
241     /// at the end of a string.
without_end_anchor(&mut self) -> &mut Self242     pub fn without_end_anchor(&mut self) -> &mut Self {
243         self.config.is_end_anchor_disabled = true;
244         self
245     }
246 
247     /// Tells `RegExpBuilder` to remove the caret and dollar sign anchors from the resulting
248     /// regular expression, thereby allowing to match the test cases also when they occur
249     /// within a larger string that contains other content as well.
without_anchors(&mut self) -> &mut Self250     pub fn without_anchors(&mut self) -> &mut Self {
251         self.config.is_start_anchor_disabled = true;
252         self.config.is_end_anchor_disabled = true;
253         self
254     }
255 
256     /// Tells `RegExpBuilder` to provide syntax highlighting for the resulting regular expression.
257     ///
258     /// ⚠ This method may only be used if the resulting regular expression is meant to
259     /// be printed to the console. The regex string representation returned from enabling
260     /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate.
with_syntax_highlighting(&mut self) -> &mut Self261     pub fn with_syntax_highlighting(&mut self) -> &mut Self {
262         self.config.is_output_colorized = true;
263         self
264     }
265 
266     /// Builds the actual regular expression using the previously given settings.
267     /// Every generated regular expression is surrounded by the anchors `^` and `$`
268     /// so that substrings not being part of the test cases are not matched accidentally.
build(&mut self) -> String269     pub fn build(&mut self) -> String {
270         RegExp::from(&mut self.test_cases, &self.config).to_string()
271     }
272 }
273