1 /* 2 * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #![allow(deprecated)] 18 19 use crate::regexp::feature::Feature; 20 use crate::regexp::{RegExp, RegExpConfig}; 21 use itertools::Itertools; 22 use std::io::ErrorKind; 23 use std::path::PathBuf; 24 25 /// This struct builds regular expressions from user-provided test cases. 26 pub struct RegExpBuilder { 27 test_cases: Vec<String>, 28 config: RegExpConfig, 29 } 30 31 impl RegExpBuilder { 32 /// Specifies the test cases to build the regular expression from. 33 /// 34 /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 35 /// 36 /// ⚠ Panics if `test_cases` is empty. from<T: Clone + Into<String>>(test_cases: &[T]) -> Self37 pub fn from<T: Clone + Into<String>>(test_cases: &[T]) -> Self { 38 if test_cases.is_empty() { 39 panic!("No test cases have been provided for regular expression generation"); 40 } 41 Self { 42 test_cases: test_cases.iter().cloned().map(|it| it.into()).collect_vec(), 43 config: RegExpConfig::new(), 44 } 45 } 46 47 /// Specifies a text file containing test cases to build the regular expression from. 48 /// 49 /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 50 /// 51 /// Each test case needs to be on a separate line. 52 /// Lines may be ended with either a newline (`\n`) or 53 /// a carriage return with a line feed (`\r\n`). 54 /// The final line ending is optional. 55 /// 56 /// ⚠ Panics if: 57 /// - the file cannot be found 58 /// - the file's encoding is not valid UTF-8 data 59 /// - the file cannot be opened because of conflicting permissions from_file<T: Into<PathBuf>>(file_path: T) -> Self60 pub fn from_file<T: Into<PathBuf>>(file_path: T) -> Self { 61 match std::fs::read_to_string(file_path.into()) { 62 Ok(file_content) => Self { 63 test_cases: file_content.lines().map(|it| it.to_string()).collect_vec(), 64 config: RegExpConfig::new(), 65 }, 66 Err(error) => match error.kind() { 67 ErrorKind::NotFound => panic!("The specified file could not be found"), 68 ErrorKind::InvalidData => { 69 panic!("The specified file's encoding is not valid UTF-8") 70 } 71 ErrorKind::PermissionDenied => { 72 panic!("Permission denied: The specified file could not be opened") 73 } 74 _ => panic!("{}", error), 75 }, 76 } 77 } 78 79 /// Tells `RegExpBuilder` to convert any Unicode decimal digit to character class `\d`. 80 /// 81 /// This method takes precedence over 82 /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set. 83 /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. 84 /// 85 /// This method takes precedence over 86 /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 87 /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. with_conversion_of_digits(&mut self) -> &mut Self88 pub fn with_conversion_of_digits(&mut self) -> &mut Self { 89 self.config.is_digit_converted = true; 90 self 91 } 92 93 /// Tells `RegExpBuilder` to convert any character which is not 94 /// a Unicode decimal digit to character class `\D`. 95 /// 96 /// This method takes precedence over 97 /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. 98 /// Non-digits which are also non-word characters are converted to `\D`. 99 /// 100 /// This method takes precedence over 101 /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 102 /// Non-digits which are also non-space characters are converted to `\D`. with_conversion_of_non_digits(&mut self) -> &mut Self103 pub fn with_conversion_of_non_digits(&mut self) -> &mut Self { 104 self.config.is_non_digit_converted = true; 105 self 106 } 107 108 /// Tells `RegExpBuilder` to convert any Unicode whitespace character to character class `\s`. 109 /// 110 /// This method takes precedence over 111 /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. 112 /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. 113 /// 114 /// This method takes precedence over 115 /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. 116 /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. with_conversion_of_whitespace(&mut self) -> &mut Self117 pub fn with_conversion_of_whitespace(&mut self) -> &mut Self { 118 self.config.is_space_converted = true; 119 self 120 } 121 122 /// Tells `RegExpBuilder` to convert any character which is not 123 /// a Unicode whitespace character to character class `\S`. with_conversion_of_non_whitespace(&mut self) -> &mut Self124 pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self { 125 self.config.is_non_space_converted = true; 126 self 127 } 128 129 /// Tells `RegExpBuilder` to convert any Unicode word character to character class `\w`. 130 /// 131 /// This method takes precedence over 132 /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. 133 /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. 134 /// 135 /// This method takes precedence over 136 /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 137 /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. with_conversion_of_words(&mut self) -> &mut Self138 pub fn with_conversion_of_words(&mut self) -> &mut Self { 139 self.config.is_word_converted = true; 140 self 141 } 142 143 /// Tells `RegExpBuilder` to convert any character which is not 144 /// a Unicode word character to character class `\W`. 145 /// 146 /// This method takes precedence over 147 /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 148 /// Non-words which are also non-space characters are converted to `\W`. with_conversion_of_non_words(&mut self) -> &mut Self149 pub fn with_conversion_of_non_words(&mut self) -> &mut Self { 150 self.config.is_non_word_converted = true; 151 self 152 } 153 154 /// Tells `RegExpBuilder` to detect repeated non-overlapping substrings and 155 /// to convert them to `{min,max}` quantifier notation. with_conversion_of_repetitions(&mut self) -> &mut Self156 pub fn with_conversion_of_repetitions(&mut self) -> &mut Self { 157 self.config.is_repetition_converted = true; 158 self 159 } 160 161 /// Tells `RegExpBuilder` to enable case-insensitive matching of test cases 162 /// so that letters match both upper and lower case. with_case_insensitive_matching(&mut self) -> &mut Self163 pub fn with_case_insensitive_matching(&mut self) -> &mut Self { 164 self.config.is_case_insensitive_matching = true; 165 self 166 } 167 168 /// Tells `RegExpBuilder` to replace non-capturing groups by capturing ones. with_capturing_groups(&mut self) -> &mut Self169 pub fn with_capturing_groups(&mut self) -> &mut Self { 170 self.config.is_capturing_group_enabled = true; 171 self 172 } 173 174 /// Tells `RegExpBuilder` which conversions should be performed during 175 /// regular expression generation. The available conversion features 176 /// are listed in the [`Feature`](./enum.Feature.html#variants) enum. 177 /// 178 /// ⚠ Panics if `features` is empty. 179 #[deprecated(since = "1.3.0", note = "This method will be removed in 1.4.0.")] with_conversion_of(&mut self, features: &[Feature]) -> &mut Self180 pub fn with_conversion_of(&mut self, features: &[Feature]) -> &mut Self { 181 if features.is_empty() { 182 panic!("No conversion features have been provided for regular expression generation"); 183 } 184 self.config.conversion_features = features.to_vec(); 185 self 186 } 187 188 /// Specifies the minimum quantity of substring repetitions to be converted if 189 /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. 190 /// 191 /// If the quantity is not explicitly set with this method, a default value of 1 will be used. 192 /// 193 /// ⚠ Panics if `quantity` is zero. with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self194 pub fn with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self { 195 if quantity == 0 { 196 panic!("Quantity of minimum repetitions must not be zero"); 197 } 198 self.config.minimum_repetitions = quantity; 199 self 200 } 201 202 /// Specifies the minimum length a repeated substring must have in order to be converted if 203 /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. 204 /// 205 /// If the length is not explicitly set with this method, a default value of 1 will be used. 206 /// 207 /// ⚠ Panics if `length` is zero. with_minimum_substring_length(&mut self, length: u32) -> &mut Self208 pub fn with_minimum_substring_length(&mut self, length: u32) -> &mut Self { 209 if length == 0 { 210 panic!("Minimum substring length must not be zero"); 211 } 212 self.config.minimum_substring_length = length; 213 self 214 } 215 216 /// Tells `RegExpBuilder` to convert non-ASCII characters to unicode escape sequences. 217 /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes 218 /// (range `U+010000` to `U+10FFFF`) to surrogate pairs. with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self219 pub fn with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self { 220 self.config.is_non_ascii_char_escaped = true; 221 self.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs; 222 self 223 } 224 225 /// Tells `RegExpBuilder` to produce a nicer looking regular expression in verbose mode. with_verbose_mode(&mut self) -> &mut Self226 pub fn with_verbose_mode(&mut self) -> &mut Self { 227 self.config.is_verbose_mode_enabled = true; 228 self 229 } 230 231 /// Tells `RegExpBuilder` to remove the caret anchor '^' from the resulting regular 232 /// expression, thereby allowing to match the test cases also when they do not occur 233 /// at the start of a string. without_start_anchor(&mut self) -> &mut Self234 pub fn without_start_anchor(&mut self) -> &mut Self { 235 self.config.is_start_anchor_disabled = true; 236 self 237 } 238 239 /// Tells `RegExpBuilder` to remove the dollar sign anchor '$' from the resulting regular 240 /// expression, thereby allowing to match the test cases also when they do not occur 241 /// at the end of a string. without_end_anchor(&mut self) -> &mut Self242 pub fn without_end_anchor(&mut self) -> &mut Self { 243 self.config.is_end_anchor_disabled = true; 244 self 245 } 246 247 /// Tells `RegExpBuilder` to remove the caret and dollar sign anchors from the resulting 248 /// regular expression, thereby allowing to match the test cases also when they occur 249 /// within a larger string that contains other content as well. without_anchors(&mut self) -> &mut Self250 pub fn without_anchors(&mut self) -> &mut Self { 251 self.config.is_start_anchor_disabled = true; 252 self.config.is_end_anchor_disabled = true; 253 self 254 } 255 256 /// Tells `RegExpBuilder` to provide syntax highlighting for the resulting regular expression. 257 /// 258 /// ⚠ This method may only be used if the resulting regular expression is meant to 259 /// be printed to the console. The regex string representation returned from enabling 260 /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate. with_syntax_highlighting(&mut self) -> &mut Self261 pub fn with_syntax_highlighting(&mut self) -> &mut Self { 262 self.config.is_output_colorized = true; 263 self 264 } 265 266 /// Builds the actual regular expression using the previously given settings. 267 /// Every generated regular expression is surrounded by the anchors `^` and `$` 268 /// so that substrings not being part of the test cases are not matched accidentally. build(&mut self) -> String269 pub fn build(&mut self) -> String { 270 RegExp::from(&mut self.test_cases, &self.config).to_string() 271 } 272 } 273