1 /* 2 * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 use crate::ast::{Quantifier, Substring}; 18 use crate::char::{Grapheme, GraphemeCluster}; 19 use crate::fsm::Dfa; 20 use crate::regexp::RegExpConfig; 21 use itertools::EitherOrBoth::Both; 22 use itertools::Itertools; 23 use ndarray::{Array1, Array2}; 24 use petgraph::prelude::EdgeRef; 25 use std::cmp::Reverse; 26 use std::collections::BTreeSet; 27 28 #[derive(Clone, Debug, Eq, PartialEq)] 29 pub enum Expression { 30 Alternation(Vec<Expression>, RegExpConfig), 31 CharacterClass(BTreeSet<char>, RegExpConfig), 32 Concatenation(Box<Expression>, Box<Expression>, RegExpConfig), 33 Literal(GraphemeCluster, RegExpConfig), 34 Repetition(Box<Expression>, Quantifier, RegExpConfig), 35 } 36 37 impl Expression { from(dfa: Dfa, config: &RegExpConfig) -> Self38 pub(crate) fn from(dfa: Dfa, config: &RegExpConfig) -> Self { 39 let states = dfa.states_in_depth_first_order(); 40 let state_count = dfa.state_count(); 41 42 let mut a = Array2::<Option<Expression>>::default((state_count, state_count)); 43 let mut b = Array1::<Option<Expression>>::default(state_count); 44 45 for (i, state) in states.iter().enumerate() { 46 if dfa.is_final_state(*state) { 47 b[i] = Some(Expression::new_literal( 48 GraphemeCluster::from("", config), 49 config, 50 )); 51 } 52 53 for edge in dfa.outgoing_edges(*state) { 54 let literal = Expression::new_literal( 55 GraphemeCluster::new(edge.weight().clone(), config), 56 config, 57 ); 58 let j = states.iter().position(|&it| it == edge.target()).unwrap(); 59 60 a[(i, j)] = if a[(i, j)].is_some() { 61 Self::union(&a[(i, j)], &Some(literal), config) 62 } else { 63 Some(literal) 64 } 65 } 66 } 67 68 for n in (0..state_count).rev() { 69 if a[(n, n)].is_some() { 70 b[n] = Self::concatenate( 71 &Self::repeat_zero_or_more_times(&a[(n, n)], config), 72 &b[n], 73 config, 74 ); 75 for j in 0..n { 76 a[(n, j)] = Self::concatenate( 77 &Self::repeat_zero_or_more_times(&a[(n, n)], config), 78 &a[(n, j)], 79 config, 80 ); 81 } 82 } 83 84 for i in 0..n { 85 if a[(i, n)].is_some() { 86 b[i] = 87 Self::union(&b[i], &Self::concatenate(&a[(i, n)], &b[n], config), config); 88 for j in 0..n { 89 a[(i, j)] = Self::union( 90 &a[(i, j)], 91 &Self::concatenate(&a[(i, n)], &a[(n, j)], config), 92 config, 93 ); 94 } 95 } 96 } 97 } 98 99 if !b.is_empty() && b[0].is_some() { 100 b[0].as_ref().unwrap().clone() 101 } else { 102 Expression::new_literal(GraphemeCluster::from("", config), config) 103 } 104 } 105 new_alternation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self106 fn new_alternation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self { 107 let mut options: Vec<Expression> = vec![]; 108 Self::flatten_alternations(&mut options, vec![expr1, expr2]); 109 options.sort_by_key(|option| Reverse(option.len())); 110 Expression::Alternation(options, config.clone()) 111 } 112 new_character_class( first_char_set: BTreeSet<char>, second_char_set: BTreeSet<char>, config: &RegExpConfig, ) -> Self113 fn new_character_class( 114 first_char_set: BTreeSet<char>, 115 second_char_set: BTreeSet<char>, 116 config: &RegExpConfig, 117 ) -> Self { 118 let union_set = first_char_set.union(&second_char_set).copied().collect(); 119 Expression::CharacterClass(union_set, config.clone()) 120 } 121 new_concatenation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self122 fn new_concatenation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self { 123 Expression::Concatenation(Box::from(expr1), Box::from(expr2), config.clone()) 124 } 125 new_literal(cluster: GraphemeCluster, config: &RegExpConfig) -> Self126 fn new_literal(cluster: GraphemeCluster, config: &RegExpConfig) -> Self { 127 Expression::Literal(cluster, config.clone()) 128 } 129 new_repetition(expr: Expression, quantifier: Quantifier, config: &RegExpConfig) -> Self130 fn new_repetition(expr: Expression, quantifier: Quantifier, config: &RegExpConfig) -> Self { 131 Expression::Repetition(Box::from(expr), quantifier, config.clone()) 132 } 133 is_empty(&self) -> bool134 fn is_empty(&self) -> bool { 135 match self { 136 Expression::Literal(cluster, _) => cluster.is_empty(), 137 _ => false, 138 } 139 } 140 is_single_codepoint(&self) -> bool141 pub(crate) fn is_single_codepoint(&self) -> bool { 142 match self { 143 Expression::CharacterClass(_, _) => true, 144 Expression::Literal(cluster, config) => { 145 cluster.char_count(config.is_non_ascii_char_escaped) == 1 146 && cluster.graphemes().first().unwrap().maximum() == 1 147 } 148 _ => false, 149 } 150 } 151 len(&self) -> usize152 fn len(&self) -> usize { 153 match self { 154 Expression::Alternation(options, _) => options.first().unwrap().len(), 155 Expression::CharacterClass(_, _) => 1, 156 Expression::Concatenation(expr1, expr2, _) => expr1.len() + expr2.len(), 157 Expression::Literal(cluster, _) => cluster.size(), 158 Expression::Repetition(expr, _, _) => expr.len(), 159 } 160 } 161 precedence(&self) -> u8162 pub(crate) fn precedence(&self) -> u8 { 163 match self { 164 Expression::Alternation(_, _) | Expression::CharacterClass(_, _) => 1, 165 Expression::Concatenation(_, _, _) | Expression::Literal(_, _) => 2, 166 Expression::Repetition(_, _, _) => 3, 167 } 168 } 169 remove_substring(&mut self, substring: &Substring, length: usize)170 pub(crate) fn remove_substring(&mut self, substring: &Substring, length: usize) { 171 match self { 172 Expression::Concatenation(expr1, expr2, _) => match substring { 173 Substring::Prefix => { 174 if let Expression::Literal(_, _) = **expr1 { 175 expr1.remove_substring(substring, length) 176 } 177 } 178 Substring::Suffix => { 179 if let Expression::Literal(_, _) = **expr2 { 180 expr2.remove_substring(substring, length) 181 } 182 } 183 }, 184 Expression::Literal(cluster, _) => match substring { 185 Substring::Prefix => { 186 cluster.graphemes_mut().drain(..length); 187 } 188 Substring::Suffix => { 189 let graphemes = cluster.graphemes_mut(); 190 graphemes.drain(graphemes.len() - length..); 191 } 192 }, 193 _ => (), 194 } 195 } 196 value(&self, substring: Option<&Substring>) -> Option<Vec<Grapheme>>197 pub(crate) fn value(&self, substring: Option<&Substring>) -> Option<Vec<Grapheme>> { 198 match self { 199 Expression::Concatenation(expr1, expr2, _) => match substring { 200 Some(value) => match value { 201 Substring::Prefix => expr1.value(None), 202 Substring::Suffix => expr2.value(None), 203 }, 204 None => None, 205 }, 206 Expression::Literal(cluster, _) => Some(cluster.graphemes().clone()), 207 _ => None, 208 } 209 } 210 repeat_zero_or_more_times( expr: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>211 fn repeat_zero_or_more_times( 212 expr: &Option<Expression>, 213 config: &RegExpConfig, 214 ) -> Option<Expression> { 215 expr.as_ref() 216 .map(|value| Expression::new_repetition(value.clone(), Quantifier::KleeneStar, config)) 217 } 218 concatenate( a: &Option<Expression>, b: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>219 fn concatenate( 220 a: &Option<Expression>, 221 b: &Option<Expression>, 222 config: &RegExpConfig, 223 ) -> Option<Expression> { 224 if a.is_none() || b.is_none() { 225 return None; 226 } 227 228 let expr1 = a.as_ref().unwrap(); 229 let expr2 = b.as_ref().unwrap(); 230 231 if expr1.is_empty() { 232 return b.clone(); 233 } 234 if expr2.is_empty() { 235 return a.clone(); 236 } 237 238 if let (Expression::Literal(graphemes_a, config), Expression::Literal(graphemes_b, _)) = 239 (&expr1, &expr2) 240 { 241 return Some(Expression::new_literal( 242 GraphemeCluster::merge(graphemes_a, graphemes_b, config), 243 config, 244 )); 245 } 246 247 if let (Expression::Literal(graphemes_a, _), Expression::Concatenation(first, second, _)) = 248 (&expr1, &expr2) 249 { 250 if let Expression::Literal(graphemes_first, config) = &**first { 251 let literal = Expression::new_literal( 252 GraphemeCluster::merge(graphemes_a, graphemes_first, config), 253 config, 254 ); 255 return Some(Expression::new_concatenation( 256 literal, 257 *second.clone(), 258 config, 259 )); 260 } 261 } 262 263 if let (Expression::Literal(graphemes_b, _), Expression::Concatenation(first, second, _)) = 264 (&expr2, &expr1) 265 { 266 if let Expression::Literal(graphemes_second, config) = &**second { 267 let literal = Expression::new_literal( 268 GraphemeCluster::merge(graphemes_second, graphemes_b, config), 269 config, 270 ); 271 return Some(Expression::new_concatenation( 272 *first.clone(), 273 literal, 274 config, 275 )); 276 } 277 } 278 279 Some(Expression::new_concatenation( 280 expr1.clone(), 281 expr2.clone(), 282 config, 283 )) 284 } 285 union( a: &Option<Expression>, b: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>286 fn union( 287 a: &Option<Expression>, 288 b: &Option<Expression>, 289 config: &RegExpConfig, 290 ) -> Option<Expression> { 291 if let (Some(mut expr1), Some(mut expr2)) = (a.clone(), b.clone()) { 292 if expr1 != expr2 { 293 let common_prefix = 294 Self::remove_common_substring(&mut expr1, &mut expr2, Substring::Prefix); 295 let common_suffix = 296 Self::remove_common_substring(&mut expr1, &mut expr2, Substring::Suffix); 297 298 let mut result = if expr1.is_empty() { 299 Some(Expression::new_repetition( 300 expr2.clone(), 301 Quantifier::QuestionMark, 302 config, 303 )) 304 } else if expr2.is_empty() { 305 Some(Expression::new_repetition( 306 expr1.clone(), 307 Quantifier::QuestionMark, 308 config, 309 )) 310 } else { 311 None 312 }; 313 314 if result.is_none() { 315 if let Expression::Repetition(expr, quantifier, _) = &expr1 { 316 if quantifier == &Quantifier::QuestionMark { 317 let alternation = 318 Expression::new_alternation(*expr.clone(), expr2.clone(), config); 319 result = Some(Expression::new_repetition( 320 alternation, 321 Quantifier::QuestionMark, 322 config, 323 )); 324 } 325 } 326 } 327 328 if result.is_none() { 329 if let Expression::Repetition(expr, quantifier, _) = &expr2 { 330 if quantifier == &Quantifier::QuestionMark { 331 let alternation = 332 Expression::new_alternation(expr1.clone(), *expr.clone(), config); 333 result = Some(Expression::new_repetition( 334 alternation, 335 Quantifier::QuestionMark, 336 config, 337 )); 338 } 339 } 340 } 341 342 if result.is_none() && expr1.is_single_codepoint() && expr2.is_single_codepoint() { 343 let first_char_set = Self::extract_character_set(expr1.clone()); 344 let second_char_set = Self::extract_character_set(expr2.clone()); 345 result = Some(Expression::new_character_class( 346 first_char_set, 347 second_char_set, 348 config, 349 )); 350 } 351 352 if result.is_none() { 353 result = Some(Expression::new_alternation(expr1, expr2, config)); 354 } 355 356 if let Some(prefix) = common_prefix { 357 result = Some(Expression::new_concatenation( 358 Expression::new_literal( 359 GraphemeCluster::from_graphemes(prefix, config), 360 config, 361 ), 362 result.unwrap(), 363 config, 364 )); 365 } 366 367 if let Some(suffix) = common_suffix { 368 result = Some(Expression::new_concatenation( 369 result.unwrap(), 370 Expression::new_literal( 371 GraphemeCluster::from_graphemes(suffix, config), 372 config, 373 ), 374 config, 375 )); 376 } 377 378 result 379 } else if a.is_some() { 380 a.clone() 381 } else if b.is_some() { 382 b.clone() 383 } else { 384 None 385 } 386 } else if a.is_some() { 387 a.clone() 388 } else if b.is_some() { 389 b.clone() 390 } else { 391 None 392 } 393 } 394 flatten_alternations( flattened_options: &mut Vec<Expression>, current_options: Vec<Expression>, )395 fn flatten_alternations( 396 flattened_options: &mut Vec<Expression>, 397 current_options: Vec<Expression>, 398 ) { 399 for option in current_options { 400 if let Expression::Alternation(expr_options, _) = option { 401 Self::flatten_alternations(flattened_options, expr_options); 402 } else { 403 flattened_options.push(option); 404 } 405 } 406 } 407 extract_character_set(expr: Expression) -> BTreeSet<char>408 fn extract_character_set(expr: Expression) -> BTreeSet<char> { 409 match expr { 410 Expression::Literal(cluster, _) => { 411 let single_char = cluster 412 .graphemes() 413 .first() 414 .unwrap() 415 .value() 416 .chars() 417 .next() 418 .unwrap(); 419 btreeset![single_char] 420 } 421 Expression::CharacterClass(char_set, _) => char_set, 422 _ => BTreeSet::new(), 423 } 424 } 425 remove_common_substring( a: &mut Expression, b: &mut Expression, substring: Substring, ) -> Option<Vec<Grapheme>>426 fn remove_common_substring( 427 a: &mut Expression, 428 b: &mut Expression, 429 substring: Substring, 430 ) -> Option<Vec<Grapheme>> { 431 let common_substring = Self::find_common_substring(a, b, &substring); 432 if let Some(value) = &common_substring { 433 a.remove_substring(&substring, value.len()); 434 b.remove_substring(&substring, value.len()); 435 } 436 common_substring 437 } 438 find_common_substring( a: &Expression, b: &Expression, substring: &Substring, ) -> Option<Vec<Grapheme>>439 fn find_common_substring( 440 a: &Expression, 441 b: &Expression, 442 substring: &Substring, 443 ) -> Option<Vec<Grapheme>> { 444 let mut graphemes_a = a.value(Some(substring)).unwrap_or_else(Vec::new); 445 let mut graphemes_b = b.value(Some(substring)).unwrap_or_else(Vec::new); 446 let mut common_graphemes = vec![]; 447 448 if let Substring::Suffix = substring { 449 graphemes_a.reverse(); 450 graphemes_b.reverse(); 451 } 452 453 for pair in graphemes_a.iter().zip_longest(graphemes_b.iter()) { 454 match pair { 455 Both(grapheme_a, grapheme_b) => { 456 if grapheme_a == grapheme_b { 457 common_graphemes.push(grapheme_a.clone()); 458 } else { 459 break; 460 } 461 } 462 _ => break, 463 } 464 } 465 466 if let Substring::Suffix = substring { 467 common_graphemes.reverse(); 468 } 469 470 if common_graphemes.is_empty() { 471 None 472 } else { 473 Some(common_graphemes) 474 } 475 } 476 } 477 478 #[cfg(test)] 479 mod tests { 480 use super::*; 481 482 #[test] ensure_correct_string_representation_of_alternation_1()483 fn ensure_correct_string_representation_of_alternation_1() { 484 let config = RegExpConfig::new(); 485 let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config); 486 let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config); 487 let alternation = Expression::new_alternation(literal1, literal2, &config); 488 assert_eq!(alternation.to_string(), "abc|def"); 489 } 490 491 #[test] ensure_correct_string_representation_of_alternation_2()492 fn ensure_correct_string_representation_of_alternation_2() { 493 let config = RegExpConfig::new(); 494 let literal1 = Expression::new_literal(GraphemeCluster::from("a", &config), &config); 495 let literal2 = Expression::new_literal(GraphemeCluster::from("ab", &config), &config); 496 let literal3 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config); 497 let alternation1 = Expression::new_alternation(literal1, literal2, &config); 498 let alternation2 = Expression::new_alternation(alternation1, literal3, &config); 499 assert_eq!(alternation2.to_string(), "abc|ab|a"); 500 } 501 502 #[test] ensure_correct_string_representation_of_character_class_1()503 fn ensure_correct_string_representation_of_character_class_1() { 504 let config = RegExpConfig::new(); 505 let char_class = Expression::new_character_class(btreeset!['a'], btreeset!['b'], &config); 506 assert_eq!(char_class.to_string(), "[ab]"); 507 } 508 509 #[test] ensure_correct_string_representation_of_character_class_2()510 fn ensure_correct_string_representation_of_character_class_2() { 511 let config = RegExpConfig::new(); 512 let char_class = 513 Expression::new_character_class(btreeset!['a', 'b'], btreeset!['c'], &config); 514 assert_eq!(char_class.to_string(), "[a-c]"); 515 } 516 517 #[test] ensure_correct_string_representation_of_concatenation_1()518 fn ensure_correct_string_representation_of_concatenation_1() { 519 let config = RegExpConfig::new(); 520 let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config); 521 let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config); 522 let concatenation = Expression::new_concatenation(literal1, literal2, &config); 523 assert_eq!(concatenation.to_string(), "abcdef"); 524 } 525 526 #[test] ensure_correct_string_representation_of_concatenation_2()527 fn ensure_correct_string_representation_of_concatenation_2() { 528 let config = RegExpConfig::new(); 529 let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config); 530 let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config); 531 let repetition = Expression::new_repetition(literal1, Quantifier::KleeneStar, &config); 532 let concatenation = Expression::new_concatenation(repetition, literal2, &config); 533 assert_eq!(concatenation.to_string(), "(?:abc)*def"); 534 } 535 536 #[test] ensure_correct_removal_of_prefix_in_literal()537 fn ensure_correct_removal_of_prefix_in_literal() { 538 let config = RegExpConfig::new(); 539 let mut literal = 540 Expression::new_literal(GraphemeCluster::from("abcdef", &config), &config); 541 assert_eq!( 542 literal.value(None), 543 Some( 544 vec!["a", "b", "c", "d", "e", "f"] 545 .iter() 546 .map(|&it| Grapheme::from(it, &config)) 547 .collect_vec() 548 ) 549 ); 550 551 literal.remove_substring(&Substring::Prefix, 2); 552 assert_eq!( 553 literal.value(None), 554 Some( 555 vec!["c", "d", "e", "f"] 556 .iter() 557 .map(|&it| Grapheme::from(it, &config)) 558 .collect_vec() 559 ) 560 ); 561 } 562 563 #[test] ensure_correct_removal_of_suffix_in_literal()564 fn ensure_correct_removal_of_suffix_in_literal() { 565 let config = RegExpConfig::new(); 566 let mut literal = 567 Expression::new_literal(GraphemeCluster::from("abcdef", &config), &config); 568 assert_eq!( 569 literal.value(None), 570 Some( 571 vec!["a", "b", "c", "d", "e", "f"] 572 .iter() 573 .map(|&it| Grapheme::from(it, &config)) 574 .collect_vec() 575 ) 576 ); 577 578 literal.remove_substring(&Substring::Suffix, 2); 579 assert_eq!( 580 literal.value(None), 581 Some( 582 vec!["a", "b", "c", "d"] 583 .iter() 584 .map(|&it| Grapheme::from(it, &config)) 585 .collect_vec() 586 ) 587 ); 588 } 589 590 #[test] ensure_correct_string_representation_of_repetition_1()591 fn ensure_correct_string_representation_of_repetition_1() { 592 let config = RegExpConfig::new(); 593 let literal = Expression::new_literal(GraphemeCluster::from("abc", &config), &config); 594 let repetition = Expression::new_repetition(literal, Quantifier::KleeneStar, &config); 595 assert_eq!(repetition.to_string(), "(?:abc)*"); 596 } 597 598 #[test] ensure_correct_string_representation_of_repetition_2()599 fn ensure_correct_string_representation_of_repetition_2() { 600 let config = RegExpConfig::new(); 601 let literal = Expression::new_literal(GraphemeCluster::from("a", &config), &config); 602 let repetition = Expression::new_repetition(literal, Quantifier::QuestionMark, &config); 603 assert_eq!(repetition.to_string(), "a?"); 604 } 605 } 606