1 /*
2  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 use crate::ast::{Quantifier, Substring};
18 use crate::char::{Grapheme, GraphemeCluster};
19 use crate::fsm::Dfa;
20 use crate::regexp::RegExpConfig;
21 use itertools::EitherOrBoth::Both;
22 use itertools::Itertools;
23 use ndarray::{Array1, Array2};
24 use petgraph::prelude::EdgeRef;
25 use std::cmp::Reverse;
26 use std::collections::BTreeSet;
27 
28 #[derive(Clone, Debug, Eq, PartialEq)]
29 pub enum Expression {
30     Alternation(Vec<Expression>, RegExpConfig),
31     CharacterClass(BTreeSet<char>, RegExpConfig),
32     Concatenation(Box<Expression>, Box<Expression>, RegExpConfig),
33     Literal(GraphemeCluster, RegExpConfig),
34     Repetition(Box<Expression>, Quantifier, RegExpConfig),
35 }
36 
37 impl Expression {
from(dfa: Dfa, config: &RegExpConfig) -> Self38     pub(crate) fn from(dfa: Dfa, config: &RegExpConfig) -> Self {
39         let states = dfa.states_in_depth_first_order();
40         let state_count = dfa.state_count();
41 
42         let mut a = Array2::<Option<Expression>>::default((state_count, state_count));
43         let mut b = Array1::<Option<Expression>>::default(state_count);
44 
45         for (i, state) in states.iter().enumerate() {
46             if dfa.is_final_state(*state) {
47                 b[i] = Some(Expression::new_literal(
48                     GraphemeCluster::from("", config),
49                     config,
50                 ));
51             }
52 
53             for edge in dfa.outgoing_edges(*state) {
54                 let literal = Expression::new_literal(
55                     GraphemeCluster::new(edge.weight().clone(), config),
56                     config,
57                 );
58                 let j = states.iter().position(|&it| it == edge.target()).unwrap();
59 
60                 a[(i, j)] = if a[(i, j)].is_some() {
61                     Self::union(&a[(i, j)], &Some(literal), config)
62                 } else {
63                     Some(literal)
64                 }
65             }
66         }
67 
68         for n in (0..state_count).rev() {
69             if a[(n, n)].is_some() {
70                 b[n] = Self::concatenate(
71                     &Self::repeat_zero_or_more_times(&a[(n, n)], config),
72                     &b[n],
73                     config,
74                 );
75                 for j in 0..n {
76                     a[(n, j)] = Self::concatenate(
77                         &Self::repeat_zero_or_more_times(&a[(n, n)], config),
78                         &a[(n, j)],
79                         config,
80                     );
81                 }
82             }
83 
84             for i in 0..n {
85                 if a[(i, n)].is_some() {
86                     b[i] =
87                         Self::union(&b[i], &Self::concatenate(&a[(i, n)], &b[n], config), config);
88                     for j in 0..n {
89                         a[(i, j)] = Self::union(
90                             &a[(i, j)],
91                             &Self::concatenate(&a[(i, n)], &a[(n, j)], config),
92                             config,
93                         );
94                     }
95                 }
96             }
97         }
98 
99         if !b.is_empty() && b[0].is_some() {
100             b[0].as_ref().unwrap().clone()
101         } else {
102             Expression::new_literal(GraphemeCluster::from("", config), config)
103         }
104     }
105 
new_alternation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self106     fn new_alternation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self {
107         let mut options: Vec<Expression> = vec![];
108         Self::flatten_alternations(&mut options, vec![expr1, expr2]);
109         options.sort_by_key(|option| Reverse(option.len()));
110         Expression::Alternation(options, config.clone())
111     }
112 
new_character_class( first_char_set: BTreeSet<char>, second_char_set: BTreeSet<char>, config: &RegExpConfig, ) -> Self113     fn new_character_class(
114         first_char_set: BTreeSet<char>,
115         second_char_set: BTreeSet<char>,
116         config: &RegExpConfig,
117     ) -> Self {
118         let union_set = first_char_set.union(&second_char_set).copied().collect();
119         Expression::CharacterClass(union_set, config.clone())
120     }
121 
new_concatenation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self122     fn new_concatenation(expr1: Expression, expr2: Expression, config: &RegExpConfig) -> Self {
123         Expression::Concatenation(Box::from(expr1), Box::from(expr2), config.clone())
124     }
125 
new_literal(cluster: GraphemeCluster, config: &RegExpConfig) -> Self126     fn new_literal(cluster: GraphemeCluster, config: &RegExpConfig) -> Self {
127         Expression::Literal(cluster, config.clone())
128     }
129 
new_repetition(expr: Expression, quantifier: Quantifier, config: &RegExpConfig) -> Self130     fn new_repetition(expr: Expression, quantifier: Quantifier, config: &RegExpConfig) -> Self {
131         Expression::Repetition(Box::from(expr), quantifier, config.clone())
132     }
133 
is_empty(&self) -> bool134     fn is_empty(&self) -> bool {
135         match self {
136             Expression::Literal(cluster, _) => cluster.is_empty(),
137             _ => false,
138         }
139     }
140 
is_single_codepoint(&self) -> bool141     pub(crate) fn is_single_codepoint(&self) -> bool {
142         match self {
143             Expression::CharacterClass(_, _) => true,
144             Expression::Literal(cluster, config) => {
145                 cluster.char_count(config.is_non_ascii_char_escaped) == 1
146                     && cluster.graphemes().first().unwrap().maximum() == 1
147             }
148             _ => false,
149         }
150     }
151 
len(&self) -> usize152     fn len(&self) -> usize {
153         match self {
154             Expression::Alternation(options, _) => options.first().unwrap().len(),
155             Expression::CharacterClass(_, _) => 1,
156             Expression::Concatenation(expr1, expr2, _) => expr1.len() + expr2.len(),
157             Expression::Literal(cluster, _) => cluster.size(),
158             Expression::Repetition(expr, _, _) => expr.len(),
159         }
160     }
161 
precedence(&self) -> u8162     pub(crate) fn precedence(&self) -> u8 {
163         match self {
164             Expression::Alternation(_, _) | Expression::CharacterClass(_, _) => 1,
165             Expression::Concatenation(_, _, _) | Expression::Literal(_, _) => 2,
166             Expression::Repetition(_, _, _) => 3,
167         }
168     }
169 
remove_substring(&mut self, substring: &Substring, length: usize)170     pub(crate) fn remove_substring(&mut self, substring: &Substring, length: usize) {
171         match self {
172             Expression::Concatenation(expr1, expr2, _) => match substring {
173                 Substring::Prefix => {
174                     if let Expression::Literal(_, _) = **expr1 {
175                         expr1.remove_substring(substring, length)
176                     }
177                 }
178                 Substring::Suffix => {
179                     if let Expression::Literal(_, _) = **expr2 {
180                         expr2.remove_substring(substring, length)
181                     }
182                 }
183             },
184             Expression::Literal(cluster, _) => match substring {
185                 Substring::Prefix => {
186                     cluster.graphemes_mut().drain(..length);
187                 }
188                 Substring::Suffix => {
189                     let graphemes = cluster.graphemes_mut();
190                     graphemes.drain(graphemes.len() - length..);
191                 }
192             },
193             _ => (),
194         }
195     }
196 
value(&self, substring: Option<&Substring>) -> Option<Vec<Grapheme>>197     pub(crate) fn value(&self, substring: Option<&Substring>) -> Option<Vec<Grapheme>> {
198         match self {
199             Expression::Concatenation(expr1, expr2, _) => match substring {
200                 Some(value) => match value {
201                     Substring::Prefix => expr1.value(None),
202                     Substring::Suffix => expr2.value(None),
203                 },
204                 None => None,
205             },
206             Expression::Literal(cluster, _) => Some(cluster.graphemes().clone()),
207             _ => None,
208         }
209     }
210 
repeat_zero_or_more_times( expr: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>211     fn repeat_zero_or_more_times(
212         expr: &Option<Expression>,
213         config: &RegExpConfig,
214     ) -> Option<Expression> {
215         expr.as_ref()
216             .map(|value| Expression::new_repetition(value.clone(), Quantifier::KleeneStar, config))
217     }
218 
concatenate( a: &Option<Expression>, b: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>219     fn concatenate(
220         a: &Option<Expression>,
221         b: &Option<Expression>,
222         config: &RegExpConfig,
223     ) -> Option<Expression> {
224         if a.is_none() || b.is_none() {
225             return None;
226         }
227 
228         let expr1 = a.as_ref().unwrap();
229         let expr2 = b.as_ref().unwrap();
230 
231         if expr1.is_empty() {
232             return b.clone();
233         }
234         if expr2.is_empty() {
235             return a.clone();
236         }
237 
238         if let (Expression::Literal(graphemes_a, config), Expression::Literal(graphemes_b, _)) =
239             (&expr1, &expr2)
240         {
241             return Some(Expression::new_literal(
242                 GraphemeCluster::merge(graphemes_a, graphemes_b, config),
243                 config,
244             ));
245         }
246 
247         if let (Expression::Literal(graphemes_a, _), Expression::Concatenation(first, second, _)) =
248             (&expr1, &expr2)
249         {
250             if let Expression::Literal(graphemes_first, config) = &**first {
251                 let literal = Expression::new_literal(
252                     GraphemeCluster::merge(graphemes_a, graphemes_first, config),
253                     config,
254                 );
255                 return Some(Expression::new_concatenation(
256                     literal,
257                     *second.clone(),
258                     config,
259                 ));
260             }
261         }
262 
263         if let (Expression::Literal(graphemes_b, _), Expression::Concatenation(first, second, _)) =
264             (&expr2, &expr1)
265         {
266             if let Expression::Literal(graphemes_second, config) = &**second {
267                 let literal = Expression::new_literal(
268                     GraphemeCluster::merge(graphemes_second, graphemes_b, config),
269                     config,
270                 );
271                 return Some(Expression::new_concatenation(
272                     *first.clone(),
273                     literal,
274                     config,
275                 ));
276             }
277         }
278 
279         Some(Expression::new_concatenation(
280             expr1.clone(),
281             expr2.clone(),
282             config,
283         ))
284     }
285 
union( a: &Option<Expression>, b: &Option<Expression>, config: &RegExpConfig, ) -> Option<Expression>286     fn union(
287         a: &Option<Expression>,
288         b: &Option<Expression>,
289         config: &RegExpConfig,
290     ) -> Option<Expression> {
291         if let (Some(mut expr1), Some(mut expr2)) = (a.clone(), b.clone()) {
292             if expr1 != expr2 {
293                 let common_prefix =
294                     Self::remove_common_substring(&mut expr1, &mut expr2, Substring::Prefix);
295                 let common_suffix =
296                     Self::remove_common_substring(&mut expr1, &mut expr2, Substring::Suffix);
297 
298                 let mut result = if expr1.is_empty() {
299                     Some(Expression::new_repetition(
300                         expr2.clone(),
301                         Quantifier::QuestionMark,
302                         config,
303                     ))
304                 } else if expr2.is_empty() {
305                     Some(Expression::new_repetition(
306                         expr1.clone(),
307                         Quantifier::QuestionMark,
308                         config,
309                     ))
310                 } else {
311                     None
312                 };
313 
314                 if result.is_none() {
315                     if let Expression::Repetition(expr, quantifier, _) = &expr1 {
316                         if quantifier == &Quantifier::QuestionMark {
317                             let alternation =
318                                 Expression::new_alternation(*expr.clone(), expr2.clone(), config);
319                             result = Some(Expression::new_repetition(
320                                 alternation,
321                                 Quantifier::QuestionMark,
322                                 config,
323                             ));
324                         }
325                     }
326                 }
327 
328                 if result.is_none() {
329                     if let Expression::Repetition(expr, quantifier, _) = &expr2 {
330                         if quantifier == &Quantifier::QuestionMark {
331                             let alternation =
332                                 Expression::new_alternation(expr1.clone(), *expr.clone(), config);
333                             result = Some(Expression::new_repetition(
334                                 alternation,
335                                 Quantifier::QuestionMark,
336                                 config,
337                             ));
338                         }
339                     }
340                 }
341 
342                 if result.is_none() && expr1.is_single_codepoint() && expr2.is_single_codepoint() {
343                     let first_char_set = Self::extract_character_set(expr1.clone());
344                     let second_char_set = Self::extract_character_set(expr2.clone());
345                     result = Some(Expression::new_character_class(
346                         first_char_set,
347                         second_char_set,
348                         config,
349                     ));
350                 }
351 
352                 if result.is_none() {
353                     result = Some(Expression::new_alternation(expr1, expr2, config));
354                 }
355 
356                 if let Some(prefix) = common_prefix {
357                     result = Some(Expression::new_concatenation(
358                         Expression::new_literal(
359                             GraphemeCluster::from_graphemes(prefix, config),
360                             config,
361                         ),
362                         result.unwrap(),
363                         config,
364                     ));
365                 }
366 
367                 if let Some(suffix) = common_suffix {
368                     result = Some(Expression::new_concatenation(
369                         result.unwrap(),
370                         Expression::new_literal(
371                             GraphemeCluster::from_graphemes(suffix, config),
372                             config,
373                         ),
374                         config,
375                     ));
376                 }
377 
378                 result
379             } else if a.is_some() {
380                 a.clone()
381             } else if b.is_some() {
382                 b.clone()
383             } else {
384                 None
385             }
386         } else if a.is_some() {
387             a.clone()
388         } else if b.is_some() {
389             b.clone()
390         } else {
391             None
392         }
393     }
394 
flatten_alternations( flattened_options: &mut Vec<Expression>, current_options: Vec<Expression>, )395     fn flatten_alternations(
396         flattened_options: &mut Vec<Expression>,
397         current_options: Vec<Expression>,
398     ) {
399         for option in current_options {
400             if let Expression::Alternation(expr_options, _) = option {
401                 Self::flatten_alternations(flattened_options, expr_options);
402             } else {
403                 flattened_options.push(option);
404             }
405         }
406     }
407 
extract_character_set(expr: Expression) -> BTreeSet<char>408     fn extract_character_set(expr: Expression) -> BTreeSet<char> {
409         match expr {
410             Expression::Literal(cluster, _) => {
411                 let single_char = cluster
412                     .graphemes()
413                     .first()
414                     .unwrap()
415                     .value()
416                     .chars()
417                     .next()
418                     .unwrap();
419                 btreeset![single_char]
420             }
421             Expression::CharacterClass(char_set, _) => char_set,
422             _ => BTreeSet::new(),
423         }
424     }
425 
remove_common_substring( a: &mut Expression, b: &mut Expression, substring: Substring, ) -> Option<Vec<Grapheme>>426     fn remove_common_substring(
427         a: &mut Expression,
428         b: &mut Expression,
429         substring: Substring,
430     ) -> Option<Vec<Grapheme>> {
431         let common_substring = Self::find_common_substring(a, b, &substring);
432         if let Some(value) = &common_substring {
433             a.remove_substring(&substring, value.len());
434             b.remove_substring(&substring, value.len());
435         }
436         common_substring
437     }
438 
find_common_substring( a: &Expression, b: &Expression, substring: &Substring, ) -> Option<Vec<Grapheme>>439     fn find_common_substring(
440         a: &Expression,
441         b: &Expression,
442         substring: &Substring,
443     ) -> Option<Vec<Grapheme>> {
444         let mut graphemes_a = a.value(Some(substring)).unwrap_or_else(Vec::new);
445         let mut graphemes_b = b.value(Some(substring)).unwrap_or_else(Vec::new);
446         let mut common_graphemes = vec![];
447 
448         if let Substring::Suffix = substring {
449             graphemes_a.reverse();
450             graphemes_b.reverse();
451         }
452 
453         for pair in graphemes_a.iter().zip_longest(graphemes_b.iter()) {
454             match pair {
455                 Both(grapheme_a, grapheme_b) => {
456                     if grapheme_a == grapheme_b {
457                         common_graphemes.push(grapheme_a.clone());
458                     } else {
459                         break;
460                     }
461                 }
462                 _ => break,
463             }
464         }
465 
466         if let Substring::Suffix = substring {
467             common_graphemes.reverse();
468         }
469 
470         if common_graphemes.is_empty() {
471             None
472         } else {
473             Some(common_graphemes)
474         }
475     }
476 }
477 
478 #[cfg(test)]
479 mod tests {
480     use super::*;
481 
482     #[test]
ensure_correct_string_representation_of_alternation_1()483     fn ensure_correct_string_representation_of_alternation_1() {
484         let config = RegExpConfig::new();
485         let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config);
486         let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config);
487         let alternation = Expression::new_alternation(literal1, literal2, &config);
488         assert_eq!(alternation.to_string(), "abc|def");
489     }
490 
491     #[test]
ensure_correct_string_representation_of_alternation_2()492     fn ensure_correct_string_representation_of_alternation_2() {
493         let config = RegExpConfig::new();
494         let literal1 = Expression::new_literal(GraphemeCluster::from("a", &config), &config);
495         let literal2 = Expression::new_literal(GraphemeCluster::from("ab", &config), &config);
496         let literal3 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config);
497         let alternation1 = Expression::new_alternation(literal1, literal2, &config);
498         let alternation2 = Expression::new_alternation(alternation1, literal3, &config);
499         assert_eq!(alternation2.to_string(), "abc|ab|a");
500     }
501 
502     #[test]
ensure_correct_string_representation_of_character_class_1()503     fn ensure_correct_string_representation_of_character_class_1() {
504         let config = RegExpConfig::new();
505         let char_class = Expression::new_character_class(btreeset!['a'], btreeset!['b'], &config);
506         assert_eq!(char_class.to_string(), "[ab]");
507     }
508 
509     #[test]
ensure_correct_string_representation_of_character_class_2()510     fn ensure_correct_string_representation_of_character_class_2() {
511         let config = RegExpConfig::new();
512         let char_class =
513             Expression::new_character_class(btreeset!['a', 'b'], btreeset!['c'], &config);
514         assert_eq!(char_class.to_string(), "[a-c]");
515     }
516 
517     #[test]
ensure_correct_string_representation_of_concatenation_1()518     fn ensure_correct_string_representation_of_concatenation_1() {
519         let config = RegExpConfig::new();
520         let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config);
521         let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config);
522         let concatenation = Expression::new_concatenation(literal1, literal2, &config);
523         assert_eq!(concatenation.to_string(), "abcdef");
524     }
525 
526     #[test]
ensure_correct_string_representation_of_concatenation_2()527     fn ensure_correct_string_representation_of_concatenation_2() {
528         let config = RegExpConfig::new();
529         let literal1 = Expression::new_literal(GraphemeCluster::from("abc", &config), &config);
530         let literal2 = Expression::new_literal(GraphemeCluster::from("def", &config), &config);
531         let repetition = Expression::new_repetition(literal1, Quantifier::KleeneStar, &config);
532         let concatenation = Expression::new_concatenation(repetition, literal2, &config);
533         assert_eq!(concatenation.to_string(), "(?:abc)*def");
534     }
535 
536     #[test]
ensure_correct_removal_of_prefix_in_literal()537     fn ensure_correct_removal_of_prefix_in_literal() {
538         let config = RegExpConfig::new();
539         let mut literal =
540             Expression::new_literal(GraphemeCluster::from("abcdef", &config), &config);
541         assert_eq!(
542             literal.value(None),
543             Some(
544                 vec!["a", "b", "c", "d", "e", "f"]
545                     .iter()
546                     .map(|&it| Grapheme::from(it, &config))
547                     .collect_vec()
548             )
549         );
550 
551         literal.remove_substring(&Substring::Prefix, 2);
552         assert_eq!(
553             literal.value(None),
554             Some(
555                 vec!["c", "d", "e", "f"]
556                     .iter()
557                     .map(|&it| Grapheme::from(it, &config))
558                     .collect_vec()
559             )
560         );
561     }
562 
563     #[test]
ensure_correct_removal_of_suffix_in_literal()564     fn ensure_correct_removal_of_suffix_in_literal() {
565         let config = RegExpConfig::new();
566         let mut literal =
567             Expression::new_literal(GraphemeCluster::from("abcdef", &config), &config);
568         assert_eq!(
569             literal.value(None),
570             Some(
571                 vec!["a", "b", "c", "d", "e", "f"]
572                     .iter()
573                     .map(|&it| Grapheme::from(it, &config))
574                     .collect_vec()
575             )
576         );
577 
578         literal.remove_substring(&Substring::Suffix, 2);
579         assert_eq!(
580             literal.value(None),
581             Some(
582                 vec!["a", "b", "c", "d"]
583                     .iter()
584                     .map(|&it| Grapheme::from(it, &config))
585                     .collect_vec()
586             )
587         );
588     }
589 
590     #[test]
ensure_correct_string_representation_of_repetition_1()591     fn ensure_correct_string_representation_of_repetition_1() {
592         let config = RegExpConfig::new();
593         let literal = Expression::new_literal(GraphemeCluster::from("abc", &config), &config);
594         let repetition = Expression::new_repetition(literal, Quantifier::KleeneStar, &config);
595         assert_eq!(repetition.to_string(), "(?:abc)*");
596     }
597 
598     #[test]
ensure_correct_string_representation_of_repetition_2()599     fn ensure_correct_string_representation_of_repetition_2() {
600         let config = RegExpConfig::new();
601         let literal = Expression::new_literal(GraphemeCluster::from("a", &config), &config);
602         let repetition = Expression::new_repetition(literal, Quantifier::QuestionMark, &config);
603         assert_eq!(repetition.to_string(), "a?");
604     }
605 }
606