1 /*!
2 This module provides a regular expression printer for `Hir`.
3 */
4 
5 use std::fmt;
6 
7 use crate::hir::visitor::{self, Visitor};
8 use crate::hir::{self, Hir, HirKind};
9 use crate::is_meta_character;
10 
11 /// A builder for constructing a printer.
12 ///
13 /// Note that since a printer doesn't have any configuration knobs, this type
14 /// remains unexported.
15 #[derive(Clone, Debug)]
16 struct PrinterBuilder {
17     _priv: (),
18 }
19 
20 impl Default for PrinterBuilder {
default() -> PrinterBuilder21     fn default() -> PrinterBuilder {
22         PrinterBuilder::new()
23     }
24 }
25 
26 impl PrinterBuilder {
new() -> PrinterBuilder27     fn new() -> PrinterBuilder {
28         PrinterBuilder { _priv: () }
29     }
30 
build(&self) -> Printer31     fn build(&self) -> Printer {
32         Printer { _priv: () }
33     }
34 }
35 
36 /// A printer for a regular expression's high-level intermediate
37 /// representation.
38 ///
39 /// A printer converts a high-level intermediate representation (HIR) to a
40 /// regular expression pattern string. This particular printer uses constant
41 /// stack space and heap space proportional to the size of the HIR.
42 ///
43 /// Since this printer is only using the HIR, the pattern it prints will likely
44 /// not resemble the original pattern at all. For example, a pattern like
45 /// `\pL` will have its entire class written out.
46 ///
47 /// The purpose of this printer is to provide a means to mutate an HIR and then
48 /// build a regular expression from the result of that mutation. (A regex
49 /// library could provide a constructor from this HIR explicitly, but that
50 /// creates an unnecessary public coupling between the regex library and this
51 /// specific HIR representation.)
52 #[derive(Debug)]
53 pub struct Printer {
54     _priv: (),
55 }
56 
57 impl Printer {
58     /// Create a new printer.
new() -> Printer59     pub fn new() -> Printer {
60         PrinterBuilder::new().build()
61     }
62 
63     /// Print the given `Ast` to the given writer. The writer must implement
64     /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65     /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66     /// implementations) or a `&mut String`.
print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result67     pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68         visitor::visit(hir, Writer { printer: self, wtr: wtr })
69     }
70 }
71 
72 #[derive(Debug)]
73 struct Writer<'p, W> {
74     printer: &'p mut Printer,
75     wtr: W,
76 }
77 
78 impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
79     type Output = ();
80     type Err = fmt::Error;
81 
finish(self) -> fmt::Result82     fn finish(self) -> fmt::Result {
83         Ok(())
84     }
85 
visit_pre(&mut self, hir: &Hir) -> fmt::Result86     fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
87         match *hir.kind() {
88             HirKind::Empty
89             | HirKind::Repetition(_)
90             | HirKind::Concat(_)
91             | HirKind::Alternation(_) => {}
92             HirKind::Literal(hir::Literal::Unicode(c)) => {
93                 self.write_literal_char(c)?;
94             }
95             HirKind::Literal(hir::Literal::Byte(b)) => {
96                 self.write_literal_byte(b)?;
97             }
98             HirKind::Class(hir::Class::Unicode(ref cls)) => {
99                 self.wtr.write_str("[")?;
100                 for range in cls.iter() {
101                     if range.start() == range.end() {
102                         self.write_literal_char(range.start())?;
103                     } else {
104                         self.write_literal_char(range.start())?;
105                         self.wtr.write_str("-")?;
106                         self.write_literal_char(range.end())?;
107                     }
108                 }
109                 self.wtr.write_str("]")?;
110             }
111             HirKind::Class(hir::Class::Bytes(ref cls)) => {
112                 self.wtr.write_str("(?-u:[")?;
113                 for range in cls.iter() {
114                     if range.start() == range.end() {
115                         self.write_literal_class_byte(range.start())?;
116                     } else {
117                         self.write_literal_class_byte(range.start())?;
118                         self.wtr.write_str("-")?;
119                         self.write_literal_class_byte(range.end())?;
120                     }
121                 }
122                 self.wtr.write_str("])")?;
123             }
124             HirKind::Anchor(hir::Anchor::StartLine) => {
125                 self.wtr.write_str("(?m:^)")?;
126             }
127             HirKind::Anchor(hir::Anchor::EndLine) => {
128                 self.wtr.write_str("(?m:$)")?;
129             }
130             HirKind::Anchor(hir::Anchor::StartText) => {
131                 self.wtr.write_str(r"\A")?;
132             }
133             HirKind::Anchor(hir::Anchor::EndText) => {
134                 self.wtr.write_str(r"\z")?;
135             }
136             HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
137                 self.wtr.write_str(r"\b")?;
138             }
139             HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
140                 self.wtr.write_str(r"\B")?;
141             }
142             HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
143                 self.wtr.write_str(r"(?-u:\b)")?;
144             }
145             HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
146                 self.wtr.write_str(r"(?-u:\B)")?;
147             }
148             HirKind::Group(ref x) => match x.kind {
149                 hir::GroupKind::CaptureIndex(_) => {
150                     self.wtr.write_str("(")?;
151                 }
152                 hir::GroupKind::CaptureName { ref name, .. } => {
153                     write!(self.wtr, "(?P<{}>", name)?;
154                 }
155                 hir::GroupKind::NonCapturing => {
156                     self.wtr.write_str("(?:")?;
157                 }
158             },
159         }
160         Ok(())
161     }
162 
visit_post(&mut self, hir: &Hir) -> fmt::Result163     fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
164         match *hir.kind() {
165             // Handled during visit_pre
166             HirKind::Empty
167             | HirKind::Literal(_)
168             | HirKind::Class(_)
169             | HirKind::Anchor(_)
170             | HirKind::WordBoundary(_)
171             | HirKind::Concat(_)
172             | HirKind::Alternation(_) => {}
173             HirKind::Repetition(ref x) => {
174                 match x.kind {
175                     hir::RepetitionKind::ZeroOrOne => {
176                         self.wtr.write_str("?")?;
177                     }
178                     hir::RepetitionKind::ZeroOrMore => {
179                         self.wtr.write_str("*")?;
180                     }
181                     hir::RepetitionKind::OneOrMore => {
182                         self.wtr.write_str("+")?;
183                     }
184                     hir::RepetitionKind::Range(ref x) => match *x {
185                         hir::RepetitionRange::Exactly(m) => {
186                             write!(self.wtr, "{{{}}}", m)?;
187                         }
188                         hir::RepetitionRange::AtLeast(m) => {
189                             write!(self.wtr, "{{{},}}", m)?;
190                         }
191                         hir::RepetitionRange::Bounded(m, n) => {
192                             write!(self.wtr, "{{{},{}}}", m, n)?;
193                         }
194                     },
195                 }
196                 if !x.greedy {
197                     self.wtr.write_str("?")?;
198                 }
199             }
200             HirKind::Group(_) => {
201                 self.wtr.write_str(")")?;
202             }
203         }
204         Ok(())
205     }
206 
visit_alternation_in(&mut self) -> fmt::Result207     fn visit_alternation_in(&mut self) -> fmt::Result {
208         self.wtr.write_str("|")
209     }
210 }
211 
212 impl<'p, W: fmt::Write> Writer<'p, W> {
write_literal_char(&mut self, c: char) -> fmt::Result213     fn write_literal_char(&mut self, c: char) -> fmt::Result {
214         if is_meta_character(c) {
215             self.wtr.write_str("\\")?;
216         }
217         self.wtr.write_char(c)
218     }
219 
220     fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
221         let c = b as char;
222         if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
223             self.write_literal_char(c)
224         } else {
225             write!(self.wtr, "(?-u:\\x{:02X})", b)
226         }
227     }
228 
229     fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
230         let c = b as char;
231         if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
232             self.write_literal_char(c)
233         } else {
234             write!(self.wtr, "\\x{:02X}", b)
235         }
236     }
237 }
238 
239 #[cfg(test)]
240 mod tests {
241     use super::Printer;
242     use crate::ParserBuilder;
243 
244     fn roundtrip(given: &str, expected: &str) {
245         roundtrip_with(|b| b, given, expected);
246     }
247 
248     fn roundtrip_bytes(given: &str, expected: &str) {
249         roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
250     }
251 
252     fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
253     where
254         F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
255     {
256         let mut builder = ParserBuilder::new();
257         f(&mut builder);
258         let hir = builder.build().parse(given).unwrap();
259 
260         let mut printer = Printer::new();
261         let mut dst = String::new();
262         printer.print(&hir, &mut dst).unwrap();
263 
264         // Check that the result is actually valid.
265         builder.build().parse(&dst).unwrap();
266 
267         assert_eq!(expected, dst);
268     }
269 
270     #[test]
271     fn print_literal() {
272         roundtrip("a", "a");
273         roundtrip(r"\xff", "\u{FF}");
274         roundtrip_bytes(r"\xff", "\u{FF}");
275         roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
276         roundtrip("☃", "☃");
277     }
278 
279     #[test]
280     fn print_class() {
281         roundtrip(r"[a]", r"[a]");
282         roundtrip(r"[a-z]", r"[a-z]");
283         roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
284         roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
285         roundtrip(r"[-]", r"[\-]");
286         roundtrip(r"[☃-⛄]", r"[☃-⛄]");
287 
288         roundtrip(r"(?-u)[a]", r"(?-u:[a])");
289         roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
290         roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
291 
292         // The following test that the printer escapes meta characters
293         // in character classes.
294         roundtrip(r"[\[]", r"[\[]");
295         roundtrip(r"[Z-_]", r"[Z-_]");
296         roundtrip(r"[Z-_--Z]", r"[\[-_]");
297 
298         // The following test that the printer escapes meta characters
299         // in byte oriented character classes.
300         roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
301         roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
302         roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
303     }
304 
305     #[test]
306     fn print_anchor() {
307         roundtrip(r"^", r"\A");
308         roundtrip(r"$", r"\z");
309         roundtrip(r"(?m)^", r"(?m:^)");
310         roundtrip(r"(?m)$", r"(?m:$)");
311     }
312 
313     #[test]
314     fn print_word_boundary() {
315         roundtrip(r"\b", r"\b");
316         roundtrip(r"\B", r"\B");
317         roundtrip(r"(?-u)\b", r"(?-u:\b)");
318         roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
319     }
320 
321     #[test]
322     fn print_repetition() {
323         roundtrip("a?", "a?");
324         roundtrip("a??", "a??");
325         roundtrip("(?U)a?", "a??");
326 
327         roundtrip("a*", "a*");
328         roundtrip("a*?", "a*?");
329         roundtrip("(?U)a*", "a*?");
330 
331         roundtrip("a+", "a+");
332         roundtrip("a+?", "a+?");
333         roundtrip("(?U)a+", "a+?");
334 
335         roundtrip("a{1}", "a{1}");
336         roundtrip("a{1,}", "a{1,}");
337         roundtrip("a{1,5}", "a{1,5}");
338         roundtrip("a{1}?", "a{1}?");
339         roundtrip("a{1,}?", "a{1,}?");
340         roundtrip("a{1,5}?", "a{1,5}?");
341         roundtrip("(?U)a{1}", "a{1}?");
342         roundtrip("(?U)a{1,}", "a{1,}?");
343         roundtrip("(?U)a{1,5}", "a{1,5}?");
344     }
345 
346     #[test]
347     fn print_group() {
348         roundtrip("()", "()");
349         roundtrip("(?P<foo>)", "(?P<foo>)");
350         roundtrip("(?:)", "(?:)");
351 
352         roundtrip("(a)", "(a)");
353         roundtrip("(?P<foo>a)", "(?P<foo>a)");
354         roundtrip("(?:a)", "(?:a)");
355 
356         roundtrip("((((a))))", "((((a))))");
357     }
358 
359     #[test]
360     fn print_alternation() {
361         roundtrip("|", "|");
362         roundtrip("||", "||");
363 
364         roundtrip("a|b", "a|b");
365         roundtrip("a|b|c", "a|b|c");
366         roundtrip("foo|bar|quux", "foo|bar|quux");
367     }
368 }
369