1 //! Generates an iterator type `Matcher` that looks roughly like
2 //!
3 //! ```ignore
4 //! mod intern_token {
5 //!     extern crate regex as regex;
6 //!
7 //!     #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
8 //!     pub struct Token<'input>(pub usize, pub &'input str);
9 //!     //                           ~~~~~~     ~~~~~~~~~~~
10 //!     //                           token      token
11 //!     //                           index      text
12 //!     //                           (type)
13 //!
14 //!     impl<'a> fmt::Display for Token<'a> { ... }
15 //!
16 //!     pub struct MatcherBuilder {
17 //!         regex_set: regex::RegexSet,
18 //!         regex_vec: Vec<regex::Regex>,
19 //!     }
20 //!
21 //!     impl MatcherBuilder {
22 //!         fn new() -> MatchBuilder { ... }
23 //!         fn matcher<'input, 'builder>(&'builder self, s: &'input str) -> Matcher<'input, 'builder> { ... }
24 //!     }
25 //!
26 //!     pub struct Matcher<'input, 'builder> {
27 //!         text: &'input str,
28 //!         consumed: usize,
29 //!         regex_set: &'builder regex::RegexSet,
30 //!         regex_vec: &'builder Vec<regex::Regex>,
31 //!     }
32 //!
33 //!     impl Matcher<'input> {
34 //!         fn tokenize(&self, text: &str) -> Option<(usize, usize)> { ... }
35 //!     }
36 //!
37 //!     impl<'input> Iterator for Matcher<'input> {
38 //!         type Item = Result<(usize, Token<'input>, usize), ParseError>;
39 //!         //                  ~~~~~  ~~~~~~~~~~~~~  ~~~~~
40 //!         //                  start  token          end
41 //!     }
42 //! }
43 //! ```
44 
45 use grammar::parse_tree::InternToken;
46 use grammar::repr::{Grammar, TerminalLiteral};
47 use lexer::re;
48 use rust::RustWrite;
49 use std::io::{self, Write};
50 
compile<W: Write>( grammar: &Grammar, intern_token: &InternToken, out: &mut RustWrite<W>, ) -> io::Result<()>51 pub fn compile<W: Write>(
52     grammar: &Grammar,
53     intern_token: &InternToken,
54     out: &mut RustWrite<W>,
55 ) -> io::Result<()> {
56     let prefix = &grammar.prefix;
57 
58     rust!(out, "#[cfg_attr(rustfmt, rustfmt_skip)]");
59     rust!(out, "mod {}intern_token {{", prefix);
60     rust!(out, "#![allow(unused_imports)]");
61     out.write_uses("", &grammar)?;
62     rust!(out, "extern crate regex as {}regex;", prefix);
63     rust!(out, "use std::fmt as {}fmt;", prefix);
64     rust!(out, "");
65     rust!(
66         out,
67         "#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]"
68     );
69     rust!(out, "pub struct Token<'input>(pub usize, pub &'input str);");
70     rust!(out, "impl<'a> {}fmt::Display for Token<'a> {{", prefix);
71     rust!(
72         out,
73         "fn fmt<'f>(&self, formatter: &mut {}fmt::Formatter<'f>) -> Result<(), {}fmt::Error> {{",
74         prefix,
75         prefix
76     );
77     rust!(out, "{}fmt::Display::fmt(self.1, formatter)", prefix);
78     rust!(out, "}}");
79     rust!(out, "}}");
80     rust!(out, "");
81     rust!(out, "pub struct {}MatcherBuilder {{", prefix);
82     rust!(out, "regex_set: {}regex::RegexSet,", prefix);
83     rust!(out, "regex_vec: Vec<{}regex::Regex>,", prefix);
84     rust!(out, "}}");
85     rust!(out, "");
86     rust!(out, "impl {}MatcherBuilder {{", prefix);
87     rust!(out, "pub fn new() -> {}MatcherBuilder {{", prefix);
88 
89     // create a vector of rust string literals with the text of each
90     // regular expression
91     let regex_strings: Vec<String> = {
92         intern_token
93             .match_entries
94             .iter()
95             .map(|match_entry| match match_entry.match_literal {
96                 TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
97                 TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
98             })
99             .map(|regex| {
100                 // make sure all regex are anchored at the beginning of the input
101                 format!("^({})", regex)
102             })
103             .map(|regex_str| {
104                 // create a rust string with text of the regex; the Debug impl
105                 // will add quotes and escape
106                 format!("{:?}", regex_str)
107             })
108             .collect()
109     };
110 
111     rust!(out, "let {}strs: &[&str] = &[", prefix);
112     for literal in &regex_strings {
113         rust!(out, "{},", literal);
114     }
115     rust!(out, "];");
116     rust!(
117         out,
118         "let {}regex_set = {}regex::RegexSet::new({}strs).unwrap();",
119         prefix,
120         prefix,
121         prefix
122     );
123 
124     rust!(out, "let {}regex_vec = vec![", prefix);
125     for literal in &regex_strings {
126         rust!(out, "{}regex::Regex::new({}).unwrap(),", prefix, literal);
127     }
128     rust!(out, "];");
129 
130     rust!(
131         out,
132         "{0}MatcherBuilder {{ regex_set: {0}regex_set, regex_vec: {0}regex_vec }}",
133         prefix
134     );
135     rust!(out, "}}"); // fn new()
136     rust!(
137         out,
138         "pub fn matcher<'input, 'builder>(&'builder self, s: &'input str) \
139          -> {}Matcher<'input, 'builder> {{",
140         prefix
141     );
142     rust!(out, "{}Matcher {{", prefix);
143     rust!(out, "text: s,");
144     rust!(out, "consumed: 0,");
145     rust!(out, "regex_set: &self.regex_set,");
146     rust!(out, "regex_vec: &self.regex_vec,");
147     rust!(out, "}}"); // struct literal
148     rust!(out, "}}"); // fn matcher()
149     rust!(out, "}}"); // impl MatcherBuilder
150     rust!(out, "");
151     rust!(out, "pub struct {}Matcher<'input, 'builder> {{", prefix);
152     rust!(out, "text: &'input str,"); // remaining input
153     rust!(out, "consumed: usize,"); // number of chars consumed thus far
154     rust!(out, "regex_set: &'builder {}regex::RegexSet,", prefix);
155     rust!(out, "regex_vec: &'builder Vec<{}regex::Regex>,", prefix);
156     rust!(out, "}}");
157     rust!(out, "");
158     rust!(
159         out,
160         "impl<'input, 'builder> Iterator for {}Matcher<'input, 'builder> {{",
161         prefix
162     );
163     rust!(
164         out,
165         "type Item = Result<(usize, Token<'input>, usize), \
166          {}lalrpop_util::ParseError<usize,Token<'input>,{}>>;",
167         prefix,
168         grammar.types.error_type()
169     );
170     rust!(out, "");
171     rust!(out, "fn next(&mut self) -> Option<Self::Item> {{");
172 
173     // start by trimming whitespace from left
174     rust!(out, "let {}text = self.text.trim_start();", prefix);
175     rust!(
176         out,
177         "let {}whitespace = self.text.len() - {}text.len();",
178         prefix,
179         prefix
180     );
181     rust!(
182         out,
183         "let {}start_offset = self.consumed + {}whitespace;",
184         prefix,
185         prefix
186     );
187 
188     // if nothing left, return None
189     rust!(out, "if {}text.is_empty() {{", prefix);
190     rust!(out, "self.text = {}text;", prefix);
191     rust!(out, "self.consumed = {}start_offset;", prefix);
192     rust!(out, "None");
193     rust!(out, "}} else {{");
194 
195     // otherwise, use regex-set to find list of matching tokens
196     rust!(
197         out,
198         "let {}matches = self.regex_set.matches({}text);",
199         prefix,
200         prefix
201     );
202 
203     // if nothing matched, return an error
204     rust!(out, "if !{}matches.matched_any() {{", prefix);
205     rust!(
206         out,
207         "Some(Err({}lalrpop_util::ParseError::InvalidToken {{",
208         prefix
209     );
210     rust!(out, "location: {}start_offset,", prefix);
211     rust!(out, "}}))");
212     rust!(out, "}} else {{");
213 
214     // otherwise, have to find longest, highest-priority match. We have the literals
215     // sorted in order of increasing precedence, so we'll iterate over them one by one,
216     // checking if each one matches, and remembering the longest one.
217     rust!(out, "let mut {}longest_match = 0;", prefix); // length of longest match
218     rust!(out, "let mut {}index = 0;", prefix); // index of longest match
219     rust!(
220         out,
221         "for {}i in 0 .. {} {{",
222         prefix,
223         intern_token.match_entries.len()
224     );
225     rust!(out, "if {}matches.matched({}i) {{", prefix, prefix);
226 
227     // re-run the regex to find out how long this particular match
228     // was, then compare that against the longest-match so far. Note
229     // that the order of the tuple is carefully constructed to ensure
230     // that (a) we get the longest-match but (b) if two matches are
231     // equal, we get the largest index. This is because the indices
232     // are sorted in order of increasing priority, and because we know
233     // that indices of equal priority cannot both match (because of
234     // the DFA check).
235     rust!(
236         out,
237         "let {}match = self.regex_vec[{}i].find({}text).unwrap();",
238         prefix,
239         prefix,
240         prefix
241     );
242     rust!(out, "let {}len = {}match.end();", prefix, prefix);
243     rust!(out, "if {}len >= {}longest_match {{", prefix, prefix);
244     rust!(out, "{}longest_match = {}len;", prefix, prefix);
245     rust!(out, "{}index = {}i;", prefix, prefix);
246     rust!(out, "}}"); // if is longest match
247     rust!(out, "}}"); // if matches.matched(i)
248     rust!(out, "}}"); // for loop
249 
250     // transform the result into the expected return value
251     rust!(
252         out,
253         "let {}result = &{}text[..{}longest_match];",
254         prefix,
255         prefix,
256         prefix
257     );
258     rust!(
259         out,
260         "let {}remaining = &{}text[{}longest_match..];",
261         prefix,
262         prefix,
263         prefix
264     );
265     rust!(
266         out,
267         "let {}end_offset = {}start_offset + {}longest_match;",
268         prefix,
269         prefix,
270         prefix
271     );
272     rust!(out, "self.text = {}remaining;", prefix);
273     rust!(out, "self.consumed = {}end_offset;", prefix);
274     rust!(
275         out,
276         "Some(Ok(({}start_offset, Token({}index, {}result), {}end_offset)))",
277         prefix,
278         prefix,
279         prefix,
280         prefix
281     );
282 
283     rust!(out, "}}"); // else
284     rust!(out, "}}"); // else
285     rust!(out, "}}"); // fn
286     rust!(out, "}}"); // impl
287     rust!(out, "}}"); // mod
288     Ok(())
289 }
290