1 /*!
2 
3 Generates an iterator type `Matcher` that looks roughly like
4 
5 ```ignore
6 mod intern_token {
7     extern crate regex as regex;
8 
9     #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
10     pub struct Token<'input>(pub usize, pub &'input str);
11     //                           ~~~~~~     ~~~~~~~~~~~
12     //                           token      token
13     //                           index      text
14     //                           (type)
15 
16     impl<'a> fmt::Display for Token<'a> { ... }
17 
18     pub struct MatcherBuilder {
19         regex_set: regex::RegexSet,
20         regex_vec: Vec<regex::Regex>,
21     }
22 
23     impl MatcherBuilder {
24         fn new() -> MatchBuilder { ... }
25         fn matcher<'input, 'builder>(&'builder self, s: &'input str) -> Matcher<'input, 'builder> { ... }
26     }
27 
28     pub struct Matcher<'input, 'builder> {
29         text: &'input str,
30         consumed: usize,
31         regex_set: &'builder regex::RegexSet,
32         regex_vec: &'builder Vec<regex::Regex>,
33     }
34 
35     impl Matcher<'input> {
36         fn tokenize(&self, text: &str) -> Option<(usize, usize)> { ... }
37     }
38 
39     impl<'input> Iterator for Matcher<'input> {
40         type Item = Result<(usize, Token<'input>, usize), ParseError>;
41         //                  ~~~~~  ~~~~~~~~~~~~~  ~~~~~
42         //                  start  token          end
43     }
44 }
45 ```
46 
47  */
48 
49 use grammar::parse_tree::InternToken;
50 use grammar::repr::{Grammar, TerminalLiteral};
51 use lexer::re;
52 use rust::RustWrite;
53 use std::io::{self, Write};
54 
compile<W: Write>( grammar: &Grammar, intern_token: &InternToken, out: &mut RustWrite<W>, ) -> io::Result<()>55 pub fn compile<W: Write>(
56     grammar: &Grammar,
57     intern_token: &InternToken,
58     out: &mut RustWrite<W>,
59 ) -> io::Result<()> {
60     let prefix = &grammar.prefix;
61 
62     rust!(out, "#[cfg_attr(rustfmt, rustfmt_skip)]");
63     rust!(out, "mod {}intern_token {{", prefix);
64     rust!(out, "#![allow(unused_imports)]");
65     try!(out.write_uses("", &grammar));
66     rust!(out, "extern crate regex as {}regex;", prefix);
67     rust!(out, "use std::fmt as {}fmt;", prefix);
68     rust!(out, "");
69     rust!(
70         out,
71         "#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]"
72     );
73     rust!(out, "pub struct Token<'input>(pub usize, pub &'input str);");
74     rust!(out, "impl<'a> {}fmt::Display for Token<'a> {{", prefix);
75     rust!(
76         out,
77         "fn fmt(&self, formatter: &mut {}fmt::Formatter) -> Result<(), {}fmt::Error> {{",
78         prefix,
79         prefix
80     );
81     rust!(out, "{}fmt::Display::fmt(self.1, formatter)", prefix);
82     rust!(out, "}}");
83     rust!(out, "}}");
84     rust!(out, "");
85     rust!(out, "pub struct {}MatcherBuilder {{", prefix);
86     rust!(out, "regex_set: {}regex::RegexSet,", prefix);
87     rust!(out, "regex_vec: Vec<{}regex::Regex>,", prefix);
88     rust!(out, "}}");
89     rust!(out, "");
90     rust!(out, "impl {}MatcherBuilder {{", prefix);
91     rust!(out, "pub fn new() -> {}MatcherBuilder {{", prefix);
92 
93     // create a vector of rust string literals with the text of each
94     // regular expression
95     let regex_strings: Vec<String> = {
96         intern_token
97             .match_entries
98             .iter()
99             .map(|match_entry| match match_entry.match_literal {
100                 TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
101                 TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
102             })
103             .map(|regex| {
104                 // make sure all regex are anchored at the beginning of the input
105                 format!("^({})", regex)
106             })
107             .map(|regex_str| {
108                 // create a rust string with text of the regex; the Debug impl
109                 // will add quotes and escape
110                 format!("{:?}", regex_str)
111             })
112             .collect()
113     };
114 
115     rust!(out, "let {}strs: &[&str] = &[", prefix);
116     for literal in &regex_strings {
117         rust!(out, "{},", literal);
118     }
119     rust!(out, "];");
120     rust!(
121         out,
122         "let {}regex_set = {}regex::RegexSet::new({}strs).unwrap();",
123         prefix,
124         prefix,
125         prefix
126     );
127 
128     rust!(out, "let {}regex_vec = vec![", prefix);
129     for literal in &regex_strings {
130         rust!(out, "{}regex::Regex::new({}).unwrap(),", prefix, literal);
131     }
132     rust!(out, "];");
133 
134     rust!(
135         out,
136         "{0}MatcherBuilder {{ regex_set: {0}regex_set, regex_vec: {0}regex_vec }}",
137         prefix
138     );
139     rust!(out, "}}"); // fn new()
140     rust!(
141         out,
142         "pub fn matcher<'input, 'builder>(&'builder self, s: &'input str) \
143          -> {}Matcher<'input, 'builder> {{",
144         prefix
145     );
146     rust!(out, "{}Matcher {{", prefix);
147     rust!(out, "text: s,");
148     rust!(out, "consumed: 0,");
149     rust!(out, "regex_set: &self.regex_set,");
150     rust!(out, "regex_vec: &self.regex_vec,");
151     rust!(out, "}}"); // struct literal
152     rust!(out, "}}"); // fn matcher()
153     rust!(out, "}}"); // impl MatcherBuilder
154     rust!(out, "");
155     rust!(out, "pub struct {}Matcher<'input, 'builder> {{", prefix);
156     rust!(out, "text: &'input str,"); // remaining input
157     rust!(out, "consumed: usize,"); // number of chars consumed thus far
158     rust!(out, "regex_set: &'builder {}regex::RegexSet,", prefix);
159     rust!(out, "regex_vec: &'builder Vec<{}regex::Regex>,", prefix);
160     rust!(out, "}}");
161     rust!(out, "");
162     rust!(
163         out,
164         "impl<'input, 'builder> Iterator for {}Matcher<'input, 'builder> {{",
165         prefix
166     );
167     rust!(
168         out,
169         "type Item = Result<(usize, Token<'input>, usize), \
170          {}lalrpop_util::ParseError<usize,Token<'input>,{}>>;",
171         prefix,
172         grammar.types.error_type()
173     );
174     rust!(out, "");
175     rust!(out, "fn next(&mut self) -> Option<Self::Item> {{");
176 
177     // start by trimming whitespace from left
178     rust!(out, "#[allow(deprecated)]");
179     rust!(out, "let {}text = self.text.trim_left();", prefix);
180     rust!(
181         out,
182         "let {}whitespace = self.text.len() - {}text.len();",
183         prefix,
184         prefix
185     );
186     rust!(
187         out,
188         "let {}start_offset = self.consumed + {}whitespace;",
189         prefix,
190         prefix
191     );
192 
193     // if nothing left, return None
194     rust!(out, "if {}text.is_empty() {{", prefix);
195     rust!(out, "self.text = {}text;", prefix);
196     rust!(out, "self.consumed = {}start_offset;", prefix);
197     rust!(out, "None");
198     rust!(out, "}} else {{");
199 
200     // otherwise, use regex-set to find list of matching tokens
201     rust!(
202         out,
203         "let {}matches = self.regex_set.matches({}text);",
204         prefix,
205         prefix
206     );
207 
208     // if nothing matched, return an error
209     rust!(out, "if !{}matches.matched_any() {{", prefix);
210     rust!(
211         out,
212         "Some(Err({}lalrpop_util::ParseError::InvalidToken {{",
213         prefix
214     );
215     rust!(out, "location: {}start_offset,", prefix);
216     rust!(out, "}}))");
217     rust!(out, "}} else {{");
218 
219     // otherwise, have to find longest, highest-priority match. We have the literals
220     // sorted in order of increasing precedence, so we'll iterate over them one by one,
221     // checking if each one matches, and remembering the longest one.
222     rust!(out, "let mut {}longest_match = 0;", prefix); // length of longest match
223     rust!(out, "let mut {}index = 0;", prefix); // index of longest match
224     rust!(
225         out,
226         "for {}i in 0 .. {} {{",
227         prefix,
228         intern_token.match_entries.len()
229     );
230     rust!(out, "if {}matches.matched({}i) {{", prefix, prefix);
231 
232     // re-run the regex to find out how long this particular match
233     // was, then compare that against the longest-match so far. Note
234     // that the order of the tuple is carefully constructed to ensure
235     // that (a) we get the longest-match but (b) if two matches are
236     // equal, we get the largest index. This is because the indices
237     // are sorted in order of increasing priority, and because we know
238     // that indices of equal priority cannot both match (because of
239     // the DFA check).
240     rust!(
241         out,
242         "let {}match = self.regex_vec[{}i].find({}text).unwrap();",
243         prefix,
244         prefix,
245         prefix
246     );
247     rust!(out, "let {}len = {}match.end();", prefix, prefix);
248     rust!(out, "if {}len >= {}longest_match {{", prefix, prefix);
249     rust!(out, "{}longest_match = {}len;", prefix, prefix);
250     rust!(out, "{}index = {}i;", prefix, prefix);
251     rust!(out, "}}"); // if is longest match
252     rust!(out, "}}"); // if matches.matched(i)
253     rust!(out, "}}"); // for loop
254 
255     // transform the result into the expected return value
256     rust!(
257         out,
258         "let {}result = &{}text[..{}longest_match];",
259         prefix,
260         prefix,
261         prefix
262     );
263     rust!(
264         out,
265         "let {}remaining = &{}text[{}longest_match..];",
266         prefix,
267         prefix,
268         prefix
269     );
270     rust!(
271         out,
272         "let {}end_offset = {}start_offset + {}longest_match;",
273         prefix,
274         prefix,
275         prefix
276     );
277     rust!(out, "self.text = {}remaining;", prefix);
278     rust!(out, "self.consumed = {}end_offset;", prefix);
279     rust!(
280         out,
281         "Some(Ok(({}start_offset, Token({}index, {}result), {}end_offset)))",
282         prefix,
283         prefix,
284         prefix,
285         prefix
286     );
287 
288     rust!(out, "}}"); // else
289     rust!(out, "}}"); // else
290     rust!(out, "}}"); // fn
291     rust!(out, "}}"); // impl
292     rust!(out, "}}"); // mod
293     Ok(())
294 }
295