1 /*!
2
3 Generates an iterator type `Matcher` that looks roughly like
4
5 ```ignore
6 mod intern_token {
7 extern crate regex as regex;
8
9 #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
10 pub struct Token<'input>(pub usize, pub &'input str);
11 // ~~~~~~ ~~~~~~~~~~~
12 // token token
13 // index text
14 // (type)
15
16 impl<'a> fmt::Display for Token<'a> { ... }
17
18 pub struct MatcherBuilder {
19 regex_set: regex::RegexSet,
20 regex_vec: Vec<regex::Regex>,
21 }
22
23 impl MatcherBuilder {
24 fn new() -> MatchBuilder { ... }
25 fn matcher<'input, 'builder>(&'builder self, s: &'input str) -> Matcher<'input, 'builder> { ... }
26 }
27
28 pub struct Matcher<'input, 'builder> {
29 text: &'input str,
30 consumed: usize,
31 regex_set: &'builder regex::RegexSet,
32 regex_vec: &'builder Vec<regex::Regex>,
33 }
34
35 impl Matcher<'input> {
36 fn tokenize(&self, text: &str) -> Option<(usize, usize)> { ... }
37 }
38
39 impl<'input> Iterator for Matcher<'input> {
40 type Item = Result<(usize, Token<'input>, usize), ParseError>;
41 // ~~~~~ ~~~~~~~~~~~~~ ~~~~~
42 // start token end
43 }
44 }
45 ```
46
47 */
48
49 use grammar::parse_tree::InternToken;
50 use grammar::repr::{Grammar, TerminalLiteral};
51 use lexer::re;
52 use rust::RustWrite;
53 use std::io::{self, Write};
54
compile<W: Write>( grammar: &Grammar, intern_token: &InternToken, out: &mut RustWrite<W>, ) -> io::Result<()>55 pub fn compile<W: Write>(
56 grammar: &Grammar,
57 intern_token: &InternToken,
58 out: &mut RustWrite<W>,
59 ) -> io::Result<()> {
60 let prefix = &grammar.prefix;
61
62 rust!(out, "#[cfg_attr(rustfmt, rustfmt_skip)]");
63 rust!(out, "mod {}intern_token {{", prefix);
64 rust!(out, "#![allow(unused_imports)]");
65 try!(out.write_uses("", &grammar));
66 rust!(out, "extern crate regex as {}regex;", prefix);
67 rust!(out, "use std::fmt as {}fmt;", prefix);
68 rust!(out, "");
69 rust!(
70 out,
71 "#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]"
72 );
73 rust!(out, "pub struct Token<'input>(pub usize, pub &'input str);");
74 rust!(out, "impl<'a> {}fmt::Display for Token<'a> {{", prefix);
75 rust!(
76 out,
77 "fn fmt(&self, formatter: &mut {}fmt::Formatter) -> Result<(), {}fmt::Error> {{",
78 prefix,
79 prefix
80 );
81 rust!(out, "{}fmt::Display::fmt(self.1, formatter)", prefix);
82 rust!(out, "}}");
83 rust!(out, "}}");
84 rust!(out, "");
85 rust!(out, "pub struct {}MatcherBuilder {{", prefix);
86 rust!(out, "regex_set: {}regex::RegexSet,", prefix);
87 rust!(out, "regex_vec: Vec<{}regex::Regex>,", prefix);
88 rust!(out, "}}");
89 rust!(out, "");
90 rust!(out, "impl {}MatcherBuilder {{", prefix);
91 rust!(out, "pub fn new() -> {}MatcherBuilder {{", prefix);
92
93 // create a vector of rust string literals with the text of each
94 // regular expression
95 let regex_strings: Vec<String> = {
96 intern_token
97 .match_entries
98 .iter()
99 .map(|match_entry| match match_entry.match_literal {
100 TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
101 TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
102 })
103 .map(|regex| {
104 // make sure all regex are anchored at the beginning of the input
105 format!("^({})", regex)
106 })
107 .map(|regex_str| {
108 // create a rust string with text of the regex; the Debug impl
109 // will add quotes and escape
110 format!("{:?}", regex_str)
111 })
112 .collect()
113 };
114
115 rust!(out, "let {}strs: &[&str] = &[", prefix);
116 for literal in ®ex_strings {
117 rust!(out, "{},", literal);
118 }
119 rust!(out, "];");
120 rust!(
121 out,
122 "let {}regex_set = {}regex::RegexSet::new({}strs).unwrap();",
123 prefix,
124 prefix,
125 prefix
126 );
127
128 rust!(out, "let {}regex_vec = vec![", prefix);
129 for literal in ®ex_strings {
130 rust!(out, "{}regex::Regex::new({}).unwrap(),", prefix, literal);
131 }
132 rust!(out, "];");
133
134 rust!(
135 out,
136 "{0}MatcherBuilder {{ regex_set: {0}regex_set, regex_vec: {0}regex_vec }}",
137 prefix
138 );
139 rust!(out, "}}"); // fn new()
140 rust!(
141 out,
142 "pub fn matcher<'input, 'builder>(&'builder self, s: &'input str) \
143 -> {}Matcher<'input, 'builder> {{",
144 prefix
145 );
146 rust!(out, "{}Matcher {{", prefix);
147 rust!(out, "text: s,");
148 rust!(out, "consumed: 0,");
149 rust!(out, "regex_set: &self.regex_set,");
150 rust!(out, "regex_vec: &self.regex_vec,");
151 rust!(out, "}}"); // struct literal
152 rust!(out, "}}"); // fn matcher()
153 rust!(out, "}}"); // impl MatcherBuilder
154 rust!(out, "");
155 rust!(out, "pub struct {}Matcher<'input, 'builder> {{", prefix);
156 rust!(out, "text: &'input str,"); // remaining input
157 rust!(out, "consumed: usize,"); // number of chars consumed thus far
158 rust!(out, "regex_set: &'builder {}regex::RegexSet,", prefix);
159 rust!(out, "regex_vec: &'builder Vec<{}regex::Regex>,", prefix);
160 rust!(out, "}}");
161 rust!(out, "");
162 rust!(
163 out,
164 "impl<'input, 'builder> Iterator for {}Matcher<'input, 'builder> {{",
165 prefix
166 );
167 rust!(
168 out,
169 "type Item = Result<(usize, Token<'input>, usize), \
170 {}lalrpop_util::ParseError<usize,Token<'input>,{}>>;",
171 prefix,
172 grammar.types.error_type()
173 );
174 rust!(out, "");
175 rust!(out, "fn next(&mut self) -> Option<Self::Item> {{");
176
177 // start by trimming whitespace from left
178 rust!(out, "#[allow(deprecated)]");
179 rust!(out, "let {}text = self.text.trim_left();", prefix);
180 rust!(
181 out,
182 "let {}whitespace = self.text.len() - {}text.len();",
183 prefix,
184 prefix
185 );
186 rust!(
187 out,
188 "let {}start_offset = self.consumed + {}whitespace;",
189 prefix,
190 prefix
191 );
192
193 // if nothing left, return None
194 rust!(out, "if {}text.is_empty() {{", prefix);
195 rust!(out, "self.text = {}text;", prefix);
196 rust!(out, "self.consumed = {}start_offset;", prefix);
197 rust!(out, "None");
198 rust!(out, "}} else {{");
199
200 // otherwise, use regex-set to find list of matching tokens
201 rust!(
202 out,
203 "let {}matches = self.regex_set.matches({}text);",
204 prefix,
205 prefix
206 );
207
208 // if nothing matched, return an error
209 rust!(out, "if !{}matches.matched_any() {{", prefix);
210 rust!(
211 out,
212 "Some(Err({}lalrpop_util::ParseError::InvalidToken {{",
213 prefix
214 );
215 rust!(out, "location: {}start_offset,", prefix);
216 rust!(out, "}}))");
217 rust!(out, "}} else {{");
218
219 // otherwise, have to find longest, highest-priority match. We have the literals
220 // sorted in order of increasing precedence, so we'll iterate over them one by one,
221 // checking if each one matches, and remembering the longest one.
222 rust!(out, "let mut {}longest_match = 0;", prefix); // length of longest match
223 rust!(out, "let mut {}index = 0;", prefix); // index of longest match
224 rust!(
225 out,
226 "for {}i in 0 .. {} {{",
227 prefix,
228 intern_token.match_entries.len()
229 );
230 rust!(out, "if {}matches.matched({}i) {{", prefix, prefix);
231
232 // re-run the regex to find out how long this particular match
233 // was, then compare that against the longest-match so far. Note
234 // that the order of the tuple is carefully constructed to ensure
235 // that (a) we get the longest-match but (b) if two matches are
236 // equal, we get the largest index. This is because the indices
237 // are sorted in order of increasing priority, and because we know
238 // that indices of equal priority cannot both match (because of
239 // the DFA check).
240 rust!(
241 out,
242 "let {}match = self.regex_vec[{}i].find({}text).unwrap();",
243 prefix,
244 prefix,
245 prefix
246 );
247 rust!(out, "let {}len = {}match.end();", prefix, prefix);
248 rust!(out, "if {}len >= {}longest_match {{", prefix, prefix);
249 rust!(out, "{}longest_match = {}len;", prefix, prefix);
250 rust!(out, "{}index = {}i;", prefix, prefix);
251 rust!(out, "}}"); // if is longest match
252 rust!(out, "}}"); // if matches.matched(i)
253 rust!(out, "}}"); // for loop
254
255 // transform the result into the expected return value
256 rust!(
257 out,
258 "let {}result = &{}text[..{}longest_match];",
259 prefix,
260 prefix,
261 prefix
262 );
263 rust!(
264 out,
265 "let {}remaining = &{}text[{}longest_match..];",
266 prefix,
267 prefix,
268 prefix
269 );
270 rust!(
271 out,
272 "let {}end_offset = {}start_offset + {}longest_match;",
273 prefix,
274 prefix,
275 prefix
276 );
277 rust!(out, "self.text = {}remaining;", prefix);
278 rust!(out, "self.consumed = {}end_offset;", prefix);
279 rust!(
280 out,
281 "Some(Ok(({}start_offset, Token({}index, {}result), {}end_offset)))",
282 prefix,
283 prefix,
284 prefix,
285 prefix
286 );
287
288 rust!(out, "}}"); // else
289 rust!(out, "}}"); // else
290 rust!(out, "}}"); // fn
291 rust!(out, "}}"); // impl
292 rust!(out, "}}"); // mod
293 Ok(())
294 }
295