1 //! Generates an iterator type `Matcher` that looks roughly like
2 //!
3 //! ```ignore
4 //! mod intern_token {
5 //! extern crate regex as regex;
6 //!
7 //! #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
8 //! pub struct Token<'input>(pub usize, pub &'input str);
9 //! // ~~~~~~ ~~~~~~~~~~~
10 //! // token token
11 //! // index text
12 //! // (type)
13 //!
14 //! impl<'a> fmt::Display for Token<'a> { ... }
15 //!
16 //! pub struct MatcherBuilder {
17 //! regex_set: regex::RegexSet,
18 //! regex_vec: Vec<regex::Regex>,
19 //! }
20 //!
21 //! impl MatcherBuilder {
22 //! fn new() -> MatchBuilder { ... }
23 //! fn matcher<'input, 'builder>(&'builder self, s: &'input str) -> Matcher<'input, 'builder> { ... }
24 //! }
25 //!
26 //! pub struct Matcher<'input, 'builder> {
27 //! text: &'input str,
28 //! consumed: usize,
29 //! regex_set: &'builder regex::RegexSet,
30 //! regex_vec: &'builder Vec<regex::Regex>,
31 //! }
32 //!
33 //! impl Matcher<'input> {
34 //! fn tokenize(&self, text: &str) -> Option<(usize, usize)> { ... }
35 //! }
36 //!
37 //! impl<'input> Iterator for Matcher<'input> {
38 //! type Item = Result<(usize, Token<'input>, usize), ParseError>;
39 //! // ~~~~~ ~~~~~~~~~~~~~ ~~~~~
40 //! // start token end
41 //! }
42 //! }
43 //! ```
44
45 use grammar::parse_tree::InternToken;
46 use grammar::repr::{Grammar, TerminalLiteral};
47 use lexer::re;
48 use rust::RustWrite;
49 use std::io::{self, Write};
50
compile<W: Write>( grammar: &Grammar, intern_token: &InternToken, out: &mut RustWrite<W>, ) -> io::Result<()>51 pub fn compile<W: Write>(
52 grammar: &Grammar,
53 intern_token: &InternToken,
54 out: &mut RustWrite<W>,
55 ) -> io::Result<()> {
56 let prefix = &grammar.prefix;
57
58 rust!(out, "#[cfg_attr(rustfmt, rustfmt_skip)]");
59 rust!(out, "mod {}intern_token {{", prefix);
60 rust!(out, "#![allow(unused_imports)]");
61 out.write_uses("", &grammar)?;
62 rust!(out, "extern crate regex as {}regex;", prefix);
63 rust!(out, "use std::fmt as {}fmt;", prefix);
64 rust!(out, "");
65 rust!(
66 out,
67 "#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]"
68 );
69 rust!(out, "pub struct Token<'input>(pub usize, pub &'input str);");
70 rust!(out, "impl<'a> {}fmt::Display for Token<'a> {{", prefix);
71 rust!(
72 out,
73 "fn fmt<'f>(&self, formatter: &mut {}fmt::Formatter<'f>) -> Result<(), {}fmt::Error> {{",
74 prefix,
75 prefix
76 );
77 rust!(out, "{}fmt::Display::fmt(self.1, formatter)", prefix);
78 rust!(out, "}}");
79 rust!(out, "}}");
80 rust!(out, "");
81 rust!(out, "pub struct {}MatcherBuilder {{", prefix);
82 rust!(out, "regex_set: {}regex::RegexSet,", prefix);
83 rust!(out, "regex_vec: Vec<{}regex::Regex>,", prefix);
84 rust!(out, "}}");
85 rust!(out, "");
86 rust!(out, "impl {}MatcherBuilder {{", prefix);
87 rust!(out, "pub fn new() -> {}MatcherBuilder {{", prefix);
88
89 // create a vector of rust string literals with the text of each
90 // regular expression
91 let regex_strings: Vec<String> = {
92 intern_token
93 .match_entries
94 .iter()
95 .map(|match_entry| match match_entry.match_literal {
96 TerminalLiteral::Quoted(ref s) => re::parse_literal(&s),
97 TerminalLiteral::Regex(ref s) => re::parse_regex(&s).unwrap(),
98 })
99 .map(|regex| {
100 // make sure all regex are anchored at the beginning of the input
101 format!("^({})", regex)
102 })
103 .map(|regex_str| {
104 // create a rust string with text of the regex; the Debug impl
105 // will add quotes and escape
106 format!("{:?}", regex_str)
107 })
108 .collect()
109 };
110
111 rust!(out, "let {}strs: &[&str] = &[", prefix);
112 for literal in ®ex_strings {
113 rust!(out, "{},", literal);
114 }
115 rust!(out, "];");
116 rust!(
117 out,
118 "let {}regex_set = {}regex::RegexSet::new({}strs).unwrap();",
119 prefix,
120 prefix,
121 prefix
122 );
123
124 rust!(out, "let {}regex_vec = vec![", prefix);
125 for literal in ®ex_strings {
126 rust!(out, "{}regex::Regex::new({}).unwrap(),", prefix, literal);
127 }
128 rust!(out, "];");
129
130 rust!(
131 out,
132 "{0}MatcherBuilder {{ regex_set: {0}regex_set, regex_vec: {0}regex_vec }}",
133 prefix
134 );
135 rust!(out, "}}"); // fn new()
136 rust!(
137 out,
138 "pub fn matcher<'input, 'builder>(&'builder self, s: &'input str) \
139 -> {}Matcher<'input, 'builder> {{",
140 prefix
141 );
142 rust!(out, "{}Matcher {{", prefix);
143 rust!(out, "text: s,");
144 rust!(out, "consumed: 0,");
145 rust!(out, "regex_set: &self.regex_set,");
146 rust!(out, "regex_vec: &self.regex_vec,");
147 rust!(out, "}}"); // struct literal
148 rust!(out, "}}"); // fn matcher()
149 rust!(out, "}}"); // impl MatcherBuilder
150 rust!(out, "");
151 rust!(out, "pub struct {}Matcher<'input, 'builder> {{", prefix);
152 rust!(out, "text: &'input str,"); // remaining input
153 rust!(out, "consumed: usize,"); // number of chars consumed thus far
154 rust!(out, "regex_set: &'builder {}regex::RegexSet,", prefix);
155 rust!(out, "regex_vec: &'builder Vec<{}regex::Regex>,", prefix);
156 rust!(out, "}}");
157 rust!(out, "");
158 rust!(
159 out,
160 "impl<'input, 'builder> Iterator for {}Matcher<'input, 'builder> {{",
161 prefix
162 );
163 rust!(
164 out,
165 "type Item = Result<(usize, Token<'input>, usize), \
166 {}lalrpop_util::ParseError<usize,Token<'input>,{}>>;",
167 prefix,
168 grammar.types.error_type()
169 );
170 rust!(out, "");
171 rust!(out, "fn next(&mut self) -> Option<Self::Item> {{");
172
173 // start by trimming whitespace from left
174 rust!(out, "let {}text = self.text.trim_start();", prefix);
175 rust!(
176 out,
177 "let {}whitespace = self.text.len() - {}text.len();",
178 prefix,
179 prefix
180 );
181 rust!(
182 out,
183 "let {}start_offset = self.consumed + {}whitespace;",
184 prefix,
185 prefix
186 );
187
188 // if nothing left, return None
189 rust!(out, "if {}text.is_empty() {{", prefix);
190 rust!(out, "self.text = {}text;", prefix);
191 rust!(out, "self.consumed = {}start_offset;", prefix);
192 rust!(out, "None");
193 rust!(out, "}} else {{");
194
195 // otherwise, use regex-set to find list of matching tokens
196 rust!(
197 out,
198 "let {}matches = self.regex_set.matches({}text);",
199 prefix,
200 prefix
201 );
202
203 // if nothing matched, return an error
204 rust!(out, "if !{}matches.matched_any() {{", prefix);
205 rust!(
206 out,
207 "Some(Err({}lalrpop_util::ParseError::InvalidToken {{",
208 prefix
209 );
210 rust!(out, "location: {}start_offset,", prefix);
211 rust!(out, "}}))");
212 rust!(out, "}} else {{");
213
214 // otherwise, have to find longest, highest-priority match. We have the literals
215 // sorted in order of increasing precedence, so we'll iterate over them one by one,
216 // checking if each one matches, and remembering the longest one.
217 rust!(out, "let mut {}longest_match = 0;", prefix); // length of longest match
218 rust!(out, "let mut {}index = 0;", prefix); // index of longest match
219 rust!(
220 out,
221 "for {}i in 0 .. {} {{",
222 prefix,
223 intern_token.match_entries.len()
224 );
225 rust!(out, "if {}matches.matched({}i) {{", prefix, prefix);
226
227 // re-run the regex to find out how long this particular match
228 // was, then compare that against the longest-match so far. Note
229 // that the order of the tuple is carefully constructed to ensure
230 // that (a) we get the longest-match but (b) if two matches are
231 // equal, we get the largest index. This is because the indices
232 // are sorted in order of increasing priority, and because we know
233 // that indices of equal priority cannot both match (because of
234 // the DFA check).
235 rust!(
236 out,
237 "let {}match = self.regex_vec[{}i].find({}text).unwrap();",
238 prefix,
239 prefix,
240 prefix
241 );
242 rust!(out, "let {}len = {}match.end();", prefix, prefix);
243 rust!(out, "if {}len >= {}longest_match {{", prefix, prefix);
244 rust!(out, "{}longest_match = {}len;", prefix, prefix);
245 rust!(out, "{}index = {}i;", prefix, prefix);
246 rust!(out, "}}"); // if is longest match
247 rust!(out, "}}"); // if matches.matched(i)
248 rust!(out, "}}"); // for loop
249
250 // transform the result into the expected return value
251 rust!(
252 out,
253 "let {}result = &{}text[..{}longest_match];",
254 prefix,
255 prefix,
256 prefix
257 );
258 rust!(
259 out,
260 "let {}remaining = &{}text[{}longest_match..];",
261 prefix,
262 prefix,
263 prefix
264 );
265 rust!(
266 out,
267 "let {}end_offset = {}start_offset + {}longest_match;",
268 prefix,
269 prefix,
270 prefix
271 );
272 rust!(out, "self.text = {}remaining;", prefix);
273 rust!(out, "self.consumed = {}end_offset;", prefix);
274 rust!(
275 out,
276 "Some(Ok(({}start_offset, Token({}index, {}result), {}end_offset)))",
277 prefix,
278 prefix,
279 prefix,
280 prefix
281 );
282
283 rust!(out, "}}"); // else
284 rust!(out, "}}"); // else
285 rust!(out, "}}"); // fn
286 rust!(out, "}}"); // impl
287 rust!(out, "}}"); // mod
288 Ok(())
289 }
290