1 //! If an extern token is provided, then this pass validates that
2 //! terminal IDs have conversions. Otherwise, it generates a
3 //! tokenizer. This can only be done after macro expansion because
4 //! some macro arguments never make it into an actual production and
5 //! are only used in `if` conditions; we use string literals for
6 //! those, but they do not have to have a defined conversion.
7
8 use super::{NormError, NormResult};
9
10 use collections::{Map, Set};
11 use grammar::consts::*;
12 use grammar::parse_tree::*;
13 use lexer::dfa::{self, DFAConstructionError, Precedence};
14 use lexer::nfa::NFAConstructionError::*;
15 use lexer::re;
16 use string_cache::DefaultAtom as Atom;
17
18 #[cfg(test)]
19 mod test;
20
validate(mut grammar: Grammar) -> NormResult<Grammar>21 pub fn validate(mut grammar: Grammar) -> NormResult<Grammar> {
22 let mode = {
23 let mode = if let Some(enum_token) = grammar.enum_token() {
24 assert!(
25 grammar.match_token().is_none(),
26 "validator permitted both an extern/match section"
27 );
28
29 TokenMode::Extern {
30 conversions: enum_token
31 .conversions
32 .iter()
33 .map(|conversion| conversion.from.clone())
34 .collect(),
35 }
36 } else {
37 TokenMode::Internal {
38 match_block: MatchBlock::new(grammar.match_token())?,
39 }
40 };
41
42 let mut validator = Validator {
43 grammar: &grammar,
44 mode,
45 };
46
47 validator.validate()?;
48
49 validator.mode
50 };
51
52 match mode {
53 TokenMode::Extern { .. } => {
54 // If using an external tokenizer, we're all done at this point.
55 }
56 TokenMode::Internal { match_block } => {
57 // Otherwise, construct the `InternToken` item.
58 construct(&mut grammar, match_block)?;
59 }
60 }
61
62 Ok(grammar)
63 }
64
65 ///////////////////////////////////////////////////////////////////////////
66 // Validation phase -- this phase walks the grammar and visits all
67 // terminals. If using an external set of tokens, it checks that all
68 // terminals have a defined conversion to some pattern. Otherwise,
69 // it collects all terminals into the `all_literals` set for later use.
70
71 struct Validator<'grammar> {
72 grammar: &'grammar Grammar,
73 mode: TokenMode,
74 }
75
76 enum TokenMode {
77 /// If there is an `extern { ... }` section that defines
78 /// conversions of the form `TERMINAL => PATTERN`, then this is a
79 /// set of those terminals. These are the only terminals that the
80 /// user should be using.
81 Extern { conversions: Set<TerminalString> },
82
83 /// Otherwise, we are synthesizing the tokenizer. In that case,
84 /// `match_block` summarizes the data from the `match { ... }`
85 /// section, if any. If there was no `match` section, or the
86 /// section contains a wildcard, the user can also use additional
87 /// terminals in the grammar.
88 Internal { match_block: MatchBlock },
89 }
90
91 /// Data summarizing the `match { }` block, along with any literals we
92 /// scraped up.
93 #[derive(Default)]
94 struct MatchBlock {
95 /// This map stores the `match { }` entries. If `match_catch_all`
96 /// is true, then we will grow this set with "identity mappings"
97 /// for new literals that we find.
98 match_entries: Vec<MatchEntry>,
99
100 /// The names of all terminals the user can legally type. If
101 /// `match_catch_all` is true, then if we encounter additional
102 /// terminal literals in the grammar, we will add them to this
103 /// set.
104 match_user_names: Set<TerminalString>,
105
106 /// For each terminal literal that we have to match, the span
107 /// where it appeared in user's source. This can either be in the
108 /// `match { }` section or else in the grammar somewhere (if added
109 /// due to a catch-all, or there is no match section).
110 spans: Map<TerminalLiteral, Span>,
111
112 /// True if we should permit unrecognized literals to be used.
113 catch_all: bool,
114 }
115
116 impl MatchBlock {
117 /// Creates a `MatchBlock` by reading the data out of the `match {
118 /// ... }` block that the user provided (if any).
new(opt_match_token: Option<&MatchToken>) -> NormResult<Self>119 fn new(opt_match_token: Option<&MatchToken>) -> NormResult<Self> {
120 let mut match_block = Self::default();
121 if let Some(match_token) = opt_match_token {
122 for (idx, mc) in match_token.contents.iter().enumerate() {
123 let precedence = match_token.contents.len() - idx;
124 for item in &mc.items {
125 match *item {
126 MatchItem::Unmapped(ref sym, span) => {
127 match_block.add_match_entry(
128 precedence,
129 sym.clone(),
130 TerminalString::Literal(sym.clone()),
131 span,
132 )?;
133 }
134 MatchItem::Mapped(ref sym, ref user, span) => {
135 match_block.add_match_entry(
136 precedence,
137 sym.clone(),
138 user.clone(),
139 span,
140 )?;
141 }
142 MatchItem::CatchAll(_) => {
143 match_block.catch_all = true;
144 }
145 }
146 }
147 }
148 } else {
149 // no match block is equivalent to `match { _ }`
150 match_block.catch_all = true;
151 }
152 Ok(match_block)
153 }
154
add_match_entry( &mut self, match_group_precedence: usize, sym: TerminalLiteral, user_name: TerminalString, span: Span, ) -> NormResult<()>155 fn add_match_entry(
156 &mut self,
157 match_group_precedence: usize,
158 sym: TerminalLiteral,
159 user_name: TerminalString,
160 span: Span,
161 ) -> NormResult<()> {
162 if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
163 return_err!(span, "multiple match entries for `{}`", sym);
164 }
165
166 // NB: It's legal for multiple regex to produce same terminal.
167 self.match_user_names.insert(user_name.clone());
168
169 self.match_entries.push(MatchEntry {
170 precedence: match_group_precedence * 2 + sym.base_precedence(),
171 match_literal: sym,
172 user_name,
173 });
174 Ok(())
175 }
176
add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()>177 fn add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()> {
178 // Already saw this literal, maybe in a match entry, maybe in the grammar.
179 if self
180 .match_user_names
181 .contains(&TerminalString::Literal(sym.clone()))
182 {
183 return Ok(());
184 }
185
186 if !self.catch_all {
187 return_err!(
188 span,
189 "terminal `{}` does not have a match mapping defined for it",
190 sym
191 );
192 }
193
194 self.match_user_names
195 .insert(TerminalString::Literal(sym.clone()));
196
197 self.match_entries.push(MatchEntry {
198 precedence: sym.base_precedence(),
199 match_literal: sym.clone(),
200 user_name: TerminalString::Literal(sym.clone()),
201 });
202
203 self.spans.insert(sym, span);
204
205 Ok(())
206 }
207 }
208
209 impl<'grammar> Validator<'grammar> {
validate(&mut self) -> NormResult<()>210 fn validate(&mut self) -> NormResult<()> {
211 for item in &self.grammar.items {
212 match *item {
213 GrammarItem::Use(..) => {}
214 GrammarItem::MatchToken(..) => {}
215 GrammarItem::ExternToken(_) => {}
216 GrammarItem::InternToken(_) => {}
217 GrammarItem::Nonterminal(ref data) => {
218 for alternative in &data.alternatives {
219 self.validate_alternative(alternative)?;
220 }
221 }
222 }
223 }
224 Ok(())
225 }
226
validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()>227 fn validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()> {
228 assert!(alternative.condition.is_none()); // macro expansion should have removed these
229 self.validate_expr(&alternative.expr)?;
230 Ok(())
231 }
232
validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()>233 fn validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()> {
234 for symbol in &expr.symbols {
235 self.validate_symbol(symbol)?;
236 }
237 Ok(())
238 }
239
validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()>240 fn validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()> {
241 match symbol.kind {
242 SymbolKind::Expr(ref expr) => {
243 self.validate_expr(expr)?;
244 }
245 SymbolKind::Terminal(ref term) => {
246 self.validate_terminal(symbol.span, term)?;
247 }
248 SymbolKind::Nonterminal(_) => {}
249 SymbolKind::Repeat(ref repeat) => {
250 self.validate_symbol(&repeat.symbol)?;
251 }
252 SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => {
253 self.validate_symbol(sym)?;
254 }
255 SymbolKind::Lookahead | SymbolKind::Lookbehind | SymbolKind::Error => {}
256 SymbolKind::AmbiguousId(ref id) => {
257 panic!("ambiguous id `{}` encountered after name resolution", id)
258 }
259 SymbolKind::Macro(..) => {
260 panic!("macro not removed: {:?}", symbol);
261 }
262 }
263
264 Ok(())
265 }
266
validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()>267 fn validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()> {
268 match self.mode {
269 // If there is an extern token definition, validate that
270 // this terminal has a defined conversion.
271 TokenMode::Extern { ref conversions } => {
272 if !conversions.contains(term) {
273 return_err!(
274 span,
275 "terminal `{}` does not have a pattern defined for it",
276 term
277 );
278 }
279 }
280
281 // If there is no extern token definition, then collect
282 // the terminal literals ("class", r"[a-z]+") into a set.
283 TokenMode::Internal {
284 ref mut match_block,
285 } => {
286 match *term {
287 TerminalString::Bare(_) => assert!(
288 match_block.match_user_names.contains(term),
289 "bare terminal without match entry: {}",
290 term
291 ),
292
293 TerminalString::Literal(ref l) => {
294 match_block.add_literal_from_grammar(l.clone(), span)?
295 }
296
297 // Error is a builtin terminal that always exists
298 TerminalString::Error => (),
299 }
300 }
301 }
302
303 Ok(())
304 }
305 }
306
307 ///////////////////////////////////////////////////////////////////////////
308 // Construction phase -- if we are constructing a tokenizer, this
309 // phase builds up an internal token DFA.
310
construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()>311 fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
312 let MatchBlock {
313 mut match_entries,
314 spans,
315 ..
316 } = match_block;
317
318 // Sort match entries by order of increasing precedence.
319 match_entries.sort();
320
321 // Build up two vectors, one of parsed regular expressions and
322 // one of precedences, that are parallel with `literals`.
323 let mut regexs = Vec::with_capacity(match_entries.len());
324 let mut precedences = Vec::with_capacity(match_entries.len());
325 {
326 for match_entry in &match_entries {
327 precedences.push(Precedence(match_entry.precedence));
328 match match_entry.match_literal {
329 TerminalLiteral::Quoted(ref s) => {
330 regexs.push(re::parse_literal(&s));
331 }
332 TerminalLiteral::Regex(ref s) => {
333 match re::parse_regex(&s) {
334 Ok(regex) => regexs.push(regex),
335 Err(error) => {
336 let literal_span = spans[&match_entry.match_literal];
337 // FIXME -- take offset into account for
338 // span; this requires knowing how many #
339 // the user used, which we do not track
340 return_err!(literal_span, "invalid regular expression: {}", error);
341 }
342 }
343 }
344 }
345 }
346 Ok(())
347 }?;
348
349 let dfa = match dfa::build_dfa(®exs, &precedences) {
350 Ok(dfa) => dfa,
351 Err(DFAConstructionError::NFAConstructionError { index, error }) => {
352 let feature = match error {
353 NamedCaptures => r#"named captures (`(?P<foo>...)`)"#,
354 NonGreedy => r#""non-greedy" repetitions (`*?` or `+?`)"#,
355 WordBoundary => r#"word boundaries (`\b` or `\B`)"#,
356 LineBoundary => r#"line boundaries (`^` or `$`)"#,
357 TextBoundary => r#"text boundaries (`^` or `$`)"#,
358 ByteRegex => r#"byte-based matches"#,
359 };
360 let literal = &match_entries[index.index()].match_literal;
361 return_err!(
362 spans[literal],
363 "{} are not supported in regular expressions",
364 feature
365 )
366 }
367 Err(DFAConstructionError::Ambiguity { match0, match1 }) => {
368 let literal0 = &match_entries[match0.index()].match_literal;
369 let literal1 = &match_entries[match1.index()].match_literal;
370 // FIXME(#88) -- it'd be nice to give an example here
371 return_err!(
372 spans[literal0],
373 "ambiguity detected between the terminal `{}` and the terminal `{}`",
374 literal0,
375 literal1
376 )
377 }
378 };
379
380 grammar
381 .items
382 .push(GrammarItem::InternToken(InternToken { match_entries, dfa }));
383
384 // we need to inject a `'input` lifetime and `input: &'input str` parameter as well:
385
386 let input_lifetime = Lifetime::input();
387 for parameter in &grammar.type_parameters {
388 match parameter {
389 TypeParameter::Lifetime(i) if *i == input_lifetime => {
390 return_err!(
391 grammar.span,
392 "since there is no external token enum specified, \
393 the `'input` lifetime is implicit and cannot be declared"
394 );
395 }
396 _ => {}
397 }
398 }
399
400 let input_parameter = Atom::from(INPUT_PARAMETER);
401 for parameter in &grammar.parameters {
402 if parameter.name == input_parameter {
403 return_err!(
404 grammar.span,
405 "since there is no external token enum specified, \
406 the `input` parameter is implicit and cannot be declared"
407 );
408 }
409 }
410
411 grammar
412 .type_parameters
413 .insert(0, TypeParameter::Lifetime(input_lifetime.clone()));
414
415 let parameter = Parameter {
416 name: input_parameter,
417 ty: TypeRef::Ref {
418 lifetime: Some(input_lifetime),
419 mutable: false,
420 referent: Box::new(TypeRef::Id(Atom::from("str"))),
421 },
422 };
423 grammar.parameters.push(parameter);
424
425 Ok(())
426 }
427