1 //! If an extern token is provided, then this pass validates that
2 //! terminal IDs have conversions. Otherwise, it generates a
3 //! tokenizer. This can only be done after macro expansion because
4 //! some macro arguments never make it into an actual production and
5 //! are only used in `if` conditions; we use string literals for
6 //! those, but they do not have to have a defined conversion.
7 
8 use super::{NormError, NormResult};
9 
10 use collections::{Map, Set};
11 use grammar::consts::*;
12 use grammar::parse_tree::*;
13 use lexer::dfa::{self, DFAConstructionError, Precedence};
14 use lexer::nfa::NFAConstructionError::*;
15 use lexer::re;
16 use string_cache::DefaultAtom as Atom;
17 
18 #[cfg(test)]
19 mod test;
20 
validate(mut grammar: Grammar) -> NormResult<Grammar>21 pub fn validate(mut grammar: Grammar) -> NormResult<Grammar> {
22     let mode = {
23         let mode = if let Some(enum_token) = grammar.enum_token() {
24             assert!(
25                 grammar.match_token().is_none(),
26                 "validator permitted both an extern/match section"
27             );
28 
29             TokenMode::Extern {
30                 conversions: enum_token
31                     .conversions
32                     .iter()
33                     .map(|conversion| conversion.from.clone())
34                     .collect(),
35             }
36         } else {
37             TokenMode::Internal {
38                 match_block: MatchBlock::new(grammar.match_token())?,
39             }
40         };
41 
42         let mut validator = Validator {
43             grammar: &grammar,
44             mode,
45         };
46 
47         validator.validate()?;
48 
49         validator.mode
50     };
51 
52     match mode {
53         TokenMode::Extern { .. } => {
54             // If using an external tokenizer, we're all done at this point.
55         }
56         TokenMode::Internal { match_block } => {
57             // Otherwise, construct the `InternToken` item.
58             construct(&mut grammar, match_block)?;
59         }
60     }
61 
62     Ok(grammar)
63 }
64 
65 ///////////////////////////////////////////////////////////////////////////
66 // Validation phase -- this phase walks the grammar and visits all
67 // terminals. If using an external set of tokens, it checks that all
68 // terminals have a defined conversion to some pattern. Otherwise,
69 // it collects all terminals into the `all_literals` set for later use.
70 
71 struct Validator<'grammar> {
72     grammar: &'grammar Grammar,
73     mode: TokenMode,
74 }
75 
76 enum TokenMode {
77     /// If there is an `extern { ... }` section that defines
78     /// conversions of the form `TERMINAL => PATTERN`, then this is a
79     /// set of those terminals. These are the only terminals that the
80     /// user should be using.
81     Extern { conversions: Set<TerminalString> },
82 
83     /// Otherwise, we are synthesizing the tokenizer. In that case,
84     /// `match_block` summarizes the data from the `match { ... }`
85     /// section, if any. If there was no `match` section, or the
86     /// section contains a wildcard, the user can also use additional
87     /// terminals in the grammar.
88     Internal { match_block: MatchBlock },
89 }
90 
91 /// Data summarizing the `match { }` block, along with any literals we
92 /// scraped up.
93 #[derive(Default)]
94 struct MatchBlock {
95     /// This map stores the `match { }` entries. If `match_catch_all`
96     /// is true, then we will grow this set with "identity mappings"
97     /// for new literals that we find.
98     match_entries: Vec<MatchEntry>,
99 
100     /// The names of all terminals the user can legally type. If
101     /// `match_catch_all` is true, then if we encounter additional
102     /// terminal literals in the grammar, we will add them to this
103     /// set.
104     match_user_names: Set<TerminalString>,
105 
106     /// For each terminal literal that we have to match, the span
107     /// where it appeared in user's source.  This can either be in the
108     /// `match { }` section or else in the grammar somewhere (if added
109     /// due to a catch-all, or there is no match section).
110     spans: Map<TerminalLiteral, Span>,
111 
112     /// True if we should permit unrecognized literals to be used.
113     catch_all: bool,
114 }
115 
116 impl MatchBlock {
117     /// Creates a `MatchBlock` by reading the data out of the `match {
118     /// ... }` block that the user provided (if any).
new(opt_match_token: Option<&MatchToken>) -> NormResult<Self>119     fn new(opt_match_token: Option<&MatchToken>) -> NormResult<Self> {
120         let mut match_block = Self::default();
121         if let Some(match_token) = opt_match_token {
122             for (idx, mc) in match_token.contents.iter().enumerate() {
123                 let precedence = match_token.contents.len() - idx;
124                 for item in &mc.items {
125                     match *item {
126                         MatchItem::Unmapped(ref sym, span) => {
127                             match_block.add_match_entry(
128                                 precedence,
129                                 sym.clone(),
130                                 TerminalString::Literal(sym.clone()),
131                                 span,
132                             )?;
133                         }
134                         MatchItem::Mapped(ref sym, ref user, span) => {
135                             match_block.add_match_entry(
136                                 precedence,
137                                 sym.clone(),
138                                 user.clone(),
139                                 span,
140                             )?;
141                         }
142                         MatchItem::CatchAll(_) => {
143                             match_block.catch_all = true;
144                         }
145                     }
146                 }
147             }
148         } else {
149             // no match block is equivalent to `match { _ }`
150             match_block.catch_all = true;
151         }
152         Ok(match_block)
153     }
154 
add_match_entry( &mut self, match_group_precedence: usize, sym: TerminalLiteral, user_name: TerminalString, span: Span, ) -> NormResult<()>155     fn add_match_entry(
156         &mut self,
157         match_group_precedence: usize,
158         sym: TerminalLiteral,
159         user_name: TerminalString,
160         span: Span,
161     ) -> NormResult<()> {
162         if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
163             return_err!(span, "multiple match entries for `{}`", sym);
164         }
165 
166         // NB: It's legal for multiple regex to produce same terminal.
167         self.match_user_names.insert(user_name.clone());
168 
169         self.match_entries.push(MatchEntry {
170             precedence: match_group_precedence * 2 + sym.base_precedence(),
171             match_literal: sym,
172             user_name,
173         });
174         Ok(())
175     }
176 
add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()>177     fn add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()> {
178         // Already saw this literal, maybe in a match entry, maybe in the grammar.
179         if self
180             .match_user_names
181             .contains(&TerminalString::Literal(sym.clone()))
182         {
183             return Ok(());
184         }
185 
186         if !self.catch_all {
187             return_err!(
188                 span,
189                 "terminal `{}` does not have a match mapping defined for it",
190                 sym
191             );
192         }
193 
194         self.match_user_names
195             .insert(TerminalString::Literal(sym.clone()));
196 
197         self.match_entries.push(MatchEntry {
198             precedence: sym.base_precedence(),
199             match_literal: sym.clone(),
200             user_name: TerminalString::Literal(sym.clone()),
201         });
202 
203         self.spans.insert(sym, span);
204 
205         Ok(())
206     }
207 }
208 
209 impl<'grammar> Validator<'grammar> {
validate(&mut self) -> NormResult<()>210     fn validate(&mut self) -> NormResult<()> {
211         for item in &self.grammar.items {
212             match *item {
213                 GrammarItem::Use(..) => {}
214                 GrammarItem::MatchToken(..) => {}
215                 GrammarItem::ExternToken(_) => {}
216                 GrammarItem::InternToken(_) => {}
217                 GrammarItem::Nonterminal(ref data) => {
218                     for alternative in &data.alternatives {
219                         self.validate_alternative(alternative)?;
220                     }
221                 }
222             }
223         }
224         Ok(())
225     }
226 
validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()>227     fn validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()> {
228         assert!(alternative.condition.is_none()); // macro expansion should have removed these
229         self.validate_expr(&alternative.expr)?;
230         Ok(())
231     }
232 
validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()>233     fn validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()> {
234         for symbol in &expr.symbols {
235             self.validate_symbol(symbol)?;
236         }
237         Ok(())
238     }
239 
validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()>240     fn validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()> {
241         match symbol.kind {
242             SymbolKind::Expr(ref expr) => {
243                 self.validate_expr(expr)?;
244             }
245             SymbolKind::Terminal(ref term) => {
246                 self.validate_terminal(symbol.span, term)?;
247             }
248             SymbolKind::Nonterminal(_) => {}
249             SymbolKind::Repeat(ref repeat) => {
250                 self.validate_symbol(&repeat.symbol)?;
251             }
252             SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => {
253                 self.validate_symbol(sym)?;
254             }
255             SymbolKind::Lookahead | SymbolKind::Lookbehind | SymbolKind::Error => {}
256             SymbolKind::AmbiguousId(ref id) => {
257                 panic!("ambiguous id `{}` encountered after name resolution", id)
258             }
259             SymbolKind::Macro(..) => {
260                 panic!("macro not removed: {:?}", symbol);
261             }
262         }
263 
264         Ok(())
265     }
266 
validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()>267     fn validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()> {
268         match self.mode {
269             // If there is an extern token definition, validate that
270             // this terminal has a defined conversion.
271             TokenMode::Extern { ref conversions } => {
272                 if !conversions.contains(term) {
273                     return_err!(
274                         span,
275                         "terminal `{}` does not have a pattern defined for it",
276                         term
277                     );
278                 }
279             }
280 
281             // If there is no extern token definition, then collect
282             // the terminal literals ("class", r"[a-z]+") into a set.
283             TokenMode::Internal {
284                 ref mut match_block,
285             } => {
286                 match *term {
287                     TerminalString::Bare(_) => assert!(
288                         match_block.match_user_names.contains(term),
289                         "bare terminal without match entry: {}",
290                         term
291                     ),
292 
293                     TerminalString::Literal(ref l) => {
294                         match_block.add_literal_from_grammar(l.clone(), span)?
295                     }
296 
297                     // Error is a builtin terminal that always exists
298                     TerminalString::Error => (),
299                 }
300             }
301         }
302 
303         Ok(())
304     }
305 }
306 
307 ///////////////////////////////////////////////////////////////////////////
308 // Construction phase -- if we are constructing a tokenizer, this
309 // phase builds up an internal token DFA.
310 
construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()>311 fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
312     let MatchBlock {
313         mut match_entries,
314         spans,
315         ..
316     } = match_block;
317 
318     // Sort match entries by order of increasing precedence.
319     match_entries.sort();
320 
321     // Build up two vectors, one of parsed regular expressions and
322     // one of precedences, that are parallel with `literals`.
323     let mut regexs = Vec::with_capacity(match_entries.len());
324     let mut precedences = Vec::with_capacity(match_entries.len());
325     {
326         for match_entry in &match_entries {
327             precedences.push(Precedence(match_entry.precedence));
328             match match_entry.match_literal {
329                 TerminalLiteral::Quoted(ref s) => {
330                     regexs.push(re::parse_literal(&s));
331                 }
332                 TerminalLiteral::Regex(ref s) => {
333                     match re::parse_regex(&s) {
334                         Ok(regex) => regexs.push(regex),
335                         Err(error) => {
336                             let literal_span = spans[&match_entry.match_literal];
337                             // FIXME -- take offset into account for
338                             // span; this requires knowing how many #
339                             // the user used, which we do not track
340                             return_err!(literal_span, "invalid regular expression: {}", error);
341                         }
342                     }
343                 }
344             }
345         }
346         Ok(())
347     }?;
348 
349     let dfa = match dfa::build_dfa(&regexs, &precedences) {
350         Ok(dfa) => dfa,
351         Err(DFAConstructionError::NFAConstructionError { index, error }) => {
352             let feature = match error {
353                 NamedCaptures => r#"named captures (`(?P<foo>...)`)"#,
354                 NonGreedy => r#""non-greedy" repetitions (`*?` or `+?`)"#,
355                 WordBoundary => r#"word boundaries (`\b` or `\B`)"#,
356                 LineBoundary => r#"line boundaries (`^` or `$`)"#,
357                 TextBoundary => r#"text boundaries (`^` or `$`)"#,
358                 ByteRegex => r#"byte-based matches"#,
359             };
360             let literal = &match_entries[index.index()].match_literal;
361             return_err!(
362                 spans[literal],
363                 "{} are not supported in regular expressions",
364                 feature
365             )
366         }
367         Err(DFAConstructionError::Ambiguity { match0, match1 }) => {
368             let literal0 = &match_entries[match0.index()].match_literal;
369             let literal1 = &match_entries[match1.index()].match_literal;
370             // FIXME(#88) -- it'd be nice to give an example here
371             return_err!(
372                 spans[literal0],
373                 "ambiguity detected between the terminal `{}` and the terminal `{}`",
374                 literal0,
375                 literal1
376             )
377         }
378     };
379 
380     grammar
381         .items
382         .push(GrammarItem::InternToken(InternToken { match_entries, dfa }));
383 
384     // we need to inject a `'input` lifetime and `input: &'input str` parameter as well:
385 
386     let input_lifetime = Lifetime::input();
387     for parameter in &grammar.type_parameters {
388         match parameter {
389             TypeParameter::Lifetime(i) if *i == input_lifetime => {
390                 return_err!(
391                     grammar.span,
392                     "since there is no external token enum specified, \
393                      the `'input` lifetime is implicit and cannot be declared"
394                 );
395             }
396             _ => {}
397         }
398     }
399 
400     let input_parameter = Atom::from(INPUT_PARAMETER);
401     for parameter in &grammar.parameters {
402         if parameter.name == input_parameter {
403             return_err!(
404                 grammar.span,
405                 "since there is no external token enum specified, \
406                  the `input` parameter is implicit and cannot be declared"
407             );
408         }
409     }
410 
411     grammar
412         .type_parameters
413         .insert(0, TypeParameter::Lifetime(input_lifetime.clone()));
414 
415     let parameter = Parameter {
416         name: input_parameter,
417         ty: TypeRef::Ref {
418             lifetime: Some(input_lifetime),
419             mutable: false,
420             referent: Box::new(TypeRef::Id(Atom::from("str"))),
421         },
422     };
423     grammar.parameters.push(parameter);
424 
425     Ok(())
426 }
427