1 //! If an extern token is provided, then this pass validates that
2 //! terminal IDs have conversions. Otherwise, it generates a
3 //! tokenizer. This can only be done after macro expansion because
4 //! some macro arguments never make it into an actual production and
5 //! are only used in `if` conditions; we use string literals for
6 //! those, but they do not have to have a defined conversion.
7 
8 use super::{NormError, NormResult};
9 
10 use crate::collections::{Map, Set};
11 use crate::grammar::consts::*;
12 use crate::grammar::parse_tree::*;
13 use crate::lexer::dfa::{self, DFAConstructionError, Precedence};
14 use crate::lexer::nfa::NFAConstructionError::*;
15 use crate::lexer::re;
16 use string_cache::DefaultAtom as Atom;
17 
18 #[cfg(test)]
19 mod test;
20 
validate(mut grammar: Grammar) -> NormResult<Grammar>21 pub fn validate(mut grammar: Grammar) -> NormResult<Grammar> {
22     let mode = {
23         let mode = if let Some(enum_token) = grammar.enum_token() {
24             assert!(
25                 grammar.match_token().is_none(),
26                 "validator permitted both an extern/match section"
27             );
28 
29             TokenMode::Extern {
30                 conversions: enum_token
31                     .conversions
32                     .iter()
33                     .map(|conversion| conversion.from.clone())
34                     .collect(),
35             }
36         } else {
37             if cfg!(not(feature = "lexer")) {
38                 return_err!(
39                     Span::default(),
40                     "The `lexer` feature must be specified unless an `extern` lexer is defined"
41                 );
42             }
43             TokenMode::Internal {
44                 match_block: MatchBlock::new(grammar.match_token())?,
45             }
46         };
47 
48         let mut validator = Validator {
49             grammar: &grammar,
50             mode,
51         };
52 
53         validator.validate()?;
54 
55         validator.mode
56     };
57 
58     match mode {
59         TokenMode::Extern { .. } => {
60             // If using an external tokenizer, we're all done at this point.
61         }
62         TokenMode::Internal { match_block } => {
63             // Otherwise, construct the `InternToken` item.
64             construct(&mut grammar, match_block)?;
65         }
66     }
67 
68     Ok(grammar)
69 }
70 
71 ///////////////////////////////////////////////////////////////////////////
72 // Validation phase -- this phase walks the grammar and visits all
73 // terminals. If using an external set of tokens, it checks that all
74 // terminals have a defined conversion to some pattern. Otherwise,
75 // it collects all terminals into the `all_literals` set for later use.
76 
77 struct Validator<'grammar> {
78     grammar: &'grammar Grammar,
79     mode: TokenMode,
80 }
81 
82 enum TokenMode {
83     /// If there is an `extern { ... }` section that defines
84     /// conversions of the form `TERMINAL => PATTERN`, then this is a
85     /// set of those terminals. These are the only terminals that the
86     /// user should be using.
87     Extern { conversions: Set<TerminalString> },
88 
89     /// Otherwise, we are synthesizing the tokenizer. In that case,
90     /// `match_block` summarizes the data from the `match { ... }`
91     /// section, if any. If there was no `match` section, or the
92     /// section contains a wildcard, the user can also use additional
93     /// terminals in the grammar.
94     Internal { match_block: MatchBlock },
95 }
96 
97 /// Data summarizing the `match { }` block, along with any literals we
98 /// scraped up.
99 #[derive(Default)]
100 struct MatchBlock {
101     /// This map stores the `match { }` entries. If `match_catch_all`
102     /// is true, then we will grow this set with "identity mappings"
103     /// for new literals that we find.
104     match_entries: Vec<MatchEntry>,
105 
106     /// The names of all terminals the user can legally type. If
107     /// `match_catch_all` is true, then if we encounter additional
108     /// terminal literals in the grammar, we will add them to this
109     /// set.
110     match_user_names: Set<TerminalString>,
111 
112     /// For each terminal literal that we have to match, the span
113     /// where it appeared in user's source.  This can either be in the
114     /// `match { }` section or else in the grammar somewhere (if added
115     /// due to a catch-all, or there is no match section).
116     spans: Map<TerminalLiteral, Span>,
117 
118     /// True if we should permit unrecognized literals to be used.
119     catch_all: bool,
120 }
121 
122 impl MatchBlock {
123     /// Creates a `MatchBlock` by reading the data out of the `match {
124     /// ... }` block that the user provided (if any).
new(opt_match_token: Option<&MatchToken>) -> NormResult<Self>125     fn new(opt_match_token: Option<&MatchToken>) -> NormResult<Self> {
126         let mut match_block = Self::default();
127         if let Some(match_token) = opt_match_token {
128             for (idx, mc) in match_token.contents.iter().enumerate() {
129                 let precedence = match_token.contents.len() - idx;
130                 for item in &mc.items {
131                     match *item {
132                         MatchItem::Unmapped(ref sym, span) => {
133                             match_block.add_match_entry(
134                                 precedence,
135                                 sym.clone(),
136                                 MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
137                                 span,
138                             )?;
139                         }
140                         MatchItem::Mapped(ref sym, ref user, span) => {
141                             match_block.add_match_entry(
142                                 precedence,
143                                 sym.clone(),
144                                 user.clone(),
145                                 span,
146                             )?;
147                         }
148                         MatchItem::CatchAll(_) => {
149                             match_block.catch_all = true;
150                         }
151                     }
152                 }
153             }
154         } else {
155             // no match block is equivalent to `match { _ }`
156             match_block.catch_all = true;
157         }
158         Ok(match_block)
159     }
160 
add_match_entry( &mut self, match_group_precedence: usize, sym: TerminalLiteral, user_name: MatchMapping, span: Span, ) -> NormResult<()>161     fn add_match_entry(
162         &mut self,
163         match_group_precedence: usize,
164         sym: TerminalLiteral,
165         user_name: MatchMapping,
166         span: Span,
167     ) -> NormResult<()> {
168         if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
169             return_err!(span, "multiple match entries for `{}`", sym);
170         }
171 
172         // NB: It's legal for multiple regex to produce same terminal.
173         if let MatchMapping::Terminal(user_name) = &user_name {
174             self.match_user_names.insert(user_name.clone());
175         }
176 
177         self.match_entries.push(MatchEntry {
178             precedence: match_group_precedence * 2 + sym.base_precedence(),
179             match_literal: sym,
180             user_name,
181         });
182         Ok(())
183     }
184 
add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()>185     fn add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()> {
186         // Already saw this literal, maybe in a match entry, maybe in the grammar.
187         if self
188             .match_user_names
189             .contains(&TerminalString::Literal(sym.clone()))
190         {
191             return Ok(());
192         }
193 
194         if !self.catch_all {
195             return_err!(
196                 span,
197                 "terminal `{}` does not have a match mapping defined for it",
198                 sym
199             );
200         }
201 
202         self.match_user_names
203             .insert(TerminalString::Literal(sym.clone()));
204 
205         self.match_entries.push(MatchEntry {
206             precedence: sym.base_precedence(),
207             match_literal: sym.clone(),
208             user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
209         });
210 
211         self.spans.insert(sym, span);
212 
213         Ok(())
214     }
215 }
216 
217 impl<'grammar> Validator<'grammar> {
validate(&mut self) -> NormResult<()>218     fn validate(&mut self) -> NormResult<()> {
219         for item in &self.grammar.items {
220             match *item {
221                 GrammarItem::Use(..) => {}
222                 GrammarItem::MatchToken(..) => {}
223                 GrammarItem::ExternToken(_) => {}
224                 GrammarItem::InternToken(_) => {}
225                 GrammarItem::Nonterminal(ref data) => {
226                     for alternative in &data.alternatives {
227                         self.validate_alternative(alternative)?;
228                     }
229                 }
230             }
231         }
232         Ok(())
233     }
234 
validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()>235     fn validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()> {
236         assert!(alternative.condition.is_none()); // macro expansion should have removed these
237         self.validate_expr(&alternative.expr)?;
238         Ok(())
239     }
240 
validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()>241     fn validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()> {
242         for symbol in &expr.symbols {
243             self.validate_symbol(symbol)?;
244         }
245         Ok(())
246     }
247 
validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()>248     fn validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()> {
249         match symbol.kind {
250             SymbolKind::Expr(ref expr) => {
251                 self.validate_expr(expr)?;
252             }
253             SymbolKind::Terminal(ref term) => {
254                 self.validate_terminal(symbol.span, term)?;
255             }
256             SymbolKind::Nonterminal(_) => {}
257             SymbolKind::Repeat(ref repeat) => {
258                 self.validate_symbol(&repeat.symbol)?;
259             }
260             SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => {
261                 self.validate_symbol(sym)?;
262             }
263             SymbolKind::Lookahead | SymbolKind::Lookbehind | SymbolKind::Error => {}
264             SymbolKind::AmbiguousId(ref id) => {
265                 panic!("ambiguous id `{}` encountered after name resolution", id)
266             }
267             SymbolKind::Macro(..) => {
268                 panic!("macro not removed: {:?}", symbol);
269             }
270         }
271 
272         Ok(())
273     }
274 
validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()>275     fn validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()> {
276         match self.mode {
277             // If there is an extern token definition, validate that
278             // this terminal has a defined conversion.
279             TokenMode::Extern { ref conversions } => {
280                 if !conversions.contains(term) {
281                     return_err!(
282                         span,
283                         "terminal `{}` does not have a pattern defined for it",
284                         term
285                     );
286                 }
287             }
288 
289             // If there is no extern token definition, then collect
290             // the terminal literals ("class", r"[a-z]+") into a set.
291             TokenMode::Internal {
292                 ref mut match_block,
293             } => {
294                 match *term {
295                     TerminalString::Bare(_) => assert!(
296                         match_block.match_user_names.contains(term),
297                         "bare terminal without match entry: {}",
298                         term
299                     ),
300 
301                     TerminalString::Literal(ref l) => {
302                         match_block.add_literal_from_grammar(l.clone(), span)?
303                     }
304 
305                     // Error is a builtin terminal that always exists
306                     TerminalString::Error => (),
307                 }
308             }
309         }
310 
311         Ok(())
312     }
313 }
314 
315 ///////////////////////////////////////////////////////////////////////////
316 // Construction phase -- if we are constructing a tokenizer, this
317 // phase builds up an internal token DFA.
318 
construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()>319 fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
320     let MatchBlock {
321         mut match_entries,
322         spans,
323         ..
324     } = match_block;
325 
326     // Sort match entries by order of increasing precedence.
327     match_entries.sort();
328 
329     // Build up two vectors, one of parsed regular expressions and
330     // one of precedences, that are parallel with `literals`.
331     let mut regexs = Vec::with_capacity(match_entries.len());
332     let mut precedences = Vec::with_capacity(match_entries.len());
333     for match_entry in &match_entries {
334         precedences.push(Precedence(match_entry.precedence));
335         match match_entry.match_literal {
336             TerminalLiteral::Quoted(ref s) => {
337                 regexs.push(re::parse_literal(&s));
338             }
339             TerminalLiteral::Regex(ref s) => {
340                 match re::parse_regex(&s) {
341                     Ok(regex) => regexs.push(regex),
342                     Err(error) => {
343                         let literal_span = spans[&match_entry.match_literal];
344                         // FIXME -- take offset into account for
345                         // span; this requires knowing how many #
346                         // the user used, which we do not track
347                         return_err!(literal_span, "invalid regular expression: {}", error);
348                     }
349                 }
350             }
351         }
352     }
353 
354     let dfa = match dfa::build_dfa(&regexs, &precedences) {
355         Ok(dfa) => dfa,
356         Err(DFAConstructionError::NFAConstructionError { index, error }) => {
357             let feature = match error {
358                 NamedCaptures => r#"named captures (`(?P<foo>...)`)"#,
359                 NonGreedy => r#""non-greedy" repetitions (`*?` or `+?`)"#,
360                 WordBoundary => r#"word boundaries (`\b` or `\B`)"#,
361                 LineBoundary => r#"line boundaries (`^` or `$`)"#,
362                 TextBoundary => r#"text boundaries (`^` or `$`)"#,
363                 ByteRegex => r#"byte-based matches"#,
364             };
365             let literal = &match_entries[index.index()].match_literal;
366             return_err!(
367                 spans[literal],
368                 "{} are not supported in regular expressions",
369                 feature
370             )
371         }
372         Err(DFAConstructionError::Ambiguity { match0, match1 }) => {
373             let literal0 = &match_entries[match0.index()].match_literal;
374             let literal1 = &match_entries[match1.index()].match_literal;
375             // FIXME(#88) -- it'd be nice to give an example here
376             return_err!(
377                 spans[literal0],
378                 "ambiguity detected between the terminal `{}` and the terminal `{}`",
379                 literal0,
380                 literal1
381             )
382         }
383     };
384 
385     grammar
386         .items
387         .push(GrammarItem::InternToken(InternToken { match_entries, dfa }));
388 
389     // we need to inject a `'input` lifetime and `input: &'input str` parameter as well:
390 
391     let input_lifetime = Lifetime::input();
392     for parameter in &grammar.type_parameters {
393         match parameter {
394             TypeParameter::Lifetime(i) if *i == input_lifetime => {
395                 return_err!(
396                     grammar.span,
397                     "since there is no external token enum specified, \
398                      the `'input` lifetime is implicit and cannot be declared"
399                 );
400             }
401             _ => {}
402         }
403     }
404 
405     let input_parameter = Atom::from(INPUT_PARAMETER);
406     for parameter in &grammar.parameters {
407         if parameter.name == input_parameter {
408             return_err!(
409                 grammar.span,
410                 "since there is no external token enum specified, \
411                  the `input` parameter is implicit and cannot be declared"
412             );
413         }
414     }
415 
416     grammar
417         .type_parameters
418         .insert(0, TypeParameter::Lifetime(input_lifetime.clone()));
419 
420     let parameter = Parameter {
421         name: input_parameter,
422         ty: TypeRef::Ref {
423             lifetime: Some(input_lifetime),
424             mutable: false,
425             referent: Box::new(TypeRef::Id(Atom::from("str"))),
426         },
427     };
428     grammar.parameters.push(parameter);
429 
430     Ok(())
431 }
432