1 //! If an extern token is provided, then this pass validates that
2 //! terminal IDs have conversions. Otherwise, it generates a
3 //! tokenizer. This can only be done after macro expansion because
4 //! some macro arguments never make it into an actual production and
5 //! are only used in `if` conditions; we use string literals for
6 //! those, but they do not have to have a defined conversion.
7
8 use super::{NormError, NormResult};
9
10 use crate::collections::{Map, Set};
11 use crate::grammar::consts::*;
12 use crate::grammar::parse_tree::*;
13 use crate::lexer::dfa::{self, DFAConstructionError, Precedence};
14 use crate::lexer::nfa::NFAConstructionError::*;
15 use crate::lexer::re;
16 use string_cache::DefaultAtom as Atom;
17
18 #[cfg(test)]
19 mod test;
20
validate(mut grammar: Grammar) -> NormResult<Grammar>21 pub fn validate(mut grammar: Grammar) -> NormResult<Grammar> {
22 let mode = {
23 let mode = if let Some(enum_token) = grammar.enum_token() {
24 assert!(
25 grammar.match_token().is_none(),
26 "validator permitted both an extern/match section"
27 );
28
29 TokenMode::Extern {
30 conversions: enum_token
31 .conversions
32 .iter()
33 .map(|conversion| conversion.from.clone())
34 .collect(),
35 }
36 } else {
37 if cfg!(not(feature = "lexer")) {
38 return_err!(
39 Span::default(),
40 "The `lexer` feature must be specified unless an `extern` lexer is defined"
41 );
42 }
43 TokenMode::Internal {
44 match_block: MatchBlock::new(grammar.match_token())?,
45 }
46 };
47
48 let mut validator = Validator {
49 grammar: &grammar,
50 mode,
51 };
52
53 validator.validate()?;
54
55 validator.mode
56 };
57
58 match mode {
59 TokenMode::Extern { .. } => {
60 // If using an external tokenizer, we're all done at this point.
61 }
62 TokenMode::Internal { match_block } => {
63 // Otherwise, construct the `InternToken` item.
64 construct(&mut grammar, match_block)?;
65 }
66 }
67
68 Ok(grammar)
69 }
70
71 ///////////////////////////////////////////////////////////////////////////
72 // Validation phase -- this phase walks the grammar and visits all
73 // terminals. If using an external set of tokens, it checks that all
74 // terminals have a defined conversion to some pattern. Otherwise,
75 // it collects all terminals into the `all_literals` set for later use.
76
77 struct Validator<'grammar> {
78 grammar: &'grammar Grammar,
79 mode: TokenMode,
80 }
81
82 enum TokenMode {
83 /// If there is an `extern { ... }` section that defines
84 /// conversions of the form `TERMINAL => PATTERN`, then this is a
85 /// set of those terminals. These are the only terminals that the
86 /// user should be using.
87 Extern { conversions: Set<TerminalString> },
88
89 /// Otherwise, we are synthesizing the tokenizer. In that case,
90 /// `match_block` summarizes the data from the `match { ... }`
91 /// section, if any. If there was no `match` section, or the
92 /// section contains a wildcard, the user can also use additional
93 /// terminals in the grammar.
94 Internal { match_block: MatchBlock },
95 }
96
97 /// Data summarizing the `match { }` block, along with any literals we
98 /// scraped up.
99 #[derive(Default)]
100 struct MatchBlock {
101 /// This map stores the `match { }` entries. If `match_catch_all`
102 /// is true, then we will grow this set with "identity mappings"
103 /// for new literals that we find.
104 match_entries: Vec<MatchEntry>,
105
106 /// The names of all terminals the user can legally type. If
107 /// `match_catch_all` is true, then if we encounter additional
108 /// terminal literals in the grammar, we will add them to this
109 /// set.
110 match_user_names: Set<TerminalString>,
111
112 /// For each terminal literal that we have to match, the span
113 /// where it appeared in user's source. This can either be in the
114 /// `match { }` section or else in the grammar somewhere (if added
115 /// due to a catch-all, or there is no match section).
116 spans: Map<TerminalLiteral, Span>,
117
118 /// True if we should permit unrecognized literals to be used.
119 catch_all: bool,
120 }
121
122 impl MatchBlock {
123 /// Creates a `MatchBlock` by reading the data out of the `match {
124 /// ... }` block that the user provided (if any).
new(opt_match_token: Option<&MatchToken>) -> NormResult<Self>125 fn new(opt_match_token: Option<&MatchToken>) -> NormResult<Self> {
126 let mut match_block = Self::default();
127 if let Some(match_token) = opt_match_token {
128 for (idx, mc) in match_token.contents.iter().enumerate() {
129 let precedence = match_token.contents.len() - idx;
130 for item in &mc.items {
131 match *item {
132 MatchItem::Unmapped(ref sym, span) => {
133 match_block.add_match_entry(
134 precedence,
135 sym.clone(),
136 MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
137 span,
138 )?;
139 }
140 MatchItem::Mapped(ref sym, ref user, span) => {
141 match_block.add_match_entry(
142 precedence,
143 sym.clone(),
144 user.clone(),
145 span,
146 )?;
147 }
148 MatchItem::CatchAll(_) => {
149 match_block.catch_all = true;
150 }
151 }
152 }
153 }
154 } else {
155 // no match block is equivalent to `match { _ }`
156 match_block.catch_all = true;
157 }
158 Ok(match_block)
159 }
160
add_match_entry( &mut self, match_group_precedence: usize, sym: TerminalLiteral, user_name: MatchMapping, span: Span, ) -> NormResult<()>161 fn add_match_entry(
162 &mut self,
163 match_group_precedence: usize,
164 sym: TerminalLiteral,
165 user_name: MatchMapping,
166 span: Span,
167 ) -> NormResult<()> {
168 if let Some(_old_span) = self.spans.insert(sym.clone(), span) {
169 return_err!(span, "multiple match entries for `{}`", sym);
170 }
171
172 // NB: It's legal for multiple regex to produce same terminal.
173 if let MatchMapping::Terminal(user_name) = &user_name {
174 self.match_user_names.insert(user_name.clone());
175 }
176
177 self.match_entries.push(MatchEntry {
178 precedence: match_group_precedence * 2 + sym.base_precedence(),
179 match_literal: sym,
180 user_name,
181 });
182 Ok(())
183 }
184
add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()>185 fn add_literal_from_grammar(&mut self, sym: TerminalLiteral, span: Span) -> NormResult<()> {
186 // Already saw this literal, maybe in a match entry, maybe in the grammar.
187 if self
188 .match_user_names
189 .contains(&TerminalString::Literal(sym.clone()))
190 {
191 return Ok(());
192 }
193
194 if !self.catch_all {
195 return_err!(
196 span,
197 "terminal `{}` does not have a match mapping defined for it",
198 sym
199 );
200 }
201
202 self.match_user_names
203 .insert(TerminalString::Literal(sym.clone()));
204
205 self.match_entries.push(MatchEntry {
206 precedence: sym.base_precedence(),
207 match_literal: sym.clone(),
208 user_name: MatchMapping::Terminal(TerminalString::Literal(sym.clone())),
209 });
210
211 self.spans.insert(sym, span);
212
213 Ok(())
214 }
215 }
216
217 impl<'grammar> Validator<'grammar> {
validate(&mut self) -> NormResult<()>218 fn validate(&mut self) -> NormResult<()> {
219 for item in &self.grammar.items {
220 match *item {
221 GrammarItem::Use(..) => {}
222 GrammarItem::MatchToken(..) => {}
223 GrammarItem::ExternToken(_) => {}
224 GrammarItem::InternToken(_) => {}
225 GrammarItem::Nonterminal(ref data) => {
226 for alternative in &data.alternatives {
227 self.validate_alternative(alternative)?;
228 }
229 }
230 }
231 }
232 Ok(())
233 }
234
validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()>235 fn validate_alternative(&mut self, alternative: &Alternative) -> NormResult<()> {
236 assert!(alternative.condition.is_none()); // macro expansion should have removed these
237 self.validate_expr(&alternative.expr)?;
238 Ok(())
239 }
240
validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()>241 fn validate_expr(&mut self, expr: &ExprSymbol) -> NormResult<()> {
242 for symbol in &expr.symbols {
243 self.validate_symbol(symbol)?;
244 }
245 Ok(())
246 }
247
validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()>248 fn validate_symbol(&mut self, symbol: &Symbol) -> NormResult<()> {
249 match symbol.kind {
250 SymbolKind::Expr(ref expr) => {
251 self.validate_expr(expr)?;
252 }
253 SymbolKind::Terminal(ref term) => {
254 self.validate_terminal(symbol.span, term)?;
255 }
256 SymbolKind::Nonterminal(_) => {}
257 SymbolKind::Repeat(ref repeat) => {
258 self.validate_symbol(&repeat.symbol)?;
259 }
260 SymbolKind::Choose(ref sym) | SymbolKind::Name(_, ref sym) => {
261 self.validate_symbol(sym)?;
262 }
263 SymbolKind::Lookahead | SymbolKind::Lookbehind | SymbolKind::Error => {}
264 SymbolKind::AmbiguousId(ref id) => {
265 panic!("ambiguous id `{}` encountered after name resolution", id)
266 }
267 SymbolKind::Macro(..) => {
268 panic!("macro not removed: {:?}", symbol);
269 }
270 }
271
272 Ok(())
273 }
274
validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()>275 fn validate_terminal(&mut self, span: Span, term: &TerminalString) -> NormResult<()> {
276 match self.mode {
277 // If there is an extern token definition, validate that
278 // this terminal has a defined conversion.
279 TokenMode::Extern { ref conversions } => {
280 if !conversions.contains(term) {
281 return_err!(
282 span,
283 "terminal `{}` does not have a pattern defined for it",
284 term
285 );
286 }
287 }
288
289 // If there is no extern token definition, then collect
290 // the terminal literals ("class", r"[a-z]+") into a set.
291 TokenMode::Internal {
292 ref mut match_block,
293 } => {
294 match *term {
295 TerminalString::Bare(_) => assert!(
296 match_block.match_user_names.contains(term),
297 "bare terminal without match entry: {}",
298 term
299 ),
300
301 TerminalString::Literal(ref l) => {
302 match_block.add_literal_from_grammar(l.clone(), span)?
303 }
304
305 // Error is a builtin terminal that always exists
306 TerminalString::Error => (),
307 }
308 }
309 }
310
311 Ok(())
312 }
313 }
314
315 ///////////////////////////////////////////////////////////////////////////
316 // Construction phase -- if we are constructing a tokenizer, this
317 // phase builds up an internal token DFA.
318
construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()>319 fn construct(grammar: &mut Grammar, match_block: MatchBlock) -> NormResult<()> {
320 let MatchBlock {
321 mut match_entries,
322 spans,
323 ..
324 } = match_block;
325
326 // Sort match entries by order of increasing precedence.
327 match_entries.sort();
328
329 // Build up two vectors, one of parsed regular expressions and
330 // one of precedences, that are parallel with `literals`.
331 let mut regexs = Vec::with_capacity(match_entries.len());
332 let mut precedences = Vec::with_capacity(match_entries.len());
333 for match_entry in &match_entries {
334 precedences.push(Precedence(match_entry.precedence));
335 match match_entry.match_literal {
336 TerminalLiteral::Quoted(ref s) => {
337 regexs.push(re::parse_literal(&s));
338 }
339 TerminalLiteral::Regex(ref s) => {
340 match re::parse_regex(&s) {
341 Ok(regex) => regexs.push(regex),
342 Err(error) => {
343 let literal_span = spans[&match_entry.match_literal];
344 // FIXME -- take offset into account for
345 // span; this requires knowing how many #
346 // the user used, which we do not track
347 return_err!(literal_span, "invalid regular expression: {}", error);
348 }
349 }
350 }
351 }
352 }
353
354 let dfa = match dfa::build_dfa(®exs, &precedences) {
355 Ok(dfa) => dfa,
356 Err(DFAConstructionError::NFAConstructionError { index, error }) => {
357 let feature = match error {
358 NamedCaptures => r#"named captures (`(?P<foo>...)`)"#,
359 NonGreedy => r#""non-greedy" repetitions (`*?` or `+?`)"#,
360 WordBoundary => r#"word boundaries (`\b` or `\B`)"#,
361 LineBoundary => r#"line boundaries (`^` or `$`)"#,
362 TextBoundary => r#"text boundaries (`^` or `$`)"#,
363 ByteRegex => r#"byte-based matches"#,
364 };
365 let literal = &match_entries[index.index()].match_literal;
366 return_err!(
367 spans[literal],
368 "{} are not supported in regular expressions",
369 feature
370 )
371 }
372 Err(DFAConstructionError::Ambiguity { match0, match1 }) => {
373 let literal0 = &match_entries[match0.index()].match_literal;
374 let literal1 = &match_entries[match1.index()].match_literal;
375 // FIXME(#88) -- it'd be nice to give an example here
376 return_err!(
377 spans[literal0],
378 "ambiguity detected between the terminal `{}` and the terminal `{}`",
379 literal0,
380 literal1
381 )
382 }
383 };
384
385 grammar
386 .items
387 .push(GrammarItem::InternToken(InternToken { match_entries, dfa }));
388
389 // we need to inject a `'input` lifetime and `input: &'input str` parameter as well:
390
391 let input_lifetime = Lifetime::input();
392 for parameter in &grammar.type_parameters {
393 match parameter {
394 TypeParameter::Lifetime(i) if *i == input_lifetime => {
395 return_err!(
396 grammar.span,
397 "since there is no external token enum specified, \
398 the `'input` lifetime is implicit and cannot be declared"
399 );
400 }
401 _ => {}
402 }
403 }
404
405 let input_parameter = Atom::from(INPUT_PARAMETER);
406 for parameter in &grammar.parameters {
407 if parameter.name == input_parameter {
408 return_err!(
409 grammar.span,
410 "since there is no external token enum specified, \
411 the `input` parameter is implicit and cannot be declared"
412 );
413 }
414 }
415
416 grammar
417 .type_parameters
418 .insert(0, TypeParameter::Lifetime(input_lifetime.clone()));
419
420 let parameter = Parameter {
421 name: input_parameter,
422 ty: TypeRef::Ref {
423 lifetime: Some(input_lifetime),
424 mutable: false,
425 referent: Box::new(TypeRef::Id(Atom::from("str"))),
426 },
427 };
428 grammar.parameters.push(parameter);
429
430 Ok(())
431 }
432