1 //! Definition of a lexer for the WebAssembly text format.
2 //!
3 //! This module provides a [`Lexer`][] type which is an iterate over the raw
4 //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5 //! byte in a WebAssembly text field, returning tokens even for comments and
6 //! whitespace. Typically you'll ignore comments and whitespace, however.
7 //!
8 //! If you'd like to iterate over the tokens in a file you can do so via:
9 //!
10 //! ```
11 //! # fn foo() -> Result<(), wast::Error> {
12 //! use wast::lexer::Lexer;
13 //!
14 //! let wat = "(module (func $foo))";
15 //! for token in Lexer::new(wat) {
16 //! println!("{:?}", token?);
17 //! }
18 //! # Ok(())
19 //! # }
20 //! ```
21 //!
22 //! Note that you'll typically not use this module but will rather use
23 //! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24 //!
25 //! [`Lexer`]: crate::lexer::Lexer
26
27 use crate::{Error, Span};
28 use std::borrow::Cow;
29 use std::char;
30 use std::fmt;
31 use std::str;
32
33 /// A structure used to lex the s-expression syntax of WAT files.
34 ///
35 /// This structure is used to generate [`Source`] items, which should account for
36 /// every single byte of the input as we iterate over it. A [`LexError`] is
37 /// returned for any non-lexable text.
38 #[derive(Clone)]
39 pub struct Lexer<'a> {
40 remaining: &'a str,
41 input: &'a str,
42 allow_confusing_unicode: bool,
43 }
44
45 /// A fragment of source lex'd from an input string.
46 ///
47 /// This enumeration contains all kinds of fragments, including comments and
48 /// whitespace. For most cases you'll probably ignore these and simply look at
49 /// tokens.
50 #[derive(Debug, PartialEq)]
51 pub enum Token<'a> {
52 /// A line comment, preceded with `;;`
53 LineComment(&'a str),
54
55 /// A block comment, surrounded by `(;` and `;)`. Note that these can be
56 /// nested.
57 BlockComment(&'a str),
58
59 /// A fragment of source that represents whitespace.
60 Whitespace(&'a str),
61
62 /// A left-parenthesis, including the source text for where it comes from.
63 LParen(&'a str),
64 /// A right-parenthesis, including the source text for where it comes from.
65 RParen(&'a str),
66
67 /// A string literal, which is actually a list of bytes.
68 String(WasmString<'a>),
69
70 /// An identifier (like `$foo`).
71 ///
72 /// All identifiers start with `$` and the payload here is the original
73 /// source text.
74 Id(&'a str),
75
76 /// A keyword, or something that starts with an alphabetic character.
77 ///
78 /// The payload here is the original source text.
79 Keyword(&'a str),
80
81 /// A reserved series of `idchar` symbols. Unknown what this is meant to be
82 /// used for, you'll probably generate an error about an unexpected token.
83 Reserved(&'a str),
84
85 /// An integer.
86 Integer(Integer<'a>),
87
88 /// A float.
89 Float(Float<'a>),
90 }
91
92 /// Errors that can be generated while lexing.
93 ///
94 /// All lexing errors have line/colum/position information as well as a
95 /// `LexError` indicating what kind of error happened while lexing.
96 #[derive(Debug, Clone, PartialEq)]
97 pub enum LexError {
98 /// A dangling block comment was found with an unbalanced `(;` which was
99 /// never terminated in the file.
100 DanglingBlockComment,
101
102 /// An unexpected character was encountered when generally parsing and
103 /// looking for something else.
104 Unexpected(char),
105
106 /// An invalid `char` in a string literal was found.
107 InvalidStringElement(char),
108
109 /// An invalid string escape letter was found (the thing after the `\` in
110 /// string literals)
111 InvalidStringEscape(char),
112
113 /// An invalid hexadecimal digit was found.
114 InvalidHexDigit(char),
115
116 /// An invalid base-10 digit was found.
117 InvalidDigit(char),
118
119 /// Parsing expected `wanted` but ended up finding `found` instead where the
120 /// two characters aren't the same.
121 Expected {
122 /// The character that was expected to be found
123 wanted: char,
124 /// The character that was actually found
125 found: char,
126 },
127
128 /// We needed to parse more but EOF (or end of the string) was encountered.
129 UnexpectedEof,
130
131 /// A number failed to parse because it was too big to fit within the target
132 /// type.
133 NumberTooBig,
134
135 /// An invalid unicode value was found in a `\u{...}` escape in a string,
136 /// only valid unicode scalars can be escaped that way.
137 InvalidUnicodeValue(u32),
138
139 /// A lone underscore was found when parsing a number, since underscores
140 /// should always be preceded and succeeded with a digit of some form.
141 LoneUnderscore,
142
143 /// A "confusing" unicode character is present in a comment or a string
144 /// literal, such as a character that changes the direction text is
145 /// typically displayed in editors. This could cause the human-read
146 /// version to behave differently than the compiler-visible version, so
147 /// these are simply rejected for now.
148 ConfusingUnicode(char),
149
150 #[doc(hidden)]
151 __Nonexhaustive,
152 }
153
154 /// A sign token for an integer.
155 #[derive(Clone, Copy, Debug, PartialEq)]
156 pub enum SignToken {
157 /// Plus sign: "+",
158 Plus,
159 /// Minus sign: "-",
160 Minus,
161 }
162
163 /// A parsed integer, signed or unsigned.
164 ///
165 /// Methods can be use to access the value of the integer.
166 #[derive(Debug, PartialEq)]
167 pub struct Integer<'a>(Box<IntegerInner<'a>>);
168
169 #[derive(Debug, PartialEq)]
170 struct IntegerInner<'a> {
171 sign: Option<SignToken>,
172 src: &'a str,
173 val: Cow<'a, str>,
174 hex: bool,
175 }
176
177 /// A parsed float.
178 ///
179 /// Methods can be use to access the value of the float.
180 #[derive(Debug, PartialEq)]
181 pub struct Float<'a>(Box<FloatInner<'a>>);
182
183 #[derive(Debug, PartialEq)]
184 struct FloatInner<'a> {
185 src: &'a str,
186 val: FloatVal<'a>,
187 }
188
189 /// A parsed string.
190 #[derive(Debug, PartialEq)]
191 pub struct WasmString<'a>(Box<WasmStringInner<'a>>);
192
193 #[derive(Debug, PartialEq)]
194 struct WasmStringInner<'a> {
195 src: &'a str,
196 val: Cow<'a, [u8]>,
197 }
198
199 /// Possible parsed float values
200 #[derive(Debug, PartialEq)]
201 pub enum FloatVal<'a> {
202 /// A float `NaN` representation
203 Nan {
204 /// The specific bits to encode for this float, optionally
205 val: Option<u64>,
206 /// Whether or not this is a negative `NaN` or not.
207 negative: bool,
208 },
209 /// An float infinite representation,
210 Inf {
211 #[allow(missing_docs)]
212 negative: bool,
213 },
214 /// A parsed and separated floating point value
215 Val {
216 /// Whether or not the `integral` and `decimal` are specified in hex
217 hex: bool,
218 /// The float parts before the `.`
219 integral: Cow<'a, str>,
220 /// The float parts after the `.`
221 decimal: Option<Cow<'a, str>>,
222 /// The exponent to multiple this `integral.decimal` portion of the
223 /// float by. If `hex` is true this is `2^exponent` and otherwise it's
224 /// `10^exponent`
225 exponent: Option<Cow<'a, str>>,
226 },
227 }
228
229 // https://webassembly.github.io/spec/core/text/values.html#text-idchar
230 macro_rules! idchars {
231 () => {
232 b'0'..=b'9'
233 | b'A'..=b'Z'
234 | b'a'..=b'z'
235 | b'!'
236 | b'#'
237 | b'$'
238 | b'%'
239 | b'&'
240 | b'\''
241 | b'*'
242 | b'+'
243 | b'-'
244 | b'.'
245 | b'/'
246 | b':'
247 | b'<'
248 | b'='
249 | b'>'
250 | b'?'
251 | b'@'
252 | b'\\'
253 | b'^'
254 | b'_'
255 | b'`'
256 | b'|'
257 | b'~'
258 }
259 }
260
261 impl<'a> Lexer<'a> {
262 /// Creates a new lexer which will lex the `input` source string.
new(input: &str) -> Lexer<'_>263 pub fn new(input: &str) -> Lexer<'_> {
264 Lexer {
265 remaining: input,
266 input,
267 allow_confusing_unicode: false,
268 }
269 }
270
271 /// Returns the original source input that we're lexing.
input(&self) -> &'a str272 pub fn input(&self) -> &'a str {
273 self.input
274 }
275
276 /// Configures whether "confusing" unicode characters are allowed while
277 /// lexing.
278 ///
279 /// If allowed then no error will happen if these characters are found, but
280 /// otherwise if disallowed a lex error will be produced when these
281 /// characters are found. Confusing characters are denied by default.
282 ///
283 /// For now "confusing characters" are primarily related to the "trojan
284 /// source" problem where it refers to characters which cause humans to read
285 /// text differently than this lexer, such as characters that alter the
286 /// left-to-right display of the source code.
allow_confusing_unicode(&mut self, allow: bool) -> &mut Self287 pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
288 self.allow_confusing_unicode = allow;
289 self
290 }
291
292 /// Lexes the next token in the input.
293 ///
294 /// Returns `Some` if a token is found or `None` if we're at EOF.
295 ///
296 /// # Errors
297 ///
298 /// Returns an error if the input is malformed.
parse(&mut self) -> Result<Option<Token<'a>>, Error>299 pub fn parse(&mut self) -> Result<Option<Token<'a>>, Error> {
300 let pos = self.cur();
301 // This `match` generally parses the grammar specified at
302 //
303 // https://webassembly.github.io/spec/core/text/lexical.html#text-token
304 let byte = match self.remaining.as_bytes().get(0) {
305 Some(b) => b,
306 None => return Ok(None),
307 };
308
309 match byte {
310 // Open-parens check the next character to see if this is the start
311 // of a block comment, otherwise it's just a bland left-paren
312 // token.
313 b'(' => match self.remaining.as_bytes().get(1) {
314 Some(b';') => {
315 let mut level = 1;
316 // Note that we're doing a byte-level search here for the
317 // close-delimiter of `;)`. The actual source text is utf-8
318 // encode in `self.remaining` but due to how utf-8 works we
319 // can safely search for an ASCII byte since it'll never
320 // otherwise appear in the middle of a codepoint and if we
321 // find it then it's guaranteed to be the right byte.
322 //
323 // Mainly we're avoiding the overhead of decoding utf-8
324 // characters into a Rust `char` since it's otherwise
325 // unnecessary work.
326 let mut iter = self.remaining.as_bytes()[2..].iter();
327 while let Some(ch) = iter.next() {
328 match ch {
329 b'(' => {
330 if let Some(b';') = iter.as_slice().get(0) {
331 level += 1;
332 iter.next();
333 }
334 }
335 b';' => {
336 if let Some(b')') = iter.as_slice().get(0) {
337 level -= 1;
338 iter.next();
339 if level == 0 {
340 let len = self.remaining.len() - iter.as_slice().len();
341 let (comment, remaining) = self.remaining.split_at(len);
342 self.remaining = remaining;
343 self.check_confusing_comment(comment)?;
344 return Ok(Some(Token::BlockComment(comment)));
345 }
346 }
347 }
348 _ => {}
349 }
350 }
351 Err(self.error(pos, LexError::DanglingBlockComment))
352 }
353 _ => Ok(Some(Token::LParen(self.split_first_byte()))),
354 },
355
356 b')' => Ok(Some(Token::RParen(self.split_first_byte()))),
357
358 b'"' => {
359 let val = self.string()?;
360 let src = &self.input[pos..self.cur()];
361 return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner {
362 val,
363 src,
364 })))));
365 }
366
367 // https://webassembly.github.io/spec/core/text/lexical.html#white-space
368 b' ' | b'\n' | b'\r' | b'\t' => Ok(Some(Token::Whitespace(self.split_ws()))),
369
370 c @ idchars!() => {
371 let reserved = self.split_while(|b| match b {
372 idchars!() => true,
373 _ => false,
374 });
375
376 // https://webassembly.github.io/spec/core/text/values.html#integers
377 if let Some(number) = self.number(reserved) {
378 Ok(Some(number))
379 // https://webassembly.github.io/spec/core/text/values.html#text-id
380 } else if *c == b'$' && reserved.len() > 1 {
381 Ok(Some(Token::Id(reserved)))
382 // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
383 } else if b'a' <= *c && *c <= b'z' {
384 Ok(Some(Token::Keyword(reserved)))
385 } else {
386 Ok(Some(Token::Reserved(reserved)))
387 }
388 }
389
390 // This could be a line comment, otherwise `;` is a reserved token.
391 // The second byte is checked to see if it's a `;;` line comment
392 b';' => match self.remaining.as_bytes().get(1) {
393 Some(b';') => {
394 let comment = self.split_until(b'\n');
395 self.check_confusing_comment(comment)?;
396 Ok(Some(Token::LineComment(comment)))
397 }
398 _ => Ok(Some(Token::Reserved(self.split_first_byte()))),
399 },
400
401 // Other known reserved tokens other than `;`
402 b',' | b'[' | b']' | b'{' | b'}' => Ok(Some(Token::Reserved(self.split_first_byte()))),
403
404 _ => {
405 let ch = self.remaining.chars().next().unwrap();
406 Err(self.error(pos, LexError::Unexpected(ch)))
407 }
408 }
409 }
410
split_first_byte(&mut self) -> &'a str411 fn split_first_byte(&mut self) -> &'a str {
412 let (token, remaining) = self.remaining.split_at(1);
413 self.remaining = remaining;
414 token
415 }
416
split_until(&mut self, byte: u8) -> &'a str417 fn split_until(&mut self, byte: u8) -> &'a str {
418 let pos = memchr::memchr(byte, self.remaining.as_bytes()).unwrap_or(self.remaining.len());
419 let (ret, remaining) = self.remaining.split_at(pos);
420 self.remaining = remaining;
421 ret
422 }
423
split_ws(&mut self) -> &'a str424 fn split_ws(&mut self) -> &'a str {
425 // This table is a byte lookup table to determine whether a byte is a
426 // whitespace byte. There are only 4 whitespace bytes for the `*.wat`
427 // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
428 // have a '1' in the table below.
429 //
430 // Due to how utf-8 works (our input is guaranteed to be utf-8) it is
431 // known that if these bytes are found they're guaranteed to be the
432 // whitespace byte, so they can be safely skipped and we don't have to
433 // do full utf-8 decoding. This means that the goal of this function is
434 // to find the first non-whitespace byte in `self.remaining`.
435 //
436 // For now this lookup table seems to be the fastest, but projects like
437 // https://github.com/lemire/despacer show other simd algorithms which
438 // can possibly accelerate this even more. Note that `*.wat` files often
439 // have a lot of whitespace so this function is typically quite hot when
440 // parsing inputs.
441 #[rustfmt::skip]
442 const WS: [u8; 256] = [
443 // \t \n \r
444 /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
445 /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
446 // ' '
447 /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
448 /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
449 /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450 /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451 /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452 /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454 /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455 /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456 /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457 /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 ];
462 let pos = self
463 .remaining
464 .as_bytes()
465 .iter()
466 .position(|b| WS[*b as usize] != 1)
467 .unwrap_or(self.remaining.len());
468 let (ret, remaining) = self.remaining.split_at(pos);
469 self.remaining = remaining;
470 ret
471 }
472
split_while(&mut self, f: impl Fn(u8) -> bool) -> &'a str473 fn split_while(&mut self, f: impl Fn(u8) -> bool) -> &'a str {
474 let pos = self
475 .remaining
476 .as_bytes()
477 .iter()
478 .position(|b| !f(*b))
479 .unwrap_or(self.remaining.len());
480 let (ret, remaining) = self.remaining.split_at(pos);
481 self.remaining = remaining;
482 ret
483 }
484
number(&self, src: &'a str) -> Option<Token<'a>>485 fn number(&self, src: &'a str) -> Option<Token<'a>> {
486 let (sign, num) = if src.starts_with('+') {
487 (Some(SignToken::Plus), &src[1..])
488 } else if src.starts_with('-') {
489 (Some(SignToken::Minus), &src[1..])
490 } else {
491 (None, src)
492 };
493
494 let negative = sign == Some(SignToken::Minus);
495
496 // Handle `inf` and `nan` which are special numbers here
497 if num == "inf" {
498 return Some(Token::Float(Float(Box::new(FloatInner {
499 src,
500 val: FloatVal::Inf { negative },
501 }))));
502 } else if num == "nan" {
503 return Some(Token::Float(Float(Box::new(FloatInner {
504 src,
505 val: FloatVal::Nan {
506 val: None,
507 negative,
508 },
509 }))));
510 } else if num.starts_with("nan:0x") {
511 let mut it = num[6..].chars();
512 let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?;
513 if it.next().is_some() {
514 return None;
515 }
516 let n = u64::from_str_radix(&to_parse, 16).ok()?;
517 return Some(Token::Float(Float(Box::new(FloatInner {
518 src,
519 val: FloatVal::Nan {
520 val: Some(n),
521 negative,
522 },
523 }))));
524 }
525
526 // Figure out if we're a hex number or not
527 let (mut it, hex, test_valid) = if num.starts_with("0x") {
528 (
529 num[2..].chars(),
530 true,
531 char::is_ascii_hexdigit as fn(&char) -> bool,
532 )
533 } else {
534 (
535 num.chars(),
536 false,
537 char::is_ascii_digit as fn(&char) -> bool,
538 )
539 };
540
541 // Evaluate the first part, moving out all underscores
542 let val = skip_undescores(&mut it, negative, test_valid)?;
543
544 match it.clone().next() {
545 // If we're followed by something this may be a float so keep going.
546 Some(_) => {}
547
548 // Otherwise this is a valid integer literal!
549 None => {
550 return Some(Token::Integer(Integer(Box::new(IntegerInner {
551 sign,
552 src,
553 val,
554 hex,
555 }))))
556 }
557 }
558
559 // A number can optionally be after the decimal so only actually try to
560 // parse one if it's there.
561 let decimal = if it.clone().next() == Some('.') {
562 it.next();
563 match it.clone().next() {
564 Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?),
565 Some(_) | None => None,
566 }
567 } else {
568 None
569 };
570
571 // Figure out if there's an exponential part here to make a float, and
572 // if so parse it but defer its actual calculation until later.
573 let exponent = match (hex, it.next()) {
574 (true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => {
575 let negative = match it.clone().next() {
576 Some('-') => {
577 it.next();
578 true
579 }
580 Some('+') => {
581 it.next();
582 false
583 }
584 _ => false,
585 };
586 Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?)
587 }
588 (_, None) => None,
589 _ => return None,
590 };
591
592 // We should have eaten everything by now, if not then this is surely
593 // not a float or integer literal.
594 if it.next().is_some() {
595 return None;
596 }
597
598 return Some(Token::Float(Float(Box::new(FloatInner {
599 src,
600 val: FloatVal::Val {
601 hex,
602 integral: val,
603 exponent,
604 decimal,
605 },
606 }))));
607
608 fn skip_undescores<'a>(
609 it: &mut str::Chars<'a>,
610 negative: bool,
611 good: fn(&char) -> bool,
612 ) -> Option<Cow<'a, str>> {
613 enum State {
614 Raw,
615 Collecting(String),
616 }
617 let mut last_underscore = false;
618 let mut state = if negative {
619 State::Collecting("-".to_string())
620 } else {
621 State::Raw
622 };
623 let input = it.as_str();
624 let first = it.next()?;
625 if !good(&first) {
626 return None;
627 }
628 if let State::Collecting(s) = &mut state {
629 s.push(first);
630 }
631 let mut last = 1;
632 while let Some(c) = it.clone().next() {
633 if c == '_' && !last_underscore {
634 if let State::Raw = state {
635 state = State::Collecting(input[..last].to_string());
636 }
637 it.next();
638 last_underscore = true;
639 continue;
640 }
641 if !good(&c) {
642 break;
643 }
644 if let State::Collecting(s) = &mut state {
645 s.push(c);
646 }
647 last_underscore = false;
648 it.next();
649 last += 1;
650 }
651 if last_underscore {
652 return None;
653 }
654 Some(match state {
655 State::Raw => input[..last].into(),
656 State::Collecting(s) => s.into(),
657 })
658 }
659 }
660
661 /// Verifies that `comment`, which is about to be returned, has a "confusing
662 /// unicode character" in it and should instead be transformed into an
663 /// error.
check_confusing_comment(&self, comment: &str) -> Result<(), Error>664 fn check_confusing_comment(&self, comment: &str) -> Result<(), Error> {
665 if self.allow_confusing_unicode {
666 return Ok(());
667 }
668
669 // In an effort to avoid utf-8 decoding the entire `comment` the search
670 // here is a bit more optimized. This checks for the `0xe2` byte because
671 // in the utf-8 encoding that's the leading encoding byte for all
672 // "confusing characters". Each instance of 0xe2 is checked to see if it
673 // starts a confusing character, and if so that's returned.
674 //
675 // Also note that 0xe2 will never be found in the middle of a codepoint,
676 // it's always the start of a codepoint. This means that if our special
677 // characters show up they're guaranteed to start with 0xe2 bytes.
678 let bytes = comment.as_bytes();
679 for pos in memchr::Memchr::new(0xe2, bytes) {
680 if let Some(c) = comment[pos..].chars().next() {
681 if is_confusing_unicode(c) {
682 // Note that `self.cur()` accounts for already having
683 // parsed `comment`, so we move backwards to where
684 // `comment` started and then add the index within
685 // `comment`.
686 let pos = self.cur() - comment.len() + pos;
687 return Err(self.error(pos, LexError::ConfusingUnicode(c)));
688 }
689 }
690 }
691
692 Ok(())
693 }
694
695 /// Reads everything for a literal string except the leading `"`. Returns
696 /// the string value that has been read.
697 ///
698 /// https://webassembly.github.io/spec/core/text/values.html#text-string
string(&mut self) -> Result<Cow<'a, [u8]>, Error>699 fn string(&mut self) -> Result<Cow<'a, [u8]>, Error> {
700 let mut it = self.remaining[1..].chars();
701 let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
702 let end = self.input.len() - it.as_str().len();
703 self.remaining = &self.input[end..];
704 result.map_err(|e| {
705 let err_pos = match &e {
706 LexError::UnexpectedEof => self.input.len(),
707 _ => self.input[..end].char_indices().next_back().unwrap().0,
708 };
709 self.error(err_pos, e)
710 })
711 }
712
parse_str( it: &mut str::Chars<'a>, allow_confusing_unicode: bool, ) -> Result<Cow<'a, [u8]>, LexError>713 fn parse_str(
714 it: &mut str::Chars<'a>,
715 allow_confusing_unicode: bool,
716 ) -> Result<Cow<'a, [u8]>, LexError> {
717 enum State {
718 Start,
719 String(Vec<u8>),
720 }
721 let orig = it.as_str();
722 let mut state = State::Start;
723 loop {
724 match it.next().ok_or(LexError::UnexpectedEof)? {
725 '"' => break,
726 '\\' => {
727 match state {
728 State::String(_) => {}
729 State::Start => {
730 let pos = orig.len() - it.as_str().len() - 1;
731 state = State::String(orig[..pos].as_bytes().to_vec());
732 }
733 }
734 let buf = match &mut state {
735 State::String(b) => b,
736 State::Start => unreachable!(),
737 };
738 match it.next().ok_or(LexError::UnexpectedEof)? {
739 '"' => buf.push(b'"'),
740 '\'' => buf.push(b'\''),
741 't' => buf.push(b'\t'),
742 'n' => buf.push(b'\n'),
743 'r' => buf.push(b'\r'),
744 '\\' => buf.push(b'\\'),
745 'u' => {
746 Lexer::must_eat_char(it, '{')?;
747 let n = Lexer::hexnum(it)?;
748 let c = char::from_u32(n)
749 .ok_or_else(|| LexError::InvalidUnicodeValue(n))?;
750 buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
751 Lexer::must_eat_char(it, '}')?;
752 }
753 c1 if c1.is_ascii_hexdigit() => {
754 let c2 = Lexer::hexdigit(it)?;
755 buf.push(to_hex(c1) * 16 + c2);
756 }
757 c => return Err(LexError::InvalidStringEscape(c)),
758 }
759 }
760 c if (c as u32) < 0x20 || c as u32 == 0x7f => {
761 return Err(LexError::InvalidStringElement(c))
762 }
763 c if !allow_confusing_unicode && is_confusing_unicode(c) => {
764 return Err(LexError::ConfusingUnicode(c))
765 }
766 c => match &mut state {
767 State::Start => {}
768 State::String(v) => {
769 v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
770 }
771 },
772 }
773 }
774 match state {
775 State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
776 State::String(s) => Ok(s.into()),
777 }
778 }
779
hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError>780 fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
781 let n = Lexer::hexdigit(it)?;
782 let mut last_underscore = false;
783 let mut n = n as u32;
784 while let Some(c) = it.clone().next() {
785 if c == '_' {
786 it.next();
787 last_underscore = true;
788 continue;
789 }
790 if !c.is_ascii_hexdigit() {
791 break;
792 }
793 last_underscore = false;
794 it.next();
795 n = n
796 .checked_mul(16)
797 .and_then(|n| n.checked_add(to_hex(c) as u32))
798 .ok_or(LexError::NumberTooBig)?;
799 }
800 if last_underscore {
801 return Err(LexError::LoneUnderscore);
802 }
803 Ok(n)
804 }
805
806 /// Reads a hexidecimal digit from the input stream, returning where it's
807 /// defined and the hex value. Returns an error on EOF or an invalid hex
808 /// digit.
hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError>809 fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
810 let ch = Lexer::must_char(it)?;
811 if ch.is_ascii_hexdigit() {
812 Ok(to_hex(ch))
813 } else {
814 Err(LexError::InvalidHexDigit(ch))
815 }
816 }
817
818 /// Reads the next character from the input string and where it's located,
819 /// returning an error if the input stream is empty.
must_char(it: &mut str::Chars<'_>) -> Result<char, LexError>820 fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
821 it.next().ok_or(LexError::UnexpectedEof)
822 }
823
824 /// Expects that a specific character must be read next
must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError>825 fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
826 let found = Lexer::must_char(it)?;
827 if wanted == found {
828 Ok(())
829 } else {
830 Err(LexError::Expected { wanted, found })
831 }
832 }
833
834 /// Returns the current position of our iterator through the input string
cur(&self) -> usize835 fn cur(&self) -> usize {
836 self.input.len() - self.remaining.len()
837 }
838
839 /// Creates an error at `pos` with the specified `kind`
error(&self, pos: usize, kind: LexError) -> Error840 fn error(&self, pos: usize, kind: LexError) -> Error {
841 Error::lex(Span { offset: pos }, self.input, kind)
842 }
843 }
844
845 impl<'a> Iterator for Lexer<'a> {
846 type Item = Result<Token<'a>, Error>;
847
next(&mut self) -> Option<Self::Item>848 fn next(&mut self) -> Option<Self::Item> {
849 self.parse().transpose()
850 }
851 }
852
853 impl<'a> Token<'a> {
854 /// Returns the original source text for this token.
src(&self) -> &'a str855 pub fn src(&self) -> &'a str {
856 match self {
857 Token::Whitespace(s) => s,
858 Token::BlockComment(s) => s,
859 Token::LineComment(s) => s,
860 Token::LParen(s) => s,
861 Token::RParen(s) => s,
862 Token::String(s) => s.src(),
863 Token::Id(s) => s,
864 Token::Keyword(s) => s,
865 Token::Reserved(s) => s,
866 Token::Integer(i) => i.src(),
867 Token::Float(f) => f.src(),
868 }
869 }
870 }
871
872 impl<'a> Integer<'a> {
873 /// Returns the sign token for this integer.
sign(&self) -> Option<SignToken>874 pub fn sign(&self) -> Option<SignToken> {
875 self.0.sign
876 }
877
878 /// Returns the original source text for this integer.
src(&self) -> &'a str879 pub fn src(&self) -> &'a str {
880 self.0.src
881 }
882
883 /// Returns the value string that can be parsed for this integer, as well as
884 /// the base that it should be parsed in
val(&self) -> (&str, u32)885 pub fn val(&self) -> (&str, u32) {
886 (&self.0.val, if self.0.hex { 16 } else { 10 })
887 }
888 }
889
890 impl<'a> Float<'a> {
891 /// Returns the original source text for this integer.
src(&self) -> &'a str892 pub fn src(&self) -> &'a str {
893 self.0.src
894 }
895
896 /// Returns a parsed value of this float with all of the components still
897 /// listed as strings.
val(&self) -> &FloatVal<'a>898 pub fn val(&self) -> &FloatVal<'a> {
899 &self.0.val
900 }
901 }
902
903 impl<'a> WasmString<'a> {
904 /// Returns the original source text for this string.
src(&self) -> &'a str905 pub fn src(&self) -> &'a str {
906 self.0.src
907 }
908
909 /// Returns a parsed value, as a list of bytes, for this string.
val(&self) -> &[u8]910 pub fn val(&self) -> &[u8] {
911 &self.0.val
912 }
913 }
914
to_hex(c: char) -> u8915 fn to_hex(c: char) -> u8 {
916 match c {
917 'a'..='f' => c as u8 - b'a' + 10,
918 'A'..='F' => c as u8 - b'A' + 10,
919 _ => c as u8 - b'0',
920 }
921 }
922
923 impl fmt::Display for LexError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result924 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
925 use LexError::*;
926 match self {
927 DanglingBlockComment => f.write_str("unterminated block comment")?,
928 Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
929 InvalidStringElement(c) => {
930 write!(f, "invalid character in string '{}'", escape_char(*c))?
931 }
932 InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
933 InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
934 InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
935 Expected { wanted, found } => write!(
936 f,
937 "expected '{}' but found '{}'",
938 escape_char(*wanted),
939 escape_char(*found)
940 )?,
941 UnexpectedEof => write!(f, "unexpected end-of-file")?,
942 NumberTooBig => f.write_str("number is too big to parse")?,
943 InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
944 LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
945 ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
946 __Nonexhaustive => unreachable!(),
947 }
948 Ok(())
949 }
950 }
951
escape_char(c: char) -> String952 fn escape_char(c: char) -> String {
953 match c {
954 '\t' => String::from("\\t"),
955 '\r' => String::from("\\r"),
956 '\n' => String::from("\\n"),
957 '\\' => String::from("\\\\"),
958 '\'' => String::from("\\\'"),
959 '\"' => String::from("\""),
960 '\x20'..='\x7e' => String::from(c),
961 _ => c.escape_unicode().to_string(),
962 }
963 }
964
965 /// This is an attempt to protect agains the "trojan source" [1] problem where
966 /// unicode characters can cause editors to render source code differently
967 /// for humans than the compiler itself sees.
968 ///
969 /// To mitigate this issue, and because it's relatively rare in practice,
970 /// this simply rejects characters of that form.
971 ///
972 /// [1]: https://www.trojansource.codes/
is_confusing_unicode(ch: char) -> bool973 fn is_confusing_unicode(ch: char) -> bool {
974 match ch {
975 '\u{202a}' | '\u{202b}' | '\u{202d}' | '\u{202e}' | '\u{2066}' | '\u{2067}'
976 | '\u{2068}' | '\u{206c}' | '\u{2069}' => true,
977 _ => false,
978 }
979 }
980
981 #[cfg(test)]
982 mod tests {
983 use super::*;
984
985 #[test]
ws_smoke()986 fn ws_smoke() {
987 fn get_whitespace(input: &str) -> &str {
988 match Lexer::new(input).parse().expect("no first token") {
989 Some(Token::Whitespace(s)) => s,
990 other => panic!("unexpected {:?}", other),
991 }
992 }
993 assert_eq!(get_whitespace(" "), " ");
994 assert_eq!(get_whitespace(" "), " ");
995 assert_eq!(get_whitespace(" \n "), " \n ");
996 assert_eq!(get_whitespace(" x"), " ");
997 assert_eq!(get_whitespace(" ;"), " ");
998 }
999
1000 #[test]
line_comment_smoke()1001 fn line_comment_smoke() {
1002 fn get_line_comment(input: &str) -> &str {
1003 match Lexer::new(input).parse().expect("no first token") {
1004 Some(Token::LineComment(s)) => s,
1005 other => panic!("unexpected {:?}", other),
1006 }
1007 }
1008 assert_eq!(get_line_comment(";;"), ";;");
1009 assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1010 assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1011 assert_eq!(get_line_comment(";;\nabc"), ";;");
1012 assert_eq!(get_line_comment(";; \nabc"), ";; ");
1013 }
1014
1015 #[test]
block_comment_smoke()1016 fn block_comment_smoke() {
1017 fn get_block_comment(input: &str) -> &str {
1018 match Lexer::new(input).parse().expect("no first token") {
1019 Some(Token::BlockComment(s)) => s,
1020 other => panic!("unexpected {:?}", other),
1021 }
1022 }
1023 assert_eq!(get_block_comment("(;;)"), "(;;)");
1024 assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1025 assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1026 }
1027
get_token(input: &str) -> Token<'_>1028 fn get_token(input: &str) -> Token<'_> {
1029 Lexer::new(input)
1030 .parse()
1031 .expect("no first token")
1032 .expect("no token")
1033 }
1034
1035 #[test]
lparen()1036 fn lparen() {
1037 assert_eq!(get_token("(("), Token::LParen("("));
1038 }
1039
1040 #[test]
rparen()1041 fn rparen() {
1042 assert_eq!(get_token(")("), Token::RParen(")"));
1043 }
1044
1045 #[test]
strings()1046 fn strings() {
1047 fn get_string(input: &str) -> Vec<u8> {
1048 match get_token(input) {
1049 Token::String(s) => {
1050 assert_eq!(input, s.src());
1051 s.val().to_vec()
1052 }
1053 other => panic!("not string {:?}", other),
1054 }
1055 }
1056 assert_eq!(&*get_string("\"\""), b"");
1057 assert_eq!(&*get_string("\"a\""), b"a");
1058 assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1059 assert_eq!(&*get_string("\"\\\"\""), b"\"");
1060 assert_eq!(&*get_string("\"\\'\""), b"'");
1061 assert_eq!(&*get_string("\"\\n\""), b"\n");
1062 assert_eq!(&*get_string("\"\\t\""), b"\t");
1063 assert_eq!(&*get_string("\"\\r\""), b"\r");
1064 assert_eq!(&*get_string("\"\\\\\""), b"\\");
1065 assert_eq!(&*get_string("\"\\01\""), &[1]);
1066 assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1067 assert_eq!(
1068 &*get_string("\"\\u{0f3}\""),
1069 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1070 );
1071 assert_eq!(
1072 &*get_string("\"\\u{0_f_3}\""),
1073 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1074 );
1075
1076 for i in 0..=255i32 {
1077 let s = format!("\"\\{:02x}\"", i);
1078 assert_eq!(&*get_string(&s), &[i as u8]);
1079 }
1080 }
1081
1082 #[test]
id()1083 fn id() {
1084 fn get_id(input: &str) -> &str {
1085 match get_token(input) {
1086 Token::Id(s) => s,
1087 other => panic!("not id {:?}", other),
1088 }
1089 }
1090 assert_eq!(get_id("$x"), "$x");
1091 assert_eq!(get_id("$xyz"), "$xyz");
1092 assert_eq!(get_id("$x_z"), "$x_z");
1093 assert_eq!(get_id("$0^"), "$0^");
1094 assert_eq!(get_id("$0^;;"), "$0^");
1095 assert_eq!(get_id("$0^ ;;"), "$0^");
1096 }
1097
1098 #[test]
keyword()1099 fn keyword() {
1100 fn get_keyword(input: &str) -> &str {
1101 match get_token(input) {
1102 Token::Keyword(s) => s,
1103 other => panic!("not id {:?}", other),
1104 }
1105 }
1106 assert_eq!(get_keyword("x"), "x");
1107 assert_eq!(get_keyword("xyz"), "xyz");
1108 assert_eq!(get_keyword("x_z"), "x_z");
1109 assert_eq!(get_keyword("x_z "), "x_z");
1110 assert_eq!(get_keyword("x_z "), "x_z");
1111 }
1112
1113 #[test]
reserved()1114 fn reserved() {
1115 fn get_reserved(input: &str) -> &str {
1116 match get_token(input) {
1117 Token::Reserved(s) => s,
1118 other => panic!("not reserved {:?}", other),
1119 }
1120 }
1121 assert_eq!(get_reserved("$ "), "$");
1122 assert_eq!(get_reserved("^_x "), "^_x");
1123 }
1124
1125 #[test]
integer()1126 fn integer() {
1127 fn get_integer(input: &str) -> String {
1128 match get_token(input) {
1129 Token::Integer(i) => {
1130 assert_eq!(input, i.src());
1131 i.val().0.to_string()
1132 }
1133 other => panic!("not integer {:?}", other),
1134 }
1135 }
1136 assert_eq!(get_integer("1"), "1");
1137 assert_eq!(get_integer("0"), "0");
1138 assert_eq!(get_integer("-1"), "-1");
1139 assert_eq!(get_integer("+1"), "1");
1140 assert_eq!(get_integer("+1_000"), "1000");
1141 assert_eq!(get_integer("+1_0_0_0"), "1000");
1142 assert_eq!(get_integer("+0x10"), "10");
1143 assert_eq!(get_integer("-0x10"), "-10");
1144 assert_eq!(get_integer("0x10"), "10");
1145 }
1146
1147 #[test]
float()1148 fn float() {
1149 fn get_float(input: &str) -> FloatVal<'_> {
1150 match get_token(input) {
1151 Token::Float(i) => {
1152 assert_eq!(input, i.src());
1153 i.0.val
1154 }
1155 other => panic!("not reserved {:?}", other),
1156 }
1157 }
1158 assert_eq!(
1159 get_float("nan"),
1160 FloatVal::Nan {
1161 val: None,
1162 negative: false
1163 },
1164 );
1165 assert_eq!(
1166 get_float("-nan"),
1167 FloatVal::Nan {
1168 val: None,
1169 negative: true,
1170 },
1171 );
1172 assert_eq!(
1173 get_float("+nan"),
1174 FloatVal::Nan {
1175 val: None,
1176 negative: false,
1177 },
1178 );
1179 assert_eq!(
1180 get_float("+nan:0x1"),
1181 FloatVal::Nan {
1182 val: Some(1),
1183 negative: false,
1184 },
1185 );
1186 assert_eq!(
1187 get_float("nan:0x7f_ffff"),
1188 FloatVal::Nan {
1189 val: Some(0x7fffff),
1190 negative: false,
1191 },
1192 );
1193 assert_eq!(get_float("inf"), FloatVal::Inf { negative: false });
1194 assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true });
1195 assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false });
1196
1197 assert_eq!(
1198 get_float("1.2"),
1199 FloatVal::Val {
1200 integral: "1".into(),
1201 decimal: Some("2".into()),
1202 exponent: None,
1203 hex: false,
1204 },
1205 );
1206 assert_eq!(
1207 get_float("1.2e3"),
1208 FloatVal::Val {
1209 integral: "1".into(),
1210 decimal: Some("2".into()),
1211 exponent: Some("3".into()),
1212 hex: false,
1213 },
1214 );
1215 assert_eq!(
1216 get_float("-1_2.1_1E+0_1"),
1217 FloatVal::Val {
1218 integral: "-12".into(),
1219 decimal: Some("11".into()),
1220 exponent: Some("01".into()),
1221 hex: false,
1222 },
1223 );
1224 assert_eq!(
1225 get_float("+1_2.1_1E-0_1"),
1226 FloatVal::Val {
1227 integral: "12".into(),
1228 decimal: Some("11".into()),
1229 exponent: Some("-01".into()),
1230 hex: false,
1231 },
1232 );
1233 assert_eq!(
1234 get_float("0x1_2.3_4p5_6"),
1235 FloatVal::Val {
1236 integral: "12".into(),
1237 decimal: Some("34".into()),
1238 exponent: Some("56".into()),
1239 hex: true,
1240 },
1241 );
1242 assert_eq!(
1243 get_float("+0x1_2.3_4P-5_6"),
1244 FloatVal::Val {
1245 integral: "12".into(),
1246 decimal: Some("34".into()),
1247 exponent: Some("-56".into()),
1248 hex: true,
1249 },
1250 );
1251 assert_eq!(
1252 get_float("1."),
1253 FloatVal::Val {
1254 integral: "1".into(),
1255 decimal: None,
1256 exponent: None,
1257 hex: false,
1258 },
1259 );
1260 assert_eq!(
1261 get_float("0x1p-24"),
1262 FloatVal::Val {
1263 integral: "1".into(),
1264 decimal: None,
1265 exponent: Some("-24".into()),
1266 hex: true,
1267 },
1268 );
1269 }
1270 }
1271