1 //! Definition of a lexer for the WebAssembly text format.
2 //!
3 //! This module provides a [`Lexer`][] type which is an iterate over the raw
4 //! tokens of a WebAssembly text file. A [`Lexer`][] accounts for every single
5 //! byte in a WebAssembly text field, returning tokens even for comments and
6 //! whitespace. Typically you'll ignore comments and whitespace, however.
7 //!
8 //! If you'd like to iterate over the tokens in a file you can do so via:
9 //!
10 //! ```
11 //! # fn foo() -> Result<(), wast::Error> {
12 //! use wast::lexer::Lexer;
13 //!
14 //! let wat = "(module (func $foo))";
15 //! for token in Lexer::new(wat) {
16 //!     println!("{:?}", token?);
17 //! }
18 //! # Ok(())
19 //! # }
20 //! ```
21 //!
22 //! Note that you'll typically not use this module but will rather use
23 //! [`ParseBuffer`](crate::parser::ParseBuffer) instead.
24 //!
25 //! [`Lexer`]: crate::lexer::Lexer
26 
27 use crate::{Error, Span};
28 use std::borrow::Cow;
29 use std::char;
30 use std::fmt;
31 use std::str;
32 
33 /// A structure used to lex the s-expression syntax of WAT files.
34 ///
35 /// This structure is used to generate [`Source`] items, which should account for
36 /// every single byte of the input as we iterate over it. A [`LexError`] is
37 /// returned for any non-lexable text.
38 #[derive(Clone)]
39 pub struct Lexer<'a> {
40     remaining: &'a str,
41     input: &'a str,
42     allow_confusing_unicode: bool,
43 }
44 
45 /// A fragment of source lex'd from an input string.
46 ///
47 /// This enumeration contains all kinds of fragments, including comments and
48 /// whitespace. For most cases you'll probably ignore these and simply look at
49 /// tokens.
50 #[derive(Debug, PartialEq)]
51 pub enum Token<'a> {
52     /// A line comment, preceded with `;;`
53     LineComment(&'a str),
54 
55     /// A block comment, surrounded by `(;` and `;)`. Note that these can be
56     /// nested.
57     BlockComment(&'a str),
58 
59     /// A fragment of source that represents whitespace.
60     Whitespace(&'a str),
61 
62     /// A left-parenthesis, including the source text for where it comes from.
63     LParen(&'a str),
64     /// A right-parenthesis, including the source text for where it comes from.
65     RParen(&'a str),
66 
67     /// A string literal, which is actually a list of bytes.
68     String(WasmString<'a>),
69 
70     /// An identifier (like `$foo`).
71     ///
72     /// All identifiers start with `$` and the payload here is the original
73     /// source text.
74     Id(&'a str),
75 
76     /// A keyword, or something that starts with an alphabetic character.
77     ///
78     /// The payload here is the original source text.
79     Keyword(&'a str),
80 
81     /// A reserved series of `idchar` symbols. Unknown what this is meant to be
82     /// used for, you'll probably generate an error about an unexpected token.
83     Reserved(&'a str),
84 
85     /// An integer.
86     Integer(Integer<'a>),
87 
88     /// A float.
89     Float(Float<'a>),
90 }
91 
92 /// Errors that can be generated while lexing.
93 ///
94 /// All lexing errors have line/colum/position information as well as a
95 /// `LexError` indicating what kind of error happened while lexing.
96 #[derive(Debug, Clone, PartialEq)]
97 pub enum LexError {
98     /// A dangling block comment was found with an unbalanced `(;` which was
99     /// never terminated in the file.
100     DanglingBlockComment,
101 
102     /// An unexpected character was encountered when generally parsing and
103     /// looking for something else.
104     Unexpected(char),
105 
106     /// An invalid `char` in a string literal was found.
107     InvalidStringElement(char),
108 
109     /// An invalid string escape letter was found (the thing after the `\` in
110     /// string literals)
111     InvalidStringEscape(char),
112 
113     /// An invalid hexadecimal digit was found.
114     InvalidHexDigit(char),
115 
116     /// An invalid base-10 digit was found.
117     InvalidDigit(char),
118 
119     /// Parsing expected `wanted` but ended up finding `found` instead where the
120     /// two characters aren't the same.
121     Expected {
122         /// The character that was expected to be found
123         wanted: char,
124         /// The character that was actually found
125         found: char,
126     },
127 
128     /// We needed to parse more but EOF (or end of the string) was encountered.
129     UnexpectedEof,
130 
131     /// A number failed to parse because it was too big to fit within the target
132     /// type.
133     NumberTooBig,
134 
135     /// An invalid unicode value was found in a `\u{...}` escape in a string,
136     /// only valid unicode scalars can be escaped that way.
137     InvalidUnicodeValue(u32),
138 
139     /// A lone underscore was found when parsing a number, since underscores
140     /// should always be preceded and succeeded with a digit of some form.
141     LoneUnderscore,
142 
143     /// A "confusing" unicode character is present in a comment or a string
144     /// literal, such as a character that changes the direction text is
145     /// typically displayed in editors. This could cause the human-read
146     /// version to behave differently than the compiler-visible version, so
147     /// these are simply rejected for now.
148     ConfusingUnicode(char),
149 
150     #[doc(hidden)]
151     __Nonexhaustive,
152 }
153 
154 /// A sign token for an integer.
155 #[derive(Clone, Copy, Debug, PartialEq)]
156 pub enum SignToken {
157     /// Plus sign: "+",
158     Plus,
159     /// Minus sign: "-",
160     Minus,
161 }
162 
163 /// A parsed integer, signed or unsigned.
164 ///
165 /// Methods can be use to access the value of the integer.
166 #[derive(Debug, PartialEq)]
167 pub struct Integer<'a>(Box<IntegerInner<'a>>);
168 
169 #[derive(Debug, PartialEq)]
170 struct IntegerInner<'a> {
171     sign: Option<SignToken>,
172     src: &'a str,
173     val: Cow<'a, str>,
174     hex: bool,
175 }
176 
177 /// A parsed float.
178 ///
179 /// Methods can be use to access the value of the float.
180 #[derive(Debug, PartialEq)]
181 pub struct Float<'a>(Box<FloatInner<'a>>);
182 
183 #[derive(Debug, PartialEq)]
184 struct FloatInner<'a> {
185     src: &'a str,
186     val: FloatVal<'a>,
187 }
188 
189 /// A parsed string.
190 #[derive(Debug, PartialEq)]
191 pub struct WasmString<'a>(Box<WasmStringInner<'a>>);
192 
193 #[derive(Debug, PartialEq)]
194 struct WasmStringInner<'a> {
195     src: &'a str,
196     val: Cow<'a, [u8]>,
197 }
198 
199 /// Possible parsed float values
200 #[derive(Debug, PartialEq)]
201 pub enum FloatVal<'a> {
202     /// A float `NaN` representation
203     Nan {
204         /// The specific bits to encode for this float, optionally
205         val: Option<u64>,
206         /// Whether or not this is a negative `NaN` or not.
207         negative: bool,
208     },
209     /// An float infinite representation,
210     Inf {
211         #[allow(missing_docs)]
212         negative: bool,
213     },
214     /// A parsed and separated floating point value
215     Val {
216         /// Whether or not the `integral` and `decimal` are specified in hex
217         hex: bool,
218         /// The float parts before the `.`
219         integral: Cow<'a, str>,
220         /// The float parts after the `.`
221         decimal: Option<Cow<'a, str>>,
222         /// The exponent to multiple this `integral.decimal` portion of the
223         /// float by. If `hex` is true this is `2^exponent` and otherwise it's
224         /// `10^exponent`
225         exponent: Option<Cow<'a, str>>,
226     },
227 }
228 
229 // https://webassembly.github.io/spec/core/text/values.html#text-idchar
230 macro_rules! idchars {
231     () => {
232         b'0'..=b'9'
233         | b'A'..=b'Z'
234         | b'a'..=b'z'
235         | b'!'
236         | b'#'
237         | b'$'
238         | b'%'
239         | b'&'
240         | b'\''
241         | b'*'
242         | b'+'
243         | b'-'
244         | b'.'
245         | b'/'
246         | b':'
247         | b'<'
248         | b'='
249         | b'>'
250         | b'?'
251         | b'@'
252         | b'\\'
253         | b'^'
254         | b'_'
255         | b'`'
256         | b'|'
257         | b'~'
258     }
259 }
260 
261 impl<'a> Lexer<'a> {
262     /// Creates a new lexer which will lex the `input` source string.
new(input: &str) -> Lexer<'_>263     pub fn new(input: &str) -> Lexer<'_> {
264         Lexer {
265             remaining: input,
266             input,
267             allow_confusing_unicode: false,
268         }
269     }
270 
271     /// Returns the original source input that we're lexing.
input(&self) -> &'a str272     pub fn input(&self) -> &'a str {
273         self.input
274     }
275 
276     /// Configures whether "confusing" unicode characters are allowed while
277     /// lexing.
278     ///
279     /// If allowed then no error will happen if these characters are found, but
280     /// otherwise if disallowed a lex error will be produced when these
281     /// characters are found. Confusing characters are denied by default.
282     ///
283     /// For now "confusing characters" are primarily related to the "trojan
284     /// source" problem where it refers to characters which cause humans to read
285     /// text differently than this lexer, such as characters that alter the
286     /// left-to-right display of the source code.
allow_confusing_unicode(&mut self, allow: bool) -> &mut Self287     pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
288         self.allow_confusing_unicode = allow;
289         self
290     }
291 
292     /// Lexes the next token in the input.
293     ///
294     /// Returns `Some` if a token is found or `None` if we're at EOF.
295     ///
296     /// # Errors
297     ///
298     /// Returns an error if the input is malformed.
parse(&mut self) -> Result<Option<Token<'a>>, Error>299     pub fn parse(&mut self) -> Result<Option<Token<'a>>, Error> {
300         let pos = self.cur();
301         // This `match` generally parses the grammar specified at
302         //
303         // https://webassembly.github.io/spec/core/text/lexical.html#text-token
304         let byte = match self.remaining.as_bytes().get(0) {
305             Some(b) => b,
306             None => return Ok(None),
307         };
308 
309         match byte {
310             // Open-parens check the next character to see if this is the start
311             // of a block comment, otherwise it's just a bland left-paren
312             // token.
313             b'(' => match self.remaining.as_bytes().get(1) {
314                 Some(b';') => {
315                     let mut level = 1;
316                     // Note that we're doing a byte-level search here for the
317                     // close-delimiter of `;)`. The actual source text is utf-8
318                     // encode in `self.remaining` but due to how utf-8 works we
319                     // can safely search for an ASCII byte since it'll never
320                     // otherwise appear in the middle of a codepoint and if we
321                     // find it then it's guaranteed to be the right byte.
322                     //
323                     // Mainly we're avoiding the overhead of decoding utf-8
324                     // characters into a Rust `char` since it's otherwise
325                     // unnecessary work.
326                     let mut iter = self.remaining.as_bytes()[2..].iter();
327                     while let Some(ch) = iter.next() {
328                         match ch {
329                             b'(' => {
330                                 if let Some(b';') = iter.as_slice().get(0) {
331                                     level += 1;
332                                     iter.next();
333                                 }
334                             }
335                             b';' => {
336                                 if let Some(b')') = iter.as_slice().get(0) {
337                                     level -= 1;
338                                     iter.next();
339                                     if level == 0 {
340                                         let len = self.remaining.len() - iter.as_slice().len();
341                                         let (comment, remaining) = self.remaining.split_at(len);
342                                         self.remaining = remaining;
343                                         self.check_confusing_comment(comment)?;
344                                         return Ok(Some(Token::BlockComment(comment)));
345                                     }
346                                 }
347                             }
348                             _ => {}
349                         }
350                     }
351                     Err(self.error(pos, LexError::DanglingBlockComment))
352                 }
353                 _ => Ok(Some(Token::LParen(self.split_first_byte()))),
354             },
355 
356             b')' => Ok(Some(Token::RParen(self.split_first_byte()))),
357 
358             b'"' => {
359                 let val = self.string()?;
360                 let src = &self.input[pos..self.cur()];
361                 return Ok(Some(Token::String(WasmString(Box::new(WasmStringInner {
362                     val,
363                     src,
364                 })))));
365             }
366 
367             // https://webassembly.github.io/spec/core/text/lexical.html#white-space
368             b' ' | b'\n' | b'\r' | b'\t' => Ok(Some(Token::Whitespace(self.split_ws()))),
369 
370             c @ idchars!() => {
371                 let reserved = self.split_while(|b| match b {
372                     idchars!() => true,
373                     _ => false,
374                 });
375 
376                 // https://webassembly.github.io/spec/core/text/values.html#integers
377                 if let Some(number) = self.number(reserved) {
378                     Ok(Some(number))
379                 // https://webassembly.github.io/spec/core/text/values.html#text-id
380                 } else if *c == b'$' && reserved.len() > 1 {
381                     Ok(Some(Token::Id(reserved)))
382                 // https://webassembly.github.io/spec/core/text/lexical.html#text-keyword
383                 } else if b'a' <= *c && *c <= b'z' {
384                     Ok(Some(Token::Keyword(reserved)))
385                 } else {
386                     Ok(Some(Token::Reserved(reserved)))
387                 }
388             }
389 
390             // This could be a line comment, otherwise `;` is a reserved token.
391             // The second byte is checked to see if it's a `;;` line comment
392             b';' => match self.remaining.as_bytes().get(1) {
393                 Some(b';') => {
394                     let comment = self.split_until(b'\n');
395                     self.check_confusing_comment(comment)?;
396                     Ok(Some(Token::LineComment(comment)))
397                 }
398                 _ => Ok(Some(Token::Reserved(self.split_first_byte()))),
399             },
400 
401             // Other known reserved tokens other than `;`
402             b',' | b'[' | b']' | b'{' | b'}' => Ok(Some(Token::Reserved(self.split_first_byte()))),
403 
404             _ => {
405                 let ch = self.remaining.chars().next().unwrap();
406                 Err(self.error(pos, LexError::Unexpected(ch)))
407             }
408         }
409     }
410 
split_first_byte(&mut self) -> &'a str411     fn split_first_byte(&mut self) -> &'a str {
412         let (token, remaining) = self.remaining.split_at(1);
413         self.remaining = remaining;
414         token
415     }
416 
split_until(&mut self, byte: u8) -> &'a str417     fn split_until(&mut self, byte: u8) -> &'a str {
418         let pos = memchr::memchr(byte, self.remaining.as_bytes()).unwrap_or(self.remaining.len());
419         let (ret, remaining) = self.remaining.split_at(pos);
420         self.remaining = remaining;
421         ret
422     }
423 
split_ws(&mut self) -> &'a str424     fn split_ws(&mut self) -> &'a str {
425         // This table is a byte lookup table to determine whether a byte is a
426         // whitespace byte. There are only 4 whitespace bytes for the `*.wat`
427         // format right now which are ' ', '\t', '\r', and '\n'. These 4 bytes
428         // have a '1' in the table below.
429         //
430         // Due to how utf-8 works (our input is guaranteed to be utf-8) it is
431         // known that if these bytes are found they're guaranteed to be the
432         // whitespace byte, so they can be safely skipped and we don't have to
433         // do full utf-8 decoding. This means that the goal of this function is
434         // to find the first non-whitespace byte in `self.remaining`.
435         //
436         // For now this lookup table seems to be the fastest, but projects like
437         // https://github.com/lemire/despacer show other simd algorithms which
438         // can possibly accelerate this even more. Note that `*.wat` files often
439         // have a lot of whitespace so this function is typically quite hot when
440         // parsing inputs.
441         #[rustfmt::skip]
442         const WS: [u8; 256] = [
443             //                                   \t \n       \r
444             /* 0x00 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
445             /* 0x10 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
446             //        ' '
447             /* 0x20 */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
448             /* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
449             /* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450             /* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451             /* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452             /* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453             /* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454             /* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455             /* 0xa0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456             /* 0xb0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457             /* 0xc0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458             /* 0xd0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459             /* 0xe0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460             /* 0xf0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461         ];
462         let pos = self
463             .remaining
464             .as_bytes()
465             .iter()
466             .position(|b| WS[*b as usize] != 1)
467             .unwrap_or(self.remaining.len());
468         let (ret, remaining) = self.remaining.split_at(pos);
469         self.remaining = remaining;
470         ret
471     }
472 
split_while(&mut self, f: impl Fn(u8) -> bool) -> &'a str473     fn split_while(&mut self, f: impl Fn(u8) -> bool) -> &'a str {
474         let pos = self
475             .remaining
476             .as_bytes()
477             .iter()
478             .position(|b| !f(*b))
479             .unwrap_or(self.remaining.len());
480         let (ret, remaining) = self.remaining.split_at(pos);
481         self.remaining = remaining;
482         ret
483     }
484 
number(&self, src: &'a str) -> Option<Token<'a>>485     fn number(&self, src: &'a str) -> Option<Token<'a>> {
486         let (sign, num) = if src.starts_with('+') {
487             (Some(SignToken::Plus), &src[1..])
488         } else if src.starts_with('-') {
489             (Some(SignToken::Minus), &src[1..])
490         } else {
491             (None, src)
492         };
493 
494         let negative = sign == Some(SignToken::Minus);
495 
496         // Handle `inf` and `nan` which are special numbers here
497         if num == "inf" {
498             return Some(Token::Float(Float(Box::new(FloatInner {
499                 src,
500                 val: FloatVal::Inf { negative },
501             }))));
502         } else if num == "nan" {
503             return Some(Token::Float(Float(Box::new(FloatInner {
504                 src,
505                 val: FloatVal::Nan {
506                     val: None,
507                     negative,
508                 },
509             }))));
510         } else if num.starts_with("nan:0x") {
511             let mut it = num[6..].chars();
512             let to_parse = skip_undescores(&mut it, false, char::is_ascii_hexdigit)?;
513             if it.next().is_some() {
514                 return None;
515             }
516             let n = u64::from_str_radix(&to_parse, 16).ok()?;
517             return Some(Token::Float(Float(Box::new(FloatInner {
518                 src,
519                 val: FloatVal::Nan {
520                     val: Some(n),
521                     negative,
522                 },
523             }))));
524         }
525 
526         // Figure out if we're a hex number or not
527         let (mut it, hex, test_valid) = if num.starts_with("0x") {
528             (
529                 num[2..].chars(),
530                 true,
531                 char::is_ascii_hexdigit as fn(&char) -> bool,
532             )
533         } else {
534             (
535                 num.chars(),
536                 false,
537                 char::is_ascii_digit as fn(&char) -> bool,
538             )
539         };
540 
541         // Evaluate the first part, moving out all underscores
542         let val = skip_undescores(&mut it, negative, test_valid)?;
543 
544         match it.clone().next() {
545             // If we're followed by something this may be a float so keep going.
546             Some(_) => {}
547 
548             // Otherwise this is a valid integer literal!
549             None => {
550                 return Some(Token::Integer(Integer(Box::new(IntegerInner {
551                     sign,
552                     src,
553                     val,
554                     hex,
555                 }))))
556             }
557         }
558 
559         // A number can optionally be after the decimal so only actually try to
560         // parse one if it's there.
561         let decimal = if it.clone().next() == Some('.') {
562             it.next();
563             match it.clone().next() {
564                 Some(c) if test_valid(&c) => Some(skip_undescores(&mut it, false, test_valid)?),
565                 Some(_) | None => None,
566             }
567         } else {
568             None
569         };
570 
571         // Figure out if there's an exponential part here to make a float, and
572         // if so parse it but defer its actual calculation until later.
573         let exponent = match (hex, it.next()) {
574             (true, Some('p')) | (true, Some('P')) | (false, Some('e')) | (false, Some('E')) => {
575                 let negative = match it.clone().next() {
576                     Some('-') => {
577                         it.next();
578                         true
579                     }
580                     Some('+') => {
581                         it.next();
582                         false
583                     }
584                     _ => false,
585                 };
586                 Some(skip_undescores(&mut it, negative, char::is_ascii_digit)?)
587             }
588             (_, None) => None,
589             _ => return None,
590         };
591 
592         // We should have eaten everything by now, if not then this is surely
593         // not a float or integer literal.
594         if it.next().is_some() {
595             return None;
596         }
597 
598         return Some(Token::Float(Float(Box::new(FloatInner {
599             src,
600             val: FloatVal::Val {
601                 hex,
602                 integral: val,
603                 exponent,
604                 decimal,
605             },
606         }))));
607 
608         fn skip_undescores<'a>(
609             it: &mut str::Chars<'a>,
610             negative: bool,
611             good: fn(&char) -> bool,
612         ) -> Option<Cow<'a, str>> {
613             enum State {
614                 Raw,
615                 Collecting(String),
616             }
617             let mut last_underscore = false;
618             let mut state = if negative {
619                 State::Collecting("-".to_string())
620             } else {
621                 State::Raw
622             };
623             let input = it.as_str();
624             let first = it.next()?;
625             if !good(&first) {
626                 return None;
627             }
628             if let State::Collecting(s) = &mut state {
629                 s.push(first);
630             }
631             let mut last = 1;
632             while let Some(c) = it.clone().next() {
633                 if c == '_' && !last_underscore {
634                     if let State::Raw = state {
635                         state = State::Collecting(input[..last].to_string());
636                     }
637                     it.next();
638                     last_underscore = true;
639                     continue;
640                 }
641                 if !good(&c) {
642                     break;
643                 }
644                 if let State::Collecting(s) = &mut state {
645                     s.push(c);
646                 }
647                 last_underscore = false;
648                 it.next();
649                 last += 1;
650             }
651             if last_underscore {
652                 return None;
653             }
654             Some(match state {
655                 State::Raw => input[..last].into(),
656                 State::Collecting(s) => s.into(),
657             })
658         }
659     }
660 
661     /// Verifies that `comment`, which is about to be returned, has a "confusing
662     /// unicode character" in it and should instead be transformed into an
663     /// error.
check_confusing_comment(&self, comment: &str) -> Result<(), Error>664     fn check_confusing_comment(&self, comment: &str) -> Result<(), Error> {
665         if self.allow_confusing_unicode {
666             return Ok(());
667         }
668 
669         // In an effort to avoid utf-8 decoding the entire `comment` the search
670         // here is a bit more optimized. This checks for the `0xe2` byte because
671         // in the utf-8 encoding that's the leading encoding byte for all
672         // "confusing characters". Each instance of 0xe2 is checked to see if it
673         // starts a confusing character, and if so that's returned.
674         //
675         // Also note that 0xe2 will never be found in the middle of a codepoint,
676         // it's always the start of a codepoint. This means that if our special
677         // characters show up they're guaranteed to start with 0xe2 bytes.
678         let bytes = comment.as_bytes();
679         for pos in memchr::Memchr::new(0xe2, bytes) {
680             if let Some(c) = comment[pos..].chars().next() {
681                 if is_confusing_unicode(c) {
682                     // Note that `self.cur()` accounts for already having
683                     // parsed `comment`, so we move backwards to where
684                     // `comment` started and then add the index within
685                     // `comment`.
686                     let pos = self.cur() - comment.len() + pos;
687                     return Err(self.error(pos, LexError::ConfusingUnicode(c)));
688                 }
689             }
690         }
691 
692         Ok(())
693     }
694 
695     /// Reads everything for a literal string except the leading `"`. Returns
696     /// the string value that has been read.
697     ///
698     /// https://webassembly.github.io/spec/core/text/values.html#text-string
string(&mut self) -> Result<Cow<'a, [u8]>, Error>699     fn string(&mut self) -> Result<Cow<'a, [u8]>, Error> {
700         let mut it = self.remaining[1..].chars();
701         let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
702         let end = self.input.len() - it.as_str().len();
703         self.remaining = &self.input[end..];
704         result.map_err(|e| {
705             let err_pos = match &e {
706                 LexError::UnexpectedEof => self.input.len(),
707                 _ => self.input[..end].char_indices().next_back().unwrap().0,
708             };
709             self.error(err_pos, e)
710         })
711     }
712 
parse_str( it: &mut str::Chars<'a>, allow_confusing_unicode: bool, ) -> Result<Cow<'a, [u8]>, LexError>713     fn parse_str(
714         it: &mut str::Chars<'a>,
715         allow_confusing_unicode: bool,
716     ) -> Result<Cow<'a, [u8]>, LexError> {
717         enum State {
718             Start,
719             String(Vec<u8>),
720         }
721         let orig = it.as_str();
722         let mut state = State::Start;
723         loop {
724             match it.next().ok_or(LexError::UnexpectedEof)? {
725                 '"' => break,
726                 '\\' => {
727                     match state {
728                         State::String(_) => {}
729                         State::Start => {
730                             let pos = orig.len() - it.as_str().len() - 1;
731                             state = State::String(orig[..pos].as_bytes().to_vec());
732                         }
733                     }
734                     let buf = match &mut state {
735                         State::String(b) => b,
736                         State::Start => unreachable!(),
737                     };
738                     match it.next().ok_or(LexError::UnexpectedEof)? {
739                         '"' => buf.push(b'"'),
740                         '\'' => buf.push(b'\''),
741                         't' => buf.push(b'\t'),
742                         'n' => buf.push(b'\n'),
743                         'r' => buf.push(b'\r'),
744                         '\\' => buf.push(b'\\'),
745                         'u' => {
746                             Lexer::must_eat_char(it, '{')?;
747                             let n = Lexer::hexnum(it)?;
748                             let c = char::from_u32(n)
749                                 .ok_or_else(|| LexError::InvalidUnicodeValue(n))?;
750                             buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
751                             Lexer::must_eat_char(it, '}')?;
752                         }
753                         c1 if c1.is_ascii_hexdigit() => {
754                             let c2 = Lexer::hexdigit(it)?;
755                             buf.push(to_hex(c1) * 16 + c2);
756                         }
757                         c => return Err(LexError::InvalidStringEscape(c)),
758                     }
759                 }
760                 c if (c as u32) < 0x20 || c as u32 == 0x7f => {
761                     return Err(LexError::InvalidStringElement(c))
762                 }
763                 c if !allow_confusing_unicode && is_confusing_unicode(c) => {
764                     return Err(LexError::ConfusingUnicode(c))
765                 }
766                 c => match &mut state {
767                     State::Start => {}
768                     State::String(v) => {
769                         v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
770                     }
771                 },
772             }
773         }
774         match state {
775             State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
776             State::String(s) => Ok(s.into()),
777         }
778     }
779 
hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError>780     fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
781         let n = Lexer::hexdigit(it)?;
782         let mut last_underscore = false;
783         let mut n = n as u32;
784         while let Some(c) = it.clone().next() {
785             if c == '_' {
786                 it.next();
787                 last_underscore = true;
788                 continue;
789             }
790             if !c.is_ascii_hexdigit() {
791                 break;
792             }
793             last_underscore = false;
794             it.next();
795             n = n
796                 .checked_mul(16)
797                 .and_then(|n| n.checked_add(to_hex(c) as u32))
798                 .ok_or(LexError::NumberTooBig)?;
799         }
800         if last_underscore {
801             return Err(LexError::LoneUnderscore);
802         }
803         Ok(n)
804     }
805 
806     /// Reads a hexidecimal digit from the input stream, returning where it's
807     /// defined and the hex value. Returns an error on EOF or an invalid hex
808     /// digit.
hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError>809     fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
810         let ch = Lexer::must_char(it)?;
811         if ch.is_ascii_hexdigit() {
812             Ok(to_hex(ch))
813         } else {
814             Err(LexError::InvalidHexDigit(ch))
815         }
816     }
817 
818     /// Reads the next character from the input string and where it's located,
819     /// returning an error if the input stream is empty.
must_char(it: &mut str::Chars<'_>) -> Result<char, LexError>820     fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
821         it.next().ok_or(LexError::UnexpectedEof)
822     }
823 
824     /// Expects that a specific character must be read next
must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError>825     fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
826         let found = Lexer::must_char(it)?;
827         if wanted == found {
828             Ok(())
829         } else {
830             Err(LexError::Expected { wanted, found })
831         }
832     }
833 
834     /// Returns the current position of our iterator through the input string
cur(&self) -> usize835     fn cur(&self) -> usize {
836         self.input.len() - self.remaining.len()
837     }
838 
839     /// Creates an error at `pos` with the specified `kind`
error(&self, pos: usize, kind: LexError) -> Error840     fn error(&self, pos: usize, kind: LexError) -> Error {
841         Error::lex(Span { offset: pos }, self.input, kind)
842     }
843 }
844 
845 impl<'a> Iterator for Lexer<'a> {
846     type Item = Result<Token<'a>, Error>;
847 
next(&mut self) -> Option<Self::Item>848     fn next(&mut self) -> Option<Self::Item> {
849         self.parse().transpose()
850     }
851 }
852 
853 impl<'a> Token<'a> {
854     /// Returns the original source text for this token.
src(&self) -> &'a str855     pub fn src(&self) -> &'a str {
856         match self {
857             Token::Whitespace(s) => s,
858             Token::BlockComment(s) => s,
859             Token::LineComment(s) => s,
860             Token::LParen(s) => s,
861             Token::RParen(s) => s,
862             Token::String(s) => s.src(),
863             Token::Id(s) => s,
864             Token::Keyword(s) => s,
865             Token::Reserved(s) => s,
866             Token::Integer(i) => i.src(),
867             Token::Float(f) => f.src(),
868         }
869     }
870 }
871 
872 impl<'a> Integer<'a> {
873     /// Returns the sign token for this integer.
sign(&self) -> Option<SignToken>874     pub fn sign(&self) -> Option<SignToken> {
875         self.0.sign
876     }
877 
878     /// Returns the original source text for this integer.
src(&self) -> &'a str879     pub fn src(&self) -> &'a str {
880         self.0.src
881     }
882 
883     /// Returns the value string that can be parsed for this integer, as well as
884     /// the base that it should be parsed in
val(&self) -> (&str, u32)885     pub fn val(&self) -> (&str, u32) {
886         (&self.0.val, if self.0.hex { 16 } else { 10 })
887     }
888 }
889 
890 impl<'a> Float<'a> {
891     /// Returns the original source text for this integer.
src(&self) -> &'a str892     pub fn src(&self) -> &'a str {
893         self.0.src
894     }
895 
896     /// Returns a parsed value of this float with all of the components still
897     /// listed as strings.
val(&self) -> &FloatVal<'a>898     pub fn val(&self) -> &FloatVal<'a> {
899         &self.0.val
900     }
901 }
902 
903 impl<'a> WasmString<'a> {
904     /// Returns the original source text for this string.
src(&self) -> &'a str905     pub fn src(&self) -> &'a str {
906         self.0.src
907     }
908 
909     /// Returns a parsed value, as a list of bytes, for this string.
val(&self) -> &[u8]910     pub fn val(&self) -> &[u8] {
911         &self.0.val
912     }
913 }
914 
to_hex(c: char) -> u8915 fn to_hex(c: char) -> u8 {
916     match c {
917         'a'..='f' => c as u8 - b'a' + 10,
918         'A'..='F' => c as u8 - b'A' + 10,
919         _ => c as u8 - b'0',
920     }
921 }
922 
923 impl fmt::Display for LexError {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result924     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
925         use LexError::*;
926         match self {
927             DanglingBlockComment => f.write_str("unterminated block comment")?,
928             Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
929             InvalidStringElement(c) => {
930                 write!(f, "invalid character in string '{}'", escape_char(*c))?
931             }
932             InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
933             InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
934             InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
935             Expected { wanted, found } => write!(
936                 f,
937                 "expected '{}' but found '{}'",
938                 escape_char(*wanted),
939                 escape_char(*found)
940             )?,
941             UnexpectedEof => write!(f, "unexpected end-of-file")?,
942             NumberTooBig => f.write_str("number is too big to parse")?,
943             InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
944             LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
945             ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
946             __Nonexhaustive => unreachable!(),
947         }
948         Ok(())
949     }
950 }
951 
escape_char(c: char) -> String952 fn escape_char(c: char) -> String {
953     match c {
954         '\t' => String::from("\\t"),
955         '\r' => String::from("\\r"),
956         '\n' => String::from("\\n"),
957         '\\' => String::from("\\\\"),
958         '\'' => String::from("\\\'"),
959         '\"' => String::from("\""),
960         '\x20'..='\x7e' => String::from(c),
961         _ => c.escape_unicode().to_string(),
962     }
963 }
964 
965 /// This is an attempt to protect agains the "trojan source" [1] problem where
966 /// unicode characters can cause editors to render source code differently
967 /// for humans than the compiler itself sees.
968 ///
969 /// To mitigate this issue, and because it's relatively rare in practice,
970 /// this simply rejects characters of that form.
971 ///
972 /// [1]: https://www.trojansource.codes/
is_confusing_unicode(ch: char) -> bool973 fn is_confusing_unicode(ch: char) -> bool {
974     match ch {
975         '\u{202a}' | '\u{202b}' | '\u{202d}' | '\u{202e}' | '\u{2066}' | '\u{2067}'
976         | '\u{2068}' | '\u{206c}' | '\u{2069}' => true,
977         _ => false,
978     }
979 }
980 
981 #[cfg(test)]
982 mod tests {
983     use super::*;
984 
985     #[test]
ws_smoke()986     fn ws_smoke() {
987         fn get_whitespace(input: &str) -> &str {
988             match Lexer::new(input).parse().expect("no first token") {
989                 Some(Token::Whitespace(s)) => s,
990                 other => panic!("unexpected {:?}", other),
991             }
992         }
993         assert_eq!(get_whitespace(" "), " ");
994         assert_eq!(get_whitespace("  "), "  ");
995         assert_eq!(get_whitespace("  \n "), "  \n ");
996         assert_eq!(get_whitespace("  x"), "  ");
997         assert_eq!(get_whitespace("  ;"), "  ");
998     }
999 
1000     #[test]
line_comment_smoke()1001     fn line_comment_smoke() {
1002         fn get_line_comment(input: &str) -> &str {
1003             match Lexer::new(input).parse().expect("no first token") {
1004                 Some(Token::LineComment(s)) => s,
1005                 other => panic!("unexpected {:?}", other),
1006             }
1007         }
1008         assert_eq!(get_line_comment(";;"), ";;");
1009         assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1010         assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1011         assert_eq!(get_line_comment(";;\nabc"), ";;");
1012         assert_eq!(get_line_comment(";;   \nabc"), ";;   ");
1013     }
1014 
1015     #[test]
block_comment_smoke()1016     fn block_comment_smoke() {
1017         fn get_block_comment(input: &str) -> &str {
1018             match Lexer::new(input).parse().expect("no first token") {
1019                 Some(Token::BlockComment(s)) => s,
1020                 other => panic!("unexpected {:?}", other),
1021             }
1022         }
1023         assert_eq!(get_block_comment("(;;)"), "(;;)");
1024         assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1025         assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1026     }
1027 
get_token(input: &str) -> Token<'_>1028     fn get_token(input: &str) -> Token<'_> {
1029         Lexer::new(input)
1030             .parse()
1031             .expect("no first token")
1032             .expect("no token")
1033     }
1034 
1035     #[test]
lparen()1036     fn lparen() {
1037         assert_eq!(get_token("(("), Token::LParen("("));
1038     }
1039 
1040     #[test]
rparen()1041     fn rparen() {
1042         assert_eq!(get_token(")("), Token::RParen(")"));
1043     }
1044 
1045     #[test]
strings()1046     fn strings() {
1047         fn get_string(input: &str) -> Vec<u8> {
1048             match get_token(input) {
1049                 Token::String(s) => {
1050                     assert_eq!(input, s.src());
1051                     s.val().to_vec()
1052                 }
1053                 other => panic!("not string {:?}", other),
1054             }
1055         }
1056         assert_eq!(&*get_string("\"\""), b"");
1057         assert_eq!(&*get_string("\"a\""), b"a");
1058         assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1059         assert_eq!(&*get_string("\"\\\"\""), b"\"");
1060         assert_eq!(&*get_string("\"\\'\""), b"'");
1061         assert_eq!(&*get_string("\"\\n\""), b"\n");
1062         assert_eq!(&*get_string("\"\\t\""), b"\t");
1063         assert_eq!(&*get_string("\"\\r\""), b"\r");
1064         assert_eq!(&*get_string("\"\\\\\""), b"\\");
1065         assert_eq!(&*get_string("\"\\01\""), &[1]);
1066         assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1067         assert_eq!(
1068             &*get_string("\"\\u{0f3}\""),
1069             '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1070         );
1071         assert_eq!(
1072             &*get_string("\"\\u{0_f_3}\""),
1073             '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1074         );
1075 
1076         for i in 0..=255i32 {
1077             let s = format!("\"\\{:02x}\"", i);
1078             assert_eq!(&*get_string(&s), &[i as u8]);
1079         }
1080     }
1081 
1082     #[test]
id()1083     fn id() {
1084         fn get_id(input: &str) -> &str {
1085             match get_token(input) {
1086                 Token::Id(s) => s,
1087                 other => panic!("not id {:?}", other),
1088             }
1089         }
1090         assert_eq!(get_id("$x"), "$x");
1091         assert_eq!(get_id("$xyz"), "$xyz");
1092         assert_eq!(get_id("$x_z"), "$x_z");
1093         assert_eq!(get_id("$0^"), "$0^");
1094         assert_eq!(get_id("$0^;;"), "$0^");
1095         assert_eq!(get_id("$0^ ;;"), "$0^");
1096     }
1097 
1098     #[test]
keyword()1099     fn keyword() {
1100         fn get_keyword(input: &str) -> &str {
1101             match get_token(input) {
1102                 Token::Keyword(s) => s,
1103                 other => panic!("not id {:?}", other),
1104             }
1105         }
1106         assert_eq!(get_keyword("x"), "x");
1107         assert_eq!(get_keyword("xyz"), "xyz");
1108         assert_eq!(get_keyword("x_z"), "x_z");
1109         assert_eq!(get_keyword("x_z "), "x_z");
1110         assert_eq!(get_keyword("x_z "), "x_z");
1111     }
1112 
1113     #[test]
reserved()1114     fn reserved() {
1115         fn get_reserved(input: &str) -> &str {
1116             match get_token(input) {
1117                 Token::Reserved(s) => s,
1118                 other => panic!("not reserved {:?}", other),
1119             }
1120         }
1121         assert_eq!(get_reserved("$ "), "$");
1122         assert_eq!(get_reserved("^_x "), "^_x");
1123     }
1124 
1125     #[test]
integer()1126     fn integer() {
1127         fn get_integer(input: &str) -> String {
1128             match get_token(input) {
1129                 Token::Integer(i) => {
1130                     assert_eq!(input, i.src());
1131                     i.val().0.to_string()
1132                 }
1133                 other => panic!("not integer {:?}", other),
1134             }
1135         }
1136         assert_eq!(get_integer("1"), "1");
1137         assert_eq!(get_integer("0"), "0");
1138         assert_eq!(get_integer("-1"), "-1");
1139         assert_eq!(get_integer("+1"), "1");
1140         assert_eq!(get_integer("+1_000"), "1000");
1141         assert_eq!(get_integer("+1_0_0_0"), "1000");
1142         assert_eq!(get_integer("+0x10"), "10");
1143         assert_eq!(get_integer("-0x10"), "-10");
1144         assert_eq!(get_integer("0x10"), "10");
1145     }
1146 
1147     #[test]
float()1148     fn float() {
1149         fn get_float(input: &str) -> FloatVal<'_> {
1150             match get_token(input) {
1151                 Token::Float(i) => {
1152                     assert_eq!(input, i.src());
1153                     i.0.val
1154                 }
1155                 other => panic!("not reserved {:?}", other),
1156             }
1157         }
1158         assert_eq!(
1159             get_float("nan"),
1160             FloatVal::Nan {
1161                 val: None,
1162                 negative: false
1163             },
1164         );
1165         assert_eq!(
1166             get_float("-nan"),
1167             FloatVal::Nan {
1168                 val: None,
1169                 negative: true,
1170             },
1171         );
1172         assert_eq!(
1173             get_float("+nan"),
1174             FloatVal::Nan {
1175                 val: None,
1176                 negative: false,
1177             },
1178         );
1179         assert_eq!(
1180             get_float("+nan:0x1"),
1181             FloatVal::Nan {
1182                 val: Some(1),
1183                 negative: false,
1184             },
1185         );
1186         assert_eq!(
1187             get_float("nan:0x7f_ffff"),
1188             FloatVal::Nan {
1189                 val: Some(0x7fffff),
1190                 negative: false,
1191             },
1192         );
1193         assert_eq!(get_float("inf"), FloatVal::Inf { negative: false });
1194         assert_eq!(get_float("-inf"), FloatVal::Inf { negative: true });
1195         assert_eq!(get_float("+inf"), FloatVal::Inf { negative: false });
1196 
1197         assert_eq!(
1198             get_float("1.2"),
1199             FloatVal::Val {
1200                 integral: "1".into(),
1201                 decimal: Some("2".into()),
1202                 exponent: None,
1203                 hex: false,
1204             },
1205         );
1206         assert_eq!(
1207             get_float("1.2e3"),
1208             FloatVal::Val {
1209                 integral: "1".into(),
1210                 decimal: Some("2".into()),
1211                 exponent: Some("3".into()),
1212                 hex: false,
1213             },
1214         );
1215         assert_eq!(
1216             get_float("-1_2.1_1E+0_1"),
1217             FloatVal::Val {
1218                 integral: "-12".into(),
1219                 decimal: Some("11".into()),
1220                 exponent: Some("01".into()),
1221                 hex: false,
1222             },
1223         );
1224         assert_eq!(
1225             get_float("+1_2.1_1E-0_1"),
1226             FloatVal::Val {
1227                 integral: "12".into(),
1228                 decimal: Some("11".into()),
1229                 exponent: Some("-01".into()),
1230                 hex: false,
1231             },
1232         );
1233         assert_eq!(
1234             get_float("0x1_2.3_4p5_6"),
1235             FloatVal::Val {
1236                 integral: "12".into(),
1237                 decimal: Some("34".into()),
1238                 exponent: Some("56".into()),
1239                 hex: true,
1240             },
1241         );
1242         assert_eq!(
1243             get_float("+0x1_2.3_4P-5_6"),
1244             FloatVal::Val {
1245                 integral: "12".into(),
1246                 decimal: Some("34".into()),
1247                 exponent: Some("-56".into()),
1248                 hex: true,
1249             },
1250         );
1251         assert_eq!(
1252             get_float("1."),
1253             FloatVal::Val {
1254                 integral: "1".into(),
1255                 decimal: None,
1256                 exponent: None,
1257                 hex: false,
1258             },
1259         );
1260         assert_eq!(
1261             get_float("0x1p-24"),
1262             FloatVal::Val {
1263                 integral: "1".into(),
1264                 decimal: None,
1265                 exponent: Some("-24".into()),
1266                 hex: true,
1267             },
1268         );
1269     }
1270 }
1271