1 use crate::{
2     visitors::{Visit, VisitMut, Visitor, VisitorMut},
3     ShortString,
4 };
5 
6 use full_moon_derive::symbols;
7 #[cfg(feature = "serde")]
8 use serde::{Deserialize, Serialize};
9 use std::{cmp::Ordering, fmt, str::FromStr};
10 
11 symbols!(
12     And => "and",
13     Break => "break",
14     Do => "do",
15     ElseIf => "elseif",
16     Else => "else",
17     End => "end",
18     False => "false",
19     For => "for",
20     Function => "function",
21     If => "if",
22     In => "in",
23     Local => "local",
24     Nil => "nil",
25     Not => "not",
26     Or => "or",
27     Repeat => "repeat",
28     Return => "return",
29     Then => "then",
30     True => "true",
31     Until => "until",
32     While => "while",
33     // TODO: This only is valid in Lua 5.2
34     Goto => "goto",
35 
36     // TODO: This only is valid in Roblox
37     PlusEqual => "+=",
38     MinusEqual => "-=",
39     StarEqual => "*=",
40     SlashEqual => "/=",
41     PercentEqual => "%=",
42     CaretEqual => "^=",
43     TwoDotsEqual => "..=",
44     // TODO: This only is valid in Roblox
45     Ampersand => "&",
46     // TODO: This only is valid in Roblox
47     ThinArrow => "->",
48     // TODO: This only is valid in Roblox and Lua 5.2
49     TwoColons => "::",
50     Caret => "^",
51     Colon => ":",
52     Comma => ",",
53     Ellipse => "...",
54     TwoDots => "..",
55     Dot => ".",
56     TwoEqual => "==",
57     Equal => "=",
58     GreaterThanEqual => ">=",
59     GreaterThan => ">",
60     Hash => "#",
61     LeftBrace => "{",
62     LeftBracket => "[",
63     LeftParen => "(",
64     LessThanEqual => "<=",
65     LessThan => "<",
66     Minus => "-",
67     Percent => "%",
68     // TODO: This only is valid in Roblox
69     Pipe => "|",
70     Plus => "+",
71     // TODO: This only is valid in Roblox
72     QuestionMark => "?",
73     RightBrace => "}",
74     RightBracket => "]",
75     RightParen => ")",
76     Semicolon => ";",
77     Slash => "/",
78     Star => "*",
79     TildeEqual => "~=",
80 );
81 
82 /// The possible errors that can happen while tokenizing.
83 #[derive(Clone, Debug, PartialEq)]
84 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
85 pub enum TokenizerErrorType {
86     /// An unclosed multi-line comment was found
87     UnclosedComment,
88     /// An unclosed string was found
89     UnclosedString,
90     /// An unexpected #! was found
91     UnexpectedShebang,
92     /// An unexpected token was found
93     UnexpectedToken(char),
94     /// Symbol passed is not valid
95     /// Returned from [`TokenReference::symbol`]
96     InvalidSymbol(String),
97 }
98 
99 /// The type of tokens in parsed code
100 #[derive(Clone, Debug, Eq, PartialEq)]
101 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
102 #[cfg_attr(feature = "serde", serde(tag = "type"))]
103 #[non_exhaustive]
104 pub enum TokenType {
105     /// End of file, should always be the very last token
106     Eof,
107 
108     /// An identifier, such as `foo`
109     Identifier {
110         /// The identifier itself
111         identifier: ShortString,
112     },
113 
114     /// A multi line comment in the format of `--[[ comment ]]`
115     MultiLineComment {
116         /// Number of equals signs, if any, for the multi line comment
117         /// For example, `--[=[` would have a `blocks` value of `1`
118         blocks: usize,
119         /// The comment itself, ignoring opening and closing tags
120         comment: ShortString,
121     },
122 
123     /// A literal number, such as `3.3`
124     Number {
125         /// The text representing the number, includes details such as `0x`
126         text: ShortString,
127     },
128 
129     /// A shebang line
130     Shebang {
131         /// The shebang line itself
132         line: ShortString,
133     },
134 
135     /// A single line comment, such as `-- comment`
136     SingleLineComment {
137         /// The comment, ignoring initial `--`
138         comment: ShortString,
139     },
140 
141     /// A literal string, such as "Hello, world"
142     StringLiteral {
143         /// The literal itself, ignoring quotation marks
144         literal: ShortString,
145         #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
146         /// Number of equals signs used for a multi line string, if it is one
147         /// For example, `[=[string]=]` would have a `multi_line` value of Some(1)
148         /// `[[string]]` would have a `multi_line` value of Some(0)
149         /// A string such as `"string"` would have a `multi_line` value of None
150         multi_line: Option<usize>,
151         /// The type of quotation mark used to make the string
152         quote_type: StringLiteralQuoteType,
153     },
154 
155     /// A [`Symbol`], such as `local` or `+`
156     Symbol {
157         /// The symbol itself
158         symbol: Symbol,
159     },
160 
161     /// Whitespace, such as tabs or new lines
162     Whitespace {
163         /// Characters consisting of the whitespace
164         characters: ShortString,
165     },
166 }
167 
168 impl TokenType {
169     /// Returns whether a token can be practically ignored in most cases
170     /// Comments and whitespace will return `true`, everything else will return `false`
is_trivia(&self) -> bool171     pub fn is_trivia(&self) -> bool {
172         matches!(
173             self,
174             TokenType::Shebang { .. }
175                 | TokenType::SingleLineComment { .. }
176                 | TokenType::MultiLineComment { .. }
177                 | TokenType::Whitespace { .. }
178         )
179     }
180 
181     /// Returns the kind of the token type.
182     ///
183     /// ```rust
184     /// use full_moon::{ShortString, tokenizer::{TokenKind, TokenType}};
185     ///
186     /// assert_eq!(
187     ///     TokenType::Identifier {
188     ///         identifier: ShortString::new("hello")
189     ///     }.kind(),
190     ///     TokenKind::Identifier,
191     /// );
192     /// ```
kind(&self) -> TokenKind193     pub fn kind(&self) -> TokenKind {
194         match self {
195             TokenType::Eof => TokenKind::Eof,
196             TokenType::Identifier { .. } => TokenKind::Identifier,
197             TokenType::MultiLineComment { .. } => TokenKind::MultiLineComment,
198             TokenType::Number { .. } => TokenKind::Number,
199             TokenType::Shebang { .. } => TokenKind::Shebang,
200             TokenType::SingleLineComment { .. } => TokenKind::SingleLineComment,
201             TokenType::StringLiteral { .. } => TokenKind::StringLiteral,
202             TokenType::Symbol { .. } => TokenKind::Symbol,
203             TokenType::Whitespace { .. } => TokenKind::Whitespace,
204         }
205     }
206 
207     /// Returns a whitespace `TokenType` consisting of spaces
spaces(spaces: usize) -> Self208     pub fn spaces(spaces: usize) -> Self {
209         TokenType::Whitespace {
210             characters: " ".repeat(spaces).into(),
211         }
212     }
213 
214     /// Returns a whitespace `TokenType` consisting of tabs
tabs(tabs: usize) -> Self215     pub fn tabs(tabs: usize) -> Self {
216         TokenType::Whitespace {
217             characters: "\t".repeat(tabs).into(),
218         }
219     }
220 }
221 
222 /// The kind of token. Contains no additional data.
223 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
224 #[non_exhaustive]
225 pub enum TokenKind {
226     /// End of file, should always be the very last token
227     Eof,
228     /// An identifier, such as `foo`
229     Identifier,
230     /// A multi line comment in the format of `--[[ comment ]]`
231     MultiLineComment,
232     /// A literal number, such as `3.3`
233     Number,
234     /// The shebang line
235     Shebang,
236     /// A single line comment, such as `-- comment`
237     SingleLineComment,
238     /// A literal string, such as "Hello, world"
239     StringLiteral,
240     /// A [`Symbol`], such as `local` or `+`
241     Symbol,
242     /// Whitespace, such as tabs or new lines
243     Whitespace,
244 }
245 
246 /// A token such consisting of its [`Position`] and a [`TokenType`]
247 #[derive(Clone, Debug)]
248 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
249 pub struct Token {
250     pub(crate) start_position: Position,
251     pub(crate) end_position: Position,
252     pub(crate) token_type: TokenType,
253 }
254 
255 impl Token {
256     /// Creates a token with a zero position
new(token_type: TokenType) -> Token257     pub fn new(token_type: TokenType) -> Token {
258         Token {
259             start_position: Position::default(),
260             end_position: Position::default(),
261             token_type,
262         }
263     }
264 
265     /// The position a token begins at
start_position(&self) -> Position266     pub fn start_position(&self) -> Position {
267         self.start_position
268     }
269 
270     /// The position a token ends at
end_position(&self) -> Position271     pub fn end_position(&self) -> Position {
272         self.end_position
273     }
274 
275     /// The type of token as well as the data needed to represent it
276     /// If you don't need any other information, use [`token_kind`](Token::token_kind) instead.
token_type(&self) -> &TokenType277     pub fn token_type(&self) -> &TokenType {
278         &self.token_type
279     }
280 
281     /// The kind of token with no additional data.
282     /// If you need any information such as idenitfier names, use [`token_type`](Token::token_type) instead.
token_kind(&self) -> TokenKind283     pub fn token_kind(&self) -> TokenKind {
284         self.token_type().kind()
285     }
286 }
287 
288 impl fmt::Display for Token {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result289     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
290         use self::TokenType::*;
291 
292         match &*self.token_type() {
293             Eof => "".to_string(),
294             Number { text } => text.to_string(),
295             Identifier { identifier } => identifier.to_string(),
296             MultiLineComment { blocks, comment } => {
297                 format!("--[{0}[{1}]{0}]", "=".repeat(*blocks), comment)
298             }
299             Shebang { line } => line.to_string(),
300             SingleLineComment { comment } => format!("--{}", comment),
301             StringLiteral {
302                 literal,
303                 multi_line,
304                 quote_type,
305             } => {
306                 if let Some(blocks) = multi_line {
307                     format!("[{0}[{1}]{0}]", "=".repeat(*blocks), literal.to_string())
308                 } else {
309                     format!("{0}{1}{0}", quote_type.to_string(), literal.to_string())
310                 }
311             }
312             Symbol { symbol } => symbol.to_string(),
313             Whitespace { characters } => characters.to_string(),
314         }
315         .fmt(formatter)
316     }
317 }
318 
319 impl PartialEq<Self> for Token {
eq(&self, rhs: &Self) -> bool320     fn eq(&self, rhs: &Self) -> bool {
321         self.start_position() == rhs.start_position()
322             && self.end_position() == rhs.end_position()
323             && self.token_type == rhs.token_type
324     }
325 }
326 
327 impl Eq for Token {}
328 
329 impl Ord for Token {
cmp(&self, other: &Self) -> Ordering330     fn cmp(&self, other: &Self) -> Ordering {
331         self.start_position().cmp(&other.start_position())
332     }
333 }
334 
335 impl PartialOrd for Token {
partial_cmp(&self, other: &Self) -> Option<Ordering>336     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
337         Some(self.cmp(other))
338     }
339 }
340 
341 impl Visit for Token {
visit<V: Visitor>(&self, visitor: &mut V)342     fn visit<V: Visitor>(&self, visitor: &mut V) {
343         visitor.visit_token(self);
344 
345         match self.token_kind() {
346             TokenKind::Eof => {}
347             TokenKind::Identifier => visitor.visit_identifier(self),
348             TokenKind::MultiLineComment => visitor.visit_multi_line_comment(self),
349             TokenKind::Number => visitor.visit_number(self),
350             TokenKind::Shebang => {}
351             TokenKind::SingleLineComment => visitor.visit_single_line_comment(self),
352             TokenKind::StringLiteral => visitor.visit_string_literal(self),
353             TokenKind::Symbol => visitor.visit_symbol(self),
354             TokenKind::Whitespace => visitor.visit_whitespace(self),
355         }
356     }
357 }
358 
359 impl VisitMut for Token {
visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self360     fn visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self {
361         let token = visitor.visit_token(self);
362 
363         match token.token_kind() {
364             TokenKind::Eof => token,
365             TokenKind::Identifier => visitor.visit_identifier(token),
366             TokenKind::MultiLineComment => visitor.visit_multi_line_comment(token),
367             TokenKind::Number => visitor.visit_number(token),
368             TokenKind::Shebang => token,
369             TokenKind::SingleLineComment => visitor.visit_single_line_comment(token),
370             TokenKind::StringLiteral => visitor.visit_string_literal(token),
371             TokenKind::Symbol => visitor.visit_symbol(token),
372             TokenKind::Whitespace => visitor.visit_whitespace(token),
373         }
374     }
375 }
376 
377 /// A reference to a token used by Ast's.
378 /// Dereferences to a [`Token`]
379 #[derive(Clone, Debug)]
380 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
381 pub struct TokenReference {
382     pub(crate) leading_trivia: Vec<Token>,
383     pub(crate) token: Token,
384     pub(crate) trailing_trivia: Vec<Token>,
385 }
386 
387 impl TokenReference {
388     /// Creates a TokenReference from leading/trailing trivia as well as the leading token
new(leading_trivia: Vec<Token>, token: Token, trailing_trivia: Vec<Token>) -> Self389     pub fn new(leading_trivia: Vec<Token>, token: Token, trailing_trivia: Vec<Token>) -> Self {
390         Self {
391             leading_trivia,
392             token,
393             trailing_trivia,
394         }
395     }
396 
397     /// Returns a symbol with the leading and trailing whitespace
398     /// Only whitespace is supported
399     /// ```rust
400     /// # use full_moon::tokenizer::{Symbol, TokenReference, TokenType, TokenizerErrorType};
401     /// # fn main() -> Result<(), Box<TokenizerErrorType>> {
402     /// let symbol = TokenReference::symbol("\nreturn ")?;
403     /// assert_eq!(symbol.leading_trivia().next().unwrap().to_string(), "\n");
404     /// assert_eq!(symbol.token().token_type(), &TokenType::Symbol {
405     ///     symbol: Symbol::Return,
406     /// });
407     /// assert_eq!(symbol.trailing_trivia().next().unwrap().to_string(), " ");
408     /// assert!(TokenReference::symbol("isnt whitespace").is_err());
409     /// assert!(TokenReference::symbol(" notasymbol ").is_err());
410     /// # Ok(())
411     /// # }
412     /// ```
symbol(text: &str) -> Result<Self, TokenizerErrorType>413     pub fn symbol(text: &str) -> Result<Self, TokenizerErrorType> {
414         let mut chars = text.chars().peekable();
415 
416         let mut leading_trivia = String::new();
417         while let Some(character) = chars.peek() {
418             if character.is_ascii_whitespace() {
419                 leading_trivia.push(chars.next().unwrap());
420             } else {
421                 break;
422             }
423         }
424 
425         let mut symbol_text = String::new();
426         while let Some(character) = chars.peek() {
427             if !character.is_ascii_whitespace() {
428                 symbol_text.push(chars.next().unwrap());
429             } else {
430                 break;
431             }
432         }
433 
434         let symbol = Symbol::from_str(&symbol_text)
435             .map_err(|_| TokenizerErrorType::InvalidSymbol(symbol_text))?;
436 
437         let mut trailing_trivia = String::new();
438         while let Some(character) = chars.peek() {
439             if character.is_ascii_whitespace() {
440                 trailing_trivia.push(chars.next().unwrap());
441             } else {
442                 return Err(TokenizerErrorType::UnexpectedToken(*character));
443             }
444         }
445 
446         Ok(Self {
447             leading_trivia: vec![Token::new(TokenType::Whitespace {
448                 characters: leading_trivia.into(),
449             })],
450             token: Token::new(TokenType::Symbol { symbol }),
451             trailing_trivia: vec![Token::new(TokenType::Whitespace {
452                 characters: trailing_trivia.into(),
453             })],
454         })
455     }
456 
457     /// Returns the inner token.
token(&self) -> &Token458     pub fn token(&self) -> &Token {
459         &self.token
460     }
461 
462     /// Returns the leading trivia
leading_trivia(&self) -> impl Iterator<Item = &Token>463     pub fn leading_trivia(&self) -> impl Iterator<Item = &Token> {
464         self.leading_trivia.iter()
465     }
466 
467     /// Returns the trailing trivia
trailing_trivia(&self) -> impl Iterator<Item = &Token>468     pub fn trailing_trivia(&self) -> impl Iterator<Item = &Token> {
469         self.trailing_trivia.iter()
470     }
471 
472     /// Creates a clone of the current TokenReference with the new inner token, preserving trivia.
with_token(&self, token: Token) -> Self473     pub fn with_token(&self, token: Token) -> Self {
474         Self {
475             token,
476             leading_trivia: self.leading_trivia.clone(),
477             trailing_trivia: self.trailing_trivia.clone(),
478         }
479     }
480 }
481 
482 impl std::borrow::Borrow<Token> for &TokenReference {
borrow(&self) -> &Token483     fn borrow(&self) -> &Token {
484         &**self
485     }
486 }
487 
488 impl std::ops::Deref for TokenReference {
489     type Target = Token;
490 
deref(&self) -> &Self::Target491     fn deref(&self) -> &Self::Target {
492         &self.token
493     }
494 }
495 
496 impl fmt::Display for TokenReference {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result497     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
498         for trivia in &self.leading_trivia {
499             formatter.write_str(&trivia.to_string())?;
500         }
501 
502         formatter.write_str(&self.token.to_string())?;
503 
504         for trivia in &self.trailing_trivia {
505             formatter.write_str(&trivia.to_string())?;
506         }
507 
508         Ok(())
509     }
510 }
511 
512 impl PartialEq<Self> for TokenReference {
eq(&self, other: &Self) -> bool513     fn eq(&self, other: &Self) -> bool {
514         (**self).eq(other)
515             && self.leading_trivia == other.leading_trivia
516             && self.trailing_trivia == other.trailing_trivia
517     }
518 }
519 
520 impl Eq for TokenReference {}
521 
522 impl Ord for TokenReference {
cmp(&self, other: &Self) -> Ordering523     fn cmp(&self, other: &Self) -> Ordering {
524         (**self).cmp(&**other)
525     }
526 }
527 
528 impl PartialOrd for TokenReference {
partial_cmp(&self, other: &Self) -> Option<Ordering>529     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
530         Some(self.cmp(other))
531     }
532 }
533 
534 impl Visit for TokenReference {
visit<V: Visitor>(&self, visitor: &mut V)535     fn visit<V: Visitor>(&self, visitor: &mut V) {
536         visitor.visit_token(self);
537 
538         if matches!(self.token().token_kind(), TokenKind::Eof) {
539             visitor.visit_eof(self);
540         }
541 
542         self.leading_trivia.visit(visitor);
543         self.token.visit(visitor);
544         self.trailing_trivia.visit(visitor);
545     }
546 }
547 
548 impl VisitMut for TokenReference {
visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self549     fn visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self {
550         let mut token_reference = visitor.visit_token_reference(self);
551 
552         if matches!(token_reference.token().token_kind(), TokenKind::Eof) {
553             token_reference = visitor.visit_eof(token_reference);
554         }
555 
556         token_reference.leading_trivia = token_reference.leading_trivia.visit_mut(visitor);
557         token_reference.token = token_reference.token.visit_mut(visitor);
558         token_reference.trailing_trivia = token_reference.trailing_trivia.visit_mut(visitor);
559         token_reference
560     }
561 }
562 
563 /// Used to represent exact positions of tokens in code
564 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
565 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
566 pub struct Position {
567     pub(crate) bytes: usize,
568     pub(crate) line: usize,
569     pub(crate) character: usize,
570 }
571 
572 impl Position {
573     /// How many bytes, ignoring lines, it would take to find this position
bytes(self) -> usize574     pub fn bytes(self) -> usize {
575         self.bytes
576     }
577 
578     /// Index of the character on the line for this position
character(self) -> usize579     pub fn character(self) -> usize {
580         self.character
581     }
582 
583     /// Line the position lies on
line(self) -> usize584     pub fn line(self) -> usize {
585         self.line
586     }
587 }
588 
589 impl Ord for Position {
cmp(&self, other: &Self) -> Ordering590     fn cmp(&self, other: &Self) -> Ordering {
591         self.bytes.cmp(&other.bytes)
592     }
593 }
594 
595 impl PartialOrd for Position {
partial_cmp(&self, other: &Self) -> Option<Ordering>596     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
597         Some(self.cmp(other))
598     }
599 }
600 
601 #[derive(Clone, Debug, PartialEq)]
602 struct TokenAdvancement {
603     pub advance: usize,
604     pub token_type: TokenType,
605 }
606 
607 /// The types of quotes used in a Lua string
608 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
609 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
610 #[non_exhaustive]
611 pub enum StringLiteralQuoteType {
612     /// Strings formatted \[\[with brackets\]\]
613     Brackets,
614     /// Strings formatted "with double quotes"
615     Double,
616     /// Strings formatted 'with single quotes'
617     Single,
618 }
619 
620 impl fmt::Display for StringLiteralQuoteType {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result621     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
622         match *self {
623             StringLiteralQuoteType::Brackets => unreachable!(),
624             StringLiteralQuoteType::Double => "\"",
625             StringLiteralQuoteType::Single => "'",
626         }
627         .fmt(formatter)
628     }
629 }
630 
631 type RawToken = Result<TokenType, TokenizerErrorType>;
632 
633 impl From<TokenType> for RawToken {
from(token_type: TokenType) -> RawToken634     fn from(token_type: TokenType) -> RawToken {
635         Ok(token_type)
636     }
637 }
638 
639 impl From<TokenizerErrorType> for RawToken {
from(error: TokenizerErrorType) -> RawToken640     fn from(error: TokenizerErrorType) -> RawToken {
641         Err(error)
642     }
643 }
644 
645 peg::parser! {
646     grammar tokens() for str {
647         use super::ParseSymbol;
648         use peg::ParseLiteral;
649         use super::StringLiteralQuoteType as QuoteType;
650 
651         rule line_ending()
652             = "\n" / "\r\n"
653         rule space()
654             = [' '|'\t']
655 
656         pub(super) rule whitespace() -> RawToken
657             = chars:$( space()+ line_ending()? / line_ending() )
658               { TokenType::Whitespace { characters:chars.into() }.into() }
659 
660         rule multi_line_start() -> &'input str
661             = "[" block:$("="*) "[" {block}
662 
663         rule multi_line_end(block: &'input str)
664             = "]" ##parse_string_literal(block) "]"
665 
666         rule multi_line_block() -> (usize, &'input str)
667             = block:multi_line_start()
668               content:$((!multi_line_end(block) [_])*)
669               multi_line_end(block)
670               { (block.len(), content) }
671 
672         rule multi_line_quote() -> RawToken
673             = v:multi_line_block() { TokenType::StringLiteral {
674                 multi_line: Some(v.0),
675                 literal:v.1.into(),
676                 quote_type: QuoteType::Brackets,
677             }.into()}
678             / &multi_line_start() [_]+ { TokenizerErrorType::UnclosedString.into() }
679 
680         rule escape()
681             = "\\" [_]
682 
683         rule quote_char(quote: &str)
684             = !(##parse_string_literal(quote) / ['\r'|'\n'|'\\']) [_]
685 
686         rule quoted(quote: &str, quote_type: QuoteType) -> RawToken
687             = ##parse_string_literal(quote)
688               literal:$((quote_char(quote) / escape())+ / )
689               ##parse_string_literal(quote)
690               { TokenType::StringLiteral { multi_line: None, literal:literal.into(), quote_type }.into() }
691             / ##parse_string_literal(quote) [_]* {TokenizerErrorType::UnclosedString.into() }
692 
693         rule single_line_quote() -> RawToken
694             = quoted("\"", (QuoteType::Double))
695             / quoted("\'", (QuoteType::Single))
696 
697         pub(super) rule string_literal() -> RawToken
698             = multi_line_quote()
699             / single_line_quote()
700 
701         pub(super) rule shebang() -> RawToken
702             = line:$("#!" (!line_ending() [_])* line_ending())
703               {TokenType::Shebang{line:line.into()}.into()}
704 
705         pub(super) rule identifier() -> RawToken
706             = id:$(['_'|'a'..='z'|'A'..='Z'] ['_'|'a'..='z'|'A'..='Z'|'0'..='9']*)
707               { match parse_keyword(id) {
708                     Some(symbol) => TokenType::Symbol { symbol }.into(),
709                     None => TokenType::Identifier { identifier: id.into() }.into(),
710               }}
711             / expected!("identifier")
712 
713         pub(super) rule comment() -> RawToken
714             = "--" v:multi_line_block()
715               { TokenType::MultiLineComment { blocks: v.0, comment: v.1.into() }.into() }
716             / "--" multi_line_start() [_]* { TokenizerErrorType::UnclosedComment.into() }
717             / "--" comment:$(([^ '\r'|'\n'])*)
718               { TokenType::SingleLineComment { comment: comment.into() }.into() }
719 
720         rule roblox()
721             = {? if cfg!(feature = "roblox") {
722                 Ok(())
723             } else {
724                 Err("roblox not enabled")
725             }}
726 
727         rule roblox_number() -> &'input str
728             = roblox() n:$(("0b"/"0B") ['0'|'1'|'_']+) {n}
729 
730         rule hex_number() -> &'input str
731             = roblox() n:$(("0x"/"0X") ['0'..='9'|'a'..='f'|'A'..='F'|'_']+) {n}
732             / !roblox() n:$(("0x"/"0X") ['0'..='9'|'a'..='f'|'A'..='F']+) {n}
733 
734         rule digit_with_separator() -> &'input str
735             = roblox() n:$(['0'..='9'] ['0'..='9'|'_']*) {n}
736             / !roblox() n:$(['0'..='9']+) {n}
737 
738         rule basic_number() -> &'input str
739             = $(
740                 digit_with_separator()
741                 ("." digit_with_separator()?)?
742                 (['e'|'E'] ['-'|'+']? digit_with_separator())?
743             )
744 
745         rule no_int_fractional_number() -> &'input str
746             = $(
747                 "." digit_with_separator()
748                 (['e'|'E'] ['-'|'+']? digit_with_separator())?
749             )
750 
751         pub(super) rule number() -> RawToken
752             = n:(
753                 roblox_number()
754               / hex_number()
755               / basic_number()
756               / no_int_fractional_number()
757             ) { TokenType::Number { text:n.into() }.into() }
758 
759         pub(super) rule symbol() -> RawToken = symbol:##parse_symbol() { TokenType::Symbol{symbol}.into() }
760 
761         rule token() -> RawToken
762             = whitespace()
763             / comment()
764             / number()
765             / string_literal()
766             / "#!" { TokenizerErrorType::UnexpectedShebang.into() }
767             / symbol()
768             / identifier()
769 
770         pub(crate) rule tokens() -> Vec<(RawToken, usize)>
771             = shebang:(shebang:shebang() pos:position!() {(shebang,pos)})?
772               body:( token:token() pos:position!() {(token,pos)})*
773               {
774                   let mut body = body;
775                   if let Some(shebang) = shebang {
776                       body.insert(0, shebang)
777                   }
778                   body
779               }
780     }
781 }
782 
783 /// Information about an error that occurs while tokenizing
784 #[derive(Clone, Debug, PartialEq)]
785 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
786 pub struct TokenizerError {
787     /// The type of error
788     error: TokenizerErrorType,
789     /// The position of the token that caused the error
790     position: Position,
791 }
792 
793 impl TokenizerError {
794     /// The type of error
error(&self) -> &TokenizerErrorType795     pub fn error(&self) -> &TokenizerErrorType {
796         &self.error
797     }
798 
799     /// The position of the token that caused the error
position(&self) -> Position800     pub fn position(&self) -> Position {
801         self.position
802     }
803 }
804 
805 impl fmt::Display for TokenizerError {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result806     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
807         write!(
808             formatter,
809             "{} at line {}, column {}",
810             match &self.error {
811                 TokenizerErrorType::UnclosedComment => "unclosed comment".to_string(),
812                 TokenizerErrorType::UnclosedString => "unclosed string".to_string(),
813                 TokenizerErrorType::UnexpectedShebang => "unexpected shebang".to_string(),
814                 TokenizerErrorType::UnexpectedToken(character) => {
815                     format!("unexpected character {}", character)
816                 }
817                 TokenizerErrorType::InvalidSymbol(symbol) => {
818                     format!("invalid symbol {}", symbol)
819                 }
820             },
821             self.position.line,
822             self.position.character,
823         )
824     }
825 }
826 
827 impl std::error::Error for TokenizerError {}
828 
829 impl From<peg::str::LineCol> for Position {
from(location: peg::str::LineCol) -> Position830     fn from(location: peg::str::LineCol) -> Position {
831         Position {
832             bytes: location.offset,
833             line: location.line,
834             character: location.column,
835         }
836     }
837 }
838 
839 struct TokenCollector {
840     result: Vec<Token>,
841 }
842 
843 // Collector
844 impl TokenCollector {
new() -> Self845     fn new() -> Self {
846         Self { result: Vec::new() }
847     }
push( &mut self, start_position: Position, raw_token: RawToken, end_position: Position, ) -> Result<(), TokenizerError>848     fn push(
849         &mut self,
850         start_position: Position,
851         raw_token: RawToken,
852         end_position: Position,
853     ) -> Result<(), TokenizerError> {
854         match raw_token {
855             Ok(token_type) => {
856                 self.result.push(Token {
857                     start_position,
858                     end_position,
859                     token_type,
860                 });
861                 Ok(())
862             }
863             Err(error) => Err(TokenizerError {
864                 error,
865                 position: start_position,
866             }),
867         }
868     }
finish(mut self, eof_position: Position) -> Vec<Token>869     fn finish(mut self, eof_position: Position) -> Vec<Token> {
870         self.result.push(Token {
871             start_position: eof_position,
872             end_position: eof_position,
873             token_type: TokenType::Eof,
874         });
875         self.result
876     }
877 }
878 
from_parser_error( code: &'_ str, ) -> impl Fn(peg::error::ParseError<peg::str::LineCol>) -> TokenizerError + '_879 fn from_parser_error(
880     code: &'_ str,
881 ) -> impl Fn(peg::error::ParseError<peg::str::LineCol>) -> TokenizerError + '_ {
882     move |err| TokenizerError {
883         error: TokenizerErrorType::UnexpectedToken(
884             code[err.location.offset..].chars().next().expect(
885                 "(internal full-moon error) Text overflow while giving unexpected token error",
886             ),
887         ),
888         position: err.location.into(),
889     }
890 }
891 
892 /// Returns a list of tokens.
893 /// You probably want [`parse`](crate::parse) instead.
894 ///
895 /// # Errors
896 ///
897 /// If the code passed is malformed from normal Lua expectations,
898 /// a [`TokenizerError`] will be returned.
899 ///
900 /// ```rust
901 /// # use full_moon::tokenizer::tokens;
902 /// assert!(tokens("local x = 1").is_ok());
903 /// assert!(tokens("local 4 = end").is_ok()); // tokens does *not* check validity of code, only tokenizing
904 /// assert!(tokens("--[[ Unclosed comment!").is_err());
905 /// ```
tokens(code: &str) -> Result<Vec<Token>, TokenizerError>906 pub fn tokens(code: &str) -> Result<Vec<Token>, TokenizerError> {
907     let mut tokens = TokenCollector::new();
908 
909     let mut raw_tokens = tokens::tokens(code).map_err(from_parser_error(code))?;
910 
911     // rust-peg lets us easily get the offset associated with
912     // (the end of) each token, but not the line or column
913     // information. We iterate over the characters to match
914     // up the tokens with the row/column information.
915     let mut raw_tokens = raw_tokens.drain(..);
916 
917     let mut position = Position {
918         bytes: 0,
919         character: 1,
920         line: 1,
921     };
922     let mut next_is_new_line = false;
923     let mut start_position = position;
924     if let Some((mut token_type, mut token_offset)) = raw_tokens.next() {
925         for character in code.chars() {
926             if character == '\n' {
927                 next_is_new_line = true;
928             } else {
929                 position.character += 1;
930             }
931 
932             position.bytes += character.len_utf8();
933 
934             let end_position = position;
935 
936             if next_is_new_line {
937                 next_is_new_line = false;
938                 position.line += 1;
939                 position.character = 1;
940             }
941 
942             if token_offset == end_position.bytes {
943                 tokens.push(start_position, token_type, end_position)?;
944                 start_position = position;
945                 if let Some((next_token_type, next_token_offset)) = raw_tokens.next() {
946                     token_type = next_token_type;
947                     token_offset = next_token_offset;
948                 } else {
949                     break;
950                 }
951             }
952         }
953     }
954 
955     if let Some((token_type, token_offset)) = raw_tokens.next() {
956         panic!("(internal full-moon error) Found token {:?} with offset {:?} which is past the end of source", token_type, token_offset);
957     }
958 
959     Ok(tokens.finish(position))
960 }
961 
962 #[cfg(test)]
963 mod tests {
964     use crate::tokenizer::*;
965     use pretty_assertions::assert_eq;
966 
967     macro_rules! test_rule {
968         ($rule:ident($code:expr), $result:expr) => {
969             let code: &str = $code;
970             let result: RawToken = $result.into();
971 
972             assert_eq!(
973                 tokens::$rule(code)
974                     .map_err(|err| from_parser_error(code)(err).error)
975                     .and_then(|v| v),
976                 result,
977             );
978             test_rule!(code, result)
979         };
980         ($code:expr, $result:expr) => {
981             let code: &str = $code;
982             let result: RawToken = $result.into();
983 
984             match result {
985                 Ok(token) => {
986                     let tokens = tokens(code).expect("couldn't tokenize");
987                     let first_token = &tokens.get(0).expect("tokenized response is empty");
988                     assert_eq!(*first_token.token_type(), token);
989                 }
990 
991                 Err(expected) => {
992                     if let Err(TokenizerError { error, .. }) = tokens($code) {
993                         assert_eq!(error, expected);
994                     }
995                 }
996             };
997         };
998     }
999 
1000     #[test]
test_rule_comment()1001     fn test_rule_comment() {
1002         test_rule!(
1003             comment("-- hello world"),
1004             TokenType::SingleLineComment {
1005                 comment: " hello world".into()
1006             }
1007         );
1008 
1009         test_rule!(
1010             comment("--[[ hello world ]]"),
1011             TokenType::MultiLineComment {
1012                 blocks: 0,
1013                 comment: " hello world ".into()
1014             }
1015         );
1016 
1017         test_rule!(
1018             comment("--[=[ hello world ]=]"),
1019             TokenType::MultiLineComment {
1020                 blocks: 1,
1021                 comment: " hello world ".into()
1022             }
1023         );
1024         test_rule!(
1025             comment("--"),
1026             TokenType::SingleLineComment { comment: "".into() }
1027         );
1028     }
1029 
1030     #[test]
test_rule_numbers()1031     fn test_rule_numbers() {
1032         test_rule!(number("213"), TokenType::Number { text: "213".into() });
1033 
1034         test_rule!(number("1"), TokenType::Number { text: "1".into() });
1035 
1036         test_rule!(
1037             number("123.45"),
1038             TokenType::Number {
1039                 text: "123.45".into(),
1040             }
1041         );
1042     }
1043 
1044     #[test]
1045     #[cfg_attr(not(feature = "roblox"), ignore)]
test_rule_binary_literals()1046     fn test_rule_binary_literals() {
1047         test_rule!(
1048             number("0b101"),
1049             TokenType::Number {
1050                 text: "0b101".into(),
1051             }
1052         );
1053     }
1054 
1055     #[test]
test_rule_identifier()1056     fn test_rule_identifier() {
1057         test_rule!(
1058             identifier("hello"),
1059             TokenType::Identifier {
1060                 identifier: "hello".into(),
1061             }
1062         );
1063 
1064         test_rule!(
1065             "hello world",
1066             TokenType::Identifier {
1067                 identifier: "hello".into(),
1068             }
1069         );
1070 
1071         test_rule!(
1072             identifier("hello___"),
1073             TokenType::Identifier {
1074                 identifier: "hello___".into(),
1075             }
1076         );
1077 
1078         test_rule!(identifier("123"), TokenizerErrorType::UnexpectedToken('1'));
1079     }
1080 
1081     #[test]
test_rule_symbols()1082     fn test_rule_symbols() {
1083         test_rule!(
1084             identifier("local"),
1085             TokenType::Symbol {
1086                 symbol: Symbol::Local
1087             }
1088         );
1089     }
1090 
1091     #[test]
test_rule_whitespace()1092     fn test_rule_whitespace() {
1093         test_rule!(
1094             "\t  \n\t",
1095             TokenType::Whitespace {
1096                 characters: "\t  \n".into(),
1097             }
1098         );
1099 
1100         test_rule!(
1101             "\thello",
1102             TokenType::Whitespace {
1103                 characters: "\t".into(),
1104             }
1105         );
1106 
1107         test_rule!(
1108             "\t\t\nhello",
1109             TokenType::Whitespace {
1110                 characters: "\t\t\n".into(),
1111             }
1112         );
1113 
1114         test_rule!(
1115             "\n\thello",
1116             TokenType::Whitespace {
1117                 characters: "\n".into(),
1118             }
1119         );
1120     }
1121 
1122     #[test]
test_rule_string_literal()1123     fn test_rule_string_literal() {
1124         test_rule!(
1125             string_literal("\"hello\""),
1126             TokenType::StringLiteral {
1127                 literal: "hello".into(),
1128                 multi_line: None,
1129                 quote_type: StringLiteralQuoteType::Double,
1130             }
1131         );
1132 
1133         test_rule!(
1134             string_literal("\"hello\\\nworld\""),
1135             TokenType::StringLiteral {
1136                 literal: "hello\\\nworld".into(),
1137                 multi_line: None,
1138                 quote_type: StringLiteralQuoteType::Double,
1139             }
1140         );
1141 
1142         test_rule!(
1143             string_literal("\"hello"),
1144             TokenizerErrorType::UnclosedString
1145         );
1146     }
1147 
1148     #[test]
test_symbols_within_symbols()1149     fn test_symbols_within_symbols() {
1150         // "index" should not return "in"
1151         test_rule!(
1152             identifier("index"),
1153             TokenType::Identifier {
1154                 identifier: "index".into()
1155             }
1156         );
1157 
1158         // "<=" should not return "<"
1159         test_rule!(
1160             symbol("<="),
1161             TokenType::Symbol {
1162                 symbol: Symbol::LessThanEqual,
1163             }
1164         );
1165     }
1166 
1167     #[test]
test_rule_shebang()1168     fn test_rule_shebang() {
1169         test_rule!(
1170             shebang("#!/usr/bin/env lua\n"),
1171             TokenType::Shebang {
1172                 line: "#!/usr/bin/env lua\n".into()
1173             }
1174         );
1175         // Don't recognize with a whitespace.
1176         test_rule!(
1177             " #!/usr/bin/env lua\n",
1178             TokenizerErrorType::UnexpectedShebang
1179         );
1180     }
1181 
1182     #[test]
test_new_line_on_same_line()1183     fn test_new_line_on_same_line() {
1184         assert_eq!(
1185             tokens("\n").unwrap()[0],
1186             Token {
1187                 start_position: Position {
1188                     bytes: 0,
1189                     character: 1,
1190                     line: 1,
1191                 },
1192 
1193                 end_position: Position {
1194                     bytes: 1,
1195                     character: 1,
1196                     line: 1,
1197                 },
1198 
1199                 token_type: TokenType::Whitespace {
1200                     characters: "\n".into()
1201                 },
1202             }
1203         );
1204     }
1205 
1206     #[test]
test_fuzzer()1207     fn test_fuzzer() {
1208         let _ = tokens("*ա");
1209         let _ = tokens("̹(");
1210         let _ = tokens("¹;");
1211     }
1212 }
1213