1 use crate::{
2 visitors::{Visit, VisitMut, Visitor, VisitorMut},
3 ShortString,
4 };
5
6 use full_moon_derive::symbols;
7 #[cfg(feature = "serde")]
8 use serde::{Deserialize, Serialize};
9 use std::{cmp::Ordering, fmt, str::FromStr};
10
11 symbols!(
12 And => "and",
13 Break => "break",
14 Do => "do",
15 ElseIf => "elseif",
16 Else => "else",
17 End => "end",
18 False => "false",
19 For => "for",
20 Function => "function",
21 If => "if",
22 In => "in",
23 Local => "local",
24 Nil => "nil",
25 Not => "not",
26 Or => "or",
27 Repeat => "repeat",
28 Return => "return",
29 Then => "then",
30 True => "true",
31 Until => "until",
32 While => "while",
33 // TODO: This only is valid in Lua 5.2
34 Goto => "goto",
35
36 // TODO: This only is valid in Roblox
37 PlusEqual => "+=",
38 MinusEqual => "-=",
39 StarEqual => "*=",
40 SlashEqual => "/=",
41 PercentEqual => "%=",
42 CaretEqual => "^=",
43 TwoDotsEqual => "..=",
44 // TODO: This only is valid in Roblox
45 Ampersand => "&",
46 // TODO: This only is valid in Roblox
47 ThinArrow => "->",
48 // TODO: This only is valid in Roblox and Lua 5.2
49 TwoColons => "::",
50 Caret => "^",
51 Colon => ":",
52 Comma => ",",
53 Ellipse => "...",
54 TwoDots => "..",
55 Dot => ".",
56 TwoEqual => "==",
57 Equal => "=",
58 GreaterThanEqual => ">=",
59 GreaterThan => ">",
60 Hash => "#",
61 LeftBrace => "{",
62 LeftBracket => "[",
63 LeftParen => "(",
64 LessThanEqual => "<=",
65 LessThan => "<",
66 Minus => "-",
67 Percent => "%",
68 // TODO: This only is valid in Roblox
69 Pipe => "|",
70 Plus => "+",
71 // TODO: This only is valid in Roblox
72 QuestionMark => "?",
73 RightBrace => "}",
74 RightBracket => "]",
75 RightParen => ")",
76 Semicolon => ";",
77 Slash => "/",
78 Star => "*",
79 TildeEqual => "~=",
80 );
81
82 /// The possible errors that can happen while tokenizing.
83 #[derive(Clone, Debug, PartialEq)]
84 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
85 pub enum TokenizerErrorType {
86 /// An unclosed multi-line comment was found
87 UnclosedComment,
88 /// An unclosed string was found
89 UnclosedString,
90 /// An unexpected #! was found
91 UnexpectedShebang,
92 /// An unexpected token was found
93 UnexpectedToken(char),
94 /// Symbol passed is not valid
95 /// Returned from [`TokenReference::symbol`]
96 InvalidSymbol(String),
97 }
98
99 /// The type of tokens in parsed code
100 #[derive(Clone, Debug, Eq, PartialEq)]
101 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
102 #[cfg_attr(feature = "serde", serde(tag = "type"))]
103 #[non_exhaustive]
104 pub enum TokenType {
105 /// End of file, should always be the very last token
106 Eof,
107
108 /// An identifier, such as `foo`
109 Identifier {
110 /// The identifier itself
111 identifier: ShortString,
112 },
113
114 /// A multi line comment in the format of `--[[ comment ]]`
115 MultiLineComment {
116 /// Number of equals signs, if any, for the multi line comment
117 /// For example, `--[=[` would have a `blocks` value of `1`
118 blocks: usize,
119 /// The comment itself, ignoring opening and closing tags
120 comment: ShortString,
121 },
122
123 /// A literal number, such as `3.3`
124 Number {
125 /// The text representing the number, includes details such as `0x`
126 text: ShortString,
127 },
128
129 /// A shebang line
130 Shebang {
131 /// The shebang line itself
132 line: ShortString,
133 },
134
135 /// A single line comment, such as `-- comment`
136 SingleLineComment {
137 /// The comment, ignoring initial `--`
138 comment: ShortString,
139 },
140
141 /// A literal string, such as "Hello, world"
142 StringLiteral {
143 /// The literal itself, ignoring quotation marks
144 literal: ShortString,
145 #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
146 /// Number of equals signs used for a multi line string, if it is one
147 /// For example, `[=[string]=]` would have a `multi_line` value of Some(1)
148 /// `[[string]]` would have a `multi_line` value of Some(0)
149 /// A string such as `"string"` would have a `multi_line` value of None
150 multi_line: Option<usize>,
151 /// The type of quotation mark used to make the string
152 quote_type: StringLiteralQuoteType,
153 },
154
155 /// A [`Symbol`], such as `local` or `+`
156 Symbol {
157 /// The symbol itself
158 symbol: Symbol,
159 },
160
161 /// Whitespace, such as tabs or new lines
162 Whitespace {
163 /// Characters consisting of the whitespace
164 characters: ShortString,
165 },
166 }
167
168 impl TokenType {
169 /// Returns whether a token can be practically ignored in most cases
170 /// Comments and whitespace will return `true`, everything else will return `false`
is_trivia(&self) -> bool171 pub fn is_trivia(&self) -> bool {
172 matches!(
173 self,
174 TokenType::Shebang { .. }
175 | TokenType::SingleLineComment { .. }
176 | TokenType::MultiLineComment { .. }
177 | TokenType::Whitespace { .. }
178 )
179 }
180
181 /// Returns the kind of the token type.
182 ///
183 /// ```rust
184 /// use full_moon::{ShortString, tokenizer::{TokenKind, TokenType}};
185 ///
186 /// assert_eq!(
187 /// TokenType::Identifier {
188 /// identifier: ShortString::new("hello")
189 /// }.kind(),
190 /// TokenKind::Identifier,
191 /// );
192 /// ```
kind(&self) -> TokenKind193 pub fn kind(&self) -> TokenKind {
194 match self {
195 TokenType::Eof => TokenKind::Eof,
196 TokenType::Identifier { .. } => TokenKind::Identifier,
197 TokenType::MultiLineComment { .. } => TokenKind::MultiLineComment,
198 TokenType::Number { .. } => TokenKind::Number,
199 TokenType::Shebang { .. } => TokenKind::Shebang,
200 TokenType::SingleLineComment { .. } => TokenKind::SingleLineComment,
201 TokenType::StringLiteral { .. } => TokenKind::StringLiteral,
202 TokenType::Symbol { .. } => TokenKind::Symbol,
203 TokenType::Whitespace { .. } => TokenKind::Whitespace,
204 }
205 }
206
207 /// Returns a whitespace `TokenType` consisting of spaces
spaces(spaces: usize) -> Self208 pub fn spaces(spaces: usize) -> Self {
209 TokenType::Whitespace {
210 characters: " ".repeat(spaces).into(),
211 }
212 }
213
214 /// Returns a whitespace `TokenType` consisting of tabs
tabs(tabs: usize) -> Self215 pub fn tabs(tabs: usize) -> Self {
216 TokenType::Whitespace {
217 characters: "\t".repeat(tabs).into(),
218 }
219 }
220 }
221
222 /// The kind of token. Contains no additional data.
223 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
224 #[non_exhaustive]
225 pub enum TokenKind {
226 /// End of file, should always be the very last token
227 Eof,
228 /// An identifier, such as `foo`
229 Identifier,
230 /// A multi line comment in the format of `--[[ comment ]]`
231 MultiLineComment,
232 /// A literal number, such as `3.3`
233 Number,
234 /// The shebang line
235 Shebang,
236 /// A single line comment, such as `-- comment`
237 SingleLineComment,
238 /// A literal string, such as "Hello, world"
239 StringLiteral,
240 /// A [`Symbol`], such as `local` or `+`
241 Symbol,
242 /// Whitespace, such as tabs or new lines
243 Whitespace,
244 }
245
246 /// A token such consisting of its [`Position`] and a [`TokenType`]
247 #[derive(Clone, Debug)]
248 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
249 pub struct Token {
250 pub(crate) start_position: Position,
251 pub(crate) end_position: Position,
252 pub(crate) token_type: TokenType,
253 }
254
255 impl Token {
256 /// Creates a token with a zero position
new(token_type: TokenType) -> Token257 pub fn new(token_type: TokenType) -> Token {
258 Token {
259 start_position: Position::default(),
260 end_position: Position::default(),
261 token_type,
262 }
263 }
264
265 /// The position a token begins at
start_position(&self) -> Position266 pub fn start_position(&self) -> Position {
267 self.start_position
268 }
269
270 /// The position a token ends at
end_position(&self) -> Position271 pub fn end_position(&self) -> Position {
272 self.end_position
273 }
274
275 /// The type of token as well as the data needed to represent it
276 /// If you don't need any other information, use [`token_kind`](Token::token_kind) instead.
token_type(&self) -> &TokenType277 pub fn token_type(&self) -> &TokenType {
278 &self.token_type
279 }
280
281 /// The kind of token with no additional data.
282 /// If you need any information such as idenitfier names, use [`token_type`](Token::token_type) instead.
token_kind(&self) -> TokenKind283 pub fn token_kind(&self) -> TokenKind {
284 self.token_type().kind()
285 }
286 }
287
288 impl fmt::Display for Token {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result289 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
290 use self::TokenType::*;
291
292 match &*self.token_type() {
293 Eof => "".to_string(),
294 Number { text } => text.to_string(),
295 Identifier { identifier } => identifier.to_string(),
296 MultiLineComment { blocks, comment } => {
297 format!("--[{0}[{1}]{0}]", "=".repeat(*blocks), comment)
298 }
299 Shebang { line } => line.to_string(),
300 SingleLineComment { comment } => format!("--{}", comment),
301 StringLiteral {
302 literal,
303 multi_line,
304 quote_type,
305 } => {
306 if let Some(blocks) = multi_line {
307 format!("[{0}[{1}]{0}]", "=".repeat(*blocks), literal.to_string())
308 } else {
309 format!("{0}{1}{0}", quote_type.to_string(), literal.to_string())
310 }
311 }
312 Symbol { symbol } => symbol.to_string(),
313 Whitespace { characters } => characters.to_string(),
314 }
315 .fmt(formatter)
316 }
317 }
318
319 impl PartialEq<Self> for Token {
eq(&self, rhs: &Self) -> bool320 fn eq(&self, rhs: &Self) -> bool {
321 self.start_position() == rhs.start_position()
322 && self.end_position() == rhs.end_position()
323 && self.token_type == rhs.token_type
324 }
325 }
326
327 impl Eq for Token {}
328
329 impl Ord for Token {
cmp(&self, other: &Self) -> Ordering330 fn cmp(&self, other: &Self) -> Ordering {
331 self.start_position().cmp(&other.start_position())
332 }
333 }
334
335 impl PartialOrd for Token {
partial_cmp(&self, other: &Self) -> Option<Ordering>336 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
337 Some(self.cmp(other))
338 }
339 }
340
341 impl Visit for Token {
visit<V: Visitor>(&self, visitor: &mut V)342 fn visit<V: Visitor>(&self, visitor: &mut V) {
343 visitor.visit_token(self);
344
345 match self.token_kind() {
346 TokenKind::Eof => {}
347 TokenKind::Identifier => visitor.visit_identifier(self),
348 TokenKind::MultiLineComment => visitor.visit_multi_line_comment(self),
349 TokenKind::Number => visitor.visit_number(self),
350 TokenKind::Shebang => {}
351 TokenKind::SingleLineComment => visitor.visit_single_line_comment(self),
352 TokenKind::StringLiteral => visitor.visit_string_literal(self),
353 TokenKind::Symbol => visitor.visit_symbol(self),
354 TokenKind::Whitespace => visitor.visit_whitespace(self),
355 }
356 }
357 }
358
359 impl VisitMut for Token {
visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self360 fn visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self {
361 let token = visitor.visit_token(self);
362
363 match token.token_kind() {
364 TokenKind::Eof => token,
365 TokenKind::Identifier => visitor.visit_identifier(token),
366 TokenKind::MultiLineComment => visitor.visit_multi_line_comment(token),
367 TokenKind::Number => visitor.visit_number(token),
368 TokenKind::Shebang => token,
369 TokenKind::SingleLineComment => visitor.visit_single_line_comment(token),
370 TokenKind::StringLiteral => visitor.visit_string_literal(token),
371 TokenKind::Symbol => visitor.visit_symbol(token),
372 TokenKind::Whitespace => visitor.visit_whitespace(token),
373 }
374 }
375 }
376
377 /// A reference to a token used by Ast's.
378 /// Dereferences to a [`Token`]
379 #[derive(Clone, Debug)]
380 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
381 pub struct TokenReference {
382 pub(crate) leading_trivia: Vec<Token>,
383 pub(crate) token: Token,
384 pub(crate) trailing_trivia: Vec<Token>,
385 }
386
387 impl TokenReference {
388 /// Creates a TokenReference from leading/trailing trivia as well as the leading token
new(leading_trivia: Vec<Token>, token: Token, trailing_trivia: Vec<Token>) -> Self389 pub fn new(leading_trivia: Vec<Token>, token: Token, trailing_trivia: Vec<Token>) -> Self {
390 Self {
391 leading_trivia,
392 token,
393 trailing_trivia,
394 }
395 }
396
397 /// Returns a symbol with the leading and trailing whitespace
398 /// Only whitespace is supported
399 /// ```rust
400 /// # use full_moon::tokenizer::{Symbol, TokenReference, TokenType, TokenizerErrorType};
401 /// # fn main() -> Result<(), Box<TokenizerErrorType>> {
402 /// let symbol = TokenReference::symbol("\nreturn ")?;
403 /// assert_eq!(symbol.leading_trivia().next().unwrap().to_string(), "\n");
404 /// assert_eq!(symbol.token().token_type(), &TokenType::Symbol {
405 /// symbol: Symbol::Return,
406 /// });
407 /// assert_eq!(symbol.trailing_trivia().next().unwrap().to_string(), " ");
408 /// assert!(TokenReference::symbol("isnt whitespace").is_err());
409 /// assert!(TokenReference::symbol(" notasymbol ").is_err());
410 /// # Ok(())
411 /// # }
412 /// ```
symbol(text: &str) -> Result<Self, TokenizerErrorType>413 pub fn symbol(text: &str) -> Result<Self, TokenizerErrorType> {
414 let mut chars = text.chars().peekable();
415
416 let mut leading_trivia = String::new();
417 while let Some(character) = chars.peek() {
418 if character.is_ascii_whitespace() {
419 leading_trivia.push(chars.next().unwrap());
420 } else {
421 break;
422 }
423 }
424
425 let mut symbol_text = String::new();
426 while let Some(character) = chars.peek() {
427 if !character.is_ascii_whitespace() {
428 symbol_text.push(chars.next().unwrap());
429 } else {
430 break;
431 }
432 }
433
434 let symbol = Symbol::from_str(&symbol_text)
435 .map_err(|_| TokenizerErrorType::InvalidSymbol(symbol_text))?;
436
437 let mut trailing_trivia = String::new();
438 while let Some(character) = chars.peek() {
439 if character.is_ascii_whitespace() {
440 trailing_trivia.push(chars.next().unwrap());
441 } else {
442 return Err(TokenizerErrorType::UnexpectedToken(*character));
443 }
444 }
445
446 Ok(Self {
447 leading_trivia: vec![Token::new(TokenType::Whitespace {
448 characters: leading_trivia.into(),
449 })],
450 token: Token::new(TokenType::Symbol { symbol }),
451 trailing_trivia: vec![Token::new(TokenType::Whitespace {
452 characters: trailing_trivia.into(),
453 })],
454 })
455 }
456
457 /// Returns the inner token.
token(&self) -> &Token458 pub fn token(&self) -> &Token {
459 &self.token
460 }
461
462 /// Returns the leading trivia
leading_trivia(&self) -> impl Iterator<Item = &Token>463 pub fn leading_trivia(&self) -> impl Iterator<Item = &Token> {
464 self.leading_trivia.iter()
465 }
466
467 /// Returns the trailing trivia
trailing_trivia(&self) -> impl Iterator<Item = &Token>468 pub fn trailing_trivia(&self) -> impl Iterator<Item = &Token> {
469 self.trailing_trivia.iter()
470 }
471
472 /// Creates a clone of the current TokenReference with the new inner token, preserving trivia.
with_token(&self, token: Token) -> Self473 pub fn with_token(&self, token: Token) -> Self {
474 Self {
475 token,
476 leading_trivia: self.leading_trivia.clone(),
477 trailing_trivia: self.trailing_trivia.clone(),
478 }
479 }
480 }
481
482 impl std::borrow::Borrow<Token> for &TokenReference {
borrow(&self) -> &Token483 fn borrow(&self) -> &Token {
484 &**self
485 }
486 }
487
488 impl std::ops::Deref for TokenReference {
489 type Target = Token;
490
deref(&self) -> &Self::Target491 fn deref(&self) -> &Self::Target {
492 &self.token
493 }
494 }
495
496 impl fmt::Display for TokenReference {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result497 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
498 for trivia in &self.leading_trivia {
499 formatter.write_str(&trivia.to_string())?;
500 }
501
502 formatter.write_str(&self.token.to_string())?;
503
504 for trivia in &self.trailing_trivia {
505 formatter.write_str(&trivia.to_string())?;
506 }
507
508 Ok(())
509 }
510 }
511
512 impl PartialEq<Self> for TokenReference {
eq(&self, other: &Self) -> bool513 fn eq(&self, other: &Self) -> bool {
514 (**self).eq(other)
515 && self.leading_trivia == other.leading_trivia
516 && self.trailing_trivia == other.trailing_trivia
517 }
518 }
519
520 impl Eq for TokenReference {}
521
522 impl Ord for TokenReference {
cmp(&self, other: &Self) -> Ordering523 fn cmp(&self, other: &Self) -> Ordering {
524 (**self).cmp(&**other)
525 }
526 }
527
528 impl PartialOrd for TokenReference {
partial_cmp(&self, other: &Self) -> Option<Ordering>529 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
530 Some(self.cmp(other))
531 }
532 }
533
534 impl Visit for TokenReference {
visit<V: Visitor>(&self, visitor: &mut V)535 fn visit<V: Visitor>(&self, visitor: &mut V) {
536 visitor.visit_token(self);
537
538 if matches!(self.token().token_kind(), TokenKind::Eof) {
539 visitor.visit_eof(self);
540 }
541
542 self.leading_trivia.visit(visitor);
543 self.token.visit(visitor);
544 self.trailing_trivia.visit(visitor);
545 }
546 }
547
548 impl VisitMut for TokenReference {
visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self549 fn visit_mut<V: VisitorMut>(self, visitor: &mut V) -> Self {
550 let mut token_reference = visitor.visit_token_reference(self);
551
552 if matches!(token_reference.token().token_kind(), TokenKind::Eof) {
553 token_reference = visitor.visit_eof(token_reference);
554 }
555
556 token_reference.leading_trivia = token_reference.leading_trivia.visit_mut(visitor);
557 token_reference.token = token_reference.token.visit_mut(visitor);
558 token_reference.trailing_trivia = token_reference.trailing_trivia.visit_mut(visitor);
559 token_reference
560 }
561 }
562
563 /// Used to represent exact positions of tokens in code
564 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
565 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
566 pub struct Position {
567 pub(crate) bytes: usize,
568 pub(crate) line: usize,
569 pub(crate) character: usize,
570 }
571
572 impl Position {
573 /// How many bytes, ignoring lines, it would take to find this position
bytes(self) -> usize574 pub fn bytes(self) -> usize {
575 self.bytes
576 }
577
578 /// Index of the character on the line for this position
character(self) -> usize579 pub fn character(self) -> usize {
580 self.character
581 }
582
583 /// Line the position lies on
line(self) -> usize584 pub fn line(self) -> usize {
585 self.line
586 }
587 }
588
589 impl Ord for Position {
cmp(&self, other: &Self) -> Ordering590 fn cmp(&self, other: &Self) -> Ordering {
591 self.bytes.cmp(&other.bytes)
592 }
593 }
594
595 impl PartialOrd for Position {
partial_cmp(&self, other: &Self) -> Option<Ordering>596 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
597 Some(self.cmp(other))
598 }
599 }
600
601 #[derive(Clone, Debug, PartialEq)]
602 struct TokenAdvancement {
603 pub advance: usize,
604 pub token_type: TokenType,
605 }
606
607 /// The types of quotes used in a Lua string
608 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
609 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
610 #[non_exhaustive]
611 pub enum StringLiteralQuoteType {
612 /// Strings formatted \[\[with brackets\]\]
613 Brackets,
614 /// Strings formatted "with double quotes"
615 Double,
616 /// Strings formatted 'with single quotes'
617 Single,
618 }
619
620 impl fmt::Display for StringLiteralQuoteType {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result621 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
622 match *self {
623 StringLiteralQuoteType::Brackets => unreachable!(),
624 StringLiteralQuoteType::Double => "\"",
625 StringLiteralQuoteType::Single => "'",
626 }
627 .fmt(formatter)
628 }
629 }
630
631 type RawToken = Result<TokenType, TokenizerErrorType>;
632
633 impl From<TokenType> for RawToken {
from(token_type: TokenType) -> RawToken634 fn from(token_type: TokenType) -> RawToken {
635 Ok(token_type)
636 }
637 }
638
639 impl From<TokenizerErrorType> for RawToken {
from(error: TokenizerErrorType) -> RawToken640 fn from(error: TokenizerErrorType) -> RawToken {
641 Err(error)
642 }
643 }
644
645 peg::parser! {
646 grammar tokens() for str {
647 use super::ParseSymbol;
648 use peg::ParseLiteral;
649 use super::StringLiteralQuoteType as QuoteType;
650
651 rule line_ending()
652 = "\n" / "\r\n"
653 rule space()
654 = [' '|'\t']
655
656 pub(super) rule whitespace() -> RawToken
657 = chars:$( space()+ line_ending()? / line_ending() )
658 { TokenType::Whitespace { characters:chars.into() }.into() }
659
660 rule multi_line_start() -> &'input str
661 = "[" block:$("="*) "[" {block}
662
663 rule multi_line_end(block: &'input str)
664 = "]" ##parse_string_literal(block) "]"
665
666 rule multi_line_block() -> (usize, &'input str)
667 = block:multi_line_start()
668 content:$((!multi_line_end(block) [_])*)
669 multi_line_end(block)
670 { (block.len(), content) }
671
672 rule multi_line_quote() -> RawToken
673 = v:multi_line_block() { TokenType::StringLiteral {
674 multi_line: Some(v.0),
675 literal:v.1.into(),
676 quote_type: QuoteType::Brackets,
677 }.into()}
678 / &multi_line_start() [_]+ { TokenizerErrorType::UnclosedString.into() }
679
680 rule escape()
681 = "\\" [_]
682
683 rule quote_char(quote: &str)
684 = !(##parse_string_literal(quote) / ['\r'|'\n'|'\\']) [_]
685
686 rule quoted(quote: &str, quote_type: QuoteType) -> RawToken
687 = ##parse_string_literal(quote)
688 literal:$((quote_char(quote) / escape())+ / )
689 ##parse_string_literal(quote)
690 { TokenType::StringLiteral { multi_line: None, literal:literal.into(), quote_type }.into() }
691 / ##parse_string_literal(quote) [_]* {TokenizerErrorType::UnclosedString.into() }
692
693 rule single_line_quote() -> RawToken
694 = quoted("\"", (QuoteType::Double))
695 / quoted("\'", (QuoteType::Single))
696
697 pub(super) rule string_literal() -> RawToken
698 = multi_line_quote()
699 / single_line_quote()
700
701 pub(super) rule shebang() -> RawToken
702 = line:$("#!" (!line_ending() [_])* line_ending())
703 {TokenType::Shebang{line:line.into()}.into()}
704
705 pub(super) rule identifier() -> RawToken
706 = id:$(['_'|'a'..='z'|'A'..='Z'] ['_'|'a'..='z'|'A'..='Z'|'0'..='9']*)
707 { match parse_keyword(id) {
708 Some(symbol) => TokenType::Symbol { symbol }.into(),
709 None => TokenType::Identifier { identifier: id.into() }.into(),
710 }}
711 / expected!("identifier")
712
713 pub(super) rule comment() -> RawToken
714 = "--" v:multi_line_block()
715 { TokenType::MultiLineComment { blocks: v.0, comment: v.1.into() }.into() }
716 / "--" multi_line_start() [_]* { TokenizerErrorType::UnclosedComment.into() }
717 / "--" comment:$(([^ '\r'|'\n'])*)
718 { TokenType::SingleLineComment { comment: comment.into() }.into() }
719
720 rule roblox()
721 = {? if cfg!(feature = "roblox") {
722 Ok(())
723 } else {
724 Err("roblox not enabled")
725 }}
726
727 rule roblox_number() -> &'input str
728 = roblox() n:$(("0b"/"0B") ['0'|'1'|'_']+) {n}
729
730 rule hex_number() -> &'input str
731 = roblox() n:$(("0x"/"0X") ['0'..='9'|'a'..='f'|'A'..='F'|'_']+) {n}
732 / !roblox() n:$(("0x"/"0X") ['0'..='9'|'a'..='f'|'A'..='F']+) {n}
733
734 rule digit_with_separator() -> &'input str
735 = roblox() n:$(['0'..='9'] ['0'..='9'|'_']*) {n}
736 / !roblox() n:$(['0'..='9']+) {n}
737
738 rule basic_number() -> &'input str
739 = $(
740 digit_with_separator()
741 ("." digit_with_separator()?)?
742 (['e'|'E'] ['-'|'+']? digit_with_separator())?
743 )
744
745 rule no_int_fractional_number() -> &'input str
746 = $(
747 "." digit_with_separator()
748 (['e'|'E'] ['-'|'+']? digit_with_separator())?
749 )
750
751 pub(super) rule number() -> RawToken
752 = n:(
753 roblox_number()
754 / hex_number()
755 / basic_number()
756 / no_int_fractional_number()
757 ) { TokenType::Number { text:n.into() }.into() }
758
759 pub(super) rule symbol() -> RawToken = symbol:##parse_symbol() { TokenType::Symbol{symbol}.into() }
760
761 rule token() -> RawToken
762 = whitespace()
763 / comment()
764 / number()
765 / string_literal()
766 / "#!" { TokenizerErrorType::UnexpectedShebang.into() }
767 / symbol()
768 / identifier()
769
770 pub(crate) rule tokens() -> Vec<(RawToken, usize)>
771 = shebang:(shebang:shebang() pos:position!() {(shebang,pos)})?
772 body:( token:token() pos:position!() {(token,pos)})*
773 {
774 let mut body = body;
775 if let Some(shebang) = shebang {
776 body.insert(0, shebang)
777 }
778 body
779 }
780 }
781 }
782
783 /// Information about an error that occurs while tokenizing
784 #[derive(Clone, Debug, PartialEq)]
785 #[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
786 pub struct TokenizerError {
787 /// The type of error
788 error: TokenizerErrorType,
789 /// The position of the token that caused the error
790 position: Position,
791 }
792
793 impl TokenizerError {
794 /// The type of error
error(&self) -> &TokenizerErrorType795 pub fn error(&self) -> &TokenizerErrorType {
796 &self.error
797 }
798
799 /// The position of the token that caused the error
position(&self) -> Position800 pub fn position(&self) -> Position {
801 self.position
802 }
803 }
804
805 impl fmt::Display for TokenizerError {
fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result806 fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
807 write!(
808 formatter,
809 "{} at line {}, column {}",
810 match &self.error {
811 TokenizerErrorType::UnclosedComment => "unclosed comment".to_string(),
812 TokenizerErrorType::UnclosedString => "unclosed string".to_string(),
813 TokenizerErrorType::UnexpectedShebang => "unexpected shebang".to_string(),
814 TokenizerErrorType::UnexpectedToken(character) => {
815 format!("unexpected character {}", character)
816 }
817 TokenizerErrorType::InvalidSymbol(symbol) => {
818 format!("invalid symbol {}", symbol)
819 }
820 },
821 self.position.line,
822 self.position.character,
823 )
824 }
825 }
826
827 impl std::error::Error for TokenizerError {}
828
829 impl From<peg::str::LineCol> for Position {
from(location: peg::str::LineCol) -> Position830 fn from(location: peg::str::LineCol) -> Position {
831 Position {
832 bytes: location.offset,
833 line: location.line,
834 character: location.column,
835 }
836 }
837 }
838
839 struct TokenCollector {
840 result: Vec<Token>,
841 }
842
843 // Collector
844 impl TokenCollector {
new() -> Self845 fn new() -> Self {
846 Self { result: Vec::new() }
847 }
push( &mut self, start_position: Position, raw_token: RawToken, end_position: Position, ) -> Result<(), TokenizerError>848 fn push(
849 &mut self,
850 start_position: Position,
851 raw_token: RawToken,
852 end_position: Position,
853 ) -> Result<(), TokenizerError> {
854 match raw_token {
855 Ok(token_type) => {
856 self.result.push(Token {
857 start_position,
858 end_position,
859 token_type,
860 });
861 Ok(())
862 }
863 Err(error) => Err(TokenizerError {
864 error,
865 position: start_position,
866 }),
867 }
868 }
finish(mut self, eof_position: Position) -> Vec<Token>869 fn finish(mut self, eof_position: Position) -> Vec<Token> {
870 self.result.push(Token {
871 start_position: eof_position,
872 end_position: eof_position,
873 token_type: TokenType::Eof,
874 });
875 self.result
876 }
877 }
878
from_parser_error( code: &'_ str, ) -> impl Fn(peg::error::ParseError<peg::str::LineCol>) -> TokenizerError + '_879 fn from_parser_error(
880 code: &'_ str,
881 ) -> impl Fn(peg::error::ParseError<peg::str::LineCol>) -> TokenizerError + '_ {
882 move |err| TokenizerError {
883 error: TokenizerErrorType::UnexpectedToken(
884 code[err.location.offset..].chars().next().expect(
885 "(internal full-moon error) Text overflow while giving unexpected token error",
886 ),
887 ),
888 position: err.location.into(),
889 }
890 }
891
892 /// Returns a list of tokens.
893 /// You probably want [`parse`](crate::parse) instead.
894 ///
895 /// # Errors
896 ///
897 /// If the code passed is malformed from normal Lua expectations,
898 /// a [`TokenizerError`] will be returned.
899 ///
900 /// ```rust
901 /// # use full_moon::tokenizer::tokens;
902 /// assert!(tokens("local x = 1").is_ok());
903 /// assert!(tokens("local 4 = end").is_ok()); // tokens does *not* check validity of code, only tokenizing
904 /// assert!(tokens("--[[ Unclosed comment!").is_err());
905 /// ```
tokens(code: &str) -> Result<Vec<Token>, TokenizerError>906 pub fn tokens(code: &str) -> Result<Vec<Token>, TokenizerError> {
907 let mut tokens = TokenCollector::new();
908
909 let mut raw_tokens = tokens::tokens(code).map_err(from_parser_error(code))?;
910
911 // rust-peg lets us easily get the offset associated with
912 // (the end of) each token, but not the line or column
913 // information. We iterate over the characters to match
914 // up the tokens with the row/column information.
915 let mut raw_tokens = raw_tokens.drain(..);
916
917 let mut position = Position {
918 bytes: 0,
919 character: 1,
920 line: 1,
921 };
922 let mut next_is_new_line = false;
923 let mut start_position = position;
924 if let Some((mut token_type, mut token_offset)) = raw_tokens.next() {
925 for character in code.chars() {
926 if character == '\n' {
927 next_is_new_line = true;
928 } else {
929 position.character += 1;
930 }
931
932 position.bytes += character.len_utf8();
933
934 let end_position = position;
935
936 if next_is_new_line {
937 next_is_new_line = false;
938 position.line += 1;
939 position.character = 1;
940 }
941
942 if token_offset == end_position.bytes {
943 tokens.push(start_position, token_type, end_position)?;
944 start_position = position;
945 if let Some((next_token_type, next_token_offset)) = raw_tokens.next() {
946 token_type = next_token_type;
947 token_offset = next_token_offset;
948 } else {
949 break;
950 }
951 }
952 }
953 }
954
955 if let Some((token_type, token_offset)) = raw_tokens.next() {
956 panic!("(internal full-moon error) Found token {:?} with offset {:?} which is past the end of source", token_type, token_offset);
957 }
958
959 Ok(tokens.finish(position))
960 }
961
962 #[cfg(test)]
963 mod tests {
964 use crate::tokenizer::*;
965 use pretty_assertions::assert_eq;
966
967 macro_rules! test_rule {
968 ($rule:ident($code:expr), $result:expr) => {
969 let code: &str = $code;
970 let result: RawToken = $result.into();
971
972 assert_eq!(
973 tokens::$rule(code)
974 .map_err(|err| from_parser_error(code)(err).error)
975 .and_then(|v| v),
976 result,
977 );
978 test_rule!(code, result)
979 };
980 ($code:expr, $result:expr) => {
981 let code: &str = $code;
982 let result: RawToken = $result.into();
983
984 match result {
985 Ok(token) => {
986 let tokens = tokens(code).expect("couldn't tokenize");
987 let first_token = &tokens.get(0).expect("tokenized response is empty");
988 assert_eq!(*first_token.token_type(), token);
989 }
990
991 Err(expected) => {
992 if let Err(TokenizerError { error, .. }) = tokens($code) {
993 assert_eq!(error, expected);
994 }
995 }
996 };
997 };
998 }
999
1000 #[test]
test_rule_comment()1001 fn test_rule_comment() {
1002 test_rule!(
1003 comment("-- hello world"),
1004 TokenType::SingleLineComment {
1005 comment: " hello world".into()
1006 }
1007 );
1008
1009 test_rule!(
1010 comment("--[[ hello world ]]"),
1011 TokenType::MultiLineComment {
1012 blocks: 0,
1013 comment: " hello world ".into()
1014 }
1015 );
1016
1017 test_rule!(
1018 comment("--[=[ hello world ]=]"),
1019 TokenType::MultiLineComment {
1020 blocks: 1,
1021 comment: " hello world ".into()
1022 }
1023 );
1024 test_rule!(
1025 comment("--"),
1026 TokenType::SingleLineComment { comment: "".into() }
1027 );
1028 }
1029
1030 #[test]
test_rule_numbers()1031 fn test_rule_numbers() {
1032 test_rule!(number("213"), TokenType::Number { text: "213".into() });
1033
1034 test_rule!(number("1"), TokenType::Number { text: "1".into() });
1035
1036 test_rule!(
1037 number("123.45"),
1038 TokenType::Number {
1039 text: "123.45".into(),
1040 }
1041 );
1042 }
1043
1044 #[test]
1045 #[cfg_attr(not(feature = "roblox"), ignore)]
test_rule_binary_literals()1046 fn test_rule_binary_literals() {
1047 test_rule!(
1048 number("0b101"),
1049 TokenType::Number {
1050 text: "0b101".into(),
1051 }
1052 );
1053 }
1054
1055 #[test]
test_rule_identifier()1056 fn test_rule_identifier() {
1057 test_rule!(
1058 identifier("hello"),
1059 TokenType::Identifier {
1060 identifier: "hello".into(),
1061 }
1062 );
1063
1064 test_rule!(
1065 "hello world",
1066 TokenType::Identifier {
1067 identifier: "hello".into(),
1068 }
1069 );
1070
1071 test_rule!(
1072 identifier("hello___"),
1073 TokenType::Identifier {
1074 identifier: "hello___".into(),
1075 }
1076 );
1077
1078 test_rule!(identifier("123"), TokenizerErrorType::UnexpectedToken('1'));
1079 }
1080
1081 #[test]
test_rule_symbols()1082 fn test_rule_symbols() {
1083 test_rule!(
1084 identifier("local"),
1085 TokenType::Symbol {
1086 symbol: Symbol::Local
1087 }
1088 );
1089 }
1090
1091 #[test]
test_rule_whitespace()1092 fn test_rule_whitespace() {
1093 test_rule!(
1094 "\t \n\t",
1095 TokenType::Whitespace {
1096 characters: "\t \n".into(),
1097 }
1098 );
1099
1100 test_rule!(
1101 "\thello",
1102 TokenType::Whitespace {
1103 characters: "\t".into(),
1104 }
1105 );
1106
1107 test_rule!(
1108 "\t\t\nhello",
1109 TokenType::Whitespace {
1110 characters: "\t\t\n".into(),
1111 }
1112 );
1113
1114 test_rule!(
1115 "\n\thello",
1116 TokenType::Whitespace {
1117 characters: "\n".into(),
1118 }
1119 );
1120 }
1121
1122 #[test]
test_rule_string_literal()1123 fn test_rule_string_literal() {
1124 test_rule!(
1125 string_literal("\"hello\""),
1126 TokenType::StringLiteral {
1127 literal: "hello".into(),
1128 multi_line: None,
1129 quote_type: StringLiteralQuoteType::Double,
1130 }
1131 );
1132
1133 test_rule!(
1134 string_literal("\"hello\\\nworld\""),
1135 TokenType::StringLiteral {
1136 literal: "hello\\\nworld".into(),
1137 multi_line: None,
1138 quote_type: StringLiteralQuoteType::Double,
1139 }
1140 );
1141
1142 test_rule!(
1143 string_literal("\"hello"),
1144 TokenizerErrorType::UnclosedString
1145 );
1146 }
1147
1148 #[test]
test_symbols_within_symbols()1149 fn test_symbols_within_symbols() {
1150 // "index" should not return "in"
1151 test_rule!(
1152 identifier("index"),
1153 TokenType::Identifier {
1154 identifier: "index".into()
1155 }
1156 );
1157
1158 // "<=" should not return "<"
1159 test_rule!(
1160 symbol("<="),
1161 TokenType::Symbol {
1162 symbol: Symbol::LessThanEqual,
1163 }
1164 );
1165 }
1166
1167 #[test]
test_rule_shebang()1168 fn test_rule_shebang() {
1169 test_rule!(
1170 shebang("#!/usr/bin/env lua\n"),
1171 TokenType::Shebang {
1172 line: "#!/usr/bin/env lua\n".into()
1173 }
1174 );
1175 // Don't recognize with a whitespace.
1176 test_rule!(
1177 " #!/usr/bin/env lua\n",
1178 TokenizerErrorType::UnexpectedShebang
1179 );
1180 }
1181
1182 #[test]
test_new_line_on_same_line()1183 fn test_new_line_on_same_line() {
1184 assert_eq!(
1185 tokens("\n").unwrap()[0],
1186 Token {
1187 start_position: Position {
1188 bytes: 0,
1189 character: 1,
1190 line: 1,
1191 },
1192
1193 end_position: Position {
1194 bytes: 1,
1195 character: 1,
1196 line: 1,
1197 },
1198
1199 token_type: TokenType::Whitespace {
1200 characters: "\n".into()
1201 },
1202 }
1203 );
1204 }
1205
1206 #[test]
test_fuzzer()1207 fn test_fuzzer() {
1208 let _ = tokens("*ա");
1209 let _ = tokens("̹(");
1210 let _ = tokens("¹;");
1211 }
1212 }
1213