1 use logos::Logos;
2 
3 use super::kind::SyntaxKind;
4 
5 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord, Logos)]
6 #[allow(non_camel_case_types)]
7 #[repr(u16)]
8 enum Token {
9     #[regex(r"\s+")]
10     WHITESPACE = 2,
11 
12     #[regex(r"%[^\r\n]*")]
13     COMMENT,
14 
15     #[token("{")]
16     L_CURLY,
17 
18     #[token("}")]
19     R_CURLY,
20 
21     #[token("[")]
22     L_BRACK,
23 
24     #[token("]")]
25     R_BRACK,
26 
27     #[token("(")]
28     L_PAREN,
29 
30     #[token(")")]
31     R_PAREN,
32 
33     #[regex(r"#\d?")]
34     PARAMETER,
35 
36     #[token(",")]
37     COMMA,
38 
39     #[token("=")]
40     EQUALITY_SIGN,
41 
42     #[regex(r"[^\s\\%\{\},\$\[\]\(\)=\#]+")]
43     #[error]
44     WORD,
45 
46     #[regex(r"\$\$?")]
47     DOLLAR,
48 
49     #[regex(r"\\([^\r\n]|[@a-zA-Z:_]+\*?)?")]
50     GENERIC_COMMAND_NAME,
51 
52     #[regex(r"\\begin")]
53     BEGIN_ENVIRONMENT_NAME,
54 
55     #[regex(r"\\end")]
56     END_ENVIRONMENT_NAME,
57 
58     #[regex(r"\\\[")]
59     BEGIN_EQUATION_NAME,
60 
61     #[regex(r"\\\]")]
62     END_EQUATION_NAME,
63 
64     #[regex(r"\\part\*?")]
65     PART_NAME,
66 
67     #[regex(r"\\chapter\*?")]
68     CHAPTER_NAME,
69 
70     #[regex(r"\\section\*?")]
71     SECTION_NAME,
72 
73     #[regex(r"\\subsection\*?")]
74     SUBSECTION_NAME,
75 
76     #[regex(r"\\subsubsection\*?")]
77     SUBSUBSECTION_NAME,
78 
79     #[regex(r"\\paragraph\*?")]
80     PARAGRAPH_NAME,
81 
82     #[regex(r"\\subparagraph\*?")]
83     SUBPARAGRAPH_NAME,
84 
85     #[regex(r"\\item")]
86     ENUM_ITEM_NAME,
87 
88     #[regex(r"\\caption")]
89     CAPTION_NAME,
90 
91     #[regex(r"\\cite|\\cite\*|\\Cite|\\nocite|\\citet|\\citep|\\citet\*|\\citep\*|\\citeauthor|\\citeauthor\*|\\Citeauthor|\\Citeauthor\*|\\citetitle|\\citetitle\*|\\citeyear|\\citeyear\*|\\citedate|\\citedate\*|\\citeurl|\\fullcite|\\citeyearpar|\\citealt|\\citealp|\\citetext|\\parencite|\\parencite\*|\\Parencite|\\footcite|\\footfullcite|\\footcitetext|\\textcite|\\Textcite|\\smartcite|\\Smartcite|\\supercite|\\autocite|\\Autocite|\\autocite\*|\\Autocite\*|\\volcite|\\Volcite|\\pvolcite|\\Pvolcite|\\fvolcite|\\ftvolcite|\\svolcite|\\Svolcite|\\tvolcite|\\Tvolcite|\\avolcite|\\Avolcite|\\notecite|\\notecite|\\pnotecite|\\Pnotecite|\\fnotecite|\\citeA|\\citeA\*")]
92     CITATION_NAME,
93 
94     #[regex(r"\\usepackage|\\RequirePackage")]
95     PACKAGE_INCLUDE_NAME,
96 
97     #[regex(r"\\documentclass")]
98     CLASS_INCLUDE_NAME,
99 
100     #[regex(r"\\include|\\subfileinclude|\\input|\\subfile")]
101     LATEX_INCLUDE_NAME,
102 
103     #[regex(r"\\addbibresource")]
104     BIBLATEX_INCLUDE_NAME,
105 
106     #[regex(r"\\bibliography")]
107     BIBTEX_INCLUDE_NAME,
108 
109     #[regex(r"\\includegraphics")]
110     GRAPHICS_INCLUDE_NAME,
111 
112     #[regex(r"\\includesvg")]
113     SVG_INCLUDE_NAME,
114 
115     #[regex(r"\\includeinkscape")]
116     INKSCAPE_INCLUDE_NAME,
117 
118     #[regex(r"\\verbatiminput|\\VerbatimInput")]
119     VERBATIM_INCLUDE_NAME,
120 
121     #[regex(r"\\import|\\subimport|\\inputfrom|\\subimportfrom|\\includefrom|\\subincludefrom")]
122     IMPORT_NAME,
123 
124     #[regex(r"\\label")]
125     LABEL_DEFINITION_NAME,
126 
127     #[regex(r"\\ref|\\vref|\\Vref|\\autoref|\\pageref|\\cref|\\Cref|\\cref*|\\Cref*|\\namecref|\\nameCref|\\lcnamecref|\\namecrefs|\\nameCrefs|\\lcnamecrefs|\\labelcref|\\labelcpageref|\\eqref")]
128     LABEL_REFERENCE_NAME,
129 
130     #[regex(r"\\crefrange\*?|\\Crefrange\*?")]
131     LABEL_REFERENCE_RANGE_NAME,
132 
133     #[regex(r"\\newlabel")]
134     LABEL_NUMBER_NAME,
135 
136     #[regex(r"\\newcommand\*?|\\renewcommand|\\DeclareRobustCommand")]
137     COMMAND_DEFINITION_NAME,
138 
139     #[regex(r"\\DeclareMathOperator\*?")]
140     MATH_OPERATOR_NAME,
141 
142     #[regex(r"\\newglossaryentry")]
143     GLOSSARY_ENTRY_DEFINITION_NAME,
144 
145     #[regex(r"\\gls|\\Gls|\\GLS|\\glspl|\\Glspl|\\GLSpl|\\glsdisp|\\glslink|\\glstext|\\Glstext|\\GLStext|\\glsfirst|\\Glsfirst|\\GLSfirst|\\glsplural|\\Glsplural|\\GLSplural|\\glsfirstplural|\\Glsfirstplural|\\GLSfirstplural|\\glsname|\\Glsname|\\GLSname|\\glssymbol|\\Glssymbol|\\glsdesc|\\Glsdesc|\\GLSdesc|\\glsuseri|\\Glsuseri|\\GLSuseri|\\glsuserii|\\Glsuserii|\\GLSuserii|\\glsuseriii|\\Glsuseriii|\\GLSuseriii|\\glsuseriv|\\Glsuseriv|\\GLSuseriv|\\glsuserv|\\Glsuserv|\\GLSuserv|\\glsuservi|\\Glsuservi|\\GLSuservi")]
146     GLOSSARY_ENTRY_REFERENCE_NAME,
147 
148     #[regex(r"\\newacronym")]
149     ACRONYM_DEFINITION_NAME,
150 
151     #[regex(r"\\DeclareAcronym")]
152     ACRONYM_DECLARATION_NAME,
153 
154     #[regex(r"\\acrshort|\\Acrshort|\\ACRshort|\\acrshortpl|\\Acrshortpl|\\ACRshortpl|\\acrlong|\\Acrlong|\\ACRlong|\\acrlongpl|\\Acrlongpl|\\ACRlongpl|\\acrfull|\\Acrfull|\\ACRfull|\\acrfullpl|\\Acrfullpl|\\ACRfullpl|\\acs|\\Acs|\\acsp|\\Acsp|\\acl|\\Acl|\\aclp|\\Aclp|\\acf|\\Acf|\\acfp|\\Acfp|\\ac|\\Ac|\\acp|\\glsentrylong|\\Glsentrylong|\\glsentrylongpl|\\Glsentrylongpl|\\glsentryshort|\\Glsentryshort|\\glsentryshortpl|\\Glsentryshortpl|\\glsentryfullpl|\\Glsentryfullpl")]
155     ACRONYM_REFERENCE_NAME,
156 
157     #[regex(r"\\newtheorem|\\declaretheorem")]
158     THEOREM_DEFINITION_NAME,
159 
160     #[regex(r"\\color|\\colorbox|\\textcolor|\\pagecolor")]
161     COLOR_REFERENCE_NAME,
162 
163     #[regex(r"\\definecolor")]
164     COLOR_DEFINITION_NAME,
165 
166     #[regex(r"\\definecolorset")]
167     COLOR_SET_DEFINITION_NAME,
168 
169     #[regex(r"\\usepgflibrary|\\usetikzlibrary")]
170     TIKZ_LIBRARY_IMPORT_NAME,
171 
172     #[regex(r"\\newenvironment|\\newenvironment*")]
173     ENVIRONMENT_DEFINITION_NAME,
174 }
175 
176 #[derive(Debug, PartialEq, Eq, Clone)]
177 pub struct Lexer<'a> {
178     tokens: Vec<(SyntaxKind, &'a str)>,
179 }
180 
181 impl<'a> Lexer<'a> {
new(text: &'a str) -> Self182     pub fn new(text: &'a str) -> Self {
183         let mut tokens = Vec::new();
184         let mut lexer = Token::lexer(text);
185         while let Some(kind) = lexer.next() {
186             tokens.push((
187                 unsafe { std::mem::transmute::<Token, SyntaxKind>(kind) },
188                 lexer.slice(),
189             ));
190         }
191         tokens.reverse();
192         Self { tokens }
193     }
194 
peek(&self) -> Option<SyntaxKind>195     pub fn peek(&self) -> Option<SyntaxKind> {
196         self.tokens.last().map(|(kind, _)| *kind)
197     }
198 
eat(&mut self) -> Option<(SyntaxKind, &'a str)>199     pub fn eat(&mut self) -> Option<(SyntaxKind, &'a str)> {
200         self.tokens.pop()
201     }
202 }
203 
204 #[cfg(test)]
205 mod tests {
206     use insta::assert_debug_snapshot;
207 
208     use super::*;
209 
verify(text: &str) -> Vec<(SyntaxKind, &str)>210     fn verify(text: &str) -> Vec<(SyntaxKind, &str)> {
211         let mut tokens = Lexer::new(text).tokens;
212         tokens.reverse();
213         tokens
214     }
215 
216     #[test]
test_empty()217     fn test_empty() {
218         assert_debug_snapshot!(verify(r#""#));
219     }
220 
221     #[test]
test_delimiters()222     fn test_delimiters() {
223         assert_debug_snapshot!(verify(r#"{foo} (bar) [baz, qux = foo-bar]"#));
224     }
225 
226     #[test]
test_command_with_parameter()227     fn test_command_with_parameter() {
228         assert_debug_snapshot!(verify(r#"\newcommand{\id}[1]{#1}"#));
229     }
230 
231     #[test]
test_command_with_star()232     fn test_command_with_star() {
233         assert_debug_snapshot!(verify(r#"\section*{Foo}"#));
234     }
235 
236     #[test]
test_escape_sequence()237     fn test_escape_sequence() {
238         assert_debug_snapshot!(verify(r#"\% hello"#));
239     }
240 
241     #[test]
test_formula()242     fn test_formula() {
243         assert_debug_snapshot!(verify(r#"$ f(x) = y $$"#));
244     }
245 
246     #[test]
test_comment()247     fn test_comment() {
248         assert_debug_snapshot!(verify("hello %world\r\ntest %test"));
249     }
250 
251     #[test]
test_invalid_parameter()252     fn test_invalid_parameter() {
253         assert_debug_snapshot!(verify(r#"#"#))
254     }
255 }
256