1 use logos::Logos; 2 3 use super::kind::SyntaxKind; 4 5 #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord, Logos)] 6 #[allow(non_camel_case_types)] 7 #[repr(u16)] 8 enum Token { 9 #[regex(r"\s+")] 10 WHITESPACE = 2, 11 12 #[regex(r"%[^\r\n]*")] 13 COMMENT, 14 15 #[token("{")] 16 L_CURLY, 17 18 #[token("}")] 19 R_CURLY, 20 21 #[token("[")] 22 L_BRACK, 23 24 #[token("]")] 25 R_BRACK, 26 27 #[token("(")] 28 L_PAREN, 29 30 #[token(")")] 31 R_PAREN, 32 33 #[regex(r"#\d?")] 34 PARAMETER, 35 36 #[token(",")] 37 COMMA, 38 39 #[token("=")] 40 EQUALITY_SIGN, 41 42 #[regex(r"[^\s\\%\{\},\$\[\]\(\)=\#]+")] 43 #[error] 44 WORD, 45 46 #[regex(r"\$\$?")] 47 DOLLAR, 48 49 #[regex(r"\\([^\r\n]|[@a-zA-Z:_]+\*?)?")] 50 GENERIC_COMMAND_NAME, 51 52 #[regex(r"\\begin")] 53 BEGIN_ENVIRONMENT_NAME, 54 55 #[regex(r"\\end")] 56 END_ENVIRONMENT_NAME, 57 58 #[regex(r"\\\[")] 59 BEGIN_EQUATION_NAME, 60 61 #[regex(r"\\\]")] 62 END_EQUATION_NAME, 63 64 #[regex(r"\\part\*?")] 65 PART_NAME, 66 67 #[regex(r"\\chapter\*?")] 68 CHAPTER_NAME, 69 70 #[regex(r"\\section\*?")] 71 SECTION_NAME, 72 73 #[regex(r"\\subsection\*?")] 74 SUBSECTION_NAME, 75 76 #[regex(r"\\subsubsection\*?")] 77 SUBSUBSECTION_NAME, 78 79 #[regex(r"\\paragraph\*?")] 80 PARAGRAPH_NAME, 81 82 #[regex(r"\\subparagraph\*?")] 83 SUBPARAGRAPH_NAME, 84 85 #[regex(r"\\item")] 86 ENUM_ITEM_NAME, 87 88 #[regex(r"\\caption")] 89 CAPTION_NAME, 90 91 #[regex(r"\\cite|\\cite\*|\\Cite|\\nocite|\\citet|\\citep|\\citet\*|\\citep\*|\\citeauthor|\\citeauthor\*|\\Citeauthor|\\Citeauthor\*|\\citetitle|\\citetitle\*|\\citeyear|\\citeyear\*|\\citedate|\\citedate\*|\\citeurl|\\fullcite|\\citeyearpar|\\citealt|\\citealp|\\citetext|\\parencite|\\parencite\*|\\Parencite|\\footcite|\\footfullcite|\\footcitetext|\\textcite|\\Textcite|\\smartcite|\\Smartcite|\\supercite|\\autocite|\\Autocite|\\autocite\*|\\Autocite\*|\\volcite|\\Volcite|\\pvolcite|\\Pvolcite|\\fvolcite|\\ftvolcite|\\svolcite|\\Svolcite|\\tvolcite|\\Tvolcite|\\avolcite|\\Avolcite|\\notecite|\\notecite|\\pnotecite|\\Pnotecite|\\fnotecite|\\citeA|\\citeA\*")] 92 CITATION_NAME, 93 94 #[regex(r"\\usepackage|\\RequirePackage")] 95 PACKAGE_INCLUDE_NAME, 96 97 #[regex(r"\\documentclass")] 98 CLASS_INCLUDE_NAME, 99 100 #[regex(r"\\include|\\subfileinclude|\\input|\\subfile")] 101 LATEX_INCLUDE_NAME, 102 103 #[regex(r"\\addbibresource")] 104 BIBLATEX_INCLUDE_NAME, 105 106 #[regex(r"\\bibliography")] 107 BIBTEX_INCLUDE_NAME, 108 109 #[regex(r"\\includegraphics")] 110 GRAPHICS_INCLUDE_NAME, 111 112 #[regex(r"\\includesvg")] 113 SVG_INCLUDE_NAME, 114 115 #[regex(r"\\includeinkscape")] 116 INKSCAPE_INCLUDE_NAME, 117 118 #[regex(r"\\verbatiminput|\\VerbatimInput")] 119 VERBATIM_INCLUDE_NAME, 120 121 #[regex(r"\\import|\\subimport|\\inputfrom|\\subimportfrom|\\includefrom|\\subincludefrom")] 122 IMPORT_NAME, 123 124 #[regex(r"\\label")] 125 LABEL_DEFINITION_NAME, 126 127 #[regex(r"\\ref|\\vref|\\Vref|\\autoref|\\pageref|\\cref|\\Cref|\\cref*|\\Cref*|\\namecref|\\nameCref|\\lcnamecref|\\namecrefs|\\nameCrefs|\\lcnamecrefs|\\labelcref|\\labelcpageref|\\eqref")] 128 LABEL_REFERENCE_NAME, 129 130 #[regex(r"\\crefrange\*?|\\Crefrange\*?")] 131 LABEL_REFERENCE_RANGE_NAME, 132 133 #[regex(r"\\newlabel")] 134 LABEL_NUMBER_NAME, 135 136 #[regex(r"\\newcommand\*?|\\renewcommand|\\DeclareRobustCommand")] 137 COMMAND_DEFINITION_NAME, 138 139 #[regex(r"\\DeclareMathOperator\*?")] 140 MATH_OPERATOR_NAME, 141 142 #[regex(r"\\newglossaryentry")] 143 GLOSSARY_ENTRY_DEFINITION_NAME, 144 145 #[regex(r"\\gls|\\Gls|\\GLS|\\glspl|\\Glspl|\\GLSpl|\\glsdisp|\\glslink|\\glstext|\\Glstext|\\GLStext|\\glsfirst|\\Glsfirst|\\GLSfirst|\\glsplural|\\Glsplural|\\GLSplural|\\glsfirstplural|\\Glsfirstplural|\\GLSfirstplural|\\glsname|\\Glsname|\\GLSname|\\glssymbol|\\Glssymbol|\\glsdesc|\\Glsdesc|\\GLSdesc|\\glsuseri|\\Glsuseri|\\GLSuseri|\\glsuserii|\\Glsuserii|\\GLSuserii|\\glsuseriii|\\Glsuseriii|\\GLSuseriii|\\glsuseriv|\\Glsuseriv|\\GLSuseriv|\\glsuserv|\\Glsuserv|\\GLSuserv|\\glsuservi|\\Glsuservi|\\GLSuservi")] 146 GLOSSARY_ENTRY_REFERENCE_NAME, 147 148 #[regex(r"\\newacronym")] 149 ACRONYM_DEFINITION_NAME, 150 151 #[regex(r"\\DeclareAcronym")] 152 ACRONYM_DECLARATION_NAME, 153 154 #[regex(r"\\acrshort|\\Acrshort|\\ACRshort|\\acrshortpl|\\Acrshortpl|\\ACRshortpl|\\acrlong|\\Acrlong|\\ACRlong|\\acrlongpl|\\Acrlongpl|\\ACRlongpl|\\acrfull|\\Acrfull|\\ACRfull|\\acrfullpl|\\Acrfullpl|\\ACRfullpl|\\acs|\\Acs|\\acsp|\\Acsp|\\acl|\\Acl|\\aclp|\\Aclp|\\acf|\\Acf|\\acfp|\\Acfp|\\ac|\\Ac|\\acp|\\glsentrylong|\\Glsentrylong|\\glsentrylongpl|\\Glsentrylongpl|\\glsentryshort|\\Glsentryshort|\\glsentryshortpl|\\Glsentryshortpl|\\glsentryfullpl|\\Glsentryfullpl")] 155 ACRONYM_REFERENCE_NAME, 156 157 #[regex(r"\\newtheorem|\\declaretheorem")] 158 THEOREM_DEFINITION_NAME, 159 160 #[regex(r"\\color|\\colorbox|\\textcolor|\\pagecolor")] 161 COLOR_REFERENCE_NAME, 162 163 #[regex(r"\\definecolor")] 164 COLOR_DEFINITION_NAME, 165 166 #[regex(r"\\definecolorset")] 167 COLOR_SET_DEFINITION_NAME, 168 169 #[regex(r"\\usepgflibrary|\\usetikzlibrary")] 170 TIKZ_LIBRARY_IMPORT_NAME, 171 172 #[regex(r"\\newenvironment|\\newenvironment*")] 173 ENVIRONMENT_DEFINITION_NAME, 174 } 175 176 #[derive(Debug, PartialEq, Eq, Clone)] 177 pub struct Lexer<'a> { 178 tokens: Vec<(SyntaxKind, &'a str)>, 179 } 180 181 impl<'a> Lexer<'a> { new(text: &'a str) -> Self182 pub fn new(text: &'a str) -> Self { 183 let mut tokens = Vec::new(); 184 let mut lexer = Token::lexer(text); 185 while let Some(kind) = lexer.next() { 186 tokens.push(( 187 unsafe { std::mem::transmute::<Token, SyntaxKind>(kind) }, 188 lexer.slice(), 189 )); 190 } 191 tokens.reverse(); 192 Self { tokens } 193 } 194 peek(&self) -> Option<SyntaxKind>195 pub fn peek(&self) -> Option<SyntaxKind> { 196 self.tokens.last().map(|(kind, _)| *kind) 197 } 198 eat(&mut self) -> Option<(SyntaxKind, &'a str)>199 pub fn eat(&mut self) -> Option<(SyntaxKind, &'a str)> { 200 self.tokens.pop() 201 } 202 } 203 204 #[cfg(test)] 205 mod tests { 206 use insta::assert_debug_snapshot; 207 208 use super::*; 209 verify(text: &str) -> Vec<(SyntaxKind, &str)>210 fn verify(text: &str) -> Vec<(SyntaxKind, &str)> { 211 let mut tokens = Lexer::new(text).tokens; 212 tokens.reverse(); 213 tokens 214 } 215 216 #[test] test_empty()217 fn test_empty() { 218 assert_debug_snapshot!(verify(r#""#)); 219 } 220 221 #[test] test_delimiters()222 fn test_delimiters() { 223 assert_debug_snapshot!(verify(r#"{foo} (bar) [baz, qux = foo-bar]"#)); 224 } 225 226 #[test] test_command_with_parameter()227 fn test_command_with_parameter() { 228 assert_debug_snapshot!(verify(r#"\newcommand{\id}[1]{#1}"#)); 229 } 230 231 #[test] test_command_with_star()232 fn test_command_with_star() { 233 assert_debug_snapshot!(verify(r#"\section*{Foo}"#)); 234 } 235 236 #[test] test_escape_sequence()237 fn test_escape_sequence() { 238 assert_debug_snapshot!(verify(r#"\% hello"#)); 239 } 240 241 #[test] test_formula()242 fn test_formula() { 243 assert_debug_snapshot!(verify(r#"$ f(x) = y $$"#)); 244 } 245 246 #[test] test_comment()247 fn test_comment() { 248 assert_debug_snapshot!(verify("hello %world\r\ntest %test")); 249 } 250 251 #[test] test_invalid_parameter()252 fn test_invalid_parameter() { 253 assert_debug_snapshot!(verify(r#"#"#)) 254 } 255 } 256