1# -*- coding: utf-8 -*- 2# Copyright (c) 2021 Rocky Bernstein 3# Copyright (c) 2016 rsmenon 4# Licensed under the MIT License (https://opensource.org/licenses/MIT) 5 6from collections import defaultdict 7 8from pygments.lexer import RegexLexer, include, words, bygroups 9from pygments.token import Token as PToken 10 11import mathics_pygments.builtins as mma 12 13 14class Regex: 15 IDENTIFIER = r"[a-zA-Z\$][a-zA-Z0-9\$]*" 16 NAMED_CHARACTER = fr"\\\[{IDENTIFIER}\]" 17 SYMBOLS = (fr'[`]?({IDENTIFIER}|{NAMED_CHARACTER})(`({IDENTIFIER}|{NAMED_CHARACTER}))*[`]?') 18 INTEGER = r"[0-9]+" 19 FLOAT = f"({INTEGER})?[.][0-9]+|{INTEGER}[.]" 20 REAL = fr"({INTEGER}|{FLOAT})`({INTEGER}|{FLOAT})?|{FLOAT}" 21 BASE_NUMBER = fr"{INTEGER}\s*\^\^\s*({REAL}|{INTEGER})" 22 SCIENTIFIC_NUMBER = fr"({REAL}|{INTEGER})\s*\*\^\s*{INTEGER}" 23 PATTERNS = fr"{SYMBOLS}\_{{1,3}}({SYMBOLS})?|({SYMBOLS})?\_{{1,3}}{SYMBOLS}" 24 SLOTS = fr"#{SYMBOLS}|#\"{SYMBOLS}\"|#{{1,2}}[0-9]*" 25 MESSAGES = fr"(::)(\\s*)({SYMBOLS})" 26 GROUPINGS = words(mma.GROUPINGS).get() 27 OPERATORS = words(mma.OPERATORS).get() 28 MATHICS_MESSAGE = "(\\w+)::(\\w+):( )(.+)" 29 30 31class MToken: 32 """ 33 Mathics Tokens. Like Pygments Token but for Mathics. 34 35 Class variables contain Mathics tokens like BUILTIN, COMMENT. 36 These variables hold corresponding Pygments token-name value. 37 38 The style, e.g. "colorful", "zenburn", "xcode", ultimately determines 39 out this appears on a terminal 40 """ 41 BUILTIN = PToken.Name.Function 42 COMMENT = PToken.Comment 43 GROUP = PToken.Punctuation 44 LOCAL_SCOPE = PToken.Name.Variable.Class 45 MESSAGE = PToken.Name.Exception 46 NAMESPACE = PToken.Namespace 47 NUMBER = PToken.Number 48 OPERATOR = PToken.Operator 49 PATTERN = PToken.Name.Tag 50 PUNCTUATION = PToken.Punctuation 51 SLOT = PToken.Name.Function 52 STRING = PToken.Generic.Emph 53 TEXT = PToken.Generic.Output 54 SYMBOL = PToken.Name.Variable 55 UNKNOWN = PToken.Error 56 WHITESPACE = PToken.Text.Whitespace 57 58 59class MathematicaLexer(RegexLexer): 60 name = "Mathematica" 61 aliases = ["mathematica", "mathics", "mma", "nb", "wl", "wolfram", "wolfram-language"] 62 filenames = ["*.cdf", "*.m", "*.ma", "*.nb", "*.wl"] 63 mimetypes = [ 64 "application/mathematica", 65 "application/vnd.wolfram.mathematica", 66 "application/vnd.wolfram.mathematica.package", 67 "application/vnd.wolfram.cdf", 68 "application/vnd.wolfram.cdf.text", 69 ] 70 tokens = { 71 "root": [ 72 (r"\(\*", MToken.COMMENT, "comments"), 73 (r'"', MToken.STRING, "strings"), 74 include("numbers"), 75 (Regex.PATTERNS, MToken.PATTERN), 76 77 (Regex.SYMBOLS, MToken.SYMBOL), 78 ( 79 Regex.MATHICS_MESSAGE, 80 bygroups(MToken.OPERATOR, MToken.WHITESPACE, MToken.TEXT, MToken.TEXT), 81 ), 82 83 (Regex.SLOTS, MToken.SLOT), 84 (Regex.GROUPINGS, MToken.GROUP), 85 ( 86 Regex.MESSAGES, 87 bygroups(MToken.OPERATOR, MToken.WHITESPACE, MToken.MESSAGE), 88 ), 89 (Regex.OPERATORS, MToken.OPERATOR), 90 (r"\s+", MToken.WHITESPACE), 91 92 # Note IDENTIFER should come after tokens that have IDENTIFIER parts, like SYMBOLS. 93 # Otherwise we may have System`foo matching identifier System over Symbol System`foo 94 # 95 # I don't understand why this is not a problem in pygments-mathematica. 96 (Regex.IDENTIFIER, MToken.SYMBOL), 97 ], 98 "comments": [ 99 (r"[^\*\(\)]+", MToken.COMMENT), 100 (r"\*[^\)]", MToken.COMMENT), 101 (r"\(\*", MToken.COMMENT, "#push"), 102 (r"\*\)", MToken.COMMENT, "#pop"), 103 (r"\([^\*]?|[^\*]?\)", MToken.COMMENT), 104 ], 105 "numbers": [ 106 (Regex.BASE_NUMBER, MToken.NUMBER), 107 (Regex.SCIENTIFIC_NUMBER, MToken.NUMBER), 108 (Regex.REAL, MToken.NUMBER), 109 (Regex.INTEGER, MToken.NUMBER), 110 ], 111 "strings": [ 112 (r'[^"\\]+', MToken.STRING), 113 (r'^[\\"]', MToken.STRING), 114 (r"(\\n|\\r)", MToken.STRING), 115 (r'\\"', MToken.STRING), 116 (r"\\", MToken.STRING), 117 (r'"', MToken.STRING, "#pop"), 118 ], 119 } 120 121 def get_tokens_unprocessed(self, text, stack=("root",)): 122 ma = MathematicaAnnotations() 123 annotations = (ma.builtins, ma.unicode, ma.lexical_scope) 124 for index, token, value in RegexLexer.get_tokens_unprocessed(self, text): 125 result = (index, token, value) 126 for func in annotations: 127 result = func(*result) 128 129 yield result 130 131 132class _State(dict): 133 def __getattr__(self, attr): 134 return self.get(attr) 135 136 __setattr__ = dict.__setitem__ 137 138 139class MathematicaAnnotations: 140 def __init__(self): 141 self.scope = _State() 142 self._reset_scope_state() 143 144 @staticmethod 145 def builtins(index, token, value): 146 if token is MToken.SYMBOL and value in mma.SYSTEM_SYMBOLS: 147 return index, MToken.BUILTIN, value 148 else: 149 return index, token, value 150 151 @staticmethod 152 def unicode(index, token, value): 153 if token is MToken.UNKNOWN: 154 if value in mma.UNICODE_SYSTEM_SYMBOLS: 155 new_token = MToken.BUILTIN 156 elif value in mma.UNICODE_GROUPINGS: 157 new_token = MToken.GROUP 158 elif value in mma.UNICODE_OPERATORS: 159 new_token = MToken.OPERATOR 160 elif value in mma.UNICODE_SYSTEM_UNDEFINED_SYMBOLS: 161 new_token = MToken.SYMBOL 162 else: 163 new_token = MToken.UNKNOWN 164 return index, new_token, value 165 else: 166 return index, token, value 167 168 def _reset_scope_state(self): 169 # keyword = True denotes the presence of a trigger symbol such as Block, With, Module 170 # When keyword is True and is followed by a [, then the parser enters an active state 171 self.scope.keyword = False 172 self.scope.active = False 173 174 # level tracks the nestedness of local scopes (e.g. Block[{x = Block[{y = ...}, ...]}, ...]) 175 self.scope.level = 0 176 177 # The next three variables are stacks that track opening and closing brackets, braces and 178 # and other groupings (associations, angle brackets, etc.) at each level. 179 # Braces are tracked only immediately after entering an active scope, which is where the 180 # local variables are defined. 181 self.scope.brackets = defaultdict(int) 182 self.scope.braces = defaultdict(int) 183 self.scope.other_groups = defaultdict(int) 184 185 # stack_state is a tuple of the above three counters at each level when the parser is inside 186 # a local variable definition region. i.e. when the parser is at { in Block[{x = 1}, x] 187 self.scope.stack_state = defaultdict(int) 188 189 # variables is the set of symbols/builtins that have been identified as being in a local 190 # scope at each level. rhs is True when the parser is in the RHS of an assignment (= or :=) 191 self.scope.variables = defaultdict(set) 192 self.scope.rhs = defaultdict(bool) 193 194 def _reset_scope_level(self, level): 195 scope_vars = ( 196 self.scope.brackets, 197 self.scope.braces, 198 self.scope.other_groups, 199 self.scope.stack_state, 200 self.scope.variables, 201 self.scope.rhs, 202 ) 203 [var.pop(level) for var in scope_vars if level in var] 204 205 def _get_stack_state(self, level): 206 return ( 207 self.scope.brackets[level], 208 self.scope.braces[level], 209 self.scope.other_groups[level], 210 ) 211 212 def lexical_scope(self, index, token, value): 213 level = self.scope.level 214 if token is MToken.WHITESPACE: 215 return index, token, value 216 217 if self.scope.active and token is MToken.GROUP and value in ("<|", " ", " "): 218 self.scope.other_groups[level] += 1 219 return index, token, value 220 elif self.scope.active and token is MToken.GROUP and value in ("|>", " ", " "): 221 self.scope.other_groups[level] -= 1 222 return index, token, value 223 224 if self.scope.active and token is MToken.GROUP and value == "}": 225 if self.scope.braces[level]: 226 self.scope.braces[level] -= 1 227 228 if not self.scope.braces[level]: 229 self.scope.rhs[level] = False 230 231 return index, token, value 232 233 if self.scope.active and token is MToken.GROUP and value == "]": 234 if self.scope.brackets[level]: 235 self.scope.brackets[level] -= 1 236 if not self.scope.brackets[level] and level: 237 self._reset_scope_level(level) 238 self.scope.level -= 1 239 240 if not self.scope.level: 241 self._reset_scope_state() 242 243 return index, token, value 244 245 if token is MToken.BUILTIN and value in ("Block", "With", "Module"): 246 self.scope.keyword = True 247 return index, token, value 248 249 if token is MToken.GROUP and value == "[": 250 # Enter an active state only if the preceding non-whitespace token is one of the scope 251 # keyword symbols. If it is already in an active state, the counter is incremented. 252 if self.scope.keyword: 253 self.scope.active = True 254 self.scope.level += 1 255 self.scope.keyword = False 256 257 if self.scope.active: 258 self.scope.brackets[self.scope.level] += 1 259 260 return index, token, value 261 262 if self.scope.active and token is MToken.GROUP and value == "{": 263 if level not in self.scope.variables: 264 # The parser is not yet in the local variables section so initialize counters and 265 # containers and take a snapshot of the stack state. The frozen stack state is used 266 # later to identify the end of the RHS in an assignment expression. 267 self.scope.variables[level] = set() 268 self.scope.braces[level] += 1 269 self.scope.stack_state[level] = self._get_stack_state(level) 270 elif level in self.scope.variables and self.scope.braces[level]: 271 # The parser is inside the local variables section. 272 self.scope.braces[level] += 1 273 else: 274 # In all other cases don't modify the stack. 275 pass 276 277 return index, token, value 278 279 if ( 280 self.scope.active 281 and self.scope.braces[level] 282 and token in (MToken.SYMBOL, MToken.BUILTIN) 283 ): 284 # The parser is inside the local variables section and on a builtin or a generic symbol 285 # token. If it isn't in the RHS of an assignment expression, then modify the token and 286 # add the value to the list of local scope variables at this level. 287 if not self.scope.rhs[level]: 288 self.scope.variables[level].add(value) 289 return index, MToken.LOCAL_SCOPE, value 290 else: 291 return index, token, value 292 293 elif self.scope.active and self.scope.braces[level]: 294 # If the parser is on an assignment operator, mark rhs = True so that symbols from the 295 # RHS of the assignment are not considered as local variables. The rhs value is reset 296 # when: 297 # 1. the parser is on a , inside the local variables section and the stack state 298 # is the same as when it entered the section. For example, in 299 # Block[{x = 1, y = 2}, x + y], the stack state is the same at { and the first ,. 300 # But in Block[{x = {1, a}, y = 2}, x + y], the stack state is not the same at { 301 # and the first , so it is still part of the RHS. 302 # 2. if it has exited the local variables section (handled earlier) 303 if token is MToken.OPERATOR and value in ("=", ":="): 304 self.scope.rhs[level] = True 305 elif ( 306 token is MToken.GROUP 307 and value == "," 308 and self._get_stack_state(level) == self.scope.stack_state[level] 309 ): 310 self.scope.rhs[level] = False 311 312 return index, token, value 313 314 elif self.scope.active and token in (MToken.SYMBOL, MToken.BUILTIN): 315 # If the code has reached here, the parser is outside the local variables section and in 316 # the body of the scoping function. 317 if value in self.scope.variables[level]: 318 return index, MToken.LOCAL_SCOPE, value 319 else: 320 return index, token, value 321 322 self.scope.keyword = False 323 return index, token, value 324