1# -*- coding: utf-8 -*-
2# Copyright (c) 2021 Rocky Bernstein
3# Copyright (c) 2016 rsmenon
4# Licensed under the MIT License (https://opensource.org/licenses/MIT)
5
6from collections import defaultdict
7
8from pygments.lexer import RegexLexer, include, words, bygroups
9from pygments.token import Token as PToken
10
11import mathics_pygments.builtins as mma
12
13
14class Regex:
15    IDENTIFIER = r"[a-zA-Z\$][a-zA-Z0-9\$]*"
16    NAMED_CHARACTER = fr"\\\[{IDENTIFIER}\]"
17    SYMBOLS = (fr'[`]?({IDENTIFIER}|{NAMED_CHARACTER})(`({IDENTIFIER}|{NAMED_CHARACTER}))*[`]?')
18    INTEGER = r"[0-9]+"
19    FLOAT = f"({INTEGER})?[.][0-9]+|{INTEGER}[.]"
20    REAL = fr"({INTEGER}|{FLOAT})`({INTEGER}|{FLOAT})?|{FLOAT}"
21    BASE_NUMBER = fr"{INTEGER}\s*\^\^\s*({REAL}|{INTEGER})"
22    SCIENTIFIC_NUMBER = fr"({REAL}|{INTEGER})\s*\*\^\s*{INTEGER}"
23    PATTERNS = fr"{SYMBOLS}\_{{1,3}}({SYMBOLS})?|({SYMBOLS})?\_{{1,3}}{SYMBOLS}"
24    SLOTS = fr"#{SYMBOLS}|#\"{SYMBOLS}\"|#{{1,2}}[0-9]*"
25    MESSAGES = fr"(::)(\\s*)({SYMBOLS})"
26    GROUPINGS = words(mma.GROUPINGS).get()
27    OPERATORS = words(mma.OPERATORS).get()
28    MATHICS_MESSAGE = "(\\w+)::(\\w+):( )(.+)"
29
30
31class MToken:
32    """
33    Mathics Tokens. Like Pygments Token but for Mathics.
34
35    Class variables contain Mathics tokens like BUILTIN, COMMENT.
36    These variables hold corresponding Pygments token-name value.
37
38    The style, e.g. "colorful", "zenburn", "xcode", ultimately determines
39    out this appears on a terminal
40    """
41    BUILTIN = PToken.Name.Function
42    COMMENT = PToken.Comment
43    GROUP = PToken.Punctuation
44    LOCAL_SCOPE = PToken.Name.Variable.Class
45    MESSAGE = PToken.Name.Exception
46    NAMESPACE = PToken.Namespace
47    NUMBER = PToken.Number
48    OPERATOR = PToken.Operator
49    PATTERN = PToken.Name.Tag
50    PUNCTUATION = PToken.Punctuation
51    SLOT = PToken.Name.Function
52    STRING = PToken.Generic.Emph
53    TEXT = PToken.Generic.Output
54    SYMBOL = PToken.Name.Variable
55    UNKNOWN = PToken.Error
56    WHITESPACE = PToken.Text.Whitespace
57
58
59class MathematicaLexer(RegexLexer):
60    name = "Mathematica"
61    aliases = ["mathematica", "mathics", "mma", "nb", "wl", "wolfram", "wolfram-language"]
62    filenames = ["*.cdf", "*.m", "*.ma", "*.nb", "*.wl"]
63    mimetypes = [
64        "application/mathematica",
65        "application/vnd.wolfram.mathematica",
66        "application/vnd.wolfram.mathematica.package",
67        "application/vnd.wolfram.cdf",
68        "application/vnd.wolfram.cdf.text",
69    ]
70    tokens = {
71        "root": [
72            (r"\(\*", MToken.COMMENT, "comments"),
73            (r'"', MToken.STRING, "strings"),
74            include("numbers"),
75            (Regex.PATTERNS, MToken.PATTERN),
76
77            (Regex.SYMBOLS, MToken.SYMBOL),
78            (
79                Regex.MATHICS_MESSAGE,
80                bygroups(MToken.OPERATOR, MToken.WHITESPACE, MToken.TEXT, MToken.TEXT),
81            ),
82
83            (Regex.SLOTS, MToken.SLOT),
84            (Regex.GROUPINGS, MToken.GROUP),
85            (
86                Regex.MESSAGES,
87                bygroups(MToken.OPERATOR, MToken.WHITESPACE, MToken.MESSAGE),
88            ),
89            (Regex.OPERATORS, MToken.OPERATOR),
90            (r"\s+", MToken.WHITESPACE),
91
92            # Note IDENTIFER should come after tokens that have IDENTIFIER parts, like SYMBOLS.
93            # Otherwise we may have System`foo matching identifier System over Symbol System`foo
94            #
95            # I don't understand why this is not a problem in  pygments-mathematica.
96            (Regex.IDENTIFIER, MToken.SYMBOL),
97        ],
98        "comments": [
99            (r"[^\*\(\)]+", MToken.COMMENT),
100            (r"\*[^\)]", MToken.COMMENT),
101            (r"\(\*", MToken.COMMENT, "#push"),
102            (r"\*\)", MToken.COMMENT, "#pop"),
103            (r"\([^\*]?|[^\*]?\)", MToken.COMMENT),
104        ],
105        "numbers": [
106            (Regex.BASE_NUMBER, MToken.NUMBER),
107            (Regex.SCIENTIFIC_NUMBER, MToken.NUMBER),
108            (Regex.REAL, MToken.NUMBER),
109            (Regex.INTEGER, MToken.NUMBER),
110        ],
111        "strings": [
112            (r'[^"\\]+', MToken.STRING),
113            (r'^[\\"]', MToken.STRING),
114            (r"(\\n|\\r)", MToken.STRING),
115            (r'\\"', MToken.STRING),
116            (r"\\", MToken.STRING),
117            (r'"', MToken.STRING, "#pop"),
118        ],
119    }
120
121    def get_tokens_unprocessed(self, text, stack=("root",)):
122        ma = MathematicaAnnotations()
123        annotations = (ma.builtins, ma.unicode, ma.lexical_scope)
124        for index, token, value in RegexLexer.get_tokens_unprocessed(self, text):
125            result = (index, token, value)
126            for func in annotations:
127                result = func(*result)
128
129            yield result
130
131
132class _State(dict):
133    def __getattr__(self, attr):
134        return self.get(attr)
135
136    __setattr__ = dict.__setitem__
137
138
139class MathematicaAnnotations:
140    def __init__(self):
141        self.scope = _State()
142        self._reset_scope_state()
143
144    @staticmethod
145    def builtins(index, token, value):
146        if token is MToken.SYMBOL and value in mma.SYSTEM_SYMBOLS:
147            return index, MToken.BUILTIN, value
148        else:
149            return index, token, value
150
151    @staticmethod
152    def unicode(index, token, value):
153        if token is MToken.UNKNOWN:
154            if value in mma.UNICODE_SYSTEM_SYMBOLS:
155                new_token = MToken.BUILTIN
156            elif value in mma.UNICODE_GROUPINGS:
157                new_token = MToken.GROUP
158            elif value in mma.UNICODE_OPERATORS:
159                new_token = MToken.OPERATOR
160            elif value in mma.UNICODE_SYSTEM_UNDEFINED_SYMBOLS:
161                new_token = MToken.SYMBOL
162            else:
163                new_token = MToken.UNKNOWN
164            return index, new_token, value
165        else:
166            return index, token, value
167
168    def _reset_scope_state(self):
169        # keyword = True denotes the presence of a trigger symbol such as Block, With, Module
170        # When keyword is True and is followed by a [, then the parser enters an active state
171        self.scope.keyword = False
172        self.scope.active = False
173
174        # level tracks the nestedness of local scopes (e.g. Block[{x = Block[{y = ...}, ...]}, ...])
175        self.scope.level = 0
176
177        # The next three variables are stacks that track opening and closing brackets, braces and
178        # and other groupings (associations, angle brackets, etc.) at each level.
179        # Braces are tracked only immediately after entering an active scope, which is where the
180        # local variables are defined.
181        self.scope.brackets = defaultdict(int)
182        self.scope.braces = defaultdict(int)
183        self.scope.other_groups = defaultdict(int)
184
185        # stack_state is a tuple of the above three counters at each level when the parser is inside
186        # a local variable definition region. i.e. when the parser is at { in Block[{x = 1}, x]
187        self.scope.stack_state = defaultdict(int)
188
189        # variables is the set of symbols/builtins that have been identified as being in a local
190        # scope at each level. rhs is True when the parser is in the RHS of an assignment (= or :=)
191        self.scope.variables = defaultdict(set)
192        self.scope.rhs = defaultdict(bool)
193
194    def _reset_scope_level(self, level):
195        scope_vars = (
196            self.scope.brackets,
197            self.scope.braces,
198            self.scope.other_groups,
199            self.scope.stack_state,
200            self.scope.variables,
201            self.scope.rhs,
202        )
203        [var.pop(level) for var in scope_vars if level in var]
204
205    def _get_stack_state(self, level):
206        return (
207            self.scope.brackets[level],
208            self.scope.braces[level],
209            self.scope.other_groups[level],
210        )
211
212    def lexical_scope(self, index, token, value):
213        level = self.scope.level
214        if token is MToken.WHITESPACE:
215            return index, token, value
216
217        if self.scope.active and token is MToken.GROUP and value in ("<|", " ", " "):
218            self.scope.other_groups[level] += 1
219            return index, token, value
220        elif self.scope.active and token is MToken.GROUP and value in ("|>", " ", " "):
221            self.scope.other_groups[level] -= 1
222            return index, token, value
223
224        if self.scope.active and token is MToken.GROUP and value == "}":
225            if self.scope.braces[level]:
226                self.scope.braces[level] -= 1
227
228            if not self.scope.braces[level]:
229                self.scope.rhs[level] = False
230
231            return index, token, value
232
233        if self.scope.active and token is MToken.GROUP and value == "]":
234            if self.scope.brackets[level]:
235                self.scope.brackets[level] -= 1
236                if not self.scope.brackets[level] and level:
237                    self._reset_scope_level(level)
238                    self.scope.level -= 1
239
240                if not self.scope.level:
241                    self._reset_scope_state()
242
243            return index, token, value
244
245        if token is MToken.BUILTIN and value in ("Block", "With", "Module"):
246            self.scope.keyword = True
247            return index, token, value
248
249        if token is MToken.GROUP and value == "[":
250            # Enter an active state only if the preceding non-whitespace token is one of the scope
251            # keyword symbols. If it is already in an active state, the counter is incremented.
252            if self.scope.keyword:
253                self.scope.active = True
254                self.scope.level += 1
255                self.scope.keyword = False
256
257            if self.scope.active:
258                self.scope.brackets[self.scope.level] += 1
259
260            return index, token, value
261
262        if self.scope.active and token is MToken.GROUP and value == "{":
263            if level not in self.scope.variables:
264                # The parser is not yet in the local variables section so initialize counters and
265                # containers and take a snapshot of the stack state. The frozen stack state is used
266                # later to identify the end of the RHS in an assignment expression.
267                self.scope.variables[level] = set()
268                self.scope.braces[level] += 1
269                self.scope.stack_state[level] = self._get_stack_state(level)
270            elif level in self.scope.variables and self.scope.braces[level]:
271                # The parser is inside the local variables section.
272                self.scope.braces[level] += 1
273            else:
274                # In all other cases don't modify the stack.
275                pass
276
277            return index, token, value
278
279        if (
280            self.scope.active
281            and self.scope.braces[level]
282            and token in (MToken.SYMBOL, MToken.BUILTIN)
283        ):
284            # The parser is inside the local variables section and on a builtin or a generic symbol
285            # token. If it isn't in the RHS of an assignment expression, then modify the token and
286            # add the value to the list of local scope variables at this level.
287            if not self.scope.rhs[level]:
288                self.scope.variables[level].add(value)
289                return index, MToken.LOCAL_SCOPE, value
290            else:
291                return index, token, value
292
293        elif self.scope.active and self.scope.braces[level]:
294            # If the parser is on an assignment operator, mark rhs = True so that symbols from the
295            # RHS of the assignment are not considered as local variables. The rhs value is reset
296            # when:
297            #   1. the parser is on a , inside the local variables section and the stack state
298            #      is the same as when it entered the section. For example, in
299            #      Block[{x = 1, y = 2}, x + y], the stack state is the same at { and the first ,.
300            #      But in Block[{x = {1, a}, y = 2}, x + y], the stack state is not the same at {
301            #      and the first , so it is still part of the RHS.
302            #   2. if it has exited the local variables section (handled earlier)
303            if token is MToken.OPERATOR and value in ("=", ":="):
304                self.scope.rhs[level] = True
305            elif (
306                token is MToken.GROUP
307                and value == ","
308                and self._get_stack_state(level) == self.scope.stack_state[level]
309            ):
310                self.scope.rhs[level] = False
311
312            return index, token, value
313
314        elif self.scope.active and token in (MToken.SYMBOL, MToken.BUILTIN):
315            # If the code has reached here, the parser is outside the local variables section and in
316            # the body of the scoping function.
317            if value in self.scope.variables[level]:
318                return index, MToken.LOCAL_SCOPE, value
319            else:
320                return index, token, value
321
322        self.scope.keyword = False
323        return index, token, value
324