1# -*- coding: utf-8 -*-
2"""
3    pygments.lexers.stata
4    ~~~~~~~~~~~~~~~~~~~~~
5
6    Lexer for Stata
7
8    :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
9    :license: BSD, see LICENSE for details.
10"""
11
12import re
13from pygments.lexer import RegexLexer, default, include, words
14from pygments.token import Comment, Keyword, Name, Number, \
15    String, Text, Operator
16
17from pygments.lexers._stata_builtins import builtins_base, builtins_functions
18
19__all__ = ['StataLexer']
20
21
22class StataLexer(RegexLexer):
23    """
24    For `Stata <http://www.stata.com/>`_ do files.
25
26    .. versionadded:: 2.2
27    """
28    # Syntax based on
29    # - http://fmwww.bc.edu/RePEc/bocode/s/synlightlist.ado
30    # - https://github.com/isagalaev/highlight.js/blob/master/src/languages/stata.js
31    # - https://github.com/jpitblado/vim-stata/blob/master/syntax/stata.vim
32
33    name      = 'Stata'
34    aliases   = ['stata', 'do']
35    filenames = ['*.do', '*.ado']
36    mimetypes = ['text/x-stata', 'text/stata', 'application/x-stata']
37    flags     = re.MULTILINE | re.DOTALL
38
39    tokens = {
40        'root': [
41            include('comments'),
42            include('strings'),
43            include('macros'),
44            include('numbers'),
45            include('keywords'),
46            include('operators'),
47            include('format'),
48            (r'.', Text),
49        ],
50        # Comments are a complicated beast in Stata because they can be
51        # nested and there are a few corner cases with that. See:
52        # - github.com/kylebarron/language-stata/issues/90
53        # - statalist.org/forums/forum/general-stata-discussion/general/1448244
54        'comments': [
55            (r'(^//|(?<=\s)//)(?!/)', Comment.Single, 'comments-double-slash'),
56            (r'^\s*\*', Comment.Single, 'comments-star'),
57            (r'/\*', Comment.Multiline, 'comments-block'),
58            (r'(^///|(?<=\s)///)', Comment.Special, 'comments-triple-slash')
59        ],
60        'comments-block': [
61            (r'/\*', Comment.Multiline, '#push'),
62            # this ends and restarts a comment block. but need to catch this so
63            # that it doesn\'t start _another_ level of comment blocks
64            (r'\*/\*', Comment.Multiline),
65            (r'(\*/\s+\*(?!/)[^\n]*)|(\*/)', Comment.Multiline, '#pop'),
66            # Match anything else as a character inside the comment
67            (r'.', Comment.Multiline),
68        ],
69        'comments-star': [
70            (r'///.*?\n', Comment.Single,
71                ('#pop', 'comments-triple-slash')),
72            (r'(^//|(?<=\s)//)(?!/)', Comment.Single,
73                ('#pop', 'comments-double-slash')),
74            (r'/\*', Comment.Multiline, 'comments-block'),
75            (r'.(?=\n)', Comment.Single, '#pop'),
76            (r'.', Comment.Single),
77        ],
78        'comments-triple-slash': [
79            (r'\n', Comment.Special, '#pop'),
80            # A // breaks out of a comment for the rest of the line
81            (r'//.*?(?=\n)', Comment.Single, '#pop'),
82            (r'.', Comment.Special),
83        ],
84        'comments-double-slash': [
85            (r'\n', Text, '#pop'),
86            (r'.', Comment.Single),
87        ],
88        # `"compound string"' and regular "string"; note the former are
89        # nested.
90        'strings': [
91            (r'`"', String, 'string-compound'),
92            (r'(?<!`)"', String, 'string-regular'),
93        ],
94        'string-compound': [
95            (r'`"', String, '#push'),
96            (r'"\'', String, '#pop'),
97            (r'\\\\|\\"|\\\$|\\`|\\\n', String.Escape),
98            include('macros'),
99            (r'.', String)
100        ],
101        'string-regular': [
102            (r'(")(?!\')|(?=\n)', String, '#pop'),
103            (r'\\\\|\\"|\\\$|\\`|\\\n', String.Escape),
104            include('macros'),
105            (r'.', String)
106        ],
107        # A local is usually
108        #     `\w{0,31}'
109        #     `:extended macro'
110        #     `=expression'
111        #     `[rsen](results)'
112        #     `(++--)scalar(++--)'
113        #
114        # However, there are all sorts of weird rules wrt edge
115        # cases. Instead of writing 27 exceptions, anything inside
116        # `' is a local.
117        #
118        # A global is more restricted, so we do follow rules. Note only
119        # locals explicitly enclosed ${} can be nested.
120        'macros': [
121            (r'\$(\{|(?=[$`]))', Name.Variable.Global, 'macro-global-nested'),
122            (r'\$', Name.Variable.Global,  'macro-global-name'),
123            (r'`', Name.Variable, 'macro-local'),
124        ],
125        'macro-local': [
126            (r'`', Name.Variable, '#push'),
127            (r"'", Name.Variable, '#pop'),
128            (r'\$(\{|(?=[$`]))', Name.Variable.Global, 'macro-global-nested'),
129            (r'\$', Name.Variable.Global, 'macro-global-name'),
130            (r'.', Name.Variable),  # fallback
131        ],
132        'macro-global-nested': [
133            (r'\$(\{|(?=[$`]))', Name.Variable.Global, '#push'),
134            (r'\}', Name.Variable.Global, '#pop'),
135            (r'\$', Name.Variable.Global, 'macro-global-name'),
136            (r'`', Name.Variable, 'macro-local'),
137            (r'\w', Name.Variable.Global),  # fallback
138            default('#pop'),
139        ],
140        'macro-global-name': [
141            (r'\$(\{|(?=[$`]))', Name.Variable.Global, 'macro-global-nested', '#pop'),
142            (r'\$', Name.Variable.Global, 'macro-global-name', '#pop'),
143            (r'`', Name.Variable, 'macro-local', '#pop'),
144            (r'\w{1,32}', Name.Variable.Global, '#pop'),
145        ],
146        # Built in functions and statements
147        'keywords': [
148            (words(builtins_functions, prefix = r'\b', suffix = r'(?=\()'),
149             Name.Function),
150            (words(builtins_base, prefix = r'(^\s*|\s)', suffix = r'\b'),
151             Keyword),
152        ],
153        # http://www.stata.com/help.cgi?operators
154        'operators': [
155            (r'-|==|<=|>=|<|>|&|!=', Operator),
156            (r'\*|\+|\^|/|!|~|==|~=', Operator)
157        ],
158        # Stata numbers
159        'numbers': [
160            # decimal number
161            (r'\b[+-]?([0-9]+(\.[0-9]+)?|\.[0-9]+|\.)([eE][+-]?[0-9]+)?[i]?\b',
162             Number),
163        ],
164        # Stata formats
165        'format': [
166            (r'%-?\d{1,2}(\.\d{1,2})?[gfe]c?', Name.Other),
167            (r'%(21x|16H|16L|8H|8L)', Name.Other),
168            (r'%-?(tc|tC|td|tw|tm|tq|th|ty|tg)\S{0,32}', Name.Other),
169            (r'%[-~]?\d{1,4}s', Name.Other),
170        ]
171    }
172