1-- Copyright 2006-2021 Mitchell. See LICENSE.
2-- Python LPeg lexer.
3
4local lexer = require('lexer')
5local token, word_match = lexer.token, lexer.word_match
6local P, S = lpeg.P, lpeg.S
7
8local lex = lexer.new('python', {fold_by_indentation = true})
9
10-- Whitespace.
11lex:add_rule('whitespace', token(lexer.WHITESPACE, lexer.space^1))
12
13-- Keywords.
14lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[
15  and as assert async await break class continue def del elif else except exec
16  finally for from global if import in is lambda nonlocal not or pass print
17  raise return try while with yield
18  -- Descriptors/attr access.
19  __get__ __set__ __delete__ __slots__
20  -- Class.
21  __new__ __init__ __del__ __repr__ __str__ __cmp__ __index__ __lt__ __le__
22  __gt__ __ge__ __eq__ __ne__ __hash__ __nonzero__ __getattr__ __getattribute__
23  __setattr__ __delattr__ __call__
24  -- Operator.
25  __add__ __sub__ __mul__ __div__ __floordiv__ __mod__ __divmod__ __pow__
26  __and__ __xor__ __or__ __lshift__ __rshift__ __nonzero__ __neg__ __pos__
27  __abs__ __invert__ __iadd__ __isub__ __imul__ __idiv__ __ifloordiv__ __imod__
28  __ipow__ __iand__ __ixor__ __ior__ __ilshift__ __irshift__
29  -- Conversions.
30  __int__ __long__ __float__ __complex__ __oct__ __hex__ __coerce__
31  -- Containers.
32  __len__ __getitem__ __missing__ __setitem__ __delitem__ __contains__ __iter__
33  __getslice__ __setslice__ __delslice__
34  -- Module and class attribs.
35  __doc__ __name__ __dict__ __file__ __path__ __module__ __bases__ __class__
36  __self__
37  -- Stdlib/sys.
38  __builtin__ __future__ __main__ __import__ __stdin__ __stdout__ __stderr__
39  -- Other.
40  __debug__ __doc__ __import__ __name__
41]]))
42
43-- Functions.
44lex:add_rule('function', token(lexer.FUNCTION, word_match[[
45  abs all any apply basestring bool buffer callable chr classmethod cmp coerce
46  compile complex copyright credits delattr dict dir divmod enumerate eval
47  execfile exit file filter float frozenset getattr globals hasattr hash help
48  hex id input int intern isinstance issubclass iter len license list locals
49  long map max min object oct open ord pow property quit range raw_input reduce
50  reload repr reversed round set setattr slice sorted staticmethod str sum super
51  tuple type unichr unicode vars xrange zip
52]]))
53
54-- Constants.
55lex:add_rule('constant', token(lexer.CONSTANT, word_match[[
56  ArithmeticError AssertionError AttributeError BaseException DeprecationWarning
57  EOFError Ellipsis EnvironmentError Exception False FloatingPointError
58  FutureWarning GeneratorExit IOError ImportError ImportWarning IndentationError
59  IndexError KeyError KeyboardInterrupt LookupError MemoryError NameError None
60  NotImplemented NotImplementedError OSError OverflowError
61  PendingDeprecationWarning ReferenceError RuntimeError RuntimeWarning
62  StandardError StopIteration SyntaxError SyntaxWarning SystemError SystemExit
63  TabError True TypeError UnboundLocalError UnicodeDecodeError
64  UnicodeEncodeError UnicodeError UnicodeTranslateError UnicodeWarning
65  UserWarning ValueError Warning ZeroDivisionError
66]]))
67
68-- Self.
69lex:add_rule('self', token('self', P('self')))
70lex:add_style('self', lexer.styles.type)
71
72-- Identifiers.
73lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word))
74
75-- Comments.
76lex:add_rule('comment', token(lexer.COMMENT, lexer.to_eol('#', true)))
77
78-- Strings.
79local sq_str = P('u')^-1 * lexer.range("'", true)
80local dq_str = P('U')^-1 * lexer.range('"', true)
81local tq_str = lexer.range("'''") + lexer.range('"""')
82-- TODO: raw_strs cannot end in single \.
83local raw_sq_str = P('u')^-1 * 'r' * lexer.range("'", false, false)
84local raw_dq_str = P('U')^-1 * 'R' * lexer.range('"', false, false)
85lex:add_rule('string', token(lexer.STRING, tq_str + sq_str + dq_str +
86  raw_sq_str + raw_dq_str))
87
88-- Numbers.
89local dec = lexer.dec_num * S('Ll')^-1
90local bin = '0b' * S('01')^1 * ('_' * S('01')^1)^0
91local oct = lexer.oct_num * S('Ll')^-1
92local integer = S('+-')^-1 * (bin + lexer.hex_num + oct + dec)
93lex:add_rule('number', token(lexer.NUMBER, lexer.float + integer))
94
95-- Decorators.
96lex:add_rule('decorator', token('decorator', lexer.to_eol('@')))
97lex:add_style('decorator', lexer.styles.preprocessor)
98
99-- Operators.
100lex:add_rule('operator', token(lexer.OPERATOR, S('!%^&*()[]{}-=+/|:;.,?<>~`')))
101
102return lex
103