1-- Copyright 2006-2021 Mitchell. See LICENSE.
2-- Perl LPeg lexer.
3
4local lexer = require('lexer')
5local token, word_match = lexer.token, lexer.word_match
6local P, S, V = lpeg.P, lpeg.S, lpeg.V
7
8local lex = lexer.new('perl')
9
10-- Whitespace.
11lex:add_rule('perl', token(lexer.WHITESPACE, lexer.space^1))
12
13-- Keywords.
14lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[
15  STDIN STDOUT STDERR BEGIN END CHECK INIT
16  require use
17  break continue do each else elsif foreach for if last local my next our
18  package return sub unless until while __FILE__ __LINE__ __PACKAGE__
19  and or not eq ne lt gt le ge
20]]))
21
22-- Markers.
23lex:add_rule('marker', token(lexer.COMMENT, word_match('__DATA__ __END__') *
24  lexer.any^0))
25
26-- Functions.
27lex:add_rule('function', token(lexer.FUNCTION, word_match[[
28  abs accept alarm atan2 bind binmode bless caller chdir chmod chomp chop chown
29  chr chroot closedir close connect cos crypt dbmclose dbmopen defined delete
30  die dump each endgrent endhostent endnetent endprotoent endpwent endservent
31  eof eval exec exists exit exp fcntl fileno flock fork format formline getc
32  getgrent getgrgid getgrnam gethostbyaddr gethostbyname gethostent getlogin
33  getnetbyaddr getnetbyname getnetent getpeername getpgrp getppid getpriority
34  getprotobyname getprotobynumber getprotoent getpwent getpwnam getpwuid
35  getservbyname getservbyport getservent getsockname getsockopt glob gmtime goto
36  grep hex import index int ioctl join keys kill lcfirst lc length link listen
37  localtime log lstat map mkdir msgctl msgget msgrcv msgsnd new oct opendir open
38  ord pack pipe pop pos printf print prototype push quotemeta rand readdir read
39  readlink recv redo ref rename reset reverse rewinddir rindex rmdir scalar
40  seekdir seek select semctl semget semop send setgrent sethostent setnetent
41  setpgrp setpriority setprotoent setpwent setservent setsockopt shift shmctl
42  shmget shmread shmwrite shutdown sin sleep socket socketpair sort splice split
43  sprintf sqrt srand stat study substr symlink syscall sysread sysseek system
44  syswrite telldir tell tied tie time times truncate ucfirst uc umask undef
45  unlink unpack unshift untie utime values vec wait waitpid wantarray warn write
46]]))
47
48local delimiter_matches = {['('] = ')', ['['] = ']', ['{'] = '}', ['<'] = '>'}
49local literal_delimited = P(function(input, index) -- for single delimiter sets
50  local delimiter = input:sub(index, index)
51  if not delimiter:find('%w') then -- only non alpha-numerics
52    local patt
53    if delimiter_matches[delimiter] then
54      -- Handle nested delimiter/matches in strings.
55      local s, e = delimiter, delimiter_matches[delimiter]
56      patt = lexer.range(s, e, false, true, true)
57    else
58      patt = lexer.range(delimiter)
59    end
60    local match_pos = lpeg.match(patt, input, index)
61    return match_pos or #input + 1
62  end
63end)
64local literal_delimited2 = P(function(input, index) -- for 2 delimiter sets
65  local delimiter = input:sub(index, index)
66  -- Only consider non-alpha-numerics and non-spaces as delimiters. The
67  -- non-spaces are used to ignore operators like "-s".
68  if not delimiter:find('[%w ]') then
69    local patt
70    if delimiter_matches[delimiter] then
71      -- Handle nested delimiter/matches in strings.
72      local s, e = delimiter, delimiter_matches[delimiter]
73      patt = lexer.range(s, e, false, true, true)
74    else
75      patt = lexer.range(delimiter)
76    end
77    local first_match_pos = lpeg.match(patt, input, index)
78    local final_match_pos = lpeg.match(patt, input, first_match_pos - 1)
79    if not final_match_pos then -- using (), [], {}, or <> notation
80      final_match_pos = lpeg.match(lexer.space^0 * patt, input, first_match_pos)
81    end
82    return final_match_pos or #input + 1
83  end
84end)
85
86-- Strings.
87local sq_str = lexer.range("'")
88local dq_str = lexer.range('"')
89local cmd_str = lexer.range('`')
90local heredoc = '<<' * P(function(input, index)
91  local s, e, delimiter = input:find('([%a_][%w_]*)[\n\r\f;]+', index)
92  if s == index and delimiter then
93    local end_heredoc = '[\n\r\f]+'
94    local _, e = input:find(end_heredoc .. delimiter, e)
95    return e and e + 1 or #input + 1
96  end
97end)
98local lit_str = 'q' * P('q')^-1 * literal_delimited
99local lit_array = 'qw' * literal_delimited
100local lit_cmd = 'qx' * literal_delimited
101local lit_tr = (P('tr') + 'y') * literal_delimited2 * S('cds')^0
102local string = token(lexer.STRING, sq_str + dq_str + cmd_str + heredoc +
103  lit_str + lit_array + lit_cmd + lit_tr)
104local regex_str = #P('/') * lexer.last_char_includes('-<>+*!~\\=%&|^?:;([{') *
105  lexer.range('/', true) * S('imosx')^0
106local lit_regex = 'qr' * literal_delimited * S('imosx')^0
107local lit_match = 'm' * literal_delimited * S('cgimosx')^0
108local lit_sub = 's' * literal_delimited2 * S('ecgimosx')^0
109local regex = token(lexer.REGEX, regex_str + lit_regex + lit_match + lit_sub)
110lex:add_rule('string', string + regex)
111
112-- Identifiers.
113lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word))
114
115-- Comments.
116local line_comment = lexer.to_eol('#', true)
117local block_comment = lexer.range(lexer.starts_line('=' * lexer.alpha),
118  lexer.starts_line('=cut'))
119lex:add_rule('comment', token(lexer.COMMENT, block_comment + line_comment))
120
121-- Numbers.
122lex:add_rule('number', token(lexer.NUMBER, lexer.number))
123
124-- Variables.
125local special_var = '$' * (
126  '^' * S('ADEFHILMOPSTWX')^-1 +
127  S('\\"[]\'&`+*.,;=%~?@<>(|/!-') +
128  ':' * (lexer.any - ':') +
129  P('$') * -lexer.word +
130  lexer.digit^1)
131local plain_var = ('$#' + S('$@%')) * P('$')^0 * lexer.word + '$#'
132lex:add_rule('variable', token(lexer.VARIABLE, special_var + plain_var))
133
134-- Operators.
135lex:add_rule('operator', token(lexer.OPERATOR, S('-<>+*!~\\=/%&|^.,?:;()[]{}')))
136
137-- Fold points.
138lex:add_fold_point(lexer.OPERATOR, '[', ']')
139lex:add_fold_point(lexer.OPERATOR, '{', '}')
140lex:add_fold_point(lexer.COMMENT, lexer.fold_consecutive_lines('#'))
141
142return lex
143