1-- Copyright 2006-2021 Mitchell. See LICENSE. 2-- Perl LPeg lexer. 3 4local lexer = require('lexer') 5local token, word_match = lexer.token, lexer.word_match 6local P, S, V = lpeg.P, lpeg.S, lpeg.V 7 8local lex = lexer.new('perl') 9 10-- Whitespace. 11lex:add_rule('perl', token(lexer.WHITESPACE, lexer.space^1)) 12 13-- Keywords. 14lex:add_rule('keyword', token(lexer.KEYWORD, word_match[[ 15 STDIN STDOUT STDERR BEGIN END CHECK INIT 16 require use 17 break continue do each else elsif foreach for if last local my next our 18 package return sub unless until while __FILE__ __LINE__ __PACKAGE__ 19 and or not eq ne lt gt le ge 20]])) 21 22-- Markers. 23lex:add_rule('marker', token(lexer.COMMENT, word_match('__DATA__ __END__') * 24 lexer.any^0)) 25 26-- Functions. 27lex:add_rule('function', token(lexer.FUNCTION, word_match[[ 28 abs accept alarm atan2 bind binmode bless caller chdir chmod chomp chop chown 29 chr chroot closedir close connect cos crypt dbmclose dbmopen defined delete 30 die dump each endgrent endhostent endnetent endprotoent endpwent endservent 31 eof eval exec exists exit exp fcntl fileno flock fork format formline getc 32 getgrent getgrgid getgrnam gethostbyaddr gethostbyname gethostent getlogin 33 getnetbyaddr getnetbyname getnetent getpeername getpgrp getppid getpriority 34 getprotobyname getprotobynumber getprotoent getpwent getpwnam getpwuid 35 getservbyname getservbyport getservent getsockname getsockopt glob gmtime goto 36 grep hex import index int ioctl join keys kill lcfirst lc length link listen 37 localtime log lstat map mkdir msgctl msgget msgrcv msgsnd new oct opendir open 38 ord pack pipe pop pos printf print prototype push quotemeta rand readdir read 39 readlink recv redo ref rename reset reverse rewinddir rindex rmdir scalar 40 seekdir seek select semctl semget semop send setgrent sethostent setnetent 41 setpgrp setpriority setprotoent setpwent setservent setsockopt shift shmctl 42 shmget shmread shmwrite shutdown sin sleep socket socketpair sort splice split 43 sprintf sqrt srand stat study substr symlink syscall sysread sysseek system 44 syswrite telldir tell tied tie time times truncate ucfirst uc umask undef 45 unlink unpack unshift untie utime values vec wait waitpid wantarray warn write 46]])) 47 48local delimiter_matches = {['('] = ')', ['['] = ']', ['{'] = '}', ['<'] = '>'} 49local literal_delimited = P(function(input, index) -- for single delimiter sets 50 local delimiter = input:sub(index, index) 51 if not delimiter:find('%w') then -- only non alpha-numerics 52 local patt 53 if delimiter_matches[delimiter] then 54 -- Handle nested delimiter/matches in strings. 55 local s, e = delimiter, delimiter_matches[delimiter] 56 patt = lexer.range(s, e, false, true, true) 57 else 58 patt = lexer.range(delimiter) 59 end 60 local match_pos = lpeg.match(patt, input, index) 61 return match_pos or #input + 1 62 end 63end) 64local literal_delimited2 = P(function(input, index) -- for 2 delimiter sets 65 local delimiter = input:sub(index, index) 66 -- Only consider non-alpha-numerics and non-spaces as delimiters. The 67 -- non-spaces are used to ignore operators like "-s". 68 if not delimiter:find('[%w ]') then 69 local patt 70 if delimiter_matches[delimiter] then 71 -- Handle nested delimiter/matches in strings. 72 local s, e = delimiter, delimiter_matches[delimiter] 73 patt = lexer.range(s, e, false, true, true) 74 else 75 patt = lexer.range(delimiter) 76 end 77 local first_match_pos = lpeg.match(patt, input, index) 78 local final_match_pos = lpeg.match(patt, input, first_match_pos - 1) 79 if not final_match_pos then -- using (), [], {}, or <> notation 80 final_match_pos = lpeg.match(lexer.space^0 * patt, input, first_match_pos) 81 end 82 return final_match_pos or #input + 1 83 end 84end) 85 86-- Strings. 87local sq_str = lexer.range("'") 88local dq_str = lexer.range('"') 89local cmd_str = lexer.range('`') 90local heredoc = '<<' * P(function(input, index) 91 local s, e, delimiter = input:find('([%a_][%w_]*)[\n\r\f;]+', index) 92 if s == index and delimiter then 93 local end_heredoc = '[\n\r\f]+' 94 local _, e = input:find(end_heredoc .. delimiter, e) 95 return e and e + 1 or #input + 1 96 end 97end) 98local lit_str = 'q' * P('q')^-1 * literal_delimited 99local lit_array = 'qw' * literal_delimited 100local lit_cmd = 'qx' * literal_delimited 101local lit_tr = (P('tr') + 'y') * literal_delimited2 * S('cds')^0 102local string = token(lexer.STRING, sq_str + dq_str + cmd_str + heredoc + 103 lit_str + lit_array + lit_cmd + lit_tr) 104local regex_str = #P('/') * lexer.last_char_includes('-<>+*!~\\=%&|^?:;([{') * 105 lexer.range('/', true) * S('imosx')^0 106local lit_regex = 'qr' * literal_delimited * S('imosx')^0 107local lit_match = 'm' * literal_delimited * S('cgimosx')^0 108local lit_sub = 's' * literal_delimited2 * S('ecgimosx')^0 109local regex = token(lexer.REGEX, regex_str + lit_regex + lit_match + lit_sub) 110lex:add_rule('string', string + regex) 111 112-- Identifiers. 113lex:add_rule('identifier', token(lexer.IDENTIFIER, lexer.word)) 114 115-- Comments. 116local line_comment = lexer.to_eol('#', true) 117local block_comment = lexer.range(lexer.starts_line('=' * lexer.alpha), 118 lexer.starts_line('=cut')) 119lex:add_rule('comment', token(lexer.COMMENT, block_comment + line_comment)) 120 121-- Numbers. 122lex:add_rule('number', token(lexer.NUMBER, lexer.number)) 123 124-- Variables. 125local special_var = '$' * ( 126 '^' * S('ADEFHILMOPSTWX')^-1 + 127 S('\\"[]\'&`+*.,;=%~?@<>(|/!-') + 128 ':' * (lexer.any - ':') + 129 P('$') * -lexer.word + 130 lexer.digit^1) 131local plain_var = ('$#' + S('$@%')) * P('$')^0 * lexer.word + '$#' 132lex:add_rule('variable', token(lexer.VARIABLE, special_var + plain_var)) 133 134-- Operators. 135lex:add_rule('operator', token(lexer.OPERATOR, S('-<>+*!~\\=/%&|^.,?:;()[]{}'))) 136 137-- Fold points. 138lex:add_fold_point(lexer.OPERATOR, '[', ']') 139lex:add_fold_point(lexer.OPERATOR, '{', '}') 140lex:add_fold_point(lexer.COMMENT, lexer.fold_consecutive_lines('#')) 141 142return lex 143