1local pl_lexer = [===[ 2--- Lexical scanner for creating a sequence of tokens from text. 3-- `lexer.scan(s)` returns an iterator over all tokens found in the 4-- string `s`. This iterator returns two values, a token type string 5-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the 6-- token. 7-- 8-- Versions specialized for Lua and C are available; these also handle block comments 9-- and classify keywords as 'keyword' tokens. For example: 10-- 11-- > s = 'for i=1,n do' 12-- > for t,v in lexer.lua(s) do print(t,v) end 13-- keyword for 14-- iden i 15-- = = 16-- number 1 17-- , , 18-- iden n 19-- keyword do 20-- 21-- See the Guide for further @{06-data.md.Lexical_Scanning|discussion} 22-- @module pl.lexer 23 24local yield,wrap = coroutine.yield,coroutine.wrap 25local strfind = string.find 26local strsub = string.sub 27local append = table.insert 28 29local function assert_arg(idx,val,tp) 30 if type(val) ~= tp then 31 error("argument "..idx.." must be "..tp, 2) 32 end 33end 34 35local lexer = {} 36 37local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+' 38local NUMBER2 = '^[%+%-]?%d+%.?%d*' 39local NUMBER3 = '^0x[%da-fA-F]+' 40local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+' 41local NUMBER5 = '^%d+%.?%d*' 42local IDEN = '^[%a_][%w_]*' 43local WSPACE = '^%s+' 44local STRING0 = [[^(['\"]).-\\%1]] 45local STRING1 = [[^(['\"]).-[^\]%1]] 46local STRING3 = "^((['\"])%2)" -- empty string 47local PREPRO = '^#.-[^\\]\n' 48 49local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword 50 51local function tdump(tok) 52 return yield(tok,tok) 53end 54 55local function ndump(tok,options) 56 if options and options.number then 57 tok = tonumber(tok) 58 end 59 return yield("number",tok) 60end 61 62-- regular strings, single or double quotes; usually we want them 63-- without the quotes 64local function sdump(tok,options) 65 if options and options.string then 66 tok = tok:sub(2,-2) 67 end 68 return yield("string",tok) 69end 70 71-- long Lua strings need extra work to get rid of the quotes 72local function sdump_l(tok,options,findres) 73 if options and options.string then 74 local quotelen = 3 75 if findres[3] then 76 quotelen = quotelen + findres[3]:len() 77 end 78 tok = tok:sub(quotelen,-1 * quotelen) 79 end 80 return yield("string",tok) 81end 82 83local function chdump(tok,options) 84 if options and options.string then 85 tok = tok:sub(2,-2) 86 end 87 return yield("char",tok) 88end 89 90local function cdump(tok) 91 return yield('comment',tok) 92end 93 94local function wsdump (tok) 95 return yield("space",tok) 96end 97 98local function pdump (tok) 99 return yield('prepro',tok) 100end 101 102local function plain_vdump(tok) 103 return yield("iden",tok) 104end 105 106local function lua_vdump(tok) 107 if lua_keyword[tok] then 108 return yield("keyword",tok) 109 else 110 return yield("iden",tok) 111 end 112end 113 114local function cpp_vdump(tok) 115 if cpp_keyword[tok] then 116 return yield("keyword",tok) 117 else 118 return yield("iden",tok) 119 end 120end 121 122--- create a plain token iterator from a string or file-like object. 123-- @string s the string 124-- @tab matches an optional match table (set of pattern-action pairs) 125-- @tab[opt] filter a table of token types to exclude, by default `{space=true}` 126-- @tab[opt] options a table of options; by default, `{number=true,string=true}`, 127-- which means convert numbers and strip string quotes. 128function lexer.scan (s,matches,filter,options) 129 --assert_arg(1,s,'string') 130 local file = type(s) ~= 'string' and s 131 filter = filter or {space=true} 132 options = options or {number=true,string=true} 133 if filter then 134 if filter.space then filter[wsdump] = true end 135 if filter.comments then 136 filter[cdump] = true 137 end 138 end 139 if not matches then 140 if not plain_matches then 141 plain_matches = { 142 {WSPACE,wsdump}, 143 {NUMBER3,ndump}, 144 {IDEN,plain_vdump}, 145 {NUMBER1,ndump}, 146 {NUMBER2,ndump}, 147 {STRING3,sdump}, 148 {STRING0,sdump}, 149 {STRING1,sdump}, 150 {'^.',tdump} 151 } 152 end 153 matches = plain_matches 154 end 155 local function lex () 156 if type(s)=='string' and s=='' then return end 157 local findres,i1,i2,idx,res1,res2,tok,pat,fun,capt 158 local line = 1 159 if file then s = file:read()..'\n' end 160 local sz = #s 161 local idx = 1 162 --print('sz',sz) 163 while true do 164 for _,m in ipairs(matches) do 165 pat = m[1] 166 fun = m[2] 167 findres = { strfind(s,pat,idx) } 168 i1 = findres[1] 169 i2 = findres[2] 170 if i1 then 171 tok = strsub(s,i1,i2) 172 idx = i2 + 1 173 if not (filter and filter[fun]) then 174 lexer.finished = idx > sz 175 res1,res2 = fun(tok,options,findres) 176 end 177 if res1 then 178 local tp = type(res1) 179 -- insert a token list 180 if tp=='table' then 181 yield('','') 182 for _,t in ipairs(res1) do 183 yield(t[1],t[2]) 184 end 185 elseif tp == 'string' then -- or search up to some special pattern 186 i1,i2 = strfind(s,res1,idx) 187 if i1 then 188 tok = strsub(s,i1,i2) 189 idx = i2 + 1 190 yield('',tok) 191 else 192 yield('','') 193 idx = sz + 1 194 end 195 --if idx > sz then return end 196 else 197 yield(line,idx) 198 end 199 end 200 if idx > sz then 201 if file then 202 --repeat -- next non-empty line 203 line = line + 1 204 s = file:read() 205 if not s then return end 206 --until not s:match '^%s*$' 207 s = s .. '\n' 208 idx ,sz = 1,#s 209 break 210 else 211 return 212 end 213 else break end 214 end 215 end 216 end 217 end 218 return wrap(lex) 219end 220 221local function isstring (s) 222 return type(s) == 'string' 223end 224 225--- insert tokens into a stream. 226-- @param tok a token stream 227-- @param a1 a string is the type, a table is a token list and 228-- a function is assumed to be a token-like iterator (returns type & value) 229-- @string a2 a string is the value 230function lexer.insert (tok,a1,a2) 231 if not a1 then return end 232 local ts 233 if isstring(a1) and isstring(a2) then 234 ts = {{a1,a2}} 235 elseif type(a1) == 'function' then 236 ts = {} 237 for t,v in a1() do 238 append(ts,{t,v}) 239 end 240 else 241 ts = a1 242 end 243 tok(ts) 244end 245 246--- get everything in a stream upto a newline. 247-- @param tok a token stream 248-- @return a string 249function lexer.getline (tok) 250 local t,v = tok('.-\n') 251 return v 252end 253 254--- get current line number. 255-- Only available if the input source is a file-like object. 256-- @param tok a token stream 257-- @return the line number and current column 258function lexer.lineno (tok) 259 return tok(0) 260end 261 262--- get the rest of the stream. 263-- @param tok a token stream 264-- @return a string 265function lexer.getrest (tok) 266 local t,v = tok('.+') 267 return v 268end 269 270--- get the Lua keywords as a set-like table. 271-- So `res["and"]` etc would be `true`. 272-- @return a table 273function lexer.get_keywords () 274 if not lua_keyword then 275 lua_keyword = { 276 ["and"] = true, ["break"] = true, ["do"] = true, 277 ["else"] = true, ["elseif"] = true, ["end"] = true, 278 ["false"] = true, ["for"] = true, ["function"] = true, 279 ["if"] = true, ["in"] = true, ["local"] = true, ["nil"] = true, 280 ["not"] = true, ["or"] = true, ["repeat"] = true, 281 ["return"] = true, ["then"] = true, ["true"] = true, 282 ["until"] = true, ["while"] = true 283 } 284 end 285 return lua_keyword 286end 287 288--- create a Lua token iterator from a string or file-like object. 289-- Will return the token type and value. 290-- @string s the string 291-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}` 292-- @tab[opt] options a table of options; by default, `{number=true,string=true}`, 293-- which means convert numbers and strip string quotes. 294function lexer.lua(s,filter,options) 295 filter = filter or {space=true,comments=true} 296 lexer.get_keywords() 297 if not lua_matches then 298 lua_matches = { 299 {WSPACE,wsdump}, 300 {NUMBER3,ndump}, 301 {IDEN,lua_vdump}, 302 {NUMBER4,ndump}, 303 {NUMBER5,ndump}, 304 {STRING3,sdump}, 305 {STRING0,sdump}, 306 {STRING1,sdump}, 307 {'^%-%-%[(=*)%[.-%]%1%]',cdump}, 308 {'^%-%-.-\n',cdump}, 309 {'^%[(=*)%[.-%]%1%]',sdump_l}, 310 {'^==',tdump}, 311 {'^~=',tdump}, 312 {'^<=',tdump}, 313 {'^>=',tdump}, 314 {'^%.%.%.',tdump}, 315 {'^%.%.',tdump}, 316 {'^//',tdump}, 317 {'^.',tdump} 318 } 319 end 320 return lexer.scan(s,lua_matches,filter,options) 321end 322 323--- create a C/C++ token iterator from a string or file-like object. 324-- Will return the token type type and value. 325-- @string s the string 326-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}` 327-- @tab[opt] options a table of options; by default, `{number=true,string=true}`, 328-- which means convert numbers and strip string quotes. 329function lexer.cpp(s,filter,options) 330 filter = filter or {comments=true} 331 if not cpp_keyword then 332 cpp_keyword = { 333 ["class"] = true, ["break"] = true, ["do"] = true, ["sizeof"] = true, 334 ["else"] = true, ["continue"] = true, ["struct"] = true, 335 ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true, 336 ["private"] = true, ["protected"] = true, ["goto"] = true, 337 ["if"] = true, ["static"] = true, ["const"] = true, ["typedef"] = true, 338 ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true, 339 ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true, 340 ["double"] = true, ["while"] = true, ["new"] = true, 341 ["namespace"] = true, ["try"] = true, ["catch"] = true, 342 ["switch"] = true, ["case"] = true, ["extern"] = true, 343 ["return"] = true,["default"] = true,['unsigned'] = true,['signed'] = true, 344 ["union"] = true, ["volatile"] = true, ["register"] = true,["short"] = true, 345 } 346 end 347 if not cpp_matches then 348 cpp_matches = { 349 {WSPACE,wsdump}, 350 {PREPRO,pdump}, 351 {NUMBER3,ndump}, 352 {IDEN,cpp_vdump}, 353 {NUMBER4,ndump}, 354 {NUMBER5,ndump}, 355 {STRING3,sdump}, 356 {STRING1,chdump}, 357 {'^//.-\n',cdump}, 358 {'^/%*.-%*/',cdump}, 359 {'^==',tdump}, 360 {'^!=',tdump}, 361 {'^<=',tdump}, 362 {'^>=',tdump}, 363 {'^->',tdump}, 364 {'^&&',tdump}, 365 {'^||',tdump}, 366 {'^%+%+',tdump}, 367 {'^%-%-',tdump}, 368 {'^%+=',tdump}, 369 {'^%-=',tdump}, 370 {'^%*=',tdump}, 371 {'^/=',tdump}, 372 {'^|=',tdump}, 373 {'^%^=',tdump}, 374 {'^::',tdump}, 375 {'^.',tdump} 376 } 377 end 378 return lexer.scan(s,cpp_matches,filter,options) 379end 380 381--- get a list of parameters separated by a delimiter from a stream. 382-- @param tok the token stream 383-- @string[opt=')'] endtoken end of list. Can be '\n' 384-- @string[opt=','] delim separator 385-- @return a list of token lists. 386function lexer.get_separated_list(tok,endtoken,delim) 387 endtoken = endtoken or ')' 388 delim = delim or ',' 389 local parm_values = {} 390 local level = 1 -- used to count ( and ) 391 local tl = {} 392 local function tappend (tl,t,val) 393 val = val or t 394 append(tl,{t,val}) 395 end 396 local is_end 397 if endtoken == '\n' then 398 is_end = function(t,val) 399 return t == 'space' and val:find '\n' 400 end 401 else 402 is_end = function (t) 403 return t == endtoken 404 end 405 end 406 local token,value 407 while true do 408 token,value=tok() 409 if not token then return nil,'EOS' end -- end of stream is an error! 410 if is_end(token,value) and level == 1 then 411 append(parm_values,tl) 412 break 413 elseif token == '(' then 414 level = level + 1 415 tappend(tl,'(') 416 elseif token == ')' then 417 level = level - 1 418 if level == 0 then -- finished with parm list 419 append(parm_values,tl) 420 break 421 else 422 tappend(tl,')') 423 end 424 elseif token == delim and level == 1 then 425 append(parm_values,tl) -- a new parm 426 tl = {} 427 else 428 tappend(tl,token,value) 429 end 430 end 431 return parm_values,{token,value} 432end 433 434--- get the next non-space token from the stream. 435-- @param tok the token stream. 436function lexer.skipws (tok) 437 local t,v = tok() 438 while t == 'space' do 439 t,v = tok() 440 end 441 return t,v 442end 443 444local skipws = lexer.skipws 445 446--- get the next token, which must be of the expected type. 447-- Throws an error if this type does not match! 448-- @param tok the token stream 449-- @string expected_type the token type 450-- @bool no_skip_ws whether we should skip whitespace 451function lexer.expecting (tok,expected_type,no_skip_ws) 452 assert_arg(1,tok,'function') 453 assert_arg(2,expected_type,'string') 454 local t,v 455 if no_skip_ws then 456 t,v = tok() 457 else 458 t,v = skipws(tok) 459 end 460 if t ~= expected_type then error ("expecting "..expected_type,2) end 461 return v 462end 463 464return lexer 465]===] 466 467local lexer = load( pl_lexer, 'lexer.lua' )() 468 469local enctab = { 470 [ " " ] = '0', -- 221208 471 [ "." ] = '100', -- 62755 472 [ "=" ] = '1010', -- 29673 473 [ "end" ] = '1011000', -- 3844 474 [ "if" ] = '10110010', -- 1967 475 [ "==" ] = '10110011', -- 2003 476 [ "function" ] = '101101000', -- 1006 477 [ "local" ] = '101101001', -- 1096 478 [ "<=" ] = '1011010100', -- 541 479 [ "while" ] = '1011010101', -- 557 480 [ ":" ] = '10110101100', -- 289 481 [ "<" ] = '10110101101', -- 290 482 [ "-" ] = '10110101110', -- 304 483 [ "for" ] = '10110101111', -- 314 484 [ "then" ] = '10110110', -- 2637 485 [ "and" ] = '1011011100', -- 644 486 [ ">=" ] = '10110111010', -- 319 487 [ ">" ] = '10110111011', -- 326 488 [ "elseif" ] = '1011011110', -- 670 489 [ "else" ] = '10110111110', -- 381 490 [ "%" ] = '101101111110', -- 198 491 [ "nil" ] = '101101111111', -- 201 492 [ "self" ] = '10111', -- 24196 493 [ "\ 494" ] = '1100', -- 45578 495 [ "true" ] = '11010000', -- 2799 496 [ "," ] = '11010001', -- 2815 497 [ "[" ] = '1101001', -- 6744 498 [ ")" ] = '110101', -- 12925 499 [ "(" ] = '110110', -- 12925 500 [ "]" ] = '1101110', -- 6744 501 [ "false" ] = '11011110', -- 3512 502 [ "{" ] = '11011111000', -- 429 503 [ "}" ] = '11011111001', -- 429 504 [ "do" ] = '1101111101', -- 871 505 [ "+" ] = '1101111110', -- 900 506 [ "~=" ] = '1101111111000', -- 99 507 [ "return" ] = '11011111110010', -- 55 508 [ "/" ] = '1101111111001100', -- 13 509 [ "^" ] = '1101111111001101', -- 14 510 [ "#" ] = '110111111100111', -- 28 511 [ "or" ] = '110111111101', -- 220 512 [ "not" ] = '110111111110', -- 244 513 [ "//" ] = '1101111111110', -- 121 514 [ "in" ] = '110111111111100', -- 34 515 [ "*" ] = '110111111111101', -- 35 516 [ "until" ] = '11011111111111000', -- 7 517 [ "|" ] = '1101111111111100100000', -- 0 518 [ "goto" ] = '1101111111111100100001', -- 0 519 [ "break" ] = '1101111111111100100010', -- 0 520 [ ">>" ] = '11011111111111001000110', -- 0 521 [ "\9" ] = '11011111111111001000111', -- 0 522 [ "~" ] = '11011111111111001001000', -- 0 523 [ "::" ] = '11011111111111001001001', -- 0 524 [ ";" ] = '11011111111111001001010', -- 0 525 [ "eof" ] = '11011111111111001001011', -- 0 526 [ "<<" ] = '11011111111111001001100', -- 0 527 [ "\11" ] = '11011111111111001001101', -- 0 528 [ "\13" ] = '11011111111111001001110', -- 0 529 [ "\12" ] = '11011111111111001001111', -- 0 530 [ "..." ] = '1101111111111100101', -- 4 531 [ "repeat" ] = '110111111111110011', -- 7 532 [ "&" ] = '1101111111111101', -- 20 533 [ ".." ] = '110111111111111', -- 39 534 [ "literal" ] = '111', -- 116390 535} 536 537local function errorout( ... ) 538 local args = { ... } 539 local format = args[ 1 ] 540 table.remove( args, 1 ) 541 io.stderr:write( string.format( format, args ), '\n' ) 542 os.exit( 1 ) 543end 544 545local function addbits( encoded, bits ) 546 for i = 1, #bits do 547 encoded[ #encoded + 1 ] = bits:sub( i, i ) + 0 548 end 549end 550 551local function addliteral( encoded, str ) 552 if #encoded % 8 == 0 then 553 addbits( encoded, '1' ) 554 end 555 556 for i = 1, #str do 557 local k = str:byte( i, i ) 558 559 for j = 7, 0, -1 do 560 if bit32.band( k, bit32.lshift( 1, j ) ) ~= 0 then 561 addbits( encoded, '1' ) 562 else 563 addbits( encoded, '0' ) 564 end 565 end 566 end 567 568 addbits( encoded, '00000000' ) 569end 570 571local function bitstobyte( encoded, i ) 572 local bit = 128 573 local byte = 0 574 575 for j = 0, 7 do 576 byte = byte + encoded[ i + j ] * bit 577 bit = bit / 2 578 end 579 580 return string.char( byte ) 581end 582 583local function encode( source ) 584 local encoded = {} 585 586 for token, lexeme in lexer.lua( source, {} ) do 587 local bits 588 589 if token == 'space' then 590 for i = 1, #lexeme do 591 addbits( encoded, enctab[ lexeme:sub( i, i ) ] ) 592 end 593 elseif token == 'keyword' then 594 addbits( encoded, enctab[ lexeme ] ) 595 elseif token == 'iden' then 596 addbits( encoded, enctab.literal ) 597 addliteral( encoded, lexeme ) 598 elseif token == 'number' then 599 addbits( encoded, enctab.literal ) 600 addliteral( encoded, tostring( lexeme ) ) 601 elseif token == 'string' then 602 addbits( encoded, enctab.literal ) 603 addliteral( encoded, string.format( '%q', lexeme ) ) 604 elseif token == 'comment' then 605 addbits( encoded, enctab.literal ) 606 addliteral( encoded, lexeme ) 607 else 608 addbits( encoded, enctab[ lexeme ] ) 609 end 610 end 611 612 addbits( encoded, enctab.eof ) 613 614 while #encoded %8 ~= 0 do 615 addbits( encoded, '0' ) 616 end 617 618 local s = {} 619 620 for i = 1, #encoded, 8 do 621 s[ #s + 1 ] = bitstobyte( encoded, i ) 622 end 623 624 return table.concat( s ) 625end 626 627local function main( args ) 628 if #args ~= 2 then 629 io.write( 'Usage: lua bsenc.lua <input.lua> <output.bs>\n' ) 630 return 0 631 end 632 633 local file, err = io.open( args[ 1 ] ) 634 635 if not file then 636 errorout( 'Error opening %s', args[ 1 ] ) 637 end 638 639 local source = file:read( '*a' ) 640 file:close() 641 642 if not source then 643 errorout( 'Could not read from %s', args[ 1 ] ) 644 end 645 646 local encoded = encode( source ) 647 648 file, err = io.open( args[ 2 ], 'wb' ) 649 650 if not file then 651 errorout( 'Error opening %s', args[ 2 ] ) 652 end 653 654 file:write( encoded ) 655 file:close() 656end 657 658return main( arg ) 659