1local pl_lexer = [===[
2--- Lexical scanner for creating a sequence of tokens from text.
3-- `lexer.scan(s)` returns an iterator over all tokens found in the
4-- string `s`. This iterator returns two values, a token type string
5-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the
6-- token.
8-- Versions specialized for Lua and C are available; these also handle block comments
9-- and classify keywords as 'keyword' tokens. For example:
11--    > s = 'for i=1,n do'
12--    > for t,v in lexer.lua(s)  do print(t,v) end
13--    keyword for
14--    iden    i
15--    =       =
16--    number  1
17--    ,       ,
18--    iden    n
19--    keyword do
21-- See the Guide for further @{06-data.md.Lexical_Scanning|discussion}
22-- @module pl.lexer
24local yield,wrap = coroutine.yield,coroutine.wrap
25local strfind = string.find
26local strsub = string.sub
27local append = table.insert
29local function assert_arg(idx,val,tp)
30    if type(val) ~= tp then
31        error("argument "..idx.." must be "..tp, 2)
32    end
35local lexer = {}
37local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+'
38local NUMBER2 = '^[%+%-]?%d+%.?%d*'
39local NUMBER3 = '^0x[%da-fA-F]+'
40local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+'
41local NUMBER5 = '^%d+%.?%d*'
42local IDEN = '^[%a_][%w_]*'
43local WSPACE = '^%s+'
44local STRING0 = [[^(['\"]).-\\%1]]
45local STRING1 = [[^(['\"]).-[^\]%1]]
46local STRING3 = "^((['\"])%2)" -- empty string
47local PREPRO = '^#.-[^\\]\n'
49local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword
51local function tdump(tok)
52    return yield(tok,tok)
55local function ndump(tok,options)
56    if options and options.number then
57        tok = tonumber(tok)
58    end
59    return yield("number",tok)
62-- regular strings, single or double quotes; usually we want them
63-- without the quotes
64local function sdump(tok,options)
65    if options and options.string then
66        tok = tok:sub(2,-2)
67    end
68    return yield("string",tok)
71-- long Lua strings need extra work to get rid of the quotes
72local function sdump_l(tok,options,findres)
73    if options and options.string then
74        local quotelen = 3
75        if findres[3] then
76            quotelen = quotelen + findres[3]:len()
77        end
78        tok = tok:sub(quotelen,-1 * quotelen)
79    end
80    return yield("string",tok)
83local function chdump(tok,options)
84    if options and options.string then
85        tok = tok:sub(2,-2)
86    end
87    return yield("char",tok)
90local function cdump(tok)
91    return yield('comment',tok)
94local function wsdump (tok)
95    return yield("space",tok)
98local function pdump (tok)
99    return yield('prepro',tok)
102local function plain_vdump(tok)
103    return yield("iden",tok)
106local function lua_vdump(tok)
107    if lua_keyword[tok] then
108        return yield("keyword",tok)
109    else
110        return yield("iden",tok)
111    end
114local function cpp_vdump(tok)
115    if cpp_keyword[tok] then
116        return yield("keyword",tok)
117    else
118        return yield("iden",tok)
119    end
122--- create a plain token iterator from a string or file-like object.
123-- @string s the string
124-- @tab matches an optional match table (set of pattern-action pairs)
125-- @tab[opt] filter a table of token types to exclude, by default `{space=true}`
126-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
127-- which means convert numbers and strip string quotes.
128function lexer.scan (s,matches,filter,options)
129    --assert_arg(1,s,'string')
130    local file = type(s) ~= 'string' and s
131    filter = filter or {space=true}
132    options = options or {number=true,string=true}
133    if filter then
134        if filter.space then filter[wsdump] = true end
135        if filter.comments then
136            filter[cdump] = true
137        end
138    end
139    if not matches then
140        if not plain_matches then
141            plain_matches = {
142                {WSPACE,wsdump},
143                {NUMBER3,ndump},
144                {IDEN,plain_vdump},
145                {NUMBER1,ndump},
146                {NUMBER2,ndump},
147                {STRING3,sdump},
148                {STRING0,sdump},
149                {STRING1,sdump},
150                {'^.',tdump}
151            }
152        end
153        matches = plain_matches
154    end
155    local function lex ()
156        if type(s)=='string' and s=='' then return end
157        local findres,i1,i2,idx,res1,res2,tok,pat,fun,capt
158        local line = 1
159        if file then s = file:read()..'\n' end
160        local sz = #s
161        local idx = 1
162        --print('sz',sz)
163        while true do
164            for _,m in ipairs(matches) do
165                pat = m[1]
166                fun = m[2]
167                findres = { strfind(s,pat,idx) }
168                i1 = findres[1]
169                i2 = findres[2]
170                if i1 then
171                    tok = strsub(s,i1,i2)
172                    idx = i2 + 1
173                    if not (filter and filter[fun]) then
174                        lexer.finished = idx > sz
175                        res1,res2 = fun(tok,options,findres)
176                    end
177                    if res1 then
178                        local tp = type(res1)
179                        -- insert a token list
180                        if tp=='table' then
181                            yield('','')
182                            for _,t in ipairs(res1) do
183                                yield(t[1],t[2])
184                            end
185                        elseif tp == 'string' then -- or search up to some special pattern
186                            i1,i2 = strfind(s,res1,idx)
187                            if i1 then
188                                tok = strsub(s,i1,i2)
189                                idx = i2 + 1
190                                yield('',tok)
191                            else
192                                yield('','')
193                                idx = sz + 1
194                            end
195                            --if idx > sz then return end
196                        else
197                            yield(line,idx)
198                        end
199                    end
200                    if idx > sz then
201                        if file then
202                            --repeat -- next non-empty line
203                                line = line + 1
204                                s = file:read()
205                                if not s then return end
206                            --until not s:match '^%s*$'
207                            s = s .. '\n'
208                            idx ,sz = 1,#s
209                            break
210                        else
211                            return
212                        end
213                    else break end
214                end
215            end
216        end
217    end
218    return wrap(lex)
221local function isstring (s)
222    return type(s) == 'string'
225--- insert tokens into a stream.
226-- @param tok a token stream
227-- @param a1 a string is the type, a table is a token list and
228-- a function is assumed to be a token-like iterator (returns type & value)
229-- @string a2 a string is the value
230function lexer.insert (tok,a1,a2)
231    if not a1 then return end
232    local ts
233    if isstring(a1) and isstring(a2) then
234        ts = {{a1,a2}}
235    elseif type(a1) == 'function' then
236        ts = {}
237        for t,v in a1() do
238            append(ts,{t,v})
239        end
240    else
241        ts = a1
242    end
243    tok(ts)
246--- get everything in a stream upto a newline.
247-- @param tok a token stream
248-- @return a string
249function lexer.getline (tok)
250    local t,v = tok('.-\n')
251    return v
254--- get current line number.
255-- Only available if the input source is a file-like object.
256-- @param tok a token stream
257-- @return the line number and current column
258function lexer.lineno (tok)
259    return tok(0)
262--- get the rest of the stream.
263-- @param tok a token stream
264-- @return a string
265function lexer.getrest (tok)
266    local t,v = tok('.+')
267    return v
270--- get the Lua keywords as a set-like table.
271-- So `res["and"]` etc would be `true`.
272-- @return a table
273function lexer.get_keywords ()
274    if not lua_keyword then
275        lua_keyword = {
276            ["and"] = true, ["break"] = true,  ["do"] = true,
277            ["else"] = true, ["elseif"] = true, ["end"] = true,
278            ["false"] = true, ["for"] = true, ["function"] = true,
279            ["if"] = true, ["in"] = true,  ["local"] = true, ["nil"] = true,
280            ["not"] = true, ["or"] = true, ["repeat"] = true,
281            ["return"] = true, ["then"] = true, ["true"] = true,
282            ["until"] = true,  ["while"] = true
283        }
284    end
285    return lua_keyword
288--- create a Lua token iterator from a string or file-like object.
289-- Will return the token type and value.
290-- @string s the string
291-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
292-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
293-- which means convert numbers and strip string quotes.
294function lexer.lua(s,filter,options)
295    filter = filter or {space=true,comments=true}
296    lexer.get_keywords()
297    if not lua_matches then
298        lua_matches = {
299            {WSPACE,wsdump},
300            {NUMBER3,ndump},
301            {IDEN,lua_vdump},
302            {NUMBER4,ndump},
303            {NUMBER5,ndump},
304            {STRING3,sdump},
305            {STRING0,sdump},
306            {STRING1,sdump},
307            {'^%-%-%[(=*)%[.-%]%1%]',cdump},
308            {'^%-%-.-\n',cdump},
309            {'^%[(=*)%[.-%]%1%]',sdump_l},
310            {'^==',tdump},
311            {'^~=',tdump},
312            {'^<=',tdump},
313            {'^>=',tdump},
314            {'^%.%.%.',tdump},
315            {'^%.%.',tdump},
316            {'^//',tdump},
317            {'^.',tdump}
318        }
319    end
320    return lexer.scan(s,lua_matches,filter,options)
323--- create a C/C++ token iterator from a string or file-like object.
324-- Will return the token type type and value.
325-- @string s the string
326-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
327-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
328-- which means convert numbers and strip string quotes.
329function lexer.cpp(s,filter,options)
330    filter = filter or {comments=true}
331    if not cpp_keyword then
332        cpp_keyword = {
333            ["class"] = true, ["break"] = true,  ["do"] = true, ["sizeof"] = true,
334            ["else"] = true, ["continue"] = true, ["struct"] = true,
335            ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
336            ["private"] = true, ["protected"] = true, ["goto"] = true,
337            ["if"] = true, ["static"] = true,  ["const"] = true, ["typedef"] = true,
338            ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
339            ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
340            ["double"] = true,  ["while"] = true, ["new"] = true,
341            ["namespace"] = true, ["try"] = true, ["catch"] = true,
342            ["switch"] = true, ["case"] = true, ["extern"] = true,
343            ["return"] = true,["default"] = true,['unsigned']  = true,['signed'] = true,
344            ["union"] =  true, ["volatile"] = true, ["register"] = true,["short"] = true,
345        }
346    end
347    if not cpp_matches then
348        cpp_matches = {
349            {WSPACE,wsdump},
350            {PREPRO,pdump},
351            {NUMBER3,ndump},
352            {IDEN,cpp_vdump},
353            {NUMBER4,ndump},
354            {NUMBER5,ndump},
355            {STRING3,sdump},
356            {STRING1,chdump},
357            {'^//.-\n',cdump},
358            {'^/%*.-%*/',cdump},
359            {'^==',tdump},
360            {'^!=',tdump},
361            {'^<=',tdump},
362            {'^>=',tdump},
363            {'^->',tdump},
364            {'^&&',tdump},
365            {'^||',tdump},
366            {'^%+%+',tdump},
367            {'^%-%-',tdump},
368            {'^%+=',tdump},
369            {'^%-=',tdump},
370            {'^%*=',tdump},
371            {'^/=',tdump},
372            {'^|=',tdump},
373            {'^%^=',tdump},
374            {'^::',tdump},
375            {'^.',tdump}
376        }
377    end
378    return lexer.scan(s,cpp_matches,filter,options)
381--- get a list of parameters separated by a delimiter from a stream.
382-- @param tok the token stream
383-- @string[opt=')'] endtoken end of list. Can be '\n'
384-- @string[opt=','] delim separator
385-- @return a list of token lists.
386function lexer.get_separated_list(tok,endtoken,delim)
387    endtoken = endtoken or ')'
388    delim = delim or ','
389    local parm_values = {}
390    local level = 1 -- used to count ( and )
391    local tl = {}
392    local function tappend (tl,t,val)
393        val = val or t
394        append(tl,{t,val})
395    end
396    local is_end
397    if endtoken == '\n' then
398        is_end = function(t,val)
399            return t == 'space' and val:find '\n'
400        end
401    else
402        is_end = function (t)
403            return t == endtoken
404        end
405    end
406    local token,value
407    while true do
408        token,value=tok()
409        if not token then return nil,'EOS' end -- end of stream is an error!
410        if is_end(token,value) and level == 1 then
411            append(parm_values,tl)
412            break
413        elseif token == '(' then
414            level = level + 1
415            tappend(tl,'(')
416        elseif token == ')' then
417            level = level - 1
418            if level == 0 then -- finished with parm list
419                append(parm_values,tl)
420                break
421            else
422                tappend(tl,')')
423            end
424        elseif token == delim and level == 1 then
425            append(parm_values,tl) -- a new parm
426            tl = {}
427        else
428            tappend(tl,token,value)
429        end
430    end
431    return parm_values,{token,value}
434--- get the next non-space token from the stream.
435-- @param tok the token stream.
436function lexer.skipws (tok)
437    local t,v = tok()
438    while t == 'space' do
439        t,v = tok()
440    end
441    return t,v
444local skipws = lexer.skipws
446--- get the next token, which must be of the expected type.
447-- Throws an error if this type does not match!
448-- @param tok the token stream
449-- @string expected_type the token type
450-- @bool no_skip_ws whether we should skip whitespace
451function lexer.expecting (tok,expected_type,no_skip_ws)
452    assert_arg(1,tok,'function')
453    assert_arg(2,expected_type,'string')
454    local t,v
455    if no_skip_ws then
456        t,v = tok()
457    else
458        t,v = skipws(tok)
459    end
460    if t ~= expected_type then error ("expecting "..expected_type,2) end
461    return v
464return lexer
467local lexer = load( pl_lexer, 'lexer.lua' )()
469local enctab = {
470  [ " " ] = '0', -- 221208
471  [ "." ] = '100', -- 62755
472  [ "=" ] = '1010', -- 29673
473  [ "end" ] = '1011000', -- 3844
474  [ "if" ] = '10110010', -- 1967
475  [ "==" ] = '10110011', -- 2003
476  [ "function" ] = '101101000', -- 1006
477  [ "local" ] = '101101001', -- 1096
478  [ "<=" ] = '1011010100', -- 541
479  [ "while" ] = '1011010101', -- 557
480  [ ":" ] = '10110101100', -- 289
481  [ "<" ] = '10110101101', -- 290
482  [ "-" ] = '10110101110', -- 304
483  [ "for" ] = '10110101111', -- 314
484  [ "then" ] = '10110110', -- 2637
485  [ "and" ] = '1011011100', -- 644
486  [ ">=" ] = '10110111010', -- 319
487  [ ">" ] = '10110111011', -- 326
488  [ "elseif" ] = '1011011110', -- 670
489  [ "else" ] = '10110111110', -- 381
490  [ "%" ] = '101101111110', -- 198
491  [ "nil" ] = '101101111111', -- 201
492  [ "self" ] = '10111', -- 24196
493  [ "\
494" ] = '1100', -- 45578
495  [ "true" ] = '11010000', -- 2799
496  [ "," ] = '11010001', -- 2815
497  [ "[" ] = '1101001', -- 6744
498  [ ")" ] = '110101', -- 12925
499  [ "(" ] = '110110', -- 12925
500  [ "]" ] = '1101110', -- 6744
501  [ "false" ] = '11011110', -- 3512
502  [ "{" ] = '11011111000', -- 429
503  [ "}" ] = '11011111001', -- 429
504  [ "do" ] = '1101111101', -- 871
505  [ "+" ] = '1101111110', -- 900
506  [ "~=" ] = '1101111111000', -- 99
507  [ "return" ] = '11011111110010', -- 55
508  [ "/" ] = '1101111111001100', -- 13
509  [ "^" ] = '1101111111001101', -- 14
510  [ "#" ] = '110111111100111', -- 28
511  [ "or" ] = '110111111101', -- 220
512  [ "not" ] = '110111111110', -- 244
513  [ "//" ] = '1101111111110', -- 121
514  [ "in" ] = '110111111111100', -- 34
515  [ "*" ] = '110111111111101', -- 35
516  [ "until" ] = '11011111111111000', -- 7
517  [ "|" ] = '1101111111111100100000', -- 0
518  [ "goto" ] = '1101111111111100100001', -- 0
519  [ "break" ] = '1101111111111100100010', -- 0
520  [ ">>" ] = '11011111111111001000110', -- 0
521  [ "\9" ] = '11011111111111001000111', -- 0
522  [ "~" ] = '11011111111111001001000', -- 0
523  [ "::" ] = '11011111111111001001001', -- 0
524  [ ";" ] = '11011111111111001001010', -- 0
525  [ "eof" ] = '11011111111111001001011', -- 0
526  [ "<<" ] = '11011111111111001001100', -- 0
527  [ "\11" ] = '11011111111111001001101', -- 0
528  [ "\13" ] = '11011111111111001001110', -- 0
529  [ "\12" ] = '11011111111111001001111', -- 0
530  [ "..." ] = '1101111111111100101', -- 4
531  [ "repeat" ] = '110111111111110011', -- 7
532  [ "&" ] = '1101111111111101', -- 20
533  [ ".." ] = '110111111111111', -- 39
534  [ "literal" ] = '111', -- 116390
537local function errorout( ... )
538  local args = { ... }
539  local format = args[ 1 ]
540  table.remove( args, 1 )
541  io.stderr:write( string.format( format, args ), '\n' )
542  os.exit( 1 )
545local function addbits( encoded, bits )
546  for i = 1, #bits do
547    encoded[ #encoded + 1 ] = bits:sub( i, i ) + 0
548  end
551local function addliteral( encoded, str )
552  if #encoded % 8 == 0 then
553    addbits( encoded, '1' )
554  end
556  for i = 1, #str do
557    local k = str:byte( i, i )
559    for j = 7, 0, -1 do
560      if bit32.band( k, bit32.lshift( 1, j ) ) ~= 0 then
561        addbits( encoded, '1' )
562      else
563        addbits( encoded, '0' )
564      end
565    end
566  end
568  addbits( encoded, '00000000' )
571local function bitstobyte( encoded, i )
572  local bit = 128
573  local byte = 0
575  for j = 0, 7 do
576    byte = byte + encoded[ i + j ] * bit
577    bit = bit / 2
578  end
580  return string.char( byte )
583local function encode( source )
584  local encoded = {}
586  for token, lexeme in lexer.lua( source, {} ) do
587    local bits
589    if token == 'space' then
590      for i = 1, #lexeme do
591        addbits( encoded, enctab[ lexeme:sub( i, i ) ] )
592      end
593    elseif token == 'keyword' then
594      addbits( encoded, enctab[ lexeme ] )
595    elseif token == 'iden' then
596      addbits( encoded, enctab.literal )
597      addliteral( encoded, lexeme )
598    elseif token == 'number' then
599      addbits( encoded, enctab.literal )
600      addliteral( encoded, tostring( lexeme ) )
601    elseif token == 'string' then
602      addbits( encoded, enctab.literal )
603      addliteral( encoded, string.format( '%q', lexeme ) )
604    elseif token == 'comment' then
605      addbits( encoded, enctab.literal )
606      addliteral( encoded, lexeme )
607    else
608      addbits( encoded, enctab[ lexeme ] )
609    end
610  end
612  addbits( encoded, enctab.eof )
614  while #encoded %8 ~= 0 do
615    addbits( encoded, '0' )
616  end
618  local s = {}
620  for i = 1, #encoded, 8 do
621    s[ #s + 1 ] = bitstobyte( encoded, i )
622  end
624  return table.concat( s )
627local function main( args )
628  if #args ~= 2 then
629    io.write( 'Usage: lua bsenc.lua <input.lua> <output.bs>\n' )
630    return 0
631  end
633  local file, err = io.open( args[ 1 ] )
635  if not file then
636    errorout( 'Error opening %s', args[ 1 ] )
637  end
639  local source = file:read( '*a' )
640  file:close()
642  if not source then
643    errorout( 'Could not read from %s', args[ 1 ] )
644  end
646  local encoded = encode( source )
648  file, err = io.open( args[ 2 ], 'wb' )
650  if not file then
651    errorout( 'Error opening %s', args[ 2 ] )
652  end
654  file:write( encoded )
655  file:close()
658return main( arg )