1local pl_lexer = [===[
2--- Lexical scanner for creating a sequence of tokens from text.
3-- `lexer.scan(s)` returns an iterator over all tokens found in the
4-- string `s`. This iterator returns two values, a token type string
5-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the
6-- token.
7--
8-- Versions specialized for Lua and C are available; these also handle block comments
9-- and classify keywords as 'keyword' tokens. For example:
10--
11--    > s = 'for i=1,n do'
12--    > for t,v in lexer.lua(s)  do print(t,v) end
13--    keyword for
14--    iden    i
15--    =       =
16--    number  1
17--    ,       ,
18--    iden    n
19--    keyword do
20--
21-- See the Guide for further @{06-data.md.Lexical_Scanning|discussion}
22-- @module pl.lexer
23
24local yield,wrap = coroutine.yield,coroutine.wrap
25local strfind = string.find
26local strsub = string.sub
27local append = table.insert
28
29local function assert_arg(idx,val,tp)
30    if type(val) ~= tp then
31        error("argument "..idx.." must be "..tp, 2)
32    end
33end
34
35local lexer = {}
36
37local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+'
38local NUMBER2 = '^[%+%-]?%d+%.?%d*'
39local NUMBER3 = '^0x[%da-fA-F]+'
40local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+'
41local NUMBER5 = '^%d+%.?%d*'
42local IDEN = '^[%a_][%w_]*'
43local WSPACE = '^%s+'
44local STRING0 = [[^(['\"]).-\\%1]]
45local STRING1 = [[^(['\"]).-[^\]%1]]
46local STRING3 = "^((['\"])%2)" -- empty string
47local PREPRO = '^#.-[^\\]\n'
48
49local plain_matches,lua_matches,cpp_matches,lua_keyword,cpp_keyword
50
51local function tdump(tok)
52    return yield(tok,tok)
53end
54
55local function ndump(tok,options)
56    if options and options.number then
57        tok = tonumber(tok)
58    end
59    return yield("number",tok)
60end
61
62-- regular strings, single or double quotes; usually we want them
63-- without the quotes
64local function sdump(tok,options)
65    if options and options.string then
66        tok = tok:sub(2,-2)
67    end
68    return yield("string",tok)
69end
70
71-- long Lua strings need extra work to get rid of the quotes
72local function sdump_l(tok,options,findres)
73    if options and options.string then
74        local quotelen = 3
75        if findres[3] then
76            quotelen = quotelen + findres[3]:len()
77        end
78        tok = tok:sub(quotelen,-1 * quotelen)
79    end
80    return yield("string",tok)
81end
82
83local function chdump(tok,options)
84    if options and options.string then
85        tok = tok:sub(2,-2)
86    end
87    return yield("char",tok)
88end
89
90local function cdump(tok)
91    return yield('comment',tok)
92end
93
94local function wsdump (tok)
95    return yield("space",tok)
96end
97
98local function pdump (tok)
99    return yield('prepro',tok)
100end
101
102local function plain_vdump(tok)
103    return yield("iden",tok)
104end
105
106local function lua_vdump(tok)
107    if lua_keyword[tok] then
108        return yield("keyword",tok)
109    else
110        return yield("iden",tok)
111    end
112end
113
114local function cpp_vdump(tok)
115    if cpp_keyword[tok] then
116        return yield("keyword",tok)
117    else
118        return yield("iden",tok)
119    end
120end
121
122--- create a plain token iterator from a string or file-like object.
123-- @string s the string
124-- @tab matches an optional match table (set of pattern-action pairs)
125-- @tab[opt] filter a table of token types to exclude, by default `{space=true}`
126-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
127-- which means convert numbers and strip string quotes.
128function lexer.scan (s,matches,filter,options)
129    --assert_arg(1,s,'string')
130    local file = type(s) ~= 'string' and s
131    filter = filter or {space=true}
132    options = options or {number=true,string=true}
133    if filter then
134        if filter.space then filter[wsdump] = true end
135        if filter.comments then
136            filter[cdump] = true
137        end
138    end
139    if not matches then
140        if not plain_matches then
141            plain_matches = {
142                {WSPACE,wsdump},
143                {NUMBER3,ndump},
144                {IDEN,plain_vdump},
145                {NUMBER1,ndump},
146                {NUMBER2,ndump},
147                {STRING3,sdump},
148                {STRING0,sdump},
149                {STRING1,sdump},
150                {'^.',tdump}
151            }
152        end
153        matches = plain_matches
154    end
155    local function lex ()
156        if type(s)=='string' and s=='' then return end
157        local findres,i1,i2,idx,res1,res2,tok,pat,fun,capt
158        local line = 1
159        if file then s = file:read()..'\n' end
160        local sz = #s
161        local idx = 1
162        --print('sz',sz)
163        while true do
164            for _,m in ipairs(matches) do
165                pat = m[1]
166                fun = m[2]
167                findres = { strfind(s,pat,idx) }
168                i1 = findres[1]
169                i2 = findres[2]
170                if i1 then
171                    tok = strsub(s,i1,i2)
172                    idx = i2 + 1
173                    if not (filter and filter[fun]) then
174                        lexer.finished = idx > sz
175                        res1,res2 = fun(tok,options,findres)
176                    end
177                    if res1 then
178                        local tp = type(res1)
179                        -- insert a token list
180                        if tp=='table' then
181                            yield('','')
182                            for _,t in ipairs(res1) do
183                                yield(t[1],t[2])
184                            end
185                        elseif tp == 'string' then -- or search up to some special pattern
186                            i1,i2 = strfind(s,res1,idx)
187                            if i1 then
188                                tok = strsub(s,i1,i2)
189                                idx = i2 + 1
190                                yield('',tok)
191                            else
192                                yield('','')
193                                idx = sz + 1
194                            end
195                            --if idx > sz then return end
196                        else
197                            yield(line,idx)
198                        end
199                    end
200                    if idx > sz then
201                        if file then
202                            --repeat -- next non-empty line
203                                line = line + 1
204                                s = file:read()
205                                if not s then return end
206                            --until not s:match '^%s*$'
207                            s = s .. '\n'
208                            idx ,sz = 1,#s
209                            break
210                        else
211                            return
212                        end
213                    else break end
214                end
215            end
216        end
217    end
218    return wrap(lex)
219end
220
221local function isstring (s)
222    return type(s) == 'string'
223end
224
225--- insert tokens into a stream.
226-- @param tok a token stream
227-- @param a1 a string is the type, a table is a token list and
228-- a function is assumed to be a token-like iterator (returns type & value)
229-- @string a2 a string is the value
230function lexer.insert (tok,a1,a2)
231    if not a1 then return end
232    local ts
233    if isstring(a1) and isstring(a2) then
234        ts = {{a1,a2}}
235    elseif type(a1) == 'function' then
236        ts = {}
237        for t,v in a1() do
238            append(ts,{t,v})
239        end
240    else
241        ts = a1
242    end
243    tok(ts)
244end
245
246--- get everything in a stream upto a newline.
247-- @param tok a token stream
248-- @return a string
249function lexer.getline (tok)
250    local t,v = tok('.-\n')
251    return v
252end
253
254--- get current line number.
255-- Only available if the input source is a file-like object.
256-- @param tok a token stream
257-- @return the line number and current column
258function lexer.lineno (tok)
259    return tok(0)
260end
261
262--- get the rest of the stream.
263-- @param tok a token stream
264-- @return a string
265function lexer.getrest (tok)
266    local t,v = tok('.+')
267    return v
268end
269
270--- get the Lua keywords as a set-like table.
271-- So `res["and"]` etc would be `true`.
272-- @return a table
273function lexer.get_keywords ()
274    if not lua_keyword then
275        lua_keyword = {
276            ["and"] = true, ["break"] = true,  ["do"] = true,
277            ["else"] = true, ["elseif"] = true, ["end"] = true,
278            ["false"] = true, ["for"] = true, ["function"] = true,
279            ["if"] = true, ["in"] = true,  ["local"] = true, ["nil"] = true,
280            ["not"] = true, ["or"] = true, ["repeat"] = true,
281            ["return"] = true, ["then"] = true, ["true"] = true,
282            ["until"] = true,  ["while"] = true
283        }
284    end
285    return lua_keyword
286end
287
288--- create a Lua token iterator from a string or file-like object.
289-- Will return the token type and value.
290-- @string s the string
291-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
292-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
293-- which means convert numbers and strip string quotes.
294function lexer.lua(s,filter,options)
295    filter = filter or {space=true,comments=true}
296    lexer.get_keywords()
297    if not lua_matches then
298        lua_matches = {
299            {WSPACE,wsdump},
300            {NUMBER3,ndump},
301            {IDEN,lua_vdump},
302            {NUMBER4,ndump},
303            {NUMBER5,ndump},
304            {STRING3,sdump},
305            {STRING0,sdump},
306            {STRING1,sdump},
307            {'^%-%-%[(=*)%[.-%]%1%]',cdump},
308            {'^%-%-.-\n',cdump},
309            {'^%[(=*)%[.-%]%1%]',sdump_l},
310            {'^==',tdump},
311            {'^~=',tdump},
312            {'^<=',tdump},
313            {'^>=',tdump},
314            {'^%.%.%.',tdump},
315            {'^%.%.',tdump},
316            {'^//',tdump},
317            {'^.',tdump}
318        }
319    end
320    return lexer.scan(s,lua_matches,filter,options)
321end
322
323--- create a C/C++ token iterator from a string or file-like object.
324-- Will return the token type type and value.
325-- @string s the string
326-- @tab[opt] filter a table of token types to exclude, by default `{space=true,comments=true}`
327-- @tab[opt] options a table of options; by default, `{number=true,string=true}`,
328-- which means convert numbers and strip string quotes.
329function lexer.cpp(s,filter,options)
330    filter = filter or {comments=true}
331    if not cpp_keyword then
332        cpp_keyword = {
333            ["class"] = true, ["break"] = true,  ["do"] = true, ["sizeof"] = true,
334            ["else"] = true, ["continue"] = true, ["struct"] = true,
335            ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true,
336            ["private"] = true, ["protected"] = true, ["goto"] = true,
337            ["if"] = true, ["static"] = true,  ["const"] = true, ["typedef"] = true,
338            ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true,
339            ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true,
340            ["double"] = true,  ["while"] = true, ["new"] = true,
341            ["namespace"] = true, ["try"] = true, ["catch"] = true,
342            ["switch"] = true, ["case"] = true, ["extern"] = true,
343            ["return"] = true,["default"] = true,['unsigned']  = true,['signed'] = true,
344            ["union"] =  true, ["volatile"] = true, ["register"] = true,["short"] = true,
345        }
346    end
347    if not cpp_matches then
348        cpp_matches = {
349            {WSPACE,wsdump},
350            {PREPRO,pdump},
351            {NUMBER3,ndump},
352            {IDEN,cpp_vdump},
353            {NUMBER4,ndump},
354            {NUMBER5,ndump},
355            {STRING3,sdump},
356            {STRING1,chdump},
357            {'^//.-\n',cdump},
358            {'^/%*.-%*/',cdump},
359            {'^==',tdump},
360            {'^!=',tdump},
361            {'^<=',tdump},
362            {'^>=',tdump},
363            {'^->',tdump},
364            {'^&&',tdump},
365            {'^||',tdump},
366            {'^%+%+',tdump},
367            {'^%-%-',tdump},
368            {'^%+=',tdump},
369            {'^%-=',tdump},
370            {'^%*=',tdump},
371            {'^/=',tdump},
372            {'^|=',tdump},
373            {'^%^=',tdump},
374            {'^::',tdump},
375            {'^.',tdump}
376        }
377    end
378    return lexer.scan(s,cpp_matches,filter,options)
379end
380
381--- get a list of parameters separated by a delimiter from a stream.
382-- @param tok the token stream
383-- @string[opt=')'] endtoken end of list. Can be '\n'
384-- @string[opt=','] delim separator
385-- @return a list of token lists.
386function lexer.get_separated_list(tok,endtoken,delim)
387    endtoken = endtoken or ')'
388    delim = delim or ','
389    local parm_values = {}
390    local level = 1 -- used to count ( and )
391    local tl = {}
392    local function tappend (tl,t,val)
393        val = val or t
394        append(tl,{t,val})
395    end
396    local is_end
397    if endtoken == '\n' then
398        is_end = function(t,val)
399            return t == 'space' and val:find '\n'
400        end
401    else
402        is_end = function (t)
403            return t == endtoken
404        end
405    end
406    local token,value
407    while true do
408        token,value=tok()
409        if not token then return nil,'EOS' end -- end of stream is an error!
410        if is_end(token,value) and level == 1 then
411            append(parm_values,tl)
412            break
413        elseif token == '(' then
414            level = level + 1
415            tappend(tl,'(')
416        elseif token == ')' then
417            level = level - 1
418            if level == 0 then -- finished with parm list
419                append(parm_values,tl)
420                break
421            else
422                tappend(tl,')')
423            end
424        elseif token == delim and level == 1 then
425            append(parm_values,tl) -- a new parm
426            tl = {}
427        else
428            tappend(tl,token,value)
429        end
430    end
431    return parm_values,{token,value}
432end
433
434--- get the next non-space token from the stream.
435-- @param tok the token stream.
436function lexer.skipws (tok)
437    local t,v = tok()
438    while t == 'space' do
439        t,v = tok()
440    end
441    return t,v
442end
443
444local skipws = lexer.skipws
445
446--- get the next token, which must be of the expected type.
447-- Throws an error if this type does not match!
448-- @param tok the token stream
449-- @string expected_type the token type
450-- @bool no_skip_ws whether we should skip whitespace
451function lexer.expecting (tok,expected_type,no_skip_ws)
452    assert_arg(1,tok,'function')
453    assert_arg(2,expected_type,'string')
454    local t,v
455    if no_skip_ws then
456        t,v = tok()
457    else
458        t,v = skipws(tok)
459    end
460    if t ~= expected_type then error ("expecting "..expected_type,2) end
461    return v
462end
463
464return lexer
465]===]
466
467local lexer = load( pl_lexer, 'lexer.lua' )()
468
469local enctab = {
470  [ " " ] = '0', -- 221208
471  [ "." ] = '100', -- 62755
472  [ "=" ] = '1010', -- 29673
473  [ "end" ] = '1011000', -- 3844
474  [ "if" ] = '10110010', -- 1967
475  [ "==" ] = '10110011', -- 2003
476  [ "function" ] = '101101000', -- 1006
477  [ "local" ] = '101101001', -- 1096
478  [ "<=" ] = '1011010100', -- 541
479  [ "while" ] = '1011010101', -- 557
480  [ ":" ] = '10110101100', -- 289
481  [ "<" ] = '10110101101', -- 290
482  [ "-" ] = '10110101110', -- 304
483  [ "for" ] = '10110101111', -- 314
484  [ "then" ] = '10110110', -- 2637
485  [ "and" ] = '1011011100', -- 644
486  [ ">=" ] = '10110111010', -- 319
487  [ ">" ] = '10110111011', -- 326
488  [ "elseif" ] = '1011011110', -- 670
489  [ "else" ] = '10110111110', -- 381
490  [ "%" ] = '101101111110', -- 198
491  [ "nil" ] = '101101111111', -- 201
492  [ "self" ] = '10111', -- 24196
493  [ "\
494" ] = '1100', -- 45578
495  [ "true" ] = '11010000', -- 2799
496  [ "," ] = '11010001', -- 2815
497  [ "[" ] = '1101001', -- 6744
498  [ ")" ] = '110101', -- 12925
499  [ "(" ] = '110110', -- 12925
500  [ "]" ] = '1101110', -- 6744
501  [ "false" ] = '11011110', -- 3512
502  [ "{" ] = '11011111000', -- 429
503  [ "}" ] = '11011111001', -- 429
504  [ "do" ] = '1101111101', -- 871
505  [ "+" ] = '1101111110', -- 900
506  [ "~=" ] = '1101111111000', -- 99
507  [ "return" ] = '11011111110010', -- 55
508  [ "/" ] = '1101111111001100', -- 13
509  [ "^" ] = '1101111111001101', -- 14
510  [ "#" ] = '110111111100111', -- 28
511  [ "or" ] = '110111111101', -- 220
512  [ "not" ] = '110111111110', -- 244
513  [ "//" ] = '1101111111110', -- 121
514  [ "in" ] = '110111111111100', -- 34
515  [ "*" ] = '110111111111101', -- 35
516  [ "until" ] = '11011111111111000', -- 7
517  [ "|" ] = '1101111111111100100000', -- 0
518  [ "goto" ] = '1101111111111100100001', -- 0
519  [ "break" ] = '1101111111111100100010', -- 0
520  [ ">>" ] = '11011111111111001000110', -- 0
521  [ "\9" ] = '11011111111111001000111', -- 0
522  [ "~" ] = '11011111111111001001000', -- 0
523  [ "::" ] = '11011111111111001001001', -- 0
524  [ ";" ] = '11011111111111001001010', -- 0
525  [ "eof" ] = '11011111111111001001011', -- 0
526  [ "<<" ] = '11011111111111001001100', -- 0
527  [ "\11" ] = '11011111111111001001101', -- 0
528  [ "\13" ] = '11011111111111001001110', -- 0
529  [ "\12" ] = '11011111111111001001111', -- 0
530  [ "..." ] = '1101111111111100101', -- 4
531  [ "repeat" ] = '110111111111110011', -- 7
532  [ "&" ] = '1101111111111101', -- 20
533  [ ".." ] = '110111111111111', -- 39
534  [ "literal" ] = '111', -- 116390
535}
536
537local function errorout( ... )
538  local args = { ... }
539  local format = args[ 1 ]
540  table.remove( args, 1 )
541  io.stderr:write( string.format( format, args ), '\n' )
542  os.exit( 1 )
543end
544
545local function addbits( encoded, bits )
546  for i = 1, #bits do
547    encoded[ #encoded + 1 ] = bits:sub( i, i ) + 0
548  end
549end
550
551local function addliteral( encoded, str )
552  if #encoded % 8 == 0 then
553    addbits( encoded, '1' )
554  end
555
556  for i = 1, #str do
557    local k = str:byte( i, i )
558
559    for j = 7, 0, -1 do
560      if bit32.band( k, bit32.lshift( 1, j ) ) ~= 0 then
561        addbits( encoded, '1' )
562      else
563        addbits( encoded, '0' )
564      end
565    end
566  end
567
568  addbits( encoded, '00000000' )
569end
570
571local function bitstobyte( encoded, i )
572  local bit = 128
573  local byte = 0
574
575  for j = 0, 7 do
576    byte = byte + encoded[ i + j ] * bit
577    bit = bit / 2
578  end
579
580  return string.char( byte )
581end
582
583local function encode( source )
584  local encoded = {}
585
586  for token, lexeme in lexer.lua( source, {} ) do
587    local bits
588
589    if token == 'space' then
590      for i = 1, #lexeme do
591        addbits( encoded, enctab[ lexeme:sub( i, i ) ] )
592      end
593    elseif token == 'keyword' then
594      addbits( encoded, enctab[ lexeme ] )
595    elseif token == 'iden' then
596      addbits( encoded, enctab.literal )
597      addliteral( encoded, lexeme )
598    elseif token == 'number' then
599      addbits( encoded, enctab.literal )
600      addliteral( encoded, tostring( lexeme ) )
601    elseif token == 'string' then
602      addbits( encoded, enctab.literal )
603      addliteral( encoded, string.format( '%q', lexeme ) )
604    elseif token == 'comment' then
605      addbits( encoded, enctab.literal )
606      addliteral( encoded, lexeme )
607    else
608      addbits( encoded, enctab[ lexeme ] )
609    end
610  end
611
612  addbits( encoded, enctab.eof )
613
614  while #encoded %8 ~= 0 do
615    addbits( encoded, '0' )
616  end
617
618  local s = {}
619
620  for i = 1, #encoded, 8 do
621    s[ #s + 1 ] = bitstobyte( encoded, i )
622  end
623
624  return table.concat( s )
625end
626
627local function main( args )
628  if #args ~= 2 then
629    io.write( 'Usage: lua bsenc.lua <input.lua> <output.bs>\n' )
630    return 0
631  end
632
633  local file, err = io.open( args[ 1 ] )
634
635  if not file then
636    errorout( 'Error opening %s', args[ 1 ] )
637  end
638
639  local source = file:read( '*a' )
640  file:close()
641
642  if not source then
643    errorout( 'Could not read from %s', args[ 1 ] )
644  end
645
646  local encoded = encode( source )
647
648  file, err = io.open( args[ 2 ], 'wb' )
649
650  if not file then
651    errorout( 'Error opening %s', args[ 2 ] )
652  end
653
654  file:write( encoded )
655  file:close()
656end
657
658return main( arg )
659