1%% Copyright (c) 2008,2009 Robert Virding. All rights reserved.
2%%
3%% Redistribution and use in source and binary forms, with or without
4%% modification, are permitted provided that the following conditions
5%% are met:
6%%
7%% 1. Redistributions of source code must retain the above copyright
8%% notice, this list of conditions and the following disclaimer.
9%% 2. Redistributions in binary form must reproduce the above copyright
10%% notice, this list of conditions and the following disclaimer in the
11%% documentation and/or other materials provided with the distribution.
12%%
13%% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
14%% "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
15%% LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
16%% FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
17%% COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
18%% INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19%% BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20%% LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21%% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22%% LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
23%% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24%% POSSIBILITY OF SUCH DAMAGE.
25
26%%% A Lexical Analyser Generator for Erlang.
27%%%
28%%% Most of the algorithms used here are taken pretty much as
29%%% described in the "Dragon Book" by Aho, Sethi and Ullman. Some
30%%% completing details were taken from "Compiler Design in C" by
31%%% Hollub.
32
33-module(leex).
34
35-export([compile/3,file/1,file/2,format_error/1]).
36
37-import(lists, [member/2,reverse/1,sort/1,keysort/2,
38                map/2,foldl/3,foldr/3,foreach/2,flatmap/2]).
39-import(ordsets, [is_element/2,add_element/2,union/2]).
40-import(orddict, [store/3]).
41
42-include("erl_compile.hrl").
43%%-include_lib("stdlib/include/erl_compile.hrl").
44
45-define(LEEXINC, "leexinc.hrl").        % Include file
46-define(LEEXLIB, parsetools).           % Leex is in lib parsetools
47%%-define(LEEXLIB, leex).               % Leex is in lib leex
48
49-define(DEFS_HEAD, "Definitions.").
50-define(RULE_HEAD, "Rules.").
51-define(CODE_HEAD, "Erlang code.").
52
53-record(leex, {xfile=[],        % Xrl file
54               efile=[],        % Erl file
55               ifile=[],        % Include file
56               gfile=[],        % Graph file
57               module,          % Module name
58               opts=[],         % Options
59               encoding=none,   % Encoding of Xrl file
60               % posix=false,   % POSIX regular expressions
61               errors=[],
62               warnings=[]
63              }).
64
65-record(nfa_state, {no,edges=[],accept=noaccept}).
66-record(dfa_state, {no,nfa=[],trans=[],accept=noaccept}).
67
68%%%
69%%% Exported functions
70%%%
71
72%%% Interface to erl_compile.
73
74compile(Input0, Output0,
75        #options{warning = WarnLevel, verbose=Verbose, includes=Includes,
76		 specific=Specific}) ->
77    Input = assure_extension(shorten_filename(Input0), ".xrl"),
78    Output = assure_extension(shorten_filename(Output0), ".erl"),
79    Includefile = lists:sublist(Includes, 1),
80    Werror = proplists:get_bool(warnings_as_errors, Specific),
81    Opts = [{scannerfile,Output},{includefile,Includefile},{verbose,Verbose},
82            {report_errors,true},{report_warnings,WarnLevel > 0},
83	    {warnings_as_errors, Werror}],
84    case file(Input, Opts) of
85        {ok, _} ->
86            ok;
87        error ->
88            error
89    end.
90
91-type error_info() :: {erl_anno:line() | 'none',
92                       module(), ErrorDescriptor :: term()}.
93-type errors() :: [{file:filename(), [error_info()]}].
94-type warnings() :: [{file:filename(), [error_info()]}].
95-type ok_ret() :: {'ok', Scannerfile :: file:filename()}
96                | {'ok', Scannerfile :: file:filename(), warnings()}.
97-type error_ret() :: 'error'
98                  | {'error', Errors :: errors(), Warnings :: warnings()}.
99-type leex_ret() :: ok_ret() | error_ret().
100
101-spec file(FileName) -> leex_ret() when
102      FileName :: file:filename().
103
104file(File) -> file(File, []).
105
106-spec file(FileName, Options) -> leex_ret() when
107      FileName :: file:filename(),
108      Options :: Option | [Option],
109      Option :: {'dfa_graph', boolean()}
110              | {'includefile', Includefile :: file:filename()}
111              | {'report_errors', boolean()}
112              | {'report_warnings', boolean()}
113              | {'report', boolean()}
114              | {'return_errors', boolean()}
115              | {'return_warnings', boolean()}
116              | {'return', boolean()}
117              | {'scannerfile', Scannerfile :: file:filename()}
118              | {'verbose', boolean()}
119              | {'warnings_as_errors', boolean()}
120              | 'dfa_graph'
121              | 'report_errors' | 'report_warnings' | 'report'
122              | 'return_errors' | 'return_warnings' | 'return'
123              | 'verbose' | 'warnings_as_errors'.
124
125file(File, Opts0) when is_list(Opts0) ->
126    case is_filename(File) of
127        no -> erlang:error(badarg, [File,Opts0]);
128        _ -> ok
129    end,
130    EnvOpts0 = env_default_opts(),
131    EnvOpts = select_recognized_opts(EnvOpts0),
132    Opts1 = Opts0 ++ EnvOpts,
133    Opts = case options(Opts1) of
134               badarg ->
135                   erlang:error(badarg, [File,Opts0]);
136               Options ->
137                   Options
138           end,
139    St0 = #leex{},
140    St1 = filenames(File, Opts, St0),   % Get all the filenames
141    St = try
142             {ok,REAs,Actions,Code,St2} = parse_file(St1),
143             {DFA,DF} = make_dfa(REAs, St2),
144             case werror(St2) of
145                 false ->
146                     St3 = out_file(St2, DFA, DF, Actions, Code),
147                     case lists:member(dfa_graph, St3#leex.opts) of
148                         true -> out_dfa_graph(St3, DFA, DF);
149                         false -> St3
150                     end;
151                 true ->
152                     St2
153             end
154         catch #leex{}=St4 ->
155             St4
156         end,
157    leex_ret(St);
158file(File, Opt) ->
159    file(File, [Opt]).
160
161-spec format_error(ErrorDescriptor) -> io_lib:chars() when
162      ErrorDescriptor :: term().
163
164format_error({file_error, Reason}) ->
165    io_lib:fwrite("~ts",[file:format_error(Reason)]);
166format_error(missing_defs) -> "missing Definitions";
167format_error(missing_rules) -> "missing Rules";
168format_error(missing_code) -> "missing Erlang code";
169format_error(empty_rules) -> "no rules";
170format_error(bad_rule) -> "bad rule";
171format_error({regexp,E})->
172    Es = case E of
173             {interval_range,_} -> "interval range";
174             {unterminated,Cs} ->
175                 "unterminated " ++ Cs;
176             {illegal_char,Cs} ->
177                 "illegal character " ++ Cs;
178%%           {posix_cc,What} ->
179%%               ["illegal POSIX character class ",io_lib:write_string(What)];
180             {char_class,What} ->
181                 ["illegal character class ",io_lib:write_string(What)]
182         end,
183    ["bad regexp `",Es,"'"];
184format_error(ignored_characters) ->
185    "ignored characters";
186format_error(cannot_parse) ->
187    io_lib:fwrite("cannot parse; probably encoding mismatch", []).
188
189%%%
190%%% Local functions
191%%%
192
193assure_extension(File, Ext) ->
194    lists:concat([strip_extension(File, Ext), Ext]).
195
196%% Assumes File is a filename.
197strip_extension(File, Ext) ->
198    case filename:extension(File) of
199        Ext -> filename:rootname(File);
200        _Other -> File
201    end.
202
203%% Copied from compile.erl.
204env_default_opts() ->
205    Key = "ERL_COMPILER_OPTIONS",
206    case os:getenv(Key) of
207	false -> [];
208	Str when is_list(Str) ->
209	    case erl_scan:string(Str) of
210		{ok,Tokens,_} ->
211                    Dot = {dot, erl_anno:new(1)},
212		    case erl_parse:parse_term(Tokens ++ [Dot]) of
213			{ok,List} when is_list(List) -> List;
214			{ok,Term} -> [Term];
215			{error,_Reason} ->
216			    io:format("Ignoring bad term in ~s\n", [Key]),
217			    []
218		    end;
219		{error, {_,_,_Reason}, _} ->
220		    io:format("Ignoring bad term in ~s\n", [Key]),
221		    []
222	    end
223    end.
224
225select_recognized_opts(Options0) ->
226    Options = preprocess_options(Options0),
227    AllOptions = all_options(),
228    [Option ||
229        {Name, _} = Option <- Options,
230        lists:member(Name, AllOptions)].
231
232options(Options0) ->
233    Options1 = preprocess_options(Options0),
234    AllOptions = all_options(),
235    case check_options(Options1, AllOptions, []) of
236        badarg ->
237            badarg;
238        OptionValues  ->
239            AllOptionValues =
240                [case lists:keyfind(Option, 1, OptionValues) of
241                     false ->
242                         {Option, default_option(Option)};
243                     OptionValue ->
244                         OptionValue
245                 end || Option <- AllOptions],
246            foldr(fun({_, false}, L) -> L;
247                     ({Option, true}, L) -> [Option | L];
248                     (OptionValue, L) -> [OptionValue | L]
249                  end, [], AllOptionValues)
250    end.
251
252preprocess_options(Options) ->
253    foldr(fun preproc_opt/2, [], Options).
254
255preproc_opt(return, Os) ->
256    [{return_errors, true}, {return_warnings, true} | Os];
257preproc_opt(report, Os) ->
258    [{report_errors, true}, {report_warnings, true} | Os];
259preproc_opt({return, T}, Os) ->
260    [{return_errors, T}, {return_warnings, T} | Os];
261preproc_opt({report, T}, Os) ->
262    [{report_errors, T}, {report_warnings, T} | Os];
263preproc_opt(Option, Os) ->
264    [try atom_option(Option) catch error:_ -> Option end | Os].
265
266check_options([{Option, FileName0} | Options], AllOptions, L)
267          when Option =:= includefile; Option =:= scannerfile ->
268    case is_filename(FileName0) of
269        no ->
270            badarg;
271        Filename ->
272            check_options(Options, AllOptions, [{Option, Filename} | L])
273    end;
274check_options([{Option, Boolean} | Options], AllOptions, L)
275          when is_boolean(Boolean) ->
276    case lists:member(Option, AllOptions) of
277        true ->
278            check_options(Options, AllOptions, [{Option, Boolean} | L]);
279        false ->
280            badarg
281        end;
282check_options([], _AllOptions, L) ->
283    L;
284check_options(_Options, _, _L) ->
285    badarg.
286
287all_options() ->
288    [dfa_graph,includefile,report_errors,report_warnings,
289     return_errors,return_warnings,scannerfile,verbose,
290     warnings_as_errors].
291
292default_option(dfa_graph) -> false;
293default_option(includefile) -> [];
294default_option(report_errors) -> true;
295default_option(report_warnings) -> true;
296default_option(return_errors) -> false;
297default_option(return_warnings) -> false;
298default_option(scannerfile) -> [];
299default_option(verbose) -> false;
300default_option(warnings_as_errors) -> false.
301
302atom_option(dfa_graph) -> {dfa_graph,true};
303atom_option(report_errors) -> {report_errors,true};
304atom_option(report_warnings) -> {report_warnings,true};
305atom_option(warnings_as_errors) -> {warnings_as_errors,true};
306atom_option(return_errors) -> {return_errors,true};
307atom_option(verbose) -> {verbose,true};
308atom_option(return_warnings) -> {return_warnings,true};
309atom_option(Key) -> Key.
310
311is_filename(T) ->
312    try filename:flatten(T)
313    catch error: _ -> no
314    end.
315
316shorten_filename(Name0) ->
317    {ok,Cwd} = file:get_cwd(),
318    case string:prefix(Name0, Cwd) of
319        nomatch -> Name0;
320        Rest ->
321            case unicode:characters_to_list(Rest) of
322                "/"++N -> N;
323                N -> N
324            end
325    end.
326
327leex_ret(St) ->
328    report_errors(St),
329    report_warnings(St),
330    Es = pack_errors(St#leex.errors),
331    Ws = pack_warnings(St#leex.warnings),
332    Werror = werror(St),
333    if
334        Werror ->
335            do_error_return(St, Es, Ws);
336        Es =:= [] ->
337            case member(return_warnings, St#leex.opts) of
338                true -> {ok, St#leex.efile, Ws};
339                false -> {ok, St#leex.efile}
340            end;
341        true ->
342            do_error_return(St, Es, Ws)
343    end.
344
345do_error_return(St, Es, Ws) ->
346    case member(return_errors, St#leex.opts) of
347        true -> {error, Es, Ws};
348        false -> error
349    end.
350
351werror(St) ->
352    St#leex.warnings =/= []
353	andalso member(warnings_as_errors, St#leex.opts).
354
355pack_errors([{File,_} | _] = Es) ->
356    [{File, flatmap(fun({_,E}) -> [E] end, sort(Es))}];
357pack_errors([]) ->
358    [].
359
360pack_warnings([{File,_} | _] = Ws) ->
361    [{File, flatmap(fun({_,W}) -> [W] end, sort(Ws))}];
362pack_warnings([]) ->
363    [].
364
365report_errors(St) ->
366    when_opt(fun () ->
367                     foreach(fun({File,{none,Mod,E}}) ->
368                                     io:fwrite("~ts: ~ts\n",
369                                               [File,Mod:format_error(E)]);
370                                ({File,{Line,Mod,E}}) ->
371                                     io:fwrite("~ts:~w: ~ts\n",
372                                               [File,Line,Mod:format_error(E)])
373                             end, sort(St#leex.errors))
374             end, report_errors, St#leex.opts).
375
376report_warnings(St) ->
377    Werror = member(warnings_as_errors, St#leex.opts),
378    Prefix = case Werror of
379		 true -> "";
380		 false -> "Warning: "
381	     end,
382    ReportWerror = Werror andalso member(report_errors, St#leex.opts),
383    ShouldReport = member(report_warnings, St#leex.opts) orelse ReportWerror,
384    when_bool(fun () ->
385		      foreach(fun({File,{none,Mod,W}}) ->
386				      io:fwrite("~ts: ~s~ts\n",
387						[File,Prefix,
388						 Mod:format_error(W)]);
389				 ({File,{Line,Mod,W}}) ->
390				      io:fwrite("~ts:~w: ~s~ts\n",
391						[File,Line,Prefix,
392						 Mod:format_error(W)])
393			      end, sort(St#leex.warnings))
394	      end, ShouldReport).
395
396-spec add_error(_, #leex{}) -> no_return().
397add_error(E, St) ->
398    add_error(St#leex.xfile, E, St).
399
400add_error(File, Error, St) ->
401    throw(St#leex{errors = [{File,Error}|St#leex.errors]}).
402
403add_warning(Line, W, St) ->
404    St#leex{warnings = [{St#leex.xfile,{Line,leex,W}}|St#leex.warnings]}.
405
406%% filenames(File, Options, State) -> State.
407%%  The default output dir is the current directory unless an
408%%  explicit one has been given in the options.
409
410filenames(File, Opts, St0) ->
411    Dir = filename:dirname(File),
412    Base = filename:basename(File, ".xrl"),
413    Xfile = filename:join(Dir, Base ++ ".xrl"),
414    Efile = Base ++ ".erl",
415    Gfile = Base ++ ".dot",
416    Module = list_to_atom(Base),
417    St1 = St0#leex{xfile=Xfile,
418                   opts=Opts,
419                   module=Module},
420    {includefile,Ifile0} = lists:keyfind(includefile, 1, Opts),
421    Ifile = inc_file_name(Ifile0),
422    %% Test for explicit scanner file.
423    {scannerfile,Ofile} = lists:keyfind(scannerfile, 1, Opts),
424    if
425        Ofile =:= [] ->
426            St1#leex{efile=filename:join(Dir, Efile),
427                     ifile=Ifile,
428                     gfile=filename:join(Dir, Gfile)};
429        true ->
430            D = filename:dirname(Ofile),
431            St1#leex{efile=Ofile,
432                     ifile=Ifile,
433                     gfile=filename:join(D, Gfile)}
434    end.
435
436when_opt(Do, Opt, Opts) ->
437    case member(Opt, Opts) of
438        true -> Do();
439        false -> ok
440    end.
441
442when_bool(Do, Bool) ->
443    case Bool of
444	true -> Do();
445	false -> ok
446    end.
447
448verbose_print(St, Format, Args) ->
449    when_opt(fun () -> io:fwrite(Format, Args) end, verbose, St#leex.opts).
450
451%% parse_file(State) -> {ok,[REA],[Action],Code,NewState} | throw(NewState)
452%%  when
453%%      REA = {RegExp,ActionNo};
454%%      Action = {ActionNo,ActionString};
455%%      Code = {StartLine, StartPos, NumOfLines}. Where the Erlang code is.
456%%
457%%  Read and parse the file Xfile.
458%%  After each section of the file has been parsed we directly call the
459%%  next section. This is done when we detect a line we don't recognise
460%%  in the current section. The file format is very simple and Erlang
461%%  token based, we allow empty lines and Erlang style comments.
462
463parse_file(St0) ->
464    case file:open(St0#leex.xfile, [read]) of
465        {ok,Xfile} ->
466            St1 = St0#leex{encoding = epp:set_encoding(Xfile)},
467            try
468                verbose_print(St1, "Parsing file ~ts, ", [St1#leex.xfile]),
469                %% We KNOW that errors throw so we can ignore them here.
470                {ok,Line1,St2} = parse_head(Xfile, St1),
471                {ok,Line2,Macs,St3} = parse_defs(Xfile, Line1, St2),
472                {ok,Line3,REAs,Actions,St4} =
473                    parse_rules(Xfile, Line2, Macs, St3),
474                {ok,Code,St5} = parse_code(Xfile, Line3, St4),
475                verbose_print(St5, "contained ~w rules.~n", [length(REAs)]),
476                {ok,REAs,Actions,Code,St5}
477            after ok = file:close(Xfile)
478            end;
479        {error,Error} ->
480            add_error({none,leex,{file_error,Error}}, St0)
481    end.
482
483%% parse_head(File, State) -> {ok,NextLine,State}.
484%%  Parse the head of the file. Skip all comments and blank lines.
485
486parse_head(Ifile, St) -> {ok,nextline(Ifile, 0, St),St}.
487
488%% parse_defs(File, Line, State) -> {ok,NextLine,Macros,State}.
489%%  Parse the macro definition section of a file. This must exist.
490%%  The section is ended by a non-blank line which is not a macro def.
491
492parse_defs(Ifile, {ok,?DEFS_HEAD ++ Rest,L}, St) ->
493    St1 = warn_ignored_chars(L, Rest, St),
494    parse_defs(Ifile, nextline(Ifile, L, St), [], St1);
495parse_defs(_, {ok,_,L}, St) ->
496    add_error({L,leex,missing_defs}, St);
497parse_defs(_, {eof,L}, St) ->
498    add_error({L,leex,missing_defs}, St).
499
500parse_defs(Ifile, {ok,Chars,L}=Line, Ms, St) ->
501    %% This little beauty matches out a macro definition, RE's are so clear.
502    MS = "^[ \t]*([A-Z_][A-Za-z0-9_]*)[ \t]*=[ \t]*([^ \t\r\n]*)[ \t\r\n]*\$",
503    case re:run(Chars, MS, [{capture,all_but_first,list},unicode]) of
504        {match,[Name,Def]} ->
505            %%io:fwrite("~p = ~p\n", [Name,Def]),
506            parse_defs(Ifile, nextline(Ifile, L, St), [{Name,Def}|Ms], St);
507        _ -> {ok,Line,Ms,St}                    % Anything else
508    end;
509parse_defs(_, Line, Ms, St) ->
510    {ok,Line,Ms,St}.
511
512%% parse_rules(File, Line, Macros, State) -> {ok,NextLine,REAs,Actions,State}.
513%%  Parse the RE rules section of the file. This must exist.
514
515parse_rules(Ifile, {ok,?RULE_HEAD ++ Rest,L}, Ms, St) ->
516    St1 = warn_ignored_chars(L, Rest, St),
517    parse_rules(Ifile, nextline(Ifile, L, St), Ms, [], [], 0, St1);
518parse_rules(_, {ok,_,L}, _, St) ->
519    add_error({L,leex,missing_rules}, St);
520parse_rules(_, {eof,L}, _, St) ->
521    add_error({L,leex,missing_rules}, St).
522
523%% parse_rules(File, Result, Macros, RegExpActions, Actions, Acount, State) ->
524%%      {ok,NextCLine,RegExpActions,Actions,NewState} | throw(NewState)
525
526parse_rules(Ifile, NextLine, Ms, REAs, As, N, St) ->
527    case NextLine of
528        {ok,?CODE_HEAD ++ _Rest,_} ->
529            parse_rules_end(Ifile, NextLine, REAs, As, St);
530        {ok,Chars,L0} ->
531            %%io:fwrite("~w: ~p~n", [L0,Chars]),
532            case collect_rule(Ifile, Chars, L0) of
533                {ok,Re,Atoks,L1} ->
534                    {ok,REA,A,St1} = parse_rule(Re, L0, Atoks, Ms, N, St),
535                    parse_rules(Ifile, nextline(Ifile, L1, St), Ms,
536                                [REA|REAs], [A|As], N+1, St1);
537                {error,E} -> add_error(E, St)
538            end;
539        {eof,_} ->
540            parse_rules_end(Ifile, NextLine, REAs, As, St)
541    end.
542
543parse_rules_end(_, {ok,_,L}, [], [], St) ->
544    add_error({L,leex,empty_rules}, St);
545parse_rules_end(_, {eof,L}, [], [], St) ->
546    add_error({L,leex,empty_rules}, St);
547parse_rules_end(_, NextLine, REAs, As, St) ->
548    %% Must be *VERY* careful to put rules in correct order!
549    {ok,NextLine,reverse(REAs),reverse(As),St}.
550
551%% collect_rule(File, Line, Lineno) ->
552%%      {ok,RegExp,ActionTokens,NewLineno} | {error,E}.
553%% Collect a complete rule by reading lines until the the regexp and
554%% action has been read. Keep track of line number.
555
556collect_rule(Ifile, Chars, L0) ->
557    {RegExp,Rest} = string:take(Chars, " \t\r\n", true),
558    case collect_action(Ifile, Rest, L0, []) of
559        {ok,[{':',_}|Toks],L1} -> {ok,RegExp,Toks,L1};
560        {ok,_,_} -> {error,{L0,leex,bad_rule}};
561        {eof,L1} -> {error,{L1,leex,bad_rule}};
562        {error,E,_} -> {error,E}
563    end.
564
565collect_action(_Ifile, {error, _}, L, _Cont0) ->
566    {error, {L, leex, cannot_parse}, ignored_end_line};
567collect_action(Ifile, Chars, L0, Cont0) ->
568    case erl_scan:tokens(Cont0, Chars, L0) of
569        {done,{ok,Toks,_},_} -> {ok,Toks,L0};
570        {done,{eof,_},_} -> {eof,L0};
571        {done,{error,E,_},_} -> {error,E,L0};
572        {more,Cont1} ->
573            collect_action(Ifile, io:get_line(Ifile, leex), L0+1, Cont1)
574    end.
575
576%% parse_rule(RegExpString, RegExpLine, ActionTokens, Macros, Counter, State) ->
577%%      {ok,{RE,Action},ActionData,State}.
578%%  Parse one regexp after performing macro substition.
579
580parse_rule(S, Line, [{dot,_}], Ms, N, St) ->
581    case parse_rule_regexp(S, Ms, St) of
582        {ok,R} ->
583            {ok,{R,N},{N,empty_action},St};
584        {error,E} ->
585            add_error({Line,leex,E}, St)
586    end;
587parse_rule(S, Line, Atoks, Ms, N, St) ->
588    case parse_rule_regexp(S, Ms, St) of
589        {ok,R} ->
590            %%io:fwrite("RE = ~p~n", [R]),
591            %% Check for token variables.
592            TokenChars = var_used('TokenChars', Atoks),
593            TokenLen = var_used('TokenLen', Atoks),
594            TokenLine = var_used('TokenLine', Atoks),
595            {ok,{R,N},{N,Atoks,TokenChars,TokenLen,TokenLine},St};
596        {error,E} ->
597            add_error({Line,leex,E}, St)
598    end.
599
600var_used(Name, Toks) ->
601    case lists:keyfind(Name, 3, Toks) of
602        {var,_,Name} -> true;                   %It's the var we want
603        _ -> false
604    end.
605
606%% parse_rule_regexp(RegExpString, Macros, State) ->
607%%     {ok,RegExp} | {error,Error}.
608%% Substitute in macros and parse RegExpString. Cannot use re:replace
609%% here as it uses info in replace string (&).
610
611parse_rule_regexp(RE0, [{M,Exp}|Ms], St) ->
612    Split= re:split(RE0, "\\{" ++ M ++ "\\}", [{return,list},unicode]),
613    RE1 = lists:append(lists:join(Exp, Split)),
614    parse_rule_regexp(RE1, Ms, St);
615parse_rule_regexp(RE, [], St) ->
616    %%io:fwrite("RE = ~p~n", [RE]),
617    case re_parse(RE, St) of
618        {ok,R} -> {ok,R};
619        {error,E} -> {error,{regexp,E}}
620    end.
621
622%% parse_code(File, Line, State) -> {ok,Code,NewState}.
623%%  Finds the line and the position where the code section of the file
624%%  begins. This must exist.
625
626parse_code(Ifile, {ok,?CODE_HEAD ++ Rest,CodeL}, St) ->
627    St1 = warn_ignored_chars(CodeL, Rest, St),
628    {ok, CodePos} = file:position(Ifile, cur),
629    %% Just count the lines; copy the code from file to file later.
630    EndCodeLine = count_lines(Ifile, CodeL, St),
631    NCodeLines = EndCodeLine - CodeL,
632    {ok,{CodeL,CodePos,NCodeLines},St1};
633parse_code(_, {ok,_,L}, St) ->
634    add_error({L,leex,missing_code}, St);
635parse_code(_, {eof,L}, St) ->
636    add_error({L,leex,missing_code}, St).
637
638count_lines(File, N, St) ->
639    case io:get_line(File, leex) of
640        eof -> N;
641        {error, _} -> add_error({N+1, leex, cannot_parse}, St);
642        _Line -> count_lines(File, N+1, St)
643    end.
644
645%% nextline(InputFile, PrevLineNo, State) -> {ok,Chars,LineNo} | {eof,LineNo}.
646%%  Get the next line skipping comment lines and blank lines.
647
648nextline(Ifile, L, St) ->
649    case io:get_line(Ifile, leex) of
650        eof -> {eof,L};
651        {error, _} -> add_error({L+1, leex, cannot_parse}, St);
652        Chars ->
653            case string:take(Chars, " \t\n") of
654                {_, [$%|_Rest]} -> nextline(Ifile, L+1, St);
655                {_, []} -> nextline(Ifile, L+1, St);
656                _Other -> {ok,Chars,L+1}
657            end
658    end.
659
660warn_ignored_chars(Line, S, St) ->
661    case non_white(S) of
662        [] -> St;
663        _ -> add_warning(Line, ignored_characters, St)
664    end.
665
666non_white(S) ->
667    [C || C <- S, C > $\s, C < $\200 orelse C > $\240].
668
669%% This is the regular expression grammar used. It is equivalent to the
670%% one used in AWK, except that we allow ^ $ to be used anywhere and fail
671%% in the matching.
672%%
673%% reg -> alt : '$1'.
674%% alt -> seq "|" seq ... : {alt,['$1','$2'...]}.
675%% seq -> repeat repeat ... : {seq,['$1','$2'...]}.
676%% repeat -> repeat "*" : {kclosure,'$1'}.
677%% repeat -> repeat "+" : {pclosure,'$1'}.
678%% repeat -> repeat "?" : {optional,'$1'}.
679%% repeat -> repeat "{" [Min],[Max] "}" : {interval,'$1',Min,Max}
680%% repeat -> single : '$1'.
681%% single -> "(" reg ")" : {sub,'$2',Number}.
682%% single -> "^" : bos/bol.
683%% single -> "$" : eos/eol.
684%% single -> "." : any.
685%% single -> "[" class "]" : {char_class,char_class('$2')}
686%% single -> "[" "^" class "]" : {comp_class,char_class('$3')}.
687%% single -> "\"" chars "\"" : {lit,'$2'}.
688%% single -> "\\" char : {lit,['$2']}.
689%% single -> char : {lit,['$1']}.
690%% single -> empty : epsilon.
691%%  The grammar of the current regular expressions. The actual parser
692%%  is a recursive descent implementation of the grammar.
693
694%% re_parse(Chars, State) -> {ok,RegExp} | {error,Error}.
695
696re_parse(Cs0, St) ->
697    case catch re_reg(Cs0, 0, St) of
698        {RE,_,[]} -> {ok,RE};
699        {_,_,[C|_]} -> {error,{illegal_char,[C]}};
700        {parse_error,E} -> {error,E}
701    end.
702
703parse_error(E) -> throw({parse_error,E}).
704
705re_reg(Cs, Sn, St) -> re_alt(Cs, Sn, St).
706
707re_alt(Cs0, Sn0, St) ->
708    {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St),
709    case re_alt1(Cs1, Sn1, St) of
710        {[],Sn2,Cs2} -> {L,Sn2,Cs2};
711        {Rs,Sn2,Cs2} -> {{alt,[L|Rs]},Sn2,Cs2}
712    end.
713
714re_alt1([$||Cs0], Sn0, St) ->
715    {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St),
716    {Rs,Sn2,Cs2} = re_alt1(Cs1, Sn1, St),
717    {[L|Rs],Sn2,Cs2};
718re_alt1(Cs, Sn, _) -> {[],Sn,Cs}.
719
720%% Parse a sequence of regexps. Don't allow the empty sequence.
721%% re_seq(Cs0, Sn0, St) ->
722%%     {L,Sn1,Cs1} = repeat(Cs0, Sn0, St),
723%%     case re_seq1(Cs1, Sn1, St) of
724%%      {[],Sn2,Cs2} -> {L,Sn2,Cs2};
725%%      {Rs,Sn2,Cs2} -> {{seq,[L|Rs]},Sn2,Cs2}
726%%     end.
727
728%% re_seq(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}.
729%% Parse a sequence of regexps. Allow the empty sequence, returns epsilon.
730
731re_seq(Cs0, Sn0, St) ->
732    case re_seq1(Cs0, Sn0, St) of
733        {[],Sn1,Cs1} -> {epsilon,Sn1,Cs1};
734        {[R],Sn1,Cs1} -> {R,Sn1,Cs1};
735        {Rs,Sn1,Cs1} -> {{seq,Rs},Sn1,Cs1}
736    end.
737
738re_seq1([C|_]=Cs0, Sn0, St) when C =/= $|, C =/= $) ->
739    {L,Sn1,Cs1} = re_repeat(Cs0, Sn0, St),
740    {Rs,Sn2,Cs2} = re_seq1(Cs1, Sn1, St),
741    {[L|Rs],Sn2,Cs2};
742re_seq1(Cs, Sn, _) -> {[],Sn,Cs}.
743
744%% re_repeat(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}.
745
746re_repeat(Cs0, Sn0, St) ->
747    {S,Sn1,Cs1} = re_single(Cs0, Sn0, St),
748    re_repeat1(Cs1, Sn1, S, St).
749
750re_repeat1([$*|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {kclosure,S}, St);
751re_repeat1([$+|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {pclosure,S}, St);
752re_repeat1([$?|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {optional,S}, St);
753%% { only starts interval when ere is true, otherwise normal character.
754%% re_repeat1([${|Cs0], Sn, S, #leex{posix=true}=St) ->    % $}
755%%     case re_interval_range(Cs0) of
756%%         {Min,Max,[$}|Cs1]} when is_integer(Min), is_integer(Max), Min =< Max ->
757%%             re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St);
758%%         {Min,Max,[$}|Cs1]} when is_integer(Min), is_atom(Max) ->
759%%             re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St);
760%%         {_,_,Cs1} -> parse_error({interval_range,string_between([${|Cs0], Cs1)})
761%%     end;
762re_repeat1(Cs, Sn, S, _) -> {S,Sn,Cs}.
763
764%% re_single(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}.
765%% Parse a re_single regexp.
766
767re_single([$(|Cs0], Sn0, St) ->                % $)
768    Sn1 = Sn0 + 1,                             % Keep track of sub count anyway
769    case re_reg(Cs0, Sn1, St) of
770        {S,Sn2,[$)|Cs1]} -> {S,Sn2,Cs1};
771        %%{S,Sn2,[$)|Cs1]} -> {{sub,S,Sn1},Sn2,Cs1};
772        _ -> parse_error({unterminated,"("})
773    end;
774%% These are not legal inside a regexp.
775%% re_single([$^|Cs], Sn, St) -> {bos,Sn,Cs};
776%% re_single([$$|Cs], Sn, St) -> {eos,Sn,Cs};
777%% re_single([$.|Cs], Sn, St) -> {any,Sn,Cs};
778re_single([$.|Cs], Sn, _) -> {{comp_class,"\n"},Sn,Cs}; % Do this here?
779re_single("[^" ++ Cs0, Sn, St) ->
780    case re_char_class(Cs0, St) of
781        {Cc,[$]|Cs1]} -> {{comp_class,Cc},Sn,Cs1};
782        _ -> parse_error({unterminated,"["})
783    end;
784re_single([$[|Cs0], Sn, St) ->
785    case re_char_class(Cs0, St) of
786        {Cc,[$]|Cs1]} -> {{char_class,Cc},Sn,Cs1};
787        _ -> parse_error({unterminated,"["})
788    end;
789re_single([$\\|Cs0], Sn, _) ->
790    {C,Cs1} = re_char($\\, Cs0),
791    {{lit,[C]},Sn,Cs1};
792re_single([C|Cs0], Sn, St) ->
793    case special_char(C, St) of
794        true -> parse_error({illegal_char,[C]});
795        false ->
796            {C,Cs1} = re_char(C, Cs0),
797            {{lit,[C]},Sn,Cs1}
798    end.
799
800-define(IS_HEX(C), C >= $0 andalso C =< $9 orelse
801        C >= $A andalso C =< $F orelse
802        C >= $a andalso C =< $f).
803
804%% re_char(Char, Chars) -> {CharValue,Chars}.
805%%  Reads one character value from the input list, it knows about escapes.
806
807re_char($\\, [O1,O2,O3|S]) when
808  O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
809    {(O1*8 + O2)*8 + O3 - 73*$0,S};
810re_char($\\, [$x,H1,H2|S]) when ?IS_HEX(H1), ?IS_HEX(H2) ->
811    {erlang:list_to_integer([H1,H2], 16),S};
812re_char($\\,[$x,${|S0]) ->
813    re_hex(S0, []);
814re_char($\\,[$x|_]) ->
815    parse_error({illegal_char,"\\x"});
816re_char($\\, [C|S]) -> {escape_char(C),S};
817re_char($\\, []) -> parse_error({unterminated,"\\"});
818re_char(C, S) -> {C,S}.                         % Just this character
819
820re_hex([C|Cs], L) when ?IS_HEX(C) -> re_hex(Cs, [C|L]);
821re_hex([$}|S], L0) ->
822    L = lists:reverse(L0),
823    case erlang:list_to_integer(L, 16) of
824        C when C =< 16#10FFFF -> {C,S};
825        _ -> parse_error({illegal_char,[$\\,$x,${|L]++"}"})
826    end;
827re_hex(_, _) -> parse_error({unterminated,"\\x{"}).
828
829%% special_char(Char, State) -> bool().
830%% These are the special characters for an ERE.
831%% N.B. ]}) are only special in the context after [{(.
832
833special_char($^, _) -> true;
834special_char($., _) -> true;
835special_char($[, _) -> true;
836special_char($$, _) -> true;
837special_char($(, _) -> true;
838special_char($), _) -> true;
839special_char($|, _) -> true;
840special_char($*, _) -> true;
841special_char($+, _) -> true;
842special_char($?, _) -> true;
843%% special_char(${, #leex{posix=true}) -> true;    % Only when POSIX set
844special_char($\\, _) -> true;
845special_char(_, _) -> false.
846
847%% re_char_class(Chars, State) -> {CharClass,Chars}.
848%% Parse a character class.
849
850re_char_class([$]|Cs], St) ->                   % Must special case this.
851    re_char_class(Cs, [$]], St);
852re_char_class(Cs, St) -> re_char_class(Cs, [], St).
853
854%% re_char_class("[:" ++ Cs0, Cc, #leex{posix=true}=St) ->
855%%     %% POSIX char class only.
856%%     case posix_cc(Cs0) of
857%%         {Pcl,":]" ++ Cs1} -> re_char_class(Cs1, [{posix,Pcl}|Cc], St);
858%%         {_,Cs1} -> parse_error({posix_cc,string_between(Cs0, Cs1)})
859%%     end;
860re_char_class([C1|Cs0], Cc, St) when C1 =/= $] ->
861    case re_char(C1, Cs0) of
862        {Cf,[$-,C2|Cs1]} when C2 =/= $] ->
863            case re_char(C2, Cs1) of
864                {Cl,Cs2} when Cf < Cl ->
865                    re_char_class(Cs2, [{range,Cf,Cl}|Cc], St);
866                {_,Cs2} ->
867                    parse_error({char_class,string_between([C1|Cs0], Cs2)})
868            end;
869        {C,Cs1} -> re_char_class(Cs1, [C|Cc], St)
870    end;
871re_char_class(Cs, Cc, _) -> {reverse(Cc),Cs}.   % Preserve order
872
873%% posix_cc(String) -> {PosixClass,RestString}.
874%%  Handle POSIX character classes.
875
876%% posix_cc("alnum" ++ Cs) -> {alnum,Cs};
877%% posix_cc("alpha" ++ Cs) -> {alpha,Cs};
878%% posix_cc("blank" ++ Cs) -> {blank,Cs};
879%% posix_cc("cntrl" ++ Cs) -> {cntrl,Cs};
880%% posix_cc("digit" ++ Cs) -> {digit,Cs};
881%% posix_cc("graph" ++ Cs) -> {graph,Cs};
882%% posix_cc("lower" ++ Cs) -> {lower,Cs};
883%% posix_cc("print" ++ Cs) -> {print,Cs};
884%% posix_cc("punct" ++ Cs) -> {punct,Cs};
885%% posix_cc("space" ++ Cs) -> {space,Cs};
886%% posix_cc("upper" ++ Cs) -> {upper,Cs};
887%% posix_cc("xdigit" ++ Cs) -> {xdigit,Cs};
888%% posix_cc(Cs) -> parse_error({posix_cc,string:slice(Cs, 0, 5)}).
889
890escape_char($n) -> $\n;                         % \n = LF
891escape_char($r) -> $\r;                         % \r = CR
892escape_char($t) -> $\t;                         % \t = TAB
893escape_char($v) -> $\v;                         % \v = VT
894escape_char($b) -> $\b;                         % \b = BS
895escape_char($f) -> $\f;                         % \f = FF
896escape_char($e) -> $\e;                         % \e = ESC
897escape_char($s) -> $\s;                         % \s = SPACE
898escape_char($d) -> $\d;                         % \d = DEL
899escape_char(C) -> C.                            % Pass it straight through
900
901%% re_interval_range(Chars) -> {Min,Max,RestChars}.
902%% NoInt     -> none,none
903%% Int       -> Int,none
904%% Int,      -> Int,any
905%% Int1,Int2 -> Int1,Int2
906
907%% re_interval_range(Cs0) ->
908%%     case re_number(Cs0) of
909%%         {none,Cs1} -> {none,none,Cs1};
910%%         {N,[$,|Cs1]} ->
911%%             case re_number(Cs1) of
912%%                 {none,Cs2} -> {N,any,Cs2};
913%%                 {M,Cs2} -> {N,M,Cs2}
914%%             end;
915%%         {N,Cs1} -> {N,none,Cs1}
916%%     end.
917
918%% re_number([C|Cs]) when C >= $0, C =< $9 ->
919%%     re_number(Cs, C - $0);
920%% re_number(Cs) -> {none,Cs}.
921
922%% re_number([C|Cs], Acc) when C >= $0, C =< $9 ->
923%%     re_number(Cs, 10*Acc + (C - $0));
924%% re_number(Cs, Acc) -> {Acc,Cs}.
925
926string_between(Cs1, Cs2) ->
927    string:slice(Cs1, 0, string:length(Cs1)-string:length(Cs2)).
928
929%% We use standard methods, Thompson's construction and subset
930%% construction, to create first an NFA and then a DFA from the
931%% regexps. A non-standard feature is that we work with sets of
932%% character ranges (crs) instead sets of characters. This is most
933%% noticeable when constructing DFAs. The major benefit is that we can
934%% handle characters from any set, not just limited ASCII or 8859,
935%% even 16/32 bit unicode.
936%%
937%% The whole range of characters is 0-maxchar, where maxchar is a BIG
938%% number. We don't make any assumptions about the size of maxchar, it
939%% is just bigger than any character.
940%%
941%% Using character ranges makes describing many regexps very simple,
942%% for example the regexp "." just becomes the range
943%% [{0-9},{11-maxchar}].
944
945%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}.
946%% Build a complete nfa from a list of {RegExp,Action}. The NFA field
947%% accept has values {yes,Action}|no. The NFA is a list of states.
948
949make_dfa(REAs, St) ->
950    {NFA,NF} = build_combined_nfa(REAs),
951    verbose_print(St, "NFA contains ~w states, ", [tuple_size(NFA)]),
952    {DFA0,DF0} = build_dfa(NFA, NF),
953    verbose_print(St, "DFA contains ~w states, ", [length(DFA0)]),
954    {DFA,DF} = minimise_dfa(DFA0, DF0),
955    verbose_print(St, "minimised to ~w states.~n", [length(DFA)]),
956    %%io:fwrite("~p\n", [{NF,NFA}]),
957    %%io:fwrite("~p\n", [{DF0,DFA0}]),
958    %%io:fwrite("~p\n", [{DF,DFA}]),
959    {DFA,DF}.
960
961%% build_combined_nfa(RegExpActionList) -> {NFA,FirstState}.
962%%  Build the combined NFA using Thompson's construction straight out
963%%  of the book. Build the separate NFAs in the same order as the
964%%  rules so that the accepting have ascending states have ascending
965%%  state numbers. Start numbering the states from 1 as we put the
966%%  states in a tuple with the state number as the index.
967%%
968%%  The edges from a state are a list of {CharRange,State} | {epsilon,State}.
969
970build_combined_nfa(REAs) ->
971    {NFA0,Firsts,Free} = build_nfa_list(REAs, [], [], 1),
972    F = #nfa_state{no=Free,edges=epsilon_trans(Firsts)},
973    {list_to_tuple(keysort(#nfa_state.no, [F|NFA0])),Free}.
974
975build_nfa_list([{RE,Action}|REAs], NFA0, Firsts, Free0) ->
976    {NFA1,Free1,First} = build_nfa(RE, Free0, Action),
977    build_nfa_list(REAs, NFA1 ++ NFA0, [First|Firsts], Free1);
978build_nfa_list([], NFA, Firsts, Free) ->
979    {NFA,reverse(Firsts),Free}.
980
981epsilon_trans(Firsts) -> [ {epsilon,F} || F <- Firsts ].
982
983%% build_nfa(RegExp, NextState, Action) -> {NFA,NextState,FirstState}.
984%%  When building the NFA states for a regexp we don't build the end
985%%  state, just allocate a State for it and return this state's
986%%  number. This allows us to avoid building unnecessary states for
987%%  concatenation which would then have to be removed by overwriting
988%%  an existing state.
989
990build_nfa(RE, N0, Action) ->
991    {NFA,N1,E} = build_nfa(RE, N0+1, N0, []),
992    {[#nfa_state{no=E,accept={accept,Action}}|NFA],N1,N0}.
993
994%% build_nfa(RegExp, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
995%%  Build an NFA from the RegExp. NFA is a list of #nfa_state{} in no
996%%  predefined order. NextState is the number of the next free state
997%%  to use, FirstState is the the state which must be the start for
998%%  this regexp as a previous regexp refers to it, EndState is the
999%%  state to which this NFA will exit to. The number of the returned
1000%%  EndState is already allocated!
1001
1002build_nfa({alt,REs}, N, F, NFA) ->
1003    build_nfa_alt(REs, N, F, NFA);
1004build_nfa({seq,REs}, N, F, NFA) ->
1005    build_nfa_seq(REs, N, F, NFA);
1006build_nfa({kclosure,RE}, N0, F, NFA0) ->
1007    {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
1008    E = N1,                             % End state
1009    {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]},
1010      #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
1011     N1+1,E};
1012build_nfa({pclosure,RE}, N0, F, NFA0) ->
1013    {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
1014    E = N1,                             % End state
1015    {[#nfa_state{no=F,edges=[{epsilon,N0}]},
1016      #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
1017     N1+1,E};
1018build_nfa({optional,RE}, N0, F, NFA0) ->
1019    {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
1020    E = N1,                             % End state
1021    {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]},
1022      #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1],
1023     N1+1,E};
1024build_nfa({char_class,Cc}, N, F, NFA) ->
1025    {[#nfa_state{no=F,edges=[{pack_cc(Cc),N}]}|NFA],N+1,N};
1026build_nfa({comp_class,Cc}, N, F, NFA) ->
1027    {[#nfa_state{no=F,edges=[{comp_class(Cc),N}]}|NFA],N+1,N};
1028build_nfa({lit,Cs}, N, F, NFA) ->               % Implicit concatenation
1029    build_nfa_lit(Cs, N, F, NFA);
1030build_nfa(epsilon, N, F, NFA) ->                % Just an epsilon transition
1031    {[#nfa_state{no=F,edges=[{epsilon,N}]}|NFA],N+1,N}.
1032
1033%% build_nfa_lit(Chars, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
1034%%  Build an NFA for the sequence of literal characters.
1035
1036build_nfa_lit(Cs, N0, F0, NFA0) ->
1037    foldl(fun (C, {NFA,N,F}) ->
1038                  {[#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA],N+1,N}
1039          end, {NFA0,N0,F0}, Cs).
1040
1041%% build_nfa_lit([C|Cs], N, F, NFA0) when is_integer(C) ->
1042%%     NFA1 = [#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA0],
1043%%     build_nfa_lit(Cs, N+1, N, NFA1);
1044%% build_nfa_lit([], N, F, NFA) -> {NFA,N,F}.
1045
1046%% build_nfa_seq(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
1047%%  Build an NFA for the regexps in a sequence.
1048
1049build_nfa_seq(REs, N0, F0, NFA0) ->
1050    foldl(fun (RE, {NFA,N,F}) -> build_nfa(RE, N, F, NFA) end,
1051          {NFA0,N0,F0}, REs).
1052
1053%% build_nfa_seq([RE|REs], N0, F, NFA0) ->
1054%%     {NFA1,N1,E1} = build_nfa(RE, N0, F, NFA0),
1055%%     build_nfa_seq(REs, N1, E1, NFA1);
1056%% build_nfa_seq([], N, F, NFA) -> {NFA,N,F}.
1057
1058%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
1059%%  Build an NFA for the regexps in an alternative. N.B. we don't
1060%%  handle empty alts here but the parser should never generate them
1061%%  anyway.
1062
1063build_nfa_alt([RE], N, F, NFA) -> build_nfa(RE, N, F, NFA);
1064build_nfa_alt([RE|REs], N0, F, NFA0) ->
1065    {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
1066    {NFA2,N2,E2} = build_nfa_alt(REs, N1+1, N1, NFA1),
1067    E = N2,                                     % End state
1068    {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,N1}]},
1069      #nfa_state{no=E1,edges=[{epsilon,E}]},
1070      #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2],
1071     N2+1,E}.
1072
1073%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}.
1074%%  Build an NFA for the regexps in an alternative. Make one big
1075%%  epsilon split state, not necessary but fun.
1076
1077%% build_nfa_alt(REs, N0, F0, NFA0) ->
1078%%     E = N0,                                  % Must reserve End state first
1079%%     {Fs,{NFA1,N1}} = mapfoldl(fun (RE, {NFA,N}) ->
1080%%                                    build_nfa_alt1(RE, N, E, NFA)
1081%%                            end, {NFA0,N0+1}, REs),
1082%%     {[#nfa_state{no=F0,edges=epsilon_trans(Fs)},
1083%%       #nfa_state{no=E,edges=[{epsilon,N1}]}|NFA1],N1+1,N1}.
1084
1085%% build_nfa_alt1(RE, N0, End, NFA0) ->
1086%%     {NFA1,N1,E} = build_nfa(RE, N0+1, N0, NFA0),
1087%%     {N0,{[#nfa_state{no=E,edges=[{epsilon,End}]}|NFA1],N1}}.
1088
1089%% pack_cc(CharClass) -> CharClass
1090%%  Pack and optimise a character class specification (bracket
1091%%  expression). First sort it and then compact it.
1092
1093pack_cc(Cc) ->
1094    Crs = foldl(fun ({range,Cf,Cl}, Set) -> add_element({Cf,Cl}, Set);
1095                    (C, Set) -> add_element({C,C}, Set)
1096                end, ordsets:new(), Cc),
1097    pack_crs(Crs).                              % An ordset IS a list!
1098
1099pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 ->
1100    %% C1      C2
1101    %%   C3  C4
1102    pack_crs([Cr|Crs]);
1103pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 ->
1104    %% C1    C2
1105    %%    C3   C4
1106    pack_crs([{C1,C4}|Crs]);
1107pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 =:= C3 ->
1108    %% C1   C2
1109    %%        C3  C4
1110    pack_crs([{C1,C4}|Crs]);
1111pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)];
1112pack_crs([]) -> [].
1113
1114comp_class(Cc) ->
1115    Crs = pack_cc(Cc),
1116    Comp = comp_crs(Crs, 0),
1117    %% io:fwrite("comp: ~p\n      ~p\n", [Crs,Comp]),
1118    Comp.
1119
1120comp_crs([{0,C2}|Crs], 0) ->                   % Get first range right
1121    comp_crs(Crs, C2+1);
1122comp_crs([{C1,C2}|Crs], Last) ->
1123    [{Last,C1-1}|comp_crs(Crs, C2+1)];
1124comp_crs([], Last) -> [{Last,maxchar}].
1125
1126%% build_dfa(NFA, NfaFirstState) -> {DFA,DfaFirstState}.
1127%%  Build a DFA from an NFA using "subset construction". The major
1128%%  difference from the book is that we keep the marked and unmarked
1129%%  DFA states in seperate lists. New DFA states are added to the
1130%%  unmarked list and states are marked by moving them to the marked
1131%%  list. We assume that the NFA accepting state numbers are in
1132%%  ascending order for the rules and use ordsets to keep this order.
1133
1134build_dfa(NFA, Nf) ->
1135    D = #dfa_state{no=0,nfa=eclosure([Nf], NFA)},
1136    {build_dfa([D], 1, [], NFA),0}.
1137
1138%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA.
1139%%  Traverse the unmarked states. Temporarily add the current unmarked
1140%%  state to the marked list before calculating translation, this is
1141%%  to avoid adding too many duplicate states. Add it properly to the
1142%%  marked list afterwards with correct translations.
1143
1144build_dfa([U|Us0], N0, Ms, NFA) ->
1145    {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA),
1146    M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)},
1147    build_dfa(Us1, N1, [M|Ms], NFA);
1148build_dfa([], _, Ms, _) -> Ms.
1149
1150%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) ->
1151%%      {Transitions,UnmarkedStates,NextState}.
1152%%  Foreach NFA state set calculate the legal translations. N.B. must
1153%%  search *BOTH* the unmarked and marked lists to check if DFA state
1154%%  already exists. As the range of characters is potentially VERY
1155%%  large we cannot explicitly test all characters. Instead we first
1156%%  calculate the set of all disjoint character ranges which are
1157%%  possible candidates to the set of NFA states. The transitions are
1158%%  an orddict so we get the transition lists in ascending order.
1159
1160build_dfa(Set, Us, N, Ts, Ms, NFA) ->
1161    %% List of all transition sets.
1162    Crs0 = [Cr || S <- Set,
1163                  {Crs,_St} <- (element(S, NFA))#nfa_state.edges,
1164                  Crs =/= epsilon,       % Not an epsilon transition
1165                  Cr <- Crs ],
1166    Crs1 = lists:usort(Crs0),            % Must remove duplicates!
1167    %% Build list of disjoint test ranges.
1168    Test = disjoint_crs(Crs1),
1169    %% io:fwrite("bd: ~p\n    ~p\n    ~p\n    ~p\n", [Set,Crs0,Crs1,Test]),
1170    build_dfa(Test, Set, Us, N, Ts, Ms, NFA).
1171
1172%% disjoint_crs([CharRange]) -> [CharRange].
1173%%  Take a sorted list of char ranges and make a sorted list of
1174%%  disjoint char ranges. No new char range extends past an existing
1175%%  char range.
1176
1177disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 ->
1178    %% C1  C2
1179    %%        C3  C4
1180    [Cr1|disjoint_crs([Cr2|Crs])];
1181disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 =:= C3 ->
1182    %% C1     C2
1183    %% C3       C4
1184    [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))];
1185disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 ->
1186    %% C1     C2
1187    %%    C3     C4
1188    [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))];
1189disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 =:= C4 ->
1190    %% C1      C2
1191    %%    C3   C4
1192    [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))];
1193disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 ->
1194    %% C1        C2
1195    %%    C3   C4
1196    [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))];
1197disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)];
1198disjoint_crs([]) -> [].
1199
1200build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) ->
1201    case eclosure(move(Set, Cr, NFA), NFA) of
1202        S when S =/= [] ->
1203            case dfa_state_exist(S, Us, Ms) of
1204                {yes,T} ->
1205                    build_dfa(Crs, Set, Us, N, store(Cr, T, Ts), Ms, NFA);
1206                no ->
1207                    U = #dfa_state{no=N,nfa=S},
1208                    build_dfa(Crs, Set, [U|Us], N+1, store(Cr, N, Ts), Ms, NFA)
1209            end;
1210        [] ->
1211            build_dfa(Crs, Set, Us, N, Ts, Ms, NFA)
1212    end;
1213build_dfa([], _, Us, N, Ts, _, _) ->
1214    {Ts,Us,N}.
1215
1216%% dfa_state_exist(Set, Unmarked, Marked) -> {yes,State} | no.
1217
1218dfa_state_exist(S, Us, Ms) ->
1219    case lists:keyfind(S, #dfa_state.nfa, Us) of
1220        #dfa_state{no=T} -> {yes,T};
1221        false ->
1222            case lists:keyfind(S, #dfa_state.nfa, Ms) of
1223                #dfa_state{no=T} -> {yes,T};
1224                false -> no
1225            end
1226    end.
1227
1228%% eclosure([State], NFA) -> [State].
1229%% move([State], Char, NFA) -> [State].
1230%%  These are straight out of the book. As eclosure uses ordsets then
1231%%  the generated state sets are in ascending order.
1232
1233eclosure(Sts, NFA) -> eclosure(Sts, NFA, []).
1234
1235eclosure([St|Sts], NFA, Ec) ->
1236    #nfa_state{edges=Es} = element(St, NFA),
1237    eclosure([ N || {epsilon,N} <- Es,
1238		    not is_element(N, Ec) ] ++ Sts,
1239             NFA, add_element(St, Ec));
1240eclosure([], _, Ec) -> Ec.
1241
1242move(Sts, Cr, NFA) ->
1243    %% io:fwrite("move1: ~p\n", [{Sts,Cr}]),
1244    [ St || N <- Sts,
1245            {Crs,St} <- (element(N, NFA))#nfa_state.edges,
1246            Crs =/= epsilon,             % Not an epsilon transition
1247            in_crs(Cr, Crs) ].
1248
1249in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true;
1250in_crs(Cr, [Cr|_Crs]) -> true;          % Catch bos and eos.
1251in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs);
1252in_crs(_Cr, []) -> false.
1253
1254%% accept([State], NFA) -> {accept,A} | noaccept.
1255%%  Scan down the state list until we find an accepting state.
1256
1257accept([St|Sts], NFA) ->
1258    case element(St, NFA) of
1259        #nfa_state{accept={accept,A}} -> {accept,A};
1260        #nfa_state{accept=noaccept} -> accept(Sts, NFA)
1261    end;
1262accept([], _) -> noaccept.
1263
1264%% minimise_dfa(DFA, DfaFirst) -> {DFA,DfaFirst}.
1265%%  Minimise the DFA by removing equivalent states. We consider a
1266%%  state if both the transitions and the their accept state is the
1267%%  same.  First repeatedly run throught the DFA state list removing
1268%%  equivalent states and updating remaining transitions with
1269%%  remaining equivalent state numbers. When no more reductions are
1270%%  possible then pack the remaining state numbers to get consecutive
1271%%  states.
1272
1273minimise_dfa(DFA0, Df0) ->
1274    case min_dfa(DFA0) of
1275        {DFA1,[]} ->                    % No reduction!
1276            {DFA2,Rs} = pack_dfa(DFA1),
1277            {min_update(DFA2, Rs),min_use(Df0, Rs)};
1278        {DFA1,Rs} ->
1279            minimise_dfa(min_update(DFA1, Rs), min_use(Df0, Rs))
1280    end.
1281
1282min_dfa(DFA) -> min_dfa(DFA, [], []).
1283
1284min_dfa([D|DFA0], Rs0, MDFA) ->
1285    {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept,
1286                            D#dfa_state.no, Rs0, []),
1287    min_dfa(DFA1, Rs1, [D|MDFA]);
1288min_dfa([], Rs, MDFA) -> {MDFA,Rs}.
1289
1290%% min_delete(States, Trans, Action, NewN, Rs, MiniDFA) -> {MiniDFA,Rs}.
1291%%  Delete all states with same transactions and action. Return
1292%%  rewrites and minimised DFA with no duplicate states.
1293
1294min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) ->
1295    min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA);
1296min_delete([D|DFA], T, A, NewN, Rs, MDFA) ->
1297    min_delete(DFA, T, A, NewN, Rs, [D|MDFA]);
1298min_delete([], _, _, _, Rs, MDFA) -> {MDFA,Rs}.
1299
1300min_update(DFA, Rs) ->
1301    [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ].
1302
1303min_update_trans(Tr, Rs) ->
1304    [ {C,min_use(S, Rs)} || {C,S} <- Tr ].
1305
1306min_use(Old, [{Old,New}|_]) -> New;
1307min_use(Old, [_|Reds]) -> min_use(Old, Reds);
1308min_use(Old, []) -> Old.
1309
1310pack_dfa(DFA) -> pack_dfa(DFA, 0, [], []).
1311
1312pack_dfa([D|DFA], NewN, Rs, PDFA) ->
1313    pack_dfa(DFA, NewN+1,
1314             [{D#dfa_state.no,NewN}|Rs], [D#dfa_state{no=NewN}|PDFA]);
1315pack_dfa([], _, Rs, PDFA) -> {PDFA,Rs}.
1316
1317%% The main output is the yystate function which is built from the
1318%% DFA. It has the spec:
1319%%
1320%% yystate() -> InitialState.
1321%% yystate(State, InChars, Line, CurrTokLen, AcceptAction, AcceptLen) ->
1322%%      {Action, AcceptLength, RestChars, Line} |         Accepting end state
1323%%      {Action, AcceptLength, RestChars, Line, State} |  Accepting state
1324%%      {reject, AcceptLength, CurrTokLen, RestChars, Line, State} |
1325%%      {Action, AcceptLength, CurrTokLen, RestChars, Line, State}.
1326
1327%% The return CurrTokLen is always the current number of characters
1328%% scanned in the current token. The returns have the following
1329%% meanings:
1330%% {Action, AcceptLength, RestChars, Line} -
1331%%  The scanner has reached an accepting end-state, for example after
1332%%  a regexp "abc". Action is the action number and AcceptLength is
1333%%  the length of the matching token.
1334%%
1335%% {Action, AcceptLength, RestChars, Line, State} -
1336%%  The scanner has reached an accepting transition state, for example
1337%%  after c in regexp "abc(xyz)?", continuation depends on
1338%%  RestChars. If RestChars == [] (no more current characters) then we
1339%%  need to get more characters to see if it is an end-state,
1340%%  otherwise (eof or chars) then we have not found continuing
1341%%  characters and it is an end state.
1342%%
1343%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} -
1344%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State} -
1345%%  The scanner has reached a non-accepting transition state. If
1346%%  RestChars == [] we need to get more characters to continue.
1347%%  Otherwise if 'reject' then no accepting state has been reached it
1348%%  is an error. If we have an Action and AcceptLength then these are
1349%%  the last accept state, use them and continue from there.
1350
1351%% out_file(LeexState, DFA, DfaStart, [Action], Code) -> ok | error.
1352%%  Generate an output .erl file from the include file, the DFA and
1353%%  the code for the actions.
1354
1355out_file(St0, DFA, DF, Actions, Code) ->
1356    verbose_print(St0, "Writing file ~ts, ", [St0#leex.efile]),
1357    case open_inc_file(St0) of
1358        {ok,Ifile} ->
1359            try
1360                case file:open(St0#leex.efile, [write]) of
1361                    {ok,Ofile} ->
1362                        set_encoding(St0, Ofile),
1363                        try
1364                            output_encoding_comment(Ofile, St0),
1365                            output_file_directive(Ofile, St0#leex.ifile, 0),
1366                            out_file(Ifile, Ofile, St0, DFA, DF, Actions,
1367                                     Code, 1),
1368                            verbose_print(St0, "ok~n", []),
1369                            St0
1370                        after ok = file:close(Ofile)
1371                        end;
1372                    {error,Error} ->
1373                        verbose_print(St0, "error~n", []),
1374                        add_error({none,leex,{file_error,Error}}, St0)
1375                end
1376            after ok = file:close(Ifile)
1377            end;
1378        {{error,Error},Ifile} ->
1379            add_error(Ifile, {none,leex,{file_error,Error}}, St0)
1380    end.
1381
1382open_inc_file(State) ->
1383    Ifile = State#leex.ifile,
1384    case file:open(Ifile, [read]) of
1385        {ok,F} ->
1386            _ = epp:set_encoding(F),
1387            {ok,F};
1388        Error -> {Error,Ifile}
1389    end.
1390
1391inc_file_name([]) ->
1392    Incdir = filename:join(code:lib_dir(parsetools), "include"),
1393    filename:join(Incdir, ?LEEXINC);
1394inc_file_name(Filename) ->
1395    Filename.
1396
1397%% out_file(IncFile, OutFile, State, DFA, DfaStart, Actions, Code, Line) -> ok
1398%%  Copy the include file line by line substituting special lines with
1399%%  generated code. We cheat by only looking at the first 5
1400%%  characters.
1401
1402out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L) ->
1403    case io:get_line(Ifile, leex) of
1404        eof -> output_file_directive(Ofile, St#leex.ifile, L);
1405        {error, _} -> add_error(St#leex.ifile, {L, leex, cannot_parse}, St);
1406        Line ->
1407            case string:slice(Line, 0, 5) of
1408                "##mod" -> out_module(Ofile, St);
1409                "##cod" -> out_erlang_code(Ofile, St, Code, L);
1410                "##dfa" -> out_dfa(Ofile, St, DFA, Code, DF, L);
1411                "##act" -> out_actions(Ofile, St#leex.xfile, Actions);
1412                _ -> io:put_chars(Ofile, Line)
1413            end,
1414            out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L+1)
1415    end.
1416
1417out_module(File, St) ->
1418    io:fwrite(File, "-module(~w).\n", [St#leex.module]).
1419
1420out_erlang_code(File, St, Code, L) ->
1421    {CodeL,CodePos,_NCodeLines} = Code,
1422    output_file_directive(File, St#leex.xfile, CodeL),
1423    {ok,Xfile} = file:open(St#leex.xfile, [read]),
1424    try
1425        set_encoding(St, Xfile),
1426        {ok,_} = file:position(Xfile, CodePos),
1427        ok = file_copy(Xfile, File)
1428    after
1429        ok = file:close(Xfile)
1430    end,
1431    io:nl(File),
1432    output_file_directive(File, St#leex.ifile, L).
1433
1434file_copy(From, To) ->
1435    case io:get_line(From, leex) of
1436        eof -> ok;
1437        Line when is_list(Line) ->
1438            io:fwrite(To, "~ts", [Line]),
1439            file_copy(From, To)
1440    end.
1441
1442out_dfa(File, St, DFA, Code, DF, L) ->
1443    {_CodeL,_CodePos,NCodeLines} = Code,
1444    %% Three file attributes before this one...
1445    output_file_directive(File, St#leex.efile, L+(NCodeLines-1)+3),
1446    io:fwrite(File, "yystate() -> ~w.~n~n", [DF]),
1447    foreach(fun (S) -> out_trans(File, S) end, DFA),
1448    io:fwrite(File, "yystate(S, Ics, Line, Tlen, Action, Alen) ->~n", []),
1449    io:fwrite(File, "    {Action,Alen,Tlen,Ics,Line,S}.~n", []).
1450
1451out_trans(File, #dfa_state{no=N,trans=[],accept={accept,A}}) ->
1452    %% Accepting end state, guaranteed done.
1453    io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]),
1454    io:fwrite(File, "    {~w,Tlen,Ics,Line};~n", [A]);
1455out_trans(File, #dfa_state{no=N,trans=Tr,accept={accept,A}}) ->
1456    %% Accepting state, but there maybe more.
1457    foreach(fun (T) -> out_accept_tran(File, N, A, T) end, pack_trans(Tr)),
1458    io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]),
1459    io:fwrite(File, "    {~w,Tlen,Ics,Line,~w};~n", [A,N]);
1460out_trans(File, #dfa_state{no=N,trans=Tr,accept=noaccept}) ->
1461    %% Non-accepting transition state.
1462    foreach(fun (T) -> out_noaccept_tran(File, N, T) end, pack_trans(Tr)),
1463    io:fwrite(File, "yystate(~w, Ics, Line, Tlen, Action, Alen) ->~n", [N]),
1464    io:fwrite(File, "    {Action,Alen,Tlen,Ics,Line,~w};~n", [N]).
1465
1466out_accept_tran(File, N, A, {{Cf,maxchar},S}) ->
1467    out_accept_head_max(File, N, Cf),
1468    out_accept_body(File, S, "Line", A);
1469out_accept_tran(File, N, A, {{Cf,Cl},S}) ->
1470    out_accept_head_range(File, N, Cf, Cl),
1471    out_accept_body(File, S, "Line", A);
1472out_accept_tran(File, N, A, {$\n,S}) ->
1473    out_accept_head_1(File, N, $\n),
1474    out_accept_body(File, S, "Line+1", A);
1475out_accept_tran(File, N, A, {C,S}) ->
1476    out_accept_head_1(File, N, C),
1477    out_accept_body(File, S, "Line", A).
1478
1479out_accept_head_1(File, State, Char) ->
1480    out_head_1(File, State, Char, "_", "_").
1481
1482out_accept_head_max(File, State, Min) ->
1483    out_head_max(File, State, Min, "_", "_").
1484
1485out_accept_head_range(File, State, Min, Max) ->
1486    out_head_range(File, State, Min, Max, "_", "_").
1487
1488out_accept_body(File, Next, Line, Action) ->
1489    out_body(File, Next, Line, io_lib:write(Action), "Tlen").
1490
1491out_noaccept_tran(File, N, {{Cf,maxchar},S}) ->
1492    out_noaccept_head_max(File, N, Cf),
1493    out_noaccept_body(File, S, "Line");
1494out_noaccept_tran(File, N, {{Cf,Cl},S}) ->
1495    out_noaccept_head_range(File, N, Cf, Cl),
1496    out_noaccept_body(File, S, "Line");
1497out_noaccept_tran(File, N, {$\n,S}) ->
1498    out_noaccept_head_1(File, N, $\n),
1499    out_noaccept_body(File, S, "Line+1");
1500out_noaccept_tran(File, N, {C,S}) ->
1501    out_noaccept_head_1(File, N, C),
1502    out_noaccept_body(File, S, "Line").
1503
1504out_noaccept_head_1(File, State, Char) ->
1505    out_head_1(File, State, Char, "Action", "Alen").
1506
1507out_noaccept_head_max(File, State, Min) ->
1508    out_head_max(File, State, Min, "Action", "Alen").
1509
1510out_noaccept_head_range(File, State, Min, Max) ->
1511    out_head_range(File, State, Min, Max, "Action", "Alen").
1512
1513out_noaccept_body(File, Next, Line) ->
1514    out_body(File, Next, Line, "Action", "Alen").
1515
1516out_head_1(File, State, Char, Action, Alen) ->
1517    io:fwrite(File, "yystate(~w, [~w|Ics], Line, Tlen, ~s, ~s) ->\n",
1518              [State,Char,Action,Alen]).
1519
1520out_head_max(File, State, Min, Action, Alen) ->
1521    io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w ->\n",
1522              [State,Action,Alen,Min]).
1523
1524out_head_range(File, State, Min, Max, Action, Alen) ->
1525    io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w, C =< ~w ->\n",
1526              [State,Action,Alen,Min,Max]).
1527
1528out_body(File, Next, Line, Action, Alen) ->
1529    io:fwrite(File, "    yystate(~w, Ics, ~s, Tlen+1, ~s, ~s);\n",
1530              [Next,Line,Action,Alen]).
1531
1532%% pack_trans([{Crange,State}]) -> [{Crange,State}] when
1533%%      Crange = {Char,Char} | Char.
1534%%  Pack the translation table into something more suitable for
1535%%  generating code. We KNOW how the pattern matching compiler works
1536%%  so solitary characters are stored before ranges. We do this by
1537%%  prepending singletons to the front of the packed transitions and
1538%%  appending ranges to the back. This preserves the smallest to
1539%%  largest order of ranges. Newline characters, $\n, are always
1540%%  extracted and handled as singeltons.
1541
1542pack_trans(Trs) -> pack_trans(Trs, []).
1543
1544%% pack_trans(Trs) ->
1545%%     Trs1 = pack_trans(Trs, []),
1546%%     io:fwrite("tr:~p\n=> ~p\n", [Trs,Trs1]),
1547%%     Trs1.
1548
1549pack_trans([{{C,C},S}|Trs], Pt) ->         % Singletons to the head
1550    case lists:member({C,S}, Pt) of
1551        true -> pack_trans(Trs, Pt);
1552        false -> pack_trans(Trs, [{C,S}|Pt])
1553    end;
1554%% Special detection and handling of $\n.
1555pack_trans([{{Cf,$\n},S}|Trs], Pt) ->
1556    pack_trans([{{Cf,$\n-1},S}|Trs], [{$\n,S}|Pt]);
1557pack_trans([{{$\n,Cl},S}|Trs], Pt) ->
1558    pack_trans([{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]);
1559pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cf < $\n, Cl > $\n ->
1560    pack_trans([{{Cf,$\n-1},S},{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]);
1561%% Small ranges become singletons.
1562pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cl =:= Cf + 1 ->
1563    pack_trans(Trs, [{Cf,S},{Cl,S}|Pt]);
1564pack_trans([Tr|Trs], Pt) ->                % The default uninteresting case
1565    pack_trans(Trs, Pt ++ [Tr]);
1566pack_trans([], Pt) -> Pt.
1567
1568%% out_actions(File, XrlFile, ActionList) -> ok.
1569%% Write out the action table.
1570
1571out_actions(File, XrlFile, As) ->
1572    As1 = prep_out_actions(As),
1573    foreach(fun (A) -> out_action(File, A) end, As1),
1574    io:fwrite(File, "yyaction(_, _, _, _) -> error.~n", []),
1575    foreach(fun (A) -> out_action_code(File, XrlFile, A) end, As1).
1576
1577prep_out_actions(As) ->
1578    map(fun ({A,empty_action}) ->
1579                {A,empty_action};
1580            ({A,Code,TokenChars,TokenLen,TokenLine}) ->
1581                Vs = [{TokenChars,"TokenChars"},
1582                      {TokenLen,"TokenLen"},
1583                      {TokenLine,"TokenLine"},
1584                      {TokenChars,"YYtcs"},
1585                      {TokenLen or TokenChars,"TokenLen"}],
1586                Vars = [if F -> S; true -> "_" end || {F,S} <- Vs],
1587                Name = list_to_atom(lists:concat([yyaction_,A])),
1588                [Chars,Len,Line,_,_] = Vars,
1589                Args = [V || V <- [Chars,Len,Line], V =/= "_"],
1590                ArgsChars = lists:join(", ", Args),
1591                {A,Code,Vars,Name,Args,ArgsChars}
1592        end, As).
1593
1594out_action(File, {A,empty_action}) ->
1595    io:fwrite(File, "yyaction(~w, _, _, _) -> skip_token;~n", [A]);
1596out_action(File, {A,_Code,Vars,Name,_Args,ArgsChars}) ->
1597    [_,_,Line,Tcs,Len] = Vars,
1598    io:fwrite(File, "yyaction(~w, ~s, ~s, ~s) ->~n", [A,Len,Tcs,Line]),
1599    if
1600        Tcs =/= "_" ->
1601            io:fwrite(File, "    TokenChars = yypre(YYtcs, TokenLen),~n", []);
1602        true -> ok
1603    end,
1604    io:fwrite(File, "    ~s(~s);~n", [Name, ArgsChars]).
1605
1606out_action_code(_File, _XrlFile, {_A,empty_action}) ->
1607    ok;
1608out_action_code(File, XrlFile, {_A,Code,_Vars,Name,Args,ArgsChars}) ->
1609    %% Should set the file to the .erl file, but instead assumes that
1610    %% ?LEEXINC is syntactically correct.
1611    io:fwrite(File, "\n-compile({inline,~w/~w}).\n", [Name, length(Args)]),
1612    L = erl_scan:line(hd(Code)),
1613    output_file_directive(File, XrlFile, L-2),
1614    io:fwrite(File, "~s(~s) ->~n", [Name, ArgsChars]),
1615    io:fwrite(File, "    ~ts\n", [pp_tokens(Code, L, File)]).
1616
1617%% pp_tokens(Tokens, Line, File) -> [char()].
1618%%  Prints the tokens keeping the line breaks of the original code.
1619
1620pp_tokens(Tokens, Line0, File) -> pp_tokens(Tokens, Line0, File, none).
1621
1622pp_tokens([], _Line0, _, _) -> [];
1623pp_tokens([T | Ts], Line0, File, Prev) ->
1624    Line = erl_scan:line(T),
1625    [pp_sep(Line, Line0, Prev, T),
1626     pp_symbol(T, File) | pp_tokens(Ts, Line, File, T)].
1627
1628pp_symbol({var,_,Var}, _) -> atom_to_list(Var);
1629pp_symbol({_,_,Symbol}, File) -> format_symbol(Symbol, File);
1630pp_symbol({dot, _}, _) -> ".";
1631pp_symbol({Symbol, _}, _) -> atom_to_list(Symbol).
1632
1633pp_sep(Line, Line0, Prev, T) when Line > Line0 ->
1634    ["\n    " | pp_sep(Line - 1, Line0, Prev, T)];
1635pp_sep(_, _, {'.',_}, _) -> "";        % No space after '.' (not a dot)
1636pp_sep(_, _, {'#',_}, _) -> "";        % No space after '#'
1637pp_sep(_, _, {'(',_}, _) -> "";        % No space after '('
1638pp_sep(_, _, {'[',_}, _) -> "";        % No space after '['
1639pp_sep(_, _, _, {'.',_}) -> "";        % No space before '.'
1640pp_sep(_, _, _, {'#',_}) -> "";        % No space before '#'
1641pp_sep(_, _, _, {',',_}) -> "";        % No space before ','
1642pp_sep(_, _, _, {')',_}) -> "";        % No space before ')'
1643pp_sep(_, _, _, _) -> " ".
1644
1645%% out_dfa_graph(LeexState, DFA, DfaStart) -> ok | error.
1646%%  Writes the DFA to a .dot file in DOT-format which can be viewed
1647%%  with Graphviz.
1648
1649out_dfa_graph(St, DFA, DF) ->
1650    verbose_print(St, "Writing DFA to file ~ts, ", [St#leex.gfile]),
1651    case file:open(St#leex.gfile, [write]) of
1652        {ok,Gfile} ->
1653            try
1654                %% Set the same encoding as infile:
1655                set_encoding(St, Gfile),
1656                io:fwrite(Gfile, "digraph DFA {~n", []),
1657                out_dfa_states(Gfile, DFA, DF),
1658                out_dfa_edges(Gfile, DFA),
1659                io:fwrite(Gfile, "}~n", []),
1660                verbose_print(St, "ok~n", []),
1661                St
1662            after ok = file:close(Gfile)
1663            end;
1664        {error,Error} ->
1665            verbose_print(St, "error~n", []),
1666            add_error({none,leex,{file_error,Error}}, St)
1667    end.
1668
1669out_dfa_states(File, DFA, DF) ->
1670    foreach(fun (S) -> out_dfa_state(File, DF, S) end, DFA),
1671    io:fwrite(File, "~n", []).
1672
1673out_dfa_state(File, DF, #dfa_state{no=DF, accept={accept,_}}) ->
1674    io:fwrite(File, "  ~b [shape=doublecircle color=green];~n", [DF]);
1675out_dfa_state(File, DF, #dfa_state{no=DF, accept=noaccept}) ->
1676    io:fwrite(File, "  ~b [shape=circle color=green];~n", [DF]);
1677out_dfa_state(File, _, #dfa_state{no=S, accept={accept,_}}) ->
1678    io:fwrite(File, "  ~b [shape=doublecircle];~n", [S]);
1679out_dfa_state(File, _, #dfa_state{no=S, accept=noaccept}) ->
1680    io:fwrite(File, "  ~b [shape=circle];~n", [S]).
1681
1682out_dfa_edges(File, DFA) ->
1683    foreach(fun (#dfa_state{no=S,trans=Trans}) ->
1684                    Pt = pack_trans(Trans),
1685                    Tdict = foldl(fun ({Cr,T}, D) ->
1686                                          orddict:append(T, Cr, D)
1687                                  end, orddict:new(), Pt),
1688                    foreach(fun (T) ->
1689                                    Crs = orddict:fetch(T, Tdict),
1690                                    Edgelab = dfa_edgelabel(Crs, File),
1691                                    io:fwrite(File, "  ~b -> ~b [label=\"~ts\"];~n",
1692                                              [S,T,Edgelab])
1693                            end, sort(orddict:fetch_keys(Tdict)))
1694            end, DFA).
1695
1696dfa_edgelabel([C], File) when is_integer(C) -> quote(C, File);
1697dfa_edgelabel(Cranges, File) ->
1698    %% io:fwrite("el: ~p\n", [Cranges]),
1699    "[" ++ map(fun ({A,B}) -> [quote(A, File), "-", quote(B, File)];
1700                   (C)     -> [quote(C, File)]
1701               end, Cranges) ++ "]".
1702
1703set_encoding(#leex{encoding = none}, File) ->
1704    ok = io:setopts(File, [{encoding, epp:default_encoding()}]);
1705set_encoding(#leex{encoding = E}, File) ->
1706    ok = io:setopts(File, [{encoding, E}]).
1707
1708output_encoding_comment(_File, #leex{encoding = none}) ->
1709    ok;
1710output_encoding_comment(File, #leex{encoding = Encoding}) ->
1711    io:fwrite(File, <<"%% ~s\n">>, [epp:encoding_to_string(Encoding)]).
1712
1713output_file_directive(File, Filename, Line) ->
1714    io:fwrite(File, <<"-file(~ts, ~w).\n">>,
1715              [format_filename(Filename, File), Line]).
1716
1717format_filename(Filename0, File) ->
1718    Filename = filename:flatten(Filename0),
1719    case enc(File) of
1720        unicode -> io_lib:write_string(Filename);
1721        latin1  -> io_lib:write_string_as_latin1(Filename)
1722    end.
1723
1724format_symbol(Symbol, File) ->
1725    Format = case enc(File) of
1726                 latin1  -> "~p";
1727                 unicode -> "~tp"
1728             end,
1729    io_lib:fwrite(Format, [Symbol]).
1730
1731enc(File) ->
1732    case lists:keyfind(encoding, 1, io:getopts(File)) of
1733	false -> latin1; % should never happen
1734	{encoding, Enc} -> Enc
1735    end.
1736
1737quote($^, _File)  -> "\\^";
1738quote($., _File)  -> "\\.";
1739quote($$, _File)  -> "\\$";
1740quote($-, _File)  -> "\\-";
1741quote($[, _File)  -> "\\[";
1742quote($], _File)  -> "\\]";
1743quote($\s, _File) -> "\\\\s";
1744quote($\", _File) -> "\\\"";
1745quote($\b, _File) -> "\\\\b";
1746quote($\f, _File) -> "\\\\f";
1747quote($\n, _File) -> "\\\\n";
1748quote($\r, _File) -> "\\\\r";
1749quote($\t, _File) -> "\\\\t";
1750quote($\e, _File) -> "\\\\e";
1751quote($\v, _File) -> "\\\\v";
1752quote($\d, _File) -> "\\\\d";
1753quote($\\, _File) -> "\\\\";
1754quote(C, File) when is_integer(C) ->
1755    %% Must remove the $ and get the \'s right.
1756    S = case enc(File) of
1757            unicode -> io_lib:write_char(C);
1758            latin1  -> io_lib:write_char_as_latin1(C)
1759        end,
1760    case S of
1761        [$$,$\\|Cs] -> "\\\\" ++ Cs;
1762        [$$|Cs] -> Cs
1763    end;
1764quote(maxchar, _File) ->
1765    "MAXCHAR".
1766