1%% Copyright (c) 2008,2009 Robert Virding. All rights reserved. 2%% 3%% Redistribution and use in source and binary forms, with or without 4%% modification, are permitted provided that the following conditions 5%% are met: 6%% 7%% 1. Redistributions of source code must retain the above copyright 8%% notice, this list of conditions and the following disclaimer. 9%% 2. Redistributions in binary form must reproduce the above copyright 10%% notice, this list of conditions and the following disclaimer in the 11%% documentation and/or other materials provided with the distribution. 12%% 13%% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 14%% "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 15%% LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 16%% FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 17%% COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 18%% INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 19%% BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20%% LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21%% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22%% LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 23%% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24%% POSSIBILITY OF SUCH DAMAGE. 25 26%%% A Lexical Analyser Generator for Erlang. 27%%% 28%%% Most of the algorithms used here are taken pretty much as 29%%% described in the "Dragon Book" by Aho, Sethi and Ullman. Some 30%%% completing details were taken from "Compiler Design in C" by 31%%% Hollub. 32 33-module(leex). 34 35-export([compile/3,file/1,file/2,format_error/1]). 36 37-import(lists, [member/2,reverse/1,sort/1,keysort/2, 38 map/2,foldl/3,foldr/3,foreach/2,flatmap/2]). 39-import(ordsets, [is_element/2,add_element/2,union/2]). 40-import(orddict, [store/3]). 41 42-include("erl_compile.hrl"). 43%%-include_lib("stdlib/include/erl_compile.hrl"). 44 45-define(LEEXINC, "leexinc.hrl"). % Include file 46-define(LEEXLIB, parsetools). % Leex is in lib parsetools 47%%-define(LEEXLIB, leex). % Leex is in lib leex 48 49-define(DEFS_HEAD, "Definitions."). 50-define(RULE_HEAD, "Rules."). 51-define(CODE_HEAD, "Erlang code."). 52 53-record(leex, {xfile=[], % Xrl file 54 efile=[], % Erl file 55 ifile=[], % Include file 56 gfile=[], % Graph file 57 module, % Module name 58 opts=[], % Options 59 encoding=none, % Encoding of Xrl file 60 % posix=false, % POSIX regular expressions 61 errors=[], 62 warnings=[] 63 }). 64 65-record(nfa_state, {no,edges=[],accept=noaccept}). 66-record(dfa_state, {no,nfa=[],trans=[],accept=noaccept}). 67 68%%% 69%%% Exported functions 70%%% 71 72%%% Interface to erl_compile. 73 74compile(Input0, Output0, 75 #options{warning = WarnLevel, verbose=Verbose, includes=Includes, 76 specific=Specific}) -> 77 Input = assure_extension(shorten_filename(Input0), ".xrl"), 78 Output = assure_extension(shorten_filename(Output0), ".erl"), 79 Includefile = lists:sublist(Includes, 1), 80 Werror = proplists:get_bool(warnings_as_errors, Specific), 81 Opts = [{scannerfile,Output},{includefile,Includefile},{verbose,Verbose}, 82 {report_errors,true},{report_warnings,WarnLevel > 0}, 83 {warnings_as_errors, Werror}], 84 case file(Input, Opts) of 85 {ok, _} -> 86 ok; 87 error -> 88 error 89 end. 90 91-type error_info() :: {erl_anno:line() | 'none', 92 module(), ErrorDescriptor :: term()}. 93-type errors() :: [{file:filename(), [error_info()]}]. 94-type warnings() :: [{file:filename(), [error_info()]}]. 95-type ok_ret() :: {'ok', Scannerfile :: file:filename()} 96 | {'ok', Scannerfile :: file:filename(), warnings()}. 97-type error_ret() :: 'error' 98 | {'error', Errors :: errors(), Warnings :: warnings()}. 99-type leex_ret() :: ok_ret() | error_ret(). 100 101-spec file(FileName) -> leex_ret() when 102 FileName :: file:filename(). 103 104file(File) -> file(File, []). 105 106-spec file(FileName, Options) -> leex_ret() when 107 FileName :: file:filename(), 108 Options :: Option | [Option], 109 Option :: {'dfa_graph', boolean()} 110 | {'includefile', Includefile :: file:filename()} 111 | {'report_errors', boolean()} 112 | {'report_warnings', boolean()} 113 | {'report', boolean()} 114 | {'return_errors', boolean()} 115 | {'return_warnings', boolean()} 116 | {'return', boolean()} 117 | {'scannerfile', Scannerfile :: file:filename()} 118 | {'verbose', boolean()} 119 | {'warnings_as_errors', boolean()} 120 | 'dfa_graph' 121 | 'report_errors' | 'report_warnings' | 'report' 122 | 'return_errors' | 'return_warnings' | 'return' 123 | 'verbose' | 'warnings_as_errors'. 124 125file(File, Opts0) when is_list(Opts0) -> 126 case is_filename(File) of 127 no -> erlang:error(badarg, [File,Opts0]); 128 _ -> ok 129 end, 130 EnvOpts0 = env_default_opts(), 131 EnvOpts = select_recognized_opts(EnvOpts0), 132 Opts1 = Opts0 ++ EnvOpts, 133 Opts = case options(Opts1) of 134 badarg -> 135 erlang:error(badarg, [File,Opts0]); 136 Options -> 137 Options 138 end, 139 St0 = #leex{}, 140 St1 = filenames(File, Opts, St0), % Get all the filenames 141 St = try 142 {ok,REAs,Actions,Code,St2} = parse_file(St1), 143 {DFA,DF} = make_dfa(REAs, St2), 144 case werror(St2) of 145 false -> 146 St3 = out_file(St2, DFA, DF, Actions, Code), 147 case lists:member(dfa_graph, St3#leex.opts) of 148 true -> out_dfa_graph(St3, DFA, DF); 149 false -> St3 150 end; 151 true -> 152 St2 153 end 154 catch #leex{}=St4 -> 155 St4 156 end, 157 leex_ret(St); 158file(File, Opt) -> 159 file(File, [Opt]). 160 161-spec format_error(ErrorDescriptor) -> io_lib:chars() when 162 ErrorDescriptor :: term(). 163 164format_error({file_error, Reason}) -> 165 io_lib:fwrite("~ts",[file:format_error(Reason)]); 166format_error(missing_defs) -> "missing Definitions"; 167format_error(missing_rules) -> "missing Rules"; 168format_error(missing_code) -> "missing Erlang code"; 169format_error(empty_rules) -> "no rules"; 170format_error(bad_rule) -> "bad rule"; 171format_error({regexp,E})-> 172 Es = case E of 173 {interval_range,_} -> "interval range"; 174 {unterminated,Cs} -> 175 "unterminated " ++ Cs; 176 {illegal_char,Cs} -> 177 "illegal character " ++ Cs; 178%% {posix_cc,What} -> 179%% ["illegal POSIX character class ",io_lib:write_string(What)]; 180 {char_class,What} -> 181 ["illegal character class ",io_lib:write_string(What)] 182 end, 183 ["bad regexp `",Es,"'"]; 184format_error(ignored_characters) -> 185 "ignored characters"; 186format_error(cannot_parse) -> 187 io_lib:fwrite("cannot parse; probably encoding mismatch", []). 188 189%%% 190%%% Local functions 191%%% 192 193assure_extension(File, Ext) -> 194 lists:concat([strip_extension(File, Ext), Ext]). 195 196%% Assumes File is a filename. 197strip_extension(File, Ext) -> 198 case filename:extension(File) of 199 Ext -> filename:rootname(File); 200 _Other -> File 201 end. 202 203%% Copied from compile.erl. 204env_default_opts() -> 205 Key = "ERL_COMPILER_OPTIONS", 206 case os:getenv(Key) of 207 false -> []; 208 Str when is_list(Str) -> 209 case erl_scan:string(Str) of 210 {ok,Tokens,_} -> 211 Dot = {dot, erl_anno:new(1)}, 212 case erl_parse:parse_term(Tokens ++ [Dot]) of 213 {ok,List} when is_list(List) -> List; 214 {ok,Term} -> [Term]; 215 {error,_Reason} -> 216 io:format("Ignoring bad term in ~s\n", [Key]), 217 [] 218 end; 219 {error, {_,_,_Reason}, _} -> 220 io:format("Ignoring bad term in ~s\n", [Key]), 221 [] 222 end 223 end. 224 225select_recognized_opts(Options0) -> 226 Options = preprocess_options(Options0), 227 AllOptions = all_options(), 228 [Option || 229 {Name, _} = Option <- Options, 230 lists:member(Name, AllOptions)]. 231 232options(Options0) -> 233 Options1 = preprocess_options(Options0), 234 AllOptions = all_options(), 235 case check_options(Options1, AllOptions, []) of 236 badarg -> 237 badarg; 238 OptionValues -> 239 AllOptionValues = 240 [case lists:keyfind(Option, 1, OptionValues) of 241 false -> 242 {Option, default_option(Option)}; 243 OptionValue -> 244 OptionValue 245 end || Option <- AllOptions], 246 foldr(fun({_, false}, L) -> L; 247 ({Option, true}, L) -> [Option | L]; 248 (OptionValue, L) -> [OptionValue | L] 249 end, [], AllOptionValues) 250 end. 251 252preprocess_options(Options) -> 253 foldr(fun preproc_opt/2, [], Options). 254 255preproc_opt(return, Os) -> 256 [{return_errors, true}, {return_warnings, true} | Os]; 257preproc_opt(report, Os) -> 258 [{report_errors, true}, {report_warnings, true} | Os]; 259preproc_opt({return, T}, Os) -> 260 [{return_errors, T}, {return_warnings, T} | Os]; 261preproc_opt({report, T}, Os) -> 262 [{report_errors, T}, {report_warnings, T} | Os]; 263preproc_opt(Option, Os) -> 264 [try atom_option(Option) catch error:_ -> Option end | Os]. 265 266check_options([{Option, FileName0} | Options], AllOptions, L) 267 when Option =:= includefile; Option =:= scannerfile -> 268 case is_filename(FileName0) of 269 no -> 270 badarg; 271 Filename -> 272 check_options(Options, AllOptions, [{Option, Filename} | L]) 273 end; 274check_options([{Option, Boolean} | Options], AllOptions, L) 275 when is_boolean(Boolean) -> 276 case lists:member(Option, AllOptions) of 277 true -> 278 check_options(Options, AllOptions, [{Option, Boolean} | L]); 279 false -> 280 badarg 281 end; 282check_options([], _AllOptions, L) -> 283 L; 284check_options(_Options, _, _L) -> 285 badarg. 286 287all_options() -> 288 [dfa_graph,includefile,report_errors,report_warnings, 289 return_errors,return_warnings,scannerfile,verbose, 290 warnings_as_errors]. 291 292default_option(dfa_graph) -> false; 293default_option(includefile) -> []; 294default_option(report_errors) -> true; 295default_option(report_warnings) -> true; 296default_option(return_errors) -> false; 297default_option(return_warnings) -> false; 298default_option(scannerfile) -> []; 299default_option(verbose) -> false; 300default_option(warnings_as_errors) -> false. 301 302atom_option(dfa_graph) -> {dfa_graph,true}; 303atom_option(report_errors) -> {report_errors,true}; 304atom_option(report_warnings) -> {report_warnings,true}; 305atom_option(warnings_as_errors) -> {warnings_as_errors,true}; 306atom_option(return_errors) -> {return_errors,true}; 307atom_option(verbose) -> {verbose,true}; 308atom_option(return_warnings) -> {return_warnings,true}; 309atom_option(Key) -> Key. 310 311is_filename(T) -> 312 try filename:flatten(T) 313 catch error: _ -> no 314 end. 315 316shorten_filename(Name0) -> 317 {ok,Cwd} = file:get_cwd(), 318 case string:prefix(Name0, Cwd) of 319 nomatch -> Name0; 320 Rest -> 321 case unicode:characters_to_list(Rest) of 322 "/"++N -> N; 323 N -> N 324 end 325 end. 326 327leex_ret(St) -> 328 report_errors(St), 329 report_warnings(St), 330 Es = pack_errors(St#leex.errors), 331 Ws = pack_warnings(St#leex.warnings), 332 Werror = werror(St), 333 if 334 Werror -> 335 do_error_return(St, Es, Ws); 336 Es =:= [] -> 337 case member(return_warnings, St#leex.opts) of 338 true -> {ok, St#leex.efile, Ws}; 339 false -> {ok, St#leex.efile} 340 end; 341 true -> 342 do_error_return(St, Es, Ws) 343 end. 344 345do_error_return(St, Es, Ws) -> 346 case member(return_errors, St#leex.opts) of 347 true -> {error, Es, Ws}; 348 false -> error 349 end. 350 351werror(St) -> 352 St#leex.warnings =/= [] 353 andalso member(warnings_as_errors, St#leex.opts). 354 355pack_errors([{File,_} | _] = Es) -> 356 [{File, flatmap(fun({_,E}) -> [E] end, sort(Es))}]; 357pack_errors([]) -> 358 []. 359 360pack_warnings([{File,_} | _] = Ws) -> 361 [{File, flatmap(fun({_,W}) -> [W] end, sort(Ws))}]; 362pack_warnings([]) -> 363 []. 364 365report_errors(St) -> 366 when_opt(fun () -> 367 foreach(fun({File,{none,Mod,E}}) -> 368 io:fwrite("~ts: ~ts\n", 369 [File,Mod:format_error(E)]); 370 ({File,{Line,Mod,E}}) -> 371 io:fwrite("~ts:~w: ~ts\n", 372 [File,Line,Mod:format_error(E)]) 373 end, sort(St#leex.errors)) 374 end, report_errors, St#leex.opts). 375 376report_warnings(St) -> 377 Werror = member(warnings_as_errors, St#leex.opts), 378 Prefix = case Werror of 379 true -> ""; 380 false -> "Warning: " 381 end, 382 ReportWerror = Werror andalso member(report_errors, St#leex.opts), 383 ShouldReport = member(report_warnings, St#leex.opts) orelse ReportWerror, 384 when_bool(fun () -> 385 foreach(fun({File,{none,Mod,W}}) -> 386 io:fwrite("~ts: ~s~ts\n", 387 [File,Prefix, 388 Mod:format_error(W)]); 389 ({File,{Line,Mod,W}}) -> 390 io:fwrite("~ts:~w: ~s~ts\n", 391 [File,Line,Prefix, 392 Mod:format_error(W)]) 393 end, sort(St#leex.warnings)) 394 end, ShouldReport). 395 396-spec add_error(_, #leex{}) -> no_return(). 397add_error(E, St) -> 398 add_error(St#leex.xfile, E, St). 399 400add_error(File, Error, St) -> 401 throw(St#leex{errors = [{File,Error}|St#leex.errors]}). 402 403add_warning(Line, W, St) -> 404 St#leex{warnings = [{St#leex.xfile,{Line,leex,W}}|St#leex.warnings]}. 405 406%% filenames(File, Options, State) -> State. 407%% The default output dir is the current directory unless an 408%% explicit one has been given in the options. 409 410filenames(File, Opts, St0) -> 411 Dir = filename:dirname(File), 412 Base = filename:basename(File, ".xrl"), 413 Xfile = filename:join(Dir, Base ++ ".xrl"), 414 Efile = Base ++ ".erl", 415 Gfile = Base ++ ".dot", 416 Module = list_to_atom(Base), 417 St1 = St0#leex{xfile=Xfile, 418 opts=Opts, 419 module=Module}, 420 {includefile,Ifile0} = lists:keyfind(includefile, 1, Opts), 421 Ifile = inc_file_name(Ifile0), 422 %% Test for explicit scanner file. 423 {scannerfile,Ofile} = lists:keyfind(scannerfile, 1, Opts), 424 if 425 Ofile =:= [] -> 426 St1#leex{efile=filename:join(Dir, Efile), 427 ifile=Ifile, 428 gfile=filename:join(Dir, Gfile)}; 429 true -> 430 D = filename:dirname(Ofile), 431 St1#leex{efile=Ofile, 432 ifile=Ifile, 433 gfile=filename:join(D, Gfile)} 434 end. 435 436when_opt(Do, Opt, Opts) -> 437 case member(Opt, Opts) of 438 true -> Do(); 439 false -> ok 440 end. 441 442when_bool(Do, Bool) -> 443 case Bool of 444 true -> Do(); 445 false -> ok 446 end. 447 448verbose_print(St, Format, Args) -> 449 when_opt(fun () -> io:fwrite(Format, Args) end, verbose, St#leex.opts). 450 451%% parse_file(State) -> {ok,[REA],[Action],Code,NewState} | throw(NewState) 452%% when 453%% REA = {RegExp,ActionNo}; 454%% Action = {ActionNo,ActionString}; 455%% Code = {StartLine, StartPos, NumOfLines}. Where the Erlang code is. 456%% 457%% Read and parse the file Xfile. 458%% After each section of the file has been parsed we directly call the 459%% next section. This is done when we detect a line we don't recognise 460%% in the current section. The file format is very simple and Erlang 461%% token based, we allow empty lines and Erlang style comments. 462 463parse_file(St0) -> 464 case file:open(St0#leex.xfile, [read]) of 465 {ok,Xfile} -> 466 St1 = St0#leex{encoding = epp:set_encoding(Xfile)}, 467 try 468 verbose_print(St1, "Parsing file ~ts, ", [St1#leex.xfile]), 469 %% We KNOW that errors throw so we can ignore them here. 470 {ok,Line1,St2} = parse_head(Xfile, St1), 471 {ok,Line2,Macs,St3} = parse_defs(Xfile, Line1, St2), 472 {ok,Line3,REAs,Actions,St4} = 473 parse_rules(Xfile, Line2, Macs, St3), 474 {ok,Code,St5} = parse_code(Xfile, Line3, St4), 475 verbose_print(St5, "contained ~w rules.~n", [length(REAs)]), 476 {ok,REAs,Actions,Code,St5} 477 after ok = file:close(Xfile) 478 end; 479 {error,Error} -> 480 add_error({none,leex,{file_error,Error}}, St0) 481 end. 482 483%% parse_head(File, State) -> {ok,NextLine,State}. 484%% Parse the head of the file. Skip all comments and blank lines. 485 486parse_head(Ifile, St) -> {ok,nextline(Ifile, 0, St),St}. 487 488%% parse_defs(File, Line, State) -> {ok,NextLine,Macros,State}. 489%% Parse the macro definition section of a file. This must exist. 490%% The section is ended by a non-blank line which is not a macro def. 491 492parse_defs(Ifile, {ok,?DEFS_HEAD ++ Rest,L}, St) -> 493 St1 = warn_ignored_chars(L, Rest, St), 494 parse_defs(Ifile, nextline(Ifile, L, St), [], St1); 495parse_defs(_, {ok,_,L}, St) -> 496 add_error({L,leex,missing_defs}, St); 497parse_defs(_, {eof,L}, St) -> 498 add_error({L,leex,missing_defs}, St). 499 500parse_defs(Ifile, {ok,Chars,L}=Line, Ms, St) -> 501 %% This little beauty matches out a macro definition, RE's are so clear. 502 MS = "^[ \t]*([A-Z_][A-Za-z0-9_]*)[ \t]*=[ \t]*([^ \t\r\n]*)[ \t\r\n]*\$", 503 case re:run(Chars, MS, [{capture,all_but_first,list},unicode]) of 504 {match,[Name,Def]} -> 505 %%io:fwrite("~p = ~p\n", [Name,Def]), 506 parse_defs(Ifile, nextline(Ifile, L, St), [{Name,Def}|Ms], St); 507 _ -> {ok,Line,Ms,St} % Anything else 508 end; 509parse_defs(_, Line, Ms, St) -> 510 {ok,Line,Ms,St}. 511 512%% parse_rules(File, Line, Macros, State) -> {ok,NextLine,REAs,Actions,State}. 513%% Parse the RE rules section of the file. This must exist. 514 515parse_rules(Ifile, {ok,?RULE_HEAD ++ Rest,L}, Ms, St) -> 516 St1 = warn_ignored_chars(L, Rest, St), 517 parse_rules(Ifile, nextline(Ifile, L, St), Ms, [], [], 0, St1); 518parse_rules(_, {ok,_,L}, _, St) -> 519 add_error({L,leex,missing_rules}, St); 520parse_rules(_, {eof,L}, _, St) -> 521 add_error({L,leex,missing_rules}, St). 522 523%% parse_rules(File, Result, Macros, RegExpActions, Actions, Acount, State) -> 524%% {ok,NextCLine,RegExpActions,Actions,NewState} | throw(NewState) 525 526parse_rules(Ifile, NextLine, Ms, REAs, As, N, St) -> 527 case NextLine of 528 {ok,?CODE_HEAD ++ _Rest,_} -> 529 parse_rules_end(Ifile, NextLine, REAs, As, St); 530 {ok,Chars,L0} -> 531 %%io:fwrite("~w: ~p~n", [L0,Chars]), 532 case collect_rule(Ifile, Chars, L0) of 533 {ok,Re,Atoks,L1} -> 534 {ok,REA,A,St1} = parse_rule(Re, L0, Atoks, Ms, N, St), 535 parse_rules(Ifile, nextline(Ifile, L1, St), Ms, 536 [REA|REAs], [A|As], N+1, St1); 537 {error,E} -> add_error(E, St) 538 end; 539 {eof,_} -> 540 parse_rules_end(Ifile, NextLine, REAs, As, St) 541 end. 542 543parse_rules_end(_, {ok,_,L}, [], [], St) -> 544 add_error({L,leex,empty_rules}, St); 545parse_rules_end(_, {eof,L}, [], [], St) -> 546 add_error({L,leex,empty_rules}, St); 547parse_rules_end(_, NextLine, REAs, As, St) -> 548 %% Must be *VERY* careful to put rules in correct order! 549 {ok,NextLine,reverse(REAs),reverse(As),St}. 550 551%% collect_rule(File, Line, Lineno) -> 552%% {ok,RegExp,ActionTokens,NewLineno} | {error,E}. 553%% Collect a complete rule by reading lines until the the regexp and 554%% action has been read. Keep track of line number. 555 556collect_rule(Ifile, Chars, L0) -> 557 {RegExp,Rest} = string:take(Chars, " \t\r\n", true), 558 case collect_action(Ifile, Rest, L0, []) of 559 {ok,[{':',_}|Toks],L1} -> {ok,RegExp,Toks,L1}; 560 {ok,_,_} -> {error,{L0,leex,bad_rule}}; 561 {eof,L1} -> {error,{L1,leex,bad_rule}}; 562 {error,E,_} -> {error,E} 563 end. 564 565collect_action(_Ifile, {error, _}, L, _Cont0) -> 566 {error, {L, leex, cannot_parse}, ignored_end_line}; 567collect_action(Ifile, Chars, L0, Cont0) -> 568 case erl_scan:tokens(Cont0, Chars, L0) of 569 {done,{ok,Toks,_},_} -> {ok,Toks,L0}; 570 {done,{eof,_},_} -> {eof,L0}; 571 {done,{error,E,_},_} -> {error,E,L0}; 572 {more,Cont1} -> 573 collect_action(Ifile, io:get_line(Ifile, leex), L0+1, Cont1) 574 end. 575 576%% parse_rule(RegExpString, RegExpLine, ActionTokens, Macros, Counter, State) -> 577%% {ok,{RE,Action},ActionData,State}. 578%% Parse one regexp after performing macro substition. 579 580parse_rule(S, Line, [{dot,_}], Ms, N, St) -> 581 case parse_rule_regexp(S, Ms, St) of 582 {ok,R} -> 583 {ok,{R,N},{N,empty_action},St}; 584 {error,E} -> 585 add_error({Line,leex,E}, St) 586 end; 587parse_rule(S, Line, Atoks, Ms, N, St) -> 588 case parse_rule_regexp(S, Ms, St) of 589 {ok,R} -> 590 %%io:fwrite("RE = ~p~n", [R]), 591 %% Check for token variables. 592 TokenChars = var_used('TokenChars', Atoks), 593 TokenLen = var_used('TokenLen', Atoks), 594 TokenLine = var_used('TokenLine', Atoks), 595 {ok,{R,N},{N,Atoks,TokenChars,TokenLen,TokenLine},St}; 596 {error,E} -> 597 add_error({Line,leex,E}, St) 598 end. 599 600var_used(Name, Toks) -> 601 case lists:keyfind(Name, 3, Toks) of 602 {var,_,Name} -> true; %It's the var we want 603 _ -> false 604 end. 605 606%% parse_rule_regexp(RegExpString, Macros, State) -> 607%% {ok,RegExp} | {error,Error}. 608%% Substitute in macros and parse RegExpString. Cannot use re:replace 609%% here as it uses info in replace string (&). 610 611parse_rule_regexp(RE0, [{M,Exp}|Ms], St) -> 612 Split= re:split(RE0, "\\{" ++ M ++ "\\}", [{return,list},unicode]), 613 RE1 = lists:append(lists:join(Exp, Split)), 614 parse_rule_regexp(RE1, Ms, St); 615parse_rule_regexp(RE, [], St) -> 616 %%io:fwrite("RE = ~p~n", [RE]), 617 case re_parse(RE, St) of 618 {ok,R} -> {ok,R}; 619 {error,E} -> {error,{regexp,E}} 620 end. 621 622%% parse_code(File, Line, State) -> {ok,Code,NewState}. 623%% Finds the line and the position where the code section of the file 624%% begins. This must exist. 625 626parse_code(Ifile, {ok,?CODE_HEAD ++ Rest,CodeL}, St) -> 627 St1 = warn_ignored_chars(CodeL, Rest, St), 628 {ok, CodePos} = file:position(Ifile, cur), 629 %% Just count the lines; copy the code from file to file later. 630 EndCodeLine = count_lines(Ifile, CodeL, St), 631 NCodeLines = EndCodeLine - CodeL, 632 {ok,{CodeL,CodePos,NCodeLines},St1}; 633parse_code(_, {ok,_,L}, St) -> 634 add_error({L,leex,missing_code}, St); 635parse_code(_, {eof,L}, St) -> 636 add_error({L,leex,missing_code}, St). 637 638count_lines(File, N, St) -> 639 case io:get_line(File, leex) of 640 eof -> N; 641 {error, _} -> add_error({N+1, leex, cannot_parse}, St); 642 _Line -> count_lines(File, N+1, St) 643 end. 644 645%% nextline(InputFile, PrevLineNo, State) -> {ok,Chars,LineNo} | {eof,LineNo}. 646%% Get the next line skipping comment lines and blank lines. 647 648nextline(Ifile, L, St) -> 649 case io:get_line(Ifile, leex) of 650 eof -> {eof,L}; 651 {error, _} -> add_error({L+1, leex, cannot_parse}, St); 652 Chars -> 653 case string:take(Chars, " \t\n") of 654 {_, [$%|_Rest]} -> nextline(Ifile, L+1, St); 655 {_, []} -> nextline(Ifile, L+1, St); 656 _Other -> {ok,Chars,L+1} 657 end 658 end. 659 660warn_ignored_chars(Line, S, St) -> 661 case non_white(S) of 662 [] -> St; 663 _ -> add_warning(Line, ignored_characters, St) 664 end. 665 666non_white(S) -> 667 [C || C <- S, C > $\s, C < $\200 orelse C > $\240]. 668 669%% This is the regular expression grammar used. It is equivalent to the 670%% one used in AWK, except that we allow ^ $ to be used anywhere and fail 671%% in the matching. 672%% 673%% reg -> alt : '$1'. 674%% alt -> seq "|" seq ... : {alt,['$1','$2'...]}. 675%% seq -> repeat repeat ... : {seq,['$1','$2'...]}. 676%% repeat -> repeat "*" : {kclosure,'$1'}. 677%% repeat -> repeat "+" : {pclosure,'$1'}. 678%% repeat -> repeat "?" : {optional,'$1'}. 679%% repeat -> repeat "{" [Min],[Max] "}" : {interval,'$1',Min,Max} 680%% repeat -> single : '$1'. 681%% single -> "(" reg ")" : {sub,'$2',Number}. 682%% single -> "^" : bos/bol. 683%% single -> "$" : eos/eol. 684%% single -> "." : any. 685%% single -> "[" class "]" : {char_class,char_class('$2')} 686%% single -> "[" "^" class "]" : {comp_class,char_class('$3')}. 687%% single -> "\"" chars "\"" : {lit,'$2'}. 688%% single -> "\\" char : {lit,['$2']}. 689%% single -> char : {lit,['$1']}. 690%% single -> empty : epsilon. 691%% The grammar of the current regular expressions. The actual parser 692%% is a recursive descent implementation of the grammar. 693 694%% re_parse(Chars, State) -> {ok,RegExp} | {error,Error}. 695 696re_parse(Cs0, St) -> 697 case catch re_reg(Cs0, 0, St) of 698 {RE,_,[]} -> {ok,RE}; 699 {_,_,[C|_]} -> {error,{illegal_char,[C]}}; 700 {parse_error,E} -> {error,E} 701 end. 702 703parse_error(E) -> throw({parse_error,E}). 704 705re_reg(Cs, Sn, St) -> re_alt(Cs, Sn, St). 706 707re_alt(Cs0, Sn0, St) -> 708 {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), 709 case re_alt1(Cs1, Sn1, St) of 710 {[],Sn2,Cs2} -> {L,Sn2,Cs2}; 711 {Rs,Sn2,Cs2} -> {{alt,[L|Rs]},Sn2,Cs2} 712 end. 713 714re_alt1([$||Cs0], Sn0, St) -> 715 {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), 716 {Rs,Sn2,Cs2} = re_alt1(Cs1, Sn1, St), 717 {[L|Rs],Sn2,Cs2}; 718re_alt1(Cs, Sn, _) -> {[],Sn,Cs}. 719 720%% Parse a sequence of regexps. Don't allow the empty sequence. 721%% re_seq(Cs0, Sn0, St) -> 722%% {L,Sn1,Cs1} = repeat(Cs0, Sn0, St), 723%% case re_seq1(Cs1, Sn1, St) of 724%% {[],Sn2,Cs2} -> {L,Sn2,Cs2}; 725%% {Rs,Sn2,Cs2} -> {{seq,[L|Rs]},Sn2,Cs2} 726%% end. 727 728%% re_seq(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 729%% Parse a sequence of regexps. Allow the empty sequence, returns epsilon. 730 731re_seq(Cs0, Sn0, St) -> 732 case re_seq1(Cs0, Sn0, St) of 733 {[],Sn1,Cs1} -> {epsilon,Sn1,Cs1}; 734 {[R],Sn1,Cs1} -> {R,Sn1,Cs1}; 735 {Rs,Sn1,Cs1} -> {{seq,Rs},Sn1,Cs1} 736 end. 737 738re_seq1([C|_]=Cs0, Sn0, St) when C =/= $|, C =/= $) -> 739 {L,Sn1,Cs1} = re_repeat(Cs0, Sn0, St), 740 {Rs,Sn2,Cs2} = re_seq1(Cs1, Sn1, St), 741 {[L|Rs],Sn2,Cs2}; 742re_seq1(Cs, Sn, _) -> {[],Sn,Cs}. 743 744%% re_repeat(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 745 746re_repeat(Cs0, Sn0, St) -> 747 {S,Sn1,Cs1} = re_single(Cs0, Sn0, St), 748 re_repeat1(Cs1, Sn1, S, St). 749 750re_repeat1([$*|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {kclosure,S}, St); 751re_repeat1([$+|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {pclosure,S}, St); 752re_repeat1([$?|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {optional,S}, St); 753%% { only starts interval when ere is true, otherwise normal character. 754%% re_repeat1([${|Cs0], Sn, S, #leex{posix=true}=St) -> % $} 755%% case re_interval_range(Cs0) of 756%% {Min,Max,[$}|Cs1]} when is_integer(Min), is_integer(Max), Min =< Max -> 757%% re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); 758%% {Min,Max,[$}|Cs1]} when is_integer(Min), is_atom(Max) -> 759%% re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); 760%% {_,_,Cs1} -> parse_error({interval_range,string_between([${|Cs0], Cs1)}) 761%% end; 762re_repeat1(Cs, Sn, S, _) -> {S,Sn,Cs}. 763 764%% re_single(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 765%% Parse a re_single regexp. 766 767re_single([$(|Cs0], Sn0, St) -> % $) 768 Sn1 = Sn0 + 1, % Keep track of sub count anyway 769 case re_reg(Cs0, Sn1, St) of 770 {S,Sn2,[$)|Cs1]} -> {S,Sn2,Cs1}; 771 %%{S,Sn2,[$)|Cs1]} -> {{sub,S,Sn1},Sn2,Cs1}; 772 _ -> parse_error({unterminated,"("}) 773 end; 774%% These are not legal inside a regexp. 775%% re_single([$^|Cs], Sn, St) -> {bos,Sn,Cs}; 776%% re_single([$$|Cs], Sn, St) -> {eos,Sn,Cs}; 777%% re_single([$.|Cs], Sn, St) -> {any,Sn,Cs}; 778re_single([$.|Cs], Sn, _) -> {{comp_class,"\n"},Sn,Cs}; % Do this here? 779re_single("[^" ++ Cs0, Sn, St) -> 780 case re_char_class(Cs0, St) of 781 {Cc,[$]|Cs1]} -> {{comp_class,Cc},Sn,Cs1}; 782 _ -> parse_error({unterminated,"["}) 783 end; 784re_single([$[|Cs0], Sn, St) -> 785 case re_char_class(Cs0, St) of 786 {Cc,[$]|Cs1]} -> {{char_class,Cc},Sn,Cs1}; 787 _ -> parse_error({unterminated,"["}) 788 end; 789re_single([$\\|Cs0], Sn, _) -> 790 {C,Cs1} = re_char($\\, Cs0), 791 {{lit,[C]},Sn,Cs1}; 792re_single([C|Cs0], Sn, St) -> 793 case special_char(C, St) of 794 true -> parse_error({illegal_char,[C]}); 795 false -> 796 {C,Cs1} = re_char(C, Cs0), 797 {{lit,[C]},Sn,Cs1} 798 end. 799 800-define(IS_HEX(C), C >= $0 andalso C =< $9 orelse 801 C >= $A andalso C =< $F orelse 802 C >= $a andalso C =< $f). 803 804%% re_char(Char, Chars) -> {CharValue,Chars}. 805%% Reads one character value from the input list, it knows about escapes. 806 807re_char($\\, [O1,O2,O3|S]) when 808 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> 809 {(O1*8 + O2)*8 + O3 - 73*$0,S}; 810re_char($\\, [$x,H1,H2|S]) when ?IS_HEX(H1), ?IS_HEX(H2) -> 811 {erlang:list_to_integer([H1,H2], 16),S}; 812re_char($\\,[$x,${|S0]) -> 813 re_hex(S0, []); 814re_char($\\,[$x|_]) -> 815 parse_error({illegal_char,"\\x"}); 816re_char($\\, [C|S]) -> {escape_char(C),S}; 817re_char($\\, []) -> parse_error({unterminated,"\\"}); 818re_char(C, S) -> {C,S}. % Just this character 819 820re_hex([C|Cs], L) when ?IS_HEX(C) -> re_hex(Cs, [C|L]); 821re_hex([$}|S], L0) -> 822 L = lists:reverse(L0), 823 case erlang:list_to_integer(L, 16) of 824 C when C =< 16#10FFFF -> {C,S}; 825 _ -> parse_error({illegal_char,[$\\,$x,${|L]++"}"}) 826 end; 827re_hex(_, _) -> parse_error({unterminated,"\\x{"}). 828 829%% special_char(Char, State) -> bool(). 830%% These are the special characters for an ERE. 831%% N.B. ]}) are only special in the context after [{(. 832 833special_char($^, _) -> true; 834special_char($., _) -> true; 835special_char($[, _) -> true; 836special_char($$, _) -> true; 837special_char($(, _) -> true; 838special_char($), _) -> true; 839special_char($|, _) -> true; 840special_char($*, _) -> true; 841special_char($+, _) -> true; 842special_char($?, _) -> true; 843%% special_char(${, #leex{posix=true}) -> true; % Only when POSIX set 844special_char($\\, _) -> true; 845special_char(_, _) -> false. 846 847%% re_char_class(Chars, State) -> {CharClass,Chars}. 848%% Parse a character class. 849 850re_char_class([$]|Cs], St) -> % Must special case this. 851 re_char_class(Cs, [$]], St); 852re_char_class(Cs, St) -> re_char_class(Cs, [], St). 853 854%% re_char_class("[:" ++ Cs0, Cc, #leex{posix=true}=St) -> 855%% %% POSIX char class only. 856%% case posix_cc(Cs0) of 857%% {Pcl,":]" ++ Cs1} -> re_char_class(Cs1, [{posix,Pcl}|Cc], St); 858%% {_,Cs1} -> parse_error({posix_cc,string_between(Cs0, Cs1)}) 859%% end; 860re_char_class([C1|Cs0], Cc, St) when C1 =/= $] -> 861 case re_char(C1, Cs0) of 862 {Cf,[$-,C2|Cs1]} when C2 =/= $] -> 863 case re_char(C2, Cs1) of 864 {Cl,Cs2} when Cf < Cl -> 865 re_char_class(Cs2, [{range,Cf,Cl}|Cc], St); 866 {_,Cs2} -> 867 parse_error({char_class,string_between([C1|Cs0], Cs2)}) 868 end; 869 {C,Cs1} -> re_char_class(Cs1, [C|Cc], St) 870 end; 871re_char_class(Cs, Cc, _) -> {reverse(Cc),Cs}. % Preserve order 872 873%% posix_cc(String) -> {PosixClass,RestString}. 874%% Handle POSIX character classes. 875 876%% posix_cc("alnum" ++ Cs) -> {alnum,Cs}; 877%% posix_cc("alpha" ++ Cs) -> {alpha,Cs}; 878%% posix_cc("blank" ++ Cs) -> {blank,Cs}; 879%% posix_cc("cntrl" ++ Cs) -> {cntrl,Cs}; 880%% posix_cc("digit" ++ Cs) -> {digit,Cs}; 881%% posix_cc("graph" ++ Cs) -> {graph,Cs}; 882%% posix_cc("lower" ++ Cs) -> {lower,Cs}; 883%% posix_cc("print" ++ Cs) -> {print,Cs}; 884%% posix_cc("punct" ++ Cs) -> {punct,Cs}; 885%% posix_cc("space" ++ Cs) -> {space,Cs}; 886%% posix_cc("upper" ++ Cs) -> {upper,Cs}; 887%% posix_cc("xdigit" ++ Cs) -> {xdigit,Cs}; 888%% posix_cc(Cs) -> parse_error({posix_cc,string:slice(Cs, 0, 5)}). 889 890escape_char($n) -> $\n; % \n = LF 891escape_char($r) -> $\r; % \r = CR 892escape_char($t) -> $\t; % \t = TAB 893escape_char($v) -> $\v; % \v = VT 894escape_char($b) -> $\b; % \b = BS 895escape_char($f) -> $\f; % \f = FF 896escape_char($e) -> $\e; % \e = ESC 897escape_char($s) -> $\s; % \s = SPACE 898escape_char($d) -> $\d; % \d = DEL 899escape_char(C) -> C. % Pass it straight through 900 901%% re_interval_range(Chars) -> {Min,Max,RestChars}. 902%% NoInt -> none,none 903%% Int -> Int,none 904%% Int, -> Int,any 905%% Int1,Int2 -> Int1,Int2 906 907%% re_interval_range(Cs0) -> 908%% case re_number(Cs0) of 909%% {none,Cs1} -> {none,none,Cs1}; 910%% {N,[$,|Cs1]} -> 911%% case re_number(Cs1) of 912%% {none,Cs2} -> {N,any,Cs2}; 913%% {M,Cs2} -> {N,M,Cs2} 914%% end; 915%% {N,Cs1} -> {N,none,Cs1} 916%% end. 917 918%% re_number([C|Cs]) when C >= $0, C =< $9 -> 919%% re_number(Cs, C - $0); 920%% re_number(Cs) -> {none,Cs}. 921 922%% re_number([C|Cs], Acc) when C >= $0, C =< $9 -> 923%% re_number(Cs, 10*Acc + (C - $0)); 924%% re_number(Cs, Acc) -> {Acc,Cs}. 925 926string_between(Cs1, Cs2) -> 927 string:slice(Cs1, 0, string:length(Cs1)-string:length(Cs2)). 928 929%% We use standard methods, Thompson's construction and subset 930%% construction, to create first an NFA and then a DFA from the 931%% regexps. A non-standard feature is that we work with sets of 932%% character ranges (crs) instead sets of characters. This is most 933%% noticeable when constructing DFAs. The major benefit is that we can 934%% handle characters from any set, not just limited ASCII or 8859, 935%% even 16/32 bit unicode. 936%% 937%% The whole range of characters is 0-maxchar, where maxchar is a BIG 938%% number. We don't make any assumptions about the size of maxchar, it 939%% is just bigger than any character. 940%% 941%% Using character ranges makes describing many regexps very simple, 942%% for example the regexp "." just becomes the range 943%% [{0-9},{11-maxchar}]. 944 945%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. 946%% Build a complete nfa from a list of {RegExp,Action}. The NFA field 947%% accept has values {yes,Action}|no. The NFA is a list of states. 948 949make_dfa(REAs, St) -> 950 {NFA,NF} = build_combined_nfa(REAs), 951 verbose_print(St, "NFA contains ~w states, ", [tuple_size(NFA)]), 952 {DFA0,DF0} = build_dfa(NFA, NF), 953 verbose_print(St, "DFA contains ~w states, ", [length(DFA0)]), 954 {DFA,DF} = minimise_dfa(DFA0, DF0), 955 verbose_print(St, "minimised to ~w states.~n", [length(DFA)]), 956 %%io:fwrite("~p\n", [{NF,NFA}]), 957 %%io:fwrite("~p\n", [{DF0,DFA0}]), 958 %%io:fwrite("~p\n", [{DF,DFA}]), 959 {DFA,DF}. 960 961%% build_combined_nfa(RegExpActionList) -> {NFA,FirstState}. 962%% Build the combined NFA using Thompson's construction straight out 963%% of the book. Build the separate NFAs in the same order as the 964%% rules so that the accepting have ascending states have ascending 965%% state numbers. Start numbering the states from 1 as we put the 966%% states in a tuple with the state number as the index. 967%% 968%% The edges from a state are a list of {CharRange,State} | {epsilon,State}. 969 970build_combined_nfa(REAs) -> 971 {NFA0,Firsts,Free} = build_nfa_list(REAs, [], [], 1), 972 F = #nfa_state{no=Free,edges=epsilon_trans(Firsts)}, 973 {list_to_tuple(keysort(#nfa_state.no, [F|NFA0])),Free}. 974 975build_nfa_list([{RE,Action}|REAs], NFA0, Firsts, Free0) -> 976 {NFA1,Free1,First} = build_nfa(RE, Free0, Action), 977 build_nfa_list(REAs, NFA1 ++ NFA0, [First|Firsts], Free1); 978build_nfa_list([], NFA, Firsts, Free) -> 979 {NFA,reverse(Firsts),Free}. 980 981epsilon_trans(Firsts) -> [ {epsilon,F} || F <- Firsts ]. 982 983%% build_nfa(RegExp, NextState, Action) -> {NFA,NextState,FirstState}. 984%% When building the NFA states for a regexp we don't build the end 985%% state, just allocate a State for it and return this state's 986%% number. This allows us to avoid building unnecessary states for 987%% concatenation which would then have to be removed by overwriting 988%% an existing state. 989 990build_nfa(RE, N0, Action) -> 991 {NFA,N1,E} = build_nfa(RE, N0+1, N0, []), 992 {[#nfa_state{no=E,accept={accept,Action}}|NFA],N1,N0}. 993 994%% build_nfa(RegExp, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 995%% Build an NFA from the RegExp. NFA is a list of #nfa_state{} in no 996%% predefined order. NextState is the number of the next free state 997%% to use, FirstState is the the state which must be the start for 998%% this regexp as a previous regexp refers to it, EndState is the 999%% state to which this NFA will exit to. The number of the returned 1000%% EndState is already allocated! 1001 1002build_nfa({alt,REs}, N, F, NFA) -> 1003 build_nfa_alt(REs, N, F, NFA); 1004build_nfa({seq,REs}, N, F, NFA) -> 1005 build_nfa_seq(REs, N, F, NFA); 1006build_nfa({kclosure,RE}, N0, F, NFA0) -> 1007 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1008 E = N1, % End state 1009 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, 1010 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 1011 N1+1,E}; 1012build_nfa({pclosure,RE}, N0, F, NFA0) -> 1013 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1014 E = N1, % End state 1015 {[#nfa_state{no=F,edges=[{epsilon,N0}]}, 1016 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 1017 N1+1,E}; 1018build_nfa({optional,RE}, N0, F, NFA0) -> 1019 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1020 E = N1, % End state 1021 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, 1022 #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], 1023 N1+1,E}; 1024build_nfa({char_class,Cc}, N, F, NFA) -> 1025 {[#nfa_state{no=F,edges=[{pack_cc(Cc),N}]}|NFA],N+1,N}; 1026build_nfa({comp_class,Cc}, N, F, NFA) -> 1027 {[#nfa_state{no=F,edges=[{comp_class(Cc),N}]}|NFA],N+1,N}; 1028build_nfa({lit,Cs}, N, F, NFA) -> % Implicit concatenation 1029 build_nfa_lit(Cs, N, F, NFA); 1030build_nfa(epsilon, N, F, NFA) -> % Just an epsilon transition 1031 {[#nfa_state{no=F,edges=[{epsilon,N}]}|NFA],N+1,N}. 1032 1033%% build_nfa_lit(Chars, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 1034%% Build an NFA for the sequence of literal characters. 1035 1036build_nfa_lit(Cs, N0, F0, NFA0) -> 1037 foldl(fun (C, {NFA,N,F}) -> 1038 {[#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA],N+1,N} 1039 end, {NFA0,N0,F0}, Cs). 1040 1041%% build_nfa_lit([C|Cs], N, F, NFA0) when is_integer(C) -> 1042%% NFA1 = [#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA0], 1043%% build_nfa_lit(Cs, N+1, N, NFA1); 1044%% build_nfa_lit([], N, F, NFA) -> {NFA,N,F}. 1045 1046%% build_nfa_seq(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 1047%% Build an NFA for the regexps in a sequence. 1048 1049build_nfa_seq(REs, N0, F0, NFA0) -> 1050 foldl(fun (RE, {NFA,N,F}) -> build_nfa(RE, N, F, NFA) end, 1051 {NFA0,N0,F0}, REs). 1052 1053%% build_nfa_seq([RE|REs], N0, F, NFA0) -> 1054%% {NFA1,N1,E1} = build_nfa(RE, N0, F, NFA0), 1055%% build_nfa_seq(REs, N1, E1, NFA1); 1056%% build_nfa_seq([], N, F, NFA) -> {NFA,N,F}. 1057 1058%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 1059%% Build an NFA for the regexps in an alternative. N.B. we don't 1060%% handle empty alts here but the parser should never generate them 1061%% anyway. 1062 1063build_nfa_alt([RE], N, F, NFA) -> build_nfa(RE, N, F, NFA); 1064build_nfa_alt([RE|REs], N0, F, NFA0) -> 1065 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1066 {NFA2,N2,E2} = build_nfa_alt(REs, N1+1, N1, NFA1), 1067 E = N2, % End state 1068 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,N1}]}, 1069 #nfa_state{no=E1,edges=[{epsilon,E}]}, 1070 #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], 1071 N2+1,E}. 1072 1073%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 1074%% Build an NFA for the regexps in an alternative. Make one big 1075%% epsilon split state, not necessary but fun. 1076 1077%% build_nfa_alt(REs, N0, F0, NFA0) -> 1078%% E = N0, % Must reserve End state first 1079%% {Fs,{NFA1,N1}} = mapfoldl(fun (RE, {NFA,N}) -> 1080%% build_nfa_alt1(RE, N, E, NFA) 1081%% end, {NFA0,N0+1}, REs), 1082%% {[#nfa_state{no=F0,edges=epsilon_trans(Fs)}, 1083%% #nfa_state{no=E,edges=[{epsilon,N1}]}|NFA1],N1+1,N1}. 1084 1085%% build_nfa_alt1(RE, N0, End, NFA0) -> 1086%% {NFA1,N1,E} = build_nfa(RE, N0+1, N0, NFA0), 1087%% {N0,{[#nfa_state{no=E,edges=[{epsilon,End}]}|NFA1],N1}}. 1088 1089%% pack_cc(CharClass) -> CharClass 1090%% Pack and optimise a character class specification (bracket 1091%% expression). First sort it and then compact it. 1092 1093pack_cc(Cc) -> 1094 Crs = foldl(fun ({range,Cf,Cl}, Set) -> add_element({Cf,Cl}, Set); 1095 (C, Set) -> add_element({C,C}, Set) 1096 end, ordsets:new(), Cc), 1097 pack_crs(Crs). % An ordset IS a list! 1098 1099pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> 1100 %% C1 C2 1101 %% C3 C4 1102 pack_crs([Cr|Crs]); 1103pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> 1104 %% C1 C2 1105 %% C3 C4 1106 pack_crs([{C1,C4}|Crs]); 1107pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 =:= C3 -> 1108 %% C1 C2 1109 %% C3 C4 1110 pack_crs([{C1,C4}|Crs]); 1111pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; 1112pack_crs([]) -> []. 1113 1114comp_class(Cc) -> 1115 Crs = pack_cc(Cc), 1116 Comp = comp_crs(Crs, 0), 1117 %% io:fwrite("comp: ~p\n ~p\n", [Crs,Comp]), 1118 Comp. 1119 1120comp_crs([{0,C2}|Crs], 0) -> % Get first range right 1121 comp_crs(Crs, C2+1); 1122comp_crs([{C1,C2}|Crs], Last) -> 1123 [{Last,C1-1}|comp_crs(Crs, C2+1)]; 1124comp_crs([], Last) -> [{Last,maxchar}]. 1125 1126%% build_dfa(NFA, NfaFirstState) -> {DFA,DfaFirstState}. 1127%% Build a DFA from an NFA using "subset construction". The major 1128%% difference from the book is that we keep the marked and unmarked 1129%% DFA states in seperate lists. New DFA states are added to the 1130%% unmarked list and states are marked by moving them to the marked 1131%% list. We assume that the NFA accepting state numbers are in 1132%% ascending order for the rules and use ordsets to keep this order. 1133 1134build_dfa(NFA, Nf) -> 1135 D = #dfa_state{no=0,nfa=eclosure([Nf], NFA)}, 1136 {build_dfa([D], 1, [], NFA),0}. 1137 1138%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. 1139%% Traverse the unmarked states. Temporarily add the current unmarked 1140%% state to the marked list before calculating translation, this is 1141%% to avoid adding too many duplicate states. Add it properly to the 1142%% marked list afterwards with correct translations. 1143 1144build_dfa([U|Us0], N0, Ms, NFA) -> 1145 {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), 1146 M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, 1147 build_dfa(Us1, N1, [M|Ms], NFA); 1148build_dfa([], _, Ms, _) -> Ms. 1149 1150%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> 1151%% {Transitions,UnmarkedStates,NextState}. 1152%% Foreach NFA state set calculate the legal translations. N.B. must 1153%% search *BOTH* the unmarked and marked lists to check if DFA state 1154%% already exists. As the range of characters is potentially VERY 1155%% large we cannot explicitly test all characters. Instead we first 1156%% calculate the set of all disjoint character ranges which are 1157%% possible candidates to the set of NFA states. The transitions are 1158%% an orddict so we get the transition lists in ascending order. 1159 1160build_dfa(Set, Us, N, Ts, Ms, NFA) -> 1161 %% List of all transition sets. 1162 Crs0 = [Cr || S <- Set, 1163 {Crs,_St} <- (element(S, NFA))#nfa_state.edges, 1164 Crs =/= epsilon, % Not an epsilon transition 1165 Cr <- Crs ], 1166 Crs1 = lists:usort(Crs0), % Must remove duplicates! 1167 %% Build list of disjoint test ranges. 1168 Test = disjoint_crs(Crs1), 1169 %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), 1170 build_dfa(Test, Set, Us, N, Ts, Ms, NFA). 1171 1172%% disjoint_crs([CharRange]) -> [CharRange]. 1173%% Take a sorted list of char ranges and make a sorted list of 1174%% disjoint char ranges. No new char range extends past an existing 1175%% char range. 1176 1177disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> 1178 %% C1 C2 1179 %% C3 C4 1180 [Cr1|disjoint_crs([Cr2|Crs])]; 1181disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 =:= C3 -> 1182 %% C1 C2 1183 %% C3 C4 1184 [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; 1185disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> 1186 %% C1 C2 1187 %% C3 C4 1188 [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; 1189disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 =:= C4 -> 1190 %% C1 C2 1191 %% C3 C4 1192 [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; 1193disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> 1194 %% C1 C2 1195 %% C3 C4 1196 [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; 1197disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; 1198disjoint_crs([]) -> []. 1199 1200build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> 1201 case eclosure(move(Set, Cr, NFA), NFA) of 1202 S when S =/= [] -> 1203 case dfa_state_exist(S, Us, Ms) of 1204 {yes,T} -> 1205 build_dfa(Crs, Set, Us, N, store(Cr, T, Ts), Ms, NFA); 1206 no -> 1207 U = #dfa_state{no=N,nfa=S}, 1208 build_dfa(Crs, Set, [U|Us], N+1, store(Cr, N, Ts), Ms, NFA) 1209 end; 1210 [] -> 1211 build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) 1212 end; 1213build_dfa([], _, Us, N, Ts, _, _) -> 1214 {Ts,Us,N}. 1215 1216%% dfa_state_exist(Set, Unmarked, Marked) -> {yes,State} | no. 1217 1218dfa_state_exist(S, Us, Ms) -> 1219 case lists:keyfind(S, #dfa_state.nfa, Us) of 1220 #dfa_state{no=T} -> {yes,T}; 1221 false -> 1222 case lists:keyfind(S, #dfa_state.nfa, Ms) of 1223 #dfa_state{no=T} -> {yes,T}; 1224 false -> no 1225 end 1226 end. 1227 1228%% eclosure([State], NFA) -> [State]. 1229%% move([State], Char, NFA) -> [State]. 1230%% These are straight out of the book. As eclosure uses ordsets then 1231%% the generated state sets are in ascending order. 1232 1233eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). 1234 1235eclosure([St|Sts], NFA, Ec) -> 1236 #nfa_state{edges=Es} = element(St, NFA), 1237 eclosure([ N || {epsilon,N} <- Es, 1238 not is_element(N, Ec) ] ++ Sts, 1239 NFA, add_element(St, Ec)); 1240eclosure([], _, Ec) -> Ec. 1241 1242move(Sts, Cr, NFA) -> 1243 %% io:fwrite("move1: ~p\n", [{Sts,Cr}]), 1244 [ St || N <- Sts, 1245 {Crs,St} <- (element(N, NFA))#nfa_state.edges, 1246 Crs =/= epsilon, % Not an epsilon transition 1247 in_crs(Cr, Crs) ]. 1248 1249in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; 1250in_crs(Cr, [Cr|_Crs]) -> true; % Catch bos and eos. 1251in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); 1252in_crs(_Cr, []) -> false. 1253 1254%% accept([State], NFA) -> {accept,A} | noaccept. 1255%% Scan down the state list until we find an accepting state. 1256 1257accept([St|Sts], NFA) -> 1258 case element(St, NFA) of 1259 #nfa_state{accept={accept,A}} -> {accept,A}; 1260 #nfa_state{accept=noaccept} -> accept(Sts, NFA) 1261 end; 1262accept([], _) -> noaccept. 1263 1264%% minimise_dfa(DFA, DfaFirst) -> {DFA,DfaFirst}. 1265%% Minimise the DFA by removing equivalent states. We consider a 1266%% state if both the transitions and the their accept state is the 1267%% same. First repeatedly run throught the DFA state list removing 1268%% equivalent states and updating remaining transitions with 1269%% remaining equivalent state numbers. When no more reductions are 1270%% possible then pack the remaining state numbers to get consecutive 1271%% states. 1272 1273minimise_dfa(DFA0, Df0) -> 1274 case min_dfa(DFA0) of 1275 {DFA1,[]} -> % No reduction! 1276 {DFA2,Rs} = pack_dfa(DFA1), 1277 {min_update(DFA2, Rs),min_use(Df0, Rs)}; 1278 {DFA1,Rs} -> 1279 minimise_dfa(min_update(DFA1, Rs), min_use(Df0, Rs)) 1280 end. 1281 1282min_dfa(DFA) -> min_dfa(DFA, [], []). 1283 1284min_dfa([D|DFA0], Rs0, MDFA) -> 1285 {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, 1286 D#dfa_state.no, Rs0, []), 1287 min_dfa(DFA1, Rs1, [D|MDFA]); 1288min_dfa([], Rs, MDFA) -> {MDFA,Rs}. 1289 1290%% min_delete(States, Trans, Action, NewN, Rs, MiniDFA) -> {MiniDFA,Rs}. 1291%% Delete all states with same transactions and action. Return 1292%% rewrites and minimised DFA with no duplicate states. 1293 1294min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> 1295 min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); 1296min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> 1297 min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); 1298min_delete([], _, _, _, Rs, MDFA) -> {MDFA,Rs}. 1299 1300min_update(DFA, Rs) -> 1301 [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. 1302 1303min_update_trans(Tr, Rs) -> 1304 [ {C,min_use(S, Rs)} || {C,S} <- Tr ]. 1305 1306min_use(Old, [{Old,New}|_]) -> New; 1307min_use(Old, [_|Reds]) -> min_use(Old, Reds); 1308min_use(Old, []) -> Old. 1309 1310pack_dfa(DFA) -> pack_dfa(DFA, 0, [], []). 1311 1312pack_dfa([D|DFA], NewN, Rs, PDFA) -> 1313 pack_dfa(DFA, NewN+1, 1314 [{D#dfa_state.no,NewN}|Rs], [D#dfa_state{no=NewN}|PDFA]); 1315pack_dfa([], _, Rs, PDFA) -> {PDFA,Rs}. 1316 1317%% The main output is the yystate function which is built from the 1318%% DFA. It has the spec: 1319%% 1320%% yystate() -> InitialState. 1321%% yystate(State, InChars, Line, CurrTokLen, AcceptAction, AcceptLen) -> 1322%% {Action, AcceptLength, RestChars, Line} | Accepting end state 1323%% {Action, AcceptLength, RestChars, Line, State} | Accepting state 1324%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} | 1325%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State}. 1326 1327%% The return CurrTokLen is always the current number of characters 1328%% scanned in the current token. The returns have the following 1329%% meanings: 1330%% {Action, AcceptLength, RestChars, Line} - 1331%% The scanner has reached an accepting end-state, for example after 1332%% a regexp "abc". Action is the action number and AcceptLength is 1333%% the length of the matching token. 1334%% 1335%% {Action, AcceptLength, RestChars, Line, State} - 1336%% The scanner has reached an accepting transition state, for example 1337%% after c in regexp "abc(xyz)?", continuation depends on 1338%% RestChars. If RestChars == [] (no more current characters) then we 1339%% need to get more characters to see if it is an end-state, 1340%% otherwise (eof or chars) then we have not found continuing 1341%% characters and it is an end state. 1342%% 1343%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} - 1344%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State} - 1345%% The scanner has reached a non-accepting transition state. If 1346%% RestChars == [] we need to get more characters to continue. 1347%% Otherwise if 'reject' then no accepting state has been reached it 1348%% is an error. If we have an Action and AcceptLength then these are 1349%% the last accept state, use them and continue from there. 1350 1351%% out_file(LeexState, DFA, DfaStart, [Action], Code) -> ok | error. 1352%% Generate an output .erl file from the include file, the DFA and 1353%% the code for the actions. 1354 1355out_file(St0, DFA, DF, Actions, Code) -> 1356 verbose_print(St0, "Writing file ~ts, ", [St0#leex.efile]), 1357 case open_inc_file(St0) of 1358 {ok,Ifile} -> 1359 try 1360 case file:open(St0#leex.efile, [write]) of 1361 {ok,Ofile} -> 1362 set_encoding(St0, Ofile), 1363 try 1364 output_encoding_comment(Ofile, St0), 1365 output_file_directive(Ofile, St0#leex.ifile, 0), 1366 out_file(Ifile, Ofile, St0, DFA, DF, Actions, 1367 Code, 1), 1368 verbose_print(St0, "ok~n", []), 1369 St0 1370 after ok = file:close(Ofile) 1371 end; 1372 {error,Error} -> 1373 verbose_print(St0, "error~n", []), 1374 add_error({none,leex,{file_error,Error}}, St0) 1375 end 1376 after ok = file:close(Ifile) 1377 end; 1378 {{error,Error},Ifile} -> 1379 add_error(Ifile, {none,leex,{file_error,Error}}, St0) 1380 end. 1381 1382open_inc_file(State) -> 1383 Ifile = State#leex.ifile, 1384 case file:open(Ifile, [read]) of 1385 {ok,F} -> 1386 _ = epp:set_encoding(F), 1387 {ok,F}; 1388 Error -> {Error,Ifile} 1389 end. 1390 1391inc_file_name([]) -> 1392 Incdir = filename:join(code:lib_dir(parsetools), "include"), 1393 filename:join(Incdir, ?LEEXINC); 1394inc_file_name(Filename) -> 1395 Filename. 1396 1397%% out_file(IncFile, OutFile, State, DFA, DfaStart, Actions, Code, Line) -> ok 1398%% Copy the include file line by line substituting special lines with 1399%% generated code. We cheat by only looking at the first 5 1400%% characters. 1401 1402out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L) -> 1403 case io:get_line(Ifile, leex) of 1404 eof -> output_file_directive(Ofile, St#leex.ifile, L); 1405 {error, _} -> add_error(St#leex.ifile, {L, leex, cannot_parse}, St); 1406 Line -> 1407 case string:slice(Line, 0, 5) of 1408 "##mod" -> out_module(Ofile, St); 1409 "##cod" -> out_erlang_code(Ofile, St, Code, L); 1410 "##dfa" -> out_dfa(Ofile, St, DFA, Code, DF, L); 1411 "##act" -> out_actions(Ofile, St#leex.xfile, Actions); 1412 _ -> io:put_chars(Ofile, Line) 1413 end, 1414 out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L+1) 1415 end. 1416 1417out_module(File, St) -> 1418 io:fwrite(File, "-module(~w).\n", [St#leex.module]). 1419 1420out_erlang_code(File, St, Code, L) -> 1421 {CodeL,CodePos,_NCodeLines} = Code, 1422 output_file_directive(File, St#leex.xfile, CodeL), 1423 {ok,Xfile} = file:open(St#leex.xfile, [read]), 1424 try 1425 set_encoding(St, Xfile), 1426 {ok,_} = file:position(Xfile, CodePos), 1427 ok = file_copy(Xfile, File) 1428 after 1429 ok = file:close(Xfile) 1430 end, 1431 io:nl(File), 1432 output_file_directive(File, St#leex.ifile, L). 1433 1434file_copy(From, To) -> 1435 case io:get_line(From, leex) of 1436 eof -> ok; 1437 Line when is_list(Line) -> 1438 io:fwrite(To, "~ts", [Line]), 1439 file_copy(From, To) 1440 end. 1441 1442out_dfa(File, St, DFA, Code, DF, L) -> 1443 {_CodeL,_CodePos,NCodeLines} = Code, 1444 %% Three file attributes before this one... 1445 output_file_directive(File, St#leex.efile, L+(NCodeLines-1)+3), 1446 io:fwrite(File, "yystate() -> ~w.~n~n", [DF]), 1447 foreach(fun (S) -> out_trans(File, S) end, DFA), 1448 io:fwrite(File, "yystate(S, Ics, Line, Tlen, Action, Alen) ->~n", []), 1449 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,S}.~n", []). 1450 1451out_trans(File, #dfa_state{no=N,trans=[],accept={accept,A}}) -> 1452 %% Accepting end state, guaranteed done. 1453 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), 1454 io:fwrite(File, " {~w,Tlen,Ics,Line};~n", [A]); 1455out_trans(File, #dfa_state{no=N,trans=Tr,accept={accept,A}}) -> 1456 %% Accepting state, but there maybe more. 1457 foreach(fun (T) -> out_accept_tran(File, N, A, T) end, pack_trans(Tr)), 1458 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), 1459 io:fwrite(File, " {~w,Tlen,Ics,Line,~w};~n", [A,N]); 1460out_trans(File, #dfa_state{no=N,trans=Tr,accept=noaccept}) -> 1461 %% Non-accepting transition state. 1462 foreach(fun (T) -> out_noaccept_tran(File, N, T) end, pack_trans(Tr)), 1463 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, Action, Alen) ->~n", [N]), 1464 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,~w};~n", [N]). 1465 1466out_accept_tran(File, N, A, {{Cf,maxchar},S}) -> 1467 out_accept_head_max(File, N, Cf), 1468 out_accept_body(File, S, "Line", A); 1469out_accept_tran(File, N, A, {{Cf,Cl},S}) -> 1470 out_accept_head_range(File, N, Cf, Cl), 1471 out_accept_body(File, S, "Line", A); 1472out_accept_tran(File, N, A, {$\n,S}) -> 1473 out_accept_head_1(File, N, $\n), 1474 out_accept_body(File, S, "Line+1", A); 1475out_accept_tran(File, N, A, {C,S}) -> 1476 out_accept_head_1(File, N, C), 1477 out_accept_body(File, S, "Line", A). 1478 1479out_accept_head_1(File, State, Char) -> 1480 out_head_1(File, State, Char, "_", "_"). 1481 1482out_accept_head_max(File, State, Min) -> 1483 out_head_max(File, State, Min, "_", "_"). 1484 1485out_accept_head_range(File, State, Min, Max) -> 1486 out_head_range(File, State, Min, Max, "_", "_"). 1487 1488out_accept_body(File, Next, Line, Action) -> 1489 out_body(File, Next, Line, io_lib:write(Action), "Tlen"). 1490 1491out_noaccept_tran(File, N, {{Cf,maxchar},S}) -> 1492 out_noaccept_head_max(File, N, Cf), 1493 out_noaccept_body(File, S, "Line"); 1494out_noaccept_tran(File, N, {{Cf,Cl},S}) -> 1495 out_noaccept_head_range(File, N, Cf, Cl), 1496 out_noaccept_body(File, S, "Line"); 1497out_noaccept_tran(File, N, {$\n,S}) -> 1498 out_noaccept_head_1(File, N, $\n), 1499 out_noaccept_body(File, S, "Line+1"); 1500out_noaccept_tran(File, N, {C,S}) -> 1501 out_noaccept_head_1(File, N, C), 1502 out_noaccept_body(File, S, "Line"). 1503 1504out_noaccept_head_1(File, State, Char) -> 1505 out_head_1(File, State, Char, "Action", "Alen"). 1506 1507out_noaccept_head_max(File, State, Min) -> 1508 out_head_max(File, State, Min, "Action", "Alen"). 1509 1510out_noaccept_head_range(File, State, Min, Max) -> 1511 out_head_range(File, State, Min, Max, "Action", "Alen"). 1512 1513out_noaccept_body(File, Next, Line) -> 1514 out_body(File, Next, Line, "Action", "Alen"). 1515 1516out_head_1(File, State, Char, Action, Alen) -> 1517 io:fwrite(File, "yystate(~w, [~w|Ics], Line, Tlen, ~s, ~s) ->\n", 1518 [State,Char,Action,Alen]). 1519 1520out_head_max(File, State, Min, Action, Alen) -> 1521 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w ->\n", 1522 [State,Action,Alen,Min]). 1523 1524out_head_range(File, State, Min, Max, Action, Alen) -> 1525 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w, C =< ~w ->\n", 1526 [State,Action,Alen,Min,Max]). 1527 1528out_body(File, Next, Line, Action, Alen) -> 1529 io:fwrite(File, " yystate(~w, Ics, ~s, Tlen+1, ~s, ~s);\n", 1530 [Next,Line,Action,Alen]). 1531 1532%% pack_trans([{Crange,State}]) -> [{Crange,State}] when 1533%% Crange = {Char,Char} | Char. 1534%% Pack the translation table into something more suitable for 1535%% generating code. We KNOW how the pattern matching compiler works 1536%% so solitary characters are stored before ranges. We do this by 1537%% prepending singletons to the front of the packed transitions and 1538%% appending ranges to the back. This preserves the smallest to 1539%% largest order of ranges. Newline characters, $\n, are always 1540%% extracted and handled as singeltons. 1541 1542pack_trans(Trs) -> pack_trans(Trs, []). 1543 1544%% pack_trans(Trs) -> 1545%% Trs1 = pack_trans(Trs, []), 1546%% io:fwrite("tr:~p\n=> ~p\n", [Trs,Trs1]), 1547%% Trs1. 1548 1549pack_trans([{{C,C},S}|Trs], Pt) -> % Singletons to the head 1550 case lists:member({C,S}, Pt) of 1551 true -> pack_trans(Trs, Pt); 1552 false -> pack_trans(Trs, [{C,S}|Pt]) 1553 end; 1554%% Special detection and handling of $\n. 1555pack_trans([{{Cf,$\n},S}|Trs], Pt) -> 1556 pack_trans([{{Cf,$\n-1},S}|Trs], [{$\n,S}|Pt]); 1557pack_trans([{{$\n,Cl},S}|Trs], Pt) -> 1558 pack_trans([{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); 1559pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cf < $\n, Cl > $\n -> 1560 pack_trans([{{Cf,$\n-1},S},{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); 1561%% Small ranges become singletons. 1562pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cl =:= Cf + 1 -> 1563 pack_trans(Trs, [{Cf,S},{Cl,S}|Pt]); 1564pack_trans([Tr|Trs], Pt) -> % The default uninteresting case 1565 pack_trans(Trs, Pt ++ [Tr]); 1566pack_trans([], Pt) -> Pt. 1567 1568%% out_actions(File, XrlFile, ActionList) -> ok. 1569%% Write out the action table. 1570 1571out_actions(File, XrlFile, As) -> 1572 As1 = prep_out_actions(As), 1573 foreach(fun (A) -> out_action(File, A) end, As1), 1574 io:fwrite(File, "yyaction(_, _, _, _) -> error.~n", []), 1575 foreach(fun (A) -> out_action_code(File, XrlFile, A) end, As1). 1576 1577prep_out_actions(As) -> 1578 map(fun ({A,empty_action}) -> 1579 {A,empty_action}; 1580 ({A,Code,TokenChars,TokenLen,TokenLine}) -> 1581 Vs = [{TokenChars,"TokenChars"}, 1582 {TokenLen,"TokenLen"}, 1583 {TokenLine,"TokenLine"}, 1584 {TokenChars,"YYtcs"}, 1585 {TokenLen or TokenChars,"TokenLen"}], 1586 Vars = [if F -> S; true -> "_" end || {F,S} <- Vs], 1587 Name = list_to_atom(lists:concat([yyaction_,A])), 1588 [Chars,Len,Line,_,_] = Vars, 1589 Args = [V || V <- [Chars,Len,Line], V =/= "_"], 1590 ArgsChars = lists:join(", ", Args), 1591 {A,Code,Vars,Name,Args,ArgsChars} 1592 end, As). 1593 1594out_action(File, {A,empty_action}) -> 1595 io:fwrite(File, "yyaction(~w, _, _, _) -> skip_token;~n", [A]); 1596out_action(File, {A,_Code,Vars,Name,_Args,ArgsChars}) -> 1597 [_,_,Line,Tcs,Len] = Vars, 1598 io:fwrite(File, "yyaction(~w, ~s, ~s, ~s) ->~n", [A,Len,Tcs,Line]), 1599 if 1600 Tcs =/= "_" -> 1601 io:fwrite(File, " TokenChars = yypre(YYtcs, TokenLen),~n", []); 1602 true -> ok 1603 end, 1604 io:fwrite(File, " ~s(~s);~n", [Name, ArgsChars]). 1605 1606out_action_code(_File, _XrlFile, {_A,empty_action}) -> 1607 ok; 1608out_action_code(File, XrlFile, {_A,Code,_Vars,Name,Args,ArgsChars}) -> 1609 %% Should set the file to the .erl file, but instead assumes that 1610 %% ?LEEXINC is syntactically correct. 1611 io:fwrite(File, "\n-compile({inline,~w/~w}).\n", [Name, length(Args)]), 1612 L = erl_scan:line(hd(Code)), 1613 output_file_directive(File, XrlFile, L-2), 1614 io:fwrite(File, "~s(~s) ->~n", [Name, ArgsChars]), 1615 io:fwrite(File, " ~ts\n", [pp_tokens(Code, L, File)]). 1616 1617%% pp_tokens(Tokens, Line, File) -> [char()]. 1618%% Prints the tokens keeping the line breaks of the original code. 1619 1620pp_tokens(Tokens, Line0, File) -> pp_tokens(Tokens, Line0, File, none). 1621 1622pp_tokens([], _Line0, _, _) -> []; 1623pp_tokens([T | Ts], Line0, File, Prev) -> 1624 Line = erl_scan:line(T), 1625 [pp_sep(Line, Line0, Prev, T), 1626 pp_symbol(T, File) | pp_tokens(Ts, Line, File, T)]. 1627 1628pp_symbol({var,_,Var}, _) -> atom_to_list(Var); 1629pp_symbol({_,_,Symbol}, File) -> format_symbol(Symbol, File); 1630pp_symbol({dot, _}, _) -> "."; 1631pp_symbol({Symbol, _}, _) -> atom_to_list(Symbol). 1632 1633pp_sep(Line, Line0, Prev, T) when Line > Line0 -> 1634 ["\n " | pp_sep(Line - 1, Line0, Prev, T)]; 1635pp_sep(_, _, {'.',_}, _) -> ""; % No space after '.' (not a dot) 1636pp_sep(_, _, {'#',_}, _) -> ""; % No space after '#' 1637pp_sep(_, _, {'(',_}, _) -> ""; % No space after '(' 1638pp_sep(_, _, {'[',_}, _) -> ""; % No space after '[' 1639pp_sep(_, _, _, {'.',_}) -> ""; % No space before '.' 1640pp_sep(_, _, _, {'#',_}) -> ""; % No space before '#' 1641pp_sep(_, _, _, {',',_}) -> ""; % No space before ',' 1642pp_sep(_, _, _, {')',_}) -> ""; % No space before ')' 1643pp_sep(_, _, _, _) -> " ". 1644 1645%% out_dfa_graph(LeexState, DFA, DfaStart) -> ok | error. 1646%% Writes the DFA to a .dot file in DOT-format which can be viewed 1647%% with Graphviz. 1648 1649out_dfa_graph(St, DFA, DF) -> 1650 verbose_print(St, "Writing DFA to file ~ts, ", [St#leex.gfile]), 1651 case file:open(St#leex.gfile, [write]) of 1652 {ok,Gfile} -> 1653 try 1654 %% Set the same encoding as infile: 1655 set_encoding(St, Gfile), 1656 io:fwrite(Gfile, "digraph DFA {~n", []), 1657 out_dfa_states(Gfile, DFA, DF), 1658 out_dfa_edges(Gfile, DFA), 1659 io:fwrite(Gfile, "}~n", []), 1660 verbose_print(St, "ok~n", []), 1661 St 1662 after ok = file:close(Gfile) 1663 end; 1664 {error,Error} -> 1665 verbose_print(St, "error~n", []), 1666 add_error({none,leex,{file_error,Error}}, St) 1667 end. 1668 1669out_dfa_states(File, DFA, DF) -> 1670 foreach(fun (S) -> out_dfa_state(File, DF, S) end, DFA), 1671 io:fwrite(File, "~n", []). 1672 1673out_dfa_state(File, DF, #dfa_state{no=DF, accept={accept,_}}) -> 1674 io:fwrite(File, " ~b [shape=doublecircle color=green];~n", [DF]); 1675out_dfa_state(File, DF, #dfa_state{no=DF, accept=noaccept}) -> 1676 io:fwrite(File, " ~b [shape=circle color=green];~n", [DF]); 1677out_dfa_state(File, _, #dfa_state{no=S, accept={accept,_}}) -> 1678 io:fwrite(File, " ~b [shape=doublecircle];~n", [S]); 1679out_dfa_state(File, _, #dfa_state{no=S, accept=noaccept}) -> 1680 io:fwrite(File, " ~b [shape=circle];~n", [S]). 1681 1682out_dfa_edges(File, DFA) -> 1683 foreach(fun (#dfa_state{no=S,trans=Trans}) -> 1684 Pt = pack_trans(Trans), 1685 Tdict = foldl(fun ({Cr,T}, D) -> 1686 orddict:append(T, Cr, D) 1687 end, orddict:new(), Pt), 1688 foreach(fun (T) -> 1689 Crs = orddict:fetch(T, Tdict), 1690 Edgelab = dfa_edgelabel(Crs, File), 1691 io:fwrite(File, " ~b -> ~b [label=\"~ts\"];~n", 1692 [S,T,Edgelab]) 1693 end, sort(orddict:fetch_keys(Tdict))) 1694 end, DFA). 1695 1696dfa_edgelabel([C], File) when is_integer(C) -> quote(C, File); 1697dfa_edgelabel(Cranges, File) -> 1698 %% io:fwrite("el: ~p\n", [Cranges]), 1699 "[" ++ map(fun ({A,B}) -> [quote(A, File), "-", quote(B, File)]; 1700 (C) -> [quote(C, File)] 1701 end, Cranges) ++ "]". 1702 1703set_encoding(#leex{encoding = none}, File) -> 1704 ok = io:setopts(File, [{encoding, epp:default_encoding()}]); 1705set_encoding(#leex{encoding = E}, File) -> 1706 ok = io:setopts(File, [{encoding, E}]). 1707 1708output_encoding_comment(_File, #leex{encoding = none}) -> 1709 ok; 1710output_encoding_comment(File, #leex{encoding = Encoding}) -> 1711 io:fwrite(File, <<"%% ~s\n">>, [epp:encoding_to_string(Encoding)]). 1712 1713output_file_directive(File, Filename, Line) -> 1714 io:fwrite(File, <<"-file(~ts, ~w).\n">>, 1715 [format_filename(Filename, File), Line]). 1716 1717format_filename(Filename0, File) -> 1718 Filename = filename:flatten(Filename0), 1719 case enc(File) of 1720 unicode -> io_lib:write_string(Filename); 1721 latin1 -> io_lib:write_string_as_latin1(Filename) 1722 end. 1723 1724format_symbol(Symbol, File) -> 1725 Format = case enc(File) of 1726 latin1 -> "~p"; 1727 unicode -> "~tp" 1728 end, 1729 io_lib:fwrite(Format, [Symbol]). 1730 1731enc(File) -> 1732 case lists:keyfind(encoding, 1, io:getopts(File)) of 1733 false -> latin1; % should never happen 1734 {encoding, Enc} -> Enc 1735 end. 1736 1737quote($^, _File) -> "\\^"; 1738quote($., _File) -> "\\."; 1739quote($$, _File) -> "\\$"; 1740quote($-, _File) -> "\\-"; 1741quote($[, _File) -> "\\["; 1742quote($], _File) -> "\\]"; 1743quote($\s, _File) -> "\\\\s"; 1744quote($\", _File) -> "\\\""; 1745quote($\b, _File) -> "\\\\b"; 1746quote($\f, _File) -> "\\\\f"; 1747quote($\n, _File) -> "\\\\n"; 1748quote($\r, _File) -> "\\\\r"; 1749quote($\t, _File) -> "\\\\t"; 1750quote($\e, _File) -> "\\\\e"; 1751quote($\v, _File) -> "\\\\v"; 1752quote($\d, _File) -> "\\\\d"; 1753quote($\\, _File) -> "\\\\"; 1754quote(C, File) when is_integer(C) -> 1755 %% Must remove the $ and get the \'s right. 1756 S = case enc(File) of 1757 unicode -> io_lib:write_char(C); 1758 latin1 -> io_lib:write_char_as_latin1(C) 1759 end, 1760 case S of 1761 [$$,$\\|Cs] -> "\\\\" ++ Cs; 1762 [$$|Cs] -> Cs 1763 end; 1764quote(maxchar, _File) -> 1765 "MAXCHAR". 1766