1%% Copyright (c) 2008,2009 Robert Virding. All rights reserved. 2%% 3%% Redistribution and use in source and binary forms, with or without 4%% modification, are permitted provided that the following conditions 5%% are met: 6%% 7%% 1. Redistributions of source code must retain the above copyright 8%% notice, this list of conditions and the following disclaimer. 9%% 2. Redistributions in binary form must reproduce the above copyright 10%% notice, this list of conditions and the following disclaimer in the 11%% documentation and/or other materials provided with the distribution. 12%% 13%% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 14%% "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 15%% LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 16%% FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 17%% COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 18%% INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 19%% BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20%% LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21%% CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22%% LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 23%% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24%% POSSIBILITY OF SUCH DAMAGE. 25 26%%% A Lexical Analyser Generator for Erlang. 27%%% 28%%% Most of the algorithms used here are taken pretty much as 29%%% described in the "Dragon Book" by Aho, Sethi and Ullman. Some 30%%% completing details were taken from "Compiler Design in C" by 31%%% Hollub. 32 33-module(leex). 34 35-export([compile/3,file/1,file/2,format_error/1]). 36 37-import(lists, [member/2,reverse/1,sort/1,delete/2, 38 keysort/2,keydelete/3, 39 map/2,foldl/3,foreach/2,flatmap/2]). 40-import(ordsets, [is_element/2,add_element/2,union/2]). 41-import(orddict, [store/3]). 42 43-include("erl_compile.hrl"). 44%%-include_lib("stdlib/include/erl_compile.hrl"). 45 46-define(LEEXINC, "leexinc.hrl"). % Include file 47-define(LEEXLIB, parsetools). % Leex is in lib parsetools 48%%-define(LEEXLIB, leex). % Leex is in lib leex 49 50-define(DEFS_HEAD, "Definitions."). 51-define(RULE_HEAD, "Rules."). 52-define(CODE_HEAD, "Erlang code."). 53 54-record(leex, {xfile=[], % Xrl file 55 efile=[], % Erl file 56 ifile=[], % Include file 57 gfile=[], % Graph file 58 module, % Module name 59 opts=[], % Options 60 encoding=none, % Encoding of Xrl file 61 % posix=false, % POSIX regular expressions 62 errors=[], 63 warnings=[] 64 }). 65 66-record(nfa_state, {no,edges=[],accept=noaccept}). 67-record(dfa_state, {no,nfa=[],trans=[],accept=noaccept}). 68 69%%% 70%%% Exported functions 71%%% 72 73%%% Interface to erl_compile. 74 75compile(Input0, Output0, 76 #options{warning = WarnLevel, verbose=Verbose, includes=Includes, 77 specific=Specific}) -> 78 Input = assure_extension(shorten_filename(Input0), ".xrl"), 79 Output = assure_extension(shorten_filename(Output0), ".erl"), 80 Includefile = lists:sublist(Includes, 1), 81 Werror = proplists:get_bool(warnings_as_errors, Specific), 82 Opts = [{scannerfile,Output},{includefile,Includefile},{verbose,Verbose}, 83 {report_errors,true},{report_warnings,WarnLevel > 0}, 84 {warnings_as_errors, Werror}], 85 case file(Input, Opts) of 86 {ok, _} -> 87 ok; 88 error -> 89 error 90 end. 91 92%% file(File) -> ok | error. 93%% file(File, Options) -> ok | error. 94 95file(File) -> file(File, []). 96 97file(File, Opts0) -> 98 case is_filename(File) of 99 no -> erlang:error(badarg, [File,Opts0]); 100 _ -> ok 101 end, 102 Opts = case options(Opts0) of 103 badarg -> 104 erlang:error(badarg, [File,Opts0]); 105 Options -> 106 Options 107 end, 108 St0 = #leex{}, 109 St1 = filenames(File, Opts, St0), % Get all the filenames 110 St = try 111 {ok,REAs,Actions,Code,St2} = parse_file(St1), 112 {DFA,DF} = make_dfa(REAs, St2), 113 case werror(St2) of 114 false -> 115 St3 = out_file(St2, DFA, DF, Actions, Code), 116 case lists:member(dfa_graph, St3#leex.opts) of 117 true -> out_dfa_graph(St3, DFA, DF); 118 false -> St3 119 end; 120 true -> 121 St2 122 end 123 catch #leex{}=St4 -> 124 St4 125 end, 126 leex_ret(St). 127 128format_error({file_error, Reason}) -> 129 io_lib:fwrite("~ts",[file:format_error(Reason)]); 130format_error(missing_defs) -> "missing Definitions"; 131format_error(missing_rules) -> "missing Rules"; 132format_error(missing_code) -> "missing Erlang code"; 133format_error(empty_rules) -> "no rules"; 134format_error(bad_rule) -> "bad rule"; 135format_error({regexp,E})-> 136 Es = case E of 137 {interval_range,_} -> "interval range"; 138 {unterminated,Cs} -> 139 "unterminated " ++ Cs; 140 {illegal_char,Cs} -> 141 "illegal character " ++ Cs; 142%% {posix_cc,What} -> 143%% ["illegal POSIX character class ",io_lib:write_string(What)]; 144 {char_class,What} -> 145 ["illegal character class ",io_lib:write_string(What)] 146 end, 147 ["bad regexp `",Es,"'"]; 148format_error(ignored_characters) -> 149 "ignored characters"; 150format_error(cannot_parse) -> 151 io_lib:fwrite("cannot parse; probably encoding mismatch", []). 152 153%%% 154%%% Local functions 155%%% 156 157assure_extension(File, Ext) -> 158 lists:concat([strip_extension(File, Ext), Ext]). 159 160%% Assumes File is a filename. 161strip_extension(File, Ext) -> 162 case filename:extension(File) of 163 Ext -> filename:rootname(File); 164 _Other -> File 165 end. 166 167options(Options0) when is_list(Options0) -> 168 try 169 Options = flatmap(fun(return) -> short_option(return, true); 170 (report) -> short_option(report, true); 171 ({return,T}) -> short_option(return, T); 172 ({report,T}) -> short_option(report, T); 173 (T) -> [T] 174 end, Options0), 175 options(Options, [scannerfile,includefile,report_errors, 176 report_warnings,warnings_as_errors, 177 return_errors,return_warnings, 178 verbose,dfa_graph], []) 179 catch error: _ -> badarg 180 end; 181options(Option) -> 182 options([Option]). 183 184short_option(return, T) -> 185 [{return_errors,T}, {return_warnings,T}]; 186short_option(report, T) -> 187 [{report_errors,T}, {report_warnings,T}]. 188 189options(Options0, [Key|Keys], L) when is_list(Options0) -> 190 Options = case member(Key, Options0) of 191 true -> 192 [atom_option(Key)|delete(Key, Options0)]; 193 false -> 194 Options0 195 end, 196 V = case lists:keyfind(Key, 1, Options) of 197 {Key, Filename0} when Key =:= includefile; 198 Key =:= scannerfile -> 199 case is_filename(Filename0) of 200 no -> 201 badarg; 202 Filename -> 203 {ok,[{Key,Filename}]} 204 end; 205 {Key, Bool} = KB when is_boolean(Bool) -> 206 {ok, [KB]}; 207 {Key, _} -> 208 badarg; 209 false -> 210 {ok,[{Key,default_option(Key)}]} 211 end, 212 case V of 213 badarg -> 214 badarg; 215 {ok,KeyValueL} -> 216 NewOptions = keydelete(Key, 1, Options), 217 options(NewOptions, Keys, KeyValueL ++ L) 218 end; 219options([], [], L) -> 220 foldl(fun({_,false}, A) -> A; 221 ({Tag,true}, A) -> [Tag|A]; 222 (F,A) -> [F|A] 223 end, [], L); 224options(_Options, _, _L) -> 225 badarg. 226 227default_option(dfa_graph) -> false; 228default_option(includefile) -> []; 229default_option(report_errors) -> true; 230default_option(report_warnings) -> true; 231default_option(warnings_as_errors) -> false; 232default_option(return_errors) -> false; 233default_option(return_warnings) -> false; 234default_option(scannerfile) -> []; 235default_option(verbose) -> false. 236 237atom_option(dfa_graph) -> {dfa_graph,true}; 238atom_option(report_errors) -> {report_errors,true}; 239atom_option(report_warnings) -> {report_warnings,true}; 240atom_option(warnings_as_errors) -> {warnings_as_errors,true}; 241atom_option(return_errors) -> {return_errors,true}; 242atom_option(return_warnings) -> {return_warnings,true}; 243atom_option(verbose) -> {verbose,true}; 244atom_option(Key) -> Key. 245 246is_filename(T) -> 247 try filename:flatten(T) 248 catch error: _ -> no 249 end. 250 251shorten_filename(Name0) -> 252 {ok,Cwd} = file:get_cwd(), 253 case string:prefix(Name0, Cwd) of 254 nomatch -> Name0; 255 Rest -> 256 case unicode:characters_to_list(Rest) of 257 "/"++N -> N; 258 N -> N 259 end 260 end. 261 262leex_ret(St) -> 263 report_errors(St), 264 report_warnings(St), 265 Es = pack_errors(St#leex.errors), 266 Ws = pack_warnings(St#leex.warnings), 267 Werror = werror(St), 268 if 269 Werror -> 270 do_error_return(St, Es, Ws); 271 Es =:= [] -> 272 case member(return_warnings, St#leex.opts) of 273 true -> {ok, St#leex.efile, Ws}; 274 false -> {ok, St#leex.efile} 275 end; 276 true -> 277 do_error_return(St, Es, Ws) 278 end. 279 280do_error_return(St, Es, Ws) -> 281 case member(return_errors, St#leex.opts) of 282 true -> {error, Es, Ws}; 283 false -> error 284 end. 285 286werror(St) -> 287 St#leex.warnings =/= [] 288 andalso member(warnings_as_errors, St#leex.opts). 289 290pack_errors([{File,_} | _] = Es) -> 291 [{File, flatmap(fun({_,E}) -> [E] end, sort(Es))}]; 292pack_errors([]) -> 293 []. 294 295pack_warnings([{File,_} | _] = Ws) -> 296 [{File, flatmap(fun({_,W}) -> [W] end, sort(Ws))}]; 297pack_warnings([]) -> 298 []. 299 300report_errors(St) -> 301 when_opt(fun () -> 302 foreach(fun({File,{none,Mod,E}}) -> 303 io:fwrite("~ts: ~ts\n", 304 [File,Mod:format_error(E)]); 305 ({File,{Line,Mod,E}}) -> 306 io:fwrite("~ts:~w: ~ts\n", 307 [File,Line,Mod:format_error(E)]) 308 end, sort(St#leex.errors)) 309 end, report_errors, St#leex.opts). 310 311report_warnings(St) -> 312 Werror = member(warnings_as_errors, St#leex.opts), 313 Prefix = case Werror of 314 true -> ""; 315 false -> "Warning: " 316 end, 317 ReportWerror = Werror andalso member(report_errors, St#leex.opts), 318 ShouldReport = member(report_warnings, St#leex.opts) orelse ReportWerror, 319 when_bool(fun () -> 320 foreach(fun({File,{none,Mod,W}}) -> 321 io:fwrite("~ts: ~s~ts\n", 322 [File,Prefix, 323 Mod:format_error(W)]); 324 ({File,{Line,Mod,W}}) -> 325 io:fwrite("~ts:~w: ~s~ts\n", 326 [File,Line,Prefix, 327 Mod:format_error(W)]) 328 end, sort(St#leex.warnings)) 329 end, ShouldReport). 330 331-spec add_error(_, #leex{}) -> no_return(). 332add_error(E, St) -> 333 add_error(St#leex.xfile, E, St). 334 335add_error(File, Error, St) -> 336 throw(St#leex{errors = [{File,Error}|St#leex.errors]}). 337 338add_warning(Line, W, St) -> 339 St#leex{warnings = [{St#leex.xfile,{Line,leex,W}}|St#leex.warnings]}. 340 341%% filenames(File, Options, State) -> State. 342%% The default output dir is the current directory unless an 343%% explicit one has been given in the options. 344 345filenames(File, Opts, St0) -> 346 Dir = filename:dirname(File), 347 Base = filename:basename(File, ".xrl"), 348 Xfile = filename:join(Dir, Base ++ ".xrl"), 349 Efile = Base ++ ".erl", 350 Gfile = Base ++ ".dot", 351 Module = list_to_atom(Base), 352 St1 = St0#leex{xfile=Xfile, 353 opts=Opts, 354 module=Module}, 355 {includefile,Ifile0} = lists:keyfind(includefile, 1, Opts), 356 Ifile = inc_file_name(Ifile0), 357 %% Test for explicit scanner file. 358 {scannerfile,Ofile} = lists:keyfind(scannerfile, 1, Opts), 359 if 360 Ofile =:= [] -> 361 St1#leex{efile=filename:join(Dir, Efile), 362 ifile=Ifile, 363 gfile=filename:join(Dir, Gfile)}; 364 true -> 365 D = filename:dirname(Ofile), 366 St1#leex{efile=Ofile, 367 ifile=Ifile, 368 gfile=filename:join(D, Gfile)} 369 end. 370 371when_opt(Do, Opt, Opts) -> 372 case member(Opt, Opts) of 373 true -> Do(); 374 false -> ok 375 end. 376 377when_bool(Do, Bool) -> 378 case Bool of 379 true -> Do(); 380 false -> ok 381 end. 382 383verbose_print(St, Format, Args) -> 384 when_opt(fun () -> io:fwrite(Format, Args) end, verbose, St#leex.opts). 385 386%% parse_file(State) -> {ok,[REA],[Action],Code,NewState} | throw(NewState) 387%% when 388%% REA = {RegExp,ActionNo}; 389%% Action = {ActionNo,ActionString}; 390%% Code = {StartLine, StartPos, NumOfLines}. Where the Erlang code is. 391%% 392%% Read and parse the file Xfile. 393%% After each section of the file has been parsed we directly call the 394%% next section. This is done when we detect a line we don't recognise 395%% in the current section. The file format is very simple and Erlang 396%% token based, we allow empty lines and Erlang style comments. 397 398parse_file(St0) -> 399 case file:open(St0#leex.xfile, [read]) of 400 {ok,Xfile} -> 401 St1 = St0#leex{encoding = epp:set_encoding(Xfile)}, 402 try 403 verbose_print(St1, "Parsing file ~ts, ", [St1#leex.xfile]), 404 %% We KNOW that errors throw so we can ignore them here. 405 {ok,Line1,St2} = parse_head(Xfile, St1), 406 {ok,Line2,Macs,St3} = parse_defs(Xfile, Line1, St2), 407 {ok,Line3,REAs,Actions,St4} = 408 parse_rules(Xfile, Line2, Macs, St3), 409 {ok,Code,St5} = parse_code(Xfile, Line3, St4), 410 verbose_print(St5, "contained ~w rules.~n", [length(REAs)]), 411 {ok,REAs,Actions,Code,St5} 412 after ok = file:close(Xfile) 413 end; 414 {error,Error} -> 415 add_error({none,leex,{file_error,Error}}, St0) 416 end. 417 418%% parse_head(File, State) -> {ok,NextLine,State}. 419%% Parse the head of the file. Skip all comments and blank lines. 420 421parse_head(Ifile, St) -> {ok,nextline(Ifile, 0, St),St}. 422 423%% parse_defs(File, Line, State) -> {ok,NextLine,Macros,State}. 424%% Parse the macro definition section of a file. This must exist. 425%% The section is ended by a non-blank line which is not a macro def. 426 427parse_defs(Ifile, {ok,?DEFS_HEAD ++ Rest,L}, St) -> 428 St1 = warn_ignored_chars(L, Rest, St), 429 parse_defs(Ifile, nextline(Ifile, L, St), [], St1); 430parse_defs(_, {ok,_,L}, St) -> 431 add_error({L,leex,missing_defs}, St); 432parse_defs(_, {eof,L}, St) -> 433 add_error({L,leex,missing_defs}, St). 434 435parse_defs(Ifile, {ok,Chars,L}=Line, Ms, St) -> 436 %% This little beauty matches out a macro definition, RE's are so clear. 437 MS = "^[ \t]*([A-Z_][A-Za-z0-9_]*)[ \t]*=[ \t]*([^ \t\r\n]*)[ \t\r\n]*\$", 438 case re:run(Chars, MS, [{capture,all_but_first,list},unicode]) of 439 {match,[Name,Def]} -> 440 %%io:fwrite("~p = ~p\n", [Name,Def]), 441 parse_defs(Ifile, nextline(Ifile, L, St), [{Name,Def}|Ms], St); 442 _ -> {ok,Line,Ms,St} % Anything else 443 end; 444parse_defs(_, Line, Ms, St) -> 445 {ok,Line,Ms,St}. 446 447%% parse_rules(File, Line, Macros, State) -> {ok,NextLine,REAs,Actions,State}. 448%% Parse the RE rules section of the file. This must exist. 449 450parse_rules(Ifile, {ok,?RULE_HEAD ++ Rest,L}, Ms, St) -> 451 St1 = warn_ignored_chars(L, Rest, St), 452 parse_rules(Ifile, nextline(Ifile, L, St), Ms, [], [], 0, St1); 453parse_rules(_, {ok,_,L}, _, St) -> 454 add_error({L,leex,missing_rules}, St); 455parse_rules(_, {eof,L}, _, St) -> 456 add_error({L,leex,missing_rules}, St). 457 458%% parse_rules(File, Result, Macros, RegExpActions, Actions, Acount, State) -> 459%% {ok,NextCLine,RegExpActions,Actions,NewState} | throw(NewState) 460 461parse_rules(Ifile, NextLine, Ms, REAs, As, N, St) -> 462 case NextLine of 463 {ok,?CODE_HEAD ++ _Rest,_} -> 464 parse_rules_end(Ifile, NextLine, REAs, As, St); 465 {ok,Chars,L0} -> 466 %%io:fwrite("~w: ~p~n", [L0,Chars]), 467 case collect_rule(Ifile, Chars, L0) of 468 {ok,Re,Atoks,L1} -> 469 {ok,REA,A,St1} = parse_rule(Re, L0, Atoks, Ms, N, St), 470 parse_rules(Ifile, nextline(Ifile, L1, St), Ms, 471 [REA|REAs], [A|As], N+1, St1); 472 {error,E} -> add_error(E, St) 473 end; 474 {eof,_} -> 475 parse_rules_end(Ifile, NextLine, REAs, As, St) 476 end. 477 478parse_rules_end(_, {ok,_,L}, [], [], St) -> 479 add_error({L,leex,empty_rules}, St); 480parse_rules_end(_, {eof,L}, [], [], St) -> 481 add_error({L,leex,empty_rules}, St); 482parse_rules_end(_, NextLine, REAs, As, St) -> 483 %% Must be *VERY* careful to put rules in correct order! 484 {ok,NextLine,reverse(REAs),reverse(As),St}. 485 486%% collect_rule(File, Line, Lineno) -> 487%% {ok,RegExp,ActionTokens,NewLineno} | {error,E}. 488%% Collect a complete rule by reading lines until the the regexp and 489%% action has been read. Keep track of line number. 490 491collect_rule(Ifile, Chars, L0) -> 492 {RegExp,Rest} = string:take(Chars, " \t\r\n", true), 493 case collect_action(Ifile, Rest, L0, []) of 494 {ok,[{':',_}|Toks],L1} -> {ok,RegExp,Toks,L1}; 495 {ok,_,_} -> {error,{L0,leex,bad_rule}}; 496 {eof,L1} -> {error,{L1,leex,bad_rule}}; 497 {error,E,_} -> {error,E} 498 end. 499 500collect_action(_Ifile, {error, _}, L, _Cont0) -> 501 {error, {L, leex, cannot_parse}, ignored_end_line}; 502collect_action(Ifile, Chars, L0, Cont0) -> 503 case erl_scan:tokens(Cont0, Chars, L0) of 504 {done,{ok,Toks,_},_} -> {ok,Toks,L0}; 505 {done,{eof,_},_} -> {eof,L0}; 506 {done,{error,E,_},_} -> {error,E,L0}; 507 {more,Cont1} -> 508 collect_action(Ifile, io:get_line(Ifile, leex), L0+1, Cont1) 509 end. 510 511%% parse_rule(RegExpString, RegExpLine, ActionTokens, Macros, Counter, State) -> 512%% {ok,{RE,Action},ActionData,State}. 513%% Parse one regexp after performing macro substition. 514 515parse_rule(S, Line, [{dot,_}], Ms, N, St) -> 516 case parse_rule_regexp(S, Ms, St) of 517 {ok,R} -> 518 {ok,{R,N},{N,empty_action},St}; 519 {error,E} -> 520 add_error({Line,leex,E}, St) 521 end; 522parse_rule(S, Line, Atoks, Ms, N, St) -> 523 case parse_rule_regexp(S, Ms, St) of 524 {ok,R} -> 525 %%io:fwrite("RE = ~p~n", [R]), 526 %% Check for token variables. 527 TokenChars = var_used('TokenChars', Atoks), 528 TokenLen = var_used('TokenLen', Atoks), 529 TokenLine = var_used('TokenLine', Atoks), 530 {ok,{R,N},{N,Atoks,TokenChars,TokenLen,TokenLine},St}; 531 {error,E} -> 532 add_error({Line,leex,E}, St) 533 end. 534 535var_used(Name, Toks) -> 536 case lists:keyfind(Name, 3, Toks) of 537 {var,_,Name} -> true; %It's the var we want 538 _ -> false 539 end. 540 541%% parse_rule_regexp(RegExpString, Macros, State) -> 542%% {ok,RegExp} | {error,Error}. 543%% Substitute in macros and parse RegExpString. Cannot use re:replace 544%% here as it uses info in replace string (&). 545 546parse_rule_regexp(RE0, [{M,Exp}|Ms], St) -> 547 Split= re:split(RE0, "\\{" ++ M ++ "\\}", [{return,list},unicode]), 548 RE1 = lists:append(lists:join(Exp, Split)), 549 parse_rule_regexp(RE1, Ms, St); 550parse_rule_regexp(RE, [], St) -> 551 %%io:fwrite("RE = ~p~n", [RE]), 552 case re_parse(RE, St) of 553 {ok,R} -> {ok,R}; 554 {error,E} -> {error,{regexp,E}} 555 end. 556 557%% parse_code(File, Line, State) -> {ok,Code,NewState}. 558%% Finds the line and the position where the code section of the file 559%% begins. This must exist. 560 561parse_code(Ifile, {ok,?CODE_HEAD ++ Rest,CodeL}, St) -> 562 St1 = warn_ignored_chars(CodeL, Rest, St), 563 {ok, CodePos} = file:position(Ifile, cur), 564 %% Just count the lines; copy the code from file to file later. 565 EndCodeLine = count_lines(Ifile, CodeL, St), 566 NCodeLines = EndCodeLine - CodeL, 567 {ok,{CodeL,CodePos,NCodeLines},St1}; 568parse_code(_, {ok,_,L}, St) -> 569 add_error({L,leex,missing_code}, St); 570parse_code(_, {eof,L}, St) -> 571 add_error({L,leex,missing_code}, St). 572 573count_lines(File, N, St) -> 574 case io:get_line(File, leex) of 575 eof -> N; 576 {error, _} -> add_error({N+1, leex, cannot_parse}, St); 577 _Line -> count_lines(File, N+1, St) 578 end. 579 580%% nextline(InputFile, PrevLineNo, State) -> {ok,Chars,LineNo} | {eof,LineNo}. 581%% Get the next line skipping comment lines and blank lines. 582 583nextline(Ifile, L, St) -> 584 case io:get_line(Ifile, leex) of 585 eof -> {eof,L}; 586 {error, _} -> add_error({L+1, leex, cannot_parse}, St); 587 Chars -> 588 case string:take(Chars, " \t\n") of 589 {_, [$%|_Rest]} -> nextline(Ifile, L+1, St); 590 {_, []} -> nextline(Ifile, L+1, St); 591 _Other -> {ok,Chars,L+1} 592 end 593 end. 594 595warn_ignored_chars(Line, S, St) -> 596 case non_white(S) of 597 [] -> St; 598 _ -> add_warning(Line, ignored_characters, St) 599 end. 600 601non_white(S) -> 602 [C || C <- S, C > $\s, C < $\200 orelse C > $\240]. 603 604%% This is the regular expression grammar used. It is equivalent to the 605%% one used in AWK, except that we allow ^ $ to be used anywhere and fail 606%% in the matching. 607%% 608%% reg -> alt : '$1'. 609%% alt -> seq "|" seq ... : {alt,['$1','$2'...]}. 610%% seq -> repeat repeat ... : {seq,['$1','$2'...]}. 611%% repeat -> repeat "*" : {kclosure,'$1'}. 612%% repeat -> repeat "+" : {pclosure,'$1'}. 613%% repeat -> repeat "?" : {optional,'$1'}. 614%% repeat -> repeat "{" [Min],[Max] "}" : {interval,'$1',Min,Max} 615%% repeat -> single : '$1'. 616%% single -> "(" reg ")" : {sub,'$2',Number}. 617%% single -> "^" : bos/bol. 618%% single -> "$" : eos/eol. 619%% single -> "." : any. 620%% single -> "[" class "]" : {char_class,char_class('$2')} 621%% single -> "[" "^" class "]" : {comp_class,char_class('$3')}. 622%% single -> "\"" chars "\"" : {lit,'$2'}. 623%% single -> "\\" char : {lit,['$2']}. 624%% single -> char : {lit,['$1']}. 625%% single -> empty : epsilon. 626%% The grammar of the current regular expressions. The actual parser 627%% is a recursive descent implementation of the grammar. 628 629%% re_parse(Chars, State) -> {ok,RegExp} | {error,Error}. 630 631re_parse(Cs0, St) -> 632 case catch re_reg(Cs0, 0, St) of 633 {RE,_,[]} -> {ok,RE}; 634 {_,_,[C|_]} -> {error,{illegal_char,[C]}}; 635 {parse_error,E} -> {error,E} 636 end. 637 638parse_error(E) -> throw({parse_error,E}). 639 640re_reg(Cs, Sn, St) -> re_alt(Cs, Sn, St). 641 642re_alt(Cs0, Sn0, St) -> 643 {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), 644 case re_alt1(Cs1, Sn1, St) of 645 {[],Sn2,Cs2} -> {L,Sn2,Cs2}; 646 {Rs,Sn2,Cs2} -> {{alt,[L|Rs]},Sn2,Cs2} 647 end. 648 649re_alt1([$||Cs0], Sn0, St) -> 650 {L,Sn1,Cs1} = re_seq(Cs0, Sn0, St), 651 {Rs,Sn2,Cs2} = re_alt1(Cs1, Sn1, St), 652 {[L|Rs],Sn2,Cs2}; 653re_alt1(Cs, Sn, _) -> {[],Sn,Cs}. 654 655%% Parse a sequence of regexps. Don't allow the empty sequence. 656%% re_seq(Cs0, Sn0, St) -> 657%% {L,Sn1,Cs1} = repeat(Cs0, Sn0, St), 658%% case re_seq1(Cs1, Sn1, St) of 659%% {[],Sn2,Cs2} -> {L,Sn2,Cs2}; 660%% {Rs,Sn2,Cs2} -> {{seq,[L|Rs]},Sn2,Cs2} 661%% end. 662 663%% re_seq(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 664%% Parse a sequence of regexps. Allow the empty sequence, returns epsilon. 665 666re_seq(Cs0, Sn0, St) -> 667 case re_seq1(Cs0, Sn0, St) of 668 {[],Sn1,Cs1} -> {epsilon,Sn1,Cs1}; 669 {[R],Sn1,Cs1} -> {R,Sn1,Cs1}; 670 {Rs,Sn1,Cs1} -> {{seq,Rs},Sn1,Cs1} 671 end. 672 673re_seq1([C|_]=Cs0, Sn0, St) when C =/= $|, C =/= $) -> 674 {L,Sn1,Cs1} = re_repeat(Cs0, Sn0, St), 675 {Rs,Sn2,Cs2} = re_seq1(Cs1, Sn1, St), 676 {[L|Rs],Sn2,Cs2}; 677re_seq1(Cs, Sn, _) -> {[],Sn,Cs}. 678 679%% re_repeat(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 680 681re_repeat(Cs0, Sn0, St) -> 682 {S,Sn1,Cs1} = re_single(Cs0, Sn0, St), 683 re_repeat1(Cs1, Sn1, S, St). 684 685re_repeat1([$*|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {kclosure,S}, St); 686re_repeat1([$+|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {pclosure,S}, St); 687re_repeat1([$?|Cs], Sn, S, St) -> re_repeat1(Cs, Sn, {optional,S}, St); 688%% { only starts interval when ere is true, otherwise normal character. 689%% re_repeat1([${|Cs0], Sn, S, #leex{posix=true}=St) -> % $} 690%% case re_interval_range(Cs0) of 691%% {Min,Max,[$}|Cs1]} when is_integer(Min), is_integer(Max), Min =< Max -> 692%% re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); 693%% {Min,Max,[$}|Cs1]} when is_integer(Min), is_atom(Max) -> 694%% re_repeat1(Cs1, Sn, {interval,S,Min,Max}, St); 695%% {_,_,Cs1} -> parse_error({interval_range,string_between([${|Cs0], Cs1)}) 696%% end; 697re_repeat1(Cs, Sn, S, _) -> {S,Sn,Cs}. 698 699%% re_single(Chars, SubNumber, State) -> {RegExp,SubNumber,Chars}. 700%% Parse a re_single regexp. 701 702re_single([$(|Cs0], Sn0, St) -> % $) 703 Sn1 = Sn0 + 1, % Keep track of sub count anyway 704 case re_reg(Cs0, Sn1, St) of 705 {S,Sn2,[$)|Cs1]} -> {S,Sn2,Cs1}; 706 %%{S,Sn2,[$)|Cs1]} -> {{sub,S,Sn1},Sn2,Cs1}; 707 _ -> parse_error({unterminated,"("}) 708 end; 709%% These are not legal inside a regexp. 710%% re_single([$^|Cs], Sn, St) -> {bos,Sn,Cs}; 711%% re_single([$$|Cs], Sn, St) -> {eos,Sn,Cs}; 712%% re_single([$.|Cs], Sn, St) -> {any,Sn,Cs}; 713re_single([$.|Cs], Sn, _) -> {{comp_class,"\n"},Sn,Cs}; % Do this here? 714re_single("[^" ++ Cs0, Sn, St) -> 715 case re_char_class(Cs0, St) of 716 {Cc,[$]|Cs1]} -> {{comp_class,Cc},Sn,Cs1}; 717 _ -> parse_error({unterminated,"["}) 718 end; 719re_single([$[|Cs0], Sn, St) -> 720 case re_char_class(Cs0, St) of 721 {Cc,[$]|Cs1]} -> {{char_class,Cc},Sn,Cs1}; 722 _ -> parse_error({unterminated,"["}) 723 end; 724re_single([$\\|Cs0], Sn, _) -> 725 {C,Cs1} = re_char($\\, Cs0), 726 {{lit,[C]},Sn,Cs1}; 727re_single([C|Cs0], Sn, St) -> 728 case special_char(C, St) of 729 true -> parse_error({illegal_char,[C]}); 730 false -> 731 {C,Cs1} = re_char(C, Cs0), 732 {{lit,[C]},Sn,Cs1} 733 end. 734 735-define(IS_HEX(C), C >= $0 andalso C =< $9 orelse 736 C >= $A andalso C =< $F orelse 737 C >= $a andalso C =< $f). 738 739%% re_char(Char, Chars) -> {CharValue,Chars}. 740%% Reads one character value from the input list, it knows about escapes. 741 742re_char($\\, [O1,O2,O3|S]) when 743 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> 744 {(O1*8 + O2)*8 + O3 - 73*$0,S}; 745re_char($\\, [$x,H1,H2|S]) when ?IS_HEX(H1), ?IS_HEX(H2) -> 746 {erlang:list_to_integer([H1,H2], 16),S}; 747re_char($\\,[$x,${|S0]) -> 748 re_hex(S0, []); 749re_char($\\,[$x|_]) -> 750 parse_error({illegal_char,"\\x"}); 751re_char($\\, [C|S]) -> {escape_char(C),S}; 752re_char($\\, []) -> parse_error({unterminated,"\\"}); 753re_char(C, S) -> {C,S}. % Just this character 754 755re_hex([C|Cs], L) when ?IS_HEX(C) -> re_hex(Cs, [C|L]); 756re_hex([$}|S], L0) -> 757 L = lists:reverse(L0), 758 case erlang:list_to_integer(L, 16) of 759 C when C =< 16#10FFFF -> {C,S}; 760 _ -> parse_error({illegal_char,[$\\,$x,${|L]++"}"}) 761 end; 762re_hex(_, _) -> parse_error({unterminated,"\\x{"}). 763 764%% special_char(Char, State) -> bool(). 765%% These are the special characters for an ERE. 766%% N.B. ]}) are only special in the context after [{(. 767 768special_char($^, _) -> true; 769special_char($., _) -> true; 770special_char($[, _) -> true; 771special_char($$, _) -> true; 772special_char($(, _) -> true; 773special_char($), _) -> true; 774special_char($|, _) -> true; 775special_char($*, _) -> true; 776special_char($+, _) -> true; 777special_char($?, _) -> true; 778%% special_char(${, #leex{posix=true}) -> true; % Only when POSIX set 779special_char($\\, _) -> true; 780special_char(_, _) -> false. 781 782%% re_char_class(Chars, State) -> {CharClass,Chars}. 783%% Parse a character class. 784 785re_char_class([$]|Cs], St) -> % Must special case this. 786 re_char_class(Cs, [$]], St); 787re_char_class(Cs, St) -> re_char_class(Cs, [], St). 788 789%% re_char_class("[:" ++ Cs0, Cc, #leex{posix=true}=St) -> 790%% %% POSIX char class only. 791%% case posix_cc(Cs0) of 792%% {Pcl,":]" ++ Cs1} -> re_char_class(Cs1, [{posix,Pcl}|Cc], St); 793%% {_,Cs1} -> parse_error({posix_cc,string_between(Cs0, Cs1)}) 794%% end; 795re_char_class([C1|Cs0], Cc, St) when C1 =/= $] -> 796 case re_char(C1, Cs0) of 797 {Cf,[$-,C2|Cs1]} when C2 =/= $] -> 798 case re_char(C2, Cs1) of 799 {Cl,Cs2} when Cf < Cl -> 800 re_char_class(Cs2, [{range,Cf,Cl}|Cc], St); 801 {_,Cs2} -> 802 parse_error({char_class,string_between([C1|Cs0], Cs2)}) 803 end; 804 {C,Cs1} -> re_char_class(Cs1, [C|Cc], St) 805 end; 806re_char_class(Cs, Cc, _) -> {reverse(Cc),Cs}. % Preserve order 807 808%% posix_cc(String) -> {PosixClass,RestString}. 809%% Handle POSIX character classes. 810 811%% posix_cc("alnum" ++ Cs) -> {alnum,Cs}; 812%% posix_cc("alpha" ++ Cs) -> {alpha,Cs}; 813%% posix_cc("blank" ++ Cs) -> {blank,Cs}; 814%% posix_cc("cntrl" ++ Cs) -> {cntrl,Cs}; 815%% posix_cc("digit" ++ Cs) -> {digit,Cs}; 816%% posix_cc("graph" ++ Cs) -> {graph,Cs}; 817%% posix_cc("lower" ++ Cs) -> {lower,Cs}; 818%% posix_cc("print" ++ Cs) -> {print,Cs}; 819%% posix_cc("punct" ++ Cs) -> {punct,Cs}; 820%% posix_cc("space" ++ Cs) -> {space,Cs}; 821%% posix_cc("upper" ++ Cs) -> {upper,Cs}; 822%% posix_cc("xdigit" ++ Cs) -> {xdigit,Cs}; 823%% posix_cc(Cs) -> parse_error({posix_cc,string:slice(Cs, 0, 5)}). 824 825escape_char($n) -> $\n; % \n = LF 826escape_char($r) -> $\r; % \r = CR 827escape_char($t) -> $\t; % \t = TAB 828escape_char($v) -> $\v; % \v = VT 829escape_char($b) -> $\b; % \b = BS 830escape_char($f) -> $\f; % \f = FF 831escape_char($e) -> $\e; % \e = ESC 832escape_char($s) -> $\s; % \s = SPACE 833escape_char($d) -> $\d; % \d = DEL 834escape_char(C) -> C. % Pass it straight through 835 836%% re_interval_range(Chars) -> {Min,Max,RestChars}. 837%% NoInt -> none,none 838%% Int -> Int,none 839%% Int, -> Int,any 840%% Int1,Int2 -> Int1,Int2 841 842%% re_interval_range(Cs0) -> 843%% case re_number(Cs0) of 844%% {none,Cs1} -> {none,none,Cs1}; 845%% {N,[$,|Cs1]} -> 846%% case re_number(Cs1) of 847%% {none,Cs2} -> {N,any,Cs2}; 848%% {M,Cs2} -> {N,M,Cs2} 849%% end; 850%% {N,Cs1} -> {N,none,Cs1} 851%% end. 852 853%% re_number([C|Cs]) when C >= $0, C =< $9 -> 854%% re_number(Cs, C - $0); 855%% re_number(Cs) -> {none,Cs}. 856 857%% re_number([C|Cs], Acc) when C >= $0, C =< $9 -> 858%% re_number(Cs, 10*Acc + (C - $0)); 859%% re_number(Cs, Acc) -> {Acc,Cs}. 860 861string_between(Cs1, Cs2) -> 862 string:slice(Cs1, 0, string:length(Cs1)-string:length(Cs2)). 863 864%% We use standard methods, Thompson's construction and subset 865%% construction, to create first an NFA and then a DFA from the 866%% regexps. A non-standard feature is that we work with sets of 867%% character ranges (crs) instead sets of characters. This is most 868%% noticeable when constructing DFAs. The major benefit is that we can 869%% handle characters from any set, not just limited ASCII or 8859, 870%% even 16/32 bit unicode. 871%% 872%% The whole range of characters is 0-maxchar, where maxchar is a BIG 873%% number. We don't make any assumptions about the size of maxchar, it 874%% is just bigger than any character. 875%% 876%% Using character ranges makes describing many regexps very simple, 877%% for example the regexp "." just becomes the range 878%% [{0-9},{11-maxchar}]. 879 880%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. 881%% Build a complete nfa from a list of {RegExp,Action}. The NFA field 882%% accept has values {yes,Action}|no. The NFA is a list of states. 883 884make_dfa(REAs, St) -> 885 {NFA,NF} = build_combined_nfa(REAs), 886 verbose_print(St, "NFA contains ~w states, ", [tuple_size(NFA)]), 887 {DFA0,DF0} = build_dfa(NFA, NF), 888 verbose_print(St, "DFA contains ~w states, ", [length(DFA0)]), 889 {DFA,DF} = minimise_dfa(DFA0, DF0), 890 verbose_print(St, "minimised to ~w states.~n", [length(DFA)]), 891 %%io:fwrite("~p\n", [{NF,NFA}]), 892 %%io:fwrite("~p\n", [{DF0,DFA0}]), 893 %%io:fwrite("~p\n", [{DF,DFA}]), 894 {DFA,DF}. 895 896%% build_combined_nfa(RegExpActionList) -> {NFA,FirstState}. 897%% Build the combined NFA using Thompson's construction straight out 898%% of the book. Build the separate NFAs in the same order as the 899%% rules so that the accepting have ascending states have ascending 900%% state numbers. Start numbering the states from 1 as we put the 901%% states in a tuple with the state number as the index. 902%% 903%% The edges from a state are a list of {CharRange,State} | {epsilon,State}. 904 905build_combined_nfa(REAs) -> 906 {NFA0,Firsts,Free} = build_nfa_list(REAs, [], [], 1), 907 F = #nfa_state{no=Free,edges=epsilon_trans(Firsts)}, 908 {list_to_tuple(keysort(#nfa_state.no, [F|NFA0])),Free}. 909 910build_nfa_list([{RE,Action}|REAs], NFA0, Firsts, Free0) -> 911 {NFA1,Free1,First} = build_nfa(RE, Free0, Action), 912 build_nfa_list(REAs, NFA1 ++ NFA0, [First|Firsts], Free1); 913build_nfa_list([], NFA, Firsts, Free) -> 914 {NFA,reverse(Firsts),Free}. 915 916epsilon_trans(Firsts) -> [ {epsilon,F} || F <- Firsts ]. 917 918%% build_nfa(RegExp, NextState, Action) -> {NFA,NextState,FirstState}. 919%% When building the NFA states for a regexp we don't build the end 920%% state, just allocate a State for it and return this state's 921%% number. This allows us to avoid building unnecessary states for 922%% concatenation which would then have to be removed by overwriting 923%% an existing state. 924 925build_nfa(RE, N0, Action) -> 926 {NFA,N1,E} = build_nfa(RE, N0+1, N0, []), 927 {[#nfa_state{no=E,accept={accept,Action}}|NFA],N1,N0}. 928 929%% build_nfa(RegExp, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 930%% Build an NFA from the RegExp. NFA is a list of #nfa_state{} in no 931%% predefined order. NextState is the number of the next free state 932%% to use, FirstState is the the state which must be the start for 933%% this regexp as a previous regexp refers to it, EndState is the 934%% state to which this NFA will exit to. The number of the returned 935%% EndState is already allocated! 936 937build_nfa({alt,REs}, N, F, NFA) -> 938 build_nfa_alt(REs, N, F, NFA); 939build_nfa({seq,REs}, N, F, NFA) -> 940 build_nfa_seq(REs, N, F, NFA); 941build_nfa({kclosure,RE}, N0, F, NFA0) -> 942 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 943 E = N1, % End state 944 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, 945 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 946 N1+1,E}; 947build_nfa({pclosure,RE}, N0, F, NFA0) -> 948 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 949 E = N1, % End state 950 {[#nfa_state{no=F,edges=[{epsilon,N0}]}, 951 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 952 N1+1,E}; 953build_nfa({optional,RE}, N0, F, NFA0) -> 954 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 955 E = N1, % End state 956 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,E}]}, 957 #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], 958 N1+1,E}; 959build_nfa({char_class,Cc}, N, F, NFA) -> 960 {[#nfa_state{no=F,edges=[{pack_cc(Cc),N}]}|NFA],N+1,N}; 961build_nfa({comp_class,Cc}, N, F, NFA) -> 962 {[#nfa_state{no=F,edges=[{comp_class(Cc),N}]}|NFA],N+1,N}; 963build_nfa({lit,Cs}, N, F, NFA) -> % Implicit concatenation 964 build_nfa_lit(Cs, N, F, NFA); 965build_nfa(epsilon, N, F, NFA) -> % Just an epsilon transition 966 {[#nfa_state{no=F,edges=[{epsilon,N}]}|NFA],N+1,N}. 967 968%% build_nfa_lit(Chars, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 969%% Build an NFA for the sequence of literal characters. 970 971build_nfa_lit(Cs, N0, F0, NFA0) -> 972 foldl(fun (C, {NFA,N,F}) -> 973 {[#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA],N+1,N} 974 end, {NFA0,N0,F0}, Cs). 975 976%% build_nfa_lit([C|Cs], N, F, NFA0) when is_integer(C) -> 977%% NFA1 = [#nfa_state{no=F,edges=[{[{C,C}],N}]}|NFA0], 978%% build_nfa_lit(Cs, N+1, N, NFA1); 979%% build_nfa_lit([], N, F, NFA) -> {NFA,N,F}. 980 981%% build_nfa_seq(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 982%% Build an NFA for the regexps in a sequence. 983 984build_nfa_seq(REs, N0, F0, NFA0) -> 985 foldl(fun (RE, {NFA,N,F}) -> build_nfa(RE, N, F, NFA) end, 986 {NFA0,N0,F0}, REs). 987 988%% build_nfa_seq([RE|REs], N0, F, NFA0) -> 989%% {NFA1,N1,E1} = build_nfa(RE, N0, F, NFA0), 990%% build_nfa_seq(REs, N1, E1, NFA1); 991%% build_nfa_seq([], N, F, NFA) -> {NFA,N,F}. 992 993%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 994%% Build an NFA for the regexps in an alternative. N.B. we don't 995%% handle empty alts here but the parser should never generate them 996%% anyway. 997 998build_nfa_alt([RE], N, F, NFA) -> build_nfa(RE, N, F, NFA); 999build_nfa_alt([RE|REs], N0, F, NFA0) -> 1000 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1001 {NFA2,N2,E2} = build_nfa_alt(REs, N1+1, N1, NFA1), 1002 E = N2, % End state 1003 {[#nfa_state{no=F,edges=[{epsilon,N0},{epsilon,N1}]}, 1004 #nfa_state{no=E1,edges=[{epsilon,E}]}, 1005 #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], 1006 N2+1,E}. 1007 1008%% build_nfa_alt(REs, NextState, FirstState, NFA) -> {NFA,NextState,EndState}. 1009%% Build an NFA for the regexps in an alternative. Make one big 1010%% epsilon split state, not necessary but fun. 1011 1012%% build_nfa_alt(REs, N0, F0, NFA0) -> 1013%% E = N0, % Must reserve End state first 1014%% {Fs,{NFA1,N1}} = mapfoldl(fun (RE, {NFA,N}) -> 1015%% build_nfa_alt1(RE, N, E, NFA) 1016%% end, {NFA0,N0+1}, REs), 1017%% {[#nfa_state{no=F0,edges=epsilon_trans(Fs)}, 1018%% #nfa_state{no=E,edges=[{epsilon,N1}]}|NFA1],N1+1,N1}. 1019 1020%% build_nfa_alt1(RE, N0, End, NFA0) -> 1021%% {NFA1,N1,E} = build_nfa(RE, N0+1, N0, NFA0), 1022%% {N0,{[#nfa_state{no=E,edges=[{epsilon,End}]}|NFA1],N1}}. 1023 1024%% pack_cc(CharClass) -> CharClass 1025%% Pack and optimise a character class specification (bracket 1026%% expression). First sort it and then compact it. 1027 1028pack_cc(Cc) -> 1029 Crs = foldl(fun ({range,Cf,Cl}, Set) -> add_element({Cf,Cl}, Set); 1030 (C, Set) -> add_element({C,C}, Set) 1031 end, ordsets:new(), Cc), 1032 pack_crs(Crs). % An ordset IS a list! 1033 1034pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> 1035 %% C1 C2 1036 %% C3 C4 1037 pack_crs([Cr|Crs]); 1038pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> 1039 %% C1 C2 1040 %% C3 C4 1041 pack_crs([{C1,C4}|Crs]); 1042pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 =:= C3 -> 1043 %% C1 C2 1044 %% C3 C4 1045 pack_crs([{C1,C4}|Crs]); 1046pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; 1047pack_crs([]) -> []. 1048 1049comp_class(Cc) -> 1050 Crs = pack_cc(Cc), 1051 Comp = comp_crs(Crs, 0), 1052 %% io:fwrite("comp: ~p\n ~p\n", [Crs,Comp]), 1053 Comp. 1054 1055comp_crs([{0,C2}|Crs], 0) -> % Get first range right 1056 comp_crs(Crs, C2+1); 1057comp_crs([{C1,C2}|Crs], Last) -> 1058 [{Last,C1-1}|comp_crs(Crs, C2+1)]; 1059comp_crs([], Last) -> [{Last,maxchar}]. 1060 1061%% build_dfa(NFA, NfaFirstState) -> {DFA,DfaFirstState}. 1062%% Build a DFA from an NFA using "subset construction". The major 1063%% difference from the book is that we keep the marked and unmarked 1064%% DFA states in seperate lists. New DFA states are added to the 1065%% unmarked list and states are marked by moving them to the marked 1066%% list. We assume that the NFA accepting state numbers are in 1067%% ascending order for the rules and use ordsets to keep this order. 1068 1069build_dfa(NFA, Nf) -> 1070 D = #dfa_state{no=0,nfa=eclosure([Nf], NFA)}, 1071 {build_dfa([D], 1, [], NFA),0}. 1072 1073%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. 1074%% Traverse the unmarked states. Temporarily add the current unmarked 1075%% state to the marked list before calculating translation, this is 1076%% to avoid adding too many duplicate states. Add it properly to the 1077%% marked list afterwards with correct translations. 1078 1079build_dfa([U|Us0], N0, Ms, NFA) -> 1080 {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), 1081 M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, 1082 build_dfa(Us1, N1, [M|Ms], NFA); 1083build_dfa([], _, Ms, _) -> Ms. 1084 1085%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> 1086%% {Transitions,UnmarkedStates,NextState}. 1087%% Foreach NFA state set calculate the legal translations. N.B. must 1088%% search *BOTH* the unmarked and marked lists to check if DFA state 1089%% already exists. As the range of characters is potentially VERY 1090%% large we cannot explicitly test all characters. Instead we first 1091%% calculate the set of all disjoint character ranges which are 1092%% possible candidates to the set of NFA states. The transitions are 1093%% an orddict so we get the transition lists in ascending order. 1094 1095build_dfa(Set, Us, N, Ts, Ms, NFA) -> 1096 %% List of all transition sets. 1097 Crs0 = [Cr || S <- Set, 1098 {Crs,_St} <- (element(S, NFA))#nfa_state.edges, 1099 Crs =/= epsilon, % Not an epsilon transition 1100 Cr <- Crs ], 1101 Crs1 = lists:usort(Crs0), % Must remove duplicates! 1102 %% Build list of disjoint test ranges. 1103 Test = disjoint_crs(Crs1), 1104 %% io:fwrite("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), 1105 build_dfa(Test, Set, Us, N, Ts, Ms, NFA). 1106 1107%% disjoint_crs([CharRange]) -> [CharRange]. 1108%% Take a sorted list of char ranges and make a sorted list of 1109%% disjoint char ranges. No new char range extends past an existing 1110%% char range. 1111 1112disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> 1113 %% C1 C2 1114 %% C3 C4 1115 [Cr1|disjoint_crs([Cr2|Crs])]; 1116disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 =:= C3 -> 1117 %% C1 C2 1118 %% C3 C4 1119 [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; 1120disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> 1121 %% C1 C2 1122 %% C3 C4 1123 [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; 1124disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 =:= C4 -> 1125 %% C1 C2 1126 %% C3 C4 1127 [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; 1128disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> 1129 %% C1 C2 1130 %% C3 C4 1131 [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; 1132disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; 1133disjoint_crs([]) -> []. 1134 1135build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> 1136 case eclosure(move(Set, Cr, NFA), NFA) of 1137 S when S =/= [] -> 1138 case dfa_state_exist(S, Us, Ms) of 1139 {yes,T} -> 1140 build_dfa(Crs, Set, Us, N, store(Cr, T, Ts), Ms, NFA); 1141 no -> 1142 U = #dfa_state{no=N,nfa=S}, 1143 build_dfa(Crs, Set, [U|Us], N+1, store(Cr, N, Ts), Ms, NFA) 1144 end; 1145 [] -> 1146 build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) 1147 end; 1148build_dfa([], _, Us, N, Ts, _, _) -> 1149 {Ts,Us,N}. 1150 1151%% dfa_state_exist(Set, Unmarked, Marked) -> {yes,State} | no. 1152 1153dfa_state_exist(S, Us, Ms) -> 1154 case lists:keyfind(S, #dfa_state.nfa, Us) of 1155 #dfa_state{no=T} -> {yes,T}; 1156 false -> 1157 case lists:keyfind(S, #dfa_state.nfa, Ms) of 1158 #dfa_state{no=T} -> {yes,T}; 1159 false -> no 1160 end 1161 end. 1162 1163%% eclosure([State], NFA) -> [State]. 1164%% move([State], Char, NFA) -> [State]. 1165%% These are straight out of the book. As eclosure uses ordsets then 1166%% the generated state sets are in ascending order. 1167 1168eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). 1169 1170eclosure([St|Sts], NFA, Ec) -> 1171 #nfa_state{edges=Es} = element(St, NFA), 1172 eclosure([ N || {epsilon,N} <- Es, 1173 not is_element(N, Ec) ] ++ Sts, 1174 NFA, add_element(St, Ec)); 1175eclosure([], _, Ec) -> Ec. 1176 1177move(Sts, Cr, NFA) -> 1178 %% io:fwrite("move1: ~p\n", [{Sts,Cr}]), 1179 [ St || N <- Sts, 1180 {Crs,St} <- (element(N, NFA))#nfa_state.edges, 1181 Crs =/= epsilon, % Not an epsilon transition 1182 in_crs(Cr, Crs) ]. 1183 1184in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; 1185in_crs(Cr, [Cr|_Crs]) -> true; % Catch bos and eos. 1186in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); 1187in_crs(_Cr, []) -> false. 1188 1189%% accept([State], NFA) -> {accept,A} | noaccept. 1190%% Scan down the state list until we find an accepting state. 1191 1192accept([St|Sts], NFA) -> 1193 case element(St, NFA) of 1194 #nfa_state{accept={accept,A}} -> {accept,A}; 1195 #nfa_state{accept=noaccept} -> accept(Sts, NFA) 1196 end; 1197accept([], _) -> noaccept. 1198 1199%% minimise_dfa(DFA, DfaFirst) -> {DFA,DfaFirst}. 1200%% Minimise the DFA by removing equivalent states. We consider a 1201%% state if both the transitions and the their accept state is the 1202%% same. First repeatedly run throught the DFA state list removing 1203%% equivalent states and updating remaining transitions with 1204%% remaining equivalent state numbers. When no more reductions are 1205%% possible then pack the remaining state numbers to get consecutive 1206%% states. 1207 1208minimise_dfa(DFA0, Df0) -> 1209 case min_dfa(DFA0) of 1210 {DFA1,[]} -> % No reduction! 1211 {DFA2,Rs} = pack_dfa(DFA1), 1212 {min_update(DFA2, Rs),min_use(Df0, Rs)}; 1213 {DFA1,Rs} -> 1214 minimise_dfa(min_update(DFA1, Rs), min_use(Df0, Rs)) 1215 end. 1216 1217min_dfa(DFA) -> min_dfa(DFA, [], []). 1218 1219min_dfa([D|DFA0], Rs0, MDFA) -> 1220 {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, 1221 D#dfa_state.no, Rs0, []), 1222 min_dfa(DFA1, Rs1, [D|MDFA]); 1223min_dfa([], Rs, MDFA) -> {MDFA,Rs}. 1224 1225%% min_delete(States, Trans, Action, NewN, Rs, MiniDFA) -> {MiniDFA,Rs}. 1226%% Delete all states with same transactions and action. Return 1227%% rewrites and minimised DFA with no duplicate states. 1228 1229min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> 1230 min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); 1231min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> 1232 min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); 1233min_delete([], _, _, _, Rs, MDFA) -> {MDFA,Rs}. 1234 1235min_update(DFA, Rs) -> 1236 [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. 1237 1238min_update_trans(Tr, Rs) -> 1239 [ {C,min_use(S, Rs)} || {C,S} <- Tr ]. 1240 1241min_use(Old, [{Old,New}|_]) -> New; 1242min_use(Old, [_|Reds]) -> min_use(Old, Reds); 1243min_use(Old, []) -> Old. 1244 1245pack_dfa(DFA) -> pack_dfa(DFA, 0, [], []). 1246 1247pack_dfa([D|DFA], NewN, Rs, PDFA) -> 1248 pack_dfa(DFA, NewN+1, 1249 [{D#dfa_state.no,NewN}|Rs], [D#dfa_state{no=NewN}|PDFA]); 1250pack_dfa([], _, Rs, PDFA) -> {PDFA,Rs}. 1251 1252%% The main output is the yystate function which is built from the 1253%% DFA. It has the spec: 1254%% 1255%% yystate() -> InitialState. 1256%% yystate(State, InChars, Line, CurrTokLen, AcceptAction, AcceptLen) -> 1257%% {Action, AcceptLength, RestChars, Line} | Accepting end state 1258%% {Action, AcceptLength, RestChars, Line, State} | Accepting state 1259%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} | 1260%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State}. 1261 1262%% The return CurrTokLen is always the current number of characters 1263%% scanned in the current token. The returns have the following 1264%% meanings: 1265%% {Action, AcceptLength, RestChars, Line} - 1266%% The scanner has reached an accepting end-state, for example after 1267%% a regexp "abc". Action is the action number and AcceptLength is 1268%% the length of the matching token. 1269%% 1270%% {Action, AcceptLength, RestChars, Line, State} - 1271%% The scanner has reached an accepting transition state, for example 1272%% after c in regexp "abc(xyz)?", continuation depends on 1273%% RestChars. If RestChars == [] (no more current characters) then we 1274%% need to get more characters to see if it is an end-state, 1275%% otherwise (eof or chars) then we have not found continuing 1276%% characters and it is an end state. 1277%% 1278%% {reject, AcceptLength, CurrTokLen, RestChars, Line, State} - 1279%% {Action, AcceptLength, CurrTokLen, RestChars, Line, State} - 1280%% The scanner has reached a non-accepting transition state. If 1281%% RestChars == [] we need to get more characters to continue. 1282%% Otherwise if 'reject' then no accepting state has been reached it 1283%% is an error. If we have an Action and AcceptLength then these are 1284%% the last accept state, use them and continue from there. 1285 1286%% out_file(LeexState, DFA, DfaStart, [Action], Code) -> ok | error. 1287%% Generate an output .erl file from the include file, the DFA and 1288%% the code for the actions. 1289 1290out_file(St0, DFA, DF, Actions, Code) -> 1291 verbose_print(St0, "Writing file ~ts, ", [St0#leex.efile]), 1292 case open_inc_file(St0) of 1293 {ok,Ifile} -> 1294 try 1295 case file:open(St0#leex.efile, [write]) of 1296 {ok,Ofile} -> 1297 set_encoding(St0, Ofile), 1298 try 1299 output_encoding_comment(Ofile, St0), 1300 output_file_directive(Ofile, St0#leex.ifile, 0), 1301 out_file(Ifile, Ofile, St0, DFA, DF, Actions, 1302 Code, 1), 1303 verbose_print(St0, "ok~n", []), 1304 St0 1305 after ok = file:close(Ofile) 1306 end; 1307 {error,Error} -> 1308 verbose_print(St0, "error~n", []), 1309 add_error({none,leex,{file_error,Error}}, St0) 1310 end 1311 after ok = file:close(Ifile) 1312 end; 1313 {{error,Error},Ifile} -> 1314 add_error(Ifile, {none,leex,{file_error,Error}}, St0) 1315 end. 1316 1317open_inc_file(State) -> 1318 Ifile = State#leex.ifile, 1319 case file:open(Ifile, [read]) of 1320 {ok,F} -> 1321 _ = epp:set_encoding(F), 1322 {ok,F}; 1323 Error -> {Error,Ifile} 1324 end. 1325 1326inc_file_name([]) -> 1327 Incdir = filename:join(code:lib_dir(parsetools), "include"), 1328 filename:join(Incdir, ?LEEXINC); 1329inc_file_name(Filename) -> 1330 Filename. 1331 1332%% out_file(IncFile, OutFile, State, DFA, DfaStart, Actions, Code, Line) -> ok 1333%% Copy the include file line by line substituting special lines with 1334%% generated code. We cheat by only looking at the first 5 1335%% characters. 1336 1337out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L) -> 1338 case io:get_line(Ifile, leex) of 1339 eof -> output_file_directive(Ofile, St#leex.ifile, L); 1340 {error, _} -> add_error(St#leex.ifile, {L, leex, cannot_parse}, St); 1341 Line -> 1342 case string:slice(Line, 0, 5) of 1343 "##mod" -> out_module(Ofile, St); 1344 "##cod" -> out_erlang_code(Ofile, St, Code, L); 1345 "##dfa" -> out_dfa(Ofile, St, DFA, Code, DF, L); 1346 "##act" -> out_actions(Ofile, St#leex.xfile, Actions); 1347 _ -> io:put_chars(Ofile, Line) 1348 end, 1349 out_file(Ifile, Ofile, St, DFA, DF, Actions, Code, L+1) 1350 end. 1351 1352out_module(File, St) -> 1353 io:fwrite(File, "-module(~w).\n", [St#leex.module]). 1354 1355out_erlang_code(File, St, Code, L) -> 1356 {CodeL,CodePos,_NCodeLines} = Code, 1357 output_file_directive(File, St#leex.xfile, CodeL), 1358 {ok,Xfile} = file:open(St#leex.xfile, [read]), 1359 try 1360 set_encoding(St, Xfile), 1361 {ok,_} = file:position(Xfile, CodePos), 1362 ok = file_copy(Xfile, File) 1363 after 1364 ok = file:close(Xfile) 1365 end, 1366 io:nl(File), 1367 output_file_directive(File, St#leex.ifile, L). 1368 1369file_copy(From, To) -> 1370 case io:get_line(From, leex) of 1371 eof -> ok; 1372 Line when is_list(Line) -> 1373 io:fwrite(To, "~ts", [Line]), 1374 file_copy(From, To) 1375 end. 1376 1377out_dfa(File, St, DFA, Code, DF, L) -> 1378 {_CodeL,_CodePos,NCodeLines} = Code, 1379 %% Three file attributes before this one... 1380 output_file_directive(File, St#leex.efile, L+(NCodeLines-1)+3), 1381 io:fwrite(File, "yystate() -> ~w.~n~n", [DF]), 1382 foreach(fun (S) -> out_trans(File, S) end, DFA), 1383 io:fwrite(File, "yystate(S, Ics, Line, Tlen, Action, Alen) ->~n", []), 1384 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,S}.~n", []). 1385 1386out_trans(File, #dfa_state{no=N,trans=[],accept={accept,A}}) -> 1387 %% Accepting end state, guaranteed done. 1388 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), 1389 io:fwrite(File, " {~w,Tlen,Ics,Line};~n", [A]); 1390out_trans(File, #dfa_state{no=N,trans=Tr,accept={accept,A}}) -> 1391 %% Accepting state, but there maybe more. 1392 foreach(fun (T) -> out_accept_tran(File, N, A, T) end, pack_trans(Tr)), 1393 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, _, _) ->~n", [N]), 1394 io:fwrite(File, " {~w,Tlen,Ics,Line,~w};~n", [A,N]); 1395out_trans(File, #dfa_state{no=N,trans=Tr,accept=noaccept}) -> 1396 %% Non-accepting transition state. 1397 foreach(fun (T) -> out_noaccept_tran(File, N, T) end, pack_trans(Tr)), 1398 io:fwrite(File, "yystate(~w, Ics, Line, Tlen, Action, Alen) ->~n", [N]), 1399 io:fwrite(File, " {Action,Alen,Tlen,Ics,Line,~w};~n", [N]). 1400 1401out_accept_tran(File, N, A, {{Cf,maxchar},S}) -> 1402 out_accept_head_max(File, N, Cf), 1403 out_accept_body(File, S, "Line", A); 1404out_accept_tran(File, N, A, {{Cf,Cl},S}) -> 1405 out_accept_head_range(File, N, Cf, Cl), 1406 out_accept_body(File, S, "Line", A); 1407out_accept_tran(File, N, A, {$\n,S}) -> 1408 out_accept_head_1(File, N, $\n), 1409 out_accept_body(File, S, "Line+1", A); 1410out_accept_tran(File, N, A, {C,S}) -> 1411 out_accept_head_1(File, N, C), 1412 out_accept_body(File, S, "Line", A). 1413 1414out_accept_head_1(File, State, Char) -> 1415 out_head_1(File, State, Char, "_", "_"). 1416 1417out_accept_head_max(File, State, Min) -> 1418 out_head_max(File, State, Min, "_", "_"). 1419 1420out_accept_head_range(File, State, Min, Max) -> 1421 out_head_range(File, State, Min, Max, "_", "_"). 1422 1423out_accept_body(File, Next, Line, Action) -> 1424 out_body(File, Next, Line, io_lib:write(Action), "Tlen"). 1425 1426out_noaccept_tran(File, N, {{Cf,maxchar},S}) -> 1427 out_noaccept_head_max(File, N, Cf), 1428 out_noaccept_body(File, S, "Line"); 1429out_noaccept_tran(File, N, {{Cf,Cl},S}) -> 1430 out_noaccept_head_range(File, N, Cf, Cl), 1431 out_noaccept_body(File, S, "Line"); 1432out_noaccept_tran(File, N, {$\n,S}) -> 1433 out_noaccept_head_1(File, N, $\n), 1434 out_noaccept_body(File, S, "Line+1"); 1435out_noaccept_tran(File, N, {C,S}) -> 1436 out_noaccept_head_1(File, N, C), 1437 out_noaccept_body(File, S, "Line"). 1438 1439out_noaccept_head_1(File, State, Char) -> 1440 out_head_1(File, State, Char, "Action", "Alen"). 1441 1442out_noaccept_head_max(File, State, Min) -> 1443 out_head_max(File, State, Min, "Action", "Alen"). 1444 1445out_noaccept_head_range(File, State, Min, Max) -> 1446 out_head_range(File, State, Min, Max, "Action", "Alen"). 1447 1448out_noaccept_body(File, Next, Line) -> 1449 out_body(File, Next, Line, "Action", "Alen"). 1450 1451out_head_1(File, State, Char, Action, Alen) -> 1452 io:fwrite(File, "yystate(~w, [~w|Ics], Line, Tlen, ~s, ~s) ->\n", 1453 [State,Char,Action,Alen]). 1454 1455out_head_max(File, State, Min, Action, Alen) -> 1456 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w ->\n", 1457 [State,Action,Alen,Min]). 1458 1459out_head_range(File, State, Min, Max, Action, Alen) -> 1460 io:fwrite(File, "yystate(~w, [C|Ics], Line, Tlen, ~s, ~s) when C >= ~w, C =< ~w ->\n", 1461 [State,Action,Alen,Min,Max]). 1462 1463out_body(File, Next, Line, Action, Alen) -> 1464 io:fwrite(File, " yystate(~w, Ics, ~s, Tlen+1, ~s, ~s);\n", 1465 [Next,Line,Action,Alen]). 1466 1467%% pack_trans([{Crange,State}]) -> [{Crange,State}] when 1468%% Crange = {Char,Char} | Char. 1469%% Pack the translation table into something more suitable for 1470%% generating code. We KNOW how the pattern matching compiler works 1471%% so solitary characters are stored before ranges. We do this by 1472%% prepending singletons to the front of the packed transitions and 1473%% appending ranges to the back. This preserves the smallest to 1474%% largest order of ranges. Newline characters, $\n, are always 1475%% extracted and handled as singeltons. 1476 1477pack_trans(Trs) -> pack_trans(Trs, []). 1478 1479%% pack_trans(Trs) -> 1480%% Trs1 = pack_trans(Trs, []), 1481%% io:fwrite("tr:~p\n=> ~p\n", [Trs,Trs1]), 1482%% Trs1. 1483 1484pack_trans([{{C,C},S}|Trs], Pt) -> % Singletons to the head 1485 pack_trans(Trs, [{C,S}|Pt]); 1486%% Special detection and handling of $\n. 1487pack_trans([{{Cf,$\n},S}|Trs], Pt) -> 1488 pack_trans([{{Cf,$\n-1},S}|Trs], [{$\n,S}|Pt]); 1489pack_trans([{{$\n,Cl},S}|Trs], Pt) -> 1490 pack_trans([{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); 1491pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cf < $\n, Cl > $\n -> 1492 pack_trans([{{Cf,$\n-1},S},{{$\n+1,Cl},S}|Trs], [{$\n,S}|Pt]); 1493%% Small ranges become singletons. 1494pack_trans([{{Cf,Cl},S}|Trs], Pt) when Cl =:= Cf + 1 -> 1495 pack_trans(Trs, [{Cf,S},{Cl,S}|Pt]); 1496pack_trans([Tr|Trs], Pt) -> % The default uninteresting case 1497 pack_trans(Trs, Pt ++ [Tr]); 1498pack_trans([], Pt) -> Pt. 1499 1500%% out_actions(File, XrlFile, ActionList) -> ok. 1501%% Write out the action table. 1502 1503out_actions(File, XrlFile, As) -> 1504 As1 = prep_out_actions(As), 1505 foreach(fun (A) -> out_action(File, A) end, As1), 1506 io:fwrite(File, "yyaction(_, _, _, _) -> error.~n", []), 1507 foreach(fun (A) -> out_action_code(File, XrlFile, A) end, As1). 1508 1509prep_out_actions(As) -> 1510 map(fun ({A,empty_action}) -> 1511 {A,empty_action}; 1512 ({A,Code,TokenChars,TokenLen,TokenLine}) -> 1513 Vs = [{TokenChars,"TokenChars"}, 1514 {TokenLen,"TokenLen"}, 1515 {TokenLine,"TokenLine"}, 1516 {TokenChars,"YYtcs"}, 1517 {TokenLen or TokenChars,"TokenLen"}], 1518 Vars = [if F -> S; true -> "_" end || {F,S} <- Vs], 1519 Name = list_to_atom(lists:concat([yyaction_,A])), 1520 [Chars,Len,Line,_,_] = Vars, 1521 Args = [V || V <- [Chars,Len,Line], V =/= "_"], 1522 ArgsChars = lists:join(", ", Args), 1523 {A,Code,Vars,Name,Args,ArgsChars} 1524 end, As). 1525 1526out_action(File, {A,empty_action}) -> 1527 io:fwrite(File, "yyaction(~w, _, _, _) -> skip_token;~n", [A]); 1528out_action(File, {A,_Code,Vars,Name,_Args,ArgsChars}) -> 1529 [_,_,Line,Tcs,Len] = Vars, 1530 io:fwrite(File, "yyaction(~w, ~s, ~s, ~s) ->~n", [A,Len,Tcs,Line]), 1531 if 1532 Tcs =/= "_" -> 1533 io:fwrite(File, " TokenChars = yypre(YYtcs, TokenLen),~n", []); 1534 true -> ok 1535 end, 1536 io:fwrite(File, " ~s(~s);~n", [Name, ArgsChars]). 1537 1538out_action_code(_File, _XrlFile, {_A,empty_action}) -> 1539 ok; 1540out_action_code(File, XrlFile, {_A,Code,_Vars,Name,Args,ArgsChars}) -> 1541 %% Should set the file to the .erl file, but instead assumes that 1542 %% ?LEEXINC is syntactically correct. 1543 io:fwrite(File, "\n-compile({inline,~w/~w}).\n", [Name, length(Args)]), 1544 L = erl_scan:line(hd(Code)), 1545 output_file_directive(File, XrlFile, L-2), 1546 io:fwrite(File, "~s(~s) ->~n", [Name, ArgsChars]), 1547 io:fwrite(File, " ~ts\n", [pp_tokens(Code, L, File)]). 1548 1549%% pp_tokens(Tokens, Line, File) -> [char()]. 1550%% Prints the tokens keeping the line breaks of the original code. 1551 1552pp_tokens(Tokens, Line0, File) -> pp_tokens(Tokens, Line0, File, none). 1553 1554pp_tokens([], _Line0, _, _) -> []; 1555pp_tokens([T | Ts], Line0, File, Prev) -> 1556 Line = erl_scan:line(T), 1557 [pp_sep(Line, Line0, Prev, T), 1558 pp_symbol(T, File) | pp_tokens(Ts, Line, File, T)]. 1559 1560pp_symbol({var,_,Var}, _) -> atom_to_list(Var); 1561pp_symbol({_,_,Symbol}, File) -> format_symbol(Symbol, File); 1562pp_symbol({dot, _}, _) -> "."; 1563pp_symbol({Symbol, _}, _) -> atom_to_list(Symbol). 1564 1565pp_sep(Line, Line0, Prev, T) when Line > Line0 -> 1566 ["\n " | pp_sep(Line - 1, Line0, Prev, T)]; 1567pp_sep(_, _, {'.',_}, _) -> ""; % No space after '.' (not a dot) 1568pp_sep(_, _, {'#',_}, _) -> ""; % No space after '#' 1569pp_sep(_, _, {'(',_}, _) -> ""; % No space after '(' 1570pp_sep(_, _, {'[',_}, _) -> ""; % No space after '[' 1571pp_sep(_, _, _, {'.',_}) -> ""; % No space before '.' 1572pp_sep(_, _, _, {'#',_}) -> ""; % No space before '#' 1573pp_sep(_, _, _, {',',_}) -> ""; % No space before ',' 1574pp_sep(_, _, _, {')',_}) -> ""; % No space before ')' 1575pp_sep(_, _, _, _) -> " ". 1576 1577%% out_dfa_graph(LeexState, DFA, DfaStart) -> ok | error. 1578%% Writes the DFA to a .dot file in DOT-format which can be viewed 1579%% with Graphviz. 1580 1581out_dfa_graph(St, DFA, DF) -> 1582 verbose_print(St, "Writing DFA to file ~ts, ", [St#leex.gfile]), 1583 case file:open(St#leex.gfile, [write]) of 1584 {ok,Gfile} -> 1585 try 1586 %% Set the same encoding as infile: 1587 set_encoding(St, Gfile), 1588 io:fwrite(Gfile, "digraph DFA {~n", []), 1589 out_dfa_states(Gfile, DFA, DF), 1590 out_dfa_edges(Gfile, DFA), 1591 io:fwrite(Gfile, "}~n", []), 1592 verbose_print(St, "ok~n", []), 1593 St 1594 after ok = file:close(Gfile) 1595 end; 1596 {error,Error} -> 1597 verbose_print(St, "error~n", []), 1598 add_error({none,leex,{file_error,Error}}, St) 1599 end. 1600 1601out_dfa_states(File, DFA, DF) -> 1602 foreach(fun (S) -> out_dfa_state(File, DF, S) end, DFA), 1603 io:fwrite(File, "~n", []). 1604 1605out_dfa_state(File, DF, #dfa_state{no=DF, accept={accept,_}}) -> 1606 io:fwrite(File, " ~b [shape=doublecircle color=green];~n", [DF]); 1607out_dfa_state(File, DF, #dfa_state{no=DF, accept=noaccept}) -> 1608 io:fwrite(File, " ~b [shape=circle color=green];~n", [DF]); 1609out_dfa_state(File, _, #dfa_state{no=S, accept={accept,_}}) -> 1610 io:fwrite(File, " ~b [shape=doublecircle];~n", [S]); 1611out_dfa_state(File, _, #dfa_state{no=S, accept=noaccept}) -> 1612 io:fwrite(File, " ~b [shape=circle];~n", [S]). 1613 1614out_dfa_edges(File, DFA) -> 1615 foreach(fun (#dfa_state{no=S,trans=Trans}) -> 1616 Pt = pack_trans(Trans), 1617 Tdict = foldl(fun ({Cr,T}, D) -> 1618 orddict:append(T, Cr, D) 1619 end, orddict:new(), Pt), 1620 foreach(fun (T) -> 1621 Crs = orddict:fetch(T, Tdict), 1622 Edgelab = dfa_edgelabel(Crs, File), 1623 io:fwrite(File, " ~b -> ~b [label=\"~ts\"];~n", 1624 [S,T,Edgelab]) 1625 end, sort(orddict:fetch_keys(Tdict))) 1626 end, DFA). 1627 1628dfa_edgelabel([C], File) when is_integer(C) -> quote(C, File); 1629dfa_edgelabel(Cranges, File) -> 1630 %% io:fwrite("el: ~p\n", [Cranges]), 1631 "[" ++ map(fun ({A,B}) -> [quote(A, File), "-", quote(B, File)]; 1632 (C) -> [quote(C, File)] 1633 end, Cranges) ++ "]". 1634 1635set_encoding(#leex{encoding = none}, File) -> 1636 ok = io:setopts(File, [{encoding, epp:default_encoding()}]); 1637set_encoding(#leex{encoding = E}, File) -> 1638 ok = io:setopts(File, [{encoding, E}]). 1639 1640output_encoding_comment(_File, #leex{encoding = none}) -> 1641 ok; 1642output_encoding_comment(File, #leex{encoding = Encoding}) -> 1643 io:fwrite(File, <<"%% ~s\n">>, [epp:encoding_to_string(Encoding)]). 1644 1645output_file_directive(File, Filename, Line) -> 1646 io:fwrite(File, <<"-file(~ts, ~w).\n">>, 1647 [format_filename(Filename, File), Line]). 1648 1649format_filename(Filename0, File) -> 1650 Filename = filename:flatten(Filename0), 1651 case enc(File) of 1652 unicode -> io_lib:write_string(Filename); 1653 latin1 -> io_lib:write_string_as_latin1(Filename) 1654 end. 1655 1656format_symbol(Symbol, File) -> 1657 Format = case enc(File) of 1658 latin1 -> "~p"; 1659 unicode -> "~tp" 1660 end, 1661 io_lib:fwrite(Format, [Symbol]). 1662 1663enc(File) -> 1664 case lists:keyfind(encoding, 1, io:getopts(File)) of 1665 false -> latin1; % should never happen 1666 {encoding, Enc} -> Enc 1667 end. 1668 1669quote($^, _File) -> "\\^"; 1670quote($., _File) -> "\\."; 1671quote($$, _File) -> "\\$"; 1672quote($-, _File) -> "\\-"; 1673quote($[, _File) -> "\\["; 1674quote($], _File) -> "\\]"; 1675quote($\s, _File) -> "\\\\s"; 1676quote($\", _File) -> "\\\""; 1677quote($\b, _File) -> "\\\\b"; 1678quote($\f, _File) -> "\\\\f"; 1679quote($\n, _File) -> "\\\\n"; 1680quote($\r, _File) -> "\\\\r"; 1681quote($\t, _File) -> "\\\\t"; 1682quote($\e, _File) -> "\\\\e"; 1683quote($\v, _File) -> "\\\\v"; 1684quote($\d, _File) -> "\\\\d"; 1685quote($\\, _File) -> "\\\\"; 1686quote(C, File) when is_integer(C) -> 1687 %% Must remove the $ and get the \'s right. 1688 S = case enc(File) of 1689 unicode -> io_lib:write_char(C); 1690 latin1 -> io_lib:write_char_as_latin1(C) 1691 end, 1692 case S of 1693 [$$,$\\|Cs] -> "\\\\" ++ Cs; 1694 [$$|Cs] -> Cs 1695 end; 1696quote(maxchar, _File) -> 1697 "MAXCHAR". 1698