1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2006-2017. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21%% 22-module(xmerl_regexp). 23 24%% This module provides a basic set of regular expression functions 25%% for strings. The functions provided are taken from AWK. 26%% 27%% Note that we interpret the syntax tree of a regular expression 28%% directly instead of converting it to an NFA and then interpreting 29%% that. This method seems to go significantly faster. 30 31-export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]). 32-export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]). 33 34-export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]). 35 36-import(string, [substr/2,substr/3]). 37-import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]). 38-import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]). 39-import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]). 40 41%%-compile([export_all]). 42 43-export([setup/1,compile_proc/2]). 44 45-include("xmerl_internal.hrl"). 46 47setup(RE0) -> 48 RE = setup(RE0, [$^]), 49 Pid = spawn(?MODULE,compile_proc,[self(),RE]), 50 receive 51 {ok,Result} -> 52 Result 53 after 2000 -> 54 exit(Pid,force), 55 parse(RE) 56 end. 57 %% compile(RE). 58%%RE. 59compile_proc(From,RE) -> 60 Res = compile(RE), 61 From ! {ok,Res}. 62 63 64setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc); 65setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc); 66setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc); 67setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc); 68setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc); %% Only Latin-1 now 69setup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc); 70setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc); 71setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc); 72%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S}; 73%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S}; 74%% Letter, Any 75%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 76%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 77%% Letter, Uppercase 78%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 79%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 80%% Letter, Lowercase 81%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 82%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 83%% Letter, Titlecase 84%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 85%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 86%% Letter, Modifier 87%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 88%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 89%% Letter, Other 90%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 91%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 92%% Mark, Any 93%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 94%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 95%% Mark, Nonspacing 96%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 97%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 98%% Mark, Spacing Combining 99%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 100%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 101%% Mark, Enclosing 102%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 103%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 104%% Number, Any 105%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 106%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 107%% Number, Decimal Digit 108%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 109%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 110%% Number, Letter 111%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 112%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 113%% Number, Other 114%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 115%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 116%% Punctuation, Any 117%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 118%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 119%% Punctuation, Connector 120%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 121%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 122%% Punctuation, Dash 123%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 124%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 125%% Punctuation, Open 126%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 127%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 128%% Punctuation, Close 129%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 130%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 131%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage) 132%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 133%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 134%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage) 135%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 136%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 137%% Punctuation, Other 138%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 139%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 140%% Symbol, Any 141%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 142%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 143%% Symbol, Math 144%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 145%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 146%% Symbol, Currency 147%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 148%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 149%% Symbol, Modifier 150%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 151%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 152%% Symbol, Other 153%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 154%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 155%% Separator, Any 156%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 157%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 158%% Separator, Space 159%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 160%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 161%% Separator, Line 162%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 163%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 164%% Separator, Paragraph 165%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 166%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 167%% Other, Any 168%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 169%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 170%% Other, Control 171%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 172%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 173%% Other, Format 174%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 175%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 176%% Other, Surrogate not supported by schema recommendation 177%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 178%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 179%% Other, Private Use 180%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 181%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 182%% Other, Not assigned (no characters in the file have this property) 183%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S}; 184%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S}; 185setup([A|S], Acc) -> setup(S, [A|Acc]); 186setup([],Acc) -> reverse([$$|Acc]). 187 188%% sh_to_awk(ShellRegExp) 189%% Convert a sh style regexp into a full AWK one. The main difficulty is 190%% getting character sets right as the conventions are different. 191 192sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginning 193 194sh_to_awk_1([$*|Sh]) -> %This matches any string 195 ".*" ++ sh_to_awk_1(Sh); 196sh_to_awk_1([$?|Sh]) -> %This matches any character 197 [$.|sh_to_awk_1(Sh)]; 198sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling 199 "\\^" ++ sh_to_awk_1(Sh); 200%% Must move '^' to end. 201sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)]; 202sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false); 203sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)]; 204sh_to_awk_1([C|Sh]) -> 205 %% Unspecialise everything else which is not an escape character. 206 case sh_special_char(C) of 207 true -> [$\\,C|sh_to_awk_1(Sh)]; 208 false -> [C|sh_to_awk_1(Sh)] 209 end; 210sh_to_awk_1([]) -> ")$". %Fix the end 211 212sh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)]; 213sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow). 214 215sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh); 216sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)]; 217sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)]; 218sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])]; 219sh_to_awk_3([], false) -> sh_to_awk_1([]). 220 221%% -type sh_special_char(char()) -> bool(). 222%% Test if a character is a special character. 223 224sh_special_char($|) -> true; 225sh_special_char($*) -> true; 226sh_special_char($+) -> true; 227sh_special_char($?) -> true; 228sh_special_char($() -> true; 229sh_special_char($)) -> true; 230sh_special_char($\\) -> true; 231sh_special_char($^) -> true; 232sh_special_char($$) -> true; 233sh_special_char($.) -> true; 234sh_special_char($[) -> true; 235sh_special_char($]) -> true; 236sh_special_char($") -> true; 237sh_special_char(_C) -> false. 238 239%% parse(RegExp) -> {ok,RE} | {error,E}. 240%% Parse the regexp described in the string RegExp. 241 242parse(S) -> 243 case catch reg(S, 0) of 244 {R,Sc,[]} -> {ok,{regexp,{R,Sc}}}; 245 {_R,_Sc,[C|_]} -> {error,{illegal,[C]}}; 246 {error,E} -> {error,E} 247 end. 248 249%% format_error(Error) -> String. 250 251format_error({interval_range,What}) -> 252 ["illegal interval range",io_lib:write_string(What)]; 253format_error({illegal,What}) -> ["illegal character `",What,"'"]; 254format_error({unterminated,What}) -> ["unterminated `",What,"'"]; 255format_error({posix_cc,What}) -> 256 ["illegal POSIX character class ",io_lib:write_string(What)]; 257format_error({char_class,What}) -> 258 ["illegal character class ",io_lib:write_string(What)]. 259 260%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. 261%% Find the longest match of RegExp in String. 262 263match(S, RegExp) when is_list(RegExp) -> 264 case parse(RegExp) of 265 {ok,RE} -> match(S, RE); 266 {error,E} -> {error,E} 267 end; 268match(S, {regexp,RE}) -> 269 case match_re(RE, S, 1, 0, -1) of 270 {Start,Len} when Len >= 0 -> 271 {match,Start,Len}; 272 {_Start,_Len} -> nomatch 273 end; 274match(S, {comp_regexp,RE}) -> 275 case match_comp(RE, S, 1, 0, -1) of 276 {Start,Len} when Len >= 0 -> 277 {match,Start,Len}; 278 {_Start,_Len} -> nomatch 279 end. 280 281match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) -> 282 case re_apply(S0, P0, RE) of 283 {match,P1,_S1,_Subs} -> 284 Len = P1-P0, 285 if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len); 286 true -> match_re(RE, Cs, P0+1, Mst, Mlen) 287 end; 288 nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen); 289 never_match -> {Mst,Mlen} %No need to go on 290 end; 291match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. 292 293match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) -> 294 case comp_apply(S0, P0, RE) of 295 {match,P1,_S1} -> 296 Len = P1-P0, 297 if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len); 298 true -> match_comp(RE, Cs, P0+1, Mst, Mlen) 299 end; 300 nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen) 301 end; 302match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}. 303 304%% match_re(RE, S0, Pos0, Mst, Mlen) -> 305%% case first_match_re(RE, S0, Pos0) of 306%% {St,Len,_} -> %Found a match 307%% Pos1 = St + 1, %Where to start next match 308%% S1 = lists:nthtail(Pos1-Pos0, S0), 309%% if Len > Mlen -> match_re(RE, S1, Pos1, St, Len); 310%% true -> match_re(RE, S1, Pos1, Mst, Mlen) 311%% end; 312%% nomatch -> {Mst,Mlen} 313%% end. 314 315%% match_comp(RE, S0, Pos0, Mst, Mlen) -> 316%% case first_match_comp(RE, S0, Pos0) of 317%% {St,Len} -> %Found a match 318%% Pos1 = St + 1, %Where to start next match 319%% S1 = lists:nthtail(Pos1-Pos0, S0), 320%% if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len); 321%% true -> match_comp(RE, S1, Pos1, Mst, Mlen) 322%% end; 323%% nomatch -> {Mst,Mlen} 324%% end. 325 326%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}. 327%% Find the first match of RegExp in String. 328 329first_match(S, RegExp) when is_list(RegExp) -> 330 case parse(RegExp) of 331 {ok,RE} -> first_match(S, RE); 332 {error,E} -> {error,E} 333 end; 334first_match(S, {regexp,RE}) -> 335 case first_match_re(RE, S, 1) of 336 {Start,Len,_} -> {match,Start,Len}; 337 nomatch -> nomatch 338 end; 339first_match(S, {comp_regexp,RE}) -> 340 case first_match_comp(RE, S, 1) of 341 {Start,Len} -> {match,Start,Len}; 342 nomatch -> nomatch 343 end. 344 345first_match_re(RE, S, St) when S /= [] -> 346 case re_apply(S, St, RE) of 347 {match,P,_Rest,Subs} -> {St,P-St,Subs}; 348 nomatch -> first_match_re(RE, tl(S), St+1); 349 never_match -> nomatch 350 end; 351first_match_re(_RE, [], _St) -> nomatch. 352 353first_match_comp(RE, S, St) when S /= [] -> 354 case comp_apply(S, St, RE) of 355 {match,P,_Rest} -> {St,P-St}; 356 nomatch -> first_match_comp(RE, tl(S), St+1) 357 end; 358first_match_comp(_RE, [], _St) -> nomatch. 359 360%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}. 361%% Return the all the non-overlapping matches of RegExp in String. 362 363matches(S, RegExp) when is_list(RegExp) -> 364 case parse(RegExp) of 365 {ok,RE} -> matches(S, RE); 366 {error,E} -> {error,E} 367 end; 368matches(S, {regexp,RE}) -> {match,matches_re(S, RE, 1)}; 369matches(S, {comp_regexp,RE}) -> {match,matches_comp(S, RE, 1)}. 370 371matches_re([_|Cs]=S0, RE, P0) -> 372 case re_apply(S0, P0, RE) of 373 {match,P0,S1,_Subs} -> %0 length match 374 [{P0,0}|matches_re(tl(S1), RE, P0+1)]; 375 {match,P1,S1,_Subs} -> 376 [{P0,P1-P0}|matches_re(S1, RE, P1)]; 377 nomatch -> matches_re(Cs, RE, P0+1); 378 never_match -> [] 379 end; 380matches_re([], _RE, _P) -> []. 381 382matches_comp([_|Cs]=S0, RE, P0) -> 383 case comp_apply(S0, P0, RE) of 384 {match,P0,S1} -> %0 length match 385 [{P0,0}|matches_comp(tl(S1), RE, P0+1)]; 386 {match,P1,S1} -> 387 [{P0,P1-P0}|matches_comp(S1, RE, P1)]; 388 nomatch -> matches_comp(Cs, RE, P0+1) 389 end; 390matches_comp([], _RE, _P) -> []. 391 392%% sub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. 393%% Substitute the first match of the regular expression RegExp with 394%% the string Replace in String. Accept pre-parsed regular 395%% expressions. 396 397sub(String, RegExp, Rep) when is_list(RegExp) -> 398 case parse(RegExp) of 399 {ok,RE} -> sub(String, RE, Rep); 400 {error,E} -> {error,E} 401 end; 402sub(String, {regexp,RE}, Rep) -> 403 case sub_re(String, 1, RE, [], Rep) of 404 {yes,NewStr} -> {ok,NewStr,1}; 405 no -> {ok,String,0} 406 end; 407sub(String, {comp_regexp,RE}, Rep) -> 408 case sub_comp(String, 1, RE, [], Rep) of 409 {yes,NewStr} -> {ok,NewStr,1}; 410 no -> {ok,String,0} 411 end. 412 413%% sub_re(String, Position, Regexp, Before, Replacement) -> 414%% {NewString,Count}. 415%% sub_comp(String, Position, Regexp, Before, Replacement) -> 416%% {NewString,Count}. 417%% Step forward over String until a match is found saving stepped over 418%% chars in Before. Return reversed Before prepended to replacement 419%% and rest of string. 420 421sub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> 422 case re_apply(S0, P0, RE) of 423 {match,P0,_S1,_} -> %Ignore 0 length match 424 sub_re(Cs, P0+1, RE, [C|Bef], Rep); 425 {match,P1,Rest,_Gps} -> 426 {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; 427 nomatch -> sub_re(Cs, P0+1, RE, [C|Bef], Rep); 428 never_match -> no %No need to go on 429 end; 430sub_re([], _P, _RE, _Bef, _Rep) -> no. 431 432sub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> 433 case comp_apply(S0, P0, RE) of 434 {match,P0,_S1} -> %Ignore 0 length match 435 sub_comp(Cs, P0+1, RE, [C|Bef], Rep); 436 {match,P1,Rest} -> 437 {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))}; 438 nomatch -> sub_comp(Cs, P0+1, RE, [C|Bef], Rep) 439 end; 440sub_comp([], _P, _RE, _Bef, _Rep) -> no. 441 442sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest); 443sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)]; 444sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)]; 445sub_repl([], _M, Rest) -> Rest. 446 447%% gsub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}. 448%% Substitute every match of the regular expression RegExp with the 449%% string New in String. Accept pre-parsed regular expressions. 450 451gsub(String, RegExp, Rep) when is_list(RegExp) -> 452 case parse(RegExp) of 453 {ok,RE} -> gsub(String, RE, Rep); 454 {error,E} -> {error,E} 455 end; 456gsub(String, {regexp,RE}, Rep) -> 457 case gsub_re(String, 1, RE, [], Rep) of 458 {NewStr,N} -> {ok,NewStr,N}; 459 no -> {ok,String,0} %No substitutions 460 end; 461gsub(String, {comp_regexp,RE}, Rep) -> 462 case gsub_comp(String, 1, RE, [], Rep) of 463 {NewStr,N} -> {ok,NewStr,N}; 464 no -> {ok,String,0} %No substitutions 465 end. 466 467%% gsub_re(String, Position, Regexp, Before, Replacement) -> 468%% {NewString,Count}. 469%% gsub_comp(String, Position, Regexp, Before, Replacement) -> 470%% {NewString,Count}. 471%% Step forward over String until a match is found saving stepped over 472%% chars in Before. Call recursively to do rest of string after 473%% match. Return reversed Before prepended to return from recursive 474%% call. 475 476gsub_re([C|Cs]=S0, P0, RE, Bef, Rep) -> 477 case re_apply(S0, P0, RE) of 478 {match,P0,_S1,_} -> %Ignore 0 length match 479 gsub_re(Cs, P0+1, RE, [C|Bef], Rep); 480 {match,P1,S1,_Gps} -> 481 case gsub_re(S1, P1, RE, [], Rep) of 482 {NewStr,N0} -> %Substituitions 483 {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), 484 N0+1}; 485 no -> %No substituitions. 486 {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} 487 end; 488 %%No match so step forward saving C on Bef. 489 nomatch -> gsub_re(Cs, P0+1, RE, [C|Bef], Rep); 490 never_match -> no %No need to go on 491 end; 492gsub_re([], _P, _RE, _Bef, _Rep) -> no. 493 494gsub_comp([C|Cs]=S0, P0, RE, Bef, Rep) -> 495 case comp_apply(S0, P0, RE) of 496 {match,P0,_S1} -> %Ignore 0 length match 497 gsub_comp(Cs, P0+1, RE, [C|Bef], Rep); 498 {match,P1,S1} -> 499 case gsub_comp(S1, P1, RE, [], Rep) of 500 {NewStr,N0} -> %Substituitions 501 {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)), 502 N0+1}; 503 no -> %No substituitions. 504 {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1} 505 end; 506 %%No match so step forward saving C on Bef. 507 nomatch -> gsub_comp(Cs, P0+1, RE, [C|Bef], Rep) 508 end; 509gsub_comp([], _P, _RE, _Bef, _Rep) -> no. 510 511%% split(String, RegExp) -> {ok,[SubString]} | {error,E}. 512%% Split a string into substrings where the RegExp describes the 513%% field seperator. The RegExp " " is specially treated. 514 515split(String, " ") -> %This is really special 516 {ok,{regexp,RE}} = parse("[ \t]+"), 517 case split_apply_re(String, RE, true) of 518 [[]|Ss] -> {ok,Ss}; 519 Ss -> {ok,Ss} 520 end; 521split(String, RegExp) when is_list(RegExp) -> 522 case parse(RegExp) of 523 {ok,{regexp,RE}} -> {ok,split_apply_re(String, RE, false)}; 524 {error,E} -> {error,E} 525 end; 526split(String, {regexp,RE}) -> {ok,split_apply_re(String, RE, false)}; 527split(String, {comp_regexp,RE}) -> {ok,split_apply_comp(String, RE, false)}. 528 529split_apply_re(S, RE, Trim) -> split_apply_re(S, 1, RE, Trim, []). 530 531split_apply_re([], _P, _RE, true, []) -> []; 532split_apply_re([], _P, _RE, _T, Sub) -> [reverse(Sub)]; 533split_apply_re([C|Cs]=S, P0, RE, T, Sub) -> 534 case re_apply(S, P0, RE) of 535 {match,P0,_S1,_} -> %Ignore 0 length match 536 split_apply_re(Cs, P0+1, RE, T, [C|Sub]); 537 {match,P1,S1,_} -> 538 [reverse(Sub)|split_apply_re(S1, P1, RE, T, [])]; 539 nomatch -> 540 split_apply_re(Cs, P0+1, RE, T, [C|Sub]); 541 never_match -> [reverse(Sub, S)] %No need to go on 542 end. 543 544split_apply_comp(S, RE, Trim) -> split_apply_comp(S, 1, RE, Trim, []). 545 546%%split_apply_comp([], _P, _RE, true, []) -> []; 547split_apply_comp([], _P, _RE, _T, Sub) -> [reverse(Sub)]; 548split_apply_comp([C|Cs]=S, P0, RE, T, Sub) -> 549 case comp_apply(S, P0, RE) of 550 {match,P0,_S1} -> %Ignore 0 length match 551 split_apply_comp(Cs, P0+1, RE, T, [C|Sub]); 552 {match,P1,S1} -> 553 [reverse(Sub)|split_apply_comp(S1, P1, RE, T, [])]; 554 nomatch -> 555 split_apply_comp(Cs, P0+1, RE, T, [C|Sub]) 556 end. 557 558%% sub_match(String, RegExp) -> 559%% {match,Start,Length,SubExprs} | nomatch | {error,E}. 560%% Find the longest match of RegExp in String. 561 562sub_match(S, RegExp) when is_list(RegExp) -> 563 case parse(RegExp) of 564 {ok,RE} -> sub_match(S, RE); 565 {error,E} -> {error,E} 566 end; 567sub_match(S, {regexp,RE}) -> 568 case sub_match_re(RE, S, 1, 0, -1, none) of 569 {Start,Len,Subs} when Len >= 0 -> 570 {match,Start,Len,Subs}; 571 {_Start,_Len,_Subs} -> nomatch 572 end. 573 574sub_match_re(RE, S0, Pos0, Mst, Mlen, Msubs) -> 575 case first_match_re(RE, S0, Pos0) of 576 {St,Len,Subs} -> %Found a match 577 Pos1 = St + 1, %Where to start next match 578 S1 = lists:nthtail(Pos1-Pos0, S0), 579 if Len > Mlen -> sub_match_re(RE, S1, Pos1, St, Len, Subs); 580 true -> sub_match_re(RE, S1, Pos1, Mst, Mlen, Msubs) 581 end; 582 nomatch -> {Mst,Mlen,Msubs} 583 end. 584 585%% sub_first_match(String, RegExp) -> 586%% {match,Start,Length,SubExprs} | nomatch | {error,E}. 587%% Find the longest match of RegExp in String, return Start and Length 588%% as well as tuple of sub-expression matches. 589 590sub_first_match(S, RegExp) when is_list(RegExp) -> 591 {ok,RE} = parse(RegExp), 592 sub_first_match(S, RE); 593sub_first_match(S, {regexp,RE}) -> 594 case first_match_re(RE, S, 1) of 595 {St,Len,Subs} -> {match,St,Len,Subs}; 596 nomatch -> nomatch 597 end. 598 599 600%% This is the regular expression grammar used. It is equivalent to the 601%% one used in AWK, except that we allow ^ $ to be used anywhere and fail 602%% in the matching. 603%% 604%% reg -> reg1 : '$1'. 605%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}. 606%% reg1 -> reg2 : '$1'. 607%% reg2 -> reg2 reg3 : {concat,'$1','$2'}. 608%% reg2 -> reg3 : '$1'. 609%% reg3 -> reg3 "*" : {kclosure,'$1'}. 610%% reg3 -> reg3 "+" : {pclosure,'$1'}. 611%% reg3 -> reg3 "?" : {optional,'$1'}. 612%% reg3 -> reg3 "{" [Min],[Max] "}" : {closure_range, Num, '$1'} see below 613%% reg3 -> reg4 : '$1'. 614%% reg4 -> "(" reg ")" : '$2'. 615%% reg4 -> "\\" char : '$2'. 616%% reg4 -> "^" : bos. 617%% reg4 -> "$" : eos. 618%% reg4 -> "." : char. 619%% reg4 -> "[" class "]" : {char_class,char_class('$2')} 620%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')} 621%% reg4 -> "\"" chars "\"" : char_string('$2') 622%% reg4 -> char : '$1'. 623%% reg4 -> empty : epsilon. 624%% The grammar of the current regular expressions. The actual parser 625%% is a recursive descent implementation of the grammar. 626 627reg(S, Sc) -> reg1(S, Sc). 628 629%% reg1 -> reg2 reg1' 630%% reg1' -> "|" reg2 631%% reg1' -> empty 632 633reg1(S0, Sc0) -> 634 {L,Sc1,S1} = reg2(S0, Sc0), 635 reg1p(S1, L, Sc1). 636 637reg1p([$||S0], L, Sc0) -> 638 {R,Sc1,S1} = reg2(S0, Sc0), 639 reg1p(S1, {'or',L,R}, Sc1); 640reg1p(S, L, Sc) -> {L,Sc,S}. 641 642%% reg2 -> reg3 reg2' 643%% reg2' -> reg3 644%% reg2' -> empty 645 646reg2(S0, Sc0) -> 647 {L,Sc1,S1} = reg3(S0, Sc0), 648 reg2p(S1, L, Sc1). 649 650reg2p([C|S0], L, Sc0) when C /= $|, C /= $) -> 651 {R,Sc1,S1} = reg3([C|S0], Sc0), 652 %% reg2p(S1, {concat,L,R}, Sc1); 653 case is_integer(R) of 654 true -> 655 case L of 656 {literal,Lit} -> 657 reg2p(S1, {literal,Lit ++[R]}, Sc1); 658 {concat,S2,Char} when is_integer(Char) -> 659 reg2p(S1, {concat,S2,{literal,[Char,R]}}, Sc1); 660 {concat,S2,{literal,Lit}} -> 661 reg2p(S1, {concat,S2,{literal,Lit ++ [R]}}, Sc1); 662 Char when is_integer(Char) -> 663 reg2p(S1, {literal,[Char,R]}, Sc1); 664 _ -> 665 reg2p(S1, {concat,L,R}, Sc1) 666 end; 667 false -> 668 reg2p(S1, {concat,L,R}, Sc1) 669 end; 670reg2p(S, L, Sc) -> {L,Sc,S}. 671 672%% reg3 -> reg4 reg3' 673%% reg3' -> "*" reg3' 674%% reg3' -> "+" reg3' 675%% reg3' -> "?" reg3' 676%% reg3' -> "{" [Min],[Max] "}" reg3' 677%% reg3' -> empty 678 679reg3(S0, Sc0) -> 680 {L,Sc1,S1} = reg4(S0, Sc0), 681 reg3p(S1, L, Sc1). 682 683reg3p([$*|S], L, Sc) -> reg3p(S, {kclosure,L}, Sc); 684reg3p([$+|S], L, Sc) -> reg3p(S, {pclosure,L}, Sc); 685reg3p([$?|S], L, Sc) -> reg3p(S, {optional,L}, Sc); 686reg3p([${|Cs0], L, Sc) -> % $} 687 case interval_range(Cs0) of 688 {none,none,_Cs1} -> parse_error({interval_range,[${|Cs0]}); 689 {N,M,[$}|Cs1]} -> reg3p(Cs1, {iclosure,L,N,M}, Sc); 690 {_N,_M,_Cs1} -> parse_error({unterminated,"{"}) 691 end; 692reg3p(S, L, Sc) -> {L,Sc,S}. 693 694reg4([$(|S0], Sc0) -> 695 Sc1 = Sc0+1, 696 case reg(S0, Sc1) of 697 {R,Sc2,[$)|S1]} -> {{subexpr,Sc1,R},Sc2,S1}; 698 {_R,_Sc,_S} -> parse_error({unterminated,"("}) 699 end; 700reg4([$^|S], Sc) -> {bos,Sc,S}; 701reg4([$$|S], Sc) -> {eos,Sc,S}; 702reg4([$.|S], Sc) -> {{comp_class,"\n"},Sc,S}; 703reg4("[^" ++ S0, Sc) -> 704 case char_class(S0) of 705 {Cc,[$]|S1]} -> {{comp_class,Cc},Sc,S1}; 706 {_Cc,_S} -> parse_error({unterminated,"["}) 707 end; 708reg4([$[|S0], Sc) -> 709 case char_class(S0) of 710 {Cc,[$]|S1]} -> {{char_class,Cc},Sc,S1}; 711 {_Cc,_S1} -> parse_error({unterminated,"["}) 712 end; 713%reg4([$"|S0], Sc) -> 714% case char_string(S0) of 715% {St,[$"|S1]} -> {St,Sc,S1}; 716% {St,S1} -> parse_error({unterminated,"\""}) 717% end; 718reg4([C0|S0], Sc) when 719 is_integer(C0), C0 /= $*, C0 /= $+, C0 /= $?, C0 /= $], C0 /= $), C0 /= $} -> 720 %% Handle \ quoted characters as well, at least those we see. 721 {C1,S1} = char(C0, S0), 722 {C1,Sc,S1}; 723reg4(S=[$)|_], Sc) -> {epsilon,Sc,S}; 724reg4([C|_S], _Sc) -> parse_error({illegal,[C]}); 725reg4([], Sc) -> {epsilon,Sc,[]}. 726 727char($\\, [O1,O2,O3|S]) when 728 O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 -> 729 {(O1*8 + O2)*8 + O3 - 73*$0,S}; 730char($\\, [C|S]) -> {escape_char(C),S}; 731char($\\, []) -> parse_error({unterminated,"\\"}); 732char(C, S) -> {C,S}. 733 734escape_char($n) -> $\n; %\n = LF 735escape_char($r) -> $\r; %\r = CR 736escape_char($t) -> $\t; %\t = TAB 737escape_char($v) -> $\v; %\v = VT 738escape_char($b) -> $\b; %\b = BS 739escape_char($f) -> $\f; %\f = FF 740escape_char($e) -> $\e; %\e = ESC 741escape_char($s) -> $\s; %\s = SPACE 742escape_char($d) -> $\d; %\d = DEL 743escape_char(C) -> C. 744 745char_class([$]|S0]) -> 746 {Cc,S1} = char_class(S0, [$]]), 747 {pack_cc(Cc),S1}; 748char_class(S0) -> 749 {Cc,S1} = char_class(S0, []), 750 {pack_cc(Cc),S1}. 751 752pack_cc(Cc0) -> 753 %% First sort the list. 754 Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2; 755 ({Cf1,_}, C) -> Cf1 < C; 756 (C, {Cf,_}) -> C < Cf; 757 (C1, C2) -> C1 =< C2 758 end, Cc0), 759 pack_cc1(Cc1). 760 761pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 -> 762 pack_cc1([{Cf1,Cl2}|Cc]); 763pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 -> 764 pack_cc1([{Cf1,Cl1}|Cc]); 765pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 -> 766 pack_cc1([{Cf1,Cl2}|Cc]); 767pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]); 768pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]); 769pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]); 770pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]); 771pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)]; 772pack_cc1([]) -> []. 773 774char_class("[:" ++ S0, Cc0) -> %Start of POSIX char class 775 case posix_cc(S0, Cc0) of 776 {Cc1,":]" ++ S1} -> char_class(S1, Cc1); 777 {_,_S1} -> parse_error({posix_cc,"[:" ++ S0}) 778 end; 779char_class([C1|S0], Cc) when C1 /= $] -> 780 case char(C1, S0) of 781 {Cf,[$-,C2|S1]} when C2 /= $] -> 782 case char(C2, S1) of 783 {Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]); 784 {_Cl,_S2} -> parse_error({char_class,[C1|S0]}) 785 end; 786 {C,S1} -> char_class(S1, [C|Cc]) 787 end; 788char_class(S, Cc) -> {Cc,S}. 789 790%% posix_cc(String, CharClass) -> {NewCharClass,RestString}. 791%% Handle POSIX character classes, use Latin-1 character set. 792 793posix_cc("alnum" ++ S, Cc) -> 794 {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; 795posix_cc("alpha" ++ S, Cc) -> 796 {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S}; 797posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S}; 798posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S}; 799posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S}; 800posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S}; 801posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S}; 802posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S}; 803posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S}; 804posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S}; 805posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S}; 806posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S}; 807posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}). 808 809interval_range(Cs0) -> 810 case number(Cs0) of 811 {none,Cs1} -> {none,none,Cs1}; 812 {N,[$,|Cs1]} -> 813 case number(Cs1) of 814 {none,Cs2} -> {N,any,Cs2}; 815 {M,Cs2} -> {N,M,Cs2} 816 end; 817 {N,Cs1} -> {N,none,Cs1} 818 end. 819 820number([C|Cs]) when C >= $0, C =< $9 -> 821 number(Cs, C - $0); 822number(Cs) -> {none,Cs}. 823 824number([C|Cs], Acc) when C >= $0, C =< $9 -> 825 number(Cs, 10*Acc + (C - $0)); 826number(Cs, Acc) -> {Acc,Cs}. 827 828parse_error(E) -> throw({error,E}). 829 830%char_string([C|S]) when C /= $" -> char_string(S, C); 831%char_string(S) -> {epsilon,S}. 832 833%char_string([C|S0], L) when C /= $" -> 834% char_string(S0, {concat,L,C}); 835%char_string(S, L) -> {L,S}. 836 837%% re_apply(String, StartPos, RegExp) -> 838%% {match,RestPos,Rest,SubExprs} | nomatch. 839%% 840%% Apply the (parse of the) regular expression RegExp to String. If 841%% there is a match return the position of the remaining string and 842%% the string if else return 'nomatch'. 843%% 844%% StartPos should be the real start position as it is used to decide 845%% if we are at the beginning of the string. 846 847re_apply(S, St, {RE,Sc}) -> 848 Subs = erlang:make_tuple(Sc, none), %Make a sub-regexp table. 849 Res = re_apply(RE, [], S, St, Subs), 850 %% ?dbg("~p x ~p -> ~p\n", [RE,S,Res]), 851 Res. 852 853re_apply(epsilon, More, S, P, Subs) -> %This always matches 854 re_apply_more(More, S, P, Subs); 855re_apply({'or',RE1,RE2}, More, S, P, Subs) -> 856 re_apply_or(re_apply(RE1, More, S, P, Subs), 857 re_apply(RE2, More, S, P, Subs)); 858re_apply({concat,RE1,RE2}, More, S0, P, Subs) -> 859 re_apply(RE1, [RE2|More], S0, P, Subs); 860re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) -> 861 re_apply_lit(Lcs, More, S, P+1, Subs); %Have matched first char 862re_apply({kclosure,RE}, More, S0, P0, Subs0) -> 863 %% Greedy so try RE first, no difference here actually. 864 Loop = case re_apply(RE, [], S0, P0, Subs0) of 865 {match,P0,_S1,_Subs1} -> %0 length match, don't loop! 866 nomatch; 867 {match,P1,S1,Subs1} -> 868 re_apply_more([{kclosure,RE}|More], S1, P1, Subs1); 869 nomatch -> nomatch; 870 never_match -> never_match 871 end, 872 re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0)); 873re_apply({pclosure,RE}, More, S, P, Subs) -> 874 re_apply(RE, [{kclosure,RE}|More], S, P, Subs); 875re_apply({optional,RE}, More, S, P, Subs) -> 876 %% Greedy so try RE first, no difference here actually. 877 re_apply_or(re_apply(RE, More, S, P, Subs), 878 re_apply_more(More, S, P, Subs)); 879re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 -> 880 re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs); 881re_apply({iclosure,RE,0,M}, More, S, P, Subs) -> 882 Exp = expand_opt(RE, M), 883 re_apply(Exp, More, S, P, Subs); 884re_apply({subexpr,N,RE}, More, S, P, Subs) -> 885 re_apply(RE, [{endsub,N,P}|More], S, P, Subs); 886re_apply({endsub,N,St}, More, S, P, Subs0) -> 887 Subs1 = setelement(N, Subs0, {St,P-St}), %Record sub-expr 888 re_apply_more(More, S, P, Subs1); 889re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs); 890re_apply(bos, _More, _S, _, _) -> never_match; 891re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs); 892re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs); 893re_apply({char_class,Cc}, More, [C|S], P, Subs) -> 894 case in_char_class(C, Cc) of 895 true -> re_apply_more(More, S, P+1, Subs); 896 false -> nomatch 897 end; 898re_apply({comp_class,Cc}, More, [C|S], P, Subs) -> 899 case in_char_class(C, Cc) of 900 true -> nomatch; 901 false -> re_apply_more(More, S, P+1, Subs) 902 end; 903re_apply(C, More, [C|S], P, Subs) when is_integer(C) -> 904 re_apply_more(More, S, P+1, Subs); 905re_apply(_RE, _More, _S, _P, _Subs) -> 906 %% ?dbg("~p : ~p\n", [_RE,_S]), 907 nomatch. 908 909%% re_apply_more([RegExp], String, Length, SubsExprs) -> 910%% {match,RestPos,Rest,SubExprs} | nomatch. 911 912re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs); 913re_apply_more([], S, P, Subs) -> {match,P,S,Subs}. 914 915%% re_apply_lit(Literal, More, String, Position, SubExprs) -> 916%% {match,RestPos,Rest,SubExprs} | nomatch. 917re_apply_lit([C|Lit], More, [C|Cs], P, Subs) -> 918 re_apply_lit(Lit, More, Cs, P+1, Subs); 919re_apply_lit([], More, Cs, P, Subs) -> 920 re_apply_more(More, Cs, P, Subs); 921re_apply_lit(_Lit, _More, _Cs, _P, _Subs) -> 922 nomatch. 923 924%% expand_iclosure(RE, N, M) -> RE. 925 926expand_iclosure(RE, 0, M) -> expand_opt(RE, M); 927expand_iclosure(RE, N, M) -> 928 {concat,RE,expand_iclosure(RE, N-1, M)}. 929 930%% expand_opt(RegExp, Count) -> RE. 931%% Handle all the cases. 932 933expand_opt(_RE, none) -> epsilon; 934expand_opt(RE, any) -> {kclosure,RE}; 935expand_opt(_RE, 0) -> epsilon; 936expand_opt(RE, 1) -> {optional,RE}; 937expand_opt(RE, N) -> 938 {optional,{concat,RE,expand_opt(RE, N-1)}}. 939 940%% find_prefix(PrefixStr, SourceStr) 941%% if PrefixStr is a prefix of Str then return {ok,RemainingStr} 942%% otherwise return false 943 944%% find_prefix([C|Prest], [C|Rest]) -> 945%% find_prefix(Prest, Rest); 946%% find_prefix([], Rest) -> {yes,Rest}; 947%% find_prefix(_, _) -> no. 948 949%% in_char_class(Char, Class) -> bool(). 950 951in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true; 952in_char_class(C, [C|_Cc]) -> true; 953in_char_class(C, [_|Cc]) -> in_char_class(C, Cc); 954in_char_class(_C, []) -> false. 955 956%% re_apply_or(Match1, Match2, SubExprs) -> 957%% {match,RestPos,Rest,SubExprs} | nomatch. 958%% If we want the best match then choose the longest match, else just 959%% choose one by trying sequentially. 960 961re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1; 962re_apply_or({match,_,_,_}, M2={match,_,_,_}) -> M2; 963re_apply_or(never_match, R2) -> R2; 964re_apply_or(R1, never_match) -> R1; 965re_apply_or(nomatch, R2) -> R2; 966re_apply_or(R1, nomatch) -> R1. 967 968%% Record definitions for the NFA, DFA and compiler. 969 970-record(nfa_state, {no,edges=[],accept=no}). 971-record(dfa_state, {no,nfa=[],trans=[],accept=no}). 972 973-record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none, 974 accept=false,spec=[]}). 975 976%% We use standard methods, Thompson's construction and subset 977%% construction, to create first an NFA and then a DFA from the 978%% regexps. A non-standard feature is that we work with sets of 979%% character ranges (crs) instead sets of characters. This is most 980%% noticeable when constructing DFAs. The major benefit is that we can 981%% handle characters from any set, not just limited ASCII or 8859, 982%% even 16/32 bit unicode. 983%% 984%% The whole range of characters is 0-maxchar, where maxchar is a BIG 985%% number. We don't make any assumptions about the size of maxchar, it 986%% is just bigger than any character. 987%% 988%% Using character ranges makes describing many regexps very simple, 989%% for example the regexp "." just becomes the range 990%% [{0-9},{11-maxchar}]. 991 992%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}. 993%% Build a complete nfa from a list of {RegExp,Action}. The NFA field 994%% accept has values {yes,Action}|no. The NFA is a list of states. 995 996make_nfa(REAs0) -> 997 case parse_reas(REAs0) of 998 {ok,REAs1} -> 999 {NFA,Start} = build_combined_nfa(REAs1), 1000 {ok,{NFA,Start}}; 1001 {error,E} -> {error,E} 1002 end. 1003 1004%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}. 1005%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}. 1006%% Build a complete dfa from a list of {RegExp,Action}. The DFA field 1007%% accept has values {yes,Action}|no. If multiple Regexps can result 1008%% in same match string then RegExpActions list define priority. 1009 1010make_dfa(REAs) -> make_dfa(REAs, 0). 1011 1012make_dfa(REAs0, Low) -> 1013 case parse_reas(REAs0) of 1014 {ok,REAs1} -> 1015 {NFA,Start0} = build_combined_nfa(REAs1), 1016 {DFA0,Start1} = build_dfa(NFA, Start0), 1017 {DFA,Start} = minimise_dfa(DFA0, Start1, Low), 1018 {ok,{DFA,Start}}; 1019 {error,E} -> {error,E} 1020 end. 1021 1022parse_reas(REAs) -> parse_reas(REAs, []). 1023 1024parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) -> %Already parsed 1025 parse_reas(REAs, [{R,A}|S]); 1026parse_reas([{RegExp,A}|REAs], S) -> 1027 case parse(RegExp) of 1028 {ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]); 1029 {error,E} -> {error,E} 1030 end; 1031parse_reas([], Stack) -> {ok,reverse(Stack)}. 1032 1033%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}. 1034%% Build the combined NFA using Thompson's construction straight out 1035%% of the book. Build the separate NFAs in the same order as the 1036%% rules so that the accepting have ascending states have ascending 1037%% state numbers. Start numbering the states from 1 as we put the 1038%% states in a tuple with the state number as the index. 1039 1040build_combined_nfa(REAs) -> 1041 {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1), 1042 F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no}, 1043 {[F|NFA],Next}. 1044 1045build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) -> 1046 {NFA1,Next1,Start} = build_nfa(RE, Next0, Action), 1047 build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1); 1048build_nfa_list([], NFA, Starts, Next) -> 1049 {NFA,reverse(Starts),Next}. 1050 1051epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ]. 1052 1053%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}. 1054%% When building the NFA states for a ??? we don't build the end 1055%% state, just allocate a State for it and return this state 1056%% number. This allows us to avoid building unnecessary states for 1057%% concatenation which would then have to be removed by overwriting 1058%% an existing state. 1059 1060build_nfa(RE, Next, Action) -> 1061 {NFA,N,E} = build_nfa(RE, Next+1, Next, []), 1062 {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}. 1063 1064%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}. 1065%% The NFA is a list of nfa_state is no predefined order. The state 1066%% number of the returned EndState is already allocated! 1067 1068build_nfa({'or',RE1,RE2}, N0, S, NFA0) -> 1069 {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0), 1070 {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1), 1071 E = N2, 1072 {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]}, 1073 #nfa_state{no=E1,edges=[{epsilon,E}]}, 1074 #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2], 1075 N2+1,E}; 1076build_nfa({literal,[]}, N, S, NFA) -> 1077 {NFA,N,S}; 1078build_nfa({literal,[C|Cs]}, N0, S, NFA0) -> 1079 {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0), 1080 build_nfa({literal,Cs}, N1, E1, NFA1); 1081build_nfa({concat,RE1,RE2}, N0, S, NFA0) -> 1082 {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0), 1083 {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1), 1084 {NFA2,N2,E2}; 1085build_nfa({kclosure,RE}, N0, S, NFA0) -> 1086 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1087 E = N1, 1088 {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, 1089 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 1090 N1+1,E}; 1091build_nfa({pclosure,RE}, N0, S, NFA0) -> 1092 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1093 E = N1, 1094 {[#nfa_state{no=S,edges=[{epsilon,N0}]}, 1095 #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1], 1096 N1+1,E}; 1097build_nfa({optional,RE}, N0, S, NFA0) -> 1098 {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0), 1099 E = N1, 1100 {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]}, 1101 #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1], 1102 N1+1,E}; 1103build_nfa({iclosure,RE,I1,I2}, N, S, NFA) -> 1104 Exp = expand_iclosure(RE, I1, I2), 1105 build_nfa(Exp, N, S, NFA); 1106build_nfa({char_class,Cc}, N, S, NFA) -> 1107 {[#nfa_state{no=S,edges=[{nfa_char_class(Cc),N}]}|NFA],N+1,N}; 1108build_nfa({comp_class,Cc}, N, S, NFA) -> 1109 {[#nfa_state{no=S,edges=[{nfa_comp_class(Cc),N}]}|NFA],N+1,N}; 1110build_nfa(epsilon, N, S, NFA) -> 1111 {NFA,N,S}; 1112build_nfa({group,RE}, N, S, NFA) -> %%% FIXME %%%%%%% 1113 build_nfa(RE, N, S, NFA); 1114build_nfa({subexpr,_N,RE}, N, S, NFA) -> %%% FIXME %%%%%%% 1115 build_nfa(RE, N, S, NFA); 1116build_nfa(bos, N, S, NFA) -> 1117 {[#nfa_state{no=S,edges=[{[bos],N}]}|NFA],N+1,N}; 1118build_nfa(eos, N, S, NFA) -> 1119 {[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; 1120%%{[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N}; 1121build_nfa(C, N, S, NFA) when is_integer(C) -> 1122 {[#nfa_state{no=S,edges=[{[{C,C}],N}]}|NFA],N+1,N}. 1123 1124nfa_char_class(Cc) -> 1125 Crs = lists:foldl(fun({C1,C2}, Set) -> add_element({C1,C2}, Set); 1126 (C, Set) -> add_element({C,C}, Set) end, [], Cc), 1127 %% ?dbg("cc: ~p\n", [Crs]), 1128 pack_crs(Crs). 1129 1130pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 -> 1131 %% C1 C2 1132 %% C3 C4 1133 pack_crs([Cr|Crs]); 1134pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 -> 1135 %% C1 C2 1136 %% C3 C4 1137 pack_crs([{C1,C4}|Crs]); 1138pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 -> 1139 %% C1 C2 1140 %% C3 C4 1141 pack_crs([{C1,C4}|Crs]); 1142pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)]; 1143pack_crs([]) -> []. 1144 1145nfa_comp_class(Cc) -> 1146 Crs = nfa_char_class(Cc), 1147 %% ?dbg("comp: ~p\n", [Crs]), 1148 comp_crs(Crs, 0). 1149 1150comp_crs([{C1,C2}|Crs], Last) -> 1151 [{Last,C1-1}|comp_crs(Crs, C2+1)]; 1152comp_crs([], Last) -> [{Last,maxchar}]. 1153 1154%% build_dfa(NFA, NfaStartState) -> {DFA,DfaStartState}. 1155%% Build a DFA from an NFA using "subset construction". The major 1156%% difference from the book is that we keep the marked and unmarked 1157%% DFA states in separate lists. New DFA states are added to the 1158%% unmarked list and states are marked by moving them to the marked 1159%% list. We assume that the NFA accepting state numbers are in 1160%% ascending order for the rules and use ordsets to keep this order. 1161 1162build_dfa(NFA0, Start) -> 1163 %% We want NFA as sorted tuple for fast access, assume lowest state 1. 1164 NFA1 = list_to_tuple(keysort(#nfa_state.no, NFA0)), 1165 D = #dfa_state{no=0,nfa=eclosure([Start], NFA1),accept=no}, 1166 {build_dfa([D], 1, [], NFA1),0}. 1167 1168%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA. 1169%% Traverse the unmarked states. Temporarily add the current unmarked 1170%% state to the marked list before calculating translation, this is 1171%% to avoid adding too many duplicate states. Add it properly to the 1172%% marked list afterwards with correct translations. 1173 1174build_dfa([U|Us0], N0, Ms, NFA) -> 1175 {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA), 1176 M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)}, 1177 build_dfa(Us1, N1, [M|Ms], NFA); 1178build_dfa([], _N, Ms, _NFA) -> Ms. 1179 1180%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) -> 1181%% {Transitions,UnmarkedStates,NextState}. 1182%% Foreach NFA state set calculate the legal translations. N.B. must 1183%% search *BOTH* the unmarked and marked lists to check if DFA state 1184%% already exists. As the range of characters is potentially VERY 1185%% large we cannot explicitly test all characters. Instead we first 1186%% calculate the set of all disjoint character ranges which are 1187%% possible candidates to the set of NFA states. 1188 1189build_dfa(Set, Us, N, Ts, Ms, NFA) -> 1190 %% List of all transition sets. 1191 Crs0 = [Cr || S <- Set, 1192 {Crs,_St} <- (element(S, NFA))#nfa_state.edges, 1193 is_list(Crs), 1194 Cr <- Crs ], 1195 Crs1 = lists:usort(Crs0), %Must remove duplicates! 1196 %% Build list of disjoint test ranges. 1197 Test = disjoint_crs(Crs1), 1198 %% ?dbg("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]), 1199 build_dfa(Test, Set, Us, N, Ts, Ms, NFA). 1200 1201%% disjoint_crs([CharRange]) -> [CharRange]. 1202%% Take a sorted list of char ranges and make a sorted list of 1203%% disjoint char ranges. No new char range extends past an existing 1204%% char range. 1205 1206disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 -> 1207 %% C1 C2 1208 %% C3 C4 1209 [Cr1|disjoint_crs([Cr2|Crs])]; 1210disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 -> 1211 %% C1 C2 1212 %% C3 C4 1213 [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))]; 1214disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 -> 1215 %% C1 C2 1216 %% C3 C4 1217 [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))]; 1218disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 -> 1219 %% C1 C2 1220 %% C3 C4 1221 [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))]; 1222disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 -> 1223 %% C1 C2 1224 %% C3 C4 1225 [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))]; 1226disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)]; 1227disjoint_crs([]) -> []. 1228 1229build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) -> 1230 case eclosure(move(Set, Cr, NFA), NFA) of 1231 S when S /= [] -> 1232 case keysearch(S, #dfa_state.nfa, Us) of 1233 {value,#dfa_state{no=T}} -> 1234 build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); 1235 false -> 1236 case keysearch(S, #dfa_state.nfa, Ms) of 1237 {value,#dfa_state{no=T}} -> 1238 build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA); 1239 false -> 1240 U = #dfa_state{no=N,nfa=S}, 1241 build_dfa(Crs, Set, [U|Us], N+1, [{Cr,N}|Ts], Ms, NFA) 1242 end 1243 end; 1244 [] -> 1245 build_dfa(Crs, Set, Us, N, Ts, Ms, NFA) 1246 end; 1247build_dfa([], _Set, Us, N, Ts, _Ms, _NFA) -> 1248 {Ts,Us,N}. 1249 1250%% eclosure([State], NFA) -> [State]. 1251%% move([State], Char, NFA) -> [State]. 1252%% These are straight out of the book. As eclosure uses ordsets then 1253%% the generated state sets are in ascending order. 1254 1255eclosure(Sts, NFA) -> eclosure(Sts, NFA, []). 1256 1257eclosure([St|Sts], NFA, Ec) -> 1258 #nfa_state{edges=Es} = element(St, NFA), 1259 eclosure([ N || {epsilon,N} <- Es, 1260 not is_element(N, Ec) ] ++ Sts, 1261 NFA, add_element(St, Ec)); 1262eclosure([], _NFA, Ec) -> Ec. 1263 1264move(Sts, Cr, NFA) -> 1265 [ St || N <- Sts, 1266 {Crs,St} <- (element(N, NFA))#nfa_state.edges, 1267 is_list(Crs), 1268%% begin 1269%% ?dbg("move1: ~p\n", [{Sts,Cr,Crs,in_crs(Cr,Crs)}]), 1270%% true 1271%% end, 1272 in_crs(Cr, Crs) ]. 1273 1274in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true; 1275in_crs(Cr, [Cr|_Crs]) -> true; %Catch bos and eos. 1276in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs); 1277in_crs(_Cr, []) -> false. 1278 1279%% accept([State], NFA) -> true | false. 1280%% Scan down the state list until we find an accepting state. 1281 1282accept([St|Sts], NFA) -> 1283 case element(St, NFA) of 1284 #nfa_state{accept={yes,A}} -> {yes,A}; 1285 #nfa_state{accept=no} -> accept(Sts, NFA) 1286 end; 1287accept([], _NFA) -> no. 1288 1289%% minimise_dfa(DFA, StartState, FirstState) -> {DFA,StartState}. 1290%% Minimise the DFA by removing equivalent states. We consider a 1291%% state if both the transitions and the their accept state is the 1292%% same. First repeatedly run throught the DFA state list removing 1293%% equivalent states and updating remaining transitions with 1294%% remaining equivalent state numbers. When no more reductions are 1295%% possible then pack the remaining state numbers to get consecutive 1296%% states. 1297 1298minimise_dfa(DFA0, Start, N) -> 1299 case min_dfa(DFA0) of 1300 {DFA1,[]} -> %No reduction! 1301 {DFA2,Rs} = pack_dfa(DFA1, N), 1302 {min_update(DFA2, Rs),min_new_state(Start, Rs)}; 1303 {DFA1,Rs} -> 1304 minimise_dfa(min_update(DFA1, Rs), min_new_state(Start, Rs), N) 1305 end. 1306 1307min_dfa(DFA) -> min_dfa(DFA, [], []). 1308 1309min_dfa([D|DFA0], Rs0, MDFA) -> 1310 {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept, 1311 D#dfa_state.no, Rs0, []), 1312 min_dfa(DFA1, Rs1, [D|MDFA]); 1313min_dfa([], Rs, MDFA) -> {MDFA,Rs}. 1314 1315min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) -> 1316 min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA); 1317min_delete([D|DFA], T, A, NewN, Rs, MDFA) -> 1318 min_delete(DFA, T, A, NewN, Rs, [D|MDFA]); 1319min_delete([], _T, _A, _NewN, Rs, MDFA) -> {MDFA,Rs}. 1320 1321min_update(DFA, Rs) -> 1322 [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ]. 1323 1324min_update_trans(Tr, Rs) -> 1325 [ {C,min_new_state(S, Rs)} || {C,S} <- Tr ]. 1326 1327min_new_state(Old, [{Old,New}|_Reds]) -> New; 1328min_new_state(Old, [_R|Reds]) -> min_new_state(Old, Reds); 1329min_new_state(Old, []) -> Old. 1330 1331pack_dfa(DFA, N) -> pack_dfa(DFA, N, [], []). 1332 1333pack_dfa([D|DFA], NewN, Rs, PDFA) -> 1334 pack_dfa(DFA, NewN+1, [{D#dfa_state.no,NewN}|Rs], 1335 [D#dfa_state{no=NewN}|PDFA]); 1336pack_dfa([], _NewN, Rs, PDFA) -> {PDFA,Rs}. 1337 1338%% comp_apply(String, StartPos, DFAReg) -> {match,RestPos,Rest} | nomatch. 1339%% Apply the DFA of a regular expression to a string. If 1340%% there is a match return the position of the remaining string and 1341%% the string if else return 'nomatch'. 1342%% 1343%% StartPos should be the real start position as it is used to decide 1344%% if we are at the beginning of the string. 1345 1346comp_apply(Cs, P, {DFA,Start,_Fail}) -> 1347 comp_apply(element(Start, DFA), Cs, P, DFA, nomatch). 1348 1349comp_apply(#c_state{spec=[]}=St, Cs, P, DFA, Accept) -> 1350 comp_apply_tr(St, Cs, P, DFA, Accept); 1351comp_apply(#c_state{spec=Sp}=St, Cs, P, DFA, Accept) -> 1352 comp_apply_sp(St, Cs, P, DFA, Accept, Sp). 1353 1354comp_apply_tr(#c_state{trans=none,accept=A}, Cs, P, _DFA, Accept) -> 1355 %% End state. 1356 accept_value(A, Cs, P, Accept); 1357comp_apply_tr(#c_state{trans=Tr,tmin=Tmin,smin=Smin,tmax=Tmax,smax=Smax,accept=A}, 1358 [C|Cs]=Cs0, P, DFA, Accept) -> 1359 %% Get the next state number to go to. 1360 NextSt = if C =< Tmin -> Smin; %Below transition table 1361 C >= Tmax -> Smax; %Above transition table 1362 true -> %Otherwise use table 1363 element(C - Tmin, Tr) 1364 end, 1365 comp_apply(element(NextSt, DFA), Cs, P+1, DFA, 1366 accept_value(A, Cs0, P, Accept)); 1367comp_apply_tr(#c_state{trans=_Tr,accept=A}, [], P, _DFA, Accept) -> 1368 accept_value(A, [], P, Accept). 1369 1370comp_apply_sp(_St, Cs, 1, DFA, Accept, [{bos,S}|_]) -> 1371 comp_apply(element(S, DFA), Cs, 1, DFA, Accept); 1372comp_apply_sp(_St, [$\n], P, DFA, Accept, [{eos,S}|_]) -> 1373 comp_apply(element(S, DFA), [], P, DFA, Accept); 1374comp_apply_sp(_St, [], P, DFA, Accept, [{eos,S}|_]) -> 1375 comp_apply(element(S, DFA), [], P, DFA, Accept); 1376comp_apply_sp(St, Cs, P, DFA, Accept, [_|Sp]) -> 1377 comp_apply_sp(St, Cs, P, DFA, Accept, Sp); 1378comp_apply_sp(St, Cs, P, DFA, Accept, []) -> 1379 comp_apply_tr(St, Cs, P, DFA, Accept). 1380 1381accept_value(true, Cs, P, _Accept) -> {match,P,Cs}; 1382accept_value(false, _Cs, _P, Accept) -> Accept. 1383 1384%% compile(RegExp) -> {ok,RE} | {error,E}. 1385%% Parse the regexp described in the string RegExp. 1386 1387compile(RegExp) -> 1388 case make_dfa([{RegExp,yes}], 2) of 1389 {ok,{DFA0,Start}} -> 1390 Fail = 1, 1391 DFA1 = [#dfa_state{no=Fail,accept=no,trans=[]}|DFA0], 1392 DFA = tuplelise_dfa(DFA1, 1), 1393 {ok,{comp_regexp,{DFA,Start,Fail}}}; 1394 {error,E} -> {error,E} 1395 end. 1396 1397%% tuplelise_dfa(DFAstates, NoAcceptState) -> {{CompState},FirstState}. 1398 1399tuplelise_dfa(DFA0, NoAccept) -> 1400 DFA1 = map(fun (#dfa_state{no=N,trans=Ts,accept=A}) -> 1401 {Tr,Tmin,Smin,Tmax,Smax,Sp} = build_trans(Ts, NoAccept), 1402 #c_state{no=N,trans=Tr,tmin=Tmin,smin=Smin, 1403 tmax=Tmax,smax=Smax, 1404 accept=fix_accept(A),spec=Sp} 1405 end, DFA0), 1406 list_to_tuple(keysort(#dfa_state.no, DFA1)). 1407 1408build_trans(Ts0, NoAccept) -> 1409 %% Split transitions into character ranges and specials. 1410 {Ts1,Sp1} = foldl(fun ({{_,_},_}=T, {Ts,Sp}) -> {[T|Ts],Sp}; 1411 ({_,_}=T, {Ts,Sp}) -> {Ts,[T|Sp]} 1412 end, {[],[]}, Ts0), 1413 if Ts1 == [] -> 1414 {none,none,none,none,none,Sp1}; 1415 true -> 1416 %% Have transitions, convert to tuple. 1417 Ts2 = keysort(1, Ts1), 1418 {Tmin,Smin,Ts3} = min_trans(Ts2, NoAccept), 1419 %% ?dbg("exptr: ~p\n", [{Ts3,Tmin}]), 1420 {Trans,Tmax,Smax} = expand_trans(Ts3, Tmin, NoAccept), 1421 {list_to_tuple(Trans),Tmin,Smin,Tmax,Smax,Sp1} 1422 end. 1423 1424min_trans([{{0,C2},S}|Crs], _Def) -> {C2,S,Crs}; 1425min_trans([{{C1,_C2},_S}|_]=Crs, Def) -> {C1-1,Def,Crs}. 1426 1427expand_trans([{{C1,maxchar},S}], Last, Def) -> 1428 Trs = duplicate(C1-(Last+1), Def), 1429 {Trs,C1,S}; 1430expand_trans([{{C1,C2},S}], Last, Def) -> 1431 Trs = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S), 1432 {Trs,C2+1,Def}; 1433expand_trans([{{C1,C2},S}|Crs], Last, Def) -> 1434 {Trs0,Tmax,Smax} = expand_trans(Crs, C2, Def), 1435 Trs1 = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S) ++ Trs0, 1436 {Trs1,Tmax,Smax}. 1437 1438fix_accept({yes,_}) -> true; 1439fix_accept(no) -> false. 1440 1441