1import functools 2 3from guppy.etc.RE_Rect import chooserects 4from guppy.etc.IterPermute import iterpermute 5 6 7class InfiniteError(Exception): 8 pass 9 10 11class WordsMemo: 12 def __init__(self, re, ch): 13 self.re = re 14 self.ch = ch 15 self.xs = {} 16 self.N = 0 17 18 def get_words_of_length(self, N): 19 # Return a list of words of length up to N 20 if N not in self.xs: 21 self.xs[N] = self.re.get_words_of_length_memoized(N, self) 22 return self.xs[N] 23 24 def get_words_of_length_upto(self, N): 25 # Return all words of length up to N, in the form 26 # [(0, <list of words of length 0>), 27 # (1, <list of words of length 0>), 28 # ...] 29 xsu = [] 30 for i in range(N+1): 31 xs = self.get_words_of_length(i) 32 if xs: 33 xsu.append((i, xs)) 34 return xsu 35 36 37REBASE = tuple 38 39 40class RE(REBASE): 41 # Regular expression nodes 42 # The operators are choosen to be compatible with Pythonic standards: 43 # o sets : using | for union 44 # o strings, sequences : using + for concatenation. 45 # 46 # This differs from mathematical presentations of regular 47 # expressions where + is the union, but it seemed more important 48 # to not confuse the Python usage. 49 50 # There are also operators for closure x*, x+ that can not be 51 # represented directly in Python expressions and these were choosen 52 # to use a function call syntax. 53 # The following table summarizes the operators. 54 55 # RE node expr re lib mathematical name 56 57 # x + y x y x y Concatenation 58 # x | y x | y x + y Union 59 # x('*') x* x* Kleene closure 60 # x('+') x+ x+ Positive closure 61 # x('?') x? 62 63 _re_special = r'.^$*+?{}\[]|()' 64 65 def __add__(a, b): 66 if isinstance(b, RE): 67 return concat(a, b) 68 else: 69 return Concatenation(a, Single(b)) 70 71 def __call__(a, *args, **kwds): 72 if not kwds: 73 if args == ('*',): 74 return KleeneClosure(a) 75 elif args == ('+',): 76 return PositiveClosure(a) 77 elif args == ('?',): 78 return EpsilonOrOne(a) 79 raise ValueError( 80 "Argument to regular expression must be '*' or '+' or '?'") 81 82 def __hash__(self): 83 return hash((self._name, tuple(self))) 84 85 def __eq__(a, b): 86 return (a._name == b._name and 87 tuple(a) == tuple(b)) 88 89 def __lt__(a, b): 90 if a._name == b._name: 91 return tuple(a) < tuple(b) 92 else: 93 return a._name < b._name 94 95 def __or__(a, b): 96 return Union(a, b) 97 98 def get_num_closures(self): 99 ns = 0 100 for ch in self: 101 ns += ch.get_num_closures() 102 return ns 103 104 def get_num_syms(self): 105 ns = 0 106 for ch in self: 107 ns += ch.get_num_syms() 108 return ns 109 110 def get_sum_sym_lengths(self): 111 ns = 0 112 for ch in self: 113 ns += ch.get_sum_sym_lengths() 114 return ns 115 116 def get_words_memo(self): 117 ch = [x.get_words_memo() for x in self] 118 return WordsMemo(self, ch) 119 120 def get_words_of_length(self, N): 121 xs = self.get_words_memo() 122 return xs.get_words_of_length(N) 123 124 def mapchildren(self, f): 125 return self.__class__(*[f(x) for x in self]) 126 127 def regexpform(self): 128 return self.mappedrepr(regexpname) 129 130 def reversed(self): 131 return self.mapchildren(lambda x: x.reversed()) 132 133 def rempretup(self): 134 def f(x): 135 if isinstance(x, Seq): 136 if x is not Epsilon and isinstance(x[0], tuple): 137 ws = x[1:] 138 return Seq(*ws) 139 else: 140 return x 141 return x.mapchildren(f) 142 143 return f(self) 144 145 def seqatoms(self): 146 sa = [] 147 self.apseqatoms(sa.append) 148 return sa 149 150 def sequni(self): 151 d = {} 152 us = [] 153 154 def ap(x): 155 if x not in d: 156 d[x] = 1 157 us.append(x) 158 self.apseq(ap) 159 return Union(*us) 160 161 def shform(self, conc=' '): 162 r = self.mappedrepr(regexpname) 163 if conc != ' ': 164 r = conc.join(r.split(' ')) 165 return r 166 167 def simplified(self, *a, **k): 168 return self 169 170 def simulform(self): 171 def f(x): 172 if x == '': 173 return '()' 174 return str(x) 175 return self.mappedrepr(f) 176 177 178def regexpname(s): 179 if s == '': 180 return '()' 181 special = RE._re_special 182 ren = [] 183 for c in str(s): 184 if c in special+"', ": 185 #c = r'\%s'%c 186 c = '' 187 ren.append(c) 188 return ''.join(ren) 189 190 191class Seq(RE): 192 _priority = 0 193 _name = 'Seq' 194 195 def __new__(clas, *symbols): 196 if not symbols: 197 return Epsilon 198 return REBASE.__new__(clas, symbols) 199 200 def __repr__(self): 201 return '%s(%s)' % (self.__class__.__name__, ', '.join(['%r' % (x,) for x in self])) 202 203 def apseq(self, ap): 204 ap(self) 205 206 def apseqatoms(self, ap): 207 for x in self: 208 ap(Single(x)) 209 210 def get_num_closures(self): 211 return 0 212 213 def get_num_syms(self): 214 return len(self) 215 216 def get_sum_sym_lengths(self): 217 s = 0 218 for x in self: 219 s += len(str(x)) 220 return s 221 222 def get_words_memo(self): 223 return WordsMemo(self, ()) 224 225 def get_words_of_length_memoized(self, N, memo): 226 if N == len(self): 227 return [self] 228 else: 229 return [] 230 231 def limited(self, N): 232 return self 233 234 def mappedrepr(self, f): 235 if not self: 236 return f('') 237 return ' '.join(['%s' % (f(x),) for x in self]) 238 239 def reversed(self): 240 r = list(self) 241 r.reverse() 242 return self.__class__(*r) 243 244 def unionsplitted(self): 245 return [self] 246 247 248def Single(symbol): 249 return REBASE.__new__(Seq, (symbol,)) 250 251 252Epsilon = REBASE.__new__(Seq, ()) 253 254 255def concat(*args): 256 args = [x for x in args if x is not Epsilon] 257 if len(args) < 2: 258 if not args: 259 return Epsilon 260 return args[0] 261 return REBASE.__new__(Concatenation, args) 262 263 264class Concatenation(RE): 265 _priority = 2 266 _name = 'Concat' 267 268 def __new__(clas, *args): 269 if len(args) < 2: 270 if not args: 271 return Epsilon 272 return args[0] 273 return REBASE.__new__(clas, args) 274 275 def __repr__(self): 276 rs = [] 277 for ch in self: 278 r = '%r' % (ch,) 279 if ch._priority > self._priority: 280 r = '(%s)' % (r,) 281 rs.append(r) 282 return ' + '.join(rs) 283 284 def apseq(self, ap): 285 uns = [x.sequni() for x in self] 286 ixs = [0]*len(uns) 287 while 1: 288 xs = [] 289 for (i, us) in enumerate(uns): 290 for x in us[ixs[i]]: 291 if x is not Epsilon: 292 xs.append(x) 293 ap(Seq(*xs)) 294 j = 0 295 for j, ix in enumerate(ixs): 296 ix += 1 297 if ix >= len(uns[j]): 298 ix = 0 299 ixs[j] = ix 300 if ix != 0: 301 break 302 else: 303 break 304 305 def apseqatoms(self, ap): 306 for x in self: 307 x.apseqatoms(ap) 308 309 def get_words_of_length_memoized(self, N, memo): 310 chxs = [] 311 for ch in memo.ch: 312 chxs.append(ch.get_words_of_length_upto(N)) 313 xs = [] 314 seen = {} 315 316 def ads(xx, i, n): 317 if i == len(chxs): 318 if n == N: 319 for toconc in iterpermute(*xx): 320 conc = simple_Concatenation(toconc) 321 if conc not in seen: 322 xs.append(conc) 323 seen[conc] = 1 324 else: 325 for m, x in chxs[i]: 326 if n + m <= N: 327 ads(xx + [x], i + 1, n + m) 328 329 ads([], 0, 0) 330 return xs 331 332 def limited(self, N): 333 return Concatenation(*[x.limited(N) for x in self]) 334 335 def mappedrepr(self, f): 336 rs = [] 337 for ch in self: 338 r = ch.mappedrepr(f) 339 if ch._priority > self._priority: 340 r = '(%s)' % (r,) 341 rs.append(r) 342 return ' '.join(rs) 343 344 def reversed(self): 345 r = [x.reversed() for x in self] 346 r.reverse() 347 return self.__class__(*r) 348 349 def simplified(self, *a, **k): 350 conc = [x.simplified(*a, **k) for x in self] 351 sa = [] 352 for c in conc: 353 for a in c.seqatoms(): 354 sa.append(a) 355 return simple_Concatenation(sa) 356 357 def unionsplitted(self): 358 runs = [] 359 uns = [] 360 for (i, x) in enumerate(self): 361 us = x.unionsplitted() 362 if len(us) > 1: 363 uns.append((i, us)) 364 if not uns: 365 return [self] 366 ixs = [0]*len(uns) 367 ch = list(self) 368 while 1: 369 xs = [] 370 i0 = 0 371 for j, (i, us) in enumerate(uns): 372 xs.extend(ch[i0:i]) 373 ix = ixs[j] 374 xs.append(us[ix]) 375 i0 = i + 1 376 xs.extend(ch[i0:]) 377 runs.append(concat(*xs)) 378 379 j = 0 380 for j, ix in enumerate(ixs): 381 ix += 1 382 if ix >= len(uns[j][1]): 383 ix = 0 384 ixs[j] = ix 385 if ix != 0: 386 break 387 else: 388 return runs 389 390 391class SimplifiedConcatenation(Concatenation): 392 def simplified(self, *a, **k): 393 return self 394 395 396def conclosure(conc): 397 # Simplification noted Mar 5 2005 398 # Simplify ... b b* ... or ... b* b ... to ... b+ ... 399 # conc is a sequence of regular expressions 400 401 seen = {} 402 nconc = [] 403 w0 = None 404 for w in conc: 405 if w0 is not None: 406 if (w._name == '*' and # Not isinstance(KleeneClosure), would catch PositiveClosure 407 w[0] == w0): 408 w = PositiveClosure(w0) 409 elif (w0._name == '*' and 410 w0[0] == w): 411 w = PositiveClosure(w) 412 else: 413 if w0 is not None: 414 nconc.append(w0) 415 w0 = w 416 if w0 is not None: 417 nconc.append(w0) 418 return nconc 419 420 421def simple_Concatenation(conc): 422 if len(conc) > 1: 423 conc0 = conc 424 conc = conclosure(conc) 425 nconc = [] 426 i = 0 427 j = 0 428 while i < len(conc): 429 e = conc[i] 430 if not isinstance(e, Seq): 431 i += 1 432 nconc.append(e) 433 continue 434 j = i 435 while j < len(conc): 436 if not isinstance(conc[j], Seq): 437 break 438 j += 1 439 if j == i + 1: 440 nconc.append(e) 441 else: 442 syms = [] 443 for k in range(i, j): 444 e = conc[k] 445 syms.extend(list(e)) 446 nconc.append(Seq(*syms)) 447 i = j 448 if len(nconc) > 1: 449 return Concatenation(*nconc) 450 elif nconc: 451 return nconc[0] 452 else: 453 return Epsilon 454 455 456gauges = [ 457 lambda x:x.get_num_syms(), 458 lambda x:x.get_num_closures(), 459 lambda x:x.get_sum_sym_lengths() 460] 461 462 463def simpleunion(lines): 464 choosen = chooserects(lines, gauges) 465 have_epsilon = 0 466 while 1: 467 if len(choosen) == 1 and (choosen[0].width == 0 or len(choosen[0].lines) == 1): 468 us = [] 469 for line in choosen[0].lines: 470 if line: 471 us.append(line) 472 else: 473 have_epsilon = 1 474 break 475 us = [] 476 for r in choosen: 477 conc = r.get_common_part() 478 olines = r.get_uncommons() 479 u = simpleunion(olines) 480 if u is not Epsilon: 481 if r.dir == -1: 482 conc = [u]+conc 483 else: 484 conc = conc + [u] 485 if conc: 486 us.append(conc) 487 else: 488 have_epsilon = 1 489 assert not isinstance(us[-1], str) 490 491 choosen = chooserects(us, gauges) 492 493 if len(us) > 1: 494 nus = [simple_Concatenation(line) for line in us] 495 u = SimplifiedUnion(*nus) 496 elif us: 497 u = simple_Concatenation(us[0]) 498 else: 499 u = None 500 if have_epsilon: 501 if u is not None: 502 u = simple_EpsilonOrOne(u) 503 else: 504 u = Epsilon 505 506 return u 507 508 509class Union(RE): 510 _priority = 3 511 _name = 'Union' 512 513 def __new__(clas, *args): 514 return REBASE.__new__(clas, args) 515 516 def __repr__(self): 517 rs = [] 518 for ch in self: 519 r = '%r' % (ch,) 520 if ch._priority > self._priority: 521 r = '(%s)' % r 522 rs.append(r) 523 return ' | '.join(rs) 524 525 def apseq(self, ap): 526 for c in self: 527 c.apseq(ap) 528 529 def apseqatoms(self, ap): 530 for x in self: 531 x.apseqatoms(ap) 532 533 def get_words_of_length_memoized(self, N, memo): 534 xs = [] 535 seen = {} 536 for ch in memo.ch: 537 for x in ch.get_words_of_length(N): 538 if x not in seen: 539 seen[x] = 1 540 xs.append(x) 541 return xs 542 543 def limited(self, N): 544 uni = [x.limited(N) for x in self] 545 for i, x in enumerate(uni): 546 if x is not self[i]: 547 return self.__class__(*uni) 548 return self 549 550 def mappedrepr(self, f): 551 rs = [] 552 for ch in self: 553 r = '%s' % (ch.mappedrepr(f),) 554 if ch._priority > self._priority: 555 r = '(%s)' % r 556 rs.append(r) 557 return ' | '.join(rs) 558 559 def simplified(self, args=None, *a, **k): 560 if args is None: 561 args = [x.simplified() for x in self.unionsplitted()] 562 #args = [x for x in self.unionsplitted()] 563 564 # Create a simplfied union 565 # Assuming args are simplified, non-unions 566 567 ch = [a.seqatoms() for a in args] 568 return simpleunion(ch) 569 570 def unionsplitted(self): 571 us = [] 572 for x in self: 573 us.extend(list(x.unionsplitted())) 574 return us 575 576 577class SimplifiedUnion(Union): 578 def simplified(self, *a, **k): 579 return self 580 581 582class Called(RE): 583 _priority = 1 584 585 def __new__(clas, arg): 586 return REBASE.__new__(clas, (arg,)) 587 588 def __repr__(self): 589 ch = self[0] 590 r = '%r' % (ch,) 591 if ch._priority > self._priority: 592 r = '(%s)' % r 593 return "%s(%r)" % (r, self._name) 594 595 def apseqatoms(self, ap): 596 ap(self) 597 598 def get_num_closures(self): 599 return 1 + self[0].get_num_closures() 600 601 def mappedrepr(self, f): 602 ch = self[0] 603 r = ch.mappedrepr(f) 604 if (ch._priority > self._priority 605 or isinstance(ch, Seq) and len(ch) > 1): 606 r = '(%s)' % r 607 return "%s%s" % (r, self._name) 608 609 def simplified(self, *a, **k): 610 return self.__class__(self[0].simplified(*a, **k)) 611 612 613class Closure(Called): 614 def get_words_of_length_memoized(self, N, memo): 615 if N == 0: 616 return [Epsilon] 617 if N == 1: 618 return memo.ch[0].get_words_of_length(1) 619 xs = [] 620 seen = {} 621 for i in range(1, N): 622 a = memo.get_words_of_length(i) 623 b = memo.get_words_of_length(N-i) 624 for ai in a: 625 for bi in b: 626 aibi = simple_Concatenation((ai, bi)) 627 if aibi not in seen: 628 xs.append(aibi) 629 seen[aibi] = 1 630 for x in memo.ch[0].get_words_of_length(N): 631 if x not in seen: 632 xs.append(x) 633 seen[x] = 1 634 return xs 635 636 def unionsplitted(self): 637 return [self] 638 639 640class KleeneClosure(Closure): 641 _name = '*' 642 643 def apseq(self, ap): 644 raise InfiniteError( 645 'apseq: Regular expression is infinite: contains a Kleene Closure') 646 647 def limited(self, N): 648 if N == 0: 649 return Epsilon 650 cl = self[0].limited(N) 651 uni = [] 652 for i in range(N+1): 653 toconc = [cl]*i 654 uni.append(Concatenation(*toconc)) 655 return Union(*uni) 656 657 def simplified(self, *a, **k): 658 return simple_KleeneClosure(self[0].simplified(*a, **k)) 659 660 661def simple_KleeneClosure(x): 662 # (b+)* -> b* 663 if x._name == '+': 664 return simple_KleeneClosure(x[0]) 665 return KleeneClosure(x) 666 667 668class PositiveClosure(Closure): 669 _name = '+' 670 671 def apseq(self, ap): 672 raise InfiniteError( 673 'apseq: Regular expression is infinite: contains a Positive Closure') 674 675 def apseqatoms(self, ap): 676 self[0].apseqatoms(ap) 677 simple_KleeneClosure(self[0]).apseqatoms(ap) 678 679 def get_words_of_length_memoized(self, N, memo): 680 if N <= 1: 681 return memo.ch[0].get_words_of_length(N) 682 return Closure.get_words_of_length_memoized(self, N, memo) 683 684 def limited(self, N): 685 a = self[0].limited(N) 686 b = KleeneClosure(self[0]).limited(N) 687 return Concatenation(a, b) 688 689 690class EpsilonOrOne(Called): 691 _name = '?' 692 693 def apseq(self, ap): 694 ap(Epsilon) 695 self[0].apseq(ap) 696 697 def get_words_of_length_memoized(self, N, memo): 698 if N == 0: 699 return [Epsilon] 700 return memo.ch[0].get_words_of_length(N) 701 702 def limited(self, N): 703 x = self[0].limited(N) 704 if x is not self[0]: 705 self = self.__class__(x) 706 return self 707 708 def simplified(self, *a, **k): 709 return simple_EpsilonOrOne(self[0].simplified(*a, **k)) 710 711 def unionsplitted(self): 712 return [Epsilon] + list(self[0].unionsplitted()) 713 714 715def simple_EpsilonOrOne(x): 716 # (a+)? -> a* 717 718 if x._name == '+': 719 return simple_KleeneClosure(x) 720 721 # (a*)? -> a* 722 if x._name == '*': 723 return x 724 725 return EpsilonOrOne(x) 726 727 728class RegularSystem: 729 730 def __init__(self, table, Start, final_states): 731 self.table = table 732 self.Start = Start 733 self.Final = '358f0eca5c34bacdfbf6a8ac0ccf84bc' 734 self.final_states = final_states 735 736 def pp(self): 737 738 def statename(state): 739 try: 740 name = self.names[state] 741 except KeyError: 742 name = str(state) 743 return name 744 745 def transname(trans): 746 name = trans.simulform() 747 if trans._priority > 1: 748 name = '(%s)' % (name,) 749 return name 750 751 self.setup_names() 752 753 X = self.X 754 755 xs = [self.Start]+self.order 756 xs.append(self.Final) 757 for Xk in xs: 758 if Xk not in X: 759 continue 760 print('%3s = ' % (statename(Xk),), end=' ') 761 Tk = X[Xk] 762 es = [] 763 for Xj in xs: 764 if Xj in Tk: 765 es.append('%s %s' % (transname(Tk[Xj]), statename(Xj))) 766 if es: 767 print(' | '.join(es)) 768 else: 769 print() 770 771 def setup_equations(self): 772 table = self.table 773 final_states = self.final_states 774 Final = self.Final 775 self.X = X = {Final: {}} 776 for Xi, transitions in list(table.items()): 777 X[Xi] = Ti = {} 778 for (symbol, Xj) in list(transitions.items()): 779 Ti.setdefault(Xj, []).append(Single(symbol)) 780 for Xj, Aij in list(Ti.items()): 781 if len(Aij) > 1: 782 Aij.sort() 783 Aij = Union(*Aij) 784 else: 785 Aij = Aij[0] 786 Ti[Xj] = Aij 787 if Xi in final_states: 788 Ti[Final] = Epsilon 789 790 def setup_order(self): 791 def dists(X, start): 792 i = 0 793 S = {start: i} 794 news = [start] 795 while news: 796 oldnews = news 797 news = [] 798 i += 1 799 for s in oldnews: 800 if s not in X: 801 continue 802 for t in X[s]: 803 if t not in S: 804 news.append(t) 805 S[t] = i 806 return S 807 808 def start_distance(x): 809 return start_dists[x] 810 811 def sumt(f): 812 memo = {} 813 814 def g(x): 815 if x in memo: 816 return memo[x] 817 s = 0.0 818 for y in X[x]: 819 s += f(y) 820 memo[x] = s 821 return s 822 return g 823 824 def cmp3(x, y): 825 # Comparison for the sorting of equation solving order 826 # First in list = solved last 827 828 if x is y: 829 return 0 830 831 # Equations with more terms are resolved later 832 c = len(X[y]) - len(X[x]) 833 if c: 834 return c 835 836 # The equations with terms more distant from start node will be resolved earlier 837 i = 0 838 while i < 10: # 4 was enough with tests so far at Feb 24 2005 839 try: 840 f = sumdists[i] 841 except IndexError: 842 f = sumt(sumdists[i-1]) 843 sumdists.append(f) 844 c = f(x) - f(y) 845 if c: 846 return c 847 i += 1 848 849 return (x > y) - (x < y) 850 851 sumdists = [start_distance] 852 X = self.X 853 Start = self.Start 854 Final = self.Final 855 start_dists = dists(X, Start) 856 order = [x for x in start_dists if x is not Start and x is not Final] 857 order.sort(key=functools.cmp_to_key(cmp3)) 858 self.order = order 859 860 def setup_names(self): 861 try: 862 self.order 863 except AttributeError: 864 self.setup_order() 865 self.names = {} 866 self.names[self.Start] = 'X0' 867 868 for i, s in enumerate(self.order): 869 self.names[s] = 'X%d' % (i+1) 870 self.names[self.Final] = 'Final' 871 872 def solve(self): 873 # Set up equation system 874 875 self.setup_equations() 876 self.setup_order() 877 878 X = self.X 879 Start = self.Start 880 Final = self.Final 881 todo = list(self.order) 882 883 # Solve equation system 884 885 while todo: 886 Xk = todo.pop() 887 Tk = X[Xk] 888 889 if Xk in Tk: 890 # Recursive equation 891 # Eliminate Akk Xk, using Adler's theorem 892 # Given: 893 # Xk = Ak0 X0 | ... Akk Xk |.. Akn Xkn 894 # we get: 895 # Xk = Akk* (Ak0 X0 | ... <no Xk> ... | Akn Xn) 896 # which we evaluate to: 897 # Xk = Bk0 X0 | ... Bkn Xn 898 # where coefficients get the new values 899 # Bki := Akk* Aki 900 901 Akk = Tk[Xk] 902 del Tk[Xk] 903 904 AkkStar = Akk('*') 905 for Xi, Aki in list(Tk.items()): 906 Bki = AkkStar + Aki 907 Tk[Xi] = Bki 908 909 # Substitute Xk in each other equation in X 910 # containing Xk, except eqv. Xk itself, which will not be used any more.. 911 912 del X[Xk] 913 914 for Xj, Tj in list(X.items()): 915 Bjk = Tj.get(Xk) 916 if Bjk is None: 917 continue 918 del Tj[Xk] 919 for Xji, Tk_Xji in list(Tk.items()): 920 Cji = (Bjk + Tk_Xji) 921 Bji = Tj.get(Xji) 922 if Bji is not None: 923 Cji = Bji | Cji 924 Tj[Xji] = Cji 925 926 # The equation system is now solved 927 # The result is in Final term of Start equation 928 929 return X[Start][Final] 930 931 932Nothing = Union() 933 934 935def SolveFSA(fsa): 936 RS = RegularSystem(fsa.table, fsa.start_state, fsa.final_states) 937 return RS.solve() 938