RE.py - OpenGrok cross reference for /dports/devel/py-guppy3/guppy3-3.1.2/guppy/etc/RE.py

import functools

from guppy.etc.RE_Rect import chooserects
from guppy.etc.IterPermute import iterpermute


class InfiniteError(Exception):
    pass


class WordsMemo:
    def __init__(self, re, ch):
        self.re = re
        self.ch = ch
        self.xs = {}
        self.N = 0

    def get_words_of_length(self, N):
        # Return a list of words of length up to N
        if N not in self.xs:
            self.xs[N] = self.re.get_words_of_length_memoized(N, self)
        return self.xs[N]

    def get_words_of_length_upto(self, N):
        # Return all words of length up to N, in the form
        # [(0, <list of words of length 0>),
        #  (1, <list of words of length 0>),
        #  ...]
        xsu = []
        for i in range(N+1):
            xs = self.get_words_of_length(i)
            if xs:
                xsu.append((i, xs))
        return xsu


REBASE = tuple


class RE(REBASE):
    # Regular expression nodes
    # The operators are choosen to be compatible with Pythonic standards:
    #   o sets               : using | for union
    #   o strings, sequences : using + for concatenation.
    #
    # This differs from mathematical presentations of regular
    # expressions where + is the union, but it seemed more important
    # to not confuse the Python usage.

    # There are also operators for closure x*, x+ that can not be
    # represented directly in Python expressions and these were choosen
    # to use a function call syntax.
    # The following table summarizes the operators.

    #   RE node expr    re lib          mathematical    name

    #   x + y           x y             x y             Concatenation
    #   x | y           x | y           x + y           Union
    #   x('*')          x*              x*              Kleene closure
    #   x('+')          x+              x+              Positive closure
    #   x('?')          x?

    _re_special = r'.^$*+?{}\[]|()'

    def __add__(a, b):
        if isinstance(b, RE):
            return concat(a, b)
        else:
            return Concatenation(a, Single(b))

    def __call__(a, *args, **kwds):
        if not kwds:
            if args == ('*',):
                return KleeneClosure(a)
            elif args == ('+',):
                return PositiveClosure(a)
            elif args == ('?',):
                return EpsilonOrOne(a)
        raise ValueError(
            "Argument to regular expression must be '*' or '+' or '?'")

    def __hash__(self):
        return hash((self._name, tuple(self)))

    def __eq__(a, b):
        return (a._name == b._name and
                tuple(a) == tuple(b))

    def __lt__(a, b):
        if a._name == b._name:
            return tuple(a) < tuple(b)
        else:
            return a._name < b._name

    def __or__(a, b):
        return Union(a, b)

    def get_num_closures(self):
        ns = 0
        for ch in self:
            ns += ch.get_num_closures()
        return ns

    def get_num_syms(self):
        ns = 0
        for ch in self:
            ns += ch.get_num_syms()
        return ns

    def get_sum_sym_lengths(self):
        ns = 0
        for ch in self:
            ns += ch.get_sum_sym_lengths()
        return ns

    def get_words_memo(self):
        ch = [x.get_words_memo() for x in self]
        return WordsMemo(self, ch)

    def get_words_of_length(self, N):
        xs = self.get_words_memo()
        return xs.get_words_of_length(N)

    def mapchildren(self, f):
        return self.__class__(*[f(x) for x in self])

    def regexpform(self):
        return self.mappedrepr(regexpname)

    def reversed(self):
        return self.mapchildren(lambda x: x.reversed())

    def rempretup(self):
        def f(x):
            if isinstance(x, Seq):
                if x is not Epsilon and isinstance(x[0], tuple):
                    ws = x[1:]
                    return Seq(*ws)
                else:
                    return x
            return x.mapchildren(f)

        return f(self)

    def seqatoms(self):
        sa = []
        self.apseqatoms(sa.append)
        return sa

    def sequni(self):
        d = {}
        us = []

        def ap(x):
            if x not in d:
                d[x] = 1
                us.append(x)
        self.apseq(ap)
        return Union(*us)

    def shform(self, conc=' '):
        r = self.mappedrepr(regexpname)
        if conc != ' ':
            r = conc.join(r.split(' '))
        return r

    def simplified(self, *a, **k):
        return self

    def simulform(self):
        def f(x):
            if x == '':
                return '()'
            return str(x)
        return self.mappedrepr(f)


def regexpname(s):
    if s == '':
        return '()'
    special = RE._re_special
    ren = []
    for c in str(s):
        if c in special+"', ":
            #c = r'\%s'%c
            c = ''
        ren.append(c)
    return ''.join(ren)


class Seq(RE):
    _priority = 0
    _name = 'Seq'

    def __new__(clas, *symbols):
        if not symbols:
            return Epsilon
        return REBASE.__new__(clas, symbols)

    def __repr__(self):
        return '%s(%s)' % (self.__class__.__name__, ', '.join(['%r' % (x,) for x in self]))

    def apseq(self, ap):
        ap(self)

    def apseqatoms(self, ap):
        for x in self:
            ap(Single(x))

    def get_num_closures(self):
        return 0

    def get_num_syms(self):
        return len(self)

    def get_sum_sym_lengths(self):
        s = 0
        for x in self:
            s += len(str(x))
        return s

    def get_words_memo(self):
        return WordsMemo(self, ())

    def get_words_of_length_memoized(self, N, memo):
        if N == len(self):
            return [self]
        else:
            return []

    def limited(self, N):
        return self

    def mappedrepr(self, f):
        if not self:
            return f('')
        return ' '.join(['%s' % (f(x),) for x in self])

    def reversed(self):
        r = list(self)
        r.reverse()
        return self.__class__(*r)

    def unionsplitted(self):
        return [self]


def Single(symbol):
    return REBASE.__new__(Seq, (symbol,))


Epsilon = REBASE.__new__(Seq, ())


def concat(*args):
    args = [x for x in args if x is not Epsilon]
    if len(args) < 2:
        if not args:
            return Epsilon
        return args[0]
    return REBASE.__new__(Concatenation, args)


class Concatenation(RE):
    _priority = 2
    _name = 'Concat'

    def __new__(clas, *args):
        if len(args) < 2:
            if not args:
                return Epsilon
            return args[0]
        return REBASE.__new__(clas, args)

    def __repr__(self):
        rs = []
        for ch in self:
            r = '%r' % (ch,)
            if ch._priority > self._priority:
                r = '(%s)' % (r,)
            rs.append(r)
        return ' + '.join(rs)

    def apseq(self, ap):
        uns = [x.sequni() for x in self]
        ixs = [0]*len(uns)
        while 1:
            xs = []
            for (i, us) in enumerate(uns):
                for x in us[ixs[i]]:
                    if x is not Epsilon:
                        xs.append(x)
            ap(Seq(*xs))
            j = 0
            for j, ix in enumerate(ixs):
                ix += 1
                if ix >= len(uns[j]):
                    ix = 0
                ixs[j] = ix
                if ix != 0:
                    break
            else:
                break

    def apseqatoms(self, ap):
        for x in self:
            x.apseqatoms(ap)

    def get_words_of_length_memoized(self, N, memo):
        chxs = []
        for ch in memo.ch:
            chxs.append(ch.get_words_of_length_upto(N))
        xs = []
        seen = {}

        def ads(xx, i, n):
            if i == len(chxs):
                if n == N:
                    for toconc in iterpermute(*xx):
                        conc = simple_Concatenation(toconc)
                        if conc not in seen:
                            xs.append(conc)
                            seen[conc] = 1
            else:
                for m, x in chxs[i]:
                    if n + m <= N:
                        ads(xx + [x], i + 1, n + m)

        ads([], 0, 0)
        return xs

    def limited(self, N):
        return Concatenation(*[x.limited(N) for x in self])

    def mappedrepr(self, f):
        rs = []
        for ch in self:
            r = ch.mappedrepr(f)
            if ch._priority > self._priority:
                r = '(%s)' % (r,)
            rs.append(r)
        return ' '.join(rs)

    def reversed(self):
        r = [x.reversed() for x in self]
        r.reverse()
        return self.__class__(*r)

    def simplified(self, *a, **k):
        conc = [x.simplified(*a, **k) for x in self]
        sa = []
        for c in conc:
            for a in c.seqatoms():
                sa.append(a)
        return simple_Concatenation(sa)

    def unionsplitted(self):
        runs = []
        uns = []
        for (i, x) in enumerate(self):
            us = x.unionsplitted()
            if len(us) > 1:
                uns.append((i, us))
        if not uns:
            return [self]
        ixs = [0]*len(uns)
        ch = list(self)
        while 1:
            xs = []
            i0 = 0
            for j, (i, us) in enumerate(uns):
                xs.extend(ch[i0:i])
                ix = ixs[j]
                xs.append(us[ix])
                i0 = i + 1
            xs.extend(ch[i0:])
            runs.append(concat(*xs))

            j = 0
            for j, ix in enumerate(ixs):
                ix += 1
                if ix >= len(uns[j][1]):
                    ix = 0
                ixs[j] = ix
                if ix != 0:
                    break
            else:
                return runs


class SimplifiedConcatenation(Concatenation):
    def simplified(self, *a, **k):
        return self


def conclosure(conc):
    # Simplification noted Mar 5 2005
    # Simplify ... b b* ... or ... b* b ... to ... b+ ...
    # conc is a sequence of regular expressions

    seen = {}
    nconc = []
    w0 = None
    for w in conc:
        if w0 is not None:
            if (w._name == '*' and      # Not isinstance(KleeneClosure), would catch PositiveClosure
                    w[0] == w0):
                w = PositiveClosure(w0)
            elif (w0._name == '*' and
                  w0[0] == w):
                w = PositiveClosure(w)
            else:
                if w0 is not None:
                    nconc.append(w0)
        w0 = w
    if w0 is not None:
        nconc.append(w0)
    return nconc


def simple_Concatenation(conc):
    if len(conc) > 1:
        conc0 = conc
        conc = conclosure(conc)
    nconc = []
    i = 0
    j = 0
    while i < len(conc):
        e = conc[i]
        if not isinstance(e, Seq):
            i += 1
            nconc.append(e)
            continue
        j = i
        while j < len(conc):
            if not isinstance(conc[j], Seq):
                break
            j += 1
        if j == i + 1:
            nconc.append(e)
        else:
            syms = []
            for k in range(i, j):
                e = conc[k]
                syms.extend(list(e))
            nconc.append(Seq(*syms))
        i = j
    if len(nconc) > 1:
        return Concatenation(*nconc)
    elif nconc:
        return nconc[0]
    else:
        return Epsilon


gauges = [
    lambda x:x.get_num_syms(),
    lambda x:x.get_num_closures(),
    lambda x:x.get_sum_sym_lengths()
]


def simpleunion(lines):
    choosen = chooserects(lines, gauges)
    have_epsilon = 0
    while 1:
        if len(choosen) == 1 and (choosen[0].width == 0 or len(choosen[0].lines) == 1):
            us = []
            for line in choosen[0].lines:
                if line:
                    us.append(line)
                else:
                    have_epsilon = 1
            break
        us = []
        for r in choosen:
            conc = r.get_common_part()
            olines = r.get_uncommons()
            u = simpleunion(olines)
            if u is not Epsilon:
                if r.dir == -1:
                    conc = [u]+conc
                else:
                    conc = conc + [u]
            if conc:
                us.append(conc)
            else:
                have_epsilon = 1
            assert not isinstance(us[-1], str)

        choosen = chooserects(us, gauges)

    if len(us) > 1:
        nus = [simple_Concatenation(line) for line in us]
        u = SimplifiedUnion(*nus)
    elif us:
        u = simple_Concatenation(us[0])
    else:
        u = None
    if have_epsilon:
        if u is not None:
            u = simple_EpsilonOrOne(u)
        else:
            u = Epsilon

    return u


class Union(RE):
    _priority = 3
    _name = 'Union'

    def __new__(clas, *args):
        return REBASE.__new__(clas, args)

    def __repr__(self):
        rs = []
        for ch in self:
            r = '%r' % (ch,)
            if ch._priority > self._priority:
                r = '(%s)' % r
            rs.append(r)
        return ' | '.join(rs)

    def apseq(self, ap):
        for c in self:
            c.apseq(ap)

    def apseqatoms(self, ap):
        for x in self:
            x.apseqatoms(ap)

    def get_words_of_length_memoized(self, N, memo):
        xs = []
        seen = {}
        for ch in memo.ch:
            for x in ch.get_words_of_length(N):
                if x not in seen:
                    seen[x] = 1
                    xs.append(x)
        return xs

    def limited(self, N):
        uni = [x.limited(N) for x in self]
        for i, x in enumerate(uni):
            if x is not self[i]:
                return self.__class__(*uni)
        return self

    def mappedrepr(self, f):
        rs = []
        for ch in self:
            r = '%s' % (ch.mappedrepr(f),)
            if ch._priority > self._priority:
                r = '(%s)' % r
            rs.append(r)
        return ' | '.join(rs)

    def simplified(self, args=None, *a, **k):
        if args is None:
            args = [x.simplified() for x in self.unionsplitted()]
            #args = [x for x in self.unionsplitted()]

        # Create a simplfied union
        # Assuming args are simplified, non-unions

        ch = [a.seqatoms() for a in args]
        return simpleunion(ch)

    def unionsplitted(self):
        us = []
        for x in self:
            us.extend(list(x.unionsplitted()))
        return us


class SimplifiedUnion(Union):
    def simplified(self, *a, **k):
        return self


class Called(RE):
    _priority = 1

    def __new__(clas, arg):
        return REBASE.__new__(clas, (arg,))

    def __repr__(self):
        ch = self[0]
        r = '%r' % (ch,)
        if ch._priority > self._priority:
            r = '(%s)' % r
        return "%s(%r)" % (r, self._name)

    def apseqatoms(self, ap):
        ap(self)

    def get_num_closures(self):
        return 1 + self[0].get_num_closures()

    def mappedrepr(self, f):
        ch = self[0]
        r = ch.mappedrepr(f)
        if (ch._priority > self._priority
                or isinstance(ch, Seq) and len(ch) > 1):
            r = '(%s)' % r
        return "%s%s" % (r, self._name)

    def simplified(self, *a, **k):
        return self.__class__(self[0].simplified(*a, **k))


class Closure(Called):
    def get_words_of_length_memoized(self, N, memo):
        if N == 0:
            return [Epsilon]
        if N == 1:
            return memo.ch[0].get_words_of_length(1)
        xs = []
        seen = {}
        for i in range(1, N):
            a = memo.get_words_of_length(i)
            b = memo.get_words_of_length(N-i)
            for ai in a:
                for bi in b:
                    aibi = simple_Concatenation((ai, bi))
                    if aibi not in seen:
                        xs.append(aibi)
                        seen[aibi] = 1
        for x in memo.ch[0].get_words_of_length(N):
            if x not in seen:
                xs.append(x)
                seen[x] = 1
        return xs

    def unionsplitted(self):
        return [self]


class KleeneClosure(Closure):
    _name = '*'

    def apseq(self, ap):
        raise InfiniteError(
            'apseq: Regular expression is infinite: contains a Kleene Closure')

    def limited(self, N):
        if N == 0:
            return Epsilon
        cl = self[0].limited(N)
        uni = []
        for i in range(N+1):
            toconc = [cl]*i
            uni.append(Concatenation(*toconc))
        return Union(*uni)

    def simplified(self, *a, **k):
        return simple_KleeneClosure(self[0].simplified(*a, **k))


def simple_KleeneClosure(x):
    # (b+)* -> b*
    if x._name == '+':
        return simple_KleeneClosure(x[0])
    return KleeneClosure(x)


class PositiveClosure(Closure):
    _name = '+'

    def apseq(self, ap):
        raise InfiniteError(
            'apseq: Regular expression is infinite: contains a Positive Closure')

    def apseqatoms(self, ap):
        self[0].apseqatoms(ap)
        simple_KleeneClosure(self[0]).apseqatoms(ap)

    def get_words_of_length_memoized(self, N, memo):
        if N <= 1:
            return memo.ch[0].get_words_of_length(N)
        return Closure.get_words_of_length_memoized(self, N, memo)

    def limited(self, N):
        a = self[0].limited(N)
        b = KleeneClosure(self[0]).limited(N)
        return Concatenation(a, b)


class EpsilonOrOne(Called):
    _name = '?'

    def apseq(self, ap):
        ap(Epsilon)
        self[0].apseq(ap)

    def get_words_of_length_memoized(self, N, memo):
        if N == 0:
            return [Epsilon]
        return memo.ch[0].get_words_of_length(N)

    def limited(self, N):
        x = self[0].limited(N)
        if x is not self[0]:
            self = self.__class__(x)
        return self

    def simplified(self, *a, **k):
        return simple_EpsilonOrOne(self[0].simplified(*a, **k))

    def unionsplitted(self):
        return [Epsilon] + list(self[0].unionsplitted())


def simple_EpsilonOrOne(x):
    # (a+)? -> a*

    if x._name == '+':
        return simple_KleeneClosure(x)

    # (a*)? -> a*
    if x._name == '*':
        return x

    return EpsilonOrOne(x)


class RegularSystem:

    def __init__(self, table, Start, final_states):
        self.table = table
        self.Start = Start
        self.Final = '358f0eca5c34bacdfbf6a8ac0ccf84bc'
        self.final_states = final_states

    def pp(self):

        def statename(state):
            try:
                name = self.names[state]
            except KeyError:
                name = str(state)
            return name

        def transname(trans):
            name = trans.simulform()
            if trans._priority > 1:
                name = '(%s)' % (name,)
            return name

        self.setup_names()

        X = self.X

        xs = [self.Start]+self.order
        xs.append(self.Final)
        for Xk in xs:
            if Xk not in X:
                continue
            print('%3s = ' % (statename(Xk),), end=' ')
            Tk = X[Xk]
            es = []
            for Xj in xs:
                if Xj in Tk:
                    es.append('%s %s' % (transname(Tk[Xj]), statename(Xj)))
            if es:
                print(' | '.join(es))
            else:
                print()

    def setup_equations(self):
        table = self.table
        final_states = self.final_states
        Final = self.Final
        self.X = X = {Final: {}}
        for Xi, transitions in list(table.items()):
            X[Xi] = Ti = {}
            for (symbol, Xj) in list(transitions.items()):
                Ti.setdefault(Xj, []).append(Single(symbol))
            for Xj, Aij in list(Ti.items()):
                if len(Aij) > 1:
                    Aij.sort()
                    Aij = Union(*Aij)
                else:
                    Aij = Aij[0]
                Ti[Xj] = Aij
            if Xi in final_states:
                Ti[Final] = Epsilon

    def setup_order(self):
        def dists(X, start):
            i = 0
            S = {start: i}
            news = [start]
            while news:
                oldnews = news
                news = []
                i += 1
                for s in oldnews:
                    if s not in X:
                        continue
                    for t in X[s]:
                        if t not in S:
                            news.append(t)
                            S[t] = i
            return S

        def start_distance(x):
            return start_dists[x]

        def sumt(f):
            memo = {}

            def g(x):
                if x in memo:
                    return memo[x]
                s = 0.0
                for y in X[x]:
                    s += f(y)
                memo[x] = s
                return s
            return g

        def cmp3(x, y):
            # Comparison for the sorting of equation solving order
            # First in list = solved last

            if x is y:
                return 0

            # Equations with more terms are resolved later
            c = len(X[y]) - len(X[x])
            if c:
                return c

            # The equations with terms more distant from start node will be resolved earlier
            i = 0
            while i < 10:  # 4 was enough with tests so far at Feb 24 2005
                try:
                    f = sumdists[i]
                except IndexError:
                    f = sumt(sumdists[i-1])
                    sumdists.append(f)
                c = f(x) - f(y)
                if c:
                    return c
                i += 1

            return (x > y) - (x < y)

        sumdists = [start_distance]
        X = self.X
        Start = self.Start
        Final = self.Final
        start_dists = dists(X, Start)
        order = [x for x in start_dists if x is not Start and x is not Final]
        order.sort(key=functools.cmp_to_key(cmp3))
        self.order = order

    def setup_names(self):
        try:
            self.order
        except AttributeError:
            self.setup_order()
        self.names = {}
        self.names[self.Start] = 'X0'

        for i, s in enumerate(self.order):
            self.names[s] = 'X%d' % (i+1)
        self.names[self.Final] = 'Final'

    def solve(self):
        # Set up equation system

        self.setup_equations()
        self.setup_order()

        X = self.X
        Start = self.Start
        Final = self.Final
        todo = list(self.order)

        # Solve equation system

        while todo:
            Xk = todo.pop()
            Tk = X[Xk]

            if Xk in Tk:
                # Recursive equation
                # Eliminate Akk Xk, using Adler's theorem
                # Given:
                # Xk = Ak0 X0 | ... Akk Xk |.. Akn Xkn
                # we get:
                # Xk = Akk* (Ak0 X0 | ... <no Xk> ... | Akn Xn)
                # which we evaluate to:
                # Xk = Bk0 X0 | ... Bkn Xn
                # where coefficients get the new values
                # Bki := Akk* Aki

                Akk = Tk[Xk]
                del Tk[Xk]

                AkkStar = Akk('*')
                for Xi, Aki in list(Tk.items()):
                    Bki = AkkStar + Aki
                    Tk[Xi] = Bki

            # Substitute Xk in each other equation in X
            # containing Xk, except eqv. Xk itself, which will not be used any more..

            del X[Xk]

            for Xj, Tj in list(X.items()):
                Bjk = Tj.get(Xk)
                if Bjk is None:
                    continue
                del Tj[Xk]
                for Xji, Tk_Xji in list(Tk.items()):
                    Cji = (Bjk + Tk_Xji)
                    Bji = Tj.get(Xji)
                    if Bji is not None:
                        Cji = Bji | Cji
                    Tj[Xji] = Cji

        # The equation system is now solved
        # The result is in Final term of Start equation

        return X[Start][Final]


Nothing = Union()


def SolveFSA(fsa):
    RS = RegularSystem(fsa.table, fsa.start_state, fsa.final_states)
    return RS.solve()