1# built-in
2from difflib import SequenceMatcher as _SequenceMatcher
3
4# app
5from ..utils import find_ngrams
6from .base import BaseSimilarity as _BaseSimilarity
7
8
9try:
10    import numpy
11except ImportError:
12    from array import array
13    numpy = None
14
15
16__all__ = [
17    'lcsseq', 'lcsstr', 'ratcliff_obershelp',
18    'LCSSeq', 'LCSStr', 'RatcliffObershelp',
19]
20
21
22class LCSSeq(_BaseSimilarity):
23    """longest common subsequence similarity
24
25    https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
26    """
27    def __init__(self, qval=1, test_func=None, external=True):
28        self.qval = qval
29        self.test_func = test_func or self._ident
30        self.external = external
31
32    def _dynamic(self, seq1, seq2):
33        """
34        https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py
35        http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py
36        http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8
37        """
38        if numpy:
39            lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int)
40        else:
41            lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)]
42
43        # row 0 and column 0 are initialized to 0 already
44        for i, char1 in enumerate(seq1):
45            for j, char2 in enumerate(seq2):
46                if char1 == char2:
47                    lengths[i + 1][j + 1] = lengths[i][j] + 1
48                else:
49                    lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1])
50
51        # read the substring out from the matrix
52        result = ''
53        i, j = len(seq1), len(seq2)
54        while i != 0 and j != 0:
55            if lengths[i][j] == lengths[i - 1][j]:
56                i -= 1
57            elif lengths[i][j] == lengths[i][j - 1]:
58                j -= 1
59            else:
60                assert seq1[i - 1] == seq2[j - 1]
61                result = seq1[i - 1] + result
62                i -= 1
63                j -= 1
64        return result
65
66    def _recursive(self, *sequences):
67        if not all(sequences):
68            return type(sequences[0])()  # empty sequence
69        if self.test_func(*[s[-1] for s in sequences]):
70            c = sequences[0][-1]
71            sequences = [s[:-1] for s in sequences]
72            return self(*sequences) + c
73        m = type(sequences[0])()  # empty sequence
74        for i, s in enumerate(sequences):
75            ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:]
76            m = max([self(*ss), m], key=len)
77        return m
78
79    def __call__(self, *sequences):
80        if not sequences:
81            return ''
82        sequences = self._get_sequences(*sequences)
83        if len(sequences) == 2:
84            return self._dynamic(*sequences)
85        else:
86            return self._recursive(*sequences)
87
88    def similarity(self, *sequences):
89        return len(self(*sequences))
90
91
92class LCSStr(_BaseSimilarity):
93    """longest common substring similarity
94    """
95    def _standart(self, s1, s2):
96        matcher = _SequenceMatcher(a=s1, b=s2)
97        match = matcher.find_longest_match(0, len(s1), 0, len(s2))
98        return s1[match.a: match.a + match.size]
99
100    def _custom(self, *sequences):
101        short = min(sequences, key=len)
102        length = len(short)
103        for n in range(length, 0, -1):
104            for subseq in find_ngrams(short, n):
105                subseq = ''.join(subseq)
106                for seq in sequences:
107                    if subseq not in seq:
108                        break
109                else:
110                    return subseq
111        return type(short)()  # empty sequence
112
113    def __call__(self, *sequences):
114        if not all(sequences):
115            return ''
116        length = len(sequences)
117        if length == 0:
118            return ''
119        if length == 1:
120            return sequences[0]
121
122        sequences = self._get_sequences(*sequences)
123        if length == 2 and max(map(len, sequences)) < 200:
124            return self._standart(*sequences)
125        return self._custom(*sequences)
126
127    def similarity(self, *sequences):
128        return len(self(*sequences))
129
130
131class RatcliffObershelp(_BaseSimilarity):
132    """Ratcliff-Obershelp similarity
133    This follows the Ratcliff-Obershelp algorithm to derive a similarity
134    measure:
135        1. Find the length of the longest common substring in sequences.
136        2. Recurse on the strings to the left & right of each this substring
137           in sequences. The base case is a 0 length common substring, in which
138           case, return 0. Otherwise, return the sum of the current longest
139           common substring and the left & right recursed sums.
140        3. Multiply this length by 2 and divide by the sum of the lengths of
141           sequences.
142
143    https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching
144    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js
145    https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html
146    """
147
148    def maximum(self, *sequences):
149        return 1
150
151    def _find(self, *sequences):
152        subseq = LCSStr()(*sequences)
153        length = len(subseq)
154        if length == 0:
155            return 0
156        before = [s[:s.find(subseq)] for s in sequences]
157        after = [s[s.find(subseq) + length:] for s in sequences]
158        return self._find(*before) + length + self._find(*after)
159
160    def __call__(self, *sequences):
161        result = self.quick_answer(*sequences)
162        if result is not None:
163            return result
164        scount = len(sequences)  # sequences count
165        ecount = sum(map(len, sequences))  # elements count
166        sequences = self._get_sequences(*sequences)
167        return scount * self._find(*sequences) / ecount
168
169
170lcsseq = LCSSeq()
171lcsstr = LCSStr()
172ratcliff_obershelp = RatcliffObershelp()
173