1# built-in 2from difflib import SequenceMatcher as _SequenceMatcher 3 4# app 5from ..utils import find_ngrams 6from .base import BaseSimilarity as _BaseSimilarity 7 8 9try: 10 import numpy 11except ImportError: 12 from array import array 13 numpy = None 14 15 16__all__ = [ 17 'lcsseq', 'lcsstr', 'ratcliff_obershelp', 18 'LCSSeq', 'LCSStr', 'RatcliffObershelp', 19] 20 21 22class LCSSeq(_BaseSimilarity): 23 """longest common subsequence similarity 24 25 https://en.wikipedia.org/wiki/Longest_common_subsequence_problem 26 """ 27 def __init__(self, qval=1, test_func=None, external=True): 28 self.qval = qval 29 self.test_func = test_func or self._ident 30 self.external = external 31 32 def _dynamic(self, seq1, seq2): 33 """ 34 https://github.com/chrislit/abydos/blob/master/abydos/distance/_lcsseq.py 35 http://www.dis.uniroma1.it/~bonifaci/algo/LCSSEQ.py 36 http://rosettacode.org/wiki/Longest_common_subsequence#Dynamic_Programming_8 37 """ 38 if numpy: 39 lengths = numpy.zeros((len(seq1) + 1, len(seq2) + 1), dtype=int) 40 else: 41 lengths = [array('L', [0] * (len(seq2) + 1)) for _ in range(len(seq1) + 1)] 42 43 # row 0 and column 0 are initialized to 0 already 44 for i, char1 in enumerate(seq1): 45 for j, char2 in enumerate(seq2): 46 if char1 == char2: 47 lengths[i + 1][j + 1] = lengths[i][j] + 1 48 else: 49 lengths[i + 1][j + 1] = max(lengths[i + 1][j], lengths[i][j + 1]) 50 51 # read the substring out from the matrix 52 result = '' 53 i, j = len(seq1), len(seq2) 54 while i != 0 and j != 0: 55 if lengths[i][j] == lengths[i - 1][j]: 56 i -= 1 57 elif lengths[i][j] == lengths[i][j - 1]: 58 j -= 1 59 else: 60 assert seq1[i - 1] == seq2[j - 1] 61 result = seq1[i - 1] + result 62 i -= 1 63 j -= 1 64 return result 65 66 def _recursive(self, *sequences): 67 if not all(sequences): 68 return type(sequences[0])() # empty sequence 69 if self.test_func(*[s[-1] for s in sequences]): 70 c = sequences[0][-1] 71 sequences = [s[:-1] for s in sequences] 72 return self(*sequences) + c 73 m = type(sequences[0])() # empty sequence 74 for i, s in enumerate(sequences): 75 ss = sequences[:i] + (s[:-1], ) + sequences[i + 1:] 76 m = max([self(*ss), m], key=len) 77 return m 78 79 def __call__(self, *sequences): 80 if not sequences: 81 return '' 82 sequences = self._get_sequences(*sequences) 83 if len(sequences) == 2: 84 return self._dynamic(*sequences) 85 else: 86 return self._recursive(*sequences) 87 88 def similarity(self, *sequences): 89 return len(self(*sequences)) 90 91 92class LCSStr(_BaseSimilarity): 93 """longest common substring similarity 94 """ 95 def _standart(self, s1, s2): 96 matcher = _SequenceMatcher(a=s1, b=s2) 97 match = matcher.find_longest_match(0, len(s1), 0, len(s2)) 98 return s1[match.a: match.a + match.size] 99 100 def _custom(self, *sequences): 101 short = min(sequences, key=len) 102 length = len(short) 103 for n in range(length, 0, -1): 104 for subseq in find_ngrams(short, n): 105 subseq = ''.join(subseq) 106 for seq in sequences: 107 if subseq not in seq: 108 break 109 else: 110 return subseq 111 return type(short)() # empty sequence 112 113 def __call__(self, *sequences): 114 if not all(sequences): 115 return '' 116 length = len(sequences) 117 if length == 0: 118 return '' 119 if length == 1: 120 return sequences[0] 121 122 sequences = self._get_sequences(*sequences) 123 if length == 2 and max(map(len, sequences)) < 200: 124 return self._standart(*sequences) 125 return self._custom(*sequences) 126 127 def similarity(self, *sequences): 128 return len(self(*sequences)) 129 130 131class RatcliffObershelp(_BaseSimilarity): 132 """Ratcliff-Obershelp similarity 133 This follows the Ratcliff-Obershelp algorithm to derive a similarity 134 measure: 135 1. Find the length of the longest common substring in sequences. 136 2. Recurse on the strings to the left & right of each this substring 137 in sequences. The base case is a 0 length common substring, in which 138 case, return 0. Otherwise, return the sum of the current longest 139 common substring and the left & right recursed sums. 140 3. Multiply this length by 2 and divide by the sum of the lengths of 141 sequences. 142 143 https://en.wikipedia.org/wiki/Gestalt_Pattern_Matching 144 https://github.com/Yomguithereal/talisman/blob/master/src/metrics/ratcliff-obershelp.js 145 https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html 146 """ 147 148 def maximum(self, *sequences): 149 return 1 150 151 def _find(self, *sequences): 152 subseq = LCSStr()(*sequences) 153 length = len(subseq) 154 if length == 0: 155 return 0 156 before = [s[:s.find(subseq)] for s in sequences] 157 after = [s[s.find(subseq) + length:] for s in sequences] 158 return self._find(*before) + length + self._find(*after) 159 160 def __call__(self, *sequences): 161 result = self.quick_answer(*sequences) 162 if result is not None: 163 return result 164 scount = len(sequences) # sequences count 165 ecount = sum(map(len, sequences)) # elements count 166 sequences = self._get_sequences(*sequences) 167 return scount * self._find(*sequences) / ecount 168 169 170lcsseq = LCSSeq() 171lcsstr = LCSStr() 172ratcliff_obershelp = RatcliffObershelp() 173