1# built-in 2from collections import Counter 3from contextlib import suppress 4 5# app 6from ..libraries import prototype 7from ..utils import find_ngrams 8 9 10libraries = prototype.clone() 11libraries.optimize() 12 13 14class Base: 15 def __init__(self, qval=1, external=True): 16 self.qval = qval 17 self.external = external 18 19 def __call__(self, *sequences): 20 raise NotImplementedError 21 22 @staticmethod 23 def maximum(*sequences): 24 """Get maximum possible value 25 """ 26 return max(map(len, sequences)) 27 28 def distance(self, *sequences): 29 """Get distance between sequences 30 """ 31 return self(*sequences) 32 33 def similarity(self, *sequences): 34 """Get sequences similarity. 35 36 similarity = maximum - distance 37 """ 38 return self.maximum(*sequences) - self.distance(*sequences) 39 40 def normalized_distance(self, *sequences): 41 """Get distance from 0 to 1 42 """ 43 maximum = self.maximum(*sequences) 44 if maximum == 0: 45 return 0 46 return self.distance(*sequences) / maximum 47 48 def normalized_similarity(self, *sequences): 49 """Get similarity from 0 to 1 50 51 normalized_similarity = 1 - normalized_distance 52 """ 53 return 1 - self.normalized_distance(*sequences) 54 55 def external_answer(self, *sequences): 56 """Try to get answer from known external libraries. 57 """ 58 # if this feature disabled 59 if not getattr(self, 'external', False): 60 return 61 # all external libs doesn't support test_func 62 if hasattr(self, 'test_func') and self.test_func is not self._ident: 63 return 64 # try to get external libs for algorithm 65 libs = libraries.get_libs(self.__class__.__name__) 66 for lib in libs: 67 # if conditions not satisfied 68 if not lib.check_conditions(self, *sequences): 69 continue 70 # if library is not installed yet 71 if not lib.get_function(): 72 continue 73 74 prepared_sequences = lib.prepare(*sequences) 75 # fail side libraries silently and try next libs 76 with suppress(Exception): 77 return lib.func(*prepared_sequences) 78 79 def quick_answer(self, *sequences): 80 """Try to get answer quick without main implementation calling. 81 82 If no sequences, 1 sequence or all sequences are equal then return 0. 83 If any sequence are empty then return maximum. 84 And in finish try to get external answer. 85 """ 86 if not sequences: 87 return 0 88 if len(sequences) == 1: 89 return 0 90 if self._ident(*sequences): 91 return 0 92 if not all(sequences): 93 return self.maximum(*sequences) 94 # try get answer from external libs 95 answer = self.external_answer(*sequences) 96 if answer is not None: 97 return answer 98 99 @staticmethod 100 def _ident(*elements): 101 """Return True if all sequences are equal. 102 """ 103 try: 104 # for hashable elements 105 return len(set(elements)) == 1 106 except TypeError: 107 # for unhashable elements 108 for e1, e2 in zip(elements, elements[1:]): 109 if e1 != e2: 110 return False 111 return True 112 113 def _get_sequences(self, *sequences): 114 """Prepare sequences. 115 116 qval=None: split text by words 117 qval=1: do not split sequences. For text this is mean comparing by letters. 118 qval>1: split sequences by q-grams 119 """ 120 # by words 121 if not self.qval: 122 return [s.split() for s in sequences] 123 # by chars 124 if self.qval == 1: 125 return sequences 126 # by n-grams 127 return [find_ngrams(s, self.qval) for s in sequences] 128 129 def _get_counters(self, *sequences): 130 """Prepare sequences and convert it to Counters. 131 """ 132 # already Counters 133 if all(isinstance(s, Counter) for s in sequences): 134 return sequences 135 return [Counter(s) for s in self._get_sequences(*sequences)] 136 137 def _intersect_counters(self, *sequences): 138 intersection = sequences[0].copy() 139 for s in sequences[1:]: 140 intersection &= s 141 return intersection 142 143 def _union_counters(self, *sequences): 144 union = sequences[0].copy() 145 for s in sequences[1:]: 146 union |= s 147 return union 148 149 def _sum_counters(self, *sequences): 150 result = sequences[0].copy() 151 for s in sequences[1:]: 152 result += s 153 return result 154 155 def _count_counters(self, counter): 156 """Return all elements count from Counter 157 """ 158 if getattr(self, 'as_set', False): 159 return len(set(counter)) 160 else: 161 return sum(counter.values()) 162 163 def __repr__(self): 164 return '{name}({data})'.format( 165 name=type(self).__name__, 166 data=self.__dict__, 167 ) 168 169 170class BaseSimilarity(Base): 171 def distance(self, *sequences): 172 return self.maximum(*sequences) - self.similarity(*sequences) 173 174 def similarity(self, *sequences): 175 return self(*sequences) 176 177 def quick_answer(self, *sequences): 178 if not sequences: 179 return self.maximum(*sequences) 180 if len(sequences) == 1: 181 return self.maximum(*sequences) 182 if self._ident(*sequences): 183 return self.maximum(*sequences) 184 if not all(sequences): 185 return 0 186 # try get answer from external libs 187 answer = self.external_answer(*sequences) 188 if answer is not None: 189 return answer 190