1from collections.abc import Iterable 2from numbers import Real 3import zlib 4 5import numpy as np 6 7from Orange import data 8 9 10def _get_variable(dat, variable, expected_type=None, expected_name=""): 11 """Get the variable instance from data.""" 12 failed = False 13 if isinstance(variable, data.Variable): 14 datvar = getattr(dat, "variable", None) 15 if datvar is not None and datvar is not variable: 16 raise ValueError("variable does not match the variable in the data") 17 elif hasattr(dat, "domain"): 18 variable = dat.domain[variable] 19 elif hasattr(dat, "variable"): 20 variable = dat.variable 21 else: 22 failed = True 23 if failed or (expected_type is not None and not isinstance(variable, expected_type)): 24 if isinstance(variable, data.Variable): 25 raise ValueError("expected %s variable not %s" % (expected_name, variable)) 26 else: 27 raise ValueError("expected %s, not '%s'" % ( 28 expected_type.__name__, type(variable).__name__)) 29 return variable 30 31 32class Distribution(np.ndarray): 33 def __eq__(self, other): 34 return ( 35 np.array_equal(self, other) and 36 (not hasattr(other, "unknowns") or self.unknowns == other.unknowns) 37 ) 38 39 def __ne__(self, other): 40 return not self == other 41 42 def __hash__(self): 43 return zlib.adler32(self) ^ hash(self.unknowns) 44 45 def sample(self, size=None, replace=True): 46 """Get a random sample from the distribution. 47 48 Parameters 49 ---------- 50 size : Optional[Union[int, Tuple[int, ...]]] 51 replace : bool 52 53 Returns 54 ------- 55 Union[float, data.Value, np.ndarray] 56 57 """ 58 raise NotImplementedError 59 60 def normalize(self): 61 """Normalize the distribution to a probability distribution.""" 62 raise NotImplementedError 63 64 def min(self): 65 """Get the smallest value for the distribution. 66 67 If the variable is not ordinal, return None. 68 69 """ 70 raise NotImplementedError 71 72 def max(self): 73 """Get the largest value for the distribution. 74 75 If the variable is not ordinal, return None. 76 77 """ 78 raise NotImplementedError 79 80 81class Discrete(Distribution): 82 def __new__(cls, dat, variable=None, unknowns=None): 83 if isinstance(dat, data.Storage): 84 if unknowns is not None: 85 raise TypeError("incompatible arguments (data storage and 'unknowns'") 86 return cls.from_data(dat, variable) 87 88 if variable is not None: 89 variable = _get_variable(dat, variable) 90 n = len(variable.values) 91 else: 92 n = len(dat) 93 94 self = super().__new__(cls, n) 95 self.variable = variable 96 if dat is None: 97 self[:] = 0 98 self.unknowns = unknowns or 0 99 else: 100 self[:] = dat 101 self.unknowns = unknowns if unknowns is not None else getattr(dat, "unknowns", 0) 102 return self 103 104 @classmethod 105 def from_data(cls, data, variable): 106 variable = _get_variable(data, variable) 107 try: 108 dist, unknowns = data._compute_distributions([variable])[0] 109 self = super().__new__(cls, len(dist)) 110 self[:] = dist 111 self.unknowns = unknowns 112 except NotImplementedError: 113 self = super().__new__(cls, len(variable.values)) 114 self[:] = np.zeros(len(variable.values)) 115 self.unknowns = 0 116 if data.has_weights(): 117 for inst, w in zip(data, data.W): 118 val = inst[variable] 119 if not np.isnan(val): 120 self[int(val)] += w 121 else: 122 self.unknowns += w 123 else: 124 for inst in data: 125 val = inst[variable] 126 if val == val: 127 self[int(val)] += 1 128 else: 129 self.unknowns += 1 130 self.variable = variable 131 return self 132 133 @property 134 def array_with_unknowns(self): 135 """ 136 This property returns a distribution array with unknowns added 137 at the end 138 139 Returns 140 ------- 141 np.array 142 Array with appended unknowns at the end of the row. 143 """ 144 return np.append(np.array(self), self.unknowns) 145 146 def __getitem__(self, index): 147 if isinstance(index, str): 148 index = self.variable.to_val(index) 149 return super().__getitem__(index) 150 151 def __setitem__(self, index, value): 152 if isinstance(index, str): 153 index = self.variable.to_val(index) 154 super().__setitem__(index, value) 155 156 def __add__(self, other): 157 s = super().__add__(other) 158 s.unknowns = self.unknowns + getattr(other, "unknowns", 0) 159 return s 160 161 def __iadd__(self, other): 162 super().__iadd__(other) 163 self.unknowns += getattr(other, "unknowns", 0) 164 return self 165 166 def __sub__(self, other): 167 s = super().__sub__(other) 168 s.unknowns = self.unknowns - getattr(other, "unknowns", 0) 169 return s 170 171 def __isub__(self, other): 172 super().__isub__(other) 173 self.unknowns -= getattr(other, "unknowns", 0) 174 return self 175 176 def __mul__(self, other): 177 s = super().__mul__(other) 178 if isinstance(other, Real): 179 s.unknowns = self.unknowns / other 180 return s 181 182 def __imul__(self, other): 183 super().__imul__(other) 184 if isinstance(other, Real): 185 self.unknowns *= other 186 return self 187 188 def __div__(self, other): 189 s = super().__mul__(other) 190 if isinstance(other, Real): 191 s.unknowns = self.unknowns / other 192 return s 193 194 def __idiv__(self, other): 195 super().__imul__(other) 196 if isinstance(other, Real): 197 self.unknowns /= other 198 return self 199 200 def normalize(self): 201 t = np.sum(self) 202 if t > 1e-6: 203 self[:] /= t 204 self.unknowns /= t 205 elif self.shape[0]: 206 self[:] = 1 / self.shape[0] 207 208 def modus(self): 209 val = np.argmax(self) 210 return data.Value(self.variable, val) if self.variable is not None else val 211 212 def sample(self, size=None, replace=True): 213 value_indices = np.random.choice(range(len(self)), size, replace, self.normalize()) 214 if isinstance(value_indices, Iterable): 215 to_value = np.vectorize(lambda idx: data.Value(self.variable, idx)) 216 return to_value(value_indices) 217 return data.Value(self.variable, value_indices) 218 219 def min(self): 220 return None 221 222 def max(self): 223 return None 224 225 def sum(self, *args, **kwargs): 226 res = super().sum(*args, **kwargs) 227 res.unknowns = self.unknowns 228 return res 229 230 231class Continuous(Distribution): 232 def __new__(cls, dat, variable=None, unknowns=None): 233 if isinstance(dat, data.Storage): 234 if unknowns is not None: 235 raise TypeError("incompatible arguments (data storage and 'unknowns'") 236 return cls.from_data(variable, dat) 237 if isinstance(dat, int): 238 self = super().__new__(cls, (2, dat)) 239 self[:] = 0 240 self.unknowns = unknowns or 0 241 else: 242 if not isinstance(dat, np.ndarray): 243 dat = np.asarray(dat) 244 self = super().__new__(cls, dat.shape) 245 self[:] = dat 246 self.unknowns = (unknowns if unknowns is not None else getattr(dat, "unknowns", 0)) 247 self.variable = variable 248 return self 249 250 @classmethod 251 def from_data(cls, variable, data): 252 variable = _get_variable(data, variable) 253 try: 254 dist, unknowns = data._compute_distributions([variable])[0] 255 except NotImplementedError: 256 col = data[:, variable] 257 dtype = col.dtype 258 if data.has_weights(): 259 if not "float" in dtype.name and "float" in col.dtype.name: 260 dtype = col.dtype.name 261 dist = np.empty((2, len(col)), dtype=dtype) 262 dist[0, :] = col 263 dist[1, :] = data.W 264 else: 265 dist = np.ones((2, len(col)), dtype=dtype) 266 dist[0, :] = col 267 dist.sort(axis=0) 268 dist = np.array(_orange.valuecount(dist)) 269 unknowns = len(col) - dist.shape[1] 270 271 self = super().__new__(cls, dist.shape) 272 self[:] = dist 273 self.unknowns = unknowns 274 self.variable = variable 275 return self 276 277 def normalize(self): 278 t = np.sum(self[1, :]) 279 if t > 1e-6: 280 self[1, :] /= t 281 self.unknowns /= t 282 elif self.shape[1]: 283 self[1, :] = 1 / self.shape[1] 284 285 def modus(self): 286 val = np.argmax(self[1, :]) 287 return self[0, val] 288 289 def min(self): 290 return self[0, 0] 291 292 def max(self): 293 return self[0, -1] 294 295 def sample(self, size=None, replace=True): 296 normalized = Continuous(self, self.variable, self.unknowns) 297 normalized.normalize() 298 return np.random.choice(self[0, :], size, replace, normalized[1, :]) 299 300 def mean(self): 301 return np.average(np.asarray(self[0]), weights=np.asarray(self[1])) 302 303 def variance(self): 304 mean = self.mean() 305 return np.dot((self[0] - mean) ** 2, self[1]) / np.sum(self[1]) 306 307 def standard_deviation(self): 308 return np.sqrt(self.variance()) 309 310 311def class_distribution(data): 312 """Get the distribution of the class variable(s).""" 313 if data.domain.class_var: 314 return get_distribution(data, data.domain.class_var) 315 elif data.domain.class_vars: 316 return [get_distribution(data, cls) for cls in data.domain.class_vars] 317 else: 318 raise ValueError("domain has no class attribute") 319 320 321def get_distribution(dat, variable, unknowns=None): 322 """Get the distribution of the given variable.""" 323 variable = _get_variable(dat, variable) 324 if variable.is_discrete: 325 return Discrete(dat, variable, unknowns) 326 elif variable.is_continuous: 327 return Continuous(dat, variable, unknowns) 328 else: 329 raise TypeError("cannot compute distribution of '%s'" % type(variable).__name__) 330 331 332def get_distributions(dat, skipDiscrete=False, skipContinuous=False): 333 """Get the distributions of all variables in the data.""" 334 vars = dat.domain.variables 335 if skipDiscrete: 336 if skipContinuous: 337 return [] 338 columns = [i for i, var in enumerate(vars) if var.is_continuous] 339 elif skipContinuous: 340 columns = [i for i, var in enumerate(vars) if var.is_discrete] 341 else: 342 columns = None 343 try: 344 dist_unks = dat._compute_distributions(columns) 345 if columns is None: 346 columns = np.arange(len(vars)) 347 distributions = [] 348 for col, (dist, unks) in zip(columns, dist_unks): 349 distributions.append(get_distribution(dist, vars[col], unks)) 350 except NotImplementedError: 351 if columns is None: 352 columns = np.arange(len(vars)) 353 distributions = [get_distribution(dat, i) for i in columns] 354 return distributions 355 356 357def get_distributions_for_columns(data, columns): 358 """Compute the distributions for columns. 359 360 Parameters 361 ---------- 362 data : data.Table 363 List of column indices into the `data.domain` (indices can be 364 :class:`int` or instances of `Orange.data.Variable`) 365 366 """ 367 domain = data.domain 368 # Normailze the columns to int indices 369 columns = [col if isinstance(col, int) else domain.index(col) for col in columns] 370 try: 371 # Try the optimized code path (query the table|storage directly). 372 dist_unks = data._compute_distributions(columns) 373 except NotImplementedError: 374 # Use default slow(er) implementation. 375 return [get_distribution(data, i) for i in columns] 376 else: 377 # dist_unkn is a list of (values, unknowns) 378 return [get_distribution(dist, domain[col], unknown) 379 for col, (dist, unknown) in zip(columns, dist_unks)] 380