1# This module defines abstract base classes; derived classes are abstract, too 2# pylint: disable=abstract-method 3 4import numpy as np 5import sklearn.metrics as skl_metrics 6 7from Orange.data import Table, Domain, Instance, RowInstance 8from Orange.misc import DistMatrix 9from Orange.preprocess import SklImpute 10from Orange.statistics import util 11 12 13# TODO: When we upgrade to numpy 1.13, change use argument copy=False in 14# nan_to_num instead of assignment 15 16# TODO this *private* function is called from several widgets to prepare 17# data for calling the below classes. After we (mostly) stopped relying 18# on sklearn.metrics, this is (mostly) unnecessary 19# Afterwards, also remove the following line: 20# pylint: disable=redefined-outer-name 21def _preprocess(table, impute=True): 22 """Remove categorical attributes and impute missing values.""" 23 if not len(table): # this can be an array, pylint: disable=len-as-condition 24 return table 25 new_domain = Domain( 26 [a for a in table.domain.attributes if a.is_continuous], 27 table.domain.class_vars, 28 table.domain.metas) 29 new_data = table.transform(new_domain) 30 if impute: 31 new_data = SklImpute()(new_data) 32 return new_data 33 34 35# TODO I have put this function here as a substitute the above `_preprocess`. 36# None of them really belongs here; (re?)move them, eventually. 37def remove_discrete_features(data): 38 """Remove discrete columns from the data.""" 39 new_domain = Domain( 40 [a for a in data.domain.attributes if a.is_continuous], 41 data.domain.class_vars, 42 data.domain.metas) 43 return data.transform(new_domain) 44 45 46def remove_nonbinary_features(data): 47 """Remove non-binary columns from the data.""" 48 new_domain = Domain( 49 [a for a in data.domain.attributes 50 if a.is_discrete and len(a.values) == 2], 51 data.domain.class_vars, 52 data.domain.metas) 53 return data.transform(new_domain) 54 55def impute(data): 56 """Impute missing values.""" 57 return SklImpute()(data) 58 59 60def _orange_to_numpy(x): 61 """ 62 Return :class:`numpy.ndarray` (dense or sparse) with attribute data 63 from the given instance of :class:`Orange.data.Table`, 64 :class:`Orange.data.RowInstance` or :class:`Orange.data.Instance`. 65 """ 66 if isinstance(x, Table): 67 return x.X 68 elif isinstance(x, Instance): 69 return np.atleast_2d(x.x) 70 elif isinstance(x, np.ndarray): 71 return np.atleast_2d(x) 72 else: 73 return x # e.g. None 74 75 76class Distance: 77 """ 78 Base class for construction of distances models (:obj:`DistanceModel`). 79 80 Distances can be computed between all pairs of rows in one table, or 81 between pairs where one row is from one table and one from another. 82 83 If `axis` is set to `0`, the class computes distances between all pairs 84 of columns in a table. Distances between columns from separate tables are 85 probably meaningless, thus unsupported. 86 87 The class can be used as follows: 88 89 - Constructor is called only with keyword argument `axis` that 90 specifies the axis over which the distances are computed, and with other 91 subclass-specific keyword arguments. 92 - Next, we call the method `fit(data)` to produce an instance of 93 :obj:`DistanceModel`; the instance stores any parameters needed for 94 computation of distances, such as statistics for normalization and 95 handling of missing data. 96 - We can then call the :obj:`DistanceModel` with data to compute the 97 distance between its rows or columns, or with two data tables to 98 compute distances between all pairs of rows. 99 100 The second, shorter way to use this class is to call the constructor with 101 one or two data tables and any additional keyword arguments. Constructor 102 will execute the above steps and return :obj:`~Orange.misc.DistMatrix`. 103 Such usage is here for backward compatibility, practicality and efficiency. 104 105 Args: 106 e1 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \ 107 :obj:`np.ndarray` or `None`): 108 data on which to train the model and compute the distances 109 e2 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \ 110 :obj:`np.ndarray` or `None`): 111 if present, the class computes distances with pairs coming from 112 the two tables 113 axis (int): 114 axis over which the distances are computed, 1 (default) for 115 rows, 0 for columns 116 impute (bool): 117 if `True` (default is `False`), nans in the computed distances 118 are replaced with zeros, and infs with very large numbers. 119 callback (callable or None): 120 callback function 121 122 Attributes: 123 axis (int): 124 axis over which the distances are computed, 1 (default) for 125 rows, 0 for columns 126 impute (bool): 127 if `True` (default is `False`), nans in the computed distances 128 are replaced with zeros, and infs with very large numbers. 129 normalize (bool): 130 if `True`, columns are normalized before computation. This attribute 131 applies only if the distance supports normalization. 132 133 The capabilities of the metrics are described with class attributes. 134 135 If class attribute `supports_discrete` is `True`, the distance 136 also uses discrete attributes to compute row distances. The use of discrete 137 attributes depends upon the type of distance; e.g. Jaccard distance observes 138 whether the value is zero or non-zero, while Euclidean and Manhattan 139 distance observes whether a pair of values is same or different. 140 141 Class attribute `supports_missing` indicates that the distance can cope 142 with missing data. In such cases, letting the distance handle it should 143 be preferred over pre-imputation of missing values. 144 145 Class attribute `supports_normalization` indicates that the constructor 146 accepts an argument `normalize`. If set to `True`, the metric will attempt 147 to normalize the values in a sense that each attribute will have equal 148 influence. For instance, the Euclidean distance subtract the mean and 149 divides the result by the deviation, while Manhattan distance uses the 150 median and MAD. 151 152 If class attribute `supports_sparse` is `True`, the class will handle 153 sparse data. Currently, all classes that do handle it rely on fallbacks to 154 SKL metrics. These, however, do not support discrete data and missing 155 values, and will fail silently. 156 """ 157 supports_sparse = False 158 supports_discrete = False 159 supports_normalization = False 160 supports_missing = True 161 162 # Predefined here to silence pylint, which doesn't look into __new__ 163 normalize = False 164 axis = 1 165 impute = False 166 167 def __new__(cls, e1=None, e2=None, axis=1, impute=False, 168 callback=None, **kwargs): 169 self = super().__new__(cls) 170 self.axis = axis 171 self.impute = impute 172 self.callback = callback 173 # Ugly, but needed to allow allow setting subclass-specific parameters 174 # (such as normalize) when `e1` is not `None` and the `__new__` in the 175 # subclass is skipped 176 self.__dict__.update(**kwargs) 177 if e1 is None: 178 return self 179 180 # Fallbacks for sparse data and numpy tables. Remove when subclasses 181 # no longer use fallbacks for sparse data, and handling numpy tables 182 # becomes obsolete (or handled elsewhere) 183 if (not hasattr(e1, "domain") 184 or hasattr(e1, "is_sparse") and e1.is_sparse()): 185 fallback = getattr(self, "fallback", None) 186 if fallback is not None: 187 # pylint: disable=not-callable 188 return fallback(e1, e2, axis, impute) 189 190 # Magic constructor 191 model = self.fit(e1) 192 return model(e1, e2) 193 194 def fit(self, data): 195 """ 196 Abstract method returning :obj:`DistanceModel` fit to the data 197 198 Args: 199 e1 (Orange.data.Table, Orange.data.Instance, np.ndarray): 200 data for fitting the distance model 201 202 Returns: 203 model (DistanceModel) 204 """ 205 raise NotImplementedError 206 207 @staticmethod 208 def check_no_discrete(n_vals): 209 """ 210 Raise an exception if there are any discrete attributes. 211 212 Args: 213 n_vals (list of int): number of attributes values, 0 for continuous 214 """ 215 if any(n_vals): 216 raise ValueError("columns with discrete values are incommensurable") 217 218 219class DistanceModel: 220 """ 221 Base class for classes that compute distances between data rows or columns. 222 Instances of these classes are not constructed directly but returned by 223 the corresponding instances of :obj:`Distance`. 224 225 Attributes: 226 axis (int, readonly): 227 axis over which the distances are computed, 1 (default) for 228 rows, 0 for columns 229 impute (bool): 230 if `True` (default is `False`), nans in the computed distances 231 are replaced with zeros, and infs with very large numbers 232 callback (callable or None): 233 callback function 234 235 """ 236 def __init__(self, axis, impute=False, callback=None): 237 self._axis = axis 238 self.impute = impute 239 self.callback = callback 240 241 @property 242 def axis(self): 243 return self._axis 244 245 def __call__(self, e1, e2=None): 246 """ 247 If e2 is omitted, calculate distances between all rows (axis=1) or 248 columns (axis=2) of e1. If e2 is present, calculate distances between 249 all pairs if rows from e1 and e2. 250 251 This method converts the data into numpy arrays, calls the method 252 `compute_data` and packs the result into `DistMatrix`. Subclasses are 253 expected to define the `compute_data` and not the `__call__` method. 254 255 Args: 256 e1 (Orange.data.Table or Orange.data.Instance or numpy.ndarray): 257 input data 258 e2 (Orange.data.Table or Orange.data.Instance or numpy.ndarray): 259 secondary data 260 261 Returns: 262 A distance matrix (Orange.misc.distmatrix.DistMatrix) 263 """ 264 if self.axis == 0 and e2 is not None: 265 # Backward compatibility fix 266 if e2 is e1: 267 e2 = None 268 else: 269 raise ValueError("Two tables cannot be compared by columns") 270 271 x1 = _orange_to_numpy(e1) 272 x2 = _orange_to_numpy(e2) 273 with np.errstate(invalid="ignore"): # nans are handled below 274 dist = self.compute_distances(x1, x2) 275 if self.impute and np.isnan(dist).any(): 276 dist = np.nan_to_num(dist) 277 if isinstance(e1, (Table, RowInstance)): 278 dist = DistMatrix(dist, e1, e2, self.axis) 279 else: 280 dist = DistMatrix(dist) 281 return dist 282 283 def compute_distances(self, x1, x2): 284 """ 285 Abstract method for computation of distances between rows or columns of 286 `x1`, or between rows of `x1` and `x2`. Do not call directly.""" 287 raise NotImplementedError 288 289 290class FittedDistanceModel(DistanceModel): 291 """ 292 Base class for models that store attribute-related data for normalization 293 and imputation, and that treat discrete and continuous columns separately. 294 295 Attributes: 296 attributes (list of `Variable`): attributes on which the model was fit 297 discrete (np.ndarray): bool array indicating discrete attributes 298 continuous (np.ndarray): bool array indicating continuous attributes 299 normalize (bool): 300 if `True` (default is `False`) continuous columns are normalized 301 callback (callable or None): callback function 302 """ 303 def __init__(self, attributes, axis=1, impute=False, callback=None): 304 super().__init__(axis, impute, callback) 305 self.attributes = attributes 306 self.discrete = None 307 self.continuous = None 308 self.normalize = False 309 310 def __call__(self, e1, e2=None): 311 if self.attributes is not None and ( 312 e1.domain.attributes != self.attributes 313 or e2 is not None and e2.domain.attributes != self.attributes): 314 raise ValueError("mismatching domains") 315 return super().__call__(e1, e2) 316 317 def continuous_columns(self, x1, x2, offset, scale): 318 """ 319 Extract and scale continuous columns from data tables. 320 If the second table is None, it defaults to the first table. 321 322 Values are scaled if `self.normalize` is `True`. 323 324 Args: 325 x1 (np.ndarray): first table 326 x2 (np.ndarray or None): second table 327 offset (float): a constant (e.g. mean, median) subtracted from data 328 scale: (float): divider (e.g. deviation) 329 330 Returns: 331 data1 (np.ndarray): scaled continuous columns from `x1` 332 data2 (np.ndarray): scaled continuous columns from `x2` or `x1` 333 """ 334 if self.continuous.all() and not self.normalize: 335 data1, data2 = x1, x2 336 else: 337 data1 = x1[:, self.continuous] 338 if x2 is not None: 339 data2 = x2[:, self.continuous] 340 if self.normalize: 341 data1 = x1[:, self.continuous] 342 data1 -= offset 343 data1 /= scale 344 if x2 is not None: 345 data2 = x2[:, self.continuous] 346 data2 -= offset 347 data2 /= scale 348 if x2 is None: 349 data2 = data1 350 return data1, data2 351 352 def discrete_columns(self, x1, x2): 353 """ 354 Return discrete columns from the given tables. 355 If the second table is None, it defaults to the first table. 356 """ 357 if self.discrete.all(): 358 data1, data2 = x1, x1 if x2 is None else x2 359 else: 360 data1 = x1[:, self.discrete] 361 data2 = data1 if x2 is None else x2[:, self.discrete] 362 return data1, data2 363 364 365class FittedDistance(Distance): 366 """ 367 Base class for fitting models that store attribute-related data for 368 normalization and imputation, and that treat discrete and continuous 369 columns separately. 370 371 The class implements a method `fit` that calls either `fit_columns` 372 or `fit_rows` with the data and the number of values for discrete 373 attributes. The provided method `fit_rows` calls methods 374 `get_discrete_stats` and `get_continuous_stats` that can be implemented 375 in derived classes. 376 377 Class attribute `rows_model_type` contains the type of the model returned by 378 `fit_rows`. 379 """ 380 rows_model_type = None #: Option[FittedDistanceModel] 381 382 def fit(self, data): 383 """ 384 Prepare the data on attributes, call `fit_cols` or `fit_rows` and 385 return the resulting model. 386 """ 387 x = _orange_to_numpy(data) 388 if hasattr(data, "domain"): 389 attributes = data.domain.attributes 390 n_vals = np.fromiter( 391 (len(attr.values) if attr.is_discrete else 0 392 for attr in attributes), 393 dtype=np.int32, count=len(attributes)) 394 else: 395 assert isinstance(x, np.ndarray) 396 attributes = None 397 n_vals = np.zeros(x.shape[1], dtype=np.int32) 398 return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals) 399 400 def fit_cols(self, attributes, x, n_vals): 401 """ 402 Return DistanceModel for computation of distances between columns. 403 Derived classes must define this method. 404 405 Args: 406 attributes (list of Orange.data.Variable): list of attributes 407 x (np.ndarray): data 408 n_vals (np.ndarray): number of attribute values, 0 for continuous 409 """ 410 raise NotImplementedError 411 412 def fit_rows(self, attributes, x, n_vals): 413 """ 414 Return a DistanceModel for computation of distances between rows. 415 416 The model type is `self.row_distance_model`. It stores the data for 417 imputation of discrete and continuous values, and for normalization 418 of continuous values. Typical examples are the Euclidean and Manhattan 419 distance, for which the following data is stored: 420 421 For continuous columns: 422 423 - offsets[col] is the number subtracted from values in column `col` 424 - scales[col] is the divisor for values in columns `col` 425 - dist_missing2_cont[col]: the value used for distance between two 426 missing values in column `col` 427 428 For discrete values: 429 430 - dist_missing_disc[col, value] is the distance added for the given 431 `value` in the column `col` if the value for the other row is 432 missing 433 - dist_missing2_disc[col]: the distance between two missing values in 434 column `col` 435 """ 436 n_cols = len(n_vals) 437 438 discrete = n_vals > 0 439 n_bins = max(n_vals, default=0) 440 n_discrete = sum(discrete) 441 dist_missing_disc = np.zeros((n_discrete, n_bins), dtype=float) 442 dist_missing2_disc = np.zeros(n_discrete, dtype=float) 443 444 continuous = ~discrete 445 n_continuous = sum(continuous) 446 offsets = np.zeros(n_continuous, dtype=float) 447 scales = np.empty(n_continuous, dtype=float) 448 dist_missing2_cont = np.zeros(n_continuous, dtype=float) 449 450 curr_disc = curr_cont = 0 451 for col in range(n_cols): 452 column = x[:, col] 453 if np.isnan(column).all(): 454 continuous[col] = discrete[col] = False 455 elif discrete[col]: 456 discrete_stats = self.get_discrete_stats(column, n_bins) 457 if discrete_stats is not None: 458 dist_missing_disc[curr_disc], \ 459 dist_missing2_disc[curr_disc] = discrete_stats 460 curr_disc += 1 461 else: 462 continuous_stats = self.get_continuous_stats(column) 463 if continuous_stats is not None: 464 offsets[curr_cont], scales[curr_cont],\ 465 dist_missing2_cont[curr_cont] = continuous_stats 466 curr_cont += 1 467 else: 468 continuous[col] = False 469 # pylint: disable=not-callable 470 return self.rows_model_type( 471 attributes, impute, getattr(self, "normalize", False), 472 continuous, discrete, 473 offsets[:curr_cont], scales[:curr_cont], 474 dist_missing2_cont[:curr_cont], 475 dist_missing_disc, dist_missing2_disc, 476 self.callback) 477 478 @staticmethod 479 def get_discrete_stats(column, n_bins): 480 """ 481 Return tables used computing distance between missing discrete values. 482 483 Args: 484 column (np.ndarray): column data 485 n_bins (int): maximal number of bins in the dataset 486 487 Returns: 488 dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is 489 1 - probability of `value`, which is used as the distance added 490 for the given `value` in the column `col` if the value for the 491 other row is missing 492 dist_missing2_disc (float): the distance between two missing 493 values in this columns 494 """ 495 dist = util.bincount(column, minlength=n_bins)[0] 496 dist /= max(1, sum(dist)) 497 return 1 - dist, 1 - np.sum(dist ** 2) 498 499 def get_continuous_stats(self, column): 500 """ 501 Compute statistics for imputation and normalization of continuous data. 502 Derived classes must define this method. 503 504 Args: 505 column (np.ndarray): column data 506 507 Returns: 508 offset (float): the number subtracted from values in column 509 scales (float): the divisor for values in column 510 dist_missing2_cont (float): the value used for distance between two 511 missing values in column 512 """ 513 raise NotImplementedError 514 515 516# Fallbacks for distances in sparse data 517# To be removed as the corresponding functionality is implemented properly 518 519class SklDistance: 520 """ 521 Wrapper for functions sklearn's metrics. Used only as temporary fallbacks 522 when `Euclidean`, `Manhattan`, `Cosine` or `Jaccard` are given sparse data 523 or raw numpy arrays. These classes can't handle discrete or missing data 524 and normalization. Do not use for wrapping new classes. 525 """ 526 def __init__(self, metric): 527 self.metric = metric 528 529 def __call__(self, e1, e2=None, axis=1, impute=False): 530 x1 = _orange_to_numpy(e1) 531 x2 = _orange_to_numpy(e2) 532 if axis == 0: 533 x1 = x1.T 534 if x2 is not None: 535 x2 = x2.T 536 dist = skl_metrics.pairwise.pairwise_distances( 537 x1, x2, metric=self.metric) 538 if impute and np.isnan(dist).any(): 539 dist = np.nan_to_num(dist) 540 if isinstance(e1, (Table, RowInstance)): 541 dist_matrix = DistMatrix(dist, e1, e2, axis) 542 else: 543 dist_matrix = DistMatrix(dist) 544 return dist_matrix 545