1# This module defines abstract base classes; derived classes are abstract, too
2# pylint: disable=abstract-method
3
4import numpy as np
5import sklearn.metrics as skl_metrics
6
7from Orange.data import Table, Domain, Instance, RowInstance
8from Orange.misc import DistMatrix
9from Orange.preprocess import SklImpute
10from Orange.statistics import util
11
12
13# TODO: When we upgrade to numpy 1.13, change use argument copy=False in
14# nan_to_num instead of assignment
15
16# TODO this *private* function is called from several widgets to prepare
17# data for calling the below classes. After we (mostly) stopped relying
18# on sklearn.metrics, this is (mostly) unnecessary
19# Afterwards, also remove the following line:
20# pylint: disable=redefined-outer-name
21def _preprocess(table, impute=True):
22    """Remove categorical attributes and impute missing values."""
23    if not len(table):  # this can be an array, pylint: disable=len-as-condition
24        return table
25    new_domain = Domain(
26        [a for a in table.domain.attributes if a.is_continuous],
27        table.domain.class_vars,
28        table.domain.metas)
29    new_data = table.transform(new_domain)
30    if impute:
31        new_data = SklImpute()(new_data)
32    return new_data
33
34
35# TODO I have put this function here as a substitute the above `_preprocess`.
36# None of them really belongs here; (re?)move them, eventually.
37def remove_discrete_features(data):
38    """Remove discrete columns from the data."""
39    new_domain = Domain(
40        [a for a in data.domain.attributes if a.is_continuous],
41        data.domain.class_vars,
42        data.domain.metas)
43    return data.transform(new_domain)
44
45
46def remove_nonbinary_features(data):
47    """Remove non-binary columns from the data."""
48    new_domain = Domain(
49        [a for a in data.domain.attributes
50         if a.is_discrete and len(a.values) == 2],
51        data.domain.class_vars,
52        data.domain.metas)
53    return data.transform(new_domain)
54
55def impute(data):
56    """Impute missing values."""
57    return SklImpute()(data)
58
59
60def _orange_to_numpy(x):
61    """
62    Return :class:`numpy.ndarray` (dense or sparse) with attribute data
63    from the given instance of :class:`Orange.data.Table`,
64    :class:`Orange.data.RowInstance` or :class:`Orange.data.Instance`.
65    """
66    if isinstance(x, Table):
67        return x.X
68    elif isinstance(x, Instance):
69        return np.atleast_2d(x.x)
70    elif isinstance(x, np.ndarray):
71        return np.atleast_2d(x)
72    else:
73        return x  # e.g. None
74
75
76class Distance:
77    """
78    Base class for construction of distances models (:obj:`DistanceModel`).
79
80    Distances can be computed between all pairs of rows in one table, or
81    between pairs where one row is from one table and one from another.
82
83    If `axis` is set to `0`, the class computes distances between all pairs
84    of columns in a table. Distances between columns from separate tables are
85    probably meaningless, thus unsupported.
86
87    The class can be used as follows:
88
89    - Constructor is called only with keyword argument `axis` that
90      specifies the axis over which the distances are computed, and with other
91      subclass-specific keyword arguments.
92    - Next, we call the method `fit(data)` to produce an instance of
93      :obj:`DistanceModel`; the instance stores any parameters needed for
94      computation of distances, such as statistics for normalization and
95      handling of missing data.
96    - We can then call the :obj:`DistanceModel` with data to compute the
97      distance between its rows or columns, or with two data tables to
98      compute distances between all pairs of rows.
99
100    The second, shorter way to use this class is to call the constructor with
101    one or two data tables and any additional keyword arguments. Constructor
102    will execute the above steps and return :obj:`~Orange.misc.DistMatrix`.
103    Such usage is here for backward compatibility, practicality and efficiency.
104
105    Args:
106        e1 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
107                :obj:`np.ndarray` or `None`):
108            data on which to train the model and compute the distances
109        e2 (:obj:`~Orange.data.Table` or :obj:`~Orange.data.Instance` or \
110                :obj:`np.ndarray` or `None`):
111            if present, the class computes distances with pairs coming from
112            the two tables
113        axis (int):
114            axis over which the distances are computed, 1 (default) for
115            rows, 0 for columns
116        impute (bool):
117            if `True` (default is `False`), nans in the computed distances
118            are replaced with zeros, and infs with very large numbers.
119        callback (callable or None):
120            callback function
121
122    Attributes:
123        axis (int):
124            axis over which the distances are computed, 1 (default) for
125            rows, 0 for columns
126        impute (bool):
127            if `True` (default is `False`), nans in the computed distances
128            are replaced with zeros, and infs with very large numbers.
129        normalize (bool):
130            if `True`, columns are normalized before computation. This attribute
131            applies only if the distance supports normalization.
132
133    The capabilities of the metrics are described with class attributes.
134
135    If class attribute `supports_discrete` is `True`, the distance
136    also uses discrete attributes to compute row distances. The use of discrete
137    attributes depends upon the type of distance; e.g. Jaccard distance observes
138    whether the value is zero or non-zero, while Euclidean and Manhattan
139    distance observes whether a pair of values is same or different.
140
141    Class attribute `supports_missing` indicates that the distance can cope
142    with missing data. In such cases, letting the distance handle it should
143    be preferred over pre-imputation of missing values.
144
145    Class attribute `supports_normalization` indicates that the constructor
146    accepts an argument `normalize`. If set to `True`, the metric will attempt
147    to normalize the values in a sense that each attribute will have equal
148    influence. For instance, the Euclidean distance subtract the mean and
149    divides the result by the deviation, while Manhattan distance uses the
150    median and MAD.
151
152    If class attribute `supports_sparse` is `True`, the class will handle
153    sparse data. Currently, all classes that do handle it rely on fallbacks to
154    SKL metrics. These, however, do not support discrete data and missing
155    values, and will fail silently.
156    """
157    supports_sparse = False
158    supports_discrete = False
159    supports_normalization = False
160    supports_missing = True
161
162    # Predefined here to silence pylint, which doesn't look into __new__
163    normalize = False
164    axis = 1
165    impute = False
166
167    def __new__(cls, e1=None, e2=None, axis=1, impute=False,
168                callback=None, **kwargs):
169        self = super().__new__(cls)
170        self.axis = axis
171        self.impute = impute
172        self.callback = callback
173        # Ugly, but needed to allow allow setting subclass-specific parameters
174        # (such as normalize) when `e1` is not `None` and the `__new__` in the
175        # subclass is skipped
176        self.__dict__.update(**kwargs)
177        if e1 is None:
178            return self
179
180        # Fallbacks for sparse data and numpy tables. Remove when subclasses
181        # no longer use fallbacks for sparse data, and handling numpy tables
182        # becomes obsolete (or handled elsewhere)
183        if (not hasattr(e1, "domain")
184                or hasattr(e1, "is_sparse") and e1.is_sparse()):
185            fallback = getattr(self, "fallback", None)
186            if fallback is not None:
187                # pylint: disable=not-callable
188                return fallback(e1, e2, axis, impute)
189
190        # Magic constructor
191        model = self.fit(e1)
192        return model(e1, e2)
193
194    def fit(self, data):
195        """
196        Abstract method returning :obj:`DistanceModel` fit to the data
197
198        Args:
199            e1 (Orange.data.Table, Orange.data.Instance, np.ndarray):
200                data for fitting the distance model
201
202        Returns:
203            model (DistanceModel)
204        """
205        raise NotImplementedError
206
207    @staticmethod
208    def check_no_discrete(n_vals):
209        """
210        Raise an exception if there are any discrete attributes.
211
212        Args:
213            n_vals (list of int): number of attributes values, 0 for continuous
214        """
215        if any(n_vals):
216            raise ValueError("columns with discrete values are incommensurable")
217
218
219class DistanceModel:
220    """
221    Base class for classes that compute distances between data rows or columns.
222    Instances of these classes are not constructed directly but returned by
223    the corresponding instances of :obj:`Distance`.
224
225    Attributes:
226        axis (int, readonly):
227            axis over which the distances are computed, 1 (default) for
228            rows, 0 for columns
229        impute (bool):
230            if `True` (default is `False`), nans in the computed distances
231            are replaced with zeros, and infs with very large numbers
232        callback (callable or None):
233            callback function
234
235    """
236    def __init__(self, axis, impute=False, callback=None):
237        self._axis = axis
238        self.impute = impute
239        self.callback = callback
240
241    @property
242    def axis(self):
243        return self._axis
244
245    def __call__(self, e1, e2=None):
246        """
247        If e2 is omitted, calculate distances between all rows (axis=1) or
248        columns (axis=2) of e1. If e2 is present, calculate distances between
249        all pairs if rows from e1 and e2.
250
251        This method converts the data into numpy arrays, calls the method
252        `compute_data` and packs the result into `DistMatrix`. Subclasses are
253        expected to define the `compute_data` and not the `__call__` method.
254
255        Args:
256            e1 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
257                input data
258            e2 (Orange.data.Table or Orange.data.Instance or numpy.ndarray):
259                secondary data
260
261        Returns:
262            A distance matrix (Orange.misc.distmatrix.DistMatrix)
263        """
264        if self.axis == 0 and e2 is not None:
265            # Backward compatibility fix
266            if e2 is e1:
267                e2 = None
268            else:
269                raise ValueError("Two tables cannot be compared by columns")
270
271        x1 = _orange_to_numpy(e1)
272        x2 = _orange_to_numpy(e2)
273        with np.errstate(invalid="ignore"):  # nans are handled below
274            dist = self.compute_distances(x1, x2)
275            if self.impute and np.isnan(dist).any():
276                dist = np.nan_to_num(dist)
277            if isinstance(e1, (Table, RowInstance)):
278                dist = DistMatrix(dist, e1, e2, self.axis)
279            else:
280                dist = DistMatrix(dist)
281            return dist
282
283    def compute_distances(self, x1, x2):
284        """
285        Abstract method for computation of distances between rows or columns of
286        `x1`, or between rows of `x1` and `x2`. Do not call directly."""
287        raise NotImplementedError
288
289
290class FittedDistanceModel(DistanceModel):
291    """
292    Base class for models that store attribute-related data for normalization
293    and imputation, and that treat discrete and continuous columns separately.
294
295    Attributes:
296        attributes (list of `Variable`): attributes on which the model was fit
297        discrete (np.ndarray): bool array indicating discrete attributes
298        continuous (np.ndarray): bool array indicating continuous attributes
299        normalize (bool):
300            if `True` (default is `False`) continuous columns are normalized
301        callback (callable or None): callback function
302    """
303    def __init__(self, attributes, axis=1, impute=False, callback=None):
304        super().__init__(axis, impute, callback)
305        self.attributes = attributes
306        self.discrete = None
307        self.continuous = None
308        self.normalize = False
309
310    def __call__(self, e1, e2=None):
311        if self.attributes is not None and (
312                e1.domain.attributes != self.attributes
313                or e2 is not None and e2.domain.attributes != self.attributes):
314            raise ValueError("mismatching domains")
315        return super().__call__(e1, e2)
316
317    def continuous_columns(self, x1, x2, offset, scale):
318        """
319        Extract and scale continuous columns from data tables.
320        If the second table is None, it defaults to the first table.
321
322        Values are scaled if `self.normalize` is `True`.
323
324        Args:
325            x1 (np.ndarray): first table
326            x2 (np.ndarray or None): second table
327            offset (float): a constant (e.g. mean, median) subtracted from data
328            scale: (float): divider (e.g. deviation)
329
330        Returns:
331            data1 (np.ndarray): scaled continuous columns from `x1`
332            data2 (np.ndarray): scaled continuous columns from `x2` or `x1`
333        """
334        if self.continuous.all() and not self.normalize:
335            data1, data2 = x1, x2
336        else:
337            data1 = x1[:, self.continuous]
338            if x2 is not None:
339                data2 = x2[:, self.continuous]
340            if self.normalize:
341                data1 = x1[:, self.continuous]
342                data1 -= offset
343                data1 /= scale
344                if x2 is not None:
345                    data2 = x2[:, self.continuous]
346                    data2 -= offset
347                    data2 /= scale
348        if x2 is None:
349            data2 = data1
350        return data1, data2
351
352    def discrete_columns(self, x1, x2):
353        """
354        Return discrete columns from the given tables.
355        If the second table is None, it defaults to the first table.
356        """
357        if self.discrete.all():
358            data1, data2 = x1, x1 if x2 is None else x2
359        else:
360            data1 = x1[:, self.discrete]
361            data2 = data1 if x2 is None else x2[:, self.discrete]
362        return data1, data2
363
364
365class FittedDistance(Distance):
366    """
367    Base class for fitting models that store attribute-related data for
368    normalization and imputation, and that treat discrete and continuous
369    columns separately.
370
371    The class implements a method `fit` that calls either `fit_columns`
372    or `fit_rows` with the data and the number of values for discrete
373    attributes. The provided method `fit_rows` calls methods
374    `get_discrete_stats` and `get_continuous_stats` that can be implemented
375    in derived classes.
376
377    Class attribute `rows_model_type` contains the type of the model returned by
378    `fit_rows`.
379    """
380    rows_model_type = None  #: Option[FittedDistanceModel]
381
382    def fit(self, data):
383        """
384        Prepare the data on attributes, call `fit_cols` or `fit_rows` and
385        return the resulting model.
386        """
387        x = _orange_to_numpy(data)
388        if hasattr(data, "domain"):
389            attributes = data.domain.attributes
390            n_vals = np.fromiter(
391                (len(attr.values) if attr.is_discrete else 0
392                 for attr in attributes),
393                dtype=np.int32, count=len(attributes))
394        else:
395            assert isinstance(x, np.ndarray)
396            attributes = None
397            n_vals = np.zeros(x.shape[1], dtype=np.int32)
398        return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)
399
400    def fit_cols(self, attributes, x, n_vals):
401        """
402        Return DistanceModel for computation of distances between columns.
403        Derived classes must define this method.
404
405        Args:
406            attributes (list of Orange.data.Variable): list of attributes
407            x (np.ndarray): data
408            n_vals (np.ndarray): number of attribute values, 0 for continuous
409        """
410        raise NotImplementedError
411
412    def fit_rows(self, attributes, x, n_vals):
413        """
414        Return a DistanceModel for computation of distances between rows.
415
416        The model type is `self.row_distance_model`. It stores the data for
417        imputation of discrete and continuous values, and for normalization
418        of continuous values. Typical examples are the Euclidean and Manhattan
419        distance, for which the following data is stored:
420
421        For continuous columns:
422
423        - offsets[col] is the number subtracted from values in column `col`
424        - scales[col] is the divisor for values in columns `col`
425        - dist_missing2_cont[col]: the value used for distance between two
426            missing values in column `col`
427
428        For discrete values:
429
430        - dist_missing_disc[col, value] is the distance added for the given
431            `value` in the column `col` if the value for the other row is
432            missing
433        - dist_missing2_disc[col]: the distance between two missing values in
434            column `col`
435        """
436        n_cols = len(n_vals)
437
438        discrete = n_vals > 0
439        n_bins = max(n_vals, default=0)
440        n_discrete = sum(discrete)
441        dist_missing_disc = np.zeros((n_discrete, n_bins), dtype=float)
442        dist_missing2_disc = np.zeros(n_discrete, dtype=float)
443
444        continuous = ~discrete
445        n_continuous = sum(continuous)
446        offsets = np.zeros(n_continuous, dtype=float)
447        scales = np.empty(n_continuous, dtype=float)
448        dist_missing2_cont = np.zeros(n_continuous, dtype=float)
449
450        curr_disc = curr_cont = 0
451        for col in range(n_cols):
452            column = x[:, col]
453            if np.isnan(column).all():
454                continuous[col] = discrete[col] = False
455            elif discrete[col]:
456                discrete_stats = self.get_discrete_stats(column, n_bins)
457                if discrete_stats is not None:
458                    dist_missing_disc[curr_disc], \
459                    dist_missing2_disc[curr_disc] = discrete_stats
460                    curr_disc += 1
461            else:
462                continuous_stats = self.get_continuous_stats(column)
463                if continuous_stats is not None:
464                    offsets[curr_cont], scales[curr_cont],\
465                    dist_missing2_cont[curr_cont] = continuous_stats
466                    curr_cont += 1
467                else:
468                    continuous[col] = False
469        # pylint: disable=not-callable
470        return self.rows_model_type(
471            attributes, impute, getattr(self, "normalize", False),
472            continuous, discrete,
473            offsets[:curr_cont], scales[:curr_cont],
474            dist_missing2_cont[:curr_cont],
475            dist_missing_disc, dist_missing2_disc,
476            self.callback)
477
478    @staticmethod
479    def get_discrete_stats(column, n_bins):
480        """
481        Return tables used computing distance between missing discrete values.
482
483        Args:
484            column (np.ndarray): column data
485            n_bins (int): maximal number of bins in the dataset
486
487        Returns:
488            dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is
489                1 - probability of `value`, which is used as the distance added
490                for the given `value` in the column `col` if the value for the
491                other row is missing
492            dist_missing2_disc (float): the distance between two missing
493                values in this columns
494        """
495        dist = util.bincount(column, minlength=n_bins)[0]
496        dist /= max(1, sum(dist))
497        return 1 - dist, 1 - np.sum(dist ** 2)
498
499    def get_continuous_stats(self, column):
500        """
501        Compute statistics for imputation and normalization of continuous data.
502        Derived classes must define this method.
503
504        Args:
505            column (np.ndarray): column data
506
507        Returns:
508            offset (float): the number subtracted from values in column
509            scales (float): the divisor for values in column
510            dist_missing2_cont (float): the value used for distance between two
511                missing values in column
512        """
513        raise NotImplementedError
514
515
516# Fallbacks for distances in sparse data
517# To be removed as the corresponding functionality is implemented properly
518
519class SklDistance:
520    """
521    Wrapper for functions sklearn's metrics. Used only as temporary fallbacks
522    when `Euclidean`, `Manhattan`, `Cosine` or `Jaccard` are given sparse data
523    or raw numpy arrays. These classes can't handle discrete or missing data
524    and normalization. Do not use for wrapping new classes.
525    """
526    def __init__(self, metric):
527        self.metric = metric
528
529    def __call__(self, e1, e2=None, axis=1, impute=False):
530        x1 = _orange_to_numpy(e1)
531        x2 = _orange_to_numpy(e2)
532        if axis == 0:
533            x1 = x1.T
534            if x2 is not None:
535                x2 = x2.T
536        dist = skl_metrics.pairwise.pairwise_distances(
537            x1, x2, metric=self.metric)
538        if impute and np.isnan(dist).any():
539            dist = np.nan_to_num(dist)
540        if isinstance(e1, (Table, RowInstance)):
541            dist_matrix = DistMatrix(dist, e1, e2, axis)
542        else:
543            dist_matrix = DistMatrix(dist)
544        return dist_matrix
545