1from collections.abc import Iterable
2from numbers import Real
3import zlib
4
5import numpy as np
6
7from Orange import data
8
9
10def _get_variable(dat, variable, expected_type=None, expected_name=""):
11    """Get the variable instance from data."""
12    failed = False
13    if isinstance(variable, data.Variable):
14        datvar = getattr(dat, "variable", None)
15        if datvar is not None and datvar is not variable:
16            raise ValueError("variable does not match the variable in the data")
17    elif hasattr(dat, "domain"):
18        variable = dat.domain[variable]
19    elif hasattr(dat, "variable"):
20        variable = dat.variable
21    else:
22        failed = True
23    if failed or (expected_type is not None and not isinstance(variable, expected_type)):
24        if isinstance(variable, data.Variable):
25            raise ValueError("expected %s variable not %s" % (expected_name, variable))
26        else:
27            raise ValueError("expected %s, not '%s'" % (
28                expected_type.__name__, type(variable).__name__))
29    return variable
30
31
32class Distribution(np.ndarray):
33    def __eq__(self, other):
34        return (
35            np.array_equal(self, other) and
36            (not hasattr(other, "unknowns") or self.unknowns == other.unknowns)
37        )
38
39    def __ne__(self, other):
40        return not self == other
41
42    def __hash__(self):
43        return zlib.adler32(self) ^ hash(self.unknowns)
44
45    def sample(self, size=None, replace=True):
46        """Get a random sample from the distribution.
47
48        Parameters
49        ----------
50        size : Optional[Union[int, Tuple[int, ...]]]
51        replace : bool
52
53        Returns
54        -------
55        Union[float, data.Value, np.ndarray]
56
57        """
58        raise NotImplementedError
59
60    def normalize(self):
61        """Normalize the distribution to a probability distribution."""
62        raise NotImplementedError
63
64    def min(self):
65        """Get the smallest value for the distribution.
66
67        If the variable is not ordinal, return None.
68
69        """
70        raise NotImplementedError
71
72    def max(self):
73        """Get the largest value for the distribution.
74
75        If the variable is not ordinal, return None.
76
77        """
78        raise NotImplementedError
79
80
81class Discrete(Distribution):
82    def __new__(cls, dat, variable=None, unknowns=None):
83        if isinstance(dat, data.Storage):
84            if unknowns is not None:
85                raise TypeError("incompatible arguments (data storage and 'unknowns'")
86            return cls.from_data(dat, variable)
87
88        if variable is not None:
89            variable = _get_variable(dat, variable)
90            n = len(variable.values)
91        else:
92            n = len(dat)
93
94        self = super().__new__(cls, n)
95        self.variable = variable
96        if dat is None:
97            self[:] = 0
98            self.unknowns = unknowns or 0
99        else:
100            self[:] = dat
101            self.unknowns = unknowns if unknowns is not None else getattr(dat, "unknowns", 0)
102        return self
103
104    @classmethod
105    def from_data(cls, data, variable):
106        variable = _get_variable(data, variable)
107        try:
108            dist, unknowns = data._compute_distributions([variable])[0]
109            self = super().__new__(cls, len(dist))
110            self[:] = dist
111            self.unknowns = unknowns
112        except NotImplementedError:
113            self = super().__new__(cls, len(variable.values))
114            self[:] = np.zeros(len(variable.values))
115            self.unknowns = 0
116            if data.has_weights():
117                for inst, w in zip(data, data.W):
118                    val = inst[variable]
119                    if not np.isnan(val):
120                        self[int(val)] += w
121                    else:
122                        self.unknowns += w
123            else:
124                for inst in data:
125                    val = inst[variable]
126                    if val == val:
127                        self[int(val)] += 1
128                    else:
129                        self.unknowns += 1
130        self.variable = variable
131        return self
132
133    @property
134    def array_with_unknowns(self):
135        """
136        This property returns a distribution array with unknowns added
137        at the end
138
139        Returns
140        -------
141        np.array
142            Array with appended unknowns at the end of the row.
143        """
144        return np.append(np.array(self), self.unknowns)
145
146    def __getitem__(self, index):
147        if isinstance(index, str):
148            index = self.variable.to_val(index)
149        return super().__getitem__(index)
150
151    def __setitem__(self, index, value):
152        if isinstance(index, str):
153            index = self.variable.to_val(index)
154        super().__setitem__(index, value)
155
156    def __add__(self, other):
157        s = super().__add__(other)
158        s.unknowns = self.unknowns + getattr(other, "unknowns", 0)
159        return s
160
161    def __iadd__(self, other):
162        super().__iadd__(other)
163        self.unknowns += getattr(other, "unknowns", 0)
164        return self
165
166    def __sub__(self, other):
167        s = super().__sub__(other)
168        s.unknowns = self.unknowns - getattr(other, "unknowns", 0)
169        return s
170
171    def __isub__(self, other):
172        super().__isub__(other)
173        self.unknowns -= getattr(other, "unknowns", 0)
174        return self
175
176    def __mul__(self, other):
177        s = super().__mul__(other)
178        if isinstance(other, Real):
179            s.unknowns = self.unknowns / other
180        return s
181
182    def __imul__(self, other):
183        super().__imul__(other)
184        if isinstance(other, Real):
185            self.unknowns *= other
186        return self
187
188    def __div__(self, other):
189        s = super().__mul__(other)
190        if isinstance(other, Real):
191            s.unknowns = self.unknowns / other
192        return s
193
194    def __idiv__(self, other):
195        super().__imul__(other)
196        if isinstance(other, Real):
197            self.unknowns /= other
198        return self
199
200    def normalize(self):
201        t = np.sum(self)
202        if t > 1e-6:
203            self[:] /= t
204            self.unknowns /= t
205        elif self.shape[0]:
206            self[:] = 1 / self.shape[0]
207
208    def modus(self):
209        val = np.argmax(self)
210        return data.Value(self.variable, val) if self.variable is not None else val
211
212    def sample(self, size=None, replace=True):
213        value_indices = np.random.choice(range(len(self)), size, replace, self.normalize())
214        if isinstance(value_indices, Iterable):
215            to_value = np.vectorize(lambda idx: data.Value(self.variable, idx))
216            return to_value(value_indices)
217        return data.Value(self.variable, value_indices)
218
219    def min(self):
220        return None
221
222    def max(self):
223        return None
224
225    def sum(self, *args, **kwargs):
226        res = super().sum(*args, **kwargs)
227        res.unknowns = self.unknowns
228        return res
229
230
231class Continuous(Distribution):
232    def __new__(cls, dat, variable=None, unknowns=None):
233        if isinstance(dat, data.Storage):
234            if unknowns is not None:
235                raise TypeError("incompatible arguments (data storage and 'unknowns'")
236            return cls.from_data(variable, dat)
237        if isinstance(dat, int):
238            self = super().__new__(cls, (2, dat))
239            self[:] = 0
240            self.unknowns = unknowns or 0
241        else:
242            if not isinstance(dat, np.ndarray):
243                dat = np.asarray(dat)
244            self = super().__new__(cls, dat.shape)
245            self[:] = dat
246            self.unknowns = (unknowns if unknowns is not None else getattr(dat, "unknowns", 0))
247        self.variable = variable
248        return self
249
250    @classmethod
251    def from_data(cls, variable, data):
252        variable = _get_variable(data, variable)
253        try:
254            dist, unknowns = data._compute_distributions([variable])[0]
255        except NotImplementedError:
256            col = data[:, variable]
257            dtype = col.dtype
258            if data.has_weights():
259                if not "float" in dtype.name and "float" in col.dtype.name:
260                    dtype = col.dtype.name
261                dist = np.empty((2, len(col)), dtype=dtype)
262                dist[0, :] = col
263                dist[1, :] = data.W
264            else:
265                dist = np.ones((2, len(col)), dtype=dtype)
266                dist[0, :] = col
267            dist.sort(axis=0)
268            dist = np.array(_orange.valuecount(dist))
269            unknowns = len(col) - dist.shape[1]
270
271        self = super().__new__(cls, dist.shape)
272        self[:] = dist
273        self.unknowns = unknowns
274        self.variable = variable
275        return self
276
277    def normalize(self):
278        t = np.sum(self[1, :])
279        if t > 1e-6:
280            self[1, :] /= t
281            self.unknowns /= t
282        elif self.shape[1]:
283            self[1, :] = 1 / self.shape[1]
284
285    def modus(self):
286        val = np.argmax(self[1, :])
287        return self[0, val]
288
289    def min(self):
290        return self[0, 0]
291
292    def max(self):
293        return self[0, -1]
294
295    def sample(self, size=None, replace=True):
296        normalized = Continuous(self, self.variable, self.unknowns)
297        normalized.normalize()
298        return np.random.choice(self[0, :], size, replace, normalized[1, :])
299
300    def mean(self):
301        return np.average(np.asarray(self[0]), weights=np.asarray(self[1]))
302
303    def variance(self):
304        mean = self.mean()
305        return np.dot((self[0] - mean) ** 2, self[1]) / np.sum(self[1])
306
307    def standard_deviation(self):
308        return np.sqrt(self.variance())
309
310
311def class_distribution(data):
312    """Get the distribution of the class variable(s)."""
313    if data.domain.class_var:
314        return get_distribution(data, data.domain.class_var)
315    elif data.domain.class_vars:
316        return [get_distribution(data, cls) for cls in data.domain.class_vars]
317    else:
318        raise ValueError("domain has no class attribute")
319
320
321def get_distribution(dat, variable, unknowns=None):
322    """Get the distribution of the given variable."""
323    variable = _get_variable(dat, variable)
324    if variable.is_discrete:
325        return Discrete(dat, variable, unknowns)
326    elif variable.is_continuous:
327        return Continuous(dat, variable, unknowns)
328    else:
329        raise TypeError("cannot compute distribution of '%s'" % type(variable).__name__)
330
331
332def get_distributions(dat, skipDiscrete=False, skipContinuous=False):
333    """Get the distributions of all variables in the data."""
334    vars = dat.domain.variables
335    if skipDiscrete:
336        if skipContinuous:
337            return []
338        columns = [i for i, var in enumerate(vars) if var.is_continuous]
339    elif skipContinuous:
340        columns = [i for i, var in enumerate(vars) if var.is_discrete]
341    else:
342        columns = None
343    try:
344        dist_unks = dat._compute_distributions(columns)
345        if columns is None:
346            columns = np.arange(len(vars))
347        distributions = []
348        for col, (dist, unks) in zip(columns, dist_unks):
349            distributions.append(get_distribution(dist, vars[col], unks))
350    except NotImplementedError:
351        if columns is None:
352            columns = np.arange(len(vars))
353        distributions = [get_distribution(dat, i) for i in columns]
354    return distributions
355
356
357def get_distributions_for_columns(data, columns):
358    """Compute the distributions for columns.
359
360    Parameters
361    ----------
362    data : data.Table
363        List of column indices into the `data.domain` (indices can be
364        :class:`int` or instances of `Orange.data.Variable`)
365
366    """
367    domain = data.domain
368    # Normailze the columns to int indices
369    columns = [col if isinstance(col, int) else domain.index(col) for col in columns]
370    try:
371        # Try the optimized code path (query the table|storage directly).
372        dist_unks = data._compute_distributions(columns)
373    except NotImplementedError:
374        # Use default slow(er) implementation.
375        return [get_distribution(data, i) for i in columns]
376    else:
377        # dist_unkn is a list of (values, unknowns)
378        return [get_distribution(dist, domain[col], unknown)
379                for col, (dist, unknown) in zip(columns, dist_unks)]
380