1# Test methods with long descriptive names can omit docstrings
2# Test internal methods
3# pylint: disable=missing-docstring, protected-access
4
5import unittest
6from unittest.mock import Mock
7import warnings
8
9import numpy as np
10import scipy.sparse as sp
11
12from Orange.statistics import distribution
13from Orange import data
14from Orange.tests import test_filename
15
16
17def assert_dist_equal(dist, expected):
18    np.testing.assert_array_equal(np.asarray(dist), expected)
19
20
21def assert_dist_almost_equal(dist, expected):
22    np.testing.assert_almost_equal(np.asarray(dist), expected)
23
24
25class TestDiscreteDistribution(unittest.TestCase):
26    def setUp(self):
27        self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]
28        s = sum(self.freqs)
29        self.rfreqs = [x/s for x in self.freqs]
30
31        self.data = data.Table.from_numpy(
32            data.Domain(
33                attributes=[
34                    data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')),
35                    data.DiscreteVariable('num', values=('1', '2', '3')),
36                ]
37            ),
38            X=np.array([
39                [0, 2, 0, 1, 1, 0, np.nan, 1],
40                [0, 2, 0, np.nan, 1, 2, np.nan, 1],
41            ]).T
42        )
43        self.rgb, self.num = distribution.get_distributions(self.data)
44
45    def test_from_table(self):
46        d = data.Table("zoo")
47        disc = distribution.Discrete(d, "type")
48        self.assertIsInstance(disc, np.ndarray)
49        self.assertIs(disc.variable, d.domain["type"])
50        self.assertEqual(disc.unknowns, 0)
51        assert_dist_equal(disc, self.freqs)
52
53        disc2 = distribution.Discrete(d, d.domain.class_var)
54        self.assertIsInstance(disc2, np.ndarray)
55        self.assertIs(disc2.variable, d.domain.class_var)
56        self.assertEqual(disc, disc2)
57
58        disc3 = distribution.Discrete(d, len(d.domain.attributes))
59        self.assertIsInstance(disc3, np.ndarray)
60        self.assertIs(disc3.variable, d.domain.class_var)
61        self.assertEqual(disc, disc3)
62
63        disc5 = distribution.class_distribution(d)
64        self.assertIsInstance(disc5, np.ndarray)
65        self.assertIs(disc5.variable, d.domain.class_var)
66        self.assertEqual(disc, disc5)
67
68    def test_construction(self):
69        d = data.Table("zoo")
70
71        disc = distribution.Discrete(d, "type")
72        self.assertIsInstance(disc, np.ndarray)
73        self.assertIs(disc.variable, d.domain["type"])
74        self.assertEqual(disc.unknowns, 0)
75        self.assertIs(disc.variable, d.domain.class_var)
76
77        disc7 = distribution.Discrete(self.freqs)
78        self.assertIsInstance(disc, np.ndarray)
79        self.assertIsNone(disc7.variable)
80        self.assertEqual(disc7.unknowns, 0)
81        self.assertEqual(disc, disc7)
82
83        disc1 = distribution.Discrete(None, d.domain.class_var)
84        self.assertIsInstance(disc1, np.ndarray)
85        self.assertIs(disc1.variable, d.domain.class_var)
86        self.assertEqual(disc.unknowns, 0)
87        assert_dist_equal(disc1, [0]*len(d.domain.class_var.values))
88
89    def test_fallback(self):
90        d = data.Table("zoo")
91        default = distribution.Discrete(d, "type")
92
93        d._compute_distributions = Mock(side_effect=NotImplementedError)
94        fallback = distribution.Discrete(d, "type")
95
96        np.testing.assert_almost_equal(
97            np.asarray(fallback), np.asarray(default))
98        np.testing.assert_almost_equal(fallback.unknowns, default.unknowns)
99
100    def test_fallback_with_weights_and_nan(self):
101        d = data.Table("zoo")
102        d.set_weights(np.random.uniform(0., 1., size=len(d)))
103        d.Y[::10] = np.nan
104
105        default = distribution.Discrete(d, "type")
106        d._compute_distributions = Mock(side_effect=NotImplementedError)
107        fallback = distribution.Discrete(d, "type")
108
109        np.testing.assert_almost_equal(
110            np.asarray(fallback), np.asarray(default))
111        np.testing.assert_almost_equal(fallback.unknowns, default.unknowns)
112
113    def test_equality(self):
114        d = data.Table("zoo")
115        d1 = distribution.Discrete(d, 0)
116        d2 = distribution.Discrete(d, 0)
117        d3 = distribution.Discrete(d, 1)
118
119        self.assertEqual(d1, d1)
120        self.assertEqual(d1, d2)
121        self.assertNotEqual(d1, d3)
122
123    def test_indexing(self):
124        d = data.Table("zoo")
125        indamphibian = d.domain.class_var.to_val("amphibian")
126
127        disc = distribution.class_distribution(d)
128
129        self.assertEqual(len(disc), len(d.domain.class_var.values))
130
131        self.assertEqual(disc["mammal"], 41)
132        self.assertEqual(disc[indamphibian], 4)
133
134        disc["mammal"] = 100
135        self.assertEqual(disc[d.domain.class_var.to_val("mammal")], 100)
136
137        disc[indamphibian] = 33
138        self.assertEqual(disc["amphibian"], 33)
139
140        disc = distribution.class_distribution(d)
141        self.assertEqual(list(disc), self.freqs)
142
143    def test_hash(self):
144        d = data.Table("zoo")
145        disc = distribution.Discrete(d, "type")
146
147        disc2 = distribution.Discrete(d, d.domain.class_var)
148        self.assertEqual(hash(disc), hash(disc2))
149
150        disc2[0] += 1
151        self.assertNotEqual(hash(disc), hash(disc2))
152
153        disc2[0] -= 1
154        self.assertEqual(hash(disc), hash(disc2))
155
156        disc2.unknowns += 1
157        self.assertNotEqual(hash(disc), hash(disc2))
158
159    def test_add(self):
160        d = data.Table("zoo")
161        disc = distribution.Discrete(d, "type")
162
163        disc += [1, 2, 3, 4, 5, 6, 7]
164        self.assertEqual(disc, [5.0, 22.0, 16.0, 12.0, 15.0, 47.0, 12.0])
165
166        disc2 = distribution.Discrete(d, d.domain.class_var)
167
168        disc3 = disc - disc2
169        self.assertEqual(disc3, list(range(1, 8)))
170
171        disc3 *= 2
172        self.assertEqual(disc3, [2*x for x in range(1, 8)])
173
174    def test_normalize(self):
175        d = data.Table("zoo")
176        disc = distribution.Discrete(d, "type")
177        disc.normalize()
178        self.assertEqual(disc, self.rfreqs)
179        disc.normalize()
180        self.assertEqual(disc, self.rfreqs)
181
182        disc1 = distribution.Discrete(None, d.domain.class_var)
183        disc1.normalize()
184        v = len(d.domain.class_var.values)
185        assert_dist_almost_equal(disc1, [1/v]*v)
186
187    def test_modus(self):
188        d = data.Table("zoo")
189        disc = distribution.Discrete(d, "type")
190        self.assertEqual(str(disc.modus()), "mammal")
191
192    def test_sample(self):
193        ans = self.num.sample((500, 2), replace=True)
194        np.testing.assert_equal(np.unique(ans), [0, 1, 2])
195
196        # Check that samping a single value works too
197        self.assertIn(self.num.sample(), [0, 1, 2])
198
199    def test_min_max(self):
200        # Min and max don't make sense in the context of nominal variables
201        self.assertEqual(self.rgb.min(), None)
202        self.assertEqual(self.rgb.max(), None)
203        # Min and max should work for ordinal variables
204        self.assertEqual(self.num.min(), None)
205        self.assertEqual(self.num.max(), None)
206
207    def test_array_with_unknowns(self):
208        d = data.Table("zoo")
209        d.Y[0] = np.nan
210        disc = distribution.Discrete(d, "type")
211        self.assertIsInstance(disc, np.ndarray)
212        self.assertEqual(disc.unknowns, 1)
213        true_freq = [4., 20., 13., 8., 10., 40., 5.]
214        assert_dist_equal(disc, true_freq)
215        np.testing.assert_array_equal(disc.array_with_unknowns,
216                                      np.append(true_freq, 1))
217
218
219class TestContinuousDistribution(unittest.TestCase):
220    @classmethod
221    def setUpClass(cls):
222        cls.iris = data.Table("iris")
223
224        cls.data = data.Table.from_numpy(
225            data.Domain(
226                attributes=[
227                    data.ContinuousVariable('n1'),
228                    data.ContinuousVariable('n2'),
229                ]
230            ),
231            X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T
232        )
233        cls.n1, cls.n2 = distribution.get_distributions(cls.data)
234
235    def setUp(self):
236        self.freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12),
237                               (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1),
238                               (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1),
239                               (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2),
240                               (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4),
241                               (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2),
242                               (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3),
243                               (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1),
244                               (6.6, 1), (6.7, 2), (6.9, 1)]).T
245
246    def test_from_table(self):
247        d = self.iris
248        petal_length = d.columns.petal_length
249
250        for attr in ["petal length", d.domain[2], 2]:
251            disc = distribution.Continuous(d, attr)
252            self.assertIsInstance(disc, np.ndarray)
253            self.assertIs(disc.variable, petal_length)
254            self.assertEqual(disc.unknowns, 0)
255            assert_dist_almost_equal(disc, self.freqs)
256
257    def test_construction(self):
258        d = self.iris
259        petal_length = d.columns.petal_length
260
261        disc = distribution.Continuous(d, "petal length")
262
263        disc7 = distribution.Continuous(self.freqs)
264        self.assertIsInstance(disc, np.ndarray)
265        self.assertIsNone(disc7.variable)
266        self.assertEqual(disc7.unknowns, 0)
267        self.assertEqual(disc, disc7)
268
269        disc7 = distribution.Continuous(self.freqs, petal_length)
270        self.assertIsInstance(disc, np.ndarray)
271        self.assertIs(disc7.variable, petal_length)
272        self.assertEqual(disc7.unknowns, 0)
273        self.assertEqual(disc, disc7)
274
275        disc1 = distribution.Continuous(10, petal_length)
276        self.assertIsInstance(disc1, np.ndarray)
277        self.assertIs(disc7.variable, petal_length)
278        self.assertEqual(disc7.unknowns, 0)
279        assert_dist_equal(disc1, np.zeros((2, 10)))
280
281        dd = [list(range(5)), [1, 1, 2, 5, 1]]
282        disc2 = distribution.Continuous(dd)
283        self.assertIsInstance(disc2, np.ndarray)
284        self.assertIsNone(disc2.variable)
285        self.assertEqual(disc2.unknowns, 0)
286        assert_dist_equal(disc2, dd)
287
288    def test_hash(self):
289        d = self.iris
290        petal_length = d.columns.petal_length
291
292        disc = distribution.Continuous(d, "petal length")
293        disc2 = distribution.Continuous(d, petal_length)
294        self.assertEqual(hash(disc), hash(disc2))
295
296        disc2[0, 0] += 1
297        self.assertNotEqual(hash(disc), hash(disc2))
298
299        disc2[0, 0] -= 1
300        self.assertEqual(hash(disc), hash(disc2))
301
302        disc2.unknowns += 1
303        self.assertNotEqual(hash(disc), hash(disc2))
304
305    def test_normalize(self):
306        d = self.iris
307        petal_length = d.columns.petal_length
308
309        disc = distribution.Continuous(d, "petal length")
310
311        assert_dist_equal(disc, self.freqs)
312        disc.normalize()
313        self.freqs[1, :] /= 150
314        assert_dist_equal(disc, self.freqs)
315
316        disc1 = distribution.Continuous(10, petal_length)
317        disc1.normalize()
318        f = np.zeros((2, 10))
319        f[1, :] = 0.1
320        assert_dist_equal(disc1, f)
321
322    def test_modus(self):
323        disc = distribution.Continuous([list(range(5)), [1, 1, 2, 5, 1]])
324        self.assertEqual(disc.modus(), 3)
325
326    def test_random(self):
327        d = self.iris
328
329        disc = distribution.Continuous(d, "petal length")
330        ans = set()
331        for i in range(1000):
332            v = disc.sample()
333            self.assertIn(v, self.freqs)
334            ans.add(v)
335        self.assertGreater(len(ans), 10)
336
337    def test_min_max(self):
338        self.assertEqual(self.n1.min(), 0)
339        self.assertFalse(isinstance(self.n1.min(), distribution.Continuous))
340        self.assertEqual(self.n1.max(), 9)
341        self.assertFalse(isinstance(self.n1.max(), distribution.Continuous))
342
343
344class TestClassDistribution(unittest.TestCase):
345    def test_class_distribution(self):
346        d = data.Table("zoo")
347        disc = distribution.class_distribution(d)
348        self.assertIsInstance(disc, np.ndarray)
349        self.assertIs(disc.variable, d.domain["type"])
350        self.assertEqual(disc.unknowns, 0)
351        assert_dist_equal(disc, [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0])
352
353    def test_multiple_target_variables(self):
354        d = data.Table.from_numpy(
355            data.Domain(
356                attributes=[data.ContinuousVariable('n1')],
357                class_vars=[
358                    data.DiscreteVariable('c1', values=('r', 'g', 'b', 'a')),
359                    data.DiscreteVariable('c2', values=('r', 'g', 'b', 'a')),
360                    data.DiscreteVariable('c3', values=('r', 'g', 'b', 'a')),
361                ]
362            ),
363            X=np.array([range(5)]).T,
364            Y=np.array([
365                [0, 1, 2, 3, 4],
366                [0, 1, 2, 3, 4],
367                [0, 1, 2, 3, 4],
368            ]).T
369        )
370        dists = distribution.class_distribution(d)
371        self.assertEqual(len(dists), 3)
372        self.assertTrue(all(isinstance(dist, distribution.Discrete) for dist in dists))
373
374
375class TestGetDistribution(unittest.TestCase):
376    def test_get_distribution(self):
377        d = data.Table("iris")
378        cls = d.domain.class_var
379        disc = distribution.get_distribution(d, cls)
380        self.assertIsInstance(disc, np.ndarray)
381        self.assertIs(disc.variable, cls)
382        self.assertEqual(disc.unknowns, 0)
383        assert_dist_equal(disc, [50, 50, 50])
384
385        petal_length = d.columns.petal_length
386        freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12),
387                          (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1),
388                          (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1),
389                          (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2),
390                          (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4),
391                          (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2),
392                          (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3),
393                          (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1),
394                          (6.6, 1), (6.7, 2), (6.9, 1)]).T
395        disc = distribution.get_distribution(d, petal_length)
396        assert_dist_equal(disc, freqs)
397
398
399class TestDomainDistribution(unittest.TestCase):
400    def test_get_distributions(self):
401        d = data.Table("iris")
402        ddist = distribution.get_distributions(d)
403
404        self.assertEqual(len(ddist), 5)
405        for i in range(4):
406            self.assertIsInstance(ddist[i], distribution.Continuous)
407        self.assertIsInstance(ddist[-1], distribution.Discrete)
408
409        freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12),
410                          (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1),
411                          (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1),
412                          (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2),
413                          (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4),
414                          (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2),
415                          (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3),
416                          (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1),
417                          (6.6, 1), (6.7, 2), (6.9, 1)]).T
418        assert_dist_equal(ddist[2], freqs)
419        assert_dist_equal(ddist[-1], [50, 50, 50])
420
421    def test_sparse_get_distributions(self):
422        def assert_dist_and_unknowns(computed, goal_dist):
423            nonlocal d
424            goal_dist = np.array(goal_dist)
425            sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist)
426            n_all = np.sum(d.W) if d.has_weights() else len(d)
427
428            assert_dist_almost_equal(computed, goal_dist)
429            self.assertEqual(computed.unknowns, n_all - sum_dist)
430
431        domain = data.Domain(
432            [data.DiscreteVariable("d%i" % i, values=tuple("abc")) for i in range(10)] +
433            [data.ContinuousVariable("c%i" % i) for i in range(10)])
434
435        # pylint: disable=bad-whitespace
436        X = sp.csr_matrix(
437            # 0  1  2  3       4       5       6  7  8  9 10 11 12   13 14 15 16      17 18 19
438            # --------------------------------------------------------------------------------
439            [[0, 2, 0, 2,      1,      1,      2, 0, 0, 1, 0, 0, 0,   1, 1, 0, 2, np.nan, 2, 0],
440             [0, 0, 1, 1, np.nan, np.nan,      1, 0, 2, 0, 0, 0, 0,   0, 2, 0, 1, np.nan, 0, 0],
441             [0, 0, 0, 1,      0,      2, np.nan, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,      0, 0, 0],
442             [0, 0, 0, 0,      0,      0,      0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,      0, 0, 0],
443             [0, 0, 2, 0,      0,      0,      1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0,      0, 0, 0]]
444        )
445        warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning)
446        X[0, 0] = 0
447
448        d = data.Table.from_numpy(domain, X)
449        ddist = distribution.get_distributions(d)
450
451        self.assertEqual(len(ddist), 20)
452        zeros = [5, 0, 0]
453        assert_dist_and_unknowns(ddist[0], zeros)
454        assert_dist_and_unknowns(ddist[1], [4, 0, 1])
455        assert_dist_and_unknowns(ddist[2], [3, 1, 1])
456        assert_dist_and_unknowns(ddist[3], [2, 2, 1])
457        assert_dist_and_unknowns(ddist[4], [3, 1, 0])
458        assert_dist_and_unknowns(ddist[5], [2, 1, 1])
459        assert_dist_and_unknowns(ddist[6], [1, 2, 1])
460        assert_dist_and_unknowns(ddist[7], zeros)
461        assert_dist_and_unknowns(ddist[8], [4, 0, 1])
462        assert_dist_and_unknowns(ddist[9], [4, 1, 0])
463
464        zeros = [[0], [5]]
465        assert_dist_and_unknowns(ddist[10], zeros)
466        assert_dist_and_unknowns(ddist[11], zeros)
467        assert_dist_and_unknowns(ddist[12], zeros)
468        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]])
469        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]])
470        assert_dist_and_unknowns(ddist[15], zeros)
471        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]])
472        assert_dist_and_unknowns(ddist[17], [[0], [3]])
473        assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]])
474        assert_dist_and_unknowns(ddist[19], zeros)
475
476        d.set_weights(np.array([1, 2, 3, 4, 5]))
477        ddist = distribution.get_distributions(d)
478
479        self.assertEqual(len(ddist), 20)
480        assert_dist_and_unknowns(ddist[0], [15, 0, 0])
481        assert_dist_and_unknowns(ddist[1], [14, 0, 1])
482        assert_dist_and_unknowns(ddist[2], [8, 2, 5])
483        assert_dist_and_unknowns(ddist[3], [9, 5, 1])
484        assert_dist_and_unknowns(ddist[4], [12, 1, 0])
485        assert_dist_and_unknowns(ddist[5], [9, 1, 3])
486        assert_dist_and_unknowns(ddist[6], [4, 7, 1])
487        assert_dist_and_unknowns(ddist[7], [15, 0, 0])
488        assert_dist_and_unknowns(ddist[8], [13, 0, 2])
489        assert_dist_and_unknowns(ddist[9], [14, 1, 0])
490
491        zeros = [[0], [15]]
492        assert_dist_and_unknowns(ddist[10], zeros)
493        assert_dist_and_unknowns(ddist[11], zeros)
494        assert_dist_and_unknowns(ddist[12], zeros)
495        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]])
496        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]])
497        assert_dist_and_unknowns(ddist[15], zeros)
498        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]])
499        assert_dist_and_unknowns(ddist[17], [[0], [12]])
500        assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]])
501        assert_dist_and_unknowns(ddist[19], zeros)
502
503    def test_compute_distributions_metas(self):
504        d = data.Table(test_filename("datasets/test9.tab"))
505        variable = d.domain[-2]
506        dist, _ = d._compute_distributions([variable])[0]
507        assert_dist_equal(dist, [3, 3, 2])
508        # repeat with nan values
509        assert d.metas.dtype.kind == "O"
510        assert d.metas[0, 1] == 0
511        d.metas[0, 1] = np.nan
512        dist, nanc = d._compute_distributions([variable])[0]
513        assert_dist_equal(dist, [2, 3, 2])
514        self.assertEqual(nanc, 1)
515
516
517class TestContinuous(unittest.TestCase):
518    def test_mean(self):
519        # pylint: disable=bad-whitespace
520        x = np.array([[0, 5, 10],
521                      [9, 0,  1]])
522        dist = distribution.Continuous(x)
523
524        self.assertEqual(dist.mean(), np.mean(([0] * 9) + [10]))
525
526    def test_variance(self):
527        # pylint: disable=bad-whitespace
528        x = np.array([[0, 5, 10],
529                      [9, 0,  1]])
530        dist = distribution.Continuous(x)
531
532        self.assertEqual(dist.variance(), np.var(([0] * 9) + [10]))
533
534    def test_standard_deviation(self):
535        # pylint: disable=bad-whitespace
536        x = np.array([[0, 5, 10],
537                      [9, 0,  1]])
538        dist = distribution.Continuous(x)
539
540        self.assertEqual(dist.standard_deviation(), np.std(([0] * 9) + [10]))
541
542
543if __name__ == "__main__":
544    unittest.main()
545