1# Test methods with long descriptive names can omit docstrings 2# Test internal methods 3# pylint: disable=missing-docstring, protected-access 4 5import unittest 6from unittest.mock import Mock 7import warnings 8 9import numpy as np 10import scipy.sparse as sp 11 12from Orange.statistics import distribution 13from Orange import data 14from Orange.tests import test_filename 15 16 17def assert_dist_equal(dist, expected): 18 np.testing.assert_array_equal(np.asarray(dist), expected) 19 20 21def assert_dist_almost_equal(dist, expected): 22 np.testing.assert_almost_equal(np.asarray(dist), expected) 23 24 25class TestDiscreteDistribution(unittest.TestCase): 26 def setUp(self): 27 self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] 28 s = sum(self.freqs) 29 self.rfreqs = [x/s for x in self.freqs] 30 31 self.data = data.Table.from_numpy( 32 data.Domain( 33 attributes=[ 34 data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')), 35 data.DiscreteVariable('num', values=('1', '2', '3')), 36 ] 37 ), 38 X=np.array([ 39 [0, 2, 0, 1, 1, 0, np.nan, 1], 40 [0, 2, 0, np.nan, 1, 2, np.nan, 1], 41 ]).T 42 ) 43 self.rgb, self.num = distribution.get_distributions(self.data) 44 45 def test_from_table(self): 46 d = data.Table("zoo") 47 disc = distribution.Discrete(d, "type") 48 self.assertIsInstance(disc, np.ndarray) 49 self.assertIs(disc.variable, d.domain["type"]) 50 self.assertEqual(disc.unknowns, 0) 51 assert_dist_equal(disc, self.freqs) 52 53 disc2 = distribution.Discrete(d, d.domain.class_var) 54 self.assertIsInstance(disc2, np.ndarray) 55 self.assertIs(disc2.variable, d.domain.class_var) 56 self.assertEqual(disc, disc2) 57 58 disc3 = distribution.Discrete(d, len(d.domain.attributes)) 59 self.assertIsInstance(disc3, np.ndarray) 60 self.assertIs(disc3.variable, d.domain.class_var) 61 self.assertEqual(disc, disc3) 62 63 disc5 = distribution.class_distribution(d) 64 self.assertIsInstance(disc5, np.ndarray) 65 self.assertIs(disc5.variable, d.domain.class_var) 66 self.assertEqual(disc, disc5) 67 68 def test_construction(self): 69 d = data.Table("zoo") 70 71 disc = distribution.Discrete(d, "type") 72 self.assertIsInstance(disc, np.ndarray) 73 self.assertIs(disc.variable, d.domain["type"]) 74 self.assertEqual(disc.unknowns, 0) 75 self.assertIs(disc.variable, d.domain.class_var) 76 77 disc7 = distribution.Discrete(self.freqs) 78 self.assertIsInstance(disc, np.ndarray) 79 self.assertIsNone(disc7.variable) 80 self.assertEqual(disc7.unknowns, 0) 81 self.assertEqual(disc, disc7) 82 83 disc1 = distribution.Discrete(None, d.domain.class_var) 84 self.assertIsInstance(disc1, np.ndarray) 85 self.assertIs(disc1.variable, d.domain.class_var) 86 self.assertEqual(disc.unknowns, 0) 87 assert_dist_equal(disc1, [0]*len(d.domain.class_var.values)) 88 89 def test_fallback(self): 90 d = data.Table("zoo") 91 default = distribution.Discrete(d, "type") 92 93 d._compute_distributions = Mock(side_effect=NotImplementedError) 94 fallback = distribution.Discrete(d, "type") 95 96 np.testing.assert_almost_equal( 97 np.asarray(fallback), np.asarray(default)) 98 np.testing.assert_almost_equal(fallback.unknowns, default.unknowns) 99 100 def test_fallback_with_weights_and_nan(self): 101 d = data.Table("zoo") 102 d.set_weights(np.random.uniform(0., 1., size=len(d))) 103 d.Y[::10] = np.nan 104 105 default = distribution.Discrete(d, "type") 106 d._compute_distributions = Mock(side_effect=NotImplementedError) 107 fallback = distribution.Discrete(d, "type") 108 109 np.testing.assert_almost_equal( 110 np.asarray(fallback), np.asarray(default)) 111 np.testing.assert_almost_equal(fallback.unknowns, default.unknowns) 112 113 def test_equality(self): 114 d = data.Table("zoo") 115 d1 = distribution.Discrete(d, 0) 116 d2 = distribution.Discrete(d, 0) 117 d3 = distribution.Discrete(d, 1) 118 119 self.assertEqual(d1, d1) 120 self.assertEqual(d1, d2) 121 self.assertNotEqual(d1, d3) 122 123 def test_indexing(self): 124 d = data.Table("zoo") 125 indamphibian = d.domain.class_var.to_val("amphibian") 126 127 disc = distribution.class_distribution(d) 128 129 self.assertEqual(len(disc), len(d.domain.class_var.values)) 130 131 self.assertEqual(disc["mammal"], 41) 132 self.assertEqual(disc[indamphibian], 4) 133 134 disc["mammal"] = 100 135 self.assertEqual(disc[d.domain.class_var.to_val("mammal")], 100) 136 137 disc[indamphibian] = 33 138 self.assertEqual(disc["amphibian"], 33) 139 140 disc = distribution.class_distribution(d) 141 self.assertEqual(list(disc), self.freqs) 142 143 def test_hash(self): 144 d = data.Table("zoo") 145 disc = distribution.Discrete(d, "type") 146 147 disc2 = distribution.Discrete(d, d.domain.class_var) 148 self.assertEqual(hash(disc), hash(disc2)) 149 150 disc2[0] += 1 151 self.assertNotEqual(hash(disc), hash(disc2)) 152 153 disc2[0] -= 1 154 self.assertEqual(hash(disc), hash(disc2)) 155 156 disc2.unknowns += 1 157 self.assertNotEqual(hash(disc), hash(disc2)) 158 159 def test_add(self): 160 d = data.Table("zoo") 161 disc = distribution.Discrete(d, "type") 162 163 disc += [1, 2, 3, 4, 5, 6, 7] 164 self.assertEqual(disc, [5.0, 22.0, 16.0, 12.0, 15.0, 47.0, 12.0]) 165 166 disc2 = distribution.Discrete(d, d.domain.class_var) 167 168 disc3 = disc - disc2 169 self.assertEqual(disc3, list(range(1, 8))) 170 171 disc3 *= 2 172 self.assertEqual(disc3, [2*x for x in range(1, 8)]) 173 174 def test_normalize(self): 175 d = data.Table("zoo") 176 disc = distribution.Discrete(d, "type") 177 disc.normalize() 178 self.assertEqual(disc, self.rfreqs) 179 disc.normalize() 180 self.assertEqual(disc, self.rfreqs) 181 182 disc1 = distribution.Discrete(None, d.domain.class_var) 183 disc1.normalize() 184 v = len(d.domain.class_var.values) 185 assert_dist_almost_equal(disc1, [1/v]*v) 186 187 def test_modus(self): 188 d = data.Table("zoo") 189 disc = distribution.Discrete(d, "type") 190 self.assertEqual(str(disc.modus()), "mammal") 191 192 def test_sample(self): 193 ans = self.num.sample((500, 2), replace=True) 194 np.testing.assert_equal(np.unique(ans), [0, 1, 2]) 195 196 # Check that samping a single value works too 197 self.assertIn(self.num.sample(), [0, 1, 2]) 198 199 def test_min_max(self): 200 # Min and max don't make sense in the context of nominal variables 201 self.assertEqual(self.rgb.min(), None) 202 self.assertEqual(self.rgb.max(), None) 203 # Min and max should work for ordinal variables 204 self.assertEqual(self.num.min(), None) 205 self.assertEqual(self.num.max(), None) 206 207 def test_array_with_unknowns(self): 208 d = data.Table("zoo") 209 d.Y[0] = np.nan 210 disc = distribution.Discrete(d, "type") 211 self.assertIsInstance(disc, np.ndarray) 212 self.assertEqual(disc.unknowns, 1) 213 true_freq = [4., 20., 13., 8., 10., 40., 5.] 214 assert_dist_equal(disc, true_freq) 215 np.testing.assert_array_equal(disc.array_with_unknowns, 216 np.append(true_freq, 1)) 217 218 219class TestContinuousDistribution(unittest.TestCase): 220 @classmethod 221 def setUpClass(cls): 222 cls.iris = data.Table("iris") 223 224 cls.data = data.Table.from_numpy( 225 data.Domain( 226 attributes=[ 227 data.ContinuousVariable('n1'), 228 data.ContinuousVariable('n2'), 229 ] 230 ), 231 X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T 232 ) 233 cls.n1, cls.n2 = distribution.get_distributions(cls.data) 234 235 def setUp(self): 236 self.freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), 237 (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), 238 (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), 239 (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), 240 (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), 241 (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), 242 (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), 243 (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), 244 (6.6, 1), (6.7, 2), (6.9, 1)]).T 245 246 def test_from_table(self): 247 d = self.iris 248 petal_length = d.columns.petal_length 249 250 for attr in ["petal length", d.domain[2], 2]: 251 disc = distribution.Continuous(d, attr) 252 self.assertIsInstance(disc, np.ndarray) 253 self.assertIs(disc.variable, petal_length) 254 self.assertEqual(disc.unknowns, 0) 255 assert_dist_almost_equal(disc, self.freqs) 256 257 def test_construction(self): 258 d = self.iris 259 petal_length = d.columns.petal_length 260 261 disc = distribution.Continuous(d, "petal length") 262 263 disc7 = distribution.Continuous(self.freqs) 264 self.assertIsInstance(disc, np.ndarray) 265 self.assertIsNone(disc7.variable) 266 self.assertEqual(disc7.unknowns, 0) 267 self.assertEqual(disc, disc7) 268 269 disc7 = distribution.Continuous(self.freqs, petal_length) 270 self.assertIsInstance(disc, np.ndarray) 271 self.assertIs(disc7.variable, petal_length) 272 self.assertEqual(disc7.unknowns, 0) 273 self.assertEqual(disc, disc7) 274 275 disc1 = distribution.Continuous(10, petal_length) 276 self.assertIsInstance(disc1, np.ndarray) 277 self.assertIs(disc7.variable, petal_length) 278 self.assertEqual(disc7.unknowns, 0) 279 assert_dist_equal(disc1, np.zeros((2, 10))) 280 281 dd = [list(range(5)), [1, 1, 2, 5, 1]] 282 disc2 = distribution.Continuous(dd) 283 self.assertIsInstance(disc2, np.ndarray) 284 self.assertIsNone(disc2.variable) 285 self.assertEqual(disc2.unknowns, 0) 286 assert_dist_equal(disc2, dd) 287 288 def test_hash(self): 289 d = self.iris 290 petal_length = d.columns.petal_length 291 292 disc = distribution.Continuous(d, "petal length") 293 disc2 = distribution.Continuous(d, petal_length) 294 self.assertEqual(hash(disc), hash(disc2)) 295 296 disc2[0, 0] += 1 297 self.assertNotEqual(hash(disc), hash(disc2)) 298 299 disc2[0, 0] -= 1 300 self.assertEqual(hash(disc), hash(disc2)) 301 302 disc2.unknowns += 1 303 self.assertNotEqual(hash(disc), hash(disc2)) 304 305 def test_normalize(self): 306 d = self.iris 307 petal_length = d.columns.petal_length 308 309 disc = distribution.Continuous(d, "petal length") 310 311 assert_dist_equal(disc, self.freqs) 312 disc.normalize() 313 self.freqs[1, :] /= 150 314 assert_dist_equal(disc, self.freqs) 315 316 disc1 = distribution.Continuous(10, petal_length) 317 disc1.normalize() 318 f = np.zeros((2, 10)) 319 f[1, :] = 0.1 320 assert_dist_equal(disc1, f) 321 322 def test_modus(self): 323 disc = distribution.Continuous([list(range(5)), [1, 1, 2, 5, 1]]) 324 self.assertEqual(disc.modus(), 3) 325 326 def test_random(self): 327 d = self.iris 328 329 disc = distribution.Continuous(d, "petal length") 330 ans = set() 331 for i in range(1000): 332 v = disc.sample() 333 self.assertIn(v, self.freqs) 334 ans.add(v) 335 self.assertGreater(len(ans), 10) 336 337 def test_min_max(self): 338 self.assertEqual(self.n1.min(), 0) 339 self.assertFalse(isinstance(self.n1.min(), distribution.Continuous)) 340 self.assertEqual(self.n1.max(), 9) 341 self.assertFalse(isinstance(self.n1.max(), distribution.Continuous)) 342 343 344class TestClassDistribution(unittest.TestCase): 345 def test_class_distribution(self): 346 d = data.Table("zoo") 347 disc = distribution.class_distribution(d) 348 self.assertIsInstance(disc, np.ndarray) 349 self.assertIs(disc.variable, d.domain["type"]) 350 self.assertEqual(disc.unknowns, 0) 351 assert_dist_equal(disc, [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]) 352 353 def test_multiple_target_variables(self): 354 d = data.Table.from_numpy( 355 data.Domain( 356 attributes=[data.ContinuousVariable('n1')], 357 class_vars=[ 358 data.DiscreteVariable('c1', values=('r', 'g', 'b', 'a')), 359 data.DiscreteVariable('c2', values=('r', 'g', 'b', 'a')), 360 data.DiscreteVariable('c3', values=('r', 'g', 'b', 'a')), 361 ] 362 ), 363 X=np.array([range(5)]).T, 364 Y=np.array([ 365 [0, 1, 2, 3, 4], 366 [0, 1, 2, 3, 4], 367 [0, 1, 2, 3, 4], 368 ]).T 369 ) 370 dists = distribution.class_distribution(d) 371 self.assertEqual(len(dists), 3) 372 self.assertTrue(all(isinstance(dist, distribution.Discrete) for dist in dists)) 373 374 375class TestGetDistribution(unittest.TestCase): 376 def test_get_distribution(self): 377 d = data.Table("iris") 378 cls = d.domain.class_var 379 disc = distribution.get_distribution(d, cls) 380 self.assertIsInstance(disc, np.ndarray) 381 self.assertIs(disc.variable, cls) 382 self.assertEqual(disc.unknowns, 0) 383 assert_dist_equal(disc, [50, 50, 50]) 384 385 petal_length = d.columns.petal_length 386 freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), 387 (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), 388 (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), 389 (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), 390 (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), 391 (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), 392 (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), 393 (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), 394 (6.6, 1), (6.7, 2), (6.9, 1)]).T 395 disc = distribution.get_distribution(d, petal_length) 396 assert_dist_equal(disc, freqs) 397 398 399class TestDomainDistribution(unittest.TestCase): 400 def test_get_distributions(self): 401 d = data.Table("iris") 402 ddist = distribution.get_distributions(d) 403 404 self.assertEqual(len(ddist), 5) 405 for i in range(4): 406 self.assertIsInstance(ddist[i], distribution.Continuous) 407 self.assertIsInstance(ddist[-1], distribution.Discrete) 408 409 freqs = np.array([(1.0, 1), (1.1, 1), (1.2, 2), (1.3, 7), (1.4, 12), 410 (1.5, 14), (1.6, 7), (1.7, 4), (1.9, 2), (3.0, 1), 411 (3.3, 2), (3.5, 2), (3.6, 1), (3.7, 1), (3.8, 1), 412 (3.9, 3), (4.0, 5), (4.1, 3), (4.2, 4), (4.3, 2), 413 (4.4, 4), (4.5, 8), (4.6, 3), (4.7, 5), (4.8, 4), 414 (4.9, 5), (5.0, 4), (5.1, 8), (5.2, 2), (5.3, 2), 415 (5.4, 2), (5.5, 3), (5.6, 6), (5.7, 3), (5.8, 3), 416 (5.9, 2), (6.0, 2), (6.1, 3), (6.3, 1), (6.4, 1), 417 (6.6, 1), (6.7, 2), (6.9, 1)]).T 418 assert_dist_equal(ddist[2], freqs) 419 assert_dist_equal(ddist[-1], [50, 50, 50]) 420 421 def test_sparse_get_distributions(self): 422 def assert_dist_and_unknowns(computed, goal_dist): 423 nonlocal d 424 goal_dist = np.array(goal_dist) 425 sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist) 426 n_all = np.sum(d.W) if d.has_weights() else len(d) 427 428 assert_dist_almost_equal(computed, goal_dist) 429 self.assertEqual(computed.unknowns, n_all - sum_dist) 430 431 domain = data.Domain( 432 [data.DiscreteVariable("d%i" % i, values=tuple("abc")) for i in range(10)] + 433 [data.ContinuousVariable("c%i" % i) for i in range(10)]) 434 435 # pylint: disable=bad-whitespace 436 X = sp.csr_matrix( 437 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 438 # -------------------------------------------------------------------------------- 439 [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0], 440 [0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, np.nan, 0, 0], 441 [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 442 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 443 [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]] 444 ) 445 warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning) 446 X[0, 0] = 0 447 448 d = data.Table.from_numpy(domain, X) 449 ddist = distribution.get_distributions(d) 450 451 self.assertEqual(len(ddist), 20) 452 zeros = [5, 0, 0] 453 assert_dist_and_unknowns(ddist[0], zeros) 454 assert_dist_and_unknowns(ddist[1], [4, 0, 1]) 455 assert_dist_and_unknowns(ddist[2], [3, 1, 1]) 456 assert_dist_and_unknowns(ddist[3], [2, 2, 1]) 457 assert_dist_and_unknowns(ddist[4], [3, 1, 0]) 458 assert_dist_and_unknowns(ddist[5], [2, 1, 1]) 459 assert_dist_and_unknowns(ddist[6], [1, 2, 1]) 460 assert_dist_and_unknowns(ddist[7], zeros) 461 assert_dist_and_unknowns(ddist[8], [4, 0, 1]) 462 assert_dist_and_unknowns(ddist[9], [4, 1, 0]) 463 464 zeros = [[0], [5]] 465 assert_dist_and_unknowns(ddist[10], zeros) 466 assert_dist_and_unknowns(ddist[11], zeros) 467 assert_dist_and_unknowns(ddist[12], zeros) 468 assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]]) 469 assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]]) 470 assert_dist_and_unknowns(ddist[15], zeros) 471 assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]]) 472 assert_dist_and_unknowns(ddist[17], [[0], [3]]) 473 assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]]) 474 assert_dist_and_unknowns(ddist[19], zeros) 475 476 d.set_weights(np.array([1, 2, 3, 4, 5])) 477 ddist = distribution.get_distributions(d) 478 479 self.assertEqual(len(ddist), 20) 480 assert_dist_and_unknowns(ddist[0], [15, 0, 0]) 481 assert_dist_and_unknowns(ddist[1], [14, 0, 1]) 482 assert_dist_and_unknowns(ddist[2], [8, 2, 5]) 483 assert_dist_and_unknowns(ddist[3], [9, 5, 1]) 484 assert_dist_and_unknowns(ddist[4], [12, 1, 0]) 485 assert_dist_and_unknowns(ddist[5], [9, 1, 3]) 486 assert_dist_and_unknowns(ddist[6], [4, 7, 1]) 487 assert_dist_and_unknowns(ddist[7], [15, 0, 0]) 488 assert_dist_and_unknowns(ddist[8], [13, 0, 2]) 489 assert_dist_and_unknowns(ddist[9], [14, 1, 0]) 490 491 zeros = [[0], [15]] 492 assert_dist_and_unknowns(ddist[10], zeros) 493 assert_dist_and_unknowns(ddist[11], zeros) 494 assert_dist_and_unknowns(ddist[12], zeros) 495 assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]]) 496 assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]]) 497 assert_dist_and_unknowns(ddist[15], zeros) 498 assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]]) 499 assert_dist_and_unknowns(ddist[17], [[0], [12]]) 500 assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]]) 501 assert_dist_and_unknowns(ddist[19], zeros) 502 503 def test_compute_distributions_metas(self): 504 d = data.Table(test_filename("datasets/test9.tab")) 505 variable = d.domain[-2] 506 dist, _ = d._compute_distributions([variable])[0] 507 assert_dist_equal(dist, [3, 3, 2]) 508 # repeat with nan values 509 assert d.metas.dtype.kind == "O" 510 assert d.metas[0, 1] == 0 511 d.metas[0, 1] = np.nan 512 dist, nanc = d._compute_distributions([variable])[0] 513 assert_dist_equal(dist, [2, 3, 2]) 514 self.assertEqual(nanc, 1) 515 516 517class TestContinuous(unittest.TestCase): 518 def test_mean(self): 519 # pylint: disable=bad-whitespace 520 x = np.array([[0, 5, 10], 521 [9, 0, 1]]) 522 dist = distribution.Continuous(x) 523 524 self.assertEqual(dist.mean(), np.mean(([0] * 9) + [10])) 525 526 def test_variance(self): 527 # pylint: disable=bad-whitespace 528 x = np.array([[0, 5, 10], 529 [9, 0, 1]]) 530 dist = distribution.Continuous(x) 531 532 self.assertEqual(dist.variance(), np.var(([0] * 9) + [10])) 533 534 def test_standard_deviation(self): 535 # pylint: disable=bad-whitespace 536 x = np.array([[0, 5, 10], 537 [9, 0, 1]]) 538 dist = distribution.Continuous(x) 539 540 self.assertEqual(dist.standard_deviation(), np.std(([0] * 9) + [10])) 541 542 543if __name__ == "__main__": 544 unittest.main() 545