1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# 5# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> 6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7import logging 8import unittest 9import numpy as np 10from scipy import sparse 11from scipy.special import psi # gamma function utils 12 13import gensim.matutils as matutils 14 15 16# we'll define known, good (slow) version of functions here 17# and compare results from these functions vs. cython ones 18def logsumexp(x): 19 """Log of sum of exponentials. 20 21 Parameters 22 ---------- 23 x : numpy.ndarray 24 Input 2d matrix. 25 26 Returns 27 ------- 28 float 29 log of sum of exponentials of elements in `x`. 30 31 Warnings 32 -------- 33 By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`. 34 35 """ 36 x_max = np.max(x) 37 x = np.log(np.sum(np.exp(x - x_max))) 38 x += x_max 39 40 return x 41 42 43def mean_absolute_difference(a, b): 44 """Mean absolute difference between two arrays. 45 46 Parameters 47 ---------- 48 a : numpy.ndarray 49 Input 1d array. 50 b : numpy.ndarray 51 Input 1d array. 52 53 Returns 54 ------- 55 float 56 mean(abs(a - b)). 57 58 """ 59 return np.mean(np.abs(a - b)) 60 61 62def dirichlet_expectation(alpha): 63 r"""For a vector :math:`\theta \sim Dir(\alpha)`, compute :math:`E[log \theta]`. 64 65 Parameters 66 ---------- 67 alpha : numpy.ndarray 68 Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector. 69 70 Returns 71 ------- 72 numpy.ndarray: 73 :math:`E[log \theta]` 74 75 """ 76 if len(alpha.shape) == 1: 77 result = psi(alpha) - psi(np.sum(alpha)) 78 else: 79 result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] 80 return result.astype(alpha.dtype, copy=False) # keep the same precision as input 81 82 83dirichlet_expectation_1d = dirichlet_expectation 84dirichlet_expectation_2d = dirichlet_expectation 85 86 87class TestLdaModelInner(unittest.TestCase): 88 def setUp(self): 89 self.random_state = np.random.RandomState() 90 self.num_runs = 100 # test functions with *num_runs* random inputs 91 self.num_topics = 100 92 93 def test_log_sum_exp(self): 94 # test logsumexp 95 rs = self.random_state 96 97 for dtype in [np.float16, np.float32, np.float64]: 98 for i in range(self.num_runs): 99 input = rs.uniform(-1000, 1000, size=(self.num_topics, 1)) 100 101 known_good = logsumexp(input) 102 test_values = matutils.logsumexp(input) 103 104 msg = "logsumexp failed for dtype={}".format(dtype) 105 self.assertTrue(np.allclose(known_good, test_values), msg) 106 107 def test_mean_absolute_difference(self): 108 # test mean_absolute_difference 109 rs = self.random_state 110 111 for dtype in [np.float16, np.float32, np.float64]: 112 for i in range(self.num_runs): 113 input1 = rs.uniform(-10000, 10000, size=(self.num_topics,)) 114 input2 = rs.uniform(-10000, 10000, size=(self.num_topics,)) 115 116 known_good = mean_absolute_difference(input1, input2) 117 test_values = matutils.mean_absolute_difference(input1, input2) 118 119 msg = "mean_absolute_difference failed for dtype={}".format(dtype) 120 self.assertTrue(np.allclose(known_good, test_values), msg) 121 122 def test_dirichlet_expectation(self): 123 # test dirichlet_expectation 124 rs = self.random_state 125 126 for dtype in [np.float16, np.float32, np.float64]: 127 for i in range(self.num_runs): 128 # 1 dimensional case 129 input_1d = rs.uniform(.01, 10000, size=(self.num_topics,)) 130 known_good = dirichlet_expectation(input_1d) 131 test_values = matutils.dirichlet_expectation(input_1d) 132 133 msg = "dirichlet_expectation_1d failed for dtype={}".format(dtype) 134 self.assertTrue(np.allclose(known_good, test_values), msg) 135 136 # 2 dimensional case 137 input_2d = rs.uniform(.01, 10000, size=(1, self.num_topics,)) 138 known_good = dirichlet_expectation(input_2d) 139 test_values = matutils.dirichlet_expectation(input_2d) 140 141 msg = "dirichlet_expectation_2d failed for dtype={}".format(dtype) 142 self.assertTrue(np.allclose(known_good, test_values), msg) 143 144 145def manual_unitvec(vec): 146 # manual unit vector calculation for UnitvecTestCase 147 vec = vec.astype(float) 148 if sparse.issparse(vec): 149 vec_sum_of_squares = vec.multiply(vec) 150 unit = 1. / np.sqrt(vec_sum_of_squares.sum()) 151 return vec.multiply(unit) 152 elif not sparse.issparse(vec): 153 sum_vec_squared = np.sum(vec ** 2) 154 vec /= np.sqrt(sum_vec_squared) 155 return vec 156 157 158class UnitvecTestCase(unittest.TestCase): 159 # test unitvec 160 def test_sparse_npfloat32(self): 161 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32) 162 unit_vector = matutils.unitvec(input_vector) 163 man_unit_vector = manual_unitvec(input_vector) 164 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 165 self.assertEqual(input_vector.dtype, unit_vector.dtype) 166 167 def test_sparse_npfloat64(self): 168 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64) 169 unit_vector = matutils.unitvec(input_vector) 170 man_unit_vector = manual_unitvec(input_vector) 171 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 172 self.assertEqual(input_vector.dtype, unit_vector.dtype) 173 174 def test_sparse_npint32(self): 175 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32) 176 unit_vector = matutils.unitvec(input_vector) 177 man_unit_vector = manual_unitvec(input_vector) 178 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 179 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 180 181 def test_sparse_npint64(self): 182 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64) 183 unit_vector = matutils.unitvec(input_vector) 184 man_unit_vector = manual_unitvec(input_vector) 185 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 186 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 187 188 def test_dense_npfloat32(self): 189 input_vector = np.random.uniform(size=(5,)).astype(np.float32) 190 unit_vector = matutils.unitvec(input_vector) 191 man_unit_vector = manual_unitvec(input_vector) 192 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 193 self.assertEqual(input_vector.dtype, unit_vector.dtype) 194 195 def test_dense_npfloat64(self): 196 input_vector = np.random.uniform(size=(5,)).astype(np.float64) 197 unit_vector = matutils.unitvec(input_vector) 198 man_unit_vector = manual_unitvec(input_vector) 199 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 200 self.assertEqual(input_vector.dtype, unit_vector.dtype) 201 202 def test_dense_npint32(self): 203 input_vector = np.random.randint(10, size=5).astype(np.int32) 204 unit_vector = matutils.unitvec(input_vector) 205 man_unit_vector = manual_unitvec(input_vector) 206 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 207 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 208 209 def test_dense_npint64(self): 210 input_vector = np.random.randint(10, size=5).astype(np.int32) 211 unit_vector = matutils.unitvec(input_vector) 212 man_unit_vector = manual_unitvec(input_vector) 213 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 214 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 215 216 def test_sparse_python_float(self): 217 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float) 218 unit_vector = matutils.unitvec(input_vector) 219 man_unit_vector = manual_unitvec(input_vector) 220 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 221 self.assertEqual(input_vector.dtype, unit_vector.dtype) 222 223 def test_sparse_python_int(self): 224 input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int) 225 unit_vector = matutils.unitvec(input_vector) 226 man_unit_vector = manual_unitvec(input_vector) 227 self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3)) 228 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 229 230 def test_dense_python_float(self): 231 input_vector = np.random.uniform(size=(5,)).astype(float) 232 unit_vector = matutils.unitvec(input_vector) 233 man_unit_vector = manual_unitvec(input_vector) 234 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 235 self.assertEqual(input_vector.dtype, unit_vector.dtype) 236 237 def test_dense_python_int(self): 238 input_vector = np.random.randint(10, size=5).astype(int) 239 unit_vector = matutils.unitvec(input_vector) 240 man_unit_vector = manual_unitvec(input_vector) 241 self.assertTrue(np.allclose(unit_vector, man_unit_vector)) 242 self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating)) 243 244 def test_return_norm_zero_vector_scipy_sparse(self): 245 input_vector = sparse.csr_matrix([[]], dtype=np.int32) 246 return_value = matutils.unitvec(input_vector, return_norm=True) 247 self.assertTrue(isinstance(return_value, tuple)) 248 norm = return_value[1] 249 self.assertTrue(isinstance(norm, float)) 250 self.assertEqual(norm, 1.0) 251 252 def test_return_norm_zero_vector_numpy(self): 253 input_vector = np.array([], dtype=np.int32) 254 return_value = matutils.unitvec(input_vector, return_norm=True) 255 self.assertTrue(isinstance(return_value, tuple)) 256 norm = return_value[1] 257 self.assertTrue(isinstance(norm, float)) 258 self.assertEqual(norm, 1.0) 259 260 def test_return_norm_zero_vector_gensim_sparse(self): 261 input_vector = [] 262 return_value = matutils.unitvec(input_vector, return_norm=True) 263 self.assertTrue(isinstance(return_value, tuple)) 264 norm = return_value[1] 265 self.assertTrue(isinstance(norm, float)) 266 self.assertEqual(norm, 1.0) 267 268 269if __name__ == '__main__': 270 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 271 unittest.main() 272