1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4#
5# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7import logging
8import unittest
9import numpy as np
10from scipy import sparse
11from scipy.special import psi  # gamma function utils
12
13import gensim.matutils as matutils
14
15
16# we'll define known, good (slow) version of functions here
17# and compare results from these functions vs. cython ones
18def logsumexp(x):
19    """Log of sum of exponentials.
20
21    Parameters
22    ----------
23    x : numpy.ndarray
24        Input 2d matrix.
25
26    Returns
27    -------
28    float
29        log of sum of exponentials of elements in `x`.
30
31    Warnings
32    --------
33    By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
34
35    """
36    x_max = np.max(x)
37    x = np.log(np.sum(np.exp(x - x_max)))
38    x += x_max
39
40    return x
41
42
43def mean_absolute_difference(a, b):
44    """Mean absolute difference between two arrays.
45
46    Parameters
47    ----------
48    a : numpy.ndarray
49        Input 1d array.
50    b : numpy.ndarray
51        Input 1d array.
52
53    Returns
54    -------
55    float
56        mean(abs(a - b)).
57
58    """
59    return np.mean(np.abs(a - b))
60
61
62def dirichlet_expectation(alpha):
63    r"""For a vector :math:`\theta \sim Dir(\alpha)`, compute :math:`E[log \theta]`.
64
65    Parameters
66    ----------
67    alpha : numpy.ndarray
68        Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.
69
70    Returns
71    -------
72    numpy.ndarray:
73        :math:`E[log \theta]`
74
75    """
76    if len(alpha.shape) == 1:
77        result = psi(alpha) - psi(np.sum(alpha))
78    else:
79        result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
80    return result.astype(alpha.dtype, copy=False)  # keep the same precision as input
81
82
83dirichlet_expectation_1d = dirichlet_expectation
84dirichlet_expectation_2d = dirichlet_expectation
85
86
87class TestLdaModelInner(unittest.TestCase):
88    def setUp(self):
89        self.random_state = np.random.RandomState()
90        self.num_runs = 100  # test functions with *num_runs* random inputs
91        self.num_topics = 100
92
93    def test_log_sum_exp(self):
94        # test logsumexp
95        rs = self.random_state
96
97        for dtype in [np.float16, np.float32, np.float64]:
98            for i in range(self.num_runs):
99                input = rs.uniform(-1000, 1000, size=(self.num_topics, 1))
100
101                known_good = logsumexp(input)
102                test_values = matutils.logsumexp(input)
103
104                msg = "logsumexp failed for dtype={}".format(dtype)
105                self.assertTrue(np.allclose(known_good, test_values), msg)
106
107    def test_mean_absolute_difference(self):
108        # test mean_absolute_difference
109        rs = self.random_state
110
111        for dtype in [np.float16, np.float32, np.float64]:
112            for i in range(self.num_runs):
113                input1 = rs.uniform(-10000, 10000, size=(self.num_topics,))
114                input2 = rs.uniform(-10000, 10000, size=(self.num_topics,))
115
116                known_good = mean_absolute_difference(input1, input2)
117                test_values = matutils.mean_absolute_difference(input1, input2)
118
119                msg = "mean_absolute_difference failed for dtype={}".format(dtype)
120                self.assertTrue(np.allclose(known_good, test_values), msg)
121
122    def test_dirichlet_expectation(self):
123        # test dirichlet_expectation
124        rs = self.random_state
125
126        for dtype in [np.float16, np.float32, np.float64]:
127            for i in range(self.num_runs):
128                # 1 dimensional case
129                input_1d = rs.uniform(.01, 10000, size=(self.num_topics,))
130                known_good = dirichlet_expectation(input_1d)
131                test_values = matutils.dirichlet_expectation(input_1d)
132
133                msg = "dirichlet_expectation_1d failed for dtype={}".format(dtype)
134                self.assertTrue(np.allclose(known_good, test_values), msg)
135
136                # 2 dimensional case
137                input_2d = rs.uniform(.01, 10000, size=(1, self.num_topics,))
138                known_good = dirichlet_expectation(input_2d)
139                test_values = matutils.dirichlet_expectation(input_2d)
140
141                msg = "dirichlet_expectation_2d failed for dtype={}".format(dtype)
142                self.assertTrue(np.allclose(known_good, test_values), msg)
143
144
145def manual_unitvec(vec):
146    # manual unit vector calculation for UnitvecTestCase
147    vec = vec.astype(float)
148    if sparse.issparse(vec):
149        vec_sum_of_squares = vec.multiply(vec)
150        unit = 1. / np.sqrt(vec_sum_of_squares.sum())
151        return vec.multiply(unit)
152    elif not sparse.issparse(vec):
153        sum_vec_squared = np.sum(vec ** 2)
154        vec /= np.sqrt(sum_vec_squared)
155        return vec
156
157
158class UnitvecTestCase(unittest.TestCase):
159    # test unitvec
160    def test_sparse_npfloat32(self):
161        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32)
162        unit_vector = matutils.unitvec(input_vector)
163        man_unit_vector = manual_unitvec(input_vector)
164        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
165        self.assertEqual(input_vector.dtype, unit_vector.dtype)
166
167    def test_sparse_npfloat64(self):
168        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64)
169        unit_vector = matutils.unitvec(input_vector)
170        man_unit_vector = manual_unitvec(input_vector)
171        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
172        self.assertEqual(input_vector.dtype, unit_vector.dtype)
173
174    def test_sparse_npint32(self):
175        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32)
176        unit_vector = matutils.unitvec(input_vector)
177        man_unit_vector = manual_unitvec(input_vector)
178        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
179        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
180
181    def test_sparse_npint64(self):
182        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64)
183        unit_vector = matutils.unitvec(input_vector)
184        man_unit_vector = manual_unitvec(input_vector)
185        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
186        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
187
188    def test_dense_npfloat32(self):
189        input_vector = np.random.uniform(size=(5,)).astype(np.float32)
190        unit_vector = matutils.unitvec(input_vector)
191        man_unit_vector = manual_unitvec(input_vector)
192        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
193        self.assertEqual(input_vector.dtype, unit_vector.dtype)
194
195    def test_dense_npfloat64(self):
196        input_vector = np.random.uniform(size=(5,)).astype(np.float64)
197        unit_vector = matutils.unitvec(input_vector)
198        man_unit_vector = manual_unitvec(input_vector)
199        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
200        self.assertEqual(input_vector.dtype, unit_vector.dtype)
201
202    def test_dense_npint32(self):
203        input_vector = np.random.randint(10, size=5).astype(np.int32)
204        unit_vector = matutils.unitvec(input_vector)
205        man_unit_vector = manual_unitvec(input_vector)
206        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
207        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
208
209    def test_dense_npint64(self):
210        input_vector = np.random.randint(10, size=5).astype(np.int32)
211        unit_vector = matutils.unitvec(input_vector)
212        man_unit_vector = manual_unitvec(input_vector)
213        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
214        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
215
216    def test_sparse_python_float(self):
217        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float)
218        unit_vector = matutils.unitvec(input_vector)
219        man_unit_vector = manual_unitvec(input_vector)
220        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
221        self.assertEqual(input_vector.dtype, unit_vector.dtype)
222
223    def test_sparse_python_int(self):
224        input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int)
225        unit_vector = matutils.unitvec(input_vector)
226        man_unit_vector = manual_unitvec(input_vector)
227        self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
228        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
229
230    def test_dense_python_float(self):
231        input_vector = np.random.uniform(size=(5,)).astype(float)
232        unit_vector = matutils.unitvec(input_vector)
233        man_unit_vector = manual_unitvec(input_vector)
234        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
235        self.assertEqual(input_vector.dtype, unit_vector.dtype)
236
237    def test_dense_python_int(self):
238        input_vector = np.random.randint(10, size=5).astype(int)
239        unit_vector = matutils.unitvec(input_vector)
240        man_unit_vector = manual_unitvec(input_vector)
241        self.assertTrue(np.allclose(unit_vector, man_unit_vector))
242        self.assertTrue(np.issubdtype(unit_vector.dtype, np.floating))
243
244    def test_return_norm_zero_vector_scipy_sparse(self):
245        input_vector = sparse.csr_matrix([[]], dtype=np.int32)
246        return_value = matutils.unitvec(input_vector, return_norm=True)
247        self.assertTrue(isinstance(return_value, tuple))
248        norm = return_value[1]
249        self.assertTrue(isinstance(norm, float))
250        self.assertEqual(norm, 1.0)
251
252    def test_return_norm_zero_vector_numpy(self):
253        input_vector = np.array([], dtype=np.int32)
254        return_value = matutils.unitvec(input_vector, return_norm=True)
255        self.assertTrue(isinstance(return_value, tuple))
256        norm = return_value[1]
257        self.assertTrue(isinstance(norm, float))
258        self.assertEqual(norm, 1.0)
259
260    def test_return_norm_zero_vector_gensim_sparse(self):
261        input_vector = []
262        return_value = matutils.unitvec(input_vector, return_norm=True)
263        self.assertTrue(isinstance(return_value, tuple))
264        norm = return_value[1]
265        self.assertTrue(isinstance(norm, float))
266        self.assertEqual(norm, 1.0)
267
268
269if __name__ == '__main__':
270    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
271    unittest.main()
272