1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6
7"""
8Automated tests for checking transformation algorithms (the models package).
9"""
10
11
12import logging
13import unittest
14
15import numpy as np
16import scipy.linalg
17
18from gensim import matutils
19from gensim.corpora.mmcorpus import MmCorpus
20from gensim.models import lsimodel
21from gensim.test import basetmtests
22from gensim.test.utils import datapath, get_tmpfile
23
24
25class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
26    def setUp(self):
27        self.corpus = MmCorpus(datapath('testcorpus.mm'))
28        self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
29
30    def test_transform(self):
31        """Test lsi[vector] transformation."""
32        # create the transformation model
33        model = self.model
34
35        # make sure the decomposition is enough accurate
36        u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
37        self.assertTrue(np.allclose(s[:2], model.projection.s))  # singular values must match
38
39        # transform one document
40        doc = list(self.corpus)[0]
41        transformed = model[doc]
42        vec = matutils.sparse2full(transformed, 2)  # convert to dense vector, for easier equality tests
43        expected = np.array([-0.6594664, 0.142115444])  # scaled LSI version
44        # expected = np.array([-0.1973928, 0.05591352])  # non-scaled LSI version
45        self.assertTrue(np.allclose(abs(vec), abs(expected)))  # transformed entries must be equal up to sign
46
47    def test_transform_float32(self):
48        """Test lsi[vector] transformation."""
49        # create the transformation model
50        model = lsimodel.LsiModel(self.corpus, num_topics=2, dtype=np.float32)
51
52        # make sure the decomposition is enough accurate
53        u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
54        self.assertTrue(np.allclose(s[:2], model.projection.s))  # singular values must match
55        self.assertEqual(model.projection.u.dtype, np.float32)
56        self.assertEqual(model.projection.s.dtype, np.float32)
57
58        # transform one document
59        doc = list(self.corpus)[0]
60        transformed = model[doc]
61        vec = matutils.sparse2full(transformed, 2)  # convert to dense vector, for easier equality tests
62        expected = np.array([-0.6594664, 0.142115444])  # scaled LSI version
63        # transformed entries must be equal up to sign
64        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5))
65
66    def test_corpus_transform(self):
67        """Test lsi[corpus] transformation."""
68        model = self.model
69        got = np.vstack([matutils.sparse2full(doc, 2) for doc in model[self.corpus]])
70        expected = np.array([
71            [0.65946639, 0.14211544],
72            [2.02454305, -0.42088759],
73            [1.54655361, 0.32358921],
74            [1.81114125, 0.5890525],
75            [0.9336738, -0.27138939],
76            [0.01274618, -0.49016181],
77            [0.04888203, -1.11294699],
78            [0.08063836, -1.56345594],
79            [0.27381003, -1.34694159]
80        ])
81        self.assertTrue(np.allclose(abs(got), abs(expected)))  # must equal up to sign
82
83    def test_online_transform(self):
84        corpus = list(self.corpus)
85        doc = corpus[0]  # use the corpus' first document for testing
86
87        # create the transformation model
88        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5)  # compute everything at once
89        # start with no documents, we will add them later
90        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5)
91
92        # train model on a single document
93        model.add_documents([corpus[0]])
94
95        # transform the testing document with this partial transformation
96        transformed = model[doc]
97        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
98        expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0])  # scaled LSI version
99        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign
100
101        # train on another 4 documents
102        model.add_documents(corpus[1:5], chunksize=2)  # train on 4 extra docs, in chunks of 2 documents, for the lols
103
104        # transform a document with this partial transformation
105        transformed = model[doc]
106        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
107        expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269])  # scaled LSI version
108        self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign
109
110        # train on the rest of documents
111        model.add_documents(corpus[5:])
112
113        # make sure the final transformation is the same as if we had decomposed the whole corpus at once
114        vec1 = matutils.sparse2full(model[doc], model.num_topics)
115        vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
116        # the two LSI representations must equal up to sign
117        self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5))
118
119    def test_persistence(self):
120        fname = get_tmpfile('gensim_models_lsi.tst')
121        model = self.model
122        model.save(fname)
123        model2 = lsimodel.LsiModel.load(fname)
124        self.assertEqual(model.num_topics, model2.num_topics)
125        self.assertTrue(np.allclose(model.projection.u, model2.projection.u))
126        self.assertTrue(np.allclose(model.projection.s, model2.projection.s))
127        tstvec = []
128        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
129
130    def test_persistence_compressed(self):
131        fname = get_tmpfile('gensim_models_lsi.tst.gz')
132        model = self.model
133        model.save(fname)
134        model2 = lsimodel.LsiModel.load(fname, mmap=None)
135        self.assertEqual(model.num_topics, model2.num_topics)
136        self.assertTrue(np.allclose(model.projection.u, model2.projection.u))
137        self.assertTrue(np.allclose(model.projection.s, model2.projection.s))
138        tstvec = []
139        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
140
141    def test_large_mmap(self):
142        fname = get_tmpfile('gensim_models_lsi.tst')
143        model = self.model
144
145        # test storing the internal arrays into separate files
146        model.save(fname, sep_limit=0)
147
148        # now load the external arrays via mmap
149        model2 = lsimodel.LsiModel.load(fname, mmap='r')
150        self.assertEqual(model.num_topics, model2.num_topics)
151        self.assertTrue(isinstance(model2.projection.u, np.memmap))
152        self.assertTrue(isinstance(model2.projection.s, np.memmap))
153        self.assertTrue(np.allclose(model.projection.u, model2.projection.u))
154        self.assertTrue(np.allclose(model.projection.s, model2.projection.s))
155        tstvec = []
156        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
157
158    def test_large_mmap_compressed(self):
159        fname = get_tmpfile('gensim_models_lsi.tst.gz')
160        model = self.model
161
162        # test storing the internal arrays into separate files
163        model.save(fname, sep_limit=0)
164
165        # now load the external arrays via mmap
166        return
167
168        # turns out this test doesn't exercise this because there are no arrays
169        # to be mmaped!
170        self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')
171
172    def test_docs_processed(self):
173        self.assertEqual(self.model.docs_processed, 9)
174        self.assertEqual(self.model.docs_processed, self.corpus.num_docs)
175
176    def test_get_topics(self):
177        topics = self.model.get_topics()
178        vocab_size = len(self.model.id2word)
179        for topic in topics:
180            self.assertTrue(isinstance(topic, np.ndarray))
181            self.assertEqual(topic.dtype, np.float64)
182            self.assertEqual(vocab_size, topic.shape[0])
183            # LSI topics are not probability distributions
184            # self.assertAlmostEqual(np.sum(topic), 1.0, 5)
185
186
187if __name__ == '__main__':
188    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
189    unittest.main()
190