1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> 5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 7""" 8Automated tests for checking transformation algorithms (the models package). 9""" 10 11 12import logging 13import unittest 14 15import numpy as np 16import scipy.linalg 17 18from gensim import matutils 19from gensim.corpora.mmcorpus import MmCorpus 20from gensim.models import lsimodel 21from gensim.test import basetmtests 22from gensim.test.utils import datapath, get_tmpfile 23 24 25class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel): 26 def setUp(self): 27 self.corpus = MmCorpus(datapath('testcorpus.mm')) 28 self.model = lsimodel.LsiModel(self.corpus, num_topics=2) 29 30 def test_transform(self): 31 """Test lsi[vector] transformation.""" 32 # create the transformation model 33 model = self.model 34 35 # make sure the decomposition is enough accurate 36 u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) 37 self.assertTrue(np.allclose(s[:2], model.projection.s)) # singular values must match 38 39 # transform one document 40 doc = list(self.corpus)[0] 41 transformed = model[doc] 42 vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests 43 expected = np.array([-0.6594664, 0.142115444]) # scaled LSI version 44 # expected = np.array([-0.1973928, 0.05591352]) # non-scaled LSI version 45 self.assertTrue(np.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign 46 47 def test_transform_float32(self): 48 """Test lsi[vector] transformation.""" 49 # create the transformation model 50 model = lsimodel.LsiModel(self.corpus, num_topics=2, dtype=np.float32) 51 52 # make sure the decomposition is enough accurate 53 u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False) 54 self.assertTrue(np.allclose(s[:2], model.projection.s)) # singular values must match 55 self.assertEqual(model.projection.u.dtype, np.float32) 56 self.assertEqual(model.projection.s.dtype, np.float32) 57 58 # transform one document 59 doc = list(self.corpus)[0] 60 transformed = model[doc] 61 vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests 62 expected = np.array([-0.6594664, 0.142115444]) # scaled LSI version 63 # transformed entries must be equal up to sign 64 self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1.e-5)) 65 66 def test_corpus_transform(self): 67 """Test lsi[corpus] transformation.""" 68 model = self.model 69 got = np.vstack([matutils.sparse2full(doc, 2) for doc in model[self.corpus]]) 70 expected = np.array([ 71 [0.65946639, 0.14211544], 72 [2.02454305, -0.42088759], 73 [1.54655361, 0.32358921], 74 [1.81114125, 0.5890525], 75 [0.9336738, -0.27138939], 76 [0.01274618, -0.49016181], 77 [0.04888203, -1.11294699], 78 [0.08063836, -1.56345594], 79 [0.27381003, -1.34694159] 80 ]) 81 self.assertTrue(np.allclose(abs(got), abs(expected))) # must equal up to sign 82 83 def test_online_transform(self): 84 corpus = list(self.corpus) 85 doc = corpus[0] # use the corpus' first document for testing 86 87 # create the transformation model 88 model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once 89 # start with no documents, we will add them later 90 model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) 91 92 # train model on a single document 93 model.add_documents([corpus[0]]) 94 95 # transform the testing document with this partial transformation 96 transformed = model[doc] 97 vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests 98 expected = np.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version 99 self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign 100 101 # train on another 4 documents 102 model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols 103 104 # transform a document with this partial transformation 105 transformed = model[doc] 106 vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests 107 expected = np.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version 108 self.assertTrue(np.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign 109 110 # train on the rest of documents 111 model.add_documents(corpus[5:]) 112 113 # make sure the final transformation is the same as if we had decomposed the whole corpus at once 114 vec1 = matutils.sparse2full(model[doc], model.num_topics) 115 vec2 = matutils.sparse2full(model2[doc], model2.num_topics) 116 # the two LSI representations must equal up to sign 117 self.assertTrue(np.allclose(abs(vec1), abs(vec2), atol=1e-5)) 118 119 def test_persistence(self): 120 fname = get_tmpfile('gensim_models_lsi.tst') 121 model = self.model 122 model.save(fname) 123 model2 = lsimodel.LsiModel.load(fname) 124 self.assertEqual(model.num_topics, model2.num_topics) 125 self.assertTrue(np.allclose(model.projection.u, model2.projection.u)) 126 self.assertTrue(np.allclose(model.projection.s, model2.projection.s)) 127 tstvec = [] 128 self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 129 130 def test_persistence_compressed(self): 131 fname = get_tmpfile('gensim_models_lsi.tst.gz') 132 model = self.model 133 model.save(fname) 134 model2 = lsimodel.LsiModel.load(fname, mmap=None) 135 self.assertEqual(model.num_topics, model2.num_topics) 136 self.assertTrue(np.allclose(model.projection.u, model2.projection.u)) 137 self.assertTrue(np.allclose(model.projection.s, model2.projection.s)) 138 tstvec = [] 139 self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 140 141 def test_large_mmap(self): 142 fname = get_tmpfile('gensim_models_lsi.tst') 143 model = self.model 144 145 # test storing the internal arrays into separate files 146 model.save(fname, sep_limit=0) 147 148 # now load the external arrays via mmap 149 model2 = lsimodel.LsiModel.load(fname, mmap='r') 150 self.assertEqual(model.num_topics, model2.num_topics) 151 self.assertTrue(isinstance(model2.projection.u, np.memmap)) 152 self.assertTrue(isinstance(model2.projection.s, np.memmap)) 153 self.assertTrue(np.allclose(model.projection.u, model2.projection.u)) 154 self.assertTrue(np.allclose(model.projection.s, model2.projection.s)) 155 tstvec = [] 156 self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 157 158 def test_large_mmap_compressed(self): 159 fname = get_tmpfile('gensim_models_lsi.tst.gz') 160 model = self.model 161 162 # test storing the internal arrays into separate files 163 model.save(fname, sep_limit=0) 164 165 # now load the external arrays via mmap 166 return 167 168 # turns out this test doesn't exercise this because there are no arrays 169 # to be mmaped! 170 self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r') 171 172 def test_docs_processed(self): 173 self.assertEqual(self.model.docs_processed, 9) 174 self.assertEqual(self.model.docs_processed, self.corpus.num_docs) 175 176 def test_get_topics(self): 177 topics = self.model.get_topics() 178 vocab_size = len(self.model.id2word) 179 for topic in topics: 180 self.assertTrue(isinstance(topic, np.ndarray)) 181 self.assertEqual(topic.dtype, np.float64) 182 self.assertEqual(vocab_size, topic.shape[0]) 183 # LSI topics are not probability distributions 184 # self.assertAlmostEqual(np.sum(topic), 1.0, 5) 185 186 187if __name__ == '__main__': 188 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) 189 unittest.main() 190