1#!/usr/bin/env python 2# encoding: utf-8 3 4"""Module contains common utilities used in automated code tests for Gensim modules. 5 6Attributes: 7 8module_path : str 9 Full path to this module directory. 10 11common_texts : list of list of str 12 Toy dataset. 13 14common_dictionary : :class:`~gensim.corpora.dictionary.Dictionary` 15 Dictionary of toy dataset. 16 17common_corpus : list of list of (int, int) 18 Corpus of toy dataset. 19 20 21Examples: 22 23It's easy to keep objects in temporary folder and reuse'em if needed: 24 25.. sourcecode:: pycon 26 27 >>> from gensim.models import word2vec 28 >>> from gensim.test.utils import get_tmpfile, common_texts 29 >>> 30 >>> model = word2vec.Word2Vec(common_texts, min_count=1) 31 >>> temp_path = get_tmpfile('toy_w2v') 32 >>> model.save(temp_path) 33 >>> 34 >>> new_model = word2vec.Word2Vec.load(temp_path) 35 >>> result = new_model.wv.most_similar("human", topn=1) 36 37Let's print first document in toy dataset and then recreate it using its corpus and dictionary. 38 39.. sourcecode:: pycon 40 41 >>> from gensim.test.utils import common_texts, common_dictionary, common_corpus 42 >>> print(common_texts[0]) 43 ['human', 'interface', 'computer'] 44 >>> assert common_dictionary.doc2bow(common_texts[0]) == common_corpus[0] 45 46We can find our toy set in test data directory. 47 48.. sourcecode:: pycon 49 50 >>> from gensim.test.utils import datapath 51 >>> 52 >>> with open(datapath("testcorpus.txt")) as f: 53 ... texts = [line.strip().split() for line in f] 54 >>> print(texts[0]) 55 ['computer', 'human', 'interface'] 56 57If you don't need to keep temporary objects on disk use :func:`~gensim.test.utils.temporary_file`: 58 59.. sourcecode:: pycon 60 61 >>> from gensim.test.utils import temporary_file, common_corpus, common_dictionary 62 >>> from gensim.models import LdaModel 63 >>> 64 >>> with temporary_file("temp.txt") as tf: 65 ... lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=3) 66 ... lda.save(tf) 67 68""" 69 70import contextlib 71import tempfile 72import os 73import shutil 74 75from gensim.corpora import Dictionary 76from gensim.utils import simple_preprocess 77 78module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder 79 80 81def datapath(fname): 82 """Get full path for file `fname` in test data directory placed in this module directory. 83 Usually used to place corpus to test_data directory. 84 85 Parameters 86 ---------- 87 fname : str 88 Name of file. 89 90 Returns 91 ------- 92 str 93 Full path to `fname` in test_data folder. 94 95 Example 96 ------- 97 Let's get path of test GloVe data file and check if it exits. 98 99 .. sourcecode:: pycon 100 101 >>> from gensim.corpora import MmCorpus 102 >>> from gensim.test.utils import datapath 103 >>> 104 >>> corpus = MmCorpus(datapath("testcorpus.mm")) 105 >>> for document in corpus: 106 ... pass 107 108 109 """ 110 return os.path.join(module_path, 'test_data', fname) 111 112 113def get_tmpfile(suffix): 114 """Get full path to file `suffix` in temporary folder. 115 This function doesn't creates file (only generate unique name). 116 Also, it may return different paths in consecutive calling. 117 118 Parameters 119 ---------- 120 suffix : str 121 Suffix of file. 122 123 Returns 124 ------- 125 str 126 Path to `suffix` file in temporary folder. 127 128 Examples 129 -------- 130 Using this function we may get path to temporary file and use it, for example, to store temporary model. 131 132 .. sourcecode:: pycon 133 134 >>> from gensim.models import LsiModel 135 >>> from gensim.test.utils import get_tmpfile, common_dictionary, common_corpus 136 >>> 137 >>> tmp_f = get_tmpfile("toy_lsi_model") 138 >>> 139 >>> model = LsiModel(common_corpus, id2word=common_dictionary) 140 >>> model.save(tmp_f) 141 >>> 142 >>> loaded_model = LsiModel.load(tmp_f) 143 144 """ 145 return os.path.join(tempfile.mkdtemp(), suffix) 146 147 148@contextlib.contextmanager 149def temporary_file(name=""): 150 """This context manager creates file `name` in temporary directory and returns its full path. 151 Temporary directory with included files will deleted at the end of context. Note, it won't create file. 152 153 Parameters 154 ---------- 155 name : str 156 Filename. 157 158 Yields 159 ------ 160 str 161 Path to file `name` in temporary directory. 162 163 Examples 164 -------- 165 This example demonstrates that created temporary directory (and included 166 files) will deleted at the end of context. 167 168 .. sourcecode:: pycon 169 170 >>> import os 171 >>> from gensim.test.utils import temporary_file 172 >>> with temporary_file("temp.txt") as tf, open(tf, 'w') as outfile: 173 ... outfile.write("my extremely useful information") 174 ... print("Is this file exists? {}".format(os.path.exists(tf))) 175 ... print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf)))) 176 Is this file exists? True 177 Is this folder exists? True 178 >>> 179 >>> print("Is this file exists? {}".format(os.path.exists(tf))) 180 Is this file exists? False 181 >>> print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf)))) 182 Is this folder exists? False 183 184 """ 185 186 # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory 187 tmp = tempfile.mkdtemp() 188 try: 189 yield os.path.join(tmp, name) 190 finally: 191 shutil.rmtree(tmp, ignore_errors=True) 192 193 194# set up vars used in testing ("Deerwester" from the web tutorial) 195common_texts = [ 196 ['human', 'interface', 'computer'], 197 ['survey', 'user', 'computer', 'system', 'response', 'time'], 198 ['eps', 'user', 'interface', 'system'], 199 ['system', 'human', 'system', 'eps'], 200 ['user', 'response', 'time'], 201 ['trees'], 202 ['graph', 'trees'], 203 ['graph', 'minors', 'trees'], 204 ['graph', 'minors', 'survey'] 205] 206 207common_dictionary = Dictionary(common_texts) 208common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] 209 210 211class LeeCorpus: 212 def __iter__(self): 213 with open(datapath('lee_background.cor')) as f: 214 for line in f: 215 yield simple_preprocess(line) 216 217 218lee_corpus_list = list(LeeCorpus()) 219