1#!/usr/bin/env python
2# encoding: utf-8
3
4"""Module contains common utilities used in automated code tests for Gensim modules.
5
6Attributes:
7
8module_path : str
9    Full path to this module directory.
10
11common_texts : list of list of str
12    Toy dataset.
13
14common_dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
15    Dictionary of toy dataset.
16
17common_corpus : list of list of (int, int)
18    Corpus of toy dataset.
19
20
21Examples:
22
23It's easy to keep objects in temporary folder and reuse'em if needed:
24
25.. sourcecode:: pycon
26
27    >>> from gensim.models import word2vec
28    >>> from gensim.test.utils import get_tmpfile, common_texts
29    >>>
30    >>> model = word2vec.Word2Vec(common_texts, min_count=1)
31    >>> temp_path = get_tmpfile('toy_w2v')
32    >>> model.save(temp_path)
33    >>>
34    >>> new_model = word2vec.Word2Vec.load(temp_path)
35    >>> result = new_model.wv.most_similar("human", topn=1)
36
37Let's print first document in toy dataset and then recreate it using its corpus and dictionary.
38
39.. sourcecode:: pycon
40
41    >>> from gensim.test.utils import common_texts, common_dictionary, common_corpus
42    >>> print(common_texts[0])
43    ['human', 'interface', 'computer']
44    >>> assert common_dictionary.doc2bow(common_texts[0]) == common_corpus[0]
45
46We can find our toy set in test data directory.
47
48.. sourcecode:: pycon
49
50    >>> from gensim.test.utils import datapath
51    >>>
52    >>> with open(datapath("testcorpus.txt")) as f:
53    ...     texts = [line.strip().split() for line in f]
54    >>> print(texts[0])
55    ['computer', 'human', 'interface']
56
57If you don't need to keep temporary objects on disk use :func:`~gensim.test.utils.temporary_file`:
58
59.. sourcecode:: pycon
60
61    >>> from gensim.test.utils import temporary_file, common_corpus, common_dictionary
62    >>> from gensim.models import LdaModel
63    >>>
64    >>> with temporary_file("temp.txt") as tf:
65    ...     lda = LdaModel(common_corpus, id2word=common_dictionary, num_topics=3)
66    ...     lda.save(tf)
67
68"""
69
70import contextlib
71import tempfile
72import os
73import shutil
74
75from gensim.corpora import Dictionary
76from gensim.utils import simple_preprocess
77
78module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
79
80
81def datapath(fname):
82    """Get full path for file `fname` in test data directory placed in this module directory.
83    Usually used to place corpus to test_data directory.
84
85    Parameters
86    ----------
87    fname : str
88        Name of file.
89
90    Returns
91    -------
92    str
93        Full path to `fname` in test_data folder.
94
95    Example
96    -------
97    Let's get path of test GloVe data file and check if it exits.
98
99    .. sourcecode:: pycon
100
101        >>> from gensim.corpora import MmCorpus
102        >>> from gensim.test.utils import datapath
103        >>>
104        >>> corpus = MmCorpus(datapath("testcorpus.mm"))
105        >>> for document in corpus:
106        ...     pass
107
108
109    """
110    return os.path.join(module_path, 'test_data', fname)
111
112
113def get_tmpfile(suffix):
114    """Get full path to file `suffix` in temporary folder.
115    This function doesn't creates file (only generate unique name).
116    Also, it may return different paths in consecutive calling.
117
118    Parameters
119    ----------
120    suffix : str
121        Suffix of file.
122
123    Returns
124    -------
125    str
126        Path to `suffix` file in temporary folder.
127
128    Examples
129    --------
130    Using this function we may get path to temporary file and use it, for example, to store temporary model.
131
132    .. sourcecode:: pycon
133
134        >>> from gensim.models import LsiModel
135        >>> from gensim.test.utils import get_tmpfile, common_dictionary, common_corpus
136        >>>
137        >>> tmp_f = get_tmpfile("toy_lsi_model")
138        >>>
139        >>> model = LsiModel(common_corpus, id2word=common_dictionary)
140        >>> model.save(tmp_f)
141        >>>
142        >>> loaded_model = LsiModel.load(tmp_f)
143
144    """
145    return os.path.join(tempfile.mkdtemp(), suffix)
146
147
148@contextlib.contextmanager
149def temporary_file(name=""):
150    """This context manager creates file `name` in temporary directory and returns its full path.
151    Temporary directory with included files will deleted at the end of context. Note, it won't create file.
152
153    Parameters
154    ----------
155    name : str
156        Filename.
157
158    Yields
159    ------
160    str
161        Path to file `name` in temporary directory.
162
163    Examples
164    --------
165    This example demonstrates that created temporary directory (and included
166    files) will deleted at the end of context.
167
168    .. sourcecode:: pycon
169
170        >>> import os
171        >>> from gensim.test.utils import temporary_file
172        >>> with temporary_file("temp.txt") as tf, open(tf, 'w') as outfile:
173        ...     outfile.write("my extremely useful information")
174        ...     print("Is this file exists? {}".format(os.path.exists(tf)))
175        ...     print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf))))
176        Is this file exists? True
177        Is this folder exists? True
178        >>>
179        >>> print("Is this file exists? {}".format(os.path.exists(tf)))
180        Is this file exists? False
181        >>> print("Is this folder exists? {}".format(os.path.exists(os.path.dirname(tf))))
182        Is this folder exists? False
183
184    """
185
186    # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory
187    tmp = tempfile.mkdtemp()
188    try:
189        yield os.path.join(tmp, name)
190    finally:
191        shutil.rmtree(tmp, ignore_errors=True)
192
193
194# set up vars used in testing ("Deerwester" from the web tutorial)
195common_texts = [
196    ['human', 'interface', 'computer'],
197    ['survey', 'user', 'computer', 'system', 'response', 'time'],
198    ['eps', 'user', 'interface', 'system'],
199    ['system', 'human', 'system', 'eps'],
200    ['user', 'response', 'time'],
201    ['trees'],
202    ['graph', 'trees'],
203    ['graph', 'minors', 'trees'],
204    ['graph', 'minors', 'survey']
205]
206
207common_dictionary = Dictionary(common_texts)
208common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
209
210
211class LeeCorpus:
212    def __iter__(self):
213        with open(datapath('lee_background.cor')) as f:
214            for line in f:
215                yield simple_preprocess(line)
216
217
218lee_corpus_list = list(LeeCorpus())
219