1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6
7"""Compute similarities across a collection of documents in the Vector Space Model.
8
9The main class is :class:`~gensim.similarities.docsim.Similarity`, which builds an index for a given set of documents.
10
11Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each
12document in the index?". The result is a vector of numbers as large as the size of the initial set of documents,
13that is, one float for each index document. Alternatively, you can also request only the top-N most
14similar index documents to the query.
15
16
17How It Works
18------------
19The :class:`~gensim.similarities.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"),
20which are disk-based. If your entire index fits in memory (~one million documents per 1GB of RAM),
21you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity`
22or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
23These are more simple but do not scale as well: they keep the entire index in RAM, no sharding. They also do not
24support adding new document to the index dynamically.
25
26Once the index has been initialized, you can query for document similarity simply by
27
28.. sourcecode:: pycon
29
30    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
31    >>>
32    >>> index_tmpfile = get_tmpfile("index")
33    >>> query = [(1, 2), (6, 1), (7, 2)]
34    >>>
35    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
36    >>> similarities = index[query]  # get similarities between the query and all index documents
37
38If you have more query documents, you can submit them all at once, in a batch
39
40.. sourcecode:: pycon
41
42    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
43    >>>
44    >>> index_tmpfile = get_tmpfile("index")
45    >>> batch_of_documents = common_corpus[:]  # only as example
46    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
47    >>>
48    >>> # the batch is simply an iterable of documents, aka gensim corpus:
49    >>> for similarities in index[batch_of_documents]:
50    ...     pass
51
52The benefit of this batch (aka "chunked") querying is a much better performance.
53To see the speed-up on your machine, run ``python -m gensim.test.simspeed``
54(compare to my results `here <http://groups.google.com/group/gensim/msg/4f6f171a869e4fca?>`_).
55
56There is also a special syntax for when you need similarity of documents in the index
57to the index itself (i.e. queries = the indexed documents themselves). This special syntax
58uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:
59
60.. sourcecode:: pycon
61
62    >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
63    >>>
64    >>> index_tmpfile = get_tmpfile("index")
65    >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary))  # build the index
66    >>>
67    >>> for similarities in index:  # yield similarities of the 1st indexed document, then 2nd...
68    ...     pass
69
70"""
71import logging
72import itertools
73import os
74import heapq
75
76import numpy
77import scipy.sparse
78
79from gensim import interfaces, utils, matutils
80
81
82logger = logging.getLogger(__name__)
83
84PARALLEL_SHARDS = False
85try:
86    import multiprocessing
87    # by default, don't parallelize queries. uncomment the following line if you want that.
88#    PARALLEL_SHARDS = multiprocessing.cpu_count() # use #parallel processes = #CPus
89except ImportError:
90    pass
91
92
93class Shard(utils.SaveLoad):
94    """A proxy that represents a single shard instance within :class:`~gensim.similarity.docsim.Similarity` index.
95
96    Basically just wraps :class:`~gensim.similarities.docsim.MatrixSimilarity`,
97    :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query).
98
99    """
100    def __init__(self, fname, index):
101        """
102
103        Parameters
104        ----------
105        fname : str
106            Path to top-level directory (file) to traverse for corpus documents.
107        index : :class:`~gensim.interfaces.SimilarityABC`
108            Index object.
109
110        """
111        self.dirname, self.fname = os.path.split(fname)
112        self.length = len(index)
113        self.cls = index.__class__
114        logger.info("saving index shard to %s", self.fullname())
115        index.save(self.fullname())
116        self.index = self.get_index()
117
118    def fullname(self):
119        """Get full path to shard file.
120
121        Return
122        ------
123        str
124            Path to shard instance.
125
126        """
127        return os.path.join(self.dirname, self.fname)
128
129    def __len__(self):
130        """Get length."""
131        return self.length
132
133    def __getstate__(self):
134        """Special handler for pickle.
135
136        Returns
137        -------
138        dict
139            Object that contains state of current instance without `index`.
140
141        """
142        result = self.__dict__.copy()
143        # (S)MS objects must be loaded via load() because of mmap (simple pickle.load won't do)
144        if 'index' in result:
145            del result['index']
146        return result
147
148    def __str__(self):
149        return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname())
150
151    def get_index(self):
152        """Load & get index.
153
154        Returns
155        -------
156        :class:`~gensim.interfaces.SimilarityABC`
157            Index instance.
158
159        """
160        if not hasattr(self, 'index'):
161            logger.debug("mmaping index from %s", self.fullname())
162            self.index = self.cls.load(self.fullname(), mmap='r')
163        return self.index
164
165    def get_document_id(self, pos):
166        """Get index vector at position `pos`.
167
168        Parameters
169        ----------
170        pos : int
171            Vector position.
172
173        Return
174        ------
175        {:class:`scipy.sparse.csr_matrix`, :class:`numpy.ndarray`}
176            Index vector. Type depends on underlying index.
177
178        Notes
179        -----
180        The vector is of the same type as the underlying index (ie., dense for
181        :class:`~gensim.similarities.docsim.MatrixSimilarity`
182        and scipy.sparse for :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.
183
184        """
185        assert 0 <= pos < len(self), "requested position out of range"
186        return self.get_index().index[pos]
187
188    def __getitem__(self, query):
189        """Get similarities of document (or corpus) `query` to all documents in the corpus.
190
191        Parameters
192        ----------
193        query : {iterable of list of (int, number) , list of (int, number))}
194            Document or corpus.
195
196        Returns
197        -------
198        :class:`numpy.ndarray`
199            Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or**
200        :class:`scipy.sparse.csr_matrix`
201            for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.
202
203        """
204        index = self.get_index()
205        try:
206            index.num_best = self.num_best
207            index.normalize = self.normalize
208        except Exception:
209            raise ValueError("num_best and normalize have to be set before querying a proxy Shard object")
210        return index[query]
211
212
213def query_shard(args):
214    """Helper for request query from shard, same as shard[query].
215
216    Parameters
217    ---------
218    args : (list of (int, number), :class:`~gensim.interfaces.SimilarityABC`)
219        Query and Shard instances
220
221    Returns
222    -------
223    :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
224        Similarities of the query against documents indexed in this shard.
225
226    """
227    query, shard = args  # simulate starmap (not part of multiprocessing in older Pythons)
228    logger.debug("querying shard %s num_best=%s in process %s", shard, shard.num_best, os.getpid())
229    result = shard[query]
230    logger.debug("finished querying shard %s in process %s", shard, os.getpid())
231    return result
232
233
234def _nlargest(n, iterable):
235    """Helper for extracting n documents with maximum similarity.
236
237    Parameters
238    ----------
239    n : int
240        Number of elements to be extracted
241    iterable : iterable of list of (int, float)
242        Iterable containing documents with computed similarities
243
244    Returns
245    -------
246    :class:`list`
247        List with the n largest elements from the dataset defined by iterable.
248
249    Notes
250    -----
251    Elements are compared by the absolute value of similarity, because negative value of similarity
252    does not mean some form of dissimilarity.
253
254    """
255    return heapq.nlargest(n, itertools.chain(*iterable), key=lambda item: abs(item[1]))
256
257
258class Similarity(interfaces.SimilarityABC):
259    """Compute cosine similarity of a dynamic query against a corpus of documents ('the index').
260
261    The index supports adding new documents dynamically.
262
263    Notes
264    -----
265    Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory
266    The shards themselves are simply stored as files to disk and mmap'ed back as needed.
267
268    Examples
269    --------
270    .. sourcecode:: pycon
271
272        >>> from gensim.corpora.textcorpus import TextCorpus
273        >>> from gensim.test.utils import datapath, get_tmpfile
274        >>> from gensim.similarities import Similarity
275        >>>
276        >>> corpus = TextCorpus(datapath('testcorpus.mm'))
277        >>> index_temp = get_tmpfile("index")
278        >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
279        >>>
280        >>> query = next(iter(corpus))
281        >>> result = index[query]  # search similar to `query` in index
282        >>>
283        >>> for sims in index[corpus]:  # if you have more query documents, you can submit them all at once, in a batch
284        ...     pass
285        >>>
286        >>> # There is also a special syntax for when you need similarity of documents in the index
287        >>> # to the index itself (i.e. queries=indexed documents themselves). This special syntax
288        >>> # uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:
289        >>> for similarities in index:  # yield similarities of the 1st indexed document, then 2nd...
290        ...     pass
291
292    See Also
293    --------
294    :class:`~gensim.similarities.docsim.MatrixSimilarity`
295        Index similarity (dense with cosine distance).
296    :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`
297        Index similarity (sparse with cosine distance).
298    :class:`~gensim.similarities.docsim.WmdSimilarity`
299        Index similarity (with word-mover distance).
300
301    """
302
303    def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'):
304        """
305
306        Parameters
307        ----------
308        output_prefix : str
309            Prefix for shard filename. If None, a random filename in temp will be used.
310        corpus : iterable of list of (int, number)
311            Corpus in streamed Gensim bag-of-words format.
312        num_features : int
313            Size of the dictionary (number of features).
314        num_best : int, optional
315            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
316            Otherwise, return a full vector with one float for every document in the index.
317        chunksize : int, optional
318            Size of query chunks. Used internally when the query is an entire corpus.
319        shardsize : int, optional
320            Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits
321            comfortably into your RAM.
322        norm : {'l1', 'l2'}, optional
323            Normalization to use.
324
325        Notes
326        -----
327        Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard
328        converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`.
329
330        If you don't specify an output prefix, a random filename in temp will be used.
331
332        If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the
333        :class:`~gensim.similarities.docsim.MatrixSimilarity` or
334        :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
335        These are more simple but do not scale as well (they keep the entire index in RAM, no sharding).
336        They also do not support adding new document dynamically.
337
338        """
339        if output_prefix is None:
340            # undocumented feature: set output_prefix=None to create the server in temp
341            self.output_prefix = utils.randfname(prefix='simserver')
342        else:
343            self.output_prefix = output_prefix
344        logger.info("starting similarity index under %s", self.output_prefix)
345        self.num_features = num_features
346        self.num_best = num_best
347        self.norm = norm
348        self.chunksize = int(chunksize)
349        self.shardsize = shardsize
350        self.shards = []
351        self.fresh_docs, self.fresh_nnz = [], 0
352
353        if corpus is not None:
354            self.add_documents(corpus)
355
356    def __len__(self):
357        """Get length of index."""
358        return len(self.fresh_docs) + sum(len(shard) for shard in self.shards)
359
360    def __str__(self):
361        return "Similarity index with %i documents in %i shards (stored under %s)" % (
362            len(self), len(self.shards), self.output_prefix
363        )
364
365    def add_documents(self, corpus):
366        """Extend the index with new documents.
367
368        Parameters
369        ----------
370        corpus : iterable of list of (int, number)
371            Corpus in BoW format.
372
373        Notes
374        -----
375        Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
376        (or when a query is issued).
377
378        Examples
379        --------
380        .. sourcecode:: pycon
381
382            >>> from gensim.corpora.textcorpus import TextCorpus
383            >>> from gensim.test.utils import datapath, get_tmpfile
384            >>> from gensim.similarities import Similarity
385            >>>
386            >>> corpus = TextCorpus(datapath('testcorpus.mm'))
387            >>> index_temp = get_tmpfile("index")
388            >>> index = Similarity(index_temp, corpus, num_features=400)  # create index
389            >>>
390            >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
391            >>> index.add_documents(one_more_corpus)  # add more documents in corpus
392
393        """
394        min_ratio = 1.0  # 0.5 to only reopen shards that are <50% complete
395        if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
396            # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
397            self.reopen_shard()
398        for doc in corpus:
399            if isinstance(doc, numpy.ndarray):
400                doclen = len(doc)
401            elif scipy.sparse.issparse(doc):
402                doclen = doc.nnz
403            else:
404                doclen = len(doc)
405                if doclen < 0.3 * self.num_features:
406                    doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
407                else:
408                    doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
409            self.fresh_docs.append(doc)
410            self.fresh_nnz += doclen
411            if len(self.fresh_docs) >= self.shardsize:
412                self.close_shard()
413            if len(self.fresh_docs) % 10000 == 0:
414                logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
415
416    def shardid2filename(self, shardid):
417        """Get shard file by `shardid`.
418
419        Parameters
420        ----------
421        shardid : int
422            Shard index.
423
424        Return
425        ------
426        str
427            Path to shard file.
428
429        """
430        if self.output_prefix.endswith('.'):
431            return "%s%s" % (self.output_prefix, shardid)
432        else:
433            return "%s.%s" % (self.output_prefix, shardid)
434
435    def close_shard(self):
436        """Force the latest shard to close (be converted to a matrix and stored to disk).
437         Do nothing if no new documents added since last call.
438
439        Notes
440        -----
441        The shard is closed even if it is not full yet (its size is smaller than `self.shardsize`).
442        If documents are added later via :meth:`~gensim.similarities.docsim.MatrixSimilarity.add_documents`
443        this incomplete shard will be loaded again and completed.
444
445        """
446        if not self.fresh_docs:
447            return
448        shardid = len(self.shards)
449        # consider the shard sparse if its density is < 30%
450        issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features)
451        if issparse:
452            index = SparseMatrixSimilarity(
453                self.fresh_docs, num_terms=self.num_features, num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz
454            )
455        else:
456            index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features)
457        logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid)
458        shard = Shard(self.shardid2filename(shardid), index)
459        shard.num_best = self.num_best
460        shard.num_nnz = self.fresh_nnz
461        self.shards.append(shard)
462        self.fresh_docs, self.fresh_nnz = [], 0
463
464    def reopen_shard(self):
465        """Reopen an incomplete shard."""
466        assert self.shards
467        if self.fresh_docs:
468            raise ValueError("cannot reopen a shard with fresh documents in index")
469        last_shard = self.shards[-1]
470        last_index = last_shard.get_index()
471        logger.info("reopening an incomplete shard of %i documents", len(last_shard))
472
473        self.fresh_docs = list(last_index.index)
474        self.fresh_nnz = last_shard.num_nnz
475        del self.shards[-1]  # remove the shard from index, *but its file on disk is not deleted*
476        logger.debug("reopen complete")
477
478    def query_shards(self, query):
479        """Apply shard[query] to each shard in `self.shards`. Used internally.
480
481        Parameters
482        ----------
483        query : {iterable of list of (int, number) , list of (int, number))}
484            Document in BoW format or corpus of documents.
485
486        Returns
487        -------
488        (None, list of individual shard query results)
489            Query results.
490
491        """
492        args = zip([query] * len(self.shards), self.shards)
493        if PARALLEL_SHARDS and PARALLEL_SHARDS > 1:
494            logger.debug("spawning %i query processes", PARALLEL_SHARDS)
495            pool = multiprocessing.Pool(PARALLEL_SHARDS)
496            result = pool.imap(query_shard, args, chunksize=1 + len(self.shards) / PARALLEL_SHARDS)
497        else:
498            # serial processing, one shard after another
499            pool = None
500            result = map(query_shard, args)
501        return pool, result
502
503    def __getitem__(self, query):
504        """Get similarities of the document (or corpus) `query` to all documents in the corpus.
505
506        Parameters
507        ----------
508        query : {iterable of list of (int, number) , list of (int, number))}
509            A single document in bag-of-words format, or a corpus (iterable) of such documents.
510
511        Return
512        ------
513        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
514            Similarities of the query against this index.
515
516        Notes
517        -----
518        If `query` is a corpus (iterable of documents), return a matrix of similarities of
519        all query documents vs. all corpus document. This batch query is more efficient than computing the similarities
520        one document after another.
521
522        Examples
523        --------
524        .. sourcecode:: pycon
525
526            >>> from gensim.corpora.textcorpus import TextCorpus
527            >>> from gensim.test.utils import datapath
528            >>> from gensim.similarities import Similarity
529            >>>
530            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
531            >>> index = Similarity('temp', corpus, num_features=400)
532            >>> result = index[corpus]  # pairwise similarities of each document against each document
533
534        """
535        self.close_shard()  # no-op if no documents added to index since last query
536
537        # reset num_best and normalize parameters, in case they were changed dynamically
538        for shard in self.shards:
539            shard.num_best = self.num_best
540            shard.normalize = self.norm
541
542        # there are 4 distinct code paths, depending on whether input `query` is
543        # a corpus (or numpy/scipy matrix) or a single document, and whether the
544        # similarity result should be a full array or only num_best most similar
545        # documents.
546        pool, shard_results = self.query_shards(query)
547        if self.num_best is None:
548            # user asked for all documents => just stack the sub-results into a single matrix
549            # (works for both corpus / single doc query)
550            result = numpy.hstack(list(shard_results))
551        else:
552            # the following uses a lot of lazy evaluation and (optionally) parallel
553            # processing, to improve query latency and minimize memory footprint.
554            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
555
556            def convert(shard_no, doc):
557                return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
558
559            is_corpus, query = utils.is_corpus(query)
560            is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
561            if not is_corpus:
562                # user asked for num_best most similar and query is a single doc
563                results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
564                result = _nlargest(self.num_best, results)
565            else:
566                # the trickiest combination: returning num_best results when query was a corpus
567                results = []
568                for shard_no, result in enumerate(shard_results):
569                    shard_result = [convert(shard_no, doc) for doc in result]
570                    results.append(shard_result)
571                result = []
572                for parts in zip(*results):
573                    merged = _nlargest(self.num_best, parts)
574                    result.append(merged)
575        if pool:
576            # gc doesn't seem to collect the Pools, eventually leading to
577            # "IOError 24: too many open files". so let's terminate it manually.
578            pool.terminate()
579
580        return result
581
582    def vector_by_id(self, docpos):
583        """Get the indexed vector corresponding to the document at position `docpos`.
584
585        Parameters
586        ----------
587        docpos : int
588            Document position
589
590        Return
591        ------
592        :class:`scipy.sparse.csr_matrix`
593            Indexed vector.
594
595        Examples
596        --------
597        .. sourcecode:: pycon
598
599            >>> from gensim.corpora.textcorpus import TextCorpus
600            >>> from gensim.test.utils import datapath
601            >>> from gensim.similarities import Similarity
602            >>>
603            >>> # Create index:
604            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
605            >>> index = Similarity('temp', corpus, num_features=400)
606            >>> vector = index.vector_by_id(1)
607
608        """
609        self.close_shard()  # no-op if no documents added to index since last query
610        pos = 0
611        for shard in self.shards:
612            pos += len(shard)
613            if docpos < pos:
614                break
615        if not self.shards or docpos < 0 or docpos >= pos:
616            raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % (docpos, len(self)))
617        result = shard.get_document_id(docpos - pos + len(shard))
618        return result
619
620    def similarity_by_id(self, docpos):
621        """Get similarity of a document specified by its index position `docpos`.
622
623        Parameters
624        ----------
625        docpos : int
626            Document position in the index.
627
628        Return
629        ------
630        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
631            Similarities of the given document against this index.
632
633        Examples
634        --------
635        .. sourcecode:: pycon
636
637            >>> from gensim.corpora.textcorpus import TextCorpus
638            >>> from gensim.test.utils import datapath
639            >>> from gensim.similarities import Similarity
640            >>>
641            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
642            >>> index = Similarity('temp', corpus, num_features=400)
643            >>> similarities = index.similarity_by_id(1)
644
645        """
646        query = self.vector_by_id(docpos)
647        norm, self.norm = self.norm, False
648        result = self[query]
649        self.norm = norm
650        return result
651
652    def __iter__(self):
653        """For each index document in index, compute cosine similarity against all other documents in the index.
654        Uses :meth:`~gensim.similarities.docsim.Similarity.iter_chunks` internally.
655
656        Yields
657        ------
658        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
659            Similarities of each document in turn against the index.
660
661        """
662        # turn off query normalization (vectors in the index are already normalized, save some CPU)
663        norm, self.norm = self.norm, False
664
665        for chunk in self.iter_chunks():
666            if chunk.shape[0] > 1:
667                for sim in self[chunk]:
668                    yield sim
669            else:
670                yield self[chunk]
671
672        self.norm = norm  # restore normalization
673
674    def iter_chunks(self, chunksize=None):
675        """Iteratively yield the index as chunks of document vectors, each of size <= chunksize.
676
677        Parameters
678        ----------
679        chunksize : int, optional
680            Size of chunk,, if None - `self.chunksize` will be used.
681
682        Yields
683        ------
684        :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
685            Chunks of the index as 2D arrays. The arrays are either dense or sparse, depending on
686            whether the shard was storing dense or sparse vectors.
687
688        """
689        self.close_shard()
690
691        if chunksize is None:
692            # if not explicitly specified, use the chunksize from the constructor
693            chunksize = self.chunksize
694
695        for shard in self.shards:
696            query = shard.get_index().index
697            for chunk_start in range(0, query.shape[0], chunksize):
698                # scipy.sparse doesn't allow slicing beyond real size of the matrix
699                # (unlike numpy). so, clip the end of the chunk explicitly to make
700                # scipy.sparse happy
701                chunk_end = min(query.shape[0], chunk_start + chunksize)
702                chunk = query[chunk_start: chunk_end]  # create a view
703                yield chunk
704
705    def check_moved(self):
706        """Update shard locations, for case where the server prefix location changed on the filesystem."""
707        dirname = os.path.dirname(self.output_prefix)
708        for shard in self.shards:
709            shard.dirname = dirname
710
711    def save(self, fname=None, *args, **kwargs):
712        """Save the index object via pickling under `fname`. See also :meth:`~gensim.docsim.Similarity.load()`.
713
714        Parameters
715        ----------
716        fname : str, optional
717            Path for save index, if not provided - will be saved to `self.output_prefix`.
718        *args : object
719            Arguments, see :meth:`gensim.utils.SaveLoad.save`.
720        **kwargs : object
721            Keyword arguments, see :meth:`gensim.utils.SaveLoad.save`.
722
723        Notes
724        -----
725        Will call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill
726        any unfinished shards to disk first.
727
728        Examples
729        --------
730        .. sourcecode:: pycon
731
732            >>> from gensim.corpora.textcorpus import TextCorpus
733            >>> from gensim.test.utils import datapath, get_tmpfile
734            >>> from gensim.similarities import Similarity
735            >>>
736            >>> temp_fname = get_tmpfile("index")
737            >>> output_fname = get_tmpfile("saved_index")
738            >>>
739            >>> corpus = TextCorpus(datapath('testcorpus.txt'))
740            >>> index = Similarity(output_fname, corpus, num_features=400)
741            >>>
742            >>> index.save(output_fname)
743            >>> loaded_index = index.load(output_fname)
744
745        """
746        self.close_shard()
747        if fname is None:
748            fname = self.output_prefix
749        super(Similarity, self).save(fname, *args, **kwargs)
750
751    def destroy(self):
752        """Delete all files under self.output_prefix Index is not usable anymore after calling this method."""
753        import glob
754        for fname in glob.glob(self.output_prefix + '*'):
755            logger.info("deleting %s", fname)
756            os.remove(fname)
757
758
759class MatrixSimilarity(interfaces.SimilarityABC):
760    """Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
761
762    Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.
763
764    Examples
765    --------
766    .. sourcecode:: pycon
767
768        >>> from gensim.test.utils import common_corpus, common_dictionary
769        >>> from gensim.similarities import MatrixSimilarity
770        >>>
771        >>> query = [(1, 2), (5, 4)]
772        >>> index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary))
773        >>> sims = index[query]
774
775    """
776    def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
777        """
778
779        Parameters
780        ----------
781        corpus : iterable of list of (int, number)
782            Corpus in streamed Gensim bag-of-words format.
783        num_best : int, optional
784            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
785            Otherwise, return a full vector with one float for every document in the index.
786        num_features : int
787            Size of the dictionary (number of features).
788        corpus_len : int, optional
789            Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size.
790        chunksize : int, optional
791            Size of query chunks. Used internally when the query is an entire corpus.
792        dtype : numpy.dtype, optional
793            Datatype to store the internal matrix in.
794
795        """
796        if num_features is None:
797            logger.warning(
798                "scanning corpus to determine the number of features (consider setting `num_features` explicitly)"
799            )
800            num_features = 1 + utils.get_max_id(corpus)
801
802        self.num_features = num_features
803        self.num_best = num_best
804        self.normalize = True
805        self.chunksize = chunksize
806        if corpus_len is None:
807            corpus_len = len(corpus)
808
809        if corpus is not None:
810            if self.num_features <= 0:
811                raise ValueError(
812                    "cannot index a corpus with zero features (you must specify either `num_features` "
813                    "or a non-empty corpus in the constructor)"
814                )
815            logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
816            self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
817            # iterate over corpus, populating the numpy index matrix with (normalized)
818            # document vectors
819            for docno, vector in enumerate(corpus):
820                if docno % 1000 == 0:
821                    logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
822                # individual documents in fact may be in numpy.scipy.sparse format as well.
823                # it's not documented because other it's not fully supported throughout.
824                # the user better know what he's doing (no normalization, must
825                # explicitly supply num_features etc).
826                if isinstance(vector, numpy.ndarray):
827                    pass
828                elif scipy.sparse.issparse(vector):
829                    vector = vector.toarray().flatten()
830                else:
831                    vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
832                self.index[docno] = vector
833
834    def __len__(self):
835        return self.index.shape[0]
836
837    def get_similarities(self, query):
838        """Get similarity between `query` and this index.
839
840        Warnings
841        --------
842        Do not use this function directly, use the :class:`~gensim.similarities.docsim.MatrixSimilarity.__getitem__`
843        instead.
844
845        Parameters
846        ----------
847        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
848            Document or collection of documents.
849
850        Return
851        ------
852        :class:`numpy.ndarray`
853            Similarity matrix.
854
855        """
856        is_corpus, query = utils.is_corpus(query)
857        if is_corpus:
858            query = numpy.asarray(
859                [matutils.sparse2full(vec, self.num_features) for vec in query],
860                dtype=self.index.dtype
861            )
862        else:
863            if scipy.sparse.issparse(query):
864                query = query.toarray()  # convert sparse to dense
865            elif isinstance(query, numpy.ndarray):
866                pass
867            else:
868                # default case: query is a single vector in sparse gensim format
869                query = matutils.sparse2full(query, self.num_features)
870            query = numpy.asarray(query, dtype=self.index.dtype)
871
872        # do a little transposition dance to stop numpy from making a copy of
873        # self.index internally in numpy.dot (very slow).
874        result = numpy.dot(self.index, query.T).T  # return #queries x #index
875        return result  # XXX: removed casting the result from array to list; does anyone care?
876
877    def __str__(self):
878        return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1])
879
880
881class SoftCosineSimilarity(interfaces.SimilarityABC):
882    """Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory.
883
884    Examples
885    --------
886    .. sourcecode:: pycon
887
888        >>> from gensim.test.utils import common_texts
889        >>> from gensim.corpora import Dictionary
890        >>> from gensim.models import Word2Vec
891        >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
892        >>> from gensim.similarities import WordEmbeddingSimilarityIndex
893        >>>
894        >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
895        >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv)
896        >>> dictionary = Dictionary(common_texts)
897        >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
898        >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
899        >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
900        >>>
901        >>> query = 'graph trees computer'.split()  # make a query
902        >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
903
904    Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_scm.html>`__
905    for more examples.
906
907    """
908    def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, normalized=(True, True)):
909        """
910
911        Parameters
912        ----------
913        corpus: iterable of list of (int, float)
914            A list of documents in the BoW format.
915        similarity_matrix : :class:`gensim.similarities.SparseTermSimilarityMatrix`
916            A term similarity matrix.
917        num_best : int, optional
918            The number of results to retrieve for a query, if None - return similarities with all elements from corpus.
919        chunksize: int, optional
920            Size of one corpus chunk.
921        normalized : tuple of {True, False, 'maintain'}, optional
922            First/second value specifies whether the query/document vectors in the inner product
923            will be L2-normalized (True; corresponds to the soft cosine similarity measure; default),
924            maintain their L2-norm during change of basis ('maintain'; corresponds to query
925            expansion with partial membership), or kept as-is (False;
926            corresponds to query expansion).
927
928        See Also
929        --------
930        :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
931            A sparse term similarity matrix built using a term similarity index.
932        :class:`~gensim.similarities.termsim.LevenshteinSimilarityIndex`
933            A term similarity index that computes Levenshtein similarities between terms.
934        :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex`
935            A term similarity index that computes cosine similarities between word embeddings.
936
937        """
938        self.similarity_matrix = similarity_matrix
939
940        self.corpus = corpus
941        self.num_best = num_best
942        self.chunksize = chunksize
943        self.normalized = normalized
944
945        # Normalization of features is undesirable, since soft cosine similarity requires special
946        # normalization using the similarity matrix. Therefore, we would just be normalizing twice,
947        # increasing the numerical error.
948        self.normalize = False
949
950        # index is simply an array from 0 to size of corpus.
951        self.index = numpy.arange(len(corpus))
952
953    def __len__(self):
954        return len(self.corpus)
955
956    def get_similarities(self, query):
957        """Get similarity between `query` and this index.
958
959        Warnings
960        --------
961        Do not use this function directly; use the `self[query]` syntax instead.
962
963        Parameters
964        ----------
965        query : {list of (int, number), iterable of list of (int, number)}
966            Document or collection of documents.
967
968        Return
969        ------
970        :class:`numpy.ndarray`
971            Similarity matrix.
972
973        """
974        if not self.corpus:
975            return numpy.array()
976
977        is_corpus, query = utils.is_corpus(query)
978        if not is_corpus and isinstance(query, numpy.ndarray):
979            query = [self.corpus[i] for i in query]  # convert document indexes to actual documents
980        result = self.similarity_matrix.inner_product(query, self.corpus, normalized=self.normalized)
981
982        if scipy.sparse.issparse(result):
983            return numpy.asarray(result.todense())
984        if numpy.isscalar(result):
985            return numpy.array(result)
986        return numpy.asarray(result)[0]
987
988    def __str__(self):
989        return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0])
990
991
992class WmdSimilarity(interfaces.SimilarityABC):
993    """Compute negative WMD similarity against a corpus of documents.
994
995    Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_wmd.html>`__
996    for more examples.
997
998    When using this code, please consider citing the following papers:
999
1000    * `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching"
1001      <http://www.cs.huji.ac.il/~werman/Papers/ECCV2008.pdf>`_
1002    * `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances"
1003      <http://www.cs.huji.ac.il/~werman/Papers/ICCV2009.pdf>`_
1004    * `Matt Kusner et al. "From Word Embeddings To Document Distances"
1005      <http://proceedings.mlr.press/v37/kusnerb15.pdf>`_
1006
1007    Example
1008    -------
1009    .. sourcecode:: pycon
1010
1011        >>> from gensim.test.utils import common_texts
1012        >>> from gensim.models import Word2Vec
1013        >>> from gensim.similarities import WmdSimilarity
1014        >>>
1015        >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
1016        >>>
1017        >>> index = WmdSimilarity(common_texts, model)
1018        >>> # Make query.
1019        >>> query = ['trees']
1020        >>> sims = index[query]
1021
1022    """
1023    def __init__(self, corpus, kv_model, num_best=None, chunksize=256):
1024        """
1025
1026        Parameters
1027        ----------
1028        corpus: iterable of list of str
1029            A list of documents, each of which is a list of tokens.
1030        kv_model: :class:`~gensim.models.keyedvectors.KeyedVectors`
1031            A set of KeyedVectors
1032        num_best: int, optional
1033            Number of results to retrieve.
1034        chunksize : int, optional
1035            Size of chunk.
1036
1037        """
1038        self.corpus = corpus
1039        self.wv = kv_model
1040        self.num_best = num_best
1041        self.chunksize = chunksize
1042
1043        # Normalization of features is not possible, as corpus is a list (of lists) of strings.
1044        self.normalize = False
1045
1046        # index is simply an array from 0 to size of corpus.
1047        self.index = numpy.arange(len(corpus))
1048
1049    def __len__(self):
1050        """Get size of corpus."""
1051        return len(self.corpus)
1052
1053    def get_similarities(self, query):
1054        """Get similarity between `query` and this index.
1055
1056        Warnings
1057        --------
1058        Do not use this function directly; use the `self[query]` syntax instead.
1059
1060        Parameters
1061        ----------
1062        query : {list of str, iterable of list of str}
1063            Document or collection of documents.
1064
1065        Return
1066        ------
1067        :class:`numpy.ndarray`
1068            Similarity matrix.
1069
1070        """
1071        if isinstance(query, numpy.ndarray):
1072            # Convert document indexes to actual documents.
1073            query = [self.corpus[i] for i in query]
1074
1075        if not query or not isinstance(query[0], list):
1076            query = [query]
1077
1078        n_queries = len(query)
1079        result = []
1080        for qidx in range(n_queries):
1081            # Compute similarity for each query.
1082            qresult = [self.wv.wmdistance(document, query[qidx]) for document in self.corpus]
1083            qresult = numpy.array(qresult)
1084            qresult = 1. / (1. + qresult)  # Similarity is the negative of the distance.
1085
1086            # Append single query result to list of all results.
1087            result.append(qresult)
1088
1089        if len(result) == 1:
1090            # Only one query.
1091            result = result[0]
1092        else:
1093            result = numpy.array(result)
1094
1095        return result
1096
1097    def __str__(self):
1098        return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1])
1099
1100
1101class SparseMatrixSimilarity(interfaces.SimilarityABC):
1102    """Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
1103
1104    Notes
1105    -----
1106    Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM.
1107
1108    The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire
1109    matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.
1110
1111    Takes an optional `maintain_sparsity` argument, setting this to True
1112    causes `get_similarities` to return a sparse matrix instead of a
1113    dense representation if possible.
1114
1115    See also
1116    --------
1117    :class:`~gensim.similarities.docsim.Similarity`
1118        Index similarity (wrapper for other inheritors of :class:`~gensim.interfaces.SimilarityABC`).
1119    :class:`~gensim.similarities.docsim.MatrixSimilarity`
1120        Index similarity (dense with cosine distance).
1121
1122    """
1123    def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
1124                 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
1125        """
1126
1127        Parameters
1128        ----------
1129        corpus: iterable of list of (int, float)
1130            A list of documents in the BoW format.
1131        num_features : int, optional
1132            Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
1133        num_terms : int, optional
1134            Alias for `num_features`, you can use either.
1135        num_docs : int, optional
1136            Number of documents in `corpus`. Will be calculated if not provided.
1137        num_nnz : int, optional
1138            Number of non-zero elements in `corpus`. Will be calculated if not provided.
1139        num_best : int, optional
1140            If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
1141            Otherwise, return a full vector with one float for every document in the index.
1142        chunksize : int, optional
1143            Size of query chunks. Used internally when the query is an entire corpus.
1144        dtype : numpy.dtype, optional
1145            Data type of the internal matrix.
1146        maintain_sparsity : bool, optional
1147            Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?
1148
1149        """
1150        self.num_best = num_best
1151        self.normalize = True
1152        self.chunksize = chunksize
1153        self.maintain_sparsity = maintain_sparsity
1154
1155        if corpus is not None:
1156            logger.info("creating sparse index")
1157
1158            # iterate over input corpus, populating the sparse index matrix
1159            try:
1160                # use the more efficient corpus generation version, if the input
1161                # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
1162                num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
1163                logger.debug("using efficient sparse index creation")
1164            except AttributeError:
1165                # no MmCorpus, use the slower version (or maybe user supplied the
1166                # num_* params in constructor)
1167                pass
1168            if num_features is not None:
1169                # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
1170                num_terms = num_features
1171            if num_terms is None:
1172                raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
1173            corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
1174                      (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
1175                       matutils.unitvec(v)) for v in corpus)
1176            self.index = matutils.corpus2csc(
1177                corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
1178                dtype=dtype, printprogress=10000
1179            ).T
1180
1181            # convert to Compressed Sparse Row for efficient row slicing and multiplications
1182            self.index = self.index.tocsr()  # currently no-op, CSC.T is already CSR
1183            logger.info("created %r", self.index)
1184
1185    def __len__(self):
1186        """Get size of index."""
1187        return self.index.shape[0]
1188
1189    def get_similarities(self, query):
1190        """Get similarity between `query` and this index.
1191
1192        Warnings
1193        --------
1194        Do not use this function directly; use the `self[query]` syntax instead.
1195
1196        Parameters
1197        ----------
1198        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
1199            Document or collection of documents.
1200
1201        Return
1202        ------
1203        :class:`numpy.ndarray`
1204            Similarity matrix (if maintain_sparsity=False) **OR**
1205        :class:`scipy.sparse.csc`
1206            otherwise
1207
1208        """
1209        is_corpus, query = utils.is_corpus(query)
1210        if is_corpus:
1211            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
1212        else:
1213            if scipy.sparse.issparse(query):
1214                query = query.T  # convert documents=rows to documents=columns
1215            elif isinstance(query, numpy.ndarray):
1216                if query.ndim == 1:
1217                    query.shape = (1, len(query))
1218                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
1219            else:
1220                # default case: query is a single vector, in sparse gensim format
1221                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)
1222
1223        # compute cosine similarity against every other document in the collection
1224        result = self.index * query.tocsc()  # N x T * T x C = N x C
1225        if result.shape[1] == 1 and not is_corpus:
1226            # for queries of one document, return a 1d array
1227            result = result.toarray().flatten()
1228        elif self.maintain_sparsity:
1229            # avoid converting to dense array if maintaining sparsity
1230            result = result.T
1231        else:
1232            # otherwise, return a 2d matrix (#queries x #index)
1233            result = result.toarray().T
1234        return result
1235