1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz> 5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 7"""Compute similarities across a collection of documents in the Vector Space Model. 8 9The main class is :class:`~gensim.similarities.docsim.Similarity`, which builds an index for a given set of documents. 10 11Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each 12document in the index?". The result is a vector of numbers as large as the size of the initial set of documents, 13that is, one float for each index document. Alternatively, you can also request only the top-N most 14similar index documents to the query. 15 16 17How It Works 18------------ 19The :class:`~gensim.similarities.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"), 20which are disk-based. If your entire index fits in memory (~one million documents per 1GB of RAM), 21you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity` 22or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. 23These are more simple but do not scale as well: they keep the entire index in RAM, no sharding. They also do not 24support adding new document to the index dynamically. 25 26Once the index has been initialized, you can query for document similarity simply by 27 28.. sourcecode:: pycon 29 30 >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile 31 >>> 32 >>> index_tmpfile = get_tmpfile("index") 33 >>> query = [(1, 2), (6, 1), (7, 2)] 34 >>> 35 >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index 36 >>> similarities = index[query] # get similarities between the query and all index documents 37 38If you have more query documents, you can submit them all at once, in a batch 39 40.. sourcecode:: pycon 41 42 >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile 43 >>> 44 >>> index_tmpfile = get_tmpfile("index") 45 >>> batch_of_documents = common_corpus[:] # only as example 46 >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index 47 >>> 48 >>> # the batch is simply an iterable of documents, aka gensim corpus: 49 >>> for similarities in index[batch_of_documents]: 50 ... pass 51 52The benefit of this batch (aka "chunked") querying is a much better performance. 53To see the speed-up on your machine, run ``python -m gensim.test.simspeed`` 54(compare to my results `here <http://groups.google.com/group/gensim/msg/4f6f171a869e4fca?>`_). 55 56There is also a special syntax for when you need similarity of documents in the index 57to the index itself (i.e. queries = the indexed documents themselves). This special syntax 58uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**: 59 60.. sourcecode:: pycon 61 62 >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile 63 >>> 64 >>> index_tmpfile = get_tmpfile("index") 65 >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index 66 >>> 67 >>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd... 68 ... pass 69 70""" 71import logging 72import itertools 73import os 74import heapq 75 76import numpy 77import scipy.sparse 78 79from gensim import interfaces, utils, matutils 80 81 82logger = logging.getLogger(__name__) 83 84PARALLEL_SHARDS = False 85try: 86 import multiprocessing 87 # by default, don't parallelize queries. uncomment the following line if you want that. 88# PARALLEL_SHARDS = multiprocessing.cpu_count() # use #parallel processes = #CPus 89except ImportError: 90 pass 91 92 93class Shard(utils.SaveLoad): 94 """A proxy that represents a single shard instance within :class:`~gensim.similarity.docsim.Similarity` index. 95 96 Basically just wraps :class:`~gensim.similarities.docsim.MatrixSimilarity`, 97 :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query). 98 99 """ 100 def __init__(self, fname, index): 101 """ 102 103 Parameters 104 ---------- 105 fname : str 106 Path to top-level directory (file) to traverse for corpus documents. 107 index : :class:`~gensim.interfaces.SimilarityABC` 108 Index object. 109 110 """ 111 self.dirname, self.fname = os.path.split(fname) 112 self.length = len(index) 113 self.cls = index.__class__ 114 logger.info("saving index shard to %s", self.fullname()) 115 index.save(self.fullname()) 116 self.index = self.get_index() 117 118 def fullname(self): 119 """Get full path to shard file. 120 121 Return 122 ------ 123 str 124 Path to shard instance. 125 126 """ 127 return os.path.join(self.dirname, self.fname) 128 129 def __len__(self): 130 """Get length.""" 131 return self.length 132 133 def __getstate__(self): 134 """Special handler for pickle. 135 136 Returns 137 ------- 138 dict 139 Object that contains state of current instance without `index`. 140 141 """ 142 result = self.__dict__.copy() 143 # (S)MS objects must be loaded via load() because of mmap (simple pickle.load won't do) 144 if 'index' in result: 145 del result['index'] 146 return result 147 148 def __str__(self): 149 return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) 150 151 def get_index(self): 152 """Load & get index. 153 154 Returns 155 ------- 156 :class:`~gensim.interfaces.SimilarityABC` 157 Index instance. 158 159 """ 160 if not hasattr(self, 'index'): 161 logger.debug("mmaping index from %s", self.fullname()) 162 self.index = self.cls.load(self.fullname(), mmap='r') 163 return self.index 164 165 def get_document_id(self, pos): 166 """Get index vector at position `pos`. 167 168 Parameters 169 ---------- 170 pos : int 171 Vector position. 172 173 Return 174 ------ 175 {:class:`scipy.sparse.csr_matrix`, :class:`numpy.ndarray`} 176 Index vector. Type depends on underlying index. 177 178 Notes 179 ----- 180 The vector is of the same type as the underlying index (ie., dense for 181 :class:`~gensim.similarities.docsim.MatrixSimilarity` 182 and scipy.sparse for :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. 183 184 """ 185 assert 0 <= pos < len(self), "requested position out of range" 186 return self.get_index().index[pos] 187 188 def __getitem__(self, query): 189 """Get similarities of document (or corpus) `query` to all documents in the corpus. 190 191 Parameters 192 ---------- 193 query : {iterable of list of (int, number) , list of (int, number))} 194 Document or corpus. 195 196 Returns 197 ------- 198 :class:`numpy.ndarray` 199 Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** 200 :class:`scipy.sparse.csr_matrix` 201 for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. 202 203 """ 204 index = self.get_index() 205 try: 206 index.num_best = self.num_best 207 index.normalize = self.normalize 208 except Exception: 209 raise ValueError("num_best and normalize have to be set before querying a proxy Shard object") 210 return index[query] 211 212 213def query_shard(args): 214 """Helper for request query from shard, same as shard[query]. 215 216 Parameters 217 --------- 218 args : (list of (int, number), :class:`~gensim.interfaces.SimilarityABC`) 219 Query and Shard instances 220 221 Returns 222 ------- 223 :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` 224 Similarities of the query against documents indexed in this shard. 225 226 """ 227 query, shard = args # simulate starmap (not part of multiprocessing in older Pythons) 228 logger.debug("querying shard %s num_best=%s in process %s", shard, shard.num_best, os.getpid()) 229 result = shard[query] 230 logger.debug("finished querying shard %s in process %s", shard, os.getpid()) 231 return result 232 233 234def _nlargest(n, iterable): 235 """Helper for extracting n documents with maximum similarity. 236 237 Parameters 238 ---------- 239 n : int 240 Number of elements to be extracted 241 iterable : iterable of list of (int, float) 242 Iterable containing documents with computed similarities 243 244 Returns 245 ------- 246 :class:`list` 247 List with the n largest elements from the dataset defined by iterable. 248 249 Notes 250 ----- 251 Elements are compared by the absolute value of similarity, because negative value of similarity 252 does not mean some form of dissimilarity. 253 254 """ 255 return heapq.nlargest(n, itertools.chain(*iterable), key=lambda item: abs(item[1])) 256 257 258class Similarity(interfaces.SimilarityABC): 259 """Compute cosine similarity of a dynamic query against a corpus of documents ('the index'). 260 261 The index supports adding new documents dynamically. 262 263 Notes 264 ----- 265 Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory 266 The shards themselves are simply stored as files to disk and mmap'ed back as needed. 267 268 Examples 269 -------- 270 .. sourcecode:: pycon 271 272 >>> from gensim.corpora.textcorpus import TextCorpus 273 >>> from gensim.test.utils import datapath, get_tmpfile 274 >>> from gensim.similarities import Similarity 275 >>> 276 >>> corpus = TextCorpus(datapath('testcorpus.mm')) 277 >>> index_temp = get_tmpfile("index") 278 >>> index = Similarity(index_temp, corpus, num_features=400) # create index 279 >>> 280 >>> query = next(iter(corpus)) 281 >>> result = index[query] # search similar to `query` in index 282 >>> 283 >>> for sims in index[corpus]: # if you have more query documents, you can submit them all at once, in a batch 284 ... pass 285 >>> 286 >>> # There is also a special syntax for when you need similarity of documents in the index 287 >>> # to the index itself (i.e. queries=indexed documents themselves). This special syntax 288 >>> # uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**: 289 >>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd... 290 ... pass 291 292 See Also 293 -------- 294 :class:`~gensim.similarities.docsim.MatrixSimilarity` 295 Index similarity (dense with cosine distance). 296 :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` 297 Index similarity (sparse with cosine distance). 298 :class:`~gensim.similarities.docsim.WmdSimilarity` 299 Index similarity (with word-mover distance). 300 301 """ 302 303 def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): 304 """ 305 306 Parameters 307 ---------- 308 output_prefix : str 309 Prefix for shard filename. If None, a random filename in temp will be used. 310 corpus : iterable of list of (int, number) 311 Corpus in streamed Gensim bag-of-words format. 312 num_features : int 313 Size of the dictionary (number of features). 314 num_best : int, optional 315 If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. 316 Otherwise, return a full vector with one float for every document in the index. 317 chunksize : int, optional 318 Size of query chunks. Used internally when the query is an entire corpus. 319 shardsize : int, optional 320 Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits 321 comfortably into your RAM. 322 norm : {'l1', 'l2'}, optional 323 Normalization to use. 324 325 Notes 326 ----- 327 Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard 328 converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`. 329 330 If you don't specify an output prefix, a random filename in temp will be used. 331 332 If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the 333 :class:`~gensim.similarities.docsim.MatrixSimilarity` or 334 :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. 335 These are more simple but do not scale as well (they keep the entire index in RAM, no sharding). 336 They also do not support adding new document dynamically. 337 338 """ 339 if output_prefix is None: 340 # undocumented feature: set output_prefix=None to create the server in temp 341 self.output_prefix = utils.randfname(prefix='simserver') 342 else: 343 self.output_prefix = output_prefix 344 logger.info("starting similarity index under %s", self.output_prefix) 345 self.num_features = num_features 346 self.num_best = num_best 347 self.norm = norm 348 self.chunksize = int(chunksize) 349 self.shardsize = shardsize 350 self.shards = [] 351 self.fresh_docs, self.fresh_nnz = [], 0 352 353 if corpus is not None: 354 self.add_documents(corpus) 355 356 def __len__(self): 357 """Get length of index.""" 358 return len(self.fresh_docs) + sum(len(shard) for shard in self.shards) 359 360 def __str__(self): 361 return "Similarity index with %i documents in %i shards (stored under %s)" % ( 362 len(self), len(self.shards), self.output_prefix 363 ) 364 365 def add_documents(self, corpus): 366 """Extend the index with new documents. 367 368 Parameters 369 ---------- 370 corpus : iterable of list of (int, number) 371 Corpus in BoW format. 372 373 Notes 374 ----- 375 Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them 376 (or when a query is issued). 377 378 Examples 379 -------- 380 .. sourcecode:: pycon 381 382 >>> from gensim.corpora.textcorpus import TextCorpus 383 >>> from gensim.test.utils import datapath, get_tmpfile 384 >>> from gensim.similarities import Similarity 385 >>> 386 >>> corpus = TextCorpus(datapath('testcorpus.mm')) 387 >>> index_temp = get_tmpfile("index") 388 >>> index = Similarity(index_temp, corpus, num_features=400) # create index 389 >>> 390 >>> one_more_corpus = TextCorpus(datapath('testcorpus.txt')) 391 >>> index.add_documents(one_more_corpus) # add more documents in corpus 392 393 """ 394 min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete 395 if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize: 396 # The last shard was incomplete (<; load it back and add the documents there, don't start a new shard 397 self.reopen_shard() 398 for doc in corpus: 399 if isinstance(doc, numpy.ndarray): 400 doclen = len(doc) 401 elif scipy.sparse.issparse(doc): 402 doclen = doc.nnz 403 else: 404 doclen = len(doc) 405 if doclen < 0.3 * self.num_features: 406 doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm) 407 else: 408 doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm) 409 self.fresh_docs.append(doc) 410 self.fresh_nnz += doclen 411 if len(self.fresh_docs) >= self.shardsize: 412 self.close_shard() 413 if len(self.fresh_docs) % 10000 == 0: 414 logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs)) 415 416 def shardid2filename(self, shardid): 417 """Get shard file by `shardid`. 418 419 Parameters 420 ---------- 421 shardid : int 422 Shard index. 423 424 Return 425 ------ 426 str 427 Path to shard file. 428 429 """ 430 if self.output_prefix.endswith('.'): 431 return "%s%s" % (self.output_prefix, shardid) 432 else: 433 return "%s.%s" % (self.output_prefix, shardid) 434 435 def close_shard(self): 436 """Force the latest shard to close (be converted to a matrix and stored to disk). 437 Do nothing if no new documents added since last call. 438 439 Notes 440 ----- 441 The shard is closed even if it is not full yet (its size is smaller than `self.shardsize`). 442 If documents are added later via :meth:`~gensim.similarities.docsim.MatrixSimilarity.add_documents` 443 this incomplete shard will be loaded again and completed. 444 445 """ 446 if not self.fresh_docs: 447 return 448 shardid = len(self.shards) 449 # consider the shard sparse if its density is < 30% 450 issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features) 451 if issparse: 452 index = SparseMatrixSimilarity( 453 self.fresh_docs, num_terms=self.num_features, num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz 454 ) 455 else: 456 index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features) 457 logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid) 458 shard = Shard(self.shardid2filename(shardid), index) 459 shard.num_best = self.num_best 460 shard.num_nnz = self.fresh_nnz 461 self.shards.append(shard) 462 self.fresh_docs, self.fresh_nnz = [], 0 463 464 def reopen_shard(self): 465 """Reopen an incomplete shard.""" 466 assert self.shards 467 if self.fresh_docs: 468 raise ValueError("cannot reopen a shard with fresh documents in index") 469 last_shard = self.shards[-1] 470 last_index = last_shard.get_index() 471 logger.info("reopening an incomplete shard of %i documents", len(last_shard)) 472 473 self.fresh_docs = list(last_index.index) 474 self.fresh_nnz = last_shard.num_nnz 475 del self.shards[-1] # remove the shard from index, *but its file on disk is not deleted* 476 logger.debug("reopen complete") 477 478 def query_shards(self, query): 479 """Apply shard[query] to each shard in `self.shards`. Used internally. 480 481 Parameters 482 ---------- 483 query : {iterable of list of (int, number) , list of (int, number))} 484 Document in BoW format or corpus of documents. 485 486 Returns 487 ------- 488 (None, list of individual shard query results) 489 Query results. 490 491 """ 492 args = zip([query] * len(self.shards), self.shards) 493 if PARALLEL_SHARDS and PARALLEL_SHARDS > 1: 494 logger.debug("spawning %i query processes", PARALLEL_SHARDS) 495 pool = multiprocessing.Pool(PARALLEL_SHARDS) 496 result = pool.imap(query_shard, args, chunksize=1 + len(self.shards) / PARALLEL_SHARDS) 497 else: 498 # serial processing, one shard after another 499 pool = None 500 result = map(query_shard, args) 501 return pool, result 502 503 def __getitem__(self, query): 504 """Get similarities of the document (or corpus) `query` to all documents in the corpus. 505 506 Parameters 507 ---------- 508 query : {iterable of list of (int, number) , list of (int, number))} 509 A single document in bag-of-words format, or a corpus (iterable) of such documents. 510 511 Return 512 ------ 513 :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` 514 Similarities of the query against this index. 515 516 Notes 517 ----- 518 If `query` is a corpus (iterable of documents), return a matrix of similarities of 519 all query documents vs. all corpus document. This batch query is more efficient than computing the similarities 520 one document after another. 521 522 Examples 523 -------- 524 .. sourcecode:: pycon 525 526 >>> from gensim.corpora.textcorpus import TextCorpus 527 >>> from gensim.test.utils import datapath 528 >>> from gensim.similarities import Similarity 529 >>> 530 >>> corpus = TextCorpus(datapath('testcorpus.txt')) 531 >>> index = Similarity('temp', corpus, num_features=400) 532 >>> result = index[corpus] # pairwise similarities of each document against each document 533 534 """ 535 self.close_shard() # no-op if no documents added to index since last query 536 537 # reset num_best and normalize parameters, in case they were changed dynamically 538 for shard in self.shards: 539 shard.num_best = self.num_best 540 shard.normalize = self.norm 541 542 # there are 4 distinct code paths, depending on whether input `query` is 543 # a corpus (or numpy/scipy matrix) or a single document, and whether the 544 # similarity result should be a full array or only num_best most similar 545 # documents. 546 pool, shard_results = self.query_shards(query) 547 if self.num_best is None: 548 # user asked for all documents => just stack the sub-results into a single matrix 549 # (works for both corpus / single doc query) 550 result = numpy.hstack(list(shard_results)) 551 else: 552 # the following uses a lot of lazy evaluation and (optionally) parallel 553 # processing, to improve query latency and minimize memory footprint. 554 offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) 555 556 def convert(shard_no, doc): 557 return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] 558 559 is_corpus, query = utils.is_corpus(query) 560 is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 561 if not is_corpus: 562 # user asked for num_best most similar and query is a single doc 563 results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results)) 564 result = _nlargest(self.num_best, results) 565 else: 566 # the trickiest combination: returning num_best results when query was a corpus 567 results = [] 568 for shard_no, result in enumerate(shard_results): 569 shard_result = [convert(shard_no, doc) for doc in result] 570 results.append(shard_result) 571 result = [] 572 for parts in zip(*results): 573 merged = _nlargest(self.num_best, parts) 574 result.append(merged) 575 if pool: 576 # gc doesn't seem to collect the Pools, eventually leading to 577 # "IOError 24: too many open files". so let's terminate it manually. 578 pool.terminate() 579 580 return result 581 582 def vector_by_id(self, docpos): 583 """Get the indexed vector corresponding to the document at position `docpos`. 584 585 Parameters 586 ---------- 587 docpos : int 588 Document position 589 590 Return 591 ------ 592 :class:`scipy.sparse.csr_matrix` 593 Indexed vector. 594 595 Examples 596 -------- 597 .. sourcecode:: pycon 598 599 >>> from gensim.corpora.textcorpus import TextCorpus 600 >>> from gensim.test.utils import datapath 601 >>> from gensim.similarities import Similarity 602 >>> 603 >>> # Create index: 604 >>> corpus = TextCorpus(datapath('testcorpus.txt')) 605 >>> index = Similarity('temp', corpus, num_features=400) 606 >>> vector = index.vector_by_id(1) 607 608 """ 609 self.close_shard() # no-op if no documents added to index since last query 610 pos = 0 611 for shard in self.shards: 612 pos += len(shard) 613 if docpos < pos: 614 break 615 if not self.shards or docpos < 0 or docpos >= pos: 616 raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % (docpos, len(self))) 617 result = shard.get_document_id(docpos - pos + len(shard)) 618 return result 619 620 def similarity_by_id(self, docpos): 621 """Get similarity of a document specified by its index position `docpos`. 622 623 Parameters 624 ---------- 625 docpos : int 626 Document position in the index. 627 628 Return 629 ------ 630 :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` 631 Similarities of the given document against this index. 632 633 Examples 634 -------- 635 .. sourcecode:: pycon 636 637 >>> from gensim.corpora.textcorpus import TextCorpus 638 >>> from gensim.test.utils import datapath 639 >>> from gensim.similarities import Similarity 640 >>> 641 >>> corpus = TextCorpus(datapath('testcorpus.txt')) 642 >>> index = Similarity('temp', corpus, num_features=400) 643 >>> similarities = index.similarity_by_id(1) 644 645 """ 646 query = self.vector_by_id(docpos) 647 norm, self.norm = self.norm, False 648 result = self[query] 649 self.norm = norm 650 return result 651 652 def __iter__(self): 653 """For each index document in index, compute cosine similarity against all other documents in the index. 654 Uses :meth:`~gensim.similarities.docsim.Similarity.iter_chunks` internally. 655 656 Yields 657 ------ 658 :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` 659 Similarities of each document in turn against the index. 660 661 """ 662 # turn off query normalization (vectors in the index are already normalized, save some CPU) 663 norm, self.norm = self.norm, False 664 665 for chunk in self.iter_chunks(): 666 if chunk.shape[0] > 1: 667 for sim in self[chunk]: 668 yield sim 669 else: 670 yield self[chunk] 671 672 self.norm = norm # restore normalization 673 674 def iter_chunks(self, chunksize=None): 675 """Iteratively yield the index as chunks of document vectors, each of size <= chunksize. 676 677 Parameters 678 ---------- 679 chunksize : int, optional 680 Size of chunk,, if None - `self.chunksize` will be used. 681 682 Yields 683 ------ 684 :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` 685 Chunks of the index as 2D arrays. The arrays are either dense or sparse, depending on 686 whether the shard was storing dense or sparse vectors. 687 688 """ 689 self.close_shard() 690 691 if chunksize is None: 692 # if not explicitly specified, use the chunksize from the constructor 693 chunksize = self.chunksize 694 695 for shard in self.shards: 696 query = shard.get_index().index 697 for chunk_start in range(0, query.shape[0], chunksize): 698 # scipy.sparse doesn't allow slicing beyond real size of the matrix 699 # (unlike numpy). so, clip the end of the chunk explicitly to make 700 # scipy.sparse happy 701 chunk_end = min(query.shape[0], chunk_start + chunksize) 702 chunk = query[chunk_start: chunk_end] # create a view 703 yield chunk 704 705 def check_moved(self): 706 """Update shard locations, for case where the server prefix location changed on the filesystem.""" 707 dirname = os.path.dirname(self.output_prefix) 708 for shard in self.shards: 709 shard.dirname = dirname 710 711 def save(self, fname=None, *args, **kwargs): 712 """Save the index object via pickling under `fname`. See also :meth:`~gensim.docsim.Similarity.load()`. 713 714 Parameters 715 ---------- 716 fname : str, optional 717 Path for save index, if not provided - will be saved to `self.output_prefix`. 718 *args : object 719 Arguments, see :meth:`gensim.utils.SaveLoad.save`. 720 **kwargs : object 721 Keyword arguments, see :meth:`gensim.utils.SaveLoad.save`. 722 723 Notes 724 ----- 725 Will call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill 726 any unfinished shards to disk first. 727 728 Examples 729 -------- 730 .. sourcecode:: pycon 731 732 >>> from gensim.corpora.textcorpus import TextCorpus 733 >>> from gensim.test.utils import datapath, get_tmpfile 734 >>> from gensim.similarities import Similarity 735 >>> 736 >>> temp_fname = get_tmpfile("index") 737 >>> output_fname = get_tmpfile("saved_index") 738 >>> 739 >>> corpus = TextCorpus(datapath('testcorpus.txt')) 740 >>> index = Similarity(output_fname, corpus, num_features=400) 741 >>> 742 >>> index.save(output_fname) 743 >>> loaded_index = index.load(output_fname) 744 745 """ 746 self.close_shard() 747 if fname is None: 748 fname = self.output_prefix 749 super(Similarity, self).save(fname, *args, **kwargs) 750 751 def destroy(self): 752 """Delete all files under self.output_prefix Index is not usable anymore after calling this method.""" 753 import glob 754 for fname in glob.glob(self.output_prefix + '*'): 755 logger.info("deleting %s", fname) 756 os.remove(fname) 757 758 759class MatrixSimilarity(interfaces.SimilarityABC): 760 """Compute cosine similarity against a corpus of documents by storing the index matrix in memory. 761 762 Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead. 763 764 Examples 765 -------- 766 .. sourcecode:: pycon 767 768 >>> from gensim.test.utils import common_corpus, common_dictionary 769 >>> from gensim.similarities import MatrixSimilarity 770 >>> 771 >>> query = [(1, 2), (5, 4)] 772 >>> index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary)) 773 >>> sims = index[query] 774 775 """ 776 def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None): 777 """ 778 779 Parameters 780 ---------- 781 corpus : iterable of list of (int, number) 782 Corpus in streamed Gensim bag-of-words format. 783 num_best : int, optional 784 If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. 785 Otherwise, return a full vector with one float for every document in the index. 786 num_features : int 787 Size of the dictionary (number of features). 788 corpus_len : int, optional 789 Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size. 790 chunksize : int, optional 791 Size of query chunks. Used internally when the query is an entire corpus. 792 dtype : numpy.dtype, optional 793 Datatype to store the internal matrix in. 794 795 """ 796 if num_features is None: 797 logger.warning( 798 "scanning corpus to determine the number of features (consider setting `num_features` explicitly)" 799 ) 800 num_features = 1 + utils.get_max_id(corpus) 801 802 self.num_features = num_features 803 self.num_best = num_best 804 self.normalize = True 805 self.chunksize = chunksize 806 if corpus_len is None: 807 corpus_len = len(corpus) 808 809 if corpus is not None: 810 if self.num_features <= 0: 811 raise ValueError( 812 "cannot index a corpus with zero features (you must specify either `num_features` " 813 "or a non-empty corpus in the constructor)" 814 ) 815 logger.info("creating matrix with %i documents and %i features", corpus_len, num_features) 816 self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype) 817 # iterate over corpus, populating the numpy index matrix with (normalized) 818 # document vectors 819 for docno, vector in enumerate(corpus): 820 if docno % 1000 == 0: 821 logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len) 822 # individual documents in fact may be in numpy.scipy.sparse format as well. 823 # it's not documented because other it's not fully supported throughout. 824 # the user better know what he's doing (no normalization, must 825 # explicitly supply num_features etc). 826 if isinstance(vector, numpy.ndarray): 827 pass 828 elif scipy.sparse.issparse(vector): 829 vector = vector.toarray().flatten() 830 else: 831 vector = matutils.unitvec(matutils.sparse2full(vector, num_features)) 832 self.index[docno] = vector 833 834 def __len__(self): 835 return self.index.shape[0] 836 837 def get_similarities(self, query): 838 """Get similarity between `query` and this index. 839 840 Warnings 841 -------- 842 Do not use this function directly, use the :class:`~gensim.similarities.docsim.MatrixSimilarity.__getitem__` 843 instead. 844 845 Parameters 846 ---------- 847 query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`} 848 Document or collection of documents. 849 850 Return 851 ------ 852 :class:`numpy.ndarray` 853 Similarity matrix. 854 855 """ 856 is_corpus, query = utils.is_corpus(query) 857 if is_corpus: 858 query = numpy.asarray( 859 [matutils.sparse2full(vec, self.num_features) for vec in query], 860 dtype=self.index.dtype 861 ) 862 else: 863 if scipy.sparse.issparse(query): 864 query = query.toarray() # convert sparse to dense 865 elif isinstance(query, numpy.ndarray): 866 pass 867 else: 868 # default case: query is a single vector in sparse gensim format 869 query = matutils.sparse2full(query, self.num_features) 870 query = numpy.asarray(query, dtype=self.index.dtype) 871 872 # do a little transposition dance to stop numpy from making a copy of 873 # self.index internally in numpy.dot (very slow). 874 result = numpy.dot(self.index, query.T).T # return #queries x #index 875 return result # XXX: removed casting the result from array to list; does anyone care? 876 877 def __str__(self): 878 return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1]) 879 880 881class SoftCosineSimilarity(interfaces.SimilarityABC): 882 """Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory. 883 884 Examples 885 -------- 886 .. sourcecode:: pycon 887 888 >>> from gensim.test.utils import common_texts 889 >>> from gensim.corpora import Dictionary 890 >>> from gensim.models import Word2Vec 891 >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix 892 >>> from gensim.similarities import WordEmbeddingSimilarityIndex 893 >>> 894 >>> model = Word2Vec(common_texts, vector_size=20, min_count=1) # train word-vectors 895 >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv) 896 >>> dictionary = Dictionary(common_texts) 897 >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] 898 >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix 899 >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) 900 >>> 901 >>> query = 'graph trees computer'.split() # make a query 902 >>> sims = docsim_index[dictionary.doc2bow(query)] # calculate similarity of query to each doc from bow_corpus 903 904 Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_scm.html>`__ 905 for more examples. 906 907 """ 908 def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256, normalized=(True, True)): 909 """ 910 911 Parameters 912 ---------- 913 corpus: iterable of list of (int, float) 914 A list of documents in the BoW format. 915 similarity_matrix : :class:`gensim.similarities.SparseTermSimilarityMatrix` 916 A term similarity matrix. 917 num_best : int, optional 918 The number of results to retrieve for a query, if None - return similarities with all elements from corpus. 919 chunksize: int, optional 920 Size of one corpus chunk. 921 normalized : tuple of {True, False, 'maintain'}, optional 922 First/second value specifies whether the query/document vectors in the inner product 923 will be L2-normalized (True; corresponds to the soft cosine similarity measure; default), 924 maintain their L2-norm during change of basis ('maintain'; corresponds to query 925 expansion with partial membership), or kept as-is (False; 926 corresponds to query expansion). 927 928 See Also 929 -------- 930 :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix` 931 A sparse term similarity matrix built using a term similarity index. 932 :class:`~gensim.similarities.termsim.LevenshteinSimilarityIndex` 933 A term similarity index that computes Levenshtein similarities between terms. 934 :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex` 935 A term similarity index that computes cosine similarities between word embeddings. 936 937 """ 938 self.similarity_matrix = similarity_matrix 939 940 self.corpus = corpus 941 self.num_best = num_best 942 self.chunksize = chunksize 943 self.normalized = normalized 944 945 # Normalization of features is undesirable, since soft cosine similarity requires special 946 # normalization using the similarity matrix. Therefore, we would just be normalizing twice, 947 # increasing the numerical error. 948 self.normalize = False 949 950 # index is simply an array from 0 to size of corpus. 951 self.index = numpy.arange(len(corpus)) 952 953 def __len__(self): 954 return len(self.corpus) 955 956 def get_similarities(self, query): 957 """Get similarity between `query` and this index. 958 959 Warnings 960 -------- 961 Do not use this function directly; use the `self[query]` syntax instead. 962 963 Parameters 964 ---------- 965 query : {list of (int, number), iterable of list of (int, number)} 966 Document or collection of documents. 967 968 Return 969 ------ 970 :class:`numpy.ndarray` 971 Similarity matrix. 972 973 """ 974 if not self.corpus: 975 return numpy.array() 976 977 is_corpus, query = utils.is_corpus(query) 978 if not is_corpus and isinstance(query, numpy.ndarray): 979 query = [self.corpus[i] for i in query] # convert document indexes to actual documents 980 result = self.similarity_matrix.inner_product(query, self.corpus, normalized=self.normalized) 981 982 if scipy.sparse.issparse(result): 983 return numpy.asarray(result.todense()) 984 if numpy.isscalar(result): 985 return numpy.array(result) 986 return numpy.asarray(result)[0] 987 988 def __str__(self): 989 return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0]) 990 991 992class WmdSimilarity(interfaces.SimilarityABC): 993 """Compute negative WMD similarity against a corpus of documents. 994 995 Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_wmd.html>`__ 996 for more examples. 997 998 When using this code, please consider citing the following papers: 999 1000 * `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching" 1001 <http://www.cs.huji.ac.il/~werman/Papers/ECCV2008.pdf>`_ 1002 * `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances" 1003 <http://www.cs.huji.ac.il/~werman/Papers/ICCV2009.pdf>`_ 1004 * `Matt Kusner et al. "From Word Embeddings To Document Distances" 1005 <http://proceedings.mlr.press/v37/kusnerb15.pdf>`_ 1006 1007 Example 1008 ------- 1009 .. sourcecode:: pycon 1010 1011 >>> from gensim.test.utils import common_texts 1012 >>> from gensim.models import Word2Vec 1013 >>> from gensim.similarities import WmdSimilarity 1014 >>> 1015 >>> model = Word2Vec(common_texts, vector_size=20, min_count=1) # train word-vectors 1016 >>> 1017 >>> index = WmdSimilarity(common_texts, model) 1018 >>> # Make query. 1019 >>> query = ['trees'] 1020 >>> sims = index[query] 1021 1022 """ 1023 def __init__(self, corpus, kv_model, num_best=None, chunksize=256): 1024 """ 1025 1026 Parameters 1027 ---------- 1028 corpus: iterable of list of str 1029 A list of documents, each of which is a list of tokens. 1030 kv_model: :class:`~gensim.models.keyedvectors.KeyedVectors` 1031 A set of KeyedVectors 1032 num_best: int, optional 1033 Number of results to retrieve. 1034 chunksize : int, optional 1035 Size of chunk. 1036 1037 """ 1038 self.corpus = corpus 1039 self.wv = kv_model 1040 self.num_best = num_best 1041 self.chunksize = chunksize 1042 1043 # Normalization of features is not possible, as corpus is a list (of lists) of strings. 1044 self.normalize = False 1045 1046 # index is simply an array from 0 to size of corpus. 1047 self.index = numpy.arange(len(corpus)) 1048 1049 def __len__(self): 1050 """Get size of corpus.""" 1051 return len(self.corpus) 1052 1053 def get_similarities(self, query): 1054 """Get similarity between `query` and this index. 1055 1056 Warnings 1057 -------- 1058 Do not use this function directly; use the `self[query]` syntax instead. 1059 1060 Parameters 1061 ---------- 1062 query : {list of str, iterable of list of str} 1063 Document or collection of documents. 1064 1065 Return 1066 ------ 1067 :class:`numpy.ndarray` 1068 Similarity matrix. 1069 1070 """ 1071 if isinstance(query, numpy.ndarray): 1072 # Convert document indexes to actual documents. 1073 query = [self.corpus[i] for i in query] 1074 1075 if not query or not isinstance(query[0], list): 1076 query = [query] 1077 1078 n_queries = len(query) 1079 result = [] 1080 for qidx in range(n_queries): 1081 # Compute similarity for each query. 1082 qresult = [self.wv.wmdistance(document, query[qidx]) for document in self.corpus] 1083 qresult = numpy.array(qresult) 1084 qresult = 1. / (1. + qresult) # Similarity is the negative of the distance. 1085 1086 # Append single query result to list of all results. 1087 result.append(qresult) 1088 1089 if len(result) == 1: 1090 # Only one query. 1091 result = result[0] 1092 else: 1093 result = numpy.array(result) 1094 1095 return result 1096 1097 def __str__(self): 1098 return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1]) 1099 1100 1101class SparseMatrixSimilarity(interfaces.SimilarityABC): 1102 """Compute cosine similarity against a corpus of documents by storing the index matrix in memory. 1103 1104 Notes 1105 ----- 1106 Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM. 1107 1108 The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire 1109 matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead. 1110 1111 Takes an optional `maintain_sparsity` argument, setting this to True 1112 causes `get_similarities` to return a sparse matrix instead of a 1113 dense representation if possible. 1114 1115 See also 1116 -------- 1117 :class:`~gensim.similarities.docsim.Similarity` 1118 Index similarity (wrapper for other inheritors of :class:`~gensim.interfaces.SimilarityABC`). 1119 :class:`~gensim.similarities.docsim.MatrixSimilarity` 1120 Index similarity (dense with cosine distance). 1121 1122 """ 1123 def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, 1124 num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): 1125 """ 1126 1127 Parameters 1128 ---------- 1129 corpus: iterable of list of (int, float) 1130 A list of documents in the BoW format. 1131 num_features : int, optional 1132 Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. 1133 num_terms : int, optional 1134 Alias for `num_features`, you can use either. 1135 num_docs : int, optional 1136 Number of documents in `corpus`. Will be calculated if not provided. 1137 num_nnz : int, optional 1138 Number of non-zero elements in `corpus`. Will be calculated if not provided. 1139 num_best : int, optional 1140 If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. 1141 Otherwise, return a full vector with one float for every document in the index. 1142 chunksize : int, optional 1143 Size of query chunks. Used internally when the query is an entire corpus. 1144 dtype : numpy.dtype, optional 1145 Data type of the internal matrix. 1146 maintain_sparsity : bool, optional 1147 Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? 1148 1149 """ 1150 self.num_best = num_best 1151 self.normalize = True 1152 self.chunksize = chunksize 1153 self.maintain_sparsity = maintain_sparsity 1154 1155 if corpus is not None: 1156 logger.info("creating sparse index") 1157 1158 # iterate over input corpus, populating the sparse index matrix 1159 try: 1160 # use the more efficient corpus generation version, if the input 1161 # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). 1162 num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz 1163 logger.debug("using efficient sparse index creation") 1164 except AttributeError: 1165 # no MmCorpus, use the slower version (or maybe user supplied the 1166 # num_* params in constructor) 1167 pass 1168 if num_features is not None: 1169 # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity 1170 num_terms = num_features 1171 if num_terms is None: 1172 raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") 1173 corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else 1174 (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else 1175 matutils.unitvec(v)) for v in corpus) 1176 self.index = matutils.corpus2csc( 1177 corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, 1178 dtype=dtype, printprogress=10000 1179 ).T 1180 1181 # convert to Compressed Sparse Row for efficient row slicing and multiplications 1182 self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR 1183 logger.info("created %r", self.index) 1184 1185 def __len__(self): 1186 """Get size of index.""" 1187 return self.index.shape[0] 1188 1189 def get_similarities(self, query): 1190 """Get similarity between `query` and this index. 1191 1192 Warnings 1193 -------- 1194 Do not use this function directly; use the `self[query]` syntax instead. 1195 1196 Parameters 1197 ---------- 1198 query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`} 1199 Document or collection of documents. 1200 1201 Return 1202 ------ 1203 :class:`numpy.ndarray` 1204 Similarity matrix (if maintain_sparsity=False) **OR** 1205 :class:`scipy.sparse.csc` 1206 otherwise 1207 1208 """ 1209 is_corpus, query = utils.is_corpus(query) 1210 if is_corpus: 1211 query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) 1212 else: 1213 if scipy.sparse.issparse(query): 1214 query = query.T # convert documents=rows to documents=columns 1215 elif isinstance(query, numpy.ndarray): 1216 if query.ndim == 1: 1217 query.shape = (1, len(query)) 1218 query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T 1219 else: 1220 # default case: query is a single vector, in sparse gensim format 1221 query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) 1222 1223 # compute cosine similarity against every other document in the collection 1224 result = self.index * query.tocsc() # N x T * T x C = N x C 1225 if result.shape[1] == 1 and not is_corpus: 1226 # for queries of one document, return a 1d array 1227 result = result.toarray().flatten() 1228 elif self.maintain_sparsity: 1229 # avoid converting to dense array if maintaining sparsity 1230 result = result.T 1231 else: 1232 # otherwise, return a 2d matrix (#queries x #index) 1233 result = result.toarray().T 1234 return result 1235