1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> 5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 7"""Math helper functions.""" 8 9from __future__ import with_statement 10 11 12import logging 13import math 14 15from gensim import utils 16 17import numpy as np 18import scipy.sparse 19from scipy.stats import entropy 20import scipy.linalg 21from scipy.linalg.lapack import get_lapack_funcs 22from scipy.linalg.special_matrices import triu 23from scipy.special import psi # gamma function utils 24 25 26logger = logging.getLogger(__name__) 27 28 29def blas(name, ndarray): 30 """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`. 31 32 Parameters 33 ---------- 34 name : str 35 Name(s) of BLAS functions, without the type prefix. 36 ndarray : numpy.ndarray 37 Arrays can be given to determine optimal prefix of BLAS routines. 38 39 Returns 40 ------- 41 object 42 BLAS function for the needed operation on the given data type. 43 44 """ 45 return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] 46 47 48def argsort(x, topn=None, reverse=False): 49 """Efficiently calculate indices of the `topn` smallest elements in array `x`. 50 51 Parameters 52 ---------- 53 x : array_like 54 Array to get the smallest element indices from. 55 topn : int, optional 56 Number of indices of the smallest (greatest) elements to be returned. 57 If not given, indices of all elements will be returned in ascending (descending) order. 58 reverse : bool, optional 59 Return the `topn` greatest elements in descending order, 60 instead of smallest elements in ascending order? 61 62 Returns 63 ------- 64 numpy.ndarray 65 Array of `topn` indices that sort the array in the requested order. 66 67 """ 68 x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...) 69 if topn is None: 70 topn = x.size 71 if topn <= 0: 72 return [] 73 if reverse: 74 x = -x 75 if topn >= x.size or not hasattr(np, 'argpartition'): 76 return np.argsort(x)[:topn] 77 # np >= 1.8 has a fast partial argsort, use that! 78 most_extreme = np.argpartition(x, topn)[:topn] 79 return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order 80 81 82def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0): 83 """Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`, 84 with documents as columns. 85 86 Notes 87 ----- 88 If the number of terms, documents and non-zero elements is known, you can pass 89 them here as parameters and a (much) more memory efficient code path will be taken. 90 91 Parameters 92 ---------- 93 corpus : iterable of iterable of (int, number) 94 Input corpus in BoW format 95 num_terms : int, optional 96 Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored. 97 dtype : data-type, optional 98 Data type of output CSC matrix. 99 num_docs : int, optional 100 Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored. 101 num_nnz : int, optional 102 Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored. 103 printprogress : int, optional 104 Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging. 105 106 Returns 107 ------- 108 scipy.sparse.csc_matrix 109 `corpus` converted into a sparse CSC matrix. 110 111 See Also 112 -------- 113 :class:`~gensim.matutils.Sparse2Corpus` 114 Convert sparse format to Gensim corpus format. 115 116 """ 117 try: 118 # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes 119 # (as is the case with MmCorpus for example), we can use a more efficient code path 120 if num_terms is None: 121 num_terms = corpus.num_terms 122 if num_docs is None: 123 num_docs = corpus.num_docs 124 if num_nnz is None: 125 num_nnz = corpus.num_nnz 126 except AttributeError: 127 pass # not a MmCorpus... 128 if printprogress: 129 logger.info("creating sparse matrix from corpus") 130 if num_terms is not None and num_docs is not None and num_nnz is not None: 131 # faster and much more memory-friendly version of creating the sparse csc 132 posnow, indptr = 0, [0] 133 indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer 134 data = np.empty((num_nnz,), dtype=dtype) 135 for docno, doc in enumerate(corpus): 136 if printprogress and docno % printprogress == 0: 137 logger.info("PROGRESS: at document #%i/%i", docno, num_docs) 138 posnext = posnow + len(doc) 139 # zip(*doc) transforms doc to (token_indices, token_counts] 140 indices[posnow: posnext], data[posnow: posnext] = zip(*doc) if doc else ([], []) 141 indptr.append(posnext) 142 posnow = posnext 143 assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros" 144 result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) 145 else: 146 # slower version; determine the sparse matrix parameters during iteration 147 num_nnz, data, indices, indptr = 0, [], [], [0] 148 for docno, doc in enumerate(corpus): 149 if printprogress and docno % printprogress == 0: 150 logger.info("PROGRESS: at document #%i", docno) 151 152 # zip(*doc) transforms doc to (token_indices, token_counts] 153 doc_indices, doc_data = zip(*doc) if doc else ([], []) 154 indices.extend(doc_indices) 155 data.extend(doc_data) 156 num_nnz += len(doc) 157 indptr.append(num_nnz) 158 if num_terms is None: 159 num_terms = max(indices) + 1 if indices else 0 160 num_docs = len(indptr) - 1 161 # now num_docs, num_terms and num_nnz contain the correct values 162 data = np.asarray(data, dtype=dtype) 163 indices = np.asarray(indices) 164 result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype) 165 return result 166 167 168def pad(mat, padrow, padcol): 169 """Add additional rows/columns to `mat`. The new rows/columns will be initialized with zeros. 170 171 Parameters 172 ---------- 173 mat : numpy.ndarray 174 Input 2D matrix 175 padrow : int 176 Number of additional rows 177 padcol : int 178 Number of additional columns 179 180 Returns 181 ------- 182 numpy.matrixlib.defmatrix.matrix 183 Matrix with needed padding. 184 185 """ 186 if padrow < 0: 187 padrow = 0 188 if padcol < 0: 189 padcol = 0 190 rows, cols = mat.shape 191 return np.block([ 192 [mat, np.zeros((rows, padcol))], 193 [np.zeros((padrow, cols + padcol))], 194 ]) 195 196 197def zeros_aligned(shape, dtype, order='C', align=128): 198 """Get array aligned at `align` byte boundary in memory. 199 200 Parameters 201 ---------- 202 shape : int or (int, int) 203 Shape of array. 204 dtype : data-type 205 Data type of array. 206 order : {'C', 'F'}, optional 207 Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory. 208 align : int, optional 209 Boundary for alignment in bytes. 210 211 Returns 212 ------- 213 numpy.ndarray 214 Aligned array. 215 216 """ 217 nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize 218 buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded") 219 start_index = -buffer.ctypes.data % align 220 return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order) 221 222 223def ismatrix(m): 224 """Check whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix. 225 226 Parameters 227 ---------- 228 m : object 229 Object to check. 230 231 Returns 232 ------- 233 bool 234 Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix. 235 236 """ 237 return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) 238 239 240def any2sparse(vec, eps=1e-9): 241 """Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format. 242 243 Parameters 244 ---------- 245 vec : {`numpy.ndarray`, `scipy.sparse`} 246 Input vector 247 eps : float, optional 248 Value used for threshold, all coordinates less than `eps` will not be presented in result. 249 250 Returns 251 ------- 252 list of (int, float) 253 Vector in BoW format. 254 255 """ 256 if isinstance(vec, np.ndarray): 257 return dense2vec(vec, eps) 258 if scipy.sparse.issparse(vec): 259 return scipy2sparse(vec, eps) 260 return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps] 261 262 263def scipy2scipy_clipped(matrix, topn, eps=1e-9): 264 """Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix. 265 266 Parameters 267 ---------- 268 matrix : `scipy.sparse` 269 Input vector or matrix (1D or 2D sparse array). 270 topn : int 271 Number of greatest elements, in absolute value, to return. 272 eps : float 273 Ignored. 274 275 Returns 276 ------- 277 `scipy.sparse.csr.csr_matrix` 278 Clipped matrix. 279 280 """ 281 if not scipy.sparse.issparse(matrix): 282 raise ValueError("'%s' is not a scipy sparse vector." % matrix) 283 if topn <= 0: 284 return scipy.sparse.csr_matrix([]) 285 # Return clipped sparse vector if input is a sparse vector. 286 if matrix.shape[0] == 1: 287 # use np.argpartition/argsort and only form tuples that are actually returned. 288 biggest = argsort(abs(matrix.data), topn, reverse=True) 289 indices, data = matrix.indices.take(biggest), matrix.data.take(biggest) 290 return scipy.sparse.csr_matrix((data, indices, [0, len(indices)])) 291 # Return clipped sparse matrix if input is a matrix, processing row by row. 292 else: 293 matrix_indices = [] 294 matrix_data = [] 295 matrix_indptr = [0] 296 # calling abs() on entire matrix once is faster than calling abs() iteratively for each row 297 matrix_abs = abs(matrix) 298 for i in range(matrix.shape[0]): 299 v = matrix.getrow(i) 300 v_abs = matrix_abs.getrow(i) 301 # Sort and clip each row vector first. 302 biggest = argsort(v_abs.data, topn, reverse=True) 303 indices, data = v.indices.take(biggest), v.data.take(biggest) 304 # Store the topn indices and values of each row vector. 305 matrix_data.append(data) 306 matrix_indices.append(indices) 307 matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn)) 308 matrix_indices = np.concatenate(matrix_indices).ravel() 309 matrix_data = np.concatenate(matrix_data).ravel() 310 # Instantiate and return a sparse csr_matrix which preserves the order of indices/data. 311 return scipy.sparse.csr.csr_matrix( 312 (matrix_data, matrix_indices, matrix_indptr), 313 shape=(matrix.shape[0], np.max(matrix_indices) + 1) 314 ) 315 316 317def scipy2sparse(vec, eps=1e-9): 318 """Convert a scipy.sparse vector into the Gensim bag-of-words format. 319 320 Parameters 321 ---------- 322 vec : `scipy.sparse` 323 Sparse vector. 324 325 eps : float, optional 326 Value used for threshold, all coordinates less than `eps` will not be presented in result. 327 328 Returns 329 ------- 330 list of (int, float) 331 Vector in Gensim bag-of-words format. 332 333 """ 334 vec = vec.tocsr() 335 assert vec.shape[0] == 1 336 return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps] 337 338 339class Scipy2Corpus: 340 """Convert a sequence of dense/sparse vectors into a streamed Gensim corpus object. 341 342 See Also 343 -------- 344 :func:`~gensim.matutils.corpus2csc` 345 Convert corpus in Gensim format to `scipy.sparse.csc` matrix. 346 347 """ 348 def __init__(self, vecs): 349 """ 350 351 Parameters 352 ---------- 353 vecs : iterable of {`numpy.ndarray`, `scipy.sparse`} 354 Input vectors. 355 356 """ 357 self.vecs = vecs 358 359 def __iter__(self): 360 for vec in self.vecs: 361 if isinstance(vec, np.ndarray): 362 yield full2sparse(vec) 363 else: 364 yield scipy2sparse(vec) 365 366 def __len__(self): 367 return len(self.vecs) 368 369 370def sparse2full(doc, length): 371 """Convert a document in Gensim bag-of-words format into a dense numpy array. 372 373 Parameters 374 ---------- 375 doc : list of (int, number) 376 Document in BoW format. 377 length : int 378 Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly. 379 This is typically the vocabulary size or number of topics, depending on how you created `doc`. 380 381 Returns 382 ------- 383 numpy.ndarray 384 Dense numpy vector for `doc`. 385 386 See Also 387 -------- 388 :func:`~gensim.matutils.full2sparse` 389 Convert dense array to gensim bag-of-words format. 390 391 """ 392 result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value) 393 # convert indices to int as numpy 1.12 no longer indexes by floats 394 doc = ((int(id_), float(val_)) for (id_, val_) in doc) 395 396 doc = dict(doc) 397 # overwrite some of the zeroes with explicit values 398 result[list(doc)] = list(doc.values()) 399 return result 400 401 402def full2sparse(vec, eps=1e-9): 403 """Convert a dense numpy array into the Gensim bag-of-words format. 404 405 Parameters 406 ---------- 407 vec : numpy.ndarray 408 Dense input vector. 409 eps : float 410 Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and 411 won't be included in the BOW result. 412 413 Returns 414 ------- 415 list of (int, float) 416 BoW format of `vec`, with near-zero values omitted (sparse vector). 417 418 See Also 419 -------- 420 :func:`~gensim.matutils.sparse2full` 421 Convert a document in Gensim bag-of-words format into a dense numpy array. 422 423 """ 424 vec = np.asarray(vec, dtype=float) 425 nnz = np.nonzero(abs(vec) > eps)[0] 426 return list(zip(nnz, vec.take(nnz))) 427 428 429dense2vec = full2sparse 430 431 432def full2sparse_clipped(vec, topn, eps=1e-9): 433 """Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). 434 435 This is more efficient that sorting a vector and then taking the greatest values, especially 436 where `len(vec) >> topn`. 437 438 Parameters 439 ---------- 440 vec : numpy.ndarray 441 Input dense vector 442 topn : int 443 Number of greatest (abs) elements that will be presented in result. 444 eps : float 445 Threshold value, if coordinate in `vec` < eps, this will not be presented in result. 446 447 Returns 448 ------- 449 list of (int, float) 450 Clipped vector in BoW format. 451 452 See Also 453 -------- 454 :func:`~gensim.matutils.full2sparse` 455 Convert dense array to gensim bag-of-words format. 456 457 """ 458 # use np.argpartition/argsort and only form tuples that are actually returned. 459 # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on. 460 if topn <= 0: 461 return [] 462 vec = np.asarray(vec, dtype=float) 463 nnz = np.nonzero(abs(vec) > eps)[0] 464 biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True)) 465 return list(zip(biggest, vec.take(biggest))) 466 467 468def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): 469 """Convert corpus into a dense numpy 2D array, with documents as columns. 470 471 Parameters 472 ---------- 473 corpus : iterable of iterable of (int, number) 474 Input corpus in the Gensim bag-of-words format. 475 num_terms : int 476 Number of terms in the dictionary. X-axis of the resulting matrix. 477 num_docs : int, optional 478 Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken. 479 Y-axis of the resulting matrix. 480 dtype : data-type, optional 481 Data type of the output matrix. 482 483 Returns 484 ------- 485 numpy.ndarray 486 Dense 2D array that presents `corpus`. 487 488 See Also 489 -------- 490 :class:`~gensim.matutils.Dense2Corpus` 491 Convert dense matrix to Gensim corpus format. 492 493 """ 494 if num_docs is not None: 495 # we know the number of documents => don't bother column_stacking 496 docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype) 497 for docno, doc in enumerate(corpus): 498 result[:, docno] = sparse2full(doc, num_terms) 499 assert docno + 1 == num_docs 500 else: 501 # The below used to be a generator, but NumPy deprecated generator as of 1.16 with: 502 # """ 503 # FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. 504 # Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error 505 # in the future. 506 # """ 507 result = np.column_stack([sparse2full(doc, num_terms) for doc in corpus]) 508 return result.astype(dtype) 509 510 511class Dense2Corpus: 512 """Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format. 513 514 Notes 515 ----- 516 No data copy is made (changes to the underlying matrix imply changes in the streamed corpus). 517 518 See Also 519 -------- 520 :func:`~gensim.matutils.corpus2dense` 521 Convert Gensim corpus to dense matrix. 522 :class:`~gensim.matutils.Sparse2Corpus` 523 Convert sparse matrix to Gensim corpus format. 524 525 """ 526 def __init__(self, dense, documents_columns=True): 527 """ 528 529 Parameters 530 ---------- 531 dense : numpy.ndarray 532 Corpus in dense format. 533 documents_columns : bool, optional 534 Documents in `dense` represented as columns, as opposed to rows? 535 536 """ 537 if documents_columns: 538 self.dense = dense.T 539 else: 540 self.dense = dense 541 542 def __iter__(self): 543 """Iterate over the corpus. 544 545 Yields 546 ------ 547 list of (int, float) 548 Document in BoW format. 549 550 """ 551 for doc in self.dense: 552 yield full2sparse(doc.flat) 553 554 def __len__(self): 555 return len(self.dense) 556 557 558class Sparse2Corpus: 559 """Convert a matrix in scipy.sparse format into a streaming Gensim corpus. 560 561 See Also 562 -------- 563 :func:`~gensim.matutils.corpus2csc` 564 Convert gensim corpus format to `scipy.sparse.csc` matrix 565 :class:`~gensim.matutils.Dense2Corpus` 566 Convert dense matrix to gensim corpus. 567 568 """ 569 def __init__(self, sparse, documents_columns=True): 570 """ 571 572 Parameters 573 ---------- 574 sparse : `scipy.sparse` 575 Corpus scipy sparse format 576 documents_columns : bool, optional 577 Documents will be column? 578 579 """ 580 if documents_columns: 581 self.sparse = sparse.tocsc() 582 else: 583 self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len()) 584 585 def __iter__(self): 586 """ 587 588 Yields 589 ------ 590 list of (int, float) 591 Document in BoW format. 592 593 """ 594 for indprev, indnow in zip(self.sparse.indptr, self.sparse.indptr[1:]): 595 yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) 596 597 def __len__(self): 598 return self.sparse.shape[1] 599 600 def __getitem__(self, document_index): 601 """Retrieve a document vector from the corpus by its index. 602 603 Parameters 604 ---------- 605 document_index : int 606 Index of document 607 608 Returns 609 ------- 610 list of (int, number) 611 Document in BoW format. 612 613 """ 614 indprev = self.sparse.indptr[document_index] 615 indnow = self.sparse.indptr[document_index + 1] 616 return list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) 617 618 619def veclen(vec): 620 """Calculate L2 (euclidean) length of a vector. 621 622 Parameters 623 ---------- 624 vec : list of (int, number) 625 Input vector in sparse bag-of-words format. 626 627 Returns 628 ------- 629 float 630 Length of `vec`. 631 632 """ 633 if len(vec) == 0: 634 return 0.0 635 length = 1.0 * math.sqrt(sum(val**2 for _, val in vec)) 636 assert length > 0.0, "sparse documents must not contain any explicit zero entries" 637 return length 638 639 640def ret_normalized_vec(vec, length): 641 """Normalize a vector in L2 (Euclidean unit norm). 642 643 Parameters 644 ---------- 645 vec : list of (int, number) 646 Input vector in BoW format. 647 length : float 648 Length of vector 649 650 Returns 651 ------- 652 list of (int, number) 653 L2-normalized vector in BoW format. 654 655 """ 656 if length != 1.0: 657 return [(termid, val / length) for termid, val in vec] 658 else: 659 return list(vec) 660 661 662def ret_log_normalize_vec(vec, axis=1): 663 log_max = 100.0 664 if len(vec.shape) == 1: 665 max_val = np.max(vec) 666 log_shift = log_max - np.log(len(vec) + 1.0) - max_val 667 tot = np.sum(np.exp(vec + log_shift)) 668 log_norm = np.log(tot) - log_shift 669 vec -= log_norm 670 else: 671 if axis == 1: # independently normalize each sample 672 max_val = np.max(vec, 1) 673 log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val 674 tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1) 675 log_norm = np.log(tot) - log_shift 676 vec = vec - log_norm[:, np.newaxis] 677 elif axis == 0: # normalize each feature 678 k = ret_log_normalize_vec(vec.T) 679 return k[0].T, k[1] 680 else: 681 raise ValueError("'%s' is not a supported axis" % axis) 682 return vec, log_norm 683 684 685blas_nrm2 = blas('nrm2', np.array([], dtype=float)) 686blas_scal = blas('scal', np.array([], dtype=float)) 687 688 689def unitvec(vec, norm='l2', return_norm=False): 690 """Scale a vector to unit length. 691 692 Parameters 693 ---------- 694 vec : {numpy.ndarray, scipy.sparse, list of (int, float)} 695 Input vector in any format 696 norm : {'l1', 'l2', 'unique'}, optional 697 Metric to normalize in. 698 return_norm : bool, optional 699 Return the length of vector `vec`, in addition to the normalized vector itself? 700 701 Returns 702 ------- 703 numpy.ndarray, scipy.sparse, list of (int, float)} 704 Normalized vector in same format as `vec`. 705 float 706 Length of `vec` before normalization, if `return_norm` is set. 707 708 Notes 709 ----- 710 Zero-vector will be unchanged. 711 712 """ 713 supported_norms = ('l1', 'l2', 'unique') 714 if norm not in supported_norms: 715 raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms)) 716 717 if scipy.sparse.issparse(vec): 718 vec = vec.tocsr() 719 if norm == 'l1': 720 veclen = np.sum(np.abs(vec.data)) 721 if norm == 'l2': 722 veclen = np.sqrt(np.sum(vec.data ** 2)) 723 if norm == 'unique': 724 veclen = vec.nnz 725 if veclen > 0.0: 726 if np.issubdtype(vec.dtype, np.integer): 727 vec = vec.astype(float) 728 vec /= veclen 729 if return_norm: 730 return vec, veclen 731 else: 732 return vec 733 else: 734 if return_norm: 735 return vec, 1.0 736 else: 737 return vec 738 739 if isinstance(vec, np.ndarray): 740 if norm == 'l1': 741 veclen = np.sum(np.abs(vec)) 742 if norm == 'l2': 743 if vec.size == 0: 744 veclen = 0.0 745 else: 746 veclen = blas_nrm2(vec) 747 if norm == 'unique': 748 veclen = np.count_nonzero(vec) 749 if veclen > 0.0: 750 if np.issubdtype(vec.dtype, np.integer): 751 vec = vec.astype(float) 752 if return_norm: 753 return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen 754 else: 755 return blas_scal(1.0 / veclen, vec).astype(vec.dtype) 756 else: 757 if return_norm: 758 return vec, 1.0 759 else: 760 return vec 761 762 try: 763 first = next(iter(vec)) # is there at least one element? 764 except StopIteration: 765 if return_norm: 766 return vec, 1.0 767 else: 768 return vec 769 770 if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format 771 if norm == 'l1': 772 length = float(sum(abs(val) for _, val in vec)) 773 if norm == 'l2': 774 length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) 775 if norm == 'unique': 776 length = 1.0 * len(vec) 777 assert length > 0.0, "sparse documents must not contain any explicit zero entries" 778 if return_norm: 779 return ret_normalized_vec(vec, length), length 780 else: 781 return ret_normalized_vec(vec, length) 782 else: 783 raise ValueError("unknown input type") 784 785 786def cossim(vec1, vec2): 787 """Get cosine similarity between two sparse vectors. 788 789 Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar. 790 791 Parameters 792 ---------- 793 vec1 : list of (int, float) 794 Vector in BoW format. 795 vec2 : list of (int, float) 796 Vector in BoW format. 797 798 Returns 799 ------- 800 float 801 Cosine similarity between `vec1` and `vec2`. 802 803 """ 804 vec1, vec2 = dict(vec1), dict(vec2) 805 if not vec1 or not vec2: 806 return 0.0 807 vec1len = 1.0 * math.sqrt(sum(val * val for val in vec1.values())) 808 vec2len = 1.0 * math.sqrt(sum(val * val for val in vec2.values())) 809 assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" 810 if len(vec2) < len(vec1): 811 vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector 812 result = sum(value * vec2.get(index, 0.0) for index, value in vec1.items()) 813 result /= vec1len * vec2len # rescale by vector lengths 814 return result 815 816 817def isbow(vec): 818 """Checks if a vector is in the sparse Gensim bag-of-words format. 819 820 Parameters 821 ---------- 822 vec : object 823 Object to check. 824 825 Returns 826 ------- 827 bool 828 Is `vec` in BoW format. 829 830 """ 831 if scipy.sparse.issparse(vec): 832 vec = vec.todense().tolist() 833 try: 834 id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking 835 int(id_), float(val_) 836 except IndexError: 837 return True # this is to handle the empty input case 838 except (ValueError, TypeError): 839 return False 840 return True 841 842 843def _convert_vec(vec1, vec2, num_features=None): 844 if scipy.sparse.issparse(vec1): 845 vec1 = vec1.toarray() 846 if scipy.sparse.issparse(vec2): 847 vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix 848 if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense 849 if num_features is not None: # if not None, make as large as the documents drawing from 850 dense1 = sparse2full(vec1, num_features) 851 dense2 = sparse2full(vec2, num_features) 852 return dense1, dense2 853 else: 854 max_len = max(len(vec1), len(vec2)) 855 dense1 = sparse2full(vec1, max_len) 856 dense2 = sparse2full(vec2, max_len) 857 return dense1, dense2 858 else: 859 # this conversion is made because if it is not in bow format, it might be a list within a list after conversion 860 # the scipy implementation of Kullback fails in such a case so we pick up only the nested list. 861 if len(vec1) == 1: 862 vec1 = vec1[0] 863 if len(vec2) == 1: 864 vec2 = vec2[0] 865 return vec1, vec2 866 867 868def kullback_leibler(vec1, vec2, num_features=None): 869 """Calculate Kullback-Leibler distance between two probability distributions using `scipy.stats.entropy`. 870 871 Parameters 872 ---------- 873 vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)} 874 Distribution vector. 875 vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} 876 Distribution vector. 877 num_features : int, optional 878 Number of features in the vectors. 879 880 Returns 881 ------- 882 float 883 Kullback-Leibler distance between `vec1` and `vec2`. 884 Value in range [0, +∞) where values closer to 0 mean less distance (higher similarity). 885 886 """ 887 vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features) 888 return entropy(vec1, vec2) 889 890 891def jensen_shannon(vec1, vec2, num_features=None): 892 """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`. 893 894 Parameters 895 ---------- 896 vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)} 897 Distribution vector. 898 vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} 899 Distribution vector. 900 num_features : int, optional 901 Number of features in the vectors. 902 903 Returns 904 ------- 905 float 906 Jensen-Shannon distance between `vec1` and `vec2`. 907 908 Notes 909 ----- 910 This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`. 911 912 """ 913 vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features) 914 avg_vec = 0.5 * (vec1 + vec2) 915 return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec)) 916 917 918def hellinger(vec1, vec2): 919 """Calculate Hellinger distance between two probability distributions. 920 921 Parameters 922 ---------- 923 vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)} 924 Distribution vector. 925 vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} 926 Distribution vector. 927 928 Returns 929 ------- 930 float 931 Hellinger distance between `vec1` and `vec2`. 932 Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). 933 934 """ 935 if scipy.sparse.issparse(vec1): 936 vec1 = vec1.toarray() 937 if scipy.sparse.issparse(vec2): 938 vec2 = vec2.toarray() 939 if isbow(vec1) and isbow(vec2): 940 # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance 941 vec1, vec2 = dict(vec1), dict(vec2) 942 indices = set(list(vec1.keys()) + list(vec2.keys())) 943 sim = np.sqrt( 944 0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices) 945 ) 946 return sim 947 else: 948 sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum()) 949 return sim 950 951 952def jaccard(vec1, vec2): 953 """Calculate Jaccard distance between two vectors. 954 955 Parameters 956 ---------- 957 vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)} 958 Distribution vector. 959 vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} 960 Distribution vector. 961 962 Returns 963 ------- 964 float 965 Jaccard distance between `vec1` and `vec2`. 966 Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). 967 968 """ 969 970 # converting from sparse for easier manipulation 971 if scipy.sparse.issparse(vec1): 972 vec1 = vec1.toarray() 973 if scipy.sparse.issparse(vec2): 974 vec2 = vec2.toarray() 975 if isbow(vec1) and isbow(vec2): 976 # if it's in bow format, we use the following definitions: 977 # union = sum of the 'weights' of both the bags 978 # intersection = lowest weight for a particular id; basically the number of common words or items 979 union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2) 980 vec1, vec2 = dict(vec1), dict(vec2) 981 intersection = 0.0 982 for feature_id, feature_weight in vec1.items(): 983 intersection += min(feature_weight, vec2.get(feature_id, 0.0)) 984 return 1 - float(intersection) / float(union) 985 else: 986 # if it isn't in bag of words format, we can use sets to calculate intersection and union 987 if isinstance(vec1, np.ndarray): 988 vec1 = vec1.tolist() 989 if isinstance(vec2, np.ndarray): 990 vec2 = vec2.tolist() 991 vec1 = set(vec1) 992 vec2 = set(vec2) 993 intersection = vec1 & vec2 994 union = vec1 | vec2 995 return 1 - float(len(intersection)) / float(len(union)) 996 997 998def jaccard_distance(set1, set2): 999 """Calculate Jaccard distance between two sets. 1000 1001 Parameters 1002 ---------- 1003 set1 : set 1004 Input set. 1005 set2 : set 1006 Input set. 1007 1008 Returns 1009 ------- 1010 float 1011 Jaccard distance between `set1` and `set2`. 1012 Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). 1013 """ 1014 1015 union_cardinality = len(set1 | set2) 1016 if union_cardinality == 0: # Both sets are empty 1017 return 1. 1018 1019 return 1. - float(len(set1 & set2)) / float(union_cardinality) 1020 1021 1022try: 1023 # try to load fast, cythonized code if possible 1024 from gensim._matutils import logsumexp, mean_absolute_difference, dirichlet_expectation 1025 1026except ImportError: 1027 def logsumexp(x): 1028 """Log of sum of exponentials. 1029 1030 Parameters 1031 ---------- 1032 x : numpy.ndarray 1033 Input 2d matrix. 1034 1035 Returns 1036 ------- 1037 float 1038 log of sum of exponentials of elements in `x`. 1039 1040 Warnings 1041 -------- 1042 For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`. 1043 1044 """ 1045 x_max = np.max(x) 1046 x = np.log(np.sum(np.exp(x - x_max))) 1047 x += x_max 1048 1049 return x 1050 1051 def mean_absolute_difference(a, b): 1052 """Mean absolute difference between two arrays. 1053 1054 Parameters 1055 ---------- 1056 a : numpy.ndarray 1057 Input 1d array. 1058 b : numpy.ndarray 1059 Input 1d array. 1060 1061 Returns 1062 ------- 1063 float 1064 mean(abs(a - b)). 1065 1066 """ 1067 return np.mean(np.abs(a - b)) 1068 1069 def dirichlet_expectation(alpha): 1070 """Expected value of log(theta) where theta is drawn from a Dirichlet distribution. 1071 1072 Parameters 1073 ---------- 1074 alpha : numpy.ndarray 1075 Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector. 1076 1077 Returns 1078 ------- 1079 numpy.ndarray 1080 Log of expected values, dimension same as `alpha.ndim`. 1081 1082 """ 1083 if len(alpha.shape) == 1: 1084 result = psi(alpha) - psi(np.sum(alpha)) 1085 else: 1086 result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis] 1087 return result.astype(alpha.dtype, copy=False) # keep the same precision as input 1088 1089 1090def qr_destroy(la): 1091 """Get QR decomposition of `la[0]`. 1092 1093 Parameters 1094 ---------- 1095 la : list of numpy.ndarray 1096 Run QR decomposition on the first elements of `la`. Must not be empty. 1097 1098 Returns 1099 ------- 1100 (numpy.ndarray, numpy.ndarray) 1101 Matrices :math:`Q` and :math:`R`. 1102 1103 Notes 1104 ----- 1105 Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`, 1106 because the memory used in `la[0]` is reclaimed earlier. This makes a difference when 1107 decomposing very large arrays, where every memory copy counts. 1108 1109 Warnings 1110 -------- 1111 Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons. 1112 1113 """ 1114 a = np.asfortranarray(la[0]) 1115 del la[0], la # now `a` is the only reference to the input matrix 1116 m, n = a.shape 1117 # perform q, r = QR(a); code hacked out of scipy.linalg.qr 1118 logger.debug("computing QR of %s dense matrix", str(a.shape)) 1119 geqrf, = get_lapack_funcs(('geqrf',), (a,)) 1120 qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) 1121 qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) 1122 del a # free up mem 1123 assert info >= 0 1124 r = triu(qr[:n, :n]) 1125 if m < n: # rare case, #features < #topics 1126 qr = qr[:, :m] # retains fortran order 1127 gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) 1128 q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) 1129 q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) 1130 assert info >= 0, "qr failed" 1131 assert q.flags.f_contiguous 1132 return q, r 1133 1134 1135class MmWriter: 1136 """Store a corpus in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_, 1137 using :class:`~gensim.corpora.mmcorpus.MmCorpus`. 1138 1139 Notes 1140 ----- 1141 The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`). 1142 This allows you to write corpora which are larger than the available RAM. 1143 1144 The output file is created in a single pass through the input corpus, so that the input can be 1145 a once-only stream (generator). 1146 1147 To achieve this, a fake MM header is written first, corpus statistics are collected 1148 during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file, 1149 rewriting the fake header with the final values. 1150 1151 """ 1152 HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format 1153 1154 def __init__(self, fname): 1155 """ 1156 1157 Parameters 1158 ---------- 1159 fname : str 1160 Path to output file. 1161 1162 """ 1163 self.fname = fname 1164 if fname.endswith(".gz") or fname.endswith('.bz2'): 1165 raise NotImplementedError("compressed output not supported with MmWriter") 1166 self.fout = utils.open(self.fname, 'wb+') # open for both reading and writing 1167 self.headers_written = False 1168 1169 def write_headers(self, num_docs, num_terms, num_nnz): 1170 """Write headers to file. 1171 1172 Parameters 1173 ---------- 1174 num_docs : int 1175 Number of documents in corpus. 1176 num_terms : int 1177 Number of term in corpus. 1178 num_nnz : int 1179 Number of non-zero elements in corpus. 1180 1181 """ 1182 self.fout.write(MmWriter.HEADER_LINE) 1183 1184 if num_nnz < 0: 1185 # we don't know the matrix shape/density yet, so only log a general line 1186 logger.info("saving sparse matrix to %s", self.fname) 1187 self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody 1188 else: 1189 logger.info( 1190 "saving sparse %sx%s matrix with %i non-zero entries to %s", 1191 num_docs, num_terms, num_nnz, self.fname 1192 ) 1193 self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz))) 1194 self.last_docno = -1 1195 self.headers_written = True 1196 1197 def fake_headers(self, num_docs, num_terms, num_nnz): 1198 """Write "fake" headers to file, to be rewritten once we've scanned the entire corpus. 1199 1200 Parameters 1201 ---------- 1202 num_docs : int 1203 Number of documents in corpus. 1204 num_terms : int 1205 Number of term in corpus. 1206 num_nnz : int 1207 Number of non-zero elements in corpus. 1208 1209 """ 1210 stats = '%i %i %i' % (num_docs, num_terms, num_nnz) 1211 if len(stats) > 50: 1212 raise ValueError('Invalid stats: matrix too large!') 1213 self.fout.seek(len(MmWriter.HEADER_LINE)) 1214 self.fout.write(utils.to_utf8(stats)) 1215 1216 def write_vector(self, docno, vector): 1217 """Write a single sparse vector to the file. 1218 1219 Parameters 1220 ---------- 1221 docno : int 1222 Number of document. 1223 vector : list of (int, number) 1224 Document in BoW format. 1225 1226 Returns 1227 ------- 1228 (int, int) 1229 Max word index in vector and len of vector. If vector is empty, return (-1, 0). 1230 1231 """ 1232 assert self.headers_written, "must write Matrix Market file headers before writing data!" 1233 assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno) 1234 vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries 1235 for termid, weight in vector: # write term ids in sorted order 1236 # +1 because MM format starts counting from 1 1237 self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) 1238 self.last_docno = docno 1239 return (vector[-1][0], len(vector)) if vector else (-1, 0) 1240 1241 @staticmethod 1242 def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): 1243 """Save the corpus to disk in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_. 1244 1245 Parameters 1246 ---------- 1247 fname : str 1248 Filename of the resulting file. 1249 corpus : iterable of list of (int, number) 1250 Corpus in streamed bag-of-words format. 1251 progress_cnt : int, optional 1252 Print progress for every `progress_cnt` number of documents. 1253 index : bool, optional 1254 Return offsets? 1255 num_terms : int, optional 1256 Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored. 1257 metadata : bool, optional 1258 Generate a metadata file? 1259 1260 Returns 1261 ------- 1262 offsets : {list of int, None} 1263 List of offsets (if index=True) or nothing. 1264 1265 Notes 1266 ----- 1267 Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM. 1268 1269 See Also 1270 -------- 1271 :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus` 1272 Save corpus to disk. 1273 1274 """ 1275 mw = MmWriter(fname) 1276 1277 # write empty headers to the file (with enough space to be overwritten later) 1278 mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line 1279 1280 # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors 1281 _num_terms, num_nnz = 0, 0 1282 docno, poslast = -1, -1 1283 offsets = [] 1284 if hasattr(corpus, 'metadata'): 1285 orig_metadata = corpus.metadata 1286 corpus.metadata = metadata 1287 if metadata: 1288 docno2metadata = {} 1289 else: 1290 metadata = False 1291 for docno, doc in enumerate(corpus): 1292 if metadata: 1293 bow, data = doc 1294 docno2metadata[docno] = data 1295 else: 1296 bow = doc 1297 if docno % progress_cnt == 0: 1298 logger.info("PROGRESS: saving document #%i", docno) 1299 if index: 1300 posnow = mw.fout.tell() 1301 if posnow == poslast: 1302 offsets[-1] = -1 1303 offsets.append(posnow) 1304 poslast = posnow 1305 max_id, veclen = mw.write_vector(docno, bow) 1306 _num_terms = max(_num_terms, 1 + max_id) 1307 num_nnz += veclen 1308 if metadata: 1309 utils.pickle(docno2metadata, fname + '.metadata.cpickle') 1310 corpus.metadata = orig_metadata 1311 1312 num_docs = docno + 1 1313 num_terms = num_terms or _num_terms 1314 1315 if num_docs * num_terms != 0: 1316 logger.info( 1317 "saved %ix%i matrix, density=%.3f%% (%i/%i)", 1318 num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms 1319 ) 1320 1321 # now write proper headers, by seeking and overwriting the spaces written earlier 1322 mw.fake_headers(num_docs, num_terms, num_nnz) 1323 1324 mw.close() 1325 if index: 1326 return offsets 1327 1328 def __del__(self): 1329 """Close `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`. 1330 1331 Warnings 1332 -------- 1333 Closing the file explicitly via the close() method is preferred and safer. 1334 1335 """ 1336 self.close() # does nothing if called twice (on an already closed file), so no worries 1337 1338 def close(self): 1339 """Close `self.fout` file.""" 1340 logger.debug("closing %s", self.fname) 1341 if hasattr(self, 'fout'): 1342 self.fout.close() 1343 1344 1345try: 1346 from gensim.corpora._mmreader import MmReader # noqa: F401 1347except ImportError: 1348 raise utils.NO_CYTHON 1349