1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6
7"""Math helper functions."""
8
9from __future__ import with_statement
10
11
12import logging
13import math
14
15from gensim import utils
16
17import numpy as np
18import scipy.sparse
19from scipy.stats import entropy
20import scipy.linalg
21from scipy.linalg.lapack import get_lapack_funcs
22from scipy.linalg.special_matrices import triu
23from scipy.special import psi  # gamma function utils
24
25
26logger = logging.getLogger(__name__)
27
28
29def blas(name, ndarray):
30    """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`.
31
32    Parameters
33    ----------
34    name : str
35        Name(s) of BLAS functions, without the type prefix.
36    ndarray : numpy.ndarray
37        Arrays can be given to determine optimal prefix of BLAS routines.
38
39    Returns
40    -------
41    object
42        BLAS function for the needed operation on the given data type.
43
44    """
45    return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
46
47
48def argsort(x, topn=None, reverse=False):
49    """Efficiently calculate indices of the `topn` smallest elements in array `x`.
50
51    Parameters
52    ----------
53    x : array_like
54        Array to get the smallest element indices from.
55    topn : int, optional
56        Number of indices of the smallest (greatest) elements to be returned.
57        If not given, indices of all elements will be returned in ascending (descending) order.
58    reverse : bool, optional
59        Return the `topn` greatest elements in descending order,
60        instead of smallest elements in ascending order?
61
62    Returns
63    -------
64    numpy.ndarray
65        Array of `topn` indices that sort the array in the requested order.
66
67    """
68    x = np.asarray(x)  # unify code path for when `x` is not a np array (list, tuple...)
69    if topn is None:
70        topn = x.size
71    if topn <= 0:
72        return []
73    if reverse:
74        x = -x
75    if topn >= x.size or not hasattr(np, 'argpartition'):
76        return np.argsort(x)[:topn]
77    # np >= 1.8 has a fast partial argsort, use that!
78    most_extreme = np.argpartition(x, topn)[:topn]
79    return most_extreme.take(np.argsort(x.take(most_extreme)))  # resort topn into order
80
81
82def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
83    """Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`,
84    with documents as columns.
85
86    Notes
87    -----
88    If the number of terms, documents and non-zero elements is known, you can pass
89    them here as parameters and a (much) more memory efficient code path will be taken.
90
91    Parameters
92    ----------
93    corpus : iterable of iterable of (int, number)
94        Input corpus in BoW format
95    num_terms : int, optional
96        Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
97    dtype : data-type, optional
98        Data type of output CSC matrix.
99    num_docs : int, optional
100        Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored.
101    num_nnz : int, optional
102        Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored.
103    printprogress : int, optional
104        Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging.
105
106    Returns
107    -------
108    scipy.sparse.csc_matrix
109        `corpus` converted into a sparse CSC matrix.
110
111    See Also
112    --------
113    :class:`~gensim.matutils.Sparse2Corpus`
114        Convert sparse format to Gensim corpus format.
115
116    """
117    try:
118        # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
119        # (as is the case with MmCorpus for example), we can use a more efficient code path
120        if num_terms is None:
121            num_terms = corpus.num_terms
122        if num_docs is None:
123            num_docs = corpus.num_docs
124        if num_nnz is None:
125            num_nnz = corpus.num_nnz
126    except AttributeError:
127        pass  # not a MmCorpus...
128    if printprogress:
129        logger.info("creating sparse matrix from corpus")
130    if num_terms is not None and num_docs is not None and num_nnz is not None:
131        # faster and much more memory-friendly version of creating the sparse csc
132        posnow, indptr = 0, [0]
133        indices = np.empty((num_nnz,), dtype=np.int32)  # HACK assume feature ids fit in 32bit integer
134        data = np.empty((num_nnz,), dtype=dtype)
135        for docno, doc in enumerate(corpus):
136            if printprogress and docno % printprogress == 0:
137                logger.info("PROGRESS: at document #%i/%i", docno, num_docs)
138            posnext = posnow + len(doc)
139            # zip(*doc) transforms doc to (token_indices, token_counts]
140            indices[posnow: posnext], data[posnow: posnext] = zip(*doc) if doc else ([], [])
141            indptr.append(posnext)
142            posnow = posnext
143        assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
144        result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
145    else:
146        # slower version; determine the sparse matrix parameters during iteration
147        num_nnz, data, indices, indptr = 0, [], [], [0]
148        for docno, doc in enumerate(corpus):
149            if printprogress and docno % printprogress == 0:
150                logger.info("PROGRESS: at document #%i", docno)
151
152            # zip(*doc) transforms doc to (token_indices, token_counts]
153            doc_indices, doc_data = zip(*doc) if doc else ([], [])
154            indices.extend(doc_indices)
155            data.extend(doc_data)
156            num_nnz += len(doc)
157            indptr.append(num_nnz)
158        if num_terms is None:
159            num_terms = max(indices) + 1 if indices else 0
160        num_docs = len(indptr) - 1
161        # now num_docs, num_terms and num_nnz contain the correct values
162        data = np.asarray(data, dtype=dtype)
163        indices = np.asarray(indices)
164        result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
165    return result
166
167
168def pad(mat, padrow, padcol):
169    """Add additional rows/columns to `mat`. The new rows/columns will be initialized with zeros.
170
171    Parameters
172    ----------
173    mat : numpy.ndarray
174        Input 2D matrix
175    padrow : int
176        Number of additional rows
177    padcol : int
178        Number of additional columns
179
180    Returns
181    -------
182    numpy.matrixlib.defmatrix.matrix
183        Matrix with needed padding.
184
185    """
186    if padrow < 0:
187        padrow = 0
188    if padcol < 0:
189        padcol = 0
190    rows, cols = mat.shape
191    return np.block([
192        [mat, np.zeros((rows, padcol))],
193        [np.zeros((padrow, cols + padcol))],
194    ])
195
196
197def zeros_aligned(shape, dtype, order='C', align=128):
198    """Get array aligned at `align` byte boundary in memory.
199
200    Parameters
201    ----------
202    shape : int or (int, int)
203        Shape of array.
204    dtype : data-type
205        Data type of array.
206    order : {'C', 'F'}, optional
207        Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory.
208    align : int, optional
209        Boundary for alignment in bytes.
210
211    Returns
212    -------
213    numpy.ndarray
214        Aligned array.
215
216    """
217    nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
218    buffer = np.zeros(nbytes + align, dtype=np.uint8)  # problematic on win64 ("maximum allowed dimension exceeded")
219    start_index = -buffer.ctypes.data % align
220    return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order)
221
222
223def ismatrix(m):
224    """Check whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix.
225
226    Parameters
227    ----------
228    m : object
229        Object to check.
230
231    Returns
232    -------
233    bool
234        Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix.
235
236    """
237    return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
238
239
240def any2sparse(vec, eps=1e-9):
241    """Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format.
242
243    Parameters
244    ----------
245    vec : {`numpy.ndarray`, `scipy.sparse`}
246        Input vector
247    eps : float, optional
248        Value used for threshold, all coordinates less than `eps` will not be presented in result.
249
250    Returns
251    -------
252    list of (int, float)
253        Vector in BoW format.
254
255    """
256    if isinstance(vec, np.ndarray):
257        return dense2vec(vec, eps)
258    if scipy.sparse.issparse(vec):
259        return scipy2sparse(vec, eps)
260    return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]
261
262
263def scipy2scipy_clipped(matrix, topn, eps=1e-9):
264    """Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix.
265
266    Parameters
267    ----------
268    matrix : `scipy.sparse`
269        Input vector or matrix (1D or 2D sparse array).
270    topn : int
271        Number of greatest elements, in absolute value, to return.
272    eps : float
273        Ignored.
274
275    Returns
276    -------
277    `scipy.sparse.csr.csr_matrix`
278        Clipped matrix.
279
280    """
281    if not scipy.sparse.issparse(matrix):
282        raise ValueError("'%s' is not a scipy sparse vector." % matrix)
283    if topn <= 0:
284        return scipy.sparse.csr_matrix([])
285    # Return clipped sparse vector if input is a sparse vector.
286    if matrix.shape[0] == 1:
287        # use np.argpartition/argsort and only form tuples that are actually returned.
288        biggest = argsort(abs(matrix.data), topn, reverse=True)
289        indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
290        return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
291    # Return clipped sparse matrix if input is a matrix, processing row by row.
292    else:
293        matrix_indices = []
294        matrix_data = []
295        matrix_indptr = [0]
296        # calling abs() on entire matrix once is faster than calling abs() iteratively for each row
297        matrix_abs = abs(matrix)
298        for i in range(matrix.shape[0]):
299            v = matrix.getrow(i)
300            v_abs = matrix_abs.getrow(i)
301            # Sort and clip each row vector first.
302            biggest = argsort(v_abs.data, topn, reverse=True)
303            indices, data = v.indices.take(biggest), v.data.take(biggest)
304            # Store the topn indices and values of each row vector.
305            matrix_data.append(data)
306            matrix_indices.append(indices)
307            matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
308        matrix_indices = np.concatenate(matrix_indices).ravel()
309        matrix_data = np.concatenate(matrix_data).ravel()
310        # Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
311        return scipy.sparse.csr.csr_matrix(
312            (matrix_data, matrix_indices, matrix_indptr),
313            shape=(matrix.shape[0], np.max(matrix_indices) + 1)
314        )
315
316
317def scipy2sparse(vec, eps=1e-9):
318    """Convert a scipy.sparse vector into the Gensim bag-of-words format.
319
320    Parameters
321    ----------
322    vec : `scipy.sparse`
323        Sparse vector.
324
325    eps : float, optional
326        Value used for threshold, all coordinates less than `eps` will not be presented in result.
327
328    Returns
329    -------
330    list of (int, float)
331        Vector in Gensim bag-of-words format.
332
333    """
334    vec = vec.tocsr()
335    assert vec.shape[0] == 1
336    return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]
337
338
339class Scipy2Corpus:
340    """Convert a sequence of dense/sparse vectors into a streamed Gensim corpus object.
341
342    See Also
343    --------
344    :func:`~gensim.matutils.corpus2csc`
345        Convert corpus in Gensim format to `scipy.sparse.csc` matrix.
346
347    """
348    def __init__(self, vecs):
349        """
350
351        Parameters
352        ----------
353        vecs : iterable of {`numpy.ndarray`, `scipy.sparse`}
354            Input vectors.
355
356        """
357        self.vecs = vecs
358
359    def __iter__(self):
360        for vec in self.vecs:
361            if isinstance(vec, np.ndarray):
362                yield full2sparse(vec)
363            else:
364                yield scipy2sparse(vec)
365
366    def __len__(self):
367        return len(self.vecs)
368
369
370def sparse2full(doc, length):
371    """Convert a document in Gensim bag-of-words format into a dense numpy array.
372
373    Parameters
374    ----------
375    doc : list of (int, number)
376        Document in BoW format.
377    length : int
378        Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly.
379        This is typically the vocabulary size or number of topics, depending on how you created `doc`.
380
381    Returns
382    -------
383    numpy.ndarray
384        Dense numpy vector for `doc`.
385
386    See Also
387    --------
388    :func:`~gensim.matutils.full2sparse`
389        Convert dense array to gensim bag-of-words format.
390
391    """
392    result = np.zeros(length, dtype=np.float32)  # fill with zeroes (default value)
393    # convert indices to int as numpy 1.12 no longer indexes by floats
394    doc = ((int(id_), float(val_)) for (id_, val_) in doc)
395
396    doc = dict(doc)
397    # overwrite some of the zeroes with explicit values
398    result[list(doc)] = list(doc.values())
399    return result
400
401
402def full2sparse(vec, eps=1e-9):
403    """Convert a dense numpy array into the Gensim bag-of-words format.
404
405    Parameters
406    ----------
407    vec : numpy.ndarray
408        Dense input vector.
409    eps : float
410        Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and
411        won't be included in the BOW result.
412
413    Returns
414    -------
415    list of (int, float)
416        BoW format of `vec`, with near-zero values omitted (sparse vector).
417
418    See Also
419    --------
420    :func:`~gensim.matutils.sparse2full`
421        Convert a document in Gensim bag-of-words format into a dense numpy array.
422
423    """
424    vec = np.asarray(vec, dtype=float)
425    nnz = np.nonzero(abs(vec) > eps)[0]
426    return list(zip(nnz, vec.take(nnz)))
427
428
429dense2vec = full2sparse
430
431
432def full2sparse_clipped(vec, topn, eps=1e-9):
433    """Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
434
435    This is more efficient that sorting a vector and then taking the greatest values, especially
436    where `len(vec) >> topn`.
437
438    Parameters
439    ----------
440    vec : numpy.ndarray
441        Input dense vector
442    topn : int
443        Number of greatest (abs) elements that will be presented in result.
444    eps : float
445        Threshold value, if coordinate in `vec` < eps, this will not be presented in result.
446
447    Returns
448    -------
449    list of (int, float)
450        Clipped vector in BoW format.
451
452    See Also
453    --------
454    :func:`~gensim.matutils.full2sparse`
455        Convert dense array to gensim bag-of-words format.
456
457    """
458    # use np.argpartition/argsort and only form tuples that are actually returned.
459    # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
460    if topn <= 0:
461        return []
462    vec = np.asarray(vec, dtype=float)
463    nnz = np.nonzero(abs(vec) > eps)[0]
464    biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True))
465    return list(zip(biggest, vec.take(biggest)))
466
467
468def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
469    """Convert corpus into a dense numpy 2D array, with documents as columns.
470
471    Parameters
472    ----------
473    corpus : iterable of iterable of (int, number)
474        Input corpus in the Gensim bag-of-words format.
475    num_terms : int
476        Number of terms in the dictionary. X-axis of the resulting matrix.
477    num_docs : int, optional
478        Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken.
479        Y-axis of the resulting matrix.
480    dtype : data-type, optional
481        Data type of the output matrix.
482
483    Returns
484    -------
485    numpy.ndarray
486        Dense 2D array that presents `corpus`.
487
488    See Also
489    --------
490    :class:`~gensim.matutils.Dense2Corpus`
491        Convert dense matrix to Gensim corpus format.
492
493    """
494    if num_docs is not None:
495        # we know the number of documents => don't bother column_stacking
496        docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype)
497        for docno, doc in enumerate(corpus):
498            result[:, docno] = sparse2full(doc, num_terms)
499        assert docno + 1 == num_docs
500    else:
501        # The below used to be a generator, but NumPy deprecated generator as of 1.16 with:
502        # """
503        # FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple.
504        # Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error
505        # in the future.
506        # """
507        result = np.column_stack([sparse2full(doc, num_terms) for doc in corpus])
508    return result.astype(dtype)
509
510
511class Dense2Corpus:
512    """Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format.
513
514    Notes
515    -----
516    No data copy is made (changes to the underlying matrix imply changes in the streamed corpus).
517
518    See Also
519    --------
520    :func:`~gensim.matutils.corpus2dense`
521        Convert Gensim corpus to dense matrix.
522    :class:`~gensim.matutils.Sparse2Corpus`
523        Convert sparse matrix to Gensim corpus format.
524
525    """
526    def __init__(self, dense, documents_columns=True):
527        """
528
529        Parameters
530        ----------
531        dense : numpy.ndarray
532            Corpus in dense format.
533        documents_columns : bool, optional
534            Documents in `dense` represented as columns, as opposed to rows?
535
536        """
537        if documents_columns:
538            self.dense = dense.T
539        else:
540            self.dense = dense
541
542    def __iter__(self):
543        """Iterate over the corpus.
544
545        Yields
546        ------
547        list of (int, float)
548            Document in BoW format.
549
550        """
551        for doc in self.dense:
552            yield full2sparse(doc.flat)
553
554    def __len__(self):
555        return len(self.dense)
556
557
558class Sparse2Corpus:
559    """Convert a matrix in scipy.sparse format into a streaming Gensim corpus.
560
561    See Also
562    --------
563    :func:`~gensim.matutils.corpus2csc`
564        Convert gensim corpus format to `scipy.sparse.csc` matrix
565    :class:`~gensim.matutils.Dense2Corpus`
566        Convert dense matrix to gensim corpus.
567
568    """
569    def __init__(self, sparse, documents_columns=True):
570        """
571
572        Parameters
573        ----------
574        sparse : `scipy.sparse`
575            Corpus scipy sparse format
576        documents_columns : bool, optional
577            Documents will be column?
578
579        """
580        if documents_columns:
581            self.sparse = sparse.tocsc()
582        else:
583            self.sparse = sparse.tocsr().T  # make sure shape[1]=number of docs (needed in len())
584
585    def __iter__(self):
586        """
587
588        Yields
589        ------
590        list of (int, float)
591            Document in BoW format.
592
593        """
594        for indprev, indnow in zip(self.sparse.indptr, self.sparse.indptr[1:]):
595            yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
596
597    def __len__(self):
598        return self.sparse.shape[1]
599
600    def __getitem__(self, document_index):
601        """Retrieve a document vector from the corpus by its index.
602
603        Parameters
604        ----------
605        document_index : int
606            Index of document
607
608        Returns
609        -------
610        list of (int, number)
611            Document in BoW format.
612
613        """
614        indprev = self.sparse.indptr[document_index]
615        indnow = self.sparse.indptr[document_index + 1]
616        return list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
617
618
619def veclen(vec):
620    """Calculate L2 (euclidean) length of a vector.
621
622    Parameters
623    ----------
624    vec : list of (int, number)
625        Input vector in sparse bag-of-words format.
626
627    Returns
628    -------
629    float
630        Length of `vec`.
631
632    """
633    if len(vec) == 0:
634        return 0.0
635    length = 1.0 * math.sqrt(sum(val**2 for _, val in vec))
636    assert length > 0.0, "sparse documents must not contain any explicit zero entries"
637    return length
638
639
640def ret_normalized_vec(vec, length):
641    """Normalize a vector in L2 (Euclidean unit norm).
642
643    Parameters
644    ----------
645    vec : list of (int, number)
646        Input vector in BoW format.
647    length : float
648        Length of vector
649
650    Returns
651    -------
652    list of (int, number)
653        L2-normalized vector in BoW format.
654
655    """
656    if length != 1.0:
657        return [(termid, val / length) for termid, val in vec]
658    else:
659        return list(vec)
660
661
662def ret_log_normalize_vec(vec, axis=1):
663    log_max = 100.0
664    if len(vec.shape) == 1:
665        max_val = np.max(vec)
666        log_shift = log_max - np.log(len(vec) + 1.0) - max_val
667        tot = np.sum(np.exp(vec + log_shift))
668        log_norm = np.log(tot) - log_shift
669        vec -= log_norm
670    else:
671        if axis == 1:  # independently normalize each sample
672            max_val = np.max(vec, 1)
673            log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val
674            tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1)
675            log_norm = np.log(tot) - log_shift
676            vec = vec - log_norm[:, np.newaxis]
677        elif axis == 0:  # normalize each feature
678            k = ret_log_normalize_vec(vec.T)
679            return k[0].T, k[1]
680        else:
681            raise ValueError("'%s' is not a supported axis" % axis)
682    return vec, log_norm
683
684
685blas_nrm2 = blas('nrm2', np.array([], dtype=float))
686blas_scal = blas('scal', np.array([], dtype=float))
687
688
689def unitvec(vec, norm='l2', return_norm=False):
690    """Scale a vector to unit length.
691
692    Parameters
693    ----------
694    vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
695        Input vector in any format
696    norm : {'l1', 'l2', 'unique'}, optional
697        Metric to normalize in.
698    return_norm : bool, optional
699        Return the length of vector `vec`, in addition to the normalized vector itself?
700
701    Returns
702    -------
703    numpy.ndarray, scipy.sparse, list of (int, float)}
704        Normalized vector in same format as `vec`.
705    float
706        Length of `vec` before normalization, if `return_norm` is set.
707
708    Notes
709    -----
710    Zero-vector will be unchanged.
711
712    """
713    supported_norms = ('l1', 'l2', 'unique')
714    if norm not in supported_norms:
715        raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms))
716
717    if scipy.sparse.issparse(vec):
718        vec = vec.tocsr()
719        if norm == 'l1':
720            veclen = np.sum(np.abs(vec.data))
721        if norm == 'l2':
722            veclen = np.sqrt(np.sum(vec.data ** 2))
723        if norm == 'unique':
724            veclen = vec.nnz
725        if veclen > 0.0:
726            if np.issubdtype(vec.dtype, np.integer):
727                vec = vec.astype(float)
728            vec /= veclen
729            if return_norm:
730                return vec, veclen
731            else:
732                return vec
733        else:
734            if return_norm:
735                return vec, 1.0
736            else:
737                return vec
738
739    if isinstance(vec, np.ndarray):
740        if norm == 'l1':
741            veclen = np.sum(np.abs(vec))
742        if norm == 'l2':
743            if vec.size == 0:
744                veclen = 0.0
745            else:
746                veclen = blas_nrm2(vec)
747        if norm == 'unique':
748            veclen = np.count_nonzero(vec)
749        if veclen > 0.0:
750            if np.issubdtype(vec.dtype, np.integer):
751                vec = vec.astype(float)
752            if return_norm:
753                return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
754            else:
755                return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
756        else:
757            if return_norm:
758                return vec, 1.0
759            else:
760                return vec
761
762    try:
763        first = next(iter(vec))  # is there at least one element?
764    except StopIteration:
765        if return_norm:
766            return vec, 1.0
767        else:
768            return vec
769
770    if isinstance(first, (tuple, list)) and len(first) == 2:  # gensim sparse format
771        if norm == 'l1':
772            length = float(sum(abs(val) for _, val in vec))
773        if norm == 'l2':
774            length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
775        if norm == 'unique':
776            length = 1.0 * len(vec)
777        assert length > 0.0, "sparse documents must not contain any explicit zero entries"
778        if return_norm:
779            return ret_normalized_vec(vec, length), length
780        else:
781            return ret_normalized_vec(vec, length)
782    else:
783        raise ValueError("unknown input type")
784
785
786def cossim(vec1, vec2):
787    """Get cosine similarity between two sparse vectors.
788
789    Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar.
790
791    Parameters
792    ----------
793    vec1 : list of (int, float)
794        Vector in BoW format.
795    vec2 : list of (int, float)
796        Vector in BoW format.
797
798    Returns
799    -------
800    float
801        Cosine similarity between `vec1` and `vec2`.
802
803    """
804    vec1, vec2 = dict(vec1), dict(vec2)
805    if not vec1 or not vec2:
806        return 0.0
807    vec1len = 1.0 * math.sqrt(sum(val * val for val in vec1.values()))
808    vec2len = 1.0 * math.sqrt(sum(val * val for val in vec2.values()))
809    assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
810    if len(vec2) < len(vec1):
811        vec1, vec2 = vec2, vec1  # swap references so that we iterate over the shorter vector
812    result = sum(value * vec2.get(index, 0.0) for index, value in vec1.items())
813    result /= vec1len * vec2len  # rescale by vector lengths
814    return result
815
816
817def isbow(vec):
818    """Checks if a vector is in the sparse Gensim bag-of-words format.
819
820    Parameters
821    ----------
822    vec : object
823        Object to check.
824
825    Returns
826    -------
827    bool
828        Is `vec` in BoW format.
829
830    """
831    if scipy.sparse.issparse(vec):
832        vec = vec.todense().tolist()
833    try:
834        id_, val_ = vec[0]  # checking first value to see if it is in bag of words format by unpacking
835        int(id_), float(val_)
836    except IndexError:
837        return True  # this is to handle the empty input case
838    except (ValueError, TypeError):
839        return False
840    return True
841
842
843def _convert_vec(vec1, vec2, num_features=None):
844    if scipy.sparse.issparse(vec1):
845        vec1 = vec1.toarray()
846    if scipy.sparse.issparse(vec2):
847        vec2 = vec2.toarray()  # converted both the vectors to dense in case they were in sparse matrix
848    if isbow(vec1) and isbow(vec2):  # if they are in bag of words format we make it dense
849        if num_features is not None:  # if not None, make as large as the documents drawing from
850            dense1 = sparse2full(vec1, num_features)
851            dense2 = sparse2full(vec2, num_features)
852            return dense1, dense2
853        else:
854            max_len = max(len(vec1), len(vec2))
855            dense1 = sparse2full(vec1, max_len)
856            dense2 = sparse2full(vec2, max_len)
857            return dense1, dense2
858    else:
859        # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
860        # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
861        if len(vec1) == 1:
862            vec1 = vec1[0]
863        if len(vec2) == 1:
864            vec2 = vec2[0]
865        return vec1, vec2
866
867
868def kullback_leibler(vec1, vec2, num_features=None):
869    """Calculate Kullback-Leibler distance between two probability distributions using `scipy.stats.entropy`.
870
871    Parameters
872    ----------
873    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
874        Distribution vector.
875    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
876        Distribution vector.
877    num_features : int, optional
878        Number of features in the vectors.
879
880    Returns
881    -------
882    float
883        Kullback-Leibler distance between `vec1` and `vec2`.
884        Value in range [0, +∞) where values closer to 0 mean less distance (higher similarity).
885
886    """
887    vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
888    return entropy(vec1, vec2)
889
890
891def jensen_shannon(vec1, vec2, num_features=None):
892    """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`.
893
894    Parameters
895    ----------
896    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
897        Distribution vector.
898    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
899        Distribution vector.
900    num_features : int, optional
901        Number of features in the vectors.
902
903    Returns
904    -------
905    float
906        Jensen-Shannon distance between `vec1` and `vec2`.
907
908    Notes
909    -----
910    This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`.
911
912    """
913    vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
914    avg_vec = 0.5 * (vec1 + vec2)
915    return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
916
917
918def hellinger(vec1, vec2):
919    """Calculate Hellinger distance between two probability distributions.
920
921    Parameters
922    ----------
923    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
924        Distribution vector.
925    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
926        Distribution vector.
927
928    Returns
929    -------
930    float
931        Hellinger distance between `vec1` and `vec2`.
932        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
933
934    """
935    if scipy.sparse.issparse(vec1):
936        vec1 = vec1.toarray()
937    if scipy.sparse.issparse(vec2):
938        vec2 = vec2.toarray()
939    if isbow(vec1) and isbow(vec2):
940        # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
941        vec1, vec2 = dict(vec1), dict(vec2)
942        indices = set(list(vec1.keys()) + list(vec2.keys()))
943        sim = np.sqrt(
944            0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
945        )
946        return sim
947    else:
948        sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum())
949        return sim
950
951
952def jaccard(vec1, vec2):
953    """Calculate Jaccard distance between two vectors.
954
955    Parameters
956    ----------
957    vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
958        Distribution vector.
959    vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
960        Distribution vector.
961
962    Returns
963    -------
964    float
965        Jaccard distance between `vec1` and `vec2`.
966        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
967
968    """
969
970    # converting from sparse for easier manipulation
971    if scipy.sparse.issparse(vec1):
972        vec1 = vec1.toarray()
973    if scipy.sparse.issparse(vec2):
974        vec2 = vec2.toarray()
975    if isbow(vec1) and isbow(vec2):
976        # if it's in bow format, we use the following definitions:
977        # union = sum of the 'weights' of both the bags
978        # intersection = lowest weight for a particular id; basically the number of common words or items
979        union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2)
980        vec1, vec2 = dict(vec1), dict(vec2)
981        intersection = 0.0
982        for feature_id, feature_weight in vec1.items():
983            intersection += min(feature_weight, vec2.get(feature_id, 0.0))
984        return 1 - float(intersection) / float(union)
985    else:
986        # if it isn't in bag of words format, we can use sets to calculate intersection and union
987        if isinstance(vec1, np.ndarray):
988            vec1 = vec1.tolist()
989        if isinstance(vec2, np.ndarray):
990            vec2 = vec2.tolist()
991        vec1 = set(vec1)
992        vec2 = set(vec2)
993        intersection = vec1 & vec2
994        union = vec1 | vec2
995        return 1 - float(len(intersection)) / float(len(union))
996
997
998def jaccard_distance(set1, set2):
999    """Calculate Jaccard distance between two sets.
1000
1001    Parameters
1002    ----------
1003    set1 : set
1004        Input set.
1005    set2 : set
1006        Input set.
1007
1008    Returns
1009    -------
1010    float
1011        Jaccard distance between `set1` and `set2`.
1012        Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
1013    """
1014
1015    union_cardinality = len(set1 | set2)
1016    if union_cardinality == 0:  # Both sets are empty
1017        return 1.
1018
1019    return 1. - float(len(set1 & set2)) / float(union_cardinality)
1020
1021
1022try:
1023    # try to load fast, cythonized code if possible
1024    from gensim._matutils import logsumexp, mean_absolute_difference, dirichlet_expectation
1025
1026except ImportError:
1027    def logsumexp(x):
1028        """Log of sum of exponentials.
1029
1030        Parameters
1031        ----------
1032        x : numpy.ndarray
1033            Input 2d matrix.
1034
1035        Returns
1036        -------
1037        float
1038            log of sum of exponentials of elements in `x`.
1039
1040        Warnings
1041        --------
1042        For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
1043
1044        """
1045        x_max = np.max(x)
1046        x = np.log(np.sum(np.exp(x - x_max)))
1047        x += x_max
1048
1049        return x
1050
1051    def mean_absolute_difference(a, b):
1052        """Mean absolute difference between two arrays.
1053
1054        Parameters
1055        ----------
1056        a : numpy.ndarray
1057            Input 1d array.
1058        b : numpy.ndarray
1059            Input 1d array.
1060
1061        Returns
1062        -------
1063        float
1064            mean(abs(a - b)).
1065
1066        """
1067        return np.mean(np.abs(a - b))
1068
1069    def dirichlet_expectation(alpha):
1070        """Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
1071
1072        Parameters
1073        ----------
1074        alpha : numpy.ndarray
1075            Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.
1076
1077        Returns
1078        -------
1079        numpy.ndarray
1080            Log of expected values, dimension same as `alpha.ndim`.
1081
1082        """
1083        if len(alpha.shape) == 1:
1084            result = psi(alpha) - psi(np.sum(alpha))
1085        else:
1086            result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
1087        return result.astype(alpha.dtype, copy=False)  # keep the same precision as input
1088
1089
1090def qr_destroy(la):
1091    """Get QR decomposition of `la[0]`.
1092
1093    Parameters
1094    ----------
1095    la : list of numpy.ndarray
1096        Run QR decomposition on the first elements of `la`. Must not be empty.
1097
1098    Returns
1099    -------
1100    (numpy.ndarray, numpy.ndarray)
1101        Matrices :math:`Q` and :math:`R`.
1102
1103    Notes
1104    -----
1105    Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`,
1106    because the memory used in `la[0]` is reclaimed earlier. This makes a difference when
1107    decomposing very large arrays, where every memory copy counts.
1108
1109    Warnings
1110    --------
1111    Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons.
1112
1113    """
1114    a = np.asfortranarray(la[0])
1115    del la[0], la  # now `a` is the only reference to the input matrix
1116    m, n = a.shape
1117    # perform q, r = QR(a); code hacked out of scipy.linalg.qr
1118    logger.debug("computing QR of %s dense matrix", str(a.shape))
1119    geqrf, = get_lapack_funcs(('geqrf',), (a,))
1120    qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
1121    qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
1122    del a  # free up mem
1123    assert info >= 0
1124    r = triu(qr[:n, :n])
1125    if m < n:  # rare case, #features < #topics
1126        qr = qr[:, :m]  # retains fortran order
1127    gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
1128    q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
1129    q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
1130    assert info >= 0, "qr failed"
1131    assert q.flags.f_contiguous
1132    return q, r
1133
1134
1135class MmWriter:
1136    """Store a corpus in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_,
1137    using :class:`~gensim.corpora.mmcorpus.MmCorpus`.
1138
1139    Notes
1140    -----
1141    The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`).
1142    This allows you to write corpora which are larger than the available RAM.
1143
1144    The output file is created in a single pass through the input corpus, so that the input can be
1145    a once-only stream (generator).
1146
1147    To achieve this, a fake MM header is written first, corpus statistics are collected
1148    during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file,
1149    rewriting the fake header with the final values.
1150
1151    """
1152    HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n'  # the only supported MM format
1153
1154    def __init__(self, fname):
1155        """
1156
1157        Parameters
1158        ----------
1159        fname : str
1160            Path to output file.
1161
1162        """
1163        self.fname = fname
1164        if fname.endswith(".gz") or fname.endswith('.bz2'):
1165            raise NotImplementedError("compressed output not supported with MmWriter")
1166        self.fout = utils.open(self.fname, 'wb+')  # open for both reading and writing
1167        self.headers_written = False
1168
1169    def write_headers(self, num_docs, num_terms, num_nnz):
1170        """Write headers to file.
1171
1172        Parameters
1173        ----------
1174        num_docs : int
1175            Number of documents in corpus.
1176        num_terms : int
1177            Number of term in corpus.
1178        num_nnz : int
1179            Number of non-zero elements in corpus.
1180
1181        """
1182        self.fout.write(MmWriter.HEADER_LINE)
1183
1184        if num_nnz < 0:
1185            # we don't know the matrix shape/density yet, so only log a general line
1186            logger.info("saving sparse matrix to %s", self.fname)
1187            self.fout.write(utils.to_utf8(' ' * 50 + '\n'))  # 48 digits must be enough for everybody
1188        else:
1189            logger.info(
1190                "saving sparse %sx%s matrix with %i non-zero entries to %s",
1191                num_docs, num_terms, num_nnz, self.fname
1192            )
1193            self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
1194        self.last_docno = -1
1195        self.headers_written = True
1196
1197    def fake_headers(self, num_docs, num_terms, num_nnz):
1198        """Write "fake" headers to file, to be rewritten once we've scanned the entire corpus.
1199
1200        Parameters
1201        ----------
1202        num_docs : int
1203            Number of documents in corpus.
1204        num_terms : int
1205            Number of term in corpus.
1206        num_nnz : int
1207            Number of non-zero elements in corpus.
1208
1209        """
1210        stats = '%i %i %i' % (num_docs, num_terms, num_nnz)
1211        if len(stats) > 50:
1212            raise ValueError('Invalid stats: matrix too large!')
1213        self.fout.seek(len(MmWriter.HEADER_LINE))
1214        self.fout.write(utils.to_utf8(stats))
1215
1216    def write_vector(self, docno, vector):
1217        """Write a single sparse vector to the file.
1218
1219        Parameters
1220        ----------
1221        docno : int
1222            Number of document.
1223        vector : list of (int, number)
1224            Document in BoW format.
1225
1226        Returns
1227        -------
1228        (int, int)
1229            Max word index in vector and len of vector. If vector is empty, return (-1, 0).
1230
1231        """
1232        assert self.headers_written, "must write Matrix Market file headers before writing data!"
1233        assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
1234        vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12)  # ignore near-zero entries
1235        for termid, weight in vector:  # write term ids in sorted order
1236            # +1 because MM format starts counting from 1
1237            self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight)))
1238        self.last_docno = docno
1239        return (vector[-1][0], len(vector)) if vector else (-1, 0)
1240
1241    @staticmethod
1242    def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
1243        """Save the corpus to disk in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_.
1244
1245        Parameters
1246        ----------
1247        fname : str
1248            Filename of the resulting file.
1249        corpus : iterable of list of (int, number)
1250            Corpus in streamed bag-of-words format.
1251        progress_cnt : int, optional
1252            Print progress for every `progress_cnt` number of documents.
1253        index : bool, optional
1254            Return offsets?
1255        num_terms : int, optional
1256            Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
1257        metadata : bool, optional
1258            Generate a metadata file?
1259
1260        Returns
1261        -------
1262        offsets : {list of int, None}
1263            List of offsets (if index=True) or nothing.
1264
1265        Notes
1266        -----
1267        Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM.
1268
1269        See Also
1270        --------
1271        :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus`
1272            Save corpus to disk.
1273
1274        """
1275        mw = MmWriter(fname)
1276
1277        # write empty headers to the file (with enough space to be overwritten later)
1278        mw.write_headers(-1, -1, -1)  # will print 50 spaces followed by newline on the stats line
1279
1280        # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
1281        _num_terms, num_nnz = 0, 0
1282        docno, poslast = -1, -1
1283        offsets = []
1284        if hasattr(corpus, 'metadata'):
1285            orig_metadata = corpus.metadata
1286            corpus.metadata = metadata
1287            if metadata:
1288                docno2metadata = {}
1289        else:
1290            metadata = False
1291        for docno, doc in enumerate(corpus):
1292            if metadata:
1293                bow, data = doc
1294                docno2metadata[docno] = data
1295            else:
1296                bow = doc
1297            if docno % progress_cnt == 0:
1298                logger.info("PROGRESS: saving document #%i", docno)
1299            if index:
1300                posnow = mw.fout.tell()
1301                if posnow == poslast:
1302                    offsets[-1] = -1
1303                offsets.append(posnow)
1304                poslast = posnow
1305            max_id, veclen = mw.write_vector(docno, bow)
1306            _num_terms = max(_num_terms, 1 + max_id)
1307            num_nnz += veclen
1308        if metadata:
1309            utils.pickle(docno2metadata, fname + '.metadata.cpickle')
1310            corpus.metadata = orig_metadata
1311
1312        num_docs = docno + 1
1313        num_terms = num_terms or _num_terms
1314
1315        if num_docs * num_terms != 0:
1316            logger.info(
1317                "saved %ix%i matrix, density=%.3f%% (%i/%i)",
1318                num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms
1319            )
1320
1321        # now write proper headers, by seeking and overwriting the spaces written earlier
1322        mw.fake_headers(num_docs, num_terms, num_nnz)
1323
1324        mw.close()
1325        if index:
1326            return offsets
1327
1328    def __del__(self):
1329        """Close `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`.
1330
1331        Warnings
1332        --------
1333        Closing the file explicitly via the close() method is preferred and safer.
1334
1335        """
1336        self.close()  # does nothing if called twice (on an already closed file), so no worries
1337
1338    def close(self):
1339        """Close `self.fout` file."""
1340        logger.debug("closing %s", self.fname)
1341        if hasattr(self, 'fout'):
1342            self.fout.close()
1343
1344
1345try:
1346    from gensim.corpora._mmreader import MmReader  # noqa: F401
1347except ImportError:
1348    raise utils.NO_CYTHON
1349