1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6
7"""Optimized `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>` in Python.
8
9For a faster implementation of LDA (parallelized for multicore machines), see also :mod:`gensim.models.ldamulticore`.
10
11This module allows both LDA model estimation from a training corpus and inference of topic
12distribution on new, unseen documents. The model can also be updated with new documents
13for online training.
14
15The core estimation code is based on the `onlineldavb.py script
16<https://github.com/blei-lab/onlineldavb/blob/master/onlineldavb.py>`_, by `Hoffman, Blei, Bach:
17Online Learning for Latent Dirichlet Allocation, NIPS 2010
18<https://scholar.google.com/citations?hl=en&user=IeHKeGYAAAAJ&view_op=list_works>`_.
19
20The algorithm:
21
22#. Is **streamed**: training documents may come in sequentially, no random access required.
23#. Runs in **constant memory** w.r.t. the number of documents: size of the training corpus does not affect memory
24   footprint, can process corpora larger than RAM.
25#. Is **distributed**: makes use of a cluster of machines, if available, to speed up model estimation.
26
27
28Usage examples
29--------------
30
31Train an LDA model using a Gensim corpus
32
33.. sourcecode:: pycon
34
35    >>> from gensim.test.utils import common_texts
36    >>> from gensim.corpora.dictionary import Dictionary
37    >>>
38    >>> # Create a corpus from a list of texts
39    >>> common_dictionary = Dictionary(common_texts)
40    >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
41    >>>
42    >>> # Train the model on the corpus.
43    >>> lda = LdaModel(common_corpus, num_topics=10)
44
45Save a model to disk, or reload a pre-trained model
46
47.. sourcecode:: pycon
48
49    >>> from gensim.test.utils import datapath
50    >>>
51    >>> # Save model to disk.
52    >>> temp_file = datapath("model")
53    >>> lda.save(temp_file)
54    >>>
55    >>> # Load a potentially pretrained model from disk.
56    >>> lda = LdaModel.load(temp_file)
57
58Query, the model using new, unseen documents
59
60.. sourcecode:: pycon
61
62    >>> # Create a new corpus, made of previously unseen documents.
63    >>> other_texts = [
64    ...     ['computer', 'time', 'graph'],
65    ...     ['survey', 'response', 'eps'],
66    ...     ['human', 'system', 'computer']
67    ... ]
68    >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
69    >>>
70    >>> unseen_doc = other_corpus[0]
71    >>> vector = lda[unseen_doc]  # get topic probability distribution for a document
72
73Update the model by incrementally training on the new corpus
74
75.. sourcecode:: pycon
76
77    >>> lda.update(other_corpus)
78    >>> vector = lda[unseen_doc]
79
80A lot of parameters can be tuned to optimize training for your specific case
81
82.. sourcecode:: pycon
83
84    >>> lda = LdaModel(common_corpus, num_topics=50, alpha='auto', eval_every=5)  # learn asymmetric alpha from data
85
86"""
87
88import logging
89import numbers
90import os
91import time
92from collections import defaultdict
93
94import numpy as np
95from scipy.special import gammaln, psi  # gamma function utils
96from scipy.special import polygamma
97
98from gensim import interfaces, utils, matutils
99from gensim.matutils import (
100    kullback_leibler, hellinger, jaccard_distance, jensen_shannon,
101    dirichlet_expectation, logsumexp, mean_absolute_difference,
102)
103from gensim.models import basemodel, CoherenceModel
104from gensim.models.callbacks import Callback
105
106
107logger = logging.getLogger(__name__)
108
109
110def update_dir_prior(prior, N, logphat, rho):
111    """Update a given prior using Newton's method, described in
112    `J. Huang: "Maximum Likelihood Estimation of Dirichlet Distribution Parameters"
113    <http://jonathan-huang.org/research/dirichlet/dirichlet.pdf>`_.
114
115    Parameters
116    ----------
117    prior : list of float
118        The prior for each possible outcome at the previous iteration (to be updated).
119    N : int
120        Number of observations.
121    logphat : list of float
122        Log probabilities for the current estimation, also called "observed sufficient statistics".
123    rho : float
124        Learning rate.
125
126    Returns
127    -------
128    list of float
129        The updated prior.
130
131    """
132    gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat)
133
134    c = N * polygamma(1, np.sum(prior))
135    q = -N * polygamma(1, prior)
136
137    b = np.sum(gradf / q) / (1 / c + np.sum(1 / q))
138
139    dprior = -(gradf - b) / q
140
141    updated_prior = rho * dprior + prior
142    if all(updated_prior > 0):
143        prior = updated_prior
144    else:
145        logger.warning("updated prior is not positive")
146    return prior
147
148
149class LdaState(utils.SaveLoad):
150    """Encapsulate information for distributed computation of :class:`~gensim.models.ldamodel.LdaModel` objects.
151
152    Objects of this class are sent over the network, so try to keep them lean to
153    reduce traffic.
154
155    """
156    def __init__(self, eta, shape, dtype=np.float32):
157        """
158
159        Parameters
160        ----------
161        eta : numpy.ndarray
162            The prior probabilities assigned to each term.
163        shape : tuple of (int, int)
164            Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
165        dtype : type
166            Overrides the numpy array default types.
167
168        """
169        self.eta = eta.astype(dtype, copy=False)
170        self.sstats = np.zeros(shape, dtype=dtype)
171        self.numdocs = 0
172        self.dtype = dtype
173
174    def reset(self):
175        """Prepare the state for a new EM iteration (reset sufficient stats)."""
176        self.sstats[:] = 0.0
177        self.numdocs = 0
178
179    def merge(self, other):
180        """Merge the result of an E step from one node with that of another node (summing up sufficient statistics).
181
182        The merging is trivial and after merging all cluster nodes, we have the
183        exact same result as if the computation was run on a single node (no
184        approximation).
185
186        Parameters
187        ----------
188        other : :class:`~gensim.models.ldamodel.LdaState`
189            The state object with which the current one will be merged.
190
191        """
192        assert other is not None
193        self.sstats += other.sstats
194        self.numdocs += other.numdocs
195
196    def blend(self, rhot, other, targetsize=None):
197        """Merge the current state with another one using a weighted average for the sufficient statistics.
198
199        The number of documents is stretched in both state objects, so that they are of comparable magnitude.
200        This procedure corresponds to the stochastic gradient update from
201        `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation"
202        <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_, see equations (5) and (9).
203
204        Parameters
205        ----------
206        rhot : float
207            Weight of the `other` state in the computed average. A value of 0.0 means that `other`
208            is completely ignored. A value of 1.0 means `self` is completely ignored.
209        other : :class:`~gensim.models.ldamodel.LdaState`
210            The state object with which the current one will be merged.
211        targetsize : int, optional
212            The number of documents to stretch both states to.
213
214        """
215        assert other is not None
216        if targetsize is None:
217            targetsize = self.numdocs
218
219        # stretch the current model's expected n*phi counts to target size
220        if self.numdocs == 0 or targetsize == self.numdocs:
221            scale = 1.0
222        else:
223            scale = 1.0 * targetsize / self.numdocs
224        self.sstats *= (1.0 - rhot) * scale
225
226        # stretch the incoming n*phi counts to target size
227        if other.numdocs == 0 or targetsize == other.numdocs:
228            scale = 1.0
229        else:
230            logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize)
231            scale = 1.0 * targetsize / other.numdocs
232        self.sstats += rhot * scale * other.sstats
233
234        self.numdocs = targetsize
235
236    def blend2(self, rhot, other, targetsize=None):
237        """Merge the current state with another one using a weighted sum for the sufficient statistics.
238
239        In contrast to :meth:`~gensim.models.ldamodel.LdaState.blend`, the sufficient statistics are not scaled
240        prior to aggregation.
241
242        Parameters
243        ----------
244        rhot : float
245            Unused.
246        other : :class:`~gensim.models.ldamodel.LdaState`
247            The state object with which the current one will be merged.
248        targetsize : int, optional
249            The number of documents to stretch both states to.
250
251        """
252        assert other is not None
253        if targetsize is None:
254            targetsize = self.numdocs
255
256        # merge the two matrices by summing
257        self.sstats += other.sstats
258        self.numdocs = targetsize
259
260    def get_lambda(self):
261        """Get the parameters of the posterior over the topics, also referred to as "the topics".
262
263        Returns
264        -------
265        numpy.ndarray
266            Parameters of the posterior probability over topics.
267
268        """
269        return self.eta + self.sstats
270
271    def get_Elogbeta(self):
272        """Get the log (posterior) probabilities for each topic.
273
274        Returns
275        -------
276        numpy.ndarray
277            Posterior probabilities for each topic.
278        """
279        return dirichlet_expectation(self.get_lambda())
280
281    @classmethod
282    def load(cls, fname, *args, **kwargs):
283        """Load a previously stored state from disk.
284
285        Overrides :class:`~gensim.utils.SaveLoad.load` by enforcing the `dtype` parameter
286        to ensure backwards compatibility.
287
288        Parameters
289        ----------
290        fname : str
291            Path to file that contains the needed object.
292        args : object
293            Positional parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
294        kwargs : object
295            Key-word parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
296
297        Returns
298        -------
299        :class:`~gensim.models.ldamodel.LdaState`
300            The state loaded from the given file.
301
302        """
303        result = super(LdaState, cls).load(fname, *args, **kwargs)
304
305        # dtype could be absent in old models
306        if not hasattr(result, 'dtype'):
307            result.dtype = np.float64  # float64 was implicitly used before (because it's the default in numpy)
308            logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)
309
310        return result
311
312
313class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
314    """Train and use Online Latent Dirichlet Allocation (OLDA) models as presented in
315    `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
316
317    Examples
318    -------
319    Initialize a model using a Gensim corpus
320
321    .. sourcecode:: pycon
322
323        >>> from gensim.test.utils import common_corpus
324        >>>
325        >>> lda = LdaModel(common_corpus, num_topics=10)
326
327    You can then infer topic distributions on new, unseen documents.
328
329    .. sourcecode:: pycon
330
331        >>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
332        >>> doc_lda = lda[doc_bow]
333
334    The model can be updated (trained) with new documents.
335
336    .. sourcecode:: pycon
337
338        >>> # In practice (corpus =/= initial training corpus), but we use the same here for simplicity.
339        >>> other_corpus = common_corpus
340        >>>
341        >>> lda.update(other_corpus)
342
343    Model persistency is achieved through :meth:`~gensim.models.ldamodel.LdaModel.load` and
344    :meth:`~gensim.models.ldamodel.LdaModel.save` methods.
345
346    """
347    def __init__(self, corpus=None, num_topics=100, id2word=None,
348                 distributed=False, chunksize=2000, passes=1, update_every=1,
349                 alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
350                 iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
351                 random_state=None, ns_conf=None, minimum_phi_value=0.01,
352                 per_word_topics=False, callbacks=None, dtype=np.float32):
353        """
354
355        Parameters
356        ----------
357        corpus : iterable of list of (int, float), optional
358            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
359            If you have a CSC in-memory matrix, you can convert it to a
360            streamed corpus with the help of gensim.matutils.Sparse2Corpus.
361            If not given, the model is left untrained (presumably because you want to call
362            :meth:`~gensim.models.ldamodel.LdaModel.update` manually).
363        num_topics : int, optional
364            The number of requested latent topics to be extracted from the training corpus.
365        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
366            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
367            debugging and topic printing.
368        distributed : bool, optional
369            Whether distributed computing should be used to accelerate training.
370        chunksize :  int, optional
371            Number of documents to be used in each training chunk.
372        passes : int, optional
373            Number of passes through the corpus during training.
374        update_every : int, optional
375            Number of documents to be iterated through for each update.
376            Set to 0 for batch learning, > 1 for online iterative learning.
377        alpha : {numpy.ndarray, str}, optional
378            Can be set to an 1D array of length equal to the number of expected topics that expresses
379            our a-priori belief for each topics' probability.
380            Alternatively default prior selecting strategies can be employed by supplying a string:
381
382                * 'symmetric': Default; uses a fixed symmetric prior per topic,
383                * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`,
384                * 'auto': Learns an asymmetric prior from the corpus (not available if `distributed==True`).
385        eta : {float, np.array, str}, optional
386            A-priori belief on word probability, this can be:
387
388                * scalar for a symmetric prior over topic/word probability,
389                * vector of length num_words to denote an asymmetric user defined probability for each word,
390                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
391                * the string 'auto' to learn the asymmetric prior from the data.
392        decay : float, optional
393            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
394            when each new document is examined. Corresponds to Kappa from
395            `Matthew D. Hoffman, David M. Blei, Francis Bach:
396            "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
397        offset : float, optional
398            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
399            Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
400            "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
401        eval_every : int, optional
402            Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
403        iterations : int, optional
404            Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
405        gamma_threshold : float, optional
406            Minimum change in the value of the gamma parameters to continue iterating.
407        minimum_probability : float, optional
408            Topics with a probability lower than this threshold will be filtered out.
409        random_state : {np.random.RandomState, int}, optional
410            Either a randomState object or a seed to generate one. Useful for reproducibility.
411        ns_conf : dict of (str, object), optional
412            Key word parameters propagated to :func:`gensim.utils.getNS` to get a Pyro4 Nameserved.
413            Only used if `distributed` is set to True.
414        minimum_phi_value : float, optional
415            if `per_word_topics` is True, this represents a lower bound on the term probabilities.
416        per_word_topics : bool
417            If True, the model also computes a list of topics, sorted in descending order of most likely topics for
418            each word, along with their phi values multiplied by the feature length (i.e. word count).
419        callbacks : list of :class:`~gensim.models.callbacks.Callback`
420            Metric callbacks to log and visualize evaluation metrics of the model during training.
421        dtype : {numpy.float16, numpy.float32, numpy.float64}, optional
422            Data-type to use during calculations inside model. All inputs are also converted.
423
424        """
425        self.dtype = np.finfo(dtype).dtype
426
427        # store user-supplied parameters
428        self.id2word = id2word
429        if corpus is None and self.id2word is None:
430            raise ValueError(
431                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
432            )
433
434        if self.id2word is None:
435            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
436            self.id2word = utils.dict_from_corpus(corpus)
437            self.num_terms = len(self.id2word)
438        elif len(self.id2word) > 0:
439            self.num_terms = 1 + max(self.id2word.keys())
440        else:
441            self.num_terms = 0
442
443        if self.num_terms == 0:
444            raise ValueError("cannot compute LDA over an empty collection (no terms)")
445
446        self.distributed = bool(distributed)
447        self.num_topics = int(num_topics)
448        self.chunksize = chunksize
449        self.decay = decay
450        self.offset = offset
451        self.minimum_probability = minimum_probability
452        self.num_updates = 0
453
454        self.passes = passes
455        self.update_every = update_every
456        self.eval_every = eval_every
457        self.minimum_phi_value = minimum_phi_value
458        self.per_word_topics = per_word_topics
459        self.callbacks = callbacks
460
461        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
462
463        assert self.alpha.shape == (self.num_topics,), \
464            "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
465
466        if isinstance(eta, str):
467            if eta == 'asymmetric':
468                raise ValueError("The 'asymmetric' option cannot be used for eta")
469
470        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
471
472        self.random_state = utils.get_random_state(random_state)
473
474        assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), (
475            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
476            (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))
477
478        # VB constants
479        self.iterations = iterations
480        self.gamma_threshold = gamma_threshold
481
482        # set up distributed environment if necessary
483        if not distributed:
484            logger.info("using serial LDA version on this node")
485            self.dispatcher = None
486            self.numworkers = 1
487        else:
488            if self.optimize_alpha:
489                raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA")
490            # set up distributed version
491            try:
492                import Pyro4
493                if ns_conf is None:
494                    ns_conf = {}
495
496                with utils.getNS(**ns_conf) as ns:
497                    from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
498                    self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
499                    logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri))
500                    self.dispatcher.initialize(
501                        id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize,
502                        alpha=alpha, eta=eta, distributed=False
503                    )
504                    self.numworkers = len(self.dispatcher.getworkers())
505                    logger.info("using distributed version with %i workers", self.numworkers)
506            except Exception as err:
507                logger.error("failed to initialize distributed LDA (%s)", err)
508                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
509
510        # Initialize the variational distribution q(beta|lambda)
511        self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
512        self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
513        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))
514
515        # Check that we haven't accidentally fallen back to np.float64
516        assert self.eta.dtype == self.dtype
517        assert self.expElogbeta.dtype == self.dtype
518
519        # if a training corpus was provided, start estimating the model right away
520        if corpus is not None:
521            use_numpy = self.dispatcher is not None
522            start = time.time()
523            self.update(corpus, chunks_as_numpy=use_numpy)
524            self.add_lifecycle_event(
525                "created",
526                msg=f"trained {self} in {time.time() - start:.2f}s",
527            )
528
529    def init_dir_prior(self, prior, name):
530        """Initialize priors for the Dirichlet distribution.
531
532        Parameters
533        ----------
534        prior : {str, list of float, numpy.ndarray of float, float}
535            A-priori belief on word probability. If `name` == 'eta' then the prior can be:
536
537                * scalar for a symmetric prior over topic/word probability,
538                * vector of length num_words to denote an asymmetric user defined probability for each word,
539                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
540                * the string 'auto' to learn the asymmetric prior from the data.
541
542            If `name` == 'alpha', then the prior can be:
543
544                * an 1D array of length equal to the number of expected topics,
545                * 'symmetric': Uses a fixed symmetric prior per topic,
546                * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`,
547                * 'auto': Learns an asymmetric prior from the corpus.
548        name : {'alpha', 'eta'}
549            Whether the `prior` is parameterized by the alpha vector (1 parameter per topic)
550            or by the eta (1 parameter per unique term in the vocabulary).
551
552        """
553        if prior is None:
554            prior = 'symmetric'
555
556        if name == 'alpha':
557            prior_shape = self.num_topics
558        elif name == 'eta':
559            prior_shape = self.num_terms
560        else:
561            raise ValueError("'name' must be 'alpha' or 'eta'")
562
563        is_auto = False
564
565        if isinstance(prior, str):
566            if prior == 'symmetric':
567                logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
568                init_prior = np.fromiter(
569                    (1.0 / self.num_topics for i in range(prior_shape)),
570                    dtype=self.dtype, count=prior_shape,
571                )
572            elif prior == 'asymmetric':
573                init_prior = np.fromiter(
574                    (1.0 / (i + np.sqrt(prior_shape)) for i in range(prior_shape)),
575                    dtype=self.dtype, count=prior_shape,
576                )
577                init_prior /= init_prior.sum()
578                logger.info("using asymmetric %s %s", name, list(init_prior))
579            elif prior == 'auto':
580                is_auto = True
581                init_prior = np.fromiter((1.0 / self.num_topics for i in range(prior_shape)),
582                    dtype=self.dtype, count=prior_shape)
583                if name == 'alpha':
584                    logger.info("using autotuned %s, starting with %s", name, list(init_prior))
585            else:
586                raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
587        elif isinstance(prior, list):
588            init_prior = np.asarray(prior, dtype=self.dtype)
589        elif isinstance(prior, np.ndarray):
590            init_prior = prior.astype(self.dtype, copy=False)
591        elif isinstance(prior, (np.number, numbers.Real)):
592            init_prior = np.fromiter((prior for i in range(prior_shape)), dtype=self.dtype)
593        else:
594            raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)
595
596        return init_prior, is_auto
597
598    def __str__(self):
599        """Get a string representation of the current object.
600
601        Returns
602        -------
603        str
604            Human readable representation of the most important model parameters.
605
606        """
607        return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
608            self.num_terms, self.num_topics, self.decay, self.chunksize
609        )
610
611    def sync_state(self, current_Elogbeta=None):
612        """Propagate the states topic probabilities to the inner object's attribute.
613
614        Parameters
615        ----------
616        current_Elogbeta: numpy.ndarray
617            Posterior probabilities for each topic, optional.
618            If omitted, it will get Elogbeta from state.
619
620        """
621        if current_Elogbeta is None:
622            current_Elogbeta = self.state.get_Elogbeta()
623        self.expElogbeta = np.exp(current_Elogbeta)
624        assert self.expElogbeta.dtype == self.dtype
625
626    def clear(self):
627        """Clear the model's state to free some memory. Used in the distributed implementation."""
628        self.state = None
629        self.Elogbeta = None
630
631    def inference(self, chunk, collect_sstats=False):
632        """Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights)
633        for each document in the chunk.
634
635        This function does not modify the model The whole input chunk of document is assumed to fit in RAM;
636        chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational
637        parameter directly using the optimization presented in
638        `Lee, Seung: Algorithms for non-negative matrix factorization"
639        <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.
640
641        Parameters
642        ----------
643        chunk : list of list of (int, float)
644            The corpus chunk on which the inference step will be performed.
645        collect_sstats : bool, optional
646            If set to True, also collect (and return) sufficient statistics needed to update the model's topic-word
647            distributions.
648
649        Returns
650        -------
651        (numpy.ndarray, {numpy.ndarray, None})
652            The first element is always returned and it corresponds to the states gamma matrix. The second element is
653            only returned if `collect_sstats` == True and corresponds to the sufficient statistics for the M step.
654
655        """
656        try:
657            len(chunk)
658        except TypeError:
659            # convert iterators/generators to plain list, so we have len() etc.
660            chunk = list(chunk)
661        if len(chunk) > 1:
662            logger.debug("performing inference on a chunk of %i documents", len(chunk))
663
664        # Initialize the variational distribution q(theta|gamma) for the chunk
665        gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
666        Elogtheta = dirichlet_expectation(gamma)
667        expElogtheta = np.exp(Elogtheta)
668
669        assert Elogtheta.dtype == self.dtype
670        assert expElogtheta.dtype == self.dtype
671
672        if collect_sstats:
673            sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
674        else:
675            sstats = None
676        converged = 0
677
678        # Now, for each document d update that document's gamma and phi
679        # Inference code copied from Hoffman's `onlineldavb.py` (esp. the
680        # Lee&Seung trick which speeds things up by an order of magnitude, compared
681        # to Blei's original LDA-C code, cool!).
682        integer_types = (int, np.integer,)
683        epsilon = np.finfo(self.dtype).eps
684        for d, doc in enumerate(chunk):
685            if len(doc) > 0 and not isinstance(doc[0][0], integer_types):
686                # make sure the term IDs are ints, otherwise np will get upset
687                ids = [int(idx) for idx, _ in doc]
688            else:
689                ids = [idx for idx, _ in doc]
690            cts = np.fromiter((cnt for _, cnt in doc), dtype=self.dtype, count=len(doc))
691            gammad = gamma[d, :]
692            Elogthetad = Elogtheta[d, :]
693            expElogthetad = expElogtheta[d, :]
694            expElogbetad = self.expElogbeta[:, ids]
695
696            # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
697            # phinorm is the normalizer.
698            # TODO treat zeros explicitly, instead of adding epsilon?
699            phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
700
701            # Iterate between gamma and phi until convergence
702            for _ in range(self.iterations):
703                lastgamma = gammad
704                # We represent phi implicitly to save memory and time.
705                # Substituting the value of the optimal phi back into
706                # the update for gamma gives this update. Cf. Lee&Seung 2001.
707                gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
708                Elogthetad = dirichlet_expectation(gammad)
709                expElogthetad = np.exp(Elogthetad)
710                phinorm = np.dot(expElogthetad, expElogbetad) + epsilon
711                # If gamma hasn't changed much, we're done.
712                meanchange = mean_absolute_difference(gammad, lastgamma)
713                if meanchange < self.gamma_threshold:
714                    converged += 1
715                    break
716            gamma[d, :] = gammad
717            assert gammad.dtype == self.dtype
718            if collect_sstats:
719                # Contribution of document d to the expected sufficient
720                # statistics for the M step.
721                sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)
722
723        if len(chunk) > 1:
724            logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations)
725
726        if collect_sstats:
727            # This step finishes computing the sufficient statistics for the
728            # M step, so that
729            # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
730            # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
731            sstats *= self.expElogbeta
732            assert sstats.dtype == self.dtype
733
734        assert gamma.dtype == self.dtype
735        return gamma, sstats
736
737    def do_estep(self, chunk, state=None):
738        """Perform inference on a chunk of documents, and accumulate the collected sufficient statistics.
739
740        Parameters
741        ----------
742        chunk : list of list of (int, float)
743            The corpus chunk on which the inference step will be performed.
744        state : :class:`~gensim.models.ldamodel.LdaState`, optional
745            The state to be updated with the newly accumulated sufficient statistics. If none, the models
746            `self.state` is updated.
747
748        Returns
749        -------
750        numpy.ndarray
751            Gamma parameters controlling the topic weights, shape (`len(chunk)`, `self.num_topics`).
752
753        """
754        if state is None:
755            state = self.state
756        gamma, sstats = self.inference(chunk, collect_sstats=True)
757        state.sstats += sstats
758        state.numdocs += gamma.shape[0]  # avoids calling len(chunk) on a generator
759        assert gamma.dtype == self.dtype
760        return gamma
761
762    def update_alpha(self, gammat, rho):
763        """Update parameters for the Dirichlet prior on the per-document topic weights.
764
765        Parameters
766        ----------
767        gammat : numpy.ndarray
768            Previous topic weight parameters.
769        rho : float
770            Learning rate.
771
772        Returns
773        -------
774        numpy.ndarray
775            Sequence of alpha parameters.
776
777        """
778        N = float(len(gammat))
779        logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
780        assert logphat.dtype == self.dtype
781
782        self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
783        logger.info("optimized alpha %s", list(self.alpha))
784
785        assert self.alpha.dtype == self.dtype
786        return self.alpha
787
788    def update_eta(self, lambdat, rho):
789        """Update parameters for the Dirichlet prior on the per-topic word weights.
790
791        Parameters
792        ----------
793        lambdat : numpy.ndarray
794            Previous lambda parameters.
795        rho : float
796            Learning rate.
797
798        Returns
799        -------
800        numpy.ndarray
801            The updated eta parameters.
802
803        """
804        N = float(lambdat.shape[0])
805        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
806        assert logphat.dtype == self.dtype
807
808        self.eta = update_dir_prior(self.eta, N, logphat, rho)
809
810        assert self.eta.dtype == self.dtype
811        return self.eta
812
813    def log_perplexity(self, chunk, total_docs=None):
814        """Calculate and return per-word likelihood bound, using a chunk of documents as evaluation corpus.
815
816        Also output the calculated statistics, including the perplexity=2^(-bound), to log at INFO level.
817
818        Parameters
819        ----------
820        chunk : list of list of (int, float)
821            The corpus chunk on which the inference step will be performed.
822        total_docs : int, optional
823            Number of docs used for evaluation of the perplexity.
824
825        Returns
826        -------
827        numpy.ndarray
828            The variational bound score calculated for each word.
829
830        """
831        if total_docs is None:
832            total_docs = len(chunk)
833        corpus_words = sum(cnt for document in chunk for _, cnt in document)
834        subsample_ratio = 1.0 * total_docs / len(chunk)
835        perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
836        logger.info(
837            "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words",
838            perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
839        )
840        return perwordbound
841
842    def update(self, corpus, chunksize=None, decay=None, offset=None,
843               passes=None, update_every=None, eval_every=None, iterations=None,
844               gamma_threshold=None, chunks_as_numpy=False):
845        """Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
846        the maximum number of allowed iterations is reached. `corpus` must be an iterable.
847
848        In distributed mode, the E step is distributed over a cluster of machines.
849
850        Notes
851        -----
852        This update also supports updating an already trained model with new documents; the two models are then merged
853        in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary
854        input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the
855        online update of `Matthew D. Hoffman, David M. Blei, Francis Bach:
856        "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
857        and is guaranteed to converge for any `decay` in (0.5, 1.0). Additionally, for smaller corpus sizes, an
858        increasing `offset` may be beneficial (see Table 1 in the same paper).
859
860        Parameters
861        ----------
862        corpus : iterable of list of (int, float), optional
863            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to update the
864            model.
865        chunksize :  int, optional
866            Number of documents to be used in each training chunk.
867        decay : float, optional
868            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
869            when each new document is examined. Corresponds to Kappa from
870            `Matthew D. Hoffman, David M. Blei, Francis Bach:
871            "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
872        offset : float, optional
873            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
874            Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
875            "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
876        passes : int, optional
877            Number of passes through the corpus during training.
878        update_every : int, optional
879            Number of documents to be iterated through for each update.
880            Set to 0 for batch learning, > 1 for online iterative learning.
881        eval_every : int, optional
882            Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
883        iterations : int, optional
884            Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
885        gamma_threshold : float, optional
886            Minimum change in the value of the gamma parameters to continue iterating.
887        chunks_as_numpy : bool, optional
888            Whether each chunk passed to the inference step should be a numpy.ndarray or not. Numpy can in some settings
889            turn the term IDs into floats, these will be converted back into integers in inference, which incurs a
890            performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`.
891
892        """
893        # use parameters given in constructor, unless user explicitly overrode them
894        if decay is None:
895            decay = self.decay
896        if offset is None:
897            offset = self.offset
898        if passes is None:
899            passes = self.passes
900        if update_every is None:
901            update_every = self.update_every
902        if eval_every is None:
903            eval_every = self.eval_every
904        if iterations is None:
905            iterations = self.iterations
906        if gamma_threshold is None:
907            gamma_threshold = self.gamma_threshold
908
909        try:
910            lencorpus = len(corpus)
911        except Exception:
912            logger.warning("input corpus stream has no len(); counting documents")
913            lencorpus = sum(1 for _ in corpus)
914        if lencorpus == 0:
915            logger.warning("LdaModel.update() called with an empty corpus")
916            return
917
918        if chunksize is None:
919            chunksize = min(lencorpus, self.chunksize)
920
921        self.state.numdocs += lencorpus
922
923        if update_every:
924            updatetype = "online"
925            if passes == 1:
926                updatetype += " (single-pass)"
927            else:
928                updatetype += " (multi-pass)"
929            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
930        else:
931            updatetype = "batch"
932            updateafter = lencorpus
933        evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)
934
935        updates_per_pass = max(1, lencorpus / updateafter)
936        logger.info(
937            "running %s LDA training, %s topics, %i passes over "
938            "the supplied corpus of %i documents, updating model once "
939            "every %i documents, evaluating perplexity every %i documents, "
940            "iterating %ix with a convergence threshold of %f",
941            updatetype, self.num_topics, passes, lencorpus,
942            updateafter, evalafter, iterations,
943            gamma_threshold
944        )
945
946        if updates_per_pass * passes < 10:
947            logger.warning(
948                "too few updates, training might not converge; "
949                "consider increasing the number of passes or iterations to improve accuracy"
950            )
951
952        # rho is the "speed" of updating; TODO try other fncs
953        # pass_ + num_updates handles increasing the starting t for each pass,
954        # while allowing it to "reset" on the first pass of each update
955        def rho():
956            return pow(offset + pass_ + (self.num_updates / chunksize), -decay)
957
958        if self.callbacks:
959            # pass the list of input callbacks to Callback class
960            callback = Callback(self.callbacks)
961            callback.set_model(self)
962            # initialize metrics list to store metric values after every epoch
963            self.metrics = defaultdict(list)
964
965        for pass_ in range(passes):
966            if self.dispatcher:
967                logger.info('initializing %s workers', self.numworkers)
968                self.dispatcher.reset(self.state)
969            else:
970                other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
971            dirty = False
972
973            reallen = 0
974            chunks = utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy, dtype=self.dtype)
975            for chunk_no, chunk in enumerate(chunks):
976                reallen += len(chunk)  # keep track of how many documents we've processed so far
977
978                if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
979                    self.log_perplexity(chunk, total_docs=lencorpus)
980
981                if self.dispatcher:
982                    # add the chunk to dispatcher's job queue, so workers can munch on it
983                    logger.info(
984                        "PROGRESS: pass %i, dispatching documents up to #%i/%i",
985                        pass_, chunk_no * chunksize + len(chunk), lencorpus
986                    )
987                    # this will eventually block until some jobs finish, because the queue has a small finite length
988                    self.dispatcher.putjob(chunk)
989                else:
990                    logger.info(
991                        "PROGRESS: pass %i, at document #%i/%i",
992                        pass_, chunk_no * chunksize + len(chunk), lencorpus
993                    )
994                    gammat = self.do_estep(chunk, other)
995
996                    if self.optimize_alpha:
997                        self.update_alpha(gammat, rho())
998
999                dirty = True
1000                del chunk
1001
1002                # perform an M step. determine when based on update_every, don't do this after every chunk
1003                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
1004                    if self.dispatcher:
1005                        # distributed mode: wait for all workers to finish
1006                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
1007                        other = self.dispatcher.getstate()
1008                    self.do_mstep(rho(), other, pass_ > 0)
1009                    del other  # frees up memory
1010
1011                    if self.dispatcher:
1012                        logger.info('initializing workers')
1013                        self.dispatcher.reset(self.state)
1014                    else:
1015                        other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
1016                    dirty = False
1017
1018            if reallen != lencorpus:
1019                raise RuntimeError("input corpus size changed during training (don't use generators as input)")
1020
1021            # append current epoch's metric values
1022            if self.callbacks:
1023                current_metrics = callback.on_epoch_end(pass_)
1024                for metric, value in current_metrics.items():
1025                    self.metrics[metric].append(value)
1026
1027            if dirty:
1028                # finish any remaining updates
1029                if self.dispatcher:
1030                    # distributed mode: wait for all workers to finish
1031                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
1032                    other = self.dispatcher.getstate()
1033                self.do_mstep(rho(), other, pass_ > 0)
1034                del other
1035                dirty = False
1036
1037    def do_mstep(self, rho, other, extra_pass=False):
1038        """Maximization step: use linear interpolation between the existing topics and
1039        collected sufficient statistics in `other` to update the topics.
1040
1041        Parameters
1042        ----------
1043        rho : float
1044            Learning rate.
1045        other : :class:`~gensim.models.ldamodel.LdaModel`
1046            The model whose sufficient statistics will be used to update the topics.
1047        extra_pass : bool, optional
1048            Whether this step required an additional pass over the corpus.
1049
1050        """
1051        logger.debug("updating topics")
1052        # update self with the new blend; also keep track of how much did
1053        # the topics change through this update, to assess convergence
1054        previous_Elogbeta = self.state.get_Elogbeta()
1055        self.state.blend(rho, other)
1056
1057        current_Elogbeta = self.state.get_Elogbeta()
1058        self.sync_state(current_Elogbeta)
1059
1060        # print out some debug info at the end of each EM iteration
1061        self.print_topics(5)
1062        diff = mean_absolute_difference(previous_Elogbeta.ravel(), current_Elogbeta.ravel())
1063        logger.info("topic diff=%f, rho=%f", diff, rho)
1064
1065        if self.optimize_eta:
1066            self.update_eta(self.state.get_lambda(), rho)
1067
1068        if not extra_pass:
1069            # only update if this isn't an additional pass
1070            self.num_updates += other.numdocs
1071
1072    def bound(self, corpus, gamma=None, subsample_ratio=1.0):
1073        """Estimate the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)].
1074
1075        Parameters
1076        ----------
1077        corpus : iterable of list of (int, float), optional
1078            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to estimate the
1079            variational bounds.
1080        gamma : numpy.ndarray, optional
1081            Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
1082        subsample_ratio : float, optional
1083            Percentage of the whole corpus represented by the passed `corpus` argument (in case this was a sample).
1084            Set to 1.0 if the whole corpus was passed.This is used as a multiplicative factor to scale the likelihood
1085            appropriately.
1086
1087        Returns
1088        -------
1089        numpy.ndarray
1090            The variational bound score calculated for each document.
1091
1092        """
1093        score = 0.0
1094        _lambda = self.state.get_lambda()
1095        Elogbeta = dirichlet_expectation(_lambda)
1096
1097        for d, doc in enumerate(corpus):  # stream the input doc-by-doc, in case it's too large to fit in RAM
1098            if d % self.chunksize == 0:
1099                logger.debug("bound: at document #%i", d)
1100            if gamma is None:
1101                gammad, _ = self.inference([doc])
1102            else:
1103                gammad = gamma[d]
1104            Elogthetad = dirichlet_expectation(gammad)
1105
1106            assert gammad.dtype == self.dtype
1107            assert Elogthetad.dtype == self.dtype
1108
1109            # E[log p(doc | theta, beta)]
1110            score += sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
1111
1112            # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
1113            score += np.sum((self.alpha - gammad) * Elogthetad)
1114            score += np.sum(gammaln(gammad) - gammaln(self.alpha))
1115            score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad))
1116
1117        # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures
1118        # that the likelihood is always roughly on the same scale.
1119        score *= subsample_ratio
1120
1121        # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar
1122        score += np.sum((self.eta - _lambda) * Elogbeta)
1123        score += np.sum(gammaln(_lambda) - gammaln(self.eta))
1124
1125        if np.ndim(self.eta) == 0:
1126            sum_eta = self.eta * self.num_terms
1127        else:
1128            sum_eta = np.sum(self.eta)
1129
1130        score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))
1131
1132        return score
1133
1134    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
1135        """Get a representation for selected topics.
1136
1137        Parameters
1138        ----------
1139        num_topics : int, optional
1140            Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in LDA.
1141            The returned topics subset of all topics is therefore arbitrary and may change between two LDA
1142            training runs.
1143        num_words : int, optional
1144            Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
1145            probability for each topic).
1146        log : bool, optional
1147            Whether the output is also logged, besides being returned.
1148        formatted : bool, optional
1149            Whether the topic representations should be formatted as strings. If False, they are returned as
1150            2 tuples of (word, probability).
1151
1152        Returns
1153        -------
1154        list of {str, tuple of (str, float)}
1155            a list of topics, each represented either as a string (when `formatted` == True) or word-probability
1156            pairs.
1157
1158        """
1159        if num_topics < 0 or num_topics >= self.num_topics:
1160            num_topics = self.num_topics
1161            chosen_topics = range(num_topics)
1162        else:
1163            num_topics = min(num_topics, self.num_topics)
1164
1165            # add a little random jitter, to randomize results around the same alpha
1166            sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
1167            # random_state.rand returns float64, but converting back to dtype won't speed up anything
1168
1169            sorted_topics = list(matutils.argsort(sort_alpha))
1170            chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
1171
1172        shown = []
1173
1174        topic = self.state.get_lambda()
1175        for i in chosen_topics:
1176            topic_ = topic[i]
1177            topic_ = topic_ / topic_.sum()  # normalize to probability distribution
1178            bestn = matutils.argsort(topic_, num_words, reverse=True)
1179            topic_ = [(self.id2word[id], topic_[id]) for id in bestn]
1180            if formatted:
1181                topic_ = ' + '.join('%.3f*"%s"' % (v, k) for k, v in topic_)
1182
1183            shown.append((i, topic_))
1184            if log:
1185                logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic_)
1186
1187        return shown
1188
1189    def show_topic(self, topicid, topn=10):
1190        """Get the representation for a single topic. Words here are the actual strings, in constrast to
1191        :meth:`~gensim.models.ldamodel.LdaModel.get_topic_terms` that represents words by their vocabulary ID.
1192
1193        Parameters
1194        ----------
1195        topicid : int
1196            The ID of the topic to be returned
1197        topn : int, optional
1198            Number of the most significant words that are associated with the topic.
1199
1200        Returns
1201        -------
1202        list of (str, float)
1203            Word - probability pairs for the most relevant words generated by the topic.
1204
1205        """
1206        return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]
1207
1208    def get_topics(self):
1209        """Get the term-topic matrix learned during inference.
1210
1211        Returns
1212        -------
1213        numpy.ndarray
1214            The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`).
1215
1216        """
1217        topics = self.state.get_lambda()
1218        return topics / topics.sum(axis=1)[:, None]
1219
1220    def get_topic_terms(self, topicid, topn=10):
1221        """Get the representation for a single topic. Words the integer IDs, in constrast to
1222        :meth:`~gensim.models.ldamodel.LdaModel.show_topic` that represents words by the actual strings.
1223
1224        Parameters
1225        ----------
1226        topicid : int
1227            The ID of the topic to be returned
1228        topn : int, optional
1229            Number of the most significant words that are associated with the topic.
1230
1231        Returns
1232        -------
1233        list of (int, float)
1234            Word ID - probability pairs for the most relevant words generated by the topic.
1235
1236        """
1237        topic = self.get_topics()[topicid]
1238        topic = topic / topic.sum()  # normalize to probability distribution
1239        bestn = matutils.argsort(topic, topn, reverse=True)
1240        return [(idx, topic[idx]) for idx in bestn]
1241
1242    def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
1243                   coherence='u_mass', topn=20, processes=-1):
1244        """Get the topics with the highest coherence score the coherence for each topic.
1245
1246        Parameters
1247        ----------
1248        corpus : iterable of list of (int, float), optional
1249            Corpus in BoW format.
1250        texts : list of list of str, optional
1251            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
1252            probability estimator .
1253        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
1254            Gensim dictionary mapping of id word to create corpus.
1255            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
1256        window_size : int, optional
1257            Is the size of the window to be used for coherence measures using boolean sliding window as their
1258            probability estimator. For 'u_mass' this doesn't matter.
1259            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
1260        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
1261            Coherence measure to be used.
1262            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
1263            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
1264            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
1265        topn : int, optional
1266            Integer corresponding to the number of top words to be extracted from each topic.
1267        processes : int, optional
1268            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
1269            num_cpus - 1.
1270
1271        Returns
1272        -------
1273        list of (list of (int, str), float)
1274            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
1275            are distributions of words, represented as a list of pairs of word IDs and their probabilities.
1276
1277        """
1278        cm = CoherenceModel(
1279            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
1280            window_size=window_size, coherence=coherence, topn=topn,
1281            processes=processes
1282        )
1283        coherence_scores = cm.get_coherence_per_topic()
1284
1285        str_topics = []
1286        for topic in self.get_topics():  # topic = array of vocab_size floats, one per term
1287            bestn = matutils.argsort(topic, topn=topn, reverse=True)  # top terms for topic
1288            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]  # membership, token
1289            str_topics.append(beststr)  # list of topn (float membership, token) tuples
1290
1291        scored_topics = zip(str_topics, coherence_scores)
1292        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
1293
1294    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None,
1295                            per_word_topics=False):
1296        """Get the topic distribution for the given document.
1297
1298        Parameters
1299        ----------
1300        bow : corpus : list of (int, float)
1301            The document in BOW format.
1302        minimum_probability : float
1303            Topics with an assigned probability lower than this threshold will be discarded.
1304        minimum_phi_value : float
1305            If `per_word_topics` is True, this represents a lower bound on the term probabilities that are included.
1306             If set to None, a value of 1e-8 is used to prevent 0s.
1307        per_word_topics : bool
1308            If True, this function will also return two extra lists as explained in the "Returns" section.
1309
1310        Returns
1311        -------
1312        list of (int, float)
1313            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
1314            the probability that was assigned to it.
1315        list of (int, list of (int, float), optional
1316            Most probable topics per word. Each element in the list is a pair of a word's id, and a list of
1317            topics sorted by their relevance to this word. Only returned if `per_word_topics` was set to True.
1318        list of (int, list of float), optional
1319            Phi relevance values, multiplied by the feature length, for each word-topic combination.
1320            Each element in the list is a pair of a word's id and a list of the phi values between this word and
1321            each topic. Only returned if `per_word_topics` was set to True.
1322
1323        """
1324        if minimum_probability is None:
1325            minimum_probability = self.minimum_probability
1326        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output
1327
1328        if minimum_phi_value is None:
1329            minimum_phi_value = self.minimum_probability
1330        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output
1331
1332        # if the input vector is a corpus, return a transformed corpus
1333        is_corpus, corpus = utils.is_corpus(bow)
1334        if is_corpus:
1335            kwargs = dict(
1336                per_word_topics=per_word_topics,
1337                minimum_probability=minimum_probability,
1338                minimum_phi_value=minimum_phi_value
1339            )
1340            return self._apply(corpus, **kwargs)
1341
1342        gamma, phis = self.inference([bow], collect_sstats=per_word_topics)
1343        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution
1344
1345        document_topics = [
1346            (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
1347            if topicvalue >= minimum_probability
1348        ]
1349
1350        if not per_word_topics:
1351            return document_topics
1352
1353        word_topic = []  # contains word and corresponding topic
1354        word_phi = []  # contains word and phi values
1355        for word_type, weight in bow:
1356            phi_values = []  # contains (phi_value, topic) pairing to later be sorted
1357            phi_topic = []  # contains topic and corresponding phi value to be returned 'raw' to user
1358            for topic_id in range(0, self.num_topics):
1359                if phis[topic_id][word_type] >= minimum_phi_value:
1360                    # appends phi values for each topic for that word
1361                    # these phi values are scaled by feature length
1362                    phi_values.append((phis[topic_id][word_type], topic_id))
1363                    phi_topic.append((topic_id, phis[topic_id][word_type]))
1364
1365            # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
1366            word_phi.append((word_type, phi_topic))
1367            # sorts the topics based on most likely topic
1368            # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
1369            sorted_phi_values = sorted(phi_values, reverse=True)
1370            topics_sorted = [x[1] for x in sorted_phi_values]
1371            word_topic.append((word_type, topics_sorted))
1372
1373        return document_topics, word_topic, word_phi  # returns 2-tuple
1374
1375    def get_term_topics(self, word_id, minimum_probability=None):
1376        """Get the most relevant topics to the given word.
1377
1378        Parameters
1379        ----------
1380        word_id : int
1381            The word for which the topic distribution will be computed.
1382        minimum_probability : float, optional
1383            Topics with an assigned probability below this threshold will be discarded.
1384
1385        Returns
1386        -------
1387        list of (int, float)
1388            The relevant topics represented as pairs of their ID and their assigned probability, sorted
1389            by relevance to the given word.
1390
1391        """
1392        if minimum_probability is None:
1393            minimum_probability = self.minimum_probability
1394        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output
1395
1396        # if user enters word instead of id in vocab, change to get id
1397        if isinstance(word_id, str):
1398            word_id = self.id2word.doc2bow([word_id])[0][0]
1399
1400        values = []
1401        for topic_id in range(0, self.num_topics):
1402            if self.expElogbeta[topic_id][word_id] >= minimum_probability:
1403                values.append((topic_id, self.expElogbeta[topic_id][word_id]))
1404
1405        return values
1406
1407    def diff(self, other, distance="kullback_leibler", num_words=100,
1408             n_ann_terms=10, diagonal=False, annotation=True, normed=True):
1409        """Calculate the difference in topic distributions between two models: `self` and `other`.
1410
1411        Parameters
1412        ----------
1413        other : :class:`~gensim.models.ldamodel.LdaModel`
1414            The model which will be compared against the current object.
1415        distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'}
1416            The distance metric to calculate the difference with.
1417        num_words : int, optional
1418            The number of most relevant words used if `distance == 'jaccard'`. Also used for annotating topics.
1419        n_ann_terms : int, optional
1420            Max number of words in intersection/symmetric difference between topics. Used for annotation.
1421        diagonal : bool, optional
1422            Whether we need the difference between identical topics (the diagonal of the difference matrix).
1423        annotation : bool, optional
1424            Whether the intersection or difference of words between two topics should be returned.
1425        normed : bool, optional
1426            Whether the matrix should be normalized or not.
1427
1428        Returns
1429        -------
1430        numpy.ndarray
1431            A difference matrix. Each element corresponds to the difference between the two topics,
1432            shape (`self.num_topics`, `other.num_topics`)
1433        numpy.ndarray, optional
1434            Annotation matrix where for each pair we include the word from the intersection of the two topics,
1435            and the word from the symmetric difference of the two topics. Only included if `annotation == True`.
1436            Shape (`self.num_topics`, `other_model.num_topics`, 2).
1437
1438        Examples
1439        --------
1440        Get the differences between each pair of topics inferred by two models
1441
1442        .. sourcecode:: pycon
1443
1444            >>> from gensim.models.ldamulticore import LdaMulticore
1445            >>> from gensim.test.utils import datapath
1446            >>>
1447            >>> m1 = LdaMulticore.load(datapath("lda_3_0_1_model"))
1448            >>> m2 = LdaMulticore.load(datapath("ldamodel_python_3_5"))
1449            >>> mdiff, annotation = m1.diff(m2)
1450            >>> topic_diff = mdiff  # get matrix with difference for each topic pair from `m1` and `m2`
1451
1452        """
1453        distances = {
1454            "kullback_leibler": kullback_leibler,
1455            "hellinger": hellinger,
1456            "jaccard": jaccard_distance,
1457            "jensen_shannon": jensen_shannon
1458        }
1459
1460        if distance not in distances:
1461            valid_keys = ", ".join("`{}`".format(x) for x in distances.keys())
1462            raise ValueError("Incorrect distance, valid only {}".format(valid_keys))
1463
1464        if not isinstance(other, self.__class__):
1465            raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__))
1466
1467        distance_func = distances[distance]
1468        d1, d2 = self.get_topics(), other.get_topics()
1469        t1_size, t2_size = d1.shape[0], d2.shape[0]
1470        annotation_terms = None
1471
1472        fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in range(t1_size)]
1473        snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in range(t2_size)]
1474
1475        if distance == "jaccard":
1476            d1, d2 = fst_topics, snd_topics
1477
1478        if diagonal:
1479            assert t1_size == t2_size, \
1480                "Both input models should have same no. of topics, " \
1481                "as the diagonal will only be valid in a square matrix"
1482            # initialize z and annotation array
1483            z = np.zeros(t1_size)
1484            if annotation:
1485                annotation_terms = np.zeros(t1_size, dtype=list)
1486        else:
1487            # initialize z and annotation matrix
1488            z = np.zeros((t1_size, t2_size))
1489            if annotation:
1490                annotation_terms = np.zeros((t1_size, t2_size), dtype=list)
1491
1492        # iterate over each cell in the initialized z and annotation
1493        for topic in np.ndindex(z.shape):
1494            topic1 = topic[0]
1495            if diagonal:
1496                topic2 = topic1
1497            else:
1498                topic2 = topic[1]
1499
1500            z[topic] = distance_func(d1[topic1], d2[topic2])
1501            if annotation:
1502                pos_tokens = fst_topics[topic1] & snd_topics[topic2]
1503                neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])
1504
1505                pos_tokens = list(pos_tokens)[:min(len(pos_tokens), n_ann_terms)]
1506                neg_tokens = list(neg_tokens)[:min(len(neg_tokens), n_ann_terms)]
1507
1508                annotation_terms[topic] = [pos_tokens, neg_tokens]
1509
1510        if normed:
1511            if np.abs(np.max(z)) > 1e-8:
1512                z /= np.max(z)
1513
1514        return z, annotation_terms
1515
1516    def __getitem__(self, bow, eps=None):
1517        """Get the topic distribution for the given document.
1518
1519        Wraps :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` to support an operator style call.
1520        Uses the model's current state (set using constructor arguments) to fill in the additional arguments of the
1521        wrapper method.
1522
1523        Parameters
1524        ---------
1525        bow : list of (int, float)
1526            The document in BOW format.
1527        eps : float, optional
1528            Topics with an assigned probability lower than this threshold will be discarded.
1529
1530        Returns
1531        -------
1532        list of (int, float)
1533            Topic distribution for the given document. Each topic is represented as a pair of its ID and the probability
1534            assigned to it.
1535
1536        """
1537        return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
1538
1539    def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **kwargs):
1540        """Save the model to a file.
1541
1542        Large internal arrays may be stored into separate files, with `fname` as prefix.
1543
1544        Notes
1545        -----
1546        If you intend to use models across Python 2/3 versions there are a few things to
1547        keep in mind:
1548
1549          1. The pickled Python dictionaries will not work across Python versions
1550          2. The `save` method does not automatically save all numpy arrays separately, only
1551             those ones that exceed `sep_limit` set in :meth:`~gensim.utils.SaveLoad.save`. The main
1552             concern here is the `alpha` array if for instance using `alpha='auto'`.
1553
1554        Please refer to the `wiki recipes section
1555        <https://github.com/RaRe-Technologies/gensim/wiki/
1556        Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2>`_
1557        for an example on how to work around these issues.
1558
1559        See Also
1560        --------
1561        :meth:`~gensim.models.ldamodel.LdaModel.load`
1562            Load model.
1563
1564        Parameters
1565        ----------
1566        fname : str
1567            Path to the system file where the model will be persisted.
1568        ignore : tuple of str, optional
1569            The named attributes in the tuple will be left out of the pickled model. The reason why
1570            the internal `state` is ignored by default is that it uses its own serialisation rather than the one
1571            provided by this method.
1572        separately : {list of str, None}, optional
1573            If None -  automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
1574            them into separate files. This avoids pickle memory errors and allows `mmap`'ing large arrays
1575            back on load efficiently. If list of str - this attributes will be stored in separate files,
1576            the automatic check is not performed in this case.
1577        *args
1578            Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
1579        **kwargs
1580            Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
1581
1582        """
1583        if self.state is not None:
1584            self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
1585        # Save the dictionary separately if not in 'ignore'.
1586        if 'id2word' not in ignore:
1587            utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word'))
1588
1589        # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
1590        # someone sets the ignore list themselves
1591        if ignore is not None and ignore:
1592            if isinstance(ignore, str):
1593                ignore = [ignore]
1594            ignore = [e for e in ignore if e]  # make sure None and '' are not in the list
1595            ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore))
1596        else:
1597            ignore = ['state', 'dispatcher', 'id2word']
1598
1599        # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
1600        # someone sets the separately list themselves.
1601        separately_explicit = ['expElogbeta', 'sstats']
1602        # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
1603        # array manually.
1604        if (isinstance(self.alpha, str) and self.alpha == 'auto') or \
1605                (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1):
1606            separately_explicit.append('alpha')
1607        if (isinstance(self.eta, str) and self.eta == 'auto') or \
1608                (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1):
1609            separately_explicit.append('eta')
1610        # Merge separately_explicit with separately.
1611        if separately:
1612            if isinstance(separately, str):
1613                separately = [separately]
1614            separately = [e for e in separately if e]  # make sure None and '' are not in the list
1615            separately = list(set(separately_explicit) | set(separately))
1616        else:
1617            separately = separately_explicit
1618        super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)
1619
1620    @classmethod
1621    def load(cls, fname, *args, **kwargs):
1622        """Load a previously saved :class:`gensim.models.ldamodel.LdaModel` from file.
1623
1624        See Also
1625        --------
1626        :meth:`~gensim.models.ldamodel.LdaModel.save`
1627            Save model.
1628
1629        Parameters
1630        ----------
1631        fname : str
1632            Path to the file where the model is stored.
1633        *args
1634            Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
1635        **kwargs
1636            Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
1637
1638        Examples
1639        --------
1640        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:
1641
1642        .. sourcecode:: pycon
1643
1644            >>> from gensim.test.utils import datapath
1645            >>>
1646            >>> fname = datapath("lda_3_0_1_model")
1647            >>> lda = LdaModel.load(fname, mmap='r')
1648
1649        """
1650        kwargs['mmap'] = kwargs.get('mmap', None)
1651        result = super(LdaModel, cls).load(fname, *args, **kwargs)
1652
1653        # check if `random_state` attribute has been set after main pickle load
1654        # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
1655        # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim,
1656        # so set `random_state` as the default value
1657        if not hasattr(result, 'random_state'):
1658            result.random_state = utils.get_random_state(None)  # using default value `get_random_state(None)`
1659            logging.warning("random_state not set so using default value")
1660
1661        # dtype could be absent in old models
1662        if not hasattr(result, 'dtype'):
1663            result.dtype = np.float64  # float64 was implicitly used before (cause it's default in numpy)
1664            logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)
1665
1666        state_fname = utils.smart_extension(fname, '.state')
1667        try:
1668            result.state = LdaState.load(state_fname, *args, **kwargs)
1669        except Exception as e:
1670            logging.warning("failed to load state from %s: %s", state_fname, e)
1671
1672        id2word_fname = utils.smart_extension(fname, '.id2word')
1673        # check if `id2word_fname` file is present on disk
1674        # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim,
1675        # so set `result.id2word` using the `id2word_fname` file
1676        # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim,
1677        # so `result.id2word` already set after the main pickle load
1678        if os.path.isfile(id2word_fname):
1679            try:
1680                result.id2word = utils.unpickle(id2word_fname)
1681            except Exception as e:
1682                logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
1683        return result
1684