1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> 5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 7"""Optimized `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>` in Python. 8 9For a faster implementation of LDA (parallelized for multicore machines), see also :mod:`gensim.models.ldamulticore`. 10 11This module allows both LDA model estimation from a training corpus and inference of topic 12distribution on new, unseen documents. The model can also be updated with new documents 13for online training. 14 15The core estimation code is based on the `onlineldavb.py script 16<https://github.com/blei-lab/onlineldavb/blob/master/onlineldavb.py>`_, by `Hoffman, Blei, Bach: 17Online Learning for Latent Dirichlet Allocation, NIPS 2010 18<https://scholar.google.com/citations?hl=en&user=IeHKeGYAAAAJ&view_op=list_works>`_. 19 20The algorithm: 21 22#. Is **streamed**: training documents may come in sequentially, no random access required. 23#. Runs in **constant memory** w.r.t. the number of documents: size of the training corpus does not affect memory 24 footprint, can process corpora larger than RAM. 25#. Is **distributed**: makes use of a cluster of machines, if available, to speed up model estimation. 26 27 28Usage examples 29-------------- 30 31Train an LDA model using a Gensim corpus 32 33.. sourcecode:: pycon 34 35 >>> from gensim.test.utils import common_texts 36 >>> from gensim.corpora.dictionary import Dictionary 37 >>> 38 >>> # Create a corpus from a list of texts 39 >>> common_dictionary = Dictionary(common_texts) 40 >>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] 41 >>> 42 >>> # Train the model on the corpus. 43 >>> lda = LdaModel(common_corpus, num_topics=10) 44 45Save a model to disk, or reload a pre-trained model 46 47.. sourcecode:: pycon 48 49 >>> from gensim.test.utils import datapath 50 >>> 51 >>> # Save model to disk. 52 >>> temp_file = datapath("model") 53 >>> lda.save(temp_file) 54 >>> 55 >>> # Load a potentially pretrained model from disk. 56 >>> lda = LdaModel.load(temp_file) 57 58Query, the model using new, unseen documents 59 60.. sourcecode:: pycon 61 62 >>> # Create a new corpus, made of previously unseen documents. 63 >>> other_texts = [ 64 ... ['computer', 'time', 'graph'], 65 ... ['survey', 'response', 'eps'], 66 ... ['human', 'system', 'computer'] 67 ... ] 68 >>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] 69 >>> 70 >>> unseen_doc = other_corpus[0] 71 >>> vector = lda[unseen_doc] # get topic probability distribution for a document 72 73Update the model by incrementally training on the new corpus 74 75.. sourcecode:: pycon 76 77 >>> lda.update(other_corpus) 78 >>> vector = lda[unseen_doc] 79 80A lot of parameters can be tuned to optimize training for your specific case 81 82.. sourcecode:: pycon 83 84 >>> lda = LdaModel(common_corpus, num_topics=50, alpha='auto', eval_every=5) # learn asymmetric alpha from data 85 86""" 87 88import logging 89import numbers 90import os 91import time 92from collections import defaultdict 93 94import numpy as np 95from scipy.special import gammaln, psi # gamma function utils 96from scipy.special import polygamma 97 98from gensim import interfaces, utils, matutils 99from gensim.matutils import ( 100 kullback_leibler, hellinger, jaccard_distance, jensen_shannon, 101 dirichlet_expectation, logsumexp, mean_absolute_difference, 102) 103from gensim.models import basemodel, CoherenceModel 104from gensim.models.callbacks import Callback 105 106 107logger = logging.getLogger(__name__) 108 109 110def update_dir_prior(prior, N, logphat, rho): 111 """Update a given prior using Newton's method, described in 112 `J. Huang: "Maximum Likelihood Estimation of Dirichlet Distribution Parameters" 113 <http://jonathan-huang.org/research/dirichlet/dirichlet.pdf>`_. 114 115 Parameters 116 ---------- 117 prior : list of float 118 The prior for each possible outcome at the previous iteration (to be updated). 119 N : int 120 Number of observations. 121 logphat : list of float 122 Log probabilities for the current estimation, also called "observed sufficient statistics". 123 rho : float 124 Learning rate. 125 126 Returns 127 ------- 128 list of float 129 The updated prior. 130 131 """ 132 gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat) 133 134 c = N * polygamma(1, np.sum(prior)) 135 q = -N * polygamma(1, prior) 136 137 b = np.sum(gradf / q) / (1 / c + np.sum(1 / q)) 138 139 dprior = -(gradf - b) / q 140 141 updated_prior = rho * dprior + prior 142 if all(updated_prior > 0): 143 prior = updated_prior 144 else: 145 logger.warning("updated prior is not positive") 146 return prior 147 148 149class LdaState(utils.SaveLoad): 150 """Encapsulate information for distributed computation of :class:`~gensim.models.ldamodel.LdaModel` objects. 151 152 Objects of this class are sent over the network, so try to keep them lean to 153 reduce traffic. 154 155 """ 156 def __init__(self, eta, shape, dtype=np.float32): 157 """ 158 159 Parameters 160 ---------- 161 eta : numpy.ndarray 162 The prior probabilities assigned to each term. 163 shape : tuple of (int, int) 164 Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary). 165 dtype : type 166 Overrides the numpy array default types. 167 168 """ 169 self.eta = eta.astype(dtype, copy=False) 170 self.sstats = np.zeros(shape, dtype=dtype) 171 self.numdocs = 0 172 self.dtype = dtype 173 174 def reset(self): 175 """Prepare the state for a new EM iteration (reset sufficient stats).""" 176 self.sstats[:] = 0.0 177 self.numdocs = 0 178 179 def merge(self, other): 180 """Merge the result of an E step from one node with that of another node (summing up sufficient statistics). 181 182 The merging is trivial and after merging all cluster nodes, we have the 183 exact same result as if the computation was run on a single node (no 184 approximation). 185 186 Parameters 187 ---------- 188 other : :class:`~gensim.models.ldamodel.LdaState` 189 The state object with which the current one will be merged. 190 191 """ 192 assert other is not None 193 self.sstats += other.sstats 194 self.numdocs += other.numdocs 195 196 def blend(self, rhot, other, targetsize=None): 197 """Merge the current state with another one using a weighted average for the sufficient statistics. 198 199 The number of documents is stretched in both state objects, so that they are of comparable magnitude. 200 This procedure corresponds to the stochastic gradient update from 201 `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" 202 <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_, see equations (5) and (9). 203 204 Parameters 205 ---------- 206 rhot : float 207 Weight of the `other` state in the computed average. A value of 0.0 means that `other` 208 is completely ignored. A value of 1.0 means `self` is completely ignored. 209 other : :class:`~gensim.models.ldamodel.LdaState` 210 The state object with which the current one will be merged. 211 targetsize : int, optional 212 The number of documents to stretch both states to. 213 214 """ 215 assert other is not None 216 if targetsize is None: 217 targetsize = self.numdocs 218 219 # stretch the current model's expected n*phi counts to target size 220 if self.numdocs == 0 or targetsize == self.numdocs: 221 scale = 1.0 222 else: 223 scale = 1.0 * targetsize / self.numdocs 224 self.sstats *= (1.0 - rhot) * scale 225 226 # stretch the incoming n*phi counts to target size 227 if other.numdocs == 0 or targetsize == other.numdocs: 228 scale = 1.0 229 else: 230 logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize) 231 scale = 1.0 * targetsize / other.numdocs 232 self.sstats += rhot * scale * other.sstats 233 234 self.numdocs = targetsize 235 236 def blend2(self, rhot, other, targetsize=None): 237 """Merge the current state with another one using a weighted sum for the sufficient statistics. 238 239 In contrast to :meth:`~gensim.models.ldamodel.LdaState.blend`, the sufficient statistics are not scaled 240 prior to aggregation. 241 242 Parameters 243 ---------- 244 rhot : float 245 Unused. 246 other : :class:`~gensim.models.ldamodel.LdaState` 247 The state object with which the current one will be merged. 248 targetsize : int, optional 249 The number of documents to stretch both states to. 250 251 """ 252 assert other is not None 253 if targetsize is None: 254 targetsize = self.numdocs 255 256 # merge the two matrices by summing 257 self.sstats += other.sstats 258 self.numdocs = targetsize 259 260 def get_lambda(self): 261 """Get the parameters of the posterior over the topics, also referred to as "the topics". 262 263 Returns 264 ------- 265 numpy.ndarray 266 Parameters of the posterior probability over topics. 267 268 """ 269 return self.eta + self.sstats 270 271 def get_Elogbeta(self): 272 """Get the log (posterior) probabilities for each topic. 273 274 Returns 275 ------- 276 numpy.ndarray 277 Posterior probabilities for each topic. 278 """ 279 return dirichlet_expectation(self.get_lambda()) 280 281 @classmethod 282 def load(cls, fname, *args, **kwargs): 283 """Load a previously stored state from disk. 284 285 Overrides :class:`~gensim.utils.SaveLoad.load` by enforcing the `dtype` parameter 286 to ensure backwards compatibility. 287 288 Parameters 289 ---------- 290 fname : str 291 Path to file that contains the needed object. 292 args : object 293 Positional parameters to be propagated to class:`~gensim.utils.SaveLoad.load` 294 kwargs : object 295 Key-word parameters to be propagated to class:`~gensim.utils.SaveLoad.load` 296 297 Returns 298 ------- 299 :class:`~gensim.models.ldamodel.LdaState` 300 The state loaded from the given file. 301 302 """ 303 result = super(LdaState, cls).load(fname, *args, **kwargs) 304 305 # dtype could be absent in old models 306 if not hasattr(result, 'dtype'): 307 result.dtype = np.float64 # float64 was implicitly used before (because it's the default in numpy) 308 logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname) 309 310 return result 311 312 313class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel): 314 """Train and use Online Latent Dirichlet Allocation (OLDA) models as presented in 315 `Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 316 317 Examples 318 ------- 319 Initialize a model using a Gensim corpus 320 321 .. sourcecode:: pycon 322 323 >>> from gensim.test.utils import common_corpus 324 >>> 325 >>> lda = LdaModel(common_corpus, num_topics=10) 326 327 You can then infer topic distributions on new, unseen documents. 328 329 .. sourcecode:: pycon 330 331 >>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)] 332 >>> doc_lda = lda[doc_bow] 333 334 The model can be updated (trained) with new documents. 335 336 .. sourcecode:: pycon 337 338 >>> # In practice (corpus =/= initial training corpus), but we use the same here for simplicity. 339 >>> other_corpus = common_corpus 340 >>> 341 >>> lda.update(other_corpus) 342 343 Model persistency is achieved through :meth:`~gensim.models.ldamodel.LdaModel.load` and 344 :meth:`~gensim.models.ldamodel.LdaModel.save` methods. 345 346 """ 347 def __init__(self, corpus=None, num_topics=100, id2word=None, 348 distributed=False, chunksize=2000, passes=1, update_every=1, 349 alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, 350 iterations=50, gamma_threshold=0.001, minimum_probability=0.01, 351 random_state=None, ns_conf=None, minimum_phi_value=0.01, 352 per_word_topics=False, callbacks=None, dtype=np.float32): 353 """ 354 355 Parameters 356 ---------- 357 corpus : iterable of list of (int, float), optional 358 Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`). 359 If you have a CSC in-memory matrix, you can convert it to a 360 streamed corpus with the help of gensim.matutils.Sparse2Corpus. 361 If not given, the model is left untrained (presumably because you want to call 362 :meth:`~gensim.models.ldamodel.LdaModel.update` manually). 363 num_topics : int, optional 364 The number of requested latent topics to be extracted from the training corpus. 365 id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`} 366 Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for 367 debugging and topic printing. 368 distributed : bool, optional 369 Whether distributed computing should be used to accelerate training. 370 chunksize : int, optional 371 Number of documents to be used in each training chunk. 372 passes : int, optional 373 Number of passes through the corpus during training. 374 update_every : int, optional 375 Number of documents to be iterated through for each update. 376 Set to 0 for batch learning, > 1 for online iterative learning. 377 alpha : {numpy.ndarray, str}, optional 378 Can be set to an 1D array of length equal to the number of expected topics that expresses 379 our a-priori belief for each topics' probability. 380 Alternatively default prior selecting strategies can be employed by supplying a string: 381 382 * 'symmetric': Default; uses a fixed symmetric prior per topic, 383 * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`, 384 * 'auto': Learns an asymmetric prior from the corpus (not available if `distributed==True`). 385 eta : {float, np.array, str}, optional 386 A-priori belief on word probability, this can be: 387 388 * scalar for a symmetric prior over topic/word probability, 389 * vector of length num_words to denote an asymmetric user defined probability for each word, 390 * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, 391 * the string 'auto' to learn the asymmetric prior from the data. 392 decay : float, optional 393 A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten 394 when each new document is examined. Corresponds to Kappa from 395 `Matthew D. Hoffman, David M. Blei, Francis Bach: 396 "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 397 offset : float, optional 398 Hyper-parameter that controls how much we will slow down the first steps the first few iterations. 399 Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: 400 "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 401 eval_every : int, optional 402 Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. 403 iterations : int, optional 404 Maximum number of iterations through the corpus when inferring the topic distribution of a corpus. 405 gamma_threshold : float, optional 406 Minimum change in the value of the gamma parameters to continue iterating. 407 minimum_probability : float, optional 408 Topics with a probability lower than this threshold will be filtered out. 409 random_state : {np.random.RandomState, int}, optional 410 Either a randomState object or a seed to generate one. Useful for reproducibility. 411 ns_conf : dict of (str, object), optional 412 Key word parameters propagated to :func:`gensim.utils.getNS` to get a Pyro4 Nameserved. 413 Only used if `distributed` is set to True. 414 minimum_phi_value : float, optional 415 if `per_word_topics` is True, this represents a lower bound on the term probabilities. 416 per_word_topics : bool 417 If True, the model also computes a list of topics, sorted in descending order of most likely topics for 418 each word, along with their phi values multiplied by the feature length (i.e. word count). 419 callbacks : list of :class:`~gensim.models.callbacks.Callback` 420 Metric callbacks to log and visualize evaluation metrics of the model during training. 421 dtype : {numpy.float16, numpy.float32, numpy.float64}, optional 422 Data-type to use during calculations inside model. All inputs are also converted. 423 424 """ 425 self.dtype = np.finfo(dtype).dtype 426 427 # store user-supplied parameters 428 self.id2word = id2word 429 if corpus is None and self.id2word is None: 430 raise ValueError( 431 'at least one of corpus/id2word must be specified, to establish input space dimensionality' 432 ) 433 434 if self.id2word is None: 435 logger.warning("no word id mapping provided; initializing from corpus, assuming identity") 436 self.id2word = utils.dict_from_corpus(corpus) 437 self.num_terms = len(self.id2word) 438 elif len(self.id2word) > 0: 439 self.num_terms = 1 + max(self.id2word.keys()) 440 else: 441 self.num_terms = 0 442 443 if self.num_terms == 0: 444 raise ValueError("cannot compute LDA over an empty collection (no terms)") 445 446 self.distributed = bool(distributed) 447 self.num_topics = int(num_topics) 448 self.chunksize = chunksize 449 self.decay = decay 450 self.offset = offset 451 self.minimum_probability = minimum_probability 452 self.num_updates = 0 453 454 self.passes = passes 455 self.update_every = update_every 456 self.eval_every = eval_every 457 self.minimum_phi_value = minimum_phi_value 458 self.per_word_topics = per_word_topics 459 self.callbacks = callbacks 460 461 self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') 462 463 assert self.alpha.shape == (self.num_topics,), \ 464 "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) 465 466 if isinstance(eta, str): 467 if eta == 'asymmetric': 468 raise ValueError("The 'asymmetric' option cannot be used for eta") 469 470 self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') 471 472 self.random_state = utils.get_random_state(random_state) 473 474 assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), ( 475 "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % 476 (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) 477 478 # VB constants 479 self.iterations = iterations 480 self.gamma_threshold = gamma_threshold 481 482 # set up distributed environment if necessary 483 if not distributed: 484 logger.info("using serial LDA version on this node") 485 self.dispatcher = None 486 self.numworkers = 1 487 else: 488 if self.optimize_alpha: 489 raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") 490 # set up distributed version 491 try: 492 import Pyro4 493 if ns_conf is None: 494 ns_conf = {} 495 496 with utils.getNS(**ns_conf) as ns: 497 from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX 498 self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) 499 logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) 500 self.dispatcher.initialize( 501 id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, 502 alpha=alpha, eta=eta, distributed=False 503 ) 504 self.numworkers = len(self.dispatcher.getworkers()) 505 logger.info("using distributed version with %i workers", self.numworkers) 506 except Exception as err: 507 logger.error("failed to initialize distributed LDA (%s)", err) 508 raise RuntimeError("failed to initialize distributed LDA (%s)" % err) 509 510 # Initialize the variational distribution q(beta|lambda) 511 self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype) 512 self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) 513 self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) 514 515 # Check that we haven't accidentally fallen back to np.float64 516 assert self.eta.dtype == self.dtype 517 assert self.expElogbeta.dtype == self.dtype 518 519 # if a training corpus was provided, start estimating the model right away 520 if corpus is not None: 521 use_numpy = self.dispatcher is not None 522 start = time.time() 523 self.update(corpus, chunks_as_numpy=use_numpy) 524 self.add_lifecycle_event( 525 "created", 526 msg=f"trained {self} in {time.time() - start:.2f}s", 527 ) 528 529 def init_dir_prior(self, prior, name): 530 """Initialize priors for the Dirichlet distribution. 531 532 Parameters 533 ---------- 534 prior : {str, list of float, numpy.ndarray of float, float} 535 A-priori belief on word probability. If `name` == 'eta' then the prior can be: 536 537 * scalar for a symmetric prior over topic/word probability, 538 * vector of length num_words to denote an asymmetric user defined probability for each word, 539 * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, 540 * the string 'auto' to learn the asymmetric prior from the data. 541 542 If `name` == 'alpha', then the prior can be: 543 544 * an 1D array of length equal to the number of expected topics, 545 * 'symmetric': Uses a fixed symmetric prior per topic, 546 * 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / (topic_index + sqrt(num_topics))`, 547 * 'auto': Learns an asymmetric prior from the corpus. 548 name : {'alpha', 'eta'} 549 Whether the `prior` is parameterized by the alpha vector (1 parameter per topic) 550 or by the eta (1 parameter per unique term in the vocabulary). 551 552 """ 553 if prior is None: 554 prior = 'symmetric' 555 556 if name == 'alpha': 557 prior_shape = self.num_topics 558 elif name == 'eta': 559 prior_shape = self.num_terms 560 else: 561 raise ValueError("'name' must be 'alpha' or 'eta'") 562 563 is_auto = False 564 565 if isinstance(prior, str): 566 if prior == 'symmetric': 567 logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) 568 init_prior = np.fromiter( 569 (1.0 / self.num_topics for i in range(prior_shape)), 570 dtype=self.dtype, count=prior_shape, 571 ) 572 elif prior == 'asymmetric': 573 init_prior = np.fromiter( 574 (1.0 / (i + np.sqrt(prior_shape)) for i in range(prior_shape)), 575 dtype=self.dtype, count=prior_shape, 576 ) 577 init_prior /= init_prior.sum() 578 logger.info("using asymmetric %s %s", name, list(init_prior)) 579 elif prior == 'auto': 580 is_auto = True 581 init_prior = np.fromiter((1.0 / self.num_topics for i in range(prior_shape)), 582 dtype=self.dtype, count=prior_shape) 583 if name == 'alpha': 584 logger.info("using autotuned %s, starting with %s", name, list(init_prior)) 585 else: 586 raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior)) 587 elif isinstance(prior, list): 588 init_prior = np.asarray(prior, dtype=self.dtype) 589 elif isinstance(prior, np.ndarray): 590 init_prior = prior.astype(self.dtype, copy=False) 591 elif isinstance(prior, (np.number, numbers.Real)): 592 init_prior = np.fromiter((prior for i in range(prior_shape)), dtype=self.dtype) 593 else: 594 raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name) 595 596 return init_prior, is_auto 597 598 def __str__(self): 599 """Get a string representation of the current object. 600 601 Returns 602 ------- 603 str 604 Human readable representation of the most important model parameters. 605 606 """ 607 return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( 608 self.num_terms, self.num_topics, self.decay, self.chunksize 609 ) 610 611 def sync_state(self, current_Elogbeta=None): 612 """Propagate the states topic probabilities to the inner object's attribute. 613 614 Parameters 615 ---------- 616 current_Elogbeta: numpy.ndarray 617 Posterior probabilities for each topic, optional. 618 If omitted, it will get Elogbeta from state. 619 620 """ 621 if current_Elogbeta is None: 622 current_Elogbeta = self.state.get_Elogbeta() 623 self.expElogbeta = np.exp(current_Elogbeta) 624 assert self.expElogbeta.dtype == self.dtype 625 626 def clear(self): 627 """Clear the model's state to free some memory. Used in the distributed implementation.""" 628 self.state = None 629 self.Elogbeta = None 630 631 def inference(self, chunk, collect_sstats=False): 632 """Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) 633 for each document in the chunk. 634 635 This function does not modify the model The whole input chunk of document is assumed to fit in RAM; 636 chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational 637 parameter directly using the optimization presented in 638 `Lee, Seung: Algorithms for non-negative matrix factorization" 639 <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_. 640 641 Parameters 642 ---------- 643 chunk : list of list of (int, float) 644 The corpus chunk on which the inference step will be performed. 645 collect_sstats : bool, optional 646 If set to True, also collect (and return) sufficient statistics needed to update the model's topic-word 647 distributions. 648 649 Returns 650 ------- 651 (numpy.ndarray, {numpy.ndarray, None}) 652 The first element is always returned and it corresponds to the states gamma matrix. The second element is 653 only returned if `collect_sstats` == True and corresponds to the sufficient statistics for the M step. 654 655 """ 656 try: 657 len(chunk) 658 except TypeError: 659 # convert iterators/generators to plain list, so we have len() etc. 660 chunk = list(chunk) 661 if len(chunk) > 1: 662 logger.debug("performing inference on a chunk of %i documents", len(chunk)) 663 664 # Initialize the variational distribution q(theta|gamma) for the chunk 665 gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False) 666 Elogtheta = dirichlet_expectation(gamma) 667 expElogtheta = np.exp(Elogtheta) 668 669 assert Elogtheta.dtype == self.dtype 670 assert expElogtheta.dtype == self.dtype 671 672 if collect_sstats: 673 sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype) 674 else: 675 sstats = None 676 converged = 0 677 678 # Now, for each document d update that document's gamma and phi 679 # Inference code copied from Hoffman's `onlineldavb.py` (esp. the 680 # Lee&Seung trick which speeds things up by an order of magnitude, compared 681 # to Blei's original LDA-C code, cool!). 682 integer_types = (int, np.integer,) 683 epsilon = np.finfo(self.dtype).eps 684 for d, doc in enumerate(chunk): 685 if len(doc) > 0 and not isinstance(doc[0][0], integer_types): 686 # make sure the term IDs are ints, otherwise np will get upset 687 ids = [int(idx) for idx, _ in doc] 688 else: 689 ids = [idx for idx, _ in doc] 690 cts = np.fromiter((cnt for _, cnt in doc), dtype=self.dtype, count=len(doc)) 691 gammad = gamma[d, :] 692 Elogthetad = Elogtheta[d, :] 693 expElogthetad = expElogtheta[d, :] 694 expElogbetad = self.expElogbeta[:, ids] 695 696 # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. 697 # phinorm is the normalizer. 698 # TODO treat zeros explicitly, instead of adding epsilon? 699 phinorm = np.dot(expElogthetad, expElogbetad) + epsilon 700 701 # Iterate between gamma and phi until convergence 702 for _ in range(self.iterations): 703 lastgamma = gammad 704 # We represent phi implicitly to save memory and time. 705 # Substituting the value of the optimal phi back into 706 # the update for gamma gives this update. Cf. Lee&Seung 2001. 707 gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T) 708 Elogthetad = dirichlet_expectation(gammad) 709 expElogthetad = np.exp(Elogthetad) 710 phinorm = np.dot(expElogthetad, expElogbetad) + epsilon 711 # If gamma hasn't changed much, we're done. 712 meanchange = mean_absolute_difference(gammad, lastgamma) 713 if meanchange < self.gamma_threshold: 714 converged += 1 715 break 716 gamma[d, :] = gammad 717 assert gammad.dtype == self.dtype 718 if collect_sstats: 719 # Contribution of document d to the expected sufficient 720 # statistics for the M step. 721 sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm) 722 723 if len(chunk) > 1: 724 logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations) 725 726 if collect_sstats: 727 # This step finishes computing the sufficient statistics for the 728 # M step, so that 729 # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} 730 # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. 731 sstats *= self.expElogbeta 732 assert sstats.dtype == self.dtype 733 734 assert gamma.dtype == self.dtype 735 return gamma, sstats 736 737 def do_estep(self, chunk, state=None): 738 """Perform inference on a chunk of documents, and accumulate the collected sufficient statistics. 739 740 Parameters 741 ---------- 742 chunk : list of list of (int, float) 743 The corpus chunk on which the inference step will be performed. 744 state : :class:`~gensim.models.ldamodel.LdaState`, optional 745 The state to be updated with the newly accumulated sufficient statistics. If none, the models 746 `self.state` is updated. 747 748 Returns 749 ------- 750 numpy.ndarray 751 Gamma parameters controlling the topic weights, shape (`len(chunk)`, `self.num_topics`). 752 753 """ 754 if state is None: 755 state = self.state 756 gamma, sstats = self.inference(chunk, collect_sstats=True) 757 state.sstats += sstats 758 state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator 759 assert gamma.dtype == self.dtype 760 return gamma 761 762 def update_alpha(self, gammat, rho): 763 """Update parameters for the Dirichlet prior on the per-document topic weights. 764 765 Parameters 766 ---------- 767 gammat : numpy.ndarray 768 Previous topic weight parameters. 769 rho : float 770 Learning rate. 771 772 Returns 773 ------- 774 numpy.ndarray 775 Sequence of alpha parameters. 776 777 """ 778 N = float(len(gammat)) 779 logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N 780 assert logphat.dtype == self.dtype 781 782 self.alpha = update_dir_prior(self.alpha, N, logphat, rho) 783 logger.info("optimized alpha %s", list(self.alpha)) 784 785 assert self.alpha.dtype == self.dtype 786 return self.alpha 787 788 def update_eta(self, lambdat, rho): 789 """Update parameters for the Dirichlet prior on the per-topic word weights. 790 791 Parameters 792 ---------- 793 lambdat : numpy.ndarray 794 Previous lambda parameters. 795 rho : float 796 Learning rate. 797 798 Returns 799 ------- 800 numpy.ndarray 801 The updated eta parameters. 802 803 """ 804 N = float(lambdat.shape[0]) 805 logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,)) 806 assert logphat.dtype == self.dtype 807 808 self.eta = update_dir_prior(self.eta, N, logphat, rho) 809 810 assert self.eta.dtype == self.dtype 811 return self.eta 812 813 def log_perplexity(self, chunk, total_docs=None): 814 """Calculate and return per-word likelihood bound, using a chunk of documents as evaluation corpus. 815 816 Also output the calculated statistics, including the perplexity=2^(-bound), to log at INFO level. 817 818 Parameters 819 ---------- 820 chunk : list of list of (int, float) 821 The corpus chunk on which the inference step will be performed. 822 total_docs : int, optional 823 Number of docs used for evaluation of the perplexity. 824 825 Returns 826 ------- 827 numpy.ndarray 828 The variational bound score calculated for each word. 829 830 """ 831 if total_docs is None: 832 total_docs = len(chunk) 833 corpus_words = sum(cnt for document in chunk for _, cnt in document) 834 subsample_ratio = 1.0 * total_docs / len(chunk) 835 perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words) 836 logger.info( 837 "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words", 838 perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words 839 ) 840 return perwordbound 841 842 def update(self, corpus, chunksize=None, decay=None, offset=None, 843 passes=None, update_every=None, eval_every=None, iterations=None, 844 gamma_threshold=None, chunks_as_numpy=False): 845 """Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until 846 the maximum number of allowed iterations is reached. `corpus` must be an iterable. 847 848 In distributed mode, the E step is distributed over a cluster of machines. 849 850 Notes 851 ----- 852 This update also supports updating an already trained model with new documents; the two models are then merged 853 in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary 854 input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the 855 online update of `Matthew D. Hoffman, David M. Blei, Francis Bach: 856 "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 857 and is guaranteed to converge for any `decay` in (0.5, 1.0). Additionally, for smaller corpus sizes, an 858 increasing `offset` may be beneficial (see Table 1 in the same paper). 859 860 Parameters 861 ---------- 862 corpus : iterable of list of (int, float), optional 863 Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to update the 864 model. 865 chunksize : int, optional 866 Number of documents to be used in each training chunk. 867 decay : float, optional 868 A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten 869 when each new document is examined. Corresponds to Kappa from 870 `Matthew D. Hoffman, David M. Blei, Francis Bach: 871 "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 872 offset : float, optional 873 Hyper-parameter that controls how much we will slow down the first steps the first few iterations. 874 Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach: 875 "Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_. 876 passes : int, optional 877 Number of passes through the corpus during training. 878 update_every : int, optional 879 Number of documents to be iterated through for each update. 880 Set to 0 for batch learning, > 1 for online iterative learning. 881 eval_every : int, optional 882 Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. 883 iterations : int, optional 884 Maximum number of iterations through the corpus when inferring the topic distribution of a corpus. 885 gamma_threshold : float, optional 886 Minimum change in the value of the gamma parameters to continue iterating. 887 chunks_as_numpy : bool, optional 888 Whether each chunk passed to the inference step should be a numpy.ndarray or not. Numpy can in some settings 889 turn the term IDs into floats, these will be converted back into integers in inference, which incurs a 890 performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`. 891 892 """ 893 # use parameters given in constructor, unless user explicitly overrode them 894 if decay is None: 895 decay = self.decay 896 if offset is None: 897 offset = self.offset 898 if passes is None: 899 passes = self.passes 900 if update_every is None: 901 update_every = self.update_every 902 if eval_every is None: 903 eval_every = self.eval_every 904 if iterations is None: 905 iterations = self.iterations 906 if gamma_threshold is None: 907 gamma_threshold = self.gamma_threshold 908 909 try: 910 lencorpus = len(corpus) 911 except Exception: 912 logger.warning("input corpus stream has no len(); counting documents") 913 lencorpus = sum(1 for _ in corpus) 914 if lencorpus == 0: 915 logger.warning("LdaModel.update() called with an empty corpus") 916 return 917 918 if chunksize is None: 919 chunksize = min(lencorpus, self.chunksize) 920 921 self.state.numdocs += lencorpus 922 923 if update_every: 924 updatetype = "online" 925 if passes == 1: 926 updatetype += " (single-pass)" 927 else: 928 updatetype += " (multi-pass)" 929 updateafter = min(lencorpus, update_every * self.numworkers * chunksize) 930 else: 931 updatetype = "batch" 932 updateafter = lencorpus 933 evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) 934 935 updates_per_pass = max(1, lencorpus / updateafter) 936 logger.info( 937 "running %s LDA training, %s topics, %i passes over " 938 "the supplied corpus of %i documents, updating model once " 939 "every %i documents, evaluating perplexity every %i documents, " 940 "iterating %ix with a convergence threshold of %f", 941 updatetype, self.num_topics, passes, lencorpus, 942 updateafter, evalafter, iterations, 943 gamma_threshold 944 ) 945 946 if updates_per_pass * passes < 10: 947 logger.warning( 948 "too few updates, training might not converge; " 949 "consider increasing the number of passes or iterations to improve accuracy" 950 ) 951 952 # rho is the "speed" of updating; TODO try other fncs 953 # pass_ + num_updates handles increasing the starting t for each pass, 954 # while allowing it to "reset" on the first pass of each update 955 def rho(): 956 return pow(offset + pass_ + (self.num_updates / chunksize), -decay) 957 958 if self.callbacks: 959 # pass the list of input callbacks to Callback class 960 callback = Callback(self.callbacks) 961 callback.set_model(self) 962 # initialize metrics list to store metric values after every epoch 963 self.metrics = defaultdict(list) 964 965 for pass_ in range(passes): 966 if self.dispatcher: 967 logger.info('initializing %s workers', self.numworkers) 968 self.dispatcher.reset(self.state) 969 else: 970 other = LdaState(self.eta, self.state.sstats.shape, self.dtype) 971 dirty = False 972 973 reallen = 0 974 chunks = utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy, dtype=self.dtype) 975 for chunk_no, chunk in enumerate(chunks): 976 reallen += len(chunk) # keep track of how many documents we've processed so far 977 978 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): 979 self.log_perplexity(chunk, total_docs=lencorpus) 980 981 if self.dispatcher: 982 # add the chunk to dispatcher's job queue, so workers can munch on it 983 logger.info( 984 "PROGRESS: pass %i, dispatching documents up to #%i/%i", 985 pass_, chunk_no * chunksize + len(chunk), lencorpus 986 ) 987 # this will eventually block until some jobs finish, because the queue has a small finite length 988 self.dispatcher.putjob(chunk) 989 else: 990 logger.info( 991 "PROGRESS: pass %i, at document #%i/%i", 992 pass_, chunk_no * chunksize + len(chunk), lencorpus 993 ) 994 gammat = self.do_estep(chunk, other) 995 996 if self.optimize_alpha: 997 self.update_alpha(gammat, rho()) 998 999 dirty = True 1000 del chunk 1001 1002 # perform an M step. determine when based on update_every, don't do this after every chunk 1003 if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: 1004 if self.dispatcher: 1005 # distributed mode: wait for all workers to finish 1006 logger.info("reached the end of input; now waiting for all remaining jobs to finish") 1007 other = self.dispatcher.getstate() 1008 self.do_mstep(rho(), other, pass_ > 0) 1009 del other # frees up memory 1010 1011 if self.dispatcher: 1012 logger.info('initializing workers') 1013 self.dispatcher.reset(self.state) 1014 else: 1015 other = LdaState(self.eta, self.state.sstats.shape, self.dtype) 1016 dirty = False 1017 1018 if reallen != lencorpus: 1019 raise RuntimeError("input corpus size changed during training (don't use generators as input)") 1020 1021 # append current epoch's metric values 1022 if self.callbacks: 1023 current_metrics = callback.on_epoch_end(pass_) 1024 for metric, value in current_metrics.items(): 1025 self.metrics[metric].append(value) 1026 1027 if dirty: 1028 # finish any remaining updates 1029 if self.dispatcher: 1030 # distributed mode: wait for all workers to finish 1031 logger.info("reached the end of input; now waiting for all remaining jobs to finish") 1032 other = self.dispatcher.getstate() 1033 self.do_mstep(rho(), other, pass_ > 0) 1034 del other 1035 dirty = False 1036 1037 def do_mstep(self, rho, other, extra_pass=False): 1038 """Maximization step: use linear interpolation between the existing topics and 1039 collected sufficient statistics in `other` to update the topics. 1040 1041 Parameters 1042 ---------- 1043 rho : float 1044 Learning rate. 1045 other : :class:`~gensim.models.ldamodel.LdaModel` 1046 The model whose sufficient statistics will be used to update the topics. 1047 extra_pass : bool, optional 1048 Whether this step required an additional pass over the corpus. 1049 1050 """ 1051 logger.debug("updating topics") 1052 # update self with the new blend; also keep track of how much did 1053 # the topics change through this update, to assess convergence 1054 previous_Elogbeta = self.state.get_Elogbeta() 1055 self.state.blend(rho, other) 1056 1057 current_Elogbeta = self.state.get_Elogbeta() 1058 self.sync_state(current_Elogbeta) 1059 1060 # print out some debug info at the end of each EM iteration 1061 self.print_topics(5) 1062 diff = mean_absolute_difference(previous_Elogbeta.ravel(), current_Elogbeta.ravel()) 1063 logger.info("topic diff=%f, rho=%f", diff, rho) 1064 1065 if self.optimize_eta: 1066 self.update_eta(self.state.get_lambda(), rho) 1067 1068 if not extra_pass: 1069 # only update if this isn't an additional pass 1070 self.num_updates += other.numdocs 1071 1072 def bound(self, corpus, gamma=None, subsample_ratio=1.0): 1073 """Estimate the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)]. 1074 1075 Parameters 1076 ---------- 1077 corpus : iterable of list of (int, float), optional 1078 Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`) used to estimate the 1079 variational bounds. 1080 gamma : numpy.ndarray, optional 1081 Topic weight variational parameters for each document. If not supplied, it will be inferred from the model. 1082 subsample_ratio : float, optional 1083 Percentage of the whole corpus represented by the passed `corpus` argument (in case this was a sample). 1084 Set to 1.0 if the whole corpus was passed.This is used as a multiplicative factor to scale the likelihood 1085 appropriately. 1086 1087 Returns 1088 ------- 1089 numpy.ndarray 1090 The variational bound score calculated for each document. 1091 1092 """ 1093 score = 0.0 1094 _lambda = self.state.get_lambda() 1095 Elogbeta = dirichlet_expectation(_lambda) 1096 1097 for d, doc in enumerate(corpus): # stream the input doc-by-doc, in case it's too large to fit in RAM 1098 if d % self.chunksize == 0: 1099 logger.debug("bound: at document #%i", d) 1100 if gamma is None: 1101 gammad, _ = self.inference([doc]) 1102 else: 1103 gammad = gamma[d] 1104 Elogthetad = dirichlet_expectation(gammad) 1105 1106 assert gammad.dtype == self.dtype 1107 assert Elogthetad.dtype == self.dtype 1108 1109 # E[log p(doc | theta, beta)] 1110 score += sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc) 1111 1112 # E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector 1113 score += np.sum((self.alpha - gammad) * Elogthetad) 1114 score += np.sum(gammaln(gammad) - gammaln(self.alpha)) 1115 score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad)) 1116 1117 # Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures 1118 # that the likelihood is always roughly on the same scale. 1119 score *= subsample_ratio 1120 1121 # E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar 1122 score += np.sum((self.eta - _lambda) * Elogbeta) 1123 score += np.sum(gammaln(_lambda) - gammaln(self.eta)) 1124 1125 if np.ndim(self.eta) == 0: 1126 sum_eta = self.eta * self.num_terms 1127 else: 1128 sum_eta = np.sum(self.eta) 1129 1130 score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1))) 1131 1132 return score 1133 1134 def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): 1135 """Get a representation for selected topics. 1136 1137 Parameters 1138 ---------- 1139 num_topics : int, optional 1140 Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in LDA. 1141 The returned topics subset of all topics is therefore arbitrary and may change between two LDA 1142 training runs. 1143 num_words : int, optional 1144 Number of words to be presented for each topic. These will be the most relevant words (assigned the highest 1145 probability for each topic). 1146 log : bool, optional 1147 Whether the output is also logged, besides being returned. 1148 formatted : bool, optional 1149 Whether the topic representations should be formatted as strings. If False, they are returned as 1150 2 tuples of (word, probability). 1151 1152 Returns 1153 ------- 1154 list of {str, tuple of (str, float)} 1155 a list of topics, each represented either as a string (when `formatted` == True) or word-probability 1156 pairs. 1157 1158 """ 1159 if num_topics < 0 or num_topics >= self.num_topics: 1160 num_topics = self.num_topics 1161 chosen_topics = range(num_topics) 1162 else: 1163 num_topics = min(num_topics, self.num_topics) 1164 1165 # add a little random jitter, to randomize results around the same alpha 1166 sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha)) 1167 # random_state.rand returns float64, but converting back to dtype won't speed up anything 1168 1169 sorted_topics = list(matutils.argsort(sort_alpha)) 1170 chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:] 1171 1172 shown = [] 1173 1174 topic = self.state.get_lambda() 1175 for i in chosen_topics: 1176 topic_ = topic[i] 1177 topic_ = topic_ / topic_.sum() # normalize to probability distribution 1178 bestn = matutils.argsort(topic_, num_words, reverse=True) 1179 topic_ = [(self.id2word[id], topic_[id]) for id in bestn] 1180 if formatted: 1181 topic_ = ' + '.join('%.3f*"%s"' % (v, k) for k, v in topic_) 1182 1183 shown.append((i, topic_)) 1184 if log: 1185 logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic_) 1186 1187 return shown 1188 1189 def show_topic(self, topicid, topn=10): 1190 """Get the representation for a single topic. Words here are the actual strings, in constrast to 1191 :meth:`~gensim.models.ldamodel.LdaModel.get_topic_terms` that represents words by their vocabulary ID. 1192 1193 Parameters 1194 ---------- 1195 topicid : int 1196 The ID of the topic to be returned 1197 topn : int, optional 1198 Number of the most significant words that are associated with the topic. 1199 1200 Returns 1201 ------- 1202 list of (str, float) 1203 Word - probability pairs for the most relevant words generated by the topic. 1204 1205 """ 1206 return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)] 1207 1208 def get_topics(self): 1209 """Get the term-topic matrix learned during inference. 1210 1211 Returns 1212 ------- 1213 numpy.ndarray 1214 The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`). 1215 1216 """ 1217 topics = self.state.get_lambda() 1218 return topics / topics.sum(axis=1)[:, None] 1219 1220 def get_topic_terms(self, topicid, topn=10): 1221 """Get the representation for a single topic. Words the integer IDs, in constrast to 1222 :meth:`~gensim.models.ldamodel.LdaModel.show_topic` that represents words by the actual strings. 1223 1224 Parameters 1225 ---------- 1226 topicid : int 1227 The ID of the topic to be returned 1228 topn : int, optional 1229 Number of the most significant words that are associated with the topic. 1230 1231 Returns 1232 ------- 1233 list of (int, float) 1234 Word ID - probability pairs for the most relevant words generated by the topic. 1235 1236 """ 1237 topic = self.get_topics()[topicid] 1238 topic = topic / topic.sum() # normalize to probability distribution 1239 bestn = matutils.argsort(topic, topn, reverse=True) 1240 return [(idx, topic[idx]) for idx in bestn] 1241 1242 def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None, 1243 coherence='u_mass', topn=20, processes=-1): 1244 """Get the topics with the highest coherence score the coherence for each topic. 1245 1246 Parameters 1247 ---------- 1248 corpus : iterable of list of (int, float), optional 1249 Corpus in BoW format. 1250 texts : list of list of str, optional 1251 Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) 1252 probability estimator . 1253 dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional 1254 Gensim dictionary mapping of id word to create corpus. 1255 If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used. 1256 window_size : int, optional 1257 Is the size of the window to be used for coherence measures using boolean sliding window as their 1258 probability estimator. For 'u_mass' this doesn't matter. 1259 If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10. 1260 coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional 1261 Coherence measure to be used. 1262 Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`. 1263 For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus 1264 using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed) 1265 topn : int, optional 1266 Integer corresponding to the number of top words to be extracted from each topic. 1267 processes : int, optional 1268 Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as 1269 num_cpus - 1. 1270 1271 Returns 1272 ------- 1273 list of (list of (int, str), float) 1274 Each element in the list is a pair of a topic representation and its coherence score. Topic representations 1275 are distributions of words, represented as a list of pairs of word IDs and their probabilities. 1276 1277 """ 1278 cm = CoherenceModel( 1279 model=self, corpus=corpus, texts=texts, dictionary=dictionary, 1280 window_size=window_size, coherence=coherence, topn=topn, 1281 processes=processes 1282 ) 1283 coherence_scores = cm.get_coherence_per_topic() 1284 1285 str_topics = [] 1286 for topic in self.get_topics(): # topic = array of vocab_size floats, one per term 1287 bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic 1288 beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token 1289 str_topics.append(beststr) # list of topn (float membership, token) tuples 1290 1291 scored_topics = zip(str_topics, coherence_scores) 1292 return sorted(scored_topics, key=lambda tup: tup[1], reverse=True) 1293 1294 def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, 1295 per_word_topics=False): 1296 """Get the topic distribution for the given document. 1297 1298 Parameters 1299 ---------- 1300 bow : corpus : list of (int, float) 1301 The document in BOW format. 1302 minimum_probability : float 1303 Topics with an assigned probability lower than this threshold will be discarded. 1304 minimum_phi_value : float 1305 If `per_word_topics` is True, this represents a lower bound on the term probabilities that are included. 1306 If set to None, a value of 1e-8 is used to prevent 0s. 1307 per_word_topics : bool 1308 If True, this function will also return two extra lists as explained in the "Returns" section. 1309 1310 Returns 1311 ------- 1312 list of (int, float) 1313 Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and 1314 the probability that was assigned to it. 1315 list of (int, list of (int, float), optional 1316 Most probable topics per word. Each element in the list is a pair of a word's id, and a list of 1317 topics sorted by their relevance to this word. Only returned if `per_word_topics` was set to True. 1318 list of (int, list of float), optional 1319 Phi relevance values, multiplied by the feature length, for each word-topic combination. 1320 Each element in the list is a pair of a word's id and a list of the phi values between this word and 1321 each topic. Only returned if `per_word_topics` was set to True. 1322 1323 """ 1324 if minimum_probability is None: 1325 minimum_probability = self.minimum_probability 1326 minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output 1327 1328 if minimum_phi_value is None: 1329 minimum_phi_value = self.minimum_probability 1330 minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output 1331 1332 # if the input vector is a corpus, return a transformed corpus 1333 is_corpus, corpus = utils.is_corpus(bow) 1334 if is_corpus: 1335 kwargs = dict( 1336 per_word_topics=per_word_topics, 1337 minimum_probability=minimum_probability, 1338 minimum_phi_value=minimum_phi_value 1339 ) 1340 return self._apply(corpus, **kwargs) 1341 1342 gamma, phis = self.inference([bow], collect_sstats=per_word_topics) 1343 topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution 1344 1345 document_topics = [ 1346 (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) 1347 if topicvalue >= minimum_probability 1348 ] 1349 1350 if not per_word_topics: 1351 return document_topics 1352 1353 word_topic = [] # contains word and corresponding topic 1354 word_phi = [] # contains word and phi values 1355 for word_type, weight in bow: 1356 phi_values = [] # contains (phi_value, topic) pairing to later be sorted 1357 phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user 1358 for topic_id in range(0, self.num_topics): 1359 if phis[topic_id][word_type] >= minimum_phi_value: 1360 # appends phi values for each topic for that word 1361 # these phi values are scaled by feature length 1362 phi_values.append((phis[topic_id][word_type], topic_id)) 1363 phi_topic.append((topic_id, phis[topic_id][word_type])) 1364 1365 # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]). 1366 word_phi.append((word_type, phi_topic)) 1367 # sorts the topics based on most likely topic 1368 # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). 1369 sorted_phi_values = sorted(phi_values, reverse=True) 1370 topics_sorted = [x[1] for x in sorted_phi_values] 1371 word_topic.append((word_type, topics_sorted)) 1372 1373 return document_topics, word_topic, word_phi # returns 2-tuple 1374 1375 def get_term_topics(self, word_id, minimum_probability=None): 1376 """Get the most relevant topics to the given word. 1377 1378 Parameters 1379 ---------- 1380 word_id : int 1381 The word for which the topic distribution will be computed. 1382 minimum_probability : float, optional 1383 Topics with an assigned probability below this threshold will be discarded. 1384 1385 Returns 1386 ------- 1387 list of (int, float) 1388 The relevant topics represented as pairs of their ID and their assigned probability, sorted 1389 by relevance to the given word. 1390 1391 """ 1392 if minimum_probability is None: 1393 minimum_probability = self.minimum_probability 1394 minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output 1395 1396 # if user enters word instead of id in vocab, change to get id 1397 if isinstance(word_id, str): 1398 word_id = self.id2word.doc2bow([word_id])[0][0] 1399 1400 values = [] 1401 for topic_id in range(0, self.num_topics): 1402 if self.expElogbeta[topic_id][word_id] >= minimum_probability: 1403 values.append((topic_id, self.expElogbeta[topic_id][word_id])) 1404 1405 return values 1406 1407 def diff(self, other, distance="kullback_leibler", num_words=100, 1408 n_ann_terms=10, diagonal=False, annotation=True, normed=True): 1409 """Calculate the difference in topic distributions between two models: `self` and `other`. 1410 1411 Parameters 1412 ---------- 1413 other : :class:`~gensim.models.ldamodel.LdaModel` 1414 The model which will be compared against the current object. 1415 distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'} 1416 The distance metric to calculate the difference with. 1417 num_words : int, optional 1418 The number of most relevant words used if `distance == 'jaccard'`. Also used for annotating topics. 1419 n_ann_terms : int, optional 1420 Max number of words in intersection/symmetric difference between topics. Used for annotation. 1421 diagonal : bool, optional 1422 Whether we need the difference between identical topics (the diagonal of the difference matrix). 1423 annotation : bool, optional 1424 Whether the intersection or difference of words between two topics should be returned. 1425 normed : bool, optional 1426 Whether the matrix should be normalized or not. 1427 1428 Returns 1429 ------- 1430 numpy.ndarray 1431 A difference matrix. Each element corresponds to the difference between the two topics, 1432 shape (`self.num_topics`, `other.num_topics`) 1433 numpy.ndarray, optional 1434 Annotation matrix where for each pair we include the word from the intersection of the two topics, 1435 and the word from the symmetric difference of the two topics. Only included if `annotation == True`. 1436 Shape (`self.num_topics`, `other_model.num_topics`, 2). 1437 1438 Examples 1439 -------- 1440 Get the differences between each pair of topics inferred by two models 1441 1442 .. sourcecode:: pycon 1443 1444 >>> from gensim.models.ldamulticore import LdaMulticore 1445 >>> from gensim.test.utils import datapath 1446 >>> 1447 >>> m1 = LdaMulticore.load(datapath("lda_3_0_1_model")) 1448 >>> m2 = LdaMulticore.load(datapath("ldamodel_python_3_5")) 1449 >>> mdiff, annotation = m1.diff(m2) 1450 >>> topic_diff = mdiff # get matrix with difference for each topic pair from `m1` and `m2` 1451 1452 """ 1453 distances = { 1454 "kullback_leibler": kullback_leibler, 1455 "hellinger": hellinger, 1456 "jaccard": jaccard_distance, 1457 "jensen_shannon": jensen_shannon 1458 } 1459 1460 if distance not in distances: 1461 valid_keys = ", ".join("`{}`".format(x) for x in distances.keys()) 1462 raise ValueError("Incorrect distance, valid only {}".format(valid_keys)) 1463 1464 if not isinstance(other, self.__class__): 1465 raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__)) 1466 1467 distance_func = distances[distance] 1468 d1, d2 = self.get_topics(), other.get_topics() 1469 t1_size, t2_size = d1.shape[0], d2.shape[0] 1470 annotation_terms = None 1471 1472 fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in range(t1_size)] 1473 snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in range(t2_size)] 1474 1475 if distance == "jaccard": 1476 d1, d2 = fst_topics, snd_topics 1477 1478 if diagonal: 1479 assert t1_size == t2_size, \ 1480 "Both input models should have same no. of topics, " \ 1481 "as the diagonal will only be valid in a square matrix" 1482 # initialize z and annotation array 1483 z = np.zeros(t1_size) 1484 if annotation: 1485 annotation_terms = np.zeros(t1_size, dtype=list) 1486 else: 1487 # initialize z and annotation matrix 1488 z = np.zeros((t1_size, t2_size)) 1489 if annotation: 1490 annotation_terms = np.zeros((t1_size, t2_size), dtype=list) 1491 1492 # iterate over each cell in the initialized z and annotation 1493 for topic in np.ndindex(z.shape): 1494 topic1 = topic[0] 1495 if diagonal: 1496 topic2 = topic1 1497 else: 1498 topic2 = topic[1] 1499 1500 z[topic] = distance_func(d1[topic1], d2[topic2]) 1501 if annotation: 1502 pos_tokens = fst_topics[topic1] & snd_topics[topic2] 1503 neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2]) 1504 1505 pos_tokens = list(pos_tokens)[:min(len(pos_tokens), n_ann_terms)] 1506 neg_tokens = list(neg_tokens)[:min(len(neg_tokens), n_ann_terms)] 1507 1508 annotation_terms[topic] = [pos_tokens, neg_tokens] 1509 1510 if normed: 1511 if np.abs(np.max(z)) > 1e-8: 1512 z /= np.max(z) 1513 1514 return z, annotation_terms 1515 1516 def __getitem__(self, bow, eps=None): 1517 """Get the topic distribution for the given document. 1518 1519 Wraps :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` to support an operator style call. 1520 Uses the model's current state (set using constructor arguments) to fill in the additional arguments of the 1521 wrapper method. 1522 1523 Parameters 1524 --------- 1525 bow : list of (int, float) 1526 The document in BOW format. 1527 eps : float, optional 1528 Topics with an assigned probability lower than this threshold will be discarded. 1529 1530 Returns 1531 ------- 1532 list of (int, float) 1533 Topic distribution for the given document. Each topic is represented as a pair of its ID and the probability 1534 assigned to it. 1535 1536 """ 1537 return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics) 1538 1539 def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **kwargs): 1540 """Save the model to a file. 1541 1542 Large internal arrays may be stored into separate files, with `fname` as prefix. 1543 1544 Notes 1545 ----- 1546 If you intend to use models across Python 2/3 versions there are a few things to 1547 keep in mind: 1548 1549 1. The pickled Python dictionaries will not work across Python versions 1550 2. The `save` method does not automatically save all numpy arrays separately, only 1551 those ones that exceed `sep_limit` set in :meth:`~gensim.utils.SaveLoad.save`. The main 1552 concern here is the `alpha` array if for instance using `alpha='auto'`. 1553 1554 Please refer to the `wiki recipes section 1555 <https://github.com/RaRe-Technologies/gensim/wiki/ 1556 Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2>`_ 1557 for an example on how to work around these issues. 1558 1559 See Also 1560 -------- 1561 :meth:`~gensim.models.ldamodel.LdaModel.load` 1562 Load model. 1563 1564 Parameters 1565 ---------- 1566 fname : str 1567 Path to the system file where the model will be persisted. 1568 ignore : tuple of str, optional 1569 The named attributes in the tuple will be left out of the pickled model. The reason why 1570 the internal `state` is ignored by default is that it uses its own serialisation rather than the one 1571 provided by this method. 1572 separately : {list of str, None}, optional 1573 If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store 1574 them into separate files. This avoids pickle memory errors and allows `mmap`'ing large arrays 1575 back on load efficiently. If list of str - this attributes will be stored in separate files, 1576 the automatic check is not performed in this case. 1577 *args 1578 Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. 1579 **kwargs 1580 Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. 1581 1582 """ 1583 if self.state is not None: 1584 self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) 1585 # Save the dictionary separately if not in 'ignore'. 1586 if 'id2word' not in ignore: 1587 utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word')) 1588 1589 # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if 1590 # someone sets the ignore list themselves 1591 if ignore is not None and ignore: 1592 if isinstance(ignore, str): 1593 ignore = [ignore] 1594 ignore = [e for e in ignore if e] # make sure None and '' are not in the list 1595 ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore)) 1596 else: 1597 ignore = ['state', 'dispatcher', 'id2word'] 1598 1599 # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if 1600 # someone sets the separately list themselves. 1601 separately_explicit = ['expElogbeta', 'sstats'] 1602 # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some 1603 # array manually. 1604 if (isinstance(self.alpha, str) and self.alpha == 'auto') or \ 1605 (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): 1606 separately_explicit.append('alpha') 1607 if (isinstance(self.eta, str) and self.eta == 'auto') or \ 1608 (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): 1609 separately_explicit.append('eta') 1610 # Merge separately_explicit with separately. 1611 if separately: 1612 if isinstance(separately, str): 1613 separately = [separately] 1614 separately = [e for e in separately if e] # make sure None and '' are not in the list 1615 separately = list(set(separately_explicit) | set(separately)) 1616 else: 1617 separately = separately_explicit 1618 super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs) 1619 1620 @classmethod 1621 def load(cls, fname, *args, **kwargs): 1622 """Load a previously saved :class:`gensim.models.ldamodel.LdaModel` from file. 1623 1624 See Also 1625 -------- 1626 :meth:`~gensim.models.ldamodel.LdaModel.save` 1627 Save model. 1628 1629 Parameters 1630 ---------- 1631 fname : str 1632 Path to the file where the model is stored. 1633 *args 1634 Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. 1635 **kwargs 1636 Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. 1637 1638 Examples 1639 -------- 1640 Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: 1641 1642 .. sourcecode:: pycon 1643 1644 >>> from gensim.test.utils import datapath 1645 >>> 1646 >>> fname = datapath("lda_3_0_1_model") 1647 >>> lda = LdaModel.load(fname, mmap='r') 1648 1649 """ 1650 kwargs['mmap'] = kwargs.get('mmap', None) 1651 result = super(LdaModel, cls).load(fname, *args, **kwargs) 1652 1653 # check if `random_state` attribute has been set after main pickle load 1654 # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim 1655 # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, 1656 # so set `random_state` as the default value 1657 if not hasattr(result, 'random_state'): 1658 result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` 1659 logging.warning("random_state not set so using default value") 1660 1661 # dtype could be absent in old models 1662 if not hasattr(result, 'dtype'): 1663 result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy) 1664 logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname) 1665 1666 state_fname = utils.smart_extension(fname, '.state') 1667 try: 1668 result.state = LdaState.load(state_fname, *args, **kwargs) 1669 except Exception as e: 1670 logging.warning("failed to load state from %s: %s", state_fname, e) 1671 1672 id2word_fname = utils.smart_extension(fname, '.id2word') 1673 # check if `id2word_fname` file is present on disk 1674 # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, 1675 # so set `result.id2word` using the `id2word_fname` file 1676 # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, 1677 # so `result.id2word` already set after the main pickle load 1678 if os.path.isfile(id2word_fname): 1679 try: 1680 result.id2word = utils.unpickle(id2word_fname) 1681 except Exception as e: 1682 logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) 1683 return result 1684