1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz> 5# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com> 6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 7 8"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency 9<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>` vector space bag-of-words models. 10 11For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), 12see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ 13 14""" 15 16import logging 17from functools import partial 18import re 19 20import numpy as np 21 22from gensim import interfaces, matutils, utils 23from gensim.utils import deprecated 24 25 26logger = logging.getLogger(__name__) 27 28 29def resolve_weights(smartirs): 30 """Check the validity of `smartirs` parameters. 31 32 Parameters 33 ---------- 34 smartirs : str 35 `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) 36 Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting 37 variants in the vector space model. The mnemonic for representing a combination 38 of weights takes the form ddd, where the letters represents the term weighting of the document vector. 39 for more information visit `SMART Information Retrieval System 40 <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_. 41 42 Returns 43 ------- 44 str of (local_letter, global_letter, normalization_letter) 45 46 local_letter : str 47 Term frequency weighing, one of: 48 * `b` - binary, 49 * `t` or `n` - raw, 50 * `a` - augmented, 51 * `l` - logarithm, 52 * `d` - double logarithm, 53 * `L` - log average. 54 global_letter : str 55 Document frequency weighting, one of: 56 * `x` or `n` - none, 57 * `f` - idf, 58 * `t` - zero-corrected idf, 59 * `p` - probabilistic idf. 60 normalization_letter : str 61 Document normalization, one of: 62 * `x` or `n` - none, 63 * `c` - cosine, 64 * `u` - pivoted unique, 65 * `b` - pivoted character length. 66 67 Raises 68 ------ 69 ValueError 70 If `smartirs` is not a string of length 3 or one of the decomposed value 71 doesn't fit the list of permissible values. 72 """ 73 if isinstance(smartirs, str) and re.match(r"...\....", smartirs): 74 match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs) 75 raise ValueError( 76 "The notation {ddd}.{qqq} specifies two term-weighting schemes, " 77 "one for collection documents ({ddd}) and one for queries ({qqq}). " 78 "You must train two separate tf-idf models.".format( 79 ddd=match.group("ddd"), 80 qqq=match.group("qqq"), 81 ) 82 ) 83 if not isinstance(smartirs, str) or len(smartirs) != 3: 84 raise ValueError("Expected a string of length 3 got " + smartirs) 85 86 w_tf, w_df, w_n = smartirs 87 88 if w_tf not in 'btnaldL': 89 raise ValueError("Expected term frequency weight to be one of 'btnaldL', got {}".format(w_tf)) 90 91 if w_df not in 'xnftp': 92 raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', got {}".format(w_df)) 93 94 if w_n not in 'xncub': 95 raise ValueError("Expected normalization weight to be one of 'xncub', got {}".format(w_n)) 96 97 # resolve aliases 98 if w_tf == "t": 99 w_tf = "n" 100 if w_df == "x": 101 w_df = "n" 102 if w_n == "x": 103 w_n = "n" 104 105 return w_tf + w_df + w_n 106 107 108def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): 109 r"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`: 110 :math:`idf = add + log_{log\_base} \frac{totaldocs}{docfreq}` 111 112 Parameters 113 ---------- 114 docfreq : {int, float} 115 Document frequency. 116 totaldocs : int 117 Total number of documents. 118 log_base : float, optional 119 Base of logarithm. 120 add : float, optional 121 Offset. 122 123 Returns 124 ------- 125 float 126 Inverse document frequency. 127 128 """ 129 return add + np.log(float(totaldocs) / docfreq) / np.log(log_base) 130 131 132def precompute_idfs(wglobal, dfs, total_docs): 133 """Pre-compute the inverse document frequency mapping for all terms. 134 135 Parameters 136 ---------- 137 wglobal : function 138 Custom function for calculating the "global" weighting function. 139 See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`. 140 dfs : dict 141 Dictionary mapping `term_id` into how many documents did that term appear in. 142 total_docs : int 143 Total number of documents. 144 145 Returns 146 ------- 147 dict of (int, float) 148 Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`. 149 150 """ 151 # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. 152 # this method is here just to speed things up a little. 153 return {termid: wglobal(df, total_docs) for termid, df in dfs.items()} 154 155 156def smartirs_wlocal(tf, local_scheme): 157 """Calculate local term weight for a term using the weighting scheme specified in `local_scheme`. 158 159 Parameters 160 ---------- 161 tf : int 162 Term frequency. 163 local : {'b', 'n', 'a', 'l', 'd', 'L'} 164 Local transformation scheme. 165 166 Returns 167 ------- 168 float 169 Calculated local weight. 170 171 """ 172 if local_scheme == "n": 173 return tf 174 elif local_scheme == "l": 175 return 1 + np.log2(tf) 176 elif local_scheme == "d": 177 return 1 + np.log2(1 + np.log2(tf)) 178 elif local_scheme == "a": 179 return 0.5 + (0.5 * tf / tf.max(axis=0)) 180 elif local_scheme == "b": 181 return tf.astype('bool').astype('int') 182 elif local_scheme == "L": 183 return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0))) 184 185 186def smartirs_wglobal(docfreq, totaldocs, global_scheme): 187 """Calculate global document weight based on the weighting scheme specified in `global_scheme`. 188 189 Parameters 190 ---------- 191 docfreq : int 192 Document frequency. 193 totaldocs : int 194 Total number of documents. 195 global_scheme : {'n', 'f', 't', 'p'} 196 Global transformation scheme. 197 198 Returns 199 ------- 200 float 201 Calculated global weight. 202 203 """ 204 if global_scheme == "n": 205 return 1.0 206 elif global_scheme == "f": 207 return np.log2(1.0 * totaldocs / docfreq) 208 elif global_scheme == "t": 209 return np.log2((totaldocs + 1.0) / docfreq) 210 elif global_scheme == "p": 211 return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) 212 213 214@deprecated("Function will be removed in 4.0.0") 215def smartirs_normalize(x, norm_scheme, return_norm=False): 216 """Normalize a vector using the normalization scheme specified in `norm_scheme`. 217 218 Parameters 219 ---------- 220 x : numpy.ndarray 221 The tf-idf vector. 222 norm_scheme : {'n', 'c'} 223 Document length normalization scheme. 224 return_norm : bool, optional 225 Return the length of `x` as well? 226 227 Returns 228 ------- 229 numpy.ndarray 230 Normalized array. 231 float (only if return_norm is set) 232 Norm of `x`. 233 """ 234 if norm_scheme == "n": 235 if return_norm: 236 _, length = matutils.unitvec(x, return_norm=return_norm) 237 return x, length 238 else: 239 return x 240 elif norm_scheme == "c": 241 return matutils.unitvec(x, return_norm=return_norm) 242 243 244class TfidfModel(interfaces.TransformationABC): 245 """Objects of this class realize the transformation between word-document co-occurrence matrix (int) 246 into a locally/globally weighted TF-IDF matrix (positive floats). 247 248 Examples 249 -------- 250 .. sourcecode:: pycon 251 252 >>> import gensim.downloader as api 253 >>> from gensim.models import TfidfModel 254 >>> from gensim.corpora import Dictionary 255 >>> 256 >>> dataset = api.load("text8") 257 >>> dct = Dictionary(dataset) # fit dictionary 258 >>> corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format 259 >>> 260 >>> model = TfidfModel(corpus) # fit model 261 >>> vector = model[corpus[0]] # apply model to the first corpus document 262 263 """ 264 def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, 265 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25): 266 r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component 267 (inverse document frequency), and normalizing the resulting documents to unit length. 268 Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents 269 270 .. math:: weight_{i,j} = frequency_{i,j} * log_2 \frac{D}{document\_freq_{i}} 271 272 or, more generally 273 274 .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D) 275 276 so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions. 277 278 Parameters 279 ---------- 280 corpus : iterable of iterable of (int, int), optional 281 Input corpus 282 id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional 283 Mapping token - id, that was used for converting input data to bag of words format. 284 dictionary : :class:`~gensim.corpora.Dictionary` 285 If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. 286 to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). 287 wlocals : callable, optional 288 Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` 289 (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.). 290 wglobal : callable, optional 291 Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. 292 normalize : {bool, callable}, optional 293 Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`. 294 smartirs : str, optional 295 SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, 296 a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. 297 The mnemonic for representing a combination of weights takes the form XYZ, 298 for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. 299 300 Term frequency weighing: 301 * `b` - binary, 302 * `t` or `n` - raw, 303 * `a` - augmented, 304 * `l` - logarithm, 305 * `d` - double logarithm, 306 * `L` - log average. 307 308 Document frequency weighting: 309 * `x` or `n` - none, 310 * `f` - idf, 311 * `t` - zero-corrected idf, 312 * `p` - probabilistic idf. 313 314 Document normalization: 315 * `x` or `n` - none, 316 * `c` - cosine, 317 * `u` - pivoted unique, 318 * `b` - pivoted character length. 319 320 Default is 'nfc'. 321 For more information visit `SMART Information Retrieval System 322 <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_. 323 pivot : float or None, optional 324 In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length 325 normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 - 326 slope) * pivot`. 327 328 You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following 329 two steps: 330 331 * Set either the `u` or `b` document normalization in the `smartirs` parameter. 332 * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from 333 the properties of the `corpus` or `dictionary`. 334 335 If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be 336 disabled. Default is None. 337 338 See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. 339 slope : float, optional 340 In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length 341 normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 - 342 slope) * pivot`. 343 344 Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively 345 disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and 346 0.3 for best results. Default is 0.25. 347 348 See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. 349 350 See Also 351 -------- 352 ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. 353 resolve_weights : Function that also uses the SMART scheme. 354 355 References 356 ---------- 357 .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length 358 Normalization <http://singhal.info/pivoted-dln.pdf>`_. *SIGIR Forum*, 51, 176–184. 359 .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview <http://singhal.info/ieee2001.pdf>`_. 360 *IEEE Data Eng. Bull.*, 24(4), 35–43. 361 362 """ 363 self.id2word = id2word 364 self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize 365 self.num_docs, self.num_nnz, self.idfs = None, None, None 366 self.smartirs = resolve_weights(smartirs) if smartirs is not None else None 367 self.slope = slope 368 self.pivot = pivot 369 self.eps = 1e-12 370 371 if smartirs: 372 n_tf, n_df, n_n = self.smartirs 373 self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) 374 self.wglobal = partial(smartirs_wglobal, global_scheme=n_df) 375 376 if dictionary: 377 # user supplied a Dictionary object, which already contains all the 378 # statistics we need to construct the IDF mapping. we can skip the 379 # step that goes through the corpus (= an optimization). 380 if corpus: 381 logger.warning( 382 "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus" 383 ) 384 self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz 385 self.cfs = dictionary.cfs.copy() 386 self.dfs = dictionary.dfs.copy() 387 self.term_lens = {termid: len(term) for termid, term in dictionary.items()} 388 self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) 389 if not id2word: 390 self.id2word = dictionary 391 elif corpus: 392 self.initialize(corpus) 393 else: 394 # NOTE: everything is left uninitialized; presumably the model will 395 # be initialized in some other way 396 pass 397 398 # If smartirs is not None, override pivot and normalize 399 if not smartirs: 400 return 401 if self.pivot is not None: 402 if n_n in 'ub': 403 logger.warning("constructor received pivot; ignoring smartirs[2]") 404 return 405 if n_n in 'ub' and callable(self.normalize): 406 logger.warning("constructor received smartirs; ignoring normalize") 407 if n_n in 'ub' and not dictionary and not corpus: 408 logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]") 409 elif n_n == "u": 410 self.pivot = 1.0 * self.num_nnz / self.num_docs 411 elif n_n == "b": 412 self.pivot = 1.0 * sum( 413 self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in dictionary.keys() 414 ) / self.num_docs 415 416 @classmethod 417 def load(cls, *args, **kwargs): 418 """Load a previously saved TfidfModel class. Handles backwards compatibility from 419 older TfidfModel versions which did not use pivoted document normalization. 420 421 """ 422 model = super(TfidfModel, cls).load(*args, **kwargs) 423 if not hasattr(model, 'pivot'): 424 model.pivot = None 425 logger.info('older version of %s loaded without pivot arg', cls.__name__) 426 logger.info('Setting pivot to %s.', model.pivot) 427 if not hasattr(model, 'slope'): 428 model.slope = 0.65 429 logger.info('older version of %s loaded without slope arg', cls.__name__) 430 logger.info('Setting slope to %s.', model.slope) 431 if not hasattr(model, 'smartirs'): 432 model.smartirs = None 433 logger.info('older version of %s loaded without smartirs arg', cls.__name__) 434 logger.info('Setting smartirs to %s.', model.smartirs) 435 return model 436 437 def __str__(self): 438 return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) 439 440 def initialize(self, corpus): 441 """Compute inverse document weights, which will be used to modify term frequencies for documents. 442 443 Parameters 444 ---------- 445 corpus : iterable of iterable of (int, int) 446 Input corpus. 447 448 """ 449 logger.info("collecting document frequencies") 450 dfs = {} 451 numnnz, docno = 0, -1 452 453 for docno, bow in enumerate(corpus): 454 if docno % 10000 == 0: 455 logger.info("PROGRESS: processing document #%i", docno) 456 numnnz += len(bow) 457 for termid, _ in bow: 458 dfs[termid] = dfs.get(termid, 0) + 1 459 # keep some stats about the training corpus 460 self.num_docs = docno + 1 461 self.num_nnz = numnnz 462 self.cfs = None 463 self.dfs = dfs 464 self.term_lengths = None 465 # and finally compute the idf weights 466 self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) 467 self.add_lifecycle_event( 468 "initialize", 469 msg=( 470 f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}" 471 f" features ({self.num_nnz} matrix non-zeros)" 472 ), 473 ) 474 475 def __getitem__(self, bow, eps=1e-12): 476 """Get the tf-idf representation of an input vector and/or corpus. 477 478 bow : {list of (int, int), iterable of iterable of (int, int)} 479 Input document in the `sparse Gensim bag-of-words format 480 <https://radimrehurek.com/gensim/intro.html#core-concepts>`_, 481 or a streamed corpus of such documents. 482 eps : float 483 Threshold value, will remove all position that have tfidf-value less than `eps`. 484 485 Returns 486 ------- 487 vector : list of (int, float) 488 TfIdf vector, if `bow` is a single document 489 :class:`~gensim.interfaces.TransformedCorpus` 490 TfIdf corpus, if `bow` is a corpus. 491 492 """ 493 self.eps = eps 494 # if the input vector is in fact a corpus, return a transformed corpus as a result 495 is_corpus, bow = utils.is_corpus(bow) 496 if is_corpus: 497 return self._apply(bow) 498 499 # unknown (new) terms will be given zero weight (NOT infinity/huge weight, 500 # as strict application of the IDF formula would dictate) 501 502 termid_array, tf_array = [], [] 503 for termid, tf in bow: 504 termid_array.append(termid) 505 tf_array.append(tf) 506 507 tf_array = self.wlocal(np.array(tf_array)) 508 509 vector = [ 510 (termid, tf * self.idfs.get(termid)) 511 for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps 512 ] 513 514 # and finally, normalize the vector either to unit length, or use a 515 # user-defined normalization function 516 if self.smartirs: 517 n_n = self.smartirs[2] 518 if n_n == "n" or (n_n in 'ub' and self.pivot is None): 519 if self.pivot is not None: 520 _, old_norm = matutils.unitvec(vector, return_norm=True) 521 norm_vector = vector 522 elif n_n == "c": 523 if self.pivot is not None: 524 _, old_norm = matutils.unitvec(vector, return_norm=True) 525 else: 526 norm_vector = matutils.unitvec(vector) 527 elif n_n == "u": 528 _, old_norm = matutils.unitvec(vector, return_norm=True, norm='unique') 529 elif n_n == "b": 530 old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow) 531 else: 532 if self.normalize is True: 533 self.normalize = matutils.unitvec 534 elif self.normalize is False: 535 self.normalize = utils.identity 536 537 if self.pivot is not None: 538 _, old_norm = self.normalize(vector, return_norm=True) 539 else: 540 norm_vector = self.normalize(vector) 541 542 if self.pivot is None: 543 norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps] 544 else: 545 pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm 546 norm_vector = [ 547 (termid, weight / float(pivoted_norm)) 548 for termid, weight in vector 549 if abs(weight / float(pivoted_norm)) > self.eps 550 ] 551 return norm_vector 552