1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
4# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
5# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>
6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
8"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency
9<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>` vector space bag-of-words models.
11For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes),
12see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/
16import logging
17from functools import partial
18import re
20import numpy as np
22from gensim import interfaces, matutils, utils
23from gensim.utils import deprecated
26logger = logging.getLogger(__name__)
29def resolve_weights(smartirs):
30    """Check the validity of `smartirs` parameters.
32    Parameters
33    ----------
34    smartirs : str
35        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
36        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
37        variants in the vector space model. The mnemonic for representing a combination
38        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
39        for more information visit `SMART Information Retrieval System
40        <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
42    Returns
43    -------
44    str of (local_letter, global_letter, normalization_letter)
46    local_letter : str
47        Term frequency weighing, one of:
48            * `b` - binary,
49            * `t` or `n` - raw,
50            * `a` - augmented,
51            * `l` - logarithm,
52            * `d` - double logarithm,
53            * `L` - log average.
54    global_letter : str
55        Document frequency weighting, one of:
56            * `x` or `n` - none,
57            * `f` - idf,
58            * `t` - zero-corrected idf,
59            * `p` - probabilistic idf.
60    normalization_letter : str
61        Document normalization, one of:
62            * `x` or `n` - none,
63            * `c` - cosine,
64            * `u` - pivoted unique,
65            * `b` - pivoted character length.
67    Raises
68    ------
69    ValueError
70        If `smartirs` is not a string of length 3 or one of the decomposed value
71        doesn't fit the list of permissible values.
72    """
73    if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
74        match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
75        raise ValueError(
76            "The notation {ddd}.{qqq} specifies two term-weighting schemes, "
77            "one for collection documents ({ddd}) and one for queries ({qqq}). "
78            "You must train two separate tf-idf models.".format(
79                ddd=match.group("ddd"),
80                qqq=match.group("qqq"),
81            )
82        )
83    if not isinstance(smartirs, str) or len(smartirs) != 3:
84        raise ValueError("Expected a string of length 3 got " + smartirs)
86    w_tf, w_df, w_n = smartirs
88    if w_tf not in 'btnaldL':
89        raise ValueError("Expected term frequency weight to be one of 'btnaldL', got {}".format(w_tf))
91    if w_df not in 'xnftp':
92        raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', got {}".format(w_df))
94    if w_n not in 'xncub':
95        raise ValueError("Expected normalization weight to be one of 'xncub', got {}".format(w_n))
97    # resolve aliases
98    if w_tf == "t":
99        w_tf = "n"
100    if w_df == "x":
101        w_df = "n"
102    if w_n == "x":
103        w_n = "n"
105    return w_tf + w_df + w_n
108def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
109    r"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
110    :math:`idf = add + log_{log\_base} \frac{totaldocs}{docfreq}`
112    Parameters
113    ----------
114    docfreq : {int, float}
115        Document frequency.
116    totaldocs : int
117        Total number of documents.
118    log_base : float, optional
119        Base of logarithm.
120    add : float, optional
121        Offset.
123    Returns
124    -------
125    float
126        Inverse document frequency.
128    """
129    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
132def precompute_idfs(wglobal, dfs, total_docs):
133    """Pre-compute the inverse document frequency mapping for all terms.
135    Parameters
136    ----------
137    wglobal : function
138        Custom function for calculating the "global" weighting function.
139        See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`.
140    dfs : dict
141        Dictionary mapping `term_id` into how many documents did that term appear in.
142    total_docs : int
143        Total number of documents.
145    Returns
146    -------
147    dict of (int, float)
148        Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`.
150    """
151    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
152    # this method is here just to speed things up a little.
153    return {termid: wglobal(df, total_docs) for termid, df in dfs.items()}
156def smartirs_wlocal(tf, local_scheme):
157    """Calculate local term weight for a term using the weighting scheme specified in `local_scheme`.
159    Parameters
160    ----------
161    tf : int
162        Term frequency.
163    local : {'b', 'n', 'a', 'l', 'd', 'L'}
164        Local transformation scheme.
166    Returns
167    -------
168    float
169        Calculated local weight.
171    """
172    if local_scheme == "n":
173        return tf
174    elif local_scheme == "l":
175        return 1 + np.log2(tf)
176    elif local_scheme == "d":
177        return 1 + np.log2(1 + np.log2(tf))
178    elif local_scheme == "a":
179        return 0.5 + (0.5 * tf / tf.max(axis=0))
180    elif local_scheme == "b":
181        return tf.astype('bool').astype('int')
182    elif local_scheme == "L":
183        return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))
186def smartirs_wglobal(docfreq, totaldocs, global_scheme):
187    """Calculate global document weight based on the weighting scheme specified in `global_scheme`.
189    Parameters
190    ----------
191    docfreq : int
192        Document frequency.
193    totaldocs : int
194        Total number of documents.
195    global_scheme : {'n', 'f', 't', 'p'}
196        Global transformation scheme.
198    Returns
199    -------
200    float
201        Calculated global weight.
203    """
204    if global_scheme == "n":
205        return 1.0
206    elif global_scheme == "f":
207        return np.log2(1.0 * totaldocs / docfreq)
208    elif global_scheme == "t":
209        return np.log2((totaldocs + 1.0) / docfreq)
210    elif global_scheme == "p":
211        return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))
214@deprecated("Function will be removed in 4.0.0")
215def smartirs_normalize(x, norm_scheme, return_norm=False):
216    """Normalize a vector using the normalization scheme specified in `norm_scheme`.
218    Parameters
219    ----------
220    x : numpy.ndarray
221        The tf-idf vector.
222    norm_scheme : {'n', 'c'}
223        Document length normalization scheme.
224    return_norm : bool, optional
225        Return the length of `x` as well?
227    Returns
228    -------
229    numpy.ndarray
230        Normalized array.
231    float (only if return_norm is set)
232        Norm of `x`.
233    """
234    if norm_scheme == "n":
235        if return_norm:
236            _, length = matutils.unitvec(x, return_norm=return_norm)
237            return x, length
238        else:
239            return x
240    elif norm_scheme == "c":
241        return matutils.unitvec(x, return_norm=return_norm)
244class TfidfModel(interfaces.TransformationABC):
245    """Objects of this class realize the transformation between word-document co-occurrence matrix (int)
246    into a locally/globally weighted TF-IDF matrix (positive floats).
248    Examples
249    --------
250    .. sourcecode:: pycon
252        >>> import gensim.downloader as api
253        >>> from gensim.models import TfidfModel
254        >>> from gensim.corpora import Dictionary
255        >>>
256        >>> dataset = api.load("text8")
257        >>> dct = Dictionary(dataset)  # fit dictionary
258        >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
259        >>>
260        >>> model = TfidfModel(corpus)  # fit model
261        >>> vector = model[corpus[0]]  # apply model to the first corpus document
263    """
264    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
265                 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25):
266        r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component
267        (inverse document frequency), and normalizing the resulting documents to unit length.
268        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
270        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \frac{D}{document\_freq_{i}}
272        or, more generally
274        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)
276        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.
278        Parameters
279        ----------
280        corpus : iterable of iterable of (int, int), optional
281            Input corpus
282        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
283            Mapping token - id, that was used for converting input data to bag of words format.
284        dictionary : :class:`~gensim.corpora.Dictionary`
285            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
286            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
287        wlocals : callable, optional
288            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
289            (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.).
290        wglobal : callable, optional
291            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
292        normalize : {bool, callable}, optional
293            Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
294        smartirs : str, optional
295            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
296            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
297            The mnemonic for representing a combination of weights takes the form XYZ,
298            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
300            Term frequency weighing:
301                * `b` - binary,
302                * `t` or `n` - raw,
303                * `a` - augmented,
304                * `l` - logarithm,
305                * `d` - double logarithm,
306                * `L` - log average.
308            Document frequency weighting:
309                * `x` or `n` - none,
310                * `f` - idf,
311                * `t` - zero-corrected idf,
312                * `p` - probabilistic idf.
314            Document normalization:
315                * `x` or `n` - none,
316                * `c` - cosine,
317                * `u` - pivoted unique,
318                * `b` - pivoted character length.
320            Default is 'nfc'.
321            For more information visit `SMART Information Retrieval System
322            <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
323        pivot : float or None, optional
324            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
325            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
326            slope) * pivot`.
328            You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following
329            two steps:
331                * Set either the `u` or `b` document normalization in the `smartirs` parameter.
332                * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from
333                  the properties of the `corpus` or `dictionary`.
335            If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be
336            disabled. Default is None.
338            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
339        slope : float, optional
340            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
341            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
342            slope) * pivot`.
344            Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively
345            disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and
346            0.3 for best results. Default is 0.25.
348            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
350        See Also
351        --------
352        ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme.
353        resolve_weights : Function that also uses the SMART scheme.
355        References
356        ----------
357        .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length
358           Normalization <http://singhal.info/pivoted-dln.pdf>`_. *SIGIR Forum*, 51, 176–184.
359        .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview <http://singhal.info/ieee2001.pdf>`_.
360           *IEEE Data Eng. Bull.*, 24(4), 35–43.
362        """
363        self.id2word = id2word
364        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
365        self.num_docs, self.num_nnz, self.idfs = None, None, None
366        self.smartirs = resolve_weights(smartirs) if smartirs is not None else None
367        self.slope = slope
368        self.pivot = pivot
369        self.eps = 1e-12
371        if smartirs:
372            n_tf, n_df, n_n = self.smartirs
373            self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
374            self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
376        if dictionary:
377            # user supplied a Dictionary object, which already contains all the
378            # statistics we need to construct the IDF mapping. we can skip the
379            # step that goes through the corpus (= an optimization).
380            if corpus:
381                logger.warning(
382                    "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
383                )
384            self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
385            self.cfs = dictionary.cfs.copy()
386            self.dfs = dictionary.dfs.copy()
387            self.term_lens = {termid: len(term) for termid, term in dictionary.items()}
388            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
389            if not id2word:
390                self.id2word = dictionary
391        elif corpus:
392            self.initialize(corpus)
393        else:
394            # NOTE: everything is left uninitialized; presumably the model will
395            # be initialized in some other way
396            pass
398        # If smartirs is not None, override pivot and normalize
399        if not smartirs:
400            return
401        if self.pivot is not None:
402            if n_n in 'ub':
403                logger.warning("constructor received pivot; ignoring smartirs[2]")
404            return
405        if n_n in 'ub' and callable(self.normalize):
406            logger.warning("constructor received smartirs; ignoring normalize")
407        if n_n in 'ub' and not dictionary and not corpus:
408            logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]")
409        elif n_n == "u":
410            self.pivot = 1.0 * self.num_nnz / self.num_docs
411        elif n_n == "b":
412            self.pivot = 1.0 * sum(
413                self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in dictionary.keys()
414            ) / self.num_docs
416    @classmethod
417    def load(cls, *args, **kwargs):
418        """Load a previously saved TfidfModel class. Handles backwards compatibility from
419        older TfidfModel versions which did not use pivoted document normalization.
421        """
422        model = super(TfidfModel, cls).load(*args, **kwargs)
423        if not hasattr(model, 'pivot'):
424            model.pivot = None
425            logger.info('older version of %s loaded without pivot arg', cls.__name__)
426            logger.info('Setting pivot to %s.', model.pivot)
427        if not hasattr(model, 'slope'):
428            model.slope = 0.65
429            logger.info('older version of %s loaded without slope arg', cls.__name__)
430            logger.info('Setting slope to %s.', model.slope)
431        if not hasattr(model, 'smartirs'):
432            model.smartirs = None
433            logger.info('older version of %s loaded without smartirs arg', cls.__name__)
434            logger.info('Setting smartirs to %s.', model.smartirs)
435        return model
437    def __str__(self):
438        return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
440    def initialize(self, corpus):
441        """Compute inverse document weights, which will be used to modify term frequencies for documents.
443        Parameters
444        ----------
445        corpus : iterable of iterable of (int, int)
446            Input corpus.
448        """
449        logger.info("collecting document frequencies")
450        dfs = {}
451        numnnz, docno = 0, -1
453        for docno, bow in enumerate(corpus):
454            if docno % 10000 == 0:
455                logger.info("PROGRESS: processing document #%i", docno)
456            numnnz += len(bow)
457            for termid, _ in bow:
458                dfs[termid] = dfs.get(termid, 0) + 1
459        # keep some stats about the training corpus
460        self.num_docs = docno + 1
461        self.num_nnz = numnnz
462        self.cfs = None
463        self.dfs = dfs
464        self.term_lengths = None
465        # and finally compute the idf weights
466        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
467        self.add_lifecycle_event(
468            "initialize",
469            msg=(
470                f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}"
471                f" features ({self.num_nnz} matrix non-zeros)"
472            ),
473        )
475    def __getitem__(self, bow, eps=1e-12):
476        """Get the tf-idf representation of an input vector and/or corpus.
478        bow : {list of (int, int), iterable of iterable of (int, int)}
479            Input document in the `sparse Gensim bag-of-words format
480            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
481            or a streamed corpus of such documents.
482        eps : float
483            Threshold value, will remove all position that have tfidf-value less than `eps`.
485        Returns
486        -------
487        vector : list of (int, float)
488            TfIdf vector, if `bow` is a single document
489        :class:`~gensim.interfaces.TransformedCorpus`
490            TfIdf corpus, if `bow` is a corpus.
492        """
493        self.eps = eps
494        # if the input vector is in fact a corpus, return a transformed corpus as a result
495        is_corpus, bow = utils.is_corpus(bow)
496        if is_corpus:
497            return self._apply(bow)
499        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
500        # as strict application of the IDF formula would dictate)
502        termid_array, tf_array = [], []
503        for termid, tf in bow:
504            termid_array.append(termid)
505            tf_array.append(tf)
507        tf_array = self.wlocal(np.array(tf_array))
509        vector = [
510            (termid, tf * self.idfs.get(termid))
511            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
512        ]
514        # and finally, normalize the vector either to unit length, or use a
515        # user-defined normalization function
516        if self.smartirs:
517            n_n = self.smartirs[2]
518            if n_n == "n" or (n_n in 'ub' and self.pivot is None):
519                if self.pivot is not None:
520                    _, old_norm = matutils.unitvec(vector, return_norm=True)
521                norm_vector = vector
522            elif n_n == "c":
523                if self.pivot is not None:
524                    _, old_norm = matutils.unitvec(vector, return_norm=True)
525                else:
526                    norm_vector = matutils.unitvec(vector)
527            elif n_n == "u":
528                _, old_norm = matutils.unitvec(vector, return_norm=True, norm='unique')
529            elif n_n == "b":
530                old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow)
531        else:
532            if self.normalize is True:
533                self.normalize = matutils.unitvec
534            elif self.normalize is False:
535                self.normalize = utils.identity
537            if self.pivot is not None:
538                _, old_norm = self.normalize(vector, return_norm=True)
539            else:
540                norm_vector = self.normalize(vector)
542        if self.pivot is None:
543            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
544        else:
545            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
546            norm_vector = [
547                (termid, weight / float(pivoted_norm))
548                for termid, weight in vector
549                if abs(weight / float(pivoted_norm)) > self.eps
550            ]
551        return norm_vector