1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
5# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>
6# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
7
8"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency
9<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>` vector space bag-of-words models.
10
11For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes),
12see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/
13
14"""
15
16import logging
17from functools import partial
18import re
19
20import numpy as np
21
22from gensim import interfaces, matutils, utils
23from gensim.utils import deprecated
24
25
26logger = logging.getLogger(__name__)
27
28
29def resolve_weights(smartirs):
30    """Check the validity of `smartirs` parameters.
31
32    Parameters
33    ----------
34    smartirs : str
35        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
36        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
37        variants in the vector space model. The mnemonic for representing a combination
38        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
39        for more information visit `SMART Information Retrieval System
40        <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
41
42    Returns
43    -------
44    str of (local_letter, global_letter, normalization_letter)
45
46    local_letter : str
47        Term frequency weighing, one of:
48            * `b` - binary,
49            * `t` or `n` - raw,
50            * `a` - augmented,
51            * `l` - logarithm,
52            * `d` - double logarithm,
53            * `L` - log average.
54    global_letter : str
55        Document frequency weighting, one of:
56            * `x` or `n` - none,
57            * `f` - idf,
58            * `t` - zero-corrected idf,
59            * `p` - probabilistic idf.
60    normalization_letter : str
61        Document normalization, one of:
62            * `x` or `n` - none,
63            * `c` - cosine,
64            * `u` - pivoted unique,
65            * `b` - pivoted character length.
66
67    Raises
68    ------
69    ValueError
70        If `smartirs` is not a string of length 3 or one of the decomposed value
71        doesn't fit the list of permissible values.
72    """
73    if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
74        match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
75        raise ValueError(
76            "The notation {ddd}.{qqq} specifies two term-weighting schemes, "
77            "one for collection documents ({ddd}) and one for queries ({qqq}). "
78            "You must train two separate tf-idf models.".format(
79                ddd=match.group("ddd"),
80                qqq=match.group("qqq"),
81            )
82        )
83    if not isinstance(smartirs, str) or len(smartirs) != 3:
84        raise ValueError("Expected a string of length 3 got " + smartirs)
85
86    w_tf, w_df, w_n = smartirs
87
88    if w_tf not in 'btnaldL':
89        raise ValueError("Expected term frequency weight to be one of 'btnaldL', got {}".format(w_tf))
90
91    if w_df not in 'xnftp':
92        raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', got {}".format(w_df))
93
94    if w_n not in 'xncub':
95        raise ValueError("Expected normalization weight to be one of 'xncub', got {}".format(w_n))
96
97    # resolve aliases
98    if w_tf == "t":
99        w_tf = "n"
100    if w_df == "x":
101        w_df = "n"
102    if w_n == "x":
103        w_n = "n"
104
105    return w_tf + w_df + w_n
106
107
108def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
109    r"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
110    :math:`idf = add + log_{log\_base} \frac{totaldocs}{docfreq}`
111
112    Parameters
113    ----------
114    docfreq : {int, float}
115        Document frequency.
116    totaldocs : int
117        Total number of documents.
118    log_base : float, optional
119        Base of logarithm.
120    add : float, optional
121        Offset.
122
123    Returns
124    -------
125    float
126        Inverse document frequency.
127
128    """
129    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
130
131
132def precompute_idfs(wglobal, dfs, total_docs):
133    """Pre-compute the inverse document frequency mapping for all terms.
134
135    Parameters
136    ----------
137    wglobal : function
138        Custom function for calculating the "global" weighting function.
139        See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`.
140    dfs : dict
141        Dictionary mapping `term_id` into how many documents did that term appear in.
142    total_docs : int
143        Total number of documents.
144
145    Returns
146    -------
147    dict of (int, float)
148        Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`.
149
150    """
151    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
152    # this method is here just to speed things up a little.
153    return {termid: wglobal(df, total_docs) for termid, df in dfs.items()}
154
155
156def smartirs_wlocal(tf, local_scheme):
157    """Calculate local term weight for a term using the weighting scheme specified in `local_scheme`.
158
159    Parameters
160    ----------
161    tf : int
162        Term frequency.
163    local : {'b', 'n', 'a', 'l', 'd', 'L'}
164        Local transformation scheme.
165
166    Returns
167    -------
168    float
169        Calculated local weight.
170
171    """
172    if local_scheme == "n":
173        return tf
174    elif local_scheme == "l":
175        return 1 + np.log2(tf)
176    elif local_scheme == "d":
177        return 1 + np.log2(1 + np.log2(tf))
178    elif local_scheme == "a":
179        return 0.5 + (0.5 * tf / tf.max(axis=0))
180    elif local_scheme == "b":
181        return tf.astype('bool').astype('int')
182    elif local_scheme == "L":
183        return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))
184
185
186def smartirs_wglobal(docfreq, totaldocs, global_scheme):
187    """Calculate global document weight based on the weighting scheme specified in `global_scheme`.
188
189    Parameters
190    ----------
191    docfreq : int
192        Document frequency.
193    totaldocs : int
194        Total number of documents.
195    global_scheme : {'n', 'f', 't', 'p'}
196        Global transformation scheme.
197
198    Returns
199    -------
200    float
201        Calculated global weight.
202
203    """
204    if global_scheme == "n":
205        return 1.0
206    elif global_scheme == "f":
207        return np.log2(1.0 * totaldocs / docfreq)
208    elif global_scheme == "t":
209        return np.log2((totaldocs + 1.0) / docfreq)
210    elif global_scheme == "p":
211        return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))
212
213
214@deprecated("Function will be removed in 4.0.0")
215def smartirs_normalize(x, norm_scheme, return_norm=False):
216    """Normalize a vector using the normalization scheme specified in `norm_scheme`.
217
218    Parameters
219    ----------
220    x : numpy.ndarray
221        The tf-idf vector.
222    norm_scheme : {'n', 'c'}
223        Document length normalization scheme.
224    return_norm : bool, optional
225        Return the length of `x` as well?
226
227    Returns
228    -------
229    numpy.ndarray
230        Normalized array.
231    float (only if return_norm is set)
232        Norm of `x`.
233    """
234    if norm_scheme == "n":
235        if return_norm:
236            _, length = matutils.unitvec(x, return_norm=return_norm)
237            return x, length
238        else:
239            return x
240    elif norm_scheme == "c":
241        return matutils.unitvec(x, return_norm=return_norm)
242
243
244class TfidfModel(interfaces.TransformationABC):
245    """Objects of this class realize the transformation between word-document co-occurrence matrix (int)
246    into a locally/globally weighted TF-IDF matrix (positive floats).
247
248    Examples
249    --------
250    .. sourcecode:: pycon
251
252        >>> import gensim.downloader as api
253        >>> from gensim.models import TfidfModel
254        >>> from gensim.corpora import Dictionary
255        >>>
256        >>> dataset = api.load("text8")
257        >>> dct = Dictionary(dataset)  # fit dictionary
258        >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
259        >>>
260        >>> model = TfidfModel(corpus)  # fit model
261        >>> vector = model[corpus[0]]  # apply model to the first corpus document
262
263    """
264    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
265                 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25):
266        r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component
267        (inverse document frequency), and normalizing the resulting documents to unit length.
268        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
269
270        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \frac{D}{document\_freq_{i}}
271
272        or, more generally
273
274        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)
275
276        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.
277
278        Parameters
279        ----------
280        corpus : iterable of iterable of (int, int), optional
281            Input corpus
282        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
283            Mapping token - id, that was used for converting input data to bag of words format.
284        dictionary : :class:`~gensim.corpora.Dictionary`
285            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
286            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
287        wlocals : callable, optional
288            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
289            (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.).
290        wglobal : callable, optional
291            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
292        normalize : {bool, callable}, optional
293            Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
294        smartirs : str, optional
295            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
296            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
297            The mnemonic for representing a combination of weights takes the form XYZ,
298            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
299
300            Term frequency weighing:
301                * `b` - binary,
302                * `t` or `n` - raw,
303                * `a` - augmented,
304                * `l` - logarithm,
305                * `d` - double logarithm,
306                * `L` - log average.
307
308            Document frequency weighting:
309                * `x` or `n` - none,
310                * `f` - idf,
311                * `t` - zero-corrected idf,
312                * `p` - probabilistic idf.
313
314            Document normalization:
315                * `x` or `n` - none,
316                * `c` - cosine,
317                * `u` - pivoted unique,
318                * `b` - pivoted character length.
319
320            Default is 'nfc'.
321            For more information visit `SMART Information Retrieval System
322            <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
323        pivot : float or None, optional
324            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
325            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
326            slope) * pivot`.
327
328            You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following
329            two steps:
330
331                * Set either the `u` or `b` document normalization in the `smartirs` parameter.
332                * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from
333                  the properties of the `corpus` or `dictionary`.
334
335            If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be
336            disabled. Default is None.
337
338            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
339        slope : float, optional
340            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
341            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
342            slope) * pivot`.
343
344            Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively
345            disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and
346            0.3 for best results. Default is 0.25.
347
348            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
349
350        See Also
351        --------
352        ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme.
353        resolve_weights : Function that also uses the SMART scheme.
354
355        References
356        ----------
357        .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length
358           Normalization <http://singhal.info/pivoted-dln.pdf>`_. *SIGIR Forum*, 51, 176–184.
359        .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview <http://singhal.info/ieee2001.pdf>`_.
360           *IEEE Data Eng. Bull.*, 24(4), 35–43.
361
362        """
363        self.id2word = id2word
364        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
365        self.num_docs, self.num_nnz, self.idfs = None, None, None
366        self.smartirs = resolve_weights(smartirs) if smartirs is not None else None
367        self.slope = slope
368        self.pivot = pivot
369        self.eps = 1e-12
370
371        if smartirs:
372            n_tf, n_df, n_n = self.smartirs
373            self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
374            self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
375
376        if dictionary:
377            # user supplied a Dictionary object, which already contains all the
378            # statistics we need to construct the IDF mapping. we can skip the
379            # step that goes through the corpus (= an optimization).
380            if corpus:
381                logger.warning(
382                    "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
383                )
384            self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
385            self.cfs = dictionary.cfs.copy()
386            self.dfs = dictionary.dfs.copy()
387            self.term_lens = {termid: len(term) for termid, term in dictionary.items()}
388            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
389            if not id2word:
390                self.id2word = dictionary
391        elif corpus:
392            self.initialize(corpus)
393        else:
394            # NOTE: everything is left uninitialized; presumably the model will
395            # be initialized in some other way
396            pass
397
398        # If smartirs is not None, override pivot and normalize
399        if not smartirs:
400            return
401        if self.pivot is not None:
402            if n_n in 'ub':
403                logger.warning("constructor received pivot; ignoring smartirs[2]")
404            return
405        if n_n in 'ub' and callable(self.normalize):
406            logger.warning("constructor received smartirs; ignoring normalize")
407        if n_n in 'ub' and not dictionary and not corpus:
408            logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]")
409        elif n_n == "u":
410            self.pivot = 1.0 * self.num_nnz / self.num_docs
411        elif n_n == "b":
412            self.pivot = 1.0 * sum(
413                self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in dictionary.keys()
414            ) / self.num_docs
415
416    @classmethod
417    def load(cls, *args, **kwargs):
418        """Load a previously saved TfidfModel class. Handles backwards compatibility from
419        older TfidfModel versions which did not use pivoted document normalization.
420
421        """
422        model = super(TfidfModel, cls).load(*args, **kwargs)
423        if not hasattr(model, 'pivot'):
424            model.pivot = None
425            logger.info('older version of %s loaded without pivot arg', cls.__name__)
426            logger.info('Setting pivot to %s.', model.pivot)
427        if not hasattr(model, 'slope'):
428            model.slope = 0.65
429            logger.info('older version of %s loaded without slope arg', cls.__name__)
430            logger.info('Setting slope to %s.', model.slope)
431        if not hasattr(model, 'smartirs'):
432            model.smartirs = None
433            logger.info('older version of %s loaded without smartirs arg', cls.__name__)
434            logger.info('Setting smartirs to %s.', model.smartirs)
435        return model
436
437    def __str__(self):
438        return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
439
440    def initialize(self, corpus):
441        """Compute inverse document weights, which will be used to modify term frequencies for documents.
442
443        Parameters
444        ----------
445        corpus : iterable of iterable of (int, int)
446            Input corpus.
447
448        """
449        logger.info("collecting document frequencies")
450        dfs = {}
451        numnnz, docno = 0, -1
452
453        for docno, bow in enumerate(corpus):
454            if docno % 10000 == 0:
455                logger.info("PROGRESS: processing document #%i", docno)
456            numnnz += len(bow)
457            for termid, _ in bow:
458                dfs[termid] = dfs.get(termid, 0) + 1
459        # keep some stats about the training corpus
460        self.num_docs = docno + 1
461        self.num_nnz = numnnz
462        self.cfs = None
463        self.dfs = dfs
464        self.term_lengths = None
465        # and finally compute the idf weights
466        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
467        self.add_lifecycle_event(
468            "initialize",
469            msg=(
470                f"calculated IDF weights for {self.num_docs} documents and {max(dfs.keys()) + 1 if dfs else 0}"
471                f" features ({self.num_nnz} matrix non-zeros)"
472            ),
473        )
474
475    def __getitem__(self, bow, eps=1e-12):
476        """Get the tf-idf representation of an input vector and/or corpus.
477
478        bow : {list of (int, int), iterable of iterable of (int, int)}
479            Input document in the `sparse Gensim bag-of-words format
480            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
481            or a streamed corpus of such documents.
482        eps : float
483            Threshold value, will remove all position that have tfidf-value less than `eps`.
484
485        Returns
486        -------
487        vector : list of (int, float)
488            TfIdf vector, if `bow` is a single document
489        :class:`~gensim.interfaces.TransformedCorpus`
490            TfIdf corpus, if `bow` is a corpus.
491
492        """
493        self.eps = eps
494        # if the input vector is in fact a corpus, return a transformed corpus as a result
495        is_corpus, bow = utils.is_corpus(bow)
496        if is_corpus:
497            return self._apply(bow)
498
499        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
500        # as strict application of the IDF formula would dictate)
501
502        termid_array, tf_array = [], []
503        for termid, tf in bow:
504            termid_array.append(termid)
505            tf_array.append(tf)
506
507        tf_array = self.wlocal(np.array(tf_array))
508
509        vector = [
510            (termid, tf * self.idfs.get(termid))
511            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
512        ]
513
514        # and finally, normalize the vector either to unit length, or use a
515        # user-defined normalization function
516        if self.smartirs:
517            n_n = self.smartirs[2]
518            if n_n == "n" or (n_n in 'ub' and self.pivot is None):
519                if self.pivot is not None:
520                    _, old_norm = matutils.unitvec(vector, return_norm=True)
521                norm_vector = vector
522            elif n_n == "c":
523                if self.pivot is not None:
524                    _, old_norm = matutils.unitvec(vector, return_norm=True)
525                else:
526                    norm_vector = matutils.unitvec(vector)
527            elif n_n == "u":
528                _, old_norm = matutils.unitvec(vector, return_norm=True, norm='unique')
529            elif n_n == "b":
530                old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow)
531        else:
532            if self.normalize is True:
533                self.normalize = matutils.unitvec
534            elif self.normalize is False:
535                self.normalize = utils.identity
536
537            if self.pivot is not None:
538                _, old_norm = self.normalize(vector, return_norm=True)
539            else:
540                norm_vector = self.normalize(vector)
541
542        if self.pivot is None:
543            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
544        else:
545            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
546            norm_vector = [
547                (termid, weight / float(pivoted_norm))
548                for termid, weight in vector
549                if abs(weight / float(pivoted_norm)) > self.eps
550            ]
551        return norm_vector
552