1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3#
4# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
6
7r"""This module contains functions to compute confirmation on a pair of words or word subsets.
8
9Notes
10-----
11The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and
12:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing
13brands of cars, which semantically support each other. However, both brands are seldom mentioned
14together in documents in the reference corpus. But their confirmations to other words like “road”
15or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure.
16Thus, indirect confirmation measures may capture semantic support that direct measures would miss.
17
18The formula used to compute indirect confirmation measure is
19
20.. math::
21
22    \widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*}))
23
24
25where :math:`s_{sim}` can be cosine, dice or jaccard similarity and
26
27.. math::
28
29    \vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
30
31"""
32
33import itertools
34import logging
35
36import numpy as np
37import scipy.sparse as sps
38
39from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure
40
41logger = logging.getLogger(__name__)
42
43
44def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
45    """For each topic segmentation, compute average cosine similarity using a
46    :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.
47
48    Parameters
49    ----------
50    segmented_topics : list of lists of (int, `numpy.ndarray`)
51        Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
52    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
53                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
54        Word occurrence accumulator.
55    with_std : bool, optional
56        True to also include standard deviation across topic segment sets
57        in addition to the mean coherence for each topic.
58    with_support : bool, optional
59        True to also include support across topic segments. The support is defined as
60        the number of pairwise similarity comparisons were used to compute the overall topic coherence.
61
62    Returns
63    -------
64    list of (float[, float[, int]])
65        Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).
66
67    Examples
68    --------
69    .. sourcecode:: pycon
70
71        >>> import numpy as np
72        >>> from gensim.corpora.dictionary import Dictionary
73        >>> from gensim.topic_coherence import indirect_confirmation_measure
74        >>> from gensim.topic_coherence import text_analysis
75        >>>
76        >>> # create segmentation
77        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
78        >>>
79        >>> # create accumulator
80        >>> dictionary = Dictionary()
81        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
82        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
83        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
84        >>>
85        >>> # should be (0.726752426218 0.00695475919227)
86        >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]
87
88    """
89    topic_coherences = []
90    total_oov = 0
91
92    for topic_index, topic_segments in enumerate(segmented_topics):
93        segment_sims = []
94        num_oov = 0
95        for w_prime, w_star in topic_segments:
96            if not hasattr(w_prime, '__iter__'):
97                w_prime = [w_prime]
98            if not hasattr(w_star, '__iter__'):
99                w_star = [w_star]
100
101            try:
102                segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
103            except ZeroDivisionError:
104                num_oov += 1
105
106        if num_oov > 0:
107            total_oov += 1
108            logger.warning(
109                "%d terms for topic %d are not in word2vec model vocabulary",
110                num_oov, topic_index)
111        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
112
113    if total_oov > 0:
114        logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
115    return topic_coherences
116
117
118def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr',
119                      gamma=1, with_std=False, with_support=False):
120    """Calculate the indirect cosine measure.
121
122    Parameters
123    ----------
124    segmented_topics: list of lists of (int, `numpy.ndarray`)
125        Output from the segmentation module of the segmented topics.
126    accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
127        Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
128    measure : str, optional
129        Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
130    gamma: float, optional
131        Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
132    with_std : bool
133        True to also include standard deviation across topic segment sets in addition to the mean coherence
134        for each topic; default is False.
135    with_support : bool
136        True to also include support across topic segments. The support is defined as the number of pairwise similarity
137        comparisons were used to compute the overall topic coherence.
138
139    Returns
140    -------
141    list
142        List of indirect cosine similarity measure for each topic.
143
144    Examples
145    --------
146    .. sourcecode:: pycon
147
148        >>> from gensim.corpora.dictionary import Dictionary
149        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
150        >>> import numpy as np
151        >>>
152        >>> # create accumulator
153        >>> dictionary = Dictionary()
154        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
155        >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
156        >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
157        >>> accumulator._num_docs = 5
158        >>>
159        >>> # create topics
160        >>> topics = [np.array([1, 2])]
161        >>>
162        >>> # create segmentation
163        >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
164        >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
165        >>> print(obtained[0])
166        0.623018926945
167
168    """
169    context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)
170
171    topic_coherences = []
172    for topic_words, topic_segments in zip(topics, segmented_topics):
173        topic_words = tuple(topic_words)  # because tuples are hashable
174        segment_sims = np.zeros(len(topic_segments))
175        for i, (w_prime, w_star) in enumerate(topic_segments):
176            w_prime_cv = context_vectors[w_prime, topic_words]
177            w_star_cv = context_vectors[w_star, topic_words]
178            segment_sims[i] = _cossim(w_prime_cv, w_star_cv)
179
180        topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
181
182    return topic_coherences
183
184
185class ContextVectorComputer:
186    """Lazily compute context vectors for topic segments.
187
188    Parameters
189    ----------
190    measure: str
191        Confirmation measure.
192    topics: list of numpy.array
193        Topics.
194    accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
195                  :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
196        Word occurrence accumulator from probability_estimation.
197    gamma: float
198        Value for computing vectors.
199
200    Attributes
201    ----------
202    sim_cache: dict
203        Cache similarities between tokens (pairs of word ids), e.g. (1, 2).
204    context_vector_cache: dict
205        Mapping from (segment, topic_words) --> context_vector.
206
207    Example
208    -------
209    .. sourcecode:: pycon
210
211        >>> from gensim.corpora.dictionary import Dictionary
212        >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
213        >>> import numpy as np
214        >>>
215        >>> # create measure, topics
216        >>> measure = 'nlr'
217        >>> topics = [np.array([1, 2])]
218        >>>
219        >>> # create accumulator
220        >>> dictionary = Dictionary()
221        >>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
222        >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
223        >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5)
224        >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1)
225        >>> cont_vect_comp.mapping
226        {1: 0, 2: 1}
227        >>> cont_vect_comp.vocab_size
228        2
229
230    """
231
232    def __init__(self, measure, topics, accumulator, gamma):
233
234        if measure == 'nlr':
235            self.similarity = _pair_npmi
236        else:
237            raise ValueError(
238                "The direct confirmation measure you entered is not currently supported.")
239
240        self.mapping = _map_to_contiguous(topics)
241        self.vocab_size = len(self.mapping)
242        self.accumulator = accumulator
243        self.gamma = gamma
244        self.sim_cache = {}
245        self.context_vector_cache = {}
246
247    def __getitem__(self, idx):
248        return self.compute_context_vector(*idx)
249
250    def compute_context_vector(self, segment_word_ids, topic_word_ids):
251        """Check if (segment_word_ids, topic_word_ids) context vector has been cached.
252
253        Parameters
254        ----------
255        segment_word_ids: list
256            Ids of words in segment.
257        topic_word_ids: list
258            Ids of words in topic.
259        Returns
260        -------
261        csr_matrix :class:`~scipy.sparse.csr`
262            If context vector has been cached, then return corresponding context vector,
263            else compute, cache, and return.
264
265        """
266        key = _key_for_segment(segment_word_ids, topic_word_ids)
267        context_vector = self.context_vector_cache.get(key, None)
268        if context_vector is None:
269            context_vector = self._make_seg(segment_word_ids, topic_word_ids)
270            self.context_vector_cache[key] = context_vector
271        return context_vector
272
273    def _make_seg(self, segment_word_ids, topic_word_ids):
274        """Return context vectors for segmentation (Internal helper function).
275
276        Parameters
277        ----------
278        segment_word_ids : iterable or int
279            Ids of words in segment.
280        topic_word_ids : list
281            Ids of words in topic.
282        Returns
283        -------
284        csr_matrix :class:`~scipy.sparse.csr`
285            Matrix in Compressed Sparse Row format
286
287        """
288        context_vector = sps.lil_matrix((self.vocab_size, 1))
289        if not hasattr(segment_word_ids, '__iter__'):
290            segment_word_ids = (segment_word_ids,)
291
292        for w_j in topic_word_ids:
293            idx = (self.mapping[w_j], 0)
294            for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids):
295                if pair not in self.sim_cache:
296                    self.sim_cache[pair] = self.similarity(pair, self.accumulator)
297
298                context_vector[idx] += self.sim_cache[pair] ** self.gamma
299
300        return context_vector.tocsr()
301
302
303def _pair_npmi(pair, accumulator):
304    """Compute normalized pairwise mutual information (**NPMI**) between a pair of words.
305
306    Parameters
307    ----------
308    pair : (int, int)
309        The pair of words (word_id1, word_id2).
310    accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
311        Word occurrence accumulator from probability_estimation.
312
313    Return
314    ------
315    float
316        NPMI between a pair of words.
317
318    """
319    return log_ratio_measure([[pair]], accumulator, True)[0]
320
321
322def _cossim(cv1, cv2):
323    return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
324
325
326def _magnitude(sparse_vec):
327    return np.sqrt(np.sum(sparse_vec.data ** 2))
328
329
330def _map_to_contiguous(ids_iterable):
331    uniq_ids = {}
332    n = 0
333    for id_ in itertools.chain.from_iterable(ids_iterable):
334        if id_ not in uniq_ids:
335            uniq_ids[id_] = n
336            n += 1
337    return uniq_ids
338
339
340def _key_for_segment(segment, topic_words):
341    """A segment may have a single number of an iterable of them."""
342    segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment
343    return segment_key, topic_words
344