1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz> 5# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 7r"""This module contains functions to compute confirmation on a pair of words or word subsets. 8 9Notes 10----- 11The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and 12:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing 13brands of cars, which semantically support each other. However, both brands are seldom mentioned 14together in documents in the reference corpus. But their confirmations to other words like “road” 15or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. 16Thus, indirect confirmation measures may capture semantic support that direct measures would miss. 17 18The formula used to compute indirect confirmation measure is 19 20.. math:: 21 22 \widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*})) 23 24 25where :math:`s_{sim}` can be cosine, dice or jaccard similarity and 26 27.. math:: 28 29 \vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} 30 31""" 32 33import itertools 34import logging 35 36import numpy as np 37import scipy.sparse as sps 38 39from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure 40 41logger = logging.getLogger(__name__) 42 43 44def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False): 45 """For each topic segmentation, compute average cosine similarity using a 46 :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`. 47 48 Parameters 49 ---------- 50 segmented_topics : list of lists of (int, `numpy.ndarray`) 51 Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`. 52 accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or 53 :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` 54 Word occurrence accumulator. 55 with_std : bool, optional 56 True to also include standard deviation across topic segment sets 57 in addition to the mean coherence for each topic. 58 with_support : bool, optional 59 True to also include support across topic segments. The support is defined as 60 the number of pairwise similarity comparisons were used to compute the overall topic coherence. 61 62 Returns 63 ------- 64 list of (float[, float[, int]]) 65 Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`). 66 67 Examples 68 -------- 69 .. sourcecode:: pycon 70 71 >>> import numpy as np 72 >>> from gensim.corpora.dictionary import Dictionary 73 >>> from gensim.topic_coherence import indirect_confirmation_measure 74 >>> from gensim.topic_coherence import text_analysis 75 >>> 76 >>> # create segmentation 77 >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] 78 >>> 79 >>> # create accumulator 80 >>> dictionary = Dictionary() 81 >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} 82 >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) 83 >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5) 84 >>> 85 >>> # should be (0.726752426218 0.00695475919227) 86 >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0] 87 88 """ 89 topic_coherences = [] 90 total_oov = 0 91 92 for topic_index, topic_segments in enumerate(segmented_topics): 93 segment_sims = [] 94 num_oov = 0 95 for w_prime, w_star in topic_segments: 96 if not hasattr(w_prime, '__iter__'): 97 w_prime = [w_prime] 98 if not hasattr(w_star, '__iter__'): 99 w_star = [w_star] 100 101 try: 102 segment_sims.append(accumulator.ids_similarity(w_prime, w_star)) 103 except ZeroDivisionError: 104 num_oov += 1 105 106 if num_oov > 0: 107 total_oov += 1 108 logger.warning( 109 "%d terms for topic %d are not in word2vec model vocabulary", 110 num_oov, topic_index) 111 topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support)) 112 113 if total_oov > 0: 114 logger.warning("%d terms for are not in word2vec model vocabulary", total_oov) 115 return topic_coherences 116 117 118def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', 119 gamma=1, with_std=False, with_support=False): 120 """Calculate the indirect cosine measure. 121 122 Parameters 123 ---------- 124 segmented_topics: list of lists of (int, `numpy.ndarray`) 125 Output from the segmentation module of the segmented topics. 126 accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` 127 Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. 128 measure : str, optional 129 Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). 130 gamma: float, optional 131 Gamma value for computing :math:`W'` and :math:`W^{*}` vectors. 132 with_std : bool 133 True to also include standard deviation across topic segment sets in addition to the mean coherence 134 for each topic; default is False. 135 with_support : bool 136 True to also include support across topic segments. The support is defined as the number of pairwise similarity 137 comparisons were used to compute the overall topic coherence. 138 139 Returns 140 ------- 141 list 142 List of indirect cosine similarity measure for each topic. 143 144 Examples 145 -------- 146 .. sourcecode:: pycon 147 148 >>> from gensim.corpora.dictionary import Dictionary 149 >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis 150 >>> import numpy as np 151 >>> 152 >>> # create accumulator 153 >>> dictionary = Dictionary() 154 >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} 155 >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) 156 >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} 157 >>> accumulator._num_docs = 5 158 >>> 159 >>> # create topics 160 >>> topics = [np.array([1, 2])] 161 >>> 162 >>> # create segmentation 163 >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] 164 >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1) 165 >>> print(obtained[0]) 166 0.623018926945 167 168 """ 169 context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) 170 171 topic_coherences = [] 172 for topic_words, topic_segments in zip(topics, segmented_topics): 173 topic_words = tuple(topic_words) # because tuples are hashable 174 segment_sims = np.zeros(len(topic_segments)) 175 for i, (w_prime, w_star) in enumerate(topic_segments): 176 w_prime_cv = context_vectors[w_prime, topic_words] 177 w_star_cv = context_vectors[w_star, topic_words] 178 segment_sims[i] = _cossim(w_prime_cv, w_star_cv) 179 180 topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support)) 181 182 return topic_coherences 183 184 185class ContextVectorComputer: 186 """Lazily compute context vectors for topic segments. 187 188 Parameters 189 ---------- 190 measure: str 191 Confirmation measure. 192 topics: list of numpy.array 193 Topics. 194 accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or 195 :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` 196 Word occurrence accumulator from probability_estimation. 197 gamma: float 198 Value for computing vectors. 199 200 Attributes 201 ---------- 202 sim_cache: dict 203 Cache similarities between tokens (pairs of word ids), e.g. (1, 2). 204 context_vector_cache: dict 205 Mapping from (segment, topic_words) --> context_vector. 206 207 Example 208 ------- 209 .. sourcecode:: pycon 210 211 >>> from gensim.corpora.dictionary import Dictionary 212 >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis 213 >>> import numpy as np 214 >>> 215 >>> # create measure, topics 216 >>> measure = 'nlr' 217 >>> topics = [np.array([1, 2])] 218 >>> 219 >>> # create accumulator 220 >>> dictionary = Dictionary() 221 >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} 222 >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) 223 >>> _ = accumulator.accumulate([['fake', 'tokens'], ['tokens', 'fake']], 5) 224 >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1) 225 >>> cont_vect_comp.mapping 226 {1: 0, 2: 1} 227 >>> cont_vect_comp.vocab_size 228 2 229 230 """ 231 232 def __init__(self, measure, topics, accumulator, gamma): 233 234 if measure == 'nlr': 235 self.similarity = _pair_npmi 236 else: 237 raise ValueError( 238 "The direct confirmation measure you entered is not currently supported.") 239 240 self.mapping = _map_to_contiguous(topics) 241 self.vocab_size = len(self.mapping) 242 self.accumulator = accumulator 243 self.gamma = gamma 244 self.sim_cache = {} 245 self.context_vector_cache = {} 246 247 def __getitem__(self, idx): 248 return self.compute_context_vector(*idx) 249 250 def compute_context_vector(self, segment_word_ids, topic_word_ids): 251 """Check if (segment_word_ids, topic_word_ids) context vector has been cached. 252 253 Parameters 254 ---------- 255 segment_word_ids: list 256 Ids of words in segment. 257 topic_word_ids: list 258 Ids of words in topic. 259 Returns 260 ------- 261 csr_matrix :class:`~scipy.sparse.csr` 262 If context vector has been cached, then return corresponding context vector, 263 else compute, cache, and return. 264 265 """ 266 key = _key_for_segment(segment_word_ids, topic_word_ids) 267 context_vector = self.context_vector_cache.get(key, None) 268 if context_vector is None: 269 context_vector = self._make_seg(segment_word_ids, topic_word_ids) 270 self.context_vector_cache[key] = context_vector 271 return context_vector 272 273 def _make_seg(self, segment_word_ids, topic_word_ids): 274 """Return context vectors for segmentation (Internal helper function). 275 276 Parameters 277 ---------- 278 segment_word_ids : iterable or int 279 Ids of words in segment. 280 topic_word_ids : list 281 Ids of words in topic. 282 Returns 283 ------- 284 csr_matrix :class:`~scipy.sparse.csr` 285 Matrix in Compressed Sparse Row format 286 287 """ 288 context_vector = sps.lil_matrix((self.vocab_size, 1)) 289 if not hasattr(segment_word_ids, '__iter__'): 290 segment_word_ids = (segment_word_ids,) 291 292 for w_j in topic_word_ids: 293 idx = (self.mapping[w_j], 0) 294 for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids): 295 if pair not in self.sim_cache: 296 self.sim_cache[pair] = self.similarity(pair, self.accumulator) 297 298 context_vector[idx] += self.sim_cache[pair] ** self.gamma 299 300 return context_vector.tocsr() 301 302 303def _pair_npmi(pair, accumulator): 304 """Compute normalized pairwise mutual information (**NPMI**) between a pair of words. 305 306 Parameters 307 ---------- 308 pair : (int, int) 309 The pair of words (word_id1, word_id2). 310 accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` 311 Word occurrence accumulator from probability_estimation. 312 313 Return 314 ------ 315 float 316 NPMI between a pair of words. 317 318 """ 319 return log_ratio_measure([[pair]], accumulator, True)[0] 320 321 322def _cossim(cv1, cv2): 323 return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2)) 324 325 326def _magnitude(sparse_vec): 327 return np.sqrt(np.sum(sparse_vec.data ** 2)) 328 329 330def _map_to_contiguous(ids_iterable): 331 uniq_ids = {} 332 n = 0 333 for id_ in itertools.chain.from_iterable(ids_iterable): 334 if id_ not in uniq_ids: 335 uniq_ids[id_] = n 336 n += 1 337 return uniq_ids 338 339 340def _key_for_segment(segment, topic_words): 341 """A segment may have a single number of an iterable of them.""" 342 segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment 343 return segment_key, topic_words 344