1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: WordNet 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Steven Bethard <Steven.Bethard@colorado.edu> 6# Steven Bird <stevenbird1@gmail.com> 7# Edward Loper <edloper@gmail.com> 8# Nitin Madnani <nmadnani@ets.org> 9# Nasruddin A’aidil Shari 10# Sim Wei Ying Geraldine 11# Soe Lynn 12# Francis Bond <bond@ieee.org> 13# URL: <http://nltk.org/> 14# For license information, see LICENSE.TXT 15 16""" 17An NLTK interface for WordNet 18 19WordNet is a lexical database of English. 20Using synsets, helps find conceptual relationships between words 21such as hypernyms, hyponyms, synonyms, antonyms etc. 22 23For details about WordNet see: 24http://wordnet.princeton.edu/ 25 26This module also allows you to find lemmas in languages 27other than English from the Open Multilingual Wordnet 28http://compling.hss.ntu.edu.sg/omw/ 29 30""" 31 32from __future__ import print_function, unicode_literals 33 34import math 35import re 36from itertools import islice, chain 37from functools import total_ordering 38from operator import itemgetter 39from collections import defaultdict, deque 40 41from six import iteritems 42from six.moves import range 43 44from nltk.corpus.reader import CorpusReader 45from nltk.util import binary_search_file as _binary_search_file 46from nltk.probability import FreqDist 47from nltk.compat import python_2_unicode_compatible 48from nltk.internals import deprecated 49 50###################################################################### 51# Table of Contents 52###################################################################### 53# - Constants 54# - Data Classes 55# - WordNetError 56# - Lemma 57# - Synset 58# - WordNet Corpus Reader 59# - WordNet Information Content Corpus Reader 60# - Similarity Metrics 61# - Demo 62 63###################################################################### 64# Constants 65###################################################################### 66 67#: Positive infinity (for similarity functions) 68_INF = 1e300 69 70# { Part-of-speech constants 71ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' 72# } 73 74POS_LIST = [NOUN, VERB, ADJ, ADV] 75 76# A table of strings that are used to express verb frames. 77VERB_FRAME_STRINGS = ( 78 None, 79 "Something %s", 80 "Somebody %s", 81 "It is %sing", 82 "Something is %sing PP", 83 "Something %s something Adjective/Noun", 84 "Something %s Adjective/Noun", 85 "Somebody %s Adjective", 86 "Somebody %s something", 87 "Somebody %s somebody", 88 "Something %s somebody", 89 "Something %s something", 90 "Something %s to somebody", 91 "Somebody %s on something", 92 "Somebody %s somebody something", 93 "Somebody %s something to somebody", 94 "Somebody %s something from somebody", 95 "Somebody %s somebody with something", 96 "Somebody %s somebody of something", 97 "Somebody %s something on somebody", 98 "Somebody %s somebody PP", 99 "Somebody %s something PP", 100 "Somebody %s PP", 101 "Somebody's (body part) %s", 102 "Somebody %s somebody to INFINITIVE", 103 "Somebody %s somebody INFINITIVE", 104 "Somebody %s that CLAUSE", 105 "Somebody %s to somebody", 106 "Somebody %s to INFINITIVE", 107 "Somebody %s whether INFINITIVE", 108 "Somebody %s somebody into V-ing something", 109 "Somebody %s something with something", 110 "Somebody %s INFINITIVE", 111 "Somebody %s VERB-ing", 112 "It %s that CLAUSE", 113 "Something %s INFINITIVE", 114) 115 116SENSENUM_RE = re.compile(r'\.[\d]+\.') 117 118 119###################################################################### 120# Data Classes 121###################################################################### 122 123 124class WordNetError(Exception): 125 """An exception class for wordnet-related errors.""" 126 127 128@total_ordering 129class _WordNetObject(object): 130 """A common base class for lemmas and synsets.""" 131 132 def hypernyms(self): 133 return self._related('@') 134 135 def _hypernyms(self): 136 return self._related('@') 137 138 def instance_hypernyms(self): 139 return self._related('@i') 140 141 def _instance_hypernyms(self): 142 return self._related('@i') 143 144 def hyponyms(self): 145 return self._related('~') 146 147 def instance_hyponyms(self): 148 return self._related('~i') 149 150 def member_holonyms(self): 151 return self._related('#m') 152 153 def substance_holonyms(self): 154 return self._related('#s') 155 156 def part_holonyms(self): 157 return self._related('#p') 158 159 def member_meronyms(self): 160 return self._related('%m') 161 162 def substance_meronyms(self): 163 return self._related('%s') 164 165 def part_meronyms(self): 166 return self._related('%p') 167 168 def topic_domains(self): 169 return self._related(';c') 170 171 def in_topic_domains(self): 172 return self._related('-c') 173 174 def region_domains(self): 175 return self._related(';r') 176 177 def in_region_domains(self): 178 return self._related('-r') 179 180 def usage_domains(self): 181 return self._related(';u') 182 183 def in_usage_domains(self): 184 return self._related('-u') 185 186 def attributes(self): 187 return self._related('=') 188 189 def entailments(self): 190 return self._related('*') 191 192 def causes(self): 193 return self._related('>') 194 195 def also_sees(self): 196 return self._related('^') 197 198 def verb_groups(self): 199 return self._related('$') 200 201 def similar_tos(self): 202 return self._related('&') 203 204 def __hash__(self): 205 return hash(self._name) 206 207 def __eq__(self, other): 208 return self._name == other._name 209 210 def __ne__(self, other): 211 return self._name != other._name 212 213 def __lt__(self, other): 214 return self._name < other._name 215 216 217@python_2_unicode_compatible 218class Lemma(_WordNetObject): 219 """ 220 The lexical entry for a single morphological form of a 221 sense-disambiguated word. 222 223 Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where: 224 <word> is the morphological stem identifying the synset 225 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB 226 <number> is the sense number, counting from 0. 227 <lemma> is the morphological form of interest 228 229 Note that <word> and <lemma> can be different, e.g. the Synset 230 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and 231 'salt.n.03.salinity'. 232 233 Lemma attributes, accessible via methods with the same name:: 234 235 - name: The canonical name of this lemma. 236 - synset: The synset that this lemma belongs to. 237 - syntactic_marker: For adjectives, the WordNet string identifying the 238 syntactic position relative modified noun. See: 239 http://wordnet.princeton.edu/man/wninput.5WN.html#sect10 240 For all other parts of speech, this attribute is None. 241 - count: The frequency of this lemma in wordnet. 242 243 Lemma methods: 244 245 Lemmas have the following methods for retrieving related Lemmas. They 246 correspond to the names for the pointer symbols defined here: 247 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 248 These methods all return lists of Lemmas: 249 250 - antonyms 251 - hypernyms, instance_hypernyms 252 - hyponyms, instance_hyponyms 253 - member_holonyms, substance_holonyms, part_holonyms 254 - member_meronyms, substance_meronyms, part_meronyms 255 - topic_domains, region_domains, usage_domains 256 - attributes 257 - derivationally_related_forms 258 - entailments 259 - causes 260 - also_sees 261 - verb_groups 262 - similar_tos 263 - pertainyms 264 """ 265 266 __slots__ = [ 267 '_wordnet_corpus_reader', 268 '_name', 269 '_syntactic_marker', 270 '_synset', 271 '_frame_strings', 272 '_frame_ids', 273 '_lexname_index', 274 '_lex_id', 275 '_lang', 276 '_key', 277 ] 278 279 def __init__( 280 self, 281 wordnet_corpus_reader, 282 synset, 283 name, 284 lexname_index, 285 lex_id, 286 syntactic_marker, 287 ): 288 self._wordnet_corpus_reader = wordnet_corpus_reader 289 self._name = name 290 self._syntactic_marker = syntactic_marker 291 self._synset = synset 292 self._frame_strings = [] 293 self._frame_ids = [] 294 self._lexname_index = lexname_index 295 self._lex_id = lex_id 296 self._lang = 'eng' 297 298 self._key = None # gets set later. 299 300 def name(self): 301 return self._name 302 303 def syntactic_marker(self): 304 return self._syntactic_marker 305 306 def synset(self): 307 return self._synset 308 309 def frame_strings(self): 310 return self._frame_strings 311 312 def frame_ids(self): 313 return self._frame_ids 314 315 def lang(self): 316 return self._lang 317 318 def key(self): 319 return self._key 320 321 def __repr__(self): 322 tup = type(self).__name__, self._synset._name, self._name 323 return "%s('%s.%s')" % tup 324 325 def _related(self, relation_symbol): 326 get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset 327 if (self._name, relation_symbol) not in self._synset._lemma_pointers: 328 return [] 329 return [ 330 get_synset(pos, offset)._lemmas[lemma_index] 331 for pos, offset, lemma_index in self._synset._lemma_pointers[ 332 self._name, relation_symbol 333 ] 334 ] 335 336 def count(self): 337 """Return the frequency count for this Lemma""" 338 return self._wordnet_corpus_reader.lemma_count(self) 339 340 def antonyms(self): 341 return self._related('!') 342 343 def derivationally_related_forms(self): 344 return self._related('+') 345 346 def pertainyms(self): 347 return self._related('\\') 348 349 350@python_2_unicode_compatible 351class Synset(_WordNetObject): 352 """Create a Synset from a "<lemma>.<pos>.<number>" string where: 353 <lemma> is the word's morphological stem 354 <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB 355 <number> is the sense number, counting from 0. 356 357 Synset attributes, accessible via methods with the same name: 358 359 - name: The canonical name of this synset, formed using the first lemma 360 of this synset. Note that this may be different from the name 361 passed to the constructor if that string used a different lemma to 362 identify the synset. 363 - pos: The synset's part of speech, matching one of the module level 364 attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. 365 - lemmas: A list of the Lemma objects for this synset. 366 - definition: The definition for this synset. 367 - examples: A list of example strings for this synset. 368 - offset: The offset in the WordNet dict file of this synset. 369 - lexname: The name of the lexicographer file containing this synset. 370 371 Synset methods: 372 373 Synsets have the following methods for retrieving related Synsets. 374 They correspond to the names for the pointer symbols defined here: 375 http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 376 These methods all return lists of Synsets. 377 378 - hypernyms, instance_hypernyms 379 - hyponyms, instance_hyponyms 380 - member_holonyms, substance_holonyms, part_holonyms 381 - member_meronyms, substance_meronyms, part_meronyms 382 - attributes 383 - entailments 384 - causes 385 - also_sees 386 - verb_groups 387 - similar_tos 388 389 Additionally, Synsets support the following methods specific to the 390 hypernym relation: 391 392 - root_hypernyms 393 - common_hypernyms 394 - lowest_common_hypernyms 395 396 Note that Synsets do not support the following relations because 397 these are defined by WordNet as lexical relations: 398 399 - antonyms 400 - derivationally_related_forms 401 - pertainyms 402 """ 403 404 __slots__ = [ 405 '_pos', 406 '_offset', 407 '_name', 408 '_frame_ids', 409 '_lemmas', 410 '_lemma_names', 411 '_definition', 412 '_examples', 413 '_lexname', 414 '_pointers', 415 '_lemma_pointers', 416 '_max_depth', 417 '_min_depth', 418 ] 419 420 def __init__(self, wordnet_corpus_reader): 421 self._wordnet_corpus_reader = wordnet_corpus_reader 422 # All of these attributes get initialized by 423 # WordNetCorpusReader._synset_from_pos_and_line() 424 425 self._pos = None 426 self._offset = None 427 self._name = None 428 self._frame_ids = [] 429 self._lemmas = [] 430 self._lemma_names = [] 431 self._definition = None 432 self._examples = [] 433 self._lexname = None # lexicographer name 434 self._all_hypernyms = None 435 436 self._pointers = defaultdict(set) 437 self._lemma_pointers = defaultdict(list) 438 439 def pos(self): 440 return self._pos 441 442 def offset(self): 443 return self._offset 444 445 def name(self): 446 return self._name 447 448 def frame_ids(self): 449 return self._frame_ids 450 451 def definition(self): 452 return self._definition 453 454 def examples(self): 455 return self._examples 456 457 def lexname(self): 458 return self._lexname 459 460 def _needs_root(self): 461 if self._pos == NOUN: 462 if self._wordnet_corpus_reader.get_version() == '1.6': 463 return True 464 else: 465 return False 466 elif self._pos == VERB: 467 return True 468 469 def lemma_names(self, lang='eng'): 470 '''Return all the lemma_names associated with the synset''' 471 if lang == 'eng': 472 return self._lemma_names 473 else: 474 self._wordnet_corpus_reader._load_lang_data(lang) 475 476 i = self._wordnet_corpus_reader.ss2of(self, lang) 477 if i in self._wordnet_corpus_reader._lang_data[lang][0]: 478 return self._wordnet_corpus_reader._lang_data[lang][0][i] 479 else: 480 return [] 481 482 def lemmas(self, lang='eng'): 483 '''Return all the lemma objects associated with the synset''' 484 if lang == 'eng': 485 return self._lemmas 486 else: 487 self._wordnet_corpus_reader._load_lang_data(lang) 488 lemmark = [] 489 lemmy = self.lemma_names(lang) 490 for lem in lemmy: 491 temp = Lemma( 492 self._wordnet_corpus_reader, 493 self, 494 lem, 495 self._wordnet_corpus_reader._lexnames.index(self.lexname()), 496 0, 497 None, 498 ) 499 temp._lang = lang 500 lemmark.append(temp) 501 return lemmark 502 503 def root_hypernyms(self): 504 """Get the topmost hypernyms of this synset in WordNet.""" 505 506 result = [] 507 seen = set() 508 todo = [self] 509 while todo: 510 next_synset = todo.pop() 511 if next_synset not in seen: 512 seen.add(next_synset) 513 next_hypernyms = ( 514 next_synset.hypernyms() + next_synset.instance_hypernyms() 515 ) 516 if not next_hypernyms: 517 result.append(next_synset) 518 else: 519 todo.extend(next_hypernyms) 520 return result 521 522 # Simpler implementation which makes incorrect assumption that 523 # hypernym hierarchy is acyclic: 524 # 525 # if not self.hypernyms(): 526 # return [self] 527 # else: 528 # return list(set(root for h in self.hypernyms() 529 # for root in h.root_hypernyms())) 530 def max_depth(self): 531 """ 532 :return: The length of the longest hypernym path from this 533 synset to the root. 534 """ 535 536 if "_max_depth" not in self.__dict__: 537 hypernyms = self.hypernyms() + self.instance_hypernyms() 538 if not hypernyms: 539 self._max_depth = 0 540 else: 541 self._max_depth = 1 + max(h.max_depth() for h in hypernyms) 542 return self._max_depth 543 544 def min_depth(self): 545 """ 546 :return: The length of the shortest hypernym path from this 547 synset to the root. 548 """ 549 550 if "_min_depth" not in self.__dict__: 551 hypernyms = self.hypernyms() + self.instance_hypernyms() 552 if not hypernyms: 553 self._min_depth = 0 554 else: 555 self._min_depth = 1 + min(h.min_depth() for h in hypernyms) 556 return self._min_depth 557 558 def closure(self, rel, depth=-1): 559 """Return the transitive closure of source under the rel 560 relationship, breadth-first 561 562 >>> from nltk.corpus import wordnet as wn 563 >>> dog = wn.synset('dog.n.01') 564 >>> hyp = lambda s:s.hypernyms() 565 >>> list(dog.closure(hyp)) 566 [Synset('canine.n.02'), Synset('domestic_animal.n.01'), 567 Synset('carnivore.n.01'), Synset('animal.n.01'), 568 Synset('placental.n.01'), Synset('organism.n.01'), 569 Synset('mammal.n.01'), Synset('living_thing.n.01'), 570 Synset('vertebrate.n.01'), Synset('whole.n.02'), 571 Synset('chordate.n.01'), Synset('object.n.01'), 572 Synset('physical_entity.n.01'), Synset('entity.n.01')] 573 574 """ 575 from nltk.util import breadth_first 576 577 synset_offsets = [] 578 for synset in breadth_first(self, rel, depth): 579 if synset._offset != self._offset: 580 if synset._offset not in synset_offsets: 581 synset_offsets.append(synset._offset) 582 yield synset 583 584 def hypernym_paths(self): 585 """ 586 Get the path(s) from this synset to the root, where each path is a 587 list of the synset nodes traversed on the way to the root. 588 589 :return: A list of lists, where each list gives the node sequence 590 connecting the initial ``Synset`` node and a root node. 591 """ 592 paths = [] 593 594 hypernyms = self.hypernyms() + self.instance_hypernyms() 595 if len(hypernyms) == 0: 596 paths = [[self]] 597 598 for hypernym in hypernyms: 599 for ancestor_list in hypernym.hypernym_paths(): 600 ancestor_list.append(self) 601 paths.append(ancestor_list) 602 return paths 603 604 def common_hypernyms(self, other): 605 """ 606 Find all synsets that are hypernyms of this synset and the 607 other synset. 608 609 :type other: Synset 610 :param other: other input synset. 611 :return: The synsets that are hypernyms of both synsets. 612 """ 613 if not self._all_hypernyms: 614 self._all_hypernyms = set( 615 self_synset 616 for self_synsets in self._iter_hypernym_lists() 617 for self_synset in self_synsets 618 ) 619 if not other._all_hypernyms: 620 other._all_hypernyms = set( 621 other_synset 622 for other_synsets in other._iter_hypernym_lists() 623 for other_synset in other_synsets 624 ) 625 return list(self._all_hypernyms.intersection(other._all_hypernyms)) 626 627 def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False): 628 """ 629 Get a list of lowest synset(s) that both synsets have as a hypernym. 630 When `use_min_depth == False` this means that the synset which appears 631 as a hypernym of both `self` and `other` with the lowest maximum depth 632 is returned or if there are multiple such synsets at the same depth 633 they are all returned 634 635 However, if `use_min_depth == True` then the synset(s) which has/have 636 the lowest minimum depth and appear(s) in both paths is/are returned. 637 638 By setting the use_min_depth flag to True, the behavior of NLTK2 can be 639 preserved. This was changed in NLTK3 to give more accurate results in a 640 small set of cases, generally with synsets concerning people. (eg: 641 'chef.n.01', 'fireman.n.01', etc.) 642 643 This method is an implementation of Ted Pedersen's "Lowest Common 644 Subsumer" method from the Perl Wordnet module. It can return either 645 "self" or "other" if they are a hypernym of the other. 646 647 :type other: Synset 648 :param other: other input synset 649 :type simulate_root: bool 650 :param simulate_root: The various verb taxonomies do not 651 share a single root which disallows this metric from working for 652 synsets that are not connected. This flag (False by default) 653 creates a fake root that connects all the taxonomies. Set it 654 to True to enable this behavior. For the noun taxonomy, 655 there is usually a default root except for WordNet version 1.6. 656 If you are using wordnet 1.6, a fake root will need to be added 657 for nouns as well. 658 :type use_min_depth: bool 659 :param use_min_depth: This setting mimics older (v2) behavior of NLTK 660 wordnet If True, will use the min_depth function to calculate the 661 lowest common hypernyms. This is known to give strange results for 662 some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained 663 for backwards compatibility 664 :return: The synsets that are the lowest common hypernyms of both 665 synsets 666 """ 667 synsets = self.common_hypernyms(other) 668 if simulate_root: 669 fake_synset = Synset(None) 670 fake_synset._name = '*ROOT*' 671 fake_synset.hypernyms = lambda: [] 672 fake_synset.instance_hypernyms = lambda: [] 673 synsets.append(fake_synset) 674 675 try: 676 if use_min_depth: 677 max_depth = max(s.min_depth() for s in synsets) 678 unsorted_lch = [s for s in synsets if s.min_depth() == max_depth] 679 else: 680 max_depth = max(s.max_depth() for s in synsets) 681 unsorted_lch = [s for s in synsets if s.max_depth() == max_depth] 682 return sorted(unsorted_lch) 683 except ValueError: 684 return [] 685 686 def hypernym_distances(self, distance=0, simulate_root=False): 687 """ 688 Get the path(s) from this synset to the root, counting the distance 689 of each node from the initial node on the way. A set of 690 (synset, distance) tuples is returned. 691 692 :type distance: int 693 :param distance: the distance (number of edges) from this hypernym to 694 the original hypernym ``Synset`` on which this method was called. 695 :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is 696 a hypernym of the first ``Synset``. 697 """ 698 distances = set([(self, distance)]) 699 for hypernym in self._hypernyms() + self._instance_hypernyms(): 700 distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False) 701 if simulate_root: 702 fake_synset = Synset(None) 703 fake_synset._name = '*ROOT*' 704 fake_synset_distance = max(distances, key=itemgetter(1))[1] 705 distances.add((fake_synset, fake_synset_distance + 1)) 706 return distances 707 708 def _shortest_hypernym_paths(self, simulate_root): 709 if self._name == '*ROOT*': 710 return {self: 0} 711 712 queue = deque([(self, 0)]) 713 path = {} 714 715 while queue: 716 s, depth = queue.popleft() 717 if s in path: 718 continue 719 path[s] = depth 720 721 depth += 1 722 queue.extend((hyp, depth) for hyp in s._hypernyms()) 723 queue.extend((hyp, depth) for hyp in s._instance_hypernyms()) 724 725 if simulate_root: 726 fake_synset = Synset(None) 727 fake_synset._name = '*ROOT*' 728 path[fake_synset] = max(path.values()) + 1 729 730 return path 731 732 def shortest_path_distance(self, other, simulate_root=False): 733 """ 734 Returns the distance of the shortest path linking the two synsets (if 735 one exists). For each synset, all the ancestor nodes and their 736 distances are recorded and compared. The ancestor node common to both 737 synsets that can be reached with the minimum number of traversals is 738 used. If no ancestor nodes are common, None is returned. If a node is 739 compared with itself 0 is returned. 740 741 :type other: Synset 742 :param other: The Synset to which the shortest path will be found. 743 :return: The number of edges in the shortest path connecting the two 744 nodes, or None if no path exists. 745 """ 746 747 if self == other: 748 return 0 749 750 dist_dict1 = self._shortest_hypernym_paths(simulate_root) 751 dist_dict2 = other._shortest_hypernym_paths(simulate_root) 752 753 # For each ancestor synset common to both subject synsets, find the 754 # connecting path length. Return the shortest of these. 755 756 inf = float('inf') 757 path_distance = inf 758 for synset, d1 in iteritems(dist_dict1): 759 d2 = dist_dict2.get(synset, inf) 760 path_distance = min(path_distance, d1 + d2) 761 762 return None if math.isinf(path_distance) else path_distance 763 764 def tree(self, rel, depth=-1, cut_mark=None): 765 """ 766 >>> from nltk.corpus import wordnet as wn 767 >>> dog = wn.synset('dog.n.01') 768 >>> hyp = lambda s:s.hypernyms() 769 >>> from pprint import pprint 770 >>> pprint(dog.tree(hyp)) 771 [Synset('dog.n.01'), 772 [Synset('canine.n.02'), 773 [Synset('carnivore.n.01'), 774 [Synset('placental.n.01'), 775 [Synset('mammal.n.01'), 776 [Synset('vertebrate.n.01'), 777 [Synset('chordate.n.01'), 778 [Synset('animal.n.01'), 779 [Synset('organism.n.01'), 780 [Synset('living_thing.n.01'), 781 [Synset('whole.n.02'), 782 [Synset('object.n.01'), 783 [Synset('physical_entity.n.01'), 784 [Synset('entity.n.01')]]]]]]]]]]]]], 785 [Synset('domestic_animal.n.01'), 786 [Synset('animal.n.01'), 787 [Synset('organism.n.01'), 788 [Synset('living_thing.n.01'), 789 [Synset('whole.n.02'), 790 [Synset('object.n.01'), 791 [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]] 792 """ 793 794 tree = [self] 795 if depth != 0: 796 tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)] 797 elif cut_mark: 798 tree += [cut_mark] 799 return tree 800 801 # interface to similarity methods 802 def path_similarity(self, other, verbose=False, simulate_root=True): 803 """ 804 Path Distance Similarity: 805 Return a score denoting how similar two word senses are, based on the 806 shortest path that connects the senses in the is-a (hypernym/hypnoym) 807 taxonomy. The score is in the range 0 to 1, except in those cases where 808 a path cannot be found (will only be true for verbs as there are many 809 distinct verb taxonomies), in which case None is returned. A score of 810 1 represents identity i.e. comparing a sense with itself will return 1. 811 812 :type other: Synset 813 :param other: The ``Synset`` that this ``Synset`` is being compared to. 814 :type simulate_root: bool 815 :param simulate_root: The various verb taxonomies do not 816 share a single root which disallows this metric from working for 817 synsets that are not connected. This flag (True by default) 818 creates a fake root that connects all the taxonomies. Set it 819 to false to disable this behavior. For the noun taxonomy, 820 there is usually a default root except for WordNet version 1.6. 821 If you are using wordnet 1.6, a fake root will be added for nouns 822 as well. 823 :return: A score denoting the similarity of the two ``Synset`` objects, 824 normally between 0 and 1. None is returned if no connecting path 825 could be found. 1 is returned if a ``Synset`` is compared with 826 itself. 827 """ 828 829 distance = self.shortest_path_distance( 830 other, simulate_root=simulate_root and self._needs_root() 831 ) 832 if distance is None or distance < 0: 833 return None 834 return 1.0 / (distance + 1) 835 836 def lch_similarity(self, other, verbose=False, simulate_root=True): 837 """ 838 Leacock Chodorow Similarity: 839 Return a score denoting how similar two word senses are, based on the 840 shortest path that connects the senses (as above) and the maximum depth 841 of the taxonomy in which the senses occur. The relationship is given as 842 -log(p/2d) where p is the shortest path length and d is the taxonomy 843 depth. 844 845 :type other: Synset 846 :param other: The ``Synset`` that this ``Synset`` is being compared to. 847 :type simulate_root: bool 848 :param simulate_root: The various verb taxonomies do not 849 share a single root which disallows this metric from working for 850 synsets that are not connected. This flag (True by default) 851 creates a fake root that connects all the taxonomies. Set it 852 to false to disable this behavior. For the noun taxonomy, 853 there is usually a default root except for WordNet version 1.6. 854 If you are using wordnet 1.6, a fake root will be added for nouns 855 as well. 856 :return: A score denoting the similarity of the two ``Synset`` objects, 857 normally greater than 0. None is returned if no connecting path 858 could be found. If a ``Synset`` is compared with itself, the 859 maximum score is returned, which varies depending on the taxonomy 860 depth. 861 """ 862 863 if self._pos != other._pos: 864 raise WordNetError( 865 'Computing the lch similarity requires ' 866 '%s and %s to have the same part of speech.' % (self, other) 867 ) 868 869 need_root = self._needs_root() 870 871 if self._pos not in self._wordnet_corpus_reader._max_depth: 872 self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root) 873 874 depth = self._wordnet_corpus_reader._max_depth[self._pos] 875 876 distance = self.shortest_path_distance( 877 other, simulate_root=simulate_root and need_root 878 ) 879 880 if distance is None or distance < 0 or depth == 0: 881 return None 882 return -math.log((distance + 1) / (2.0 * depth)) 883 884 def wup_similarity(self, other, verbose=False, simulate_root=True): 885 """ 886 Wu-Palmer Similarity: 887 Return a score denoting how similar two word senses are, based on the 888 depth of the two senses in the taxonomy and that of their Least Common 889 Subsumer (most specific ancestor node). Previously, the scores computed 890 by this implementation did _not_ always agree with those given by 891 Pedersen's Perl implementation of WordNet Similarity. However, with 892 the addition of the simulate_root flag (see below), the score for 893 verbs now almost always agree but not always for nouns. 894 895 The LCS does not necessarily feature in the shortest path connecting 896 the two senses, as it is by definition the common ancestor deepest in 897 the taxonomy, not closest to the two senses. Typically, however, it 898 will so feature. Where multiple candidates for the LCS exist, that 899 whose shortest path to the root node is the longest will be selected. 900 Where the LCS has multiple paths to the root, the longer path is used 901 for the purposes of the calculation. 902 903 :type other: Synset 904 :param other: The ``Synset`` that this ``Synset`` is being compared to. 905 :type simulate_root: bool 906 :param simulate_root: The various verb taxonomies do not 907 share a single root which disallows this metric from working for 908 synsets that are not connected. This flag (True by default) 909 creates a fake root that connects all the taxonomies. Set it 910 to false to disable this behavior. For the noun taxonomy, 911 there is usually a default root except for WordNet version 1.6. 912 If you are using wordnet 1.6, a fake root will be added for nouns 913 as well. 914 :return: A float score denoting the similarity of the two ``Synset`` 915 objects, normally greater than zero. If no connecting path between 916 the two senses can be found, None is returned. 917 918 """ 919 920 need_root = self._needs_root() 921 # Note that to preserve behavior from NLTK2 we set use_min_depth=True 922 # It is possible that more accurate results could be obtained by 923 # removing this setting and it should be tested later on 924 subsumers = self.lowest_common_hypernyms( 925 other, simulate_root=simulate_root and need_root, use_min_depth=True 926 ) 927 928 # If no LCS was found return None 929 if len(subsumers) == 0: 930 return None 931 932 subsumer = self if self in subsumers else subsumers[0] 933 934 # Get the longest path from the LCS to the root, 935 # including a correction: 936 # - add one because the calculations include both the start and end 937 # nodes 938 depth = subsumer.max_depth() + 1 939 940 # Note: No need for an additional add-one correction for non-nouns 941 # to account for an imaginary root node because that is now 942 # automatically handled by simulate_root 943 # if subsumer._pos != NOUN: 944 # depth += 1 945 946 # Get the shortest path from the LCS to each of the synsets it is 947 # subsuming. Add this to the LCS path length to get the path 948 # length from each synset to the root. 949 len1 = self.shortest_path_distance( 950 subsumer, simulate_root=simulate_root and need_root 951 ) 952 len2 = other.shortest_path_distance( 953 subsumer, simulate_root=simulate_root and need_root 954 ) 955 if len1 is None or len2 is None: 956 return None 957 len1 += depth 958 len2 += depth 959 return (2.0 * depth) / (len1 + len2) 960 961 def res_similarity(self, other, ic, verbose=False): 962 """ 963 Resnik Similarity: 964 Return a score denoting how similar two word senses are, based on the 965 Information Content (IC) of the Least Common Subsumer (most specific 966 ancestor node). 967 968 :type other: Synset 969 :param other: The ``Synset`` that this ``Synset`` is being compared to. 970 :type ic: dict 971 :param ic: an information content object (as returned by 972 ``nltk.corpus.wordnet_ic.ic()``). 973 :return: A float score denoting the similarity of the two ``Synset`` 974 objects. Synsets whose LCS is the root node of the taxonomy will 975 have a score of 0 (e.g. N['dog'][0] and N['table'][0]). 976 """ 977 978 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 979 return lcs_ic 980 981 def jcn_similarity(self, other, ic, verbose=False): 982 """ 983 Jiang-Conrath Similarity: 984 Return a score denoting how similar two word senses are, based on the 985 Information Content (IC) of the Least Common Subsumer (most specific 986 ancestor node) and that of the two input Synsets. The relationship is 987 given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). 988 989 :type other: Synset 990 :param other: The ``Synset`` that this ``Synset`` is being compared to. 991 :type ic: dict 992 :param ic: an information content object (as returned by 993 ``nltk.corpus.wordnet_ic.ic()``). 994 :return: A float score denoting the similarity of the two ``Synset`` 995 objects. 996 """ 997 998 if self == other: 999 return _INF 1000 1001 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 1002 1003 # If either of the input synsets are the root synset, or have a 1004 # frequency of 0 (sparse data problem), return 0. 1005 if ic1 == 0 or ic2 == 0: 1006 return 0 1007 1008 ic_difference = ic1 + ic2 - 2 * lcs_ic 1009 1010 if ic_difference == 0: 1011 return _INF 1012 1013 return 1 / ic_difference 1014 1015 def lin_similarity(self, other, ic, verbose=False): 1016 """ 1017 Lin Similarity: 1018 Return a score denoting how similar two word senses are, based on the 1019 Information Content (IC) of the Least Common Subsumer (most specific 1020 ancestor node) and that of the two input Synsets. The relationship is 1021 given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). 1022 1023 :type other: Synset 1024 :param other: The ``Synset`` that this ``Synset`` is being compared to. 1025 :type ic: dict 1026 :param ic: an information content object (as returned by 1027 ``nltk.corpus.wordnet_ic.ic()``). 1028 :return: A float score denoting the similarity of the two ``Synset`` 1029 objects, in the range 0 to 1. 1030 """ 1031 1032 ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) 1033 return (2.0 * lcs_ic) / (ic1 + ic2) 1034 1035 def _iter_hypernym_lists(self): 1036 """ 1037 :return: An iterator over ``Synset`` objects that are either proper 1038 hypernyms or instance of hypernyms of the synset. 1039 """ 1040 todo = [self] 1041 seen = set() 1042 while todo: 1043 for synset in todo: 1044 seen.add(synset) 1045 yield todo 1046 todo = [ 1047 hypernym 1048 for synset in todo 1049 for hypernym in (synset.hypernyms() + synset.instance_hypernyms()) 1050 if hypernym not in seen 1051 ] 1052 1053 def __repr__(self): 1054 return "%s('%s')" % (type(self).__name__, self._name) 1055 1056 def _related(self, relation_symbol, sort=True): 1057 get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset 1058 if relation_symbol not in self._pointers: 1059 return [] 1060 pointer_tuples = self._pointers[relation_symbol] 1061 r = [get_synset(pos, offset) for pos, offset in pointer_tuples] 1062 if sort: 1063 r.sort() 1064 return r 1065 1066 1067###################################################################### 1068# WordNet Corpus Reader 1069###################################################################### 1070 1071 1072class WordNetCorpusReader(CorpusReader): 1073 """ 1074 A corpus reader used to access wordnet or its variants. 1075 """ 1076 1077 _ENCODING = 'utf8' 1078 1079 # { Part-of-speech constants 1080 ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' 1081 # } 1082 1083 # { Filename constants 1084 _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'} 1085 # } 1086 1087 # { Part of speech constants 1088 _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} 1089 _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) 1090 # } 1091 1092 #: A list of file identifiers for all the fileids used by this 1093 #: corpus reader. 1094 _FILES = ( 1095 'cntlist.rev', 1096 'lexnames', 1097 'index.sense', 1098 'index.adj', 1099 'index.adv', 1100 'index.noun', 1101 'index.verb', 1102 'data.adj', 1103 'data.adv', 1104 'data.noun', 1105 'data.verb', 1106 'adj.exc', 1107 'adv.exc', 1108 'noun.exc', 1109 'verb.exc', 1110 ) 1111 1112 def __init__(self, root, omw_reader): 1113 """ 1114 Construct a new wordnet corpus reader, with the given root 1115 directory. 1116 """ 1117 super(WordNetCorpusReader, self).__init__( 1118 root, self._FILES, encoding=self._ENCODING 1119 ) 1120 1121 # A index that provides the file offset 1122 # Map from lemma -> pos -> synset_index -> offset 1123 self._lemma_pos_offset_map = defaultdict(dict) 1124 1125 # A cache so we don't have to reconstuct synsets 1126 # Map from pos -> offset -> synset 1127 self._synset_offset_cache = defaultdict(dict) 1128 1129 # A lookup for the maximum depth of each part of speech. Useful for 1130 # the lch similarity metric. 1131 self._max_depth = defaultdict(dict) 1132 1133 # Corpus reader containing omw data. 1134 self._omw_reader = omw_reader 1135 1136 # A cache to store the wordnet data of multiple languages 1137 self._lang_data = defaultdict(list) 1138 1139 self._data_file_map = {} 1140 self._exception_map = {} 1141 self._lexnames = [] 1142 self._key_count_file = None 1143 self._key_synset_file = None 1144 1145 # Load the lexnames 1146 for i, line in enumerate(self.open('lexnames')): 1147 index, lexname, _ = line.split() 1148 assert int(index) == i 1149 self._lexnames.append(lexname) 1150 1151 # Load the indices for lemmas and synset offsets 1152 self._load_lemma_pos_offset_map() 1153 1154 # load the exception file data into memory 1155 self._load_exception_map() 1156 1157 # Open Multilingual WordNet functions, contributed by 1158 # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn 1159 1160 def of2ss(self, of): 1161 ''' take an id and return the synsets ''' 1162 return self.synset_from_pos_and_offset(of[-1], int(of[:8])) 1163 1164 def ss2of(self, ss, lang=None): 1165 ''' return the ID of the synset ''' 1166 pos = ss.pos() 1167 # Only these 3 WordNets retain the satellite pos tag 1168 if lang not in ["nld", "lit", "slk"] and pos == 's': 1169 pos = 'a' 1170 return "{:08d}-{}".format(ss.offset(), pos) 1171 1172 def _load_lang_data(self, lang): 1173 ''' load the wordnet data of the requested language from the file to 1174 the cache, _lang_data ''' 1175 1176 if lang in self._lang_data.keys(): 1177 return 1178 1179 if lang not in self.langs(): 1180 raise WordNetError("Language is not supported.") 1181 1182 f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang)) 1183 self.custom_lemmas(f, lang) 1184 f.close() 1185 1186 def langs(self): 1187 ''' return a list of languages supported by Multilingual Wordnet ''' 1188 import os 1189 1190 langs = ['eng'] 1191 fileids = self._omw_reader.fileids() 1192 for fileid in fileids: 1193 file_name, file_extension = os.path.splitext(fileid) 1194 if file_extension == '.tab': 1195 langs.append(file_name.split('-')[-1]) 1196 1197 return langs 1198 1199 def _load_lemma_pos_offset_map(self): 1200 for suffix in self._FILEMAP.values(): 1201 1202 # parse each line of the file (ignoring comment lines) 1203 for i, line in enumerate(self.open('index.%s' % suffix)): 1204 if line.startswith(' '): 1205 continue 1206 1207 _iter = iter(line.split()) 1208 1209 def _next_token(): 1210 return next(_iter) 1211 1212 try: 1213 1214 # get the lemma and part-of-speech 1215 lemma = _next_token() 1216 pos = _next_token() 1217 1218 # get the number of synsets for this lemma 1219 n_synsets = int(_next_token()) 1220 assert n_synsets > 0 1221 1222 # get and ignore the pointer symbols for all synsets of 1223 # this lemma 1224 n_pointers = int(_next_token()) 1225 [_next_token() for _ in range(n_pointers)] 1226 1227 # same as number of synsets 1228 n_senses = int(_next_token()) 1229 assert n_synsets == n_senses 1230 1231 # get and ignore number of senses ranked according to 1232 # frequency 1233 _next_token() 1234 1235 # get synset offsets 1236 synset_offsets = [int(_next_token()) for _ in range(n_synsets)] 1237 1238 # raise more informative error with file name and line number 1239 except (AssertionError, ValueError) as e: 1240 tup = ('index.%s' % suffix), (i + 1), e 1241 raise WordNetError('file %s, line %i: %s' % tup) 1242 1243 # map lemmas and parts of speech to synsets 1244 self._lemma_pos_offset_map[lemma][pos] = synset_offsets 1245 if pos == ADJ: 1246 self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets 1247 1248 def _load_exception_map(self): 1249 # load the exception file data into memory 1250 for pos, suffix in self._FILEMAP.items(): 1251 self._exception_map[pos] = {} 1252 for line in self.open('%s.exc' % suffix): 1253 terms = line.split() 1254 self._exception_map[pos][terms[0]] = terms[1:] 1255 self._exception_map[ADJ_SAT] = self._exception_map[ADJ] 1256 1257 def _compute_max_depth(self, pos, simulate_root): 1258 """ 1259 Compute the max depth for the given part of speech. This is 1260 used by the lch similarity metric. 1261 """ 1262 depth = 0 1263 for ii in self.all_synsets(pos): 1264 try: 1265 depth = max(depth, ii.max_depth()) 1266 except RuntimeError: 1267 print(ii) 1268 if simulate_root: 1269 depth += 1 1270 self._max_depth[pos] = depth 1271 1272 def get_version(self): 1273 fh = self._data_file(ADJ) 1274 for line in fh: 1275 match = re.search(r'WordNet (\d+\.\d+) Copyright', line) 1276 if match is not None: 1277 version = match.group(1) 1278 fh.seek(0) 1279 return version 1280 1281 ############################################################# 1282 # Loading Lemmas 1283 ############################################################# 1284 1285 def lemma(self, name, lang='eng'): 1286 '''Return lemma object that matches the name''' 1287 # cannot simply split on first '.', 1288 # e.g.: '.45_caliber.a.01..45_caliber' 1289 separator = SENSENUM_RE.search(name).end() 1290 1291 synset_name, lemma_name = name[: separator - 1], name[separator:] 1292 1293 synset = self.synset(synset_name) 1294 for lemma in synset.lemmas(lang): 1295 if lemma._name == lemma_name: 1296 return lemma 1297 raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name)) 1298 1299 def lemma_from_key(self, key): 1300 # Keys are case sensitive and always lower-case 1301 key = key.lower() 1302 1303 lemma_name, lex_sense = key.split('%') 1304 pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':') 1305 pos = self._pos_names[int(pos_number)] 1306 1307 # open the key -> synset file if necessary 1308 if self._key_synset_file is None: 1309 self._key_synset_file = self.open('index.sense') 1310 1311 # Find the synset for the lemma. 1312 synset_line = _binary_search_file(self._key_synset_file, key) 1313 if not synset_line: 1314 raise WordNetError("No synset found for key %r" % key) 1315 offset = int(synset_line.split()[1]) 1316 synset = self.synset_from_pos_and_offset(pos, offset) 1317 1318 # return the corresponding lemma 1319 for lemma in synset._lemmas: 1320 if lemma._key == key: 1321 return lemma 1322 raise WordNetError("No lemma found for for key %r" % key) 1323 1324 ############################################################# 1325 # Loading Synsets 1326 ############################################################# 1327 def synset(self, name): 1328 # split name into lemma, part of speech and synset number 1329 lemma, pos, synset_index_str = name.lower().rsplit('.', 2) 1330 synset_index = int(synset_index_str) - 1 1331 1332 # get the offset for this synset 1333 try: 1334 offset = self._lemma_pos_offset_map[lemma][pos][synset_index] 1335 except KeyError: 1336 message = 'no lemma %r with part of speech %r' 1337 raise WordNetError(message % (lemma, pos)) 1338 except IndexError: 1339 n_senses = len(self._lemma_pos_offset_map[lemma][pos]) 1340 message = "lemma %r with part of speech %r has only %i %s" 1341 if n_senses == 1: 1342 tup = lemma, pos, n_senses, "sense" 1343 else: 1344 tup = lemma, pos, n_senses, "senses" 1345 raise WordNetError(message % tup) 1346 1347 # load synset information from the appropriate file 1348 synset = self.synset_from_pos_and_offset(pos, offset) 1349 1350 # some basic sanity checks on loaded attributes 1351 if pos == 's' and synset._pos == 'a': 1352 message = ( 1353 'adjective satellite requested but only plain ' 1354 'adjective found for lemma %r' 1355 ) 1356 raise WordNetError(message % lemma) 1357 assert synset._pos == pos or (pos == 'a' and synset._pos == 's') 1358 1359 # Return the synset object. 1360 return synset 1361 1362 def _data_file(self, pos): 1363 """ 1364 Return an open file pointer for the data file for the given 1365 part of speech. 1366 """ 1367 if pos == ADJ_SAT: 1368 pos = ADJ 1369 if self._data_file_map.get(pos) is None: 1370 fileid = 'data.%s' % self._FILEMAP[pos] 1371 self._data_file_map[pos] = self.open(fileid) 1372 return self._data_file_map[pos] 1373 1374 def synset_from_pos_and_offset(self, pos, offset): 1375 # Check to see if the synset is in the cache 1376 if offset in self._synset_offset_cache[pos]: 1377 return self._synset_offset_cache[pos][offset] 1378 1379 data_file = self._data_file(pos) 1380 data_file.seek(offset) 1381 data_file_line = data_file.readline() 1382 synset = self._synset_from_pos_and_line(pos, data_file_line) 1383 assert synset._offset == offset 1384 self._synset_offset_cache[pos][offset] = synset 1385 return synset 1386 1387 @deprecated('Use public method synset_from_pos_and_offset() instead') 1388 def _synset_from_pos_and_offset(self, *args, **kwargs): 1389 """ 1390 Hack to help people like the readers of 1391 http://stackoverflow.com/a/27145655/1709587 1392 who were using this function before it was officially a public method 1393 """ 1394 return self.synset_from_pos_and_offset(*args, **kwargs) 1395 1396 def _synset_from_pos_and_line(self, pos, data_file_line): 1397 # Construct a new (empty) synset. 1398 synset = Synset(self) 1399 1400 # parse the entry for this synset 1401 try: 1402 1403 # parse out the definitions and examples from the gloss 1404 columns_str, gloss = data_file_line.split('|') 1405 gloss = gloss.strip() 1406 definitions = [] 1407 for gloss_part in gloss.split(';'): 1408 gloss_part = gloss_part.strip() 1409 if gloss_part.startswith('"'): 1410 synset._examples.append(gloss_part.strip('"')) 1411 else: 1412 definitions.append(gloss_part) 1413 synset._definition = '; '.join(definitions) 1414 1415 # split the other info into fields 1416 _iter = iter(columns_str.split()) 1417 1418 def _next_token(): 1419 return next(_iter) 1420 1421 # get the offset 1422 synset._offset = int(_next_token()) 1423 1424 # determine the lexicographer file name 1425 lexname_index = int(_next_token()) 1426 synset._lexname = self._lexnames[lexname_index] 1427 1428 # get the part of speech 1429 synset._pos = _next_token() 1430 1431 # create Lemma objects for each lemma 1432 n_lemmas = int(_next_token(), 16) 1433 for _ in range(n_lemmas): 1434 # get the lemma name 1435 lemma_name = _next_token() 1436 # get the lex_id (used for sense_keys) 1437 lex_id = int(_next_token(), 16) 1438 # If the lemma has a syntactic marker, extract it. 1439 m = re.match(r'(.*?)(\(.*\))?$', lemma_name) 1440 lemma_name, syn_mark = m.groups() 1441 # create the lemma object 1442 lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark) 1443 synset._lemmas.append(lemma) 1444 synset._lemma_names.append(lemma._name) 1445 1446 # collect the pointer tuples 1447 n_pointers = int(_next_token()) 1448 for _ in range(n_pointers): 1449 symbol = _next_token() 1450 offset = int(_next_token()) 1451 pos = _next_token() 1452 lemma_ids_str = _next_token() 1453 if lemma_ids_str == '0000': 1454 synset._pointers[symbol].add((pos, offset)) 1455 else: 1456 source_index = int(lemma_ids_str[:2], 16) - 1 1457 target_index = int(lemma_ids_str[2:], 16) - 1 1458 source_lemma_name = synset._lemmas[source_index]._name 1459 lemma_pointers = synset._lemma_pointers 1460 tups = lemma_pointers[source_lemma_name, symbol] 1461 tups.append((pos, offset, target_index)) 1462 1463 # read the verb frames 1464 try: 1465 frame_count = int(_next_token()) 1466 except StopIteration: 1467 pass 1468 else: 1469 for _ in range(frame_count): 1470 # read the plus sign 1471 plus = _next_token() 1472 assert plus == '+' 1473 # read the frame and lemma number 1474 frame_number = int(_next_token()) 1475 frame_string_fmt = VERB_FRAME_STRINGS[frame_number] 1476 lemma_number = int(_next_token(), 16) 1477 # lemma number of 00 means all words in the synset 1478 if lemma_number == 0: 1479 synset._frame_ids.append(frame_number) 1480 for lemma in synset._lemmas: 1481 lemma._frame_ids.append(frame_number) 1482 lemma._frame_strings.append(frame_string_fmt % lemma._name) 1483 # only a specific word in the synset 1484 else: 1485 lemma = synset._lemmas[lemma_number - 1] 1486 lemma._frame_ids.append(frame_number) 1487 lemma._frame_strings.append(frame_string_fmt % lemma._name) 1488 1489 # raise a more informative error with line text 1490 except ValueError as e: 1491 raise WordNetError('line %r: %s' % (data_file_line, e)) 1492 1493 # set sense keys for Lemma objects - note that this has to be 1494 # done afterwards so that the relations are available 1495 for lemma in synset._lemmas: 1496 if synset._pos == ADJ_SAT: 1497 head_lemma = synset.similar_tos()[0]._lemmas[0] 1498 head_name = head_lemma._name 1499 head_id = '%02d' % head_lemma._lex_id 1500 else: 1501 head_name = head_id = '' 1502 tup = ( 1503 lemma._name, 1504 WordNetCorpusReader._pos_numbers[synset._pos], 1505 lemma._lexname_index, 1506 lemma._lex_id, 1507 head_name, 1508 head_id, 1509 ) 1510 lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower() 1511 1512 # the canonical name is based on the first lemma 1513 lemma_name = synset._lemmas[0]._name.lower() 1514 offsets = self._lemma_pos_offset_map[lemma_name][synset._pos] 1515 sense_index = offsets.index(synset._offset) 1516 tup = lemma_name, synset._pos, sense_index + 1 1517 synset._name = '%s.%s.%02i' % tup 1518 1519 return synset 1520 1521 def synset_from_sense_key(self, sense_key): 1522 """ 1523 Retrieves synset based on a given sense_key. Sense keys can be 1524 obtained from lemma.key() 1525 1526 From https://wordnet.princeton.edu/wordnet/man/senseidx.5WN.html: 1527 A sense_key is represented as: 1528 lemma % lex_sense (e.g. 'dog%1:18:01::') 1529 where lex_sense is encoded as: 1530 ss_type:lex_filenum:lex_id:head_word:head_id 1531 1532 lemma: ASCII text of word/collocation, in lower case 1533 ss_type: synset type for the sense (1 digit int) 1534 The synset type is encoded as follows: 1535 1 NOUN 1536 2 VERB 1537 3 ADJECTIVE 1538 4 ADVERB 1539 5 ADJECTIVE SATELLITE 1540 lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) 1541 lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) 1542 head_word: lemma of the first word in satellite's head synset 1543 Only used if sense is in an adjective satellite synset 1544 head_id: uniquely identifies sense in a lexicographer file when paired with head_word 1545 Only used if head_word is present (2 digit int) 1546 """ 1547 sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)") 1548 synset_types = {1: NOUN, 2: VERB, 3: ADJ, 4: ADV, 5: ADJ_SAT} 1549 lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups() 1550 1551 # check that information extracted from sense_key is valid 1552 error = None 1553 if not lemma: 1554 error = "lemma" 1555 elif int(ss_type) not in synset_types: 1556 error = "ss_type" 1557 elif int(lex_id) < 0 or int(lex_id) > 99: 1558 error = "lex_id" 1559 if error: 1560 raise WordNetError( 1561 "valid {} could not be extracted from the sense key".format(error) 1562 ) 1563 1564 synset_id = '.'.join([lemma, synset_types[int(ss_type)], lex_id]) 1565 return self.synset(synset_id) 1566 1567 ############################################################# 1568 # Retrieve synsets and lemmas. 1569 ############################################################# 1570 1571 def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True): 1572 """Load all synsets with a given lemma and part of speech tag. 1573 If no pos is specified, all synsets for all parts of speech 1574 will be loaded. 1575 If lang is specified, all the synsets associated with the lemma name 1576 of that language will be returned. 1577 """ 1578 lemma = lemma.lower() 1579 1580 if lang == 'eng': 1581 get_synset = self.synset_from_pos_and_offset 1582 index = self._lemma_pos_offset_map 1583 if pos is None: 1584 pos = POS_LIST 1585 return [ 1586 get_synset(p, offset) 1587 for p in pos 1588 for form in self._morphy(lemma, p, check_exceptions) 1589 for offset in index[form].get(p, []) 1590 ] 1591 1592 else: 1593 self._load_lang_data(lang) 1594 synset_list = [] 1595 if lemma in self._lang_data[lang][1]: 1596 for l in self._lang_data[lang][1][lemma]: 1597 if pos is not None and l[-1] != pos: 1598 continue 1599 synset_list.append(self.of2ss(l)) 1600 return synset_list 1601 1602 def lemmas(self, lemma, pos=None, lang='eng'): 1603 """Return all Lemma objects with a name matching the specified lemma 1604 name and part of speech tag. Matches any part of speech tag if none is 1605 specified.""" 1606 1607 lemma = lemma.lower() 1608 if lang == 'eng': 1609 return [ 1610 lemma_obj 1611 for synset in self.synsets(lemma, pos) 1612 for lemma_obj in synset.lemmas() 1613 if lemma_obj.name().lower() == lemma 1614 ] 1615 1616 else: 1617 self._load_lang_data(lang) 1618 lemmas = [] 1619 syn = self.synsets(lemma, lang=lang) 1620 for s in syn: 1621 if pos is not None and s.pos() != pos: 1622 continue 1623 for lemma_obj in s.lemmas(lang=lang): 1624 if lemma_obj.name().lower() == lemma: 1625 lemmas.append(lemma_obj) 1626 return lemmas 1627 1628 def all_lemma_names(self, pos=None, lang='eng'): 1629 """Return all lemma names for all synsets for the given 1630 part of speech tag and language or languages. If pos is 1631 not specified, all synsets for all parts of speech will 1632 be used.""" 1633 1634 if lang == 'eng': 1635 if pos is None: 1636 return iter(self._lemma_pos_offset_map) 1637 else: 1638 return ( 1639 lemma 1640 for lemma in self._lemma_pos_offset_map 1641 if pos in self._lemma_pos_offset_map[lemma] 1642 ) 1643 else: 1644 self._load_lang_data(lang) 1645 lemma = [] 1646 for i in self._lang_data[lang][0]: 1647 if pos is not None and i[-1] != pos: 1648 continue 1649 lemma.extend(self._lang_data[lang][0][i]) 1650 1651 lemma = list(set(lemma)) 1652 return lemma 1653 1654 def all_synsets(self, pos=None): 1655 """Iterate over all synsets with a given part of speech tag. 1656 If no pos is specified, all synsets for all parts of speech 1657 will be loaded. 1658 """ 1659 if pos is None: 1660 pos_tags = self._FILEMAP.keys() 1661 else: 1662 pos_tags = [pos] 1663 1664 cache = self._synset_offset_cache 1665 from_pos_and_line = self._synset_from_pos_and_line 1666 1667 # generate all synsets for each part of speech 1668 for pos_tag in pos_tags: 1669 # Open the file for reading. Note that we can not re-use 1670 # the file poitners from self._data_file_map here, because 1671 # we're defining an iterator, and those file pointers might 1672 # be moved while we're not looking. 1673 if pos_tag == ADJ_SAT: 1674 pos_tag = ADJ 1675 fileid = 'data.%s' % self._FILEMAP[pos_tag] 1676 data_file = self.open(fileid) 1677 1678 try: 1679 # generate synsets for each line in the POS file 1680 offset = data_file.tell() 1681 line = data_file.readline() 1682 while line: 1683 if not line[0].isspace(): 1684 if offset in cache[pos_tag]: 1685 # See if the synset is cached 1686 synset = cache[pos_tag][offset] 1687 else: 1688 # Otherwise, parse the line 1689 synset = from_pos_and_line(pos_tag, line) 1690 cache[pos_tag][offset] = synset 1691 1692 # adjective satellites are in the same file as 1693 # adjectives so only yield the synset if it's actually 1694 # a satellite 1695 if synset._pos == ADJ_SAT: 1696 yield synset 1697 1698 # for all other POS tags, yield all synsets (this means 1699 # that adjectives also include adjective satellites) 1700 else: 1701 yield synset 1702 offset = data_file.tell() 1703 line = data_file.readline() 1704 1705 # close the extra file handle we opened 1706 except: 1707 data_file.close() 1708 raise 1709 else: 1710 data_file.close() 1711 1712 def words(self, lang='eng'): 1713 """return lemmas of the given language as list of words""" 1714 return self.all_lemma_names(lang=lang) 1715 1716 def license(self, lang='eng'): 1717 """Return the contents of LICENSE (for omw) 1718 use lang=lang to get the license for an individual language""" 1719 if lang == 'eng': 1720 return self.open("LICENSE").read() 1721 elif lang in self.langs(): 1722 return self._omw_reader.open("{}/LICENSE".format(lang)).read() 1723 elif lang == 'omw': 1724 # under the assumption you don't mean Omwunra-Toqura 1725 return self._omw_reader.open("LICENSE").read() 1726 elif lang in self._lang_data: 1727 raise WordNetError("Cannot determine license for user-provided tab file") 1728 else: 1729 raise WordNetError("Language is not supported.") 1730 1731 def readme(self, lang='omw'): 1732 """Return the contents of README (for omw) 1733 use lang=lang to get the readme for an individual language""" 1734 if lang == 'eng': 1735 return self.open("README").read() 1736 elif lang in self.langs(): 1737 return self._omw_reader.open("{}/README".format(lang)).read() 1738 elif lang == 'omw': 1739 # under the assumption you don't mean Omwunra-Toqura 1740 return self._omw_reader.open("README").read() 1741 elif lang in self._lang_data: 1742 raise WordNetError("No README for user-provided tab file") 1743 else: 1744 raise WordNetError("Language is not supported.") 1745 1746 def citation(self, lang='omw'): 1747 """Return the contents of citation.bib file (for omw) 1748 use lang=lang to get the citation for an individual language""" 1749 if lang == 'eng': 1750 return self.open("citation.bib").read() 1751 elif lang in self.langs(): 1752 return self._omw_reader.open("{}/citation.bib".format(lang)).read() 1753 elif lang == 'omw': 1754 # under the assumption you don't mean Omwunra-Toqura 1755 return self._omw_reader.open("citation.bib").read() 1756 elif lang in self._lang_data: 1757 raise WordNetError("citation not known for user-provided tab file") 1758 else: 1759 raise WordNetError("Language is not supported.") 1760 1761 ############################################################# 1762 # Misc 1763 ############################################################# 1764 def lemma_count(self, lemma): 1765 """Return the frequency count for this Lemma""" 1766 # Currently, count is only work for English 1767 if lemma._lang != 'eng': 1768 return 0 1769 # open the count file if we haven't already 1770 if self._key_count_file is None: 1771 self._key_count_file = self.open('cntlist.rev') 1772 # find the key in the counts file and return the count 1773 line = _binary_search_file(self._key_count_file, lemma._key) 1774 if line: 1775 return int(line.rsplit(' ', 1)[-1]) 1776 else: 1777 return 0 1778 1779 def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True): 1780 return synset1.path_similarity(synset2, verbose, simulate_root) 1781 1782 path_similarity.__doc__ = Synset.path_similarity.__doc__ 1783 1784 def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True): 1785 return synset1.lch_similarity(synset2, verbose, simulate_root) 1786 1787 lch_similarity.__doc__ = Synset.lch_similarity.__doc__ 1788 1789 def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True): 1790 return synset1.wup_similarity(synset2, verbose, simulate_root) 1791 1792 wup_similarity.__doc__ = Synset.wup_similarity.__doc__ 1793 1794 def res_similarity(self, synset1, synset2, ic, verbose=False): 1795 return synset1.res_similarity(synset2, ic, verbose) 1796 1797 res_similarity.__doc__ = Synset.res_similarity.__doc__ 1798 1799 def jcn_similarity(self, synset1, synset2, ic, verbose=False): 1800 return synset1.jcn_similarity(synset2, ic, verbose) 1801 1802 jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ 1803 1804 def lin_similarity(self, synset1, synset2, ic, verbose=False): 1805 return synset1.lin_similarity(synset2, ic, verbose) 1806 1807 lin_similarity.__doc__ = Synset.lin_similarity.__doc__ 1808 1809 ############################################################# 1810 # Morphy 1811 ############################################################# 1812 # Morphy, adapted from Oliver Steele's pywordnet 1813 def morphy(self, form, pos=None, check_exceptions=True): 1814 """ 1815 Find a possible base form for the given form, with the given 1816 part of speech, by checking WordNet's list of exceptional 1817 forms, and by recursively stripping affixes for this part of 1818 speech until a form in WordNet is found. 1819 1820 >>> from nltk.corpus import wordnet as wn 1821 >>> print(wn.morphy('dogs')) 1822 dog 1823 >>> print(wn.morphy('churches')) 1824 church 1825 >>> print(wn.morphy('aardwolves')) 1826 aardwolf 1827 >>> print(wn.morphy('abaci')) 1828 abacus 1829 >>> wn.morphy('hardrock', wn.ADV) 1830 >>> print(wn.morphy('book', wn.NOUN)) 1831 book 1832 >>> wn.morphy('book', wn.ADJ) 1833 """ 1834 1835 if pos is None: 1836 morphy = self._morphy 1837 analyses = chain(a for p in POS_LIST for a in morphy(form, p)) 1838 else: 1839 analyses = self._morphy(form, pos, check_exceptions) 1840 1841 # get the first one we find 1842 first = list(islice(analyses, 1)) 1843 if len(first) == 1: 1844 return first[0] 1845 else: 1846 return None 1847 1848 MORPHOLOGICAL_SUBSTITUTIONS = { 1849 NOUN: [ 1850 ('s', ''), 1851 ('ses', 's'), 1852 ('ves', 'f'), 1853 ('xes', 'x'), 1854 ('zes', 'z'), 1855 ('ches', 'ch'), 1856 ('shes', 'sh'), 1857 ('men', 'man'), 1858 ('ies', 'y'), 1859 ], 1860 VERB: [ 1861 ('s', ''), 1862 ('ies', 'y'), 1863 ('es', 'e'), 1864 ('es', ''), 1865 ('ed', 'e'), 1866 ('ed', ''), 1867 ('ing', 'e'), 1868 ('ing', ''), 1869 ], 1870 ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')], 1871 ADV: [], 1872 } 1873 1874 MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ] 1875 1876 def _morphy(self, form, pos, check_exceptions=True): 1877 # from jordanbg: 1878 # Given an original string x 1879 # 1. Apply rules once to the input to get y1, y2, y3, etc. 1880 # 2. Return all that are in the database 1881 # 3. If there are no matches, keep applying rules until you either 1882 # find a match or you can't go any further 1883 1884 exceptions = self._exception_map[pos] 1885 substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] 1886 1887 def apply_rules(forms): 1888 return [ 1889 form[: -len(old)] + new 1890 for form in forms 1891 for old, new in substitutions 1892 if form.endswith(old) 1893 ] 1894 1895 def filter_forms(forms): 1896 result = [] 1897 seen = set() 1898 for form in forms: 1899 if form in self._lemma_pos_offset_map: 1900 if pos in self._lemma_pos_offset_map[form]: 1901 if form not in seen: 1902 result.append(form) 1903 seen.add(form) 1904 return result 1905 1906 # 0. Check the exception lists 1907 if check_exceptions: 1908 if form in exceptions: 1909 return filter_forms([form] + exceptions[form]) 1910 1911 # 1. Apply rules once to the input to get y1, y2, y3, etc. 1912 forms = apply_rules([form]) 1913 1914 # 2. Return all that are in the database (and check the original too) 1915 results = filter_forms([form] + forms) 1916 if results: 1917 return results 1918 1919 # 3. If there are no matches, keep applying rules until we find a match 1920 while forms: 1921 forms = apply_rules(forms) 1922 results = filter_forms(forms) 1923 if results: 1924 return results 1925 1926 # Return an empty list if we can't find anything 1927 return [] 1928 1929 ############################################################# 1930 # Create information content from corpus 1931 ############################################################# 1932 def ic(self, corpus, weight_senses_equally=False, smoothing=1.0): 1933 """ 1934 Creates an information content lookup dictionary from a corpus. 1935 1936 :type corpus: CorpusReader 1937 :param corpus: The corpus from which we create an information 1938 content dictionary. 1939 :type weight_senses_equally: bool 1940 :param weight_senses_equally: If this is True, gives all 1941 possible senses equal weight rather than dividing by the 1942 number of possible senses. (If a word has 3 synses, each 1943 sense gets 0.3333 per appearance when this is False, 1.0 when 1944 it is true.) 1945 :param smoothing: How much do we smooth synset counts (default is 1.0) 1946 :type smoothing: float 1947 :return: An information content dictionary 1948 """ 1949 counts = FreqDist() 1950 for ww in corpus.words(): 1951 counts[ww] += 1 1952 1953 ic = {} 1954 for pp in POS_LIST: 1955 ic[pp] = defaultdict(float) 1956 1957 # Initialize the counts with the smoothing value 1958 if smoothing > 0.0: 1959 for ss in self.all_synsets(): 1960 pos = ss._pos 1961 if pos == ADJ_SAT: 1962 pos = ADJ 1963 ic[pos][ss._offset] = smoothing 1964 1965 for ww in counts: 1966 possible_synsets = self.synsets(ww) 1967 if len(possible_synsets) == 0: 1968 continue 1969 1970 # Distribute weight among possible synsets 1971 weight = float(counts[ww]) 1972 if not weight_senses_equally: 1973 weight /= float(len(possible_synsets)) 1974 1975 for ss in possible_synsets: 1976 pos = ss._pos 1977 if pos == ADJ_SAT: 1978 pos = ADJ 1979 for level in ss._iter_hypernym_lists(): 1980 for hh in level: 1981 ic[pos][hh._offset] += weight 1982 # Add the weight to the root 1983 ic[pos][0] += weight 1984 return ic 1985 1986 def custom_lemmas(self, tab_file, lang): 1987 """ 1988 Reads a custom tab file containing mappings of lemmas in the given 1989 language to Princeton WordNet 3.0 synset offsets, allowing NLTK's 1990 WordNet functions to then be used with that language. 1991 1992 See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for 1993 documentation on the Multilingual WordNet tab file format. 1994 1995 :param tab_file: Tab file as a file or file-like object 1996 :type lang str 1997 :param lang ISO 639-3 code of the language of the tab file 1998 """ 1999 if len(lang) != 3: 2000 raise ValueError('lang should be a (3 character) ISO 639-3 code') 2001 self._lang_data[lang] = [defaultdict(list), defaultdict(list)] 2002 for l in tab_file.readlines(): 2003 if isinstance(l, bytes): 2004 # Support byte-stream files (e.g. as returned by Python 2's 2005 # open() function) as well as text-stream ones 2006 l = l.decode('utf-8') 2007 l = l.replace('\n', '') 2008 l = l.replace(' ', '_') 2009 if l[0] != '#': 2010 word = l.split('\t') 2011 self._lang_data[lang][0][word[0]].append(word[2]) 2012 self._lang_data[lang][1][word[2].lower()].append(word[0]) 2013 # Make sure no more entries are accidentally added subsequently 2014 self._lang_data[lang][0].default_factory = None 2015 self._lang_data[lang][1].default_factory = None 2016 2017 2018###################################################################### 2019# WordNet Information Content Corpus Reader 2020###################################################################### 2021 2022 2023class WordNetICCorpusReader(CorpusReader): 2024 """ 2025 A corpus reader for the WordNet information content corpus. 2026 """ 2027 2028 def __init__(self, root, fileids): 2029 CorpusReader.__init__(self, root, fileids, encoding='utf8') 2030 2031 # this load function would be more efficient if the data was pickled 2032 # Note that we can't use NLTK's frequency distributions because 2033 # synsets are overlapping (each instance of a synset also counts 2034 # as an instance of its hypernyms) 2035 def ic(self, icfile): 2036 """ 2037 Load an information content file from the wordnet_ic corpus 2038 and return a dictionary. This dictionary has just two keys, 2039 NOUN and VERB, whose values are dictionaries that map from 2040 synsets to information content values. 2041 2042 :type icfile: str 2043 :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") 2044 :return: An information content dictionary 2045 """ 2046 ic = {} 2047 ic[NOUN] = defaultdict(float) 2048 ic[VERB] = defaultdict(float) 2049 for num, line in enumerate(self.open(icfile)): 2050 if num == 0: # skip the header 2051 continue 2052 fields = line.split() 2053 offset = int(fields[0][:-1]) 2054 value = float(fields[1]) 2055 pos = _get_pos(fields[0]) 2056 if len(fields) == 3 and fields[2] == "ROOT": 2057 # Store root count. 2058 ic[pos][0] += value 2059 if value != 0: 2060 ic[pos][offset] = value 2061 return ic 2062 2063 2064###################################################################### 2065# Similarity metrics 2066###################################################################### 2067 2068# TODO: Add in the option to manually add a new root node; this will be 2069# useful for verb similarity as there exist multiple verb taxonomies. 2070 2071# More information about the metrics is available at 2072# http://marimba.d.umn.edu/similarity/measures.html 2073 2074 2075def path_similarity(synset1, synset2, verbose=False, simulate_root=True): 2076 return synset1.path_similarity(synset2, verbose, simulate_root) 2077 2078 2079def lch_similarity(synset1, synset2, verbose=False, simulate_root=True): 2080 return synset1.lch_similarity(synset2, verbose, simulate_root) 2081 2082 2083def wup_similarity(synset1, synset2, verbose=False, simulate_root=True): 2084 return synset1.wup_similarity(synset2, verbose, simulate_root) 2085 2086 2087def res_similarity(synset1, synset2, ic, verbose=False): 2088 return synset1.res_similarity(synset2, verbose) 2089 2090 2091def jcn_similarity(synset1, synset2, ic, verbose=False): 2092 return synset1.jcn_similarity(synset2, verbose) 2093 2094 2095def lin_similarity(synset1, synset2, ic, verbose=False): 2096 return synset1.lin_similarity(synset2, verbose) 2097 2098 2099path_similarity.__doc__ = Synset.path_similarity.__doc__ 2100lch_similarity.__doc__ = Synset.lch_similarity.__doc__ 2101wup_similarity.__doc__ = Synset.wup_similarity.__doc__ 2102res_similarity.__doc__ = Synset.res_similarity.__doc__ 2103jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ 2104lin_similarity.__doc__ = Synset.lin_similarity.__doc__ 2105 2106 2107def _lcs_ic(synset1, synset2, ic, verbose=False): 2108 """ 2109 Get the information content of the least common subsumer that has 2110 the highest information content value. If two nodes have no 2111 explicit common subsumer, assume that they share an artificial 2112 root node that is the hypernym of all explicit roots. 2113 2114 :type synset1: Synset 2115 :param synset1: First input synset. 2116 :type synset2: Synset 2117 :param synset2: Second input synset. Must be the same part of 2118 speech as the first synset. 2119 :type ic: dict 2120 :param ic: an information content object (as returned by ``load_ic()``). 2121 :return: The information content of the two synsets and their most 2122 informative subsumer 2123 """ 2124 if synset1._pos != synset2._pos: 2125 raise WordNetError( 2126 'Computing the least common subsumer requires ' 2127 '%s and %s to have the same part of speech.' % (synset1, synset2) 2128 ) 2129 2130 ic1 = information_content(synset1, ic) 2131 ic2 = information_content(synset2, ic) 2132 subsumers = synset1.common_hypernyms(synset2) 2133 if len(subsumers) == 0: 2134 subsumer_ic = 0 2135 else: 2136 subsumer_ic = max(information_content(s, ic) for s in subsumers) 2137 2138 if verbose: 2139 print("> LCS Subsumer by content:", subsumer_ic) 2140 2141 return ic1, ic2, subsumer_ic 2142 2143 2144# Utility functions 2145 2146 2147def information_content(synset, ic): 2148 try: 2149 icpos = ic[synset._pos] 2150 except KeyError: 2151 msg = 'Information content file has no entries for part-of-speech: %s' 2152 raise WordNetError(msg % synset._pos) 2153 2154 counts = icpos[synset._offset] 2155 if counts == 0: 2156 return _INF 2157 else: 2158 return -math.log(counts / icpos[0]) 2159 2160 2161# get the part of speech (NOUN or VERB) from the information content record 2162# (each identifier has a 'n' or 'v' suffix) 2163 2164 2165def _get_pos(field): 2166 if field[-1] == 'n': 2167 return NOUN 2168 elif field[-1] == 'v': 2169 return VERB 2170 else: 2171 msg = ( 2172 "Unidentified part of speech in WordNet Information Content file " 2173 "for field %s" % field 2174 ) 2175 raise ValueError(msg) 2176 2177 2178# unload corpus after tests 2179def teardown_module(module=None): 2180 from nltk.corpus import wordnet 2181 2182 wordnet._unload() 2183