1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: WordNet
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Steven Bethard <Steven.Bethard@colorado.edu>
6#         Steven Bird <stevenbird1@gmail.com>
7#         Edward Loper <edloper@gmail.com>
8#         Nitin Madnani <nmadnani@ets.org>
9#         Nasruddin A’aidil Shari
10#         Sim Wei Ying Geraldine
11#         Soe Lynn
12#         Francis Bond <bond@ieee.org>
13# URL: <http://nltk.org/>
14# For license information, see LICENSE.TXT
15
16"""
17An NLTK interface for WordNet
18
19WordNet is a lexical database of English.
20Using synsets, helps find conceptual relationships between words
21such as hypernyms, hyponyms, synonyms, antonyms etc.
22
23For details about WordNet see:
24http://wordnet.princeton.edu/
25
26This module also allows you to find lemmas in languages
27other than English from the Open Multilingual Wordnet
28http://compling.hss.ntu.edu.sg/omw/
29
30"""
31
32from __future__ import print_function, unicode_literals
33
34import math
35import re
36from itertools import islice, chain
37from functools import total_ordering
38from operator import itemgetter
39from collections import defaultdict, deque
40
41from six import iteritems
42from six.moves import range
43
44from nltk.corpus.reader import CorpusReader
45from nltk.util import binary_search_file as _binary_search_file
46from nltk.probability import FreqDist
47from nltk.compat import python_2_unicode_compatible
48from nltk.internals import deprecated
49
50######################################################################
51# Table of Contents
52######################################################################
53# - Constants
54# - Data Classes
55#   - WordNetError
56#   - Lemma
57#   - Synset
58# - WordNet Corpus Reader
59# - WordNet Information Content Corpus Reader
60# - Similarity Metrics
61# - Demo
62
63######################################################################
64# Constants
65######################################################################
66
67#: Positive infinity (for similarity functions)
68_INF = 1e300
69
70# { Part-of-speech constants
71ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
72# }
73
74POS_LIST = [NOUN, VERB, ADJ, ADV]
75
76# A table of strings that are used to express verb frames.
77VERB_FRAME_STRINGS = (
78    None,
79    "Something %s",
80    "Somebody %s",
81    "It is %sing",
82    "Something is %sing PP",
83    "Something %s something Adjective/Noun",
84    "Something %s Adjective/Noun",
85    "Somebody %s Adjective",
86    "Somebody %s something",
87    "Somebody %s somebody",
88    "Something %s somebody",
89    "Something %s something",
90    "Something %s to somebody",
91    "Somebody %s on something",
92    "Somebody %s somebody something",
93    "Somebody %s something to somebody",
94    "Somebody %s something from somebody",
95    "Somebody %s somebody with something",
96    "Somebody %s somebody of something",
97    "Somebody %s something on somebody",
98    "Somebody %s somebody PP",
99    "Somebody %s something PP",
100    "Somebody %s PP",
101    "Somebody's (body part) %s",
102    "Somebody %s somebody to INFINITIVE",
103    "Somebody %s somebody INFINITIVE",
104    "Somebody %s that CLAUSE",
105    "Somebody %s to somebody",
106    "Somebody %s to INFINITIVE",
107    "Somebody %s whether INFINITIVE",
108    "Somebody %s somebody into V-ing something",
109    "Somebody %s something with something",
110    "Somebody %s INFINITIVE",
111    "Somebody %s VERB-ing",
112    "It %s that CLAUSE",
113    "Something %s INFINITIVE",
114)
115
116SENSENUM_RE = re.compile(r'\.[\d]+\.')
117
118
119######################################################################
120# Data Classes
121######################################################################
122
123
124class WordNetError(Exception):
125    """An exception class for wordnet-related errors."""
126
127
128@total_ordering
129class _WordNetObject(object):
130    """A common base class for lemmas and synsets."""
131
132    def hypernyms(self):
133        return self._related('@')
134
135    def _hypernyms(self):
136        return self._related('@')
137
138    def instance_hypernyms(self):
139        return self._related('@i')
140
141    def _instance_hypernyms(self):
142        return self._related('@i')
143
144    def hyponyms(self):
145        return self._related('~')
146
147    def instance_hyponyms(self):
148        return self._related('~i')
149
150    def member_holonyms(self):
151        return self._related('#m')
152
153    def substance_holonyms(self):
154        return self._related('#s')
155
156    def part_holonyms(self):
157        return self._related('#p')
158
159    def member_meronyms(self):
160        return self._related('%m')
161
162    def substance_meronyms(self):
163        return self._related('%s')
164
165    def part_meronyms(self):
166        return self._related('%p')
167
168    def topic_domains(self):
169        return self._related(';c')
170
171    def in_topic_domains(self):
172        return self._related('-c')
173
174    def region_domains(self):
175        return self._related(';r')
176
177    def in_region_domains(self):
178        return self._related('-r')
179
180    def usage_domains(self):
181        return self._related(';u')
182
183    def in_usage_domains(self):
184        return self._related('-u')
185
186    def attributes(self):
187        return self._related('=')
188
189    def entailments(self):
190        return self._related('*')
191
192    def causes(self):
193        return self._related('>')
194
195    def also_sees(self):
196        return self._related('^')
197
198    def verb_groups(self):
199        return self._related('$')
200
201    def similar_tos(self):
202        return self._related('&')
203
204    def __hash__(self):
205        return hash(self._name)
206
207    def __eq__(self, other):
208        return self._name == other._name
209
210    def __ne__(self, other):
211        return self._name != other._name
212
213    def __lt__(self, other):
214        return self._name < other._name
215
216
217@python_2_unicode_compatible
218class Lemma(_WordNetObject):
219    """
220    The lexical entry for a single morphological form of a
221    sense-disambiguated word.
222
223    Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
224    <word> is the morphological stem identifying the synset
225    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
226    <number> is the sense number, counting from 0.
227    <lemma> is the morphological form of interest
228
229    Note that <word> and <lemma> can be different, e.g. the Synset
230    'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
231    'salt.n.03.salinity'.
232
233    Lemma attributes, accessible via methods with the same name::
234
235    - name: The canonical name of this lemma.
236    - synset: The synset that this lemma belongs to.
237    - syntactic_marker: For adjectives, the WordNet string identifying the
238      syntactic position relative modified noun. See:
239      http://wordnet.princeton.edu/man/wninput.5WN.html#sect10
240      For all other parts of speech, this attribute is None.
241    - count: The frequency of this lemma in wordnet.
242
243    Lemma methods:
244
245    Lemmas have the following methods for retrieving related Lemmas. They
246    correspond to the names for the pointer symbols defined here:
247    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
248    These methods all return lists of Lemmas:
249
250    - antonyms
251    - hypernyms, instance_hypernyms
252    - hyponyms, instance_hyponyms
253    - member_holonyms, substance_holonyms, part_holonyms
254    - member_meronyms, substance_meronyms, part_meronyms
255    - topic_domains, region_domains, usage_domains
256    - attributes
257    - derivationally_related_forms
258    - entailments
259    - causes
260    - also_sees
261    - verb_groups
262    - similar_tos
263    - pertainyms
264    """
265
266    __slots__ = [
267        '_wordnet_corpus_reader',
268        '_name',
269        '_syntactic_marker',
270        '_synset',
271        '_frame_strings',
272        '_frame_ids',
273        '_lexname_index',
274        '_lex_id',
275        '_lang',
276        '_key',
277    ]
278
279    def __init__(
280        self,
281        wordnet_corpus_reader,
282        synset,
283        name,
284        lexname_index,
285        lex_id,
286        syntactic_marker,
287    ):
288        self._wordnet_corpus_reader = wordnet_corpus_reader
289        self._name = name
290        self._syntactic_marker = syntactic_marker
291        self._synset = synset
292        self._frame_strings = []
293        self._frame_ids = []
294        self._lexname_index = lexname_index
295        self._lex_id = lex_id
296        self._lang = 'eng'
297
298        self._key = None  # gets set later.
299
300    def name(self):
301        return self._name
302
303    def syntactic_marker(self):
304        return self._syntactic_marker
305
306    def synset(self):
307        return self._synset
308
309    def frame_strings(self):
310        return self._frame_strings
311
312    def frame_ids(self):
313        return self._frame_ids
314
315    def lang(self):
316        return self._lang
317
318    def key(self):
319        return self._key
320
321    def __repr__(self):
322        tup = type(self).__name__, self._synset._name, self._name
323        return "%s('%s.%s')" % tup
324
325    def _related(self, relation_symbol):
326        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
327        if (self._name, relation_symbol) not in self._synset._lemma_pointers:
328            return []
329        return [
330            get_synset(pos, offset)._lemmas[lemma_index]
331            for pos, offset, lemma_index in self._synset._lemma_pointers[
332                self._name, relation_symbol
333            ]
334        ]
335
336    def count(self):
337        """Return the frequency count for this Lemma"""
338        return self._wordnet_corpus_reader.lemma_count(self)
339
340    def antonyms(self):
341        return self._related('!')
342
343    def derivationally_related_forms(self):
344        return self._related('+')
345
346    def pertainyms(self):
347        return self._related('\\')
348
349
350@python_2_unicode_compatible
351class Synset(_WordNetObject):
352    """Create a Synset from a "<lemma>.<pos>.<number>" string where:
353    <lemma> is the word's morphological stem
354    <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
355    <number> is the sense number, counting from 0.
356
357    Synset attributes, accessible via methods with the same name:
358
359    - name: The canonical name of this synset, formed using the first lemma
360      of this synset. Note that this may be different from the name
361      passed to the constructor if that string used a different lemma to
362      identify the synset.
363    - pos: The synset's part of speech, matching one of the module level
364      attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
365    - lemmas: A list of the Lemma objects for this synset.
366    - definition: The definition for this synset.
367    - examples: A list of example strings for this synset.
368    - offset: The offset in the WordNet dict file of this synset.
369    - lexname: The name of the lexicographer file containing this synset.
370
371    Synset methods:
372
373    Synsets have the following methods for retrieving related Synsets.
374    They correspond to the names for the pointer symbols defined here:
375    http://wordnet.princeton.edu/man/wninput.5WN.html#sect3
376    These methods all return lists of Synsets.
377
378    - hypernyms, instance_hypernyms
379    - hyponyms, instance_hyponyms
380    - member_holonyms, substance_holonyms, part_holonyms
381    - member_meronyms, substance_meronyms, part_meronyms
382    - attributes
383    - entailments
384    - causes
385    - also_sees
386    - verb_groups
387    - similar_tos
388
389    Additionally, Synsets support the following methods specific to the
390    hypernym relation:
391
392    - root_hypernyms
393    - common_hypernyms
394    - lowest_common_hypernyms
395
396    Note that Synsets do not support the following relations because
397    these are defined by WordNet as lexical relations:
398
399    - antonyms
400    - derivationally_related_forms
401    - pertainyms
402    """
403
404    __slots__ = [
405        '_pos',
406        '_offset',
407        '_name',
408        '_frame_ids',
409        '_lemmas',
410        '_lemma_names',
411        '_definition',
412        '_examples',
413        '_lexname',
414        '_pointers',
415        '_lemma_pointers',
416        '_max_depth',
417        '_min_depth',
418    ]
419
420    def __init__(self, wordnet_corpus_reader):
421        self._wordnet_corpus_reader = wordnet_corpus_reader
422        # All of these attributes get initialized by
423        # WordNetCorpusReader._synset_from_pos_and_line()
424
425        self._pos = None
426        self._offset = None
427        self._name = None
428        self._frame_ids = []
429        self._lemmas = []
430        self._lemma_names = []
431        self._definition = None
432        self._examples = []
433        self._lexname = None  # lexicographer name
434        self._all_hypernyms = None
435
436        self._pointers = defaultdict(set)
437        self._lemma_pointers = defaultdict(list)
438
439    def pos(self):
440        return self._pos
441
442    def offset(self):
443        return self._offset
444
445    def name(self):
446        return self._name
447
448    def frame_ids(self):
449        return self._frame_ids
450
451    def definition(self):
452        return self._definition
453
454    def examples(self):
455        return self._examples
456
457    def lexname(self):
458        return self._lexname
459
460    def _needs_root(self):
461        if self._pos == NOUN:
462            if self._wordnet_corpus_reader.get_version() == '1.6':
463                return True
464            else:
465                return False
466        elif self._pos == VERB:
467            return True
468
469    def lemma_names(self, lang='eng'):
470        '''Return all the lemma_names associated with the synset'''
471        if lang == 'eng':
472            return self._lemma_names
473        else:
474            self._wordnet_corpus_reader._load_lang_data(lang)
475
476            i = self._wordnet_corpus_reader.ss2of(self, lang)
477            if i in self._wordnet_corpus_reader._lang_data[lang][0]:
478                return self._wordnet_corpus_reader._lang_data[lang][0][i]
479            else:
480                return []
481
482    def lemmas(self, lang='eng'):
483        '''Return all the lemma objects associated with the synset'''
484        if lang == 'eng':
485            return self._lemmas
486        else:
487            self._wordnet_corpus_reader._load_lang_data(lang)
488            lemmark = []
489            lemmy = self.lemma_names(lang)
490            for lem in lemmy:
491                temp = Lemma(
492                    self._wordnet_corpus_reader,
493                    self,
494                    lem,
495                    self._wordnet_corpus_reader._lexnames.index(self.lexname()),
496                    0,
497                    None,
498                )
499                temp._lang = lang
500                lemmark.append(temp)
501            return lemmark
502
503    def root_hypernyms(self):
504        """Get the topmost hypernyms of this synset in WordNet."""
505
506        result = []
507        seen = set()
508        todo = [self]
509        while todo:
510            next_synset = todo.pop()
511            if next_synset not in seen:
512                seen.add(next_synset)
513                next_hypernyms = (
514                    next_synset.hypernyms() + next_synset.instance_hypernyms()
515                )
516                if not next_hypernyms:
517                    result.append(next_synset)
518                else:
519                    todo.extend(next_hypernyms)
520        return result
521
522    # Simpler implementation which makes incorrect assumption that
523    # hypernym hierarchy is acyclic:
524    #
525    #        if not self.hypernyms():
526    #            return [self]
527    #        else:
528    #            return list(set(root for h in self.hypernyms()
529    #                            for root in h.root_hypernyms()))
530    def max_depth(self):
531        """
532        :return: The length of the longest hypernym path from this
533        synset to the root.
534        """
535
536        if "_max_depth" not in self.__dict__:
537            hypernyms = self.hypernyms() + self.instance_hypernyms()
538            if not hypernyms:
539                self._max_depth = 0
540            else:
541                self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
542        return self._max_depth
543
544    def min_depth(self):
545        """
546        :return: The length of the shortest hypernym path from this
547        synset to the root.
548        """
549
550        if "_min_depth" not in self.__dict__:
551            hypernyms = self.hypernyms() + self.instance_hypernyms()
552            if not hypernyms:
553                self._min_depth = 0
554            else:
555                self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
556        return self._min_depth
557
558    def closure(self, rel, depth=-1):
559        """Return the transitive closure of source under the rel
560        relationship, breadth-first
561
562            >>> from nltk.corpus import wordnet as wn
563            >>> dog = wn.synset('dog.n.01')
564            >>> hyp = lambda s:s.hypernyms()
565            >>> list(dog.closure(hyp))
566            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
567            Synset('carnivore.n.01'), Synset('animal.n.01'),
568            Synset('placental.n.01'), Synset('organism.n.01'),
569            Synset('mammal.n.01'), Synset('living_thing.n.01'),
570            Synset('vertebrate.n.01'), Synset('whole.n.02'),
571            Synset('chordate.n.01'), Synset('object.n.01'),
572            Synset('physical_entity.n.01'), Synset('entity.n.01')]
573
574        """
575        from nltk.util import breadth_first
576
577        synset_offsets = []
578        for synset in breadth_first(self, rel, depth):
579            if synset._offset != self._offset:
580                if synset._offset not in synset_offsets:
581                    synset_offsets.append(synset._offset)
582                    yield synset
583
584    def hypernym_paths(self):
585        """
586        Get the path(s) from this synset to the root, where each path is a
587        list of the synset nodes traversed on the way to the root.
588
589        :return: A list of lists, where each list gives the node sequence
590           connecting the initial ``Synset`` node and a root node.
591        """
592        paths = []
593
594        hypernyms = self.hypernyms() + self.instance_hypernyms()
595        if len(hypernyms) == 0:
596            paths = [[self]]
597
598        for hypernym in hypernyms:
599            for ancestor_list in hypernym.hypernym_paths():
600                ancestor_list.append(self)
601                paths.append(ancestor_list)
602        return paths
603
604    def common_hypernyms(self, other):
605        """
606        Find all synsets that are hypernyms of this synset and the
607        other synset.
608
609        :type other: Synset
610        :param other: other input synset.
611        :return: The synsets that are hypernyms of both synsets.
612        """
613        if not self._all_hypernyms:
614            self._all_hypernyms = set(
615                self_synset
616                for self_synsets in self._iter_hypernym_lists()
617                for self_synset in self_synsets
618            )
619        if not other._all_hypernyms:
620            other._all_hypernyms = set(
621                other_synset
622                for other_synsets in other._iter_hypernym_lists()
623                for other_synset in other_synsets
624            )
625        return list(self._all_hypernyms.intersection(other._all_hypernyms))
626
627    def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
628        """
629        Get a list of lowest synset(s) that both synsets have as a hypernym.
630        When `use_min_depth == False` this means that the synset which appears
631        as a hypernym of both `self` and `other` with the lowest maximum depth
632        is returned or if there are multiple such synsets at the same depth
633        they are all returned
634
635        However, if `use_min_depth == True` then the synset(s) which has/have
636        the lowest minimum depth and appear(s) in both paths is/are returned.
637
638        By setting the use_min_depth flag to True, the behavior of NLTK2 can be
639        preserved. This was changed in NLTK3 to give more accurate results in a
640        small set of cases, generally with synsets concerning people. (eg:
641        'chef.n.01', 'fireman.n.01', etc.)
642
643        This method is an implementation of Ted Pedersen's "Lowest Common
644        Subsumer" method from the Perl Wordnet module. It can return either
645        "self" or "other" if they are a hypernym of the other.
646
647        :type other: Synset
648        :param other: other input synset
649        :type simulate_root: bool
650        :param simulate_root: The various verb taxonomies do not
651            share a single root which disallows this metric from working for
652            synsets that are not connected. This flag (False by default)
653            creates a fake root that connects all the taxonomies. Set it
654            to True to enable this behavior. For the noun taxonomy,
655            there is usually a default root except for WordNet version 1.6.
656            If you are using wordnet 1.6, a fake root will need to be added
657            for nouns as well.
658        :type use_min_depth: bool
659        :param use_min_depth: This setting mimics older (v2) behavior of NLTK
660            wordnet If True, will use the min_depth function to calculate the
661            lowest common hypernyms. This is known to give strange results for
662            some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
663            for backwards compatibility
664        :return: The synsets that are the lowest common hypernyms of both
665            synsets
666        """
667        synsets = self.common_hypernyms(other)
668        if simulate_root:
669            fake_synset = Synset(None)
670            fake_synset._name = '*ROOT*'
671            fake_synset.hypernyms = lambda: []
672            fake_synset.instance_hypernyms = lambda: []
673            synsets.append(fake_synset)
674
675        try:
676            if use_min_depth:
677                max_depth = max(s.min_depth() for s in synsets)
678                unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
679            else:
680                max_depth = max(s.max_depth() for s in synsets)
681                unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
682            return sorted(unsorted_lch)
683        except ValueError:
684            return []
685
686    def hypernym_distances(self, distance=0, simulate_root=False):
687        """
688        Get the path(s) from this synset to the root, counting the distance
689        of each node from the initial node on the way. A set of
690        (synset, distance) tuples is returned.
691
692        :type distance: int
693        :param distance: the distance (number of edges) from this hypernym to
694            the original hypernym ``Synset`` on which this method was called.
695        :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
696           a hypernym of the first ``Synset``.
697        """
698        distances = set([(self, distance)])
699        for hypernym in self._hypernyms() + self._instance_hypernyms():
700            distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
701        if simulate_root:
702            fake_synset = Synset(None)
703            fake_synset._name = '*ROOT*'
704            fake_synset_distance = max(distances, key=itemgetter(1))[1]
705            distances.add((fake_synset, fake_synset_distance + 1))
706        return distances
707
708    def _shortest_hypernym_paths(self, simulate_root):
709        if self._name == '*ROOT*':
710            return {self: 0}
711
712        queue = deque([(self, 0)])
713        path = {}
714
715        while queue:
716            s, depth = queue.popleft()
717            if s in path:
718                continue
719            path[s] = depth
720
721            depth += 1
722            queue.extend((hyp, depth) for hyp in s._hypernyms())
723            queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
724
725        if simulate_root:
726            fake_synset = Synset(None)
727            fake_synset._name = '*ROOT*'
728            path[fake_synset] = max(path.values()) + 1
729
730        return path
731
732    def shortest_path_distance(self, other, simulate_root=False):
733        """
734        Returns the distance of the shortest path linking the two synsets (if
735        one exists). For each synset, all the ancestor nodes and their
736        distances are recorded and compared. The ancestor node common to both
737        synsets that can be reached with the minimum number of traversals is
738        used. If no ancestor nodes are common, None is returned. If a node is
739        compared with itself 0 is returned.
740
741        :type other: Synset
742        :param other: The Synset to which the shortest path will be found.
743        :return: The number of edges in the shortest path connecting the two
744            nodes, or None if no path exists.
745        """
746
747        if self == other:
748            return 0
749
750        dist_dict1 = self._shortest_hypernym_paths(simulate_root)
751        dist_dict2 = other._shortest_hypernym_paths(simulate_root)
752
753        # For each ancestor synset common to both subject synsets, find the
754        # connecting path length. Return the shortest of these.
755
756        inf = float('inf')
757        path_distance = inf
758        for synset, d1 in iteritems(dist_dict1):
759            d2 = dist_dict2.get(synset, inf)
760            path_distance = min(path_distance, d1 + d2)
761
762        return None if math.isinf(path_distance) else path_distance
763
764    def tree(self, rel, depth=-1, cut_mark=None):
765        """
766        >>> from nltk.corpus import wordnet as wn
767        >>> dog = wn.synset('dog.n.01')
768        >>> hyp = lambda s:s.hypernyms()
769        >>> from pprint import pprint
770        >>> pprint(dog.tree(hyp))
771        [Synset('dog.n.01'),
772         [Synset('canine.n.02'),
773          [Synset('carnivore.n.01'),
774           [Synset('placental.n.01'),
775            [Synset('mammal.n.01'),
776             [Synset('vertebrate.n.01'),
777              [Synset('chordate.n.01'),
778               [Synset('animal.n.01'),
779                [Synset('organism.n.01'),
780                 [Synset('living_thing.n.01'),
781                  [Synset('whole.n.02'),
782                   [Synset('object.n.01'),
783                    [Synset('physical_entity.n.01'),
784                     [Synset('entity.n.01')]]]]]]]]]]]]],
785         [Synset('domestic_animal.n.01'),
786          [Synset('animal.n.01'),
787           [Synset('organism.n.01'),
788            [Synset('living_thing.n.01'),
789             [Synset('whole.n.02'),
790              [Synset('object.n.01'),
791               [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
792        """
793
794        tree = [self]
795        if depth != 0:
796            tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)]
797        elif cut_mark:
798            tree += [cut_mark]
799        return tree
800
801    # interface to similarity methods
802    def path_similarity(self, other, verbose=False, simulate_root=True):
803        """
804        Path Distance Similarity:
805        Return a score denoting how similar two word senses are, based on the
806        shortest path that connects the senses in the is-a (hypernym/hypnoym)
807        taxonomy. The score is in the range 0 to 1, except in those cases where
808        a path cannot be found (will only be true for verbs as there are many
809        distinct verb taxonomies), in which case None is returned. A score of
810        1 represents identity i.e. comparing a sense with itself will return 1.
811
812        :type other: Synset
813        :param other: The ``Synset`` that this ``Synset`` is being compared to.
814        :type simulate_root: bool
815        :param simulate_root: The various verb taxonomies do not
816            share a single root which disallows this metric from working for
817            synsets that are not connected. This flag (True by default)
818            creates a fake root that connects all the taxonomies. Set it
819            to false to disable this behavior. For the noun taxonomy,
820            there is usually a default root except for WordNet version 1.6.
821            If you are using wordnet 1.6, a fake root will be added for nouns
822            as well.
823        :return: A score denoting the similarity of the two ``Synset`` objects,
824            normally between 0 and 1. None is returned if no connecting path
825            could be found. 1 is returned if a ``Synset`` is compared with
826            itself.
827        """
828
829        distance = self.shortest_path_distance(
830            other, simulate_root=simulate_root and self._needs_root()
831        )
832        if distance is None or distance < 0:
833            return None
834        return 1.0 / (distance + 1)
835
836    def lch_similarity(self, other, verbose=False, simulate_root=True):
837        """
838        Leacock Chodorow Similarity:
839        Return a score denoting how similar two word senses are, based on the
840        shortest path that connects the senses (as above) and the maximum depth
841        of the taxonomy in which the senses occur. The relationship is given as
842        -log(p/2d) where p is the shortest path length and d is the taxonomy
843        depth.
844
845        :type  other: Synset
846        :param other: The ``Synset`` that this ``Synset`` is being compared to.
847        :type simulate_root: bool
848        :param simulate_root: The various verb taxonomies do not
849            share a single root which disallows this metric from working for
850            synsets that are not connected. This flag (True by default)
851            creates a fake root that connects all the taxonomies. Set it
852            to false to disable this behavior. For the noun taxonomy,
853            there is usually a default root except for WordNet version 1.6.
854            If you are using wordnet 1.6, a fake root will be added for nouns
855            as well.
856        :return: A score denoting the similarity of the two ``Synset`` objects,
857            normally greater than 0. None is returned if no connecting path
858            could be found. If a ``Synset`` is compared with itself, the
859            maximum score is returned, which varies depending on the taxonomy
860            depth.
861        """
862
863        if self._pos != other._pos:
864            raise WordNetError(
865                'Computing the lch similarity requires '
866                '%s and %s to have the same part of speech.' % (self, other)
867            )
868
869        need_root = self._needs_root()
870
871        if self._pos not in self._wordnet_corpus_reader._max_depth:
872            self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
873
874        depth = self._wordnet_corpus_reader._max_depth[self._pos]
875
876        distance = self.shortest_path_distance(
877            other, simulate_root=simulate_root and need_root
878        )
879
880        if distance is None or distance < 0 or depth == 0:
881            return None
882        return -math.log((distance + 1) / (2.0 * depth))
883
884    def wup_similarity(self, other, verbose=False, simulate_root=True):
885        """
886        Wu-Palmer Similarity:
887        Return a score denoting how similar two word senses are, based on the
888        depth of the two senses in the taxonomy and that of their Least Common
889        Subsumer (most specific ancestor node). Previously, the scores computed
890        by this implementation did _not_ always agree with those given by
891        Pedersen's Perl implementation of WordNet Similarity. However, with
892        the addition of the simulate_root flag (see below), the score for
893        verbs now almost always agree but not always for nouns.
894
895        The LCS does not necessarily feature in the shortest path connecting
896        the two senses, as it is by definition the common ancestor deepest in
897        the taxonomy, not closest to the two senses. Typically, however, it
898        will so feature. Where multiple candidates for the LCS exist, that
899        whose shortest path to the root node is the longest will be selected.
900        Where the LCS has multiple paths to the root, the longer path is used
901        for the purposes of the calculation.
902
903        :type  other: Synset
904        :param other: The ``Synset`` that this ``Synset`` is being compared to.
905        :type simulate_root: bool
906        :param simulate_root: The various verb taxonomies do not
907            share a single root which disallows this metric from working for
908            synsets that are not connected. This flag (True by default)
909            creates a fake root that connects all the taxonomies. Set it
910            to false to disable this behavior. For the noun taxonomy,
911            there is usually a default root except for WordNet version 1.6.
912            If you are using wordnet 1.6, a fake root will be added for nouns
913            as well.
914        :return: A float score denoting the similarity of the two ``Synset``
915            objects, normally greater than zero. If no connecting path between
916            the two senses can be found, None is returned.
917
918        """
919
920        need_root = self._needs_root()
921        # Note that to preserve behavior from NLTK2 we set use_min_depth=True
922        # It is possible that more accurate results could be obtained by
923        # removing this setting and it should be tested later on
924        subsumers = self.lowest_common_hypernyms(
925            other, simulate_root=simulate_root and need_root, use_min_depth=True
926        )
927
928        # If no LCS was found return None
929        if len(subsumers) == 0:
930            return None
931
932        subsumer = self if self in subsumers else subsumers[0]
933
934        # Get the longest path from the LCS to the root,
935        # including a correction:
936        # - add one because the calculations include both the start and end
937        #   nodes
938        depth = subsumer.max_depth() + 1
939
940        # Note: No need for an additional add-one correction for non-nouns
941        # to account for an imaginary root node because that is now
942        # automatically handled by simulate_root
943        # if subsumer._pos != NOUN:
944        #     depth += 1
945
946        # Get the shortest path from the LCS to each of the synsets it is
947        # subsuming.  Add this to the LCS path length to get the path
948        # length from each synset to the root.
949        len1 = self.shortest_path_distance(
950            subsumer, simulate_root=simulate_root and need_root
951        )
952        len2 = other.shortest_path_distance(
953            subsumer, simulate_root=simulate_root and need_root
954        )
955        if len1 is None or len2 is None:
956            return None
957        len1 += depth
958        len2 += depth
959        return (2.0 * depth) / (len1 + len2)
960
961    def res_similarity(self, other, ic, verbose=False):
962        """
963        Resnik Similarity:
964        Return a score denoting how similar two word senses are, based on the
965        Information Content (IC) of the Least Common Subsumer (most specific
966        ancestor node).
967
968        :type  other: Synset
969        :param other: The ``Synset`` that this ``Synset`` is being compared to.
970        :type ic: dict
971        :param ic: an information content object (as returned by
972            ``nltk.corpus.wordnet_ic.ic()``).
973        :return: A float score denoting the similarity of the two ``Synset``
974            objects. Synsets whose LCS is the root node of the taxonomy will
975            have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
976        """
977
978        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
979        return lcs_ic
980
981    def jcn_similarity(self, other, ic, verbose=False):
982        """
983        Jiang-Conrath Similarity:
984        Return a score denoting how similar two word senses are, based on the
985        Information Content (IC) of the Least Common Subsumer (most specific
986        ancestor node) and that of the two input Synsets. The relationship is
987        given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
988
989        :type  other: Synset
990        :param other: The ``Synset`` that this ``Synset`` is being compared to.
991        :type  ic: dict
992        :param ic: an information content object (as returned by
993            ``nltk.corpus.wordnet_ic.ic()``).
994        :return: A float score denoting the similarity of the two ``Synset``
995            objects.
996        """
997
998        if self == other:
999            return _INF
1000
1001        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
1002
1003        # If either of the input synsets are the root synset, or have a
1004        # frequency of 0 (sparse data problem), return 0.
1005        if ic1 == 0 or ic2 == 0:
1006            return 0
1007
1008        ic_difference = ic1 + ic2 - 2 * lcs_ic
1009
1010        if ic_difference == 0:
1011            return _INF
1012
1013        return 1 / ic_difference
1014
1015    def lin_similarity(self, other, ic, verbose=False):
1016        """
1017        Lin Similarity:
1018        Return a score denoting how similar two word senses are, based on the
1019        Information Content (IC) of the Least Common Subsumer (most specific
1020        ancestor node) and that of the two input Synsets. The relationship is
1021        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
1022
1023        :type other: Synset
1024        :param other: The ``Synset`` that this ``Synset`` is being compared to.
1025        :type ic: dict
1026        :param ic: an information content object (as returned by
1027            ``nltk.corpus.wordnet_ic.ic()``).
1028        :return: A float score denoting the similarity of the two ``Synset``
1029            objects, in the range 0 to 1.
1030        """
1031
1032        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
1033        return (2.0 * lcs_ic) / (ic1 + ic2)
1034
1035    def _iter_hypernym_lists(self):
1036        """
1037        :return: An iterator over ``Synset`` objects that are either proper
1038        hypernyms or instance of hypernyms of the synset.
1039        """
1040        todo = [self]
1041        seen = set()
1042        while todo:
1043            for synset in todo:
1044                seen.add(synset)
1045            yield todo
1046            todo = [
1047                hypernym
1048                for synset in todo
1049                for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
1050                if hypernym not in seen
1051            ]
1052
1053    def __repr__(self):
1054        return "%s('%s')" % (type(self).__name__, self._name)
1055
1056    def _related(self, relation_symbol, sort=True):
1057        get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
1058        if relation_symbol not in self._pointers:
1059            return []
1060        pointer_tuples = self._pointers[relation_symbol]
1061        r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
1062        if sort:
1063            r.sort()
1064        return r
1065
1066
1067######################################################################
1068# WordNet Corpus Reader
1069######################################################################
1070
1071
1072class WordNetCorpusReader(CorpusReader):
1073    """
1074    A corpus reader used to access wordnet or its variants.
1075    """
1076
1077    _ENCODING = 'utf8'
1078
1079    # { Part-of-speech constants
1080    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
1081    # }
1082
1083    # { Filename constants
1084    _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
1085    # }
1086
1087    # { Part of speech constants
1088    _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
1089    _pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
1090    # }
1091
1092    #: A list of file identifiers for all the fileids used by this
1093    #: corpus reader.
1094    _FILES = (
1095        'cntlist.rev',
1096        'lexnames',
1097        'index.sense',
1098        'index.adj',
1099        'index.adv',
1100        'index.noun',
1101        'index.verb',
1102        'data.adj',
1103        'data.adv',
1104        'data.noun',
1105        'data.verb',
1106        'adj.exc',
1107        'adv.exc',
1108        'noun.exc',
1109        'verb.exc',
1110    )
1111
1112    def __init__(self, root, omw_reader):
1113        """
1114        Construct a new wordnet corpus reader, with the given root
1115        directory.
1116        """
1117        super(WordNetCorpusReader, self).__init__(
1118            root, self._FILES, encoding=self._ENCODING
1119        )
1120
1121        # A index that provides the file offset
1122        # Map from lemma -> pos -> synset_index -> offset
1123        self._lemma_pos_offset_map = defaultdict(dict)
1124
1125        # A cache so we don't have to reconstuct synsets
1126        # Map from pos -> offset -> synset
1127        self._synset_offset_cache = defaultdict(dict)
1128
1129        # A lookup for the maximum depth of each part of speech.  Useful for
1130        # the lch similarity metric.
1131        self._max_depth = defaultdict(dict)
1132
1133        # Corpus reader containing omw data.
1134        self._omw_reader = omw_reader
1135
1136        # A cache to store the wordnet data of multiple languages
1137        self._lang_data = defaultdict(list)
1138
1139        self._data_file_map = {}
1140        self._exception_map = {}
1141        self._lexnames = []
1142        self._key_count_file = None
1143        self._key_synset_file = None
1144
1145        # Load the lexnames
1146        for i, line in enumerate(self.open('lexnames')):
1147            index, lexname, _ = line.split()
1148            assert int(index) == i
1149            self._lexnames.append(lexname)
1150
1151        # Load the indices for lemmas and synset offsets
1152        self._load_lemma_pos_offset_map()
1153
1154        # load the exception file data into memory
1155        self._load_exception_map()
1156
1157    # Open Multilingual WordNet functions, contributed by
1158    # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
1159
1160    def of2ss(self, of):
1161        ''' take an id and return the synsets '''
1162        return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
1163
1164    def ss2of(self, ss, lang=None):
1165        ''' return the ID of the synset '''
1166        pos = ss.pos()
1167        # Only these 3 WordNets retain the satellite pos tag
1168        if lang not in ["nld", "lit", "slk"] and pos == 's':
1169            pos = 'a'
1170        return "{:08d}-{}".format(ss.offset(), pos)
1171
1172    def _load_lang_data(self, lang):
1173        ''' load the wordnet data of the requested language from the file to
1174        the cache, _lang_data '''
1175
1176        if lang in self._lang_data.keys():
1177            return
1178
1179        if lang not in self.langs():
1180            raise WordNetError("Language is not supported.")
1181
1182        f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
1183        self.custom_lemmas(f, lang)
1184        f.close()
1185
1186    def langs(self):
1187        ''' return a list of languages supported by Multilingual Wordnet '''
1188        import os
1189
1190        langs = ['eng']
1191        fileids = self._omw_reader.fileids()
1192        for fileid in fileids:
1193            file_name, file_extension = os.path.splitext(fileid)
1194            if file_extension == '.tab':
1195                langs.append(file_name.split('-')[-1])
1196
1197        return langs
1198
1199    def _load_lemma_pos_offset_map(self):
1200        for suffix in self._FILEMAP.values():
1201
1202            # parse each line of the file (ignoring comment lines)
1203            for i, line in enumerate(self.open('index.%s' % suffix)):
1204                if line.startswith(' '):
1205                    continue
1206
1207                _iter = iter(line.split())
1208
1209                def _next_token():
1210                    return next(_iter)
1211
1212                try:
1213
1214                    # get the lemma and part-of-speech
1215                    lemma = _next_token()
1216                    pos = _next_token()
1217
1218                    # get the number of synsets for this lemma
1219                    n_synsets = int(_next_token())
1220                    assert n_synsets > 0
1221
1222                    # get and ignore the pointer symbols for all synsets of
1223                    # this lemma
1224                    n_pointers = int(_next_token())
1225                    [_next_token() for _ in range(n_pointers)]
1226
1227                    # same as number of synsets
1228                    n_senses = int(_next_token())
1229                    assert n_synsets == n_senses
1230
1231                    # get and ignore number of senses ranked according to
1232                    # frequency
1233                    _next_token()
1234
1235                    # get synset offsets
1236                    synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
1237
1238                # raise more informative error with file name and line number
1239                except (AssertionError, ValueError) as e:
1240                    tup = ('index.%s' % suffix), (i + 1), e
1241                    raise WordNetError('file %s, line %i: %s' % tup)
1242
1243                # map lemmas and parts of speech to synsets
1244                self._lemma_pos_offset_map[lemma][pos] = synset_offsets
1245                if pos == ADJ:
1246                    self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
1247
1248    def _load_exception_map(self):
1249        # load the exception file data into memory
1250        for pos, suffix in self._FILEMAP.items():
1251            self._exception_map[pos] = {}
1252            for line in self.open('%s.exc' % suffix):
1253                terms = line.split()
1254                self._exception_map[pos][terms[0]] = terms[1:]
1255        self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
1256
1257    def _compute_max_depth(self, pos, simulate_root):
1258        """
1259        Compute the max depth for the given part of speech.  This is
1260        used by the lch similarity metric.
1261        """
1262        depth = 0
1263        for ii in self.all_synsets(pos):
1264            try:
1265                depth = max(depth, ii.max_depth())
1266            except RuntimeError:
1267                print(ii)
1268        if simulate_root:
1269            depth += 1
1270        self._max_depth[pos] = depth
1271
1272    def get_version(self):
1273        fh = self._data_file(ADJ)
1274        for line in fh:
1275            match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
1276            if match is not None:
1277                version = match.group(1)
1278                fh.seek(0)
1279                return version
1280
1281    #############################################################
1282    # Loading Lemmas
1283    #############################################################
1284
1285    def lemma(self, name, lang='eng'):
1286        '''Return lemma object that matches the name'''
1287        # cannot simply split on first '.',
1288        # e.g.: '.45_caliber.a.01..45_caliber'
1289        separator = SENSENUM_RE.search(name).end()
1290
1291        synset_name, lemma_name = name[: separator - 1], name[separator:]
1292
1293        synset = self.synset(synset_name)
1294        for lemma in synset.lemmas(lang):
1295            if lemma._name == lemma_name:
1296                return lemma
1297        raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
1298
1299    def lemma_from_key(self, key):
1300        # Keys are case sensitive and always lower-case
1301        key = key.lower()
1302
1303        lemma_name, lex_sense = key.split('%')
1304        pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
1305        pos = self._pos_names[int(pos_number)]
1306
1307        # open the key -> synset file if necessary
1308        if self._key_synset_file is None:
1309            self._key_synset_file = self.open('index.sense')
1310
1311        # Find the synset for the lemma.
1312        synset_line = _binary_search_file(self._key_synset_file, key)
1313        if not synset_line:
1314            raise WordNetError("No synset found for key %r" % key)
1315        offset = int(synset_line.split()[1])
1316        synset = self.synset_from_pos_and_offset(pos, offset)
1317
1318        # return the corresponding lemma
1319        for lemma in synset._lemmas:
1320            if lemma._key == key:
1321                return lemma
1322        raise WordNetError("No lemma found for for key %r" % key)
1323
1324    #############################################################
1325    # Loading Synsets
1326    #############################################################
1327    def synset(self, name):
1328        # split name into lemma, part of speech and synset number
1329        lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
1330        synset_index = int(synset_index_str) - 1
1331
1332        # get the offset for this synset
1333        try:
1334            offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
1335        except KeyError:
1336            message = 'no lemma %r with part of speech %r'
1337            raise WordNetError(message % (lemma, pos))
1338        except IndexError:
1339            n_senses = len(self._lemma_pos_offset_map[lemma][pos])
1340            message = "lemma %r with part of speech %r has only %i %s"
1341            if n_senses == 1:
1342                tup = lemma, pos, n_senses, "sense"
1343            else:
1344                tup = lemma, pos, n_senses, "senses"
1345            raise WordNetError(message % tup)
1346
1347        # load synset information from the appropriate file
1348        synset = self.synset_from_pos_and_offset(pos, offset)
1349
1350        # some basic sanity checks on loaded attributes
1351        if pos == 's' and synset._pos == 'a':
1352            message = (
1353                'adjective satellite requested but only plain '
1354                'adjective found for lemma %r'
1355            )
1356            raise WordNetError(message % lemma)
1357        assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
1358
1359        # Return the synset object.
1360        return synset
1361
1362    def _data_file(self, pos):
1363        """
1364        Return an open file pointer for the data file for the given
1365        part of speech.
1366        """
1367        if pos == ADJ_SAT:
1368            pos = ADJ
1369        if self._data_file_map.get(pos) is None:
1370            fileid = 'data.%s' % self._FILEMAP[pos]
1371            self._data_file_map[pos] = self.open(fileid)
1372        return self._data_file_map[pos]
1373
1374    def synset_from_pos_and_offset(self, pos, offset):
1375        # Check to see if the synset is in the cache
1376        if offset in self._synset_offset_cache[pos]:
1377            return self._synset_offset_cache[pos][offset]
1378
1379        data_file = self._data_file(pos)
1380        data_file.seek(offset)
1381        data_file_line = data_file.readline()
1382        synset = self._synset_from_pos_and_line(pos, data_file_line)
1383        assert synset._offset == offset
1384        self._synset_offset_cache[pos][offset] = synset
1385        return synset
1386
1387    @deprecated('Use public method synset_from_pos_and_offset() instead')
1388    def _synset_from_pos_and_offset(self, *args, **kwargs):
1389        """
1390        Hack to help people like the readers of
1391        http://stackoverflow.com/a/27145655/1709587
1392        who were using this function before it was officially a public method
1393        """
1394        return self.synset_from_pos_and_offset(*args, **kwargs)
1395
1396    def _synset_from_pos_and_line(self, pos, data_file_line):
1397        # Construct a new (empty) synset.
1398        synset = Synset(self)
1399
1400        # parse the entry for this synset
1401        try:
1402
1403            # parse out the definitions and examples from the gloss
1404            columns_str, gloss = data_file_line.split('|')
1405            gloss = gloss.strip()
1406            definitions = []
1407            for gloss_part in gloss.split(';'):
1408                gloss_part = gloss_part.strip()
1409                if gloss_part.startswith('"'):
1410                    synset._examples.append(gloss_part.strip('"'))
1411                else:
1412                    definitions.append(gloss_part)
1413            synset._definition = '; '.join(definitions)
1414
1415            # split the other info into fields
1416            _iter = iter(columns_str.split())
1417
1418            def _next_token():
1419                return next(_iter)
1420
1421            # get the offset
1422            synset._offset = int(_next_token())
1423
1424            # determine the lexicographer file name
1425            lexname_index = int(_next_token())
1426            synset._lexname = self._lexnames[lexname_index]
1427
1428            # get the part of speech
1429            synset._pos = _next_token()
1430
1431            # create Lemma objects for each lemma
1432            n_lemmas = int(_next_token(), 16)
1433            for _ in range(n_lemmas):
1434                # get the lemma name
1435                lemma_name = _next_token()
1436                # get the lex_id (used for sense_keys)
1437                lex_id = int(_next_token(), 16)
1438                # If the lemma has a syntactic marker, extract it.
1439                m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
1440                lemma_name, syn_mark = m.groups()
1441                # create the lemma object
1442                lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
1443                synset._lemmas.append(lemma)
1444                synset._lemma_names.append(lemma._name)
1445
1446            # collect the pointer tuples
1447            n_pointers = int(_next_token())
1448            for _ in range(n_pointers):
1449                symbol = _next_token()
1450                offset = int(_next_token())
1451                pos = _next_token()
1452                lemma_ids_str = _next_token()
1453                if lemma_ids_str == '0000':
1454                    synset._pointers[symbol].add((pos, offset))
1455                else:
1456                    source_index = int(lemma_ids_str[:2], 16) - 1
1457                    target_index = int(lemma_ids_str[2:], 16) - 1
1458                    source_lemma_name = synset._lemmas[source_index]._name
1459                    lemma_pointers = synset._lemma_pointers
1460                    tups = lemma_pointers[source_lemma_name, symbol]
1461                    tups.append((pos, offset, target_index))
1462
1463            # read the verb frames
1464            try:
1465                frame_count = int(_next_token())
1466            except StopIteration:
1467                pass
1468            else:
1469                for _ in range(frame_count):
1470                    # read the plus sign
1471                    plus = _next_token()
1472                    assert plus == '+'
1473                    # read the frame and lemma number
1474                    frame_number = int(_next_token())
1475                    frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
1476                    lemma_number = int(_next_token(), 16)
1477                    # lemma number of 00 means all words in the synset
1478                    if lemma_number == 0:
1479                        synset._frame_ids.append(frame_number)
1480                        for lemma in synset._lemmas:
1481                            lemma._frame_ids.append(frame_number)
1482                            lemma._frame_strings.append(frame_string_fmt % lemma._name)
1483                    # only a specific word in the synset
1484                    else:
1485                        lemma = synset._lemmas[lemma_number - 1]
1486                        lemma._frame_ids.append(frame_number)
1487                        lemma._frame_strings.append(frame_string_fmt % lemma._name)
1488
1489        # raise a more informative error with line text
1490        except ValueError as e:
1491            raise WordNetError('line %r: %s' % (data_file_line, e))
1492
1493        # set sense keys for Lemma objects - note that this has to be
1494        # done afterwards so that the relations are available
1495        for lemma in synset._lemmas:
1496            if synset._pos == ADJ_SAT:
1497                head_lemma = synset.similar_tos()[0]._lemmas[0]
1498                head_name = head_lemma._name
1499                head_id = '%02d' % head_lemma._lex_id
1500            else:
1501                head_name = head_id = ''
1502            tup = (
1503                lemma._name,
1504                WordNetCorpusReader._pos_numbers[synset._pos],
1505                lemma._lexname_index,
1506                lemma._lex_id,
1507                head_name,
1508                head_id,
1509            )
1510            lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
1511
1512        # the canonical name is based on the first lemma
1513        lemma_name = synset._lemmas[0]._name.lower()
1514        offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
1515        sense_index = offsets.index(synset._offset)
1516        tup = lemma_name, synset._pos, sense_index + 1
1517        synset._name = '%s.%s.%02i' % tup
1518
1519        return synset
1520
1521    def synset_from_sense_key(self, sense_key):
1522        """
1523        Retrieves synset based on a given sense_key. Sense keys can be
1524        obtained from lemma.key()
1525
1526        From https://wordnet.princeton.edu/wordnet/man/senseidx.5WN.html:
1527        A sense_key is represented as:
1528            lemma % lex_sense (e.g. 'dog%1:18:01::')
1529        where lex_sense is encoded as:
1530            ss_type:lex_filenum:lex_id:head_word:head_id
1531
1532        lemma:       ASCII text of word/collocation, in lower case
1533        ss_type:     synset type for the sense (1 digit int)
1534                     The synset type is encoded as follows:
1535                     1    NOUN
1536                     2    VERB
1537                     3    ADJECTIVE
1538                     4    ADVERB
1539                     5    ADJECTIVE SATELLITE
1540        lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
1541        lex_id:      when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
1542        head_word:   lemma of the first word in satellite's head synset
1543                     Only used if sense is in an adjective satellite synset
1544        head_id:     uniquely identifies sense in a lexicographer file when paired with head_word
1545                     Only used if head_word is present (2 digit int)
1546        """
1547        sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)")
1548        synset_types = {1: NOUN, 2: VERB, 3: ADJ, 4: ADV, 5: ADJ_SAT}
1549        lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups()
1550
1551        # check that information extracted from sense_key is valid
1552        error = None
1553        if not lemma:
1554            error = "lemma"
1555        elif int(ss_type) not in synset_types:
1556            error = "ss_type"
1557        elif int(lex_id) < 0 or int(lex_id) > 99:
1558            error = "lex_id"
1559        if error:
1560            raise WordNetError(
1561                "valid {} could not be extracted from the sense key".format(error)
1562            )
1563
1564        synset_id = '.'.join([lemma, synset_types[int(ss_type)], lex_id])
1565        return self.synset(synset_id)
1566
1567    #############################################################
1568    # Retrieve synsets and lemmas.
1569    #############################################################
1570
1571    def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
1572        """Load all synsets with a given lemma and part of speech tag.
1573        If no pos is specified, all synsets for all parts of speech
1574        will be loaded.
1575        If lang is specified, all the synsets associated with the lemma name
1576        of that language will be returned.
1577        """
1578        lemma = lemma.lower()
1579
1580        if lang == 'eng':
1581            get_synset = self.synset_from_pos_and_offset
1582            index = self._lemma_pos_offset_map
1583            if pos is None:
1584                pos = POS_LIST
1585            return [
1586                get_synset(p, offset)
1587                for p in pos
1588                for form in self._morphy(lemma, p, check_exceptions)
1589                for offset in index[form].get(p, [])
1590            ]
1591
1592        else:
1593            self._load_lang_data(lang)
1594            synset_list = []
1595            if lemma in self._lang_data[lang][1]:
1596                for l in self._lang_data[lang][1][lemma]:
1597                    if pos is not None and l[-1] != pos:
1598                        continue
1599                    synset_list.append(self.of2ss(l))
1600            return synset_list
1601
1602    def lemmas(self, lemma, pos=None, lang='eng'):
1603        """Return all Lemma objects with a name matching the specified lemma
1604        name and part of speech tag. Matches any part of speech tag if none is
1605        specified."""
1606
1607        lemma = lemma.lower()
1608        if lang == 'eng':
1609            return [
1610                lemma_obj
1611                for synset in self.synsets(lemma, pos)
1612                for lemma_obj in synset.lemmas()
1613                if lemma_obj.name().lower() == lemma
1614            ]
1615
1616        else:
1617            self._load_lang_data(lang)
1618            lemmas = []
1619            syn = self.synsets(lemma, lang=lang)
1620            for s in syn:
1621                if pos is not None and s.pos() != pos:
1622                    continue
1623                for lemma_obj in s.lemmas(lang=lang):
1624                    if lemma_obj.name().lower() == lemma:
1625                        lemmas.append(lemma_obj)
1626            return lemmas
1627
1628    def all_lemma_names(self, pos=None, lang='eng'):
1629        """Return all lemma names for all synsets for the given
1630        part of speech tag and language or languages. If pos is
1631        not specified, all synsets for all parts of speech will
1632        be used."""
1633
1634        if lang == 'eng':
1635            if pos is None:
1636                return iter(self._lemma_pos_offset_map)
1637            else:
1638                return (
1639                    lemma
1640                    for lemma in self._lemma_pos_offset_map
1641                    if pos in self._lemma_pos_offset_map[lemma]
1642                )
1643        else:
1644            self._load_lang_data(lang)
1645            lemma = []
1646            for i in self._lang_data[lang][0]:
1647                if pos is not None and i[-1] != pos:
1648                    continue
1649                lemma.extend(self._lang_data[lang][0][i])
1650
1651            lemma = list(set(lemma))
1652            return lemma
1653
1654    def all_synsets(self, pos=None):
1655        """Iterate over all synsets with a given part of speech tag.
1656        If no pos is specified, all synsets for all parts of speech
1657        will be loaded.
1658        """
1659        if pos is None:
1660            pos_tags = self._FILEMAP.keys()
1661        else:
1662            pos_tags = [pos]
1663
1664        cache = self._synset_offset_cache
1665        from_pos_and_line = self._synset_from_pos_and_line
1666
1667        # generate all synsets for each part of speech
1668        for pos_tag in pos_tags:
1669            # Open the file for reading.  Note that we can not re-use
1670            # the file poitners from self._data_file_map here, because
1671            # we're defining an iterator, and those file pointers might
1672            # be moved while we're not looking.
1673            if pos_tag == ADJ_SAT:
1674                pos_tag = ADJ
1675            fileid = 'data.%s' % self._FILEMAP[pos_tag]
1676            data_file = self.open(fileid)
1677
1678            try:
1679                # generate synsets for each line in the POS file
1680                offset = data_file.tell()
1681                line = data_file.readline()
1682                while line:
1683                    if not line[0].isspace():
1684                        if offset in cache[pos_tag]:
1685                            # See if the synset is cached
1686                            synset = cache[pos_tag][offset]
1687                        else:
1688                            # Otherwise, parse the line
1689                            synset = from_pos_and_line(pos_tag, line)
1690                            cache[pos_tag][offset] = synset
1691
1692                        # adjective satellites are in the same file as
1693                        # adjectives so only yield the synset if it's actually
1694                        # a satellite
1695                        if synset._pos == ADJ_SAT:
1696                            yield synset
1697
1698                        # for all other POS tags, yield all synsets (this means
1699                        # that adjectives also include adjective satellites)
1700                        else:
1701                            yield synset
1702                    offset = data_file.tell()
1703                    line = data_file.readline()
1704
1705            # close the extra file handle we opened
1706            except:
1707                data_file.close()
1708                raise
1709            else:
1710                data_file.close()
1711
1712    def words(self, lang='eng'):
1713        """return lemmas of the given language as list of words"""
1714        return self.all_lemma_names(lang=lang)
1715
1716    def license(self, lang='eng'):
1717        """Return the contents of LICENSE (for omw)
1718           use lang=lang to get the license for an individual language"""
1719        if lang == 'eng':
1720            return self.open("LICENSE").read()
1721        elif lang in self.langs():
1722            return self._omw_reader.open("{}/LICENSE".format(lang)).read()
1723        elif lang == 'omw':
1724            # under the assumption you don't mean Omwunra-Toqura
1725            return self._omw_reader.open("LICENSE").read()
1726        elif lang in self._lang_data:
1727            raise WordNetError("Cannot determine license for user-provided tab file")
1728        else:
1729            raise WordNetError("Language is not supported.")
1730
1731    def readme(self, lang='omw'):
1732        """Return the contents of README (for omw)
1733           use lang=lang to get the readme for an individual language"""
1734        if lang == 'eng':
1735            return self.open("README").read()
1736        elif lang in self.langs():
1737            return self._omw_reader.open("{}/README".format(lang)).read()
1738        elif lang == 'omw':
1739            # under the assumption you don't mean Omwunra-Toqura
1740            return self._omw_reader.open("README").read()
1741        elif lang in self._lang_data:
1742            raise WordNetError("No README for user-provided tab file")
1743        else:
1744            raise WordNetError("Language is not supported.")
1745
1746    def citation(self, lang='omw'):
1747        """Return the contents of citation.bib file (for omw)
1748           use lang=lang to get the citation for an individual language"""
1749        if lang == 'eng':
1750            return self.open("citation.bib").read()
1751        elif lang in self.langs():
1752            return self._omw_reader.open("{}/citation.bib".format(lang)).read()
1753        elif lang == 'omw':
1754            # under the assumption you don't mean Omwunra-Toqura
1755            return self._omw_reader.open("citation.bib").read()
1756        elif lang in self._lang_data:
1757            raise WordNetError("citation not known for user-provided tab file")
1758        else:
1759            raise WordNetError("Language is not supported.")
1760
1761    #############################################################
1762    # Misc
1763    #############################################################
1764    def lemma_count(self, lemma):
1765        """Return the frequency count for this Lemma"""
1766        # Currently, count is only work for English
1767        if lemma._lang != 'eng':
1768            return 0
1769        # open the count file if we haven't already
1770        if self._key_count_file is None:
1771            self._key_count_file = self.open('cntlist.rev')
1772        # find the key in the counts file and return the count
1773        line = _binary_search_file(self._key_count_file, lemma._key)
1774        if line:
1775            return int(line.rsplit(' ', 1)[-1])
1776        else:
1777            return 0
1778
1779    def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1780        return synset1.path_similarity(synset2, verbose, simulate_root)
1781
1782    path_similarity.__doc__ = Synset.path_similarity.__doc__
1783
1784    def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1785        return synset1.lch_similarity(synset2, verbose, simulate_root)
1786
1787    lch_similarity.__doc__ = Synset.lch_similarity.__doc__
1788
1789    def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
1790        return synset1.wup_similarity(synset2, verbose, simulate_root)
1791
1792    wup_similarity.__doc__ = Synset.wup_similarity.__doc__
1793
1794    def res_similarity(self, synset1, synset2, ic, verbose=False):
1795        return synset1.res_similarity(synset2, ic, verbose)
1796
1797    res_similarity.__doc__ = Synset.res_similarity.__doc__
1798
1799    def jcn_similarity(self, synset1, synset2, ic, verbose=False):
1800        return synset1.jcn_similarity(synset2, ic, verbose)
1801
1802    jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
1803
1804    def lin_similarity(self, synset1, synset2, ic, verbose=False):
1805        return synset1.lin_similarity(synset2, ic, verbose)
1806
1807    lin_similarity.__doc__ = Synset.lin_similarity.__doc__
1808
1809    #############################################################
1810    # Morphy
1811    #############################################################
1812    # Morphy, adapted from Oliver Steele's pywordnet
1813    def morphy(self, form, pos=None, check_exceptions=True):
1814        """
1815        Find a possible base form for the given form, with the given
1816        part of speech, by checking WordNet's list of exceptional
1817        forms, and by recursively stripping affixes for this part of
1818        speech until a form in WordNet is found.
1819
1820        >>> from nltk.corpus import wordnet as wn
1821        >>> print(wn.morphy('dogs'))
1822        dog
1823        >>> print(wn.morphy('churches'))
1824        church
1825        >>> print(wn.morphy('aardwolves'))
1826        aardwolf
1827        >>> print(wn.morphy('abaci'))
1828        abacus
1829        >>> wn.morphy('hardrock', wn.ADV)
1830        >>> print(wn.morphy('book', wn.NOUN))
1831        book
1832        >>> wn.morphy('book', wn.ADJ)
1833        """
1834
1835        if pos is None:
1836            morphy = self._morphy
1837            analyses = chain(a for p in POS_LIST for a in morphy(form, p))
1838        else:
1839            analyses = self._morphy(form, pos, check_exceptions)
1840
1841        # get the first one we find
1842        first = list(islice(analyses, 1))
1843        if len(first) == 1:
1844            return first[0]
1845        else:
1846            return None
1847
1848    MORPHOLOGICAL_SUBSTITUTIONS = {
1849        NOUN: [
1850            ('s', ''),
1851            ('ses', 's'),
1852            ('ves', 'f'),
1853            ('xes', 'x'),
1854            ('zes', 'z'),
1855            ('ches', 'ch'),
1856            ('shes', 'sh'),
1857            ('men', 'man'),
1858            ('ies', 'y'),
1859        ],
1860        VERB: [
1861            ('s', ''),
1862            ('ies', 'y'),
1863            ('es', 'e'),
1864            ('es', ''),
1865            ('ed', 'e'),
1866            ('ed', ''),
1867            ('ing', 'e'),
1868            ('ing', ''),
1869        ],
1870        ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
1871        ADV: [],
1872    }
1873
1874    MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
1875
1876    def _morphy(self, form, pos, check_exceptions=True):
1877        # from jordanbg:
1878        # Given an original string x
1879        # 1. Apply rules once to the input to get y1, y2, y3, etc.
1880        # 2. Return all that are in the database
1881        # 3. If there are no matches, keep applying rules until you either
1882        #    find a match or you can't go any further
1883
1884        exceptions = self._exception_map[pos]
1885        substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
1886
1887        def apply_rules(forms):
1888            return [
1889                form[: -len(old)] + new
1890                for form in forms
1891                for old, new in substitutions
1892                if form.endswith(old)
1893            ]
1894
1895        def filter_forms(forms):
1896            result = []
1897            seen = set()
1898            for form in forms:
1899                if form in self._lemma_pos_offset_map:
1900                    if pos in self._lemma_pos_offset_map[form]:
1901                        if form not in seen:
1902                            result.append(form)
1903                            seen.add(form)
1904            return result
1905
1906        # 0. Check the exception lists
1907        if check_exceptions:
1908            if form in exceptions:
1909                return filter_forms([form] + exceptions[form])
1910
1911        # 1. Apply rules once to the input to get y1, y2, y3, etc.
1912        forms = apply_rules([form])
1913
1914        # 2. Return all that are in the database (and check the original too)
1915        results = filter_forms([form] + forms)
1916        if results:
1917            return results
1918
1919        # 3. If there are no matches, keep applying rules until we find a match
1920        while forms:
1921            forms = apply_rules(forms)
1922            results = filter_forms(forms)
1923            if results:
1924                return results
1925
1926        # Return an empty list if we can't find anything
1927        return []
1928
1929    #############################################################
1930    # Create information content from corpus
1931    #############################################################
1932    def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
1933        """
1934        Creates an information content lookup dictionary from a corpus.
1935
1936        :type corpus: CorpusReader
1937        :param corpus: The corpus from which we create an information
1938        content dictionary.
1939        :type weight_senses_equally: bool
1940        :param weight_senses_equally: If this is True, gives all
1941        possible senses equal weight rather than dividing by the
1942        number of possible senses.  (If a word has 3 synses, each
1943        sense gets 0.3333 per appearance when this is False, 1.0 when
1944        it is true.)
1945        :param smoothing: How much do we smooth synset counts (default is 1.0)
1946        :type smoothing: float
1947        :return: An information content dictionary
1948        """
1949        counts = FreqDist()
1950        for ww in corpus.words():
1951            counts[ww] += 1
1952
1953        ic = {}
1954        for pp in POS_LIST:
1955            ic[pp] = defaultdict(float)
1956
1957        # Initialize the counts with the smoothing value
1958        if smoothing > 0.0:
1959            for ss in self.all_synsets():
1960                pos = ss._pos
1961                if pos == ADJ_SAT:
1962                    pos = ADJ
1963                ic[pos][ss._offset] = smoothing
1964
1965        for ww in counts:
1966            possible_synsets = self.synsets(ww)
1967            if len(possible_synsets) == 0:
1968                continue
1969
1970            # Distribute weight among possible synsets
1971            weight = float(counts[ww])
1972            if not weight_senses_equally:
1973                weight /= float(len(possible_synsets))
1974
1975            for ss in possible_synsets:
1976                pos = ss._pos
1977                if pos == ADJ_SAT:
1978                    pos = ADJ
1979                for level in ss._iter_hypernym_lists():
1980                    for hh in level:
1981                        ic[pos][hh._offset] += weight
1982                # Add the weight to the root
1983                ic[pos][0] += weight
1984        return ic
1985
1986    def custom_lemmas(self, tab_file, lang):
1987        """
1988        Reads a custom tab file containing mappings of lemmas in the given
1989        language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
1990        WordNet functions to then be used with that language.
1991
1992        See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
1993        documentation on the Multilingual WordNet tab file format.
1994
1995        :param tab_file: Tab file as a file or file-like object
1996        :type  lang str
1997        :param lang ISO 639-3 code of the language of the tab file
1998        """
1999        if len(lang) != 3:
2000            raise ValueError('lang should be a (3 character) ISO 639-3 code')
2001        self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
2002        for l in tab_file.readlines():
2003            if isinstance(l, bytes):
2004                # Support byte-stream files (e.g. as returned by Python 2's
2005                # open() function) as well as text-stream ones
2006                l = l.decode('utf-8')
2007            l = l.replace('\n', '')
2008            l = l.replace(' ', '_')
2009            if l[0] != '#':
2010                word = l.split('\t')
2011                self._lang_data[lang][0][word[0]].append(word[2])
2012                self._lang_data[lang][1][word[2].lower()].append(word[0])
2013        # Make sure no more entries are accidentally added subsequently
2014        self._lang_data[lang][0].default_factory = None
2015        self._lang_data[lang][1].default_factory = None
2016
2017
2018######################################################################
2019# WordNet Information Content Corpus Reader
2020######################################################################
2021
2022
2023class WordNetICCorpusReader(CorpusReader):
2024    """
2025    A corpus reader for the WordNet information content corpus.
2026    """
2027
2028    def __init__(self, root, fileids):
2029        CorpusReader.__init__(self, root, fileids, encoding='utf8')
2030
2031    # this load function would be more efficient if the data was pickled
2032    # Note that we can't use NLTK's frequency distributions because
2033    # synsets are overlapping (each instance of a synset also counts
2034    # as an instance of its hypernyms)
2035    def ic(self, icfile):
2036        """
2037        Load an information content file from the wordnet_ic corpus
2038        and return a dictionary.  This dictionary has just two keys,
2039        NOUN and VERB, whose values are dictionaries that map from
2040        synsets to information content values.
2041
2042        :type icfile: str
2043        :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
2044        :return: An information content dictionary
2045        """
2046        ic = {}
2047        ic[NOUN] = defaultdict(float)
2048        ic[VERB] = defaultdict(float)
2049        for num, line in enumerate(self.open(icfile)):
2050            if num == 0:  # skip the header
2051                continue
2052            fields = line.split()
2053            offset = int(fields[0][:-1])
2054            value = float(fields[1])
2055            pos = _get_pos(fields[0])
2056            if len(fields) == 3 and fields[2] == "ROOT":
2057                # Store root count.
2058                ic[pos][0] += value
2059            if value != 0:
2060                ic[pos][offset] = value
2061        return ic
2062
2063
2064######################################################################
2065# Similarity metrics
2066######################################################################
2067
2068# TODO: Add in the option to manually add a new root node; this will be
2069# useful for verb similarity as there exist multiple verb taxonomies.
2070
2071# More information about the metrics is available at
2072# http://marimba.d.umn.edu/similarity/measures.html
2073
2074
2075def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
2076    return synset1.path_similarity(synset2, verbose, simulate_root)
2077
2078
2079def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
2080    return synset1.lch_similarity(synset2, verbose, simulate_root)
2081
2082
2083def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
2084    return synset1.wup_similarity(synset2, verbose, simulate_root)
2085
2086
2087def res_similarity(synset1, synset2, ic, verbose=False):
2088    return synset1.res_similarity(synset2, verbose)
2089
2090
2091def jcn_similarity(synset1, synset2, ic, verbose=False):
2092    return synset1.jcn_similarity(synset2, verbose)
2093
2094
2095def lin_similarity(synset1, synset2, ic, verbose=False):
2096    return synset1.lin_similarity(synset2, verbose)
2097
2098
2099path_similarity.__doc__ = Synset.path_similarity.__doc__
2100lch_similarity.__doc__ = Synset.lch_similarity.__doc__
2101wup_similarity.__doc__ = Synset.wup_similarity.__doc__
2102res_similarity.__doc__ = Synset.res_similarity.__doc__
2103jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
2104lin_similarity.__doc__ = Synset.lin_similarity.__doc__
2105
2106
2107def _lcs_ic(synset1, synset2, ic, verbose=False):
2108    """
2109    Get the information content of the least common subsumer that has
2110    the highest information content value.  If two nodes have no
2111    explicit common subsumer, assume that they share an artificial
2112    root node that is the hypernym of all explicit roots.
2113
2114    :type synset1: Synset
2115    :param synset1: First input synset.
2116    :type synset2: Synset
2117    :param synset2: Second input synset.  Must be the same part of
2118    speech as the first synset.
2119    :type  ic: dict
2120    :param ic: an information content object (as returned by ``load_ic()``).
2121    :return: The information content of the two synsets and their most
2122    informative subsumer
2123    """
2124    if synset1._pos != synset2._pos:
2125        raise WordNetError(
2126            'Computing the least common subsumer requires '
2127            '%s and %s to have the same part of speech.' % (synset1, synset2)
2128        )
2129
2130    ic1 = information_content(synset1, ic)
2131    ic2 = information_content(synset2, ic)
2132    subsumers = synset1.common_hypernyms(synset2)
2133    if len(subsumers) == 0:
2134        subsumer_ic = 0
2135    else:
2136        subsumer_ic = max(information_content(s, ic) for s in subsumers)
2137
2138    if verbose:
2139        print("> LCS Subsumer by content:", subsumer_ic)
2140
2141    return ic1, ic2, subsumer_ic
2142
2143
2144# Utility functions
2145
2146
2147def information_content(synset, ic):
2148    try:
2149        icpos = ic[synset._pos]
2150    except KeyError:
2151        msg = 'Information content file has no entries for part-of-speech: %s'
2152        raise WordNetError(msg % synset._pos)
2153
2154    counts = icpos[synset._offset]
2155    if counts == 0:
2156        return _INF
2157    else:
2158        return -math.log(counts / icpos[0])
2159
2160
2161# get the part of speech (NOUN or VERB) from the information content record
2162# (each identifier has a 'n' or 'v' suffix)
2163
2164
2165def _get_pos(field):
2166    if field[-1] == 'n':
2167        return NOUN
2168    elif field[-1] == 'v':
2169        return VERB
2170    else:
2171        msg = (
2172            "Unidentified part of speech in WordNet Information Content file "
2173            "for field %s" % field
2174        )
2175        raise ValueError(msg)
2176
2177
2178# unload corpus after tests
2179def teardown_module(module=None):
2180    from nltk.corpus import wordnet
2181
2182    wordnet._unload()
2183