1# Module wordnet.py
2#
3# Original author: Oliver Steele <steele@osteele.com>
4# Project Page: http://sourceforge.net/projects/pywordnet
5#
6# Copyright (c) 1998-2004 by Oliver Steele.  Use is permitted under
7# the Artistic License
8# <http://www.opensource.org/licenses/artistic-license.html>
9
10"""An OO interface to the WordNet database.
11
12Usage
13-----
14>>> from wordnet import *
15
16>>> # Retrieve words from the database
17>>> N['dog']
18dog(n.)
19>>> V['dog']
20dog(v.)
21>>> ADJ['clear']
22clear(adj.)
23>>> ADV['clearly']
24clearly(adv.)
25
26>>> # Examine a word's senses and pointers:
27>>> N['dog'].getSenses()
28('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron})
29>>> # Extract the first sense
30>>> dog = N['dog'][0]   # aka N['dog'].getSenses()[0]
31>>> dog
32'dog' in {noun: dog, domestic dog, Canis familiaris}
33>>> dog.getPointers()[:5]
34(hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
35>>> dog.getPointerTargets(MEMBER_MERONYM)
36[{noun: Canis, genus Canis}, {noun: pack}]
37"""
38
39__author__  = "Oliver Steele <steele@osteele.com>"
40__version__ = "2.0.1"
41
42import string
43import os
44from os import environ
45from types import IntType, ListType, StringType, TupleType
46
47
48#
49# Configuration variables
50#
51
52WNHOME = environ.get('WNHOME', {
53    'mac': ":",
54    'dos': "C:\\wn16",
55    'nt': "C:\\Program Files\\WordNet\\2.0"}
56                     .get(os.name, "/usr/local/share/py-wordnet"))
57
58WNSEARCHDIR = environ.get('WNSEARCHDIR', WNHOME)
59
60ReadableRepresentations = 1
61"""If true, repr(word), repr(sense), and repr(synset) return
62human-readable strings instead of strings that evaluate to an object
63equal to the argument.
64
65This breaks the contract for repr, but it makes the system much more
66usable from the command line."""
67
68_TraceLookups = 0
69
70_FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r'  # work around a Windows Python bug
71
72
73#
74# Enumerated types
75#
76
77NOUN = 'noun'
78VERB = 'verb'
79ADJECTIVE = 'adjective'
80ADVERB = 'adverb'
81PartsOfSpeech = (NOUN, VERB, ADJECTIVE, ADVERB)
82
83ANTONYM = 'antonym'
84HYPERNYM = 'hypernym'
85HYPONYM = 'hyponym'
86ATTRIBUTE = 'attribute'
87ALSO_SEE = 'also see'
88ENTAILMENT = 'entailment'
89CAUSE = 'cause'
90VERB_GROUP = 'verb group'
91MEMBER_MERONYM = 'member meronym'
92SUBSTANCE_MERONYM = 'substance meronym'
93PART_MERONYM = 'part meronym'
94MEMBER_HOLONYM = 'member holonym'
95SUBSTANCE_HOLONYM = 'substance holonym'
96PART_HOLONYM = 'part holonym'
97SIMILAR = 'similar'
98PARTICIPLE_OF = 'participle of'
99PERTAINYM = 'pertainym'
100# New in wn 2.0:
101FRAMES = 'frames'
102CLASSIF_CATEGORY = 'domain category'
103CLASSIF_USAGE = 'domain usage'
104CLASSIF_REGIONAL = 'domain regional'
105CLASS_CATEGORY = 'class category'
106CLASS_USAGE = 'class usage'
107CLASS_REGIONAL = 'class regional'
108
109POINTER_TYPES = (
110    ANTONYM,
111    HYPERNYM,
112    HYPONYM,
113    ATTRIBUTE,
114    ALSO_SEE,
115    ENTAILMENT,
116    CAUSE,
117    VERB_GROUP,
118    MEMBER_MERONYM,
119    SUBSTANCE_MERONYM,
120    PART_MERONYM,
121    MEMBER_HOLONYM,
122    SUBSTANCE_HOLONYM,
123    PART_HOLONYM,
124    SIMILAR,
125    PARTICIPLE_OF,
126    PERTAINYM,
127    # New in wn 2.0:
128    FRAMES,
129    CLASSIF_CATEGORY,
130    CLASSIF_USAGE,
131    CLASSIF_REGIONAL,
132    CLASS_CATEGORY,
133    CLASS_USAGE,
134    CLASS_REGIONAL,
135    )
136
137ATTRIBUTIVE = 'attributive'
138PREDICATIVE = 'predicative'
139IMMEDIATE_POSTNOMINAL = 'immediate postnominal'
140ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None)
141
142VERB_FRAME_STRINGS = (
143    None,
144    "Something %s",
145    "Somebody %s",
146    "It is %sing",
147    "Something is %sing PP",
148    "Something %s something Adjective/Noun",
149    "Something %s Adjective/Noun",
150    "Somebody %s Adjective",
151    "Somebody %s something",
152    "Somebody %s somebody",
153    "Something %s somebody",
154    "Something %s something",
155    "Something %s to somebody",
156    "Somebody %s on something",
157    "Somebody %s somebody something",
158    "Somebody %s something to somebody",
159    "Somebody %s something from somebody",
160    "Somebody %s somebody with something",
161    "Somebody %s somebody of something",
162    "Somebody %s something on somebody",
163    "Somebody %s somebody PP",
164    "Somebody %s something PP",
165    "Somebody %s PP",
166    "Somebody's (body part) %s",
167    "Somebody %s somebody to INFINITIVE",
168    "Somebody %s somebody INFINITIVE",
169    "Somebody %s that CLAUSE",
170    "Somebody %s to somebody",
171    "Somebody %s to INFINITIVE",
172    "Somebody %s whether INFINITIVE",
173    "Somebody %s somebody into V-ing something",
174    "Somebody %s something with something",
175    "Somebody %s INFINITIVE",
176    "Somebody %s VERB-ing",
177    "It %s that CLAUSE",
178    "Something %s INFINITIVE")
179
180
181#
182# Domain classes
183#
184class Word:
185    """An index into the database.
186
187    Each word has one or more Senses, which can be accessed via
188    ``word.getSenses()`` or through the index notation, ``word[n]``.
189
190    Fields
191    ------
192      form : string
193          The orthographic representation of the word.
194      pos : string
195          The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
196      string : string
197          Same as form (for compatability with version 1.0).
198      taggedSenseCount : integer
199          The number of senses that are tagged.
200
201    Examples
202    --------
203    >>> N['dog'].pos
204    'noun'
205    >>> N['dog'].form
206    'dog'
207    >>> N['dog'].taggedSenseCount
208    1
209    """
210
211    def __init__(self, line):
212        """Initialize the word from a line of a WN POS file."""
213	tokens = string.split(line)
214	ints = map(int, tokens[int(tokens[3]) + 4:])
215	self.form = string.replace(tokens[0], '_', ' ')
216        "Orthographic representation of the word."
217	self.pos = _normalizePOS(tokens[1])
218        "Part of speech.  One of NOUN, VERB, ADJECTIVE, ADVERB."
219	self.taggedSenseCount = ints[1]
220        "Number of senses that are tagged."
221	self._synsetOffsets = ints[2:ints[0]+2]
222
223    def getPointers(self, pointerType=None):
224        """Pointers connect senses and synsets, not words.
225        Try word[0].getPointers() instead."""
226        raise self.getPointers.__doc__
227
228    def getPointerTargets(self, pointerType=None):
229        """Pointers connect senses and synsets, not words.
230        Try word[0].getPointerTargets() instead."""
231        raise self.getPointers.__doc__
232
233    def getSenses(self):
234	"""Return a sequence of senses.
235
236	>>> N['dog'].getSenses()
237	('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron})
238	"""
239	if not hasattr(self, '_senses'):
240	    def getSense(offset, pos=self.pos, form=self.form):
241		return getSynset(pos, offset)[form]
242	    self._senses = tuple(map(getSense, self._synsetOffsets))
243	    del self._synsetOffsets
244	return self._senses
245
246    # Deprecated.  Present for backwards compatability.
247    def senses(self):
248        import wordnet
249        #warningKey = 'SENSE_DEPRECATION_WARNING'
250        #if not wordnet.has_key(warningKey):
251        #    print 'Word.senses() has been deprecated.  Use Word.sense() instead.'
252        #    wordnet[warningKey] = 1
253        return self.getSense()
254
255    def isTagged(self):
256	"""Return 1 if any sense is tagged.
257
258	>>> N['dog'].isTagged()
259	1
260	"""
261	return self.taggedSenseCount > 0
262
263    def getAdjectivePositions(self):
264	"""Return a sequence of adjective positions that this word can
265	appear in.  These are elements of ADJECTIVE_POSITIONS.
266
267	>>> ADJ['clear'].getAdjectivePositions()
268	[None, 'predicative']
269	"""
270	positions = {}
271	for sense in self.getSenses():
272	    positions[sense.position] = 1
273	return positions.keys()
274
275    adjectivePositions = getAdjectivePositions # backwards compatability
276
277    def __cmp__(self, other):
278	"""
279	>>> N['cat'] < N['dog']
280	1
281	>>> N['dog'] < V['dog']
282	1
283	"""
284	return _compareInstances(self, other, ('pos', 'form'))
285
286    def __str__(self):
287	"""Return a human-readable representation.
288
289	>>> str(N['dog'])
290	'dog(n.)'
291	"""
292	abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'}
293	return self.form + "(" + abbrs[self.pos] + ")"
294
295    def __repr__(self):
296	"""If ReadableRepresentations is true, return a human-readable
297	representation, e.g. 'dog(n.)'.
298
299	If ReadableRepresentations is false, return a machine-readable
300	representation, e.g. "getWord('dog', 'noun')".
301	"""
302	if ReadableRepresentations:
303	    return str(self)
304	return "getWord" + `(self.form, self.pos)`
305
306    #
307    # Sequence protocol (a Word's elements are its Senses)
308    #
309    def __nonzero__(self):
310	return 1
311
312    def __len__(self):
313	return len(self.getSenses())
314
315    def __getitem__(self, index):
316	return self.getSenses()[index]
317
318    def __getslice__(self, i, j):
319	return self.getSenses()[i:j]
320
321
322class Synset:
323    """A set of synonyms that share a common meaning.
324
325    Each synonym contains one or more Senses, which represent a
326    specific sense of a specific word.  Senses can be retrieved via
327    synset.getSenses() or through the index notations synset[0],
328    synset[string], or synset[word]. Synsets also originate zero or
329    more typed pointers, which can be accessed via
330    synset.getPointers() or synset.getPointers(pointerType). The
331    targets of a synset pointer can be retrieved via
332    synset.getPointerTargets() or
333    synset.getPointerTargets(pointerType), which are equivalent to
334    map(Pointer.target, synset.getPointerTargets(...)).
335
336    Fields
337    ------
338      pos : string
339          The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
340      offset : integer
341          An integer offset into the part-of-speech file.  Together
342          with pos, this can be used as a unique id.
343      gloss : string
344          A gloss for the sense.
345      verbFrames : [integer]
346          A sequence of integers that index into
347          VERB_FRAME_STRINGS. These list the verb frames that any
348          Sense in this synset participates in.  (See also
349          Sense.verbFrames.) Defined only for verbs.
350
351          >>> V['think'][0].synset.verbFrames
352          (5, 9)
353    """
354
355    def __init__(self, pos, offset, line):
356        "Initialize the synset from a line off a WN synset file."
357	self.pos = pos
358        "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB."
359	self.offset = offset
360        """integer offset into the part-of-speech file.  Together
361        with pos, this can be used as a unique id."""
362	tokens = string.split(line[:string.index(line, '|')])
363	self.ssType = tokens[2]
364	self.gloss = string.strip(line[string.index(line, '|') + 1:])
365        self.lexname = Lexname.lexnames[int(tokens[1])]
366	(self._senseTuples, remainder) = _partition(tokens[4:], 2, string.atoi(tokens[3], 16))
367	(self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0]))
368	if pos == VERB:
369	    (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0]))
370	    def extractVerbFrames(index, vfTuples):
371		return tuple(map(lambda t:string.atoi(t[1]), filter(lambda t,i=index:string.atoi(t[2],16) in (0, i), vfTuples)))
372	    senseVerbFrames = []
373	    for index in range(1, len(self._senseTuples) + 1):
374		senseVerbFrames.append(extractVerbFrames(index, vfTuples))
375	    self._senseVerbFrames = senseVerbFrames
376	    self.verbFrames = tuple(extractVerbFrames(None, vfTuples))
377            """A sequence of integers that index into
378            VERB_FRAME_STRINGS. These list the verb frames that any
379            Sense in this synset participates in.  (See also
380            Sense.verbFrames.) Defined only for verbs."""
381
382    def getSenses(self):
383	"""Return a sequence of Senses.
384
385	>>> N['dog'][0].getSenses()
386	('dog' in {noun: dog, domestic dog, Canis familiaris},)
387	"""
388	if not hasattr(self, '_senses'):
389	    def loadSense(senseTuple, verbFrames=None, synset=self):
390		return Sense(synset, senseTuple, verbFrames)
391	    if self.pos == VERB:
392		self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames))
393		del self._senseVerbFrames
394	    else:
395		self._senses = tuple(map(loadSense, self._senseTuples))
396	    del self._senseTuples
397	return self._senses
398
399    senses = getSenses
400
401    def getPointers(self, pointerType=None):
402	"""Return a sequence of Pointers.
403
404        If pointerType is specified, only pointers of that type are
405        returned.  In this case, pointerType should be an element of
406        POINTER_TYPES.
407
408	>>> N['dog'][0].getPointers()[:5]
409	(hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
410	>>> N['dog'][0].getPointers(HYPERNYM)
411	(hypernym -> {noun: canine, canid},)
412	"""
413	if not hasattr(self, '_pointers'):
414	    def loadPointer(tuple, synset=self):
415		return Pointer(synset.offset, tuple)
416	    self._pointers = tuple(map(loadPointer, self._pointerTuples))
417	    del self._pointerTuples
418	if pointerType == None:
419	    return self._pointers
420	else:
421	    _requirePointerType(pointerType)
422	    return filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers)
423
424    pointers = getPointers # backwards compatability
425
426    def getPointerTargets(self, pointerType=None):
427	"""Return a sequence of Senses or Synsets.
428
429        If pointerType is specified, only targets of pointers of that
430        type are returned.  In this case, pointerType should be an
431        element of POINTER_TYPES.
432
433	>>> N['dog'][0].getPointerTargets()[:5]
434	[{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
435	>>> N['dog'][0].getPointerTargets(HYPERNYM)
436	[{noun: canine, canid}]
437	"""
438	return map(Pointer.target, self.getPointers(pointerType))
439
440    pointerTargets = getPointerTargets # backwards compatability
441
442    def isTagged(self):
443	"""Return 1 if any sense is tagged.
444
445	>>> N['dog'][0].isTagged()
446	1
447	>>> N['dog'][1].isTagged()
448	0
449	"""
450	return len(filter(Sense.isTagged, self.getSenses())) > 0
451
452    def __str__(self):
453	"""Return a human-readable representation.
454
455	>>> str(N['dog'][0].synset)
456	'{noun: dog, domestic dog, Canis familiaris}'
457	"""
458	return "{" + self.pos + ": " + string.joinfields(map(lambda sense:sense.form, self.getSenses()), ", ") + "}"
459
460    def __repr__(self):
461	"""If ReadableRepresentations is true, return a human-readable
462	representation, e.g. 'dog(n.)'.
463
464	If ReadableRepresentations is false, return a machine-readable
465	representation, e.g. "getSynset(pos, 1234)".
466	"""
467	if ReadableRepresentations:
468	    return str(self)
469	return "getSynset" + `(self.pos, self.offset)`
470
471    def __cmp__(self, other):
472	return _compareInstances(self, other, ('pos', 'offset'))
473
474    #
475    # Sequence protocol (a Synset's elements are its senses).
476    #
477    def __nonzero__(self):
478	return 1
479
480    def __len__(self):
481	"""
482	>>> len(N['dog'][0].synset)
483	3
484	"""
485	return len(self.getSenses())
486
487    def __getitem__(self, idx):
488	"""
489	>>> N['dog'][0].synset[0] == N['dog'][0]
490	1
491	>>> N['dog'][0].synset['dog'] == N['dog'][0]
492	1
493	>>> N['dog'][0].synset[N['dog']] == N['dog'][0]
494	1
495	>>> N['cat'][6]
496	'cat' in {noun: big cat, cat}
497	"""
498	senses = self.getSenses()
499	if isinstance(idx, Word):
500	    idx = idx.form
501	if isinstance(idx, StringType):
502	    idx = _index(idx, map(lambda sense:sense.form, senses)) or \
503		  _index(idx, map(lambda sense:sense.form, senses), _equalsIgnoreCase)
504	return senses[idx]
505
506    def __getslice__(self, i, j):
507	return self.getSenses()[i:j]
508
509
510class Sense:
511    """A specific meaning of a specific word -- the intersection of a Word and a Synset.
512
513    Fields
514    ------
515      form : string
516          The orthographic representation of the Word this is a Sense of.
517      pos : string
518          The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB
519      string : string
520          The same as form (for compatability with version 1.0).
521      synset : Synset
522          The Synset that this Sense is a sense of.
523      verbFrames : [integer]
524          A sequence of integers that index into
525          VERB_FRAME_STRINGS. These list the verb frames that this
526          Sense partipates in.  Defined only for verbs.
527
528          >>> decide = V['decide'][0].synset	# first synset for 'decide'
529          >>> decide[0].verbFrames
530          (8, 2, 26, 29)
531          >>> decide[1].verbFrames
532          (8, 2)
533          >>> decide[2].verbFrames
534          (8, 26, 29)
535    """
536
537    def __init__(sense, synset, senseTuple, verbFrames=None):
538        "Initialize a sense from a synset's senseTuple."
539	# synset is stored by key (pos, synset) rather than object
540	# reference, to avoid creating a circular reference between
541	# Senses and Synsets that will prevent the vm from
542	# garbage-collecting them.
543	sense.pos = synset.pos
544        "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"
545	sense.synsetOffset = synset.offset
546        "synset key.  This is used to retrieve the sense."
547	sense.verbFrames = verbFrames
548        """A sequence of integers that index into
549        VERB_FRAME_STRINGS. These list the verb frames that this
550        Sense partipates in.  Defined only for verbs."""
551	(form, idString) = senseTuple
552	sense.position = None
553	if '(' in form:
554	    index = string.index(form, '(')
555	    key = form[index + 1:-1]
556	    form = form[:index]
557	    if key == 'a':
558		sense.position = ATTRIBUTIVE
559	    elif key == 'p':
560		sense.position = PREDICATIVE
561	    elif key == 'ip':
562		sense.position = IMMEDIATE_POSTNOMINAL
563	    else:
564		raise "unknown attribute " + key
565	sense.form = string.replace(form, '_', ' ')
566        "orthographic representation of the Word this is a Sense of."
567
568    def __getattr__(self, name):
569	# see the note at __init__ about why 'synset' is provided as a
570	# 'virtual' slot
571	if name == 'synset':
572	    return getSynset(self.pos, self.synsetOffset)
573        elif name == 'lexname':
574            return self.synset.lexname
575	else:
576	    raise AttributeError, name
577
578    def __str__(self):
579	"""Return a human-readable representation.
580
581	>>> str(N['dog'])
582	'dog(n.)'
583	"""
584	return `self.form` + " in " + str(self.synset)
585
586    def __repr__(self):
587	"""If ReadableRepresentations is true, return a human-readable
588	representation, e.g. 'dog(n.)'.
589
590	If ReadableRepresentations is false, return a machine-readable
591	representation, e.g. "getWord('dog', 'noun')".
592	"""
593	if ReadableRepresentations:
594	    return str(self)
595	return "%s[%s]" % (`self.synset`, `self.form`)
596
597    def getPointers(self, pointerType=None):
598	"""Return a sequence of Pointers.
599
600        If pointerType is specified, only pointers of that type are
601        returned.  In this case, pointerType should be an element of
602        POINTER_TYPES.
603
604	>>> N['dog'][0].getPointers()[:5]
605	(hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt})
606	>>> N['dog'][0].getPointers(HYPERNYM)
607	(hypernym -> {noun: canine, canid},)
608	"""
609	senseIndex = _index(self, self.synset.getSenses())
610	def pointsFromThisSense(pointer, selfIndex=senseIndex):
611	    return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex
612	return filter(pointsFromThisSense, self.synset.getPointers(pointerType))
613
614    pointers = getPointers # backwards compatability
615
616    def getPointerTargets(self, pointerType=None):
617	"""Return a sequence of Senses or Synsets.
618
619        If pointerType is specified, only targets of pointers of that
620        type are returned.  In this case, pointerType should be an
621        element of POINTER_TYPES.
622
623	>>> N['dog'][0].getPointerTargets()[:5]
624	[{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}]
625	>>> N['dog'][0].getPointerTargets(HYPERNYM)
626	[{noun: canine, canid}]
627	"""
628	return map(Pointer.target, self.getPointers(pointerType))
629
630    pointerTargets = getPointerTargets # backwards compatability
631
632    def getSenses(self):
633	return self,
634
635    senses = getSenses # backwards compatability
636
637    def isTagged(self):
638	"""Return 1 if any sense is tagged.
639
640	>>> N['dog'][0].isTagged()
641	1
642	>>> N['dog'][1].isTagged()
643	0
644	"""
645	word = self.word()
646	return _index(self, word.getSenses()) < word.taggedSenseCount
647
648    def getWord(self):
649	return getWord(self.form, self.pos)
650
651    word = getWord # backwards compatability
652
653    def __cmp__(self, other):
654	def senseIndex(sense, synset=self.synset):
655	    return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form)
656	return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other))
657
658
659class Pointer:
660    """ A typed directional relationship between Senses or Synsets.
661
662    Fields
663    ------
664      type : string
665          One of POINTER_TYPES.
666      pos : string
667          The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
668    """
669
670    _POINTER_TYPE_TABLE = {
671	'!': ANTONYM,
672        '@': HYPERNYM,
673        '~': HYPONYM,
674	'=': ATTRIBUTE,
675        '^': ALSO_SEE,
676        '*': ENTAILMENT,
677        '>': CAUSE,
678	'$': VERB_GROUP,
679	'#m': MEMBER_MERONYM,
680        '#s': SUBSTANCE_MERONYM,
681        '#p': PART_MERONYM,
682	'%m': MEMBER_HOLONYM,
683        '%s': SUBSTANCE_HOLONYM,
684        '%p': PART_HOLONYM,
685	'&': SIMILAR,
686        '<': PARTICIPLE_OF,
687        '\\': PERTAINYM,
688        # New in wn 2.0:
689        '+': FRAMES,
690        ';c': CLASSIF_CATEGORY,
691        ';u': CLASSIF_USAGE,
692        ';r': CLASSIF_REGIONAL,
693        '-c': CLASS_CATEGORY,
694        '-u': CLASS_USAGE,
695        '-r': CLASS_REGIONAL
696        }
697
698    def __init__(self, sourceOffset, pointerTuple):
699	(type, offset, pos, indices) = pointerTuple
700	self.type = Pointer._POINTER_TYPE_TABLE[type]
701        """One of POINTER_TYPES."""
702	self.sourceOffset = sourceOffset
703	self.targetOffset = int(offset)
704	self.pos = _normalizePOS(pos)
705        """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
706	indices = string.atoi(indices, 16)
707	self.sourceIndex = indices >> 8
708	self.targetIndex = indices & 255
709
710    def getSource(self):
711	synset = getSynset(self.pos, self.sourceOffset)
712	if self.sourceIndex:
713	    return synset[self.sourceIndex - 1]
714	else:
715	    return synset
716
717    source = getSource # backwards compatability
718
719    def getTarget(self):
720	synset = getSynset(self.pos, self.targetOffset)
721	if self.targetIndex:
722	    return synset[self.targetIndex - 1]
723	else:
724	    return synset
725
726    target = getTarget # backwards compatability
727
728    def __str__(self):
729	return self.type + " -> " + str(self.target())
730
731    def __repr__(self):
732	if ReadableRepresentations:
733	    return str(self)
734	return "<" + str(self) + ">"
735
736    def __cmp__(self, other):
737	diff = _compareInstances(self, other, ('pos', 'sourceOffset'))
738	if diff:
739	    return diff
740	synset = self.source()
741	def pointerIndex(sense, synset=synset):
742	    return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex')))
743	return cmp(pointerIndex(self), pointerIndex(other))
744
745
746# Loading the lexnames
747# Klaus Ries <ries@cs.cmu.edu>
748
749class Lexname:
750   dict = {}
751   lexnames = []
752
753   def __init__(self,name,category):
754       self.name = name
755       self.category = category
756       Lexname.dict[name] = self
757       Lexname.lexnames.append(self)
758
759   def __str__(self):
760       return self.name
761
762def setupLexnames():
763    for l in open(WNSEARCHDIR+'/lexnames').readlines():
764        i,name,category = string.split(l)
765        Lexname(name,PartsOfSpeech[int(category)-1])
766
767setupLexnames()
768
769#
770# Dictionary
771#
772class Dictionary:
773
774    """A Dictionary contains all the Words in a given part of speech.
775    This module defines four dictionaries, bound to N, V, ADJ, and ADV.
776
777    Indexing a dictionary by a string retrieves the word named by that
778    string, e.g. dict['dog'].  Indexing by an integer n retrieves the
779    nth word, e.g.  dict[0].  Access by an arbitrary integer is very
780    slow except in the special case where the words are accessed
781    sequentially; this is to support the use of dictionaries as the
782    range of a for statement and as the sequence argument to map and
783    filter.
784
785    Example
786    -------
787    >>> N['dog']
788    dog(n.)
789
790    Fields
791    ------
792      pos : string
793          The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB.
794    """
795
796    def __init__(self, pos, filenameroot):
797	self.pos = pos
798        """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
799	self.indexFile = _IndexFile(pos, filenameroot)
800	self.dataFile = open(_dataFilePathname(filenameroot), _FILE_OPEN_MODE)
801
802    def __repr__(self):
803	dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'}
804	if dictionaryVariables.get(self):
805	    return self.__module__ + "." + dictionaryVariables[self]
806	return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos)
807
808    def getWord(self, form, line=None):
809	key = string.replace(string.lower(form), ' ', '_')
810	pos = self.pos
811	def loader(key=key, line=line, indexFile=self.indexFile):
812	    line = line or indexFile.get(key)
813	    return line and Word(line)
814	word = _entityCache.get((pos, key), loader)
815	if word:
816	    return word
817	else:
818	    raise KeyError, "%s is not in the %s database" % (`form`, `pos`)
819
820    def getSynset(self, offset):
821	pos = self.pos
822	def loader(pos=pos, offset=offset, dataFile=self.dataFile):
823	    return Synset(pos, offset, _lineAt(dataFile, offset))
824	return _entityCache.get((pos, offset), loader)
825
826    def _buildIndexCacheFile(self):
827	self.indexFile._buildIndexCacheFile()
828
829    #
830    # Sequence protocol (a Dictionary's items are its Words)
831    #
832    def __nonzero__(self):
833	"""Return false.  (This is to avoid scanning the whole index file
834	to compute len when a Dictionary is used in test position.)
835
836	>>> N and 'true'
837	'true'
838	"""
839	return 1
840
841    def __len__(self):
842	"""Return the number of index entries.
843
844	>>> len(ADJ)
845	21435
846	"""
847	if not hasattr(self, 'length'):
848	    self.length = len(self.indexFile)
849	return self.length
850
851    def __getslice__(self, a, b):
852        results = []
853        if type(a) == type('') and type(b) == type(''):
854            raise "unimplemented"
855        elif type(a) == type(1) and type(b) == type(1):
856            for i in range(a, b):
857                results.append(self[i])
858        else:
859            raise TypeError
860        return results
861
862    def __getitem__(self, index):
863	"""If index is a String, return the Word whose form is
864	index.  If index is an integer n, return the Word
865	indexed by the n'th Word in the Index file.
866
867	>>> N['dog']
868	dog(n.)
869	>>> N[0]
870	'hood(n.)
871	"""
872	if isinstance(index, StringType):
873	    return self.getWord(index)
874	elif isinstance(index, IntType):
875	    line = self.indexFile[index]
876	    return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line)
877	else:
878	    raise TypeError, "%s is not a String or Int" % `index`
879
880    #
881    # Dictionary protocol
882    #
883    # a Dictionary's values are its words, keyed by their form
884    #
885
886    def get(self, key, default=None):
887	"""Return the Word whose form is _key_, or _default_.
888
889	>>> N.get('dog')
890	dog(n.)
891	>>> N.get('inu')
892	"""
893	try:
894	    return self[key]
895	except LookupError:
896	    return default
897
898    def keys(self):
899	"""Return a sorted list of strings that index words in this
900	dictionary."""
901	return self.indexFile.keys()
902
903    def has_key(self, form):
904	"""Return true iff the argument indexes a word in this dictionary.
905
906	>>> N.has_key('dog')
907	1
908	>>> N.has_key('inu')
909	0
910	"""
911	return self.indexFile.has_key(form)
912
913    #
914    # Testing
915    #
916
917    def _testKeys(self):
918	"""Verify that index lookup can find each word in the index file."""
919	print "Testing: ", self
920	file = open(self.indexFile.file.name, _FILE_OPEN_MODE)
921	counter = 0
922	while 1:
923	    line = file.readline()
924	    if line == '': break
925	    if line[0] != ' ':
926		key = string.replace(line[:string.find(line, ' ')], '_', ' ')
927		if (counter % 1000) == 0:
928		    print "%s..." % (key,),
929		    import sys
930		    sys.stdout.flush()
931		counter = counter + 1
932		self[key]
933	file.close()
934	print "done."
935
936
937class _IndexFile:
938    """An _IndexFile is an implementation class that presents a
939    Sequence and Dictionary interface to a sorted index file."""
940
941    def __init__(self, pos, filenameroot):
942	self.pos = pos
943	self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE)
944	self.offsetLineCache = {}   # Table of (pathname, offset) -> (line, nextOffset)
945	self.rewind()
946	self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx")
947	try:
948	    import shelve
949	    self.indexCache = shelve.open(self.shelfname, 'r')
950	except:
951	    pass
952
953    def rewind(self):
954	self.file.seek(0)
955	while 1:
956	    offset = self.file.tell()
957	    line = self.file.readline()
958	    if (line[0] != ' '):
959		break
960	self.nextIndex = 0
961	self.nextOffset = offset
962
963    #
964    # Sequence protocol (an _IndexFile's items are its lines)
965    #
966    def __nonzero__(self):
967	return 1
968
969    def __len__(self):
970	if hasattr(self, 'indexCache'):
971	    return len(self.indexCache)
972	self.rewind()
973	lines = 0
974	while 1:
975	    line = self.file.readline()
976	    if line == "":
977		break
978	    lines = lines + 1
979	return lines
980
981    def __nonzero__(self):
982	return 1
983
984    def __getitem__(self, index):
985	if isinstance(index, StringType):
986	    if hasattr(self, 'indexCache'):
987		return self.indexCache[index]
988	    return binarySearchFile(self.file, index, self.offsetLineCache, 8)
989	elif isinstance(index, IntType):
990	    if hasattr(self, 'indexCache'):
991		return self.get(self.keys[index])
992	    if index < self.nextIndex:
993		self.rewind()
994	    while self.nextIndex <= index:
995		self.file.seek(self.nextOffset)
996		line = self.file.readline()
997		if line == "":
998		    raise IndexError, "index out of range"
999		self.nextIndex = self.nextIndex + 1
1000		self.nextOffset = self.file.tell()
1001	    return line
1002	else:
1003	    raise TypeError, "%s is not a String or Int" % `index`
1004
1005    #
1006    # Dictionary protocol
1007    #
1008    # (an _IndexFile's values are its lines, keyed by the first word)
1009    #
1010
1011    def get(self, key, default=None):
1012	try:
1013	    return self[key]
1014	except LookupError:
1015	    return default
1016
1017    def keys(self):
1018	if hasattr(self, 'indexCache'):
1019	    keys = self.indexCache.keys()
1020	    keys.sort()
1021	    return keys
1022	else:
1023	    keys = []
1024	    self.rewind()
1025	    while 1:
1026		line = self.file.readline()
1027		if not line: break
1028                key = line.split(' ', 1)[0]
1029		keys.append(key.replace('_', ' '))
1030	    return keys
1031
1032    def has_key(self, key):
1033	key = key.replace(' ', '_') # test case: V['haze over']
1034	if hasattr(self, 'indexCache'):
1035	    return self.indexCache.has_key(key)
1036	return self.get(key) != None
1037
1038    #
1039    # Index file
1040    #
1041
1042    def _buildIndexCacheFile(self):
1043	import shelve
1044	import os
1045	print "Building %s:" % (self.shelfname,),
1046	tempname = self.shelfname + ".temp"
1047	try:
1048	    indexCache = shelve.open(tempname)
1049	    self.rewind()
1050	    count = 0
1051	    while 1:
1052		offset, line = self.file.tell(), self.file.readline()
1053		if not line: break
1054		key = line[:string.find(line, ' ')]
1055		if (count % 1000) == 0:
1056		    print "%s..." % (key,),
1057		    import sys
1058		    sys.stdout.flush()
1059		indexCache[key] = line
1060		count = count + 1
1061	    indexCache.close()
1062	    os.rename(tempname, self.shelfname)
1063	finally:
1064	    try: os.remove(tempname)
1065	    except: pass
1066	print "done."
1067	self.indexCache = shelve.open(self.shelfname, 'r')
1068
1069
1070#
1071# Lookup functions
1072#
1073
1074def getWord(form, pos='noun'):
1075    "Return a word with the given lexical form and pos."
1076    return _dictionaryFor(pos).getWord(form)
1077
1078def getSense(form, pos='noun', senseno=0):
1079    "Lookup a sense by its sense number.  Used by repr(sense)."
1080    return getWord(form, pos)[senseno]
1081
1082def getSynset(pos, offset):
1083    "Lookup a synset by its offset.  Used by repr(synset)."
1084    return _dictionaryFor(pos).getSynset(offset)
1085
1086getword, getsense, getsynset = getWord, getSense, getSynset
1087
1088#
1089# Private utilities
1090#
1091
1092def _requirePointerType(pointerType):
1093    if pointerType not in POINTER_TYPES:
1094	raise TypeError, `pointerType` + " is not a pointer type"
1095    return pointerType
1096
1097def _compareInstances(a, b, fields):
1098    """"Return -1, 0, or 1 according to a comparison first by type,
1099    then by class, and finally by each of fields.""" # " <- for emacs
1100    if not hasattr(b, '__class__'):
1101	return cmp(type(a), type(b))
1102    elif a.__class__ != b.__class__:
1103	return cmp(a.__class__, b.__class__)
1104    for field in fields:
1105	diff = cmp(getattr(a, field), getattr(b, field))
1106	if diff:
1107	    return diff
1108    return 0
1109
1110def _equalsIgnoreCase(a, b):
1111    """Return true iff a and b have the same lowercase representation.
1112
1113    >>> _equalsIgnoreCase('dog', 'Dog')
1114    1
1115    >>> _equalsIgnoreCase('dOg', 'DOG')
1116    1
1117    """
1118    return a == b or string.lower(a) == string.lower(b)
1119
1120#
1121# File utilities
1122#
1123def _dataFilePathname(filenameroot):
1124    if os.name in ('dos', 'nt'):
1125	path = os.path.join(WNSEARCHDIR, filenameroot + ".dat")
1126        if os.path.exists(path):
1127            return path
1128    return os.path.join(WNSEARCHDIR, "data." + filenameroot)
1129
1130def _indexFilePathname(filenameroot):
1131    if os.name in ('dos', 'nt'):
1132	path = os.path.join(WNSEARCHDIR, filenameroot + ".idx")
1133        if os.path.exists(path):
1134            return path
1135    return os.path.join(WNSEARCHDIR, "index." + filenameroot)
1136
1137def binarySearchFile(file, key, cache={}, cacheDepth=-1):
1138    from stat import ST_SIZE
1139    key = key + ' '
1140    keylen = len(key)
1141    start, end = 0, os.stat(file.name)[ST_SIZE]
1142    currentDepth = 0
1143    #count = 0
1144    while start < end:
1145        #count = count + 1
1146        #if count > 20:
1147        #    raise "infinite loop"
1148        lastState = start, end
1149	middle = (start + end) / 2
1150	if cache.get(middle):
1151	    offset, line = cache[middle]
1152	else:
1153	    file.seek(max(0, middle - 1))
1154	    if middle > 0:
1155		file.readline()
1156	    offset, line = file.tell(), file.readline()
1157	    if currentDepth < cacheDepth:
1158		cache[middle] = (offset, line)
1159        #print start, middle, end, offset, line,
1160	if offset > end:
1161	    assert end != middle - 1, "infinite loop"
1162	    end = middle - 1
1163	elif line[:keylen] == key:# and line[keylen + 1] == ' ':
1164	    return line
1165        #elif offset == end:
1166        #    return None
1167	elif line > key:
1168	    assert end != middle - 1, "infinite loop"
1169	    end = middle - 1
1170	elif line < key:
1171	    start = offset + len(line) - 1
1172	currentDepth = currentDepth + 1
1173        thisState = start, end
1174        if lastState == thisState:
1175            # detects the condition where we're searching past the end
1176            # of the file, which is otherwise difficult to detect
1177            return None
1178    return None
1179
1180def _lineAt(file, offset):
1181    file.seek(offset)
1182    return file.readline()
1183
1184
1185#
1186# Sequence Utility Functions
1187#
1188
1189def _index(key, sequence, testfn=None, keyfn=None):
1190    """Return the index of key within sequence, using testfn for
1191    comparison and transforming items of sequence by keyfn first.
1192
1193    >>> _index('e', 'hello')
1194    1
1195    >>> _index('E', 'hello', testfn=_equalsIgnoreCase)
1196    1
1197    >>> _index('x', 'hello')
1198    """
1199    index = 0
1200    for element in sequence:
1201	value = element
1202	if keyfn:
1203	    value = keyfn(value)
1204	if (not testfn and value == key) or (testfn and testfn(value, key)):
1205	    return index
1206	index = index + 1
1207    return None
1208
1209def _partition(sequence, size, count):
1210    """Partition sequence into count subsequences of size
1211    length, and a remainder.
1212
1213    Return (partitions, remainder), where partitions is a sequence of
1214    count subsequences of cardinality count, and
1215    apply(append, partitions) + remainder == sequence."""
1216
1217    partitions = []
1218    for index in range(0, size * count, size):
1219	partitions.append(sequence[index:index + size])
1220    return (partitions, sequence[size * count:])
1221
1222
1223#
1224# Cache management
1225#
1226# Some kind of cache is necessary since Sense -> Synset references are
1227# stored by key, and it's nice not to have to cons a new copy of a
1228# Synset that's been paged in each time a Sense's synset is retrieved.
1229# Ideally, we'd use a weak dict, but there aren't any.  A strong dict
1230# reintroduces the problem that eliminating the Sense <-> Synset
1231# circularity was intended to resolve: every entity ever seen is
1232# preserved forever, making operations that iterate over the entire
1233# database prohibitive.
1234#
1235# The LRUCache approximates a weak dict in the case where temporal
1236# locality is good.
1237
1238class _LRUCache:
1239    """ A cache of values such that least recently used element is
1240    flushed when the cache fills.
1241
1242    Private fields
1243    --------------
1244    entities
1245      a dict from key -> (value, timestamp)
1246    history
1247      is a dict from timestamp -> key
1248    nextTimeStamp
1249      is the timestamp to use with the next value that's added.
1250    oldestTimeStamp
1251      The timestamp of the oldest element (the next one to remove),
1252      or slightly lower than that.
1253
1254      This lets us retrieve the key given the timestamp, and the
1255      timestamp given the key. (Also the value given either one.)
1256      That's necessary so that we can reorder the history given a key,
1257      and also manipulate the values dict given a timestamp.  #
1258
1259      I haven't tried changing history to a List.  An earlier
1260      implementation of history as a List was slower than what's here,
1261      but the two implementations aren't directly comparable."""
1262
1263    def __init__(this, capacity):
1264	this.capacity = capacity
1265	this.clear()
1266
1267    def clear(this):
1268	this.values = {}
1269	this.history = {}
1270	this.oldestTimestamp = 0
1271	this.nextTimestamp = 1
1272
1273    def removeOldestEntry(this):
1274	while this.oldestTimestamp < this.nextTimestamp:
1275	    if this.history.get(this.oldestTimestamp):
1276		key = this.history[this.oldestTimestamp]
1277		del this.history[this.oldestTimestamp]
1278		del this.values[key]
1279		return
1280	    this.oldestTimestamp = this.oldestTimestamp + 1
1281
1282    def setCapacity(this, capacity):
1283	if capacity == 0:
1284	    this.clear()
1285	else:
1286	    this.capacity = capacity
1287	    while len(this.values) > this.capacity:
1288		this.removeOldestEntry()
1289
1290    def get(this, key, loadfn=None):
1291	value = None
1292	if this.values:
1293	    pair = this.values.get(key)
1294	    if pair:
1295		(value, timestamp) = pair
1296		del this.history[timestamp]
1297	if value == None:
1298	    value = loadfn and loadfn()
1299	if this.values != None:
1300	    timestamp = this.nextTimestamp
1301	    this.nextTimestamp = this.nextTimestamp + 1
1302	    this.values[key] = (value, timestamp)
1303	    this.history[timestamp] = key
1304	    if len(this.values) > this.capacity:
1305		this.removeOldestEntry()
1306	return value
1307
1308
1309class _NullCache:
1310    """A NullCache implements the Cache interface (the interface that
1311    LRUCache implements), but doesn't store any values."""
1312
1313    def clear():
1314	pass
1315
1316    def get(this, key, loadfn=None):
1317	return loadfn and loadfn()
1318
1319
1320DEFAULT_CACHE_CAPACITY = 1000
1321_entityCache = _LRUCache(DEFAULT_CACHE_CAPACITY)
1322
1323def disableCache():
1324    """Disable the entity cache."""
1325    _entityCache = _NullCache()
1326
1327def enableCache():
1328    """Enable the entity cache."""
1329    if not isinstance(_entityCache, LRUCache):
1330	_entityCache = _LRUCache(size)
1331
1332def clearCache():
1333    """Clear the entity cache."""
1334    _entityCache.clear()
1335
1336def setCacheCapacity(capacity=DEFAULT_CACHE_CAPACITY):
1337    """Set the capacity of the entity cache."""
1338    enableCache()
1339    _entityCache.setCapacity(capacity)
1340
1341setCacheSize = setCacheCapacity # for compatability with version 1.0
1342
1343
1344#
1345# POS Dictionaries (must be initialized after file utilities)
1346#
1347
1348N = Dictionary(NOUN, 'noun')
1349V = Dictionary(VERB, 'verb')
1350ADJ = Dictionary(ADJECTIVE, 'adj')
1351ADV = Dictionary(ADVERB, 'adv')
1352Dictionaries = (N, V, ADJ, ADV)
1353
1354
1355#
1356# Part-of-speech tag normalization tables (must be initialized after
1357# POS dictionaries)
1358#
1359
1360_POSNormalizationTable = {}
1361_POStoDictionaryTable = {}
1362
1363def _initializePOSTables():
1364    global _POSNormalizationTable, _POStoDictionaryTable
1365    _POSNormalizationTable = {}
1366    _POStoDictionaryTable = {}
1367    for pos, abbreviations in (
1368	    (NOUN, "noun n n."),
1369	    (VERB, "verb v v."),
1370	    (ADJECTIVE, "adjective adj adj. a s"),
1371	    (ADVERB, "adverb adv adv. r")):
1372	tokens = string.split(abbreviations)
1373	for token in tokens:
1374	    _POSNormalizationTable[token] = pos
1375	    _POSNormalizationTable[string.upper(token)] = pos
1376    for dict in Dictionaries:
1377	_POSNormalizationTable[dict] = dict.pos
1378	_POStoDictionaryTable[dict.pos] = dict
1379
1380_initializePOSTables()
1381
1382def _normalizePOS(pos):
1383    norm = _POSNormalizationTable.get(pos)
1384    if norm:
1385	return norm
1386    raise TypeError, `pos` + " is not a part of speech type"
1387
1388def _dictionaryFor(pos):
1389    pos = _normalizePOS(pos)
1390    dict = _POStoDictionaryTable.get(pos)
1391    if dict == None:
1392	raise RuntimeError, "The " + `pos` + " dictionary has not been created"
1393    return dict
1394
1395def buildIndexFiles():
1396    for dict in Dictionaries:
1397	dict._buildIndexCacheFile()
1398
1399
1400#
1401# Testing
1402#
1403
1404def _testKeys():
1405    #This is slow, so don't do it as part of the normal test procedure.
1406    for dictionary in Dictionaries:
1407	dictionary._testKeys()
1408
1409def _test(reset=0):
1410    import doctest, wordnet
1411    if reset:
1412        doctest.master = None # This keeps doctest from complaining after a reload.
1413    return doctest.testmod(wordnet)
1414