1# Module wordnet.py 2# 3# Original author: Oliver Steele <steele@osteele.com> 4# Project Page: http://sourceforge.net/projects/pywordnet 5# 6# Copyright (c) 1998-2004 by Oliver Steele. Use is permitted under 7# the Artistic License 8# <http://www.opensource.org/licenses/artistic-license.html> 9 10"""An OO interface to the WordNet database. 11 12Usage 13----- 14>>> from wordnet import * 15 16>>> # Retrieve words from the database 17>>> N['dog'] 18dog(n.) 19>>> V['dog'] 20dog(v.) 21>>> ADJ['clear'] 22clear(adj.) 23>>> ADV['clearly'] 24clearly(adv.) 25 26>>> # Examine a word's senses and pointers: 27>>> N['dog'].getSenses() 28('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron}) 29>>> # Extract the first sense 30>>> dog = N['dog'][0] # aka N['dog'].getSenses()[0] 31>>> dog 32'dog' in {noun: dog, domestic dog, Canis familiaris} 33>>> dog.getPointers()[:5] 34(hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) 35>>> dog.getPointerTargets(MEMBER_MERONYM) 36[{noun: Canis, genus Canis}, {noun: pack}] 37""" 38 39__author__ = "Oliver Steele <steele@osteele.com>" 40__version__ = "2.0.1" 41 42import string 43import os 44from os import environ 45from types import IntType, ListType, StringType, TupleType 46 47 48# 49# Configuration variables 50# 51 52WNHOME = environ.get('WNHOME', { 53 'mac': ":", 54 'dos': "C:\\wn16", 55 'nt': "C:\\Program Files\\WordNet\\2.0"} 56 .get(os.name, "/usr/local/share/py-wordnet")) 57 58WNSEARCHDIR = environ.get('WNSEARCHDIR', WNHOME) 59 60ReadableRepresentations = 1 61"""If true, repr(word), repr(sense), and repr(synset) return 62human-readable strings instead of strings that evaluate to an object 63equal to the argument. 64 65This breaks the contract for repr, but it makes the system much more 66usable from the command line.""" 67 68_TraceLookups = 0 69 70_FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r' # work around a Windows Python bug 71 72 73# 74# Enumerated types 75# 76 77NOUN = 'noun' 78VERB = 'verb' 79ADJECTIVE = 'adjective' 80ADVERB = 'adverb' 81PartsOfSpeech = (NOUN, VERB, ADJECTIVE, ADVERB) 82 83ANTONYM = 'antonym' 84HYPERNYM = 'hypernym' 85HYPONYM = 'hyponym' 86ATTRIBUTE = 'attribute' 87ALSO_SEE = 'also see' 88ENTAILMENT = 'entailment' 89CAUSE = 'cause' 90VERB_GROUP = 'verb group' 91MEMBER_MERONYM = 'member meronym' 92SUBSTANCE_MERONYM = 'substance meronym' 93PART_MERONYM = 'part meronym' 94MEMBER_HOLONYM = 'member holonym' 95SUBSTANCE_HOLONYM = 'substance holonym' 96PART_HOLONYM = 'part holonym' 97SIMILAR = 'similar' 98PARTICIPLE_OF = 'participle of' 99PERTAINYM = 'pertainym' 100# New in wn 2.0: 101FRAMES = 'frames' 102CLASSIF_CATEGORY = 'domain category' 103CLASSIF_USAGE = 'domain usage' 104CLASSIF_REGIONAL = 'domain regional' 105CLASS_CATEGORY = 'class category' 106CLASS_USAGE = 'class usage' 107CLASS_REGIONAL = 'class regional' 108 109POINTER_TYPES = ( 110 ANTONYM, 111 HYPERNYM, 112 HYPONYM, 113 ATTRIBUTE, 114 ALSO_SEE, 115 ENTAILMENT, 116 CAUSE, 117 VERB_GROUP, 118 MEMBER_MERONYM, 119 SUBSTANCE_MERONYM, 120 PART_MERONYM, 121 MEMBER_HOLONYM, 122 SUBSTANCE_HOLONYM, 123 PART_HOLONYM, 124 SIMILAR, 125 PARTICIPLE_OF, 126 PERTAINYM, 127 # New in wn 2.0: 128 FRAMES, 129 CLASSIF_CATEGORY, 130 CLASSIF_USAGE, 131 CLASSIF_REGIONAL, 132 CLASS_CATEGORY, 133 CLASS_USAGE, 134 CLASS_REGIONAL, 135 ) 136 137ATTRIBUTIVE = 'attributive' 138PREDICATIVE = 'predicative' 139IMMEDIATE_POSTNOMINAL = 'immediate postnominal' 140ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None) 141 142VERB_FRAME_STRINGS = ( 143 None, 144 "Something %s", 145 "Somebody %s", 146 "It is %sing", 147 "Something is %sing PP", 148 "Something %s something Adjective/Noun", 149 "Something %s Adjective/Noun", 150 "Somebody %s Adjective", 151 "Somebody %s something", 152 "Somebody %s somebody", 153 "Something %s somebody", 154 "Something %s something", 155 "Something %s to somebody", 156 "Somebody %s on something", 157 "Somebody %s somebody something", 158 "Somebody %s something to somebody", 159 "Somebody %s something from somebody", 160 "Somebody %s somebody with something", 161 "Somebody %s somebody of something", 162 "Somebody %s something on somebody", 163 "Somebody %s somebody PP", 164 "Somebody %s something PP", 165 "Somebody %s PP", 166 "Somebody's (body part) %s", 167 "Somebody %s somebody to INFINITIVE", 168 "Somebody %s somebody INFINITIVE", 169 "Somebody %s that CLAUSE", 170 "Somebody %s to somebody", 171 "Somebody %s to INFINITIVE", 172 "Somebody %s whether INFINITIVE", 173 "Somebody %s somebody into V-ing something", 174 "Somebody %s something with something", 175 "Somebody %s INFINITIVE", 176 "Somebody %s VERB-ing", 177 "It %s that CLAUSE", 178 "Something %s INFINITIVE") 179 180 181# 182# Domain classes 183# 184class Word: 185 """An index into the database. 186 187 Each word has one or more Senses, which can be accessed via 188 ``word.getSenses()`` or through the index notation, ``word[n]``. 189 190 Fields 191 ------ 192 form : string 193 The orthographic representation of the word. 194 pos : string 195 The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 196 string : string 197 Same as form (for compatability with version 1.0). 198 taggedSenseCount : integer 199 The number of senses that are tagged. 200 201 Examples 202 -------- 203 >>> N['dog'].pos 204 'noun' 205 >>> N['dog'].form 206 'dog' 207 >>> N['dog'].taggedSenseCount 208 1 209 """ 210 211 def __init__(self, line): 212 """Initialize the word from a line of a WN POS file.""" 213 tokens = string.split(line) 214 ints = map(int, tokens[int(tokens[3]) + 4:]) 215 self.form = string.replace(tokens[0], '_', ' ') 216 "Orthographic representation of the word." 217 self.pos = _normalizePOS(tokens[1]) 218 "Part of speech. One of NOUN, VERB, ADJECTIVE, ADVERB." 219 self.taggedSenseCount = ints[1] 220 "Number of senses that are tagged." 221 self._synsetOffsets = ints[2:ints[0]+2] 222 223 def getPointers(self, pointerType=None): 224 """Pointers connect senses and synsets, not words. 225 Try word[0].getPointers() instead.""" 226 raise self.getPointers.__doc__ 227 228 def getPointerTargets(self, pointerType=None): 229 """Pointers connect senses and synsets, not words. 230 Try word[0].getPointerTargets() instead.""" 231 raise self.getPointers.__doc__ 232 233 def getSenses(self): 234 """Return a sequence of senses. 235 236 >>> N['dog'].getSenses() 237 ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron}) 238 """ 239 if not hasattr(self, '_senses'): 240 def getSense(offset, pos=self.pos, form=self.form): 241 return getSynset(pos, offset)[form] 242 self._senses = tuple(map(getSense, self._synsetOffsets)) 243 del self._synsetOffsets 244 return self._senses 245 246 # Deprecated. Present for backwards compatability. 247 def senses(self): 248 import wordnet 249 #warningKey = 'SENSE_DEPRECATION_WARNING' 250 #if not wordnet.has_key(warningKey): 251 # print 'Word.senses() has been deprecated. Use Word.sense() instead.' 252 # wordnet[warningKey] = 1 253 return self.getSense() 254 255 def isTagged(self): 256 """Return 1 if any sense is tagged. 257 258 >>> N['dog'].isTagged() 259 1 260 """ 261 return self.taggedSenseCount > 0 262 263 def getAdjectivePositions(self): 264 """Return a sequence of adjective positions that this word can 265 appear in. These are elements of ADJECTIVE_POSITIONS. 266 267 >>> ADJ['clear'].getAdjectivePositions() 268 [None, 'predicative'] 269 """ 270 positions = {} 271 for sense in self.getSenses(): 272 positions[sense.position] = 1 273 return positions.keys() 274 275 adjectivePositions = getAdjectivePositions # backwards compatability 276 277 def __cmp__(self, other): 278 """ 279 >>> N['cat'] < N['dog'] 280 1 281 >>> N['dog'] < V['dog'] 282 1 283 """ 284 return _compareInstances(self, other, ('pos', 'form')) 285 286 def __str__(self): 287 """Return a human-readable representation. 288 289 >>> str(N['dog']) 290 'dog(n.)' 291 """ 292 abbrs = {NOUN: 'n.', VERB: 'v.', ADJECTIVE: 'adj.', ADVERB: 'adv.'} 293 return self.form + "(" + abbrs[self.pos] + ")" 294 295 def __repr__(self): 296 """If ReadableRepresentations is true, return a human-readable 297 representation, e.g. 'dog(n.)'. 298 299 If ReadableRepresentations is false, return a machine-readable 300 representation, e.g. "getWord('dog', 'noun')". 301 """ 302 if ReadableRepresentations: 303 return str(self) 304 return "getWord" + `(self.form, self.pos)` 305 306 # 307 # Sequence protocol (a Word's elements are its Senses) 308 # 309 def __nonzero__(self): 310 return 1 311 312 def __len__(self): 313 return len(self.getSenses()) 314 315 def __getitem__(self, index): 316 return self.getSenses()[index] 317 318 def __getslice__(self, i, j): 319 return self.getSenses()[i:j] 320 321 322class Synset: 323 """A set of synonyms that share a common meaning. 324 325 Each synonym contains one or more Senses, which represent a 326 specific sense of a specific word. Senses can be retrieved via 327 synset.getSenses() or through the index notations synset[0], 328 synset[string], or synset[word]. Synsets also originate zero or 329 more typed pointers, which can be accessed via 330 synset.getPointers() or synset.getPointers(pointerType). The 331 targets of a synset pointer can be retrieved via 332 synset.getPointerTargets() or 333 synset.getPointerTargets(pointerType), which are equivalent to 334 map(Pointer.target, synset.getPointerTargets(...)). 335 336 Fields 337 ------ 338 pos : string 339 The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 340 offset : integer 341 An integer offset into the part-of-speech file. Together 342 with pos, this can be used as a unique id. 343 gloss : string 344 A gloss for the sense. 345 verbFrames : [integer] 346 A sequence of integers that index into 347 VERB_FRAME_STRINGS. These list the verb frames that any 348 Sense in this synset participates in. (See also 349 Sense.verbFrames.) Defined only for verbs. 350 351 >>> V['think'][0].synset.verbFrames 352 (5, 9) 353 """ 354 355 def __init__(self, pos, offset, line): 356 "Initialize the synset from a line off a WN synset file." 357 self.pos = pos 358 "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB." 359 self.offset = offset 360 """integer offset into the part-of-speech file. Together 361 with pos, this can be used as a unique id.""" 362 tokens = string.split(line[:string.index(line, '|')]) 363 self.ssType = tokens[2] 364 self.gloss = string.strip(line[string.index(line, '|') + 1:]) 365 self.lexname = Lexname.lexnames[int(tokens[1])] 366 (self._senseTuples, remainder) = _partition(tokens[4:], 2, string.atoi(tokens[3], 16)) 367 (self._pointerTuples, remainder) = _partition(remainder[1:], 4, int(remainder[0])) 368 if pos == VERB: 369 (vfTuples, remainder) = _partition(remainder[1:], 3, int(remainder[0])) 370 def extractVerbFrames(index, vfTuples): 371 return tuple(map(lambda t:string.atoi(t[1]), filter(lambda t,i=index:string.atoi(t[2],16) in (0, i), vfTuples))) 372 senseVerbFrames = [] 373 for index in range(1, len(self._senseTuples) + 1): 374 senseVerbFrames.append(extractVerbFrames(index, vfTuples)) 375 self._senseVerbFrames = senseVerbFrames 376 self.verbFrames = tuple(extractVerbFrames(None, vfTuples)) 377 """A sequence of integers that index into 378 VERB_FRAME_STRINGS. These list the verb frames that any 379 Sense in this synset participates in. (See also 380 Sense.verbFrames.) Defined only for verbs.""" 381 382 def getSenses(self): 383 """Return a sequence of Senses. 384 385 >>> N['dog'][0].getSenses() 386 ('dog' in {noun: dog, domestic dog, Canis familiaris},) 387 """ 388 if not hasattr(self, '_senses'): 389 def loadSense(senseTuple, verbFrames=None, synset=self): 390 return Sense(synset, senseTuple, verbFrames) 391 if self.pos == VERB: 392 self._senses = tuple(map(loadSense, self._senseTuples, self._senseVerbFrames)) 393 del self._senseVerbFrames 394 else: 395 self._senses = tuple(map(loadSense, self._senseTuples)) 396 del self._senseTuples 397 return self._senses 398 399 senses = getSenses 400 401 def getPointers(self, pointerType=None): 402 """Return a sequence of Pointers. 403 404 If pointerType is specified, only pointers of that type are 405 returned. In this case, pointerType should be an element of 406 POINTER_TYPES. 407 408 >>> N['dog'][0].getPointers()[:5] 409 (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) 410 >>> N['dog'][0].getPointers(HYPERNYM) 411 (hypernym -> {noun: canine, canid},) 412 """ 413 if not hasattr(self, '_pointers'): 414 def loadPointer(tuple, synset=self): 415 return Pointer(synset.offset, tuple) 416 self._pointers = tuple(map(loadPointer, self._pointerTuples)) 417 del self._pointerTuples 418 if pointerType == None: 419 return self._pointers 420 else: 421 _requirePointerType(pointerType) 422 return filter(lambda pointer, type=pointerType: pointer.type == type, self._pointers) 423 424 pointers = getPointers # backwards compatability 425 426 def getPointerTargets(self, pointerType=None): 427 """Return a sequence of Senses or Synsets. 428 429 If pointerType is specified, only targets of pointers of that 430 type are returned. In this case, pointerType should be an 431 element of POINTER_TYPES. 432 433 >>> N['dog'][0].getPointerTargets()[:5] 434 [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] 435 >>> N['dog'][0].getPointerTargets(HYPERNYM) 436 [{noun: canine, canid}] 437 """ 438 return map(Pointer.target, self.getPointers(pointerType)) 439 440 pointerTargets = getPointerTargets # backwards compatability 441 442 def isTagged(self): 443 """Return 1 if any sense is tagged. 444 445 >>> N['dog'][0].isTagged() 446 1 447 >>> N['dog'][1].isTagged() 448 0 449 """ 450 return len(filter(Sense.isTagged, self.getSenses())) > 0 451 452 def __str__(self): 453 """Return a human-readable representation. 454 455 >>> str(N['dog'][0].synset) 456 '{noun: dog, domestic dog, Canis familiaris}' 457 """ 458 return "{" + self.pos + ": " + string.joinfields(map(lambda sense:sense.form, self.getSenses()), ", ") + "}" 459 460 def __repr__(self): 461 """If ReadableRepresentations is true, return a human-readable 462 representation, e.g. 'dog(n.)'. 463 464 If ReadableRepresentations is false, return a machine-readable 465 representation, e.g. "getSynset(pos, 1234)". 466 """ 467 if ReadableRepresentations: 468 return str(self) 469 return "getSynset" + `(self.pos, self.offset)` 470 471 def __cmp__(self, other): 472 return _compareInstances(self, other, ('pos', 'offset')) 473 474 # 475 # Sequence protocol (a Synset's elements are its senses). 476 # 477 def __nonzero__(self): 478 return 1 479 480 def __len__(self): 481 """ 482 >>> len(N['dog'][0].synset) 483 3 484 """ 485 return len(self.getSenses()) 486 487 def __getitem__(self, idx): 488 """ 489 >>> N['dog'][0].synset[0] == N['dog'][0] 490 1 491 >>> N['dog'][0].synset['dog'] == N['dog'][0] 492 1 493 >>> N['dog'][0].synset[N['dog']] == N['dog'][0] 494 1 495 >>> N['cat'][6] 496 'cat' in {noun: big cat, cat} 497 """ 498 senses = self.getSenses() 499 if isinstance(idx, Word): 500 idx = idx.form 501 if isinstance(idx, StringType): 502 idx = _index(idx, map(lambda sense:sense.form, senses)) or \ 503 _index(idx, map(lambda sense:sense.form, senses), _equalsIgnoreCase) 504 return senses[idx] 505 506 def __getslice__(self, i, j): 507 return self.getSenses()[i:j] 508 509 510class Sense: 511 """A specific meaning of a specific word -- the intersection of a Word and a Synset. 512 513 Fields 514 ------ 515 form : string 516 The orthographic representation of the Word this is a Sense of. 517 pos : string 518 The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB 519 string : string 520 The same as form (for compatability with version 1.0). 521 synset : Synset 522 The Synset that this Sense is a sense of. 523 verbFrames : [integer] 524 A sequence of integers that index into 525 VERB_FRAME_STRINGS. These list the verb frames that this 526 Sense partipates in. Defined only for verbs. 527 528 >>> decide = V['decide'][0].synset # first synset for 'decide' 529 >>> decide[0].verbFrames 530 (8, 2, 26, 29) 531 >>> decide[1].verbFrames 532 (8, 2) 533 >>> decide[2].verbFrames 534 (8, 26, 29) 535 """ 536 537 def __init__(sense, synset, senseTuple, verbFrames=None): 538 "Initialize a sense from a synset's senseTuple." 539 # synset is stored by key (pos, synset) rather than object 540 # reference, to avoid creating a circular reference between 541 # Senses and Synsets that will prevent the vm from 542 # garbage-collecting them. 543 sense.pos = synset.pos 544 "part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB" 545 sense.synsetOffset = synset.offset 546 "synset key. This is used to retrieve the sense." 547 sense.verbFrames = verbFrames 548 """A sequence of integers that index into 549 VERB_FRAME_STRINGS. These list the verb frames that this 550 Sense partipates in. Defined only for verbs.""" 551 (form, idString) = senseTuple 552 sense.position = None 553 if '(' in form: 554 index = string.index(form, '(') 555 key = form[index + 1:-1] 556 form = form[:index] 557 if key == 'a': 558 sense.position = ATTRIBUTIVE 559 elif key == 'p': 560 sense.position = PREDICATIVE 561 elif key == 'ip': 562 sense.position = IMMEDIATE_POSTNOMINAL 563 else: 564 raise "unknown attribute " + key 565 sense.form = string.replace(form, '_', ' ') 566 "orthographic representation of the Word this is a Sense of." 567 568 def __getattr__(self, name): 569 # see the note at __init__ about why 'synset' is provided as a 570 # 'virtual' slot 571 if name == 'synset': 572 return getSynset(self.pos, self.synsetOffset) 573 elif name == 'lexname': 574 return self.synset.lexname 575 else: 576 raise AttributeError, name 577 578 def __str__(self): 579 """Return a human-readable representation. 580 581 >>> str(N['dog']) 582 'dog(n.)' 583 """ 584 return `self.form` + " in " + str(self.synset) 585 586 def __repr__(self): 587 """If ReadableRepresentations is true, return a human-readable 588 representation, e.g. 'dog(n.)'. 589 590 If ReadableRepresentations is false, return a machine-readable 591 representation, e.g. "getWord('dog', 'noun')". 592 """ 593 if ReadableRepresentations: 594 return str(self) 595 return "%s[%s]" % (`self.synset`, `self.form`) 596 597 def getPointers(self, pointerType=None): 598 """Return a sequence of Pointers. 599 600 If pointerType is specified, only pointers of that type are 601 returned. In this case, pointerType should be an element of 602 POINTER_TYPES. 603 604 >>> N['dog'][0].getPointers()[:5] 605 (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) 606 >>> N['dog'][0].getPointers(HYPERNYM) 607 (hypernym -> {noun: canine, canid},) 608 """ 609 senseIndex = _index(self, self.synset.getSenses()) 610 def pointsFromThisSense(pointer, selfIndex=senseIndex): 611 return pointer.sourceIndex == 0 or pointer.sourceIndex - 1 == selfIndex 612 return filter(pointsFromThisSense, self.synset.getPointers(pointerType)) 613 614 pointers = getPointers # backwards compatability 615 616 def getPointerTargets(self, pointerType=None): 617 """Return a sequence of Senses or Synsets. 618 619 If pointerType is specified, only targets of pointers of that 620 type are returned. In this case, pointerType should be an 621 element of POINTER_TYPES. 622 623 >>> N['dog'][0].getPointerTargets()[:5] 624 [{noun: canine, canid}, {noun: Canis, genus Canis}, {noun: pack}, {noun: pooch, doggie, doggy, barker, bow-wow}, {noun: cur, mongrel, mutt}] 625 >>> N['dog'][0].getPointerTargets(HYPERNYM) 626 [{noun: canine, canid}] 627 """ 628 return map(Pointer.target, self.getPointers(pointerType)) 629 630 pointerTargets = getPointerTargets # backwards compatability 631 632 def getSenses(self): 633 return self, 634 635 senses = getSenses # backwards compatability 636 637 def isTagged(self): 638 """Return 1 if any sense is tagged. 639 640 >>> N['dog'][0].isTagged() 641 1 642 >>> N['dog'][1].isTagged() 643 0 644 """ 645 word = self.word() 646 return _index(self, word.getSenses()) < word.taggedSenseCount 647 648 def getWord(self): 649 return getWord(self.form, self.pos) 650 651 word = getWord # backwards compatability 652 653 def __cmp__(self, other): 654 def senseIndex(sense, synset=self.synset): 655 return _index(sense, synset.getSenses(), testfn=lambda a,b: a.form == b.form) 656 return _compareInstances(self, other, ('synset',)) or cmp(senseIndex(self), senseIndex(other)) 657 658 659class Pointer: 660 """ A typed directional relationship between Senses or Synsets. 661 662 Fields 663 ------ 664 type : string 665 One of POINTER_TYPES. 666 pos : string 667 The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 668 """ 669 670 _POINTER_TYPE_TABLE = { 671 '!': ANTONYM, 672 '@': HYPERNYM, 673 '~': HYPONYM, 674 '=': ATTRIBUTE, 675 '^': ALSO_SEE, 676 '*': ENTAILMENT, 677 '>': CAUSE, 678 '$': VERB_GROUP, 679 '#m': MEMBER_MERONYM, 680 '#s': SUBSTANCE_MERONYM, 681 '#p': PART_MERONYM, 682 '%m': MEMBER_HOLONYM, 683 '%s': SUBSTANCE_HOLONYM, 684 '%p': PART_HOLONYM, 685 '&': SIMILAR, 686 '<': PARTICIPLE_OF, 687 '\\': PERTAINYM, 688 # New in wn 2.0: 689 '+': FRAMES, 690 ';c': CLASSIF_CATEGORY, 691 ';u': CLASSIF_USAGE, 692 ';r': CLASSIF_REGIONAL, 693 '-c': CLASS_CATEGORY, 694 '-u': CLASS_USAGE, 695 '-r': CLASS_REGIONAL 696 } 697 698 def __init__(self, sourceOffset, pointerTuple): 699 (type, offset, pos, indices) = pointerTuple 700 self.type = Pointer._POINTER_TYPE_TABLE[type] 701 """One of POINTER_TYPES.""" 702 self.sourceOffset = sourceOffset 703 self.targetOffset = int(offset) 704 self.pos = _normalizePOS(pos) 705 """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB""" 706 indices = string.atoi(indices, 16) 707 self.sourceIndex = indices >> 8 708 self.targetIndex = indices & 255 709 710 def getSource(self): 711 synset = getSynset(self.pos, self.sourceOffset) 712 if self.sourceIndex: 713 return synset[self.sourceIndex - 1] 714 else: 715 return synset 716 717 source = getSource # backwards compatability 718 719 def getTarget(self): 720 synset = getSynset(self.pos, self.targetOffset) 721 if self.targetIndex: 722 return synset[self.targetIndex - 1] 723 else: 724 return synset 725 726 target = getTarget # backwards compatability 727 728 def __str__(self): 729 return self.type + " -> " + str(self.target()) 730 731 def __repr__(self): 732 if ReadableRepresentations: 733 return str(self) 734 return "<" + str(self) + ">" 735 736 def __cmp__(self, other): 737 diff = _compareInstances(self, other, ('pos', 'sourceOffset')) 738 if diff: 739 return diff 740 synset = self.source() 741 def pointerIndex(sense, synset=synset): 742 return _index(sense, synset.getPointers(), testfn=lambda a,b: not _compareInstances(a, b, ('type', 'sourceIndex', 'targetIndex'))) 743 return cmp(pointerIndex(self), pointerIndex(other)) 744 745 746# Loading the lexnames 747# Klaus Ries <ries@cs.cmu.edu> 748 749class Lexname: 750 dict = {} 751 lexnames = [] 752 753 def __init__(self,name,category): 754 self.name = name 755 self.category = category 756 Lexname.dict[name] = self 757 Lexname.lexnames.append(self) 758 759 def __str__(self): 760 return self.name 761 762def setupLexnames(): 763 for l in open(WNSEARCHDIR+'/lexnames').readlines(): 764 i,name,category = string.split(l) 765 Lexname(name,PartsOfSpeech[int(category)-1]) 766 767setupLexnames() 768 769# 770# Dictionary 771# 772class Dictionary: 773 774 """A Dictionary contains all the Words in a given part of speech. 775 This module defines four dictionaries, bound to N, V, ADJ, and ADV. 776 777 Indexing a dictionary by a string retrieves the word named by that 778 string, e.g. dict['dog']. Indexing by an integer n retrieves the 779 nth word, e.g. dict[0]. Access by an arbitrary integer is very 780 slow except in the special case where the words are accessed 781 sequentially; this is to support the use of dictionaries as the 782 range of a for statement and as the sequence argument to map and 783 filter. 784 785 Example 786 ------- 787 >>> N['dog'] 788 dog(n.) 789 790 Fields 791 ------ 792 pos : string 793 The part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB. 794 """ 795 796 def __init__(self, pos, filenameroot): 797 self.pos = pos 798 """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB""" 799 self.indexFile = _IndexFile(pos, filenameroot) 800 self.dataFile = open(_dataFilePathname(filenameroot), _FILE_OPEN_MODE) 801 802 def __repr__(self): 803 dictionaryVariables = {N: 'N', V: 'V', ADJ: 'ADJ', ADV: 'ADV'} 804 if dictionaryVariables.get(self): 805 return self.__module__ + "." + dictionaryVariables[self] 806 return "<%s.%s instance for %s>" % (self.__module__, "Dictionary", self.pos) 807 808 def getWord(self, form, line=None): 809 key = string.replace(string.lower(form), ' ', '_') 810 pos = self.pos 811 def loader(key=key, line=line, indexFile=self.indexFile): 812 line = line or indexFile.get(key) 813 return line and Word(line) 814 word = _entityCache.get((pos, key), loader) 815 if word: 816 return word 817 else: 818 raise KeyError, "%s is not in the %s database" % (`form`, `pos`) 819 820 def getSynset(self, offset): 821 pos = self.pos 822 def loader(pos=pos, offset=offset, dataFile=self.dataFile): 823 return Synset(pos, offset, _lineAt(dataFile, offset)) 824 return _entityCache.get((pos, offset), loader) 825 826 def _buildIndexCacheFile(self): 827 self.indexFile._buildIndexCacheFile() 828 829 # 830 # Sequence protocol (a Dictionary's items are its Words) 831 # 832 def __nonzero__(self): 833 """Return false. (This is to avoid scanning the whole index file 834 to compute len when a Dictionary is used in test position.) 835 836 >>> N and 'true' 837 'true' 838 """ 839 return 1 840 841 def __len__(self): 842 """Return the number of index entries. 843 844 >>> len(ADJ) 845 21435 846 """ 847 if not hasattr(self, 'length'): 848 self.length = len(self.indexFile) 849 return self.length 850 851 def __getslice__(self, a, b): 852 results = [] 853 if type(a) == type('') and type(b) == type(''): 854 raise "unimplemented" 855 elif type(a) == type(1) and type(b) == type(1): 856 for i in range(a, b): 857 results.append(self[i]) 858 else: 859 raise TypeError 860 return results 861 862 def __getitem__(self, index): 863 """If index is a String, return the Word whose form is 864 index. If index is an integer n, return the Word 865 indexed by the n'th Word in the Index file. 866 867 >>> N['dog'] 868 dog(n.) 869 >>> N[0] 870 'hood(n.) 871 """ 872 if isinstance(index, StringType): 873 return self.getWord(index) 874 elif isinstance(index, IntType): 875 line = self.indexFile[index] 876 return self.getWord(string.replace(line[:string.find(line, ' ')], '_', ' '), line) 877 else: 878 raise TypeError, "%s is not a String or Int" % `index` 879 880 # 881 # Dictionary protocol 882 # 883 # a Dictionary's values are its words, keyed by their form 884 # 885 886 def get(self, key, default=None): 887 """Return the Word whose form is _key_, or _default_. 888 889 >>> N.get('dog') 890 dog(n.) 891 >>> N.get('inu') 892 """ 893 try: 894 return self[key] 895 except LookupError: 896 return default 897 898 def keys(self): 899 """Return a sorted list of strings that index words in this 900 dictionary.""" 901 return self.indexFile.keys() 902 903 def has_key(self, form): 904 """Return true iff the argument indexes a word in this dictionary. 905 906 >>> N.has_key('dog') 907 1 908 >>> N.has_key('inu') 909 0 910 """ 911 return self.indexFile.has_key(form) 912 913 # 914 # Testing 915 # 916 917 def _testKeys(self): 918 """Verify that index lookup can find each word in the index file.""" 919 print "Testing: ", self 920 file = open(self.indexFile.file.name, _FILE_OPEN_MODE) 921 counter = 0 922 while 1: 923 line = file.readline() 924 if line == '': break 925 if line[0] != ' ': 926 key = string.replace(line[:string.find(line, ' ')], '_', ' ') 927 if (counter % 1000) == 0: 928 print "%s..." % (key,), 929 import sys 930 sys.stdout.flush() 931 counter = counter + 1 932 self[key] 933 file.close() 934 print "done." 935 936 937class _IndexFile: 938 """An _IndexFile is an implementation class that presents a 939 Sequence and Dictionary interface to a sorted index file.""" 940 941 def __init__(self, pos, filenameroot): 942 self.pos = pos 943 self.file = open(_indexFilePathname(filenameroot), _FILE_OPEN_MODE) 944 self.offsetLineCache = {} # Table of (pathname, offset) -> (line, nextOffset) 945 self.rewind() 946 self.shelfname = os.path.join(WNSEARCHDIR, pos + ".pyidx") 947 try: 948 import shelve 949 self.indexCache = shelve.open(self.shelfname, 'r') 950 except: 951 pass 952 953 def rewind(self): 954 self.file.seek(0) 955 while 1: 956 offset = self.file.tell() 957 line = self.file.readline() 958 if (line[0] != ' '): 959 break 960 self.nextIndex = 0 961 self.nextOffset = offset 962 963 # 964 # Sequence protocol (an _IndexFile's items are its lines) 965 # 966 def __nonzero__(self): 967 return 1 968 969 def __len__(self): 970 if hasattr(self, 'indexCache'): 971 return len(self.indexCache) 972 self.rewind() 973 lines = 0 974 while 1: 975 line = self.file.readline() 976 if line == "": 977 break 978 lines = lines + 1 979 return lines 980 981 def __nonzero__(self): 982 return 1 983 984 def __getitem__(self, index): 985 if isinstance(index, StringType): 986 if hasattr(self, 'indexCache'): 987 return self.indexCache[index] 988 return binarySearchFile(self.file, index, self.offsetLineCache, 8) 989 elif isinstance(index, IntType): 990 if hasattr(self, 'indexCache'): 991 return self.get(self.keys[index]) 992 if index < self.nextIndex: 993 self.rewind() 994 while self.nextIndex <= index: 995 self.file.seek(self.nextOffset) 996 line = self.file.readline() 997 if line == "": 998 raise IndexError, "index out of range" 999 self.nextIndex = self.nextIndex + 1 1000 self.nextOffset = self.file.tell() 1001 return line 1002 else: 1003 raise TypeError, "%s is not a String or Int" % `index` 1004 1005 # 1006 # Dictionary protocol 1007 # 1008 # (an _IndexFile's values are its lines, keyed by the first word) 1009 # 1010 1011 def get(self, key, default=None): 1012 try: 1013 return self[key] 1014 except LookupError: 1015 return default 1016 1017 def keys(self): 1018 if hasattr(self, 'indexCache'): 1019 keys = self.indexCache.keys() 1020 keys.sort() 1021 return keys 1022 else: 1023 keys = [] 1024 self.rewind() 1025 while 1: 1026 line = self.file.readline() 1027 if not line: break 1028 key = line.split(' ', 1)[0] 1029 keys.append(key.replace('_', ' ')) 1030 return keys 1031 1032 def has_key(self, key): 1033 key = key.replace(' ', '_') # test case: V['haze over'] 1034 if hasattr(self, 'indexCache'): 1035 return self.indexCache.has_key(key) 1036 return self.get(key) != None 1037 1038 # 1039 # Index file 1040 # 1041 1042 def _buildIndexCacheFile(self): 1043 import shelve 1044 import os 1045 print "Building %s:" % (self.shelfname,), 1046 tempname = self.shelfname + ".temp" 1047 try: 1048 indexCache = shelve.open(tempname) 1049 self.rewind() 1050 count = 0 1051 while 1: 1052 offset, line = self.file.tell(), self.file.readline() 1053 if not line: break 1054 key = line[:string.find(line, ' ')] 1055 if (count % 1000) == 0: 1056 print "%s..." % (key,), 1057 import sys 1058 sys.stdout.flush() 1059 indexCache[key] = line 1060 count = count + 1 1061 indexCache.close() 1062 os.rename(tempname, self.shelfname) 1063 finally: 1064 try: os.remove(tempname) 1065 except: pass 1066 print "done." 1067 self.indexCache = shelve.open(self.shelfname, 'r') 1068 1069 1070# 1071# Lookup functions 1072# 1073 1074def getWord(form, pos='noun'): 1075 "Return a word with the given lexical form and pos." 1076 return _dictionaryFor(pos).getWord(form) 1077 1078def getSense(form, pos='noun', senseno=0): 1079 "Lookup a sense by its sense number. Used by repr(sense)." 1080 return getWord(form, pos)[senseno] 1081 1082def getSynset(pos, offset): 1083 "Lookup a synset by its offset. Used by repr(synset)." 1084 return _dictionaryFor(pos).getSynset(offset) 1085 1086getword, getsense, getsynset = getWord, getSense, getSynset 1087 1088# 1089# Private utilities 1090# 1091 1092def _requirePointerType(pointerType): 1093 if pointerType not in POINTER_TYPES: 1094 raise TypeError, `pointerType` + " is not a pointer type" 1095 return pointerType 1096 1097def _compareInstances(a, b, fields): 1098 """"Return -1, 0, or 1 according to a comparison first by type, 1099 then by class, and finally by each of fields.""" # " <- for emacs 1100 if not hasattr(b, '__class__'): 1101 return cmp(type(a), type(b)) 1102 elif a.__class__ != b.__class__: 1103 return cmp(a.__class__, b.__class__) 1104 for field in fields: 1105 diff = cmp(getattr(a, field), getattr(b, field)) 1106 if diff: 1107 return diff 1108 return 0 1109 1110def _equalsIgnoreCase(a, b): 1111 """Return true iff a and b have the same lowercase representation. 1112 1113 >>> _equalsIgnoreCase('dog', 'Dog') 1114 1 1115 >>> _equalsIgnoreCase('dOg', 'DOG') 1116 1 1117 """ 1118 return a == b or string.lower(a) == string.lower(b) 1119 1120# 1121# File utilities 1122# 1123def _dataFilePathname(filenameroot): 1124 if os.name in ('dos', 'nt'): 1125 path = os.path.join(WNSEARCHDIR, filenameroot + ".dat") 1126 if os.path.exists(path): 1127 return path 1128 return os.path.join(WNSEARCHDIR, "data." + filenameroot) 1129 1130def _indexFilePathname(filenameroot): 1131 if os.name in ('dos', 'nt'): 1132 path = os.path.join(WNSEARCHDIR, filenameroot + ".idx") 1133 if os.path.exists(path): 1134 return path 1135 return os.path.join(WNSEARCHDIR, "index." + filenameroot) 1136 1137def binarySearchFile(file, key, cache={}, cacheDepth=-1): 1138 from stat import ST_SIZE 1139 key = key + ' ' 1140 keylen = len(key) 1141 start, end = 0, os.stat(file.name)[ST_SIZE] 1142 currentDepth = 0 1143 #count = 0 1144 while start < end: 1145 #count = count + 1 1146 #if count > 20: 1147 # raise "infinite loop" 1148 lastState = start, end 1149 middle = (start + end) / 2 1150 if cache.get(middle): 1151 offset, line = cache[middle] 1152 else: 1153 file.seek(max(0, middle - 1)) 1154 if middle > 0: 1155 file.readline() 1156 offset, line = file.tell(), file.readline() 1157 if currentDepth < cacheDepth: 1158 cache[middle] = (offset, line) 1159 #print start, middle, end, offset, line, 1160 if offset > end: 1161 assert end != middle - 1, "infinite loop" 1162 end = middle - 1 1163 elif line[:keylen] == key:# and line[keylen + 1] == ' ': 1164 return line 1165 #elif offset == end: 1166 # return None 1167 elif line > key: 1168 assert end != middle - 1, "infinite loop" 1169 end = middle - 1 1170 elif line < key: 1171 start = offset + len(line) - 1 1172 currentDepth = currentDepth + 1 1173 thisState = start, end 1174 if lastState == thisState: 1175 # detects the condition where we're searching past the end 1176 # of the file, which is otherwise difficult to detect 1177 return None 1178 return None 1179 1180def _lineAt(file, offset): 1181 file.seek(offset) 1182 return file.readline() 1183 1184 1185# 1186# Sequence Utility Functions 1187# 1188 1189def _index(key, sequence, testfn=None, keyfn=None): 1190 """Return the index of key within sequence, using testfn for 1191 comparison and transforming items of sequence by keyfn first. 1192 1193 >>> _index('e', 'hello') 1194 1 1195 >>> _index('E', 'hello', testfn=_equalsIgnoreCase) 1196 1 1197 >>> _index('x', 'hello') 1198 """ 1199 index = 0 1200 for element in sequence: 1201 value = element 1202 if keyfn: 1203 value = keyfn(value) 1204 if (not testfn and value == key) or (testfn and testfn(value, key)): 1205 return index 1206 index = index + 1 1207 return None 1208 1209def _partition(sequence, size, count): 1210 """Partition sequence into count subsequences of size 1211 length, and a remainder. 1212 1213 Return (partitions, remainder), where partitions is a sequence of 1214 count subsequences of cardinality count, and 1215 apply(append, partitions) + remainder == sequence.""" 1216 1217 partitions = [] 1218 for index in range(0, size * count, size): 1219 partitions.append(sequence[index:index + size]) 1220 return (partitions, sequence[size * count:]) 1221 1222 1223# 1224# Cache management 1225# 1226# Some kind of cache is necessary since Sense -> Synset references are 1227# stored by key, and it's nice not to have to cons a new copy of a 1228# Synset that's been paged in each time a Sense's synset is retrieved. 1229# Ideally, we'd use a weak dict, but there aren't any. A strong dict 1230# reintroduces the problem that eliminating the Sense <-> Synset 1231# circularity was intended to resolve: every entity ever seen is 1232# preserved forever, making operations that iterate over the entire 1233# database prohibitive. 1234# 1235# The LRUCache approximates a weak dict in the case where temporal 1236# locality is good. 1237 1238class _LRUCache: 1239 """ A cache of values such that least recently used element is 1240 flushed when the cache fills. 1241 1242 Private fields 1243 -------------- 1244 entities 1245 a dict from key -> (value, timestamp) 1246 history 1247 is a dict from timestamp -> key 1248 nextTimeStamp 1249 is the timestamp to use with the next value that's added. 1250 oldestTimeStamp 1251 The timestamp of the oldest element (the next one to remove), 1252 or slightly lower than that. 1253 1254 This lets us retrieve the key given the timestamp, and the 1255 timestamp given the key. (Also the value given either one.) 1256 That's necessary so that we can reorder the history given a key, 1257 and also manipulate the values dict given a timestamp. # 1258 1259 I haven't tried changing history to a List. An earlier 1260 implementation of history as a List was slower than what's here, 1261 but the two implementations aren't directly comparable.""" 1262 1263 def __init__(this, capacity): 1264 this.capacity = capacity 1265 this.clear() 1266 1267 def clear(this): 1268 this.values = {} 1269 this.history = {} 1270 this.oldestTimestamp = 0 1271 this.nextTimestamp = 1 1272 1273 def removeOldestEntry(this): 1274 while this.oldestTimestamp < this.nextTimestamp: 1275 if this.history.get(this.oldestTimestamp): 1276 key = this.history[this.oldestTimestamp] 1277 del this.history[this.oldestTimestamp] 1278 del this.values[key] 1279 return 1280 this.oldestTimestamp = this.oldestTimestamp + 1 1281 1282 def setCapacity(this, capacity): 1283 if capacity == 0: 1284 this.clear() 1285 else: 1286 this.capacity = capacity 1287 while len(this.values) > this.capacity: 1288 this.removeOldestEntry() 1289 1290 def get(this, key, loadfn=None): 1291 value = None 1292 if this.values: 1293 pair = this.values.get(key) 1294 if pair: 1295 (value, timestamp) = pair 1296 del this.history[timestamp] 1297 if value == None: 1298 value = loadfn and loadfn() 1299 if this.values != None: 1300 timestamp = this.nextTimestamp 1301 this.nextTimestamp = this.nextTimestamp + 1 1302 this.values[key] = (value, timestamp) 1303 this.history[timestamp] = key 1304 if len(this.values) > this.capacity: 1305 this.removeOldestEntry() 1306 return value 1307 1308 1309class _NullCache: 1310 """A NullCache implements the Cache interface (the interface that 1311 LRUCache implements), but doesn't store any values.""" 1312 1313 def clear(): 1314 pass 1315 1316 def get(this, key, loadfn=None): 1317 return loadfn and loadfn() 1318 1319 1320DEFAULT_CACHE_CAPACITY = 1000 1321_entityCache = _LRUCache(DEFAULT_CACHE_CAPACITY) 1322 1323def disableCache(): 1324 """Disable the entity cache.""" 1325 _entityCache = _NullCache() 1326 1327def enableCache(): 1328 """Enable the entity cache.""" 1329 if not isinstance(_entityCache, LRUCache): 1330 _entityCache = _LRUCache(size) 1331 1332def clearCache(): 1333 """Clear the entity cache.""" 1334 _entityCache.clear() 1335 1336def setCacheCapacity(capacity=DEFAULT_CACHE_CAPACITY): 1337 """Set the capacity of the entity cache.""" 1338 enableCache() 1339 _entityCache.setCapacity(capacity) 1340 1341setCacheSize = setCacheCapacity # for compatability with version 1.0 1342 1343 1344# 1345# POS Dictionaries (must be initialized after file utilities) 1346# 1347 1348N = Dictionary(NOUN, 'noun') 1349V = Dictionary(VERB, 'verb') 1350ADJ = Dictionary(ADJECTIVE, 'adj') 1351ADV = Dictionary(ADVERB, 'adv') 1352Dictionaries = (N, V, ADJ, ADV) 1353 1354 1355# 1356# Part-of-speech tag normalization tables (must be initialized after 1357# POS dictionaries) 1358# 1359 1360_POSNormalizationTable = {} 1361_POStoDictionaryTable = {} 1362 1363def _initializePOSTables(): 1364 global _POSNormalizationTable, _POStoDictionaryTable 1365 _POSNormalizationTable = {} 1366 _POStoDictionaryTable = {} 1367 for pos, abbreviations in ( 1368 (NOUN, "noun n n."), 1369 (VERB, "verb v v."), 1370 (ADJECTIVE, "adjective adj adj. a s"), 1371 (ADVERB, "adverb adv adv. r")): 1372 tokens = string.split(abbreviations) 1373 for token in tokens: 1374 _POSNormalizationTable[token] = pos 1375 _POSNormalizationTable[string.upper(token)] = pos 1376 for dict in Dictionaries: 1377 _POSNormalizationTable[dict] = dict.pos 1378 _POStoDictionaryTable[dict.pos] = dict 1379 1380_initializePOSTables() 1381 1382def _normalizePOS(pos): 1383 norm = _POSNormalizationTable.get(pos) 1384 if norm: 1385 return norm 1386 raise TypeError, `pos` + " is not a part of speech type" 1387 1388def _dictionaryFor(pos): 1389 pos = _normalizePOS(pos) 1390 dict = _POStoDictionaryTable.get(pos) 1391 if dict == None: 1392 raise RuntimeError, "The " + `pos` + " dictionary has not been created" 1393 return dict 1394 1395def buildIndexFiles(): 1396 for dict in Dictionaries: 1397 dict._buildIndexCacheFile() 1398 1399 1400# 1401# Testing 1402# 1403 1404def _testKeys(): 1405 #This is slow, so don't do it as part of the normal test procedure. 1406 for dictionary in Dictionaries: 1407 dictionary._testKeys() 1408 1409def _test(reset=0): 1410 import doctest, wordnet 1411 if reset: 1412 doctest.master = None # This keeps doctest from complaining after a reload. 1413 return doctest.testmod(wordnet) 1414