1# Natural Language Toolkit: Relation Extraction 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Ewan Klein <ewan@inf.ed.ac.uk> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7 8""" 9Code for extracting relational triples from the ieer and conll2002 corpora. 10 11Relations are stored internally as dictionaries ('reldicts'). 12 13The two serialization outputs are "rtuple" and "clause". 14 15- An rtuple is a tuple of the form ``(subj, filler, obj)``, 16 where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words 17 occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to 18 circumvent locale variations in rendering utf-8 encoded strings. 19- A clause is an atom of the form ``relsym(subjsym, objsym)``, 20 where the relation, subject and object have been canonicalized to single strings. 21""" 22from __future__ import print_function 23 24# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? 25 26from collections import defaultdict 27import re 28 29from six.moves import html_entities 30 31# Dictionary that associates corpora with NE classes 32NE_CLASSES = { 33 'ieer': [ 34 'LOCATION', 35 'ORGANIZATION', 36 'PERSON', 37 'DURATION', 38 'DATE', 39 'CARDINAL', 40 'PERCENT', 41 'MONEY', 42 'MEASURE', 43 ], 44 'conll2002': ['LOC', 'PER', 'ORG'], 45 'ace': [ 46 'LOCATION', 47 'ORGANIZATION', 48 'PERSON', 49 'DURATION', 50 'DATE', 51 'CARDINAL', 52 'PERCENT', 53 'MONEY', 54 'MEASURE', 55 'FACILITY', 56 'GPE', 57 ], 58} 59 60# Allow abbreviated class labels 61short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON') 62long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER') 63 64 65def _expand(type): 66 """ 67 Expand an NE class name. 68 :type type: str 69 :rtype: str 70 """ 71 try: 72 return short2long[type] 73 except KeyError: 74 return type 75 76 77def class_abbrev(type): 78 """ 79 Abbreviate an NE class name. 80 :type type: str 81 :rtype: str 82 """ 83 try: 84 return long2short[type] 85 except KeyError: 86 return type 87 88 89def _join(lst, sep=' ', untag=False): 90 """ 91 Join a list into a string, turning tags tuples into tag strings or just words. 92 :param untag: if ``True``, omit the tag from tagged input strings. 93 :type lst: list 94 :rtype: str 95 """ 96 try: 97 return sep.join(lst) 98 except TypeError: 99 if untag: 100 return sep.join(tup[0] for tup in lst) 101 from nltk.tag import tuple2str 102 103 return sep.join(tuple2str(tup) for tup in lst) 104 105 106def descape_entity(m, defs=html_entities.entitydefs): 107 """ 108 Translate one entity to its ISO Latin value. 109 Inspired by example from effbot.org 110 111 112 """ 113 # s = 'mcglashan_&_sarrail' 114 # l = ['mcglashan', '&', 'sarrail'] 115 # pattern = re.compile("&(\w+?);") 116 # new = list2sym(l) 117 # s = pattern.sub(descape_entity, s) 118 # print s, new 119 try: 120 return defs[m.group(1)] 121 122 except KeyError: 123 return m.group(0) # use as is 124 125 126def list2sym(lst): 127 """ 128 Convert a list of strings into a canonical symbol. 129 :type lst: list 130 :return: a Unicode string without whitespace 131 :rtype: unicode 132 """ 133 sym = _join(lst, '_', untag=True) 134 sym = sym.lower() 135 ENT = re.compile("&(\w+?);") 136 sym = ENT.sub(descape_entity, sym) 137 sym = sym.replace('.', '') 138 return sym 139 140 141def tree2semi_rel(tree): 142 """ 143 Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 144 145 In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this 146 identifies pairs whose first member is a list (possibly empty) of terminal 147 strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). 148 149 :param tree: a chunk tree 150 :return: a list of pairs (list(str), ``Tree``) 151 :rtype: list of tuple 152 """ 153 154 from nltk.tree import Tree 155 156 semi_rels = [] 157 semi_rel = [[], None] 158 159 for dtr in tree: 160 if not isinstance(dtr, Tree): 161 semi_rel[0].append(dtr) 162 else: 163 # dtr is a Tree 164 semi_rel[1] = dtr 165 semi_rels.append(semi_rel) 166 semi_rel = [[], None] 167 return semi_rels 168 169 170def semi_rel2reldict(pairs, window=5, trace=False): 171 """ 172 Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which 173 stores information about the subject and object NEs plus the filler between them. 174 Additionally, a left and right context of length =< window are captured (within 175 a given input sentence). 176 177 :param pairs: a pair of list(str) and ``Tree``, as generated by 178 :param window: a threshold for the number of items to include in the left and right context 179 :type window: int 180 :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' 181 :rtype: list(defaultdict) 182 """ 183 result = [] 184 while len(pairs) > 2: 185 reldict = defaultdict(str) 186 reldict['lcon'] = _join(pairs[0][0][-window:]) 187 reldict['subjclass'] = pairs[0][1].label() 188 reldict['subjtext'] = _join(pairs[0][1].leaves()) 189 reldict['subjsym'] = list2sym(pairs[0][1].leaves()) 190 reldict['filler'] = _join(pairs[1][0]) 191 reldict['untagged_filler'] = _join(pairs[1][0], untag=True) 192 reldict['objclass'] = pairs[1][1].label() 193 reldict['objtext'] = _join(pairs[1][1].leaves()) 194 reldict['objsym'] = list2sym(pairs[1][1].leaves()) 195 reldict['rcon'] = _join(pairs[2][0][:window]) 196 if trace: 197 print( 198 "(%s(%s, %s)" 199 % ( 200 reldict['untagged_filler'], 201 reldict['subjclass'], 202 reldict['objclass'], 203 ) 204 ) 205 result.append(reldict) 206 pairs = pairs[1:] 207 return result 208 209 210def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10): 211 """ 212 Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. 213 214 The parameters ``subjclass`` and ``objclass`` can be used to restrict the 215 Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 216 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). 217 218 :param subjclass: the class of the subject Named Entity. 219 :type subjclass: str 220 :param objclass: the class of the object Named Entity. 221 :type objclass: str 222 :param doc: input document 223 :type doc: ieer document or a list of chunk trees 224 :param corpus: name of the corpus to take as input; possible values are 225 'ieer' and 'conll2002' 226 :type corpus: str 227 :param pattern: a regular expression for filtering the fillers of 228 retrieved triples. 229 :type pattern: SRE_Pattern 230 :param window: filters out fillers which exceed this threshold 231 :type window: int 232 :return: see ``mk_reldicts`` 233 :rtype: list(defaultdict) 234 """ 235 236 if subjclass and subjclass not in NE_CLASSES[corpus]: 237 if _expand(subjclass) in NE_CLASSES[corpus]: 238 subjclass = _expand(subjclass) 239 else: 240 raise ValueError( 241 "your value for the subject type has not been recognized: %s" 242 % subjclass 243 ) 244 if objclass and objclass not in NE_CLASSES[corpus]: 245 if _expand(objclass) in NE_CLASSES[corpus]: 246 objclass = _expand(objclass) 247 else: 248 raise ValueError( 249 "your value for the object type has not been recognized: %s" % objclass 250 ) 251 252 if corpus == 'ace' or corpus == 'conll2002': 253 pairs = tree2semi_rel(doc) 254 elif corpus == 'ieer': 255 pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) 256 else: 257 raise ValueError("corpus type not recognized") 258 259 reldicts = semi_rel2reldict(pairs) 260 261 relfilter = lambda x: ( 262 x['subjclass'] == subjclass 263 and len(x['filler'].split()) <= window 264 and pattern.match(x['filler']) 265 and x['objclass'] == objclass 266 ) 267 268 return list(filter(relfilter, reldicts)) 269 270 271def rtuple(reldict, lcon=False, rcon=False): 272 """ 273 Pretty print the reldict as an rtuple. 274 :param reldict: a relation dictionary 275 :type reldict: defaultdict 276 """ 277 items = [ 278 class_abbrev(reldict['subjclass']), 279 reldict['subjtext'], 280 reldict['filler'], 281 class_abbrev(reldict['objclass']), 282 reldict['objtext'], 283 ] 284 format = '[%s: %r] %r [%s: %r]' 285 if lcon: 286 items = [reldict['lcon']] + items 287 format = '...%r)' + format 288 if rcon: 289 items.append(reldict['rcon']) 290 format = format + '(%r...' 291 printargs = tuple(items) 292 return format % printargs 293 294 295def clause(reldict, relsym): 296 """ 297 Print the relation in clausal form. 298 :param reldict: a relation dictionary 299 :type reldict: defaultdict 300 :param relsym: a label for the relation 301 :type relsym: str 302 """ 303 items = (relsym, reldict['subjsym'], reldict['objsym']) 304 return "%s(%r, %r)" % items 305 306 307####################################################### 308# Demos of relation extraction with regular expressions 309####################################################### 310 311############################################ 312# Example of in(ORG, LOC) 313############################################ 314def in_demo(trace=0, sql=True): 315 """ 316 Select pairs of organizations and locations whose mentions occur with an 317 intervening occurrence of the preposition "in". 318 319 If the sql parameter is set to True, then the entity pairs are loaded into 320 an in-memory database, and subsequently pulled out using an SQL "SELECT" 321 query. 322 """ 323 from nltk.corpus import ieer 324 325 if sql: 326 try: 327 import sqlite3 328 329 connection = sqlite3.connect(":memory:") 330 connection.text_factory = sqlite3.OptimizedUnicode 331 cur = connection.cursor() 332 cur.execute( 333 """create table Locations 334 (OrgName text, LocationName text, DocID text)""" 335 ) 336 except ImportError: 337 import warnings 338 339 warnings.warn("Cannot import sqlite; sql flag will be ignored.") 340 341 IN = re.compile(r'.*\bin\b(?!\b.+ing)') 342 343 print() 344 print("IEER: in(ORG, LOC) -- just the clauses:") 345 print("=" * 45) 346 347 for file in ieer.fileids(): 348 for doc in ieer.parsed_docs(file): 349 if trace: 350 print(doc.docno) 351 print("=" * 15) 352 for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): 353 print(clause(rel, relsym='IN')) 354 if sql: 355 try: 356 rtuple = (rel['subjtext'], rel['objtext'], doc.docno) 357 cur.execute( 358 """insert into Locations 359 values (?, ?, ?)""", 360 rtuple, 361 ) 362 connection.commit() 363 except NameError: 364 pass 365 366 if sql: 367 try: 368 cur.execute( 369 """select OrgName from Locations 370 where LocationName = 'Atlanta'""" 371 ) 372 print() 373 print("Extract data from SQL table: ORGs in Atlanta") 374 print("-" * 15) 375 for row in cur: 376 print(row) 377 except NameError: 378 pass 379 380 381############################################ 382# Example of has_role(PER, LOC) 383############################################ 384 385 386def roles_demo(trace=0): 387 from nltk.corpus import ieer 388 389 roles = """ 390 (.*( # assorted roles 391 analyst| 392 chair(wo)?man| 393 commissioner| 394 counsel| 395 director| 396 economist| 397 editor| 398 executive| 399 foreman| 400 governor| 401 head| 402 lawyer| 403 leader| 404 librarian).*)| 405 manager| 406 partner| 407 president| 408 producer| 409 professor| 410 researcher| 411 spokes(wo)?man| 412 writer| 413 ,\sof\sthe?\s* # "X, of (the) Y" 414 """ 415 ROLES = re.compile(roles, re.VERBOSE) 416 417 print() 418 print("IEER: has_role(PER, ORG) -- raw rtuples:") 419 print("=" * 45) 420 421 for file in ieer.fileids(): 422 for doc in ieer.parsed_docs(file): 423 lcon = rcon = False 424 if trace: 425 print(doc.docno) 426 print("=" * 15) 427 lcon = rcon = True 428 for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): 429 print(rtuple(rel, lcon=lcon, rcon=rcon)) 430 431 432############################################## 433### Show what's in the IEER Headlines 434############################################## 435 436 437def ieer_headlines(): 438 439 from nltk.corpus import ieer 440 from nltk.tree import Tree 441 442 print("IEER: First 20 Headlines") 443 print("=" * 45) 444 445 trees = [ 446 (doc.docno, doc.headline) 447 for file in ieer.fileids() 448 for doc in ieer.parsed_docs(file) 449 ] 450 for tree in trees[:20]: 451 print() 452 print("%s:\n%s" % tree) 453 454 455############################################# 456## Dutch CONLL2002: take_on_role(PER, ORG 457############################################# 458 459 460def conllned(trace=1): 461 """ 462 Find the copula+'van' relation ('of') in the Dutch tagged training corpus 463 from CoNLL 2002. 464 """ 465 466 from nltk.corpus import conll2002 467 468 vnv = """ 469 ( 470 is/V| # 3rd sing present and 471 was/V| # past forms of the verb zijn ('be') 472 werd/V| # and also present 473 wordt/V # past of worden ('become) 474 ) 475 .* # followed by anything 476 van/Prep # followed by van ('of') 477 """ 478 VAN = re.compile(vnv, re.VERBOSE) 479 480 print() 481 print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") 482 print("=" * 45) 483 484 for doc in conll2002.chunked_sents('ned.train'): 485 lcon = rcon = False 486 if trace: 487 lcon = rcon = True 488 for rel in extract_rels( 489 'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10 490 ): 491 print(rtuple(rel, lcon=lcon, rcon=rcon)) 492 493 494############################################# 495## Spanish CONLL2002: (PER, ORG) 496############################################# 497 498 499def conllesp(): 500 from nltk.corpus import conll2002 501 502 de = """ 503 .* 504 ( 505 de/SP| 506 del/SP 507 ) 508 """ 509 DE = re.compile(de, re.VERBOSE) 510 511 print() 512 print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") 513 print("=" * 45) 514 rels = [ 515 rel 516 for doc in conll2002.chunked_sents('esp.train') 517 for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE) 518 ] 519 for r in rels[:10]: 520 print(clause(r, relsym='DE')) 521 print() 522 523 524def ne_chunked(): 525 print() 526 print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") 527 print("=" * 45) 528 ROLE = re.compile( 529 r'.*(chairman|president|trader|scientist|economist|analyst|partner).*' 530 ) 531 rels = [] 532 for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): 533 sent = nltk.ne_chunk(sent) 534 rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7) 535 for rel in rels: 536 print('{0:<5}{1}'.format(i, rtuple(rel))) 537 538 539if __name__ == '__main__': 540 import nltk 541 from nltk.sem import relextract 542 543 in_demo(trace=0) 544 roles_demo(trace=0) 545 conllned() 546 conllesp() 547 ieer_headlines() 548 ne_chunked() 549