1# Natural Language Toolkit: Relation Extraction
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Ewan Klein <ewan@inf.ed.ac.uk>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7
8"""
9Code for extracting relational triples from the ieer and conll2002 corpora.
10
11Relations are stored internally as dictionaries ('reldicts').
12
13The two serialization outputs are "rtuple" and "clause".
14
15- An rtuple is a tuple of the form ``(subj, filler, obj)``,
16  where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
17  occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
18  circumvent locale variations in rendering utf-8 encoded strings.
19- A clause is an atom of the form ``relsym(subjsym, objsym)``,
20  where the relation, subject and object have been canonicalized to single strings.
21"""
22from __future__ import print_function
23
24# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
25
26from collections import defaultdict
27import re
28
29from six.moves import html_entities
30
31# Dictionary that associates corpora with NE classes
32NE_CLASSES = {
33    'ieer': [
34        'LOCATION',
35        'ORGANIZATION',
36        'PERSON',
37        'DURATION',
38        'DATE',
39        'CARDINAL',
40        'PERCENT',
41        'MONEY',
42        'MEASURE',
43    ],
44    'conll2002': ['LOC', 'PER', 'ORG'],
45    'ace': [
46        'LOCATION',
47        'ORGANIZATION',
48        'PERSON',
49        'DURATION',
50        'DATE',
51        'CARDINAL',
52        'PERCENT',
53        'MONEY',
54        'MEASURE',
55        'FACILITY',
56        'GPE',
57    ],
58}
59
60# Allow abbreviated class labels
61short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON')
62long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER')
63
64
65def _expand(type):
66    """
67    Expand an NE class name.
68    :type type: str
69    :rtype: str
70    """
71    try:
72        return short2long[type]
73    except KeyError:
74        return type
75
76
77def class_abbrev(type):
78    """
79    Abbreviate an NE class name.
80    :type type: str
81    :rtype: str
82    """
83    try:
84        return long2short[type]
85    except KeyError:
86        return type
87
88
89def _join(lst, sep=' ', untag=False):
90    """
91    Join a list into a string, turning tags tuples into tag strings or just words.
92    :param untag: if ``True``, omit the tag from tagged input strings.
93    :type lst: list
94    :rtype: str
95    """
96    try:
97        return sep.join(lst)
98    except TypeError:
99        if untag:
100            return sep.join(tup[0] for tup in lst)
101        from nltk.tag import tuple2str
102
103        return sep.join(tuple2str(tup) for tup in lst)
104
105
106def descape_entity(m, defs=html_entities.entitydefs):
107    """
108    Translate one entity to its ISO Latin value.
109    Inspired by example from effbot.org
110
111
112    """
113    # s = 'mcglashan_&amp;_sarrail'
114    # l = ['mcglashan', '&amp;', 'sarrail']
115    # pattern = re.compile("&(\w+?);")
116    # new = list2sym(l)
117    # s = pattern.sub(descape_entity, s)
118    # print s, new
119    try:
120        return defs[m.group(1)]
121
122    except KeyError:
123        return m.group(0)  # use as is
124
125
126def list2sym(lst):
127    """
128    Convert a list of strings into a canonical symbol.
129    :type lst: list
130    :return: a Unicode string without whitespace
131    :rtype: unicode
132    """
133    sym = _join(lst, '_', untag=True)
134    sym = sym.lower()
135    ENT = re.compile("&(\w+?);")
136    sym = ENT.sub(descape_entity, sym)
137    sym = sym.replace('.', '')
138    return sym
139
140
141def tree2semi_rel(tree):
142    """
143    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
144
145    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
146    identifies pairs whose first member is a list (possibly empty) of terminal
147    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
148
149    :param tree: a chunk tree
150    :return: a list of pairs (list(str), ``Tree``)
151    :rtype: list of tuple
152    """
153
154    from nltk.tree import Tree
155
156    semi_rels = []
157    semi_rel = [[], None]
158
159    for dtr in tree:
160        if not isinstance(dtr, Tree):
161            semi_rel[0].append(dtr)
162        else:
163            # dtr is a Tree
164            semi_rel[1] = dtr
165            semi_rels.append(semi_rel)
166            semi_rel = [[], None]
167    return semi_rels
168
169
170def semi_rel2reldict(pairs, window=5, trace=False):
171    """
172    Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
173    stores information about the subject and object NEs plus the filler between them.
174    Additionally, a left and right context of length =< window are captured (within
175    a given input sentence).
176
177    :param pairs: a pair of list(str) and ``Tree``, as generated by
178    :param window: a threshold for the number of items to include in the left and right context
179    :type window: int
180    :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
181    :rtype: list(defaultdict)
182    """
183    result = []
184    while len(pairs) > 2:
185        reldict = defaultdict(str)
186        reldict['lcon'] = _join(pairs[0][0][-window:])
187        reldict['subjclass'] = pairs[0][1].label()
188        reldict['subjtext'] = _join(pairs[0][1].leaves())
189        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
190        reldict['filler'] = _join(pairs[1][0])
191        reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
192        reldict['objclass'] = pairs[1][1].label()
193        reldict['objtext'] = _join(pairs[1][1].leaves())
194        reldict['objsym'] = list2sym(pairs[1][1].leaves())
195        reldict['rcon'] = _join(pairs[2][0][:window])
196        if trace:
197            print(
198                "(%s(%s, %s)"
199                % (
200                    reldict['untagged_filler'],
201                    reldict['subjclass'],
202                    reldict['objclass'],
203                )
204            )
205        result.append(reldict)
206        pairs = pairs[1:]
207    return result
208
209
210def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
211    """
212    Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
213
214    The parameters ``subjclass`` and ``objclass`` can be used to restrict the
215    Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
216    'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
217
218    :param subjclass: the class of the subject Named Entity.
219    :type subjclass: str
220    :param objclass: the class of the object Named Entity.
221    :type objclass: str
222    :param doc: input document
223    :type doc: ieer document or a list of chunk trees
224    :param corpus: name of the corpus to take as input; possible values are
225        'ieer' and 'conll2002'
226    :type corpus: str
227    :param pattern: a regular expression for filtering the fillers of
228        retrieved triples.
229    :type pattern: SRE_Pattern
230    :param window: filters out fillers which exceed this threshold
231    :type window: int
232    :return: see ``mk_reldicts``
233    :rtype: list(defaultdict)
234    """
235
236    if subjclass and subjclass not in NE_CLASSES[corpus]:
237        if _expand(subjclass) in NE_CLASSES[corpus]:
238            subjclass = _expand(subjclass)
239        else:
240            raise ValueError(
241                "your value for the subject type has not been recognized: %s"
242                % subjclass
243            )
244    if objclass and objclass not in NE_CLASSES[corpus]:
245        if _expand(objclass) in NE_CLASSES[corpus]:
246            objclass = _expand(objclass)
247        else:
248            raise ValueError(
249                "your value for the object type has not been recognized: %s" % objclass
250            )
251
252    if corpus == 'ace' or corpus == 'conll2002':
253        pairs = tree2semi_rel(doc)
254    elif corpus == 'ieer':
255        pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
256    else:
257        raise ValueError("corpus type not recognized")
258
259    reldicts = semi_rel2reldict(pairs)
260
261    relfilter = lambda x: (
262        x['subjclass'] == subjclass
263        and len(x['filler'].split()) <= window
264        and pattern.match(x['filler'])
265        and x['objclass'] == objclass
266    )
267
268    return list(filter(relfilter, reldicts))
269
270
271def rtuple(reldict, lcon=False, rcon=False):
272    """
273    Pretty print the reldict as an rtuple.
274    :param reldict: a relation dictionary
275    :type reldict: defaultdict
276    """
277    items = [
278        class_abbrev(reldict['subjclass']),
279        reldict['subjtext'],
280        reldict['filler'],
281        class_abbrev(reldict['objclass']),
282        reldict['objtext'],
283    ]
284    format = '[%s: %r] %r [%s: %r]'
285    if lcon:
286        items = [reldict['lcon']] + items
287        format = '...%r)' + format
288    if rcon:
289        items.append(reldict['rcon'])
290        format = format + '(%r...'
291    printargs = tuple(items)
292    return format % printargs
293
294
295def clause(reldict, relsym):
296    """
297    Print the relation in clausal form.
298    :param reldict: a relation dictionary
299    :type reldict: defaultdict
300    :param relsym: a label for the relation
301    :type relsym: str
302    """
303    items = (relsym, reldict['subjsym'], reldict['objsym'])
304    return "%s(%r, %r)" % items
305
306
307#######################################################
308# Demos of relation extraction with regular expressions
309#######################################################
310
311############################################
312# Example of in(ORG, LOC)
313############################################
314def in_demo(trace=0, sql=True):
315    """
316    Select pairs of organizations and locations whose mentions occur with an
317    intervening occurrence of the preposition "in".
318
319    If the sql parameter is set to True, then the entity pairs are loaded into
320    an in-memory database, and subsequently pulled out using an SQL "SELECT"
321    query.
322    """
323    from nltk.corpus import ieer
324
325    if sql:
326        try:
327            import sqlite3
328
329            connection = sqlite3.connect(":memory:")
330            connection.text_factory = sqlite3.OptimizedUnicode
331            cur = connection.cursor()
332            cur.execute(
333                """create table Locations
334            (OrgName text, LocationName text, DocID text)"""
335            )
336        except ImportError:
337            import warnings
338
339            warnings.warn("Cannot import sqlite; sql flag will be ignored.")
340
341    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
342
343    print()
344    print("IEER: in(ORG, LOC) -- just the clauses:")
345    print("=" * 45)
346
347    for file in ieer.fileids():
348        for doc in ieer.parsed_docs(file):
349            if trace:
350                print(doc.docno)
351                print("=" * 15)
352            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
353                print(clause(rel, relsym='IN'))
354                if sql:
355                    try:
356                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
357                        cur.execute(
358                            """insert into Locations
359                                    values (?, ?, ?)""",
360                            rtuple,
361                        )
362                        connection.commit()
363                    except NameError:
364                        pass
365
366    if sql:
367        try:
368            cur.execute(
369                """select OrgName from Locations
370                        where LocationName = 'Atlanta'"""
371            )
372            print()
373            print("Extract data from SQL table: ORGs in Atlanta")
374            print("-" * 15)
375            for row in cur:
376                print(row)
377        except NameError:
378            pass
379
380
381############################################
382# Example of has_role(PER, LOC)
383############################################
384
385
386def roles_demo(trace=0):
387    from nltk.corpus import ieer
388
389    roles = """
390    (.*(                   # assorted roles
391    analyst|
392    chair(wo)?man|
393    commissioner|
394    counsel|
395    director|
396    economist|
397    editor|
398    executive|
399    foreman|
400    governor|
401    head|
402    lawyer|
403    leader|
404    librarian).*)|
405    manager|
406    partner|
407    president|
408    producer|
409    professor|
410    researcher|
411    spokes(wo)?man|
412    writer|
413    ,\sof\sthe?\s*  # "X, of (the) Y"
414    """
415    ROLES = re.compile(roles, re.VERBOSE)
416
417    print()
418    print("IEER: has_role(PER, ORG) -- raw rtuples:")
419    print("=" * 45)
420
421    for file in ieer.fileids():
422        for doc in ieer.parsed_docs(file):
423            lcon = rcon = False
424            if trace:
425                print(doc.docno)
426                print("=" * 15)
427                lcon = rcon = True
428            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
429                print(rtuple(rel, lcon=lcon, rcon=rcon))
430
431
432##############################################
433### Show what's in the IEER Headlines
434##############################################
435
436
437def ieer_headlines():
438
439    from nltk.corpus import ieer
440    from nltk.tree import Tree
441
442    print("IEER: First 20 Headlines")
443    print("=" * 45)
444
445    trees = [
446        (doc.docno, doc.headline)
447        for file in ieer.fileids()
448        for doc in ieer.parsed_docs(file)
449    ]
450    for tree in trees[:20]:
451        print()
452        print("%s:\n%s" % tree)
453
454
455#############################################
456## Dutch CONLL2002: take_on_role(PER, ORG
457#############################################
458
459
460def conllned(trace=1):
461    """
462    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
463    from CoNLL 2002.
464    """
465
466    from nltk.corpus import conll2002
467
468    vnv = """
469    (
470    is/V|    # 3rd sing present and
471    was/V|   # past forms of the verb zijn ('be')
472    werd/V|  # and also present
473    wordt/V  # past of worden ('become)
474    )
475    .*       # followed by anything
476    van/Prep # followed by van ('of')
477    """
478    VAN = re.compile(vnv, re.VERBOSE)
479
480    print()
481    print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
482    print("=" * 45)
483
484    for doc in conll2002.chunked_sents('ned.train'):
485        lcon = rcon = False
486        if trace:
487            lcon = rcon = True
488        for rel in extract_rels(
489            'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10
490        ):
491            print(rtuple(rel, lcon=lcon, rcon=rcon))
492
493
494#############################################
495## Spanish CONLL2002: (PER, ORG)
496#############################################
497
498
499def conllesp():
500    from nltk.corpus import conll2002
501
502    de = """
503    .*
504    (
505    de/SP|
506    del/SP
507    )
508    """
509    DE = re.compile(de, re.VERBOSE)
510
511    print()
512    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
513    print("=" * 45)
514    rels = [
515        rel
516        for doc in conll2002.chunked_sents('esp.train')
517        for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)
518    ]
519    for r in rels[:10]:
520        print(clause(r, relsym='DE'))
521    print()
522
523
524def ne_chunked():
525    print()
526    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
527    print("=" * 45)
528    ROLE = re.compile(
529        r'.*(chairman|president|trader|scientist|economist|analyst|partner).*'
530    )
531    rels = []
532    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
533        sent = nltk.ne_chunk(sent)
534        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
535        for rel in rels:
536            print('{0:<5}{1}'.format(i, rtuple(rel)))
537
538
539if __name__ == '__main__':
540    import nltk
541    from nltk.sem import relextract
542
543    in_demo(trace=0)
544    roles_demo(trace=0)
545    conllned()
546    conllesp()
547    ieer_headlines()
548    ne_chunked()
549