1# Natural Language Toolkit: Regular Expression Chunkers 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Edward Loper <edloper@gmail.com> 5# Steven Bird <stevenbird1@gmail.com> (minor additions) 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8from __future__ import print_function, unicode_literals 9from __future__ import division 10 11import re 12 13from six import string_types 14 15from nltk.tree import Tree 16from nltk.chunk.api import ChunkParserI 17from nltk.compat import python_2_unicode_compatible, unicode_repr 18 19##////////////////////////////////////////////////////// 20## ChunkString 21##////////////////////////////////////////////////////// 22 23 24@python_2_unicode_compatible 25class ChunkString(object): 26 """ 27 A string-based encoding of a particular chunking of a text. 28 Internally, the ``ChunkString`` class uses a single string to 29 encode the chunking of the input text. This string contains a 30 sequence of angle-bracket delimited tags, with chunking indicated 31 by braces. An example of this encoding is:: 32 33 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.> 34 35 ``ChunkString`` are created from tagged texts (i.e., lists of 36 ``tokens`` whose type is ``TaggedType``). Initially, nothing is 37 chunked. 38 39 The chunking of a ``ChunkString`` can be modified with the ``xform()`` 40 method, which uses a regular expression to transform the string 41 representation. These transformations should only add and remove 42 braces; they should *not* modify the sequence of angle-bracket 43 delimited tags. 44 45 :type _str: str 46 :ivar _str: The internal string representation of the text's 47 encoding. This string representation contains a sequence of 48 angle-bracket delimited tags, with chunking indicated by 49 braces. An example of this encoding is:: 50 51 {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.> 52 53 :type _pieces: list(tagged tokens and chunks) 54 :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``. 55 :ivar _debug: The debug level. See the constructor docs. 56 57 :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that 58 will only match positions that are in chunks. 59 :cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that 60 will only match positions that are in chinks. 61 """ 62 63 CHUNK_TAG_CHAR = r'[^\{\}<>]' 64 CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR 65 66 IN_CHUNK_PATTERN = r'(?=[^\{]*\})' 67 IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))' 68 69 # These are used by _verify 70 _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG 71 _CHINK = r'(%s+?)+?' % CHUNK_TAG 72 _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG) 73 _BRACKETS = re.compile('[^\{\}]+') 74 _BALANCED_BRACKETS = re.compile(r'(\{\})*$') 75 76 def __init__(self, chunk_struct, debug_level=1): 77 """ 78 Construct a new ``ChunkString`` that encodes the chunking of 79 the text ``tagged_tokens``. 80 81 :type chunk_struct: Tree 82 :param chunk_struct: The chunk structure to be further chunked. 83 :type debug_level: int 84 :param debug_level: The level of debugging which should be 85 applied to transformations on the ``ChunkString``. The 86 valid levels are: 87 - 0: no checks 88 - 1: full check on to_chunkstruct 89 - 2: full check on to_chunkstruct and cursory check after 90 each transformation. 91 - 3: full check on to_chunkstruct and full check after 92 each transformation. 93 We recommend you use at least level 1. You should 94 probably use level 3 if you use any non-standard 95 subclasses of ``RegexpChunkRule``. 96 """ 97 self._root_label = chunk_struct.label() 98 self._pieces = chunk_struct[:] 99 tags = [self._tag(tok) for tok in self._pieces] 100 self._str = '<' + '><'.join(tags) + '>' 101 self._debug = debug_level 102 103 def _tag(self, tok): 104 if isinstance(tok, tuple): 105 return tok[1] 106 elif isinstance(tok, Tree): 107 return tok.label() 108 else: 109 raise ValueError('chunk structures must contain tagged ' 'tokens or trees') 110 111 def _verify(self, s, verify_tags): 112 """ 113 Check to make sure that ``s`` still corresponds to some chunked 114 version of ``_pieces``. 115 116 :type verify_tags: bool 117 :param verify_tags: Whether the individual tags should be 118 checked. If this is false, ``_verify`` will check to make 119 sure that ``_str`` encodes a chunked version of *some* 120 list of tokens. If this is true, then ``_verify`` will 121 check to make sure that the tags in ``_str`` match those in 122 ``_pieces``. 123 124 :raise ValueError: if the internal string representation of 125 this ``ChunkString`` is invalid or not consistent with _pieces. 126 """ 127 # Check overall form 128 if not ChunkString._VALID.match(s): 129 raise ValueError( 130 'Transformation generated invalid ' 'chunkstring:\n %s' % s 131 ) 132 133 # Check that parens are balanced. If the string is long, we 134 # have to do this in pieces, to avoid a maximum recursion 135 # depth limit for regular expressions. 136 brackets = ChunkString._BRACKETS.sub('', s) 137 for i in range(1 + len(brackets) // 5000): 138 substr = brackets[i * 5000 : i * 5000 + 5000] 139 if not ChunkString._BALANCED_BRACKETS.match(substr): 140 raise ValueError( 141 'Transformation generated invalid ' 'chunkstring:\n %s' % s 142 ) 143 144 if verify_tags <= 0: 145 return 146 147 tags1 = (re.split(r'[\{\}<>]+', s))[1:-1] 148 tags2 = [self._tag(piece) for piece in self._pieces] 149 if tags1 != tags2: 150 raise ValueError( 151 'Transformation generated invalid ' 'chunkstring: tag changed' 152 ) 153 154 def to_chunkstruct(self, chunk_label='CHUNK'): 155 """ 156 Return the chunk structure encoded by this ``ChunkString``. 157 158 :rtype: Tree 159 :raise ValueError: If a transformation has generated an 160 invalid chunkstring. 161 """ 162 if self._debug > 0: 163 self._verify(self._str, 1) 164 165 # Use this alternating list to create the chunkstruct. 166 pieces = [] 167 index = 0 168 piece_in_chunk = 0 169 for piece in re.split('[{}]', self._str): 170 171 # Find the list of tokens contained in this piece. 172 length = piece.count('<') 173 subsequence = self._pieces[index : index + length] 174 175 # Add this list of tokens to our pieces. 176 if piece_in_chunk: 177 pieces.append(Tree(chunk_label, subsequence)) 178 else: 179 pieces += subsequence 180 181 # Update index, piece_in_chunk 182 index += length 183 piece_in_chunk = not piece_in_chunk 184 185 return Tree(self._root_label, pieces) 186 187 def xform(self, regexp, repl): 188 """ 189 Apply the given transformation to the string encoding of this 190 ``ChunkString``. In particular, find all occurrences that match 191 ``regexp``, and replace them using ``repl`` (as done by 192 ``re.sub``). 193 194 This transformation should only add and remove braces; it 195 should *not* modify the sequence of angle-bracket delimited 196 tags. Furthermore, this transformation may not result in 197 improper bracketing. Note, in particular, that bracketing may 198 not be nested. 199 200 :type regexp: str or regexp 201 :param regexp: A regular expression matching the substring 202 that should be replaced. This will typically include a 203 named group, which can be used by ``repl``. 204 :type repl: str 205 :param repl: An expression specifying what should replace the 206 matched substring. Typically, this will include a named 207 replacement group, specified by ``regexp``. 208 :rtype: None 209 :raise ValueError: If this transformation generated an 210 invalid chunkstring. 211 """ 212 # Do the actual substitution 213 s = re.sub(regexp, repl, self._str) 214 215 # The substitution might have generated "empty chunks" 216 # (substrings of the form "{}"). Remove them, so they don't 217 # interfere with other transformations. 218 s = re.sub('\{\}', '', s) 219 220 # Make sure that the transformation was legal. 221 if self._debug > 1: 222 self._verify(s, self._debug - 2) 223 224 # Commit the transformation. 225 self._str = s 226 227 def __repr__(self): 228 """ 229 Return a string representation of this ``ChunkString``. 230 It has the form:: 231 232 <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'> 233 234 :rtype: str 235 """ 236 return '<ChunkString: %s>' % unicode_repr(self._str) 237 238 def __str__(self): 239 """ 240 Return a formatted representation of this ``ChunkString``. 241 This representation will include extra spaces to ensure that 242 tags will line up with the representation of other 243 ``ChunkStrings`` for the same text, regardless of the chunking. 244 245 :rtype: str 246 """ 247 # Add spaces to make everything line up. 248 str = re.sub(r'>(?!\})', r'> ', self._str) 249 str = re.sub(r'([^\{])<', r'\1 <', str) 250 if str[0] == '<': 251 str = ' ' + str 252 return str 253 254 255##////////////////////////////////////////////////////// 256## Chunking Rules 257##////////////////////////////////////////////////////// 258 259 260@python_2_unicode_compatible 261class RegexpChunkRule(object): 262 """ 263 A rule specifying how to modify the chunking in a ``ChunkString``, 264 using a transformational regular expression. The 265 ``RegexpChunkRule`` class itself can be used to implement any 266 transformational rule based on regular expressions. There are 267 also a number of subclasses, which can be used to implement 268 simpler types of rules, based on matching regular expressions. 269 270 Each ``RegexpChunkRule`` has a regular expression and a 271 replacement expression. When a ``RegexpChunkRule`` is "applied" 272 to a ``ChunkString``, it searches the ``ChunkString`` for any 273 substring that matches the regular expression, and replaces it 274 using the replacement expression. This search/replace operation 275 has the same semantics as ``re.sub``. 276 277 Each ``RegexpChunkRule`` also has a description string, which 278 gives a short (typically less than 75 characters) description of 279 the purpose of the rule. 280 281 This transformation defined by this ``RegexpChunkRule`` should 282 only add and remove braces; it should *not* modify the sequence 283 of angle-bracket delimited tags. Furthermore, this transformation 284 may not result in nested or mismatched bracketing. 285 """ 286 287 def __init__(self, regexp, repl, descr): 288 """ 289 Construct a new RegexpChunkRule. 290 291 :type regexp: regexp or str 292 :param regexp: The regular expression for this ``RegexpChunkRule``. 293 When this rule is applied to a ``ChunkString``, any 294 substring that matches ``regexp`` will be replaced using 295 the replacement string ``repl``. Note that this must be a 296 normal regular expression, not a tag pattern. 297 :type repl: str 298 :param repl: The replacement expression for this ``RegexpChunkRule``. 299 When this rule is applied to a ``ChunkString``, any substring 300 that matches ``regexp`` will be replaced using ``repl``. 301 :type descr: str 302 :param descr: A short description of the purpose and/or effect 303 of this rule. 304 """ 305 if isinstance(regexp, string_types): 306 regexp = re.compile(regexp) 307 self._repl = repl 308 self._descr = descr 309 self._regexp = regexp 310 311 def apply(self, chunkstr): 312 # Keep docstring generic so we can inherit it. 313 """ 314 Apply this rule to the given ``ChunkString``. See the 315 class reference documentation for a description of what it 316 means to apply a rule. 317 318 :type chunkstr: ChunkString 319 :param chunkstr: The chunkstring to which this rule is applied. 320 :rtype: None 321 :raise ValueError: If this transformation generated an 322 invalid chunkstring. 323 """ 324 chunkstr.xform(self._regexp, self._repl) 325 326 def descr(self): 327 """ 328 Return a short description of the purpose and/or effect of 329 this rule. 330 331 :rtype: str 332 """ 333 return self._descr 334 335 def __repr__(self): 336 """ 337 Return a string representation of this rule. It has the form:: 338 339 <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'> 340 341 Note that this representation does not include the 342 description string; that string can be accessed 343 separately with the ``descr()`` method. 344 345 :rtype: str 346 """ 347 return ( 348 '<RegexpChunkRule: ' 349 + unicode_repr(self._regexp.pattern) 350 + '->' 351 + unicode_repr(self._repl) 352 + '>' 353 ) 354 355 @staticmethod 356 def fromstring(s): 357 """ 358 Create a RegexpChunkRule from a string description. 359 Currently, the following formats are supported:: 360 361 {regexp} # chunk rule 362 }regexp{ # chink rule 363 regexp}{regexp # split rule 364 regexp{}regexp # merge rule 365 366 Where ``regexp`` is a regular expression for the rule. Any 367 text following the comment marker (``#``) will be used as 368 the rule's description: 369 370 >>> from nltk.chunk.regexp import RegexpChunkRule 371 >>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}') 372 <ChunkRule: '<DT>?<NN.*>+'> 373 """ 374 # Split off the comment (but don't split on '\#') 375 m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s) 376 rule = m.group('rule').strip() 377 comment = (m.group('comment') or '')[1:].strip() 378 379 # Pattern bodies: chunk, chink, split, merge 380 try: 381 if not rule: 382 raise ValueError('Empty chunk pattern') 383 if rule[0] == '{' and rule[-1] == '}': 384 return ChunkRule(rule[1:-1], comment) 385 elif rule[0] == '}' and rule[-1] == '{': 386 return ChinkRule(rule[1:-1], comment) 387 elif '}{' in rule: 388 left, right = rule.split('}{') 389 return SplitRule(left, right, comment) 390 elif '{}' in rule: 391 left, right = rule.split('{}') 392 return MergeRule(left, right, comment) 393 elif re.match('[^{}]*{[^{}]*}[^{}]*', rule): 394 left, chunk, right = re.split('[{}]', rule) 395 return ChunkRuleWithContext(left, chunk, right, comment) 396 else: 397 raise ValueError('Illegal chunk pattern: %s' % rule) 398 except (ValueError, re.error): 399 raise ValueError('Illegal chunk pattern: %s' % rule) 400 401 402@python_2_unicode_compatible 403class ChunkRule(RegexpChunkRule): 404 """ 405 A rule specifying how to add chunks to a ``ChunkString``, using a 406 matching tag pattern. When applied to a ``ChunkString``, it will 407 find any substring that matches this tag pattern and that is not 408 already part of a chunk, and create a new chunk containing that 409 substring. 410 """ 411 412 def __init__(self, tag_pattern, descr): 413 414 """ 415 Construct a new ``ChunkRule``. 416 417 :type tag_pattern: str 418 :param tag_pattern: This rule's tag pattern. When 419 applied to a ``ChunkString``, this rule will 420 chunk any substring that matches this tag pattern and that 421 is not already part of a chunk. 422 :type descr: str 423 :param descr: A short description of the purpose and/or effect 424 of this rule. 425 """ 426 self._pattern = tag_pattern 427 regexp = re.compile( 428 '(?P<chunk>%s)%s' 429 % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN) 430 ) 431 RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr) 432 433 def __repr__(self): 434 """ 435 Return a string representation of this rule. It has the form:: 436 437 <ChunkRule: '<IN|VB.*>'> 438 439 Note that this representation does not include the 440 description string; that string can be accessed 441 separately with the ``descr()`` method. 442 443 :rtype: str 444 """ 445 return '<ChunkRule: ' + unicode_repr(self._pattern) + '>' 446 447 448@python_2_unicode_compatible 449class ChinkRule(RegexpChunkRule): 450 """ 451 A rule specifying how to remove chinks to a ``ChunkString``, 452 using a matching tag pattern. When applied to a 453 ``ChunkString``, it will find any substring that matches this 454 tag pattern and that is contained in a chunk, and remove it 455 from that chunk, thus creating two new chunks. 456 """ 457 458 def __init__(self, tag_pattern, descr): 459 """ 460 Construct a new ``ChinkRule``. 461 462 :type tag_pattern: str 463 :param tag_pattern: This rule's tag pattern. When 464 applied to a ``ChunkString``, this rule will 465 find any substring that matches this tag pattern and that 466 is contained in a chunk, and remove it from that chunk, 467 thus creating two new chunks. 468 :type descr: str 469 :param descr: A short description of the purpose and/or effect 470 of this rule. 471 """ 472 self._pattern = tag_pattern 473 regexp = re.compile( 474 '(?P<chink>%s)%s' 475 % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN) 476 ) 477 RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr) 478 479 def __repr__(self): 480 """ 481 Return a string representation of this rule. It has the form:: 482 483 <ChinkRule: '<IN|VB.*>'> 484 485 Note that this representation does not include the 486 description string; that string can be accessed 487 separately with the ``descr()`` method. 488 489 :rtype: str 490 """ 491 return '<ChinkRule: ' + unicode_repr(self._pattern) + '>' 492 493 494@python_2_unicode_compatible 495class UnChunkRule(RegexpChunkRule): 496 """ 497 A rule specifying how to remove chunks to a ``ChunkString``, 498 using a matching tag pattern. When applied to a 499 ``ChunkString``, it will find any complete chunk that matches this 500 tag pattern, and un-chunk it. 501 """ 502 503 def __init__(self, tag_pattern, descr): 504 """ 505 Construct a new ``UnChunkRule``. 506 507 :type tag_pattern: str 508 :param tag_pattern: This rule's tag pattern. When 509 applied to a ``ChunkString``, this rule will 510 find any complete chunk that matches this tag pattern, 511 and un-chunk it. 512 :type descr: str 513 :param descr: A short description of the purpose and/or effect 514 of this rule. 515 """ 516 self._pattern = tag_pattern 517 regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern)) 518 RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr) 519 520 def __repr__(self): 521 """ 522 Return a string representation of this rule. It has the form:: 523 524 <UnChunkRule: '<IN|VB.*>'> 525 526 Note that this representation does not include the 527 description string; that string can be accessed 528 separately with the ``descr()`` method. 529 530 :rtype: str 531 """ 532 return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>' 533 534 535@python_2_unicode_compatible 536class MergeRule(RegexpChunkRule): 537 """ 538 A rule specifying how to merge chunks in a ``ChunkString``, using 539 two matching tag patterns: a left pattern, and a right pattern. 540 When applied to a ``ChunkString``, it will find any chunk whose end 541 matches left pattern, and immediately followed by a chunk whose 542 beginning matches right pattern. It will then merge those two 543 chunks into a single chunk. 544 """ 545 546 def __init__(self, left_tag_pattern, right_tag_pattern, descr): 547 """ 548 Construct a new ``MergeRule``. 549 550 :type right_tag_pattern: str 551 :param right_tag_pattern: This rule's right tag 552 pattern. When applied to a ``ChunkString``, this 553 rule will find any chunk whose end matches 554 ``left_tag_pattern``, and immediately followed by a chunk 555 whose beginning matches this pattern. It will 556 then merge those two chunks into a single chunk. 557 :type left_tag_pattern: str 558 :param left_tag_pattern: This rule's left tag 559 pattern. When applied to a ``ChunkString``, this 560 rule will find any chunk whose end matches 561 this pattern, and immediately followed by a chunk 562 whose beginning matches ``right_tag_pattern``. It will 563 then merge those two chunks into a single chunk. 564 565 :type descr: str 566 :param descr: A short description of the purpose and/or effect 567 of this rule. 568 """ 569 # Ensure that the individual patterns are coherent. E.g., if 570 # left='(' and right=')', then this will raise an exception: 571 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 572 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 573 574 self._left_tag_pattern = left_tag_pattern 575 self._right_tag_pattern = right_tag_pattern 576 regexp = re.compile( 577 '(?P<left>%s)}{(?=%s)' 578 % ( 579 tag_pattern2re_pattern(left_tag_pattern), 580 tag_pattern2re_pattern(right_tag_pattern), 581 ) 582 ) 583 RegexpChunkRule.__init__(self, regexp, '\g<left>', descr) 584 585 def __repr__(self): 586 """ 587 Return a string representation of this rule. It has the form:: 588 589 <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'> 590 591 Note that this representation does not include the 592 description string; that string can be accessed 593 separately with the ``descr()`` method. 594 595 :rtype: str 596 """ 597 return ( 598 '<MergeRule: ' 599 + unicode_repr(self._left_tag_pattern) 600 + ', ' 601 + unicode_repr(self._right_tag_pattern) 602 + '>' 603 ) 604 605 606@python_2_unicode_compatible 607class SplitRule(RegexpChunkRule): 608 """ 609 A rule specifying how to split chunks in a ``ChunkString``, using 610 two matching tag patterns: a left pattern, and a right pattern. 611 When applied to a ``ChunkString``, it will find any chunk that 612 matches the left pattern followed by the right pattern. It will 613 then split the chunk into two new chunks, at the point between the 614 two pattern matches. 615 """ 616 617 def __init__(self, left_tag_pattern, right_tag_pattern, descr): 618 """ 619 Construct a new ``SplitRule``. 620 621 :type right_tag_pattern: str 622 :param right_tag_pattern: This rule's right tag 623 pattern. When applied to a ``ChunkString``, this rule will 624 find any chunk containing a substring that matches 625 ``left_tag_pattern`` followed by this pattern. It will 626 then split the chunk into two new chunks at the point 627 between these two matching patterns. 628 :type left_tag_pattern: str 629 :param left_tag_pattern: This rule's left tag 630 pattern. When applied to a ``ChunkString``, this rule will 631 find any chunk containing a substring that matches this 632 pattern followed by ``right_tag_pattern``. It will then 633 split the chunk into two new chunks at the point between 634 these two matching patterns. 635 :type descr: str 636 :param descr: A short description of the purpose and/or effect 637 of this rule. 638 """ 639 # Ensure that the individual patterns are coherent. E.g., if 640 # left='(' and right=')', then this will raise an exception: 641 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 642 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 643 644 self._left_tag_pattern = left_tag_pattern 645 self._right_tag_pattern = right_tag_pattern 646 regexp = re.compile( 647 '(?P<left>%s)(?=%s)' 648 % ( 649 tag_pattern2re_pattern(left_tag_pattern), 650 tag_pattern2re_pattern(right_tag_pattern), 651 ) 652 ) 653 RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr) 654 655 def __repr__(self): 656 """ 657 Return a string representation of this rule. It has the form:: 658 659 <SplitRule: '<NN>', '<DT>'> 660 661 Note that this representation does not include the 662 description string; that string can be accessed 663 separately with the ``descr()`` method. 664 665 :rtype: str 666 """ 667 return ( 668 '<SplitRule: ' 669 + unicode_repr(self._left_tag_pattern) 670 + ', ' 671 + unicode_repr(self._right_tag_pattern) 672 + '>' 673 ) 674 675 676@python_2_unicode_compatible 677class ExpandLeftRule(RegexpChunkRule): 678 """ 679 A rule specifying how to expand chunks in a ``ChunkString`` to the left, 680 using two matching tag patterns: a left pattern, and a right pattern. 681 When applied to a ``ChunkString``, it will find any chunk whose beginning 682 matches right pattern, and immediately preceded by a chink whose 683 end matches left pattern. It will then expand the chunk to incorporate 684 the new material on the left. 685 """ 686 687 def __init__(self, left_tag_pattern, right_tag_pattern, descr): 688 """ 689 Construct a new ``ExpandRightRule``. 690 691 :type right_tag_pattern: str 692 :param right_tag_pattern: This rule's right tag 693 pattern. When applied to a ``ChunkString``, this 694 rule will find any chunk whose beginning matches 695 ``right_tag_pattern``, and immediately preceded by a chink 696 whose end matches this pattern. It will 697 then merge those two chunks into a single chunk. 698 :type left_tag_pattern: str 699 :param left_tag_pattern: This rule's left tag 700 pattern. When applied to a ``ChunkString``, this 701 rule will find any chunk whose beginning matches 702 this pattern, and immediately preceded by a chink 703 whose end matches ``left_tag_pattern``. It will 704 then expand the chunk to incorporate the new material on the left. 705 706 :type descr: str 707 :param descr: A short description of the purpose and/or effect 708 of this rule. 709 """ 710 # Ensure that the individual patterns are coherent. E.g., if 711 # left='(' and right=')', then this will raise an exception: 712 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 713 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 714 715 self._left_tag_pattern = left_tag_pattern 716 self._right_tag_pattern = right_tag_pattern 717 regexp = re.compile( 718 '(?P<left>%s)\{(?P<right>%s)' 719 % ( 720 tag_pattern2re_pattern(left_tag_pattern), 721 tag_pattern2re_pattern(right_tag_pattern), 722 ) 723 ) 724 RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr) 725 726 def __repr__(self): 727 """ 728 Return a string representation of this rule. It has the form:: 729 730 <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'> 731 732 Note that this representation does not include the 733 description string; that string can be accessed 734 separately with the ``descr()`` method. 735 736 :rtype: str 737 """ 738 return ( 739 '<ExpandLeftRule: ' 740 + unicode_repr(self._left_tag_pattern) 741 + ', ' 742 + unicode_repr(self._right_tag_pattern) 743 + '>' 744 ) 745 746 747@python_2_unicode_compatible 748class ExpandRightRule(RegexpChunkRule): 749 """ 750 A rule specifying how to expand chunks in a ``ChunkString`` to the 751 right, using two matching tag patterns: a left pattern, and a 752 right pattern. When applied to a ``ChunkString``, it will find any 753 chunk whose end matches left pattern, and immediately followed by 754 a chink whose beginning matches right pattern. It will then 755 expand the chunk to incorporate the new material on the right. 756 """ 757 758 def __init__(self, left_tag_pattern, right_tag_pattern, descr): 759 """ 760 Construct a new ``ExpandRightRule``. 761 762 :type right_tag_pattern: str 763 :param right_tag_pattern: This rule's right tag 764 pattern. When applied to a ``ChunkString``, this 765 rule will find any chunk whose end matches 766 ``left_tag_pattern``, and immediately followed by a chink 767 whose beginning matches this pattern. It will 768 then merge those two chunks into a single chunk. 769 :type left_tag_pattern: str 770 :param left_tag_pattern: This rule's left tag 771 pattern. When applied to a ``ChunkString``, this 772 rule will find any chunk whose end matches 773 this pattern, and immediately followed by a chink 774 whose beginning matches ``right_tag_pattern``. It will 775 then expand the chunk to incorporate the new material on the right. 776 777 :type descr: str 778 :param descr: A short description of the purpose and/or effect 779 of this rule. 780 """ 781 # Ensure that the individual patterns are coherent. E.g., if 782 # left='(' and right=')', then this will raise an exception: 783 re.compile(tag_pattern2re_pattern(left_tag_pattern)) 784 re.compile(tag_pattern2re_pattern(right_tag_pattern)) 785 786 self._left_tag_pattern = left_tag_pattern 787 self._right_tag_pattern = right_tag_pattern 788 regexp = re.compile( 789 '(?P<left>%s)\}(?P<right>%s)' 790 % ( 791 tag_pattern2re_pattern(left_tag_pattern), 792 tag_pattern2re_pattern(right_tag_pattern), 793 ) 794 ) 795 RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr) 796 797 def __repr__(self): 798 """ 799 Return a string representation of this rule. It has the form:: 800 801 <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'> 802 803 Note that this representation does not include the 804 description string; that string can be accessed 805 separately with the ``descr()`` method. 806 807 :rtype: str 808 """ 809 return ( 810 '<ExpandRightRule: ' 811 + unicode_repr(self._left_tag_pattern) 812 + ', ' 813 + unicode_repr(self._right_tag_pattern) 814 + '>' 815 ) 816 817 818@python_2_unicode_compatible 819class ChunkRuleWithContext(RegexpChunkRule): 820 """ 821 A rule specifying how to add chunks to a ``ChunkString``, using 822 three matching tag patterns: one for the left context, one for the 823 chunk, and one for the right context. When applied to a 824 ``ChunkString``, it will find any substring that matches the chunk 825 tag pattern, is surrounded by substrings that match the two 826 context patterns, and is not already part of a chunk; and create a 827 new chunk containing the substring that matched the chunk tag 828 pattern. 829 830 Caveat: Both the left and right context are consumed when this 831 rule matches; therefore, if you need to find overlapping matches, 832 you will need to apply your rule more than once. 833 """ 834 835 def __init__( 836 self, 837 left_context_tag_pattern, 838 chunk_tag_pattern, 839 right_context_tag_pattern, 840 descr, 841 ): 842 """ 843 Construct a new ``ChunkRuleWithContext``. 844 845 :type left_context_tag_pattern: str 846 :param left_context_tag_pattern: A tag pattern that must match 847 the left context of ``chunk_tag_pattern`` for this rule to 848 apply. 849 :type chunk_tag_pattern: str 850 :param chunk_tag_pattern: A tag pattern that must match for this 851 rule to apply. If the rule does apply, then this pattern 852 also identifies the substring that will be made into a chunk. 853 :type right_context_tag_pattern: str 854 :param right_context_tag_pattern: A tag pattern that must match 855 the right context of ``chunk_tag_pattern`` for this rule to 856 apply. 857 :type descr: str 858 :param descr: A short description of the purpose and/or effect 859 of this rule. 860 """ 861 # Ensure that the individual patterns are coherent. E.g., if 862 # left='(' and right=')', then this will raise an exception: 863 re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) 864 re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) 865 re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) 866 867 self._left_context_tag_pattern = left_context_tag_pattern 868 self._chunk_tag_pattern = chunk_tag_pattern 869 self._right_context_tag_pattern = right_context_tag_pattern 870 regexp = re.compile( 871 '(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s' 872 % ( 873 tag_pattern2re_pattern(left_context_tag_pattern), 874 tag_pattern2re_pattern(chunk_tag_pattern), 875 tag_pattern2re_pattern(right_context_tag_pattern), 876 ChunkString.IN_CHINK_PATTERN, 877 ) 878 ) 879 replacement = r'\g<left>{\g<chunk>}\g<right>' 880 RegexpChunkRule.__init__(self, regexp, replacement, descr) 881 882 def __repr__(self): 883 """ 884 Return a string representation of this rule. It has the form:: 885 886 <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'> 887 888 Note that this representation does not include the 889 description string; that string can be accessed 890 separately with the ``descr()`` method. 891 892 :rtype: str 893 """ 894 return '<ChunkRuleWithContext: %r, %r, %r>' % ( 895 self._left_context_tag_pattern, 896 self._chunk_tag_pattern, 897 self._right_context_tag_pattern, 898 ) 899 900 901##////////////////////////////////////////////////////// 902## Tag Pattern Format Conversion 903##////////////////////////////////////////////////////// 904 905# this should probably be made more strict than it is -- e.g., it 906# currently accepts 'foo'. 907CHUNK_TAG_PATTERN = re.compile( 908 r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+') 909) 910 911 912def tag_pattern2re_pattern(tag_pattern): 913 """ 914 Convert a tag pattern to a regular expression pattern. A "tag 915 pattern" is a modified version of a regular expression, designed 916 for matching sequences of tags. The differences between regular 917 expression patterns and tag patterns are: 918 919 - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so 920 ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not 921 ``'<NN'`` followed by one or more repetitions of ``'>'``. 922 - Whitespace in tag patterns is ignored. So 923 ``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'`` 924 - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so 925 ``'<NN.*>'`` matches any single tag starting with ``'NN'``. 926 927 In particular, ``tag_pattern2re_pattern`` performs the following 928 transformations on the given pattern: 929 930 - Replace '.' with '[^<>{}]' 931 - Remove any whitespace 932 - Add extra parens around '<' and '>', to make '<' and '>' act 933 like parentheses. E.g., so that in '<NN>+', the '+' has scope 934 over the entire '<NN>'; and so that in '<NN|IN>', the '|' has 935 scope over 'NN' and 'IN', but not '<' or '>'. 936 - Check to make sure the resulting pattern is valid. 937 938 :type tag_pattern: str 939 :param tag_pattern: The tag pattern to convert to a regular 940 expression pattern. 941 :raise ValueError: If ``tag_pattern`` is not a valid tag pattern. 942 In particular, ``tag_pattern`` should not include braces; and it 943 should not contain nested or mismatched angle-brackets. 944 :rtype: str 945 :return: A regular expression pattern corresponding to 946 ``tag_pattern``. 947 """ 948 # Clean up the regular expression 949 tag_pattern = re.sub(r'\s', '', tag_pattern) 950 tag_pattern = re.sub(r'<', '(<(', tag_pattern) 951 tag_pattern = re.sub(r'>', ')>)', tag_pattern) 952 953 # Check the regular expression 954 if not CHUNK_TAG_PATTERN.match(tag_pattern): 955 raise ValueError('Bad tag pattern: %r' % tag_pattern) 956 957 # Replace "." with CHUNK_TAG_CHAR. 958 # We have to do this after, since it adds {}[]<>s, which would 959 # confuse CHUNK_TAG_PATTERN. 960 # PRE doesn't have lookback assertions, so reverse twice, and do 961 # the pattern backwards (with lookahead assertions). This can be 962 # made much cleaner once we can switch back to SRE. 963 def reverse_str(str): 964 lst = list(str) 965 lst.reverse() 966 return ''.join(lst) 967 968 tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) 969 reversed = reverse_str(tag_pattern) 970 reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed) 971 tag_pattern = reverse_str(reversed) 972 973 return tag_pattern 974 975 976##////////////////////////////////////////////////////// 977## RegexpChunkParser 978##////////////////////////////////////////////////////// 979 980 981@python_2_unicode_compatible 982class RegexpChunkParser(ChunkParserI): 983 """ 984 A regular expression based chunk parser. ``RegexpChunkParser`` uses a 985 sequence of "rules" to find chunks of a single type within a 986 text. The chunking of the text is encoded using a ``ChunkString``, 987 and each rule acts by modifying the chunking in the 988 ``ChunkString``. The rules are all implemented using regular 989 expression matching and substitution. 990 991 The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``, 992 ``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``) 993 define the rules that are used by ``RegexpChunkParser``. Each rule 994 defines an ``apply()`` method, which modifies the chunking encoded 995 by a given ``ChunkString``. 996 997 :type _rules: list(RegexpChunkRule) 998 :ivar _rules: The list of rules that should be applied to a text. 999 :type _trace: int 1000 :ivar _trace: The default level of tracing. 1001 1002 """ 1003 1004 def __init__(self, rules, chunk_label='NP', root_label='S', trace=0): 1005 """ 1006 Construct a new ``RegexpChunkParser``. 1007 1008 :type rules: list(RegexpChunkRule) 1009 :param rules: The sequence of rules that should be used to 1010 generate the chunking for a tagged text. 1011 :type chunk_label: str 1012 :param chunk_label: The node value that should be used for 1013 chunk subtrees. This is typically a short string 1014 describing the type of information contained by the chunk, 1015 such as ``"NP"`` for base noun phrases. 1016 :type root_label: str 1017 :param root_label: The node value that should be used for the 1018 top node of the chunk structure. 1019 :type trace: int 1020 :param trace: The level of tracing that should be used when 1021 parsing a text. ``0`` will generate no tracing output; 1022 ``1`` will generate normal tracing output; and ``2`` or 1023 higher will generate verbose tracing output. 1024 """ 1025 self._rules = rules 1026 self._trace = trace 1027 self._chunk_label = chunk_label 1028 self._root_label = root_label 1029 1030 def _trace_apply(self, chunkstr, verbose): 1031 """ 1032 Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in 1033 turn. Generate trace output between each rule. If ``verbose`` 1034 is true, then generate verbose output. 1035 1036 :type chunkstr: ChunkString 1037 :param chunkstr: The chunk string to which each rule should be 1038 applied. 1039 :type verbose: bool 1040 :param verbose: Whether output should be verbose. 1041 :rtype: None 1042 """ 1043 print('# Input:') 1044 print(chunkstr) 1045 for rule in self._rules: 1046 rule.apply(chunkstr) 1047 if verbose: 1048 print('#', rule.descr() + ' (' + unicode_repr(rule) + '):') 1049 else: 1050 print('#', rule.descr() + ':') 1051 print(chunkstr) 1052 1053 def _notrace_apply(self, chunkstr): 1054 """ 1055 Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in 1056 turn. 1057 1058 :param chunkstr: The chunk string to which each rule should be 1059 applied. 1060 :type chunkstr: ChunkString 1061 :rtype: None 1062 """ 1063 1064 for rule in self._rules: 1065 rule.apply(chunkstr) 1066 1067 def parse(self, chunk_struct, trace=None): 1068 """ 1069 :type chunk_struct: Tree 1070 :param chunk_struct: the chunk structure to be (further) chunked 1071 :type trace: int 1072 :param trace: The level of tracing that should be used when 1073 parsing a text. ``0`` will generate no tracing output; 1074 ``1`` will generate normal tracing output; and ``2`` or 1075 highter will generate verbose tracing output. This value 1076 overrides the trace level value that was given to the 1077 constructor. 1078 :rtype: Tree 1079 :return: a chunk structure that encodes the chunks in a given 1080 tagged sentence. A chunk is a non-overlapping linguistic 1081 group, such as a noun phrase. The set of chunks 1082 identified in the chunk structure depends on the rules 1083 used to define this ``RegexpChunkParser``. 1084 """ 1085 if len(chunk_struct) == 0: 1086 print('Warning: parsing empty text') 1087 return Tree(self._root_label, []) 1088 1089 try: 1090 chunk_struct.label() 1091 except AttributeError: 1092 chunk_struct = Tree(self._root_label, chunk_struct) 1093 1094 # Use the default trace value? 1095 if trace is None: 1096 trace = self._trace 1097 1098 chunkstr = ChunkString(chunk_struct) 1099 1100 # Apply the sequence of rules to the chunkstring. 1101 if trace: 1102 verbose = trace > 1 1103 self._trace_apply(chunkstr, verbose) 1104 else: 1105 self._notrace_apply(chunkstr) 1106 1107 # Use the chunkstring to create a chunk structure. 1108 return chunkstr.to_chunkstruct(self._chunk_label) 1109 1110 def rules(self): 1111 """ 1112 :return: the sequence of rules used by ``RegexpChunkParser``. 1113 :rtype: list(RegexpChunkRule) 1114 """ 1115 return self._rules 1116 1117 def __repr__(self): 1118 """ 1119 :return: a concise string representation of this 1120 ``RegexpChunkParser``. 1121 :rtype: str 1122 """ 1123 return "<RegexpChunkParser with %d rules>" % len(self._rules) 1124 1125 def __str__(self): 1126 """ 1127 :return: a verbose string representation of this ``RegexpChunkParser``. 1128 :rtype: str 1129 """ 1130 s = "RegexpChunkParser with %d rules:\n" % len(self._rules) 1131 margin = 0 1132 for rule in self._rules: 1133 margin = max(margin, len(rule.descr())) 1134 if margin < 35: 1135 format = " %" + repr(-(margin + 3)) + "s%s\n" 1136 else: 1137 format = " %s\n %s\n" 1138 for rule in self._rules: 1139 s += format % (rule.descr(), unicode_repr(rule)) 1140 return s[:-1] 1141 1142 1143##////////////////////////////////////////////////////// 1144## Chunk Grammar 1145##////////////////////////////////////////////////////// 1146 1147 1148@python_2_unicode_compatible 1149class RegexpParser(ChunkParserI): 1150 """ 1151 A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of 1152 regular expression patterns to specify the behavior of the parser. 1153 The chunking of the text is encoded using a ``ChunkString``, and 1154 each rule acts by modifying the chunking in the ``ChunkString``. 1155 The rules are all implemented using regular expression matching 1156 and substitution. 1157 1158 A grammar contains one or more clauses in the following form:: 1159 1160 NP: 1161 {<DT|JJ>} # chunk determiners and adjectives 1162 }<[\.VI].*>+{ # chink any tag beginning with V, I, or . 1163 <.*>}{<DT> # split a chunk at a determiner 1164 <DT|JJ>{}<NN.*> # merge chunk ending with det/adj 1165 # with one starting with a noun 1166 1167 The patterns of a clause are executed in order. An earlier 1168 pattern may introduce a chunk boundary that prevents a later 1169 pattern from executing. Sometimes an individual pattern will 1170 match on multiple, overlapping extents of the input. As with 1171 regular expression substitution more generally, the chunker will 1172 identify the first match possible, then continue looking for matches 1173 after this one has ended. 1174 1175 The clauses of a grammar are also executed in order. A cascaded 1176 chunk parser is one having more than one clause. The maximum depth 1177 of a parse tree created by this chunk parser is the same as the 1178 number of clauses in the grammar. 1179 1180 When tracing is turned on, the comment portion of a line is displayed 1181 each time the corresponding pattern is applied. 1182 1183 :type _start: str 1184 :ivar _start: The start symbol of the grammar (the root node of 1185 resulting trees) 1186 :type _stages: int 1187 :ivar _stages: The list of parsing stages corresponding to the grammar 1188 1189 """ 1190 1191 def __init__(self, grammar, root_label='S', loop=1, trace=0): 1192 """ 1193 Create a new chunk parser, from the given start state 1194 and set of chunk patterns. 1195 1196 :param grammar: The grammar, or a list of RegexpChunkParser objects 1197 :type grammar: str or list(RegexpChunkParser) 1198 :param root_label: The top node of the tree being created 1199 :type root_label: str or Nonterminal 1200 :param loop: The number of times to run through the patterns 1201 :type loop: int 1202 :type trace: int 1203 :param trace: The level of tracing that should be used when 1204 parsing a text. ``0`` will generate no tracing output; 1205 ``1`` will generate normal tracing output; and ``2`` or 1206 higher will generate verbose tracing output. 1207 """ 1208 self._trace = trace 1209 self._stages = [] 1210 self._grammar = grammar 1211 self._loop = loop 1212 1213 if isinstance(grammar, string_types): 1214 self._read_grammar(grammar, root_label, trace) 1215 else: 1216 # Make sur the grammar looks like it has the right type: 1217 type_err = ( 1218 'Expected string or list of RegexpChunkParsers ' 'for the grammar.' 1219 ) 1220 try: 1221 grammar = list(grammar) 1222 except: 1223 raise TypeError(type_err) 1224 for elt in grammar: 1225 if not isinstance(elt, RegexpChunkParser): 1226 raise TypeError(type_err) 1227 self._stages = grammar 1228 1229 def _read_grammar(self, grammar, root_label, trace): 1230 """ 1231 Helper function for __init__: read the grammar if it is a 1232 string. 1233 """ 1234 rules = [] 1235 lhs = None 1236 for line in grammar.split('\n'): 1237 line = line.strip() 1238 1239 # New stage begins if there's an unescaped ':' 1240 m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line) 1241 if m: 1242 # Record the stage that we just completed. 1243 self._add_stage(rules, lhs, root_label, trace) 1244 # Start a new stage. 1245 lhs = m.group('nonterminal').strip() 1246 rules = [] 1247 line = m.group('rule').strip() 1248 1249 # Skip blank & comment-only lines 1250 if line == '' or line.startswith('#'): 1251 continue 1252 1253 # Add the rule 1254 rules.append(RegexpChunkRule.fromstring(line)) 1255 1256 # Record the final stage 1257 self._add_stage(rules, lhs, root_label, trace) 1258 1259 def _add_stage(self, rules, lhs, root_label, trace): 1260 """ 1261 Helper function for __init__: add a new stage to the parser. 1262 """ 1263 if rules != []: 1264 if not lhs: 1265 raise ValueError('Expected stage marker (eg NP:)') 1266 parser = RegexpChunkParser( 1267 rules, chunk_label=lhs, root_label=root_label, trace=trace 1268 ) 1269 self._stages.append(parser) 1270 1271 def parse(self, chunk_struct, trace=None): 1272 """ 1273 Apply the chunk parser to this input. 1274 1275 :type chunk_struct: Tree 1276 :param chunk_struct: the chunk structure to be (further) chunked 1277 (this tree is modified, and is also returned) 1278 :type trace: int 1279 :param trace: The level of tracing that should be used when 1280 parsing a text. ``0`` will generate no tracing output; 1281 ``1`` will generate normal tracing output; and ``2`` or 1282 highter will generate verbose tracing output. This value 1283 overrides the trace level value that was given to the 1284 constructor. 1285 :return: the chunked output. 1286 :rtype: Tree 1287 """ 1288 if trace is None: 1289 trace = self._trace 1290 for i in range(self._loop): 1291 for parser in self._stages: 1292 chunk_struct = parser.parse(chunk_struct, trace=trace) 1293 return chunk_struct 1294 1295 def __repr__(self): 1296 """ 1297 :return: a concise string representation of this ``chunk.RegexpParser``. 1298 :rtype: str 1299 """ 1300 return "<chunk.RegexpParser with %d stages>" % len(self._stages) 1301 1302 def __str__(self): 1303 """ 1304 :return: a verbose string representation of this 1305 ``RegexpParser``. 1306 :rtype: str 1307 """ 1308 s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) 1309 margin = 0 1310 for parser in self._stages: 1311 s += "%s\n" % parser 1312 return s[:-1] 1313 1314 1315##////////////////////////////////////////////////////// 1316## Demonstration code 1317##////////////////////////////////////////////////////// 1318 1319 1320def demo_eval(chunkparser, text): 1321 """ 1322 Demonstration code for evaluating a chunk parser, using a 1323 ``ChunkScore``. This function assumes that ``text`` contains one 1324 sentence per line, and that each sentence has the form expected by 1325 ``tree.chunk``. It runs the given chunk parser on each sentence in 1326 the text, and scores the result. It prints the final score 1327 (precision, recall, and f-measure); and reports the set of chunks 1328 that were missed and the set of chunks that were incorrect. (At 1329 most 10 missing chunks and 10 incorrect chunks are reported). 1330 1331 :param chunkparser: The chunkparser to be tested 1332 :type chunkparser: ChunkParserI 1333 :param text: The chunked tagged text that should be used for 1334 evaluation. 1335 :type text: str 1336 """ 1337 from nltk import chunk 1338 from nltk.tree import Tree 1339 1340 # Evaluate our chunk parser. 1341 chunkscore = chunk.ChunkScore() 1342 1343 for sentence in text.split('\n'): 1344 print(sentence) 1345 sentence = sentence.strip() 1346 if not sentence: 1347 continue 1348 gold = chunk.tagstr2tree(sentence) 1349 tokens = gold.leaves() 1350 test = chunkparser.parse(Tree('S', tokens), trace=1) 1351 chunkscore.score(gold, test) 1352 print() 1353 1354 print('/' + ('=' * 75) + '\\') 1355 print('Scoring', chunkparser) 1356 print(('-' * 77)) 1357 print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ') 1358 print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ') 1359 print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100)) 1360 1361 # Missed chunks. 1362 if chunkscore.missed(): 1363 print('Missed:') 1364 missed = chunkscore.missed() 1365 for chunk in missed[:10]: 1366 print(' ', ' '.join(map(str, chunk))) 1367 if len(chunkscore.missed()) > 10: 1368 print(' ...') 1369 1370 # Incorrect chunks. 1371 if chunkscore.incorrect(): 1372 print('Incorrect:') 1373 incorrect = chunkscore.incorrect() 1374 for chunk in incorrect[:10]: 1375 print(' ', ' '.join(map(str, chunk))) 1376 if len(chunkscore.incorrect()) > 10: 1377 print(' ...') 1378 1379 print('\\' + ('=' * 75) + '/') 1380 print() 1381 1382 1383def demo(): 1384 """ 1385 A demonstration for the ``RegexpChunkParser`` class. A single text is 1386 parsed with four different chunk parsers, using a variety of rules 1387 and strategies. 1388 """ 1389 1390 from nltk import chunk, Tree 1391 1392 text = """\ 1393 [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. 1394 [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. 1395 [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. 1396 """ 1397 1398 print('*' * 75) 1399 print('Evaluation text:') 1400 print(text) 1401 print('*' * 75) 1402 print() 1403 1404 grammar = r""" 1405 NP: # NP stage 1406 {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 1407 {<NNP>+} # chunk proper nouns 1408 """ 1409 cp = chunk.RegexpParser(grammar) 1410 demo_eval(cp, text) 1411 1412 grammar = r""" 1413 NP: 1414 {<.*>} # start by chunking each tag 1415 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 1416 <DT|JJ>{}<NN.*> # merge det/adj with nouns 1417 """ 1418 cp = chunk.RegexpParser(grammar) 1419 demo_eval(cp, text) 1420 1421 grammar = r""" 1422 NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns 1423 VP: {<TO>?<VB.*>} # VP = verb words 1424 """ 1425 cp = chunk.RegexpParser(grammar) 1426 demo_eval(cp, text) 1427 1428 grammar = r""" 1429 NP: {<.*>*} # start by chunking everything 1430 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 1431 <.*>}{<DT> # separate on determiners 1432 PP: {<IN><NP>} # PP = preposition + noun phrase 1433 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 1434 """ 1435 cp = chunk.RegexpParser(grammar) 1436 demo_eval(cp, text) 1437 1438 # Evaluation 1439 1440 from nltk.corpus import conll2000 1441 1442 print() 1443 print("Demonstration of empty grammar:") 1444 1445 cp = chunk.RegexpParser("") 1446 print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',)))) 1447 1448 print() 1449 print("Demonstration of accuracy evaluation using CoNLL tags:") 1450 1451 grammar = r""" 1452 NP: 1453 {<.*>} # start by chunking each tag 1454 }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods 1455 <DT|JJ>{}<NN.*> # merge det/adj with nouns 1456 """ 1457 cp = chunk.RegexpParser(grammar) 1458 print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])) 1459 1460 print() 1461 print("Demonstration of tagged token input") 1462 1463 grammar = r""" 1464 NP: {<.*>*} # start by chunking everything 1465 }<[\.VI].*>+{ # chink any verbs, prepositions or periods 1466 <.*>}{<DT> # separate on determiners 1467 PP: {<IN><NP>} # PP = preposition + noun phrase 1468 VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs 1469 """ 1470 cp = chunk.RegexpParser(grammar) 1471 print( 1472 cp.parse( 1473 [ 1474 ("the", "DT"), 1475 ("little", "JJ"), 1476 ("cat", "NN"), 1477 ("sat", "VBD"), 1478 ("on", "IN"), 1479 ("the", "DT"), 1480 ("mat", "NN"), 1481 (".", "."), 1482 ] 1483 ) 1484 ) 1485 1486 1487if __name__ == '__main__': 1488 demo() 1489