1# Natural Language Toolkit: Regular Expression Chunkers
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Edward Loper <edloper@gmail.com>
5#         Steven Bird <stevenbird1@gmail.com> (minor additions)
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8from __future__ import print_function, unicode_literals
9from __future__ import division
10
11import re
12
13from six import string_types
14
15from nltk.tree import Tree
16from nltk.chunk.api import ChunkParserI
17from nltk.compat import python_2_unicode_compatible, unicode_repr
18
19##//////////////////////////////////////////////////////
20##  ChunkString
21##//////////////////////////////////////////////////////
22
23
24@python_2_unicode_compatible
25class ChunkString(object):
26    """
27    A string-based encoding of a particular chunking of a text.
28    Internally, the ``ChunkString`` class uses a single string to
29    encode the chunking of the input text.  This string contains a
30    sequence of angle-bracket delimited tags, with chunking indicated
31    by braces.  An example of this encoding is::
32
33        {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
34
35    ``ChunkString`` are created from tagged texts (i.e., lists of
36    ``tokens`` whose type is ``TaggedType``).  Initially, nothing is
37    chunked.
38
39    The chunking of a ``ChunkString`` can be modified with the ``xform()``
40    method, which uses a regular expression to transform the string
41    representation.  These transformations should only add and remove
42    braces; they should *not* modify the sequence of angle-bracket
43    delimited tags.
44
45    :type _str: str
46    :ivar _str: The internal string representation of the text's
47        encoding.  This string representation contains a sequence of
48        angle-bracket delimited tags, with chunking indicated by
49        braces.  An example of this encoding is::
50
51            {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
52
53    :type _pieces: list(tagged tokens and chunks)
54    :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
55    :ivar _debug: The debug level.  See the constructor docs.
56
57    :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
58        will only match positions that are in chunks.
59    :cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
60        will only match positions that are in chinks.
61    """
62
63    CHUNK_TAG_CHAR = r'[^\{\}<>]'
64    CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
65
66    IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
67    IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
68
69    # These are used by _verify
70    _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
71    _CHINK = r'(%s+?)+?' % CHUNK_TAG
72    _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
73    _BRACKETS = re.compile('[^\{\}]+')
74    _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
75
76    def __init__(self, chunk_struct, debug_level=1):
77        """
78        Construct a new ``ChunkString`` that encodes the chunking of
79        the text ``tagged_tokens``.
80
81        :type chunk_struct: Tree
82        :param chunk_struct: The chunk structure to be further chunked.
83        :type debug_level: int
84        :param debug_level: The level of debugging which should be
85            applied to transformations on the ``ChunkString``.  The
86            valid levels are:
87                - 0: no checks
88                - 1: full check on to_chunkstruct
89                - 2: full check on to_chunkstruct and cursory check after
90                   each transformation.
91                - 3: full check on to_chunkstruct and full check after
92                   each transformation.
93            We recommend you use at least level 1.  You should
94            probably use level 3 if you use any non-standard
95            subclasses of ``RegexpChunkRule``.
96        """
97        self._root_label = chunk_struct.label()
98        self._pieces = chunk_struct[:]
99        tags = [self._tag(tok) for tok in self._pieces]
100        self._str = '<' + '><'.join(tags) + '>'
101        self._debug = debug_level
102
103    def _tag(self, tok):
104        if isinstance(tok, tuple):
105            return tok[1]
106        elif isinstance(tok, Tree):
107            return tok.label()
108        else:
109            raise ValueError('chunk structures must contain tagged ' 'tokens or trees')
110
111    def _verify(self, s, verify_tags):
112        """
113        Check to make sure that ``s`` still corresponds to some chunked
114        version of ``_pieces``.
115
116        :type verify_tags: bool
117        :param verify_tags: Whether the individual tags should be
118            checked.  If this is false, ``_verify`` will check to make
119            sure that ``_str`` encodes a chunked version of *some*
120            list of tokens.  If this is true, then ``_verify`` will
121            check to make sure that the tags in ``_str`` match those in
122            ``_pieces``.
123
124        :raise ValueError: if the internal string representation of
125            this ``ChunkString`` is invalid or not consistent with _pieces.
126        """
127        # Check overall form
128        if not ChunkString._VALID.match(s):
129            raise ValueError(
130                'Transformation generated invalid ' 'chunkstring:\n  %s' % s
131            )
132
133        # Check that parens are balanced.  If the string is long, we
134        # have to do this in pieces, to avoid a maximum recursion
135        # depth limit for regular expressions.
136        brackets = ChunkString._BRACKETS.sub('', s)
137        for i in range(1 + len(brackets) // 5000):
138            substr = brackets[i * 5000 : i * 5000 + 5000]
139            if not ChunkString._BALANCED_BRACKETS.match(substr):
140                raise ValueError(
141                    'Transformation generated invalid ' 'chunkstring:\n  %s' % s
142                )
143
144        if verify_tags <= 0:
145            return
146
147        tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
148        tags2 = [self._tag(piece) for piece in self._pieces]
149        if tags1 != tags2:
150            raise ValueError(
151                'Transformation generated invalid ' 'chunkstring: tag changed'
152            )
153
154    def to_chunkstruct(self, chunk_label='CHUNK'):
155        """
156        Return the chunk structure encoded by this ``ChunkString``.
157
158        :rtype: Tree
159        :raise ValueError: If a transformation has generated an
160            invalid chunkstring.
161        """
162        if self._debug > 0:
163            self._verify(self._str, 1)
164
165        # Use this alternating list to create the chunkstruct.
166        pieces = []
167        index = 0
168        piece_in_chunk = 0
169        for piece in re.split('[{}]', self._str):
170
171            # Find the list of tokens contained in this piece.
172            length = piece.count('<')
173            subsequence = self._pieces[index : index + length]
174
175            # Add this list of tokens to our pieces.
176            if piece_in_chunk:
177                pieces.append(Tree(chunk_label, subsequence))
178            else:
179                pieces += subsequence
180
181            # Update index, piece_in_chunk
182            index += length
183            piece_in_chunk = not piece_in_chunk
184
185        return Tree(self._root_label, pieces)
186
187    def xform(self, regexp, repl):
188        """
189        Apply the given transformation to the string encoding of this
190        ``ChunkString``.  In particular, find all occurrences that match
191        ``regexp``, and replace them using ``repl`` (as done by
192        ``re.sub``).
193
194        This transformation should only add and remove braces; it
195        should *not* modify the sequence of angle-bracket delimited
196        tags.  Furthermore, this transformation may not result in
197        improper bracketing.  Note, in particular, that bracketing may
198        not be nested.
199
200        :type regexp: str or regexp
201        :param regexp: A regular expression matching the substring
202            that should be replaced.  This will typically include a
203            named group, which can be used by ``repl``.
204        :type repl: str
205        :param repl: An expression specifying what should replace the
206            matched substring.  Typically, this will include a named
207            replacement group, specified by ``regexp``.
208        :rtype: None
209        :raise ValueError: If this transformation generated an
210            invalid chunkstring.
211        """
212        # Do the actual substitution
213        s = re.sub(regexp, repl, self._str)
214
215        # The substitution might have generated "empty chunks"
216        # (substrings of the form "{}").  Remove them, so they don't
217        # interfere with other transformations.
218        s = re.sub('\{\}', '', s)
219
220        # Make sure that the transformation was legal.
221        if self._debug > 1:
222            self._verify(s, self._debug - 2)
223
224        # Commit the transformation.
225        self._str = s
226
227    def __repr__(self):
228        """
229        Return a string representation of this ``ChunkString``.
230        It has the form::
231
232            <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
233
234        :rtype: str
235        """
236        return '<ChunkString: %s>' % unicode_repr(self._str)
237
238    def __str__(self):
239        """
240        Return a formatted representation of this ``ChunkString``.
241        This representation will include extra spaces to ensure that
242        tags will line up with the representation of other
243        ``ChunkStrings`` for the same text, regardless of the chunking.
244
245       :rtype: str
246        """
247        # Add spaces to make everything line up.
248        str = re.sub(r'>(?!\})', r'> ', self._str)
249        str = re.sub(r'([^\{])<', r'\1 <', str)
250        if str[0] == '<':
251            str = ' ' + str
252        return str
253
254
255##//////////////////////////////////////////////////////
256##  Chunking Rules
257##//////////////////////////////////////////////////////
258
259
260@python_2_unicode_compatible
261class RegexpChunkRule(object):
262    """
263    A rule specifying how to modify the chunking in a ``ChunkString``,
264    using a transformational regular expression.  The
265    ``RegexpChunkRule`` class itself can be used to implement any
266    transformational rule based on regular expressions.  There are
267    also a number of subclasses, which can be used to implement
268    simpler types of rules, based on matching regular expressions.
269
270    Each ``RegexpChunkRule`` has a regular expression and a
271    replacement expression.  When a ``RegexpChunkRule`` is "applied"
272    to a ``ChunkString``, it searches the ``ChunkString`` for any
273    substring that matches the regular expression, and replaces it
274    using the replacement expression.  This search/replace operation
275    has the same semantics as ``re.sub``.
276
277    Each ``RegexpChunkRule`` also has a description string, which
278    gives a short (typically less than 75 characters) description of
279    the purpose of the rule.
280
281    This transformation defined by this ``RegexpChunkRule`` should
282    only add and remove braces; it should *not* modify the sequence
283    of angle-bracket delimited tags.  Furthermore, this transformation
284    may not result in nested or mismatched bracketing.
285    """
286
287    def __init__(self, regexp, repl, descr):
288        """
289        Construct a new RegexpChunkRule.
290
291        :type regexp: regexp or str
292        :param regexp: The regular expression for this ``RegexpChunkRule``.
293            When this rule is applied to a ``ChunkString``, any
294            substring that matches ``regexp`` will be replaced using
295            the replacement string ``repl``.  Note that this must be a
296            normal regular expression, not a tag pattern.
297        :type repl: str
298        :param repl: The replacement expression for this ``RegexpChunkRule``.
299            When this rule is applied to a ``ChunkString``, any substring
300            that matches ``regexp`` will be replaced using ``repl``.
301        :type descr: str
302        :param descr: A short description of the purpose and/or effect
303            of this rule.
304        """
305        if isinstance(regexp, string_types):
306            regexp = re.compile(regexp)
307        self._repl = repl
308        self._descr = descr
309        self._regexp = regexp
310
311    def apply(self, chunkstr):
312        # Keep docstring generic so we can inherit it.
313        """
314        Apply this rule to the given ``ChunkString``.  See the
315        class reference documentation for a description of what it
316        means to apply a rule.
317
318        :type chunkstr: ChunkString
319        :param chunkstr: The chunkstring to which this rule is applied.
320        :rtype: None
321        :raise ValueError: If this transformation generated an
322            invalid chunkstring.
323        """
324        chunkstr.xform(self._regexp, self._repl)
325
326    def descr(self):
327        """
328        Return a short description of the purpose and/or effect of
329        this rule.
330
331        :rtype: str
332        """
333        return self._descr
334
335    def __repr__(self):
336        """
337        Return a string representation of this rule.  It has the form::
338
339            <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
340
341        Note that this representation does not include the
342        description string; that string can be accessed
343        separately with the ``descr()`` method.
344
345        :rtype: str
346        """
347        return (
348            '<RegexpChunkRule: '
349            + unicode_repr(self._regexp.pattern)
350            + '->'
351            + unicode_repr(self._repl)
352            + '>'
353        )
354
355    @staticmethod
356    def fromstring(s):
357        """
358        Create a RegexpChunkRule from a string description.
359        Currently, the following formats are supported::
360
361          {regexp}         # chunk rule
362          }regexp{         # chink rule
363          regexp}{regexp   # split rule
364          regexp{}regexp   # merge rule
365
366        Where ``regexp`` is a regular expression for the rule.  Any
367        text following the comment marker (``#``) will be used as
368        the rule's description:
369
370        >>> from nltk.chunk.regexp import RegexpChunkRule
371        >>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
372        <ChunkRule: '<DT>?<NN.*>+'>
373        """
374        # Split off the comment (but don't split on '\#')
375        m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
376        rule = m.group('rule').strip()
377        comment = (m.group('comment') or '')[1:].strip()
378
379        # Pattern bodies: chunk, chink, split, merge
380        try:
381            if not rule:
382                raise ValueError('Empty chunk pattern')
383            if rule[0] == '{' and rule[-1] == '}':
384                return ChunkRule(rule[1:-1], comment)
385            elif rule[0] == '}' and rule[-1] == '{':
386                return ChinkRule(rule[1:-1], comment)
387            elif '}{' in rule:
388                left, right = rule.split('}{')
389                return SplitRule(left, right, comment)
390            elif '{}' in rule:
391                left, right = rule.split('{}')
392                return MergeRule(left, right, comment)
393            elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
394                left, chunk, right = re.split('[{}]', rule)
395                return ChunkRuleWithContext(left, chunk, right, comment)
396            else:
397                raise ValueError('Illegal chunk pattern: %s' % rule)
398        except (ValueError, re.error):
399            raise ValueError('Illegal chunk pattern: %s' % rule)
400
401
402@python_2_unicode_compatible
403class ChunkRule(RegexpChunkRule):
404    """
405    A rule specifying how to add chunks to a ``ChunkString``, using a
406    matching tag pattern.  When applied to a ``ChunkString``, it will
407    find any substring that matches this tag pattern and that is not
408    already part of a chunk, and create a new chunk containing that
409    substring.
410    """
411
412    def __init__(self, tag_pattern, descr):
413
414        """
415        Construct a new ``ChunkRule``.
416
417        :type tag_pattern: str
418        :param tag_pattern: This rule's tag pattern.  When
419            applied to a ``ChunkString``, this rule will
420            chunk any substring that matches this tag pattern and that
421            is not already part of a chunk.
422        :type descr: str
423        :param descr: A short description of the purpose and/or effect
424            of this rule.
425        """
426        self._pattern = tag_pattern
427        regexp = re.compile(
428            '(?P<chunk>%s)%s'
429            % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
430        )
431        RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
432
433    def __repr__(self):
434        """
435        Return a string representation of this rule.  It has the form::
436
437            <ChunkRule: '<IN|VB.*>'>
438
439        Note that this representation does not include the
440        description string; that string can be accessed
441        separately with the ``descr()`` method.
442
443        :rtype: str
444        """
445        return '<ChunkRule: ' + unicode_repr(self._pattern) + '>'
446
447
448@python_2_unicode_compatible
449class ChinkRule(RegexpChunkRule):
450    """
451    A rule specifying how to remove chinks to a ``ChunkString``,
452    using a matching tag pattern.  When applied to a
453    ``ChunkString``, it will find any substring that matches this
454    tag pattern and that is contained in a chunk, and remove it
455    from that chunk, thus creating two new chunks.
456    """
457
458    def __init__(self, tag_pattern, descr):
459        """
460        Construct a new ``ChinkRule``.
461
462        :type tag_pattern: str
463        :param tag_pattern: This rule's tag pattern.  When
464            applied to a ``ChunkString``, this rule will
465            find any substring that matches this tag pattern and that
466            is contained in a chunk, and remove it from that chunk,
467            thus creating two new chunks.
468        :type descr: str
469        :param descr: A short description of the purpose and/or effect
470            of this rule.
471        """
472        self._pattern = tag_pattern
473        regexp = re.compile(
474            '(?P<chink>%s)%s'
475            % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
476        )
477        RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
478
479    def __repr__(self):
480        """
481        Return a string representation of this rule.  It has the form::
482
483            <ChinkRule: '<IN|VB.*>'>
484
485        Note that this representation does not include the
486        description string; that string can be accessed
487        separately with the ``descr()`` method.
488
489        :rtype: str
490        """
491        return '<ChinkRule: ' + unicode_repr(self._pattern) + '>'
492
493
494@python_2_unicode_compatible
495class UnChunkRule(RegexpChunkRule):
496    """
497    A rule specifying how to remove chunks to a ``ChunkString``,
498    using a matching tag pattern.  When applied to a
499    ``ChunkString``, it will find any complete chunk that matches this
500    tag pattern, and un-chunk it.
501    """
502
503    def __init__(self, tag_pattern, descr):
504        """
505        Construct a new ``UnChunkRule``.
506
507        :type tag_pattern: str
508        :param tag_pattern: This rule's tag pattern.  When
509            applied to a ``ChunkString``, this rule will
510            find any complete chunk that matches this tag pattern,
511            and un-chunk it.
512        :type descr: str
513        :param descr: A short description of the purpose and/or effect
514            of this rule.
515        """
516        self._pattern = tag_pattern
517        regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern))
518        RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
519
520    def __repr__(self):
521        """
522        Return a string representation of this rule.  It has the form::
523
524            <UnChunkRule: '<IN|VB.*>'>
525
526        Note that this representation does not include the
527        description string; that string can be accessed
528        separately with the ``descr()`` method.
529
530        :rtype: str
531        """
532        return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>'
533
534
535@python_2_unicode_compatible
536class MergeRule(RegexpChunkRule):
537    """
538    A rule specifying how to merge chunks in a ``ChunkString``, using
539    two matching tag patterns: a left pattern, and a right pattern.
540    When applied to a ``ChunkString``, it will find any chunk whose end
541    matches left pattern, and immediately followed by a chunk whose
542    beginning matches right pattern.  It will then merge those two
543    chunks into a single chunk.
544    """
545
546    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
547        """
548        Construct a new ``MergeRule``.
549
550        :type right_tag_pattern: str
551        :param right_tag_pattern: This rule's right tag
552            pattern.  When applied to a ``ChunkString``, this
553            rule will find any chunk whose end matches
554            ``left_tag_pattern``, and immediately followed by a chunk
555            whose beginning matches this pattern.  It will
556            then merge those two chunks into a single chunk.
557        :type left_tag_pattern: str
558        :param left_tag_pattern: This rule's left tag
559            pattern.  When applied to a ``ChunkString``, this
560            rule will find any chunk whose end matches
561            this pattern, and immediately followed by a chunk
562            whose beginning matches ``right_tag_pattern``.  It will
563            then merge those two chunks into a single chunk.
564
565        :type descr: str
566        :param descr: A short description of the purpose and/or effect
567            of this rule.
568        """
569        # Ensure that the individual patterns are coherent.  E.g., if
570        # left='(' and right=')', then this will raise an exception:
571        re.compile(tag_pattern2re_pattern(left_tag_pattern))
572        re.compile(tag_pattern2re_pattern(right_tag_pattern))
573
574        self._left_tag_pattern = left_tag_pattern
575        self._right_tag_pattern = right_tag_pattern
576        regexp = re.compile(
577            '(?P<left>%s)}{(?=%s)'
578            % (
579                tag_pattern2re_pattern(left_tag_pattern),
580                tag_pattern2re_pattern(right_tag_pattern),
581            )
582        )
583        RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
584
585    def __repr__(self):
586        """
587        Return a string representation of this rule.  It has the form::
588
589            <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
590
591        Note that this representation does not include the
592        description string; that string can be accessed
593        separately with the ``descr()`` method.
594
595        :rtype: str
596        """
597        return (
598            '<MergeRule: '
599            + unicode_repr(self._left_tag_pattern)
600            + ', '
601            + unicode_repr(self._right_tag_pattern)
602            + '>'
603        )
604
605
606@python_2_unicode_compatible
607class SplitRule(RegexpChunkRule):
608    """
609    A rule specifying how to split chunks in a ``ChunkString``, using
610    two matching tag patterns: a left pattern, and a right pattern.
611    When applied to a ``ChunkString``, it will find any chunk that
612    matches the left pattern followed by the right pattern.  It will
613    then split the chunk into two new chunks, at the point between the
614    two pattern matches.
615    """
616
617    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
618        """
619        Construct a new ``SplitRule``.
620
621        :type right_tag_pattern: str
622        :param right_tag_pattern: This rule's right tag
623            pattern.  When applied to a ``ChunkString``, this rule will
624            find any chunk containing a substring that matches
625            ``left_tag_pattern`` followed by this pattern.  It will
626            then split the chunk into two new chunks at the point
627            between these two matching patterns.
628        :type left_tag_pattern: str
629        :param left_tag_pattern: This rule's left tag
630            pattern.  When applied to a ``ChunkString``, this rule will
631            find any chunk containing a substring that matches this
632            pattern followed by ``right_tag_pattern``.  It will then
633            split the chunk into two new chunks at the point between
634            these two matching patterns.
635        :type descr: str
636        :param descr: A short description of the purpose and/or effect
637            of this rule.
638        """
639        # Ensure that the individual patterns are coherent.  E.g., if
640        # left='(' and right=')', then this will raise an exception:
641        re.compile(tag_pattern2re_pattern(left_tag_pattern))
642        re.compile(tag_pattern2re_pattern(right_tag_pattern))
643
644        self._left_tag_pattern = left_tag_pattern
645        self._right_tag_pattern = right_tag_pattern
646        regexp = re.compile(
647            '(?P<left>%s)(?=%s)'
648            % (
649                tag_pattern2re_pattern(left_tag_pattern),
650                tag_pattern2re_pattern(right_tag_pattern),
651            )
652        )
653        RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
654
655    def __repr__(self):
656        """
657        Return a string representation of this rule.  It has the form::
658
659            <SplitRule: '<NN>', '<DT>'>
660
661        Note that this representation does not include the
662        description string; that string can be accessed
663        separately with the ``descr()`` method.
664
665       :rtype: str
666        """
667        return (
668            '<SplitRule: '
669            + unicode_repr(self._left_tag_pattern)
670            + ', '
671            + unicode_repr(self._right_tag_pattern)
672            + '>'
673        )
674
675
676@python_2_unicode_compatible
677class ExpandLeftRule(RegexpChunkRule):
678    """
679    A rule specifying how to expand chunks in a ``ChunkString`` to the left,
680    using two matching tag patterns: a left pattern, and a right pattern.
681    When applied to a ``ChunkString``, it will find any chunk whose beginning
682    matches right pattern, and immediately preceded by a chink whose
683    end matches left pattern.  It will then expand the chunk to incorporate
684    the new material on the left.
685    """
686
687    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
688        """
689        Construct a new ``ExpandRightRule``.
690
691        :type right_tag_pattern: str
692        :param right_tag_pattern: This rule's right tag
693            pattern.  When applied to a ``ChunkString``, this
694            rule will find any chunk whose beginning matches
695            ``right_tag_pattern``, and immediately preceded by a chink
696            whose end matches this pattern.  It will
697            then merge those two chunks into a single chunk.
698        :type left_tag_pattern: str
699        :param left_tag_pattern: This rule's left tag
700            pattern.  When applied to a ``ChunkString``, this
701            rule will find any chunk whose beginning matches
702            this pattern, and immediately preceded by a chink
703            whose end matches ``left_tag_pattern``.  It will
704            then expand the chunk to incorporate the new material on the left.
705
706        :type descr: str
707        :param descr: A short description of the purpose and/or effect
708            of this rule.
709        """
710        # Ensure that the individual patterns are coherent.  E.g., if
711        # left='(' and right=')', then this will raise an exception:
712        re.compile(tag_pattern2re_pattern(left_tag_pattern))
713        re.compile(tag_pattern2re_pattern(right_tag_pattern))
714
715        self._left_tag_pattern = left_tag_pattern
716        self._right_tag_pattern = right_tag_pattern
717        regexp = re.compile(
718            '(?P<left>%s)\{(?P<right>%s)'
719            % (
720                tag_pattern2re_pattern(left_tag_pattern),
721                tag_pattern2re_pattern(right_tag_pattern),
722            )
723        )
724        RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
725
726    def __repr__(self):
727        """
728        Return a string representation of this rule.  It has the form::
729
730            <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
731
732        Note that this representation does not include the
733        description string; that string can be accessed
734        separately with the ``descr()`` method.
735
736        :rtype: str
737        """
738        return (
739            '<ExpandLeftRule: '
740            + unicode_repr(self._left_tag_pattern)
741            + ', '
742            + unicode_repr(self._right_tag_pattern)
743            + '>'
744        )
745
746
747@python_2_unicode_compatible
748class ExpandRightRule(RegexpChunkRule):
749    """
750    A rule specifying how to expand chunks in a ``ChunkString`` to the
751    right, using two matching tag patterns: a left pattern, and a
752    right pattern.  When applied to a ``ChunkString``, it will find any
753    chunk whose end matches left pattern, and immediately followed by
754    a chink whose beginning matches right pattern.  It will then
755    expand the chunk to incorporate the new material on the right.
756    """
757
758    def __init__(self, left_tag_pattern, right_tag_pattern, descr):
759        """
760        Construct a new ``ExpandRightRule``.
761
762        :type right_tag_pattern: str
763        :param right_tag_pattern: This rule's right tag
764            pattern.  When applied to a ``ChunkString``, this
765            rule will find any chunk whose end matches
766            ``left_tag_pattern``, and immediately followed by a chink
767            whose beginning matches this pattern.  It will
768            then merge those two chunks into a single chunk.
769        :type left_tag_pattern: str
770        :param left_tag_pattern: This rule's left tag
771            pattern.  When applied to a ``ChunkString``, this
772            rule will find any chunk whose end matches
773            this pattern, and immediately followed by a chink
774            whose beginning matches ``right_tag_pattern``.  It will
775            then expand the chunk to incorporate the new material on the right.
776
777        :type descr: str
778        :param descr: A short description of the purpose and/or effect
779            of this rule.
780        """
781        # Ensure that the individual patterns are coherent.  E.g., if
782        # left='(' and right=')', then this will raise an exception:
783        re.compile(tag_pattern2re_pattern(left_tag_pattern))
784        re.compile(tag_pattern2re_pattern(right_tag_pattern))
785
786        self._left_tag_pattern = left_tag_pattern
787        self._right_tag_pattern = right_tag_pattern
788        regexp = re.compile(
789            '(?P<left>%s)\}(?P<right>%s)'
790            % (
791                tag_pattern2re_pattern(left_tag_pattern),
792                tag_pattern2re_pattern(right_tag_pattern),
793            )
794        )
795        RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
796
797    def __repr__(self):
798        """
799        Return a string representation of this rule.  It has the form::
800
801            <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
802
803        Note that this representation does not include the
804        description string; that string can be accessed
805        separately with the ``descr()`` method.
806
807        :rtype: str
808        """
809        return (
810            '<ExpandRightRule: '
811            + unicode_repr(self._left_tag_pattern)
812            + ', '
813            + unicode_repr(self._right_tag_pattern)
814            + '>'
815        )
816
817
818@python_2_unicode_compatible
819class ChunkRuleWithContext(RegexpChunkRule):
820    """
821    A rule specifying how to add chunks to a ``ChunkString``, using
822    three matching tag patterns: one for the left context, one for the
823    chunk, and one for the right context.  When applied to a
824    ``ChunkString``, it will find any substring that matches the chunk
825    tag pattern, is surrounded by substrings that match the two
826    context patterns, and is not already part of a chunk; and create a
827    new chunk containing the substring that matched the chunk tag
828    pattern.
829
830    Caveat: Both the left and right context are consumed when this
831    rule matches; therefore, if you need to find overlapping matches,
832    you will need to apply your rule more than once.
833    """
834
835    def __init__(
836        self,
837        left_context_tag_pattern,
838        chunk_tag_pattern,
839        right_context_tag_pattern,
840        descr,
841    ):
842        """
843        Construct a new ``ChunkRuleWithContext``.
844
845        :type left_context_tag_pattern: str
846        :param left_context_tag_pattern: A tag pattern that must match
847            the left context of ``chunk_tag_pattern`` for this rule to
848            apply.
849        :type chunk_tag_pattern: str
850        :param chunk_tag_pattern: A tag pattern that must match for this
851            rule to apply.  If the rule does apply, then this pattern
852            also identifies the substring that will be made into a chunk.
853        :type right_context_tag_pattern: str
854        :param right_context_tag_pattern: A tag pattern that must match
855            the right context of ``chunk_tag_pattern`` for this rule to
856            apply.
857        :type descr: str
858        :param descr: A short description of the purpose and/or effect
859            of this rule.
860        """
861        # Ensure that the individual patterns are coherent.  E.g., if
862        # left='(' and right=')', then this will raise an exception:
863        re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
864        re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
865        re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
866
867        self._left_context_tag_pattern = left_context_tag_pattern
868        self._chunk_tag_pattern = chunk_tag_pattern
869        self._right_context_tag_pattern = right_context_tag_pattern
870        regexp = re.compile(
871            '(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s'
872            % (
873                tag_pattern2re_pattern(left_context_tag_pattern),
874                tag_pattern2re_pattern(chunk_tag_pattern),
875                tag_pattern2re_pattern(right_context_tag_pattern),
876                ChunkString.IN_CHINK_PATTERN,
877            )
878        )
879        replacement = r'\g<left>{\g<chunk>}\g<right>'
880        RegexpChunkRule.__init__(self, regexp, replacement, descr)
881
882    def __repr__(self):
883        """
884        Return a string representation of this rule.  It has the form::
885
886            <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
887
888        Note that this representation does not include the
889        description string; that string can be accessed
890        separately with the ``descr()`` method.
891
892        :rtype: str
893        """
894        return '<ChunkRuleWithContext:  %r, %r, %r>' % (
895            self._left_context_tag_pattern,
896            self._chunk_tag_pattern,
897            self._right_context_tag_pattern,
898        )
899
900
901##//////////////////////////////////////////////////////
902##  Tag Pattern Format Conversion
903##//////////////////////////////////////////////////////
904
905# this should probably be made more strict than it is -- e.g., it
906# currently accepts 'foo'.
907CHUNK_TAG_PATTERN = re.compile(
908    r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
909)
910
911
912def tag_pattern2re_pattern(tag_pattern):
913    """
914    Convert a tag pattern to a regular expression pattern.  A "tag
915    pattern" is a modified version of a regular expression, designed
916    for matching sequences of tags.  The differences between regular
917    expression patterns and tag patterns are:
918
919        - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
920          ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
921          ``'<NN'`` followed by one or more repetitions of ``'>'``.
922        - Whitespace in tag patterns is ignored.  So
923          ``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'``
924        - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so
925          ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
926
927    In particular, ``tag_pattern2re_pattern`` performs the following
928    transformations on the given pattern:
929
930        - Replace '.' with '[^<>{}]'
931        - Remove any whitespace
932        - Add extra parens around '<' and '>', to make '<' and '>' act
933          like parentheses.  E.g., so that in '<NN>+', the '+' has scope
934          over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
935          scope over 'NN' and 'IN', but not '<' or '>'.
936        - Check to make sure the resulting pattern is valid.
937
938    :type tag_pattern: str
939    :param tag_pattern: The tag pattern to convert to a regular
940        expression pattern.
941    :raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
942        In particular, ``tag_pattern`` should not include braces; and it
943        should not contain nested or mismatched angle-brackets.
944    :rtype: str
945    :return: A regular expression pattern corresponding to
946        ``tag_pattern``.
947    """
948    # Clean up the regular expression
949    tag_pattern = re.sub(r'\s', '', tag_pattern)
950    tag_pattern = re.sub(r'<', '(<(', tag_pattern)
951    tag_pattern = re.sub(r'>', ')>)', tag_pattern)
952
953    # Check the regular expression
954    if not CHUNK_TAG_PATTERN.match(tag_pattern):
955        raise ValueError('Bad tag pattern: %r' % tag_pattern)
956
957    # Replace "." with CHUNK_TAG_CHAR.
958    # We have to do this after, since it adds {}[]<>s, which would
959    # confuse CHUNK_TAG_PATTERN.
960    # PRE doesn't have lookback assertions, so reverse twice, and do
961    # the pattern backwards (with lookahead assertions).  This can be
962    # made much cleaner once we can switch back to SRE.
963    def reverse_str(str):
964        lst = list(str)
965        lst.reverse()
966        return ''.join(lst)
967
968    tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
969    reversed = reverse_str(tag_pattern)
970    reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
971    tag_pattern = reverse_str(reversed)
972
973    return tag_pattern
974
975
976##//////////////////////////////////////////////////////
977##  RegexpChunkParser
978##//////////////////////////////////////////////////////
979
980
981@python_2_unicode_compatible
982class RegexpChunkParser(ChunkParserI):
983    """
984    A regular expression based chunk parser.  ``RegexpChunkParser`` uses a
985    sequence of "rules" to find chunks of a single type within a
986    text.  The chunking of the text is encoded using a ``ChunkString``,
987    and each rule acts by modifying the chunking in the
988    ``ChunkString``.  The rules are all implemented using regular
989    expression matching and substitution.
990
991    The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
992    ``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
993    define the rules that are used by ``RegexpChunkParser``.  Each rule
994    defines an ``apply()`` method, which modifies the chunking encoded
995    by a given ``ChunkString``.
996
997    :type _rules: list(RegexpChunkRule)
998    :ivar _rules: The list of rules that should be applied to a text.
999    :type _trace: int
1000    :ivar _trace: The default level of tracing.
1001
1002    """
1003
1004    def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
1005        """
1006        Construct a new ``RegexpChunkParser``.
1007
1008        :type rules: list(RegexpChunkRule)
1009        :param rules: The sequence of rules that should be used to
1010            generate the chunking for a tagged text.
1011        :type chunk_label: str
1012        :param chunk_label: The node value that should be used for
1013            chunk subtrees.  This is typically a short string
1014            describing the type of information contained by the chunk,
1015            such as ``"NP"`` for base noun phrases.
1016        :type root_label: str
1017        :param root_label: The node value that should be used for the
1018            top node of the chunk structure.
1019        :type trace: int
1020        :param trace: The level of tracing that should be used when
1021            parsing a text.  ``0`` will generate no tracing output;
1022            ``1`` will generate normal tracing output; and ``2`` or
1023            higher will generate verbose tracing output.
1024        """
1025        self._rules = rules
1026        self._trace = trace
1027        self._chunk_label = chunk_label
1028        self._root_label = root_label
1029
1030    def _trace_apply(self, chunkstr, verbose):
1031        """
1032        Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
1033        turn.  Generate trace output between each rule.  If ``verbose``
1034        is true, then generate verbose output.
1035
1036        :type chunkstr: ChunkString
1037        :param chunkstr: The chunk string to which each rule should be
1038            applied.
1039        :type verbose: bool
1040        :param verbose: Whether output should be verbose.
1041        :rtype: None
1042        """
1043        print('# Input:')
1044        print(chunkstr)
1045        for rule in self._rules:
1046            rule.apply(chunkstr)
1047            if verbose:
1048                print('#', rule.descr() + ' (' + unicode_repr(rule) + '):')
1049            else:
1050                print('#', rule.descr() + ':')
1051            print(chunkstr)
1052
1053    def _notrace_apply(self, chunkstr):
1054        """
1055        Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
1056        turn.
1057
1058        :param chunkstr: The chunk string to which each rule should be
1059            applied.
1060        :type chunkstr: ChunkString
1061        :rtype: None
1062        """
1063
1064        for rule in self._rules:
1065            rule.apply(chunkstr)
1066
1067    def parse(self, chunk_struct, trace=None):
1068        """
1069        :type chunk_struct: Tree
1070        :param chunk_struct: the chunk structure to be (further) chunked
1071        :type trace: int
1072        :param trace: The level of tracing that should be used when
1073            parsing a text.  ``0`` will generate no tracing output;
1074            ``1`` will generate normal tracing output; and ``2`` or
1075            highter will generate verbose tracing output.  This value
1076            overrides the trace level value that was given to the
1077            constructor.
1078        :rtype: Tree
1079        :return: a chunk structure that encodes the chunks in a given
1080            tagged sentence.  A chunk is a non-overlapping linguistic
1081            group, such as a noun phrase.  The set of chunks
1082            identified in the chunk structure depends on the rules
1083            used to define this ``RegexpChunkParser``.
1084        """
1085        if len(chunk_struct) == 0:
1086            print('Warning: parsing empty text')
1087            return Tree(self._root_label, [])
1088
1089        try:
1090            chunk_struct.label()
1091        except AttributeError:
1092            chunk_struct = Tree(self._root_label, chunk_struct)
1093
1094        # Use the default trace value?
1095        if trace is None:
1096            trace = self._trace
1097
1098        chunkstr = ChunkString(chunk_struct)
1099
1100        # Apply the sequence of rules to the chunkstring.
1101        if trace:
1102            verbose = trace > 1
1103            self._trace_apply(chunkstr, verbose)
1104        else:
1105            self._notrace_apply(chunkstr)
1106
1107        # Use the chunkstring to create a chunk structure.
1108        return chunkstr.to_chunkstruct(self._chunk_label)
1109
1110    def rules(self):
1111        """
1112        :return: the sequence of rules used by ``RegexpChunkParser``.
1113        :rtype: list(RegexpChunkRule)
1114        """
1115        return self._rules
1116
1117    def __repr__(self):
1118        """
1119        :return: a concise string representation of this
1120            ``RegexpChunkParser``.
1121        :rtype: str
1122        """
1123        return "<RegexpChunkParser with %d rules>" % len(self._rules)
1124
1125    def __str__(self):
1126        """
1127        :return: a verbose string representation of this ``RegexpChunkParser``.
1128        :rtype: str
1129        """
1130        s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
1131        margin = 0
1132        for rule in self._rules:
1133            margin = max(margin, len(rule.descr()))
1134        if margin < 35:
1135            format = "    %" + repr(-(margin + 3)) + "s%s\n"
1136        else:
1137            format = "    %s\n      %s\n"
1138        for rule in self._rules:
1139            s += format % (rule.descr(), unicode_repr(rule))
1140        return s[:-1]
1141
1142
1143##//////////////////////////////////////////////////////
1144##  Chunk Grammar
1145##//////////////////////////////////////////////////////
1146
1147
1148@python_2_unicode_compatible
1149class RegexpParser(ChunkParserI):
1150    """
1151    A grammar based chunk parser.  ``chunk.RegexpParser`` uses a set of
1152    regular expression patterns to specify the behavior of the parser.
1153    The chunking of the text is encoded using a ``ChunkString``, and
1154    each rule acts by modifying the chunking in the ``ChunkString``.
1155    The rules are all implemented using regular expression matching
1156    and substitution.
1157
1158    A grammar contains one or more clauses in the following form::
1159
1160     NP:
1161       {<DT|JJ>}          # chunk determiners and adjectives
1162       }<[\.VI].*>+{      # chink any tag beginning with V, I, or .
1163       <.*>}{<DT>         # split a chunk at a determiner
1164       <DT|JJ>{}<NN.*>    # merge chunk ending with det/adj
1165                          # with one starting with a noun
1166
1167    The patterns of a clause are executed in order.  An earlier
1168    pattern may introduce a chunk boundary that prevents a later
1169    pattern from executing.  Sometimes an individual pattern will
1170    match on multiple, overlapping extents of the input.  As with
1171    regular expression substitution more generally, the chunker will
1172    identify the first match possible, then continue looking for matches
1173    after this one has ended.
1174
1175    The clauses of a grammar are also executed in order.  A cascaded
1176    chunk parser is one having more than one clause.  The maximum depth
1177    of a parse tree created by this chunk parser is the same as the
1178    number of clauses in the grammar.
1179
1180    When tracing is turned on, the comment portion of a line is displayed
1181    each time the corresponding pattern is applied.
1182
1183    :type _start: str
1184    :ivar _start: The start symbol of the grammar (the root node of
1185        resulting trees)
1186    :type _stages: int
1187    :ivar _stages: The list of parsing stages corresponding to the grammar
1188
1189    """
1190
1191    def __init__(self, grammar, root_label='S', loop=1, trace=0):
1192        """
1193        Create a new chunk parser, from the given start state
1194        and set of chunk patterns.
1195
1196        :param grammar: The grammar, or a list of RegexpChunkParser objects
1197        :type grammar: str or list(RegexpChunkParser)
1198        :param root_label: The top node of the tree being created
1199        :type root_label: str or Nonterminal
1200        :param loop: The number of times to run through the patterns
1201        :type loop: int
1202        :type trace: int
1203        :param trace: The level of tracing that should be used when
1204            parsing a text.  ``0`` will generate no tracing output;
1205            ``1`` will generate normal tracing output; and ``2`` or
1206            higher will generate verbose tracing output.
1207        """
1208        self._trace = trace
1209        self._stages = []
1210        self._grammar = grammar
1211        self._loop = loop
1212
1213        if isinstance(grammar, string_types):
1214            self._read_grammar(grammar, root_label, trace)
1215        else:
1216            # Make sur the grammar looks like it has the right type:
1217            type_err = (
1218                'Expected string or list of RegexpChunkParsers ' 'for the grammar.'
1219            )
1220            try:
1221                grammar = list(grammar)
1222            except:
1223                raise TypeError(type_err)
1224            for elt in grammar:
1225                if not isinstance(elt, RegexpChunkParser):
1226                    raise TypeError(type_err)
1227            self._stages = grammar
1228
1229    def _read_grammar(self, grammar, root_label, trace):
1230        """
1231        Helper function for __init__: read the grammar if it is a
1232        string.
1233        """
1234        rules = []
1235        lhs = None
1236        for line in grammar.split('\n'):
1237            line = line.strip()
1238
1239            # New stage begins if there's an unescaped ':'
1240            m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
1241            if m:
1242                # Record the stage that we just completed.
1243                self._add_stage(rules, lhs, root_label, trace)
1244                # Start a new stage.
1245                lhs = m.group('nonterminal').strip()
1246                rules = []
1247                line = m.group('rule').strip()
1248
1249            # Skip blank & comment-only lines
1250            if line == '' or line.startswith('#'):
1251                continue
1252
1253            # Add the rule
1254            rules.append(RegexpChunkRule.fromstring(line))
1255
1256        # Record the final stage
1257        self._add_stage(rules, lhs, root_label, trace)
1258
1259    def _add_stage(self, rules, lhs, root_label, trace):
1260        """
1261        Helper function for __init__: add a new stage to the parser.
1262        """
1263        if rules != []:
1264            if not lhs:
1265                raise ValueError('Expected stage marker (eg NP:)')
1266            parser = RegexpChunkParser(
1267                rules, chunk_label=lhs, root_label=root_label, trace=trace
1268            )
1269            self._stages.append(parser)
1270
1271    def parse(self, chunk_struct, trace=None):
1272        """
1273        Apply the chunk parser to this input.
1274
1275        :type chunk_struct: Tree
1276        :param chunk_struct: the chunk structure to be (further) chunked
1277            (this tree is modified, and is also returned)
1278        :type trace: int
1279        :param trace: The level of tracing that should be used when
1280            parsing a text.  ``0`` will generate no tracing output;
1281            ``1`` will generate normal tracing output; and ``2`` or
1282            highter will generate verbose tracing output.  This value
1283            overrides the trace level value that was given to the
1284            constructor.
1285        :return: the chunked output.
1286        :rtype: Tree
1287        """
1288        if trace is None:
1289            trace = self._trace
1290        for i in range(self._loop):
1291            for parser in self._stages:
1292                chunk_struct = parser.parse(chunk_struct, trace=trace)
1293        return chunk_struct
1294
1295    def __repr__(self):
1296        """
1297        :return: a concise string representation of this ``chunk.RegexpParser``.
1298        :rtype: str
1299        """
1300        return "<chunk.RegexpParser with %d stages>" % len(self._stages)
1301
1302    def __str__(self):
1303        """
1304        :return: a verbose string representation of this
1305            ``RegexpParser``.
1306        :rtype: str
1307        """
1308        s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
1309        margin = 0
1310        for parser in self._stages:
1311            s += "%s\n" % parser
1312        return s[:-1]
1313
1314
1315##//////////////////////////////////////////////////////
1316##  Demonstration code
1317##//////////////////////////////////////////////////////
1318
1319
1320def demo_eval(chunkparser, text):
1321    """
1322    Demonstration code for evaluating a chunk parser, using a
1323    ``ChunkScore``.  This function assumes that ``text`` contains one
1324    sentence per line, and that each sentence has the form expected by
1325    ``tree.chunk``.  It runs the given chunk parser on each sentence in
1326    the text, and scores the result.  It prints the final score
1327    (precision, recall, and f-measure); and reports the set of chunks
1328    that were missed and the set of chunks that were incorrect.  (At
1329    most 10 missing chunks and 10 incorrect chunks are reported).
1330
1331    :param chunkparser: The chunkparser to be tested
1332    :type chunkparser: ChunkParserI
1333    :param text: The chunked tagged text that should be used for
1334        evaluation.
1335    :type text: str
1336    """
1337    from nltk import chunk
1338    from nltk.tree import Tree
1339
1340    # Evaluate our chunk parser.
1341    chunkscore = chunk.ChunkScore()
1342
1343    for sentence in text.split('\n'):
1344        print(sentence)
1345        sentence = sentence.strip()
1346        if not sentence:
1347            continue
1348        gold = chunk.tagstr2tree(sentence)
1349        tokens = gold.leaves()
1350        test = chunkparser.parse(Tree('S', tokens), trace=1)
1351        chunkscore.score(gold, test)
1352        print()
1353
1354    print('/' + ('=' * 75) + '\\')
1355    print('Scoring', chunkparser)
1356    print(('-' * 77))
1357    print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
1358    print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
1359    print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))
1360
1361    # Missed chunks.
1362    if chunkscore.missed():
1363        print('Missed:')
1364        missed = chunkscore.missed()
1365        for chunk in missed[:10]:
1366            print('  ', ' '.join(map(str, chunk)))
1367        if len(chunkscore.missed()) > 10:
1368            print('  ...')
1369
1370    # Incorrect chunks.
1371    if chunkscore.incorrect():
1372        print('Incorrect:')
1373        incorrect = chunkscore.incorrect()
1374        for chunk in incorrect[:10]:
1375            print('  ', ' '.join(map(str, chunk)))
1376        if len(chunkscore.incorrect()) > 10:
1377            print('  ...')
1378
1379    print('\\' + ('=' * 75) + '/')
1380    print()
1381
1382
1383def demo():
1384    """
1385    A demonstration for the ``RegexpChunkParser`` class.  A single text is
1386    parsed with four different chunk parsers, using a variety of rules
1387    and strategies.
1388    """
1389
1390    from nltk import chunk, Tree
1391
1392    text = """\
1393    [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
1394    [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
1395    [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
1396    """
1397
1398    print('*' * 75)
1399    print('Evaluation text:')
1400    print(text)
1401    print('*' * 75)
1402    print()
1403
1404    grammar = r"""
1405    NP:                   # NP stage
1406      {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
1407      {<NNP>+}            # chunk proper nouns
1408    """
1409    cp = chunk.RegexpParser(grammar)
1410    demo_eval(cp, text)
1411
1412    grammar = r"""
1413    NP:
1414      {<.*>}              # start by chunking each tag
1415      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
1416      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
1417    """
1418    cp = chunk.RegexpParser(grammar)
1419    demo_eval(cp, text)
1420
1421    grammar = r"""
1422    NP: {<DT>?<JJ>*<NN>}    # chunk determiners, adjectives and nouns
1423    VP: {<TO>?<VB.*>}       # VP = verb words
1424    """
1425    cp = chunk.RegexpParser(grammar)
1426    demo_eval(cp, text)
1427
1428    grammar = r"""
1429    NP: {<.*>*}             # start by chunking everything
1430        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
1431        <.*>}{<DT>          # separate on determiners
1432    PP: {<IN><NP>}          # PP = preposition + noun phrase
1433    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
1434    """
1435    cp = chunk.RegexpParser(grammar)
1436    demo_eval(cp, text)
1437
1438    # Evaluation
1439
1440    from nltk.corpus import conll2000
1441
1442    print()
1443    print("Demonstration of empty grammar:")
1444
1445    cp = chunk.RegexpParser("")
1446    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))
1447
1448    print()
1449    print("Demonstration of accuracy evaluation using CoNLL tags:")
1450
1451    grammar = r"""
1452    NP:
1453      {<.*>}              # start by chunking each tag
1454      }<[\.VI].*>+{       # unchunk any verbs, prepositions or periods
1455      <DT|JJ>{}<NN.*>     # merge det/adj with nouns
1456    """
1457    cp = chunk.RegexpParser(grammar)
1458    print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
1459
1460    print()
1461    print("Demonstration of tagged token input")
1462
1463    grammar = r"""
1464    NP: {<.*>*}             # start by chunking everything
1465        }<[\.VI].*>+{       # chink any verbs, prepositions or periods
1466        <.*>}{<DT>          # separate on determiners
1467    PP: {<IN><NP>}          # PP = preposition + noun phrase
1468    VP: {<VB.*><NP|PP>*}    # VP = verb words + NPs and PPs
1469    """
1470    cp = chunk.RegexpParser(grammar)
1471    print(
1472        cp.parse(
1473            [
1474                ("the", "DT"),
1475                ("little", "JJ"),
1476                ("cat", "NN"),
1477                ("sat", "VBD"),
1478                ("on", "IN"),
1479                ("the", "DT"),
1480                ("mat", "NN"),
1481                (".", "."),
1482            ]
1483        )
1484    )
1485
1486
1487if __name__ == '__main__':
1488    demo()
1489