1# Natural Language Toolkit: Shift-Reduce Parser 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Edward Loper <edloper@gmail.com> 5# Steven Bird <stevenbird1@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8from __future__ import print_function, unicode_literals 9 10from nltk.grammar import Nonterminal 11from nltk.tree import Tree 12from nltk.compat import unicode_repr 13 14from nltk.parse.api import ParserI 15 16##////////////////////////////////////////////////////// 17## Shift/Reduce Parser 18##////////////////////////////////////////////////////// 19class ShiftReduceParser(ParserI): 20 """ 21 A simple bottom-up CFG parser that uses two operations, "shift" 22 and "reduce", to find a single parse for a text. 23 24 ``ShiftReduceParser`` maintains a stack, which records the 25 structure of a portion of the text. This stack is a list of 26 strings and Trees that collectively cover a portion of 27 the text. For example, while parsing the sentence "the dog saw 28 the man" with a typical grammar, ``ShiftReduceParser`` will produce 29 the following stack, which covers "the dog saw":: 30 31 [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] 32 33 ``ShiftReduceParser`` attempts to extend the stack to cover the 34 entire text, and to combine the stack elements into a single tree, 35 producing a complete parse for the sentence. 36 37 Initially, the stack is empty. It is extended to cover the text, 38 from left to right, by repeatedly applying two operations: 39 40 - "shift" moves a token from the beginning of the text to the 41 end of the stack. 42 - "reduce" uses a CFG production to combine the rightmost stack 43 elements into a single Tree. 44 45 Often, more than one operation can be performed on a given stack. 46 In this case, ``ShiftReduceParser`` uses the following heuristics 47 to decide which operation to perform: 48 49 - Only shift if no reductions are available. 50 - If multiple reductions are available, then apply the reduction 51 whose CFG production is listed earliest in the grammar. 52 53 Note that these heuristics are not guaranteed to choose an 54 operation that leads to a parse of the text. Also, if multiple 55 parses exists, ``ShiftReduceParser`` will return at most one of 56 them. 57 58 :see: ``nltk.grammar`` 59 """ 60 61 def __init__(self, grammar, trace=0): 62 """ 63 Create a new ``ShiftReduceParser``, that uses ``grammar`` to 64 parse texts. 65 66 :type grammar: Grammar 67 :param grammar: The grammar used to parse texts. 68 :type trace: int 69 :param trace: The level of tracing that should be used when 70 parsing a text. ``0`` will generate no tracing output; 71 and higher numbers will produce more verbose tracing 72 output. 73 """ 74 self._grammar = grammar 75 self._trace = trace 76 self._check_grammar() 77 78 def grammar(self): 79 return self._grammar 80 81 def parse(self, tokens): 82 tokens = list(tokens) 83 self._grammar.check_coverage(tokens) 84 85 # initialize the stack. 86 stack = [] 87 remaining_text = tokens 88 89 # Trace output. 90 if self._trace: 91 print('Parsing %r' % " ".join(tokens)) 92 self._trace_stack(stack, remaining_text) 93 94 # iterate through the text, pushing the token onto 95 # the stack, then reducing the stack. 96 while len(remaining_text) > 0: 97 self._shift(stack, remaining_text) 98 while self._reduce(stack, remaining_text): 99 pass 100 101 # Did we reduce everything? 102 if len(stack) == 1: 103 # Did we end up with the right category? 104 if stack[0].label() == self._grammar.start().symbol(): 105 yield stack[0] 106 107 def _shift(self, stack, remaining_text): 108 """ 109 Move a token from the beginning of ``remaining_text`` to the 110 end of ``stack``. 111 112 :type stack: list(str and Tree) 113 :param stack: A list of strings and Trees, encoding 114 the structure of the text that has been parsed so far. 115 :type remaining_text: list(str) 116 :param remaining_text: The portion of the text that is not yet 117 covered by ``stack``. 118 :rtype: None 119 """ 120 stack.append(remaining_text[0]) 121 remaining_text.remove(remaining_text[0]) 122 if self._trace: 123 self._trace_shift(stack, remaining_text) 124 125 def _match_rhs(self, rhs, rightmost_stack): 126 """ 127 :rtype: bool 128 :return: true if the right hand side of a CFG production 129 matches the rightmost elements of the stack. ``rhs`` 130 matches ``rightmost_stack`` if they are the same length, 131 and each element of ``rhs`` matches the corresponding 132 element of ``rightmost_stack``. A nonterminal element of 133 ``rhs`` matches any Tree whose node value is equal 134 to the nonterminal's symbol. A terminal element of ``rhs`` 135 matches any string whose type is equal to the terminal. 136 :type rhs: list(terminal and Nonterminal) 137 :param rhs: The right hand side of a CFG production. 138 :type rightmost_stack: list(string and Tree) 139 :param rightmost_stack: The rightmost elements of the parser's 140 stack. 141 """ 142 143 if len(rightmost_stack) != len(rhs): 144 return False 145 for i in range(len(rightmost_stack)): 146 if isinstance(rightmost_stack[i], Tree): 147 if not isinstance(rhs[i], Nonterminal): 148 return False 149 if rightmost_stack[i].label() != rhs[i].symbol(): 150 return False 151 else: 152 if isinstance(rhs[i], Nonterminal): 153 return False 154 if rightmost_stack[i] != rhs[i]: 155 return False 156 return True 157 158 def _reduce(self, stack, remaining_text, production=None): 159 """ 160 Find a CFG production whose right hand side matches the 161 rightmost stack elements; and combine those stack elements 162 into a single Tree, with the node specified by the 163 production's left-hand side. If more than one CFG production 164 matches the stack, then use the production that is listed 165 earliest in the grammar. The new Tree replaces the 166 elements in the stack. 167 168 :rtype: Production or None 169 :return: If a reduction is performed, then return the CFG 170 production that the reduction is based on; otherwise, 171 return false. 172 :type stack: list(string and Tree) 173 :param stack: A list of strings and Trees, encoding 174 the structure of the text that has been parsed so far. 175 :type remaining_text: list(str) 176 :param remaining_text: The portion of the text that is not yet 177 covered by ``stack``. 178 """ 179 if production is None: 180 productions = self._grammar.productions() 181 else: 182 productions = [production] 183 184 # Try each production, in order. 185 for production in productions: 186 rhslen = len(production.rhs()) 187 188 # check if the RHS of a production matches the top of the stack 189 if self._match_rhs(production.rhs(), stack[-rhslen:]): 190 191 # combine the tree to reflect the reduction 192 tree = Tree(production.lhs().symbol(), stack[-rhslen:]) 193 stack[-rhslen:] = [tree] 194 195 # We reduced something 196 if self._trace: 197 self._trace_reduce(stack, production, remaining_text) 198 return production 199 200 # We didn't reduce anything 201 return None 202 203 def trace(self, trace=2): 204 """ 205 Set the level of tracing output that should be generated when 206 parsing a text. 207 208 :type trace: int 209 :param trace: The trace level. A trace level of ``0`` will 210 generate no tracing output; and higher trace levels will 211 produce more verbose tracing output. 212 :rtype: None 213 """ 214 # 1: just show shifts. 215 # 2: show shifts & reduces 216 # 3: display which tokens & productions are shifed/reduced 217 self._trace = trace 218 219 def _trace_stack(self, stack, remaining_text, marker=' '): 220 """ 221 Print trace output displaying the given stack and text. 222 223 :rtype: None 224 :param marker: A character that is printed to the left of the 225 stack. This is used with trace level 2 to print 'S' 226 before shifted stacks and 'R' before reduced stacks. 227 """ 228 s = ' ' + marker + ' [ ' 229 for elt in stack: 230 if isinstance(elt, Tree): 231 s += unicode_repr(Nonterminal(elt.label())) + ' ' 232 else: 233 s += unicode_repr(elt) + ' ' 234 s += '* ' + ' '.join(remaining_text) + ']' 235 print(s) 236 237 def _trace_shift(self, stack, remaining_text): 238 """ 239 Print trace output displaying that a token has been shifted. 240 241 :rtype: None 242 """ 243 if self._trace > 2: 244 print('Shift %r:' % stack[-1]) 245 if self._trace == 2: 246 self._trace_stack(stack, remaining_text, 'S') 247 elif self._trace > 0: 248 self._trace_stack(stack, remaining_text) 249 250 def _trace_reduce(self, stack, production, remaining_text): 251 """ 252 Print trace output displaying that ``production`` was used to 253 reduce ``stack``. 254 255 :rtype: None 256 """ 257 if self._trace > 2: 258 rhs = " ".join(production.rhs()) 259 print('Reduce %r <- %s' % (production.lhs(), rhs)) 260 if self._trace == 2: 261 self._trace_stack(stack, remaining_text, 'R') 262 elif self._trace > 1: 263 self._trace_stack(stack, remaining_text) 264 265 def _check_grammar(self): 266 """ 267 Check to make sure that all of the CFG productions are 268 potentially useful. If any productions can never be used, 269 then print a warning. 270 271 :rtype: None 272 """ 273 productions = self._grammar.productions() 274 275 # Any production whose RHS is an extension of another production's RHS 276 # will never be used. 277 for i in range(len(productions)): 278 for j in range(i + 1, len(productions)): 279 rhs1 = productions[i].rhs() 280 rhs2 = productions[j].rhs() 281 if rhs1[: len(rhs2)] == rhs2: 282 print('Warning: %r will never be used' % productions[i]) 283 284 285##////////////////////////////////////////////////////// 286## Stepping Shift/Reduce Parser 287##////////////////////////////////////////////////////// 288class SteppingShiftReduceParser(ShiftReduceParser): 289 """ 290 A ``ShiftReduceParser`` that allows you to setp through the parsing 291 process, performing a single operation at a time. It also allows 292 you to change the parser's grammar midway through parsing a text. 293 294 The ``initialize`` method is used to start parsing a text. 295 ``shift`` performs a single shift operation, and ``reduce`` performs 296 a single reduce operation. ``step`` will perform a single reduce 297 operation if possible; otherwise, it will perform a single shift 298 operation. ``parses`` returns the set of parses that have been 299 found by the parser. 300 301 :ivar _history: A list of ``(stack, remaining_text)`` pairs, 302 containing all of the previous states of the parser. This 303 history is used to implement the ``undo`` operation. 304 :see: ``nltk.grammar`` 305 """ 306 307 def __init__(self, grammar, trace=0): 308 super(SteppingShiftReduceParser, self).__init__(grammar, trace) 309 self._stack = None 310 self._remaining_text = None 311 self._history = [] 312 313 def parse(self, tokens): 314 tokens = list(tokens) 315 self.initialize(tokens) 316 while self.step(): 317 pass 318 return self.parses() 319 320 def stack(self): 321 """ 322 :return: The parser's stack. 323 :rtype: list(str and Tree) 324 """ 325 return self._stack 326 327 def remaining_text(self): 328 """ 329 :return: The portion of the text that is not yet covered by the 330 stack. 331 :rtype: list(str) 332 """ 333 return self._remaining_text 334 335 def initialize(self, tokens): 336 """ 337 Start parsing a given text. This sets the parser's stack to 338 ``[]`` and sets its remaining text to ``tokens``. 339 """ 340 self._stack = [] 341 self._remaining_text = tokens 342 self._history = [] 343 344 def step(self): 345 """ 346 Perform a single parsing operation. If a reduction is 347 possible, then perform that reduction, and return the 348 production that it is based on. Otherwise, if a shift is 349 possible, then perform it, and return True. Otherwise, 350 return False. 351 352 :return: False if no operation was performed; True if a shift was 353 performed; and the CFG production used to reduce if a 354 reduction was performed. 355 :rtype: Production or bool 356 """ 357 return self.reduce() or self.shift() 358 359 def shift(self): 360 """ 361 Move a token from the beginning of the remaining text to the 362 end of the stack. If there are no more tokens in the 363 remaining text, then do nothing. 364 365 :return: True if the shift operation was successful. 366 :rtype: bool 367 """ 368 if len(self._remaining_text) == 0: 369 return False 370 self._history.append((self._stack[:], self._remaining_text[:])) 371 self._shift(self._stack, self._remaining_text) 372 return True 373 374 def reduce(self, production=None): 375 """ 376 Use ``production`` to combine the rightmost stack elements into 377 a single Tree. If ``production`` does not match the 378 rightmost stack elements, then do nothing. 379 380 :return: The production used to reduce the stack, if a 381 reduction was performed. If no reduction was performed, 382 return None. 383 384 :rtype: Production or None 385 """ 386 self._history.append((self._stack[:], self._remaining_text[:])) 387 return_val = self._reduce(self._stack, self._remaining_text, production) 388 389 if not return_val: 390 self._history.pop() 391 return return_val 392 393 def undo(self): 394 """ 395 Return the parser to its state before the most recent 396 shift or reduce operation. Calling ``undo`` repeatedly return 397 the parser to successively earlier states. If no shift or 398 reduce operations have been performed, ``undo`` will make no 399 changes. 400 401 :return: true if an operation was successfully undone. 402 :rtype: bool 403 """ 404 if len(self._history) == 0: 405 return False 406 (self._stack, self._remaining_text) = self._history.pop() 407 return True 408 409 def reducible_productions(self): 410 """ 411 :return: A list of the productions for which reductions are 412 available for the current parser state. 413 :rtype: list(Production) 414 """ 415 productions = [] 416 for production in self._grammar.productions(): 417 rhslen = len(production.rhs()) 418 if self._match_rhs(production.rhs(), self._stack[-rhslen:]): 419 productions.append(production) 420 return productions 421 422 def parses(self): 423 """ 424 :return: An iterator of the parses that have been found by this 425 parser so far. 426 :rtype: iter(Tree) 427 """ 428 if ( 429 len(self._remaining_text) == 0 430 and len(self._stack) == 1 431 and self._stack[0].label() == self._grammar.start().symbol() 432 ): 433 yield self._stack[0] 434 435 # copied from nltk.parser 436 437 def set_grammar(self, grammar): 438 """ 439 Change the grammar used to parse texts. 440 441 :param grammar: The new grammar. 442 :type grammar: CFG 443 """ 444 self._grammar = grammar 445 446 447##////////////////////////////////////////////////////// 448## Demonstration Code 449##////////////////////////////////////////////////////// 450 451 452def demo(): 453 """ 454 A demonstration of the shift-reduce parser. 455 """ 456 457 from nltk import parse, CFG 458 459 grammar = CFG.fromstring( 460 """ 461 S -> NP VP 462 NP -> Det N | Det N PP 463 VP -> V NP | V NP PP 464 PP -> P NP 465 NP -> 'I' 466 N -> 'man' | 'park' | 'telescope' | 'dog' 467 Det -> 'the' | 'a' 468 P -> 'in' | 'with' 469 V -> 'saw' 470 """ 471 ) 472 473 sent = 'I saw a man in the park'.split() 474 475 parser = parse.ShiftReduceParser(grammar, trace=2) 476 for p in parser.parse(sent): 477 print(p) 478 479 480if __name__ == '__main__': 481 demo() 482