1'''Default SimpleParse EBNF grammar as a generator with productions 2 3This module defines the original SimpleParse 4grammar. It uses the generator objects directly 5as this is the first grammar being written. 6''' 7from simpleparse.objectgenerator import * 8from simpleparse import generator, baseparser 9from simpleparse.dispatchprocessor import * 10 11try: 12 _unichr = unichr 13 _unicode = unicode 14except NameError: 15 _unichr = chr 16 _unicode = str 17 18# note that whitespace is slightly different 19# due to a bug with NULL-matching repeating groups 20# we make all the ts references ts? 21whitespace = Name (value = "ts", report = 0) 22element_token = Name( value = "element_token" ) 23literal = Name ( value = "literal") 24group = Name ( value = "group") 25characterrange = Name ( value = "range") 26name = Name ( value = "name") 27 28 29SPGenerator = generator.Generator () 30 31SPGenerator.addDefinition( 32 "declarationset", 33 Name (value = "declaration", repeating = 1), 34) 35 36 37 38SPGenerator.addDefinition ( 39 "declaration", 40 SequentialGroup ( 41 children = [ 42 whitespace, 43 FirstOfGroup ( 44 children = [ 45 Name (value = "unreportedname", ), 46 Name (value = "expandedname", ), 47 Name (value = "name", ), 48 ], 49 ), 50 whitespace, 51 Literal (value = ":"), 52 Literal (value = ":", optional=1), 53 Literal (value = "=",), 54 Name( value = "seq_group"), 55 ], 56 ) 57) 58 59SPGenerator.addDefinition ( 60 "group", 61 SequentialGroup ( 62 children = [ 63 Literal (value ="("), 64 Name( value= "seq_group"), 65 Literal (value =")"), 66 ], 67 expanded = 1, 68 ) 69) 70 71_seq_children = FirstOfGroup( 72 children = [ 73 Name(value="error_on_fail"), 74 Name(value="fo_group"), 75 Name(value="element_token"), 76 ], 77) 78 79SPGenerator.addDefinition ( 80 "seq_group", 81 SequentialGroup ( 82 children = [ 83 whitespace, 84 _seq_children, 85 SequentialGroup( 86 children = [ 87 whitespace, 88 Name( value="seq_indicator"), 89 whitespace, 90 _seq_children, 91 ], 92 repeating = 1, optional = 1, 93 ), 94 whitespace, 95 ], 96 ), 97) 98 99SPGenerator.addDefinition ( 100 "fo_group", 101 SequentialGroup ( 102 children = [ 103 element_token, 104 SequentialGroup( 105 children = [ 106 whitespace, 107 Name( value="fo_indicator"), 108 whitespace, 109 element_token, 110 ], 111 repeating = 1, 112 ), 113 ], 114 ) 115) 116SPGenerator.addDefinition ( 117 "seq_indicator", 118 Literal(value = ",", report=0 ), 119) 120SPGenerator.addDefinition ( 121 "fo_indicator", 122 Literal(value = "/", report=0 ), 123) 124 125SPGenerator.addDefinition ( 126 "element_token", 127 SequentialGroup ( 128 children = [ 129 Name (value = "lookahead_indicator", optional = 1), 130 whitespace, 131 Name (value = "negpos_indicator", optional = 1), 132 whitespace, 133 FirstOfGroup ( 134 children = [ 135 literal, 136 characterrange, 137 group, 138 name, 139 ] 140 ), 141 whitespace, 142 Name (value = "occurence_indicator", optional = 1), 143 whitespace, 144 Name (value = "error_on_fail", optional = 1), 145 ] 146 ) 147) 148 149SPGenerator.addDefinition ( 150 "negpos_indicator", 151 Range (value = "+-" ) 152) 153SPGenerator.addDefinition ( 154 "lookahead_indicator", 155 Literal(value = "?" ), 156) 157 158SPGenerator.addDefinition ( 159 "occurence_indicator", 160 Range (value = "+*?" ), 161) 162SPGenerator.addDefinition ( 163 "error_on_fail", 164 SequentialGroup ( 165 children = [ 166 Literal (value ="!"), 167 SequentialGroup ( 168 children = [ 169 whitespace, 170 Name( value="literal"), 171 ], 172 optional = 1, 173 ), 174 ], 175 ), 176) 177 178SPGenerator.addDefinition ( 179 "unreportedname", 180 SequentialGroup ( 181 children = [ 182 Literal (value ="<"), 183 whitespace, 184 name, 185 whitespace, 186 Literal (value =">"), 187 ] 188 ) 189) 190SPGenerator.addDefinition ( 191 "expandedname", 192 SequentialGroup ( 193 children = [ 194 Literal (value =">"), 195 whitespace, 196 name, 197 whitespace, 198 Literal (value ="<"), 199 ] 200 ) 201) 202 203SPGenerator.addDefinition ( 204 "name", 205 SequentialGroup ( 206 children = [ 207 Range(value ='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'), 208 Range(value ='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789', optional= 1, repeating= 1), 209 ] 210 ) 211) 212 213SPGenerator.addDefinition ( 214 "ts", # ( [ \011-\015]+ / ('#',-'\n'+,'\n')+ )* 215 FirstOfGroup ( 216 children = [ 217 Range(value =' \011\012\013\014\015', repeating=1), 218 Name( value = "comment" ), 219 ], 220 repeating = 1, optional=1, 221 ) 222) 223SPGenerator.addDefinition ( 224 "comment", # ( [ \011-\015]+ / ('#',-'\n'+,'\n')+ )* 225 SequentialGroup ( 226 children = [ 227 Literal ( value ="#"), 228 Literal (value ="\n", negative = 1, repeating = 1, optional=1), 229 Literal (value = "\n",), 230 ], 231 ), 232) 233 234SPGenerator.addDefinition ( 235 "literalDecorator", # literalDecorator := [c] 236 Range( value = 'c' ) 237) 238 239SPGenerator.addDefinition ( 240 "literal", # ("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"') 241 SequentialGroup( 242 children = [ 243 Name( value = 'literalDecorator', optional=1 ), 244 FirstOfGroup ( 245 children = [ 246 SequentialGroup ( 247 children = [ 248 Literal (value ="'"), 249 FirstOfGroup ( 250 children = [ 251 Name (value = "CHARNOSNGLQUOTE"), 252 Name (value = "ESCAPEDCHAR"), 253 ], 254 optional = 1, repeating = 1, 255 ), 256 Literal (value ="'"), 257 ], 258 ), 259 SequentialGroup ( 260 children = [ 261 Literal (value ='"'), 262 FirstOfGroup ( 263 children = [ 264 Name (value = "CHARNODBLQUOTE"), 265 Name (value = "ESCAPEDCHAR"), 266 ], 267 optional = 1, repeating = 1, 268 ), 269 Literal (value ='"'), 270 ], 271 ) 272 ], 273 ), 274 ], 275 ) 276) 277 278SPGenerator.addDefinition ( 279 "range", # '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' 280 SequentialGroup ( 281 children =[ 282 Literal (value ="["), 283 Name (value ="CHARBRACE",optional = 1), 284 Name (value ="CHARDASH",optional = 1), 285 FirstOfGroup( 286 children = [ 287 Name (value ="CHARRANGE"), 288 Name (value ="CHARNOBRACE"), 289 ], 290 optional = 1, repeating = 1, 291 ), 292 Name (value ="CHARDASH",optional = 1), 293 Literal (value ="]"), 294 ], 295 ) 296) 297SPGenerator.addDefinition ( 298 "CHARBRACE", 299 Literal (value = "]"), 300) 301SPGenerator.addDefinition ( 302 "CHARDASH", 303 Literal (value = "-"), 304) 305SPGenerator.addDefinition ( 306 "CHARRANGE", # CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE 307 SequentialGroup ( 308 children =[ 309 Name (value ="CHARNOBRACE"), 310 Literal (value ="-"), 311 Name (value ="CHARNOBRACE"), 312 ], 313 ), 314) 315SPGenerator.addDefinition ( 316 "CHARNOBRACE", # CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE 317 FirstOfGroup( 318 children =[ 319 Name (value ="ESCAPEDCHAR"), 320 Name (value ="CHAR"), 321 ], 322 ), 323) 324SPGenerator.addDefinition ( 325 "CHAR", 326 Literal ( 327 value ="]", 328 negative = 1, 329 ), 330) 331 332SPGenerator.addDefinition ( 333 "ESCAPEDCHAR", # '\\',( SPECIALESCAPEDCHAR / ('x',HEXESCAPEDCHAR) / UNICODEESCAPEDCHAR_16 / OCTALESCAPEDCHAR / ) 334 SequentialGroup ( 335 children =[ 336 Literal (value ="\\"), 337 FirstOfGroup( 338 children = [ 339 Name (value ="SPECIALESCAPEDCHAR"), 340 SequentialGroup( 341 children = [ 342 Range( value = 'xX' ), 343 Name( value="HEXESCAPEDCHAR"), 344 ] 345 ), 346 Name (value ="OCTALESCAPEDCHAR"), 347 SequentialGroup( 348 children = [ 349 Range( value='uU'), 350 Name( value='UNICODEESCAPEDCHAR' ), 351 ], 352 ), 353 ], 354 ), 355 ], 356 ) 357) 358 359SPGenerator.addDefinition ( 360 "SPECIALESCAPEDCHAR", 361 Range(value ='\\abfnrtv"\''), 362) 363 364SPGenerator.addDefinition ( 365 "OCTALESCAPEDCHAR", # [0-7],[0-7]?,[0-7]? 366 SequentialGroup ( 367 children =[ 368 Range (value ="01234567"), 369 Range (value ="01234567", optional = 1), 370 Range (value ="01234567", optional = 1), 371 ], 372 ) 373) 374SPGenerator.addDefinition ( 375 "HEXESCAPEDCHAR", # [0-9a-fA-F],[0-9a-fA-F] 376 SequentialGroup ( 377 children =[ 378 Range (value ="0123456789abcdefABCDEF"), 379 Range (value ="0123456789abcdefABCDEF"), 380 ], 381 ) 382) 383SPGenerator.addDefinition( 384 "UNICODEESCAPEDCHAR", 385 SequentialGroup( 386 children=[ 387 Range (value ="0123456789abcdefABCDEF"), 388 Range (value ="0123456789abcdefABCDEF"), 389 Range (value ="0123456789abcdefABCDEF"), 390 Range (value ="0123456789abcdefABCDEF"), 391 SequentialGroup( 392 children = [ 393 Range (value ="0123456789abcdefABCDEF"), 394 Range (value ="0123456789abcdefABCDEF"), 395 Range (value ="0123456789abcdefABCDEF"), 396 Range (value ="0123456789abcdefABCDEF"), 397 ], 398 optional = True, 399 ) 400 ] 401 ) 402) 403 404SPGenerator.addDefinition ( 405 "CHARNODBLQUOTE", 406 Range(value ='\\"', negative = 1, repeating = 1), 407) 408SPGenerator.addDefinition ( 409 "CHARNOSNGLQUOTE", 410 Range(value ="\\'", negative = 1, repeating = 1), 411) 412 413declaration = r"""declarationset := declaration+ 414declaration := ts, (unreportedname/expandedname/name) ,ts,':',':'?,'=',seq_group 415 416element_token := lookahead_indicator?, ts, negpos_indicator?,ts, (literal/range/group/name),ts, occurence_indicator?, ts, error_on_fail? 417 418negpos_indicator := [-+] 419lookahead_indicator := "?" 420occurence_indicator := [+*?] 421error_on_fail := "!", (ts,literal)? 422 423>group< := '(',seq_group, ')' 424seq_group := ts,(error_on_fail/fo_group/element_token), 425 (ts, seq_indicator, ts, 426 (error_on_fail/fo_group/element_token) 427 )*, ts 428 429fo_group := element_token, (ts, fo_indicator, ts, element_token)+ 430 431 432# following two are likely something peoples might want to 433# replace in many instances... 434<fo_indicator> := "/" 435<seq_indicator> := ',' 436 437unreportedname := '<', name, '>' 438expandedname := '>', name, '<' 439name := [a-zA-Z_],[a-zA-Z0-9_]* 440<ts> := ( [ \011-\015]+ / comment )* 441comment := '#',-'\n'*,'\n' 442literal := literalDecorator?,("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"') 443literalDecorator := [c] 444 445 446 447range := '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' 448CHARBRACE := ']' 449CHARDASH := '-' 450CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE 451CHARNOBRACE := ESCAPEDCHAR/CHAR 452CHAR := -[]] 453ESCAPEDCHAR := '\\',( SPECIALESCAPEDCHAR / ('x',HEXESCAPEDCHAR) / ([uU],UNICODEESCAPEDCHAR) / OCTALESCAPEDCHAR ) 454SPECIALESCAPEDCHAR := [\\abfnrtv"'] 455OCTALESCAPEDCHAR := [0-7],[0-7]?,[0-7]? 456HEXESCAPEDCHAR := [0-9a-fA-F],[0-9a-fA-F] 457CHARNODBLQUOTE := -[\\"]+ 458CHARNOSNGLQUOTE := -[\\']+ 459UNICODEESCAPEDCHAR := [0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],([0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F])? 460""" 461 462### Now the interpreter objects... 463class Parser(baseparser.BaseParser): 464 """Parser which generates new parsers from EBNF grammars 465 466 This parser class allows you to pass in an EBNF grammar as 467 the initialisation parameter. The EBNF is processed, and a 468 SimpleParse generator object is created as self.generator. 469 470 Unlike most Parsers, this object is intended to be re-created 471 for each bit of data it parses (i.e. each EBNF), so it warps 472 the standard API a lot. 473 """ 474 _rootProduction = 'declarationset' 475 def __init__( self, ebnf, prebuilts=(), methodSource=None, definitionSources=() ): 476 """Create a new generator based on the EBNF in simpleparse format""" 477 processor = SPGrammarProcessor( prebuilts, definitionSources ) 478 success, tags, next = self.parse( ebnf, self._rootProduction, processor=processor ) 479 if next != len(ebnf): 480 lineNumber = lines(0, next, ebnf) 481 raise ValueError( 482 """Unable to complete parsing of the EBNF, stopped at line %s (%s chars of %s) 483Unparsed:\n%s..."""%(lineNumber, next, len(ebnf), ebnf[next:next+100]) 484 ) 485 self.generator = processor.generator 486 def buildTagger( self, name=None, processor = None ): 487 """Build the tag-table for parsing the EBNF for this parser""" 488 return SPGenerator.buildParser( name, processor ) 489 490class SPGrammarProcessor( DispatchProcessor ): 491 """Processing object for post-processing an EBNF into a new generator""" 492 ### top level 493 def __init__( self, prebuilts=(), definitionSources=() ): 494 """Create a new generator based on the EBNF in simpleparse format""" 495 self.generator = generator.Generator() 496 for (name, table) in prebuilts: 497 if isinstance( table, ElementToken): 498 self.generator.addDefinition( name, table) 499 else: 500 self.generator.addDefinition( name, Prebuilt(value=table)) 501 for source in definitionSources: 502 self.generator.addDefinitionSource( source ) 503 504 def declaration( self, info, buffer): 505 '''Base declaration from the grammar, a "production" or "rule"''' 506 (tag, left, right, sublist) = info 507 name = sublist[0] 508 expanded = 0 509 if name[0] == "unreportedname": 510 name = name[3][0] 511 # note that the info is stored in the wrong place :( 512 report = 0 513 elif name[0] == 'expandedname': 514 report = 1 515 expanded = 1 516 name = name[3][0] 517 else: 518 report = 1 519 name = getString( name, buffer ) 520 self.currentProduction = name 521 content = dispatch( self, sublist[1], buffer ) 522 content.report = report 523 content.expanded = expanded 524 self.generator.addDefinition( 525 name, 526 content, 527 ) 528 del self.currentProduction 529 530 ### element configuration 531 def element_token( self, info, buffer): 532 '''get the children, then configure''' 533 (tag, left, right, sublist) = info 534 base = None 535 negative = 0 536 optional = 0 537 repeating = 0 538 lookahead = 0 539 errorOnFail = None 540 for tup in sublist: 541 result = dispatch( self, tup, buffer ) 542 if tup[0] == 'negpos_indicator': 543 negative = result 544 elif tup[0] == 'occurence_indicator': 545 optional, repeating = result 546 elif tup[0] == 'lookahead_indicator': 547 lookahead = result 548 elif tup[0] == 'error_on_fail': 549 # we do some extra work here 550 errorOnFail = result 551 self._config_error_on_fail( errorOnFail, (tag,left,tup[1],[]), buffer ) 552 else: 553 base = result 554 base.optional = optional 555 base.negative = negative 556 base.repeating = repeating 557 base.lookahead = lookahead 558 if errorOnFail: 559 base.errorOnFail = errorOnFail 560 return base 561 562 ### generator-node-builders 563 def seq_group( self, info, buffer): 564 """Process a sequential-group into a SequentialGroup element token""" 565 (tag, left, right, sublist) = info 566 children = dispatchList( self, sublist, buffer ) 567 errorOnFail = None 568 result = [] 569 for (item,tup) in zip(children,sublist): 570 if isinstance( item, ErrorOnFail ): 571 errorOnFail = item 572 else: 573 if errorOnFail: 574 item.errorOnFail = errorOnFail.copy() 575 self._config_error_on_fail( 576 item.errorOnFail, 577 tup, 578 buffer 579 ) 580 result.append( item ) 581 if len(result) == 1: 582 # single-item sequential group (very common) 583 return result[0] 584 elif not result: 585 raise ValueError( """SequentialGroup on line %s doesn't have an element-token child! grammar was %s"""%( lines(0,left, buffer), buffer[left:left+25])) 586 base = SequentialGroup( 587 children = result, 588 ) 589 return base 590 def fo_group( self, info, buffer): 591 """Process a first-of-group into a FirstOf element token""" 592 (tag, left, right, sublist) = info 593 children = dispatchList( self, sublist, buffer ) 594 if len(children) == 1: 595 # this should never happen, but if it does, we can deal with it I suppose... 596 return children[0] 597 base = FirstOfGroup( 598 children = children 599 ) 600 return base 601 602 def literal( self, info, buffer): 603 '''Turn a literal result into a literal generator''' 604 (tag, left, right, sublist) = info 605 if sublist and sublist[0][0] == 'literalDecorator': 606 # right now only have the one decorator... 607 sublist = sublist[1:] 608 classObject = CILiteral 609 else: 610 classObject = Literal 611 elements = dispatchList( self, sublist, buffer) 612 ### Should check for CILiteral with non-CI string or single-character value! 613 return classObject( value = "".join(elements) ) 614 615 def range( self, info, buffer): 616## if hasattr( Range, 'requiresExpandedSet') and Range.requiresExpandedSet: 617 (tag, left, right, sublist) = info 618 return Range( 619 value = ''.join(dispatchList( self, sublist, buffer)), 620 ) 621## else: 622## # need to build up a new-syntax version of the range... 623## # escape ^ to \^ 624## # escape \ to \\ 625## # escape - to \- 626## # make sure range-sets are in proper order... 627## raise NotImplementedError( """Haven't got the new CharSet version implemented yet""") 628 def name( self, tup, buffer): 629 return Name( 630 value = getString(tup, buffer), 631 ) 632 ### simple translators 633 occurenceIndicatorMap = { 634 '*': (1,1), 635 '+': (0,1), 636 '?': (1,0), 637 } 638 def occurence_indicator( self, tup, buffer): 639 '''Return optional, repeating as a tuple of true/false values''' 640 value = getString(tup, buffer) 641 return self.occurenceIndicatorMap[value] 642 def lookahead_indicator( self, tup, buffer ): 643 """If present, the lookahead indictor just says "yes", so just return 1""" 644 return 1 645 def error_on_fail( self, info, buffer ): 646 """If present, we are going to make the current object an errorOnFail type, 647 648 If there's a string literal child, then we use it to create the 649 "message" attribute of the errorOnFail object. 650 """ 651 (tag,left,right,children) = info 652 err = ErrorOnFail() 653 if children: 654 (tag,left,right,children) = children[0] 655 message = "".join(dispatchList( self, children, buffer)) 656 err.message = message 657 return err 658 def _config_error_on_fail( self, errorOnFail, tup, buffer ): 659 """Configure an error-on-fail instance for a given child tuple""" 660 # what we expected to find... 661 errorOnFail.expected = buffer[tup[1]:tup[2]] 662 if hasattr( self, "currentProduction"): 663 errorOnFail.production = self.currentProduction 664 665 666 negposIndicatorMap = { 667 '+': 0, 668 '-': 1, 669 } 670 def negpos_indicator( self, tup, buffer ): 671 '''return whether indicates negative''' 672 value = getString(tup, buffer) 673 return self.negposIndicatorMap[value] 674 675 def CHARNODBLQUOTE( self, tup, buffer): 676 return getString(tup, buffer) 677 CHAR = CHARNOSNGLQUOTE = CHARNODBLQUOTE 678 def ESCAPEDCHAR( self, info, buffer): 679 (tag, left, right, sublist) = info 680 return "".join(dispatchList( self, sublist, buffer)) 681 specialescapedmap = { 682 'a':'\a', 683 'b':'\b', 684 'f':'\f', 685 'n':'\n', 686 'r':'\r', 687 't':'\t', 688 'v':'\v', 689 '\\':'\\', 690 '"':'"', 691 "'":"'", 692 } 693 def SPECIALESCAPEDCHAR( self, tup, buffer): 694 return self.specialescapedmap[ getString(tup, buffer)] 695 def OCTALESCAPEDCHAR(self, tup, buffer): 696 return chr(int( getString(tup, buffer), 8 )) 697 def HEXESCAPEDCHAR( self, tup , buffer): 698 return chr(int( getString(tup, buffer), 16 )) 699 def CHARNOBRACE( self, info, buffer): 700 (tag, left, right, sublist) = info 701 return "".join(dispatchList( self, sublist, buffer)) 702 def CHARRANGE( self, info, buffer): 703 '''Create a string from first to second item''' 704 (tag, left, right, sublist) = info 705 first,second = dispatchList( self, sublist, buffer) 706 if second < first: 707 second, first = first, second 708 if isinstance( first, _unicode ) or isinstance( second, _unicode ): 709 _chr = _unichr 710 if not (isinstance( second, _unicode ) and isinstance( first, _unicode )): 711 raise ValueError( 'Range %s uses one unicode and one string escape, cannot mix'%(buffer[left:right]) ) 712 else: 713 _chr = chr 714 first, second = list(map( ord, (first,second) )) 715 return u''.join([_chr(u) for u in range(first,second+1)]) 716 def CHARDASH( self, tup , buffer): 717 return '-' 718 def CHARBRACE( self, tup , buffer): 719 return ']' 720 721 def UNICODEESCAPEDCHAR( self, info, buffer): 722 """Decode a unicode-escaped hex character into a character value""" 723 (tag, left, right, sublist) = info 724 char = _unichr(int( buffer[left:right], 16 )) 725 return char 726