1#======================================================================= 2# 3# Python Lexical Analyser 4# 5# Regular Expressions 6# 7#======================================================================= 8 9import array 10import string 11import types 12from sys import maxint 13 14import Errors 15 16# 17# Constants 18# 19 20BOL = 'bol' 21EOL = 'eol' 22EOF = 'eof' 23 24nl_code = ord('\n') 25 26# 27# Helper functions 28# 29 30def chars_to_ranges(s): 31 """ 32 Return a list of character codes consisting of pairs 33 [code1a, code1b, code2a, code2b,...] which cover all 34 the characters in |s|. 35 """ 36 char_list = list(s) 37 char_list.sort() 38 i = 0 39 n = len(char_list) 40 result = [] 41 while i < n: 42 code1 = ord(char_list[i]) 43 code2 = code1 + 1 44 i = i + 1 45 while i < n and code2 >= ord(char_list[i]): 46 code2 = code2 + 1 47 i = i + 1 48 result.append(code1) 49 result.append(code2) 50 return result 51 52def uppercase_range(code1, code2): 53 """ 54 If the range of characters from code1 to code2-1 includes any 55 lower case letters, return the corresponding upper case range. 56 """ 57 code3 = max(code1, ord('a')) 58 code4 = min(code2, ord('z') + 1) 59 if code3 < code4: 60 d = ord('A') - ord('a') 61 return (code3 + d, code4 + d) 62 else: 63 return None 64 65def lowercase_range(code1, code2): 66 """ 67 If the range of characters from code1 to code2-1 includes any 68 upper case letters, return the corresponding lower case range. 69 """ 70 code3 = max(code1, ord('A')) 71 code4 = min(code2, ord('Z') + 1) 72 if code3 < code4: 73 d = ord('a') - ord('A') 74 return (code3 + d, code4 + d) 75 else: 76 return None 77 78def CodeRanges(code_list): 79 """ 80 Given a list of codes as returned by chars_to_ranges, return 81 an RE which will match a character in any of the ranges. 82 """ 83 re_list = [] 84 for i in xrange(0, len(code_list), 2): 85 re_list.append(CodeRange(code_list[i], code_list[i + 1])) 86 return apply(Alt, tuple(re_list)) 87 88def CodeRange(code1, code2): 89 """ 90 CodeRange(code1, code2) is an RE which matches any character 91 with a code |c| in the range |code1| <= |c| < |code2|. 92 """ 93 if code1 <= nl_code < code2: 94 return Alt(RawCodeRange(code1, nl_code), 95 RawNewline, 96 RawCodeRange(nl_code + 1, code2)) 97 else: 98 return RawCodeRange(code1, code2) 99 100# 101# Abstract classes 102# 103 104class RE: 105 """RE is the base class for regular expression constructors. 106 The following operators are defined on REs: 107 108 re1 + re2 is an RE which matches |re1| followed by |re2| 109 re1 | re2 is an RE which matches either |re1| or |re2| 110 """ 111 112 nullable = 1 # True if this RE can match 0 input symbols 113 match_nl = 1 # True if this RE can match a string ending with '\n' 114 str = None # Set to a string to override the class's __str__ result 115 116 def build_machine(self, machine, initial_state, final_state, 117 match_bol, nocase): 118 """ 119 This method should add states to |machine| to implement this 120 RE, starting at |initial_state| and ending at |final_state|. 121 If |match_bol| is true, the RE must be able to match at the 122 beginning of a line. If nocase is true, upper and lower case 123 letters should be treated as equivalent. 124 """ 125 raise exceptions.UnimplementedMethod("%s.build_machine not implemented" % 126 self.__class__.__name__) 127 128 def build_opt(self, m, initial_state, c): 129 """ 130 Given a state |s| of machine |m|, return a new state 131 reachable from |s| on character |c| or epsilon. 132 """ 133 s = m.new_state() 134 initial_state.link_to(s) 135 initial_state.add_transition(c, s) 136 return s 137 138 def __add__(self, other): 139 return Seq(self, other) 140 141 def __or__(self, other): 142 return Alt(self, other) 143 144 def __str__(self): 145 if self.str: 146 return self.str 147 else: 148 return self.calc_str() 149 150 def check_re(self, num, value): 151 if not isinstance(value, RE): 152 self.wrong_type(num, value, "Plex.RE instance") 153 154 def check_string(self, num, value): 155 if type(value) <> type(''): 156 self.wrong_type(num, value, "string") 157 158 def check_char(self, num, value): 159 self.check_string(num, value) 160 if len(value) <> 1: 161 raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s." 162 "Expected a string of length 1, got: %s" % ( 163 num, self.__class__.__name__, repr(value))) 164 165 def wrong_type(self, num, value, expected): 166 if type(value) == types.InstanceType: 167 got = "%s.%s instance" % ( 168 value.__class__.__module__, value.__class__.__name__) 169 else: 170 got = type(value).__name__ 171 raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s " 172 "(expected %s, got %s" % ( 173 num, self.__class__.__name__, expected, got)) 174 175# 176# Primitive RE constructors 177# ------------------------- 178# 179# These are the basic REs from which all others are built. 180# 181 182## class Char(RE): 183## """ 184## Char(c) is an RE which matches the character |c|. 185## """ 186 187## nullable = 0 188 189## def __init__(self, char): 190## self.char = char 191## self.match_nl = char == '\n' 192 193## def build_machine(self, m, initial_state, final_state, match_bol, nocase): 194## c = self.char 195## if match_bol and c <> BOL: 196## s1 = self.build_opt(m, initial_state, BOL) 197## else: 198## s1 = initial_state 199## if c == '\n' or c == EOF: 200## s1 = self.build_opt(m, s1, EOL) 201## if len(c) == 1: 202## code = ord(self.char) 203## s1.add_transition((code, code+1), final_state) 204## if nocase and is_letter_code(code): 205## code2 = other_case_code(code) 206## s1.add_transition((code2, code2+1), final_state) 207## else: 208## s1.add_transition(c, final_state) 209 210## def calc_str(self): 211## return "Char(%s)" % repr(self.char) 212 213def Char(c): 214 """ 215 Char(c) is an RE which matches the character |c|. 216 """ 217 if len(c) == 1: 218 result = CodeRange(ord(c), ord(c) + 1) 219 else: 220 result = SpecialSymbol(c) 221 result.str = "Char(%s)" % repr(c) 222 return result 223 224class RawCodeRange(RE): 225 """ 226 RawCodeRange(code1, code2) is a low-level RE which matches any character 227 with a code |c| in the range |code1| <= |c| < |code2|, where the range 228 does not include newline. For internal use only. 229 """ 230 nullable = 0 231 match_nl = 0 232 range = None # (code, code) 233 uppercase_range = None # (code, code) or None 234 lowercase_range = None # (code, code) or None 235 236 def __init__(self, code1, code2): 237 self.range = (code1, code2) 238 self.uppercase_range = uppercase_range(code1, code2) 239 self.lowercase_range = lowercase_range(code1, code2) 240 241 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 242 if match_bol: 243 initial_state = self.build_opt(m, initial_state, BOL) 244 initial_state.add_transition(self.range, final_state) 245 if nocase: 246 if self.uppercase_range: 247 initial_state.add_transition(self.uppercase_range, final_state) 248 if self.lowercase_range: 249 initial_state.add_transition(self.lowercase_range, final_state) 250 251 def calc_str(self): 252 return "CodeRange(%d,%d)" % (self.code1, self.code2) 253 254class _RawNewline(RE): 255 """ 256 RawNewline is a low-level RE which matches a newline character. 257 For internal use only. 258 """ 259 nullable = 0 260 match_nl = 1 261 262 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 263 if match_bol: 264 initial_state = self.build_opt(m, initial_state, BOL) 265 s = self.build_opt(m, initial_state, EOL) 266 s.add_transition((nl_code, nl_code + 1), final_state) 267 268RawNewline = _RawNewline() 269 270 271class SpecialSymbol(RE): 272 """ 273 SpecialSymbol(sym) is an RE which matches the special input 274 symbol |sym|, which is one of BOL, EOL or EOF. 275 """ 276 nullable = 0 277 match_nl = 0 278 sym = None 279 280 def __init__(self, sym): 281 self.sym = sym 282 283 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 284 # Sequences 'bol bol' and 'bol eof' are impossible, so only need 285 # to allow for bol if sym is eol 286 if match_bol and self.sym == EOL: 287 initial_state = self.build_opt(m, initial_state, BOL) 288 initial_state.add_transition(self.sym, final_state) 289 290 291class Seq(RE): 292 """Seq(re1, re2, re3...) is an RE which matches |re1| followed by 293 |re2| followed by |re3|...""" 294 295 def __init__(self, *re_list): 296 nullable = 1 297 for i in xrange(len(re_list)): 298 re = re_list[i] 299 self.check_re(i, re) 300 nullable = nullable and re.nullable 301 self.re_list = re_list 302 self.nullable = nullable 303 i = len(re_list) 304 match_nl = 0 305 while i: 306 i = i - 1 307 re = re_list[i] 308 if re.match_nl: 309 match_nl = 1 310 break 311 if not re.nullable: 312 break 313 self.match_nl = match_nl 314 315 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 316 re_list = self.re_list 317 if len(re_list) == 0: 318 initial_state.link_to(final_state) 319 else: 320 s1 = initial_state 321 n = len(re_list) 322 for i in xrange(n): 323 if i < n - 1: 324 s2 = m.new_state() 325 else: 326 s2 = final_state 327 re = re_list[i] 328 re.build_machine(m, s1, s2, match_bol, nocase) 329 s1 = s2 330 match_bol = re.match_nl or (match_bol and re.nullable) 331 332 def calc_str(self): 333 return "Seq(%s)" % string.join(map(str, self.re_list), ",") 334 335 336class Alt(RE): 337 """Alt(re1, re2, re3...) is an RE which matches either |re1| or 338 |re2| or |re3|...""" 339 340 def __init__(self, *re_list): 341 self.re_list = re_list 342 nullable = 0 343 match_nl = 0 344 nullable_res = [] 345 non_nullable_res = [] 346 i = 1 347 for re in re_list: 348 self.check_re(i, re) 349 if re.nullable: 350 nullable_res.append(re) 351 nullable = 1 352 else: 353 non_nullable_res.append(re) 354 if re.match_nl: 355 match_nl = 1 356 i = i + 1 357 self.nullable_res = nullable_res 358 self.non_nullable_res = non_nullable_res 359 self.nullable = nullable 360 self.match_nl = match_nl 361 362 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 363 for re in self.nullable_res: 364 re.build_machine(m, initial_state, final_state, match_bol, nocase) 365 if self.non_nullable_res: 366 if match_bol: 367 initial_state = self.build_opt(m, initial_state, BOL) 368 for re in self.non_nullable_res: 369 re.build_machine(m, initial_state, final_state, 0, nocase) 370 371 def calc_str(self): 372 return "Alt(%s)" % string.join(map(str, self.re_list), ",") 373 374 375class Rep1(RE): 376 """Rep1(re) is an RE which matches one or more repetitions of |re|.""" 377 378 def __init__(self, re): 379 self.check_re(1, re) 380 self.re = re 381 self.nullable = re.nullable 382 self.match_nl = re.match_nl 383 384 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 385 s1 = m.new_state() 386 s2 = m.new_state() 387 initial_state.link_to(s1) 388 self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase) 389 s2.link_to(s1) 390 s2.link_to(final_state) 391 392 def calc_str(self): 393 return "Rep1(%s)" % self.re 394 395 396class SwitchCase(RE): 397 """ 398 SwitchCase(re, nocase) is an RE which matches the same strings as RE, 399 but treating upper and lower case letters according to |nocase|. If 400 |nocase| is true, case is ignored, otherwise it is not. 401 """ 402 re = None 403 nocase = None 404 405 def __init__(self, re, nocase): 406 self.re = re 407 self.nocase = nocase 408 self.nullable = re.nullable 409 self.match_nl = re.match_nl 410 411 def build_machine(self, m, initial_state, final_state, match_bol, nocase): 412 self.re.build_machine(m, initial_state, final_state, match_bol, 413 self.nocase) 414 415 def calc_str(self): 416 if self.nocase: 417 name = "NoCase" 418 else: 419 name = "Case" 420 return "%s(%s)" % (name, self.re) 421 422# 423# Composite RE constructors 424# ------------------------- 425# 426# These REs are defined in terms of the primitive REs. 427# 428 429Empty = Seq() 430Empty.__doc__ = \ 431 """ 432 Empty is an RE which matches the empty string. 433 """ 434Empty.str = "Empty" 435 436def Str1(s): 437 """ 438 Str1(s) is an RE which matches the literal string |s|. 439 """ 440 result = apply(Seq, tuple(map(Char, s))) 441 result.str = "Str(%s)" % repr(s) 442 return result 443 444def Str(*strs): 445 """ 446 Str(s) is an RE which matches the literal string |s|. 447 Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|... 448 """ 449 if len(strs) == 1: 450 return Str1(strs[0]) 451 else: 452 result = apply(Alt, tuple(map(Str1, strs))) 453 result.str = "Str(%s)" % string.join(map(repr, strs), ",") 454 return result 455 456def Any(s): 457 """ 458 Any(s) is an RE which matches any character in the string |s|. 459 """ 460 #result = apply(Alt, tuple(map(Char, s))) 461 result = CodeRanges(chars_to_ranges(s)) 462 result.str = "Any(%s)" % repr(s) 463 return result 464 465def AnyBut(s): 466 """ 467 AnyBut(s) is an RE which matches any character (including 468 newline) which is not in the string |s|. 469 """ 470 ranges = chars_to_ranges(s) 471 ranges.insert(0, -maxint) 472 ranges.append(maxint) 473 result = CodeRanges(ranges) 474 result.str = "AnyBut(%s)" % repr(s) 475 return result 476 477AnyChar = AnyBut("") 478AnyChar.__doc__ = \ 479 """ 480 AnyChar is an RE which matches any single character (including a newline). 481 """ 482AnyChar.str = "AnyChar" 483 484def Range(s1, s2 = None): 485 """ 486 Range(c1, c2) is an RE which matches any single character in the range 487 |c1| to |c2| inclusive. 488 Range(s) where |s| is a string of even length is an RE which matches 489 any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,... 490 """ 491 if s2: 492 result = CodeRange(ord(s1), ord(s2) + 1) 493 result.str = "Range(%s,%s)" % (s1, s2) 494 else: 495 ranges = [] 496 for i in range(0, len(s1), 2): 497 ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1)) 498 result = apply(Alt, tuple(ranges)) 499 result.str = "Range(%s)" % repr(s1) 500 return result 501 502def Opt(re): 503 """ 504 Opt(re) is an RE which matches either |re| or the empty string. 505 """ 506 result = Alt(re, Empty) 507 result.str = "Opt(%s)" % re 508 return result 509 510def Rep(re): 511 """ 512 Rep(re) is an RE which matches zero or more repetitions of |re|. 513 """ 514 result = Opt(Rep1(re)) 515 result.str = "Rep(%s)" % re 516 return result 517 518def NoCase(re): 519 """ 520 NoCase(re) is an RE which matches the same strings as RE, but treating 521 upper and lower case letters as equivalent. 522 """ 523 return SwitchCase(re, nocase = 1) 524 525def Case(re): 526 """ 527 Case(re) is an RE which matches the same strings as RE, but treating 528 upper and lower case letters as distinct, i.e. it cancels the effect 529 of any enclosing NoCase(). 530 """ 531 return SwitchCase(re, nocase = 0) 532 533# 534# RE Constants 535# 536 537Bol = Char(BOL) 538Bol.__doc__ = \ 539 """ 540 Bol is an RE which matches the beginning of a line. 541 """ 542Bol.str = "Bol" 543 544Eol = Char(EOL) 545Eol.__doc__ = \ 546 """ 547 Eol is an RE which matches the end of a line. 548 """ 549Eol.str = "Eol" 550 551Eof = Char(EOF) 552Eof.__doc__ = \ 553 """ 554 Eof is an RE which matches the end of the file. 555 """ 556Eof.str = "Eof" 557 558