1# 2# Secret Labs' Regular Expression Engine 3# 4# convert template to internal format 5# 6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7# 8# See the sre.py file for information on usage and redistribution. 9# 10 11"""Internal support module for sre""" 12 13import _sre, sys 14 15from sre_constants import * 16 17assert _sre.MAGIC == MAGIC, "SRE module mismatch" 18 19if _sre.CODESIZE == 2: 20 MAXCODE = 65535 21else: 22 MAXCODE = 0xFFFFFFFFL 23 24def _identityfunction(x): 25 return x 26 27def set(seq): 28 s = {} 29 for elem in seq: 30 s[elem] = 1 31 return s 32 33_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) 34_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) 35_SUCCESS_CODES = set([SUCCESS, FAILURE]) 36_ASSERT_CODES = set([ASSERT, ASSERT_NOT]) 37 38def _compile(code, pattern, flags): 39 # internal: compile a (sub)pattern 40 emit = code.append 41 _len = len 42 LITERAL_CODES = _LITERAL_CODES 43 REPEATING_CODES = _REPEATING_CODES 44 SUCCESS_CODES = _SUCCESS_CODES 45 ASSERT_CODES = _ASSERT_CODES 46 for op, av in pattern: 47 if op in LITERAL_CODES: 48 if flags & SRE_FLAG_IGNORECASE: 49 emit(OPCODES[OP_IGNORE[op]]) 50 emit(_sre.getlower(av, flags)) 51 else: 52 emit(OPCODES[op]) 53 emit(av) 54 elif op is IN: 55 if flags & SRE_FLAG_IGNORECASE: 56 emit(OPCODES[OP_IGNORE[op]]) 57 def fixup(literal, flags=flags): 58 return _sre.getlower(literal, flags) 59 else: 60 emit(OPCODES[op]) 61 fixup = _identityfunction 62 skip = _len(code); emit(0) 63 _compile_charset(av, flags, code, fixup) 64 code[skip] = _len(code) - skip 65 elif op is ANY: 66 if flags & SRE_FLAG_DOTALL: 67 emit(OPCODES[ANY_ALL]) 68 else: 69 emit(OPCODES[ANY]) 70 elif op in REPEATING_CODES: 71 if flags & SRE_FLAG_TEMPLATE: 72 raise error, "internal: unsupported template operator" 73 emit(OPCODES[REPEAT]) 74 skip = _len(code); emit(0) 75 emit(av[0]) 76 emit(av[1]) 77 _compile(code, av[2], flags) 78 emit(OPCODES[SUCCESS]) 79 code[skip] = _len(code) - skip 80 elif _simple(av) and op is not REPEAT: 81 if op is MAX_REPEAT: 82 emit(OPCODES[REPEAT_ONE]) 83 else: 84 emit(OPCODES[MIN_REPEAT_ONE]) 85 skip = _len(code); emit(0) 86 emit(av[0]) 87 emit(av[1]) 88 _compile(code, av[2], flags) 89 emit(OPCODES[SUCCESS]) 90 code[skip] = _len(code) - skip 91 else: 92 emit(OPCODES[REPEAT]) 93 skip = _len(code); emit(0) 94 emit(av[0]) 95 emit(av[1]) 96 _compile(code, av[2], flags) 97 code[skip] = _len(code) - skip 98 if op is MAX_REPEAT: 99 emit(OPCODES[MAX_UNTIL]) 100 else: 101 emit(OPCODES[MIN_UNTIL]) 102 elif op is SUBPATTERN: 103 if av[0]: 104 emit(OPCODES[MARK]) 105 emit((av[0]-1)*2) 106 # _compile_info(code, av[1], flags) 107 _compile(code, av[1], flags) 108 if av[0]: 109 emit(OPCODES[MARK]) 110 emit((av[0]-1)*2+1) 111 elif op in SUCCESS_CODES: 112 emit(OPCODES[op]) 113 elif op in ASSERT_CODES: 114 emit(OPCODES[op]) 115 skip = _len(code); emit(0) 116 if av[0] >= 0: 117 emit(0) # look ahead 118 else: 119 lo, hi = av[1].getwidth() 120 if lo != hi: 121 raise error, "look-behind requires fixed-width pattern" 122 emit(lo) # look behind 123 _compile(code, av[1], flags) 124 emit(OPCODES[SUCCESS]) 125 code[skip] = _len(code) - skip 126 elif op is CALL: 127 emit(OPCODES[op]) 128 skip = _len(code); emit(0) 129 _compile(code, av, flags) 130 emit(OPCODES[SUCCESS]) 131 code[skip] = _len(code) - skip 132 elif op is AT: 133 emit(OPCODES[op]) 134 if flags & SRE_FLAG_MULTILINE: 135 av = AT_MULTILINE.get(av, av) 136 if flags & SRE_FLAG_LOCALE: 137 av = AT_LOCALE.get(av, av) 138 elif flags & SRE_FLAG_UNICODE: 139 av = AT_UNICODE.get(av, av) 140 emit(ATCODES[av]) 141 elif op is BRANCH: 142 emit(OPCODES[op]) 143 tail = [] 144 tailappend = tail.append 145 for av in av[1]: 146 skip = _len(code); emit(0) 147 # _compile_info(code, av, flags) 148 _compile(code, av, flags) 149 emit(OPCODES[JUMP]) 150 tailappend(_len(code)); emit(0) 151 code[skip] = _len(code) - skip 152 emit(0) # end of branch 153 for tail in tail: 154 code[tail] = _len(code) - tail 155 elif op is CATEGORY: 156 emit(OPCODES[op]) 157 if flags & SRE_FLAG_LOCALE: 158 av = CH_LOCALE[av] 159 elif flags & SRE_FLAG_UNICODE: 160 av = CH_UNICODE[av] 161 emit(CHCODES[av]) 162 elif op is GROUPREF: 163 if flags & SRE_FLAG_IGNORECASE: 164 emit(OPCODES[OP_IGNORE[op]]) 165 else: 166 emit(OPCODES[op]) 167 emit(av-1) 168 elif op is GROUPREF_EXISTS: 169 emit(OPCODES[op]) 170 emit(av[0]-1) 171 skipyes = _len(code); emit(0) 172 _compile(code, av[1], flags) 173 if av[2]: 174 emit(OPCODES[JUMP]) 175 skipno = _len(code); emit(0) 176 code[skipyes] = _len(code) - skipyes + 1 177 _compile(code, av[2], flags) 178 code[skipno] = _len(code) - skipno 179 else: 180 code[skipyes] = _len(code) - skipyes + 1 181 else: 182 raise ValueError, ("unsupported operand type", op) 183 184def _compile_charset(charset, flags, code, fixup=None): 185 # compile charset subprogram 186 emit = code.append 187 if fixup is None: 188 fixup = _identityfunction 189 for op, av in _optimize_charset(charset, fixup): 190 emit(OPCODES[op]) 191 if op is NEGATE: 192 pass 193 elif op is LITERAL: 194 emit(fixup(av)) 195 elif op is RANGE: 196 emit(fixup(av[0])) 197 emit(fixup(av[1])) 198 elif op is CHARSET: 199 code.extend(av) 200 elif op is BIGCHARSET: 201 code.extend(av) 202 elif op is CATEGORY: 203 if flags & SRE_FLAG_LOCALE: 204 emit(CHCODES[CH_LOCALE[av]]) 205 elif flags & SRE_FLAG_UNICODE: 206 emit(CHCODES[CH_UNICODE[av]]) 207 else: 208 emit(CHCODES[av]) 209 else: 210 raise error, "internal: unsupported set operator" 211 emit(OPCODES[FAILURE]) 212 213def _optimize_charset(charset, fixup): 214 # internal: optimize character set 215 out = [] 216 outappend = out.append 217 charmap = [0]*256 218 try: 219 for op, av in charset: 220 if op is NEGATE: 221 outappend((op, av)) 222 elif op is LITERAL: 223 charmap[fixup(av)] = 1 224 elif op is RANGE: 225 for i in range(fixup(av[0]), fixup(av[1])+1): 226 charmap[i] = 1 227 elif op is CATEGORY: 228 # XXX: could append to charmap tail 229 return charset # cannot compress 230 except IndexError: 231 # character set contains unicode characters 232 return _optimize_unicode(charset, fixup) 233 # compress character map 234 i = p = n = 0 235 runs = [] 236 runsappend = runs.append 237 for c in charmap: 238 if c: 239 if n == 0: 240 p = i 241 n = n + 1 242 elif n: 243 runsappend((p, n)) 244 n = 0 245 i = i + 1 246 if n: 247 runsappend((p, n)) 248 if len(runs) <= 2: 249 # use literal/range 250 for p, n in runs: 251 if n == 1: 252 outappend((LITERAL, p)) 253 else: 254 outappend((RANGE, (p, p+n-1))) 255 if len(out) < len(charset): 256 return out 257 else: 258 # use bitmap 259 data = _mk_bitmap(charmap) 260 outappend((CHARSET, data)) 261 return out 262 return charset 263 264def _mk_bitmap(bits): 265 data = [] 266 dataappend = data.append 267 if _sre.CODESIZE == 2: 268 start = (1, 0) 269 else: 270 start = (1L, 0L) 271 m, v = start 272 for c in bits: 273 if c: 274 v = v + m 275 m = m + m 276 if m > MAXCODE: 277 dataappend(v) 278 m, v = start 279 return data 280 281# To represent a big charset, first a bitmap of all characters in the 282# set is constructed. Then, this bitmap is sliced into chunks of 256 283# characters, duplicate chunks are eliminated, and each chunk is 284# given a number. In the compiled expression, the charset is 285# represented by a 16-bit word sequence, consisting of one word for 286# the number of different chunks, a sequence of 256 bytes (128 words) 287# of chunk numbers indexed by their original chunk position, and a 288# sequence of chunks (16 words each). 289 290# Compression is normally good: in a typical charset, large ranges of 291# Unicode will be either completely excluded (e.g. if only cyrillic 292# letters are to be matched), or completely included (e.g. if large 293# subranges of Kanji match). These ranges will be represented by 294# chunks of all one-bits or all zero-bits. 295 296# Matching can be also done efficiently: the more significant byte of 297# the Unicode character is an index into the chunk number, and the 298# less significant byte is a bit index in the chunk (just like the 299# CHARSET matching). 300 301# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets 302# of the basic multilingual plane; an efficient representation 303# for all of UTF-16 has not yet been developed. This means, 304# in particular, that negated charsets cannot be represented as 305# bigcharsets. 306 307def _optimize_unicode(charset, fixup): 308 # problems with optimization in Jython, forget about it for now 309 return charset 310 311 try: 312 import array 313 except ImportError: 314 return charset 315 charmap = [0]*65536 316 negate = 0 317 try: 318 for op, av in charset: 319 if op is NEGATE: 320 negate = 1 321 elif op is LITERAL: 322 charmap[fixup(av)] = 1 323 elif op is RANGE: 324 for i in xrange(fixup(av[0]), fixup(av[1])+1): 325 charmap[i] = 1 326 elif op is CATEGORY: 327 # XXX: could expand category 328 return charset # cannot compress 329 except IndexError: 330 # non-BMP characters 331 return charset 332 if negate: 333 if sys.maxunicode != 65535: 334 # XXX: negation does not work with big charsets 335 return charset 336 for i in xrange(65536): 337 charmap[i] = not charmap[i] 338 comps = {} 339 mapping = [0]*256 340 block = 0 341 data = [] 342 for i in xrange(256): 343 chunk = tuple(charmap[i*256:(i+1)*256]) 344 new = comps.setdefault(chunk, block) 345 mapping[i] = new 346 if new == block: 347 block = block + 1 348 data = data + _mk_bitmap(chunk) 349 header = [block] 350 if _sre.CODESIZE == 2: 351 code = 'H' 352 else: 353 # change this for Jython from 'I', since that will expand to 354 # long, and cause needless complexity (or so it seems) 355 code = 'i' 356 # Convert block indices to byte array of 256 bytes 357 mapping = array.array('b', mapping).tostring() 358 # Convert byte array to word array 359 mapping = array.array(code, mapping) 360 assert mapping.itemsize == _sre.CODESIZE 361 header = header + mapping.tolist() 362 data[0:0] = header 363 return [(BIGCHARSET, data)] 364 365def _simple(av): 366 # check if av is a "simple" operator 367 lo, hi = av[2].getwidth() 368 if lo == 0 and hi == MAXREPEAT: 369 raise error, "nothing to repeat" 370 return lo == hi == 1 and av[2][0][0] != SUBPATTERN 371 372def _compile_info(code, pattern, flags): 373 # internal: compile an info block. in the current version, 374 # this contains min/max pattern width, and an optional literal 375 # prefix or a character map 376 lo, hi = pattern.getwidth() 377 if lo == 0: 378 return # not worth it 379 # look for a literal prefix 380 prefix = [] 381 prefixappend = prefix.append 382 prefix_skip = 0 383 charset = [] # not used 384 charsetappend = charset.append 385 if not (flags & SRE_FLAG_IGNORECASE): 386 # look for literal prefix 387 for op, av in pattern.data: 388 if op is LITERAL: 389 if len(prefix) == prefix_skip: 390 prefix_skip = prefix_skip + 1 391 prefixappend(av) 392 elif op is SUBPATTERN and len(av[1]) == 1: 393 op, av = av[1][0] 394 if op is LITERAL: 395 prefixappend(av) 396 else: 397 break 398 else: 399 break 400 # if no prefix, look for charset prefix 401 if not prefix and pattern.data: 402 op, av = pattern.data[0] 403 if op is SUBPATTERN and av[1]: 404 op, av = av[1][0] 405 if op is LITERAL: 406 charsetappend((op, av)) 407 elif op is BRANCH: 408 c = [] 409 cappend = c.append 410 for p in av[1]: 411 if not p: 412 break 413 op, av = p[0] 414 if op is LITERAL: 415 cappend((op, av)) 416 else: 417 break 418 else: 419 charset = c 420 elif op is BRANCH: 421 c = [] 422 cappend = c.append 423 for p in av[1]: 424 if not p: 425 break 426 op, av = p[0] 427 if op is LITERAL: 428 cappend((op, av)) 429 else: 430 break 431 else: 432 charset = c 433 elif op is IN: 434 charset = av 435## if prefix: 436## print "*** PREFIX", prefix, prefix_skip 437## if charset: 438## print "*** CHARSET", charset 439 # add an info block 440 emit = code.append 441 emit(OPCODES[INFO]) 442 skip = len(code); emit(0) 443 # literal flag 444 mask = 0 445 if prefix: 446 mask = SRE_INFO_PREFIX 447 if len(prefix) == prefix_skip == len(pattern.data): 448 mask = mask + SRE_INFO_LITERAL 449 elif charset: 450 mask = mask + SRE_INFO_CHARSET 451 emit(mask) 452 # pattern length 453 if lo < MAXCODE: 454 emit(lo) 455 else: 456 emit(MAXCODE) 457 prefix = prefix[:MAXCODE] 458 if hi < MAXCODE: 459 emit(hi) 460 else: 461 emit(0) 462 # add literal prefix 463 if prefix: 464 emit(len(prefix)) # length 465 emit(prefix_skip) # skip 466 code.extend(prefix) 467 # generate overlap table 468 table = [-1] + ([0]*len(prefix)) 469 for i in xrange(len(prefix)): 470 table[i+1] = table[i]+1 471 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: 472 table[i+1] = table[table[i+1]-1]+1 473 code.extend(table[1:]) # don't store first entry 474 elif charset: 475 _compile_charset(charset, flags, code) 476 code[skip] = len(code) - skip 477 478try: 479 unicode 480except NameError: 481 STRING_TYPES = (type(""),) 482else: 483 STRING_TYPES = (type(""), type(unicode(""))) 484 485def isstring(obj): 486 for tp in STRING_TYPES: 487 if isinstance(obj, tp): 488 return 1 489 return 0 490 491def _code(p, flags): 492 493 flags = p.pattern.flags | flags 494 code = [] 495 496 # compile info block 497 _compile_info(code, p, flags) 498 499 # compile the pattern 500 _compile(code, p.data, flags) 501 502 code.append(OPCODES[SUCCESS]) 503 504 return code 505 506def compile(p, flags=0): 507 # internal: convert pattern list to internal format 508 509 if isstring(p): 510 import sre_parse 511 pattern = p 512 p = sre_parse.parse(p, flags) 513 else: 514 pattern = None 515 516 code = _code(p, flags) 517 518 # print code 519 520 # XXX: <fl> get rid of this limitation! 521 if p.pattern.groups > 100: 522 raise AssertionError( 523 "sorry, but this version only supports 100 named groups" 524 ) 525 526 # map in either direction 527 groupindex = p.pattern.groupdict 528 indexgroup = [None] * p.pattern.groups 529 for k, i in groupindex.items(): 530 indexgroup[i] = k 531 532 return _sre.compile( 533 pattern, flags | p.pattern.flags, code, 534 p.pattern.groups-1, 535 groupindex, indexgroup 536 ) 537