1#======================================================================= 2# 3# Python Lexical Analyser 4# 5# Traditional Regular Expression Syntax 6# 7#======================================================================= 8 9from __future__ import absolute_import 10 11from .Regexps import Alt, Seq, Rep, Rep1, Opt, Any, AnyBut, Bol, Eol, Char 12from .Errors import PlexError 13 14 15class RegexpSyntaxError(PlexError): 16 pass 17 18 19def re(s): 20 """ 21 Convert traditional string representation of regular expression |s| 22 into Plex representation. 23 """ 24 return REParser(s).parse_re() 25 26 27class REParser(object): 28 def __init__(self, s): 29 self.s = s 30 self.i = -1 31 self.end = 0 32 self.next() 33 34 def parse_re(self): 35 re = self.parse_alt() 36 if not self.end: 37 self.error("Unexpected %s" % repr(self.c)) 38 return re 39 40 def parse_alt(self): 41 """Parse a set of alternative regexps.""" 42 re = self.parse_seq() 43 if self.c == '|': 44 re_list = [re] 45 while self.c == '|': 46 self.next() 47 re_list.append(self.parse_seq()) 48 re = Alt(*re_list) 49 return re 50 51 def parse_seq(self): 52 """Parse a sequence of regexps.""" 53 re_list = [] 54 while not self.end and not self.c in "|)": 55 re_list.append(self.parse_mod()) 56 return Seq(*re_list) 57 58 def parse_mod(self): 59 """Parse a primitive regexp followed by *, +, ? modifiers.""" 60 re = self.parse_prim() 61 while not self.end and self.c in "*+?": 62 if self.c == '*': 63 re = Rep(re) 64 elif self.c == '+': 65 re = Rep1(re) 66 else: # self.c == '?' 67 re = Opt(re) 68 self.next() 69 return re 70 71 def parse_prim(self): 72 """Parse a primitive regexp.""" 73 c = self.get() 74 if c == '.': 75 re = AnyBut("\n") 76 elif c == '^': 77 re = Bol 78 elif c == '$': 79 re = Eol 80 elif c == '(': 81 re = self.parse_alt() 82 self.expect(')') 83 elif c == '[': 84 re = self.parse_charset() 85 self.expect(']') 86 else: 87 if c == '\\': 88 c = self.get() 89 re = Char(c) 90 return re 91 92 def parse_charset(self): 93 """Parse a charset. Does not include the surrounding [].""" 94 char_list = [] 95 invert = 0 96 if self.c == '^': 97 invert = 1 98 self.next() 99 if self.c == ']': 100 char_list.append(']') 101 self.next() 102 while not self.end and self.c != ']': 103 c1 = self.get() 104 if self.c == '-' and self.lookahead(1) != ']': 105 self.next() 106 c2 = self.get() 107 for a in range(ord(c1), ord(c2) + 1): 108 char_list.append(chr(a)) 109 else: 110 char_list.append(c1) 111 chars = ''.join(char_list) 112 if invert: 113 return AnyBut(chars) 114 else: 115 return Any(chars) 116 117 def next(self): 118 """Advance to the next char.""" 119 s = self.s 120 i = self.i = self.i + 1 121 if i < len(s): 122 self.c = s[i] 123 else: 124 self.c = '' 125 self.end = 1 126 127 def get(self): 128 if self.end: 129 self.error("Premature end of string") 130 c = self.c 131 self.next() 132 return c 133 134 def lookahead(self, n): 135 """Look ahead n chars.""" 136 j = self.i + n 137 if j < len(self.s): 138 return self.s[j] 139 else: 140 return '' 141 142 def expect(self, c): 143 """ 144 Expect to find character |c| at current position. 145 Raises an exception otherwise. 146 """ 147 if self.c == c: 148 self.next() 149 else: 150 self.error("Missing %s" % repr(c)) 151 152 def error(self, mess): 153 """Raise exception to signal syntax error in regexp.""" 154 raise RegexpSyntaxError("Syntax error in regexp %s at position %d: %s" % ( 155 repr(self.s), self.i, mess)) 156 157 158 159