1#=======================================================================
2#
3#	 Python Lexical Analyser
4#
5#	 Regular Expressions
6#
7#=======================================================================
8
9import array
10import string
11import types
12from sys import maxint
13
14import Errors
15
16#
17#	 Constants
18#
19
20BOL = 'bol'
21EOL = 'eol'
22EOF = 'eof'
23
24nl_code = ord('\n')
25
26#
27#	 Helper functions
28#
29
30def chars_to_ranges(s):
31	"""
32	Return a list of character codes consisting of pairs
33	[code1a, code1b, code2a, code2b,...] which cover all
34	the characters in |s|.
35	"""
36	char_list = list(s)
37	char_list.sort()
38	i = 0
39	n = len(char_list)
40	result = []
41	while i < n:
42		code1 = ord(char_list[i])
43		code2 = code1 + 1
44		i = i + 1
45		while i < n and code2 >= ord(char_list[i]):
46			code2 = code2 + 1
47			i = i + 1
48		result.append(code1)
49		result.append(code2)
50	return result
51
52def uppercase_range(code1, code2):
53	"""
54	If the range of characters from code1 to code2-1 includes any
55	lower case letters, return the corresponding upper case range.
56	"""
57	code3 = max(code1, ord('a'))
58	code4 = min(code2, ord('z') + 1)
59	if code3 < code4:
60		d = ord('A') - ord('a')
61		return (code3 + d, code4 + d)
62	else:
63		return None
64
65def lowercase_range(code1, code2):
66	"""
67	If the range of characters from code1 to code2-1 includes any
68	upper case letters, return the corresponding lower case range.
69	"""
70	code3 = max(code1, ord('A'))
71	code4 = min(code2, ord('Z') + 1)
72	if code3 < code4:
73		d = ord('a') - ord('A')
74		return (code3 + d, code4 + d)
75	else:
76		return None
77
78def CodeRanges(code_list):
79	"""
80	Given a list of codes as returned by chars_to_ranges, return
81	an RE which will match a character in any of the ranges.
82	"""
83	re_list = []
84	for i in xrange(0, len(code_list), 2):
85		re_list.append(CodeRange(code_list[i], code_list[i + 1]))
86	return apply(Alt, tuple(re_list))
87
88def CodeRange(code1, code2):
89	"""
90	CodeRange(code1, code2) is an RE which matches any character
91	with a code |c| in the range |code1| <= |c| < |code2|.
92	"""
93	if code1 <= nl_code < code2:
94		return Alt(RawCodeRange(code1, nl_code),
95							 RawNewline,
96							 RawCodeRange(nl_code + 1, code2))
97	else:
98		return RawCodeRange(code1, code2)
99
100#
101#	 Abstract classes
102#
103
104class RE:
105	"""RE is the base class for regular expression constructors.
106	The following operators are defined on REs:
107
108		 re1 + re2		 is an RE which matches |re1| followed by |re2|
109		 re1 | re2		 is an RE which matches either |re1| or |re2|
110	"""
111
112	nullable = 1 # True if this RE can match 0 input symbols
113	match_nl = 1 # True if this RE can match a string ending with '\n'
114	str = None	 # Set to a string to override the class's __str__ result
115
116	def build_machine(self, machine, initial_state, final_state,
117										match_bol, nocase):
118		"""
119		This method should add states to |machine| to implement this
120		RE, starting at |initial_state| and ending at |final_state|.
121		If |match_bol| is true, the RE must be able to match at the
122		beginning of a line. If nocase is true, upper and lower case
123		letters should be treated as equivalent.
124		"""
125		raise exceptions.UnimplementedMethod("%s.build_machine not implemented" %
126			self.__class__.__name__)
127
128	def build_opt(self, m, initial_state, c):
129		"""
130		Given a state |s| of machine |m|, return a new state
131		reachable from |s| on character |c| or epsilon.
132		"""
133		s = m.new_state()
134		initial_state.link_to(s)
135		initial_state.add_transition(c, s)
136		return s
137
138	def __add__(self, other):
139		return Seq(self, other)
140
141	def __or__(self, other):
142		return Alt(self, other)
143
144	def __str__(self):
145		if self.str:
146			return self.str
147		else:
148			return self.calc_str()
149
150	def check_re(self, num, value):
151		if not isinstance(value, RE):
152			self.wrong_type(num, value, "Plex.RE instance")
153
154	def check_string(self, num, value):
155		if type(value) <> type(''):
156			self.wrong_type(num, value, "string")
157
158	def check_char(self, num, value):
159		self.check_string(num, value)
160		if len(value) <> 1:
161			raise Errors.PlexValueError("Invalid value for argument %d of Plex.%s."
162				"Expected a string of length 1, got: %s" % (
163					num, self.__class__.__name__, repr(value)))
164
165	def wrong_type(self, num, value, expected):
166		if type(value) == types.InstanceType:
167				got = "%s.%s instance" % (
168					value.__class__.__module__, value.__class__.__name__)
169		else:
170			got = type(value).__name__
171		raise Errors.PlexTypeError("Invalid type for argument %d of Plex.%s "
172										"(expected %s, got %s" % (
173											num, self.__class__.__name__, expected, got))
174
175#
176#	 Primitive RE constructors
177#	 -------------------------
178#
179#	 These are the basic REs from which all others are built.
180#
181
182## class Char(RE):
183##	 """
184##	 Char(c) is an RE which matches the character |c|.
185##	 """
186
187##	 nullable = 0
188
189##	 def __init__(self, char):
190##		 self.char = char
191##		 self.match_nl = char == '\n'
192
193##	 def build_machine(self, m, initial_state, final_state, match_bol, nocase):
194##		 c = self.char
195##		 if match_bol and c <> BOL:
196##			 s1 = self.build_opt(m, initial_state, BOL)
197##		 else:
198##			 s1 = initial_state
199##		 if c == '\n' or c == EOF:
200##			 s1 = self.build_opt(m, s1, EOL)
201##		 if len(c) == 1:
202##			 code = ord(self.char)
203##			 s1.add_transition((code, code+1), final_state)
204##			 if nocase and is_letter_code(code):
205##				 code2 = other_case_code(code)
206##				 s1.add_transition((code2, code2+1), final_state)
207##		 else:
208##			 s1.add_transition(c, final_state)
209
210##	 def calc_str(self):
211##		 return "Char(%s)" % repr(self.char)
212
213def Char(c):
214	"""
215	Char(c) is an RE which matches the character |c|.
216	"""
217	if len(c) == 1:
218		result = CodeRange(ord(c), ord(c) + 1)
219	else:
220		result = SpecialSymbol(c)
221	result.str = "Char(%s)" % repr(c)
222	return result
223
224class RawCodeRange(RE):
225	"""
226	RawCodeRange(code1, code2) is a low-level RE which matches any character
227	with a code |c| in the range |code1| <= |c| < |code2|, where the range
228	does not include newline. For internal use only.
229	"""
230	nullable = 0
231	match_nl = 0
232	range = None					 # (code, code)
233	uppercase_range = None # (code, code) or None
234	lowercase_range = None # (code, code) or None
235
236	def __init__(self, code1, code2):
237		self.range = (code1, code2)
238		self.uppercase_range = uppercase_range(code1, code2)
239		self.lowercase_range = lowercase_range(code1, code2)
240
241	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
242		if match_bol:
243			initial_state = self.build_opt(m, initial_state, BOL)
244		initial_state.add_transition(self.range, final_state)
245		if nocase:
246			if self.uppercase_range:
247				initial_state.add_transition(self.uppercase_range, final_state)
248			if self.lowercase_range:
249				initial_state.add_transition(self.lowercase_range, final_state)
250
251	def calc_str(self):
252		return "CodeRange(%d,%d)" % (self.code1, self.code2)
253
254class _RawNewline(RE):
255	"""
256	RawNewline is a low-level RE which matches a newline character.
257	For internal use only.
258	"""
259	nullable = 0
260	match_nl = 1
261
262	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
263		if match_bol:
264			initial_state = self.build_opt(m, initial_state, BOL)
265		s = self.build_opt(m, initial_state, EOL)
266		s.add_transition((nl_code, nl_code + 1), final_state)
267
268RawNewline = _RawNewline()
269
270
271class SpecialSymbol(RE):
272	"""
273	SpecialSymbol(sym) is an RE which matches the special input
274	symbol |sym|, which is one of BOL, EOL or EOF.
275	"""
276	nullable = 0
277	match_nl = 0
278	sym = None
279
280	def __init__(self, sym):
281		self.sym = sym
282
283	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
284		# Sequences 'bol bol' and 'bol eof' are impossible, so only need
285		# to allow for bol if sym is eol
286		if match_bol and self.sym == EOL:
287			initial_state = self.build_opt(m, initial_state, BOL)
288		initial_state.add_transition(self.sym, final_state)
289
290
291class Seq(RE):
292	"""Seq(re1, re2, re3...) is an RE which matches |re1| followed by
293	|re2| followed by |re3|..."""
294
295	def __init__(self, *re_list):
296		nullable = 1
297		for i in xrange(len(re_list)):
298			re = re_list[i]
299			self.check_re(i, re)
300			nullable = nullable and re.nullable
301		self.re_list = re_list
302		self.nullable = nullable
303		i = len(re_list)
304		match_nl = 0
305		while i:
306			i = i - 1
307			re = re_list[i]
308			if re.match_nl:
309				match_nl = 1
310				break
311			if not re.nullable:
312				break
313		self.match_nl = match_nl
314
315	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
316		re_list = self.re_list
317		if len(re_list) == 0:
318			initial_state.link_to(final_state)
319		else:
320			s1 = initial_state
321			n = len(re_list)
322			for i in xrange(n):
323				if i < n - 1:
324					s2 = m.new_state()
325				else:
326					s2 = final_state
327				re = re_list[i]
328				re.build_machine(m, s1, s2, match_bol, nocase)
329				s1 = s2
330				match_bol = re.match_nl or (match_bol and re.nullable)
331
332	def calc_str(self):
333		return "Seq(%s)" % string.join(map(str, self.re_list), ",")
334
335
336class Alt(RE):
337	"""Alt(re1, re2, re3...) is an RE which matches either |re1| or
338	|re2| or |re3|..."""
339
340	def __init__(self, *re_list):
341		self.re_list = re_list
342		nullable = 0
343		match_nl = 0
344		nullable_res = []
345		non_nullable_res = []
346		i = 1
347		for re in re_list:
348			self.check_re(i, re)
349			if re.nullable:
350				nullable_res.append(re)
351				nullable = 1
352			else:
353				non_nullable_res.append(re)
354			if re.match_nl:
355				match_nl = 1
356			i = i + 1
357		self.nullable_res = nullable_res
358		self.non_nullable_res = non_nullable_res
359		self.nullable = nullable
360		self.match_nl = match_nl
361
362	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
363		for re in self.nullable_res:
364			re.build_machine(m, initial_state, final_state, match_bol, nocase)
365		if self.non_nullable_res:
366			if match_bol:
367				initial_state = self.build_opt(m, initial_state, BOL)
368			for re in self.non_nullable_res:
369				re.build_machine(m, initial_state, final_state, 0, nocase)
370
371	def calc_str(self):
372		return "Alt(%s)" % string.join(map(str, self.re_list), ",")
373
374
375class Rep1(RE):
376	"""Rep1(re) is an RE which matches one or more repetitions of |re|."""
377
378	def __init__(self, re):
379		self.check_re(1, re)
380		self.re = re
381		self.nullable = re.nullable
382		self.match_nl = re.match_nl
383
384	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
385		s1 = m.new_state()
386		s2 = m.new_state()
387		initial_state.link_to(s1)
388		self.re.build_machine(m, s1, s2, match_bol or self.re.match_nl, nocase)
389		s2.link_to(s1)
390		s2.link_to(final_state)
391
392	def calc_str(self):
393		return "Rep1(%s)" % self.re
394
395
396class SwitchCase(RE):
397	"""
398	SwitchCase(re, nocase) is an RE which matches the same strings as RE,
399	but treating upper and lower case letters according to |nocase|. If
400	|nocase| is true, case is ignored, otherwise it is not.
401	"""
402	re = None
403	nocase = None
404
405	def __init__(self, re, nocase):
406		self.re = re
407		self.nocase = nocase
408		self.nullable = re.nullable
409		self.match_nl = re.match_nl
410
411	def build_machine(self, m, initial_state, final_state, match_bol, nocase):
412		self.re.build_machine(m, initial_state, final_state, match_bol,
413													self.nocase)
414
415	def calc_str(self):
416		if self.nocase:
417			name = "NoCase"
418		else:
419			name = "Case"
420		return "%s(%s)" % (name, self.re)
421
422#
423#	 Composite RE constructors
424#	 -------------------------
425#
426#	 These REs are defined in terms of the primitive REs.
427#
428
429Empty = Seq()
430Empty.__doc__ = \
431	"""
432	Empty is an RE which matches the empty string.
433	"""
434Empty.str = "Empty"
435
436def Str1(s):
437	"""
438	Str1(s) is an RE which matches the literal string |s|.
439	"""
440	result = apply(Seq, tuple(map(Char, s)))
441	result.str = "Str(%s)" % repr(s)
442	return result
443
444def Str(*strs):
445	"""
446	Str(s) is an RE which matches the literal string |s|.
447	Str(s1, s2, s3, ...) is an RE which matches any of |s1| or |s2| or |s3|...
448	"""
449	if len(strs) == 1:
450		return Str1(strs[0])
451	else:
452		result = apply(Alt, tuple(map(Str1, strs)))
453		result.str = "Str(%s)" % string.join(map(repr, strs), ",")
454		return result
455
456def Any(s):
457	"""
458	Any(s) is an RE which matches any character in the string |s|.
459	"""
460	#result = apply(Alt, tuple(map(Char, s)))
461	result = CodeRanges(chars_to_ranges(s))
462	result.str = "Any(%s)" % repr(s)
463	return result
464
465def AnyBut(s):
466	"""
467	AnyBut(s) is an RE which matches any character (including
468	newline) which is not in the string |s|.
469	"""
470	ranges = chars_to_ranges(s)
471	ranges.insert(0, -maxint)
472	ranges.append(maxint)
473	result = CodeRanges(ranges)
474	result.str = "AnyBut(%s)" % repr(s)
475	return result
476
477AnyChar = AnyBut("")
478AnyChar.__doc__ = \
479	"""
480	AnyChar is an RE which matches any single character (including a newline).
481	"""
482AnyChar.str = "AnyChar"
483
484def Range(s1, s2 = None):
485	"""
486	Range(c1, c2) is an RE which matches any single character in the range
487	|c1| to |c2| inclusive.
488	Range(s) where |s| is a string of even length is an RE which matches
489	any single character in the ranges |s[0]| to |s[1]|, |s[2]| to |s[3]|,...
490	"""
491	if s2:
492		result = CodeRange(ord(s1), ord(s2) + 1)
493		result.str = "Range(%s,%s)" % (s1, s2)
494	else:
495		ranges = []
496		for i in range(0, len(s1), 2):
497			ranges.append(CodeRange(ord(s1[i]), ord(s1[i+1]) + 1))
498		result = apply(Alt, tuple(ranges))
499		result.str = "Range(%s)" % repr(s1)
500	return result
501
502def Opt(re):
503	"""
504	Opt(re) is an RE which matches either |re| or the empty string.
505	"""
506	result = Alt(re, Empty)
507	result.str = "Opt(%s)" % re
508	return result
509
510def Rep(re):
511	"""
512	Rep(re) is an RE which matches zero or more repetitions of |re|.
513	"""
514	result = Opt(Rep1(re))
515	result.str = "Rep(%s)" % re
516	return result
517
518def NoCase(re):
519	"""
520	NoCase(re) is an RE which matches the same strings as RE, but treating
521	upper and lower case letters as equivalent.
522	"""
523	return SwitchCase(re, nocase = 1)
524
525def Case(re):
526	"""
527	Case(re) is an RE which matches the same strings as RE, but treating
528	upper and lower case letters as distinct, i.e. it cancels the effect
529	of any enclosing NoCase().
530	"""
531	return SwitchCase(re, nocase = 0)
532
533#
534#	 RE Constants
535#
536
537Bol = Char(BOL)
538Bol.__doc__ = \
539	"""
540	Bol is an RE which matches the beginning of a line.
541	"""
542Bol.str = "Bol"
543
544Eol = Char(EOL)
545Eol.__doc__ = \
546	"""
547	Eol is an RE which matches the end of a line.
548	"""
549Eol.str = "Eol"
550
551Eof = Char(EOF)
552Eof.__doc__ = \
553	"""
554	Eof is an RE which matches the end of the file.
555	"""
556Eof.str = "Eof"
557
558