1# -*- coding: utf-8; mode: Python; indent-tabs-mode: t; tab-width: 4; python-indent: 4 -*-
2
3# Copyright (C) 2012  Olga Yakovleva <yakovleva.o.v@gmail.com>
4
5# This program is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 3 of the License, or
8# (at your option) any later version.
9
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14
15# You should have received a copy of the GNU General Public License
16# along with this program.  If not, see <http://www.gnu.org/licenses/>.
17
18import pyparsing as pyp
19
20letters=set(u"абвгдеёжзийклмнопрстуфхцчшщьыъэюя")
21
22class node(object):
23	def __init__(self,src_str,loc,toks):
24		self.location=loc
25		self.initialize(toks)
26
27	def initialize(self,toks):
28		pass
29
30class letter_node(node):
31	def initialize(self,toks):
32		self.letter=toks[0]
33
34	def format_as_foma_regex(self):
35		return self.letter
36
37class sequence_node(node):
38	def initialize(self,toks):
39		self.subexpressions=toks[0].asList()
40
41	def format_as_foma_regex(self):
42		return "["+u" ".join(expr.format_as_foma_regex() for expr in self.subexpressions)+"]"
43
44class alternation_node(node):
45	def initialize(self,toks):
46		self.subexpressions=toks[0].asList()
47
48	def format_as_foma_regex(self):
49		return "["+u" | ".join(expr.format_as_foma_regex() for expr in self.subexpressions)+"]"
50
51class repetition_node(node):
52	def initialize(self,toks):
53		self.expression=toks[0][0]
54		self.operator=toks[0][1]
55
56	def format_as_foma_regex(self):
57		if self.operator=="?":
58			return "("+self.expression.format_as_foma_regex()+")"
59		else:
60			return self.expression.format_as_foma_regex()+self.operator
61
62class letterset_node(node):
63	def initialize(self,toks):
64		self.negated=toks[0]=="^"
65		self.letters=sorted(set(toks[-1].asList()))
66
67	def format_as_foma_regex(self):
68		return ("\[" if self.negated else "[")+u"|".join(self.letters)+"]"
69
70class dot_node(node):
71	def format_as_foma_regex(self):
72		return "?"
73
74Letter=pyp.oneOf(list(letters))
75Letterset=pyp.Literal("[").suppress()+pyp.Optional(pyp.Literal("^"))+pyp.Group(pyp.OneOrMore(Letter.copy()))+pyp.Literal("]").suppress()
76Dot=pyp.Literal(".")
77StartAnchor=pyp.Optional(pyp.StringStart()+pyp.Literal("^"))
78EndAnchor=pyp.Optional(pyp.Literal("$")+pyp.StringEnd())
79Letter.setParseAction(letter_node)
80Letterset.setParseAction(letterset_node)
81Dot.setParseAction(dot_node)
82Atom=Letter|Letterset|Dot
83Regex0=pyp.operatorPrecedence(Atom,
84							  [(pyp.oneOf("* + ?"),1,pyp.opAssoc.LEFT,repetition_node),
85							   (pyp.Empty(),2,pyp.opAssoc.LEFT,sequence_node),
86							   (pyp.Literal("|").suppress(),2,pyp.opAssoc.LEFT,alternation_node)])
87Regex=StartAnchor.setResultsName("start_of_string")+Regex0.setResultsName("root")+EndAnchor.setResultsName("end_of_string")
88
89def parse(string):
90	return Regex.parseString(string)
91