1# -*- coding: utf-8; mode: Python; indent-tabs-mode: t; tab-width: 4; python-indent: 4 -*- 2 3# Copyright (C) 2012 Olga Yakovleva <yakovleva.o.v@gmail.com> 4 5# This program is free software: you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation, either version 3 of the License, or 8# (at your option) any later version. 9 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14 15# You should have received a copy of the GNU General Public License 16# along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18import pyparsing as pyp 19 20letters=set(u"абвгдеёжзийклмнопрстуфхцчшщьыъэюя") 21 22class node(object): 23 def __init__(self,src_str,loc,toks): 24 self.location=loc 25 self.initialize(toks) 26 27 def initialize(self,toks): 28 pass 29 30class letter_node(node): 31 def initialize(self,toks): 32 self.letter=toks[0] 33 34 def format_as_foma_regex(self): 35 return self.letter 36 37class sequence_node(node): 38 def initialize(self,toks): 39 self.subexpressions=toks[0].asList() 40 41 def format_as_foma_regex(self): 42 return "["+u" ".join(expr.format_as_foma_regex() for expr in self.subexpressions)+"]" 43 44class alternation_node(node): 45 def initialize(self,toks): 46 self.subexpressions=toks[0].asList() 47 48 def format_as_foma_regex(self): 49 return "["+u" | ".join(expr.format_as_foma_regex() for expr in self.subexpressions)+"]" 50 51class repetition_node(node): 52 def initialize(self,toks): 53 self.expression=toks[0][0] 54 self.operator=toks[0][1] 55 56 def format_as_foma_regex(self): 57 if self.operator=="?": 58 return "("+self.expression.format_as_foma_regex()+")" 59 else: 60 return self.expression.format_as_foma_regex()+self.operator 61 62class letterset_node(node): 63 def initialize(self,toks): 64 self.negated=toks[0]=="^" 65 self.letters=sorted(set(toks[-1].asList())) 66 67 def format_as_foma_regex(self): 68 return ("\[" if self.negated else "[")+u"|".join(self.letters)+"]" 69 70class dot_node(node): 71 def format_as_foma_regex(self): 72 return "?" 73 74Letter=pyp.oneOf(list(letters)) 75Letterset=pyp.Literal("[").suppress()+pyp.Optional(pyp.Literal("^"))+pyp.Group(pyp.OneOrMore(Letter.copy()))+pyp.Literal("]").suppress() 76Dot=pyp.Literal(".") 77StartAnchor=pyp.Optional(pyp.StringStart()+pyp.Literal("^")) 78EndAnchor=pyp.Optional(pyp.Literal("$")+pyp.StringEnd()) 79Letter.setParseAction(letter_node) 80Letterset.setParseAction(letterset_node) 81Dot.setParseAction(dot_node) 82Atom=Letter|Letterset|Dot 83Regex0=pyp.operatorPrecedence(Atom, 84 [(pyp.oneOf("* + ?"),1,pyp.opAssoc.LEFT,repetition_node), 85 (pyp.Empty(),2,pyp.opAssoc.LEFT,sequence_node), 86 (pyp.Literal("|").suppress(),2,pyp.opAssoc.LEFT,alternation_node)]) 87Regex=StartAnchor.setResultsName("start_of_string")+Regex0.setResultsName("root")+EndAnchor.setResultsName("end_of_string") 88 89def parse(string): 90 return Regex.parseString(string) 91