1#Copyright ReportLab Europe Ltd. 2000-2017
2#see license.txt for license details
3#history https://hg.reportlab.com/hg-public/reportlab/log/tip/tools/docco/t_parse.py
4"""
5Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).
6
7Template initialization has the form:
8   T = Template(template_string, wild_card_marker, single_char_marker,
9             x = regex_x, y = regex_y, ...)
10Parsing has the form
11   ([match1, match2, ..., matchn], lastindex) = T.PARSE(string)
12
13Only the first argument is mandatory.
14
15The resultant object efficiently parses strings that match the template_string,
16giving a list of substrings that correspond to each "directive" of the template.
17
18Template directives:
19
20  Wildcard:
21    The template may be initialized with a wildcard that matches any string
22    up to the string matching the next directive (which may not be a wild
23    card or single character marker) or the next literal sequence of characters
24    of the template.  The character that represents a wildcard is specified
25    by the wild_card_marker parameter, which has no default.
26
27    For example, using X as the wildcard:
28
29
30    >>> T = Template("prefixXinteriorX", "X")
31    >>> T.PARSE("prefix this is before interior and this is after")
32    ([' this is before ', ' and this is after'], 47)
33    >>> T = Template("<X>X<X>", "X")
34    >>> T.PARSE('<A HREF="index.html">go to index</A>')
35    (['A HREF="index.html"', 'go to index', '/A'], 36)
36
37    Obviously the character used to represent the wildcard must be distinct
38    from the characters used to represent literals or other directives.
39
40  Fixed length character sequences:
41    The template may have a marker character which indicates a fixed
42    length field.  All adjacent instances of this marker will be matched
43    by a substring of the same length in the parsed string.  For example:
44
45      >>> T = Template("NNN-NN-NNNN", single_char_marker="N")
46      >>> T.PARSE("1-2-34-5-12")
47      (['1-2', '34', '5-12'], 11)
48      >>> T.PARSE("111-22-3333")
49      (['111', '22', '3333'], 11)
50      >>> T.PARSE("1111-22-3333")
51      ValueError: literal not found at (3, '-')
52
53    A template may have multiple fixed length markers, which allows fixed
54    length fields to be adjacent, but recognized separately.  For example:
55
56      >>> T = Template("MMDDYYX", "X", "MDY")
57      >>> T.PARSE("112489 Somebody's birthday!")
58      (['11', '24', '89', " Somebody's birthday!"], 27)
59
60  Regular expression markers:
61    The template may have markers associated with regular expressions.
62    the regular expressions may be either string represenations of compiled.
63    For example:
64      >>> T = Template("v: s i", v=id, s=str, i=int)
65      >>> T.PARSE("this_is_an_identifier: 'a string' 12344")
66      (['this_is_an_identifier', "'a string'", '12344'], 39)
67      >>>
68
69    Here id, str, and int are regular expression conveniences provided by
70    this module.
71
72  Directive markers may be mixed and matched, except that wildcards cannot precede
73  wildcards or single character markers.
74  Example:
75>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
76>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
77(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
78>>>
79
80"""
81
82import re, string
83from reportlab.lib.utils import ascii_letters
84
85#
86# template parsing
87#
88# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
89#     ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
90#
91class Template:
92
93   def __init__(self,
94                template,
95                wild_card_marker=None,
96                single_char_marker=None,
97                **marker_to_regex_dict):
98       self.template = template
99       self.wild_card = wild_card_marker
100       self.char = single_char_marker
101       # determine the set of markers for this template
102       markers = list(marker_to_regex_dict.keys())
103       if wild_card_marker:
104          markers.append(wild_card_marker)
105       if single_char_marker:
106          for ch in single_char_marker: # allow multiple scm's
107              markers.append(ch)
108          self.char = single_char_primary = single_char_marker[0]
109       self.markers = markers
110       for mark in markers:
111           if len(mark)>1:
112              raise ValueError("Marks must be single characters: "+repr(mark))
113       # compile the regular expressions if needed
114       self.marker_dict = marker_dict = {}
115       for mark, rgex in marker_to_regex_dict.items():
116           if isinstance(rgex,str):
117              rgex = re.compile(rgex)
118           marker_dict[mark] = rgex
119       # determine the parse sequence
120       parse_seq = []
121       # dummy last char
122       lastchar = None
123       index = 0
124       last = len(template)
125       # count the number of directives encountered
126       ndirectives = 0
127       while index<last:
128          start = index
129          thischar = template[index]
130          # is it a wildcard?
131          if thischar == wild_card_marker:
132             if lastchar == wild_card_marker:
133                raise ValueError("two wild cards in sequence is not allowed")
134             parse_seq.append( (wild_card_marker, None) )
135             index = index+1
136             ndirectives = ndirectives+1
137          # is it a sequence of single character markers?
138          elif single_char_marker and thischar in single_char_marker:
139             if lastchar == wild_card_marker:
140                raise ValueError("wild card cannot precede single char marker")
141             while index<last and template[index] == thischar:
142                index = index+1
143             parse_seq.append( (single_char_primary, index-start) )
144             ndirectives = ndirectives+1
145          # is it a literal sequence?
146          elif not thischar in markers:
147             while index<last and not template[index] in markers:
148                index = index+1
149             parse_seq.append( (None, template[start:index]) )
150          # otherwise it must be a re marker
151          else:
152             rgex = marker_dict[thischar]
153             parse_seq.append( (thischar, rgex) )
154             ndirectives = ndirectives+1
155             index = index+1
156          lastchar = template[index-1]
157       self.parse_seq = parse_seq
158       self.ndirectives = ndirectives
159
160   def PARSE(self, s, start=0):
161       ndirectives = self.ndirectives
162       wild_card = self.wild_card
163       single_char = self.char
164       parse_seq = self.parse_seq
165       lparse_seq = len(parse_seq) - 1
166       # make a list long enough for substitutions for directives
167       result = [None] * ndirectives
168       current_directive_index = 0
169       currentindex = start
170       # scan through the parse sequence, recognizing
171       for parse_index in range(lparse_seq + 1):
172           (indicator, data) = parse_seq[parse_index]
173           # is it a literal indicator?
174           if indicator is None:
175              if s.find(data, currentindex) != currentindex:
176                 raise ValueError("literal not found at "+repr((currentindex,data)))
177              currentindex = currentindex + len(data)
178           else:
179              # anything else is a directive
180              # is it a wildcard?
181              if indicator == wild_card:
182                 # if it is the last directive then it matches the rest of the string
183                 if parse_index == lparse_seq:
184                    last = len(s)
185                 # otherwise must look at next directive to find end of wildcard
186                 else:
187                    # next directive must be re or literal
188                    (nextindicator, nextdata) = parse_seq[parse_index+1]
189                    if nextindicator is None:
190                       # search for literal
191                       last = s.find(nextdata, currentindex)
192                       if last<currentindex:
193                          raise ValueError("couldn't terminate wild with lit "+repr(currentindex))
194                    else:
195                       # data is a re, search for it
196                       last = nextdata.search(s, currentindex)
197                       if last<currentindex:
198                          raise ValueError("couldn't terminate wild with re "+repr(currentindex))
199              elif indicator == single_char:
200                 # data is length to eat
201                 last = currentindex + data
202              else:
203                 # other directives are always regular expressions
204                 last = data.match(s, currentindex) + currentindex
205                 if last<currentindex:
206                    raise ValueError("couldn't match re at "+repr(currentindex))
207              #print("accepting", s[currentindex:last])
208              result[current_directive_index] = s[currentindex:last]
209              current_directive_index = current_directive_index+1
210              currentindex = last
211       # sanity check
212       if current_directive_index != ndirectives:
213          raise SystemError("not enough directives found?")
214       return (result, currentindex)
215
216# some useful regular expressions
217USERNAMEREGEX = \
218  "["+ascii_letters+"]["+ascii_letters+string.digits+"_]*"
219STRINGLITREGEX = "'[^\n']*'"
220SIMPLEINTREGEX = "["+string.digits+"]+"
221id = re.compile(USERNAMEREGEX)
222str = re.compile(STRINGLITREGEX)
223int = re.compile(SIMPLEINTREGEX)
224
225def test():
226    global T, T1, T2, T3
227
228    T = Template("(NNN)NNN-NNNN X X", "X", "N")
229    print(T.PARSE("(908)949-2726 Aaron Watters"))
230
231    T1 = Template("s --> s blah", s=str)
232    s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
233    print(T1.PARSE(s))
234
235    T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
236    print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma"))
237
238    T3 = Template("XsXi", "X", "N", s=str, i=int)
239    print(T3.PARSE("prefix'string'interior1234junk not parsed"))
240
241    T4 = Template("MMDDYYX", "X", "MDY")
242    print(T4.PARSE("122961 Somebody's birthday!"))
243
244
245if __name__=="__main__": test()
246