1#Copyright ReportLab Europe Ltd. 2000-2017 2#see license.txt for license details 3#history https://hg.reportlab.com/hg-public/reportlab/log/tip/tools/docco/t_parse.py 4""" 5Template parsing module inspired by REXX (with thanks to Donn Cave for discussion). 6 7Template initialization has the form: 8 T = Template(template_string, wild_card_marker, single_char_marker, 9 x = regex_x, y = regex_y, ...) 10Parsing has the form 11 ([match1, match2, ..., matchn], lastindex) = T.PARSE(string) 12 13Only the first argument is mandatory. 14 15The resultant object efficiently parses strings that match the template_string, 16giving a list of substrings that correspond to each "directive" of the template. 17 18Template directives: 19 20 Wildcard: 21 The template may be initialized with a wildcard that matches any string 22 up to the string matching the next directive (which may not be a wild 23 card or single character marker) or the next literal sequence of characters 24 of the template. The character that represents a wildcard is specified 25 by the wild_card_marker parameter, which has no default. 26 27 For example, using X as the wildcard: 28 29 30 >>> T = Template("prefixXinteriorX", "X") 31 >>> T.PARSE("prefix this is before interior and this is after") 32 ([' this is before ', ' and this is after'], 47) 33 >>> T = Template("<X>X<X>", "X") 34 >>> T.PARSE('<A HREF="index.html">go to index</A>') 35 (['A HREF="index.html"', 'go to index', '/A'], 36) 36 37 Obviously the character used to represent the wildcard must be distinct 38 from the characters used to represent literals or other directives. 39 40 Fixed length character sequences: 41 The template may have a marker character which indicates a fixed 42 length field. All adjacent instances of this marker will be matched 43 by a substring of the same length in the parsed string. For example: 44 45 >>> T = Template("NNN-NN-NNNN", single_char_marker="N") 46 >>> T.PARSE("1-2-34-5-12") 47 (['1-2', '34', '5-12'], 11) 48 >>> T.PARSE("111-22-3333") 49 (['111', '22', '3333'], 11) 50 >>> T.PARSE("1111-22-3333") 51 ValueError: literal not found at (3, '-') 52 53 A template may have multiple fixed length markers, which allows fixed 54 length fields to be adjacent, but recognized separately. For example: 55 56 >>> T = Template("MMDDYYX", "X", "MDY") 57 >>> T.PARSE("112489 Somebody's birthday!") 58 (['11', '24', '89', " Somebody's birthday!"], 27) 59 60 Regular expression markers: 61 The template may have markers associated with regular expressions. 62 the regular expressions may be either string represenations of compiled. 63 For example: 64 >>> T = Template("v: s i", v=id, s=str, i=int) 65 >>> T.PARSE("this_is_an_identifier: 'a string' 12344") 66 (['this_is_an_identifier', "'a string'", '12344'], 39) 67 >>> 68 69 Here id, str, and int are regular expression conveniences provided by 70 this module. 71 72 Directive markers may be mixed and matched, except that wildcards cannot precede 73 wildcards or single character markers. 74 Example: 75>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str) 76>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'") 77(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72) 78>>> 79 80""" 81 82import re, string 83from reportlab.lib.utils import ascii_letters 84 85# 86# template parsing 87# 88# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N") 89# ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters") 90# 91class Template: 92 93 def __init__(self, 94 template, 95 wild_card_marker=None, 96 single_char_marker=None, 97 **marker_to_regex_dict): 98 self.template = template 99 self.wild_card = wild_card_marker 100 self.char = single_char_marker 101 # determine the set of markers for this template 102 markers = list(marker_to_regex_dict.keys()) 103 if wild_card_marker: 104 markers.append(wild_card_marker) 105 if single_char_marker: 106 for ch in single_char_marker: # allow multiple scm's 107 markers.append(ch) 108 self.char = single_char_primary = single_char_marker[0] 109 self.markers = markers 110 for mark in markers: 111 if len(mark)>1: 112 raise ValueError("Marks must be single characters: "+repr(mark)) 113 # compile the regular expressions if needed 114 self.marker_dict = marker_dict = {} 115 for mark, rgex in marker_to_regex_dict.items(): 116 if isinstance(rgex,str): 117 rgex = re.compile(rgex) 118 marker_dict[mark] = rgex 119 # determine the parse sequence 120 parse_seq = [] 121 # dummy last char 122 lastchar = None 123 index = 0 124 last = len(template) 125 # count the number of directives encountered 126 ndirectives = 0 127 while index<last: 128 start = index 129 thischar = template[index] 130 # is it a wildcard? 131 if thischar == wild_card_marker: 132 if lastchar == wild_card_marker: 133 raise ValueError("two wild cards in sequence is not allowed") 134 parse_seq.append( (wild_card_marker, None) ) 135 index = index+1 136 ndirectives = ndirectives+1 137 # is it a sequence of single character markers? 138 elif single_char_marker and thischar in single_char_marker: 139 if lastchar == wild_card_marker: 140 raise ValueError("wild card cannot precede single char marker") 141 while index<last and template[index] == thischar: 142 index = index+1 143 parse_seq.append( (single_char_primary, index-start) ) 144 ndirectives = ndirectives+1 145 # is it a literal sequence? 146 elif not thischar in markers: 147 while index<last and not template[index] in markers: 148 index = index+1 149 parse_seq.append( (None, template[start:index]) ) 150 # otherwise it must be a re marker 151 else: 152 rgex = marker_dict[thischar] 153 parse_seq.append( (thischar, rgex) ) 154 ndirectives = ndirectives+1 155 index = index+1 156 lastchar = template[index-1] 157 self.parse_seq = parse_seq 158 self.ndirectives = ndirectives 159 160 def PARSE(self, s, start=0): 161 ndirectives = self.ndirectives 162 wild_card = self.wild_card 163 single_char = self.char 164 parse_seq = self.parse_seq 165 lparse_seq = len(parse_seq) - 1 166 # make a list long enough for substitutions for directives 167 result = [None] * ndirectives 168 current_directive_index = 0 169 currentindex = start 170 # scan through the parse sequence, recognizing 171 for parse_index in range(lparse_seq + 1): 172 (indicator, data) = parse_seq[parse_index] 173 # is it a literal indicator? 174 if indicator is None: 175 if s.find(data, currentindex) != currentindex: 176 raise ValueError("literal not found at "+repr((currentindex,data))) 177 currentindex = currentindex + len(data) 178 else: 179 # anything else is a directive 180 # is it a wildcard? 181 if indicator == wild_card: 182 # if it is the last directive then it matches the rest of the string 183 if parse_index == lparse_seq: 184 last = len(s) 185 # otherwise must look at next directive to find end of wildcard 186 else: 187 # next directive must be re or literal 188 (nextindicator, nextdata) = parse_seq[parse_index+1] 189 if nextindicator is None: 190 # search for literal 191 last = s.find(nextdata, currentindex) 192 if last<currentindex: 193 raise ValueError("couldn't terminate wild with lit "+repr(currentindex)) 194 else: 195 # data is a re, search for it 196 last = nextdata.search(s, currentindex) 197 if last<currentindex: 198 raise ValueError("couldn't terminate wild with re "+repr(currentindex)) 199 elif indicator == single_char: 200 # data is length to eat 201 last = currentindex + data 202 else: 203 # other directives are always regular expressions 204 last = data.match(s, currentindex) + currentindex 205 if last<currentindex: 206 raise ValueError("couldn't match re at "+repr(currentindex)) 207 #print("accepting", s[currentindex:last]) 208 result[current_directive_index] = s[currentindex:last] 209 current_directive_index = current_directive_index+1 210 currentindex = last 211 # sanity check 212 if current_directive_index != ndirectives: 213 raise SystemError("not enough directives found?") 214 return (result, currentindex) 215 216# some useful regular expressions 217USERNAMEREGEX = \ 218 "["+ascii_letters+"]["+ascii_letters+string.digits+"_]*" 219STRINGLITREGEX = "'[^\n']*'" 220SIMPLEINTREGEX = "["+string.digits+"]+" 221id = re.compile(USERNAMEREGEX) 222str = re.compile(STRINGLITREGEX) 223int = re.compile(SIMPLEINTREGEX) 224 225def test(): 226 global T, T1, T2, T3 227 228 T = Template("(NNN)NNN-NNNN X X", "X", "N") 229 print(T.PARSE("(908)949-2726 Aaron Watters")) 230 231 T1 = Template("s --> s blah", s=str) 232 s = "' <-- a string --> ' --> 'blah blah another string blah' blah" 233 print(T1.PARSE(s)) 234 235 T2 = Template("s --> NNNiX", "X", "N", s=str, i=int) 236 print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma")) 237 238 T3 = Template("XsXi", "X", "N", s=str, i=int) 239 print(T3.PARSE("prefix'string'interior1234junk not parsed")) 240 241 T4 = Template("MMDDYYX", "X", "MDY") 242 print(T4.PARSE("122961 Somebody's birthday!")) 243 244 245if __name__=="__main__": test() 246