1''' 2@author Sergey Chikuyonok (serge.che@gmail.com) 3@link http://chikuyonok.ru 4''' 5import re 6 7re_word = re.compile(r'^[\w\-:\$]+') 8re_attr_string = re.compile(r'^(["\'])((?:(?!\1)[^\\]|\\.)*)\1') 9re_valid_name = re.compile(r'^[\w\d\-_\$\:@!]+\+?$', re.IGNORECASE) 10 11def char_at(text, pos): 12 """ 13 Returns character at specified index of text. 14 If index if out of range, returns empty string 15 """ 16 return text[pos] if pos < len(text) else '' 17 18def split_expression(expr): 19 """ 20 Split expression by node name and its content, if exists. E.g. if we pass 21 'a{Text}' expression, it will be splitted into 'a' and 'Text' 22 23 @type expr: str 24 @return: Result tuple with two elements: node name and its content 25 """ 26 # fast test on text node 27 if '{' not in expr: 28 return expr, None 29 30 attr_lvl = 0 31 text_lvl = 0 32 brace_stack = [] 33 i = 0 34 il = len(expr) 35 36 while i < il: 37 ch = expr[i] 38 if ch == '[': 39 if not text_lvl: 40 attr_lvl += 1 41 elif ch == ']': 42 if not text_lvl: 43 attr_lvl -= 1 44 elif ch == '{': 45 if not attr_lvl: 46 text_lvl += 1 47 brace_stack.append(i) 48 elif ch == '}': 49 if not attr_lvl: 50 text_lvl -= 1 51 brace_start = brace_stack.pop() 52 if text_lvl == 0: 53 # found braces bounds 54 return expr[0:brace_start], expr[brace_start + 1:i] 55 i += 1 56 57 # if we are here, then no valid text node found 58 return expr, None 59 60def parse_attributes(s): 61 """ 62 Parses tag attributes extracted from abbreviation 63 @type s: str 64 65 Example of incoming data: 66 #header 67 .some.data 68 .some.data#header 69 [attr] 70 #item[attr=Hello other="World"].class 71 """ 72 73 result = [] 74 name = '' 75 collect_name = True 76 class_name = None 77 char_map = {'#': 'id', '.': 'class'} 78 79 # walk char-by-char 80 i = 0 81 il = len(s) 82 83 while i < il: 84 ch = s[i] 85 if ch == '#': # id 86 val = get_word(i, s[1:]) 87 result.append({'name': char_map[ch], 'value': val}) 88 i += len(val) + 1 89 collect_name = False 90 elif ch == '.': # class 91 val = get_word(i, s[1:]) 92 if not class_name: 93 # remember object pointer for value modification 94 class_name = {'name': char_map[ch], 'value': ''} 95 result.append(class_name) 96 97 class_name['value'] += (class_name['value'] and ' ' or '') + val 98 i += len(val) + 1 99 collect_name = False 100 elif ch == '[': # begin attribute set 101 # search for end of set 102 try: 103 end_ix = s.index(']', i) 104 for a in extract_attributes(s[i + 1:end_ix]): 105 result.append(a) 106 107 i = end_ix 108 except: 109 # invalid attribute set, stop searching 110 i = len(s) 111 112 collect_name = False 113 else: 114 if collect_name: 115 name += ch 116 i += 1 117 118 return name, result 119 120def get_word(ix, s): 121 """ 122 Get word, starting at 'ix' character of 's 123 """ 124 m = re_word.match(s[ix:]) 125 return m and m.group(0) or '' 126 127def extract_attributes(attr_set): 128 """ 129 Extract attributes and their values from attribute set 130 @type attr_set: str 131 """ 132 attr_set = attr_set.strip() 133 loop_count = 100 # endless loop protection 134 result = [] 135 attr = None 136 137 while attr_set and loop_count: 138 attr_name = get_word(0, attr_set) 139 attr = None 140 if attr_name: 141 attr = {'name': attr_name, 'value': ''} 142 143 # let's see if attribute has value 144 ch = char_at(attr_set, len(attr_name)) 145 if ch == '=': 146 ch2 = char_at(attr_set, len(attr_name) + 1) 147 if ch2 == '"' or ch2 == "'": 148 # we have a quoted string 149 m = re_attr_string.match(attr_set[len(attr_name) + 1:]) 150 if m: 151 attr['value'] = m.group(2) 152 attr_set = attr_set[len(attr_name) + len(m.group(0)) + 1:].strip() 153 else: 154 # something wrong, break loop 155 attr_set = '' 156 else: 157 # unquoted string 158 m = re.match(r'(.+?)(\s|$)', attr_set[len(attr_name) + 1:]) 159 if m: 160 attr['value'] = m.group(1) 161 attr_set = attr_set[len(attr_name) + len(m.group(1)) + 1:].strip() 162 else: 163 # something wrong, break loop 164 attr_set = '' 165 else: 166 attr_set = attr_set[len(attr_name):].strip() 167 else: 168 # something wrong, can't extract attribute name 169 break; 170 171 if attr: result.append(attr) 172 loop_count -= 1 173 174 return result 175 176def squash(node): 177 """ 178 Optimizes tree node: replaces empty nodes with their children 179 @type node: TreeNode 180 @return: TreeNode 181 """ 182 for i, child in enumerate(node.children): 183 if child.is_empty(): 184 node.children[i:i + 1] = child.children 185 186 return node 187 188def optimize_tree(node): 189 """ 190 @type node: TreeNode 191 @return: TreeNode 192 """ 193 while node.has_empty_children(): 194 squash(node) 195 196 for child in node.children: 197 optimize_tree(child) 198 199 return node 200 201def parse(abbr): 202 """ 203 Parses abbreviation into tree with respect of groups, 204 text nodes and attributes. Each node of the tree is a single 205 abbreviation. Tree represents actual structure of the outputted 206 result 207 @param abbr: Abbreviation to parse 208 @type abbr: str 209 @return: TreeNode 210 """ 211 root = TreeNode() 212 context = root.add_child() 213 i = 0 214 il = len(abbr) 215 text_lvl = 0 216 attr_lvl = 0 217 group_stack = [root] 218 token = [''] 219 220 def dump_token(): 221 if token[0]: 222 context.set_abbreviation(token[0]) 223 token[0] = '' 224 225 while i < il: 226 ch = abbr[i] 227 prev_ch = i and abbr[i - 1] or '' 228 if ch == '{': 229 if not attr_lvl: 230 text_lvl += 1 231 token[0] += ch 232 elif ch == '}': 233 if not attr_lvl: 234 text_lvl -= 1 235 token[0] += ch 236 elif ch == '[': 237 if not text_lvl: 238 attr_lvl += 1 239 token[0] += ch 240 elif ch == ']': 241 if not text_lvl: 242 attr_lvl -= 1 243 token[0] += ch 244 elif ch == '(': 245 if not text_lvl and not attr_lvl: 246 # beginning of the new group 247 dump_token(); 248 249 if prev_ch != '+' and prev_ch != '>': 250 # previous char is not an operator, assume it's 251 # a sibling 252 context = context.parent.add_child() 253 254 group_stack.append(context) 255 context = context.add_child() 256 else: 257 token[0] += ch 258 elif ch == ')': 259 if not text_lvl and not attr_lvl: 260 # end of the group, pop stack 261 dump_token() 262 context = group_stack.pop() 263 264 if i < il - 1 and char_at(abbr, i + 1) == '*': 265 # group multiplication 266 group_mul = '' 267 for j in xrange(i + 2, il): 268 n_ch = abbr[j] 269 if n_ch.isdigit(): 270 group_mul += n_ch 271 else: 272 break 273 274 i += len(group_mul) + 1 275 group_mul = int(group_mul or 1) 276 while 1 < group_mul: 277 context.parent.add_child(context) 278 group_mul -= 1 279 else: 280 token[0] += ch 281 elif ch == '+': # sibling operator 282 if not text_lvl and not attr_lvl and i != il - 1: 283 dump_token() 284 context = context.parent.add_child() 285 else: 286 token[0] += ch 287 elif ch == '>': # child operator 288 if not text_lvl and not attr_lvl: 289 dump_token() 290 context = context.add_child() 291 else: 292 token[0] += ch 293 else: 294 token[0] += ch 295 296 i += 1 297 298 # put the final token 299 dump_token() 300 return optimize_tree(root) 301 302class TreeNode(object): 303 re_multiplier = re.compile(r'\*(\d+)?$') 304 305 def __init__(self, parent=None): 306 self.abbreviation = ''; 307 self.parent = parent 308 self.children = [] 309 self.count = 1 310 self.name = None 311 self.text = None 312 self.attributes = [] 313 self.is_repeating = False 314 self.has_implicit_name = False 315 316 def add_child(self, child=None): 317 """ 318 Adds passed or creates new child 319 @type child: TreeNode 320 @return: TreeNode 321 """ 322 if not child: child = TreeNode() 323 child.parent = self 324 self.children.append(child) 325 return child 326 327 def replace(self, node): 328 """ 329 Replace current node in parent's child list with another node 330 @type node: TreeNode 331 """ 332 if self.parent: 333 children = self.parent.children 334 if self in children: 335 children[children.index(self)] = node 336 self.parent = None 337 return 338 339 def set_abbreviation(self, abbr): 340 """ 341 Sets abbreviation that belongs to current node 342 @type abbr: str 343 """ 344 self.abbreviation = abbr 345 m = self.re_multiplier.search(abbr) 346 if m: 347 self.count = m.group(1) and int(m.group(1)) or 1 348 self.is_repeating = not m.group(1) 349 abbr = abbr[0:-len(m.group(0))] 350 351 if abbr: 352 name, self.text = split_expression(abbr) 353 354 if name: 355 self.name, self.attributes = parse_attributes(name) 356 if not self.name: 357 self.name = 'div' 358 self.has_implicit_name = True 359 360 # validate name 361 if self.name and not re_valid_name.match(self.name): 362 raise ZenInvalidAbbreviation('self.name') 363 364 def get_abbreviation(self): 365 return self.expr 366 367 def to_string(self, level=0): 368 """ 369 Dump current tree node into a foramtted string 370 """ 371 output = '(empty)' 372 if self.abbreviation: 373 output = '' 374 if self.name: 375 output = self.name 376 377 if self.text is not None: 378 output += (output and ' ' or '') + '{text: "' + self.text + '"}' 379 380 if self.attributes: 381 output += ' [' + ', '.join(['%s="%s"' % (a['name'], a['value']) for a in self.attributes]) + ']' 382 383 result = ('-' * level) + output + '\n' 384 for child in self.children: 385 result += child.to_string(level + 1) 386 387 return result 388 389 def __repr__(self): 390 return self.to_string() 391 392 def has_empty_children(self): 393 """ 394 Check if current node contains children with empty <code>expr</code> 395 property 396 """ 397 for child in self.children: 398 if child.is_empty(): 399 return True 400 401 return False 402 403 def is_empty(self): 404 return not self.abbreviation 405 406 def is_text_node(self): 407 """ 408 Check if current node is a text-only node 409 """ 410 return not self.name and self.text 411 412 413class ZenInvalidAbbreviation(Exception): 414 """ 415 Invalid abbreviation error 416 @since: 0.7 417 """ 418 def __init__(self, value): 419 self.value = value 420 def __str__(self): 421 return repr(self.value)