1''' 2@author Sergey Chikuyonok (serge.che@gmail.com) 3@link http://chikuyonok.ru 4''' 5from zencoding.parser import css, xml 6import re 7 8def is_stop_char(token): 9 return token['type'] in '{};:' 10 11def char_at(text, pos): 12 """ 13 Returns character at specified index of text. 14 If index if out of range, returns empty string 15 """ 16 return text[pos] if pos < len(text) else '' 17 18def calculate_nl_length(content, pos): 19 """ 20 Calculates newline width at specified position in content 21 @param content: str 22 @param pos: int 23 @return: int 24 """ 25 if char_at(content, pos) == '\r' and char_at(content, pos + 1) == '\n': 26 return 2 27 28 return 1 29 30def post_process_optimized(optimized, original): 31 """ 32 Post-process optimized tokens: collapse tokens for complex values 33 @param optimized: Optimized tokens 34 @type optimized: list 35 @param original: Original preprocessed tokens 36 @type original: list 37 """ 38 for token in optimized: 39 child = None 40 if token['type'] == 'value': 41 token['children'] = [] 42 child = None 43 44 subtoken_start = token['ref_start_ix'] 45 46 while subtoken_start <= token['ref_end_ix']: 47 subtoken = original[subtoken_start] 48 if subtoken['type'] != 'white': 49 if not child: 50 child = [subtoken['start'], subtoken['end']] 51 else: 52 child[1] = subtoken['end'] 53 elif child: 54 token['children'].append(child) 55 child = None 56 57 subtoken_start += 1 58 59 if child: # push last token 60 token['children'].append(child) 61 62 return optimized 63 64def make_token(type='', value='', pos=0, ix=0): 65 value = value or '' 66 return { 67 'type': type or '', 68 'content': value, 69 'start': pos, 70 'end': pos + len(value), 71 # Reference token index that starts current token 72 'ref_start_ix': ix, 73 # Reference token index that ends current token 74 'ref_end_ix': ix 75 } 76 77def parse_css(source, offset=0): 78 """ 79 Parses CSS and optimizes parsed chunks 80 @param source: CSS source code fragment 81 @type source: str 82 @param offset: Offset of CSS fragment inside whole document 83 @type offset: int 84 @return: list 85 """ 86 return optimize_css(css.parse(source), offset, source) 87 88def parse_html(tag, offset=0): 89 """ 90 Parses HTML and optimizes parsed chunks 91 @param source: HTML source code fragment 92 @type source: str 93 @param offset: Offset of HTML fragment inside whole document 94 @type offset: int 95 @return: list 96 """ 97 tokens = xml.parse(tag) 98 result = [] 99 i = 0 100 loop = 1000 # infinite loop protection 101 102 try: 103 while loop: 104 loop -= 1 105 t = tokens['next']() 106 if not t: 107 break 108 else: 109 result.append(make_token(t['style'], t['content'], offset + i, 0)) 110 i += len(t['value']) 111 except xml.StopIteration: 112 pass 113 114 return result 115 116class ExtList(list): 117 def __init__(self): 118 super(ExtList, self).__init__() 119 self.original = [] 120 121 122def optimize_css(tokens, offset, content): 123 """ 124 Optimizes parsed CSS tokens: combines selector chunks, complex values 125 into a single chunk 126 @param tokens: Tokens produced by <code>CSSEX.lex()</code> 127 @type tokens: list 128 @param offset: CSS rule offset in source code (character index) 129 @type offset: int 130 @param content: Original CSS source code 131 @type content: str 132 @return: list of optimized tokens 133 """ 134 offset = offset or 0 135 result = ExtList() 136 _o = 0 137 i = 0 138 delta = 0 139 in_rules = False 140 in_value = False 141 acc_tokens = { 142 'selector': None, 143 'value': None 144 } 145 orig_tokens = [] 146 acc_type = None 147 148 def add_token(token, type): 149 if type and type in acc_tokens: 150 if not acc_tokens[type]: 151 acc_tokens[type] = make_token(type, token['value'], offset + delta + token['charstart'], i) 152 result.append(acc_tokens[type]) 153 else: 154 acc_tokens[type]['content'] += token['value'] 155 acc_tokens[type]['end'] += len(token['value']) 156 acc_tokens[type]['ref_end_ix'] = i 157 else: 158 result.append(make_token(token['type'], token['value'], offset + delta + token['charstart'], i)) 159 160 for i, token in enumerate(tokens): 161 token = tokens[i] 162 acc_type = None 163 164 if token['type'] == 'line': 165 delta += _o 166 nl_size = content and calculate_nl_length(content, delta) or 1 167 tok_value = nl_size == 1 and '\n' or '\r\n' 168 169 orig_tokens.append(make_token(token['type'], tok_value, offset + delta)) 170 171 result.append(make_token(token['type'], tok_value, offset + delta, i)) 172 delta += nl_size 173 _o = 0 174 175 continue 176 177 orig_tokens.append(make_token(token['type'], token['value'], offset + delta + token['charstart'])) 178 179 # use charstart and length because of incorrect charend 180 # computation for whitespace 181 _o = token['charstart'] + len(token['value']) 182 183 if token['type'] != 'white': 184 if token['type'] == '{': 185 in_rules = True 186 acc_tokens['selector'] = None 187 elif in_rules: 188 if token['type'] == ':': 189 in_value = True 190 elif token['type'] == ';': 191 in_value = False 192 acc_tokens['value'] = None 193 elif token['type'] == '}': 194 in_value = in_rules = False 195 acc_tokens['value'] = None 196 elif in_value or acc_tokens['value']: 197 acc_type = 'value' 198 elif acc_tokens['selector'] or (not in_rules and not is_stop_char(token)): 199 # start selector token 200 acc_type = 'selector' 201 202 add_token(token, acc_type) 203 else: 204 # whitespace token, decide where it should be 205 if i < len(tokens) - 1 and is_stop_char(tokens[i + 1]): 206 continue 207 208 if acc_tokens['selector'] or acc_tokens['value']: 209 add_token(token, acc_tokens['selector'] and 'selector' or 'value') 210 211 result.original = orig_tokens 212 return post_process_optimized(result, orig_tokens) 213 214def extract_css_rule(content, pos, is_backward=False): 215 """ 216 Extracts single CSS selector definition from source code 217 @param {String} content CSS source code 218 @type content: str 219 @param pos: Character position where to start source code extraction 220 @type pos: int 221 """ 222 result = '' 223 c_len = len(content) 224 offset = pos 225 brace_pos = -1 226 227 # search left until we find rule edge 228 while offset >= 0: 229 ch = content[offset] 230 if ch == '{': 231 brace_pos = offset 232 break 233 elif ch == '}' and not is_backward: 234 offset += 1 235 break 236 237 offset -= 1 238 239 # search right for full rule set 240 while offset < c_len: 241 ch = content[offset] 242 if ch == '{': 243 brace_pos = offset 244 elif ch == '}': 245 if brace_pos != -1: 246 result = content[brace_pos:offset + 1] 247 break 248 249 offset += 1 250 251 if result: 252 # find CSS selector 253 offset = brace_pos - 1 254 selector = '' 255 while offset >= 0: 256 ch = content[offset] 257 if ch in '{}/\\<>': break 258 offset -= 1 259 260 # also trim whitespace 261 re_white = re.compile(r'^[\s\n\r]+', re.MULTILINE) 262 selector = re.sub(re_white, '', content[offset + 1:brace_pos]) 263 return (brace_pos - len(selector), brace_pos + len(result)) 264 265 return None 266 267# function alias 268token = make_token