1# 2# Copyright (c) 2003 Richard Jones (http://mechanicalcat.net/richard) 3# Copyright (c) 2002 ekit.com Inc (http://www.ekit-inc.com/) 4# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/) 5# 6# See the README for full license details. 7# 8# HISTORY: 9# This code is heavily based on the TAL parsing code from the Zope Page 10# Templates effort at www.zope.org. No copyright or license accompanied 11# that code. 12# 13# $Id: SimpleDOM.py,v 1.7 2004/08/26 02:50:19 richard Exp $ 14 15'''A Simple DOM parser 16 17Simple usage: 18>>> import SimpleDOM 19>>> parser = SimpleDOM.SimpleDOMParser() 20>>> parser.parseString("""<html><head><title>My Document</title></head> 21... <body> 22... <p>This is a paragraph!!!</p> 23... <p>This is another para!!</p> 24... </body> 25... </html>""") 26>>> dom = parser.getDOM() 27>>> dom.getByName('p') 28[<SimpleDOMNode "p" {} (1 elements)>, <SimpleDOMNode "p" {} (1 elements)>] 29>>> dom.getByName('p')[0][0] 30'This is a paragraph!!!' 31>>> dom.getByName('title')[0][0] 32'My Document' 33''' 34 35import sys, string 36 37# NOTE this is using a modified HTMLParser 38from HTMLParser import HTMLParser, HTMLParseError 39from utility import Upload 40 41BOOLEAN_HTML_ATTRS = [ 42 # List of Boolean attributes in HTML that may be given in 43 # minimized form (e.g. <img ismap> rather than <img ismap="">) 44 # From http://www.w3.org/TR/xhtml1/#guidelines (C.10) 45 "compact", "nowrap", "ismap", "declare", "noshade", "checked", 46 "disabled", "readonly", "multiple", "selected", "noresize", 47 "defer" 48 ] 49 50EMPTY_HTML_TAGS = [ 51 # List of HTML tags with an empty content model; these are 52 # rendered in minimized form, e.g. <img />. 53 # From http://www.w3.org/TR/xhtml1/#dtds 54 "base", "meta", "link", "hr", "br", "param", "img", "area", 55 "input", "col", "basefont", "isindex", "frame", 56 ] 57 58PARA_LEVEL_HTML_TAGS = [ 59 # List of HTML elements that close open paragraph-level elements 60 # and are themselves paragraph-level. 61 "h1", "h2", "h3", "h4", "h5", "h6", "p", 62 ] 63 64BLOCK_CLOSING_TAG_MAP = { 65 "tr": ("tr", "td", "th"), 66 "td": ("td", "th"), 67 "th": ("td", "th"), 68 "li": ("li",), 69 "dd": ("dd", "dt"), 70 "dt": ("dd", "dt"), 71 "option": ("option",), 72 } 73 74BLOCK_LEVEL_HTML_TAGS = [ 75 # List of HTML tags that denote larger sections than paragraphs. 76 "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody", 77 "noframe", "div", "form", "font", "p", 78 "ul", "ol", "li", "dl", "dt", "dd", 79 ] 80 81 82class NestingError(HTMLParseError): 83 """Exception raised when elements aren't properly nested.""" 84 85 def __init__(self, tagstack, endtag, position=(None, None)): 86 self.endtag = endtag 87 if tagstack: 88 if len(tagstack) == 1: 89 msg = ('Open tag <%s> does not match close tag </%s>' 90 % (tagstack[0], endtag)) 91 else: 92 msg = ('Open tags <%s> do not match close tag </%s>' 93 % (string.join(tagstack, '>, <'), endtag)) 94 else: 95 msg = 'No tags are open to match </%s>' % endtag 96 HTMLParseError.__init__(self, msg, position) 97 98class EmptyTagError(NestingError): 99 """Exception raised when empty elements have an end tag.""" 100 101 def __init__(self, tag, position=(None, None)): 102 self.tag = tag 103 msg = 'Close tag </%s> should be removed' % tag 104 HTMLParseError.__init__(self, msg, position) 105 106_marker=[] 107class SimpleDOMNode: 108 '''Simple class that represents a tag in a HTML document. The node may 109 have contents which are represented as a sequence of tags or strings 110 of text. 111 112 node.name -- get the "name" attribute 113 node[N] -- get the Nth entry in the contents list 114 len(node) -- number of sub-content objects 115 ''' 116 def __init__(self, name, attributes, contents): 117 self.__dict__['__name'] = name 118 self.__dict__['__attributes'] = attributes 119 self.__dict__['__contents'] = contents 120 121 def getByName(self, name, r=None): 122 '''Return all nodes of type "name" from the contents of this DOM 123 using a depth-first search. 124 ''' 125 if r is None: 126 r = [] 127 for entry in self.getContents(): 128 if isinstance(entry, SimpleDOMNode): 129 if entry.__dict__['__name'] == name: 130 r.append(entry) 131 entry.getByName(name, r) 132 return r 133 134 def getById(self, name, id): 135 '''Return all nodes of type "name" from the contents of this DOM 136 using a depth-first search. 137 ''' 138 l = self.getByName(name) 139 for entry in l: 140 if hasattr(entry, 'id') and entry.id == id: 141 return entry 142 raise ValueError, 'No %r with id %r'%(name, id) 143 144 def getByNameFlat(self, name): 145 '''Return all nodes of type "name" from the contents of this node. 146 NON-RECURSIVE. 147 ''' 148 r = [] 149 for entry in self.getContents(): 150 if isinstance(entry, SimpleDOMNode): 151 if entry.__dict__['__name'] == name: 152 r.append(entry) 153 return r 154 155 def getPath(self, path): 156 '''Return all nodes of type "name" from the contents of this node. 157 NON-RECURSIVE. 158 ''' 159 current = self 160 for name, count in path: 161 for entry in current.getContents(): 162 if isinstance(entry, SimpleDOMNode) and \ 163 entry.__dict__['__name'] == name: 164 if not count: 165 current = entry 166 break 167 count -= 1 168 return current 169 170 def hasChildNodes(self): 171 '''Determine if the Node has any content nodes (rather than just text). 172 ''' 173 for entry in self.getContents(): 174 if isinstance(entry, SimpleDOMNode): 175 return 1 176 return 0 177 178 def getContents(self): 179 return self.__dict__['__contents'] 180 181 def __getitem__(self, item): 182 return self.getContents()[item] 183 184 def hasattr(self, attr): 185 return self.__dict__['__attributes'].has_key(attr) 186 187 def getattr(self, attr, default=_marker): 188 if self.__dict__['__attributes'].has_key(attr): 189 return self.__dict__['__attributes'][attr] 190 if default is _marker: 191 raise AttributeError, attr 192 return default 193 194 def __getattr__(self, attr): 195 if self.__dict__['__attributes'].has_key(attr): 196 return self.__dict__['__attributes'][attr] 197 if self.__dict__.has_key(attr): 198 return self.__dict__[attr] 199 raise AttributeError, attr 200 201 def __len__(self): 202 return len(self.getContents()) 203 204 def getContentString(self): 205 s = '' 206 for content in self.getContents(): 207 s = s + str(content) 208 return s 209 210 def __str__(self): 211 attrs = [] 212 for attr in self.__dict__['__attributes'].items(): 213 if attr[0] in BOOLEAN_HTML_ATTRS: 214 attrs.append(attr[0]) 215 else: 216 attrs.append('%s="%s"'%attr) 217 if attrs: 218 s = '<%s %s>'%(self.__dict__['__name'], ' '.join(attrs)) 219 else: 220 s = '<%s>'%self.__dict__['__name'] 221 s = s + self.getContentString() 222 if self.__dict__['__name'] in EMPTY_HTML_TAGS: 223 return s 224 else: 225 return s + '</%s>'%self.__dict__['__name'] 226 227 def __repr__(self): 228 return '<SimpleDOMNode "%s" %s (%s elements)>'%(self.__dict__['__name'], 229 self.__dict__['__attributes'], len(self.getContents())) 230 231 def extractElements(self, path=[], include_submit=0, include_button=0): 232 ''' Pull a form's elements out of the document given the path to the 233 form. 234 235 For most elements, the returned dictionary has a key:value pair 236 holding the input elements name and value. 237 238 For radio, checkboxes and selects, the value is a dictionary 239 holding: 240 241 value or name: 'selected' (note: not 'checked') 242 243 where the value of the input/option is used but if not 244 present then the name is used. 245 ''' 246 form = self 247 for name, element in path: 248 form = form.getByName(name)[element] 249 elements = {} 250 submits = 0 251 buttons = 0 252 for input in form.getByName('input'): 253 if not hasattr(input, 'type'): 254 elements[input.name] = input.getattr('value', '') 255 elif input.type == 'image': 256 continue 257 elif input.type == 'button' and not include_button: 258 continue 259 elif input.type == 'submit' and not include_submit: 260 continue 261 elif input.type == 'file': 262 elements[input.name] = Upload('') 263 elif input.type in ['checkbox', 'radio']: 264 l = elements.setdefault(input.name, {}) 265 key = input.hasattr('value') and input.value or input.name 266 if input.hasattr('checked'): 267 l[key] = 'selected' 268 else: 269 l[key] = '' 270 elif input.type == 'submit': 271 name = input.getattr('name', 'submit') 272 if name == 'submit': 273 name = 'submit%s'%str(submits) 274 submits = submits + 1 275 elements[name] = input.getattr('value', '') 276 elif input.type == 'button': 277 name = input.getattr('name', 'button') 278 if name == 'button': 279 name = 'button%s'%str(buttons) 280 buttons = buttons + 1 281 elements[name] = input.getattr('value', '') 282 else: 283 elements[input.name] = input.getattr('value', '') 284 for textarea in form.getByName('textarea'): 285 if len(textarea): 286 elements[textarea.name] = textarea.getContentString() 287 else: 288 elements[textarea.name] = '' 289 for input in form.getByName('select'): 290 options = input.getByName('option') 291 d = elements[input.name] = {} 292 selected = first = None 293 for option in options: 294 if option.hasattr('value'): 295 key = option.value 296 elif len(option) > 0: 297 key = option[0] 298 else: 299 continue 300 if first is None: 301 first = key 302 if option.hasattr('selected'): 303 d[key] = 'selected' 304 selected = 1 305 else: 306 d[key] = '' 307 if ((not input.hasattr('size') or input.size == 1) 308 and selected is None and first is not None): 309 d[first] = 'selected' 310 311 return elements 312 313class SimpleDOMParser(HTMLParser): 314 def __init__(self, debug=0): 315 HTMLParser.__init__(self) 316 self.tagstack = [] 317 self.__debug = debug 318 319 # DOM stuff 320 self.content = self.dom = [] 321 self.stack = [] 322 323 def parseFile(self, file): 324 f = open(file) 325 data = f.read() 326 f.close() 327 self.parseString(data) 328 329 def parseString(self, data): 330 self.feed(data) 331 self.close() 332 while self.tagstack: 333 self.implied_endtag(self.tagstack[-1], 2) 334 335 def getDOM(self): 336 return SimpleDOMNode('The Document', {}, self.dom) 337 338 # Overriding HTMLParser methods 339 340 def handle_starttag(self, tag, attrs): 341 if self.__debug: 342 print '\n>handle_starttag', tag 343 print self.tagstack 344 self.close_para_tags(tag) 345 self.tagstack.append(tag) 346 d = {} 347 for k, v in attrs: 348 d[string.lower(k)] = v 349 self.emitStartElement(tag, d) 350 if tag in EMPTY_HTML_TAGS: 351 self.implied_endtag(tag, -1) 352 353 def handle_startendtag(self, tag, attrs): 354 if self.__debug: 355 print '><handle_startendtag', tag 356 print self.tagstack 357 self.close_para_tags(tag) 358 d = {} 359 for k, v in attrs: 360 d[string.lower(k)] = v 361 self.emitStartElement(tag, d, isend=1) 362 363 def handle_endtag(self, tag): 364 if self.__debug: 365 print '<handle_endtag', tag 366 print self.tagstack 367 if tag in EMPTY_HTML_TAGS: 368 # </img> etc. in the source is an error 369 raise EmptyTagError(tag, self.getpos()) 370 self.close_enclosed_tags(tag) 371 self.emitEndElement(tag) 372 self.tagstack.pop() 373 374 def close_para_tags(self, tag): 375 if tag in EMPTY_HTML_TAGS: 376 return 377 close_to = -1 378 if BLOCK_CLOSING_TAG_MAP.has_key(tag): 379 blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag] 380 for i in range(len(self.tagstack)): 381 t = self.tagstack[i] 382 if t in blocks_to_close: 383 if close_to == -1: 384 close_to = i 385 elif t in BLOCK_LEVEL_HTML_TAGS: 386 close_to = -1 387 elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS: 388 for i in range(len(self.tagstack)): 389 if self.tagstack[i] in BLOCK_LEVEL_HTML_TAGS: 390 close_to = -1 391 elif self.tagstack[i] in PARA_LEVEL_HTML_TAGS: 392 if close_to == -1: 393 close_to = i 394 if close_to >= 0: 395 while len(self.tagstack) > close_to: 396 self.implied_endtag(self.tagstack[-1], 1) 397 398 def close_enclosed_tags(self, tag): 399 if tag not in self.tagstack: 400 raise NestingError(self.tagstack, tag, self.getpos()) 401 while tag != self.tagstack[-1]: 402 self.implied_endtag(self.tagstack[-1], 1) 403 assert self.tagstack[-1] == tag 404 405 def implied_endtag(self, tag, implied): 406 if self.__debug: 407 print '<implied_endtag', tag, implied 408 print self.tagstack 409 assert tag == self.tagstack[-1] 410 assert implied in (-1, 1, 2) 411 isend = (implied < 0) 412 self.emitEndElement(tag, isend=isend, implied=implied) 413 self.tagstack.pop() 414 415 def handle_charref(self, name): 416 self.emitText("&#%s;" % name) 417 418 def handle_entityref(self, name): 419 self.emitText("&%s;" % name) 420 421 def handle_data(self, data): 422 self.emitText(data) 423 424 def handle_comment(self, data): 425 self.emitText("<!--%s-->" % data) 426 427 def handle_decl(self, data): 428 self.emitText("<!%s>" % data) 429 430 def handle_pi(self, data): 431 self.emitText("<?%s>" % data) 432 433 def emitStartTag(self, name, attrlist, isend=0): 434 if isend: 435 if self.__debug: print '*** content' 436 self.content.append(SimpleDOMNode(name, attrlist, [])) 437 else: 438 # generate a new scope and push the current one on the stack 439 if self.__debug: print '*** push' 440 newcontent = [] 441 self.stack.append(self.content) 442 self.content.append(SimpleDOMNode(name, attrlist, newcontent)) 443 self.content = newcontent 444 445 def emitEndTag(self, name): 446 if self.__debug: print '*** pop' 447 self.content = self.stack.pop() 448 449 def emitText(self, text): 450 self.content.append(text) 451 452 def emitStartElement(self, name, attrlist, isend=0): 453 # Handle the simple, common case 454 self.emitStartTag(name, attrlist, isend) 455 if isend: 456 self.emitEndElement(name, isend) 457 458 def emitEndElement(self, name, isend=0, implied=0): 459 if not isend or implied: 460 self.emitEndTag(name) 461 462 463if __name__ == '__main__': 464 tester = SimpleDOMParser(debug=0) 465 tester.parseFile('/tmp/test.html') 466 dom = tester.getDOM() 467# html = dom.getByNameFlat('html')[0] 468# body = html.getByNameFlat('body')[0] 469# table = body.getByNameFlat('table')[0] 470# tr = table.getByNameFlat('tr')[1] 471# td = tr.getByNameFlat('td')[2] 472# print td 473 import pprint;pprint.pprint(dom) 474 475