1#
2# Copyright (c) 2003 Richard Jones (http://mechanicalcat.net/richard)
3# Copyright (c) 2002 ekit.com Inc (http://www.ekit-inc.com/)
4# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/)
5#
6# See the README for full license details.
7#
8# HISTORY:
9# This code is heavily based on the TAL parsing code from the Zope Page
10# Templates effort at www.zope.org. No copyright or license accompanied
11# that code.
12#
13# $Id: SimpleDOM.py,v 1.7 2004/08/26 02:50:19 richard Exp $
14
15'''A Simple DOM parser
16
17Simple usage:
18>>> import SimpleDOM
19>>> parser = SimpleDOM.SimpleDOMParser()
20>>> parser.parseString("""<html><head><title>My Document</title></head>
21... <body>
22...  <p>This is a paragraph!!!</p>
23...  <p>This is another para!!</p>
24... </body>
25... </html>""")
26>>> dom = parser.getDOM()
27>>> dom.getByName('p')
28[<SimpleDOMNode "p" {} (1 elements)>, <SimpleDOMNode "p" {} (1 elements)>]
29>>> dom.getByName('p')[0][0]
30'This is a paragraph!!!'
31>>> dom.getByName('title')[0][0]
32'My Document'
33'''
34
35import sys, string
36
37# NOTE this is using a modified HTMLParser
38from HTMLParser import HTMLParser, HTMLParseError
39from utility import Upload
40
41BOOLEAN_HTML_ATTRS = [
42    # List of Boolean attributes in HTML that may be given in
43    # minimized form (e.g. <img ismap> rather than <img ismap="">)
44    # From http://www.w3.org/TR/xhtml1/#guidelines (C.10)
45    "compact", "nowrap", "ismap", "declare", "noshade", "checked",
46    "disabled", "readonly", "multiple", "selected", "noresize",
47    "defer"
48    ]
49
50EMPTY_HTML_TAGS = [
51    # List of HTML tags with an empty content model; these are
52    # rendered in minimized form, e.g. <img />.
53    # From http://www.w3.org/TR/xhtml1/#dtds
54    "base", "meta", "link", "hr", "br", "param", "img", "area",
55    "input", "col", "basefont", "isindex", "frame",
56    ]
57
58PARA_LEVEL_HTML_TAGS = [
59    # List of HTML elements that close open paragraph-level elements
60    # and are themselves paragraph-level.
61    "h1", "h2", "h3", "h4", "h5", "h6", "p",
62    ]
63
64BLOCK_CLOSING_TAG_MAP = {
65    "tr": ("tr", "td", "th"),
66    "td": ("td", "th"),
67    "th": ("td", "th"),
68    "li": ("li",),
69    "dd": ("dd", "dt"),
70    "dt": ("dd", "dt"),
71    "option": ("option",),
72    }
73
74BLOCK_LEVEL_HTML_TAGS = [
75    # List of HTML tags that denote larger sections than paragraphs.
76    "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
77    "noframe", "div", "form", "font", "p",
78    "ul", "ol", "li", "dl", "dt", "dd",
79    ]
80
81
82class NestingError(HTMLParseError):
83    """Exception raised when elements aren't properly nested."""
84
85    def __init__(self, tagstack, endtag, position=(None, None)):
86        self.endtag = endtag
87        if tagstack:
88            if len(tagstack) == 1:
89                msg = ('Open tag <%s> does not match close tag </%s>'
90                       % (tagstack[0], endtag))
91            else:
92                msg = ('Open tags <%s> do not match close tag </%s>'
93                       % (string.join(tagstack, '>, <'), endtag))
94        else:
95            msg = 'No tags are open to match </%s>' % endtag
96        HTMLParseError.__init__(self, msg, position)
97
98class EmptyTagError(NestingError):
99    """Exception raised when empty elements have an end tag."""
100
101    def __init__(self, tag, position=(None, None)):
102        self.tag = tag
103        msg = 'Close tag </%s> should be removed' % tag
104        HTMLParseError.__init__(self, msg, position)
105
106_marker=[]
107class SimpleDOMNode:
108    '''Simple class that represents a tag in a HTML document. The node may
109       have contents which are represented as a sequence of tags or strings
110       of text.
111
112       node.name  -- get the "name" attribute
113       node[N]    -- get the Nth entry in the contents list
114       len(node)  -- number of sub-content objects
115    '''
116    def __init__(self, name, attributes, contents):
117        self.__dict__['__name'] = name
118        self.__dict__['__attributes'] = attributes
119        self.__dict__['__contents'] = contents
120
121    def getByName(self, name, r=None):
122        '''Return all nodes of type "name" from the contents of this DOM
123           using a depth-first search.
124        '''
125        if r is None:
126            r = []
127        for entry in self.getContents():
128            if isinstance(entry, SimpleDOMNode):
129                if entry.__dict__['__name'] == name:
130                    r.append(entry)
131                entry.getByName(name, r)
132        return r
133
134    def getById(self, name, id):
135        '''Return all nodes of type "name" from the contents of this DOM
136           using a depth-first search.
137        '''
138        l = self.getByName(name)
139        for entry in l:
140            if hasattr(entry, 'id') and entry.id == id:
141                return entry
142        raise ValueError, 'No %r with id %r'%(name, id)
143
144    def getByNameFlat(self, name):
145        '''Return all nodes of type "name" from the contents of this node.
146           NON-RECURSIVE.
147        '''
148        r = []
149        for entry in self.getContents():
150            if isinstance(entry, SimpleDOMNode):
151                if entry.__dict__['__name'] == name:
152                    r.append(entry)
153        return r
154
155    def getPath(self, path):
156        '''Return all nodes of type "name" from the contents of this node.
157           NON-RECURSIVE.
158        '''
159        current = self
160        for name, count in path:
161            for entry in current.getContents():
162                if isinstance(entry, SimpleDOMNode) and \
163                        entry.__dict__['__name'] == name:
164                    if not count:
165                        current = entry
166                        break
167                    count -= 1
168        return current
169
170    def hasChildNodes(self):
171        '''Determine if the Node has any content nodes (rather than just text).
172        '''
173        for entry in self.getContents():
174            if isinstance(entry, SimpleDOMNode):
175                return 1
176        return 0
177
178    def getContents(self):
179        return self.__dict__['__contents']
180
181    def __getitem__(self, item):
182        return self.getContents()[item]
183
184    def hasattr(self, attr):
185        return self.__dict__['__attributes'].has_key(attr)
186
187    def getattr(self, attr, default=_marker):
188        if self.__dict__['__attributes'].has_key(attr):
189            return self.__dict__['__attributes'][attr]
190        if default is _marker:
191            raise AttributeError, attr
192        return default
193
194    def __getattr__(self, attr):
195        if self.__dict__['__attributes'].has_key(attr):
196            return self.__dict__['__attributes'][attr]
197        if self.__dict__.has_key(attr):
198            return self.__dict__[attr]
199        raise AttributeError, attr
200
201    def __len__(self):
202        return len(self.getContents())
203
204    def getContentString(self):
205        s = ''
206        for content in self.getContents():
207            s = s + str(content)
208        return s
209
210    def __str__(self):
211        attrs = []
212        for attr in self.__dict__['__attributes'].items():
213            if attr[0] in BOOLEAN_HTML_ATTRS:
214                attrs.append(attr[0])
215            else:
216                attrs.append('%s="%s"'%attr)
217        if attrs:
218            s = '<%s %s>'%(self.__dict__['__name'], ' '.join(attrs))
219        else:
220            s = '<%s>'%self.__dict__['__name']
221        s = s + self.getContentString()
222        if self.__dict__['__name'] in EMPTY_HTML_TAGS:
223            return s
224        else:
225            return s + '</%s>'%self.__dict__['__name']
226
227    def __repr__(self):
228        return '<SimpleDOMNode "%s" %s (%s elements)>'%(self.__dict__['__name'],
229            self.__dict__['__attributes'], len(self.getContents()))
230
231    def extractElements(self, path=[], include_submit=0, include_button=0):
232        ''' Pull a form's elements out of the document given the path to the
233            form.
234
235            For most elements, the returned dictionary has a key:value pair
236            holding the input elements name and value.
237
238            For radio, checkboxes and selects, the value is a dictionary
239            holding:
240
241              value or name: 'selected'    (note: not 'checked')
242
243            where the value of the input/option is used but if not
244            present then the name is used.
245        '''
246        form = self
247        for name, element in path:
248            form = form.getByName(name)[element]
249        elements = {}
250        submits = 0
251        buttons = 0
252        for input in form.getByName('input'):
253            if not hasattr(input, 'type'):
254                elements[input.name] = input.getattr('value', '')
255            elif input.type == 'image':
256                continue
257            elif input.type == 'button' and not include_button:
258                continue
259            elif input.type == 'submit' and not include_submit:
260                continue
261            elif input.type == 'file':
262                elements[input.name] = Upload('')
263            elif input.type in ['checkbox', 'radio']:
264                l = elements.setdefault(input.name, {})
265                key = input.hasattr('value') and input.value or input.name
266                if input.hasattr('checked'):
267                    l[key] = 'selected'
268                else:
269                    l[key] = ''
270            elif input.type == 'submit':
271                name = input.getattr('name', 'submit')
272                if name == 'submit':
273                    name = 'submit%s'%str(submits)
274                    submits = submits + 1
275                elements[name] = input.getattr('value', '')
276            elif input.type == 'button':
277                name = input.getattr('name', 'button')
278                if name == 'button':
279                    name = 'button%s'%str(buttons)
280                    buttons = buttons + 1
281                elements[name] = input.getattr('value', '')
282            else:
283                elements[input.name] = input.getattr('value', '')
284        for textarea in form.getByName('textarea'):
285            if len(textarea):
286                elements[textarea.name] = textarea.getContentString()
287            else:
288                elements[textarea.name] = ''
289        for input in form.getByName('select'):
290            options = input.getByName('option')
291            d = elements[input.name] = {}
292            selected = first = None
293            for option in options:
294                if option.hasattr('value'):
295                    key = option.value
296                elif len(option) > 0:
297                    key = option[0]
298                else:
299                    continue
300                if first is None:
301                    first = key
302                if option.hasattr('selected'):
303                    d[key] = 'selected'
304                    selected = 1
305                else:
306                    d[key] = ''
307            if ((not input.hasattr('size') or input.size == 1)
308                    and selected is None and first is not None):
309                d[first] = 'selected'
310
311        return elements
312
313class SimpleDOMParser(HTMLParser):
314    def __init__(self, debug=0):
315        HTMLParser.__init__(self)
316        self.tagstack = []
317        self.__debug = debug
318
319        #  DOM stuff
320        self.content = self.dom = []
321        self.stack = []
322
323    def parseFile(self, file):
324        f = open(file)
325        data = f.read()
326        f.close()
327        self.parseString(data)
328
329    def parseString(self, data):
330        self.feed(data)
331        self.close()
332        while self.tagstack:
333            self.implied_endtag(self.tagstack[-1], 2)
334
335    def getDOM(self):
336        return SimpleDOMNode('The Document', {}, self.dom)
337
338    # Overriding HTMLParser methods
339
340    def handle_starttag(self, tag, attrs):
341        if self.__debug:
342            print '\n>handle_starttag', tag
343            print self.tagstack
344        self.close_para_tags(tag)
345        self.tagstack.append(tag)
346        d = {}
347        for k, v in attrs:
348            d[string.lower(k)] = v
349        self.emitStartElement(tag, d)
350        if tag in EMPTY_HTML_TAGS:
351            self.implied_endtag(tag, -1)
352
353    def handle_startendtag(self, tag, attrs):
354        if self.__debug:
355            print '><handle_startendtag', tag
356            print self.tagstack
357        self.close_para_tags(tag)
358        d = {}
359        for k, v in attrs:
360            d[string.lower(k)] = v
361        self.emitStartElement(tag, d, isend=1)
362
363    def handle_endtag(self, tag):
364        if self.__debug:
365            print '<handle_endtag', tag
366            print self.tagstack
367        if tag in EMPTY_HTML_TAGS:
368            # </img> etc. in the source is an error
369            raise EmptyTagError(tag, self.getpos())
370        self.close_enclosed_tags(tag)
371        self.emitEndElement(tag)
372        self.tagstack.pop()
373
374    def close_para_tags(self, tag):
375        if tag in EMPTY_HTML_TAGS:
376            return
377        close_to = -1
378        if BLOCK_CLOSING_TAG_MAP.has_key(tag):
379            blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag]
380            for i in range(len(self.tagstack)):
381                t = self.tagstack[i]
382                if t in blocks_to_close:
383                    if close_to == -1:
384                        close_to = i
385                elif t in BLOCK_LEVEL_HTML_TAGS:
386                    close_to = -1
387        elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS:
388            for i in range(len(self.tagstack)):
389                if self.tagstack[i] in BLOCK_LEVEL_HTML_TAGS:
390                    close_to = -1
391                elif self.tagstack[i] in PARA_LEVEL_HTML_TAGS:
392                    if close_to == -1:
393                        close_to = i
394        if close_to >= 0:
395            while len(self.tagstack) > close_to:
396                self.implied_endtag(self.tagstack[-1], 1)
397
398    def close_enclosed_tags(self, tag):
399        if tag not in self.tagstack:
400            raise NestingError(self.tagstack, tag, self.getpos())
401        while tag != self.tagstack[-1]:
402            self.implied_endtag(self.tagstack[-1], 1)
403        assert self.tagstack[-1] == tag
404
405    def implied_endtag(self, tag, implied):
406        if self.__debug:
407            print '<implied_endtag', tag, implied
408            print self.tagstack
409        assert tag == self.tagstack[-1]
410        assert implied in (-1, 1, 2)
411        isend = (implied < 0)
412        self.emitEndElement(tag, isend=isend, implied=implied)
413        self.tagstack.pop()
414
415    def handle_charref(self, name):
416        self.emitText("&#%s;" % name)
417
418    def handle_entityref(self, name):
419        self.emitText("&%s;" % name)
420
421    def handle_data(self, data):
422        self.emitText(data)
423
424    def handle_comment(self, data):
425        self.emitText("<!--%s-->" % data)
426
427    def handle_decl(self, data):
428        self.emitText("<!%s>" % data)
429
430    def handle_pi(self, data):
431        self.emitText("<?%s>" % data)
432
433    def emitStartTag(self, name, attrlist, isend=0):
434        if isend:
435            if self.__debug: print '*** content'
436            self.content.append(SimpleDOMNode(name, attrlist, []))
437        else:
438            # generate a new scope and push the current one on the stack
439            if self.__debug: print '*** push'
440            newcontent = []
441            self.stack.append(self.content)
442            self.content.append(SimpleDOMNode(name, attrlist, newcontent))
443            self.content = newcontent
444
445    def emitEndTag(self, name):
446        if self.__debug: print '*** pop'
447        self.content = self.stack.pop()
448
449    def emitText(self, text):
450        self.content.append(text)
451
452    def emitStartElement(self, name, attrlist, isend=0):
453        # Handle the simple, common case
454        self.emitStartTag(name, attrlist, isend)
455        if isend:
456            self.emitEndElement(name, isend)
457
458    def emitEndElement(self, name, isend=0, implied=0):
459        if not isend or implied:
460            self.emitEndTag(name)
461
462
463if __name__ == '__main__':
464    tester = SimpleDOMParser(debug=0)
465    tester.parseFile('/tmp/test.html')
466    dom = tester.getDOM()
467#    html = dom.getByNameFlat('html')[0]
468#    body = html.getByNameFlat('body')[0]
469#    table = body.getByNameFlat('table')[0]
470#    tr = table.getByNameFlat('tr')[1]
471#    td = tr.getByNameFlat('td')[2]
472#    print td
473    import pprint;pprint.pprint(dom)
474
475