1# Copyright 2012 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14#
15
16"""Adapter between Gumbo and BeautifulSoup.
17
18This parses an HTML document and gives back a BeautifulSoup object, which you
19can then manipulate like a normal BeautifulSoup parse tree.
20"""
21
22__author__ = 'jdtang@google.com (Jonathan Tang)'
23
24import BeautifulSoup
25
26import gumboc
27
28
29def _utf8(text):
30  return text.decode('utf-8', 'replace')
31
32
33def _add_source_info(obj, original_text, start_pos, end_pos):
34  obj.original = str(original_text)
35  obj.line = start_pos.line
36  obj.col = start_pos.column
37  obj.offset = start_pos.offset
38  if end_pos:
39    obj.end_line = end_pos.line
40    obj.end_col = end_pos.column
41    obj.end_offset = end_pos.offset
42
43
44def _convert_attrs(attrs):
45  # TODO(jdtang): Ideally attributes would pass along their positions as well,
46  # but I can't extend the built in str objects with new attributes.  Maybe work
47  # around this with a subclass in some way...
48  return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
49
50
51def _add_document(soup, element):
52  # Currently ignored, since there's no real place for this in the BeautifulSoup
53  # API.
54  pass
55
56
57def _add_element(soup, element):
58  # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
59  # BeautifulSoup.
60  tag = BeautifulSoup.Tag(
61      soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
62  for child in element.children:
63    tag.append(_add_node(soup, child))
64  _add_source_info(
65      tag, element.original_tag, element.start_pos, element.end_pos)
66  tag.original_end_tag = str(element.original_end_tag)
67  return tag
68
69
70def _add_text(cls):
71  def add_text_internal(soup, element):
72    text = cls(_utf8(element.text))
73    _add_source_info(text, element.original_text, element.start_pos, None)
74    return text
75  return add_text_internal
76
77
78_HANDLERS = [
79    _add_document,
80    _add_element,
81    _add_text(BeautifulSoup.NavigableString),
82    _add_text(BeautifulSoup.CData),
83    _add_text(BeautifulSoup.Comment),
84    _add_text(BeautifulSoup.NavigableString),
85    _add_element,
86    ]
87
88
89def _add_node(soup, node):
90  return _HANDLERS[node.type.value](soup, node.contents)
91
92
93def _add_next_prev_pointers(soup):
94  def _traverse(node):
95    # .findAll requires the .next pointer, which is what we're trying to add
96    # when we call this, and so we manually supply a generator to yield the
97    # nodes in DOM order.
98    yield node
99    try:
100      for child in node.contents:
101        for descendant in _traverse(child):
102          yield descendant
103    except AttributeError:
104      # Not an element.
105      return
106
107  nodes = sorted(_traverse(soup), key=lambda node: node.offset)
108  if nodes:
109    nodes[0].previous = None
110    nodes[-1].next = None
111  for i, node in enumerate(nodes[1:-1], 1):
112    nodes[i-1].next = node
113    node.previous = nodes[i-1]
114
115
116def parse(text, **kwargs):
117  with gumboc.parse(text, **kwargs) as output:
118    soup = BeautifulSoup.BeautifulSoup()
119    soup.append(_add_node(soup, output.contents.root.contents))
120    _add_next_prev_pointers(soup)
121    return soup
122