1# Copyright 2012 Google Inc. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# 15 16"""Adapter between Gumbo and BeautifulSoup. 17 18This parses an HTML document and gives back a BeautifulSoup object, which you 19can then manipulate like a normal BeautifulSoup parse tree. 20""" 21 22__author__ = 'jdtang@google.com (Jonathan Tang)' 23 24import BeautifulSoup 25 26import gumboc 27 28 29def _utf8(text): 30 return text.decode('utf-8', 'replace') 31 32 33def _add_source_info(obj, original_text, start_pos, end_pos): 34 obj.original = str(original_text) 35 obj.line = start_pos.line 36 obj.col = start_pos.column 37 obj.offset = start_pos.offset 38 if end_pos: 39 obj.end_line = end_pos.line 40 obj.end_col = end_pos.column 41 obj.end_offset = end_pos.offset 42 43 44def _convert_attrs(attrs): 45 # TODO(jdtang): Ideally attributes would pass along their positions as well, 46 # but I can't extend the built in str objects with new attributes. Maybe work 47 # around this with a subclass in some way... 48 return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs] 49 50 51def _add_document(soup, element): 52 # Currently ignored, since there's no real place for this in the BeautifulSoup 53 # API. 54 pass 55 56 57def _add_element(soup, element): 58 # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to 59 # BeautifulSoup. 60 tag = BeautifulSoup.Tag( 61 soup, _utf8(element.tag_name), _convert_attrs(element.attributes)) 62 for child in element.children: 63 tag.append(_add_node(soup, child)) 64 _add_source_info( 65 tag, element.original_tag, element.start_pos, element.end_pos) 66 tag.original_end_tag = str(element.original_end_tag) 67 return tag 68 69 70def _add_text(cls): 71 def add_text_internal(soup, element): 72 text = cls(_utf8(element.text)) 73 _add_source_info(text, element.original_text, element.start_pos, None) 74 return text 75 return add_text_internal 76 77 78_HANDLERS = [ 79 _add_document, 80 _add_element, 81 _add_text(BeautifulSoup.NavigableString), 82 _add_text(BeautifulSoup.CData), 83 _add_text(BeautifulSoup.Comment), 84 _add_text(BeautifulSoup.NavigableString), 85 _add_element, 86 ] 87 88 89def _add_node(soup, node): 90 return _HANDLERS[node.type.value](soup, node.contents) 91 92 93def _add_next_prev_pointers(soup): 94 def _traverse(node): 95 # .findAll requires the .next pointer, which is what we're trying to add 96 # when we call this, and so we manually supply a generator to yield the 97 # nodes in DOM order. 98 yield node 99 try: 100 for child in node.contents: 101 for descendant in _traverse(child): 102 yield descendant 103 except AttributeError: 104 # Not an element. 105 return 106 107 nodes = sorted(_traverse(soup), key=lambda node: node.offset) 108 if nodes: 109 nodes[0].previous = None 110 nodes[-1].next = None 111 for i, node in enumerate(nodes[1:-1], 1): 112 nodes[i-1].next = node 113 node.previous = nodes[i-1] 114 115 116def parse(text, **kwargs): 117 with gumboc.parse(text, **kwargs) as output: 118 soup = BeautifulSoup.BeautifulSoup() 119 soup.append(_add_node(soup, output.contents.root.contents)) 120 _add_next_prev_pointers(soup) 121 return soup 122