1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import re 9 10from lxml.etree import Element as LxmlElement 11import html5_parser 12 13from calibre import xml_replace_entities 14from calibre.utils.xml_parse import safe_xml_fromstring 15from calibre.ebooks.chardet import xml_to_unicode, strip_encoding_declarations 16from calibre.utils.cleantext import clean_xml_chars 17 18XHTML_NS = 'http://www.w3.org/1999/xhtml' 19 20 21def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): 22 if isinstance(raw, bytes): 23 raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) 24 if replace_entities: 25 raw = xml_replace_entities(raw) 26 if fix_newlines: 27 raw = raw.replace('\r\n', '\n').replace('\r', '\n') 28 raw = clean_xml_chars(raw) 29 root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) 30 if (discard_namespaces and root.tag != 'html') or ( 31 not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): 32 raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) 33 return root 34 35 36def handle_private_entities(data): 37 # Process private entities 38 pre = '' 39 idx = data.find('<html') 40 if idx == -1: 41 idx = data.find('<HTML') 42 if idx > -1: 43 pre = data[:idx] 44 num_of_nl_in_pre = pre.count('\n') 45 if '<!DOCTYPE' in pre: # Handle user defined entities 46 user_entities = {} 47 for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): 48 val = match.group(2) 49 if val.startswith('"') and val.endswith('"'): 50 val = val[1:-1] 51 user_entities[match.group(1)] = val 52 if user_entities: 53 data = ('\n' * num_of_nl_in_pre) + data[idx:] 54 pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) 55 data = pat.sub(lambda m:user_entities[m.group(1)], data) 56 return data 57 58 59def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): 60 if isinstance(raw, bytes): 61 raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) 62 raw = handle_private_entities(raw) 63 if replace_entities: 64 raw = xml_replace_entities(raw).replace('\0', '') # Handle � 65 raw = raw.replace('\r\n', '\n').replace('\r', '\n') 66 67 # Remove any preamble before the opening html tag as it can cause problems, 68 # especially doctypes, preserve the original linenumbers by inserting 69 # newlines at the start 70 pre = raw[:2048] 71 for match in re.finditer(r'<\s*html', pre, flags=re.I): 72 newlines = raw.count('\n', 0, match.start()) 73 raw = ('\n' * newlines) + raw[match.start():] 74 break 75 76 raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) 77 if force_html5_parse: 78 return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) 79 try: 80 ans = safe_xml_fromstring(raw, recover=False) 81 if ans.tag != '{%s}html' % XHTML_NS: 82 raise ValueError('Root tag is not <html> in the XHTML namespace') 83 if linenumber_attribute: 84 for elem in ans.iter(LxmlElement): 85 if elem.sourceline is not None: 86 elem.set(linenumber_attribute, str(elem.sourceline)) 87 return ans 88 except Exception: 89 if log is not None: 90 log.exception('Failed to parse as XML, parsing as tag soup') 91 return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) 92 93 94if __name__ == '__main__': 95 from lxml import etree 96 root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False) 97 print(etree.tostring(root, encoding='utf-8')) 98 print() 99