1# -*- coding: utf-8 -*- 2from __future__ import absolute_import 3 4import unittest 5import sys 6 7from .common_imports import StringIO, etree, HelperTestCase, _str, _bytes, _chr, needs_libxml 8 9try: 10 unicode 11except NameError: 12 unicode = str 13 14ascii_uni = _bytes('a').decode('utf8') 15 16klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 17 18invalid_tag = _bytes("test").decode('utf8') + klingon 19 20uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 21 22uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" 23 ).decode("unicode_escape") 24 25 26class UnicodeTestCase(HelperTestCase): 27 def test__str(self): 28 # test the testing framework, namely _str from common_imports 29 self.assertEqual(_str('\x10'), _str('\u0010')) 30 self.assertEqual(_str('\x10'), _str('\U00000010')) 31 self.assertEqual(_str('\u1234'), _str('\U00001234')) 32 33 def test_unicode_xml(self): 34 tree = etree.XML('<p>%s</p>' % uni) 35 self.assertEqual(uni, tree.text) 36 37 @needs_libxml(2, 9, 5) # not sure, at least 2.9.4 fails 38 def test_wide_unicode_xml(self): 39 if sys.maxunicode < 1114111: 40 return # skip test 41 tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape')) 42 self.assertEqual(1, len(tree.text)) 43 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 44 tree.text) 45 46 def test_unicode_xml_broken(self): 47 uxml = ('<?xml version="1.0" encoding="UTF-8"?>' + 48 '<p>%s</p>' % uni) 49 self.assertRaises(ValueError, etree.XML, uxml) 50 51 def test_unicode_tag(self): 52 el = etree.Element(uni) 53 self.assertEqual(uni, el.tag) 54 55 def test_unicode_tag_invalid(self): 56 # sadly, Klingon is not well-formed 57 self.assertRaises(ValueError, etree.Element, invalid_tag) 58 59 def test_unicode_nstag(self): 60 tag = "{http://abc/}%s" % uni 61 el = etree.Element(tag) 62 self.assertEqual(tag, el.tag) 63 64 def test_unicode_ns_invalid(self): 65 # namespace URIs must conform to RFC 3986 66 tag = "{http://%s/}abc" % uni 67 self.assertRaises(ValueError, etree.Element, tag) 68 69 def test_unicode_nstag_invalid(self): 70 # sadly, Klingon is not well-formed 71 tag = "{http://abc/}%s" % invalid_tag 72 self.assertRaises(ValueError, etree.Element, tag) 73 74 def test_unicode_qname(self): 75 qname = etree.QName(uni, uni) 76 tag = "{%s}%s" % (uni, uni) 77 self.assertEqual(qname.text, tag) 78 self.assertEqual(unicode(qname), tag) 79 80 def test_unicode_qname_invalid(self): 81 self.assertRaises(ValueError, etree.QName, invalid_tag) 82 83 def test_unicode_attr(self): 84 el = etree.Element('foo', {'bar': uni}) 85 self.assertEqual(uni, el.attrib['bar']) 86 87 def test_unicode_comment(self): 88 el = etree.Comment(uni) 89 self.assertEqual(uni, el.text) 90 91 def test_unicode_repr1(self): 92 x = etree.Element(_str('å')) 93 # must not raise UnicodeEncodeError 94 repr(x) 95 96 def test_unicode_repr2(self): 97 x = etree.Comment(_str('ö')) 98 repr(x) 99 100 def test_unicode_repr3(self): 101 x = etree.ProcessingInstruction(_str('Å'), _str('\u0131')) 102 repr(x) 103 104 def test_unicode_repr4(self): 105 x = etree.Entity(_str('ä')) 106 repr(x) 107 108 def test_unicode_text(self): 109 e = etree.Element('e') 110 111 def settext(text): 112 e.text = text 113 114 self.assertRaises(ValueError, settext, _str('ab\ufffe')) 115 self.assertRaises(ValueError, settext, _str('ö\ffff')) 116 self.assertRaises(ValueError, settext, _str('\u0123\ud800')) 117 self.assertRaises(ValueError, settext, _str('x\ud8ff')) 118 self.assertRaises(ValueError, settext, _str('\U00010000\udfff')) 119 self.assertRaises(ValueError, settext, _str('abd\x00def')) 120 # should not Raise 121 settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas')) 122 123 for char_val in range(0xD800, 0xDFFF+1): 124 self.assertRaises(ValueError, settext, 'abc' + _chr(char_val)) 125 self.assertRaises(ValueError, settext, _chr(char_val)) 126 self.assertRaises(ValueError, settext, _chr(char_val) + 'abc') 127 128 self.assertRaises(ValueError, settext, _bytes('\xe4')) 129 self.assertRaises(ValueError, settext, _bytes('\x80')) 130 self.assertRaises(ValueError, settext, _bytes('\xff')) 131 self.assertRaises(ValueError, settext, _bytes('\x08')) 132 self.assertRaises(ValueError, settext, _bytes('\x19')) 133 self.assertRaises(ValueError, settext, _bytes('\x20\x00')) 134 # should not Raise 135 settext(_bytes('\x09\x0A\x0D\x20\x60\x7f')) 136 137 def test_uniname(self): 138 Element = etree.Element 139 def el(name): 140 return Element(name) 141 142 self.assertRaises(ValueError, el, ':') 143 self.assertRaises(ValueError, el, '0a') 144 self.assertRaises(ValueError, el, _str('\u203f')) 145 # should not Raise 146 el(_str('\u0132')) 147 148 149 150 def test_unicode_parse_stringio(self): 151 el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot() 152 self.assertEqual(uni, el.text) 153 154## def test_parse_fileobject_unicode(self): 155## # parse unicode from unnamed file object (not supported by ElementTree) 156## f = SillyFileLike(uxml) 157## root = etree.parse(f).getroot() 158## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 159## uxml) 160 161 162class EncodingsTestCase(HelperTestCase): 163 def test_illegal_utf8(self): 164 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 165 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data) 166 167 def test_illegal_utf8_recover(self): 168 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 169 parser = etree.XMLParser(recover=True) 170 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser) 171 172 def _test_encoding(self, encoding, xml_encoding_name=None): 173 foo = """<?xml version='1.0' encoding='%s'?>\n<tag attrib='123'></tag>""" % ( 174 xml_encoding_name or encoding) 175 root = etree.fromstring(foo.encode(encoding)) 176 self.assertEqual('tag', root.tag) 177 178 doc_encoding = root.getroottree().docinfo.encoding 179 self.assertTrue( 180 doc_encoding.lower().rstrip('lbe'), 181 (xml_encoding_name or encoding).lower().rstrip('lbe')) 182 183 def test_utf8_fromstring(self): 184 self._test_encoding('utf-8') 185 186 def test_utf8sig_fromstring(self): 187 self._test_encoding('utf_8_sig', 'utf-8') 188 189 def test_utf16_fromstring(self): 190 self._test_encoding('utf-16') 191 192 def test_utf16LE_fromstring(self): 193 self._test_encoding('utf-16le', 'utf-16') 194 195 def test_utf16BE_fromstring(self): 196 self._test_encoding('utf-16be', 'utf-16') 197 198 def test_utf32_fromstring(self): 199 self._test_encoding('utf-32', 'utf-32') 200 201 def test_utf32LE_fromstring(self): 202 self._test_encoding('utf-32le', 'utf-32') 203 204 def test_utf32BE_fromstring(self): 205 self._test_encoding('utf-32be', 'utf-32') 206 207 208def test_suite(): 209 suite = unittest.TestSuite() 210 suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 211 suite.addTests([unittest.makeSuite(EncodingsTestCase)]) 212 return suite 213