1# -*- coding: utf-8 -*-
2from __future__ import absolute_import
3
4import unittest
5import sys
6
7from .common_imports import StringIO, etree, HelperTestCase, _str, _bytes, _chr, needs_libxml
8
9try:
10    unicode
11except NameError:
12    unicode = str
13
14ascii_uni = _bytes('a').decode('utf8')
15
16klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names
17
18invalid_tag = _bytes("test").decode('utf8') + klingon
19
20uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters
21
22uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>"
23              ).decode("unicode_escape")
24
25
26class UnicodeTestCase(HelperTestCase):
27    def test__str(self):
28        # test the testing framework, namely _str from common_imports
29        self.assertEqual(_str('\x10'), _str('\u0010'))
30        self.assertEqual(_str('\x10'), _str('\U00000010'))
31        self.assertEqual(_str('\u1234'), _str('\U00001234'))
32
33    def test_unicode_xml(self):
34        tree = etree.XML('<p>%s</p>' % uni)
35        self.assertEqual(uni, tree.text)
36
37    @needs_libxml(2, 9, 5)  # not sure, at least 2.9.4 fails
38    def test_wide_unicode_xml(self):
39        if sys.maxunicode < 1114111:
40            return  # skip test
41        tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape'))
42        self.assertEqual(1, len(tree.text))
43        self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
44                         tree.text)
45
46    def test_unicode_xml_broken(self):
47        uxml = ('<?xml version="1.0" encoding="UTF-8"?>' +
48                '<p>%s</p>' % uni)
49        self.assertRaises(ValueError, etree.XML, uxml)
50
51    def test_unicode_tag(self):
52        el = etree.Element(uni)
53        self.assertEqual(uni, el.tag)
54
55    def test_unicode_tag_invalid(self):
56        # sadly, Klingon is not well-formed
57        self.assertRaises(ValueError, etree.Element, invalid_tag)
58
59    def test_unicode_nstag(self):
60        tag = "{http://abc/}%s" % uni
61        el = etree.Element(tag)
62        self.assertEqual(tag, el.tag)
63
64    def test_unicode_ns_invalid(self):
65        # namespace URIs must conform to RFC 3986
66        tag = "{http://%s/}abc" % uni
67        self.assertRaises(ValueError, etree.Element, tag)
68
69    def test_unicode_nstag_invalid(self):
70        # sadly, Klingon is not well-formed
71        tag = "{http://abc/}%s" % invalid_tag
72        self.assertRaises(ValueError, etree.Element, tag)
73
74    def test_unicode_qname(self):
75        qname = etree.QName(uni, uni)
76        tag = "{%s}%s" % (uni, uni)
77        self.assertEqual(qname.text, tag)
78        self.assertEqual(unicode(qname), tag)
79
80    def test_unicode_qname_invalid(self):
81        self.assertRaises(ValueError, etree.QName, invalid_tag)
82
83    def test_unicode_attr(self):
84        el = etree.Element('foo', {'bar': uni})
85        self.assertEqual(uni, el.attrib['bar'])
86
87    def test_unicode_comment(self):
88        el = etree.Comment(uni)
89        self.assertEqual(uni, el.text)
90
91    def test_unicode_repr1(self):
92        x = etree.Element(_str('å'))
93        # must not raise UnicodeEncodeError
94        repr(x)
95
96    def test_unicode_repr2(self):
97        x = etree.Comment(_str('ö'))
98        repr(x)
99
100    def test_unicode_repr3(self):
101        x = etree.ProcessingInstruction(_str('Å'), _str('\u0131'))
102        repr(x)
103
104    def test_unicode_repr4(self):
105        x = etree.Entity(_str('ä'))
106        repr(x)
107
108    def test_unicode_text(self):
109        e = etree.Element('e')
110
111        def settext(text):
112            e.text = text
113
114        self.assertRaises(ValueError, settext, _str('ab\ufffe'))
115        self.assertRaises(ValueError, settext, _str('ö\ffff'))
116        self.assertRaises(ValueError, settext, _str('\u0123\ud800'))
117        self.assertRaises(ValueError, settext, _str('x\ud8ff'))
118        self.assertRaises(ValueError, settext, _str('\U00010000\udfff'))
119        self.assertRaises(ValueError, settext, _str('abd\x00def'))
120        # should not Raise
121        settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas'))
122
123        for char_val in range(0xD800, 0xDFFF+1):
124            self.assertRaises(ValueError, settext, 'abc' + _chr(char_val))
125            self.assertRaises(ValueError, settext, _chr(char_val))
126            self.assertRaises(ValueError, settext, _chr(char_val) + 'abc')
127
128        self.assertRaises(ValueError, settext, _bytes('\xe4'))
129        self.assertRaises(ValueError, settext, _bytes('\x80'))
130        self.assertRaises(ValueError, settext, _bytes('\xff'))
131        self.assertRaises(ValueError, settext, _bytes('\x08'))
132        self.assertRaises(ValueError, settext, _bytes('\x19'))
133        self.assertRaises(ValueError, settext, _bytes('\x20\x00'))
134        # should not Raise
135        settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
136
137    def test_uniname(self):
138        Element = etree.Element
139        def el(name):
140            return Element(name)
141
142        self.assertRaises(ValueError, el, ':')
143        self.assertRaises(ValueError, el, '0a')
144        self.assertRaises(ValueError, el, _str('\u203f'))
145        # should not Raise
146        el(_str('\u0132'))
147
148
149
150    def test_unicode_parse_stringio(self):
151        el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot()
152        self.assertEqual(uni, el.text)
153
154##     def test_parse_fileobject_unicode(self):
155##         # parse unicode from unnamed file object (not supported by ElementTree)
156##         f = SillyFileLike(uxml)
157##         root = etree.parse(f).getroot()
158##         self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'),
159##                           uxml)
160
161
162class EncodingsTestCase(HelperTestCase):
163    def test_illegal_utf8(self):
164        data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
165        self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
166
167    def test_illegal_utf8_recover(self):
168        data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
169        parser = etree.XMLParser(recover=True)
170        self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
171
172    def _test_encoding(self, encoding, xml_encoding_name=None):
173        foo = """<?xml version='1.0' encoding='%s'?>\n<tag attrib='123'></tag>""" % (
174            xml_encoding_name or encoding)
175        root = etree.fromstring(foo.encode(encoding))
176        self.assertEqual('tag', root.tag)
177
178        doc_encoding = root.getroottree().docinfo.encoding
179        self.assertTrue(
180            doc_encoding.lower().rstrip('lbe'),
181            (xml_encoding_name or encoding).lower().rstrip('lbe'))
182
183    def test_utf8_fromstring(self):
184        self._test_encoding('utf-8')
185
186    def test_utf8sig_fromstring(self):
187        self._test_encoding('utf_8_sig', 'utf-8')
188
189    def test_utf16_fromstring(self):
190        self._test_encoding('utf-16')
191
192    def test_utf16LE_fromstring(self):
193        self._test_encoding('utf-16le', 'utf-16')
194
195    def test_utf16BE_fromstring(self):
196        self._test_encoding('utf-16be', 'utf-16')
197
198    def test_utf32_fromstring(self):
199        self._test_encoding('utf-32', 'utf-32')
200
201    def test_utf32LE_fromstring(self):
202        self._test_encoding('utf-32le', 'utf-32')
203
204    def test_utf32BE_fromstring(self):
205        self._test_encoding('utf-32be', 'utf-32')
206
207
208def test_suite():
209    suite = unittest.TestSuite()
210    suite.addTests([unittest.makeSuite(UnicodeTestCase)])
211    suite.addTests([unittest.makeSuite(EncodingsTestCase)])
212    return suite
213