1"""
2Test the implementation of the PEP 540: the UTF-8 Mode.
3"""
4
5import locale
6import sys
7import textwrap
8import unittest
9from test import support
10from test.support.script_helper import assert_python_ok, assert_python_failure
11
12
13MS_WINDOWS = (sys.platform == 'win32')
14POSIX_LOCALES = ('C', 'POSIX')
15VXWORKS = (sys.platform == "vxworks")
16
17class UTF8ModeTests(unittest.TestCase):
18    DEFAULT_ENV = {
19        'PYTHONUTF8': '',
20        'PYTHONLEGACYWINDOWSFSENCODING': '',
21        'PYTHONCOERCECLOCALE': '0',
22    }
23
24    def posix_locale(self):
25        loc = locale.setlocale(locale.LC_CTYPE, None)
26        return (loc in POSIX_LOCALES)
27
28    def get_output(self, *args, failure=False, **kw):
29        kw = dict(self.DEFAULT_ENV, **kw)
30        if failure:
31            out = assert_python_failure(*args, **kw)
32            out = out[2]
33        else:
34            out = assert_python_ok(*args, **kw)
35            out = out[1]
36        return out.decode().rstrip("\n\r")
37
38    @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
39    def test_posix_locale(self):
40        code = 'import sys; print(sys.flags.utf8_mode)'
41
42        for loc in POSIX_LOCALES:
43            with self.subTest(LC_ALL=loc):
44                out = self.get_output('-c', code, LC_ALL=loc)
45                self.assertEqual(out, '1')
46
47    def test_xoption(self):
48        code = 'import sys; print(sys.flags.utf8_mode)'
49
50        out = self.get_output('-X', 'utf8', '-c', code)
51        self.assertEqual(out, '1')
52
53        # undocumented but accepted syntax: -X utf8=1
54        out = self.get_output('-X', 'utf8=1', '-c', code)
55        self.assertEqual(out, '1')
56
57        out = self.get_output('-X', 'utf8=0', '-c', code)
58        self.assertEqual(out, '0')
59
60        if MS_WINDOWS:
61            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
62            # and has the priority over -X utf8
63            out = self.get_output('-X', 'utf8', '-c', code,
64                                  PYTHONLEGACYWINDOWSFSENCODING='1')
65            self.assertEqual(out, '0')
66
67    def test_env_var(self):
68        code = 'import sys; print(sys.flags.utf8_mode)'
69
70        out = self.get_output('-c', code, PYTHONUTF8='1')
71        self.assertEqual(out, '1')
72
73        out = self.get_output('-c', code, PYTHONUTF8='0')
74        self.assertEqual(out, '0')
75
76        # -X utf8 has the priority over PYTHONUTF8
77        out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
78        self.assertEqual(out, '0')
79
80        if MS_WINDOWS:
81            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
82            # and has the priority over PYTHONUTF8
83            out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
84                                  PYTHONLEGACYWINDOWSFSENCODING='1')
85            self.assertEqual(out, '0')
86
87        # Cannot test with the POSIX locale, since the POSIX locale enables
88        # the UTF-8 mode
89        if not self.posix_locale():
90            # PYTHONUTF8 should be ignored if -E is used
91            out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
92            self.assertEqual(out, '0')
93
94        # invalid mode
95        out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
96        self.assertIn('invalid PYTHONUTF8 environment variable value',
97                      out.rstrip())
98
99    def test_filesystemencoding(self):
100        code = textwrap.dedent('''
101            import sys
102            print("{}/{}".format(sys.getfilesystemencoding(),
103                                 sys.getfilesystemencodeerrors()))
104        ''')
105
106        if MS_WINDOWS:
107            expected = 'utf-8/surrogatepass'
108        else:
109            expected = 'utf-8/surrogateescape'
110
111        out = self.get_output('-X', 'utf8', '-c', code)
112        self.assertEqual(out, expected)
113
114        if MS_WINDOWS:
115            # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
116            # and has the priority over -X utf8 and PYTHONUTF8
117            out = self.get_output('-X', 'utf8', '-c', code,
118                                  PYTHONUTF8='strict',
119                                  PYTHONLEGACYWINDOWSFSENCODING='1')
120            self.assertEqual(out, 'mbcs/replace')
121
122    def test_stdio(self):
123        code = textwrap.dedent('''
124            import sys
125            print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
126            print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
127            print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
128        ''')
129
130        out = self.get_output('-X', 'utf8', '-c', code,
131                              PYTHONIOENCODING='')
132        self.assertEqual(out.splitlines(),
133                         ['stdin: utf-8/surrogateescape',
134                          'stdout: utf-8/surrogateescape',
135                          'stderr: utf-8/backslashreplace'])
136
137        # PYTHONIOENCODING has the priority over PYTHONUTF8
138        out = self.get_output('-X', 'utf8', '-c', code,
139                              PYTHONIOENCODING="latin1")
140        self.assertEqual(out.splitlines(),
141                         ['stdin: iso8859-1/strict',
142                          'stdout: iso8859-1/strict',
143                          'stderr: iso8859-1/backslashreplace'])
144
145        out = self.get_output('-X', 'utf8', '-c', code,
146                              PYTHONIOENCODING=":namereplace")
147        self.assertEqual(out.splitlines(),
148                         ['stdin: utf-8/namereplace',
149                          'stdout: utf-8/namereplace',
150                          'stderr: utf-8/backslashreplace'])
151
152    def test_io(self):
153        code = textwrap.dedent('''
154            import sys
155            filename = sys.argv[1]
156            with open(filename) as fp:
157                print(f"{fp.encoding}/{fp.errors}")
158        ''')
159        filename = __file__
160
161        out = self.get_output('-c', code, filename, PYTHONUTF8='1')
162        self.assertEqual(out, 'UTF-8/strict')
163
164    def _check_io_encoding(self, module, encoding=None, errors=None):
165        filename = __file__
166
167        # Encoding explicitly set
168        args = []
169        if encoding:
170            args.append(f'encoding={encoding!r}')
171        if errors:
172            args.append(f'errors={errors!r}')
173        code = textwrap.dedent('''
174            import sys
175            from %s import open
176            filename = sys.argv[1]
177            with open(filename, %s) as fp:
178                print(f"{fp.encoding}/{fp.errors}")
179        ''') % (module, ', '.join(args))
180        out = self.get_output('-c', code, filename,
181                              PYTHONUTF8='1')
182
183        if not encoding:
184            encoding = 'UTF-8'
185        if not errors:
186            errors = 'strict'
187        self.assertEqual(out, f'{encoding}/{errors}')
188
189    def check_io_encoding(self, module):
190        self._check_io_encoding(module, encoding="latin1")
191        self._check_io_encoding(module, errors="namereplace")
192        self._check_io_encoding(module,
193                                encoding="latin1", errors="namereplace")
194
195    def test_io_encoding(self):
196        self.check_io_encoding('io')
197
198    def test_pyio_encoding(self):
199        self.check_io_encoding('_pyio')
200
201    def test_locale_getpreferredencoding(self):
202        code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
203        out = self.get_output('-X', 'utf8', '-c', code)
204        self.assertEqual(out, 'UTF-8 UTF-8')
205
206        for loc in POSIX_LOCALES:
207            with self.subTest(LC_ALL=loc):
208                out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc)
209                self.assertEqual(out, 'UTF-8 UTF-8')
210
211    @unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
212    def test_cmd_line(self):
213        arg = 'h\xe9\u20ac'.encode('utf-8')
214        arg_utf8 = arg.decode('utf-8')
215        arg_ascii = arg.decode('ascii', 'surrogateescape')
216        code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
217
218        def check(utf8_opt, expected, **kw):
219            out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
220            args = out.partition(':')[2].rstrip()
221            self.assertEqual(args, ascii(expected), out)
222
223        check('utf8', [arg_utf8])
224        for loc in POSIX_LOCALES:
225            with self.subTest(LC_ALL=loc):
226                check('utf8', [arg_utf8], LC_ALL=loc)
227
228        if sys.platform == 'darwin' or support.is_android or VXWORKS:
229            c_arg = arg_utf8
230        elif sys.platform.startswith("aix"):
231            c_arg = arg.decode('iso-8859-1')
232        else:
233            c_arg = arg_ascii
234        for loc in POSIX_LOCALES:
235            with self.subTest(LC_ALL=loc):
236                check('utf8=0', [c_arg], LC_ALL=loc)
237
238    def test_optim_level(self):
239        # CPython: check that Py_Main() doesn't increment Py_OptimizeFlag
240        # twice when -X utf8 requires to parse the configuration twice (when
241        # the encoding changes after reading the configuration, the
242        # configuration is read again with the new encoding).
243        code = 'import sys; print(sys.flags.optimize)'
244        out = self.get_output('-X', 'utf8', '-O', '-c', code)
245        self.assertEqual(out, '1')
246        out = self.get_output('-X', 'utf8', '-OO', '-c', code)
247        self.assertEqual(out, '2')
248
249        code = 'import sys; print(sys.flags.ignore_environment)'
250        out = self.get_output('-X', 'utf8', '-E', '-c', code)
251        self.assertEqual(out, '1')
252
253
254if __name__ == "__main__":
255    unittest.main()
256