1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale
2
3import locale
4import os
5import shutil
6import subprocess
7import sys
8import sysconfig
9import unittest
10from collections import namedtuple
11
12import test.support
13from test.support.script_helper import (
14    run_python_until_end,
15    interpreter_requires_environment,
16)
17
18# Set the list of ways we expect to be able to ask for the "C" locale
19EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
20
21# Set our expectation for the default encoding used in the C locale
22# for the filesystem encoding and the standard streams
23EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
24EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
25
26# Set our expectation for the default locale used when none is specified
27EXPECT_COERCION_IN_DEFAULT_LOCALE = True
28
29TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
30
31# Apply some platform dependent overrides
32if sys.platform.startswith("linux"):
33    if test.support.is_android:
34        # Android defaults to using UTF-8 for all system interfaces
35        EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
36        EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
37    else:
38        # Linux distros typically alias the POSIX locale directly to the C
39        # locale.
40        # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
41        #       able to check this case unconditionally
42        EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
43elif sys.platform.startswith("aix"):
44    # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
45    EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
46    EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
47elif sys.platform == "darwin":
48    # FS encoding is UTF-8 on macOS
49    EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
50elif sys.platform == "cygwin":
51    # Cygwin defaults to using C.UTF-8
52    # TODO: Work out a robust dynamic test for this that doesn't rely on
53    #       CPython's own locale handling machinery
54    EXPECT_COERCION_IN_DEFAULT_LOCALE = False
55
56# Note that the above expectations are still wrong in some cases, such as:
57# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
58# * Any platform other than AIX that uses latin-1 in the C locale
59# * Any Linux distro where POSIX isn't a simple alias for the C locale
60# * Any Linux distro where the default locale is something other than "C"
61#
62# Options for dealing with this:
63# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
64#   such platforms (e.g. it isn't set on Windows)
65# * Fix the test expectations to match the actual platform behaviour
66
67# In order to get the warning messages to match up as expected, the candidate
68# order here must much the target locale order in Python/pylifecycle.c
69_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
70
71# There's no reliable cross-platform way of checking locale alias
72# lists, so the only way of knowing which of these locales will work
73# is to try them with locale.setlocale(). We do that in a subprocess
74# in setUpModule() below to avoid altering the locale of the test runner.
75#
76# If the relevant locale module attributes exist, and we're not on a platform
77# where we expect it to always succeed, we also check that
78# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
79# will skip locale coercion for that particular target locale
80_check_nl_langinfo_CODESET = bool(
81    sys.platform not in ("darwin", "linux") and
82    hasattr(locale, "nl_langinfo") and
83    hasattr(locale, "CODESET")
84)
85
86def _set_locale_in_subprocess(locale_name):
87    cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
88    if _check_nl_langinfo_CODESET:
89        # If there's no valid CODESET, we expect coercion to be skipped
90        cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
91    cmd = cmd_fmt.format(locale_name)
92    result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
93    return result.rc == 0
94
95
96
97_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
98_EncodingDetails = namedtuple("EncodingDetails", _fields)
99
100class EncodingDetails(_EncodingDetails):
101    # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
102    CHILD_PROCESS_SCRIPT = ";".join([
103        "import sys, os, codecs",
104        "print(codecs.lookup(sys.getfilesystemencoding()).name)",
105        "print(codecs.lookup(sys.stdin.encoding).name + ':' + sys.stdin.errors)",
106        "print(codecs.lookup(sys.stdout.encoding).name + ':' + sys.stdout.errors)",
107        "print(codecs.lookup(sys.stderr.encoding).name + ':' + sys.stderr.errors)",
108        "print(os.environ.get('LANG', 'not set'))",
109        "print(os.environ.get('LC_CTYPE', 'not set'))",
110        "print(os.environ.get('LC_ALL', 'not set'))",
111    ])
112
113    @classmethod
114    def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
115        """Returns expected child process details for a given encoding"""
116        _stream = stream_encoding + ":{}"
117        # stdin and stdout should use surrogateescape either because the
118        # coercion triggered, or because the C locale was detected
119        stream_info = 2*[_stream.format("surrogateescape")]
120        # stderr should always use backslashreplace
121        stream_info.append(_stream.format("backslashreplace"))
122        expected_lang = env_vars.get("LANG", "not set")
123        if coercion_expected:
124            expected_lc_ctype = CLI_COERCION_TARGET
125        else:
126            expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
127        expected_lc_all = env_vars.get("LC_ALL", "not set")
128        env_info = expected_lang, expected_lc_ctype, expected_lc_all
129        return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
130
131    @classmethod
132    def get_child_details(cls, env_vars):
133        """Retrieves fsencoding and standard stream details from a child process
134
135        Returns (encoding_details, stderr_lines):
136
137        - encoding_details: EncodingDetails for eager decoding
138        - stderr_lines: result of calling splitlines() on the stderr output
139
140        The child is run in isolated mode if the current interpreter supports
141        that.
142        """
143        result, py_cmd = run_python_until_end(
144            "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
145            **env_vars
146        )
147        if not result.rc == 0:
148            result.fail(py_cmd)
149        # All subprocess outputs in this test case should be pure ASCII
150        stdout_lines = result.out.decode("ascii").splitlines()
151        child_encoding_details = dict(cls(*stdout_lines)._asdict())
152        stderr_lines = result.err.decode("ascii").rstrip().splitlines()
153        return child_encoding_details, stderr_lines
154
155
156# Details of the shared library warning emitted at runtime
157LEGACY_LOCALE_WARNING = (
158    "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
159    "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
160    "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
161    "locales is recommended."
162)
163
164# Details of the CLI locale coercion warning emitted at runtime
165CLI_COERCION_WARNING_FMT = (
166    "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
167    "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
168)
169
170
171AVAILABLE_TARGETS = None
172CLI_COERCION_TARGET = None
173CLI_COERCION_WARNING = None
174
175def setUpModule():
176    global AVAILABLE_TARGETS
177    global CLI_COERCION_TARGET
178    global CLI_COERCION_WARNING
179
180    if AVAILABLE_TARGETS is not None:
181        # initialization already done
182        return
183    AVAILABLE_TARGETS = []
184
185    # Find the target locales available in the current system
186    for target_locale in _C_UTF8_LOCALES:
187        if _set_locale_in_subprocess(target_locale):
188            AVAILABLE_TARGETS.append(target_locale)
189
190    if AVAILABLE_TARGETS:
191        # Coercion is expected to use the first available target locale
192        CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
193        CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
194
195
196class _LocaleHandlingTestCase(unittest.TestCase):
197    # Base class to check expected locale handling behaviour
198
199    def _check_child_encoding_details(self,
200                                      env_vars,
201                                      expected_fs_encoding,
202                                      expected_stream_encoding,
203                                      expected_warnings,
204                                      coercion_expected):
205        """Check the C locale handling for the given process environment
206
207        Parameters:
208            expected_fs_encoding: expected sys.getfilesystemencoding() result
209            expected_stream_encoding: expected encoding for standard streams
210            expected_warning: stderr output to expect (if any)
211        """
212        result = EncodingDetails.get_child_details(env_vars)
213        encoding_details, stderr_lines = result
214        expected_details = EncodingDetails.get_expected_details(
215            coercion_expected,
216            expected_fs_encoding,
217            expected_stream_encoding,
218            env_vars
219        )
220        self.assertEqual(encoding_details, expected_details)
221        if expected_warnings is None:
222            expected_warnings = []
223        self.assertEqual(stderr_lines, expected_warnings)
224
225
226class LocaleConfigurationTests(_LocaleHandlingTestCase):
227    # Test explicit external configuration via the process environment
228
229    @classmethod
230    def setUpClass(cls):
231        # This relies on setUpModule() having been run, so it can't be
232        # handled via the @unittest.skipUnless decorator
233        if not AVAILABLE_TARGETS:
234            raise unittest.SkipTest("No C-with-UTF-8 locale available")
235
236    def test_external_target_locale_configuration(self):
237
238        # Explicitly setting a target locale should give the same behaviour as
239        # is seen when implicitly coercing to that target locale
240        self.maxDiff = None
241
242        expected_fs_encoding = "utf-8"
243        expected_stream_encoding = "utf-8"
244
245        base_var_dict = {
246            "LANG": "",
247            "LC_CTYPE": "",
248            "LC_ALL": "",
249            "PYTHONCOERCECLOCALE": "",
250        }
251        for env_var in ("LANG", "LC_CTYPE"):
252            for locale_to_set in AVAILABLE_TARGETS:
253                # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
254                #                 expected, so skip that combination for now
255                # See https://bugs.python.org/issue30672 for discussion
256                if env_var == "LANG" and locale_to_set == "UTF-8":
257                    continue
258
259                with self.subTest(env_var=env_var,
260                                  configured_locale=locale_to_set):
261                    var_dict = base_var_dict.copy()
262                    var_dict[env_var] = locale_to_set
263                    self._check_child_encoding_details(var_dict,
264                                                       expected_fs_encoding,
265                                                       expected_stream_encoding,
266                                                       expected_warnings=None,
267                                                       coercion_expected=False)
268
269
270
271@test.support.cpython_only
272@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
273                     "C locale coercion disabled at build time")
274class LocaleCoercionTests(_LocaleHandlingTestCase):
275    # Test implicit reconfiguration of the environment during CLI startup
276
277    def _check_c_locale_coercion(self,
278                                 fs_encoding, stream_encoding,
279                                 coerce_c_locale,
280                                 expected_warnings=None,
281                                 coercion_expected=True,
282                                 **extra_vars):
283        """Check the C locale handling for various configurations
284
285        Parameters:
286            fs_encoding: expected sys.getfilesystemencoding() result
287            stream_encoding: expected encoding for standard streams
288            coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
289              None: don't set the variable at all
290              str: the value set in the child's environment
291            expected_warnings: expected warning lines on stderr
292            extra_vars: additional environment variables to set in subprocess
293        """
294        self.maxDiff = None
295
296        if not AVAILABLE_TARGETS:
297            # Locale coercion is disabled when there aren't any target locales
298            fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
299            stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
300            coercion_expected = False
301            if expected_warnings:
302                expected_warnings = [LEGACY_LOCALE_WARNING]
303
304        base_var_dict = {
305            "LANG": "",
306            "LC_CTYPE": "",
307            "LC_ALL": "",
308            "PYTHONCOERCECLOCALE": "",
309        }
310        base_var_dict.update(extra_vars)
311        if coerce_c_locale is not None:
312            base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
313
314        # Check behaviour for the default locale
315        with self.subTest(default_locale=True,
316                          PYTHONCOERCECLOCALE=coerce_c_locale):
317            if EXPECT_COERCION_IN_DEFAULT_LOCALE:
318                _expected_warnings = expected_warnings
319                _coercion_expected = coercion_expected
320            else:
321                _expected_warnings = None
322                _coercion_expected = False
323            # On Android CLI_COERCION_WARNING is not printed when all the
324            # locale environment variables are undefined or empty. When
325            # this code path is run with environ['LC_ALL'] == 'C', then
326            # LEGACY_LOCALE_WARNING is printed.
327            if (test.support.is_android and
328                    _expected_warnings == [CLI_COERCION_WARNING]):
329                _expected_warnings = None
330            self._check_child_encoding_details(base_var_dict,
331                                               fs_encoding,
332                                               stream_encoding,
333                                               _expected_warnings,
334                                               _coercion_expected)
335
336        # Check behaviour for explicitly configured locales
337        for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
338            for env_var in ("LANG", "LC_CTYPE"):
339                with self.subTest(env_var=env_var,
340                                  nominal_locale=locale_to_set,
341                                  PYTHONCOERCECLOCALE=coerce_c_locale):
342                    var_dict = base_var_dict.copy()
343                    var_dict[env_var] = locale_to_set
344                    # Check behaviour on successful coercion
345                    self._check_child_encoding_details(var_dict,
346                                                       fs_encoding,
347                                                       stream_encoding,
348                                                       expected_warnings,
349                                                       coercion_expected)
350
351    def test_PYTHONCOERCECLOCALE_not_set(self):
352        # This should coerce to the first available target locale by default
353        self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
354
355    def test_PYTHONCOERCECLOCALE_not_zero(self):
356        # *Any* string other than "0" is considered "set" for our purposes
357        # and hence should result in the locale coercion being enabled
358        for setting in ("", "1", "true", "false"):
359            self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
360
361    def test_PYTHONCOERCECLOCALE_set_to_warn(self):
362        # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
363        self._check_c_locale_coercion("utf-8", "utf-8",
364                                      coerce_c_locale="warn",
365                                      expected_warnings=[CLI_COERCION_WARNING])
366
367
368    def test_PYTHONCOERCECLOCALE_set_to_zero(self):
369        # The setting "0" should result in the locale coercion being disabled
370        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
371                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
372                                      coerce_c_locale="0",
373                                      coercion_expected=False)
374        # Setting LC_ALL=C shouldn't make any difference to the behaviour
375        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
376                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
377                                      coerce_c_locale="0",
378                                      LC_ALL="C",
379                                      coercion_expected=False)
380
381    def test_LC_ALL_set_to_C(self):
382        # Setting LC_ALL should render the locale coercion ineffective
383        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
384                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
385                                      coerce_c_locale=None,
386                                      LC_ALL="C",
387                                      coercion_expected=False)
388        # And result in a warning about a lack of locale compatibility
389        self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
390                                      EXPECTED_C_LOCALE_STREAM_ENCODING,
391                                      coerce_c_locale="warn",
392                                      LC_ALL="C",
393                                      expected_warnings=[LEGACY_LOCALE_WARNING],
394                                      coercion_expected=False)
395
396    def test_PYTHONCOERCECLOCALE_set_to_one(self):
397        # skip the test if the LC_CTYPE locale is C or coerced
398        old_loc = locale.setlocale(locale.LC_CTYPE, None)
399        self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
400        loc = locale.setlocale(locale.LC_CTYPE, "")
401        if loc == "C":
402            self.skipTest("test requires LC_CTYPE locale different than C")
403        if loc in TARGET_LOCALES :
404            self.skipTest("coerced LC_CTYPE locale: %s" % loc)
405
406        # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
407        # if it's not equal to "C"
408        code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
409        env = dict(os.environ, PYTHONCOERCECLOCALE='1')
410        cmd = subprocess.run([sys.executable, '-c', code],
411                             stdout=subprocess.PIPE,
412                             env=env,
413                             text=True)
414        self.assertEqual(cmd.stdout.rstrip(), loc)
415
416
417def test_main():
418    test.support.run_unittest(
419        LocaleConfigurationTests,
420        LocaleCoercionTests
421    )
422    test.support.reap_children()
423
424if __name__ == "__main__":
425    test_main()
426