1# Tests the attempted automatic coercion of the C locale to a UTF-8 locale 2 3import locale 4import os 5import shutil 6import subprocess 7import sys 8import sysconfig 9import unittest 10from collections import namedtuple 11 12import test.support 13from test.support.script_helper import ( 14 run_python_until_end, 15 interpreter_requires_environment, 16) 17 18# Set the list of ways we expect to be able to ask for the "C" locale 19EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"] 20 21# Set our expectation for the default encoding used in the C locale 22# for the filesystem encoding and the standard streams 23EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii" 24EXPECTED_C_LOCALE_FS_ENCODING = "ascii" 25 26# Set our expectation for the default locale used when none is specified 27EXPECT_COERCION_IN_DEFAULT_LOCALE = True 28 29TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"] 30 31# Apply some platform dependent overrides 32if sys.platform.startswith("linux"): 33 if test.support.is_android: 34 # Android defaults to using UTF-8 for all system interfaces 35 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8" 36 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 37 else: 38 # Linux distros typically alias the POSIX locale directly to the C 39 # locale. 40 # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be 41 # able to check this case unconditionally 42 EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX") 43elif sys.platform.startswith("aix"): 44 # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII 45 EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1" 46 EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1" 47elif sys.platform == "darwin": 48 # FS encoding is UTF-8 on macOS 49 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8" 50elif sys.platform == "cygwin": 51 # Cygwin defaults to using C.UTF-8 52 # TODO: Work out a robust dynamic test for this that doesn't rely on 53 # CPython's own locale handling machinery 54 EXPECT_COERCION_IN_DEFAULT_LOCALE = False 55 56# Note that the above expectations are still wrong in some cases, such as: 57# * Windows when PYTHONLEGACYWINDOWSFSENCODING is set 58# * Any platform other than AIX that uses latin-1 in the C locale 59# * Any Linux distro where POSIX isn't a simple alias for the C locale 60# * Any Linux distro where the default locale is something other than "C" 61# 62# Options for dealing with this: 63# * Don't set the PY_COERCE_C_LOCALE preprocessor definition on 64# such platforms (e.g. it isn't set on Windows) 65# * Fix the test expectations to match the actual platform behaviour 66 67# In order to get the warning messages to match up as expected, the candidate 68# order here must much the target locale order in Python/pylifecycle.c 69_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8") 70 71# There's no reliable cross-platform way of checking locale alias 72# lists, so the only way of knowing which of these locales will work 73# is to try them with locale.setlocale(). We do that in a subprocess 74# in setUpModule() below to avoid altering the locale of the test runner. 75# 76# If the relevant locale module attributes exist, and we're not on a platform 77# where we expect it to always succeed, we also check that 78# `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter 79# will skip locale coercion for that particular target locale 80_check_nl_langinfo_CODESET = bool( 81 sys.platform not in ("darwin", "linux") and 82 hasattr(locale, "nl_langinfo") and 83 hasattr(locale, "CODESET") 84) 85 86def _set_locale_in_subprocess(locale_name): 87 cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))" 88 if _check_nl_langinfo_CODESET: 89 # If there's no valid CODESET, we expect coercion to be skipped 90 cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))" 91 cmd = cmd_fmt.format(locale_name) 92 result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='') 93 return result.rc == 0 94 95 96 97_fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all" 98_EncodingDetails = namedtuple("EncodingDetails", _fields) 99 100class EncodingDetails(_EncodingDetails): 101 # XXX (ncoghlan): Using JSON for child state reporting may be less fragile 102 CHILD_PROCESS_SCRIPT = ";".join([ 103 "import sys, os, codecs", 104 "print(codecs.lookup(sys.getfilesystemencoding()).name)", 105 "print(codecs.lookup(sys.stdin.encoding).name + ':' + sys.stdin.errors)", 106 "print(codecs.lookup(sys.stdout.encoding).name + ':' + sys.stdout.errors)", 107 "print(codecs.lookup(sys.stderr.encoding).name + ':' + sys.stderr.errors)", 108 "print(os.environ.get('LANG', 'not set'))", 109 "print(os.environ.get('LC_CTYPE', 'not set'))", 110 "print(os.environ.get('LC_ALL', 'not set'))", 111 ]) 112 113 @classmethod 114 def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars): 115 """Returns expected child process details for a given encoding""" 116 _stream = stream_encoding + ":{}" 117 # stdin and stdout should use surrogateescape either because the 118 # coercion triggered, or because the C locale was detected 119 stream_info = 2*[_stream.format("surrogateescape")] 120 # stderr should always use backslashreplace 121 stream_info.append(_stream.format("backslashreplace")) 122 expected_lang = env_vars.get("LANG", "not set") 123 if coercion_expected: 124 expected_lc_ctype = CLI_COERCION_TARGET 125 else: 126 expected_lc_ctype = env_vars.get("LC_CTYPE", "not set") 127 expected_lc_all = env_vars.get("LC_ALL", "not set") 128 env_info = expected_lang, expected_lc_ctype, expected_lc_all 129 return dict(cls(fs_encoding, *stream_info, *env_info)._asdict()) 130 131 @classmethod 132 def get_child_details(cls, env_vars): 133 """Retrieves fsencoding and standard stream details from a child process 134 135 Returns (encoding_details, stderr_lines): 136 137 - encoding_details: EncodingDetails for eager decoding 138 - stderr_lines: result of calling splitlines() on the stderr output 139 140 The child is run in isolated mode if the current interpreter supports 141 that. 142 """ 143 result, py_cmd = run_python_until_end( 144 "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT, 145 **env_vars 146 ) 147 if not result.rc == 0: 148 result.fail(py_cmd) 149 # All subprocess outputs in this test case should be pure ASCII 150 stdout_lines = result.out.decode("ascii").splitlines() 151 child_encoding_details = dict(cls(*stdout_lines)._asdict()) 152 stderr_lines = result.err.decode("ascii").rstrip().splitlines() 153 return child_encoding_details, stderr_lines 154 155 156# Details of the shared library warning emitted at runtime 157LEGACY_LOCALE_WARNING = ( 158 "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " 159 "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " 160 "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " 161 "locales is recommended." 162) 163 164# Details of the CLI locale coercion warning emitted at runtime 165CLI_COERCION_WARNING_FMT = ( 166 "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " 167 "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." 168) 169 170 171AVAILABLE_TARGETS = None 172CLI_COERCION_TARGET = None 173CLI_COERCION_WARNING = None 174 175def setUpModule(): 176 global AVAILABLE_TARGETS 177 global CLI_COERCION_TARGET 178 global CLI_COERCION_WARNING 179 180 if AVAILABLE_TARGETS is not None: 181 # initialization already done 182 return 183 AVAILABLE_TARGETS = [] 184 185 # Find the target locales available in the current system 186 for target_locale in _C_UTF8_LOCALES: 187 if _set_locale_in_subprocess(target_locale): 188 AVAILABLE_TARGETS.append(target_locale) 189 190 if AVAILABLE_TARGETS: 191 # Coercion is expected to use the first available target locale 192 CLI_COERCION_TARGET = AVAILABLE_TARGETS[0] 193 CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET) 194 195 196class _LocaleHandlingTestCase(unittest.TestCase): 197 # Base class to check expected locale handling behaviour 198 199 def _check_child_encoding_details(self, 200 env_vars, 201 expected_fs_encoding, 202 expected_stream_encoding, 203 expected_warnings, 204 coercion_expected): 205 """Check the C locale handling for the given process environment 206 207 Parameters: 208 expected_fs_encoding: expected sys.getfilesystemencoding() result 209 expected_stream_encoding: expected encoding for standard streams 210 expected_warning: stderr output to expect (if any) 211 """ 212 result = EncodingDetails.get_child_details(env_vars) 213 encoding_details, stderr_lines = result 214 expected_details = EncodingDetails.get_expected_details( 215 coercion_expected, 216 expected_fs_encoding, 217 expected_stream_encoding, 218 env_vars 219 ) 220 self.assertEqual(encoding_details, expected_details) 221 if expected_warnings is None: 222 expected_warnings = [] 223 self.assertEqual(stderr_lines, expected_warnings) 224 225 226class LocaleConfigurationTests(_LocaleHandlingTestCase): 227 # Test explicit external configuration via the process environment 228 229 @classmethod 230 def setUpClass(cls): 231 # This relies on setUpModule() having been run, so it can't be 232 # handled via the @unittest.skipUnless decorator 233 if not AVAILABLE_TARGETS: 234 raise unittest.SkipTest("No C-with-UTF-8 locale available") 235 236 def test_external_target_locale_configuration(self): 237 238 # Explicitly setting a target locale should give the same behaviour as 239 # is seen when implicitly coercing to that target locale 240 self.maxDiff = None 241 242 expected_fs_encoding = "utf-8" 243 expected_stream_encoding = "utf-8" 244 245 base_var_dict = { 246 "LANG": "", 247 "LC_CTYPE": "", 248 "LC_ALL": "", 249 "PYTHONCOERCECLOCALE": "", 250 } 251 for env_var in ("LANG", "LC_CTYPE"): 252 for locale_to_set in AVAILABLE_TARGETS: 253 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as 254 # expected, so skip that combination for now 255 # See https://bugs.python.org/issue30672 for discussion 256 if env_var == "LANG" and locale_to_set == "UTF-8": 257 continue 258 259 with self.subTest(env_var=env_var, 260 configured_locale=locale_to_set): 261 var_dict = base_var_dict.copy() 262 var_dict[env_var] = locale_to_set 263 self._check_child_encoding_details(var_dict, 264 expected_fs_encoding, 265 expected_stream_encoding, 266 expected_warnings=None, 267 coercion_expected=False) 268 269 270 271@test.support.cpython_only 272@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), 273 "C locale coercion disabled at build time") 274class LocaleCoercionTests(_LocaleHandlingTestCase): 275 # Test implicit reconfiguration of the environment during CLI startup 276 277 def _check_c_locale_coercion(self, 278 fs_encoding, stream_encoding, 279 coerce_c_locale, 280 expected_warnings=None, 281 coercion_expected=True, 282 **extra_vars): 283 """Check the C locale handling for various configurations 284 285 Parameters: 286 fs_encoding: expected sys.getfilesystemencoding() result 287 stream_encoding: expected encoding for standard streams 288 coerce_c_locale: setting to use for PYTHONCOERCECLOCALE 289 None: don't set the variable at all 290 str: the value set in the child's environment 291 expected_warnings: expected warning lines on stderr 292 extra_vars: additional environment variables to set in subprocess 293 """ 294 self.maxDiff = None 295 296 if not AVAILABLE_TARGETS: 297 # Locale coercion is disabled when there aren't any target locales 298 fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING 299 stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING 300 coercion_expected = False 301 if expected_warnings: 302 expected_warnings = [LEGACY_LOCALE_WARNING] 303 304 base_var_dict = { 305 "LANG": "", 306 "LC_CTYPE": "", 307 "LC_ALL": "", 308 "PYTHONCOERCECLOCALE": "", 309 } 310 base_var_dict.update(extra_vars) 311 if coerce_c_locale is not None: 312 base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale 313 314 # Check behaviour for the default locale 315 with self.subTest(default_locale=True, 316 PYTHONCOERCECLOCALE=coerce_c_locale): 317 if EXPECT_COERCION_IN_DEFAULT_LOCALE: 318 _expected_warnings = expected_warnings 319 _coercion_expected = coercion_expected 320 else: 321 _expected_warnings = None 322 _coercion_expected = False 323 # On Android CLI_COERCION_WARNING is not printed when all the 324 # locale environment variables are undefined or empty. When 325 # this code path is run with environ['LC_ALL'] == 'C', then 326 # LEGACY_LOCALE_WARNING is printed. 327 if (test.support.is_android and 328 _expected_warnings == [CLI_COERCION_WARNING]): 329 _expected_warnings = None 330 self._check_child_encoding_details(base_var_dict, 331 fs_encoding, 332 stream_encoding, 333 _expected_warnings, 334 _coercion_expected) 335 336 # Check behaviour for explicitly configured locales 337 for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS: 338 for env_var in ("LANG", "LC_CTYPE"): 339 with self.subTest(env_var=env_var, 340 nominal_locale=locale_to_set, 341 PYTHONCOERCECLOCALE=coerce_c_locale): 342 var_dict = base_var_dict.copy() 343 var_dict[env_var] = locale_to_set 344 # Check behaviour on successful coercion 345 self._check_child_encoding_details(var_dict, 346 fs_encoding, 347 stream_encoding, 348 expected_warnings, 349 coercion_expected) 350 351 def test_PYTHONCOERCECLOCALE_not_set(self): 352 # This should coerce to the first available target locale by default 353 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None) 354 355 def test_PYTHONCOERCECLOCALE_not_zero(self): 356 # *Any* string other than "0" is considered "set" for our purposes 357 # and hence should result in the locale coercion being enabled 358 for setting in ("", "1", "true", "false"): 359 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting) 360 361 def test_PYTHONCOERCECLOCALE_set_to_warn(self): 362 # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales 363 self._check_c_locale_coercion("utf-8", "utf-8", 364 coerce_c_locale="warn", 365 expected_warnings=[CLI_COERCION_WARNING]) 366 367 368 def test_PYTHONCOERCECLOCALE_set_to_zero(self): 369 # The setting "0" should result in the locale coercion being disabled 370 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 371 EXPECTED_C_LOCALE_STREAM_ENCODING, 372 coerce_c_locale="0", 373 coercion_expected=False) 374 # Setting LC_ALL=C shouldn't make any difference to the behaviour 375 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 376 EXPECTED_C_LOCALE_STREAM_ENCODING, 377 coerce_c_locale="0", 378 LC_ALL="C", 379 coercion_expected=False) 380 381 def test_LC_ALL_set_to_C(self): 382 # Setting LC_ALL should render the locale coercion ineffective 383 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 384 EXPECTED_C_LOCALE_STREAM_ENCODING, 385 coerce_c_locale=None, 386 LC_ALL="C", 387 coercion_expected=False) 388 # And result in a warning about a lack of locale compatibility 389 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING, 390 EXPECTED_C_LOCALE_STREAM_ENCODING, 391 coerce_c_locale="warn", 392 LC_ALL="C", 393 expected_warnings=[LEGACY_LOCALE_WARNING], 394 coercion_expected=False) 395 396 def test_PYTHONCOERCECLOCALE_set_to_one(self): 397 # skip the test if the LC_CTYPE locale is C or coerced 398 old_loc = locale.setlocale(locale.LC_CTYPE, None) 399 self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc) 400 loc = locale.setlocale(locale.LC_CTYPE, "") 401 if loc == "C": 402 self.skipTest("test requires LC_CTYPE locale different than C") 403 if loc in TARGET_LOCALES : 404 self.skipTest("coerced LC_CTYPE locale: %s" % loc) 405 406 # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale 407 # if it's not equal to "C" 408 code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))' 409 env = dict(os.environ, PYTHONCOERCECLOCALE='1') 410 cmd = subprocess.run([sys.executable, '-c', code], 411 stdout=subprocess.PIPE, 412 env=env, 413 text=True) 414 self.assertEqual(cmd.stdout.rstrip(), loc) 415 416 417def test_main(): 418 test.support.run_unittest( 419 LocaleConfigurationTests, 420 LocaleCoercionTests 421 ) 422 test.support.reap_children() 423 424if __name__ == "__main__": 425 test_main() 426