1# encoding.py - character transcoding support for Mercurial 2# 3# Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others 4# 5# This software may be used and distributed according to the terms of the 6# GNU General Public License version 2 or any later version. 7 8from __future__ import absolute_import, print_function 9 10import locale 11import os 12import re 13import unicodedata 14 15from .pycompat import getattr 16from . import ( 17 error, 18 policy, 19 pycompat, 20) 21 22from .pure import charencode as charencodepure 23 24if pycompat.TYPE_CHECKING: 25 from typing import ( 26 Any, 27 Callable, 28 List, 29 Text, 30 Type, 31 TypeVar, 32 Union, 33 ) 34 35 # keep pyflakes happy 36 for t in (Any, Callable, List, Text, Type, Union): 37 assert t 38 39 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') 40 41charencode = policy.importmod('charencode') 42 43isasciistr = charencode.isasciistr 44asciilower = charencode.asciilower 45asciiupper = charencode.asciiupper 46_jsonescapeu8fast = charencode.jsonescapeu8fast 47 48_sysstr = pycompat.sysstr 49 50if pycompat.ispy3: 51 unichr = chr 52 53# These unicode characters are ignored by HFS+ (Apple Technote 1150, 54# "Unicode Subtleties"), so we need to ignore them in some places for 55# sanity. 56_ignore = [ 57 unichr(int(x, 16)).encode("utf-8") 58 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " 59 b"206a 206b 206c 206d 206e 206f feff".split() 60] 61# verify the next function will work 62assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) 63 64 65def hfsignoreclean(s): 66 # type: (bytes) -> bytes 67 """Remove codepoints ignored by HFS+ from s. 68 69 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) 70 '.hg' 71 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) 72 '.hg' 73 """ 74 if b"\xe2" in s or b"\xef" in s: 75 for c in _ignore: 76 s = s.replace(c, b'') 77 return s 78 79 80# encoding.environ is provided read-only, which may not be used to modify 81# the process environment 82_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ 83if not pycompat.ispy3: 84 environ = os.environ # re-exports 85elif _nativeenviron: 86 environ = os.environb # re-exports 87else: 88 # preferred encoding isn't known yet; use utf-8 to avoid unicode error 89 # and recreate it once encoding is settled 90 environ = { 91 k.encode('utf-8'): v.encode('utf-8') 92 for k, v in os.environ.items() # re-exports 93 } 94 95_encodingrewrites = { 96 b'646': b'ascii', 97 b'ANSI_X3.4-1968': b'ascii', 98} 99# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. 100# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. 101# https://bugs.python.org/issue13216 102if pycompat.iswindows and not pycompat.ispy3: 103 _encodingrewrites[b'cp65001'] = b'utf-8' 104 105try: 106 encoding = environ.get(b"HGENCODING") 107 if not encoding: 108 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' 109 encoding = _encodingrewrites.get(encoding, encoding) 110except locale.Error: 111 encoding = b'ascii' 112encodingmode = environ.get(b"HGENCODINGMODE", b"strict") 113fallbackencoding = b'ISO-8859-1' 114 115 116class localstr(bytes): 117 """This class allows strings that are unmodified to be 118 round-tripped to the local encoding and back""" 119 120 def __new__(cls, u, l): 121 s = bytes.__new__(cls, l) 122 s._utf8 = u 123 return s 124 125 if pycompat.TYPE_CHECKING: 126 # pseudo implementation to help pytype see localstr() constructor 127 def __init__(self, u, l): 128 # type: (bytes, bytes) -> None 129 super(localstr, self).__init__(l) 130 self._utf8 = u 131 132 def __hash__(self): 133 return hash(self._utf8) # avoid collisions in local string space 134 135 136class safelocalstr(bytes): 137 """Tagged string denoting it was previously an internal UTF-8 string, 138 and can be converted back to UTF-8 losslessly 139 140 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' 141 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') 142 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} 143 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} 144 """ 145 146 147def tolocal(s): 148 # type: (bytes) -> bytes 149 """ 150 Convert a string from internal UTF-8 to local encoding 151 152 All internal strings should be UTF-8 but some repos before the 153 implementation of locale support may contain latin1 or possibly 154 other character sets. We attempt to decode everything strictly 155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and 156 replace unknown characters. 157 158 The localstr class is used to cache the known UTF-8 encoding of 159 strings next to their local representation to allow lossless 160 round-trip conversion back to UTF-8. 161 162 >>> u = b'foo: \\xc3\\xa4' # utf-8 163 >>> l = tolocal(u) 164 >>> l 165 'foo: ?' 166 >>> fromlocal(l) 167 'foo: \\xc3\\xa4' 168 >>> u2 = b'foo: \\xc3\\xa1' 169 >>> d = { l: 1, tolocal(u2): 2 } 170 >>> len(d) # no collision 171 2 172 >>> b'foo: ?' in d 173 False 174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback 175 >>> l = tolocal(l1) 176 >>> l 177 'foo: ?' 178 >>> fromlocal(l) # magically in utf-8 179 'foo: \\xc3\\xa4' 180 """ 181 182 if isasciistr(s): 183 return s 184 185 try: 186 try: 187 # make sure string is actually stored in UTF-8 188 u = s.decode('UTF-8') 189 if encoding == b'UTF-8': 190 # fast path 191 return s 192 r = u.encode(_sysstr(encoding), "replace") 193 if u == r.decode(_sysstr(encoding)): 194 # r is a safe, non-lossy encoding of s 195 return safelocalstr(r) 196 return localstr(s, r) 197 except UnicodeDecodeError: 198 # we should only get here if we're looking at an ancient changeset 199 try: 200 u = s.decode(_sysstr(fallbackencoding)) 201 r = u.encode(_sysstr(encoding), "replace") 202 if u == r.decode(_sysstr(encoding)): 203 # r is a safe, non-lossy encoding of s 204 return safelocalstr(r) 205 return localstr(u.encode('UTF-8'), r) 206 except UnicodeDecodeError: 207 u = s.decode("utf-8", "replace") # last ditch 208 # can't round-trip 209 return u.encode(_sysstr(encoding), "replace") 210 except LookupError as k: 211 raise error.Abort( 212 pycompat.bytestr(k), hint=b"please check your locale settings" 213 ) 214 215 216def fromlocal(s): 217 # type: (bytes) -> bytes 218 """ 219 Convert a string from the local character encoding to UTF-8 220 221 We attempt to decode strings using the encoding mode set by 222 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown 223 characters will cause an error message. Other modes include 224 'replace', which replaces unknown characters with a special 225 Unicode character, and 'ignore', which drops the character. 226 """ 227 228 # can we do a lossless round-trip? 229 if isinstance(s, localstr): 230 return s._utf8 231 if isasciistr(s): 232 return s 233 234 try: 235 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 236 return u.encode("utf-8") 237 except UnicodeDecodeError as inst: 238 sub = s[max(0, inst.start - 10) : inst.start + 10] 239 raise error.Abort( 240 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) 241 ) 242 except LookupError as k: 243 raise error.Abort( 244 pycompat.bytestr(k), hint=b"please check your locale settings" 245 ) 246 247 248def unitolocal(u): 249 # type: (Text) -> bytes 250 """Convert a unicode string to a byte string of local encoding""" 251 return tolocal(u.encode('utf-8')) 252 253 254def unifromlocal(s): 255 # type: (bytes) -> Text 256 """Convert a byte string of local encoding to a unicode string""" 257 return fromlocal(s).decode('utf-8') 258 259 260def unimethod(bytesfunc): 261 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text] 262 """Create a proxy method that forwards __unicode__() and __str__() of 263 Python 3 to __bytes__()""" 264 265 def unifunc(obj): 266 return unifromlocal(bytesfunc(obj)) 267 268 return unifunc 269 270 271# converter functions between native str and byte string. use these if the 272# character encoding is not aware (e.g. exception message) or is known to 273# be locale dependent (e.g. date formatting.) 274if pycompat.ispy3: 275 strtolocal = unitolocal 276 strfromlocal = unifromlocal 277 strmethod = unimethod 278else: 279 280 def strtolocal(s): 281 # type: (str) -> bytes 282 return s # pytype: disable=bad-return-type 283 284 def strfromlocal(s): 285 # type: (bytes) -> str 286 return s # pytype: disable=bad-return-type 287 288 strmethod = pycompat.identity 289 290 291def lower(s): 292 # type: (bytes) -> bytes 293 """best-effort encoding-aware case-folding of local string s""" 294 try: 295 return asciilower(s) 296 except UnicodeDecodeError: 297 pass 298 try: 299 if isinstance(s, localstr): 300 u = s._utf8.decode("utf-8") 301 else: 302 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 303 304 lu = u.lower() 305 if u == lu: 306 return s # preserve localstring 307 return lu.encode(_sysstr(encoding)) 308 except UnicodeError: 309 return s.lower() # we don't know how to fold this except in ASCII 310 except LookupError as k: 311 raise error.Abort( 312 pycompat.bytestr(k), hint=b"please check your locale settings" 313 ) 314 315 316def upper(s): 317 # type: (bytes) -> bytes 318 """best-effort encoding-aware case-folding of local string s""" 319 try: 320 return asciiupper(s) 321 except UnicodeDecodeError: 322 return upperfallback(s) 323 324 325def upperfallback(s): 326 # type: (Any) -> Any 327 try: 328 if isinstance(s, localstr): 329 u = s._utf8.decode("utf-8") 330 else: 331 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 332 333 uu = u.upper() 334 if u == uu: 335 return s # preserve localstring 336 return uu.encode(_sysstr(encoding)) 337 except UnicodeError: 338 return s.upper() # we don't know how to fold this except in ASCII 339 except LookupError as k: 340 raise error.Abort( 341 pycompat.bytestr(k), hint=b"please check your locale settings" 342 ) 343 344 345if not _nativeenviron: 346 # now encoding and helper functions are available, recreate the environ 347 # dict to be exported to other modules 348 if pycompat.iswindows and pycompat.ispy3: 349 350 class WindowsEnviron(dict): 351 """`os.environ` normalizes environment variables to uppercase on windows""" 352 353 def get(self, key, default=None): 354 return super().get(upper(key), default) 355 356 environ = WindowsEnviron() 357 358 for k, v in os.environ.items(): # re-exports 359 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) 360 361 362DRIVE_RE = re.compile(b'^[a-z]:') 363 364if pycompat.ispy3: 365 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which 366 # returns bytes. 367 if pycompat.iswindows: 368 # Python 3 on Windows issues a DeprecationWarning about using the bytes 369 # API when os.getcwdb() is called. 370 # 371 # Additionally, py3.8+ uppercases the drive letter when calling 372 # os.path.realpath(), which is used on ``repo.root``. Since those 373 # strings are compared in various places as simple strings, also call 374 # realpath here. See https://bugs.python.org/issue40368 375 # 376 # However this is not reliable, so lets explicitly make this drive 377 # letter upper case. 378 # 379 # note: we should consider dropping realpath here since it seems to 380 # change the semantic of `getcwd`. 381 382 def getcwd(): 383 cwd = os.getcwd() # re-exports 384 cwd = os.path.realpath(cwd) 385 cwd = strtolocal(cwd) 386 if DRIVE_RE.match(cwd): 387 cwd = cwd[0:1].upper() + cwd[1:] 388 return cwd 389 390 else: 391 getcwd = os.getcwdb # re-exports 392else: 393 getcwd = os.getcwd # re-exports 394 395# How to treat ambiguous-width characters. Set to 'wide' to treat as wide. 396_wide = _sysstr( 397 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" 398 and b"WFA" 399 or b"WF" 400) 401 402 403def colwidth(s): 404 # type: (bytes) -> int 405 """Find the column width of a string for display in the local encoding""" 406 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) 407 408 409def ucolwidth(d): 410 # type: (Text) -> int 411 """Find the column width of a Unicode string for display""" 412 eaw = getattr(unicodedata, 'east_asian_width', None) 413 if eaw is not None: 414 return sum([eaw(c) in _wide and 2 or 1 for c in d]) 415 return len(d) 416 417 418def getcols(s, start, c): 419 # type: (bytes, int, int) -> bytes 420 """Use colwidth to find a c-column substring of s starting at byte 421 index start""" 422 for x in pycompat.xrange(start + c, len(s)): 423 t = s[start:x] 424 if colwidth(t) == c: 425 return t 426 raise ValueError('substring not found') 427 428 429def trim(s, width, ellipsis=b'', leftside=False): 430 # type: (bytes, int, bytes, bool) -> bytes 431 """Trim string 's' to at most 'width' columns (including 'ellipsis'). 432 433 If 'leftside' is True, left side of string 's' is trimmed. 434 'ellipsis' is always placed at trimmed side. 435 436 >>> from .node import bin 437 >>> def bprint(s): 438 ... print(pycompat.sysstr(s)) 439 >>> ellipsis = b'+++' 440 >>> from . import encoding 441 >>> encoding.encoding = b'utf-8' 442 >>> t = b'1234567890' 443 >>> bprint(trim(t, 12, ellipsis=ellipsis)) 444 1234567890 445 >>> bprint(trim(t, 10, ellipsis=ellipsis)) 446 1234567890 447 >>> bprint(trim(t, 8, ellipsis=ellipsis)) 448 12345+++ 449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) 450 +++67890 451 >>> bprint(trim(t, 8)) 452 12345678 453 >>> bprint(trim(t, 8, leftside=True)) 454 34567890 455 >>> bprint(trim(t, 3, ellipsis=ellipsis)) 456 +++ 457 >>> bprint(trim(t, 1, ellipsis=ellipsis)) 458 + 459 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns 460 >>> t = u.encode(pycompat.sysstr(encoding.encoding)) 461 >>> bprint(trim(t, 12, ellipsis=ellipsis)) 462 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a 463 >>> bprint(trim(t, 10, ellipsis=ellipsis)) 464 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a 465 >>> bprint(trim(t, 8, ellipsis=ellipsis)) 466 \xe3\x81\x82\xe3\x81\x84+++ 467 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) 468 +++\xe3\x81\x88\xe3\x81\x8a 469 >>> bprint(trim(t, 5)) 470 \xe3\x81\x82\xe3\x81\x84 471 >>> bprint(trim(t, 5, leftside=True)) 472 \xe3\x81\x88\xe3\x81\x8a 473 >>> bprint(trim(t, 4, ellipsis=ellipsis)) 474 +++ 475 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) 476 +++ 477 >>> t = bin(b'112233445566778899aa') # invalid byte sequence 478 >>> bprint(trim(t, 12, ellipsis=ellipsis)) 479 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa 480 >>> bprint(trim(t, 10, ellipsis=ellipsis)) 481 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa 482 >>> bprint(trim(t, 8, ellipsis=ellipsis)) 483 \x11\x22\x33\x44\x55+++ 484 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) 485 +++\x66\x77\x88\x99\xaa 486 >>> bprint(trim(t, 8)) 487 \x11\x22\x33\x44\x55\x66\x77\x88 488 >>> bprint(trim(t, 8, leftside=True)) 489 \x33\x44\x55\x66\x77\x88\x99\xaa 490 >>> bprint(trim(t, 3, ellipsis=ellipsis)) 491 +++ 492 >>> bprint(trim(t, 1, ellipsis=ellipsis)) 493 + 494 """ 495 try: 496 u = s.decode(_sysstr(encoding)) 497 except UnicodeDecodeError: 498 if len(s) <= width: # trimming is not needed 499 return s 500 width -= len(ellipsis) 501 if width <= 0: # no enough room even for ellipsis 502 return ellipsis[: width + len(ellipsis)] 503 if leftside: 504 return ellipsis + s[-width:] 505 return s[:width] + ellipsis 506 507 if ucolwidth(u) <= width: # trimming is not needed 508 return s 509 510 width -= len(ellipsis) 511 if width <= 0: # no enough room even for ellipsis 512 return ellipsis[: width + len(ellipsis)] 513 514 if leftside: 515 uslice = lambda i: u[i:] 516 concat = lambda s: ellipsis + s 517 else: 518 uslice = lambda i: u[:-i] 519 concat = lambda s: s + ellipsis 520 for i in pycompat.xrange(1, len(u)): 521 usub = uslice(i) 522 if ucolwidth(usub) <= width: 523 return concat(usub.encode(_sysstr(encoding))) 524 return ellipsis # no enough room for multi-column characters 525 526 527class normcasespecs(object): 528 """what a platform's normcase does to ASCII strings 529 530 This is specified per platform, and should be consistent with what normcase 531 on that platform actually does. 532 533 lower: normcase lowercases ASCII strings 534 upper: normcase uppercases ASCII strings 535 other: the fallback function should always be called 536 537 This should be kept in sync with normcase_spec in util.h.""" 538 539 lower = -1 540 upper = 1 541 other = 0 542 543 544def jsonescape(s, paranoid=False): 545 # type: (Any, Any) -> Any 546 """returns a string suitable for JSON 547 548 JSON is problematic for us because it doesn't support non-Unicode 549 bytes. To deal with this, we take the following approach: 550 551 - localstr/safelocalstr objects are converted back to UTF-8 552 - valid UTF-8/ASCII strings are passed as-is 553 - other strings are converted to UTF-8b surrogate encoding 554 - apply JSON-specified string escaping 555 556 (escapes are doubled in these tests) 557 558 >>> jsonescape(b'this is a test') 559 'this is a test' 560 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') 561 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' 562 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') 563 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' 564 >>> jsonescape(b'a weird byte: \\xdd') 565 'a weird byte: \\xed\\xb3\\x9d' 566 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') 567 'utf-8: caf\\xc3\\xa9' 568 >>> jsonescape(b'') 569 '' 570 571 If paranoid, non-ascii and common troublesome characters are also escaped. 572 This is suitable for web output. 573 574 >>> s = b'escape characters: \\0 \\x0b \\x7f' 575 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) 576 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' 577 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) 578 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) 579 'escape boundary: ~ \\\\u007f \\\\u0080' 580 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) 581 'a weird byte: \\\\udcdd' 582 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) 583 'utf-8: caf\\\\u00e9' 584 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) 585 'non-BMP: \\\\ud834\\\\udd1e' 586 >>> jsonescape(b'<foo@example.org>', paranoid=True) 587 '\\\\u003cfoo@example.org\\\\u003e' 588 """ 589 590 u8chars = toutf8b(s) 591 try: 592 return _jsonescapeu8fast(u8chars, paranoid) 593 except ValueError: 594 pass 595 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) 596 597 598# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 599# bytes are mapped to that range. 600if pycompat.ispy3: 601 _utf8strict = r'surrogatepass' 602else: 603 _utf8strict = r'strict' 604 605_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 606 607 608def getutf8char(s, pos): 609 # type: (bytes, int) -> bytes 610 """get the next full utf-8 character in the given string, starting at pos 611 612 Raises a UnicodeError if the given location does not start a valid 613 utf-8 character. 614 """ 615 616 # find how many bytes to attempt decoding from first nibble 617 l = _utf8len[ord(s[pos : pos + 1]) >> 4] 618 if not l: # ascii 619 return s[pos : pos + 1] 620 621 c = s[pos : pos + l] 622 # validate with attempted decode 623 c.decode("utf-8", _utf8strict) 624 return c 625 626 627def toutf8b(s): 628 # type: (bytes) -> bytes 629 """convert a local, possibly-binary string into UTF-8b 630 631 This is intended as a generic method to preserve data when working 632 with schemes like JSON and XML that have no provision for 633 arbitrary byte strings. As Mercurial often doesn't know 634 what encoding data is in, we use so-called UTF-8b. 635 636 If a string is already valid UTF-8 (or ASCII), it passes unmodified. 637 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, 638 uDC00-uDCFF. 639 640 Principles of operation: 641 642 - ASCII and UTF-8 data successfully round-trips and is understood 643 by Unicode-oriented clients 644 - filenames and file contents in arbitrary other encodings can have 645 be round-tripped or recovered by clueful clients 646 - local strings that have a cached known UTF-8 encoding (aka 647 localstr) get sent as UTF-8 so Unicode-oriented clients get the 648 Unicode data they want 649 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well 650 - because we must preserve UTF-8 bytestring in places such as 651 filenames, metadata can't be roundtripped without help 652 653 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and 654 arbitrary bytes into an internal Unicode format that can be 655 re-encoded back into the original. Here we are exposing the 656 internal surrogate encoding as a UTF-8 string.) 657 """ 658 659 if isinstance(s, localstr): 660 # assume that the original UTF-8 sequence would never contain 661 # invalid characters in U+DCxx range 662 return s._utf8 663 elif isinstance(s, safelocalstr): 664 # already verified that s is non-lossy in legacy encoding, which 665 # shouldn't contain characters in U+DCxx range 666 return fromlocal(s) 667 elif isasciistr(s): 668 return s 669 if b"\xed" not in s: 670 try: 671 s.decode('utf-8', _utf8strict) 672 return s 673 except UnicodeDecodeError: 674 pass 675 676 s = pycompat.bytestr(s) 677 r = b"" 678 pos = 0 679 l = len(s) 680 while pos < l: 681 try: 682 c = getutf8char(s, pos) 683 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": 684 # have to re-escape existing U+DCxx characters 685 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) 686 pos += 1 687 else: 688 pos += len(c) 689 except UnicodeDecodeError: 690 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) 691 pos += 1 692 r += c 693 return r 694 695 696def fromutf8b(s): 697 # type: (bytes) -> bytes 698 """Given a UTF-8b string, return a local, possibly-binary string. 699 700 return the original binary string. This 701 is a round-trip process for strings like filenames, but metadata 702 that's was passed through tolocal will remain in UTF-8. 703 704 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x 705 >>> m = b"\\xc3\\xa9\\x99abcd" 706 >>> toutf8b(m) 707 '\\xc3\\xa9\\xed\\xb2\\x99abcd' 708 >>> roundtrip(m) 709 True 710 >>> roundtrip(b"\\xc2\\xc2\\x80") 711 True 712 >>> roundtrip(b"\\xef\\xbf\\xbd") 713 True 714 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") 715 True 716 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") 717 True 718 """ 719 720 if isasciistr(s): 721 return s 722 # fast path - look for uDxxx prefixes in s 723 if b"\xed" not in s: 724 return s 725 726 # We could do this with the unicode type but some Python builds 727 # use UTF-16 internally (issue5031) which causes non-BMP code 728 # points to be escaped. Instead, we use our handy getutf8char 729 # helper again to walk the string without "decoding" it. 730 731 s = pycompat.bytestr(s) 732 r = b"" 733 pos = 0 734 l = len(s) 735 while pos < l: 736 c = getutf8char(s, pos) 737 pos += len(c) 738 # unescape U+DCxx characters 739 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": 740 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) 741 r += c 742 return r 743