1# encoding.py - character transcoding support for Mercurial
2#
3#  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
4#
5# This software may be used and distributed according to the terms of the
6# GNU General Public License version 2 or any later version.
7
8from __future__ import absolute_import, print_function
9
10import locale
11import os
12import re
13import unicodedata
14
15from .pycompat import getattr
16from . import (
17    error,
18    policy,
19    pycompat,
20)
21
22from .pure import charencode as charencodepure
23
24if pycompat.TYPE_CHECKING:
25    from typing import (
26        Any,
27        Callable,
28        List,
29        Text,
30        Type,
31        TypeVar,
32        Union,
33    )
34
35    # keep pyflakes happy
36    for t in (Any, Callable, List, Text, Type, Union):
37        assert t
38
39    _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
40
41charencode = policy.importmod('charencode')
42
43isasciistr = charencode.isasciistr
44asciilower = charencode.asciilower
45asciiupper = charencode.asciiupper
46_jsonescapeu8fast = charencode.jsonescapeu8fast
47
48_sysstr = pycompat.sysstr
49
50if pycompat.ispy3:
51    unichr = chr
52
53# These unicode characters are ignored by HFS+ (Apple Technote 1150,
54# "Unicode Subtleties"), so we need to ignore them in some places for
55# sanity.
56_ignore = [
57    unichr(int(x, 16)).encode("utf-8")
58    for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
59    b"206a 206b 206c 206d 206e 206f feff".split()
60]
61# verify the next function will work
62assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
63
64
65def hfsignoreclean(s):
66    # type: (bytes) -> bytes
67    """Remove codepoints ignored by HFS+ from s.
68
69    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
70    '.hg'
71    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
72    '.hg'
73    """
74    if b"\xe2" in s or b"\xef" in s:
75        for c in _ignore:
76            s = s.replace(c, b'')
77    return s
78
79
80# encoding.environ is provided read-only, which may not be used to modify
81# the process environment
82_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
83if not pycompat.ispy3:
84    environ = os.environ  # re-exports
85elif _nativeenviron:
86    environ = os.environb  # re-exports
87else:
88    # preferred encoding isn't known yet; use utf-8 to avoid unicode error
89    # and recreate it once encoding is settled
90    environ = {
91        k.encode('utf-8'): v.encode('utf-8')
92        for k, v in os.environ.items()  # re-exports
93    }
94
95_encodingrewrites = {
96    b'646': b'ascii',
97    b'ANSI_X3.4-1968': b'ascii',
98}
99# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
100# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
101# https://bugs.python.org/issue13216
102if pycompat.iswindows and not pycompat.ispy3:
103    _encodingrewrites[b'cp65001'] = b'utf-8'
104
105try:
106    encoding = environ.get(b"HGENCODING")
107    if not encoding:
108        encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
109        encoding = _encodingrewrites.get(encoding, encoding)
110except locale.Error:
111    encoding = b'ascii'
112encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
113fallbackencoding = b'ISO-8859-1'
114
115
116class localstr(bytes):
117    """This class allows strings that are unmodified to be
118    round-tripped to the local encoding and back"""
119
120    def __new__(cls, u, l):
121        s = bytes.__new__(cls, l)
122        s._utf8 = u
123        return s
124
125    if pycompat.TYPE_CHECKING:
126        # pseudo implementation to help pytype see localstr() constructor
127        def __init__(self, u, l):
128            # type: (bytes, bytes) -> None
129            super(localstr, self).__init__(l)
130            self._utf8 = u
131
132    def __hash__(self):
133        return hash(self._utf8)  # avoid collisions in local string space
134
135
136class safelocalstr(bytes):
137    """Tagged string denoting it was previously an internal UTF-8 string,
138    and can be converted back to UTF-8 losslessly
139
140    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
141    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
142    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
143    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
144    """
145
146
147def tolocal(s):
148    # type: (bytes) -> bytes
149    """
150    Convert a string from internal UTF-8 to local encoding
151
152    All internal strings should be UTF-8 but some repos before the
153    implementation of locale support may contain latin1 or possibly
154    other character sets. We attempt to decode everything strictly
155    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
156    replace unknown characters.
157
158    The localstr class is used to cache the known UTF-8 encoding of
159    strings next to their local representation to allow lossless
160    round-trip conversion back to UTF-8.
161
162    >>> u = b'foo: \\xc3\\xa4' # utf-8
163    >>> l = tolocal(u)
164    >>> l
165    'foo: ?'
166    >>> fromlocal(l)
167    'foo: \\xc3\\xa4'
168    >>> u2 = b'foo: \\xc3\\xa1'
169    >>> d = { l: 1, tolocal(u2): 2 }
170    >>> len(d) # no collision
171    2
172    >>> b'foo: ?' in d
173    False
174    >>> l1 = b'foo: \\xe4' # historical latin1 fallback
175    >>> l = tolocal(l1)
176    >>> l
177    'foo: ?'
178    >>> fromlocal(l) # magically in utf-8
179    'foo: \\xc3\\xa4'
180    """
181
182    if isasciistr(s):
183        return s
184
185    try:
186        try:
187            # make sure string is actually stored in UTF-8
188            u = s.decode('UTF-8')
189            if encoding == b'UTF-8':
190                # fast path
191                return s
192            r = u.encode(_sysstr(encoding), "replace")
193            if u == r.decode(_sysstr(encoding)):
194                # r is a safe, non-lossy encoding of s
195                return safelocalstr(r)
196            return localstr(s, r)
197        except UnicodeDecodeError:
198            # we should only get here if we're looking at an ancient changeset
199            try:
200                u = s.decode(_sysstr(fallbackencoding))
201                r = u.encode(_sysstr(encoding), "replace")
202                if u == r.decode(_sysstr(encoding)):
203                    # r is a safe, non-lossy encoding of s
204                    return safelocalstr(r)
205                return localstr(u.encode('UTF-8'), r)
206            except UnicodeDecodeError:
207                u = s.decode("utf-8", "replace")  # last ditch
208                # can't round-trip
209                return u.encode(_sysstr(encoding), "replace")
210    except LookupError as k:
211        raise error.Abort(
212            pycompat.bytestr(k), hint=b"please check your locale settings"
213        )
214
215
216def fromlocal(s):
217    # type: (bytes) -> bytes
218    """
219    Convert a string from the local character encoding to UTF-8
220
221    We attempt to decode strings using the encoding mode set by
222    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
223    characters will cause an error message. Other modes include
224    'replace', which replaces unknown characters with a special
225    Unicode character, and 'ignore', which drops the character.
226    """
227
228    # can we do a lossless round-trip?
229    if isinstance(s, localstr):
230        return s._utf8
231    if isasciistr(s):
232        return s
233
234    try:
235        u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
236        return u.encode("utf-8")
237    except UnicodeDecodeError as inst:
238        sub = s[max(0, inst.start - 10) : inst.start + 10]
239        raise error.Abort(
240            b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
241        )
242    except LookupError as k:
243        raise error.Abort(
244            pycompat.bytestr(k), hint=b"please check your locale settings"
245        )
246
247
248def unitolocal(u):
249    # type: (Text) -> bytes
250    """Convert a unicode string to a byte string of local encoding"""
251    return tolocal(u.encode('utf-8'))
252
253
254def unifromlocal(s):
255    # type: (bytes) -> Text
256    """Convert a byte string of local encoding to a unicode string"""
257    return fromlocal(s).decode('utf-8')
258
259
260def unimethod(bytesfunc):
261    # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
262    """Create a proxy method that forwards __unicode__() and __str__() of
263    Python 3 to __bytes__()"""
264
265    def unifunc(obj):
266        return unifromlocal(bytesfunc(obj))
267
268    return unifunc
269
270
271# converter functions between native str and byte string. use these if the
272# character encoding is not aware (e.g. exception message) or is known to
273# be locale dependent (e.g. date formatting.)
274if pycompat.ispy3:
275    strtolocal = unitolocal
276    strfromlocal = unifromlocal
277    strmethod = unimethod
278else:
279
280    def strtolocal(s):
281        # type: (str) -> bytes
282        return s  # pytype: disable=bad-return-type
283
284    def strfromlocal(s):
285        # type: (bytes) -> str
286        return s  # pytype: disable=bad-return-type
287
288    strmethod = pycompat.identity
289
290
291def lower(s):
292    # type: (bytes) -> bytes
293    """best-effort encoding-aware case-folding of local string s"""
294    try:
295        return asciilower(s)
296    except UnicodeDecodeError:
297        pass
298    try:
299        if isinstance(s, localstr):
300            u = s._utf8.decode("utf-8")
301        else:
302            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
303
304        lu = u.lower()
305        if u == lu:
306            return s  # preserve localstring
307        return lu.encode(_sysstr(encoding))
308    except UnicodeError:
309        return s.lower()  # we don't know how to fold this except in ASCII
310    except LookupError as k:
311        raise error.Abort(
312            pycompat.bytestr(k), hint=b"please check your locale settings"
313        )
314
315
316def upper(s):
317    # type: (bytes) -> bytes
318    """best-effort encoding-aware case-folding of local string s"""
319    try:
320        return asciiupper(s)
321    except UnicodeDecodeError:
322        return upperfallback(s)
323
324
325def upperfallback(s):
326    # type: (Any) -> Any
327    try:
328        if isinstance(s, localstr):
329            u = s._utf8.decode("utf-8")
330        else:
331            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
332
333        uu = u.upper()
334        if u == uu:
335            return s  # preserve localstring
336        return uu.encode(_sysstr(encoding))
337    except UnicodeError:
338        return s.upper()  # we don't know how to fold this except in ASCII
339    except LookupError as k:
340        raise error.Abort(
341            pycompat.bytestr(k), hint=b"please check your locale settings"
342        )
343
344
345if not _nativeenviron:
346    # now encoding and helper functions are available, recreate the environ
347    # dict to be exported to other modules
348    if pycompat.iswindows and pycompat.ispy3:
349
350        class WindowsEnviron(dict):
351            """`os.environ` normalizes environment variables to uppercase on windows"""
352
353            def get(self, key, default=None):
354                return super().get(upper(key), default)
355
356        environ = WindowsEnviron()
357
358    for k, v in os.environ.items():  # re-exports
359        environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
360
361
362DRIVE_RE = re.compile(b'^[a-z]:')
363
364if pycompat.ispy3:
365    # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
366    # returns bytes.
367    if pycompat.iswindows:
368        # Python 3 on Windows issues a DeprecationWarning about using the bytes
369        # API when os.getcwdb() is called.
370        #
371        # Additionally, py3.8+ uppercases the drive letter when calling
372        # os.path.realpath(), which is used on ``repo.root``.  Since those
373        # strings are compared in various places as simple strings, also call
374        # realpath here.  See https://bugs.python.org/issue40368
375        #
376        # However this is not reliable, so lets explicitly make this drive
377        # letter upper case.
378        #
379        # note: we should consider dropping realpath here since it seems to
380        # change the semantic of `getcwd`.
381
382        def getcwd():
383            cwd = os.getcwd()  # re-exports
384            cwd = os.path.realpath(cwd)
385            cwd = strtolocal(cwd)
386            if DRIVE_RE.match(cwd):
387                cwd = cwd[0:1].upper() + cwd[1:]
388            return cwd
389
390    else:
391        getcwd = os.getcwdb  # re-exports
392else:
393    getcwd = os.getcwd  # re-exports
394
395# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
396_wide = _sysstr(
397    environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
398    and b"WFA"
399    or b"WF"
400)
401
402
403def colwidth(s):
404    # type: (bytes) -> int
405    """Find the column width of a string for display in the local encoding"""
406    return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
407
408
409def ucolwidth(d):
410    # type: (Text) -> int
411    """Find the column width of a Unicode string for display"""
412    eaw = getattr(unicodedata, 'east_asian_width', None)
413    if eaw is not None:
414        return sum([eaw(c) in _wide and 2 or 1 for c in d])
415    return len(d)
416
417
418def getcols(s, start, c):
419    # type: (bytes, int, int) -> bytes
420    """Use colwidth to find a c-column substring of s starting at byte
421    index start"""
422    for x in pycompat.xrange(start + c, len(s)):
423        t = s[start:x]
424        if colwidth(t) == c:
425            return t
426    raise ValueError('substring not found')
427
428
429def trim(s, width, ellipsis=b'', leftside=False):
430    # type: (bytes, int, bytes, bool) -> bytes
431    """Trim string 's' to at most 'width' columns (including 'ellipsis').
432
433    If 'leftside' is True, left side of string 's' is trimmed.
434    'ellipsis' is always placed at trimmed side.
435
436    >>> from .node import bin
437    >>> def bprint(s):
438    ...     print(pycompat.sysstr(s))
439    >>> ellipsis = b'+++'
440    >>> from . import encoding
441    >>> encoding.encoding = b'utf-8'
442    >>> t = b'1234567890'
443    >>> bprint(trim(t, 12, ellipsis=ellipsis))
444    1234567890
445    >>> bprint(trim(t, 10, ellipsis=ellipsis))
446    1234567890
447    >>> bprint(trim(t, 8, ellipsis=ellipsis))
448    12345+++
449    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
450    +++67890
451    >>> bprint(trim(t, 8))
452    12345678
453    >>> bprint(trim(t, 8, leftside=True))
454    34567890
455    >>> bprint(trim(t, 3, ellipsis=ellipsis))
456    +++
457    >>> bprint(trim(t, 1, ellipsis=ellipsis))
458    +
459    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
460    >>> t = u.encode(pycompat.sysstr(encoding.encoding))
461    >>> bprint(trim(t, 12, ellipsis=ellipsis))
462    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
463    >>> bprint(trim(t, 10, ellipsis=ellipsis))
464    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
465    >>> bprint(trim(t, 8, ellipsis=ellipsis))
466    \xe3\x81\x82\xe3\x81\x84+++
467    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
468    +++\xe3\x81\x88\xe3\x81\x8a
469    >>> bprint(trim(t, 5))
470    \xe3\x81\x82\xe3\x81\x84
471    >>> bprint(trim(t, 5, leftside=True))
472    \xe3\x81\x88\xe3\x81\x8a
473    >>> bprint(trim(t, 4, ellipsis=ellipsis))
474    +++
475    >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
476    +++
477    >>> t = bin(b'112233445566778899aa') # invalid byte sequence
478    >>> bprint(trim(t, 12, ellipsis=ellipsis))
479    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
480    >>> bprint(trim(t, 10, ellipsis=ellipsis))
481    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
482    >>> bprint(trim(t, 8, ellipsis=ellipsis))
483    \x11\x22\x33\x44\x55+++
484    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
485    +++\x66\x77\x88\x99\xaa
486    >>> bprint(trim(t, 8))
487    \x11\x22\x33\x44\x55\x66\x77\x88
488    >>> bprint(trim(t, 8, leftside=True))
489    \x33\x44\x55\x66\x77\x88\x99\xaa
490    >>> bprint(trim(t, 3, ellipsis=ellipsis))
491    +++
492    >>> bprint(trim(t, 1, ellipsis=ellipsis))
493    +
494    """
495    try:
496        u = s.decode(_sysstr(encoding))
497    except UnicodeDecodeError:
498        if len(s) <= width:  # trimming is not needed
499            return s
500        width -= len(ellipsis)
501        if width <= 0:  # no enough room even for ellipsis
502            return ellipsis[: width + len(ellipsis)]
503        if leftside:
504            return ellipsis + s[-width:]
505        return s[:width] + ellipsis
506
507    if ucolwidth(u) <= width:  # trimming is not needed
508        return s
509
510    width -= len(ellipsis)
511    if width <= 0:  # no enough room even for ellipsis
512        return ellipsis[: width + len(ellipsis)]
513
514    if leftside:
515        uslice = lambda i: u[i:]
516        concat = lambda s: ellipsis + s
517    else:
518        uslice = lambda i: u[:-i]
519        concat = lambda s: s + ellipsis
520    for i in pycompat.xrange(1, len(u)):
521        usub = uslice(i)
522        if ucolwidth(usub) <= width:
523            return concat(usub.encode(_sysstr(encoding)))
524    return ellipsis  # no enough room for multi-column characters
525
526
527class normcasespecs(object):
528    """what a platform's normcase does to ASCII strings
529
530    This is specified per platform, and should be consistent with what normcase
531    on that platform actually does.
532
533    lower: normcase lowercases ASCII strings
534    upper: normcase uppercases ASCII strings
535    other: the fallback function should always be called
536
537    This should be kept in sync with normcase_spec in util.h."""
538
539    lower = -1
540    upper = 1
541    other = 0
542
543
544def jsonescape(s, paranoid=False):
545    # type: (Any, Any) -> Any
546    """returns a string suitable for JSON
547
548    JSON is problematic for us because it doesn't support non-Unicode
549    bytes. To deal with this, we take the following approach:
550
551    - localstr/safelocalstr objects are converted back to UTF-8
552    - valid UTF-8/ASCII strings are passed as-is
553    - other strings are converted to UTF-8b surrogate encoding
554    - apply JSON-specified string escaping
555
556    (escapes are doubled in these tests)
557
558    >>> jsonescape(b'this is a test')
559    'this is a test'
560    >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
561    'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
562    >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
563    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
564    >>> jsonescape(b'a weird byte: \\xdd')
565    'a weird byte: \\xed\\xb3\\x9d'
566    >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
567    'utf-8: caf\\xc3\\xa9'
568    >>> jsonescape(b'')
569    ''
570
571    If paranoid, non-ascii and common troublesome characters are also escaped.
572    This is suitable for web output.
573
574    >>> s = b'escape characters: \\0 \\x0b \\x7f'
575    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
576    >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
577    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
578    >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
579    'escape boundary: ~ \\\\u007f \\\\u0080'
580    >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
581    'a weird byte: \\\\udcdd'
582    >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
583    'utf-8: caf\\\\u00e9'
584    >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
585    'non-BMP: \\\\ud834\\\\udd1e'
586    >>> jsonescape(b'<foo@example.org>', paranoid=True)
587    '\\\\u003cfoo@example.org\\\\u003e'
588    """
589
590    u8chars = toutf8b(s)
591    try:
592        return _jsonescapeu8fast(u8chars, paranoid)
593    except ValueError:
594        pass
595    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
596
597
598# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
599# bytes are mapped to that range.
600if pycompat.ispy3:
601    _utf8strict = r'surrogatepass'
602else:
603    _utf8strict = r'strict'
604
605_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
606
607
608def getutf8char(s, pos):
609    # type: (bytes, int) -> bytes
610    """get the next full utf-8 character in the given string, starting at pos
611
612    Raises a UnicodeError if the given location does not start a valid
613    utf-8 character.
614    """
615
616    # find how many bytes to attempt decoding from first nibble
617    l = _utf8len[ord(s[pos : pos + 1]) >> 4]
618    if not l:  # ascii
619        return s[pos : pos + 1]
620
621    c = s[pos : pos + l]
622    # validate with attempted decode
623    c.decode("utf-8", _utf8strict)
624    return c
625
626
627def toutf8b(s):
628    # type: (bytes) -> bytes
629    """convert a local, possibly-binary string into UTF-8b
630
631    This is intended as a generic method to preserve data when working
632    with schemes like JSON and XML that have no provision for
633    arbitrary byte strings. As Mercurial often doesn't know
634    what encoding data is in, we use so-called UTF-8b.
635
636    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
637    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
638    uDC00-uDCFF.
639
640    Principles of operation:
641
642    - ASCII and UTF-8 data successfully round-trips and is understood
643      by Unicode-oriented clients
644    - filenames and file contents in arbitrary other encodings can have
645      be round-tripped or recovered by clueful clients
646    - local strings that have a cached known UTF-8 encoding (aka
647      localstr) get sent as UTF-8 so Unicode-oriented clients get the
648      Unicode data they want
649    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
650    - because we must preserve UTF-8 bytestring in places such as
651      filenames, metadata can't be roundtripped without help
652
653    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
654    arbitrary bytes into an internal Unicode format that can be
655    re-encoded back into the original. Here we are exposing the
656    internal surrogate encoding as a UTF-8 string.)
657    """
658
659    if isinstance(s, localstr):
660        # assume that the original UTF-8 sequence would never contain
661        # invalid characters in U+DCxx range
662        return s._utf8
663    elif isinstance(s, safelocalstr):
664        # already verified that s is non-lossy in legacy encoding, which
665        # shouldn't contain characters in U+DCxx range
666        return fromlocal(s)
667    elif isasciistr(s):
668        return s
669    if b"\xed" not in s:
670        try:
671            s.decode('utf-8', _utf8strict)
672            return s
673        except UnicodeDecodeError:
674            pass
675
676    s = pycompat.bytestr(s)
677    r = b""
678    pos = 0
679    l = len(s)
680    while pos < l:
681        try:
682            c = getutf8char(s, pos)
683            if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
684                # have to re-escape existing U+DCxx characters
685                c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
686                pos += 1
687            else:
688                pos += len(c)
689        except UnicodeDecodeError:
690            c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
691            pos += 1
692        r += c
693    return r
694
695
696def fromutf8b(s):
697    # type: (bytes) -> bytes
698    """Given a UTF-8b string, return a local, possibly-binary string.
699
700    return the original binary string. This
701    is a round-trip process for strings like filenames, but metadata
702    that's was passed through tolocal will remain in UTF-8.
703
704    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
705    >>> m = b"\\xc3\\xa9\\x99abcd"
706    >>> toutf8b(m)
707    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
708    >>> roundtrip(m)
709    True
710    >>> roundtrip(b"\\xc2\\xc2\\x80")
711    True
712    >>> roundtrip(b"\\xef\\xbf\\xbd")
713    True
714    >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
715    True
716    >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
717    True
718    """
719
720    if isasciistr(s):
721        return s
722    # fast path - look for uDxxx prefixes in s
723    if b"\xed" not in s:
724        return s
725
726    # We could do this with the unicode type but some Python builds
727    # use UTF-16 internally (issue5031) which causes non-BMP code
728    # points to be escaped. Instead, we use our handy getutf8char
729    # helper again to walk the string without "decoding" it.
730
731    s = pycompat.bytestr(s)
732    r = b""
733    pos = 0
734    l = len(s)
735    while pos < l:
736        c = getutf8char(s, pos)
737        pos += len(c)
738        # unescape U+DCxx characters
739        if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
740            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
741        r += c
742    return r
743