1#!/usr/bin/python3 -OO
2# Copyright 2007-2021 The SABnzbd-Team <team@sabnzbd.org>
3#
4# This program is free software; you can redistribute it and/or
5# modify it under the terms of the GNU General Public License
6# as published by the Free Software Foundation; either version 2
7# of the License, or (at your option) any later version.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12# GNU General Public License for more details.
13#
14# You should have received a copy of the GNU General Public License
15# along with this program; if not, write to the Free Software
16# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
17
18"""
19sabnzbd.encoding - Unicode/byte translation functions
20"""
21
22import locale
23import chardet
24from xml.sax.saxutils import escape
25from typing import AnyStr
26
27CODEPAGE = locale.getpreferredencoding()
28
29
30def utob(str_in: AnyStr) -> bytes:
31    """Shorthand for converting UTF-8 string to bytes"""
32    if isinstance(str_in, bytes):
33        return str_in
34    return str_in.encode("utf-8")
35
36
37def ubtou(str_in: AnyStr) -> str:
38    """Shorthand for converting unicode bytes to UTF-8 string"""
39    if not isinstance(str_in, bytes):
40        return str_in
41    return str_in.decode("utf-8")
42
43
44def platform_btou(str_in: AnyStr) -> str:
45    """Return Unicode string, if not already Unicode, decode with locale encoding.
46    NOTE: Used for POpen because universal_newlines/text parameter doesn't
47    always work! We cannot use encoding-parameter because it's Python 3.7+
48    """
49    if isinstance(str_in, bytes):
50        try:
51            return ubtou(str_in)
52        except UnicodeDecodeError:
53            return str_in.decode(CODEPAGE, errors="replace").replace("?", "!")
54    else:
55        return str_in
56
57
58def correct_unknown_encoding(str_or_bytes_in: AnyStr) -> str:
59    """Files created on Windows but unpacked/repaired on
60    linux can result in invalid filenames. Try to fix this
61    encoding by going to bytes and then back to unicode again.
62    Last resort we use chardet package
63    """
64    # If already string, back to bytes
65    if not isinstance(str_or_bytes_in, bytes):
66        str_or_bytes_in = str_or_bytes_in.encode("utf-8", "surrogateescape")
67
68    # Try simple bytes-to-string
69    try:
70        return ubtou(str_or_bytes_in)
71    except UnicodeDecodeError:
72        try:
73            # Try using 8-bit ASCII, if came from Windows
74            return str_or_bytes_in.decode("ISO-8859-1")
75        except ValueError:
76            # Last resort we use the slow chardet package
77            return str_or_bytes_in.decode(chardet.detect(str_or_bytes_in)["encoding"])
78
79
80def xml_name(p):
81    """Prepare name for use in HTML/XML contect"""
82    return escape(str(p))
83