1try:
2    import unicodedata2 as unicodedata
3except ImportError:
4    import unicodedata  # type: ignore[no-redef]
5
6import importlib
7import logging
8from codecs import IncrementalDecoder
9from encodings.aliases import aliases
10from functools import lru_cache
11from re import findall
12from typing import List, Optional, Set, Tuple, Union
13
14from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
15
16from .constant import (
17    ENCODING_MARKS,
18    IANA_SUPPORTED_SIMILAR,
19    RE_POSSIBLE_ENCODING_INDICATION,
20    UNICODE_RANGES_COMBINED,
21    UNICODE_SECONDARY_RANGE_KEYWORD,
22    UTF8_MAXIMAL_ALLOCATION,
23)
24
25
26@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
27def is_accentuated(character: str) -> bool:
28    try:
29        description = unicodedata.name(character)  # type: str
30    except ValueError:
31        return False
32    return (
33        "WITH GRAVE" in description
34        or "WITH ACUTE" in description
35        or "WITH CEDILLA" in description
36        or "WITH DIAERESIS" in description
37        or "WITH CIRCUMFLEX" in description
38        or "WITH TILDE" in description
39    )
40
41
42@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
43def remove_accent(character: str) -> str:
44    decomposed = unicodedata.decomposition(character)  # type: str
45    if not decomposed:
46        return character
47
48    codes = decomposed.split(" ")  # type: List[str]
49
50    return chr(int(codes[0], 16))
51
52
53@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
54def unicode_range(character: str) -> Optional[str]:
55    """
56    Retrieve the Unicode range official name from a single character.
57    """
58    character_ord = ord(character)  # type: int
59
60    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
61        if character_ord in ord_range:
62            return range_name
63
64    return None
65
66
67@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
68def is_latin(character: str) -> bool:
69    try:
70        description = unicodedata.name(character)  # type: str
71    except ValueError:
72        return False
73    return "LATIN" in description
74
75
76def is_ascii(character: str) -> bool:
77    try:
78        character.encode("ascii")
79    except UnicodeEncodeError:
80        return False
81    return True
82
83
84@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
85def is_punctuation(character: str) -> bool:
86    character_category = unicodedata.category(character)  # type: str
87
88    if "P" in character_category:
89        return True
90
91    character_range = unicode_range(character)  # type: Optional[str]
92
93    if character_range is None:
94        return False
95
96    return "Punctuation" in character_range
97
98
99@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
100def is_symbol(character: str) -> bool:
101    character_category = unicodedata.category(character)  # type: str
102
103    if "S" in character_category or "N" in character_category:
104        return True
105
106    character_range = unicode_range(character)  # type: Optional[str]
107
108    if character_range is None:
109        return False
110
111    return "Forms" in character_range
112
113
114@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
115def is_emoticon(character: str) -> bool:
116    character_range = unicode_range(character)  # type: Optional[str]
117
118    if character_range is None:
119        return False
120
121    return "Emoticons" in character_range
122
123
124@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
125def is_separator(character: str) -> bool:
126    if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
127        return True
128
129    character_category = unicodedata.category(character)  # type: str
130
131    return "Z" in character_category
132
133
134@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
135def is_case_variable(character: str) -> bool:
136    return character.islower() != character.isupper()
137
138
139def is_private_use_only(character: str) -> bool:
140    character_category = unicodedata.category(character)  # type: str
141
142    return character_category == "Co"
143
144
145@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
146def is_cjk(character: str) -> bool:
147    try:
148        character_name = unicodedata.name(character)
149    except ValueError:
150        return False
151
152    return "CJK" in character_name
153
154
155@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
156def is_hiragana(character: str) -> bool:
157    try:
158        character_name = unicodedata.name(character)
159    except ValueError:
160        return False
161
162    return "HIRAGANA" in character_name
163
164
165@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
166def is_katakana(character: str) -> bool:
167    try:
168        character_name = unicodedata.name(character)
169    except ValueError:
170        return False
171
172    return "KATAKANA" in character_name
173
174
175@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
176def is_hangul(character: str) -> bool:
177    try:
178        character_name = unicodedata.name(character)
179    except ValueError:
180        return False
181
182    return "HANGUL" in character_name
183
184
185@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
186def is_thai(character: str) -> bool:
187    try:
188        character_name = unicodedata.name(character)
189    except ValueError:
190        return False
191
192    return "THAI" in character_name
193
194
195@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
196def is_unicode_range_secondary(range_name: str) -> bool:
197    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
198
199
200def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
201    """
202    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
203    """
204    if not isinstance(sequence, bytes):
205        raise TypeError
206
207    seq_len = len(sequence)  # type: int
208
209    results = findall(
210        RE_POSSIBLE_ENCODING_INDICATION,
211        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
212    )  # type: List[str]
213
214    if len(results) == 0:
215        return None
216
217    for specified_encoding in results:
218        specified_encoding = specified_encoding.lower().replace("-", "_")
219
220        for encoding_alias, encoding_iana in aliases.items():
221            if encoding_alias == specified_encoding:
222                return encoding_iana
223            if encoding_iana == specified_encoding:
224                return encoding_iana
225
226    return None
227
228
229@lru_cache(maxsize=128)
230def is_multi_byte_encoding(name: str) -> bool:
231    """
232    Verify is a specific encoding is a multi byte one based on it IANA name
233    """
234    return name in {
235        "utf_8",
236        "utf_8_sig",
237        "utf_16",
238        "utf_16_be",
239        "utf_16_le",
240        "utf_32",
241        "utf_32_le",
242        "utf_32_be",
243        "utf_7",
244    } or issubclass(
245        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,  # type: ignore
246        MultibyteIncrementalDecoder,
247    )
248
249
250def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
251    """
252    Identify and extract SIG/BOM in given sequence.
253    """
254
255    for iana_encoding in ENCODING_MARKS:
256        marks = ENCODING_MARKS[iana_encoding]  # type: Union[bytes, List[bytes]]
257
258        if isinstance(marks, bytes):
259            marks = [marks]
260
261        for mark in marks:
262            if sequence.startswith(mark):
263                return iana_encoding, mark
264
265    return None, b""
266
267
268def should_strip_sig_or_bom(iana_encoding: str) -> bool:
269    return iana_encoding not in {"utf_16", "utf_32"}
270
271
272def iana_name(cp_name: str, strict: bool = True) -> str:
273    cp_name = cp_name.lower().replace("-", "_")
274
275    for encoding_alias, encoding_iana in aliases.items():
276        if cp_name in [encoding_alias, encoding_iana]:
277            return encoding_iana
278
279    if strict:
280        raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
281
282    return cp_name
283
284
285def range_scan(decoded_sequence: str) -> List[str]:
286    ranges = set()  # type: Set[str]
287
288    for character in decoded_sequence:
289        character_range = unicode_range(character)  # type: Optional[str]
290
291        if character_range is None:
292            continue
293
294        ranges.add(character_range)
295
296    return list(ranges)
297
298
299def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
300
301    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
302        return 0.0
303
304    decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder  # type: ignore
305    decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder  # type: ignore
306
307    id_a = decoder_a(errors="ignore")  # type: IncrementalDecoder
308    id_b = decoder_b(errors="ignore")  # type: IncrementalDecoder
309
310    character_match_count = 0  # type: int
311
312    for i in range(255):
313        to_be_decoded = bytes([i])  # type: bytes
314        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
315            character_match_count += 1
316
317    return character_match_count / 254
318
319
320def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
321    """
322    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
323    the function cp_similarity.
324    """
325    return (
326        iana_name_a in IANA_SUPPORTED_SIMILAR
327        and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
328    )
329
330
331def set_logging_handler(
332    name: str = "charset_normalizer",
333    level: int = logging.INFO,
334    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
335) -> None:
336
337    logger = logging.getLogger(name)
338    logger.setLevel(level)
339
340    handler = logging.StreamHandler()
341    handler.setFormatter(logging.Formatter(format_string))
342    logger.addHandler(handler)
343