1try: 2 import unicodedata2 as unicodedata 3except ImportError: 4 import unicodedata # type: ignore[no-redef] 5 6import importlib 7import logging 8from codecs import IncrementalDecoder 9from encodings.aliases import aliases 10from functools import lru_cache 11from re import findall 12from typing import List, Optional, Set, Tuple, Union 13 14from _multibytecodec import MultibyteIncrementalDecoder # type: ignore 15 16from .constant import ( 17 ENCODING_MARKS, 18 IANA_SUPPORTED_SIMILAR, 19 RE_POSSIBLE_ENCODING_INDICATION, 20 UNICODE_RANGES_COMBINED, 21 UNICODE_SECONDARY_RANGE_KEYWORD, 22 UTF8_MAXIMAL_ALLOCATION, 23) 24 25 26@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 27def is_accentuated(character: str) -> bool: 28 try: 29 description = unicodedata.name(character) # type: str 30 except ValueError: 31 return False 32 return ( 33 "WITH GRAVE" in description 34 or "WITH ACUTE" in description 35 or "WITH CEDILLA" in description 36 or "WITH DIAERESIS" in description 37 or "WITH CIRCUMFLEX" in description 38 or "WITH TILDE" in description 39 ) 40 41 42@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 43def remove_accent(character: str) -> str: 44 decomposed = unicodedata.decomposition(character) # type: str 45 if not decomposed: 46 return character 47 48 codes = decomposed.split(" ") # type: List[str] 49 50 return chr(int(codes[0], 16)) 51 52 53@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 54def unicode_range(character: str) -> Optional[str]: 55 """ 56 Retrieve the Unicode range official name from a single character. 57 """ 58 character_ord = ord(character) # type: int 59 60 for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): 61 if character_ord in ord_range: 62 return range_name 63 64 return None 65 66 67@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 68def is_latin(character: str) -> bool: 69 try: 70 description = unicodedata.name(character) # type: str 71 except ValueError: 72 return False 73 return "LATIN" in description 74 75 76def is_ascii(character: str) -> bool: 77 try: 78 character.encode("ascii") 79 except UnicodeEncodeError: 80 return False 81 return True 82 83 84@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 85def is_punctuation(character: str) -> bool: 86 character_category = unicodedata.category(character) # type: str 87 88 if "P" in character_category: 89 return True 90 91 character_range = unicode_range(character) # type: Optional[str] 92 93 if character_range is None: 94 return False 95 96 return "Punctuation" in character_range 97 98 99@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 100def is_symbol(character: str) -> bool: 101 character_category = unicodedata.category(character) # type: str 102 103 if "S" in character_category or "N" in character_category: 104 return True 105 106 character_range = unicode_range(character) # type: Optional[str] 107 108 if character_range is None: 109 return False 110 111 return "Forms" in character_range 112 113 114@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 115def is_emoticon(character: str) -> bool: 116 character_range = unicode_range(character) # type: Optional[str] 117 118 if character_range is None: 119 return False 120 121 return "Emoticons" in character_range 122 123 124@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 125def is_separator(character: str) -> bool: 126 if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}: 127 return True 128 129 character_category = unicodedata.category(character) # type: str 130 131 return "Z" in character_category 132 133 134@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 135def is_case_variable(character: str) -> bool: 136 return character.islower() != character.isupper() 137 138 139def is_private_use_only(character: str) -> bool: 140 character_category = unicodedata.category(character) # type: str 141 142 return character_category == "Co" 143 144 145@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 146def is_cjk(character: str) -> bool: 147 try: 148 character_name = unicodedata.name(character) 149 except ValueError: 150 return False 151 152 return "CJK" in character_name 153 154 155@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 156def is_hiragana(character: str) -> bool: 157 try: 158 character_name = unicodedata.name(character) 159 except ValueError: 160 return False 161 162 return "HIRAGANA" in character_name 163 164 165@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 166def is_katakana(character: str) -> bool: 167 try: 168 character_name = unicodedata.name(character) 169 except ValueError: 170 return False 171 172 return "KATAKANA" in character_name 173 174 175@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 176def is_hangul(character: str) -> bool: 177 try: 178 character_name = unicodedata.name(character) 179 except ValueError: 180 return False 181 182 return "HANGUL" in character_name 183 184 185@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) 186def is_thai(character: str) -> bool: 187 try: 188 character_name = unicodedata.name(character) 189 except ValueError: 190 return False 191 192 return "THAI" in character_name 193 194 195@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) 196def is_unicode_range_secondary(range_name: str) -> bool: 197 return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) 198 199 200def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]: 201 """ 202 Extract using ASCII-only decoder any specified encoding in the first n-bytes. 203 """ 204 if not isinstance(sequence, bytes): 205 raise TypeError 206 207 seq_len = len(sequence) # type: int 208 209 results = findall( 210 RE_POSSIBLE_ENCODING_INDICATION, 211 sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), 212 ) # type: List[str] 213 214 if len(results) == 0: 215 return None 216 217 for specified_encoding in results: 218 specified_encoding = specified_encoding.lower().replace("-", "_") 219 220 for encoding_alias, encoding_iana in aliases.items(): 221 if encoding_alias == specified_encoding: 222 return encoding_iana 223 if encoding_iana == specified_encoding: 224 return encoding_iana 225 226 return None 227 228 229@lru_cache(maxsize=128) 230def is_multi_byte_encoding(name: str) -> bool: 231 """ 232 Verify is a specific encoding is a multi byte one based on it IANA name 233 """ 234 return name in { 235 "utf_8", 236 "utf_8_sig", 237 "utf_16", 238 "utf_16_be", 239 "utf_16_le", 240 "utf_32", 241 "utf_32_le", 242 "utf_32_be", 243 "utf_7", 244 } or issubclass( 245 importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore 246 MultibyteIncrementalDecoder, 247 ) 248 249 250def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: 251 """ 252 Identify and extract SIG/BOM in given sequence. 253 """ 254 255 for iana_encoding in ENCODING_MARKS: 256 marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]] 257 258 if isinstance(marks, bytes): 259 marks = [marks] 260 261 for mark in marks: 262 if sequence.startswith(mark): 263 return iana_encoding, mark 264 265 return None, b"" 266 267 268def should_strip_sig_or_bom(iana_encoding: str) -> bool: 269 return iana_encoding not in {"utf_16", "utf_32"} 270 271 272def iana_name(cp_name: str, strict: bool = True) -> str: 273 cp_name = cp_name.lower().replace("-", "_") 274 275 for encoding_alias, encoding_iana in aliases.items(): 276 if cp_name in [encoding_alias, encoding_iana]: 277 return encoding_iana 278 279 if strict: 280 raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) 281 282 return cp_name 283 284 285def range_scan(decoded_sequence: str) -> List[str]: 286 ranges = set() # type: Set[str] 287 288 for character in decoded_sequence: 289 character_range = unicode_range(character) # type: Optional[str] 290 291 if character_range is None: 292 continue 293 294 ranges.add(character_range) 295 296 return list(ranges) 297 298 299def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: 300 301 if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): 302 return 0.0 303 304 decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore 305 decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore 306 307 id_a = decoder_a(errors="ignore") # type: IncrementalDecoder 308 id_b = decoder_b(errors="ignore") # type: IncrementalDecoder 309 310 character_match_count = 0 # type: int 311 312 for i in range(255): 313 to_be_decoded = bytes([i]) # type: bytes 314 if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): 315 character_match_count += 1 316 317 return character_match_count / 254 318 319 320def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: 321 """ 322 Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using 323 the function cp_similarity. 324 """ 325 return ( 326 iana_name_a in IANA_SUPPORTED_SIMILAR 327 and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] 328 ) 329 330 331def set_logging_handler( 332 name: str = "charset_normalizer", 333 level: int = logging.INFO, 334 format_string: str = "%(asctime)s | %(levelname)s | %(message)s", 335) -> None: 336 337 logger = logging.getLogger(name) 338 logger.setLevel(level) 339 340 handler = logging.StreamHandler() 341 handler.setFormatter(logging.Formatter(format_string)) 342 logger.addHandler(handler) 343