1# 2# Copyright (c), 2016-2020, SISSA (International School for Advanced Studies). 3# All rights reserved. 4# This file is distributed under the terms of the MIT License. 5# See the file 'LICENSE' in the root directory of the present 6# distribution, or http://opensource.org/licenses/MIT. 7# 8# @author Davide Brunato <brunato@sissa.it> 9# 10""" 11This module defines Unicode character categories and blocks. 12""" 13from sys import maxunicode 14from typing import cast, Iterable, Iterator, List, MutableSet, Union, Optional 15 16from .unicode_categories import RAW_UNICODE_CATEGORIES 17from .codepoints import CodePoint, code_point_order, code_point_repr, \ 18 iter_code_points, get_code_point_range 19 20CodePointsArgType = Union[str, 'UnicodeSubset', List[CodePoint], Iterable[CodePoint]] 21 22 23class RegexError(Exception): 24 """ 25 Error in a regular expression or in a character class specification. 26 This exception is derived from `Exception` base class and is raised 27 only by the regex subpackage. 28 """ 29 30 31def iterparse_character_subset(s: str, expand_ranges: bool = False) -> Iterator[CodePoint]: 32 """ 33 Parses a regex character subset, generating a sequence of code points 34 and code points ranges. An unescaped hyphen (-) that is not at the 35 start or at the and is interpreted as range specifier. 36 37 :param s: a string representing the character subset. 38 :param expand_ranges: if set to `True` then expands character ranges. 39 :return: yields integers or couples of integers. 40 """ 41 escaped = False 42 on_range = False 43 char = '' 44 length = len(s) 45 subset_index_iterator = iter(range(len(s))) 46 for k in subset_index_iterator: 47 if k == 0: 48 char = s[0] 49 if char == '\\': 50 escaped = True 51 elif char in r'[]' and length > 1: 52 raise RegexError("bad character %r at position 0" % char) 53 elif expand_ranges: 54 yield ord(char) 55 elif length <= 2 or s[1] != '-': 56 yield ord(char) 57 elif s[k] == '-': 58 if escaped or (k == length - 1): 59 char = s[k] 60 yield ord(char) 61 escaped = False 62 elif on_range: 63 char = s[k] 64 yield ord(char) 65 on_range = False 66 else: 67 # Parse character range 68 on_range = True 69 k = next(subset_index_iterator) 70 end_char = s[k] 71 if end_char == '\\' and (k < length - 1): 72 if s[k + 1] in r'-|.^?*+{}()[]': 73 k = next(subset_index_iterator) 74 end_char = s[k] 75 elif s[k + 1] in r'sSdDiIcCwWpP': 76 msg = "bad character range '%s-\\%s' at position %d: %r" 77 raise RegexError(msg % (char, s[k + 1], k - 2, s)) 78 79 if ord(char) > ord(end_char): 80 msg = "bad character range '%s-%s' at position %d: %r" 81 raise RegexError(msg % (char, end_char, k - 2, s)) 82 elif expand_ranges: 83 yield from range(ord(char) + 1, ord(end_char) + 1) 84 else: 85 yield ord(char), ord(end_char) + 1 86 87 elif s[k] in r'|.^?*+{}()': 88 if escaped: 89 escaped = False 90 on_range = False 91 char = s[k] 92 yield ord(char) 93 elif s[k] in r'[]': 94 if not escaped and length > 1: 95 raise RegexError("bad character %r at position %d" % (s[k], k)) 96 escaped = on_range = False 97 char = s[k] 98 if k >= length - 2 or s[k + 1] != '-': 99 yield ord(char) 100 elif s[k] == '\\': 101 if escaped: 102 escaped = on_range = False 103 char = '\\' 104 yield ord(char) 105 else: 106 escaped = True 107 else: 108 if escaped: 109 escaped = False 110 yield ord('\\') 111 on_range = False 112 char = s[k] 113 if k >= length - 2 or s[k + 1] != '-': 114 yield ord(char) 115 if escaped: 116 yield ord('\\') 117 118 119class UnicodeSubset(MutableSet[CodePoint]): 120 """ 121 Represents a subset of Unicode code points, implemented with an ordered list of 122 integer values and ranges. Codepoints can be added or discarded using sequences 123 of integer values and ranges or with strings equivalent to regex character set. 124 125 :param codepoints: a sequence of integer values and ranges, another UnicodeSubset \ 126 instance ora a string equivalent of a regex character set. 127 """ 128 __slots__ = '_codepoints', 129 _codepoints: List[CodePoint] 130 131 def __init__(self, codepoints: Optional[CodePointsArgType] = None) -> None: 132 if not codepoints: 133 self._codepoints = list() 134 elif isinstance(codepoints, list): 135 self._codepoints = sorted(codepoints, key=code_point_order) 136 elif isinstance(codepoints, UnicodeSubset): 137 self._codepoints = codepoints.codepoints.copy() 138 else: 139 self._codepoints = list() 140 self.update(codepoints) 141 142 @property 143 def codepoints(self) -> List[CodePoint]: 144 return self._codepoints 145 146 def __repr__(self) -> str: 147 return '%s(%r)' % (self.__class__.__name__, str(self)) 148 149 def __str__(self) -> str: 150 return ''.join(code_point_repr(cp) for cp in self._codepoints) 151 152 def copy(self) -> 'UnicodeSubset': 153 return self.__copy__() 154 155 def __copy__(self) -> 'UnicodeSubset': 156 return UnicodeSubset(self._codepoints) 157 158 def __reversed__(self) -> Iterator[int]: 159 for item in reversed(self._codepoints): 160 if isinstance(item, int): 161 yield item 162 else: 163 yield from reversed(range(item[0], item[1])) 164 165 def complement(self) -> Iterator[CodePoint]: 166 last_cp = 0 167 for cp in self._codepoints: 168 if isinstance(cp, int): 169 cp = cp, cp + 1 170 171 diff = cp[0] - last_cp 172 if diff > 2: 173 yield last_cp, cp[0] 174 elif diff == 2: 175 yield last_cp 176 yield last_cp + 1 177 elif diff == 1: 178 yield last_cp 179 elif diff: 180 raise ValueError("unordered code points found in {!r}".format(self)) 181 last_cp = cp[1] 182 183 if last_cp < maxunicode: 184 yield last_cp, maxunicode + 1 185 elif last_cp == maxunicode: 186 yield maxunicode 187 188 def iter_characters(self) -> Iterator[str]: 189 return map(chr, self.__iter__()) 190 191 # 192 # MutableSet's abstract methods implementation 193 def __contains__(self, value: object) -> bool: 194 if not isinstance(value, int): 195 try: 196 value = ord(value) # type: ignore[arg-type] 197 except TypeError: 198 return False 199 200 for cp in self._codepoints: 201 if not isinstance(cp, int): 202 if cp[0] > value: 203 return False 204 elif cp[1] <= value: 205 continue 206 else: 207 return True 208 elif cp > value: 209 return False 210 elif cp == value: 211 return True 212 return False 213 214 def __iter__(self) -> Iterator[int]: 215 for cp in self._codepoints: 216 if isinstance(cp, int): 217 yield cp 218 else: 219 yield from range(*cp) 220 221 def __len__(self) -> int: 222 k = 0 223 for _ in self: 224 k += 1 225 return k 226 227 def update(self, *others: Union[str, Iterable[CodePoint]]) -> None: 228 for value in others: 229 if isinstance(value, str): 230 for cp in iter_code_points(iterparse_character_subset(value), reverse=True): 231 self.add(cp) 232 else: 233 for cp in iter_code_points(value, reverse=True): 234 self.add(cp) 235 236 def add(self, value: CodePoint) -> None: 237 try: 238 start_value, end_value = get_code_point_range(value) # type: ignore[misc] 239 except TypeError: 240 raise ValueError("{!r} is not a Unicode code point value/range".format(value)) 241 242 code_points = self._codepoints 243 last_index = len(code_points) - 1 244 for k, cp in enumerate(code_points): 245 if isinstance(cp, int): 246 cp = cp, cp + 1 247 248 if end_value < cp[0]: 249 code_points.insert(k, value) 250 elif start_value > cp[1]: 251 continue 252 elif end_value > cp[1]: 253 if k == last_index: 254 code_points[k] = min(cp[0], start_value), end_value 255 else: 256 next_cp = code_points[k + 1] 257 higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0] 258 if end_value <= higher_bound: 259 code_points[k] = min(cp[0], start_value), end_value 260 else: 261 code_points[k] = min(cp[0], start_value), higher_bound 262 start_value = higher_bound 263 continue 264 elif start_value < cp[0]: 265 code_points[k] = start_value, cp[1] 266 break 267 else: 268 self._codepoints.append(value) 269 270 def difference_update(self, *others: Union[str, Iterable[CodePoint]]) -> None: 271 for value in others: 272 if isinstance(value, str): 273 for cp in iter_code_points(iterparse_character_subset(value), reverse=True): 274 self.discard(cp) 275 else: 276 for cp in iter_code_points(value, reverse=True): 277 self.discard(cp) 278 279 def discard(self, value: CodePoint) -> None: 280 try: 281 start_cp, end_cp = get_code_point_range(value) # type: ignore[misc] 282 except TypeError: 283 raise ValueError("{!r} is not a Unicode code point value/range".format(value)) 284 285 code_points = self._codepoints 286 for k in reversed(range(len(code_points))): 287 cp = code_points[k] 288 if isinstance(cp, int): 289 cp = cp, cp + 1 290 291 if start_cp >= cp[1]: 292 break 293 elif end_cp >= cp[1]: 294 if start_cp <= cp[0]: 295 del code_points[k] 296 elif start_cp - cp[0] > 1: 297 code_points[k] = cp[0], start_cp 298 else: 299 code_points[k] = cp[0] 300 elif end_cp > cp[0]: 301 if start_cp <= cp[0]: 302 if cp[1] - end_cp > 1: 303 code_points[k] = end_cp, cp[1] 304 else: 305 code_points[k] = cp[1] - 1 306 else: 307 if cp[1] - end_cp > 1: 308 code_points.insert(k + 1, (end_cp, cp[1])) 309 else: 310 code_points.insert(k + 1, cp[1] - 1) 311 if start_cp - cp[0] > 1: 312 code_points[k] = cp[0], start_cp 313 else: 314 code_points[k] = cp[0] 315 316 # 317 # MutableSet's mixin methods override 318 def clear(self) -> None: 319 del self._codepoints[:] 320 321 def __eq__(self, other: object) -> bool: 322 if not isinstance(other, Iterable): 323 return NotImplemented 324 elif isinstance(other, UnicodeSubset): 325 return self._codepoints == other._codepoints 326 else: 327 return self._codepoints == other 328 329 def __ior__(self, other: object) -> 'UnicodeSubset': # type: ignore[override] 330 if not isinstance(other, Iterable): 331 return NotImplemented 332 elif isinstance(other, UnicodeSubset): 333 other = reversed(other._codepoints) 334 elif isinstance(other, str): 335 other = reversed(UnicodeSubset(other)._codepoints) 336 else: 337 other = iter_code_points(other, reverse=True) 338 339 for cp in other: 340 self.add(cp) 341 return self 342 343 def __or__(self, other: object) -> 'UnicodeSubset': 344 obj = self.copy() 345 return obj.__ior__(other) 346 347 def __isub__(self, other: object) -> 'UnicodeSubset': 348 if not isinstance(other, Iterable): 349 return NotImplemented 350 elif isinstance(other, UnicodeSubset): 351 other = reversed(other._codepoints) 352 elif isinstance(other, str): 353 other = reversed(UnicodeSubset(other)._codepoints) 354 else: 355 other = iter_code_points(other, reverse=True) 356 357 for cp in other: 358 self.discard(cp) 359 return self 360 361 def __sub__(self, other: object) -> 'UnicodeSubset': 362 obj = self.copy() 363 return obj.__isub__(other) 364 365 __rsub__ = __sub__ 366 367 def __iand__(self, other: object) -> 'UnicodeSubset': 368 if not isinstance(other, Iterable): 369 return NotImplemented 370 371 for value in (self - other): 372 self.discard(value) 373 return self 374 375 def __and__(self, other: object) -> 'UnicodeSubset': 376 obj = self.copy() 377 return obj.__iand__(other) 378 379 def __ixor__(self, other: object) -> 'UnicodeSubset': # type: ignore[override] 380 if other is self: 381 self.clear() 382 return self 383 elif not isinstance(other, Iterable): 384 return NotImplemented 385 elif not isinstance(other, UnicodeSubset): 386 other = UnicodeSubset(cast(Union[str, Iterable[CodePoint]], other)) 387 388 for value in other: 389 if value in self: 390 self.discard(value) 391 else: 392 self.add(value) 393 return self 394 395 def __xor__(self, other: object) -> 'UnicodeSubset': 396 obj = self.copy() 397 return obj.__ixor__(other) 398 399 400UNICODE_CATEGORIES = {k: UnicodeSubset(cast(List[CodePoint], v)) 401 for k, v in RAW_UNICODE_CATEGORIES.items()} 402 403 404# See http://www.unicode.org/Public/UNIDATA/Blocks.txt 405UNICODE_BLOCKS = { 406 'IsBasicLatin': UnicodeSubset('\u0000-\u007F'), 407 'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'), 408 'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'), 409 'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'), 410 'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'), 411 'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'), 412 'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'), 413 'IsGreek': UnicodeSubset('\u0370-\u03FF'), 414 'IsCyrillic': UnicodeSubset('\u0400-\u04FF'), 415 'IsArmenian': UnicodeSubset('\u0530-\u058F'), 416 'IsHebrew': UnicodeSubset('\u0590-\u05FF'), 417 'IsArabic': UnicodeSubset('\u0600-\u06FF'), 418 'IsSyriac': UnicodeSubset('\u0700-\u074F'), 419 'IsThaana': UnicodeSubset('\u0780-\u07BF'), 420 'IsDevanagari': UnicodeSubset('\u0900-\u097F'), 421 'IsBengali': UnicodeSubset('\u0980-\u09FF'), 422 'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'), 423 'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'), 424 'IsOriya': UnicodeSubset('\u0B00-\u0B7F'), 425 'IsTamil': UnicodeSubset('\u0B80-\u0BFF'), 426 'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'), 427 'IsKannada': UnicodeSubset('\u0C80-\u0CFF'), 428 'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'), 429 'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'), 430 'IsThai': UnicodeSubset('\u0E00-\u0E7F'), 431 'IsLao': UnicodeSubset('\u0E80-\u0EFF'), 432 'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'), 433 'IsMyanmar': UnicodeSubset('\u1000-\u109F'), 434 'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'), 435 'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'), 436 'IsEthiopic': UnicodeSubset('\u1200-\u137F'), 437 'IsCherokee': UnicodeSubset('\u13A0-\u13FF'), 438 'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'), 439 'IsOgham': UnicodeSubset('\u1680-\u169F'), 440 'IsRunic': UnicodeSubset('\u16A0-\u16FF'), 441 'IsKhmer': UnicodeSubset('\u1780-\u17FF'), 442 'IsMongolian': UnicodeSubset('\u1800-\u18AF'), 443 'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'), 444 'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'), 445 'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'), 446 'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'), 447 'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'), 448 'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'), 449 'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'), 450 'IsNumberForms': UnicodeSubset('\u2150-\u218F'), 451 'IsArrows': UnicodeSubset('\u2190-\u21FF'), 452 'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'), 453 'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'), 454 'IsControlPictures': UnicodeSubset('\u2400-\u243F'), 455 'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'), 456 'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'), 457 'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'), 458 'IsBlockElements': UnicodeSubset('\u2580-\u259F'), 459 'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'), 460 'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'), 461 'IsDingbats': UnicodeSubset('\u2700-\u27BF'), 462 'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'), 463 'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'), 464 'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'), 465 'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'), 466 'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'), 467 'IsHiragana': UnicodeSubset('\u3040-\u309F'), 468 'IsKatakana': UnicodeSubset('\u30A0-\u30FF'), 469 'IsBopomofo': UnicodeSubset('\u3100-\u312F'), 470 'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'), 471 'IsKanbun': UnicodeSubset('\u3190-\u319F'), 472 'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'), 473 'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'), 474 'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'), 475 'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'), 476 'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'), 477 'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'), 478 'IsYiRadicals': UnicodeSubset('\uA490-\uA4CF'), 479 'IsHangulSyllables': UnicodeSubset('\uAC00-\uD7A3'), 480 'IsHighSurrogates': UnicodeSubset('\uD800-\uDB7F'), 481 'IsHighPrivateUseSurrogates': UnicodeSubset('\uDB80-\uDBFF'), 482 'IsLowSurrogates': UnicodeSubset('\uDC00-\uDFFF'), 483 'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF\U000F0000-\U000FFFFF\U00100000-\U0010FFFF'), 484 'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'), 485 'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'), 486 'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'), 487 'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'), 488 'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'), 489 'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'), 490 'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'), 491 'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'), 492 'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF'), 493 'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'), 494 'IsGothic': UnicodeSubset('\U00010330-\U0001034F'), 495 'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'), 496 'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'), 497 'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'), 498 'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'), 499 'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'), 500 'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'), 501 'IsTags': UnicodeSubset('\U000E0000-\U000E007F'), 502} 503 504UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD') 505 506 507def unicode_subset(name: str) -> UnicodeSubset: 508 if name.startswith('Is'): 509 try: 510 return UNICODE_BLOCKS[name] 511 except KeyError: 512 raise RegexError("%r doesn't match to any Unicode block." % name) 513 else: 514 try: 515 return UNICODE_CATEGORIES[name] 516 except KeyError: 517 raise RegexError("%r doesn't match to any Unicode category." % name) 518