1#
2# Copyright (c), 2016-2020, SISSA (International School for Advanced Studies).
3# All rights reserved.
4# This file is distributed under the terms of the MIT License.
5# See the file 'LICENSE' in the root directory of the present
6# distribution, or http://opensource.org/licenses/MIT.
7#
8# @author Davide Brunato <brunato@sissa.it>
9#
10"""
11This module defines Unicode character categories and blocks.
12"""
13from sys import maxunicode
14from typing import cast, Iterable, Iterator, List, MutableSet, Union, Optional
15
16from .unicode_categories import RAW_UNICODE_CATEGORIES
17from .codepoints import CodePoint, code_point_order, code_point_repr, \
18    iter_code_points, get_code_point_range
19
20CodePointsArgType = Union[str, 'UnicodeSubset', List[CodePoint], Iterable[CodePoint]]
21
22
23class RegexError(Exception):
24    """
25    Error in a regular expression or in a character class specification.
26    This exception is derived from `Exception` base class and is raised
27    only by the regex subpackage.
28    """
29
30
31def iterparse_character_subset(s: str, expand_ranges: bool = False) -> Iterator[CodePoint]:
32    """
33    Parses a regex character subset, generating a sequence of code points
34    and code points ranges. An unescaped hyphen (-) that is not at the
35    start or at the and is interpreted as range specifier.
36
37    :param s: a string representing the character subset.
38    :param expand_ranges: if set to `True` then expands character ranges.
39    :return: yields integers or couples of integers.
40    """
41    escaped = False
42    on_range = False
43    char = ''
44    length = len(s)
45    subset_index_iterator = iter(range(len(s)))
46    for k in subset_index_iterator:
47        if k == 0:
48            char = s[0]
49            if char == '\\':
50                escaped = True
51            elif char in r'[]' and length > 1:
52                raise RegexError("bad character %r at position 0" % char)
53            elif expand_ranges:
54                yield ord(char)
55            elif length <= 2 or s[1] != '-':
56                yield ord(char)
57        elif s[k] == '-':
58            if escaped or (k == length - 1):
59                char = s[k]
60                yield ord(char)
61                escaped = False
62            elif on_range:
63                char = s[k]
64                yield ord(char)
65                on_range = False
66            else:
67                # Parse character range
68                on_range = True
69                k = next(subset_index_iterator)
70                end_char = s[k]
71                if end_char == '\\' and (k < length - 1):
72                    if s[k + 1] in r'-|.^?*+{}()[]':
73                        k = next(subset_index_iterator)
74                        end_char = s[k]
75                    elif s[k + 1] in r'sSdDiIcCwWpP':
76                        msg = "bad character range '%s-\\%s' at position %d: %r"
77                        raise RegexError(msg % (char, s[k + 1], k - 2, s))
78
79                if ord(char) > ord(end_char):
80                    msg = "bad character range '%s-%s' at position %d: %r"
81                    raise RegexError(msg % (char, end_char, k - 2, s))
82                elif expand_ranges:
83                    yield from range(ord(char) + 1, ord(end_char) + 1)
84                else:
85                    yield ord(char), ord(end_char) + 1
86
87        elif s[k] in r'|.^?*+{}()':
88            if escaped:
89                escaped = False
90            on_range = False
91            char = s[k]
92            yield ord(char)
93        elif s[k] in r'[]':
94            if not escaped and length > 1:
95                raise RegexError("bad character %r at position %d" % (s[k], k))
96            escaped = on_range = False
97            char = s[k]
98            if k >= length - 2 or s[k + 1] != '-':
99                yield ord(char)
100        elif s[k] == '\\':
101            if escaped:
102                escaped = on_range = False
103                char = '\\'
104                yield ord(char)
105            else:
106                escaped = True
107        else:
108            if escaped:
109                escaped = False
110                yield ord('\\')
111            on_range = False
112            char = s[k]
113            if k >= length - 2 or s[k + 1] != '-':
114                yield ord(char)
115    if escaped:
116        yield ord('\\')
117
118
119class UnicodeSubset(MutableSet[CodePoint]):
120    """
121    Represents a subset of Unicode code points, implemented with an ordered list of
122    integer values and ranges. Codepoints can be added or discarded using sequences
123    of integer values and ranges or with strings equivalent to regex character set.
124
125    :param codepoints: a sequence of integer values and ranges, another UnicodeSubset \
126    instance ora a string equivalent of a regex character set.
127    """
128    __slots__ = '_codepoints',
129    _codepoints: List[CodePoint]
130
131    def __init__(self, codepoints: Optional[CodePointsArgType] = None) -> None:
132        if not codepoints:
133            self._codepoints = list()
134        elif isinstance(codepoints, list):
135            self._codepoints = sorted(codepoints, key=code_point_order)
136        elif isinstance(codepoints, UnicodeSubset):
137            self._codepoints = codepoints.codepoints.copy()
138        else:
139            self._codepoints = list()
140            self.update(codepoints)
141
142    @property
143    def codepoints(self) -> List[CodePoint]:
144        return self._codepoints
145
146    def __repr__(self) -> str:
147        return '%s(%r)' % (self.__class__.__name__, str(self))
148
149    def __str__(self) -> str:
150        return ''.join(code_point_repr(cp) for cp in self._codepoints)
151
152    def copy(self) -> 'UnicodeSubset':
153        return self.__copy__()
154
155    def __copy__(self) -> 'UnicodeSubset':
156        return UnicodeSubset(self._codepoints)
157
158    def __reversed__(self) -> Iterator[int]:
159        for item in reversed(self._codepoints):
160            if isinstance(item, int):
161                yield item
162            else:
163                yield from reversed(range(item[0], item[1]))
164
165    def complement(self) -> Iterator[CodePoint]:
166        last_cp = 0
167        for cp in self._codepoints:
168            if isinstance(cp, int):
169                cp = cp, cp + 1
170
171            diff = cp[0] - last_cp
172            if diff > 2:
173                yield last_cp, cp[0]
174            elif diff == 2:
175                yield last_cp
176                yield last_cp + 1
177            elif diff == 1:
178                yield last_cp
179            elif diff:
180                raise ValueError("unordered code points found in {!r}".format(self))
181            last_cp = cp[1]
182
183        if last_cp < maxunicode:
184            yield last_cp, maxunicode + 1
185        elif last_cp == maxunicode:
186            yield maxunicode
187
188    def iter_characters(self) -> Iterator[str]:
189        return map(chr, self.__iter__())
190
191    #
192    # MutableSet's abstract methods implementation
193    def __contains__(self, value: object) -> bool:
194        if not isinstance(value, int):
195            try:
196                value = ord(value)  # type: ignore[arg-type]
197            except TypeError:
198                return False
199
200        for cp in self._codepoints:
201            if not isinstance(cp, int):
202                if cp[0] > value:
203                    return False
204                elif cp[1] <= value:
205                    continue
206                else:
207                    return True
208            elif cp > value:
209                return False
210            elif cp == value:
211                return True
212        return False
213
214    def __iter__(self) -> Iterator[int]:
215        for cp in self._codepoints:
216            if isinstance(cp, int):
217                yield cp
218            else:
219                yield from range(*cp)
220
221    def __len__(self) -> int:
222        k = 0
223        for _ in self:
224            k += 1
225        return k
226
227    def update(self, *others: Union[str, Iterable[CodePoint]]) -> None:
228        for value in others:
229            if isinstance(value, str):
230                for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
231                    self.add(cp)
232            else:
233                for cp in iter_code_points(value, reverse=True):
234                    self.add(cp)
235
236    def add(self, value: CodePoint) -> None:
237        try:
238            start_value, end_value = get_code_point_range(value)  # type: ignore[misc]
239        except TypeError:
240            raise ValueError("{!r} is not a Unicode code point value/range".format(value))
241
242        code_points = self._codepoints
243        last_index = len(code_points) - 1
244        for k, cp in enumerate(code_points):
245            if isinstance(cp, int):
246                cp = cp, cp + 1
247
248            if end_value < cp[0]:
249                code_points.insert(k, value)
250            elif start_value > cp[1]:
251                continue
252            elif end_value > cp[1]:
253                if k == last_index:
254                    code_points[k] = min(cp[0], start_value), end_value
255                else:
256                    next_cp = code_points[k + 1]
257                    higher_bound = next_cp if isinstance(next_cp, int) else next_cp[0]
258                    if end_value <= higher_bound:
259                        code_points[k] = min(cp[0], start_value), end_value
260                    else:
261                        code_points[k] = min(cp[0], start_value), higher_bound
262                        start_value = higher_bound
263                        continue
264            elif start_value < cp[0]:
265                code_points[k] = start_value, cp[1]
266            break
267        else:
268            self._codepoints.append(value)
269
270    def difference_update(self, *others: Union[str, Iterable[CodePoint]]) -> None:
271        for value in others:
272            if isinstance(value, str):
273                for cp in iter_code_points(iterparse_character_subset(value), reverse=True):
274                    self.discard(cp)
275            else:
276                for cp in iter_code_points(value, reverse=True):
277                    self.discard(cp)
278
279    def discard(self, value: CodePoint) -> None:
280        try:
281            start_cp, end_cp = get_code_point_range(value)  # type: ignore[misc]
282        except TypeError:
283            raise ValueError("{!r} is not a Unicode code point value/range".format(value))
284
285        code_points = self._codepoints
286        for k in reversed(range(len(code_points))):
287            cp = code_points[k]
288            if isinstance(cp, int):
289                cp = cp, cp + 1
290
291            if start_cp >= cp[1]:
292                break
293            elif end_cp >= cp[1]:
294                if start_cp <= cp[0]:
295                    del code_points[k]
296                elif start_cp - cp[0] > 1:
297                    code_points[k] = cp[0], start_cp
298                else:
299                    code_points[k] = cp[0]
300            elif end_cp > cp[0]:
301                if start_cp <= cp[0]:
302                    if cp[1] - end_cp > 1:
303                        code_points[k] = end_cp, cp[1]
304                    else:
305                        code_points[k] = cp[1] - 1
306                else:
307                    if cp[1] - end_cp > 1:
308                        code_points.insert(k + 1, (end_cp, cp[1]))
309                    else:
310                        code_points.insert(k + 1, cp[1] - 1)
311                    if start_cp - cp[0] > 1:
312                        code_points[k] = cp[0], start_cp
313                    else:
314                        code_points[k] = cp[0]
315
316    #
317    # MutableSet's mixin methods override
318    def clear(self) -> None:
319        del self._codepoints[:]
320
321    def __eq__(self, other: object) -> bool:
322        if not isinstance(other, Iterable):
323            return NotImplemented
324        elif isinstance(other, UnicodeSubset):
325            return self._codepoints == other._codepoints
326        else:
327            return self._codepoints == other
328
329    def __ior__(self, other: object) -> 'UnicodeSubset':  # type: ignore[override]
330        if not isinstance(other, Iterable):
331            return NotImplemented
332        elif isinstance(other, UnicodeSubset):
333            other = reversed(other._codepoints)
334        elif isinstance(other, str):
335            other = reversed(UnicodeSubset(other)._codepoints)
336        else:
337            other = iter_code_points(other, reverse=True)
338
339        for cp in other:
340            self.add(cp)
341        return self
342
343    def __or__(self, other: object) -> 'UnicodeSubset':
344        obj = self.copy()
345        return obj.__ior__(other)
346
347    def __isub__(self, other: object) -> 'UnicodeSubset':
348        if not isinstance(other, Iterable):
349            return NotImplemented
350        elif isinstance(other, UnicodeSubset):
351            other = reversed(other._codepoints)
352        elif isinstance(other, str):
353            other = reversed(UnicodeSubset(other)._codepoints)
354        else:
355            other = iter_code_points(other, reverse=True)
356
357        for cp in other:
358            self.discard(cp)
359        return self
360
361    def __sub__(self, other: object) -> 'UnicodeSubset':
362        obj = self.copy()
363        return obj.__isub__(other)
364
365    __rsub__ = __sub__
366
367    def __iand__(self, other: object) -> 'UnicodeSubset':
368        if not isinstance(other, Iterable):
369            return NotImplemented
370
371        for value in (self - other):
372            self.discard(value)
373        return self
374
375    def __and__(self, other: object) -> 'UnicodeSubset':
376        obj = self.copy()
377        return obj.__iand__(other)
378
379    def __ixor__(self, other: object) -> 'UnicodeSubset':  # type: ignore[override]
380        if other is self:
381            self.clear()
382            return self
383        elif not isinstance(other, Iterable):
384            return NotImplemented
385        elif not isinstance(other, UnicodeSubset):
386            other = UnicodeSubset(cast(Union[str, Iterable[CodePoint]], other))
387
388        for value in other:
389            if value in self:
390                self.discard(value)
391            else:
392                self.add(value)
393        return self
394
395    def __xor__(self, other: object) -> 'UnicodeSubset':
396        obj = self.copy()
397        return obj.__ixor__(other)
398
399
400UNICODE_CATEGORIES = {k: UnicodeSubset(cast(List[CodePoint], v))
401                      for k, v in RAW_UNICODE_CATEGORIES.items()}
402
403
404# See http://www.unicode.org/Public/UNIDATA/Blocks.txt
405UNICODE_BLOCKS = {
406    'IsBasicLatin': UnicodeSubset('\u0000-\u007F'),
407    'IsLatin-1Supplement': UnicodeSubset('\u0080-\u00FF'),
408    'IsLatinExtended-A': UnicodeSubset('\u0100-\u017F'),
409    'IsLatinExtended-B': UnicodeSubset('\u0180-\u024F'),
410    'IsIPAExtensions': UnicodeSubset('\u0250-\u02AF'),
411    'IsSpacingModifierLetters': UnicodeSubset('\u02B0-\u02FF'),
412    'IsCombiningDiacriticalMarks': UnicodeSubset('\u0300-\u036F'),
413    'IsGreek': UnicodeSubset('\u0370-\u03FF'),
414    'IsCyrillic': UnicodeSubset('\u0400-\u04FF'),
415    'IsArmenian': UnicodeSubset('\u0530-\u058F'),
416    'IsHebrew': UnicodeSubset('\u0590-\u05FF'),
417    'IsArabic': UnicodeSubset('\u0600-\u06FF'),
418    'IsSyriac': UnicodeSubset('\u0700-\u074F'),
419    'IsThaana': UnicodeSubset('\u0780-\u07BF'),
420    'IsDevanagari': UnicodeSubset('\u0900-\u097F'),
421    'IsBengali': UnicodeSubset('\u0980-\u09FF'),
422    'IsGurmukhi': UnicodeSubset('\u0A00-\u0A7F'),
423    'IsGujarati': UnicodeSubset('\u0A80-\u0AFF'),
424    'IsOriya': UnicodeSubset('\u0B00-\u0B7F'),
425    'IsTamil': UnicodeSubset('\u0B80-\u0BFF'),
426    'IsTelugu': UnicodeSubset('\u0C00-\u0C7F'),
427    'IsKannada': UnicodeSubset('\u0C80-\u0CFF'),
428    'IsMalayalam': UnicodeSubset('\u0D00-\u0D7F'),
429    'IsSinhala': UnicodeSubset('\u0D80-\u0DFF'),
430    'IsThai': UnicodeSubset('\u0E00-\u0E7F'),
431    'IsLao': UnicodeSubset('\u0E80-\u0EFF'),
432    'IsTibetan': UnicodeSubset('\u0F00-\u0FFF'),
433    'IsMyanmar': UnicodeSubset('\u1000-\u109F'),
434    'IsGeorgian': UnicodeSubset('\u10A0-\u10FF'),
435    'IsHangulJamo': UnicodeSubset('\u1100-\u11FF'),
436    'IsEthiopic': UnicodeSubset('\u1200-\u137F'),
437    'IsCherokee': UnicodeSubset('\u13A0-\u13FF'),
438    'IsUnifiedCanadianAboriginalSyllabics': UnicodeSubset('\u1400-\u167F'),
439    'IsOgham': UnicodeSubset('\u1680-\u169F'),
440    'IsRunic': UnicodeSubset('\u16A0-\u16FF'),
441    'IsKhmer': UnicodeSubset('\u1780-\u17FF'),
442    'IsMongolian': UnicodeSubset('\u1800-\u18AF'),
443    'IsLatinExtendedAdditional': UnicodeSubset('\u1E00-\u1EFF'),
444    'IsGreekExtended': UnicodeSubset('\u1F00-\u1FFF'),
445    'IsGeneralPunctuation': UnicodeSubset('\u2000-\u206F'),
446    'IsSuperscriptsandSubscripts': UnicodeSubset('\u2070-\u209F'),
447    'IsCurrencySymbols': UnicodeSubset('\u20A0-\u20CF'),
448    'IsCombiningMarksforSymbols': UnicodeSubset('\u20D0-\u20FF'),
449    'IsLetterlikeSymbols': UnicodeSubset('\u2100-\u214F'),
450    'IsNumberForms': UnicodeSubset('\u2150-\u218F'),
451    'IsArrows': UnicodeSubset('\u2190-\u21FF'),
452    'IsMathematicalOperators': UnicodeSubset('\u2200-\u22FF'),
453    'IsMiscellaneousTechnical': UnicodeSubset('\u2300-\u23FF'),
454    'IsControlPictures': UnicodeSubset('\u2400-\u243F'),
455    'IsOpticalCharacterRecognition': UnicodeSubset('\u2440-\u245F'),
456    'IsEnclosedAlphanumerics': UnicodeSubset('\u2460-\u24FF'),
457    'IsBoxDrawing': UnicodeSubset('\u2500-\u257F'),
458    'IsBlockElements': UnicodeSubset('\u2580-\u259F'),
459    'IsGeometricShapes': UnicodeSubset('\u25A0-\u25FF'),
460    'IsMiscellaneousSymbols': UnicodeSubset('\u2600-\u26FF'),
461    'IsDingbats': UnicodeSubset('\u2700-\u27BF'),
462    'IsBraillePatterns': UnicodeSubset('\u2800-\u28FF'),
463    'IsCJKRadicalsSupplement': UnicodeSubset('\u2E80-\u2EFF'),
464    'IsKangxiRadicals': UnicodeSubset('\u2F00-\u2FDF'),
465    'IsIdeographicDescriptionCharacters': UnicodeSubset('\u2FF0-\u2FFF'),
466    'IsCJKSymbolsandPunctuation': UnicodeSubset('\u3000-\u303F'),
467    'IsHiragana': UnicodeSubset('\u3040-\u309F'),
468    'IsKatakana': UnicodeSubset('\u30A0-\u30FF'),
469    'IsBopomofo': UnicodeSubset('\u3100-\u312F'),
470    'IsHangulCompatibilityJamo': UnicodeSubset('\u3130-\u318F'),
471    'IsKanbun': UnicodeSubset('\u3190-\u319F'),
472    'IsBopomofoExtended': UnicodeSubset('\u31A0-\u31BF'),
473    'IsEnclosedCJKLettersandMonths': UnicodeSubset('\u3200-\u32FF'),
474    'IsCJKCompatibility': UnicodeSubset('\u3300-\u33FF'),
475    'IsCJKUnifiedIdeographsExtensionA': UnicodeSubset('\u3400-\u4DB5'),
476    'IsCJKUnifiedIdeographs': UnicodeSubset('\u4E00-\u9FFF'),
477    'IsYiSyllables': UnicodeSubset('\uA000-\uA48F'),
478    'IsYiRadicals': UnicodeSubset('\uA490-\uA4CF'),
479    'IsHangulSyllables': UnicodeSubset('\uAC00-\uD7A3'),
480    'IsHighSurrogates': UnicodeSubset('\uD800-\uDB7F'),
481    'IsHighPrivateUseSurrogates': UnicodeSubset('\uDB80-\uDBFF'),
482    'IsLowSurrogates': UnicodeSubset('\uDC00-\uDFFF'),
483    'IsPrivateUse': UnicodeSubset('\uE000-\uF8FF\U000F0000-\U000FFFFF\U00100000-\U0010FFFF'),
484    'IsCJKCompatibilityIdeographs': UnicodeSubset('\uF900-\uFAFF'),
485    'IsAlphabeticPresentationForms': UnicodeSubset('\uFB00-\uFB4F'),
486    'IsArabicPresentationForms-A': UnicodeSubset('\uFB50-\uFDFF'),
487    'IsCombiningHalfMarks': UnicodeSubset('\uFE20-\uFE2F'),
488    'IsCJKCompatibilityForms': UnicodeSubset('\uFE30-\uFE4F'),
489    'IsSmallFormVariants': UnicodeSubset('\uFE50-\uFE6F'),
490    'IsArabicPresentationForms-B': UnicodeSubset('\uFE70-\uFEFE'),
491    'IsSpecials': UnicodeSubset('\uFEFF\uFFF0-\uFFFD'),
492    'IsHalfwidthandFullwidthForms': UnicodeSubset('\uFF00-\uFFEF'),
493    'IsOldItalic': UnicodeSubset('\U00010300-\U0001032F'),
494    'IsGothic': UnicodeSubset('\U00010330-\U0001034F'),
495    'IsDeseret': UnicodeSubset('\U00010400-\U0001044F'),
496    'IsByzantineMusicalSymbols': UnicodeSubset('\U0001D000-\U0001D0FF'),
497    'IsMusicalSymbols': UnicodeSubset('\U0001D100-\U0001D1FF'),
498    'IsMathematicalAlphanumericSymbols': UnicodeSubset('\U0001D400-\U0001D7FF'),
499    'IsCJKUnifiedIdeographsExtensionB': UnicodeSubset('\U00020000-\U0002A6D6'),
500    'IsCJKCompatibilityIdeographsSupplement': UnicodeSubset('\U0002F800-\U0002FA1F'),
501    'IsTags': UnicodeSubset('\U000E0000-\U000E007F'),
502}
503
504UNICODE_BLOCKS['IsPrivateUse'].update('\U000F0000-\U0010FFFD')
505
506
507def unicode_subset(name: str) -> UnicodeSubset:
508    if name.startswith('Is'):
509        try:
510            return UNICODE_BLOCKS[name]
511        except KeyError:
512            raise RegexError("%r doesn't match to any Unicode block." % name)
513    else:
514        try:
515            return UNICODE_CATEGORIES[name]
516        except KeyError:
517            raise RegexError("%r doesn't match to any Unicode category." % name)
518