1# unicode.py
2
3import sys
4from itertools import filterfalse
5from typing import List, Tuple, Union
6
7
8class _lazyclassproperty:
9    def __init__(self, fn):
10        self.fn = fn
11        self.__doc__ = fn.__doc__
12        self.__name__ = fn.__name__
13
14    def __get__(self, obj, cls):
15        if cls is None:
16            cls = type(obj)
17        if not hasattr(cls, "_intern") or any(
18            cls._intern is getattr(superclass, "_intern", [])
19            for superclass in cls.__mro__[1:]
20        ):
21            cls._intern = {}
22        attrname = self.fn.__name__
23        if attrname not in cls._intern:
24            cls._intern[attrname] = self.fn(cls)
25        return cls._intern[attrname]
26
27
28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
29
30
31class unicode_set:
32    """
33    A set of Unicode characters, for language-specific strings for
34    ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35    A unicode_set is defined by a list of ranges in the Unicode character
36    set, in a class attribute ``_ranges``. Ranges can be specified using
37    2-tuples or a 1-tuple, such as::
38
39        _ranges = [
40            (0x0020, 0x007e),
41            (0x00a0, 0x00ff),
42            (0x0100,),
43            ]
44
45    Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
46
47    A unicode set can also be defined using multiple inheritance of other unicode sets::
48
49        class CJK(Chinese, Japanese, Korean):
50            pass
51    """
52
53    _ranges: UnicodeRangeList = []
54
55    @_lazyclassproperty
56    def _chars_for_ranges(cls):
57        ret = []
58        for cc in cls.__mro__:
59            if cc is unicode_set:
60                break
61            for rr in getattr(cc, "_ranges", ()):
62                ret.extend(range(rr[0], rr[-1] + 1))
63        return [chr(c) for c in sorted(set(ret))]
64
65    @_lazyclassproperty
66    def printables(cls):
67        "all non-whitespace characters in this range"
68        return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
69
70    @_lazyclassproperty
71    def alphas(cls):
72        "all alphabetic characters in this range"
73        return "".join(filter(str.isalpha, cls._chars_for_ranges))
74
75    @_lazyclassproperty
76    def nums(cls):
77        "all numeric digit characters in this range"
78        return "".join(filter(str.isdigit, cls._chars_for_ranges))
79
80    @_lazyclassproperty
81    def alphanums(cls):
82        "all alphanumeric characters in this range"
83        return cls.alphas + cls.nums
84
85    @_lazyclassproperty
86    def identchars(cls):
87        "all characters in this range that are valid identifier characters, plus underscore '_'"
88        return "".join(
89            sorted(
90                set(
91                    "".join(filter(str.isidentifier, cls._chars_for_ranges))
92                    + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93                    + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94                    + "_"
95                )
96            )
97        )
98
99    @_lazyclassproperty
100    def identbodychars(cls):
101        """
102        all characters in this range that are valid identifier body characters,
103        plus the digits 0-9
104        """
105        return "".join(
106            sorted(
107                set(
108                    cls.identchars
109                    + "0123456789"
110                    + "".join(
111                        c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
112                    )
113                )
114            )
115        )
116
117
118class pyparsing_unicode(unicode_set):
119    """
120    A namespace class for defining common language unicode_sets.
121    """
122
123    _ranges: UnicodeRangeList = [(32, sys.maxunicode)]
124
125    class Latin1(unicode_set):
126        "Unicode set for Latin-1 Unicode Character Range"
127        _ranges: UnicodeRangeList = [
128            (0x0020, 0x007E),
129            (0x00A0, 0x00FF),
130        ]
131
132    class LatinA(unicode_set):
133        "Unicode set for Latin-A Unicode Character Range"
134        _ranges: UnicodeRangeList = [
135            (0x0100, 0x017F),
136        ]
137
138    class LatinB(unicode_set):
139        "Unicode set for Latin-B Unicode Character Range"
140        _ranges: UnicodeRangeList = [
141            (0x0180, 0x024F),
142        ]
143
144    class Greek(unicode_set):
145        "Unicode set for Greek Unicode Character Ranges"
146        _ranges: UnicodeRangeList = [
147            (0x0342, 0x0345),
148            (0x0370, 0x0377),
149            (0x037A, 0x037F),
150            (0x0384, 0x038A),
151            (0x038C,),
152            (0x038E, 0x03A1),
153            (0x03A3, 0x03E1),
154            (0x03F0, 0x03FF),
155            (0x1D26, 0x1D2A),
156            (0x1D5E,),
157            (0x1D60,),
158            (0x1D66, 0x1D6A),
159            (0x1F00, 0x1F15),
160            (0x1F18, 0x1F1D),
161            (0x1F20, 0x1F45),
162            (0x1F48, 0x1F4D),
163            (0x1F50, 0x1F57),
164            (0x1F59,),
165            (0x1F5B,),
166            (0x1F5D,),
167            (0x1F5F, 0x1F7D),
168            (0x1F80, 0x1FB4),
169            (0x1FB6, 0x1FC4),
170            (0x1FC6, 0x1FD3),
171            (0x1FD6, 0x1FDB),
172            (0x1FDD, 0x1FEF),
173            (0x1FF2, 0x1FF4),
174            (0x1FF6, 0x1FFE),
175            (0x2129,),
176            (0x2719, 0x271A),
177            (0xAB65,),
178            (0x10140, 0x1018D),
179            (0x101A0,),
180            (0x1D200, 0x1D245),
181            (0x1F7A1, 0x1F7A7),
182        ]
183
184    class Cyrillic(unicode_set):
185        "Unicode set for Cyrillic Unicode Character Range"
186        _ranges: UnicodeRangeList = [
187            (0x0400, 0x052F),
188            (0x1C80, 0x1C88),
189            (0x1D2B,),
190            (0x1D78,),
191            (0x2DE0, 0x2DFF),
192            (0xA640, 0xA672),
193            (0xA674, 0xA69F),
194            (0xFE2E, 0xFE2F),
195        ]
196
197    class Chinese(unicode_set):
198        "Unicode set for Chinese Unicode Character Range"
199        _ranges: UnicodeRangeList = [
200            (0x2E80, 0x2E99),
201            (0x2E9B, 0x2EF3),
202            (0x31C0, 0x31E3),
203            (0x3400, 0x4DB5),
204            (0x4E00, 0x9FEF),
205            (0xA700, 0xA707),
206            (0xF900, 0xFA6D),
207            (0xFA70, 0xFAD9),
208            (0x16FE2, 0x16FE3),
209            (0x1F210, 0x1F212),
210            (0x1F214, 0x1F23B),
211            (0x1F240, 0x1F248),
212            (0x20000, 0x2A6D6),
213            (0x2A700, 0x2B734),
214            (0x2B740, 0x2B81D),
215            (0x2B820, 0x2CEA1),
216            (0x2CEB0, 0x2EBE0),
217            (0x2F800, 0x2FA1D),
218        ]
219
220    class Japanese(unicode_set):
221        "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"
222        _ranges: UnicodeRangeList = []
223
224        class Kanji(unicode_set):
225            "Unicode set for Kanji Unicode Character Range"
226            _ranges: UnicodeRangeList = [
227                (0x4E00, 0x9FBF),
228                (0x3000, 0x303F),
229            ]
230
231        class Hiragana(unicode_set):
232            "Unicode set for Hiragana Unicode Character Range"
233            _ranges: UnicodeRangeList = [
234                (0x3041, 0x3096),
235                (0x3099, 0x30A0),
236                (0x30FC,),
237                (0xFF70,),
238                (0x1B001,),
239                (0x1B150, 0x1B152),
240                (0x1F200,),
241            ]
242
243        class Katakana(unicode_set):
244            "Unicode set for Katakana  Unicode Character Range"
245            _ranges: UnicodeRangeList = [
246                (0x3099, 0x309C),
247                (0x30A0, 0x30FF),
248                (0x31F0, 0x31FF),
249                (0x32D0, 0x32FE),
250                (0xFF65, 0xFF9F),
251                (0x1B000,),
252                (0x1B164, 0x1B167),
253                (0x1F201, 0x1F202),
254                (0x1F213,),
255            ]
256
257    class Hangul(unicode_set):
258        "Unicode set for Hangul (Korean) Unicode Character Range"
259        _ranges: UnicodeRangeList = [
260            (0x1100, 0x11FF),
261            (0x302E, 0x302F),
262            (0x3131, 0x318E),
263            (0x3200, 0x321C),
264            (0x3260, 0x327B),
265            (0x327E,),
266            (0xA960, 0xA97C),
267            (0xAC00, 0xD7A3),
268            (0xD7B0, 0xD7C6),
269            (0xD7CB, 0xD7FB),
270            (0xFFA0, 0xFFBE),
271            (0xFFC2, 0xFFC7),
272            (0xFFCA, 0xFFCF),
273            (0xFFD2, 0xFFD7),
274            (0xFFDA, 0xFFDC),
275        ]
276
277    Korean = Hangul
278
279    class CJK(Chinese, Japanese, Hangul):
280        "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
281        pass
282
283    class Thai(unicode_set):
284        "Unicode set for Thai Unicode Character Range"
285        _ranges: UnicodeRangeList = [(0x0E01, 0x0E3A), (0x0E3F, 0x0E5B)]
286
287    class Arabic(unicode_set):
288        "Unicode set for Arabic Unicode Character Range"
289        _ranges: UnicodeRangeList = [
290            (0x0600, 0x061B),
291            (0x061E, 0x06FF),
292            (0x0700, 0x077F),
293        ]
294
295    class Hebrew(unicode_set):
296        "Unicode set for Hebrew Unicode Character Range"
297        _ranges: UnicodeRangeList = [
298            (0x0591, 0x05C7),
299            (0x05D0, 0x05EA),
300            (0x05EF, 0x05F4),
301            (0xFB1D, 0xFB36),
302            (0xFB38, 0xFB3C),
303            (0xFB3E,),
304            (0xFB40, 0xFB41),
305            (0xFB43, 0xFB44),
306            (0xFB46, 0xFB4F),
307        ]
308
309    class Devanagari(unicode_set):
310        "Unicode set for Devanagari Unicode Character Range"
311        _ranges: UnicodeRangeList = [(0x0900, 0x097F), (0xA8E0, 0xA8FF)]
312
313
314pyparsing_unicode.Japanese._ranges = (
315    pyparsing_unicode.Japanese.Kanji._ranges
316    + pyparsing_unicode.Japanese.Hiragana._ranges
317    + pyparsing_unicode.Japanese.Katakana._ranges
318)
319
320# define ranges in language character sets
321pyparsing_unicode.العربية = pyparsing_unicode.Arabic
322pyparsing_unicode.中文 = pyparsing_unicode.Chinese
323pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
324pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek
325pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew
326pyparsing_unicode.日本語 = pyparsing_unicode.Japanese
327pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji
328pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana
329pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana
330pyparsing_unicode.한국어 = pyparsing_unicode.Korean
331pyparsing_unicode.ไทย = pyparsing_unicode.Thai
332pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari
333