1# unicode.py 2 3import sys 4from itertools import filterfalse 5from typing import List, Tuple, Union 6 7 8class _lazyclassproperty: 9 def __init__(self, fn): 10 self.fn = fn 11 self.__doc__ = fn.__doc__ 12 self.__name__ = fn.__name__ 13 14 def __get__(self, obj, cls): 15 if cls is None: 16 cls = type(obj) 17 if not hasattr(cls, "_intern") or any( 18 cls._intern is getattr(superclass, "_intern", []) 19 for superclass in cls.__mro__[1:] 20 ): 21 cls._intern = {} 22 attrname = self.fn.__name__ 23 if attrname not in cls._intern: 24 cls._intern[attrname] = self.fn(cls) 25 return cls._intern[attrname] 26 27 28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] 29 30 31class unicode_set: 32 """ 33 A set of Unicode characters, for language-specific strings for 34 ``alphas``, ``nums``, ``alphanums``, and ``printables``. 35 A unicode_set is defined by a list of ranges in the Unicode character 36 set, in a class attribute ``_ranges``. Ranges can be specified using 37 2-tuples or a 1-tuple, such as:: 38 39 _ranges = [ 40 (0x0020, 0x007e), 41 (0x00a0, 0x00ff), 42 (0x0100,), 43 ] 44 45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). 46 47 A unicode set can also be defined using multiple inheritance of other unicode sets:: 48 49 class CJK(Chinese, Japanese, Korean): 50 pass 51 """ 52 53 _ranges: UnicodeRangeList = [] 54 55 @_lazyclassproperty 56 def _chars_for_ranges(cls): 57 ret = [] 58 for cc in cls.__mro__: 59 if cc is unicode_set: 60 break 61 for rr in getattr(cc, "_ranges", ()): 62 ret.extend(range(rr[0], rr[-1] + 1)) 63 return [chr(c) for c in sorted(set(ret))] 64 65 @_lazyclassproperty 66 def printables(cls): 67 "all non-whitespace characters in this range" 68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) 69 70 @_lazyclassproperty 71 def alphas(cls): 72 "all alphabetic characters in this range" 73 return "".join(filter(str.isalpha, cls._chars_for_ranges)) 74 75 @_lazyclassproperty 76 def nums(cls): 77 "all numeric digit characters in this range" 78 return "".join(filter(str.isdigit, cls._chars_for_ranges)) 79 80 @_lazyclassproperty 81 def alphanums(cls): 82 "all alphanumeric characters in this range" 83 return cls.alphas + cls.nums 84 85 @_lazyclassproperty 86 def identchars(cls): 87 "all characters in this range that are valid identifier characters, plus underscore '_'" 88 return "".join( 89 sorted( 90 set( 91 "".join(filter(str.isidentifier, cls._chars_for_ranges)) 92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" 93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" 94 + "_" 95 ) 96 ) 97 ) 98 99 @_lazyclassproperty 100 def identbodychars(cls): 101 """ 102 all characters in this range that are valid identifier body characters, 103 plus the digits 0-9 104 """ 105 return "".join( 106 sorted( 107 set( 108 cls.identchars 109 + "0123456789" 110 + "".join( 111 c for c in cls._chars_for_ranges if ("_" + c).isidentifier() 112 ) 113 ) 114 ) 115 ) 116 117 118class pyparsing_unicode(unicode_set): 119 """ 120 A namespace class for defining common language unicode_sets. 121 """ 122 123 _ranges: UnicodeRangeList = [(32, sys.maxunicode)] 124 125 class Latin1(unicode_set): 126 "Unicode set for Latin-1 Unicode Character Range" 127 _ranges: UnicodeRangeList = [ 128 (0x0020, 0x007E), 129 (0x00A0, 0x00FF), 130 ] 131 132 class LatinA(unicode_set): 133 "Unicode set for Latin-A Unicode Character Range" 134 _ranges: UnicodeRangeList = [ 135 (0x0100, 0x017F), 136 ] 137 138 class LatinB(unicode_set): 139 "Unicode set for Latin-B Unicode Character Range" 140 _ranges: UnicodeRangeList = [ 141 (0x0180, 0x024F), 142 ] 143 144 class Greek(unicode_set): 145 "Unicode set for Greek Unicode Character Ranges" 146 _ranges: UnicodeRangeList = [ 147 (0x0342, 0x0345), 148 (0x0370, 0x0377), 149 (0x037A, 0x037F), 150 (0x0384, 0x038A), 151 (0x038C,), 152 (0x038E, 0x03A1), 153 (0x03A3, 0x03E1), 154 (0x03F0, 0x03FF), 155 (0x1D26, 0x1D2A), 156 (0x1D5E,), 157 (0x1D60,), 158 (0x1D66, 0x1D6A), 159 (0x1F00, 0x1F15), 160 (0x1F18, 0x1F1D), 161 (0x1F20, 0x1F45), 162 (0x1F48, 0x1F4D), 163 (0x1F50, 0x1F57), 164 (0x1F59,), 165 (0x1F5B,), 166 (0x1F5D,), 167 (0x1F5F, 0x1F7D), 168 (0x1F80, 0x1FB4), 169 (0x1FB6, 0x1FC4), 170 (0x1FC6, 0x1FD3), 171 (0x1FD6, 0x1FDB), 172 (0x1FDD, 0x1FEF), 173 (0x1FF2, 0x1FF4), 174 (0x1FF6, 0x1FFE), 175 (0x2129,), 176 (0x2719, 0x271A), 177 (0xAB65,), 178 (0x10140, 0x1018D), 179 (0x101A0,), 180 (0x1D200, 0x1D245), 181 (0x1F7A1, 0x1F7A7), 182 ] 183 184 class Cyrillic(unicode_set): 185 "Unicode set for Cyrillic Unicode Character Range" 186 _ranges: UnicodeRangeList = [ 187 (0x0400, 0x052F), 188 (0x1C80, 0x1C88), 189 (0x1D2B,), 190 (0x1D78,), 191 (0x2DE0, 0x2DFF), 192 (0xA640, 0xA672), 193 (0xA674, 0xA69F), 194 (0xFE2E, 0xFE2F), 195 ] 196 197 class Chinese(unicode_set): 198 "Unicode set for Chinese Unicode Character Range" 199 _ranges: UnicodeRangeList = [ 200 (0x2E80, 0x2E99), 201 (0x2E9B, 0x2EF3), 202 (0x31C0, 0x31E3), 203 (0x3400, 0x4DB5), 204 (0x4E00, 0x9FEF), 205 (0xA700, 0xA707), 206 (0xF900, 0xFA6D), 207 (0xFA70, 0xFAD9), 208 (0x16FE2, 0x16FE3), 209 (0x1F210, 0x1F212), 210 (0x1F214, 0x1F23B), 211 (0x1F240, 0x1F248), 212 (0x20000, 0x2A6D6), 213 (0x2A700, 0x2B734), 214 (0x2B740, 0x2B81D), 215 (0x2B820, 0x2CEA1), 216 (0x2CEB0, 0x2EBE0), 217 (0x2F800, 0x2FA1D), 218 ] 219 220 class Japanese(unicode_set): 221 "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges" 222 _ranges: UnicodeRangeList = [] 223 224 class Kanji(unicode_set): 225 "Unicode set for Kanji Unicode Character Range" 226 _ranges: UnicodeRangeList = [ 227 (0x4E00, 0x9FBF), 228 (0x3000, 0x303F), 229 ] 230 231 class Hiragana(unicode_set): 232 "Unicode set for Hiragana Unicode Character Range" 233 _ranges: UnicodeRangeList = [ 234 (0x3041, 0x3096), 235 (0x3099, 0x30A0), 236 (0x30FC,), 237 (0xFF70,), 238 (0x1B001,), 239 (0x1B150, 0x1B152), 240 (0x1F200,), 241 ] 242 243 class Katakana(unicode_set): 244 "Unicode set for Katakana Unicode Character Range" 245 _ranges: UnicodeRangeList = [ 246 (0x3099, 0x309C), 247 (0x30A0, 0x30FF), 248 (0x31F0, 0x31FF), 249 (0x32D0, 0x32FE), 250 (0xFF65, 0xFF9F), 251 (0x1B000,), 252 (0x1B164, 0x1B167), 253 (0x1F201, 0x1F202), 254 (0x1F213,), 255 ] 256 257 class Hangul(unicode_set): 258 "Unicode set for Hangul (Korean) Unicode Character Range" 259 _ranges: UnicodeRangeList = [ 260 (0x1100, 0x11FF), 261 (0x302E, 0x302F), 262 (0x3131, 0x318E), 263 (0x3200, 0x321C), 264 (0x3260, 0x327B), 265 (0x327E,), 266 (0xA960, 0xA97C), 267 (0xAC00, 0xD7A3), 268 (0xD7B0, 0xD7C6), 269 (0xD7CB, 0xD7FB), 270 (0xFFA0, 0xFFBE), 271 (0xFFC2, 0xFFC7), 272 (0xFFCA, 0xFFCF), 273 (0xFFD2, 0xFFD7), 274 (0xFFDA, 0xFFDC), 275 ] 276 277 Korean = Hangul 278 279 class CJK(Chinese, Japanese, Hangul): 280 "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" 281 pass 282 283 class Thai(unicode_set): 284 "Unicode set for Thai Unicode Character Range" 285 _ranges: UnicodeRangeList = [(0x0E01, 0x0E3A), (0x0E3F, 0x0E5B)] 286 287 class Arabic(unicode_set): 288 "Unicode set for Arabic Unicode Character Range" 289 _ranges: UnicodeRangeList = [ 290 (0x0600, 0x061B), 291 (0x061E, 0x06FF), 292 (0x0700, 0x077F), 293 ] 294 295 class Hebrew(unicode_set): 296 "Unicode set for Hebrew Unicode Character Range" 297 _ranges: UnicodeRangeList = [ 298 (0x0591, 0x05C7), 299 (0x05D0, 0x05EA), 300 (0x05EF, 0x05F4), 301 (0xFB1D, 0xFB36), 302 (0xFB38, 0xFB3C), 303 (0xFB3E,), 304 (0xFB40, 0xFB41), 305 (0xFB43, 0xFB44), 306 (0xFB46, 0xFB4F), 307 ] 308 309 class Devanagari(unicode_set): 310 "Unicode set for Devanagari Unicode Character Range" 311 _ranges: UnicodeRangeList = [(0x0900, 0x097F), (0xA8E0, 0xA8FF)] 312 313 314pyparsing_unicode.Japanese._ranges = ( 315 pyparsing_unicode.Japanese.Kanji._ranges 316 + pyparsing_unicode.Japanese.Hiragana._ranges 317 + pyparsing_unicode.Japanese.Katakana._ranges 318) 319 320# define ranges in language character sets 321pyparsing_unicode.العربية = pyparsing_unicode.Arabic 322pyparsing_unicode.中文 = pyparsing_unicode.Chinese 323pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic 324pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek 325pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew 326pyparsing_unicode.日本語 = pyparsing_unicode.Japanese 327pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji 328pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana 329pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana 330pyparsing_unicode.한국어 = pyparsing_unicode.Korean 331pyparsing_unicode.ไทย = pyparsing_unicode.Thai 332pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari 333