1from fontTools.misc.textTools import byteord, tostr 2 3import re 4from bisect import bisect_right 5 6try: 7 # use unicodedata backport compatible with python2: 8 # https://github.com/mikekap/unicodedata2 9 from unicodedata2 import * 10except ImportError: # pragma: no cover 11 # fall back to built-in unicodedata (possibly outdated) 12 from unicodedata import * 13 14from . import Blocks, Scripts, ScriptExtensions, OTTags 15 16 17__all__ = [tostr(s) for s in ( 18 # names from built-in unicodedata module 19 "lookup", 20 "name", 21 "decimal", 22 "digit", 23 "numeric", 24 "category", 25 "bidirectional", 26 "combining", 27 "east_asian_width", 28 "mirrored", 29 "decomposition", 30 "normalize", 31 "unidata_version", 32 "ucd_3_2_0", 33 # additonal functions 34 "block", 35 "script", 36 "script_extension", 37 "script_name", 38 "script_code", 39 "script_horizontal_direction", 40 "ot_tags_from_script", 41 "ot_tag_to_script", 42)] 43 44 45def script(char): 46 """ Return the four-letter script code assigned to the Unicode character 47 'char' as string. 48 49 >>> script("a") 50 'Latn' 51 >>> script(",") 52 'Zyyy' 53 >>> script(chr(0x10FFFF)) 54 'Zzzz' 55 """ 56 code = byteord(char) 57 # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which 58 # comes after (to the right of) any existing entries of x in a, and it 59 # partitions array a into two halves so that, for the left side 60 # all(val <= x for val in a[lo:i]), and for the right side 61 # all(val > x for val in a[i:hi]). 62 # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting 63 # breakpoints); we want to use `bisect_right` to look up the range that 64 # contains the given codepoint: i.e. whose start is less than or equal 65 # to the codepoint. Thus, we subtract -1 from the index returned. 66 i = bisect_right(Scripts.RANGES, code) 67 return Scripts.VALUES[i-1] 68 69 70def script_extension(char): 71 """ Return the script extension property assigned to the Unicode character 72 'char' as a set of string. 73 74 >>> script_extension("a") == {'Latn'} 75 True 76 >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'} 77 True 78 >>> script_extension(chr(0x10FFFF)) == {'Zzzz'} 79 True 80 """ 81 code = byteord(char) 82 i = bisect_right(ScriptExtensions.RANGES, code) 83 value = ScriptExtensions.VALUES[i-1] 84 if value is None: 85 # code points not explicitly listed for Script Extensions 86 # have as their value the corresponding Script property value 87 return {script(char)} 88 return value 89 90 91def script_name(code, default=KeyError): 92 """ Return the long, human-readable script name given a four-letter 93 Unicode script code. 94 95 If no matching name is found, a KeyError is raised by default. 96 97 You can use the 'default' argument to return a fallback value (e.g. 98 'Unknown' or None) instead of throwing an error. 99 """ 100 try: 101 return str(Scripts.NAMES[code].replace("_", " ")) 102 except KeyError: 103 if isinstance(default, type) and issubclass(default, KeyError): 104 raise 105 return default 106 107 108_normalize_re = re.compile(r"[-_ ]+") 109 110 111def _normalize_property_name(string): 112 """Remove case, strip space, '-' and '_' for loose matching.""" 113 return _normalize_re.sub("", string).lower() 114 115 116_SCRIPT_CODES = {_normalize_property_name(v): k 117 for k, v in Scripts.NAMES.items()} 118 119 120def script_code(script_name, default=KeyError): 121 """Returns the four-letter Unicode script code from its long name 122 123 If no matching script code is found, a KeyError is raised by default. 124 125 You can use the 'default' argument to return a fallback string (e.g. 126 'Zzzz' or None) instead of throwing an error. 127 """ 128 normalized_name = _normalize_property_name(script_name) 129 try: 130 return _SCRIPT_CODES[normalized_name] 131 except KeyError: 132 if isinstance(default, type) and issubclass(default, KeyError): 133 raise 134 return default 135 136 137# The data on script direction is taken from CLDR 37: 138# https://github.com/unicode-org/cldr/blob/release-37/common/properties/scriptMetadata.txt 139RTL_SCRIPTS = { 140 # Unicode-1.1 additions 141 'Arab', # Arabic 142 'Hebr', # Hebrew 143 144 # Unicode-3.0 additions 145 'Syrc', # Syriac 146 'Thaa', # Thaana 147 148 # Unicode-4.0 additions 149 'Cprt', # Cypriot 150 151 # Unicode-4.1 additions 152 'Khar', # Kharoshthi 153 154 # Unicode-5.0 additions 155 'Phnx', # Phoenician 156 'Nkoo', # Nko 157 158 # Unicode-5.1 additions 159 'Lydi', # Lydian 160 161 # Unicode-5.2 additions 162 'Avst', # Avestan 163 'Armi', # Imperial Aramaic 164 'Phli', # Inscriptional Pahlavi 165 'Prti', # Inscriptional Parthian 166 'Sarb', # Old South Arabian 167 'Orkh', # Old Turkic 168 'Samr', # Samaritan 169 170 # Unicode-6.0 additions 171 'Mand', # Mandaic 172 173 # Unicode-6.1 additions 174 'Merc', # Meroitic Cursive 175 'Mero', # Meroitic Hieroglyphs 176 177 # Unicode-7.0 additions 178 'Mani', # Manichaean 179 'Mend', # Mende Kikakui 180 'Nbat', # Nabataean 181 'Narb', # Old North Arabian 182 'Palm', # Palmyrene 183 'Phlp', # Psalter Pahlavi 184 185 # Unicode-8.0 additions 186 'Hatr', # Hatran 187 'Hung', # Old Hungarian 188 189 # Unicode-9.0 additions 190 'Adlm', # Adlam 191 192 # Unicode-11.0 additions 193 'Rohg', # Hanifi Rohingya 194 'Sogo', # Old Sogdian 195 'Sogd', # Sogdian 196 197 # Unicode-12.0 additions 198 'Elym', # Elymaic 199 200 # Unicode-13.0 additions 201 'Chrs', # Chorasmian 202 'Yezi', # Yezidi 203} 204 205def script_horizontal_direction(script_code, default=KeyError): 206 """ Return "RTL" for scripts that contain right-to-left characters 207 according to the Bidi_Class property. Otherwise return "LTR". 208 """ 209 if script_code not in Scripts.NAMES: 210 if isinstance(default, type) and issubclass(default, KeyError): 211 raise default(script_code) 212 return default 213 return str("RTL") if script_code in RTL_SCRIPTS else str("LTR") 214 215 216def block(char): 217 """ Return the block property assigned to the Unicode character 'char' 218 as a string. 219 220 >>> block("a") 221 'Basic Latin' 222 >>> block(chr(0x060C)) 223 'Arabic' 224 >>> block(chr(0xEFFFF)) 225 'No_Block' 226 """ 227 code = byteord(char) 228 i = bisect_right(Blocks.RANGES, code) 229 return Blocks.VALUES[i-1] 230 231 232def ot_tags_from_script(script_code): 233 """ Return a list of OpenType script tags associated with a given 234 Unicode script code. 235 Return ['DFLT'] script tag for invalid/unknown script codes. 236 """ 237 if script_code not in Scripts.NAMES: 238 return [OTTags.DEFAULT_SCRIPT] 239 240 script_tags = [ 241 OTTags.SCRIPT_EXCEPTIONS.get( 242 script_code, 243 script_code[0].lower() + script_code[1:] 244 ) 245 ] 246 if script_code in OTTags.NEW_SCRIPT_TAGS: 247 script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code]) 248 script_tags.reverse() # last in, first out 249 250 return script_tags 251 252 253def ot_tag_to_script(tag): 254 """ Return the Unicode script code for the given OpenType script tag, or 255 None for "DFLT" tag or if there is no Unicode script associated with it. 256 Raises ValueError if the tag is invalid. 257 """ 258 tag = tostr(tag).strip() 259 if not tag or " " in tag or len(tag) > 4: 260 raise ValueError("invalid OpenType tag: %r" % tag) 261 262 while len(tag) != 4: 263 tag += str(" ") # pad with spaces 264 265 if tag == OTTags.DEFAULT_SCRIPT: 266 # it's unclear which Unicode script the "DFLT" OpenType tag maps to, 267 # so here we return None 268 return None 269 270 if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED: 271 return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag] 272 273 # This side of the conversion is fully algorithmic 274 275 # Any spaces at the end of the tag are replaced by repeating the last 276 # letter. Eg 'nko ' -> 'Nkoo'. 277 # Change first char to uppercase 278 script_code = tag[0].upper() + tag[1] 279 for i in range(2, 4): 280 script_code += (script_code[i-1] if tag[i] == " " else tag[i]) 281 282 if script_code not in Scripts.NAMES: 283 return None 284 return script_code 285