1from fontTools.misc.textTools import byteord, tostr
2
3import re
4from bisect import bisect_right
5
6try:
7    # use unicodedata backport compatible with python2:
8    # https://github.com/mikekap/unicodedata2
9    from unicodedata2 import *
10except ImportError:  # pragma: no cover
11    # fall back to built-in unicodedata (possibly outdated)
12    from unicodedata import *
13
14from . import Blocks, Scripts, ScriptExtensions, OTTags
15
16
17__all__ = [tostr(s) for s in (
18    # names from built-in unicodedata module
19    "lookup",
20    "name",
21    "decimal",
22    "digit",
23    "numeric",
24    "category",
25    "bidirectional",
26    "combining",
27    "east_asian_width",
28    "mirrored",
29    "decomposition",
30    "normalize",
31    "unidata_version",
32    "ucd_3_2_0",
33    # additonal functions
34    "block",
35    "script",
36    "script_extension",
37    "script_name",
38    "script_code",
39    "script_horizontal_direction",
40    "ot_tags_from_script",
41    "ot_tag_to_script",
42)]
43
44
45def script(char):
46    """ Return the four-letter script code assigned to the Unicode character
47    'char' as string.
48
49    >>> script("a")
50    'Latn'
51    >>> script(",")
52    'Zyyy'
53    >>> script(chr(0x10FFFF))
54    'Zzzz'
55    """
56    code = byteord(char)
57    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
58    # comes after (to the right of) any existing entries of x in a, and it
59    # partitions array a into two halves so that, for the left side
60    # all(val <= x for val in a[lo:i]), and for the right side
61    # all(val > x for val in a[i:hi]).
62    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
63    # breakpoints); we want to use `bisect_right` to look up the range that
64    # contains the given codepoint: i.e. whose start is less than or equal
65    # to the codepoint. Thus, we subtract -1 from the index returned.
66    i = bisect_right(Scripts.RANGES, code)
67    return Scripts.VALUES[i-1]
68
69
70def script_extension(char):
71    """ Return the script extension property assigned to the Unicode character
72    'char' as a set of string.
73
74    >>> script_extension("a") == {'Latn'}
75    True
76    >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'}
77    True
78    >>> script_extension(chr(0x10FFFF)) == {'Zzzz'}
79    True
80    """
81    code = byteord(char)
82    i = bisect_right(ScriptExtensions.RANGES, code)
83    value = ScriptExtensions.VALUES[i-1]
84    if value is None:
85        # code points not explicitly listed for Script Extensions
86        # have as their value the corresponding Script property value
87        return {script(char)}
88    return value
89
90
91def script_name(code, default=KeyError):
92    """ Return the long, human-readable script name given a four-letter
93    Unicode script code.
94
95    If no matching name is found, a KeyError is raised by default.
96
97    You can use the 'default' argument to return a fallback value (e.g.
98    'Unknown' or None) instead of throwing an error.
99    """
100    try:
101        return str(Scripts.NAMES[code].replace("_", " "))
102    except KeyError:
103        if isinstance(default, type) and issubclass(default, KeyError):
104            raise
105        return default
106
107
108_normalize_re = re.compile(r"[-_ ]+")
109
110
111def _normalize_property_name(string):
112    """Remove case, strip space, '-' and '_' for loose matching."""
113    return _normalize_re.sub("", string).lower()
114
115
116_SCRIPT_CODES = {_normalize_property_name(v): k
117                 for k, v in Scripts.NAMES.items()}
118
119
120def script_code(script_name, default=KeyError):
121    """Returns the four-letter Unicode script code from its long name
122
123    If no matching script code is found, a KeyError is raised by default.
124
125    You can use the 'default' argument to return a fallback string (e.g.
126    'Zzzz' or None) instead of throwing an error.
127    """
128    normalized_name = _normalize_property_name(script_name)
129    try:
130        return _SCRIPT_CODES[normalized_name]
131    except KeyError:
132        if isinstance(default, type) and issubclass(default, KeyError):
133            raise
134        return default
135
136
137# The data on script direction is taken from CLDR 37:
138# https://github.com/unicode-org/cldr/blob/release-37/common/properties/scriptMetadata.txt
139RTL_SCRIPTS = {
140    # Unicode-1.1 additions
141    'Arab',  # Arabic
142    'Hebr',  # Hebrew
143
144    # Unicode-3.0 additions
145    'Syrc',  # Syriac
146    'Thaa',  # Thaana
147
148    # Unicode-4.0 additions
149    'Cprt',  # Cypriot
150
151    # Unicode-4.1 additions
152    'Khar',  # Kharoshthi
153
154    # Unicode-5.0 additions
155    'Phnx',  # Phoenician
156    'Nkoo',  # Nko
157
158    # Unicode-5.1 additions
159    'Lydi',  # Lydian
160
161    # Unicode-5.2 additions
162    'Avst',  # Avestan
163    'Armi',  # Imperial Aramaic
164    'Phli',  # Inscriptional Pahlavi
165    'Prti',  # Inscriptional Parthian
166    'Sarb',  # Old South Arabian
167    'Orkh',  # Old Turkic
168    'Samr',  # Samaritan
169
170    # Unicode-6.0 additions
171    'Mand',  # Mandaic
172
173    # Unicode-6.1 additions
174    'Merc',  # Meroitic Cursive
175    'Mero',  # Meroitic Hieroglyphs
176
177    # Unicode-7.0 additions
178    'Mani',  # Manichaean
179    'Mend',  # Mende Kikakui
180    'Nbat',  # Nabataean
181    'Narb',  # Old North Arabian
182    'Palm',  # Palmyrene
183    'Phlp',  # Psalter Pahlavi
184
185    # Unicode-8.0 additions
186    'Hatr',  # Hatran
187    'Hung',  # Old Hungarian
188
189    # Unicode-9.0 additions
190    'Adlm',  # Adlam
191
192    # Unicode-11.0 additions
193    'Rohg',  # Hanifi Rohingya
194    'Sogo',  # Old Sogdian
195    'Sogd',  # Sogdian
196
197    # Unicode-12.0 additions
198    'Elym',  # Elymaic
199
200    # Unicode-13.0 additions
201    'Chrs',  # Chorasmian
202    'Yezi',  # Yezidi
203}
204
205def script_horizontal_direction(script_code, default=KeyError):
206    """ Return "RTL" for scripts that contain right-to-left characters
207    according to the Bidi_Class property. Otherwise return "LTR".
208    """
209    if script_code not in Scripts.NAMES:
210        if isinstance(default, type) and issubclass(default, KeyError):
211            raise default(script_code)
212        return default
213    return str("RTL") if script_code in RTL_SCRIPTS else str("LTR")
214
215
216def block(char):
217    """ Return the block property assigned to the Unicode character 'char'
218    as a string.
219
220    >>> block("a")
221    'Basic Latin'
222    >>> block(chr(0x060C))
223    'Arabic'
224    >>> block(chr(0xEFFFF))
225    'No_Block'
226    """
227    code = byteord(char)
228    i = bisect_right(Blocks.RANGES, code)
229    return Blocks.VALUES[i-1]
230
231
232def ot_tags_from_script(script_code):
233    """ Return a list of OpenType script tags associated with a given
234    Unicode script code.
235    Return ['DFLT'] script tag for invalid/unknown script codes.
236    """
237    if script_code not in Scripts.NAMES:
238        return [OTTags.DEFAULT_SCRIPT]
239
240    script_tags = [
241        OTTags.SCRIPT_EXCEPTIONS.get(
242            script_code,
243            script_code[0].lower() + script_code[1:]
244        )
245    ]
246    if script_code in OTTags.NEW_SCRIPT_TAGS:
247        script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code])
248        script_tags.reverse()  # last in, first out
249
250    return script_tags
251
252
253def ot_tag_to_script(tag):
254    """ Return the Unicode script code for the given OpenType script tag, or
255    None for "DFLT" tag or if there is no Unicode script associated with it.
256    Raises ValueError if the tag is invalid.
257    """
258    tag = tostr(tag).strip()
259    if not tag or " " in tag or len(tag) > 4:
260        raise ValueError("invalid OpenType tag: %r" % tag)
261
262    while len(tag) != 4:
263        tag += str(" ")  # pad with spaces
264
265    if tag == OTTags.DEFAULT_SCRIPT:
266        # it's unclear which Unicode script the "DFLT" OpenType tag maps to,
267        # so here we return None
268        return None
269
270    if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED:
271        return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag]
272
273    # This side of the conversion is fully algorithmic
274
275    # Any spaces at the end of the tag are replaced by repeating the last
276    # letter. Eg 'nko ' -> 'Nkoo'.
277    # Change first char to uppercase
278    script_code = tag[0].upper() + tag[1]
279    for i in range(2, 4):
280        script_code += (script_code[i-1] if tag[i] == " " else tag[i])
281
282    if script_code not in Scripts.NAMES:
283        return None
284    return script_code
285