1import java.lang.Character
2try:
3    # import from jarjar-ed version
4    from org.python.icu.text import Normalizer
5    from org.python.icu.lang import UCharacter, UProperty
6    from org.python.icu.util import VersionInfo
7    from org.python.icu.lang.UCharacter import EastAsianWidth, DecompositionType
8    from org.python.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
9except ImportError:
10    # development version of Jython, so use extlibs
11    from com.ibm.icu.text import Normalizer
12    from com.ibm.icu.lang import UCharacter, UProperty
13    from com.ibm.icu.util import VersionInfo
14    from com.ibm.icu.lang.UCharacter import EastAsianWidth, DecompositionType
15    from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection
16
17
18__all__ = (
19    "bidirectional", "category", "combining", "decimal", "decomposition", "digit", "east_asian_width",
20    "lookup", "mirrored", "name", "normalize", "numeric", "unidata_version")
21
22
23_forms = {
24    'NFC':  Normalizer.NFC,
25    'NFKC': Normalizer.NFKC,
26    'NFD':  Normalizer.NFD,
27    'NFKD': Normalizer.NFKD
28}
29
30Nonesuch = object()   # to distinguish from None, which is a valid return value for some functions
31
32
33def _validate_unichr(unichr):
34    if not(isinstance(unichr, unicode)):
35        raise TypeError("must be unicode, not {}".format(type(unichr).__name__))
36    if len(unichr) > 1 or len(unichr) == 0:
37        raise TypeError("need a single Unicode character as parameter")
38
39
40def _get_codepoint(unichr):
41    _validate_unichr(unichr)
42    return ord(unichr)
43
44
45def name(unichr, default=Nonesuch):
46    # handle None
47    n = UCharacter.getName(_get_codepoint(unichr))
48    if n is None:
49        if default is not Nonesuch:
50            return default
51        else:
52            raise ValueError("no such name")
53    return n
54
55
56def lookup(name):
57    codepoint = UCharacter.getCharFromName(name)
58    if codepoint == -1:
59        raise KeyError("undefined character name '{}".format(name))
60    return unichr(codepoint)
61
62
63def digit(unichr, default=Nonesuch):
64    d = UCharacter.digit(_get_codepoint(unichr))
65    if d == -1:
66        if default is not Nonesuch:
67            return default
68        else:
69            raise ValueError("not a digit")
70    return d
71
72
73def decimal(unichr, default=Nonesuch):
74    d = UCharacter.getNumericValue(_get_codepoint(unichr))
75    if d < 0 or d > 9:
76        if default is not Nonesuch:
77            return default
78        else:
79            raise ValueError("not a decimal")
80    return d
81
82
83def numeric(unichr, default=Nonesuch):
84    n = UCharacter.getUnicodeNumericValue(_get_codepoint(unichr))
85    if n == UCharacter.NO_NUMERIC_VALUE:
86        if default is not Nonesuch:
87            return default
88        else:
89            raise ValueError("not a numeric")
90    return n
91
92
93_decomp = {
94    DecompositionType.CANONICAL: "canonical",
95    DecompositionType.CIRCLE: "circle",
96    DecompositionType.COMPAT: "compat",
97    DecompositionType.FINAL: "final",
98    DecompositionType.FONT: "font",
99    DecompositionType.FRACTION: "fraction",
100    DecompositionType.INITIAL: "initial",
101    DecompositionType.ISOLATED: "isolated",
102    DecompositionType.MEDIAL: "medial",
103    DecompositionType.NARROW: "narrow",
104    DecompositionType.NOBREAK: "nobreak",
105    DecompositionType.NONE: None,
106    DecompositionType.SMALL: "small",
107    DecompositionType.SQUARE: "square",
108    DecompositionType.SUB: "sub",
109    DecompositionType.SUPER: "super",
110    DecompositionType.VERTICAL: "vertical",
111    DecompositionType.WIDE: "wide"
112}
113
114def _get_decomp_type(unichr):
115    if unichr == u"\u2044":  # FRACTION SLASH
116        # special case this for CPython compatibility even though this returns as not being combining, eg, see
117        # http://www.fileformat.info/info/unicode/char/2044/index.htm
118        return "fraction"
119    else:
120        return _decomp[UCharacter.getIntPropertyValue(ord(unichr), UProperty.DECOMPOSITION_TYPE)]
121
122def decomposition(unichr):
123    _validate_unichr(unichr)
124    d = Normalizer.decompose(unichr, True)
125    decomp_type = None
126    if len(d) == 1:
127        decomp_type = _get_decomp_type(unichr)
128    else:
129        for c in d:
130            decomp_type = _get_decomp_type(c)
131            # print "Got a decomp_type %r %r %r" % (c, d, decomp_type)
132            if decomp_type is not None:
133                break
134    hexed = " ".join(("{0:04X}".format(ord(c)) for c in d))
135    if decomp_type:
136        return "<{}> {}".format(decomp_type, hexed)
137    elif len(d) == 1:
138        return ""
139    else:
140        return hexed
141
142
143# To map from ICU4J enumerations for category, bidirection, and
144# east_asian_width to the underlying property values that Python uses
145# from UnicodeData.txt required a manual mapping between the following
146# two files:
147#
148# http://icu-project.org/apiref/icu4j/constant-values.html
149# http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt
150
151_cat = {
152    ECharacterCategory.COMBINING_SPACING_MARK: "Mc",
153    ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc",
154    ECharacterCategory.CONTROL: "Cc",
155    ECharacterCategory.CURRENCY_SYMBOL: "Sc",
156    ECharacterCategory.DASH_PUNCTUATION: "Pd",
157    ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd",
158    ECharacterCategory.ENCLOSING_MARK: "Me",
159    ECharacterCategory.END_PUNCTUATION: "Pe",
160    ECharacterCategory.FINAL_PUNCTUATION: "Pf",
161    ECharacterCategory.FORMAT: "Cf",
162    # per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES
163    # - no characters in [UnicodeData.txt] have this property
164    ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned",
165    ECharacterCategory.INITIAL_PUNCTUATION: "Pi",
166    ECharacterCategory.LETTER_NUMBER: "Nl",
167    ECharacterCategory.LINE_SEPARATOR: "Zl",
168    ECharacterCategory.LOWERCASE_LETTER: "Ll",
169    ECharacterCategory.MATH_SYMBOL: "Sm",
170    ECharacterCategory.MODIFIER_LETTER: "Lm",
171    ECharacterCategory.MODIFIER_SYMBOL: "Sk",
172    ECharacterCategory.NON_SPACING_MARK: "Mn",
173    ECharacterCategory.OTHER_LETTER: "Lo",
174    ECharacterCategory.OTHER_NUMBER: "No",
175    ECharacterCategory.OTHER_PUNCTUATION: "Po",
176    ECharacterCategory.OTHER_SYMBOL: "So",
177    ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp",
178    ECharacterCategory.PRIVATE_USE: "Co",
179    ECharacterCategory.SPACE_SEPARATOR: "Zs",
180    ECharacterCategory.START_PUNCTUATION: "Ps",
181    ECharacterCategory.SURROGATE: "Cs",
182    ECharacterCategory.TITLECASE_LETTER: "Lt",
183    ECharacterCategory.UNASSIGNED: "Cn",
184    ECharacterCategory.UPPERCASE_LETTER: "Lu",
185}
186
187def category(unichr):
188    return _cat[UCharacter.getType(_get_codepoint(unichr))]
189
190
191_dir = {
192    ECharacterDirection.ARABIC_NUMBER: "An",
193    ECharacterDirection.BLOCK_SEPARATOR: "B",
194    ECharacterDirection.BOUNDARY_NEUTRAL: "BN",
195    ECharacterDirection.COMMON_NUMBER_SEPARATOR: "CS",
196    ECharacterDirection.DIR_NON_SPACING_MARK: "NSM",
197    ECharacterDirection.EUROPEAN_NUMBER: "EN",
198    ECharacterDirection.EUROPEAN_NUMBER_SEPARATOR: "ES",
199    ECharacterDirection.EUROPEAN_NUMBER_TERMINATOR: "ET",
200    ECharacterDirection.FIRST_STRONG_ISOLATE: "FSI",
201    ECharacterDirection.LEFT_TO_RIGHT: "L",
202    ECharacterDirection.LEFT_TO_RIGHT_EMBEDDING: "LRE",
203    ECharacterDirection.LEFT_TO_RIGHT_ISOLATE: "LRI",
204    ECharacterDirection.LEFT_TO_RIGHT_OVERRIDE: "LRO",
205    ECharacterDirection.OTHER_NEUTRAL: "ON",
206    ECharacterDirection.POP_DIRECTIONAL_FORMAT: "PDF",
207    ECharacterDirection.POP_DIRECTIONAL_ISOLATE: "PDI",
208    ECharacterDirection.RIGHT_TO_LEFT: "R",
209    ECharacterDirection.RIGHT_TO_LEFT_ARABIC: "AL",
210    ECharacterDirection.RIGHT_TO_LEFT_EMBEDDING: "RLE",
211    ECharacterDirection.RIGHT_TO_LEFT_ISOLATE: "RLI",
212    ECharacterDirection.RIGHT_TO_LEFT_OVERRIDE: "RLO",
213    ECharacterDirection.SEGMENT_SEPARATOR: "S",
214    ECharacterDirection.WHITE_SPACE_NEUTRAL: "WS"
215}
216
217def bidirectional(unichr):
218    return _dir[UCharacter.getDirection(_get_codepoint(unichr))]
219
220
221def combining(unichr):
222    return UCharacter.getCombiningClass(_get_codepoint(unichr))
223
224
225def mirrored(unichr):
226    return UCharacter.isMirrored(_get_codepoint(unichr))
227
228
229_eaw = {
230    # http://www.unicode.org/reports/tr11/
231    EastAsianWidth.AMBIGUOUS : "A",
232    EastAsianWidth.COUNT     : "?",  # apparently not used, see above TR
233    EastAsianWidth.FULLWIDTH : "F",
234    EastAsianWidth.HALFWIDTH : "H",
235    EastAsianWidth.NARROW    : "Na",
236    EastAsianWidth.NEUTRAL   : "N",
237    EastAsianWidth.WIDE      : "W"
238}
239
240def east_asian_width(unichr):
241    return _eaw[UCharacter.getIntPropertyValue(_get_codepoint(unichr), UProperty.EAST_ASIAN_WIDTH)]
242
243
244def normalize(form, unistr):
245    """
246    Return the normal form 'form' for the Unicode string unistr.  Valid
247    values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
248    """
249
250    try:
251        normalizer_form = _forms[form]
252    except KeyError:
253        raise ValueError('invalid normalization form')
254
255    return Normalizer.normalize(unistr, normalizer_form)
256
257
258def get_icu_version():
259    versions = []
260    for k in VersionInfo.__dict__.iterkeys():
261        if k.startswith("UNICODE_"):
262            v = getattr(VersionInfo, k)
263            versions.append((v.getMajor(), v.getMinor(), v.getMilli()))
264    return ".".join(str(x) for x in max(versions))
265
266
267unidata_version = get_icu_version()
268