1import java.lang.Character 2try: 3 # import from jarjar-ed version 4 from org.python.icu.text import Normalizer 5 from org.python.icu.lang import UCharacter, UProperty 6 from org.python.icu.util import VersionInfo 7 from org.python.icu.lang.UCharacter import EastAsianWidth, DecompositionType 8 from org.python.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection 9except ImportError: 10 # development version of Jython, so use extlibs 11 from com.ibm.icu.text import Normalizer 12 from com.ibm.icu.lang import UCharacter, UProperty 13 from com.ibm.icu.util import VersionInfo 14 from com.ibm.icu.lang.UCharacter import EastAsianWidth, DecompositionType 15 from com.ibm.icu.lang.UCharacterEnums import ECharacterCategory, ECharacterDirection 16 17 18__all__ = ( 19 "bidirectional", "category", "combining", "decimal", "decomposition", "digit", "east_asian_width", 20 "lookup", "mirrored", "name", "normalize", "numeric", "unidata_version") 21 22 23_forms = { 24 'NFC': Normalizer.NFC, 25 'NFKC': Normalizer.NFKC, 26 'NFD': Normalizer.NFD, 27 'NFKD': Normalizer.NFKD 28} 29 30Nonesuch = object() # to distinguish from None, which is a valid return value for some functions 31 32 33def _validate_unichr(unichr): 34 if not(isinstance(unichr, unicode)): 35 raise TypeError("must be unicode, not {}".format(type(unichr).__name__)) 36 if len(unichr) > 1 or len(unichr) == 0: 37 raise TypeError("need a single Unicode character as parameter") 38 39 40def _get_codepoint(unichr): 41 _validate_unichr(unichr) 42 return ord(unichr) 43 44 45def name(unichr, default=Nonesuch): 46 # handle None 47 n = UCharacter.getName(_get_codepoint(unichr)) 48 if n is None: 49 if default is not Nonesuch: 50 return default 51 else: 52 raise ValueError("no such name") 53 return n 54 55 56def lookup(name): 57 codepoint = UCharacter.getCharFromName(name) 58 if codepoint == -1: 59 raise KeyError("undefined character name '{}".format(name)) 60 return unichr(codepoint) 61 62 63def digit(unichr, default=Nonesuch): 64 d = UCharacter.digit(_get_codepoint(unichr)) 65 if d == -1: 66 if default is not Nonesuch: 67 return default 68 else: 69 raise ValueError("not a digit") 70 return d 71 72 73def decimal(unichr, default=Nonesuch): 74 d = UCharacter.getNumericValue(_get_codepoint(unichr)) 75 if d < 0 or d > 9: 76 if default is not Nonesuch: 77 return default 78 else: 79 raise ValueError("not a decimal") 80 return d 81 82 83def numeric(unichr, default=Nonesuch): 84 n = UCharacter.getUnicodeNumericValue(_get_codepoint(unichr)) 85 if n == UCharacter.NO_NUMERIC_VALUE: 86 if default is not Nonesuch: 87 return default 88 else: 89 raise ValueError("not a numeric") 90 return n 91 92 93_decomp = { 94 DecompositionType.CANONICAL: "canonical", 95 DecompositionType.CIRCLE: "circle", 96 DecompositionType.COMPAT: "compat", 97 DecompositionType.FINAL: "final", 98 DecompositionType.FONT: "font", 99 DecompositionType.FRACTION: "fraction", 100 DecompositionType.INITIAL: "initial", 101 DecompositionType.ISOLATED: "isolated", 102 DecompositionType.MEDIAL: "medial", 103 DecompositionType.NARROW: "narrow", 104 DecompositionType.NOBREAK: "nobreak", 105 DecompositionType.NONE: None, 106 DecompositionType.SMALL: "small", 107 DecompositionType.SQUARE: "square", 108 DecompositionType.SUB: "sub", 109 DecompositionType.SUPER: "super", 110 DecompositionType.VERTICAL: "vertical", 111 DecompositionType.WIDE: "wide" 112} 113 114def _get_decomp_type(unichr): 115 if unichr == u"\u2044": # FRACTION SLASH 116 # special case this for CPython compatibility even though this returns as not being combining, eg, see 117 # http://www.fileformat.info/info/unicode/char/2044/index.htm 118 return "fraction" 119 else: 120 return _decomp[UCharacter.getIntPropertyValue(ord(unichr), UProperty.DECOMPOSITION_TYPE)] 121 122def decomposition(unichr): 123 _validate_unichr(unichr) 124 d = Normalizer.decompose(unichr, True) 125 decomp_type = None 126 if len(d) == 1: 127 decomp_type = _get_decomp_type(unichr) 128 else: 129 for c in d: 130 decomp_type = _get_decomp_type(c) 131 # print "Got a decomp_type %r %r %r" % (c, d, decomp_type) 132 if decomp_type is not None: 133 break 134 hexed = " ".join(("{0:04X}".format(ord(c)) for c in d)) 135 if decomp_type: 136 return "<{}> {}".format(decomp_type, hexed) 137 elif len(d) == 1: 138 return "" 139 else: 140 return hexed 141 142 143# To map from ICU4J enumerations for category, bidirection, and 144# east_asian_width to the underlying property values that Python uses 145# from UnicodeData.txt required a manual mapping between the following 146# two files: 147# 148# http://icu-project.org/apiref/icu4j/constant-values.html 149# http://www.unicode.org/Public/6.3.0/ucd/PropertyValueAliases.txt 150 151_cat = { 152 ECharacterCategory.COMBINING_SPACING_MARK: "Mc", 153 ECharacterCategory.CONNECTOR_PUNCTUATION: "Pc", 154 ECharacterCategory.CONTROL: "Cc", 155 ECharacterCategory.CURRENCY_SYMBOL: "Sc", 156 ECharacterCategory.DASH_PUNCTUATION: "Pd", 157 ECharacterCategory.DECIMAL_DIGIT_NUMBER: "Nd", 158 ECharacterCategory.ENCLOSING_MARK: "Me", 159 ECharacterCategory.END_PUNCTUATION: "Pe", 160 ECharacterCategory.FINAL_PUNCTUATION: "Pf", 161 ECharacterCategory.FORMAT: "Cf", 162 # per http://icu-project.org/apiref/icu4j/com/ibm/icu/lang/UCharacterEnums.ECharacterCategory.html#GENERAL_OTHER_TYPES 163 # - no characters in [UnicodeData.txt] have this property 164 ECharacterCategory.GENERAL_OTHER_TYPES: "Cn Not Assigned", 165 ECharacterCategory.INITIAL_PUNCTUATION: "Pi", 166 ECharacterCategory.LETTER_NUMBER: "Nl", 167 ECharacterCategory.LINE_SEPARATOR: "Zl", 168 ECharacterCategory.LOWERCASE_LETTER: "Ll", 169 ECharacterCategory.MATH_SYMBOL: "Sm", 170 ECharacterCategory.MODIFIER_LETTER: "Lm", 171 ECharacterCategory.MODIFIER_SYMBOL: "Sk", 172 ECharacterCategory.NON_SPACING_MARK: "Mn", 173 ECharacterCategory.OTHER_LETTER: "Lo", 174 ECharacterCategory.OTHER_NUMBER: "No", 175 ECharacterCategory.OTHER_PUNCTUATION: "Po", 176 ECharacterCategory.OTHER_SYMBOL: "So", 177 ECharacterCategory.PARAGRAPH_SEPARATOR: "Zp", 178 ECharacterCategory.PRIVATE_USE: "Co", 179 ECharacterCategory.SPACE_SEPARATOR: "Zs", 180 ECharacterCategory.START_PUNCTUATION: "Ps", 181 ECharacterCategory.SURROGATE: "Cs", 182 ECharacterCategory.TITLECASE_LETTER: "Lt", 183 ECharacterCategory.UNASSIGNED: "Cn", 184 ECharacterCategory.UPPERCASE_LETTER: "Lu", 185} 186 187def category(unichr): 188 return _cat[UCharacter.getType(_get_codepoint(unichr))] 189 190 191_dir = { 192 ECharacterDirection.ARABIC_NUMBER: "An", 193 ECharacterDirection.BLOCK_SEPARATOR: "B", 194 ECharacterDirection.BOUNDARY_NEUTRAL: "BN", 195 ECharacterDirection.COMMON_NUMBER_SEPARATOR: "CS", 196 ECharacterDirection.DIR_NON_SPACING_MARK: "NSM", 197 ECharacterDirection.EUROPEAN_NUMBER: "EN", 198 ECharacterDirection.EUROPEAN_NUMBER_SEPARATOR: "ES", 199 ECharacterDirection.EUROPEAN_NUMBER_TERMINATOR: "ET", 200 ECharacterDirection.FIRST_STRONG_ISOLATE: "FSI", 201 ECharacterDirection.LEFT_TO_RIGHT: "L", 202 ECharacterDirection.LEFT_TO_RIGHT_EMBEDDING: "LRE", 203 ECharacterDirection.LEFT_TO_RIGHT_ISOLATE: "LRI", 204 ECharacterDirection.LEFT_TO_RIGHT_OVERRIDE: "LRO", 205 ECharacterDirection.OTHER_NEUTRAL: "ON", 206 ECharacterDirection.POP_DIRECTIONAL_FORMAT: "PDF", 207 ECharacterDirection.POP_DIRECTIONAL_ISOLATE: "PDI", 208 ECharacterDirection.RIGHT_TO_LEFT: "R", 209 ECharacterDirection.RIGHT_TO_LEFT_ARABIC: "AL", 210 ECharacterDirection.RIGHT_TO_LEFT_EMBEDDING: "RLE", 211 ECharacterDirection.RIGHT_TO_LEFT_ISOLATE: "RLI", 212 ECharacterDirection.RIGHT_TO_LEFT_OVERRIDE: "RLO", 213 ECharacterDirection.SEGMENT_SEPARATOR: "S", 214 ECharacterDirection.WHITE_SPACE_NEUTRAL: "WS" 215} 216 217def bidirectional(unichr): 218 return _dir[UCharacter.getDirection(_get_codepoint(unichr))] 219 220 221def combining(unichr): 222 return UCharacter.getCombiningClass(_get_codepoint(unichr)) 223 224 225def mirrored(unichr): 226 return UCharacter.isMirrored(_get_codepoint(unichr)) 227 228 229_eaw = { 230 # http://www.unicode.org/reports/tr11/ 231 EastAsianWidth.AMBIGUOUS : "A", 232 EastAsianWidth.COUNT : "?", # apparently not used, see above TR 233 EastAsianWidth.FULLWIDTH : "F", 234 EastAsianWidth.HALFWIDTH : "H", 235 EastAsianWidth.NARROW : "Na", 236 EastAsianWidth.NEUTRAL : "N", 237 EastAsianWidth.WIDE : "W" 238} 239 240def east_asian_width(unichr): 241 return _eaw[UCharacter.getIntPropertyValue(_get_codepoint(unichr), UProperty.EAST_ASIAN_WIDTH)] 242 243 244def normalize(form, unistr): 245 """ 246 Return the normal form 'form' for the Unicode string unistr. Valid 247 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. 248 """ 249 250 try: 251 normalizer_form = _forms[form] 252 except KeyError: 253 raise ValueError('invalid normalization form') 254 255 return Normalizer.normalize(unistr, normalizer_form) 256 257 258def get_icu_version(): 259 versions = [] 260 for k in VersionInfo.__dict__.iterkeys(): 261 if k.startswith("UNICODE_"): 262 v = getattr(VersionInfo, k) 263 versions.append((v.getMajor(), v.getMinor(), v.getMilli())) 264 return ".".join(str(x) for x in max(versions)) 265 266 267unidata_version = get_icu_version() 268