1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4__license__ = 'GPL v3' 5__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>' 6__docformat__ = 'restructuredtext en' 7 8# Setup code {{{ 9import codecs 10import sys 11 12from calibre.utils.config_base import tweaks, prefs 13from calibre_extensions import icu as _icu 14from polyglot.builtins import cmp 15 16_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = _case_sensitive_collator = None 17cmp 18 19_none = '' 20_none2 = b'' 21_cmap = {} 22 23icu_unicode_version = getattr(_icu, 'unicode_version', None) 24_nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')} 25 26# Ensure that the python internal filesystem and default encodings are not ASCII 27 28 29def is_ascii(name): 30 try: 31 return codecs.lookup(name).name == b'ascii' 32 except (TypeError, LookupError): 33 return True 34 35 36try: 37 if is_ascii(sys.getdefaultencoding()): 38 _icu.set_default_encoding(b'utf-8') 39except: 40 import traceback 41 traceback.print_exc() 42 43try: 44 if is_ascii(sys.getfilesystemencoding()): 45 _icu.set_filesystem_encoding(b'utf-8') 46except: 47 import traceback 48 traceback.print_exc() 49del is_ascii 50 51 52def collator(): 53 global _collator, _locale 54 if _collator is None: 55 if _locale is None: 56 from calibre.utils.localization import get_lang 57 if tweaks['locale_for_sorting']: 58 _locale = tweaks['locale_for_sorting'] 59 else: 60 _locale = get_lang() 61 try: 62 _collator = _icu.Collator(_locale) 63 except Exception as e: 64 print('Failed to load collator for locale: %r with error %r, using English' % (_locale, e)) 65 _collator = _icu.Collator('en') 66 return _collator 67 68 69def change_locale(locale=None): 70 global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator, _non_numeric_sort_collator 71 _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = _non_numeric_sort_collator = None 72 _locale = locale 73 74 75def primary_collator(): 76 'Ignores case differences and accented characters' 77 global _primary_collator 78 if _primary_collator is None: 79 _primary_collator = collator().clone() 80 _primary_collator.strength = _icu.UCOL_PRIMARY 81 return _primary_collator 82 83 84def sort_collator(): 85 'Ignores case differences and recognizes numbers in strings (if the tweak is set)' 86 global _sort_collator 87 if _sort_collator is None: 88 _sort_collator = collator().clone() 89 _sort_collator.strength = _icu.UCOL_SECONDARY 90 _sort_collator.numeric = prefs['numeric_collation'] 91 return _sort_collator 92 93 94def non_numeric_sort_collator(): 95 'Ignores case differences only' 96 global _non_numeric_sort_collator 97 if _non_numeric_sort_collator is None: 98 _non_numeric_sort_collator = collator().clone() 99 _non_numeric_sort_collator.strength = _icu.UCOL_SECONDARY 100 _non_numeric_sort_collator.numeric = False 101 return _non_numeric_sort_collator 102 103 104def numeric_collator(): 105 'Uses natural sorting for numbers inside strings so something2 will sort before something10' 106 global _numeric_collator 107 if _numeric_collator is None: 108 _numeric_collator = collator().clone() 109 _numeric_collator.strength = _icu.UCOL_SECONDARY 110 _numeric_collator.numeric = True 111 return _numeric_collator 112 113 114def case_sensitive_collator(): 115 'Always sorts upper case letter before lower case' 116 global _case_sensitive_collator 117 if _case_sensitive_collator is None: 118 _case_sensitive_collator = collator().clone() 119 _case_sensitive_collator.numeric = sort_collator().numeric 120 _case_sensitive_collator.upper_first = True 121 return _case_sensitive_collator 122 123 124def make_sort_key_func(collator_function, func_name='sort_key'): 125 func = None 126 127 def sort_key(a): 128 nonlocal func 129 if func is None: 130 func = getattr(collator_function(), func_name) 131 132 try: 133 return func(a) 134 except TypeError: 135 if isinstance(a, bytes): 136 try: 137 a = a.decode(sys.getdefaultencoding()) 138 except ValueError: 139 return a 140 return func(a) 141 return b'' 142 143 return sort_key 144 145 146def make_two_arg_func(collator_function, func_name='strcmp'): 147 func = None 148 149 def two_args(a, b): 150 nonlocal func 151 if func is None: 152 func = getattr(collator_function(), func_name) 153 154 try: 155 return func(a, b) 156 except TypeError: 157 if isinstance(a, bytes): 158 try: 159 a = a.decode(sys.getdefaultencoding()) 160 except Exception: 161 return cmp(a, b) 162 elif a is None: 163 a = '' 164 if isinstance(b, bytes): 165 try: 166 b = b.decode(sys.getdefaultencoding()) 167 except Exception: 168 return cmp(a, b) 169 elif b is None: 170 b = '' 171 return func(a, b) 172 173 return two_args 174 175 176def make_change_case_func(which): 177 178 def change_case(x): 179 try: 180 try: 181 return _icu.change_case(x, which, _locale) 182 except NotImplementedError: 183 pass 184 collator() # sets _locale 185 return _icu.change_case(x, which, _locale) 186 except TypeError: 187 if isinstance(x, bytes): 188 try: 189 x = x.decode(sys.getdefaultencoding()) 190 except ValueError: 191 return x 192 return _icu.change_case(x, which, _locale) 193 raise 194 return change_case 195# }}} 196 197 198# ################ The string functions ######################################## 199sort_key = make_sort_key_func(sort_collator) 200numeric_sort_key = make_sort_key_func(numeric_collator) 201primary_sort_key = make_sort_key_func(primary_collator) 202case_sensitive_sort_key = make_sort_key_func(case_sensitive_collator) 203collation_order = make_sort_key_func(sort_collator, 'collation_order') 204collation_order_for_partitioning = make_sort_key_func(non_numeric_sort_collator, 'collation_order') 205 206strcmp = make_two_arg_func(sort_collator) 207case_sensitive_strcmp = make_two_arg_func(case_sensitive_collator) 208primary_strcmp = make_two_arg_func(primary_collator) 209upper = make_change_case_func(_icu.UPPER_CASE) 210lower = make_change_case_func(_icu.LOWER_CASE) 211title_case = make_change_case_func(_icu.TITLE_CASE) 212 213 214def capitalize(x): 215 try: 216 return upper(x[0]) + lower(x[1:]) 217 except (IndexError, TypeError, AttributeError): 218 return x 219 220 221try: 222 swapcase = _icu.swap_case 223except AttributeError: # For people running from source 224 swapcase = lambda x:x.swapcase() 225 226find = make_two_arg_func(collator, 'find') 227primary_find = make_two_arg_func(primary_collator, 'find') 228contains = make_two_arg_func(collator, 'contains') 229primary_contains = make_two_arg_func(primary_collator, 'contains') 230startswith = make_two_arg_func(collator, 'startswith') 231primary_startswith = make_two_arg_func(primary_collator, 'startswith') 232safe_chr = _icu.chr 233ord_string = _icu.ord_string 234 235 236def character_name(string): 237 try: 238 return _icu.character_name(str(string)) or None 239 except (TypeError, ValueError, KeyError): 240 pass 241 242 243def character_name_from_code(code): 244 try: 245 return _icu.character_name_from_code(code) or '' 246 except (TypeError, ValueError, KeyError): 247 return '' 248 249 250def normalize(text, mode='NFC'): 251 # This is very slightly slower than using unicodedata.normalize, so stick with 252 # that unless you have very good reasons not too. Also, it's speed 253 # decreases on wide python builds, where conversion to/from ICU's string 254 # representation is slower. 255 return _icu.normalize(_nmodes[mode], str(text)) 256 257 258def contractions(col=None): 259 global _cmap 260 col = col or _collator 261 if col is None: 262 col = collator() 263 ans = _cmap.get(collator, None) 264 if ans is None: 265 ans = col.contractions() 266 ans = frozenset(filter(None, ans)) 267 _cmap[col] = ans 268 return ans 269 270 271def partition_by_first_letter(items, reverse=False, key=lambda x:x): 272 # Build a list of 'equal' first letters by noticing changes 273 # in ICU's 'ordinal' for the first letter. 274 from collections import OrderedDict 275 items = sorted(items, key=lambda x:sort_key(key(x)), reverse=reverse) 276 ans = OrderedDict() 277 last_c, last_ordnum = ' ', 0 278 for item in items: 279 c = icu_upper(key(item) or ' ') 280 ordnum, ordlen = collation_order(c) 281 if last_ordnum != ordnum: 282 last_c = c[0:1] 283 last_ordnum = ordnum 284 try: 285 ans[last_c].append(item) 286 except KeyError: 287 ans[last_c] = [item] 288 return ans 289 290 291# Return the number of unicode codepoints in a string 292string_length = len 293 294# Return the number of UTF-16 codepoints in a string 295utf16_length = _icu.utf16_length 296 297################################################################################ 298 299if __name__ == '__main__': 300 from calibre.utils.icu_test import run 301 run(verbosity=4) 302