1#!/usr/local/bin/python3.8
2# vim:fileencoding=utf-8
3
4__license__   = 'GPL v3'
5__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
6__docformat__ = 'restructuredtext en'
7
8# Setup code {{{
9import codecs
10import sys
11
12from calibre.utils.config_base import tweaks, prefs
13from calibre_extensions import icu as _icu
14from polyglot.builtins import cmp
15
16_locale = _collator = _primary_collator = _sort_collator = _non_numeric_sort_collator = _numeric_collator = _case_sensitive_collator = None
17cmp
18
19_none = ''
20_none2 = b''
21_cmap = {}
22
23icu_unicode_version = getattr(_icu, 'unicode_version', None)
24_nmodes = {m:getattr(_icu, m) for m in ('NFC', 'NFD', 'NFKC', 'NFKD')}
25
26# Ensure that the python internal filesystem and default encodings are not ASCII
27
28
29def is_ascii(name):
30    try:
31        return codecs.lookup(name).name == b'ascii'
32    except (TypeError, LookupError):
33        return True
34
35
36try:
37    if is_ascii(sys.getdefaultencoding()):
38        _icu.set_default_encoding(b'utf-8')
39except:
40    import traceback
41    traceback.print_exc()
42
43try:
44    if is_ascii(sys.getfilesystemencoding()):
45        _icu.set_filesystem_encoding(b'utf-8')
46except:
47    import traceback
48    traceback.print_exc()
49del is_ascii
50
51
52def collator():
53    global _collator, _locale
54    if _collator is None:
55        if _locale is None:
56            from calibre.utils.localization import get_lang
57            if tweaks['locale_for_sorting']:
58                _locale = tweaks['locale_for_sorting']
59            else:
60                _locale = get_lang()
61        try:
62            _collator = _icu.Collator(_locale)
63        except Exception as e:
64            print('Failed to load collator for locale: %r with error %r, using English' % (_locale, e))
65            _collator = _icu.Collator('en')
66    return _collator
67
68
69def change_locale(locale=None):
70    global _locale, _collator, _primary_collator, _sort_collator, _numeric_collator, _case_sensitive_collator, _non_numeric_sort_collator
71    _collator = _primary_collator = _sort_collator = _numeric_collator = _case_sensitive_collator = _non_numeric_sort_collator = None
72    _locale = locale
73
74
75def primary_collator():
76    'Ignores case differences and accented characters'
77    global _primary_collator
78    if _primary_collator is None:
79        _primary_collator = collator().clone()
80        _primary_collator.strength = _icu.UCOL_PRIMARY
81    return _primary_collator
82
83
84def sort_collator():
85    'Ignores case differences and recognizes numbers in strings (if the tweak is set)'
86    global _sort_collator
87    if _sort_collator is None:
88        _sort_collator = collator().clone()
89        _sort_collator.strength = _icu.UCOL_SECONDARY
90        _sort_collator.numeric = prefs['numeric_collation']
91    return _sort_collator
92
93
94def non_numeric_sort_collator():
95    'Ignores case differences only'
96    global _non_numeric_sort_collator
97    if _non_numeric_sort_collator is None:
98        _non_numeric_sort_collator = collator().clone()
99        _non_numeric_sort_collator.strength = _icu.UCOL_SECONDARY
100        _non_numeric_sort_collator.numeric = False
101    return _non_numeric_sort_collator
102
103
104def numeric_collator():
105    'Uses natural sorting for numbers inside strings so something2 will sort before something10'
106    global _numeric_collator
107    if _numeric_collator is None:
108        _numeric_collator = collator().clone()
109        _numeric_collator.strength = _icu.UCOL_SECONDARY
110        _numeric_collator.numeric = True
111    return _numeric_collator
112
113
114def case_sensitive_collator():
115    'Always sorts upper case letter before lower case'
116    global _case_sensitive_collator
117    if _case_sensitive_collator is None:
118        _case_sensitive_collator = collator().clone()
119        _case_sensitive_collator.numeric = sort_collator().numeric
120        _case_sensitive_collator.upper_first = True
121    return _case_sensitive_collator
122
123
124def make_sort_key_func(collator_function, func_name='sort_key'):
125    func = None
126
127    def sort_key(a):
128        nonlocal func
129        if func is None:
130            func = getattr(collator_function(), func_name)
131
132        try:
133            return func(a)
134        except TypeError:
135            if isinstance(a, bytes):
136                try:
137                    a = a.decode(sys.getdefaultencoding())
138                except ValueError:
139                    return a
140                return func(a)
141        return b''
142
143    return sort_key
144
145
146def make_two_arg_func(collator_function, func_name='strcmp'):
147    func = None
148
149    def two_args(a, b):
150        nonlocal func
151        if func is None:
152            func = getattr(collator_function(), func_name)
153
154        try:
155            return func(a, b)
156        except TypeError:
157            if isinstance(a, bytes):
158                try:
159                    a = a.decode(sys.getdefaultencoding())
160                except Exception:
161                    return cmp(a, b)
162            elif a is None:
163                a = ''
164            if isinstance(b, bytes):
165                try:
166                    b = b.decode(sys.getdefaultencoding())
167                except Exception:
168                    return cmp(a, b)
169            elif b is None:
170                b = ''
171            return func(a, b)
172
173    return two_args
174
175
176def make_change_case_func(which):
177
178    def change_case(x):
179        try:
180            try:
181                return _icu.change_case(x, which, _locale)
182            except NotImplementedError:
183                pass
184            collator()  # sets _locale
185            return _icu.change_case(x, which, _locale)
186        except TypeError:
187            if isinstance(x, bytes):
188                try:
189                    x = x.decode(sys.getdefaultencoding())
190                except ValueError:
191                    return x
192                return _icu.change_case(x, which, _locale)
193            raise
194    return change_case
195# }}}
196
197
198# ################ The string functions ########################################
199sort_key = make_sort_key_func(sort_collator)
200numeric_sort_key = make_sort_key_func(numeric_collator)
201primary_sort_key = make_sort_key_func(primary_collator)
202case_sensitive_sort_key = make_sort_key_func(case_sensitive_collator)
203collation_order = make_sort_key_func(sort_collator, 'collation_order')
204collation_order_for_partitioning = make_sort_key_func(non_numeric_sort_collator, 'collation_order')
205
206strcmp = make_two_arg_func(sort_collator)
207case_sensitive_strcmp = make_two_arg_func(case_sensitive_collator)
208primary_strcmp = make_two_arg_func(primary_collator)
209upper = make_change_case_func(_icu.UPPER_CASE)
210lower = make_change_case_func(_icu.LOWER_CASE)
211title_case = make_change_case_func(_icu.TITLE_CASE)
212
213
214def capitalize(x):
215    try:
216        return upper(x[0]) + lower(x[1:])
217    except (IndexError, TypeError, AttributeError):
218        return x
219
220
221try:
222    swapcase = _icu.swap_case
223except AttributeError:  # For people running from source
224    swapcase = lambda x:x.swapcase()
225
226find = make_two_arg_func(collator, 'find')
227primary_find = make_two_arg_func(primary_collator, 'find')
228contains = make_two_arg_func(collator, 'contains')
229primary_contains = make_two_arg_func(primary_collator, 'contains')
230startswith = make_two_arg_func(collator, 'startswith')
231primary_startswith = make_two_arg_func(primary_collator, 'startswith')
232safe_chr = _icu.chr
233ord_string = _icu.ord_string
234
235
236def character_name(string):
237    try:
238        return _icu.character_name(str(string)) or None
239    except (TypeError, ValueError, KeyError):
240        pass
241
242
243def character_name_from_code(code):
244    try:
245        return _icu.character_name_from_code(code) or ''
246    except (TypeError, ValueError, KeyError):
247        return ''
248
249
250def normalize(text, mode='NFC'):
251    # This is very slightly slower than using unicodedata.normalize, so stick with
252    # that unless you have very good reasons not too. Also, it's speed
253    # decreases on wide python builds, where conversion to/from ICU's string
254    # representation is slower.
255    return _icu.normalize(_nmodes[mode], str(text))
256
257
258def contractions(col=None):
259    global _cmap
260    col = col or _collator
261    if col is None:
262        col = collator()
263    ans = _cmap.get(collator, None)
264    if ans is None:
265        ans = col.contractions()
266        ans = frozenset(filter(None, ans))
267        _cmap[col] = ans
268    return ans
269
270
271def partition_by_first_letter(items, reverse=False, key=lambda x:x):
272    # Build a list of 'equal' first letters by noticing changes
273    # in ICU's 'ordinal' for the first letter.
274    from collections import OrderedDict
275    items = sorted(items, key=lambda x:sort_key(key(x)), reverse=reverse)
276    ans = OrderedDict()
277    last_c, last_ordnum = ' ', 0
278    for item in items:
279        c = icu_upper(key(item) or ' ')
280        ordnum, ordlen = collation_order(c)
281        if last_ordnum != ordnum:
282            last_c = c[0:1]
283            last_ordnum = ordnum
284        try:
285            ans[last_c].append(item)
286        except KeyError:
287            ans[last_c] = [item]
288    return ans
289
290
291# Return the number of unicode codepoints in a string
292string_length = len
293
294# Return the number of UTF-16 codepoints in a string
295utf16_length = _icu.utf16_length
296
297################################################################################
298
299if __name__ == '__main__':
300    from calibre.utils.icu_test import run
301    run(verbosity=4)
302