1#!/usr/local/bin/python3.8 2# vim:fileencoding=utf-8 3 4 5__license__ = 'GPL v3' 6__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' 7 8import unittest, sys 9from contextlib import contextmanager 10 11import calibre.utils.icu as icu 12from polyglot.builtins import iteritems, cmp 13 14 15@contextmanager 16def make_collation_func(name, locale, numeric=True, maker=icu.make_sort_key_func, func='strcmp'): 17 def coll(): 18 ans = icu._icu.Collator(locale) 19 ans.numeric = numeric 20 return ans 21 22 yield maker(coll, func) 23 24 25class TestICU(unittest.TestCase): 26 27 ae = unittest.TestCase.assertEqual 28 ane= unittest.TestCase.assertNotEqual 29 30 def setUp(self): 31 icu.change_locale('en') 32 33 def test_sorting(self): 34 ' Test the various sorting APIs ' 35 german = '''Sonntag Montag Dienstag Januar Februar März Fuße Fluße Flusse flusse fluße flüße flüsse'''.split() 36 german_good = '''Dienstag Februar flusse Flusse fluße Fluße flüsse flüße Fuße Januar März Montag Sonntag'''.split() 37 french = '''dimanche lundi mardi janvier février mars déjà Meme deja même dejà bpef bœg Boef Mémé bœf boef bnef pêche pèché pêché pêche pêché'''.split() 38 french_good = '''bnef boef Boef bœf bœg bpef deja dejà déjà dimanche février janvier lundi mardi mars Meme Mémé même pèché pêche pêche pêché pêché'''.split() # noqa 39 40 # Test corner cases 41 sort_key = icu.sort_key 42 s = '\U0001f431' 43 self.ae(sort_key(s), sort_key(s.encode(sys.getdefaultencoding())), 'UTF-8 encoded object not correctly decoded to generate sort key') 44 self.ae(s.encode('utf-16'), s.encode('utf-16'), 'Undecodable bytestring not returned as itself') 45 self.ae(b'', sort_key(None)) 46 self.ae(0, icu.strcmp(None, b'')) 47 self.ae(0, icu.strcmp(s, s.encode(sys.getdefaultencoding()))) 48 49 # Test locales 50 with make_collation_func('dsk', 'de', maker=icu.make_sort_key_func, func='sort_key') as dsk: 51 self.ae(german_good, sorted(german, key=dsk)) 52 with make_collation_func('dcmp', 'de', maker=icu.make_two_arg_func, func='strcmp') as dcmp: 53 for x in german: 54 for y in german: 55 self.ae(cmp(dsk(x), dsk(y)), dcmp(x, y)) 56 57 with make_collation_func('fsk', 'fr', maker=icu.make_sort_key_func, func='sort_key') as fsk: 58 self.ae(french_good, sorted(french, key=fsk)) 59 with make_collation_func('fcmp', 'fr', maker=icu.make_two_arg_func) as fcmp: 60 for x in french: 61 for y in french: 62 self.ae(cmp(fsk(x), fsk(y)), fcmp(x, y)) 63 64 with make_collation_func('ssk', 'es', maker=icu.make_sort_key_func, func='sort_key') as ssk: 65 self.assertNotEqual(ssk('peña'), ssk('pena')) 66 with make_collation_func('scmp', 'es', maker=icu.make_two_arg_func) as scmp: 67 self.assertNotEqual(0, scmp('pena', 'peña')) 68 69 for k, v in iteritems({'pèché': 'peche', 'flüße':'Flusse', 'Štepánek':'ŠtepaneK'}): 70 self.ae(0, icu.primary_strcmp(k, v)) 71 72 # Test different types of collation 73 self.ae(icu.primary_sort_key('Aä'), icu.primary_sort_key('aa')) 74 self.assertLess(icu.numeric_sort_key('something 2'), icu.numeric_sort_key('something 11')) 75 self.assertLess(icu.case_sensitive_sort_key('A'), icu.case_sensitive_sort_key('a')) 76 self.ae(0, icu.strcmp('a', 'A')) 77 self.ae(cmp('a', 'A'), icu.case_sensitive_strcmp('a', 'A')) 78 self.ae(0, icu.primary_strcmp('ä', 'A')) 79 80 def test_change_case(self): 81 ' Test the various ways of changing the case ' 82 from calibre.utils.titlecase import titlecase 83 # Test corner cases 84 self.ae('A', icu.upper(b'a')) 85 for x in ('', None, False, 1): 86 self.ae(x, icu.capitalize(x)) 87 88 for x in ('a', 'Alice\'s code', 'macdonald\'s machIne', '02 the wars'): 89 self.ae(icu.upper(x), x.upper()) 90 self.ae(icu.lower(x), x.lower()) 91 # ICU's title case algorithm is different from ours, when there are 92 # capitals inside words 93 self.ae(icu.title_case(x), titlecase(x).replace('machIne', 'Machine')) 94 self.ae(icu.capitalize(x), x[0].upper() + x[1:].lower()) 95 self.ae(icu.swapcase(x), x.swapcase()) 96 97 def test_find(self): 98 ' Test searching for substrings ' 99 self.ae((1, 1), icu.find(b'a', b'1ab')) 100 self.ae((1, 1), icu.find('\U0001f431', 'x\U0001f431x')) 101 self.ae((1, 1), icu.find('y', '\U0001f431y')) 102 self.ae((0, 4), icu.primary_find('pena', 'peña')) 103 for k, v in iteritems({'pèché': 'peche', 'flüße':'Flusse', 'Štepánek':'ŠtepaneK'}): 104 self.ae((1, len(k)), icu.primary_find(v, ' ' + k), 'Failed to find %s in %s' % (v, k)) 105 self.assertTrue(icu.startswith(b'abc', b'ab')) 106 self.assertTrue(icu.startswith('abc', 'abc')) 107 self.assertFalse(icu.startswith('xyz', 'a')) 108 self.assertTrue(icu.startswith('xxx', '')) 109 self.assertTrue(icu.primary_startswith('pena', 'peña')) 110 self.assertTrue(icu.contains('\U0001f431', '\U0001f431')) 111 self.assertTrue(icu.contains('something', 'some other something else')) 112 self.assertTrue(icu.contains('', 'a')) 113 self.assertTrue(icu.contains('', '')) 114 self.assertFalse(icu.contains('xxx', 'xx')) 115 self.assertTrue(icu.primary_contains('pena', 'peña')) 116 117 def test_collation_order(self): 118 'Testing collation ordering' 119 for group in [ 120 (self.ae, ('Šaa', 'Smith', 'Solženicyn', 'Štepánek')), 121 (self.ae, ('11', '011')), 122 (self.ane, ('2', '1')), 123 (self.ae, ('100 Smith', '0100 Smith')), 124 ]: 125 last = None 126 assert_func = group[0] 127 for x in group[1]: 128 order, _ = icu.numeric_collator().collation_order(x) 129 if last is not None: 130 assert_func(last, order, 'Order for %s not correct: %s != %s' % (x, last, order)) 131 last = order 132 133 self.ae(dict(icu.partition_by_first_letter(['A1', '', 'a1', '\U0001f431', '\U0001f431x'])), 134 {' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']}) 135 136 def test_collation_order_for_partitioning(self): 137 'Testing collation ordering for partitioning' 138 for group in [ 139 (self.ae, ('Smith', 'Šaa', 'Solženicyn', 'Štepánek')), 140 (self.ane, ('11', '011')), 141 (self.ae, ('102 Smith', '100 Smith')), 142 (self.ane, ('100 Smith', '0100 Smith')), 143 ]: 144 last = None 145 assert_func = group[0] 146 for x in group[1]: 147 order, _ = icu.non_numeric_sort_collator().collation_order(x) 148 if last is not None: 149 assert_func(last, order, 'Order for %s not correct: %s != %s' % (x, last, order)) 150 last = order 151 152 self.ae(dict(icu.partition_by_first_letter(['A1', '', 'a1', '\U0001f431', '\U0001f431x'])), 153 {' ':[''], 'A':['A1', 'a1'], '\U0001f431':['\U0001f431', '\U0001f431x']}) 154 155 def test_roundtrip(self): 156 ' Test roundtripping ' 157 for r in ('xxx\0\u2219\U0001f431xxx', '\0', '', 'simple'): 158 self.ae(r, icu._icu.roundtrip(r)) 159 self.ae(icu._icu.roundtrip('\ud8e81'), '\ufffd1') 160 self.ae(icu._icu.roundtrip('\udc01\ud8e8'), '\ufffd\ufffd') 161 for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]: 162 self.ae(icu._icu.string_length(x), l) 163 for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]: 164 self.ae(icu._icu.utf16_length(x), l) 165 self.ae(icu._icu.chr(0x1f431), '\U0001f431') 166 self.ae(icu._icu.ord_string('abc'*100), tuple(map(ord, 'abc'*100))) 167 self.ae(icu._icu.ord_string('\U0001f431'), (0x1f431,)) 168 169 def test_character_name(self): 170 ' Test character naming ' 171 from calibre.utils.unicode_names import character_name_from_code 172 for q, e in { 173 '\U0001f431': 'CAT FACE' 174 }.items(): 175 self.ae(icu.character_name(q), e) 176 self.ae(character_name_from_code(icu.ord_string(q)[0]), e) 177 178 def test_contractions(self): 179 ' Test contractions ' 180 self.skipTest('Skipping as this depends too much on ICU version') 181 c = icu._icu.Collator('cs') 182 self.ae(icu.contractions(c), frozenset({'Z\u030c', 'z\u030c', 'Ch', 183 'C\u030c', 'ch', 'cH', 'c\u030c', 's\u030c', 'r\u030c', 'CH', 184 'S\u030c', 'R\u030c'})) 185 186 def test_break_iterator(self): 187 ' Test the break iterator ' 188 from calibre.spell.break_iterator import split_into_words as split, index_of, split_into_words_and_positions, count_words 189 for q in ('one two three', ' one two three', 'one\ntwo three ', ): 190 self.ae(split(str(q)), ['one', 'two', 'three'], 'Failed to split: %r' % q) 191 self.ae(split('I I\'m'), ['I', "I'm"]) 192 self.ae(split('out-of-the-box'), ['out-of-the-box']) 193 self.ae(split('-one two-'), ['-one', 'two-']) 194 self.ae(split('-one a-b-c-d e'), ['-one', 'a-b-c-d', 'e']) 195 self.ae(split('-one -a-b-c-d- e'), ['-one', '-a-b-c-d-', 'e']) 196 self.ae(split_into_words_and_positions('one \U0001f431 three'), [(0, 3), (6, 5)]) 197 self.ae(count_words('a b c d e f'), 6) 198 for needle, haystack, pos in ( 199 ('word', 'a word b', 2), 200 ('word', 'a word', 2), 201 ('one-two', 'a one-two punch', 2), 202 ('one-two', 'one-two punch', 0), 203 ('one-two', 'one-two', 0), 204 ('one', 'one-two one', 8), 205 ('one-two', 'one-two-three one-two', 14), 206 ('one', 'onet one', 5), 207 ('two', 'one-two two', 8), 208 ('two', 'two-one two', 8), 209 ('-two', 'one-two -two', 8), 210 ('-two', 'two', -1), 211 ('i', 'i', 0), 212 ('i', 'six i', 4), 213 ('i', '', -1), ('', '', -1), ('', 'i', -1), 214 ('i', 'six clicks', -1), 215 ('i', '\U0001f431 i', 2), 216 ('-a', 'b -a', 2), 217 ('a-', 'a-b a- d', 4), 218 ('-a-', 'b -a -a-', 5), 219 ('-a-', '-a-', 0), 220 ('-a-', 'a-', -1), 221 ('-a-', '-a', -1), 222 ('-a-', 'a', -1), 223 ('a-', 'a-', 0), 224 ('-a', '-a', 0), 225 ('a-b-c-', 'a-b-c-d', -1), 226 ('a-b-c-', 'a-b-c-.', 0), 227 ('a-b-c-', 'a-b-c-d a-b-c- d', 8), 228 ): 229 fpos = index_of(needle, haystack) 230 self.ae(pos, fpos, 'Failed to find index of %r in %r (%d != %d)' % (needle, haystack, pos, fpos)) 231 232 233def find_tests(): 234 return unittest.defaultTestLoader.loadTestsFromTestCase(TestICU) 235 236 237class TestRunner(unittest.main): 238 239 def createTests(self): 240 self.test = find_tests() 241 242 243def run(verbosity=4): 244 TestRunner(verbosity=verbosity, exit=False) 245 246 247def test_build(): 248 result = TestRunner(verbosity=0, buffer=True, catchbreak=True, failfast=True, argv=sys.argv[:1], exit=False).result 249 if not result.wasSuccessful(): 250 raise SystemExit(1) 251 252 253if __name__ == '__main__': 254 run(verbosity=4) 255