1# Copyright (C) 2010 Canonical Ltd
2#
3# This program is free software; you can redistribute it and/or modify
4# it under the terms of the GNU General Public License as published by
5# the Free Software Foundation; either version 2 of the License, or
6# (at your option) any later version.
7#
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU General Public License for more details.
12#
13# You should have received a copy of the GNU General Public License
14# along with this program; if not, write to the Free Software
15# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16#
17
18"""Direct tests of the btree serializer extension"""
19
20import binascii
21import bisect
22
23from ... import tests
24
25from .test_btree_index import compiled_btreeparser_feature
26
27
28class TestBtreeSerializer(tests.TestCase):
29
30    _test_needs_features = [compiled_btreeparser_feature]
31
32    @property
33    def module(self):
34        return compiled_btreeparser_feature.module
35
36
37class TestHexAndUnhex(TestBtreeSerializer):
38
39    def assertHexlify(self, as_binary):
40        self.assertEqual(binascii.hexlify(as_binary),
41                         self.module._py_hexlify(as_binary))
42
43    def assertUnhexlify(self, as_hex):
44        ba_unhex = binascii.unhexlify(as_hex)
45        mod_unhex = self.module._py_unhexlify(as_hex)
46        if ba_unhex != mod_unhex:
47            if mod_unhex is None:
48                mod_hex = b'<None>'
49            else:
50                mod_hex = binascii.hexlify(mod_unhex)
51            self.fail('_py_unhexlify returned a different answer'
52                      ' from binascii:\n    %r\n != %r'
53                      % (binascii.hexlify(ba_unhex), mod_hex))
54
55    def assertFailUnhexlify(self, as_hex):
56        # Invalid hex content
57        self.assertIs(None, self.module._py_unhexlify(as_hex))
58
59    def test_to_hex(self):
60        raw_bytes = bytes(range(256))
61        for i in range(0, 240, 20):
62            self.assertHexlify(raw_bytes[i:i + 20])
63        self.assertHexlify(raw_bytes[240:] + raw_bytes[0:4])
64
65    def test_from_hex(self):
66        self.assertUnhexlify(b'0123456789abcdef0123456789abcdef01234567')
67        self.assertUnhexlify(b'123456789abcdef0123456789abcdef012345678')
68        self.assertUnhexlify(b'0123456789ABCDEF0123456789ABCDEF01234567')
69        self.assertUnhexlify(b'123456789ABCDEF0123456789ABCDEF012345678')
70        hex_chars = binascii.hexlify(bytes(range(256)))
71        for i in range(0, 480, 40):
72            self.assertUnhexlify(hex_chars[i:i + 40])
73        self.assertUnhexlify(hex_chars[480:] + hex_chars[0:8])
74
75    def test_from_invalid_hex(self):
76        self.assertFailUnhexlify(b'123456789012345678901234567890123456789X')
77        self.assertFailUnhexlify(b'12345678901234567890123456789012345678X9')
78
79    def test_bad_argument(self):
80        self.assertRaises(ValueError, self.module._py_unhexlify, u'1a')
81        self.assertRaises(ValueError, self.module._py_unhexlify, b'1b')
82
83
84_hex_form = b'123456789012345678901234567890abcdefabcd'
85
86
87class Test_KeyToSha1(TestBtreeSerializer):
88
89    def assertKeyToSha1(self, expected, key):
90        if expected is None:
91            expected_bin = None
92        else:
93            expected_bin = binascii.unhexlify(expected)
94        actual_sha1 = self.module._py_key_to_sha1(key)
95        if expected_bin != actual_sha1:
96            actual_hex_sha1 = None
97            if actual_sha1 is not None:
98                actual_hex_sha1 = binascii.hexlify(actual_sha1)
99            self.fail('_key_to_sha1 returned:\n    %s\n != %s'
100                      % (actual_sha1, expected))
101
102    def test_simple(self):
103        self.assertKeyToSha1(_hex_form, (b'sha1:' + _hex_form,))
104
105    def test_invalid_not_tuple(self):
106        self.assertKeyToSha1(None, _hex_form)
107        self.assertKeyToSha1(None, b'sha1:' + _hex_form)
108
109    def test_invalid_empty(self):
110        self.assertKeyToSha1(None, ())
111
112    def test_invalid_not_string(self):
113        self.assertKeyToSha1(None, (None,))
114        self.assertKeyToSha1(None, (list(_hex_form),))
115
116    def test_invalid_not_sha1(self):
117        self.assertKeyToSha1(None, (_hex_form,))
118        self.assertKeyToSha1(None, (b'sha2:' + _hex_form,))
119
120    def test_invalid_not_hex(self):
121        self.assertKeyToSha1(None,
122                             (b'sha1:abcdefghijklmnopqrstuvwxyz12345678901234',))
123
124
125class Test_Sha1ToKey(TestBtreeSerializer):
126
127    def assertSha1ToKey(self, hex_sha1):
128        bin_sha1 = binascii.unhexlify(hex_sha1)
129        key = self.module._py_sha1_to_key(bin_sha1)
130        self.assertEqual((b'sha1:' + hex_sha1,), key)
131
132    def test_simple(self):
133        self.assertSha1ToKey(_hex_form)
134
135
136_one_key_content = b"""type=leaf
137sha1:123456789012345678901234567890abcdefabcd\x00\x001 2 3 4
138"""
139
140_large_offsets = b"""type=leaf
141sha1:123456789012345678901234567890abcdefabcd\x00\x0012345678901 1234567890 0 1
142sha1:abcd123456789012345678901234567890abcdef\x00\x002147483648 2147483647 0 1
143sha1:abcdefabcd123456789012345678901234567890\x00\x004294967296 4294967295 4294967294 1
144"""
145
146_multi_key_content = b"""type=leaf
147sha1:c80c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
148sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
149sha1:c8e240de74fb1ed08fa08d38063f6a6a91462a81\x00\x002 2 2 2
150sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
151sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x004 4 4 4
152sha1:ce0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x005 5 5 5
153sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
154sha1:cf7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x007 7 7 7
155"""
156
157_multi_key_same_offset = b"""type=leaf
158sha1:080c881d4a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
159sha1:c86f7e437faa5a7fce15d1ddcb9eaeaea377667b\x00\x001 1 1 1
160sha1:cd0c9035898dd52fc65c41454cec9c4d2611bfb3\x00\x002 2 2 2
161sha1:cda39a3ee5e6b4b0d3255bfef95601890afd8070\x00\x003 3 3 3
162sha1:cde240de74fb1ed08fa08d38063f6a6a91462a81\x00\x004 4 4 4
163sha1:cdf51e37c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
164sha1:ce7a9e24777ec23212c54d7a350bc5bea5477fdb\x00\x006 6 6 6
165sha1:ce93b4e3c464ffd51732fbd6ded717e9efda28aa\x00\x007 7 7 7
166"""
167
168_common_32_bits = b"""type=leaf
169sha1:123456784a26984ddce795f6f71817c9cf4480e7\x00\x000 0 0 0
170sha1:1234567874fb1ed08fa08d38063f6a6a91462a81\x00\x001 1 1 1
171sha1:12345678777ec23212c54d7a350bc5bea5477fdb\x00\x002 2 2 2
172sha1:123456787faa5a7fce15d1ddcb9eaeaea377667b\x00\x003 3 3 3
173sha1:12345678898dd52fc65c41454cec9c4d2611bfb3\x00\x004 4 4 4
174sha1:12345678c269aa94d38f93e537bf6e2020b21406\x00\x005 5 5 5
175sha1:12345678c464ffd51732fbd6ded717e9efda28aa\x00\x006 6 6 6
176sha1:12345678e5e6b4b0d3255bfef95601890afd8070\x00\x007 7 7 7
177"""
178
179
180class TestGCCKHSHA1LeafNode(TestBtreeSerializer):
181
182    def assertInvalid(self, data):
183        """Ensure that we get a proper error when trying to parse invalid bytes.
184
185        (mostly this is testing that bad input doesn't cause us to segfault)
186        """
187        self.assertRaises(
188            (ValueError, TypeError), self.module._parse_into_chk, data, 1, 0)
189
190    def test_non_bytes(self):
191        self.assertInvalid(u'type=leaf\n')
192
193    def test_not_leaf(self):
194        self.assertInvalid(b'type=internal\n')
195
196    def test_empty_leaf(self):
197        leaf = self.module._parse_into_chk(b'type=leaf\n', 1, 0)
198        self.assertEqual(0, len(leaf))
199        self.assertEqual([], leaf.all_items())
200        self.assertEqual([], leaf.all_keys())
201        # It should allow any key to be queried
202        self.assertFalse(('key',) in leaf)
203
204    def test_one_key_leaf(self):
205        leaf = self.module._parse_into_chk(_one_key_content, 1, 0)
206        self.assertEqual(1, len(leaf))
207        sha_key = (b'sha1:' + _hex_form,)
208        self.assertEqual([sha_key], leaf.all_keys())
209        self.assertEqual([(sha_key, (b'1 2 3 4', ()))], leaf.all_items())
210        self.assertTrue(sha_key in leaf)
211
212    def test_large_offsets(self):
213        leaf = self.module._parse_into_chk(_large_offsets, 1, 0)
214        self.assertEqual([b'12345678901 1234567890 0 1',
215                          b'2147483648 2147483647 0 1',
216                          b'4294967296 4294967295 4294967294 1',
217                          ], [x[1][0] for x in leaf.all_items()])
218
219    def test_many_key_leaf(self):
220        leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
221        self.assertEqual(8, len(leaf))
222        all_keys = leaf.all_keys()
223        self.assertEqual(8, len(leaf.all_keys()))
224        for idx, key in enumerate(all_keys):
225            self.assertEqual(b'%d' % idx, leaf[key][0].split()[0])
226
227    def test_common_shift(self):
228        # The keys were deliberately chosen so that the first 5 bits all
229        # overlapped, it also happens that a later bit overlaps
230        # Note that by 'overlap' we mean that given bit is either on in all
231        # keys, or off in all keys
232        leaf = self.module._parse_into_chk(_multi_key_content, 1, 0)
233        self.assertEqual(19, leaf.common_shift)
234        # The interesting byte for each key is
235        # (defined as the 8-bits that come after the common prefix)
236        lst = [1, 13, 28, 180, 190, 193, 210, 239]
237        offsets = leaf._get_offsets()
238        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
239                         offsets)
240        for idx, val in enumerate(lst):
241            self.assertEqual(idx, offsets[val])
242        for idx, key in enumerate(leaf.all_keys()):
243            self.assertEqual(b'%d' % idx, leaf[key][0].split()[0])
244
245    def test_multi_key_same_offset(self):
246        # there is no common prefix, though there are some common bits
247        leaf = self.module._parse_into_chk(_multi_key_same_offset, 1, 0)
248        self.assertEqual(24, leaf.common_shift)
249        offsets = leaf._get_offsets()
250        # The interesting byte is just the first 8-bits of the key
251        lst = [8, 200, 205, 205, 205, 205, 206, 206]
252        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
253                         offsets)
254        for val in lst:
255            self.assertEqual(lst.index(val), offsets[val])
256        for idx, key in enumerate(leaf.all_keys()):
257            self.assertEqual(b'%d' % idx, leaf[key][0].split()[0])
258
259    def test_all_common_prefix(self):
260        # The first 32 bits of all hashes are the same. This is going to be
261        # pretty much impossible, but I don't want to fail because of this
262        leaf = self.module._parse_into_chk(_common_32_bits, 1, 0)
263        self.assertEqual(0, leaf.common_shift)
264        lst = [0x78] * 8
265        offsets = leaf._get_offsets()
266        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
267                         offsets)
268        for val in lst:
269            self.assertEqual(lst.index(val), offsets[val])
270        for idx, key in enumerate(leaf.all_keys()):
271            self.assertEqual(b'%d' % idx, leaf[key][0].split()[0])
272
273    def test_many_entries(self):
274        # Again, this is almost impossible, but we should still work
275        # It would be hard to fit more that 120 entries in a 4k page, much less
276        # more than 256 of them. but hey, weird stuff happens sometimes
277        lines = [b'type=leaf\n']
278        for i in range(500):
279            key_str = b'sha1:%04x%s' % (i, _hex_form[:36])
280            key = (key_str,)
281            lines.append(b'%s\0\0%d %d %d %d\n' % (key_str, i, i, i, i))
282        data = b''.join(lines)
283        leaf = self.module._parse_into_chk(data, 1, 0)
284        self.assertEqual(24 - 7, leaf.common_shift)
285        offsets = leaf._get_offsets()
286        # This is the interesting bits for each entry
287        lst = [x // 2 for x in range(500)]
288        expected_offsets = [x * 2 for x in range(128)] + [255] * 129
289        self.assertEqual(expected_offsets, offsets)
290        # We truncate because offsets is an unsigned char. So the bisection
291        # will just say 'greater than the last one' for all the rest
292        lst = lst[:255]
293        self.assertEqual([bisect.bisect_left(lst, x) for x in range(0, 257)],
294                         offsets)
295        for val in lst:
296            self.assertEqual(lst.index(val), offsets[val])
297        for idx, key in enumerate(leaf.all_keys()):
298            self.assertEqual(b'%d' % idx, leaf[key][0].split()[0])
299
300    def test__sizeof__(self):
301        # We can't use the exact numbers because of platform variations, etc.
302        # But what we really care about is that it does get bigger with more
303        # content.
304        leaf0 = self.module._parse_into_chk(b'type=leaf\n', 1, 0)
305        leaf1 = self.module._parse_into_chk(_one_key_content, 1, 0)
306        leafN = self.module._parse_into_chk(_multi_key_content, 1, 0)
307        sizeof_1 = leaf1.__sizeof__() - leaf0.__sizeof__()
308        self.assertTrue(sizeof_1 > 0)
309        sizeof_N = leafN.__sizeof__() - leaf0.__sizeof__()
310        self.assertEqual(sizeof_1 * len(leafN), sizeof_N)
311