1 /* Copyright (C) 2007 Josh MacDonald */
2
3 #include "test.h"
4 #include <assert.h>
5 #include <list>
6 #include <vector>
7 #include <algorithm>
8
9 #include "../cpp-btree/btree_map.h"
10
11 extern "C" {
12 uint32_t xd3_large32_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
13 uint32_t xd3_large32_cksum_update_old (xd3_hash_cfg *cfg, uint32_t cksum,
14 const uint8_t *base, const usize_t look);
15
16 uint64_t xd3_large64_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
17 uint64_t xd3_large64_cksum_update_old (xd3_hash_cfg *cfg, uint64_t cksum,
18 const uint8_t *base, const usize_t look);
19 }
20
21 using btree::btree_map;
22 using std::list;
23 using std::vector;
24
25 // MLCG parameters
26 // a, a*
27 uint32_t good_32bit_values[] = {
28 1597334677U, // ...
29 741103597U, 887987685U,
30 };
31
32 // a, a*
33 uint64_t good_64bit_values[] = {
34 1181783497276652981ULL, 4292484099903637661ULL,
35 7664345821815920749ULL, // ...
36 };
37
print_header()38 void print_header() {
39 static int hdr_cnt = 0;
40 if (hdr_cnt++ % 20 == 0) {
41 printf("%-32sConf\t\tCount\tUniq\tFull\tCover\tColls"
42 "\tMB/s\tIters\t#Colls\n", "Name");
43 }
44 }
45
46 struct true_type { };
47 struct false_type { };
48
49 template <typename Word>
50 usize_t bitsof();
51
52 template<>
bitsof()53 usize_t bitsof<unsigned int>() {
54 return sizeof(unsigned int) * 8;
55 }
56
57 template<>
bitsof()58 usize_t bitsof<unsigned long>() {
59 return sizeof(unsigned long) * 8;
60 }
61
62 template<>
bitsof()63 usize_t bitsof<unsigned long long>() {
64 return sizeof(unsigned long long) * 8;
65 }
66
67 template <typename Word>
68 struct hhash { // shift "s" bits leaving the high bits as a hash value for
69 // this checksum, which are the most "distant" in terms of the
70 // spectral test for the rabin_karp MLCG. For short windows,
71 // the high bits aren't enough, XOR "mask" worth of these in.
operator ()hhash72 Word operator()(const Word t, const Word s, const Word mask) {
73 return (t >> s) ^ (t & mask);
74 }
75 };
76
77 template <typename Word>
78 Word good_word();
79
80 template<>
good_word()81 uint32_t good_word<uint32_t>() {
82 return good_32bit_values[0];
83 }
84
85 template<>
good_word()86 uint64_t good_word<uint64_t>() {
87 return good_64bit_values[0];
88 }
89
90 // CLASSES
91
92 #define SELF Word, CksumSize, CksumSkip, Hash, Compaction
93 #define MEMBER template <typename Word, \
94 int CksumSize, \
95 int CksumSkip, \
96 typename Hash, \
97 int Compaction>
98
99 MEMBER
100 struct cksum_params {
101 typedef Word word_type;
102 typedef Hash hash_type;
103
104 static const int cksum_size = CksumSize;
105 static const int cksum_skip = CksumSkip;
106 static const int compaction = Compaction;
107 };
108
109 MEMBER
110 struct rabin_karp : public cksum_params<SELF> {
111 // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
rabin_karprabin_karp112 rabin_karp()
113 : powers(make_powers()),
114 product(powers[0] * good_word<Word>()),
115 incr_state(0) { }
116
make_powersrabin_karp117 static Word* make_powers() {
118 Word *p = new Word[CksumSize];
119 p[CksumSize - 1] = 1;
120 for (int i = CksumSize - 2; i >= 0; i--) {
121 p[i] = p[i + 1] * good_word<Word>();
122 }
123 return p;
124 }
125
~rabin_karprabin_karp126 ~rabin_karp() {
127 delete [] powers;
128 }
129
steprabin_karp130 Word step(const uint8_t *ptr) {
131 Word h = 0;
132 for (int i = 0; i < CksumSize; i++) {
133 h += (ptr[i]) * powers[i];
134 }
135 return h;
136 }
137
state0rabin_karp138 Word state0(const uint8_t *ptr) {
139 incr_state = step(ptr);
140 return incr_state;
141 }
142
incrrabin_karp143 Word incr(const uint8_t *ptr) {
144 incr_state = good_word<Word>() * incr_state -
145 product * (ptr[-1]) + (ptr[CksumSize - 1]);
146 return incr_state;
147 }
148
149 const Word *const powers;
150 const Word product;
151 Word incr_state;
152 };
153
154 MEMBER
155 struct with_stream : public cksum_params<SELF> {
156 xd3_stream stream;
157
with_streamwith_stream158 with_stream()
159 {
160 xd3_config cfg;
161 memset (&stream, 0, sizeof (stream));
162 xd3_init_config (&cfg, 0);
163 cfg.smatch_cfg = XD3_SMATCH_SOFT;
164 cfg.smatcher_soft.large_look = CksumSize;
165 cfg.smatcher_soft.large_step = CksumSkip;
166 cfg.smatcher_soft.small_look = 4;
167 cfg.smatcher_soft.small_chain = 4;
168 cfg.smatcher_soft.small_lchain = 4;
169 cfg.smatcher_soft.max_lazy = 4;
170 cfg.smatcher_soft.long_enough = 4;
171 CHECK_EQ(0, xd3_config_stream (&stream, &cfg));
172
173 CHECK_EQ(0, xd3_size_hashtable (&stream,
174 1<<10 /* ignored */,
175 stream.smatcher.large_look,
176 & stream.large_hash));
177 }
~with_streamwith_stream178 ~with_stream()
179 {
180 xd3_free_stream (&stream);
181 }
182 };
183
184 MEMBER
185 struct large_cksum : public with_stream<SELF> {
steplarge_cksum186 Word step(const uint8_t *ptr) {
187 return xd3_large_cksum (&this->stream.large_hash, ptr, CksumSize);
188 }
189
state0large_cksum190 Word state0(const uint8_t *ptr) {
191 incr_state = step(ptr);
192 return incr_state;
193 }
194
incrlarge_cksum195 Word incr(const uint8_t *ptr) {
196 incr_state = xd3_large_cksum_update (&this->stream.large_hash,
197 incr_state, ptr - 1, CksumSize);
198 return incr_state;
199 }
200
201 Word incr_state;
202 };
203
204 #if SIZEOF_USIZE_T == 4
205 #define xd3_large_cksum_old xd3_large32_cksum_old
206 #define xd3_large_cksum_update_old xd3_large32_cksum_update_old
207 #elif SIZEOF_USIZE_T == 8
208 #define xd3_large_cksum_old xd3_large64_cksum_old
209 #define xd3_large_cksum_update_old xd3_large64_cksum_update_old
210 #endif
211
212 MEMBER
213 struct large_cksum_old : public with_stream<SELF> {
steplarge_cksum_old214 Word step(const uint8_t *ptr) {
215 return xd3_large_cksum_old (&this->stream.large_hash, ptr, CksumSize);
216 }
217
state0large_cksum_old218 Word state0(const uint8_t *ptr) {
219 incr_state = step(ptr);
220 return incr_state;
221 }
222
incrlarge_cksum_old223 Word incr(const uint8_t *ptr) {
224 incr_state = xd3_large_cksum_update_old (&this->stream.large_hash,
225 incr_state, ptr - 1, CksumSize);
226 return incr_state;
227 }
228
229 Word incr_state;
230 };
231
232 // TESTS
233
234 template <typename Word>
235 struct file_stats {
236 typedef const uint8_t* ptr_type;
237 typedef Word word_type;
238 typedef btree::btree_multimap<word_type, ptr_type> table_type;
239 typedef typename table_type::iterator table_iterator;
240
241 usize_t cksum_size;
242 usize_t cksum_skip;
243 usize_t unique;
244 usize_t unique_values;
245 usize_t count;
246 table_type table;
247
file_statsfile_stats248 file_stats(usize_t size, usize_t skip)
249 : cksum_size(size),
250 cksum_skip(skip),
251 unique(0),
252 unique_values(0),
253 count(0) {
254 }
255
resetfile_stats256 void reset() {
257 unique = 0;
258 unique_values = 0;
259 count = 0;
260 table.clear();
261 }
262
updatefile_stats263 void update(word_type word, ptr_type ptr) {
264 table_iterator t_i = table.find(word);
265
266 count++;
267 if (t_i != table.end()) {
268 int collisions = 0;
269 for (table_iterator p_i = t_i;
270 p_i != table.end() && p_i->first == word;
271 ++p_i) {
272 if (memcmp(p_i->second, ptr, cksum_size) == 0) {
273 return;
274 }
275 collisions++;
276 }
277 if (collisions >= 1000) {
278 fprintf(stderr, "Something is not right, lots of collisions=%d\n",
279 collisions);
280 abort();
281 }
282 } else {
283 unique_values++;
284 }
285 unique++;
286 table.insert(std::make_pair(word, ptr));
287 return;
288 }
289
freezefile_stats290 void freeze() {
291 table.clear();
292 }
293 };
294
295 struct test_result_base;
296
297 static vector<test_result_base*> all_tests;
298
299 struct test_result_base {
~test_result_basetest_result_base300 virtual ~test_result_base() {
301 }
302 virtual void reset() = 0;
303 virtual void print() = 0;
304 virtual void get(const uint8_t* buf, const size_t buf_size,
305 usize_t iters) = 0;
306 virtual void stat() = 0;
307 virtual usize_t count() = 0;
308 virtual usize_t dups() = 0;
309 virtual double uniqueness() = 0;
310 virtual double fullness() = 0;
311 virtual double collisions() = 0;
312 virtual double coverage() = 0;
313 virtual double compression() = 0;
314 virtual double time() = 0;
315 virtual double total_time() = 0;
316 virtual usize_t total_count() = 0;
317 virtual usize_t total_dups() = 0;
318 };
319
320 template <typename Checksum>
321 struct test_result : public test_result_base {
322 Checksum cksum;
323 const char *test_name;
324 file_stats<typename Checksum::word_type> fstats;
325 usize_t test_size;
326 usize_t n_steps;
327 usize_t n_incrs;
328 typename Checksum::word_type s_bits;
329 typename Checksum::word_type s_mask;
330 usize_t t_entries;
331 usize_t h_bits;
332 usize_t h_buckets_full;
333 char *hash_table;
334 long accum_millis;
335 usize_t accum_iters;
336
337 // These are not reset
338 double accum_time;
339 usize_t accum_count;
340 usize_t accum_dups;
341 usize_t accum_colls;
342 size_t accum_size;
343
test_resulttest_result344 test_result(const char *name)
345 : test_name(name),
346 fstats(Checksum::cksum_size, Checksum::cksum_skip),
347 hash_table(NULL),
348 accum_millis(0),
349 accum_iters(0),
350 accum_time(0.0),
351 accum_count(0),
352 accum_dups(0),
353 accum_colls(0),
354 accum_size(0) {
355 all_tests.push_back(this);
356 }
357
~test_resulttest_result358 ~test_result() {
359 reset();
360 }
361
resettest_result362 void reset() {
363 // size of file
364 test_size = 0;
365
366 // count
367 n_steps = 0;
368 n_incrs = 0;
369
370 // four values used by new_table()/summarize_table()
371 s_bits = 0;
372 s_mask = 0;
373 t_entries = 0;
374 h_bits = 0;
375 h_buckets_full = 0;
376
377 accum_millis = 0;
378 accum_iters = 0;
379
380 fstats.reset();
381
382 // temporary
383 if (hash_table) {
384 delete(hash_table);
385 hash_table = NULL;
386 }
387 }
388
counttest_result389 usize_t count() {
390 if (Checksum::cksum_skip == 1) {
391 return n_incrs;
392 } else {
393 return n_steps;
394 }
395 }
396
dupstest_result397 usize_t dups() {
398 return fstats.count - fstats.unique;
399 }
400
401 /* Fraction of distinct strings of length cksum_size which are not
402 * represented in the hash table. */
collisionstest_result403 double collisions() {
404 return (fstats.unique - fstats.unique_values) / (double) fstats.unique;
405 }
collstest_result406 usize_t colls() {
407 return (fstats.unique - fstats.unique_values);
408 }
409
uniquenesstest_result410 double uniqueness() {
411 return 1.0 - (double) dups() / count();
412 }
413
fullnesstest_result414 double fullness() {
415 return (double) h_buckets_full / (1 << h_bits);
416 }
417
coveragetest_result418 double coverage() {
419 return (double) h_buckets_full / uniqueness() / count();
420 }
421
compressiontest_result422 double compression() {
423 return 1.0 - coverage();
424 }
425
timetest_result426 double time() {
427 return (double) accum_millis / accum_iters;
428 }
429
total_timetest_result430 double total_time() {
431 return accum_time;
432 }
433
total_counttest_result434 usize_t total_count() {
435 return accum_count;
436 }
437
total_dupstest_result438 usize_t total_dups() {
439 return accum_dups;
440 }
441
total_collstest_result442 usize_t total_colls() {
443 return accum_dups;
444 }
445
stattest_result446 void stat() {
447 accum_time += time();
448 accum_count += count();
449 accum_dups += dups();
450 accum_colls += colls();
451 accum_size += test_size;
452 }
453
printtest_result454 void print() {
455 if (fstats.count != count()) {
456 fprintf(stderr, "internal error: %" W "d != %" W "d\n", fstats.count, count());
457 abort();
458 }
459 print_header();
460 printf("%-32s%d/%d 2^%" W "u\t%" W "u\t%0.4f\t%.4f\t%.4f\t%.1e\t%.2f\t"
461 "%" W "u\t%" W "u\n",
462 test_name,
463 Checksum::cksum_size,
464 Checksum::cksum_skip,
465 h_bits,
466 count(),
467 uniqueness(),
468 fullness(),
469 coverage(),
470 collisions(),
471 0.001 * accum_iters * test_size / accum_millis,
472 accum_iters,
473 colls());
474 }
475
size_log2test_result476 usize_t size_log2 (usize_t slots) {
477 usize_t bits = bitsof<typename Checksum::word_type>() - 1;
478 usize_t i;
479
480 for (i = 3; i <= bits; i += 1) {
481 if (slots <= (1U << i)) {
482 return i - Checksum::compaction;
483 }
484 }
485
486 return bits;
487 }
488
new_tabletest_result489 void new_table(usize_t entries) {
490 t_entries = entries;
491 h_bits = size_log2(entries);
492
493 usize_t n = 1 << h_bits;
494
495 s_bits = bitsof<typename Checksum::word_type>() - h_bits;
496 s_mask = n - 1U;
497
498 hash_table = new char[n / 8];
499 memset(hash_table, 0, n / 8);
500 }
501
get_table_bittest_result502 int get_table_bit(usize_t i) {
503 return hash_table[i/8] & (1 << i%8);
504 }
505
set_table_bittest_result506 int set_table_bit(usize_t i) {
507 return hash_table[i/8] |= (1 << i%8);
508 }
509
summarize_tabletest_result510 void summarize_table() {
511 usize_t n = 1 << h_bits;
512 usize_t f = 0;
513 for (usize_t i = 0; i < n; i++) {
514 if (get_table_bit(i)) {
515 f++;
516 }
517 }
518 h_buckets_full = f;
519 }
520
gettest_result521 void get(const uint8_t* buf, const size_t buf_size, usize_t test_iters) {
522 typename Checksum::hash_type hash;
523 const uint8_t *ptr;
524 const uint8_t *end;
525 usize_t periods;
526 int64_t last_offset;
527 int64_t stop;
528
529 test_size = buf_size;
530 last_offset = buf_size - Checksum::cksum_size;
531
532 if (last_offset < 0) {
533 periods = 0;
534 n_steps = 0;
535 n_incrs = 0;
536 stop = -Checksum::cksum_size;
537 } else {
538 periods = last_offset / Checksum::cksum_skip;
539 n_steps = periods + 1;
540 n_incrs = last_offset + 1;
541 stop = last_offset - (periods + 1) * Checksum::cksum_skip;
542 }
543
544 // Compute file stats once.
545 if (fstats.unique_values == 0) {
546 if (Checksum::cksum_skip == 1) {
547 for (size_t i = 0; i <= buf_size - Checksum::cksum_size; i++) {
548 fstats.update(hash(cksum.step(buf + i), s_bits, s_mask), buf + i);
549 }
550 } else {
551 ptr = buf + last_offset;
552 end = buf + stop;
553
554 for (; ptr != end; ptr -= Checksum::cksum_skip) {
555 fstats.update(hash(cksum.step(ptr), s_bits, s_mask), ptr);
556 }
557 }
558 fstats.freeze();
559 }
560
561 long start_test = get_millisecs_now();
562
563 if (Checksum::cksum_skip != 1) {
564 new_table(n_steps);
565
566 for (usize_t i = 0; i < test_iters; i++) {
567 ptr = buf + last_offset;
568 end = buf + stop;
569
570 for (; ptr != end; ptr -= Checksum::cksum_skip) {
571 set_table_bit(hash(cksum.step(ptr), s_bits, s_mask));
572 }
573 }
574
575 summarize_table();
576 }
577
578 stop = buf_size - Checksum::cksum_size + 1;
579 if (stop < 0) {
580 stop = 0;
581 }
582
583 if (Checksum::cksum_skip == 1) {
584 new_table(n_incrs);
585
586 for (usize_t i = 0; i < test_iters; i++) {
587 ptr = buf;
588 end = buf + stop;
589
590 if (ptr != end) {
591 set_table_bit(hash(cksum.state0(ptr++), s_bits, s_mask));
592 }
593
594 for (; ptr != end; ptr++) {
595 typename Checksum::word_type w = cksum.incr(ptr);
596 CHECK_EQ(w, cksum.step(ptr));
597 set_table_bit(hash(w, s_bits, s_mask));
598 }
599 }
600
601 summarize_table();
602 }
603
604 accum_iters += test_iters;
605 accum_millis += get_millisecs_now() - start_test;
606 }
607 };
608
read_whole_file(const char * name,uint8_t ** buf_ptr,size_t * buf_len)609 static int read_whole_file(const char *name,
610 uint8_t **buf_ptr,
611 size_t *buf_len) {
612 main_file file;
613 int ret;
614 xoff_t len;
615 size_t nread;
616 main_file_init(&file);
617 file.filename = name;
618 ret = main_file_open(&file, name, XO_READ);
619 if (ret != 0) {
620 fprintf(stderr, "open failed\n");
621 goto exit;
622 }
623 ret = main_file_stat(&file, &len);
624 if (ret != 0) {
625 fprintf(stderr, "stat failed\n");
626 goto exit;
627 }
628
629 (*buf_len) = (size_t)len;
630 (*buf_ptr) = (uint8_t*) main_malloc(*buf_len);
631 ret = main_file_read(&file, *buf_ptr, *buf_len, &nread,
632 "read failed");
633 if (ret == 0 && *buf_len == nread) {
634 ret = 0;
635 } else {
636 fprintf(stderr, "invalid read\n");
637 ret = XD3_INTERNAL;
638 }
639 exit:
640 main_file_cleanup(&file);
641 return ret;
642 }
643
main(int argc,char ** argv)644 int main(int argc, char** argv) {
645 int i;
646 uint8_t *buf = NULL;
647 size_t buf_len = 0;
648 int ret;
649
650 if (argc <= 1) {
651 fprintf(stderr, "usage: %s file ...\n", argv[0]);
652 return 1;
653 }
654
655 // TODO: The xdelta3-hash.h code is identical now; add sameness test.
656 // using rabin_karp<> template.
657 #define TEST(T,Z,S,C) \
658 test_result<large_cksum<T,Z,S,hhash<T>,C>> \
659 _xck_ ## T ## _ ## Z ## _ ## S ## _ ## C \
660 ("xck_" #T "_" #Z "_" #S "_" #C); \
661 test_result<large_cksum_old<T,Z,S,hhash<T>,C>> \
662 _old_ ## T ## _ ## Z ## _ ## S ## _ ## C \
663 ("old_" #T "_" #Z "_" #S "_" #C)
664
665 #define TESTS(SIZE, SKIP) \
666 TEST(usize_t, SIZE, SKIP, 1); \
667 TEST(usize_t, SIZE, SKIP, 2)
668
669 TESTS(5, 1);
670 TESTS(6, 1);
671 TESTS(7, 1);
672 TESTS(8, 1);
673 TESTS(9, 1);
674 TESTS(10, 1);
675 TESTS(11, 1);
676 TESTS(12, 1);
677 TESTS(13, 1);
678 TESTS(14, 1);
679 TESTS(15, 1);
680 TESTS(16, 1);
681 TESTS(17, 1);
682 TESTS(18, 1);
683 TESTS(19, 1);
684 TESTS(20, 1);
685 TESTS(21, 1);
686 TESTS(22, 1);
687 TESTS(23, 1);
688 TESTS(24, 1);
689 TESTS(25, 1);
690 TESTS(26, 1);
691 TESTS(27, 1);
692 TESTS(28, 1);
693 TESTS(29, 1);
694 TESTS(30, 1);
695 TESTS(31, 1);
696 TESTS(32, 1);
697 TESTS(33, 1);
698 TESTS(34, 1);
699 TESTS(35, 1);
700 TESTS(36, 1);
701 TESTS(37, 1);
702 TESTS(38, 1);
703 TESTS(39, 1);
704
705
706 for (i = 1; i < argc; i++) {
707 if ((ret = read_whole_file(argv[i],
708 & buf,
709 & buf_len))) {
710 return 1;
711 }
712
713 fprintf(stderr, "file %s is %zu bytes\n",
714 argv[i], buf_len);
715
716 double min_time = -1.0;
717 double min_compression = 0.0;
718
719 for (vector<test_result_base*>::iterator iter = all_tests.begin();
720 iter != all_tests.end(); ++iter) {
721 test_result_base *test = *iter;
722 test->reset();
723
724 usize_t iters = 1;
725 long start_test = get_millisecs_now();
726
727 do {
728 test->get(buf, buf_len, iters);
729 iters *= 3;
730 iters /= 2;
731 } while (get_millisecs_now() - start_test < 2000);
732
733 test->stat();
734
735 if (min_time < 0.0) {
736 min_compression = test->compression();
737 min_time = test->time();
738 }
739
740 if (min_time > test->time()) {
741 min_time = test->time();
742 }
743
744 if (min_compression > test->compression()) {
745 min_compression = test->compression();
746 }
747
748 test->print();
749 }
750
751 main_free(buf);
752 buf = NULL;
753 }
754
755 return 0;
756 }
757