1 /* Copyright (C) 2007 Josh MacDonald */
2 
3 #include "test.h"
4 #include <assert.h>
5 #include <list>
6 #include <vector>
7 #include <algorithm>
8 
9 #include "../cpp-btree/btree_map.h"
10 
11 extern "C" {
12 uint32_t xd3_large32_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
13 uint32_t xd3_large32_cksum_update_old (xd3_hash_cfg *cfg, uint32_t cksum,
14 				       const uint8_t *base, const usize_t look);
15 
16 uint64_t xd3_large64_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
17 uint64_t xd3_large64_cksum_update_old (xd3_hash_cfg *cfg, uint64_t cksum,
18 				       const uint8_t *base, const usize_t look);
19 }
20 
21 using btree::btree_map;
22 using std::list;
23 using std::vector;
24 
25 // MLCG parameters
26 // a, a*
27 uint32_t good_32bit_values[] = {
28   1597334677U, // ...
29   741103597U, 887987685U,
30 };
31 
32 // a, a*
33 uint64_t good_64bit_values[] = {
34   1181783497276652981ULL, 4292484099903637661ULL,
35   7664345821815920749ULL, // ...
36 };
37 
print_header()38 void print_header() {
39   static int hdr_cnt = 0;
40   if (hdr_cnt++ % 20 == 0) {
41     printf("%-32sConf\t\tCount\tUniq\tFull\tCover\tColls"
42 	   "\tMB/s\tIters\t#Colls\n", "Name");
43   }
44 }
45 
46 struct true_type { };
47 struct false_type { };
48 
49 template <typename Word>
50 usize_t bitsof();
51 
52 template<>
bitsof()53 usize_t bitsof<unsigned int>() {
54   return sizeof(unsigned int) * 8;
55 }
56 
57 template<>
bitsof()58 usize_t bitsof<unsigned long>() {
59   return sizeof(unsigned long) * 8;
60 }
61 
62 template<>
bitsof()63 usize_t bitsof<unsigned long long>() {
64   return sizeof(unsigned long long) * 8;
65 }
66 
67 template <typename Word>
68 struct hhash {  // shift "s" bits leaving the high bits as a hash value for
69 		// this checksum, which are the most "distant" in terms of the
70 		// spectral test for the rabin_karp MLCG.  For short windows,
71 		// the high bits aren't enough, XOR "mask" worth of these in.
operator ()hhash72   Word operator()(const Word t, const Word s, const Word mask) {
73     return (t >> s) ^ (t & mask);
74   }
75 };
76 
77 template <typename Word>
78 Word good_word();
79 
80 template<>
good_word()81 uint32_t good_word<uint32_t>() {
82   return good_32bit_values[0];
83 }
84 
85 template<>
good_word()86 uint64_t good_word<uint64_t>() {
87   return good_64bit_values[0];
88 }
89 
90 // CLASSES
91 
92 #define SELF Word, CksumSize, CksumSkip, Hash, Compaction
93 #define MEMBER template <typename Word,		\
94 			 int CksumSize,		\
95 			 int CksumSkip,		\
96 			 typename Hash,		\
97                          int Compaction>
98 
99 MEMBER
100 struct cksum_params {
101   typedef Word word_type;
102   typedef Hash hash_type;
103 
104   static const int cksum_size = CksumSize;
105   static const int cksum_skip = CksumSkip;
106   static const int compaction = Compaction;
107 };
108 
109 MEMBER
110 struct rabin_karp : public cksum_params<SELF> {
111   // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
rabin_karprabin_karp112   rabin_karp()
113     : powers(make_powers()),
114       product(powers[0] * good_word<Word>()),
115       incr_state(0) { }
116 
make_powersrabin_karp117   static Word* make_powers() {
118     Word *p = new Word[CksumSize];
119     p[CksumSize - 1] = 1;
120     for (int i = CksumSize - 2; i >= 0; i--) {
121       p[i] = p[i + 1] * good_word<Word>();
122     }
123     return p;
124   }
125 
~rabin_karprabin_karp126   ~rabin_karp() {
127     delete [] powers;
128   }
129 
steprabin_karp130   Word step(const uint8_t *ptr) {
131     Word h = 0;
132     for (int i = 0; i < CksumSize; i++) {
133       h += (ptr[i]) * powers[i];
134     }
135     return h;
136   }
137 
state0rabin_karp138   Word state0(const uint8_t *ptr) {
139     incr_state = step(ptr);
140     return incr_state;
141   }
142 
incrrabin_karp143   Word incr(const uint8_t *ptr) {
144     incr_state = good_word<Word>() * incr_state -
145       product * (ptr[-1]) + (ptr[CksumSize - 1]);
146     return incr_state;
147   }
148 
149   const Word *const powers;
150   const Word  product;
151   Word        incr_state;
152 };
153 
154 MEMBER
155 struct with_stream : public cksum_params<SELF> {
156   xd3_stream stream;
157 
with_streamwith_stream158   with_stream()
159   {
160     xd3_config cfg;
161     memset (&stream, 0, sizeof (stream));
162     xd3_init_config (&cfg, 0);
163     cfg.smatch_cfg = XD3_SMATCH_SOFT;
164     cfg.smatcher_soft.large_look = CksumSize;
165     cfg.smatcher_soft.large_step = CksumSkip;
166     cfg.smatcher_soft.small_look = 4;
167     cfg.smatcher_soft.small_chain = 4;
168     cfg.smatcher_soft.small_lchain = 4;
169     cfg.smatcher_soft.max_lazy = 4;
170     cfg.smatcher_soft.long_enough = 4;
171     CHECK_EQ(0, xd3_config_stream (&stream, &cfg));
172 
173     CHECK_EQ(0, xd3_size_hashtable (&stream,
174 				    1<<10 /* ignored */,
175 				    stream.smatcher.large_look,
176 				    & stream.large_hash));
177   }
~with_streamwith_stream178   ~with_stream()
179   {
180     xd3_free_stream (&stream);
181   }
182 };
183 
184 MEMBER
185 struct large_cksum : public with_stream<SELF> {
steplarge_cksum186   Word step(const uint8_t *ptr) {
187     return xd3_large_cksum (&this->stream.large_hash, ptr, CksumSize);
188   }
189 
state0large_cksum190   Word state0(const uint8_t *ptr) {
191     incr_state = step(ptr);
192     return incr_state;
193   }
194 
incrlarge_cksum195   Word incr(const uint8_t *ptr) {
196     incr_state = xd3_large_cksum_update (&this->stream.large_hash,
197 					 incr_state, ptr - 1, CksumSize);
198     return incr_state;
199   }
200 
201   Word incr_state;
202 };
203 
204 #if SIZEOF_USIZE_T == 4
205 #define xd3_large_cksum_old         xd3_large32_cksum_old
206 #define xd3_large_cksum_update_old  xd3_large32_cksum_update_old
207 #elif SIZEOF_USIZE_T == 8
208 #define xd3_large_cksum_old         xd3_large64_cksum_old
209 #define xd3_large_cksum_update_old  xd3_large64_cksum_update_old
210 #endif
211 
212 MEMBER
213 struct large_cksum_old : public with_stream<SELF> {
steplarge_cksum_old214   Word step(const uint8_t *ptr) {
215     return xd3_large_cksum_old (&this->stream.large_hash, ptr, CksumSize);
216   }
217 
state0large_cksum_old218   Word state0(const uint8_t *ptr) {
219     incr_state = step(ptr);
220     return incr_state;
221   }
222 
incrlarge_cksum_old223   Word incr(const uint8_t *ptr) {
224     incr_state = xd3_large_cksum_update_old (&this->stream.large_hash,
225 					     incr_state, ptr - 1, CksumSize);
226     return incr_state;
227   }
228 
229   Word incr_state;
230 };
231 
232 // TESTS
233 
234 template <typename Word>
235 struct file_stats {
236   typedef const uint8_t* ptr_type;
237   typedef Word word_type;
238   typedef btree::btree_multimap<word_type, ptr_type> table_type;
239   typedef typename table_type::iterator table_iterator;
240 
241   usize_t cksum_size;
242   usize_t cksum_skip;
243   usize_t unique;
244   usize_t unique_values;
245   usize_t count;
246   table_type table;
247 
file_statsfile_stats248   file_stats(usize_t size, usize_t skip)
249     : cksum_size(size),
250       cksum_skip(skip),
251       unique(0),
252       unique_values(0),
253       count(0) {
254   }
255 
resetfile_stats256   void reset() {
257     unique = 0;
258     unique_values = 0;
259     count = 0;
260     table.clear();
261   }
262 
updatefile_stats263   void update(word_type word, ptr_type ptr) {
264     table_iterator t_i = table.find(word);
265 
266     count++;
267     if (t_i != table.end()) {
268       int collisions = 0;
269       for (table_iterator p_i = t_i;
270 	   p_i != table.end() && p_i->first == word;
271 	   ++p_i) {
272 	if (memcmp(p_i->second, ptr, cksum_size) == 0) {
273 	  return;
274 	}
275 	collisions++;
276       }
277       if (collisions >= 1000) {
278 	fprintf(stderr, "Something is not right, lots of collisions=%d\n",
279 		collisions);
280 	abort();
281       }
282     } else {
283       unique_values++;
284     }
285     unique++;
286     table.insert(std::make_pair(word, ptr));
287     return;
288   }
289 
freezefile_stats290   void freeze() {
291     table.clear();
292   }
293 };
294 
295 struct test_result_base;
296 
297 static vector<test_result_base*> all_tests;
298 
299 struct test_result_base {
~test_result_basetest_result_base300   virtual ~test_result_base() {
301   }
302   virtual void reset() = 0;
303   virtual void print() = 0;
304   virtual void get(const uint8_t* buf, const size_t buf_size,
305 		   usize_t iters) = 0;
306   virtual void stat() = 0;
307   virtual usize_t count() = 0;
308   virtual usize_t dups() = 0;
309   virtual double uniqueness() = 0;
310   virtual double fullness() = 0;
311   virtual double collisions() = 0;
312   virtual double coverage() = 0;
313   virtual double compression() = 0;
314   virtual double time() = 0;
315   virtual double total_time() = 0;
316   virtual usize_t total_count() = 0;
317   virtual usize_t total_dups() = 0;
318 };
319 
320 template <typename Checksum>
321 struct test_result : public test_result_base {
322   Checksum cksum;
323   const char *test_name;
324   file_stats<typename Checksum::word_type> fstats;
325   usize_t test_size;
326   usize_t n_steps;
327   usize_t n_incrs;
328   typename Checksum::word_type s_bits;
329   typename Checksum::word_type s_mask;
330   usize_t t_entries;
331   usize_t h_bits;
332   usize_t h_buckets_full;
333   char *hash_table;
334   long accum_millis;
335   usize_t accum_iters;
336 
337   // These are not reset
338   double accum_time;
339   usize_t accum_count;
340   usize_t accum_dups;
341   usize_t accum_colls;
342   size_t accum_size;
343 
test_resulttest_result344   test_result(const char *name)
345     : test_name(name),
346       fstats(Checksum::cksum_size, Checksum::cksum_skip),
347       hash_table(NULL),
348       accum_millis(0),
349       accum_iters(0),
350       accum_time(0.0),
351       accum_count(0),
352       accum_dups(0),
353       accum_colls(0),
354       accum_size(0) {
355     all_tests.push_back(this);
356   }
357 
~test_resulttest_result358   ~test_result() {
359     reset();
360   }
361 
resettest_result362   void reset() {
363     // size of file
364     test_size = 0;
365 
366     // count
367     n_steps = 0;
368     n_incrs = 0;
369 
370     // four values used by new_table()/summarize_table()
371     s_bits = 0;
372     s_mask = 0;
373     t_entries = 0;
374     h_bits = 0;
375     h_buckets_full = 0;
376 
377     accum_millis = 0;
378     accum_iters = 0;
379 
380     fstats.reset();
381 
382     // temporary
383     if (hash_table) {
384       delete(hash_table);
385       hash_table = NULL;
386     }
387   }
388 
counttest_result389   usize_t count() {
390     if (Checksum::cksum_skip == 1) {
391       return n_incrs;
392     } else {
393       return n_steps;
394     }
395   }
396 
dupstest_result397   usize_t dups() {
398     return fstats.count - fstats.unique;
399   }
400 
401   /* Fraction of distinct strings of length cksum_size which are not
402    * represented in the hash table. */
collisionstest_result403   double collisions() {
404     return (fstats.unique - fstats.unique_values) / (double) fstats.unique;
405   }
collstest_result406   usize_t colls() {
407     return (fstats.unique - fstats.unique_values);
408   }
409 
uniquenesstest_result410   double uniqueness() {
411     return 1.0 - (double) dups() / count();
412   }
413 
fullnesstest_result414   double fullness() {
415     return (double) h_buckets_full / (1 << h_bits);
416   }
417 
coveragetest_result418   double coverage() {
419     return (double) h_buckets_full / uniqueness() / count();
420   }
421 
compressiontest_result422   double compression() {
423     return 1.0 - coverage();
424   }
425 
timetest_result426   double time() {
427     return (double) accum_millis / accum_iters;
428   }
429 
total_timetest_result430   double total_time() {
431     return accum_time;
432   }
433 
total_counttest_result434   usize_t total_count() {
435     return accum_count;
436   }
437 
total_dupstest_result438   usize_t total_dups() {
439     return accum_dups;
440   }
441 
total_collstest_result442   usize_t total_colls() {
443     return accum_dups;
444   }
445 
stattest_result446   void stat() {
447     accum_time += time();
448     accum_count += count();
449     accum_dups += dups();
450     accum_colls += colls();
451     accum_size += test_size;
452   }
453 
printtest_result454   void print() {
455     if (fstats.count != count()) {
456       fprintf(stderr, "internal error: %" W "d != %" W "d\n", fstats.count, count());
457       abort();
458     }
459     print_header();
460     printf("%-32s%d/%d 2^%" W "u\t%" W "u\t%0.4f\t%.4f\t%.4f\t%.1e\t%.2f\t"
461 	   "%" W "u\t%" W "u\n",
462 	   test_name,
463 	   Checksum::cksum_size,
464 	   Checksum::cksum_skip,
465 	   h_bits,
466 	   count(),
467 	   uniqueness(),
468 	   fullness(),
469 	   coverage(),
470 	   collisions(),
471 	   0.001 * accum_iters * test_size / accum_millis,
472 	   accum_iters,
473 	   colls());
474   }
475 
size_log2test_result476   usize_t size_log2 (usize_t slots) {
477     usize_t bits = bitsof<typename Checksum::word_type>() - 1;
478     usize_t i;
479 
480     for (i = 3; i <= bits; i += 1) {
481       if (slots <= (1U << i)) {
482 	return i - Checksum::compaction;
483       }
484     }
485 
486     return bits;
487   }
488 
new_tabletest_result489   void new_table(usize_t entries) {
490     t_entries = entries;
491     h_bits = size_log2(entries);
492 
493     usize_t n = 1 << h_bits;
494 
495     s_bits = bitsof<typename Checksum::word_type>() - h_bits;
496     s_mask = n - 1U;
497 
498     hash_table = new char[n / 8];
499     memset(hash_table, 0, n / 8);
500   }
501 
get_table_bittest_result502   int get_table_bit(usize_t i) {
503     return hash_table[i/8] & (1 << i%8);
504   }
505 
set_table_bittest_result506   int set_table_bit(usize_t i) {
507     return hash_table[i/8] |= (1 << i%8);
508   }
509 
summarize_tabletest_result510   void summarize_table() {
511     usize_t n = 1 << h_bits;
512     usize_t f = 0;
513     for (usize_t i = 0; i < n; i++) {
514       if (get_table_bit(i)) {
515 	f++;
516       }
517     }
518     h_buckets_full = f;
519   }
520 
gettest_result521   void get(const uint8_t* buf, const size_t buf_size, usize_t test_iters) {
522     typename Checksum::hash_type hash;
523     const uint8_t *ptr;
524     const uint8_t *end;
525     usize_t periods;
526     int64_t last_offset;
527     int64_t stop;
528 
529     test_size = buf_size;
530     last_offset = buf_size - Checksum::cksum_size;
531 
532     if (last_offset < 0) {
533       periods = 0;
534       n_steps = 0;
535       n_incrs = 0;
536       stop = -Checksum::cksum_size;
537     } else {
538       periods = last_offset / Checksum::cksum_skip;
539       n_steps = periods + 1;
540       n_incrs = last_offset + 1;
541       stop = last_offset - (periods + 1) * Checksum::cksum_skip;
542     }
543 
544     // Compute file stats once.
545     if (fstats.unique_values == 0) {
546       if (Checksum::cksum_skip == 1) {
547 	for (size_t i = 0; i <= buf_size - Checksum::cksum_size; i++) {
548 	  fstats.update(hash(cksum.step(buf + i), s_bits, s_mask), buf + i);
549 	}
550       } else {
551 	ptr = buf + last_offset;
552 	end = buf + stop;
553 
554 	for (; ptr != end; ptr -= Checksum::cksum_skip) {
555 	  fstats.update(hash(cksum.step(ptr), s_bits, s_mask), ptr);
556 	}
557       }
558       fstats.freeze();
559     }
560 
561     long start_test = get_millisecs_now();
562 
563     if (Checksum::cksum_skip != 1) {
564       new_table(n_steps);
565 
566       for (usize_t i = 0; i < test_iters; i++) {
567 	ptr = buf + last_offset;
568 	end = buf + stop;
569 
570 	for (; ptr != end; ptr -= Checksum::cksum_skip) {
571 	  set_table_bit(hash(cksum.step(ptr), s_bits, s_mask));
572 	}
573       }
574 
575       summarize_table();
576     }
577 
578     stop = buf_size - Checksum::cksum_size + 1;
579     if (stop < 0) {
580       stop = 0;
581     }
582 
583     if (Checksum::cksum_skip == 1) {
584       new_table(n_incrs);
585 
586       for (usize_t i = 0; i < test_iters; i++) {
587 	ptr = buf;
588 	end = buf + stop;
589 
590 	if (ptr != end) {
591 	  set_table_bit(hash(cksum.state0(ptr++), s_bits, s_mask));
592 	}
593 
594 	for (; ptr != end; ptr++) {
595 	  typename Checksum::word_type w = cksum.incr(ptr);
596 	  CHECK_EQ(w, cksum.step(ptr));
597 	  set_table_bit(hash(w, s_bits, s_mask));
598 	}
599       }
600 
601       summarize_table();
602     }
603 
604     accum_iters += test_iters;
605     accum_millis += get_millisecs_now() - start_test;
606   }
607 };
608 
read_whole_file(const char * name,uint8_t ** buf_ptr,size_t * buf_len)609 static int read_whole_file(const char *name,
610 			   uint8_t **buf_ptr,
611 			   size_t *buf_len) {
612   main_file file;
613   int ret;
614   xoff_t len;
615   size_t nread;
616   main_file_init(&file);
617   file.filename = name;
618   ret = main_file_open(&file, name, XO_READ);
619   if (ret != 0) {
620     fprintf(stderr, "open failed\n");
621     goto exit;
622   }
623   ret = main_file_stat(&file, &len);
624   if (ret != 0) {
625     fprintf(stderr, "stat failed\n");
626     goto exit;
627   }
628 
629   (*buf_len) = (size_t)len;
630   (*buf_ptr) = (uint8_t*) main_malloc(*buf_len);
631   ret = main_file_read(&file, *buf_ptr, *buf_len, &nread,
632 		       "read failed");
633   if (ret == 0 && *buf_len == nread) {
634     ret = 0;
635   } else {
636     fprintf(stderr, "invalid read\n");
637     ret = XD3_INTERNAL;
638   }
639  exit:
640   main_file_cleanup(&file);
641   return ret;
642 }
643 
main(int argc,char ** argv)644 int main(int argc, char** argv) {
645   int i;
646   uint8_t *buf = NULL;
647   size_t buf_len = 0;
648   int ret;
649 
650   if (argc <= 1) {
651     fprintf(stderr, "usage: %s file ...\n", argv[0]);
652     return 1;
653   }
654 
655 // TODO: The xdelta3-hash.h code is identical now; add sameness test.
656 // using rabin_karp<> template.
657 #define TEST(T,Z,S,C)					\
658   test_result<large_cksum<T,Z,S,hhash<T>,C>>		\
659     _xck_ ## T ## _ ## Z ## _ ## S ## _ ## C		\
660     ("xck_" #T "_" #Z "_" #S "_" #C);			\
661   test_result<large_cksum_old<T,Z,S,hhash<T>,C>>	\
662     _old_ ## T ## _ ## Z ## _ ## S ## _ ## C		\
663     ("old_" #T "_" #Z "_" #S "_" #C)
664 
665 #define TESTS(SIZE, SKIP)	 \
666   TEST(usize_t, SIZE, SKIP, 1);  \
667   TEST(usize_t, SIZE, SKIP, 2)
668 
669   TESTS(5, 1);
670   TESTS(6, 1);
671   TESTS(7, 1);
672   TESTS(8, 1);
673   TESTS(9, 1);
674   TESTS(10, 1);
675   TESTS(11, 1);
676   TESTS(12, 1);
677   TESTS(13, 1);
678   TESTS(14, 1);
679   TESTS(15, 1);
680   TESTS(16, 1);
681   TESTS(17, 1);
682   TESTS(18, 1);
683   TESTS(19, 1);
684   TESTS(20, 1);
685   TESTS(21, 1);
686   TESTS(22, 1);
687   TESTS(23, 1);
688   TESTS(24, 1);
689   TESTS(25, 1);
690   TESTS(26, 1);
691   TESTS(27, 1);
692   TESTS(28, 1);
693   TESTS(29, 1);
694   TESTS(30, 1);
695   TESTS(31, 1);
696   TESTS(32, 1);
697   TESTS(33, 1);
698   TESTS(34, 1);
699   TESTS(35, 1);
700   TESTS(36, 1);
701   TESTS(37, 1);
702   TESTS(38, 1);
703   TESTS(39, 1);
704 
705 
706   for (i = 1; i < argc; i++) {
707     if ((ret = read_whole_file(argv[i],
708 			       & buf,
709 			       & buf_len))) {
710       return 1;
711     }
712 
713     fprintf(stderr, "file %s is %zu bytes\n",
714 	    argv[i], buf_len);
715 
716     double min_time = -1.0;
717     double min_compression = 0.0;
718 
719     for (vector<test_result_base*>::iterator iter = all_tests.begin();
720 	 iter != all_tests.end(); ++iter) {
721       test_result_base *test = *iter;
722       test->reset();
723 
724       usize_t iters = 1;
725       long start_test = get_millisecs_now();
726 
727       do {
728 	test->get(buf, buf_len, iters);
729 	iters *= 3;
730 	iters /= 2;
731       } while (get_millisecs_now() - start_test < 2000);
732 
733       test->stat();
734 
735       if (min_time < 0.0) {
736 	min_compression = test->compression();
737 	min_time = test->time();
738       }
739 
740       if (min_time > test->time()) {
741 	min_time = test->time();
742       }
743 
744       if (min_compression > test->compression()) {
745 	min_compression = test->compression();
746       }
747 
748       test->print();
749     }
750 
751     main_free(buf);
752     buf = NULL;
753   }
754 
755   return 0;
756 }
757