1 /* xdelta3 - delta compression tools and library -*- Mode: C++ -*-
2    Copyright 2016 Joshua MacDonald
3 
4    Licensed under the Apache License, Version 2.0 (the "License");
5    you may not use this file except in compliance with the License.
6    You may obtain a copy of the License at
7 
8    http://www.apache.org/licenses/LICENSE-2.0
9 
10    Unless required by applicable law or agreed to in writing, software
11    distributed under the License is distributed on an "AS IS" BASIS,
12    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13    See the License for the specific language governing permissions and
14    limitations under the License.
15 */
16 
17 #include "test.h"
18 #include <assert.h>
19 #include <list>
20 #include <vector>
21 #include <algorithm>
22 
23 #include "../cpp-btree/btree_map.h"
24 
25 extern "C" {
26 uint32_t xd3_large32_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
27 uint32_t xd3_large32_cksum_update_old (xd3_hash_cfg *cfg, uint32_t cksum,
28 				       const uint8_t *base, const usize_t look);
29 
30 uint64_t xd3_large64_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
31 uint64_t xd3_large64_cksum_update_old (xd3_hash_cfg *cfg, uint64_t cksum,
32 				       const uint8_t *base, const usize_t look);
33 }
34 
35 using btree::btree_map;
36 using std::list;
37 using std::vector;
38 
39 // MLCG parameters
40 // a, a*
41 uint32_t good_32bit_values[] = {
42   1597334677U, // ...
43   741103597U, 887987685U,
44 };
45 
46 // a, a*
47 uint64_t good_64bit_values[] = {
48   1181783497276652981ULL, 4292484099903637661ULL,
49   7664345821815920749ULL, // ...
50 };
51 
print_header()52 void print_header() {
53   static int hdr_cnt = 0;
54   if (hdr_cnt++ % 20 == 0) {
55     printf("%-32sConf\t\tCount\tUniq\tFull\tCover\tColls"
56 	   "\tMB/s\tIters\t#Colls\n", "Name");
57   }
58 }
59 
60 struct true_type { };
61 struct false_type { };
62 
63 template <typename Word>
64 usize_t bitsof();
65 
66 template<>
bitsof()67 usize_t bitsof<unsigned int>() {
68   return sizeof(unsigned int) * 8;
69 }
70 
71 template<>
bitsof()72 usize_t bitsof<unsigned long>() {
73   return sizeof(unsigned long) * 8;
74 }
75 
76 template<>
bitsof()77 usize_t bitsof<unsigned long long>() {
78   return sizeof(unsigned long long) * 8;
79 }
80 
81 template <typename Word>
82 struct hhash {  // shift "s" bits leaving the high bits as a hash value for
83 		// this checksum, which are the most "distant" in terms of the
84 		// spectral test for the rabin_karp MLCG.  For short windows,
85 		// the high bits aren't enough, XOR "mask" worth of these in.
operator ()hhash86   Word operator()(const Word t, const Word s, const Word mask) {
87     return (t >> s) ^ (t & mask);
88   }
89 };
90 
91 template <typename Word>
92 Word good_word();
93 
94 template<>
good_word()95 uint32_t good_word<uint32_t>() {
96   return good_32bit_values[0];
97 }
98 
99 template<>
good_word()100 uint64_t good_word<uint64_t>() {
101   return good_64bit_values[0];
102 }
103 
104 // CLASSES
105 
106 #define SELF Word, CksumSize, CksumSkip, Hash, Compaction
107 #define MEMBER template <typename Word,		\
108 			 int CksumSize,		\
109 			 int CksumSkip,		\
110 			 typename Hash,		\
111                          int Compaction>
112 
113 MEMBER
114 struct cksum_params {
115   typedef Word word_type;
116   typedef Hash hash_type;
117 
118   static const int cksum_size = CksumSize;
119   static const int cksum_skip = CksumSkip;
120   static const int compaction = Compaction;
121 };
122 
123 MEMBER
124 struct rabin_karp : public cksum_params<SELF> {
125   // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
rabin_karprabin_karp126   rabin_karp()
127     : powers(make_powers()),
128       product(powers[0] * good_word<Word>()),
129       incr_state(0) { }
130 
make_powersrabin_karp131   static Word* make_powers() {
132     Word *p = new Word[CksumSize];
133     p[CksumSize - 1] = 1;
134     for (int i = CksumSize - 2; i >= 0; i--) {
135       p[i] = p[i + 1] * good_word<Word>();
136     }
137     return p;
138   }
139 
~rabin_karprabin_karp140   ~rabin_karp() {
141     delete [] powers;
142   }
143 
steprabin_karp144   Word step(const uint8_t *ptr) {
145     Word h = 0;
146     for (int i = 0; i < CksumSize; i++) {
147       h += (ptr[i]) * powers[i];
148     }
149     return h;
150   }
151 
state0rabin_karp152   Word state0(const uint8_t *ptr) {
153     incr_state = step(ptr);
154     return incr_state;
155   }
156 
incrrabin_karp157   Word incr(const uint8_t *ptr) {
158     incr_state = good_word<Word>() * incr_state -
159       product * (ptr[-1]) + (ptr[CksumSize - 1]);
160     return incr_state;
161   }
162 
163   const Word *const powers;
164   const Word  product;
165   Word        incr_state;
166 };
167 
168 MEMBER
169 struct with_stream : public cksum_params<SELF> {
170   xd3_stream stream;
171 
with_streamwith_stream172   with_stream()
173   {
174     xd3_config cfg;
175     memset (&stream, 0, sizeof (stream));
176     xd3_init_config (&cfg, 0);
177     cfg.smatch_cfg = XD3_SMATCH_SOFT;
178     cfg.smatcher_soft.large_look = CksumSize;
179     cfg.smatcher_soft.large_step = CksumSkip;
180     cfg.smatcher_soft.small_look = 4;
181     cfg.smatcher_soft.small_chain = 4;
182     cfg.smatcher_soft.small_lchain = 4;
183     cfg.smatcher_soft.max_lazy = 4;
184     cfg.smatcher_soft.long_enough = 4;
185     CHECK_EQ(0, xd3_config_stream (&stream, &cfg));
186 
187     CHECK_EQ(0, xd3_size_hashtable (&stream,
188 				    1<<10 /* ignored */,
189 				    stream.smatcher.large_look,
190 				    & stream.large_hash));
191   }
~with_streamwith_stream192   ~with_stream()
193   {
194     xd3_free_stream (&stream);
195   }
196 };
197 
198 MEMBER
199 struct large_cksum : public with_stream<SELF> {
steplarge_cksum200   Word step(const uint8_t *ptr) {
201     return xd3_large_cksum (&this->stream.large_hash, ptr, CksumSize);
202   }
203 
state0large_cksum204   Word state0(const uint8_t *ptr) {
205     incr_state = step(ptr);
206     return incr_state;
207   }
208 
incrlarge_cksum209   Word incr(const uint8_t *ptr) {
210     incr_state = xd3_large_cksum_update (&this->stream.large_hash,
211 					 incr_state, ptr - 1, CksumSize);
212     return incr_state;
213   }
214 
215   Word incr_state;
216 };
217 
218 #if SIZEOF_USIZE_T == 4
219 #define xd3_large_cksum_old         xd3_large32_cksum_old
220 #define xd3_large_cksum_update_old  xd3_large32_cksum_update_old
221 #elif SIZEOF_USIZE_T == 8
222 #define xd3_large_cksum_old         xd3_large64_cksum_old
223 #define xd3_large_cksum_update_old  xd3_large64_cksum_update_old
224 #endif
225 
226 MEMBER
227 struct large_cksum_old : public with_stream<SELF> {
steplarge_cksum_old228   Word step(const uint8_t *ptr) {
229     return xd3_large_cksum_old (&this->stream.large_hash, ptr, CksumSize);
230   }
231 
state0large_cksum_old232   Word state0(const uint8_t *ptr) {
233     incr_state = step(ptr);
234     return incr_state;
235   }
236 
incrlarge_cksum_old237   Word incr(const uint8_t *ptr) {
238     incr_state = xd3_large_cksum_update_old (&this->stream.large_hash,
239 					     incr_state, ptr - 1, CksumSize);
240     return incr_state;
241   }
242 
243   Word incr_state;
244 };
245 
246 // TESTS
247 
248 template <typename Word>
249 struct file_stats {
250   typedef const uint8_t* ptr_type;
251   typedef Word word_type;
252   typedef btree::btree_multimap<word_type, ptr_type> table_type;
253   typedef typename table_type::iterator table_iterator;
254 
255   usize_t cksum_size;
256   usize_t cksum_skip;
257   usize_t unique;
258   usize_t unique_values;
259   usize_t count;
260   table_type table;
261 
file_statsfile_stats262   file_stats(usize_t size, usize_t skip)
263     : cksum_size(size),
264       cksum_skip(skip),
265       unique(0),
266       unique_values(0),
267       count(0) {
268   }
269 
resetfile_stats270   void reset() {
271     unique = 0;
272     unique_values = 0;
273     count = 0;
274     table.clear();
275   }
276 
updatefile_stats277   void update(word_type word, ptr_type ptr) {
278     table_iterator t_i = table.find(word);
279 
280     count++;
281     if (t_i != table.end()) {
282       int collisions = 0;
283       for (table_iterator p_i = t_i;
284 	   p_i != table.end() && p_i->first == word;
285 	   ++p_i) {
286 	if (memcmp(p_i->second, ptr, cksum_size) == 0) {
287 	  return;
288 	}
289 	collisions++;
290       }
291       if (collisions >= 1000) {
292 	fprintf(stderr, "Something is not right, lots of collisions=%d\n",
293 		collisions);
294 	abort();
295       }
296     } else {
297       unique_values++;
298     }
299     unique++;
300     table.insert(std::make_pair(word, ptr));
301     return;
302   }
303 
freezefile_stats304   void freeze() {
305     table.clear();
306   }
307 };
308 
309 struct test_result_base;
310 
311 static vector<test_result_base*> all_tests;
312 
313 struct test_result_base {
~test_result_basetest_result_base314   virtual ~test_result_base() {
315   }
316   virtual void reset() = 0;
317   virtual void print() = 0;
318   virtual void get(const uint8_t* buf, const size_t buf_size,
319 		   usize_t iters) = 0;
320   virtual void stat() = 0;
321   virtual usize_t count() = 0;
322   virtual usize_t dups() = 0;
323   virtual double uniqueness() = 0;
324   virtual double fullness() = 0;
325   virtual double collisions() = 0;
326   virtual double coverage() = 0;
327   virtual double compression() = 0;
328   virtual double time() = 0;
329   virtual double total_time() = 0;
330   virtual usize_t total_count() = 0;
331   virtual usize_t total_dups() = 0;
332 };
333 
334 template <typename Checksum>
335 struct test_result : public test_result_base {
336   Checksum cksum;
337   const char *test_name;
338   file_stats<typename Checksum::word_type> fstats;
339   usize_t test_size;
340   usize_t n_steps;
341   usize_t n_incrs;
342   typename Checksum::word_type s_bits;
343   typename Checksum::word_type s_mask;
344   usize_t t_entries;
345   usize_t h_bits;
346   usize_t h_buckets_full;
347   char *hash_table;
348   long accum_millis;
349   usize_t accum_iters;
350 
351   // These are not reset
352   double accum_time;
353   usize_t accum_count;
354   usize_t accum_dups;
355   usize_t accum_colls;
356   size_t accum_size;
357 
test_resulttest_result358   test_result(const char *name)
359     : test_name(name),
360       fstats(Checksum::cksum_size, Checksum::cksum_skip),
361       hash_table(NULL),
362       accum_millis(0),
363       accum_iters(0),
364       accum_time(0.0),
365       accum_count(0),
366       accum_dups(0),
367       accum_colls(0),
368       accum_size(0) {
369     all_tests.push_back(this);
370   }
371 
~test_resulttest_result372   ~test_result() {
373     reset();
374   }
375 
resettest_result376   void reset() {
377     // size of file
378     test_size = 0;
379 
380     // count
381     n_steps = 0;
382     n_incrs = 0;
383 
384     // four values used by new_table()/summarize_table()
385     s_bits = 0;
386     s_mask = 0;
387     t_entries = 0;
388     h_bits = 0;
389     h_buckets_full = 0;
390 
391     accum_millis = 0;
392     accum_iters = 0;
393 
394     fstats.reset();
395 
396     // temporary
397     if (hash_table) {
398       delete(hash_table);
399       hash_table = NULL;
400     }
401   }
402 
counttest_result403   usize_t count() {
404     if (Checksum::cksum_skip == 1) {
405       return n_incrs;
406     } else {
407       return n_steps;
408     }
409   }
410 
dupstest_result411   usize_t dups() {
412     return fstats.count - fstats.unique;
413   }
414 
415   /* Fraction of distinct strings of length cksum_size which are not
416    * represented in the hash table. */
collisionstest_result417   double collisions() {
418     return (fstats.unique - fstats.unique_values) / (double) fstats.unique;
419   }
collstest_result420   usize_t colls() {
421     return (fstats.unique - fstats.unique_values);
422   }
423 
uniquenesstest_result424   double uniqueness() {
425     return 1.0 - (double) dups() / count();
426   }
427 
fullnesstest_result428   double fullness() {
429     return (double) h_buckets_full / (1 << h_bits);
430   }
431 
coveragetest_result432   double coverage() {
433     return (double) h_buckets_full / uniqueness() / count();
434   }
435 
compressiontest_result436   double compression() {
437     return 1.0 - coverage();
438   }
439 
timetest_result440   double time() {
441     return (double) accum_millis / accum_iters;
442   }
443 
total_timetest_result444   double total_time() {
445     return accum_time;
446   }
447 
total_counttest_result448   usize_t total_count() {
449     return accum_count;
450   }
451 
total_dupstest_result452   usize_t total_dups() {
453     return accum_dups;
454   }
455 
total_collstest_result456   usize_t total_colls() {
457     return accum_dups;
458   }
459 
stattest_result460   void stat() {
461     accum_time += time();
462     accum_count += count();
463     accum_dups += dups();
464     accum_colls += colls();
465     accum_size += test_size;
466   }
467 
printtest_result468   void print() {
469     if (fstats.count != count()) {
470       fprintf(stderr, "internal error: %" W "d != %" W "d\n", fstats.count, count());
471       abort();
472     }
473     print_header();
474     printf("%-32s%d/%d 2^%" W "u\t%" W "u\t%0.4f\t%.4f\t%.4f\t%.1e\t%.2f\t"
475 	   "%" W "u\t%" W "u\n",
476 	   test_name,
477 	   Checksum::cksum_size,
478 	   Checksum::cksum_skip,
479 	   h_bits,
480 	   count(),
481 	   uniqueness(),
482 	   fullness(),
483 	   coverage(),
484 	   collisions(),
485 	   0.001 * accum_iters * test_size / accum_millis,
486 	   accum_iters,
487 	   colls());
488   }
489 
size_log2test_result490   usize_t size_log2 (usize_t slots) {
491     usize_t bits = bitsof<typename Checksum::word_type>() - 1;
492     usize_t i;
493 
494     for (i = 3; i <= bits; i += 1) {
495       if (slots <= (1U << i)) {
496 	return i - Checksum::compaction;
497       }
498     }
499 
500     return bits;
501   }
502 
new_tabletest_result503   void new_table(usize_t entries) {
504     t_entries = entries;
505     h_bits = size_log2(entries);
506 
507     usize_t n = 1 << h_bits;
508 
509     s_bits = bitsof<typename Checksum::word_type>() - h_bits;
510     s_mask = n - 1U;
511 
512     hash_table = new char[n / 8];
513     memset(hash_table, 0, n / 8);
514   }
515 
get_table_bittest_result516   int get_table_bit(usize_t i) {
517     return hash_table[i/8] & (1 << i%8);
518   }
519 
set_table_bittest_result520   int set_table_bit(usize_t i) {
521     return hash_table[i/8] |= (1 << i%8);
522   }
523 
summarize_tabletest_result524   void summarize_table() {
525     usize_t n = 1 << h_bits;
526     usize_t f = 0;
527     for (usize_t i = 0; i < n; i++) {
528       if (get_table_bit(i)) {
529 	f++;
530       }
531     }
532     h_buckets_full = f;
533   }
534 
gettest_result535   void get(const uint8_t* buf, const size_t buf_size, usize_t test_iters) {
536     typename Checksum::hash_type hash;
537     const uint8_t *ptr;
538     const uint8_t *end;
539     usize_t periods;
540     int64_t last_offset;
541     int64_t stop;
542 
543     test_size = buf_size;
544     last_offset = buf_size - Checksum::cksum_size;
545 
546     if (last_offset < 0) {
547       periods = 0;
548       n_steps = 0;
549       n_incrs = 0;
550       stop = -Checksum::cksum_size;
551     } else {
552       periods = last_offset / Checksum::cksum_skip;
553       n_steps = periods + 1;
554       n_incrs = last_offset + 1;
555       stop = last_offset - (periods + 1) * Checksum::cksum_skip;
556     }
557 
558     // Compute file stats once.
559     if (fstats.unique_values == 0) {
560       if (Checksum::cksum_skip == 1) {
561 	for (size_t i = 0; i <= buf_size - Checksum::cksum_size; i++) {
562 	  fstats.update(hash(cksum.step(buf + i), s_bits, s_mask), buf + i);
563 	}
564       } else {
565 	ptr = buf + last_offset;
566 	end = buf + stop;
567 
568 	for (; ptr != end; ptr -= Checksum::cksum_skip) {
569 	  fstats.update(hash(cksum.step(ptr), s_bits, s_mask), ptr);
570 	}
571       }
572       fstats.freeze();
573     }
574 
575     long start_test = get_millisecs_now();
576 
577     if (Checksum::cksum_skip != 1) {
578       new_table(n_steps);
579 
580       for (usize_t i = 0; i < test_iters; i++) {
581 	ptr = buf + last_offset;
582 	end = buf + stop;
583 
584 	for (; ptr != end; ptr -= Checksum::cksum_skip) {
585 	  set_table_bit(hash(cksum.step(ptr), s_bits, s_mask));
586 	}
587       }
588 
589       summarize_table();
590     }
591 
592     stop = buf_size - Checksum::cksum_size + 1;
593     if (stop < 0) {
594       stop = 0;
595     }
596 
597     if (Checksum::cksum_skip == 1) {
598       new_table(n_incrs);
599 
600       for (usize_t i = 0; i < test_iters; i++) {
601 	ptr = buf;
602 	end = buf + stop;
603 
604 	if (ptr != end) {
605 	  set_table_bit(hash(cksum.state0(ptr++), s_bits, s_mask));
606 	}
607 
608 	for (; ptr != end; ptr++) {
609 	  typename Checksum::word_type w = cksum.incr(ptr);
610 	  CHECK_EQ(w, cksum.step(ptr));
611 	  set_table_bit(hash(w, s_bits, s_mask));
612 	}
613       }
614 
615       summarize_table();
616     }
617 
618     accum_iters += test_iters;
619     accum_millis += get_millisecs_now() - start_test;
620   }
621 };
622 
read_whole_file(const char * name,uint8_t ** buf_ptr,size_t * buf_len)623 static int read_whole_file(const char *name,
624 			   uint8_t **buf_ptr,
625 			   size_t *buf_len) {
626   main_file file;
627   int ret;
628   xoff_t len;
629   size_t nread;
630   main_file_init(&file);
631   file.filename = name;
632   ret = main_file_open(&file, name, XO_READ);
633   if (ret != 0) {
634     fprintf(stderr, "open failed\n");
635     goto exit;
636   }
637   ret = main_file_stat(&file, &len);
638   if (ret != 0) {
639     fprintf(stderr, "stat failed\n");
640     goto exit;
641   }
642 
643   (*buf_len) = (size_t)len;
644   (*buf_ptr) = (uint8_t*) main_malloc(*buf_len);
645   ret = main_file_read(&file, *buf_ptr, *buf_len, &nread,
646 		       "read failed");
647   if (ret == 0 && *buf_len == nread) {
648     ret = 0;
649   } else {
650     fprintf(stderr, "invalid read\n");
651     ret = XD3_INTERNAL;
652   }
653  exit:
654   main_file_cleanup(&file);
655   return ret;
656 }
657 
main(int argc,char ** argv)658 int main(int argc, char** argv) {
659   int i;
660   uint8_t *buf = NULL;
661   size_t buf_len = 0;
662   int ret;
663 
664   if (argc <= 1) {
665     fprintf(stderr, "usage: %s file ...\n", argv[0]);
666     return 1;
667   }
668 
669 // TODO: The xdelta3-hash.h code is identical now; add sameness test.
670 // using rabin_karp<> template.
671 #define TEST(T,Z,S,C)					\
672   test_result<large_cksum<T,Z,S,hhash<T>,C>>		\
673     _xck_ ## T ## _ ## Z ## _ ## S ## _ ## C		\
674     ("xck_" #T "_" #Z "_" #S "_" #C);			\
675   test_result<large_cksum_old<T,Z,S,hhash<T>,C>>	\
676     _old_ ## T ## _ ## Z ## _ ## S ## _ ## C		\
677     ("old_" #T "_" #Z "_" #S "_" #C)
678 
679 #define TESTS(SIZE, SKIP)	 \
680   TEST(usize_t, SIZE, SKIP, 1);  \
681   TEST(usize_t, SIZE, SKIP, 2)
682 
683   TESTS(5, 1);
684   TESTS(6, 1);
685   TESTS(7, 1);
686   TESTS(8, 1);
687   TESTS(9, 1);
688   TESTS(10, 1);
689   TESTS(11, 1);
690   TESTS(12, 1);
691   TESTS(13, 1);
692   TESTS(14, 1);
693   TESTS(15, 1);
694   TESTS(16, 1);
695   TESTS(17, 1);
696   TESTS(18, 1);
697   TESTS(19, 1);
698   TESTS(20, 1);
699   TESTS(21, 1);
700   TESTS(22, 1);
701   TESTS(23, 1);
702   TESTS(24, 1);
703   TESTS(25, 1);
704   TESTS(26, 1);
705   TESTS(27, 1);
706   TESTS(28, 1);
707   TESTS(29, 1);
708   TESTS(30, 1);
709   TESTS(31, 1);
710   TESTS(32, 1);
711   TESTS(33, 1);
712   TESTS(34, 1);
713   TESTS(35, 1);
714   TESTS(36, 1);
715   TESTS(37, 1);
716   TESTS(38, 1);
717   TESTS(39, 1);
718 
719 
720   for (i = 1; i < argc; i++) {
721     if ((ret = read_whole_file(argv[i],
722 			       & buf,
723 			       & buf_len))) {
724       return 1;
725     }
726 
727     fprintf(stderr, "file %s is %zu bytes\n",
728 	    argv[i], buf_len);
729 
730     double min_time = -1.0;
731     double min_compression = 0.0;
732 
733     for (vector<test_result_base*>::iterator iter = all_tests.begin();
734 	 iter != all_tests.end(); ++iter) {
735       test_result_base *test = *iter;
736       test->reset();
737 
738       usize_t iters = 1;
739       long start_test = get_millisecs_now();
740 
741       do {
742 	test->get(buf, buf_len, iters);
743 	iters *= 3;
744 	iters /= 2;
745       } while (get_millisecs_now() - start_test < 2000);
746 
747       test->stat();
748 
749       if (min_time < 0.0) {
750 	min_compression = test->compression();
751 	min_time = test->time();
752       }
753 
754       if (min_time > test->time()) {
755 	min_time = test->time();
756       }
757 
758       if (min_compression > test->compression()) {
759 	min_compression = test->compression();
760       }
761 
762       test->print();
763     }
764 
765     main_free(buf);
766     buf = NULL;
767   }
768 
769   return 0;
770 }
771