1 #pragma once
2 
3 #include "Types.h"
4 
5 #include <math.h>
6 #include <vector>
7 #include <map>
8 #include <algorithm>   // for std::sort
9 #include <string.h>    // for memset
10 #include <stdio.h>     // for printf
11 
12 double calcScore ( const int * bins, const int bincount, const int ballcount );
13 
14 void plot ( double n );
15 
ExpectedCollisions(double balls,double bins)16 inline double ExpectedCollisions ( double balls, double bins )
17 {
18   return balls - bins + bins * pow(1 - 1/bins,balls);
19 }
20 
21 double chooseK ( int b, int k );
22 double chooseUpToK ( int n, int k );
23 
24 //-----------------------------------------------------------------------------
25 
f3mix(uint32_t k)26 inline uint32_t f3mix ( uint32_t k )
27 {
28   k ^= k >> 16;
29   k *= 0x85ebca6b;
30   k ^= k >> 13;
31   k *= 0xc2b2ae35;
32   k ^= k >> 16;
33 
34   return k;
35 }
36 
37 //-----------------------------------------------------------------------------
38 // Sort the hash list, count the total number of collisions and return
39 // the first N collisions for further processing
40 
41 template< typename hashtype >
FindCollisions(std::vector<hashtype> & hashes,HashSet<hashtype> & collisions,int maxCollisions)42 int FindCollisions ( std::vector<hashtype> & hashes,
43                      HashSet<hashtype> & collisions,
44                      int maxCollisions )
45 {
46   int collcount = 0;
47 
48   std::sort(hashes.begin(),hashes.end());
49 
50   for(size_t i = 1; i < hashes.size(); i++)
51   {
52     if(hashes[i] == hashes[i-1])
53     {
54       collcount++;
55 
56       if((int)collisions.size() < maxCollisions)
57       {
58         collisions.insert(hashes[i]);
59       }
60     }
61   }
62 
63   return collcount;
64 }
65 
66 //-----------------------------------------------------------------------------
67 
68 template < class keytype, typename hashtype >
PrintCollisions(hashfunc<hashtype> hash,std::vector<keytype> & keys)69 int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
70 {
71   int collcount = 0;
72 
73   typedef std::map<hashtype,keytype> htab;
74   htab tab;
75 
76   for(size_t i = 1; i < keys.size(); i++)
77   {
78     keytype & k1 = keys[i];
79 
80     hashtype h = hash(&k1,sizeof(keytype),0);
81 
82     typename htab::iterator it = tab.find(h);
83 
84     if(it != tab.end())
85     {
86       keytype & k2 = (*it).second;
87 
88       printf("A: ");
89       printbits(&k1,sizeof(keytype));
90       printf("B: ");
91       printbits(&k2,sizeof(keytype));
92     }
93     else
94     {
95       tab.insert( std::make_pair(h,k1) );
96     }
97   }
98 
99   return collcount;
100 }
101 
102 //----------------------------------------------------------------------------
103 // Measure the distribution "score" for each possible N-bit span up to 20 bits
104 
105 template< typename hashtype >
TestDistribution(std::vector<hashtype> & hashes,bool drawDiagram)106 double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
107 {
108   printf("Testing distribution - ");
109 
110   if(drawDiagram) printf("\n");
111 
112   const int hashbits = sizeof(hashtype) * 8;
113 
114   int maxwidth = 20;
115 
116   // We need at least 5 keys per bin to reliably test distribution biases
117   // down to 1%, so don't bother to test sparser distributions than that
118 
119   while(double(hashes.size()) / double(1 << maxwidth) < 5.0)
120   {
121     maxwidth--;
122   }
123 
124   std::vector<int> bins;
125   bins.resize(1 << maxwidth);
126 
127   double worst = 0;
128   int worstStart = -1;
129   int worstWidth = -1;
130 
131   for(int start = 0; start < hashbits; start++)
132   {
133     int width = maxwidth;
134     int bincount = (1 << width);
135 
136     memset(&bins[0],0,sizeof(int)*bincount);
137 
138     for(size_t j = 0; j < hashes.size(); j++)
139     {
140       hashtype & hash = hashes[j];
141 
142       uint32_t index = window(&hash,sizeof(hash),start,width);
143 
144       bins[index]++;
145     }
146 
147     // Test the distribution, then fold the bins in half,
148     // repeat until we're down to 256 bins
149 
150     if(drawDiagram) printf("[");
151 
152     while(bincount >= 256)
153     {
154       double n = calcScore(&bins[0],bincount,(int)hashes.size());
155 
156       if(drawDiagram) plot(n);
157 
158       if(n > worst)
159       {
160         worst = n;
161         worstStart = start;
162         worstWidth = width;
163       }
164 
165       width--;
166       bincount /= 2;
167 
168       if(width < 8) break;
169 
170       for(int i = 0; i < bincount; i++)
171       {
172         bins[i] += bins[i+bincount];
173       }
174     }
175 
176     if(drawDiagram) printf("]\n");
177   }
178 
179   double pct = worst * 100.0;
180 
181   printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct);
182   if(pct >= 1.0) printf(" !!!!! ");
183   printf("\n");
184 
185   return worst;
186 }
187 
188 //----------------------------------------------------------------------------
189 
190 template < typename hashtype >
TestHashList(std::vector<hashtype> & hashes,std::vector<hashtype> & collisions,bool testDist,bool drawDiagram)191 bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
192 {
193   bool result = true;
194 
195   {
196     size_t count = hashes.size();
197 
198     double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
199 
200     printf("Testing collisions   - Expected %8.2f, ",expected);
201 
202     double collcount = 0;
203 
204     HashSet<hashtype> collisions;
205 
206     collcount = FindCollisions(hashes,collisions,1000);
207 
208     printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
209 
210     if(sizeof(hashtype) == sizeof(uint32_t))
211     {
212     // 2x expected collisions = fail
213 
214     // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
215     // of a scale factor, otherwise we fail erroneously if there are a small expected number
216     // of collisions
217 
218     if(double(collcount) / double(expected) > 2.0)
219     {
220       printf(" !!!!! ");
221       result = false;
222     }
223     }
224     else
225     {
226       // For all hashes larger than 32 bits, _any_ collisions are a failure.
227 
228       if(collcount > 0)
229       {
230         printf(" !!!!! ");
231         result = false;
232       }
233     }
234 
235     printf("\n");
236   }
237 
238   //----------
239 
240   if(testDist)
241   {
242     TestDistribution(hashes,drawDiagram);
243   }
244 
245   return result;
246 }
247 
248 //----------
249 
250 template < typename hashtype >
TestHashList(std::vector<hashtype> & hashes,bool,bool testDist,bool drawDiagram)251 bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
252 {
253   std::vector<hashtype> collisions;
254 
255   return TestHashList(hashes,collisions,testDist,drawDiagram);
256 }
257 
258 //-----------------------------------------------------------------------------
259 
260 template < class keytype, typename hashtype >
TestKeyList(hashfunc<hashtype> hash,std::vector<keytype> & keys,bool testColl,bool testDist,bool drawDiagram)261 bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
262 {
263   int keycount = (int)keys.size();
264 
265   std::vector<hashtype> hashes;
266 
267   hashes.resize(keycount);
268 
269   printf("Hashing");
270 
271   for(int i = 0; i < keycount; i++)
272   {
273     if(i % (keycount / 10) == 0) printf(".");
274 
275     keytype & k = keys[i];
276 
277     hash(&k,sizeof(k),0,&hashes[i]);
278   }
279 
280   printf("\n");
281 
282   bool result = TestHashList(hashes,testColl,testDist,drawDiagram);
283 
284   printf("\n");
285 
286   return result;
287 }
288 
289 //-----------------------------------------------------------------------------
290 // Bytepair test - generate 16-bit indices from all possible non-overlapping
291 // 8-bit sections of the hash value, check distribution on all of them.
292 
293 // This is a very good test for catching weak intercorrelations between bits -
294 // much harder to pass than the normal distribution test. However, it doesn't
295 // really model the normal usage of hash functions in hash table lookup, so
296 // I'm not sure it's that useful (and hash functions that fail this test but
297 // pass the normal distribution test still work well in practice)
298 
299 template < typename hashtype >
TestDistributionBytepairs(std::vector<hashtype> & hashes,bool drawDiagram)300 double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
301 {
302   const int nbytes = sizeof(hashtype);
303   const int hashbits = nbytes * 8;
304 
305   const int nbins = 65536;
306 
307   std::vector<int> bins(nbins,0);
308 
309   double worst = 0;
310 
311   for(int a = 0; a < hashbits; a++)
312   {
313     if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
314 
315     if(drawDiagram) printf("[");
316 
317     for(int b = 0; b < hashbits; b++)
318     {
319       if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
320 
321       bins.clear();
322       bins.resize(nbins,0);
323 
324       for(size_t i = 0; i < hashes.size(); i++)
325       {
326         hashtype & hash = hashes[i];
327 
328         uint32_t pa = window(&hash,sizeof(hash),a,8);
329         uint32_t pb = window(&hash,sizeof(hash),b,8);
330 
331         bins[pa | (pb << 8)]++;
332       }
333 
334       double s = calcScore(bins,bins.size(),hashes.size());
335 
336       if(drawDiagram) plot(s);
337 
338       if(s > worst)
339       {
340         worst = s;
341       }
342     }
343 
344     if(drawDiagram) printf("]\n");
345   }
346 
347   return worst;
348 }
349 
350 //-----------------------------------------------------------------------------
351 // Simplified test - only check 64k distributions, and only on byte boundaries
352 
353 template < typename hashtype >
TestDistributionFast(std::vector<hashtype> & hashes,double & dworst,double & davg)354 void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
355 {
356   const int hashbits = sizeof(hashtype) * 8;
357   const int nbins = 65536;
358 
359   std::vector<int> bins(nbins,0);
360 
361   dworst = -1.0e90;
362   davg = 0;
363 
364   for(int start = 0; start < hashbits; start += 8)
365   {
366     bins.clear();
367     bins.resize(nbins,0);
368 
369     for(size_t j = 0; j < hashes.size(); j++)
370     {
371       hashtype & hash = hashes[j];
372 
373       uint32_t index = window(&hash,sizeof(hash),start,16);
374 
375       bins[index]++;
376     }
377 
378     double n = calcScore(&bins.front(),(int)bins.size(),(int)hashes.size());
379 
380     davg += n;
381 
382     if(n > dworst) dworst = n;
383   }
384 
385   davg /= double(hashbits/8);
386 }
387 
388 //-----------------------------------------------------------------------------
389