1 // components.cpp              Copyright (C)  A C Norman, October 2015-2021
2 
3 /**************************************************************************
4  * Copyright (C) 2021, Codemist.                         A C Norman       *
5  *                                                                        *
6  * Redistribution and use in source and binary forms, with or without     *
7  * modification, are permitted provided that the following conditions are *
8  * met:                                                                   *
9  *                                                                        *
10  *     * Redistributions of source code must retain the relevant          *
11  *       copyright notice, this list of conditions and the following      *
12  *       disclaimer.                                                      *
13  *     * Redistributions in binary form must reproduce the above          *
14  *       copyright notice, this list of conditions and the following      *
15  *       disclaimer in the documentation and/or other materials provided  *
16  *       with the distribution.                                           *
17  *                                                                        *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS    *
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT      *
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS      *
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE         *
22  * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,   *
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,   *
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS  *
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
26  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR  *
27  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF     *
28  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH   *
29  * DAMAGE.                                                                *
30  *************************************************************************/
31 
32 // $Id: components.cpp 5736 2021-03-16 10:41:22Z arthurcnorman $
33 
34 // The code here is for the offline creation of a hash table that will
35 // follow in the spirit of cuckoo hashing using 1, 2 or 3 probes (but
36 // never more) to retrieve information. It tries a potentially large
37 // number of hash functions and picks one that optimises table occupancy.
38 // I use this for building a hash table that holds metrics for characters,
39 // and (to my amazement) achieve around 99% table occupancy with a worst
40 // case access cost of 3 probes.
41 //
42 // A central part of the process of setting up the hash table involves
43 // finding matchings in bipartite graphs. The Hopcroft-Karp algorithm is
44 // used for this.
45 //
46 // This version uses threads to do some of its searching in parallel, and
47 // should work on Linux, Windows and OS/X, checking for the number of
48 // CPUs and using them all.
49 
50 #include <cstdio>
51 #include <cstdlib>
52 #include <cstdint>
53 #include <cinttypes>
54 #include <cstring>
55 #include <cstdarg>
56 #include <cerrno>
57 #include <cmath>
58 
59 #ifdef WIN32
60 #include "winsupport.h"
61 #else
62 #include <pthread.h>
63 #include <unistd.h>
64 #endif
65 
66 #include "cuckoo.h"
67 
68 #include "memorysize.cpp"
69 
70 // I now try to encapsulate the variations between thread support across
71 // Windows and the rest of the world...
72 // The macros here are not very syntactically safe, but I will be using
73 // them in very stylised manners so I am not too worried.
74 
75 static __thread int threadnumber = -1;
76 
77 #ifdef WIN32
78 
79 CRITICAL_SECTION critical_section, logmutex;
80 
number_of_processors()81 static int number_of_processors()
82 {   SYSTEM_INFO si;
83     GetSystemInfo(&si);
84     return static_cast<int>(si.dwNumberOfProcessors);
85 }
86 
87 // In very old versions of Windows one needed to use _beingthreadx rather
88 // than CreateThread here, but that is no longer the case (as best I can
89 // discern!).
90 
91 // The function that implements a thread will return a value THREAD_RESULT
92 // of type THREAD_RESULT_T.
93 
94 #define THREAD_RESULT_T DWORD __stdcall
95 #define THREAD_RESULT   0
96 
97 // I will have a limit on the maximum number of threads I will ever
98 // create - and at present I am not expecting to find that I have
99 // a shared-memory cpu with more than 32 CPUs.
100 
101 #define MAX_CPU_COUNT 32
102 static HANDLE thread[MAX_CPU_COUNT];
103 
104 #define CREATETHREAD_FAILED(i, p) \
105     (thread[i] = CreateThread(nullptr, 0, &p, \
106         reinterpret_cast<void *>(reinterpret_cast<std::intptr_t>(i)), 0, nullptr)) == nullptr
107 // On Windows I will use a time-limited wait for my first worker
108 // thread to terminate as my "sleep" operation - that means that as soon
109 // as that thread terminates I wake up... and I should find the
110 // thread_finished variable has been set.
111 #define SLEEP if (WaitForSingleObject(thread[0], 5000) == WAIT_OBJECT_0) \
112     {   CloseHandle(thread[0]); \
113         thread[0] = nullptr; \
114     }
115 #define JOINTHREAD_FAILED(i) \
116     (thread[i] != nullptr && \
117      (WaitForSingleObject(thread[i], INFINITE) != WAIT_OBJECT_0 || \
118       CloseHandle(thread[i]) == 0))
119 
120 #else // Now for the pthreads version for Linux, Unix, BSD, OS/X etc.
121 
122 pthread_mutex_t mutex     = PTHREAD_MUTEX_INITIALIZER,
123                 logmutex  = PTHREAD_MUTEX_INITIALIZER,
124                 condmutex = PTHREAD_MUTEX_INITIALIZER;
125 pthread_cond_t  cond      = PTHREAD_COND_INITIALIZER;
126 
127 #ifdef _SC_NPROCESSORS_ONLN
128 // _SC_NPROCESSORS_ONLN is not a proper POSIX enquiry to send to sysconf,
129 // but it is available on Linux, cygwin and OS/X and so is worth using here,
130 // perhaps especially since an #ifdef can easily check for it. I use ONLN
131 // not CONF because some processors may have many cores but leave some of
132 // then disabled - I see reports that in particular big/little ARM chips
133 // do this. Note that for hyperthreaded CPUs this returns the number of
134 // threads I can run in parallel, not the number of raw cores. I also ignore
135 // issues of load and permit my code to use all available resources.
136 
number_of_processors()137 static int number_of_processors()
138 {   return sysconf(_SC_NPROCESSORS_ONLN);
139 }
140 
141 #else // sysconf option to check CPU count
142 
number_of_processors()143 static int number_of_processors()
144 {   std::printf("_SC_NPROCESSORS_CONF not defined\n");
145     return 1;
146 }
147 
148 #endif // sysconf option to check CPU count
149 
150 #define THREAD_RESULT_T void *
151 #define THREAD_RESULT   nullptr
152 
153 #define MAX_CPU_COUNT 32
154 static pthread_t thread[MAX_CPU_COUNT];
155 
156 #define CREATETHREAD_FAILED(i, p) \
157     pthread_create(&thread[i], nullptr, &p, reinterpret_cast<void *>(reinterpret_cast<std::intptr_t>(i)))
158 #define SLEEP \
159     {   struct std::timespec ts; \
160         clock_gettime(CLOCK_REALTIME, &ts); \
161         ts.tv_sec += 5; \
162         pthread_cond_timedwait(&cond, &condmutex, &ts); \
163     }
164 #define JOINTHREAD_FAILED(i) \
165     pthread_join(thread[i], nullptr)
166 
167 #endif // Windows vs pthreads.
168 
169 
170 // logging in a way that is thread-safe
171 
172 #define DEBUG (0)
173 
printlog(const char * s,...)174 void printlog(const char *s, ...)
175 {   std::va_list a;
176     LOCKLOGMUTEX;
177     va_start(a, s);
178 #ifndef SEQUENTIAL
179     std::fprintf(stdout, "{%d:", threadnumber);
180 #endif
181     std::vfprintf(stdout, s, a);
182 #ifndef SEQUENTIAL
183     std::fprintf(stdout, ":%d}", threadnumber);
184 #endif
185     std::fflush(stdout);
186     va_end(a);
187     UNLOCKLOGMUTEX;
188 }
189 
190 
191 // See workfarm.c for a small program illustrating use of threads to attack
192 // a problem exploiting concurrency.
193 
194 // Because the table was passed as a "void *" and we do not have
195 // a single type that maps the entries it is not feasible to use direct
196 // array access (as in "table[n]") to retrieve and update items in it.
197 // So here I have two macros that use the explicitly passed item size
198 // and accress arithmetic to create a pointer to the nth item in the
199 // table, and then the user-supplied functions to deal with just the
200 // part of the thing there that represents the key. These are macros
201 // not functions because they use a number of values not explicitly passed
202 // to them as arguments.
203 
204 // I am expecting that insertion in these tables will not be done at
205 // all often, so having a somewhat clumsy-looking interface here does
206 // not worry me too much, and I will even accept code that is not
207 // terribly neat!
208 
209 
210 // I use the Hopcroft-Karp maximum matching algorithm to determine if there
211 // is any way of using a particular size of hash table. Once I know how large
212 // a hash table to use I can use the Hungarian algorithm to find an assignment
213 // to favours placing keys in positions that minimise the average number
214 // of probes that lookup will need.
215 
216 // The Hopcroft-Karp code I use is imported, but had been been placed under
217 // very flexible license terms. See the top of the source code for details.
218 
219 #include "hopkar.cpp"
220 
221 // Now an adapter that takes my hashing problem and maps it onto the
222 // an instance of a matching problem. The input is a set of keys, a table
223 // size and two parameters that control details of the hashing. It then has
224 // the hash table it is tryng to create, which it fills in if it can find
225 // a perfect way to do so. Here the table is whatevver final native-shape
226 // hash table will be required, and the procedures get_key and set_key
227 // access the keys in it.
228 
229 // This tries to insert all the keys in "items" into the hash-table "table"
230 // and it returns 0 if it fails. Because it is just finding a matching (ANY
231 // matching) it will be useful for discovering what the smallest possible
232 // table size is. But it will not take any steps at all to pick a matching
233 // that prefers first choice placements for keys over the other two
234 // possibilities. For that see the Hungarian algorith below which will be
235 // able to perform further optimisation but which is expected to be slower.
236 
237 #define LARGEST_MATCHING 20000
238 
239 static __thread int u_used[LARGEST_MATCHING+1];
240 static __thread int QQ[9*LARGEST_MATCHING+1];
241 static __thread int u_edge1[LARGEST_MATCHING];
242 static __thread int u_edge2[LARGEST_MATCHING];
243 static __thread int u_edge3[LARGEST_MATCHING];
244 static __thread int v_used[LARGEST_MATCHING];
245 static __thread int v_edgemap[LARGEST_MATCHING];
246 static __thread int v_edgecount[LARGEST_MATCHING];
247 static __thread int vu_edges[3*LARGEST_MATCHING];
248 
249 
find_any_assignment(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key,std::uint32_t modulus2,std::uint32_t offset2,int noisy)250 int find_any_assignment(
251     std::uint32_t *items,
252     int item_count,
253     cuckoo_importance *importance,
254     void *table,
255     int hash_item_size,
256     int table_size,
257     cuckoo_get_key *get_key,
258     cuckoo_set_key *set_key,
259     std::uint32_t modulus2,
260     std::uint32_t offset2,
261     int noisy)
262 {   int i, j, k, l, Qin, Qout;
263     int ucount, vcount, uchain;
264     int uremaining = item_count, vremaining = table_size;
265     if (DEBUG)
266         printlog("find_any_assignment %d items in table of size %d\n",
267                  item_count, table_size);
268     if (DEBUG) printlog("modulus2 = %d offset2 = %d\n", modulus2,
269                             offset2);
270 // First I will construct a representation of the sparse graph;
271     for (i=0; i<item_count; i++)
272     {   u_edge1[i] = u_edge2[i] = u_edge3[i] = -1;
273         u_used[i] = -1;
274     }
275     u_used[item_count] = -1; // a sentinel
276     for (i=0; i<table_size; i++)
277     {   v_edgemap[i] = v_edgecount[i] = 0;
278         v_used[i] = -1;
279     }
280     for (i=0; i<item_count; i++)
281     {   std::uint32_t key = items[i];
282         std::uint32_t h1 = key % table_size;
283         std::uint32_t h2 = key % modulus2 + offset2;
284         std::uint32_t h3 = (h1 + h2) % table_size;
285         int imp = (*importance)(key);
286 // Get rid of double edges... if an entry in the table is "-1" that
287 // means "no edge here".
288         if (h3 == h2 || h3 == h1) h3 = -1;
289         if (h2 == h1)
290         {   h2 = h3;
291             h3 = -1;
292         }
293         switch (imp)
294     {       default:
295                 u_edge3[i] = static_cast<int>(h3);
296                 if (h3 != -1) v_edgecount[h3]++; // count edges out from h3
297             // drop through
298             case CUCKOO_IMPORTANT:
299                 u_edge2[i] = static_cast<int>(h2);
300                 if (h2 != -1) v_edgecount[h2]++;
301             // drop through
302             case CUCKOO_VITAL:
303                 u_edge1[i] = static_cast<int>(h1);
304                 if (h1 != -1) v_edgecount[h1]++;
305                 break;
306         }
307         if (DEBUG) printlog("%4d: (%5d)  %5d %5d %5d\n", i, items[i],
308                                 u_edge1[i], u_edge2[i], u_edge3[i]);
309     }
310 // I have counted how many edges are incident at each vertex in set V so
311 // I can now put in pointers to the general table of edges V to U.
312     j = 0;
313     for (i=0; i<table_size; i++)
314     {   v_edgemap[i] = j;
315         if (DEBUG)
316             printlog("table entry %d is pointed at by %d items at %d\n",
317                      i, v_edgecount[i], v_edgemap[i]);
318         j += v_edgecount[i];
319     }
320 // Fill in the edges.
321     for (i=0; i<item_count; i++)
322     {   if ((j = u_edge1[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
323         if ((j = u_edge2[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
324         if ((j = u_edge3[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
325     }
326 // Restore the pointers to where they are.
327     j = 0;
328     for (i=0; i<table_size; i++)
329     {   v_edgemap[i] = j;
330         if (DEBUG) printlog("V[%d] ->", i);
331         if (DEBUG) for (k=0; k<v_edgecount[i];
332                             k++) printlog(" %d", vu_edges[j+k]);
333         if (DEBUG) printlog("\n");
334         j += v_edgecount[i];
335     }
336     i = 0;
337     while (i<item_count)
338     {
339 // Here I want to collect all the vertexes reachable from U[i]. I will
340 // need to know how many there are in U and how many in V, and only when I
341 // have got those counts can I try using Hopcroft-Karp to process them.
342 // I will chain up the locations in U that I find.
343         ucount = 0;
344         vcount = 0;
345         uchain = -2;   // a terminator for the chain.
346         Qin = Qout = 0;
347         QQ[Qin++] = i;
348         if (DEBUG)
349             printlog("Seed search for component with %d (Qin=%d Qout=%d)\n", i,
350                      Qin, Qout);
351         while (Qin != Qout)
352         {   j = QQ[Qout++];
353             if (u_used[j] != -1)
354             {   if (DEBUG) printlog("U[%d] already processed\n", j);
355                 continue; // already processed
356             }
357             if (DEBUG) printlog("spread from U[%d], chain to %d\n", j, uchain);
358 // Record U[j] as having been used.
359             u_used[j] = uchain;
360             uchain = j;
361             ucount++;
362             if (DEBUG) printlog("chain up U. Now have %d vertices in A\n",
363                                     ucount);
364 // Now try visiting each vertex in V that is joined to j...
365             if ((k = u_edge1[j]) != -1 && v_used[k] == -1)
366             {   v_used[k] = vcount++;
367                 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
368                                         k, v_used[k]);
369                 for (l=0; l<v_edgecount[k]; l++)
370                 {   int m = vu_edges[v_edgemap[k] + l];
371                     if (DEBUG) printlog("edge goes back to %d\n", m);
372                     if (u_used[m] == -1) QQ[Qin++] = m;
373                 }
374             }
375             if ((k = u_edge2[j]) != -1 && v_used[k] == -1)
376             {   v_used[k] = vcount++;
377                 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
378                                         k, v_used[k]);
379                 for (l=0; l<v_edgecount[k]; l++)
380                 {   int m = vu_edges[v_edgemap[k] + l];
381                     if (DEBUG) printlog("edge goes back to %d\n", m);
382                     if (u_used[m] == -1) QQ[Qin++] = m;
383                 }
384             }
385             if ((k = u_edge3[j]) != -1 && v_used[k] == -1)
386             {   v_used[k] = vcount++;
387                 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
388                                         k, v_used[k]);
389                 for (l=0; l<v_edgecount[k]; l++)
390                 {   int m = vu_edges[v_edgemap[k] + l];
391                     if (DEBUG) printlog("edge goes back to %d\n", m);
392                     if (u_used[m] == -1) QQ[Qin++] = m;
393                 }
394             }
395             if (DEBUG) printlog("Now Qin=%d Qout=%d\n", Qin, Qout);
396         }
397         if (DEBUG) printlog("Component found with size %d by %d\n", ucount,
398                                 vcount);
399 // If there are too many items in set U in this component then report
400 // failure. I could also report failure if the number of vertices left over
401 // were then out of balance.
402         uremaining -= ucount;
403         vremaining -= vcount;
404         if (ucount > vcount ||
405             uremaining > vremaining)
406         {   if (DEBUG) printlog("Early exit\n");
407             return 0;
408         }
409 // If I have a string component with only 1 or 2 vertices in set B then
410 // there has to be a complete matching. If the component is larger I will
411 // use Hopcroft-Karp to assess it.
412         else if (vcount >= 3)
413         {   init(ucount, vcount);
414             i = 0;
415             while (uchain != -2)
416             {   if (DEBUG) printlog("i = %d uchain = %d\n", i, uchain);
417                 if ((j = u_edge1[uchain]) != -1) addEdge(i, v_used[j]);
418                 if ((j = u_edge2[uchain]) != -1) addEdge(i, v_used[j]);
419                 if ((j = u_edge3[uchain]) != -1) addEdge(i, v_used[j]);
420                 uchain = u_used[uchain];
421                 if (i++ > ucount)
422                 {   printlog("uchain too long\n");
423                     std::exit(1);
424                 }
425             }
426             if (maxMatching() != ucount) return 0;
427         }
428 // I have put a -1 just after the end of the useful data so that the
429 // loop here that skips to the next un-visited vertex is certain to
430 // terminate nicely for me.
431         while (u_used[i] != -1) i++;
432     }
433     return 1;
434 }
435 
436 // Once a table has been set up this function checks that each key is properly
437 // present and it computes a figure of merit for it... This code may in
438 // fact not be used!
439 
cuckoo_merit(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int table_size,cuckoo_get_key * get_key,std::uint32_t modulus2,std::uint32_t offset2)440 double cuckoo_merit(
441     std::uint32_t *items,
442     int item_count,
443     cuckoo_importance *importance,
444     void *table,
445     int hash_item_size,
446     int table_size,
447     cuckoo_get_key *get_key,
448     std::uint32_t modulus2,
449     std::uint32_t offset2)
450 {   int i;
451     int nvital = 0;
452     int nimportant = 0, nimp1 = 0, nimp2 = 0;
453     int nstandard = 0, nstand1 = 0, nstand2 = 0, nstand3 = 0;
454     for (i=0; i<item_count; i++)
455     {   std::uint32_t key = items[i];
456         std::uint32_t h1 = key % table_size;
457         std::uint32_t h2 = key % modulus2 + offset2;
458         std::uint32_t h3 = (h1 + h2) % table_size;
459         int imp = (*importance)(key);
460         switch (imp)
461     {       default:
462                 nstandard++;
463                 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
464                                       (table)))
465                 {   nstand1++;
466                     break;
467                 }
468                 else if (key == (*get_key)(h2*hash_item_size+reinterpret_cast<char *>
469                                            (table)))
470                 {   nstand2++;
471                     break;
472                 }
473                 else if (key == (*get_key)(h3*hash_item_size+reinterpret_cast<char *>
474                                            (table)))
475                 {   nstand3++;
476                     break;
477                 }
478                 else
479                 {   printlog("Failed to find item %d (%u/%x)\n", i, key, key);
480                     std::exit(1);
481                 }
482             case CUCKOO_IMPORTANT:
483                 nimportant++;
484                 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
485                                       (table)))
486                 {   nimp1++;
487                     break;
488                 }
489                 else if (key == (*get_key)(h2*hash_item_size+reinterpret_cast<char *>
490                                            (table)))
491                 {   nimp2++;
492                     break;
493                 }
494                 else
495                 {   printlog("Failed to find item %d (%u/%x)\n", i, key, key);
496                     std::exit(1);
497                 }
498             case CUCKOO_VITAL:
499                 nvital++;
500                 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
501                                       (table)))
502                     break;
503                 else
504                 {   printlog("Failed to find item %d (%u/%x)\n", i, key, key);
505                     std::exit(1);
506                 }
507         }
508     }
509 // The "figure of merit" is a weighted average that scores each VITAL key
510 // as worth 10, each IMPORTANT ones as 4 and the rest as worth 1 each. Smaller
511 // values are better.
512     return (10.0*nvital +
513             4.0*(nimp1 + 2.0*nimp2) +
514             (nstand1 + 2.0*nstand2 + 3.0*nstand3)) /
515            (10.0*nvital + 4.0*nimportant + nstandard);
516 }
517 
518 // As well as being able to find assignments I need to be able to find the
519 // BEST assignments, and the following imported code that implements the
520 // Hungarian algorith sorts that out. It uses a quite different interface
521 // from the Hopcroft-Karp code - which is reasonable since its source was
522 // different. I then need to write code to link it to the problems that I
523 // wish to solve.
524 
525 // The code I have imported deals with general matchings and represents the
526 // bipartite graph as a full matrix. In my case the graph that I wish to
527 // process is sparse - it has at most three edges for each vertex in the
528 // set A. I am reworking the code to use adjcacency lists to
529 // represent the sparse array. Predefine SPARSE to try that version.
530 
531 #ifdef SPARSE
532 #include "hunsparse.cpp"
533 #else
534 #include "hungarian.cpp"
535 #endif
536 
537 // Now an adapter that takes my hashing problem and maps it onto the
538 // calls to the hungarian problem solver. The input is a set of keys,
539 // a table size and two parameters that control details of the hashing.
540 // It then has the hash table it is trying to create, which it fills in
541 // if it can find a perfect way to do so. For the code here the hash table
542 // is merely an array of uint32_t values.
543 //
544 // This tries to insert all the keys in "items" into the hash-table "table"
545 // and it returns a positive "merit" value on success, or -1.0 on failure
546 //
547 
548 static __thread int **adjacency_matrix = nullptr;
549 
find_best_assignment(std::uint32_t * items,int item_count,cuckoo_importance * importance,std::uint32_t * table,int table_size,std::uint32_t modulus2,std::uint32_t offset2)550 double find_best_assignment(
551     std::uint32_t *items,
552     int item_count,
553     cuckoo_importance *importance,
554     std::uint32_t *table,
555     int table_size,
556     std::uint32_t modulus2,
557     std::uint32_t offset2)
558 {   int i, j;
559     int *mem;
560     hungarian_problem_t p;
561     printlog("starting find_best_assignment %d %d\n", item_count,
562              table_size);
563     printlog("m2=%d o2=%d\n", modulus2, offset2);
564     adjacency_matrix = new int *[table_size + table_size*item_count + 2];
565     hungarian_test_alloc(adjacency_matrix);
566     mem = reinterpret_cast<int *>(&adjacency_matrix[table_size]);
567     for (i=0; i<table_size; i++)
568     {   adjacency_matrix[i] = mem;
569         mem += item_count;
570 // I put in a value that is not "infinite" but is such that any assignment
571 // that uses an edge of this length will be pretty bad.
572         for (j=0; j<item_count; j++)
573             adjacency_matrix[i][j] = 0x00010000;
574     }
575     for (i=0; i<item_count; i++)
576     {   std::uint32_t key = items[i];
577         std::uint32_t h1 = key % table_size;
578         std::uint32_t h2 = key % modulus2 + offset2;
579         std::uint32_t h3 = (h1 + h2) % table_size;
580         int imp = (*importance)(key);
581 // I have thought a bit about the weights I use here, but to a large extent
582 // they are a bit of an arbitrary choice.
583         switch (imp)
584     {       default:
585                 adjacency_matrix[h1][i] = 0;
586                 adjacency_matrix[h2][i] = 1;
587                 adjacency_matrix[h3][i] = 2;
588                 break;
589             case CUCKOO_IMPORTANT:
590                 adjacency_matrix[h1][i] = 0;
591                 adjacency_matrix[h2][i] = 10;
592                 break;
593             case CUCKOO_VITAL:
594                 adjacency_matrix[h1][i] = 0;
595                 break;
596         }
597     }
598     hungarian_init(&p,
599                    adjacency_matrix,
600                    table_size,
601                    item_count,
602                    HUNGARIAN_MODE_MINIMIZE_COST);
603     printlog("call hungarian_solve\n");
604     hungarian_solve(&p);
605     printlog("hungarian_solve returned\n");
606 // Next I will extract the assignment found and put it in my hash table.
607 // While I am about it I will collect some statistics.
608     int nvital = 0;
609     int nimportant = 0, nimportant1 = 0, nimportant2 = 0;
610     int nstandard = 0, nstandard1 = 0, nstandard2 = 0, nstandard3 = 0;
611     for (i=0; i<item_count; i++)
612     {   std::uint32_t key = items[i];
613         std::uint32_t h1 = key % table_size;
614         std::uint32_t h2 = key % modulus2 + offset2;
615         std::uint32_t h3 = (h1 + h2) % table_size;
616         int imp = (*importance)(key);
617         switch (imp)
618         {   case CUCKOO_VITAL:
619                 if (p.assignment[h1][i])
620                 {   table[h1] = key;
621                     nvital++;
622                     break;
623                 }
624                 else
625                 {   hungarian_free(&p);
626                     std::free(adjacency_matrix);
627                     return -1.0;
628                 }
629             case CUCKOO_IMPORTANT:
630                 if (p.assignment[h1][i])
631                 {   table[h1] = key;
632                     nimportant++;
633                     nimportant1++;
634                     break;
635                 }
636                 else if (p.assignment[h2][i])
637                 {   table[h2] = key;
638                     nimportant++;
639                     nimportant2++;
640                     break;
641                 }
642                 else
643                 {   hungarian_free(&p);
644                     std::free(adjacency_matrix);
645                     return -1.0;
646                 }
647             case CUCKOO_STANDARD:
648                 if (p.assignment[h1][i])
649                 {   table[h1] = key;
650                     nstandard++;
651                     nstandard1++;
652                     break;
653                 }
654                 else if (p.assignment[h2][i])
655                 {   table[h2] = key;
656                     nstandard++;
657                     nstandard2++;
658                     break;
659                 }
660                 else if (p.assignment[h3][i])
661                 {   table[h3] = key;
662                     nstandard++;
663                     nstandard3++;
664                     break;
665                 }
666                 else
667                 {   hungarian_free(&p);
668                     std::free(adjacency_matrix);
669                     return -1.0;
670                 }
671         }
672     }
673     printlog("transferred to hash table\n");
674     double merit = (10.0*nvital +
675                     4.0*(nimportant1 + 2.0*nimportant2) +
676                     (nstandard1 + 2.0*nstandard2 + 3.0*nstandard3)) /
677                    (10.0*nvital + 4.0*nimportant + nstandard);
678 #define VERBOSE
679 #ifdef VERBOSE
680     double avimportant =
681         (nimportant1+2.0*nimportant2)/nimportant;
682     double avstandard =
683         (nstandard1+2.0*nstandard2+3.0*nstandard3)/nstandard;
684 // But I will also compute an average that shows the expected number of
685 // probes if all keys are accessed with equal probability.
686     double average =
687         (nvital + nimportant1 + 2.0*nimportant2 +
688          nstandard1 + 2.0*nstandard2 + 3.0*nstandard3) / item_count;
689     {   printlog("\ntable_size = %u modulus2 = %u offset2 = %u occupancy %.2f\n",
690                  table_size, modulus2, offset2,
691                  (100.0*item_count)/table_size);
692         if (nvital != 0)
693             printlog("VITAL:     %u      1.0 average probes\n", nvital);
694         if (nimportant1!=0 || nimportant2 != 0)
695             printlog("IMPORTANT: %u %u   %.3f average probes\n",
696                      nimportant1, nimportant2, avimportant);
697         if (nstandard1!=0 || nstandard2 != 0)
698             printlog("STANDARD:  %u %u %u  %.3f average probes\n",
699                      nstandard1, nstandard2, nstandard3, avstandard);
700         printlog("Figure of merit = %.4f flat average = %.3f\n", merit,
701                  average);
702     }
703 #endif
704     hungarian_free(&p);
705     std::free(adjacency_matrix);
706     return merit;
707 }
708 
int32_get_key(void * p)709 std::uint32_t int32_get_key(void *p)
710 {   return *reinterpret_cast<std::uint32_t *>(p);
711 }
712 
int32_set_key(void * p,std::uint32_t v)713 void int32_set_key(void *p, std::uint32_t v)
714 {   *reinterpret_cast<std::uint32_t *>(p) = v;
715 }
716 
717 // To perform a parallel search I need to be able to distribute cases
718 // that need analysis to the various threads. A test case is a pair
719 // of a value for modulus2 and one for offset2.
720 
721 typedef struct __mod_and_off
722 {   std::uint32_t modulus2;
723     std::uint32_t offset2;
724     int noisy;
725 } mod_and_off;
726 
727 
728 // I have some data that is common to all the threads....
729 
730 static int       shared_use_hungarian;
731 static std::uint32_t *shared_keys;
732 static int       shared_key_count;
733 static std::uint32_t *shared_table;
734 static int       shared_table_size;
735 static cuckoo_importance *shared_importance;
736 static std::uint32_t  shared_modulus2;
737 static std::uint32_t  shared_offset2;
738 static std::uint32_t  best_modulus2;
739 static std::uint32_t  best_offset2;
740 static double    best_merit;
741 static int       tries;
742 static int       successes;
743 
744 // This should return the next mod_and_off, or one with values (0,0)
745 // when there are no more left.
746 
hungarian_next_case()747 mod_and_off hungarian_next_case()
748 {   mod_and_off r;
749     LOCKMUTEX;
750     tries++;
751     r.modulus2 = shared_modulus2;
752     r.offset2 = shared_offset2;
753     if (shared_modulus2 != 0)
754     {   if (shared_modulus2 + shared_offset2 < shared_table_size)
755             shared_offset2++;
756         else
757         {   shared_modulus2++;
758             shared_offset2 = 0;
759             if (shared_modulus2 >= shared_table_size) shared_modulus2 = 0;
760         }
761     }
762     if (tries % 1000 == 0) r.noisy = 1;
763     else r.noisy = 0;
764     UNLOCKMUTEX;
765     return r;
766 }
767 
768 // Here is a function that implements what a thread should do. I have just one
769 // variable called thread_finished and if a thread is ending it will set it
770 // to 1. I will not worry about locks since this only needs to notice if at
771 // least one thread is done.
772 //
773 // If shared_use_hungarian is true this considers ALL the possible values for
774 // modulus2 and offset2 and constructs a proper hash-table allocation for the
775 // configuration that delivers the best figure of merit. Otherwise it merely
776 // seeks a valid assigment and arranges that all threads stop once one of
777 // them has found one. It does not return a figure of merit and does not
778 // fill in a detailed hash assignment (even though it could).
779 
780 static int thread_finished;
781 
hungarian_thread(void * arg)782 THREAD_RESULT_T hungarian_thread(void *arg)
783 {   mod_and_off mo;
784     std::uint32_t *table = reinterpret_cast<std::uint32_t *>(std)::calloc(
785                                shared_table_size, sizeof(std::uint32_t));
786     threadnumber = static_cast<int>(reinterpret_cast<std::intptr_t>(arg));
787 //  printlog("Thread %d starting\n", threadnumber);
788     while ((mo = hungarian_next_case()).modulus2 != 0)
789     {   double merit;
790 // Before I try the (possibly expensive) Hungarian method I will filter
791 // by using Hopcroft-Karp so that I at least know that there is some feasible
792 // allocation.
793         if (find_any_assignment(
794                 shared_keys,
795                 shared_key_count,
796                 shared_importance,
797                 table,
798                 sizeof(std::uint32_t),
799                 shared_table_size,
800                 int32_get_key,
801                 int32_set_key,
802                 mo.modulus2,
803                 mo.offset2,
804                 mo.noisy))
805         {   printlog("Found assignment for m2=%d : o2=%d for table size %d\n",
806                      mo.modulus2, mo.offset2, shared_table_size);
807             if (!shared_use_hungarian)
808             {   // printlog("Report back to owner and set quit flag\n");
809                 LOCKMUTEX;
810                 successes++;
811                 best_modulus2 = mo.modulus2;
812                 best_offset2 = mo.offset2;
813                 best_merit = 0.0;
814 // Arrange that the next use of hungarian_next_case reports no more cases
815 // need investigation, and so all other threads will soon terminate too.
816                 shared_modulus2 = 0;
817                 shared_offset2 = 0;
818                 UNLOCKMUTEX;
819                 break;
820             }
821             else if ((merit = find_best_assignment(
822                                   shared_keys,
823                                   shared_key_count,
824                                   shared_importance,
825                                   table,
826                                   shared_table_size,
827                                   mo.modulus2,
828                                   mo.offset2)) >= 0.0)
829             {   LOCKMUTEX;
830                 successes++;
831                 printlog("success count=%d: M=%d O=%d => %.4f\n", successes,
832                          mo.modulus2, mo.offset2, merit);
833                 if (merit < best_merit)
834                 {   int i;
835                     printlog("+++ New best\n");
836                     best_modulus2 = mo.modulus2;
837                     best_offset2 = mo.offset2;
838                     best_merit = merit;
839 // Each time I get a new best assignment I transcribe it into the main
840 // official hash-table.
841                     for (i=0; i<shared_table_size; i++)
842                         shared_table[i] = table[i];
843                 }
844                 UNLOCKMUTEX;
845             }
846         }
847     }
848     std::free(table);
849     printlog("Thread finishing\n");
850     LOCKMUTEX;
851     printlog("Thread finishing\n");
852     if (!thread_finished)
853     {
854 //@     printlog("This is the first thread to finish: lock condmutex\n");
855 #ifndef WIN32
856 // The first thread to finish will signal the main one to that effect.
857         pthread_mutex_lock(&condmutex);
858         thread_finished = 1;
859 //@     printlog("First thread to finish: broadcast\n");
860         pthread_cond_broadcast(&cond);
861 //@     printlog("First thread to finish: unlock\n");
862         pthread_mutex_unlock(&condmutex);
863 //@     printlog("First thread to finish: done\n");
864 #else
865         thread_finished = 1;
866 #endif
867     }
868     UNLOCKMUTEX;
869     return THREAD_RESULT;
870 }
871 
872 // This routine is used to analyse tables of some specified size. If its
873 // last argument is false it merely checks if an assignment to a table
874 // of that size is possible, and returns if it manages to find one. If
875 // on the other hand the final argument is true then it will run the
876 // Hungarian algorithm on ALL possible hash functions at that size and
877 // return information about the one that gives the best packing. It
878 // does all this using multiple threads so that the often-expensive
879 // search takes less real time than might otherwise have been the case
880 // when you have a multi-core computer.
881 //
882 
try_all_hash_functions(std::uint32_t * keys,int key_count,cuckoo_importance * importance,void * hash,int hash_item_size,int hashsize,cuckoo_set_key * set_key,int use_hungarian)883 cuckoo_parameters try_all_hash_functions(
884     std::uint32_t *keys,
885     int key_count,
886     cuckoo_importance *importance,
887     void *hash,
888     int hash_item_size,
889     int hashsize,
890     cuckoo_set_key *set_key,
891     int use_hungarian)
892 {   int cpu_count, i;
893     cuckoo_parameters r;
894     std::uint32_t *table = reinterpret_cast<std::uint32_t *>(std)::calloc(
895                                hashsize, sizeof(std::uint32_t));
896     std::uint64_t memsize;
897     hungarian_test_alloc(table);
898     shared_use_hungarian = use_hungarian;
899     shared_keys = keys;
900     shared_key_count = key_count;
901     shared_importance = importance;
902     shared_table = table;
903     shared_table_size = hashsize;
904     shared_modulus2 = 1;
905     if (hashsize > 7000) shared_modulus2 =
906             4500; // Start some way in (@@@)
907     shared_offset2 = 0;
908     best_modulus2 = 0;
909     best_offset2 = 0;
910     best_merit = 4.0;
911     tries = successes = 0;
912 
913     printlog("try_all with table size %d and use_h=%d\n",
914              hashsize, use_hungarian);
915 
916     cpu_count = number_of_processors();
917     printlog("About to try to read memory size\n");
918     memsize = static_cast<std::uint64_t>(getMemorySize());
919     printlog("Have %.2fGiB memory\n",
920              static_cast<double>(memsize)/(1024.0*1024.0*1024.0));
921 #ifdef SEQUENTIAL
922     cpu_count = 1;
923 #endif
924     if (memsize <= 8LU*1024L*1024L*1024L && cpu_count > 6) cpu_count = 6;
925     if (memsize <= 4LU*1024L*1024L*1024L && cpu_count > 3) cpu_count = 3;
926     if (memsize <= 2LU*1024L*1024L*1024L && cpu_count > 1) cpu_count = 1;
927     printlog("I will use %d processors\n", cpu_count);
928     if (cpu_count > MAX_CPU_COUNT) cpu_count = MAX_CPU_COUNT;
929     cpu_count = 1;
930     thread_finished = 0;
931     for (i=0; i<cpu_count; i++)
932     {   if (CREATETHREAD_FAILED(i, hungarian_thread))
933         {   std::fprintf(stderr, "Unable to create thread\n");
934             std::exit(1);
935         }
936     }
937 //  printlog("All threads created\n");
938 // I will do a wait loop here that arranges that once every 5 seconds
939 // it checks whether at least one thread has finished and displays a
940 // message that tries to give some idea of how far it has got through the
941 // search. My ESTIMATE is that for a full search the code may run for
942 // around 9 hours on an 8-core machine and so there would be of the order
943 // of 6000 lines of trace output - rather a lot but not enough to be
944 // a complete embarassment. Perhaps.
945     while (!thread_finished)
946     {   double ww;
947         SLEEP;
948         LOCKMUTEX;
949         ww = static_cast<double>(shared_table_size - shared_modulus2);
950         printlog("%d:%d => %.4f now at m2=%d:o2=%d %.2f%% (limit m2=%d)\n",
951                  best_modulus2, best_offset2, best_merit,
952                  shared_modulus2, shared_offset2,
953                  100.0-(((100.0*ww)*ww)/shared_table_size)/shared_table_size,
954                  shared_table_size);
955         UNLOCKMUTEX;
956     }
957 //  printlog("At least one thread complete\n");
958     for (i=0; i<cpu_count-1; i++)
959     {   // printlog("Try to join with %d\n", i);
960         if (JOINTHREAD_FAILED(i))
961         {   std::fprintf(stderr, "Unable to join thread\n");
962             std::exit(1);
963         }
964         // printlog("Join with %d OK\n", i);
965     }
966 //  printlog("All threads now finished\n");
967     printlog("finished: final result is %d : %d   %.4f\n",
968              best_modulus2, best_offset2, best_merit);
969     if (successes == 0) // Failure!
970     {   r.table_size = static_cast<std::uint32_t>(-1);
971         r.modulus2 = r.offset2 = 4;
972         r.merit = 4.0;
973         std::free(table);
974         return r;
975     }
976     printlog("%d assignments for %d tries (%.3f%%)\n",
977              successes, tries, (100.0*successes)/tries);
978     r.table_size = hashsize;
979     r.modulus2 = best_modulus2;
980     r.offset2 = best_offset2;
981     r.merit = best_merit;
982     if (use_hungarian)
983         for (i=0; i<r.table_size; i++)
984             (*set_key)(hash_item_size*i+reinterpret_cast<char *>(hash),
985                        shared_table[i]);
986     std::free(table);
987     return r;
988 }
989 
990 
991 
992 // This tries all table sized starting from initial_table_size up to
993 // max_table_size and returns information about the first one for which
994 // a valid assignment is possible. I provide this because the subsequent
995 // faster version that uses binary search is based on an expectation that
996 // there will be assignments usable for every hash table size larger than
997 // the smallest one where there is any assignment at all. That is plausible
998 // but not guaranteed! Having a larger hash table should reduce pressure
999 // and increase flexibility so in general it ought never to make life harder,
1000 // but it is imaginable that the very smallest usable hash table has an
1001 // assignment that is basically an accident...
1002 //
cuckoo_simple_search(int myid,std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int initial_table_size,int max_table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key)1003 static cuckoo_parameters cuckoo_simple_search(
1004     int myid,
1005     std::uint32_t *items,
1006     int item_count,
1007     cuckoo_importance *importance,
1008     void *table,
1009     int hash_item_size,
1010     int initial_table_size,
1011     int max_table_size,
1012     cuckoo_get_key *get_key,
1013     cuckoo_set_key *set_key)
1014 {   cuckoo_parameters r = {(std::uint32_t)(-1), 0, 0, 4.0};
1015     int i, probe_count;
1016     std::uint32_t table_size, modulus2, offset2;
1017     for (table_size = initial_table_size; table_size<max_table_size;
1018          table_size++)
1019     {   printlog("Trial with table_size = %d\n", table_size);
1020         r = try_all_hash_functions(
1021                 items,
1022                 item_count,
1023                 importance,
1024                 table,
1025                 hash_item_size,
1026                 table_size,
1027                 set_key,
1028                 0);        // Merely check if there is an allocation
1029         if (r.table_size != static_cast<std::uint32_t>(-1))
1030         {   printlog("Success for %d items and table_size %d (%.2f%%)\n",
1031                      item_count, r.table_size, (100.0*item_count)/table_size);
1032             printlog("modulus2 = %d, offset2 = %d\n", r.modulus2, r.offset2);
1033             break;
1034         }
1035     }
1036     return r;
1037 }
1038 
1039 // This seeks a (near) optimal hash table using binary search to sort out the
1040 // size. Note again that this does not guarantee to find the best solution
1041 // because the mapping table_size -> number of assignments is not guaranteed
1042 // to be monotonic. However if you do not know what size is going to be best
1043 // this can be MUCH faster than a simple linear search, and it is really
1044 // improbable that it will get a very wrong answer!
1045 
1046 // Note that the search will be from [min..max), ie the minimum size quoted
1047 // will be tested by the largest acceptable size will be one lower than the
1048 // one given. This is so that the max can be passed as the size of the
1049 // hash table (and is because of zero-based array addresses).
1050 //
1051 // It will report failure if either the minimum specified table size has
1052 // a succesfull allocation or the maximum one does not.
1053 
1054 
cuckoo_binary_optimise(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int min_table_size,int max_table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key,double merit_target)1055 cuckoo_parameters cuckoo_binary_optimise(
1056     std::uint32_t *items,
1057     int item_count,
1058     cuckoo_importance *importance,
1059     void *table,
1060     int hash_item_size,
1061     int min_table_size,
1062     int max_table_size,
1063     cuckoo_get_key *get_key,
1064     cuckoo_set_key *set_key,
1065     double merit_target)
1066 {   cuckoo_parameters r, rhi;
1067     if (min_table_size >= max_table_size)
1068     {   r.table_size = static_cast<std::uint32_t>(-1);
1069         r.modulus2 = r.offset2 = 3;
1070         r.merit = 3.0;
1071         return r;
1072     }
1073 // First verify that the low limit does not lead to success. Doing this
1074 // is pretty cautious! I am going to ASSUME that the table of keys to be
1075 // inserted has no duplicates, and hence if there is more data than could
1076 // fit into the table there is no chance of success.
1077     if (min_table_size >= item_count)
1078     {   r = try_all_hash_functions(
1079                 items,
1080                 item_count,
1081                 importance,
1082                 table,
1083                 hash_item_size,
1084                 min_table_size,
1085                 set_key,
1086                 0);        // Merely check if there is an allocation
1087         if (r.table_size != static_cast<std::uint32_t>(-1))
1088         {   r.table_size = static_cast<std::uint32_t>(-1);
1089             printlog("Minimum size passed to binary search is too large\n");
1090             r.modulus2 = r.offset2 = 0;
1091             r.merit = 4.0;
1092             return r;
1093         }
1094     }
1095     else printlog("Low limit on table size smaller than item count\n");
1096 // Next confirm that the high limit does lead to success.
1097     rhi = try_all_hash_functions(
1098               items,
1099               item_count,
1100               importance,
1101               table,
1102               hash_item_size,
1103               max_table_size-1,
1104               set_key,
1105               0);        // Merely check if there is an allocation
1106     printlog("rhi: %d %d %d\n", rhi.table_size, rhi.modulus2,
1107              rhi.offset2);
1108     if (rhi.table_size == static_cast<std::uint32_t>(-1))
1109     {   printlog("Failed to fit into largest table size\n");
1110         return r;
1111     }
1112     while (max_table_size > min_table_size+1)
1113     {   int mid = (min_table_size + max_table_size)/2;
1114         r = try_all_hash_functions(
1115                 items,
1116                 item_count,
1117                 importance,
1118                 table,
1119                 hash_item_size,
1120                 mid,
1121                 set_key,
1122                 0);        // Merely check if there is an allocation
1123         if (r.table_size == static_cast<std::uint32_t>(-1)) min_table_size =
1124                 mid;
1125         else
1126         {   max_table_size = mid;
1127             rhi = r;
1128         }
1129     }
1130 // So far I have just identified the best size table to use, but I
1131 // have neither optimised moduls2/offset2 nor recorded a particular
1132 // allocation of keys. So here I use a more expensive search.
1133     printlog("Table will be of size %d\n", rhi.table_size);
1134     return try_all_hash_functions(
1135                items,
1136                item_count,
1137                importance,
1138                table,
1139                hash_item_size,
1140                rhi.table_size,
1141                set_key,
1142                1);        // Now do the full job!
1143 }
1144 
1145 // end of components.cpp
1146