1 // components.cpp Copyright (C) A C Norman, October 2015-2021
2
3 /**************************************************************************
4 * Copyright (C) 2021, Codemist. A C Norman *
5 * *
6 * Redistribution and use in source and binary forms, with or without *
7 * modification, are permitted provided that the following conditions are *
8 * met: *
9 * *
10 * * Redistributions of source code must retain the relevant *
11 * copyright notice, this list of conditions and the following *
12 * disclaimer. *
13 * * Redistributions in binary form must reproduce the above *
14 * copyright notice, this list of conditions and the following *
15 * disclaimer in the documentation and/or other materials provided *
16 * with the distribution. *
17 * *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS *
21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE *
22 * COPYRIGHT OWNERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, *
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, *
24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS *
25 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
26 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR *
27 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF *
28 * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH *
29 * DAMAGE. *
30 *************************************************************************/
31
32 // $Id: components.cpp 5736 2021-03-16 10:41:22Z arthurcnorman $
33
34 // The code here is for the offline creation of a hash table that will
35 // follow in the spirit of cuckoo hashing using 1, 2 or 3 probes (but
36 // never more) to retrieve information. It tries a potentially large
37 // number of hash functions and picks one that optimises table occupancy.
38 // I use this for building a hash table that holds metrics for characters,
39 // and (to my amazement) achieve around 99% table occupancy with a worst
40 // case access cost of 3 probes.
41 //
42 // A central part of the process of setting up the hash table involves
43 // finding matchings in bipartite graphs. The Hopcroft-Karp algorithm is
44 // used for this.
45 //
46 // This version uses threads to do some of its searching in parallel, and
47 // should work on Linux, Windows and OS/X, checking for the number of
48 // CPUs and using them all.
49
50 #include <cstdio>
51 #include <cstdlib>
52 #include <cstdint>
53 #include <cinttypes>
54 #include <cstring>
55 #include <cstdarg>
56 #include <cerrno>
57 #include <cmath>
58
59 #ifdef WIN32
60 #include "winsupport.h"
61 #else
62 #include <pthread.h>
63 #include <unistd.h>
64 #endif
65
66 #include "cuckoo.h"
67
68 #include "memorysize.cpp"
69
70 // I now try to encapsulate the variations between thread support across
71 // Windows and the rest of the world...
72 // The macros here are not very syntactically safe, but I will be using
73 // them in very stylised manners so I am not too worried.
74
75 static __thread int threadnumber = -1;
76
77 #ifdef WIN32
78
79 CRITICAL_SECTION critical_section, logmutex;
80
number_of_processors()81 static int number_of_processors()
82 { SYSTEM_INFO si;
83 GetSystemInfo(&si);
84 return static_cast<int>(si.dwNumberOfProcessors);
85 }
86
87 // In very old versions of Windows one needed to use _beingthreadx rather
88 // than CreateThread here, but that is no longer the case (as best I can
89 // discern!).
90
91 // The function that implements a thread will return a value THREAD_RESULT
92 // of type THREAD_RESULT_T.
93
94 #define THREAD_RESULT_T DWORD __stdcall
95 #define THREAD_RESULT 0
96
97 // I will have a limit on the maximum number of threads I will ever
98 // create - and at present I am not expecting to find that I have
99 // a shared-memory cpu with more than 32 CPUs.
100
101 #define MAX_CPU_COUNT 32
102 static HANDLE thread[MAX_CPU_COUNT];
103
104 #define CREATETHREAD_FAILED(i, p) \
105 (thread[i] = CreateThread(nullptr, 0, &p, \
106 reinterpret_cast<void *>(reinterpret_cast<std::intptr_t>(i)), 0, nullptr)) == nullptr
107 // On Windows I will use a time-limited wait for my first worker
108 // thread to terminate as my "sleep" operation - that means that as soon
109 // as that thread terminates I wake up... and I should find the
110 // thread_finished variable has been set.
111 #define SLEEP if (WaitForSingleObject(thread[0], 5000) == WAIT_OBJECT_0) \
112 { CloseHandle(thread[0]); \
113 thread[0] = nullptr; \
114 }
115 #define JOINTHREAD_FAILED(i) \
116 (thread[i] != nullptr && \
117 (WaitForSingleObject(thread[i], INFINITE) != WAIT_OBJECT_0 || \
118 CloseHandle(thread[i]) == 0))
119
120 #else // Now for the pthreads version for Linux, Unix, BSD, OS/X etc.
121
122 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER,
123 logmutex = PTHREAD_MUTEX_INITIALIZER,
124 condmutex = PTHREAD_MUTEX_INITIALIZER;
125 pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
126
127 #ifdef _SC_NPROCESSORS_ONLN
128 // _SC_NPROCESSORS_ONLN is not a proper POSIX enquiry to send to sysconf,
129 // but it is available on Linux, cygwin and OS/X and so is worth using here,
130 // perhaps especially since an #ifdef can easily check for it. I use ONLN
131 // not CONF because some processors may have many cores but leave some of
132 // then disabled - I see reports that in particular big/little ARM chips
133 // do this. Note that for hyperthreaded CPUs this returns the number of
134 // threads I can run in parallel, not the number of raw cores. I also ignore
135 // issues of load and permit my code to use all available resources.
136
number_of_processors()137 static int number_of_processors()
138 { return sysconf(_SC_NPROCESSORS_ONLN);
139 }
140
141 #else // sysconf option to check CPU count
142
number_of_processors()143 static int number_of_processors()
144 { std::printf("_SC_NPROCESSORS_CONF not defined\n");
145 return 1;
146 }
147
148 #endif // sysconf option to check CPU count
149
150 #define THREAD_RESULT_T void *
151 #define THREAD_RESULT nullptr
152
153 #define MAX_CPU_COUNT 32
154 static pthread_t thread[MAX_CPU_COUNT];
155
156 #define CREATETHREAD_FAILED(i, p) \
157 pthread_create(&thread[i], nullptr, &p, reinterpret_cast<void *>(reinterpret_cast<std::intptr_t>(i)))
158 #define SLEEP \
159 { struct std::timespec ts; \
160 clock_gettime(CLOCK_REALTIME, &ts); \
161 ts.tv_sec += 5; \
162 pthread_cond_timedwait(&cond, &condmutex, &ts); \
163 }
164 #define JOINTHREAD_FAILED(i) \
165 pthread_join(thread[i], nullptr)
166
167 #endif // Windows vs pthreads.
168
169
170 // logging in a way that is thread-safe
171
172 #define DEBUG (0)
173
printlog(const char * s,...)174 void printlog(const char *s, ...)
175 { std::va_list a;
176 LOCKLOGMUTEX;
177 va_start(a, s);
178 #ifndef SEQUENTIAL
179 std::fprintf(stdout, "{%d:", threadnumber);
180 #endif
181 std::vfprintf(stdout, s, a);
182 #ifndef SEQUENTIAL
183 std::fprintf(stdout, ":%d}", threadnumber);
184 #endif
185 std::fflush(stdout);
186 va_end(a);
187 UNLOCKLOGMUTEX;
188 }
189
190
191 // See workfarm.c for a small program illustrating use of threads to attack
192 // a problem exploiting concurrency.
193
194 // Because the table was passed as a "void *" and we do not have
195 // a single type that maps the entries it is not feasible to use direct
196 // array access (as in "table[n]") to retrieve and update items in it.
197 // So here I have two macros that use the explicitly passed item size
198 // and accress arithmetic to create a pointer to the nth item in the
199 // table, and then the user-supplied functions to deal with just the
200 // part of the thing there that represents the key. These are macros
201 // not functions because they use a number of values not explicitly passed
202 // to them as arguments.
203
204 // I am expecting that insertion in these tables will not be done at
205 // all often, so having a somewhat clumsy-looking interface here does
206 // not worry me too much, and I will even accept code that is not
207 // terribly neat!
208
209
210 // I use the Hopcroft-Karp maximum matching algorithm to determine if there
211 // is any way of using a particular size of hash table. Once I know how large
212 // a hash table to use I can use the Hungarian algorithm to find an assignment
213 // to favours placing keys in positions that minimise the average number
214 // of probes that lookup will need.
215
216 // The Hopcroft-Karp code I use is imported, but had been been placed under
217 // very flexible license terms. See the top of the source code for details.
218
219 #include "hopkar.cpp"
220
221 // Now an adapter that takes my hashing problem and maps it onto the
222 // an instance of a matching problem. The input is a set of keys, a table
223 // size and two parameters that control details of the hashing. It then has
224 // the hash table it is tryng to create, which it fills in if it can find
225 // a perfect way to do so. Here the table is whatevver final native-shape
226 // hash table will be required, and the procedures get_key and set_key
227 // access the keys in it.
228
229 // This tries to insert all the keys in "items" into the hash-table "table"
230 // and it returns 0 if it fails. Because it is just finding a matching (ANY
231 // matching) it will be useful for discovering what the smallest possible
232 // table size is. But it will not take any steps at all to pick a matching
233 // that prefers first choice placements for keys over the other two
234 // possibilities. For that see the Hungarian algorith below which will be
235 // able to perform further optimisation but which is expected to be slower.
236
237 #define LARGEST_MATCHING 20000
238
239 static __thread int u_used[LARGEST_MATCHING+1];
240 static __thread int QQ[9*LARGEST_MATCHING+1];
241 static __thread int u_edge1[LARGEST_MATCHING];
242 static __thread int u_edge2[LARGEST_MATCHING];
243 static __thread int u_edge3[LARGEST_MATCHING];
244 static __thread int v_used[LARGEST_MATCHING];
245 static __thread int v_edgemap[LARGEST_MATCHING];
246 static __thread int v_edgecount[LARGEST_MATCHING];
247 static __thread int vu_edges[3*LARGEST_MATCHING];
248
249
find_any_assignment(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key,std::uint32_t modulus2,std::uint32_t offset2,int noisy)250 int find_any_assignment(
251 std::uint32_t *items,
252 int item_count,
253 cuckoo_importance *importance,
254 void *table,
255 int hash_item_size,
256 int table_size,
257 cuckoo_get_key *get_key,
258 cuckoo_set_key *set_key,
259 std::uint32_t modulus2,
260 std::uint32_t offset2,
261 int noisy)
262 { int i, j, k, l, Qin, Qout;
263 int ucount, vcount, uchain;
264 int uremaining = item_count, vremaining = table_size;
265 if (DEBUG)
266 printlog("find_any_assignment %d items in table of size %d\n",
267 item_count, table_size);
268 if (DEBUG) printlog("modulus2 = %d offset2 = %d\n", modulus2,
269 offset2);
270 // First I will construct a representation of the sparse graph;
271 for (i=0; i<item_count; i++)
272 { u_edge1[i] = u_edge2[i] = u_edge3[i] = -1;
273 u_used[i] = -1;
274 }
275 u_used[item_count] = -1; // a sentinel
276 for (i=0; i<table_size; i++)
277 { v_edgemap[i] = v_edgecount[i] = 0;
278 v_used[i] = -1;
279 }
280 for (i=0; i<item_count; i++)
281 { std::uint32_t key = items[i];
282 std::uint32_t h1 = key % table_size;
283 std::uint32_t h2 = key % modulus2 + offset2;
284 std::uint32_t h3 = (h1 + h2) % table_size;
285 int imp = (*importance)(key);
286 // Get rid of double edges... if an entry in the table is "-1" that
287 // means "no edge here".
288 if (h3 == h2 || h3 == h1) h3 = -1;
289 if (h2 == h1)
290 { h2 = h3;
291 h3 = -1;
292 }
293 switch (imp)
294 { default:
295 u_edge3[i] = static_cast<int>(h3);
296 if (h3 != -1) v_edgecount[h3]++; // count edges out from h3
297 // drop through
298 case CUCKOO_IMPORTANT:
299 u_edge2[i] = static_cast<int>(h2);
300 if (h2 != -1) v_edgecount[h2]++;
301 // drop through
302 case CUCKOO_VITAL:
303 u_edge1[i] = static_cast<int>(h1);
304 if (h1 != -1) v_edgecount[h1]++;
305 break;
306 }
307 if (DEBUG) printlog("%4d: (%5d) %5d %5d %5d\n", i, items[i],
308 u_edge1[i], u_edge2[i], u_edge3[i]);
309 }
310 // I have counted how many edges are incident at each vertex in set V so
311 // I can now put in pointers to the general table of edges V to U.
312 j = 0;
313 for (i=0; i<table_size; i++)
314 { v_edgemap[i] = j;
315 if (DEBUG)
316 printlog("table entry %d is pointed at by %d items at %d\n",
317 i, v_edgecount[i], v_edgemap[i]);
318 j += v_edgecount[i];
319 }
320 // Fill in the edges.
321 for (i=0; i<item_count; i++)
322 { if ((j = u_edge1[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
323 if ((j = u_edge2[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
324 if ((j = u_edge3[i]) >= 0) vu_edges[v_edgemap[j]++] = i;
325 }
326 // Restore the pointers to where they are.
327 j = 0;
328 for (i=0; i<table_size; i++)
329 { v_edgemap[i] = j;
330 if (DEBUG) printlog("V[%d] ->", i);
331 if (DEBUG) for (k=0; k<v_edgecount[i];
332 k++) printlog(" %d", vu_edges[j+k]);
333 if (DEBUG) printlog("\n");
334 j += v_edgecount[i];
335 }
336 i = 0;
337 while (i<item_count)
338 {
339 // Here I want to collect all the vertexes reachable from U[i]. I will
340 // need to know how many there are in U and how many in V, and only when I
341 // have got those counts can I try using Hopcroft-Karp to process them.
342 // I will chain up the locations in U that I find.
343 ucount = 0;
344 vcount = 0;
345 uchain = -2; // a terminator for the chain.
346 Qin = Qout = 0;
347 QQ[Qin++] = i;
348 if (DEBUG)
349 printlog("Seed search for component with %d (Qin=%d Qout=%d)\n", i,
350 Qin, Qout);
351 while (Qin != Qout)
352 { j = QQ[Qout++];
353 if (u_used[j] != -1)
354 { if (DEBUG) printlog("U[%d] already processed\n", j);
355 continue; // already processed
356 }
357 if (DEBUG) printlog("spread from U[%d], chain to %d\n", j, uchain);
358 // Record U[j] as having been used.
359 u_used[j] = uchain;
360 uchain = j;
361 ucount++;
362 if (DEBUG) printlog("chain up U. Now have %d vertices in A\n",
363 ucount);
364 // Now try visiting each vertex in V that is joined to j...
365 if ((k = u_edge1[j]) != -1 && v_used[k] == -1)
366 { v_used[k] = vcount++;
367 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
368 k, v_used[k]);
369 for (l=0; l<v_edgecount[k]; l++)
370 { int m = vu_edges[v_edgemap[k] + l];
371 if (DEBUG) printlog("edge goes back to %d\n", m);
372 if (u_used[m] == -1) QQ[Qin++] = m;
373 }
374 }
375 if ((k = u_edge2[j]) != -1 && v_used[k] == -1)
376 { v_used[k] = vcount++;
377 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
378 k, v_used[k]);
379 for (l=0; l<v_edgecount[k]; l++)
380 { int m = vu_edges[v_edgemap[k] + l];
381 if (DEBUG) printlog("edge goes back to %d\n", m);
382 if (u_used[m] == -1) QQ[Qin++] = m;
383 }
384 }
385 if ((k = u_edge3[j]) != -1 && v_used[k] == -1)
386 { v_used[k] = vcount++;
387 if (DEBUG) printlog("visit V[%d] for first time and name it %d\n",
388 k, v_used[k]);
389 for (l=0; l<v_edgecount[k]; l++)
390 { int m = vu_edges[v_edgemap[k] + l];
391 if (DEBUG) printlog("edge goes back to %d\n", m);
392 if (u_used[m] == -1) QQ[Qin++] = m;
393 }
394 }
395 if (DEBUG) printlog("Now Qin=%d Qout=%d\n", Qin, Qout);
396 }
397 if (DEBUG) printlog("Component found with size %d by %d\n", ucount,
398 vcount);
399 // If there are too many items in set U in this component then report
400 // failure. I could also report failure if the number of vertices left over
401 // were then out of balance.
402 uremaining -= ucount;
403 vremaining -= vcount;
404 if (ucount > vcount ||
405 uremaining > vremaining)
406 { if (DEBUG) printlog("Early exit\n");
407 return 0;
408 }
409 // If I have a string component with only 1 or 2 vertices in set B then
410 // there has to be a complete matching. If the component is larger I will
411 // use Hopcroft-Karp to assess it.
412 else if (vcount >= 3)
413 { init(ucount, vcount);
414 i = 0;
415 while (uchain != -2)
416 { if (DEBUG) printlog("i = %d uchain = %d\n", i, uchain);
417 if ((j = u_edge1[uchain]) != -1) addEdge(i, v_used[j]);
418 if ((j = u_edge2[uchain]) != -1) addEdge(i, v_used[j]);
419 if ((j = u_edge3[uchain]) != -1) addEdge(i, v_used[j]);
420 uchain = u_used[uchain];
421 if (i++ > ucount)
422 { printlog("uchain too long\n");
423 std::exit(1);
424 }
425 }
426 if (maxMatching() != ucount) return 0;
427 }
428 // I have put a -1 just after the end of the useful data so that the
429 // loop here that skips to the next un-visited vertex is certain to
430 // terminate nicely for me.
431 while (u_used[i] != -1) i++;
432 }
433 return 1;
434 }
435
436 // Once a table has been set up this function checks that each key is properly
437 // present and it computes a figure of merit for it... This code may in
438 // fact not be used!
439
cuckoo_merit(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int table_size,cuckoo_get_key * get_key,std::uint32_t modulus2,std::uint32_t offset2)440 double cuckoo_merit(
441 std::uint32_t *items,
442 int item_count,
443 cuckoo_importance *importance,
444 void *table,
445 int hash_item_size,
446 int table_size,
447 cuckoo_get_key *get_key,
448 std::uint32_t modulus2,
449 std::uint32_t offset2)
450 { int i;
451 int nvital = 0;
452 int nimportant = 0, nimp1 = 0, nimp2 = 0;
453 int nstandard = 0, nstand1 = 0, nstand2 = 0, nstand3 = 0;
454 for (i=0; i<item_count; i++)
455 { std::uint32_t key = items[i];
456 std::uint32_t h1 = key % table_size;
457 std::uint32_t h2 = key % modulus2 + offset2;
458 std::uint32_t h3 = (h1 + h2) % table_size;
459 int imp = (*importance)(key);
460 switch (imp)
461 { default:
462 nstandard++;
463 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
464 (table)))
465 { nstand1++;
466 break;
467 }
468 else if (key == (*get_key)(h2*hash_item_size+reinterpret_cast<char *>
469 (table)))
470 { nstand2++;
471 break;
472 }
473 else if (key == (*get_key)(h3*hash_item_size+reinterpret_cast<char *>
474 (table)))
475 { nstand3++;
476 break;
477 }
478 else
479 { printlog("Failed to find item %d (%u/%x)\n", i, key, key);
480 std::exit(1);
481 }
482 case CUCKOO_IMPORTANT:
483 nimportant++;
484 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
485 (table)))
486 { nimp1++;
487 break;
488 }
489 else if (key == (*get_key)(h2*hash_item_size+reinterpret_cast<char *>
490 (table)))
491 { nimp2++;
492 break;
493 }
494 else
495 { printlog("Failed to find item %d (%u/%x)\n", i, key, key);
496 std::exit(1);
497 }
498 case CUCKOO_VITAL:
499 nvital++;
500 if (key == (*get_key)(h1*hash_item_size+reinterpret_cast<char *>
501 (table)))
502 break;
503 else
504 { printlog("Failed to find item %d (%u/%x)\n", i, key, key);
505 std::exit(1);
506 }
507 }
508 }
509 // The "figure of merit" is a weighted average that scores each VITAL key
510 // as worth 10, each IMPORTANT ones as 4 and the rest as worth 1 each. Smaller
511 // values are better.
512 return (10.0*nvital +
513 4.0*(nimp1 + 2.0*nimp2) +
514 (nstand1 + 2.0*nstand2 + 3.0*nstand3)) /
515 (10.0*nvital + 4.0*nimportant + nstandard);
516 }
517
518 // As well as being able to find assignments I need to be able to find the
519 // BEST assignments, and the following imported code that implements the
520 // Hungarian algorith sorts that out. It uses a quite different interface
521 // from the Hopcroft-Karp code - which is reasonable since its source was
522 // different. I then need to write code to link it to the problems that I
523 // wish to solve.
524
525 // The code I have imported deals with general matchings and represents the
526 // bipartite graph as a full matrix. In my case the graph that I wish to
527 // process is sparse - it has at most three edges for each vertex in the
528 // set A. I am reworking the code to use adjcacency lists to
529 // represent the sparse array. Predefine SPARSE to try that version.
530
531 #ifdef SPARSE
532 #include "hunsparse.cpp"
533 #else
534 #include "hungarian.cpp"
535 #endif
536
537 // Now an adapter that takes my hashing problem and maps it onto the
538 // calls to the hungarian problem solver. The input is a set of keys,
539 // a table size and two parameters that control details of the hashing.
540 // It then has the hash table it is trying to create, which it fills in
541 // if it can find a perfect way to do so. For the code here the hash table
542 // is merely an array of uint32_t values.
543 //
544 // This tries to insert all the keys in "items" into the hash-table "table"
545 // and it returns a positive "merit" value on success, or -1.0 on failure
546 //
547
548 static __thread int **adjacency_matrix = nullptr;
549
find_best_assignment(std::uint32_t * items,int item_count,cuckoo_importance * importance,std::uint32_t * table,int table_size,std::uint32_t modulus2,std::uint32_t offset2)550 double find_best_assignment(
551 std::uint32_t *items,
552 int item_count,
553 cuckoo_importance *importance,
554 std::uint32_t *table,
555 int table_size,
556 std::uint32_t modulus2,
557 std::uint32_t offset2)
558 { int i, j;
559 int *mem;
560 hungarian_problem_t p;
561 printlog("starting find_best_assignment %d %d\n", item_count,
562 table_size);
563 printlog("m2=%d o2=%d\n", modulus2, offset2);
564 adjacency_matrix = new int *[table_size + table_size*item_count + 2];
565 hungarian_test_alloc(adjacency_matrix);
566 mem = reinterpret_cast<int *>(&adjacency_matrix[table_size]);
567 for (i=0; i<table_size; i++)
568 { adjacency_matrix[i] = mem;
569 mem += item_count;
570 // I put in a value that is not "infinite" but is such that any assignment
571 // that uses an edge of this length will be pretty bad.
572 for (j=0; j<item_count; j++)
573 adjacency_matrix[i][j] = 0x00010000;
574 }
575 for (i=0; i<item_count; i++)
576 { std::uint32_t key = items[i];
577 std::uint32_t h1 = key % table_size;
578 std::uint32_t h2 = key % modulus2 + offset2;
579 std::uint32_t h3 = (h1 + h2) % table_size;
580 int imp = (*importance)(key);
581 // I have thought a bit about the weights I use here, but to a large extent
582 // they are a bit of an arbitrary choice.
583 switch (imp)
584 { default:
585 adjacency_matrix[h1][i] = 0;
586 adjacency_matrix[h2][i] = 1;
587 adjacency_matrix[h3][i] = 2;
588 break;
589 case CUCKOO_IMPORTANT:
590 adjacency_matrix[h1][i] = 0;
591 adjacency_matrix[h2][i] = 10;
592 break;
593 case CUCKOO_VITAL:
594 adjacency_matrix[h1][i] = 0;
595 break;
596 }
597 }
598 hungarian_init(&p,
599 adjacency_matrix,
600 table_size,
601 item_count,
602 HUNGARIAN_MODE_MINIMIZE_COST);
603 printlog("call hungarian_solve\n");
604 hungarian_solve(&p);
605 printlog("hungarian_solve returned\n");
606 // Next I will extract the assignment found and put it in my hash table.
607 // While I am about it I will collect some statistics.
608 int nvital = 0;
609 int nimportant = 0, nimportant1 = 0, nimportant2 = 0;
610 int nstandard = 0, nstandard1 = 0, nstandard2 = 0, nstandard3 = 0;
611 for (i=0; i<item_count; i++)
612 { std::uint32_t key = items[i];
613 std::uint32_t h1 = key % table_size;
614 std::uint32_t h2 = key % modulus2 + offset2;
615 std::uint32_t h3 = (h1 + h2) % table_size;
616 int imp = (*importance)(key);
617 switch (imp)
618 { case CUCKOO_VITAL:
619 if (p.assignment[h1][i])
620 { table[h1] = key;
621 nvital++;
622 break;
623 }
624 else
625 { hungarian_free(&p);
626 std::free(adjacency_matrix);
627 return -1.0;
628 }
629 case CUCKOO_IMPORTANT:
630 if (p.assignment[h1][i])
631 { table[h1] = key;
632 nimportant++;
633 nimportant1++;
634 break;
635 }
636 else if (p.assignment[h2][i])
637 { table[h2] = key;
638 nimportant++;
639 nimportant2++;
640 break;
641 }
642 else
643 { hungarian_free(&p);
644 std::free(adjacency_matrix);
645 return -1.0;
646 }
647 case CUCKOO_STANDARD:
648 if (p.assignment[h1][i])
649 { table[h1] = key;
650 nstandard++;
651 nstandard1++;
652 break;
653 }
654 else if (p.assignment[h2][i])
655 { table[h2] = key;
656 nstandard++;
657 nstandard2++;
658 break;
659 }
660 else if (p.assignment[h3][i])
661 { table[h3] = key;
662 nstandard++;
663 nstandard3++;
664 break;
665 }
666 else
667 { hungarian_free(&p);
668 std::free(adjacency_matrix);
669 return -1.0;
670 }
671 }
672 }
673 printlog("transferred to hash table\n");
674 double merit = (10.0*nvital +
675 4.0*(nimportant1 + 2.0*nimportant2) +
676 (nstandard1 + 2.0*nstandard2 + 3.0*nstandard3)) /
677 (10.0*nvital + 4.0*nimportant + nstandard);
678 #define VERBOSE
679 #ifdef VERBOSE
680 double avimportant =
681 (nimportant1+2.0*nimportant2)/nimportant;
682 double avstandard =
683 (nstandard1+2.0*nstandard2+3.0*nstandard3)/nstandard;
684 // But I will also compute an average that shows the expected number of
685 // probes if all keys are accessed with equal probability.
686 double average =
687 (nvital + nimportant1 + 2.0*nimportant2 +
688 nstandard1 + 2.0*nstandard2 + 3.0*nstandard3) / item_count;
689 { printlog("\ntable_size = %u modulus2 = %u offset2 = %u occupancy %.2f\n",
690 table_size, modulus2, offset2,
691 (100.0*item_count)/table_size);
692 if (nvital != 0)
693 printlog("VITAL: %u 1.0 average probes\n", nvital);
694 if (nimportant1!=0 || nimportant2 != 0)
695 printlog("IMPORTANT: %u %u %.3f average probes\n",
696 nimportant1, nimportant2, avimportant);
697 if (nstandard1!=0 || nstandard2 != 0)
698 printlog("STANDARD: %u %u %u %.3f average probes\n",
699 nstandard1, nstandard2, nstandard3, avstandard);
700 printlog("Figure of merit = %.4f flat average = %.3f\n", merit,
701 average);
702 }
703 #endif
704 hungarian_free(&p);
705 std::free(adjacency_matrix);
706 return merit;
707 }
708
int32_get_key(void * p)709 std::uint32_t int32_get_key(void *p)
710 { return *reinterpret_cast<std::uint32_t *>(p);
711 }
712
int32_set_key(void * p,std::uint32_t v)713 void int32_set_key(void *p, std::uint32_t v)
714 { *reinterpret_cast<std::uint32_t *>(p) = v;
715 }
716
717 // To perform a parallel search I need to be able to distribute cases
718 // that need analysis to the various threads. A test case is a pair
719 // of a value for modulus2 and one for offset2.
720
721 typedef struct __mod_and_off
722 { std::uint32_t modulus2;
723 std::uint32_t offset2;
724 int noisy;
725 } mod_and_off;
726
727
728 // I have some data that is common to all the threads....
729
730 static int shared_use_hungarian;
731 static std::uint32_t *shared_keys;
732 static int shared_key_count;
733 static std::uint32_t *shared_table;
734 static int shared_table_size;
735 static cuckoo_importance *shared_importance;
736 static std::uint32_t shared_modulus2;
737 static std::uint32_t shared_offset2;
738 static std::uint32_t best_modulus2;
739 static std::uint32_t best_offset2;
740 static double best_merit;
741 static int tries;
742 static int successes;
743
744 // This should return the next mod_and_off, or one with values (0,0)
745 // when there are no more left.
746
hungarian_next_case()747 mod_and_off hungarian_next_case()
748 { mod_and_off r;
749 LOCKMUTEX;
750 tries++;
751 r.modulus2 = shared_modulus2;
752 r.offset2 = shared_offset2;
753 if (shared_modulus2 != 0)
754 { if (shared_modulus2 + shared_offset2 < shared_table_size)
755 shared_offset2++;
756 else
757 { shared_modulus2++;
758 shared_offset2 = 0;
759 if (shared_modulus2 >= shared_table_size) shared_modulus2 = 0;
760 }
761 }
762 if (tries % 1000 == 0) r.noisy = 1;
763 else r.noisy = 0;
764 UNLOCKMUTEX;
765 return r;
766 }
767
768 // Here is a function that implements what a thread should do. I have just one
769 // variable called thread_finished and if a thread is ending it will set it
770 // to 1. I will not worry about locks since this only needs to notice if at
771 // least one thread is done.
772 //
773 // If shared_use_hungarian is true this considers ALL the possible values for
774 // modulus2 and offset2 and constructs a proper hash-table allocation for the
775 // configuration that delivers the best figure of merit. Otherwise it merely
776 // seeks a valid assigment and arranges that all threads stop once one of
777 // them has found one. It does not return a figure of merit and does not
778 // fill in a detailed hash assignment (even though it could).
779
780 static int thread_finished;
781
hungarian_thread(void * arg)782 THREAD_RESULT_T hungarian_thread(void *arg)
783 { mod_and_off mo;
784 std::uint32_t *table = reinterpret_cast<std::uint32_t *>(std)::calloc(
785 shared_table_size, sizeof(std::uint32_t));
786 threadnumber = static_cast<int>(reinterpret_cast<std::intptr_t>(arg));
787 // printlog("Thread %d starting\n", threadnumber);
788 while ((mo = hungarian_next_case()).modulus2 != 0)
789 { double merit;
790 // Before I try the (possibly expensive) Hungarian method I will filter
791 // by using Hopcroft-Karp so that I at least know that there is some feasible
792 // allocation.
793 if (find_any_assignment(
794 shared_keys,
795 shared_key_count,
796 shared_importance,
797 table,
798 sizeof(std::uint32_t),
799 shared_table_size,
800 int32_get_key,
801 int32_set_key,
802 mo.modulus2,
803 mo.offset2,
804 mo.noisy))
805 { printlog("Found assignment for m2=%d : o2=%d for table size %d\n",
806 mo.modulus2, mo.offset2, shared_table_size);
807 if (!shared_use_hungarian)
808 { // printlog("Report back to owner and set quit flag\n");
809 LOCKMUTEX;
810 successes++;
811 best_modulus2 = mo.modulus2;
812 best_offset2 = mo.offset2;
813 best_merit = 0.0;
814 // Arrange that the next use of hungarian_next_case reports no more cases
815 // need investigation, and so all other threads will soon terminate too.
816 shared_modulus2 = 0;
817 shared_offset2 = 0;
818 UNLOCKMUTEX;
819 break;
820 }
821 else if ((merit = find_best_assignment(
822 shared_keys,
823 shared_key_count,
824 shared_importance,
825 table,
826 shared_table_size,
827 mo.modulus2,
828 mo.offset2)) >= 0.0)
829 { LOCKMUTEX;
830 successes++;
831 printlog("success count=%d: M=%d O=%d => %.4f\n", successes,
832 mo.modulus2, mo.offset2, merit);
833 if (merit < best_merit)
834 { int i;
835 printlog("+++ New best\n");
836 best_modulus2 = mo.modulus2;
837 best_offset2 = mo.offset2;
838 best_merit = merit;
839 // Each time I get a new best assignment I transcribe it into the main
840 // official hash-table.
841 for (i=0; i<shared_table_size; i++)
842 shared_table[i] = table[i];
843 }
844 UNLOCKMUTEX;
845 }
846 }
847 }
848 std::free(table);
849 printlog("Thread finishing\n");
850 LOCKMUTEX;
851 printlog("Thread finishing\n");
852 if (!thread_finished)
853 {
854 //@ printlog("This is the first thread to finish: lock condmutex\n");
855 #ifndef WIN32
856 // The first thread to finish will signal the main one to that effect.
857 pthread_mutex_lock(&condmutex);
858 thread_finished = 1;
859 //@ printlog("First thread to finish: broadcast\n");
860 pthread_cond_broadcast(&cond);
861 //@ printlog("First thread to finish: unlock\n");
862 pthread_mutex_unlock(&condmutex);
863 //@ printlog("First thread to finish: done\n");
864 #else
865 thread_finished = 1;
866 #endif
867 }
868 UNLOCKMUTEX;
869 return THREAD_RESULT;
870 }
871
872 // This routine is used to analyse tables of some specified size. If its
873 // last argument is false it merely checks if an assignment to a table
874 // of that size is possible, and returns if it manages to find one. If
875 // on the other hand the final argument is true then it will run the
876 // Hungarian algorithm on ALL possible hash functions at that size and
877 // return information about the one that gives the best packing. It
878 // does all this using multiple threads so that the often-expensive
879 // search takes less real time than might otherwise have been the case
880 // when you have a multi-core computer.
881 //
882
try_all_hash_functions(std::uint32_t * keys,int key_count,cuckoo_importance * importance,void * hash,int hash_item_size,int hashsize,cuckoo_set_key * set_key,int use_hungarian)883 cuckoo_parameters try_all_hash_functions(
884 std::uint32_t *keys,
885 int key_count,
886 cuckoo_importance *importance,
887 void *hash,
888 int hash_item_size,
889 int hashsize,
890 cuckoo_set_key *set_key,
891 int use_hungarian)
892 { int cpu_count, i;
893 cuckoo_parameters r;
894 std::uint32_t *table = reinterpret_cast<std::uint32_t *>(std)::calloc(
895 hashsize, sizeof(std::uint32_t));
896 std::uint64_t memsize;
897 hungarian_test_alloc(table);
898 shared_use_hungarian = use_hungarian;
899 shared_keys = keys;
900 shared_key_count = key_count;
901 shared_importance = importance;
902 shared_table = table;
903 shared_table_size = hashsize;
904 shared_modulus2 = 1;
905 if (hashsize > 7000) shared_modulus2 =
906 4500; // Start some way in (@@@)
907 shared_offset2 = 0;
908 best_modulus2 = 0;
909 best_offset2 = 0;
910 best_merit = 4.0;
911 tries = successes = 0;
912
913 printlog("try_all with table size %d and use_h=%d\n",
914 hashsize, use_hungarian);
915
916 cpu_count = number_of_processors();
917 printlog("About to try to read memory size\n");
918 memsize = static_cast<std::uint64_t>(getMemorySize());
919 printlog("Have %.2fGiB memory\n",
920 static_cast<double>(memsize)/(1024.0*1024.0*1024.0));
921 #ifdef SEQUENTIAL
922 cpu_count = 1;
923 #endif
924 if (memsize <= 8LU*1024L*1024L*1024L && cpu_count > 6) cpu_count = 6;
925 if (memsize <= 4LU*1024L*1024L*1024L && cpu_count > 3) cpu_count = 3;
926 if (memsize <= 2LU*1024L*1024L*1024L && cpu_count > 1) cpu_count = 1;
927 printlog("I will use %d processors\n", cpu_count);
928 if (cpu_count > MAX_CPU_COUNT) cpu_count = MAX_CPU_COUNT;
929 cpu_count = 1;
930 thread_finished = 0;
931 for (i=0; i<cpu_count; i++)
932 { if (CREATETHREAD_FAILED(i, hungarian_thread))
933 { std::fprintf(stderr, "Unable to create thread\n");
934 std::exit(1);
935 }
936 }
937 // printlog("All threads created\n");
938 // I will do a wait loop here that arranges that once every 5 seconds
939 // it checks whether at least one thread has finished and displays a
940 // message that tries to give some idea of how far it has got through the
941 // search. My ESTIMATE is that for a full search the code may run for
942 // around 9 hours on an 8-core machine and so there would be of the order
943 // of 6000 lines of trace output - rather a lot but not enough to be
944 // a complete embarassment. Perhaps.
945 while (!thread_finished)
946 { double ww;
947 SLEEP;
948 LOCKMUTEX;
949 ww = static_cast<double>(shared_table_size - shared_modulus2);
950 printlog("%d:%d => %.4f now at m2=%d:o2=%d %.2f%% (limit m2=%d)\n",
951 best_modulus2, best_offset2, best_merit,
952 shared_modulus2, shared_offset2,
953 100.0-(((100.0*ww)*ww)/shared_table_size)/shared_table_size,
954 shared_table_size);
955 UNLOCKMUTEX;
956 }
957 // printlog("At least one thread complete\n");
958 for (i=0; i<cpu_count-1; i++)
959 { // printlog("Try to join with %d\n", i);
960 if (JOINTHREAD_FAILED(i))
961 { std::fprintf(stderr, "Unable to join thread\n");
962 std::exit(1);
963 }
964 // printlog("Join with %d OK\n", i);
965 }
966 // printlog("All threads now finished\n");
967 printlog("finished: final result is %d : %d %.4f\n",
968 best_modulus2, best_offset2, best_merit);
969 if (successes == 0) // Failure!
970 { r.table_size = static_cast<std::uint32_t>(-1);
971 r.modulus2 = r.offset2 = 4;
972 r.merit = 4.0;
973 std::free(table);
974 return r;
975 }
976 printlog("%d assignments for %d tries (%.3f%%)\n",
977 successes, tries, (100.0*successes)/tries);
978 r.table_size = hashsize;
979 r.modulus2 = best_modulus2;
980 r.offset2 = best_offset2;
981 r.merit = best_merit;
982 if (use_hungarian)
983 for (i=0; i<r.table_size; i++)
984 (*set_key)(hash_item_size*i+reinterpret_cast<char *>(hash),
985 shared_table[i]);
986 std::free(table);
987 return r;
988 }
989
990
991
992 // This tries all table sized starting from initial_table_size up to
993 // max_table_size and returns information about the first one for which
994 // a valid assignment is possible. I provide this because the subsequent
995 // faster version that uses binary search is based on an expectation that
996 // there will be assignments usable for every hash table size larger than
997 // the smallest one where there is any assignment at all. That is plausible
998 // but not guaranteed! Having a larger hash table should reduce pressure
999 // and increase flexibility so in general it ought never to make life harder,
1000 // but it is imaginable that the very smallest usable hash table has an
1001 // assignment that is basically an accident...
1002 //
cuckoo_simple_search(int myid,std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int initial_table_size,int max_table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key)1003 static cuckoo_parameters cuckoo_simple_search(
1004 int myid,
1005 std::uint32_t *items,
1006 int item_count,
1007 cuckoo_importance *importance,
1008 void *table,
1009 int hash_item_size,
1010 int initial_table_size,
1011 int max_table_size,
1012 cuckoo_get_key *get_key,
1013 cuckoo_set_key *set_key)
1014 { cuckoo_parameters r = {(std::uint32_t)(-1), 0, 0, 4.0};
1015 int i, probe_count;
1016 std::uint32_t table_size, modulus2, offset2;
1017 for (table_size = initial_table_size; table_size<max_table_size;
1018 table_size++)
1019 { printlog("Trial with table_size = %d\n", table_size);
1020 r = try_all_hash_functions(
1021 items,
1022 item_count,
1023 importance,
1024 table,
1025 hash_item_size,
1026 table_size,
1027 set_key,
1028 0); // Merely check if there is an allocation
1029 if (r.table_size != static_cast<std::uint32_t>(-1))
1030 { printlog("Success for %d items and table_size %d (%.2f%%)\n",
1031 item_count, r.table_size, (100.0*item_count)/table_size);
1032 printlog("modulus2 = %d, offset2 = %d\n", r.modulus2, r.offset2);
1033 break;
1034 }
1035 }
1036 return r;
1037 }
1038
1039 // This seeks a (near) optimal hash table using binary search to sort out the
1040 // size. Note again that this does not guarantee to find the best solution
1041 // because the mapping table_size -> number of assignments is not guaranteed
1042 // to be monotonic. However if you do not know what size is going to be best
1043 // this can be MUCH faster than a simple linear search, and it is really
1044 // improbable that it will get a very wrong answer!
1045
1046 // Note that the search will be from [min..max), ie the minimum size quoted
1047 // will be tested by the largest acceptable size will be one lower than the
1048 // one given. This is so that the max can be passed as the size of the
1049 // hash table (and is because of zero-based array addresses).
1050 //
1051 // It will report failure if either the minimum specified table size has
1052 // a succesfull allocation or the maximum one does not.
1053
1054
cuckoo_binary_optimise(std::uint32_t * items,int item_count,cuckoo_importance * importance,void * table,int hash_item_size,int min_table_size,int max_table_size,cuckoo_get_key * get_key,cuckoo_set_key * set_key,double merit_target)1055 cuckoo_parameters cuckoo_binary_optimise(
1056 std::uint32_t *items,
1057 int item_count,
1058 cuckoo_importance *importance,
1059 void *table,
1060 int hash_item_size,
1061 int min_table_size,
1062 int max_table_size,
1063 cuckoo_get_key *get_key,
1064 cuckoo_set_key *set_key,
1065 double merit_target)
1066 { cuckoo_parameters r, rhi;
1067 if (min_table_size >= max_table_size)
1068 { r.table_size = static_cast<std::uint32_t>(-1);
1069 r.modulus2 = r.offset2 = 3;
1070 r.merit = 3.0;
1071 return r;
1072 }
1073 // First verify that the low limit does not lead to success. Doing this
1074 // is pretty cautious! I am going to ASSUME that the table of keys to be
1075 // inserted has no duplicates, and hence if there is more data than could
1076 // fit into the table there is no chance of success.
1077 if (min_table_size >= item_count)
1078 { r = try_all_hash_functions(
1079 items,
1080 item_count,
1081 importance,
1082 table,
1083 hash_item_size,
1084 min_table_size,
1085 set_key,
1086 0); // Merely check if there is an allocation
1087 if (r.table_size != static_cast<std::uint32_t>(-1))
1088 { r.table_size = static_cast<std::uint32_t>(-1);
1089 printlog("Minimum size passed to binary search is too large\n");
1090 r.modulus2 = r.offset2 = 0;
1091 r.merit = 4.0;
1092 return r;
1093 }
1094 }
1095 else printlog("Low limit on table size smaller than item count\n");
1096 // Next confirm that the high limit does lead to success.
1097 rhi = try_all_hash_functions(
1098 items,
1099 item_count,
1100 importance,
1101 table,
1102 hash_item_size,
1103 max_table_size-1,
1104 set_key,
1105 0); // Merely check if there is an allocation
1106 printlog("rhi: %d %d %d\n", rhi.table_size, rhi.modulus2,
1107 rhi.offset2);
1108 if (rhi.table_size == static_cast<std::uint32_t>(-1))
1109 { printlog("Failed to fit into largest table size\n");
1110 return r;
1111 }
1112 while (max_table_size > min_table_size+1)
1113 { int mid = (min_table_size + max_table_size)/2;
1114 r = try_all_hash_functions(
1115 items,
1116 item_count,
1117 importance,
1118 table,
1119 hash_item_size,
1120 mid,
1121 set_key,
1122 0); // Merely check if there is an allocation
1123 if (r.table_size == static_cast<std::uint32_t>(-1)) min_table_size =
1124 mid;
1125 else
1126 { max_table_size = mid;
1127 rhi = r;
1128 }
1129 }
1130 // So far I have just identified the best size table to use, but I
1131 // have neither optimised moduls2/offset2 nor recorded a particular
1132 // allocation of keys. So here I use a more expensive search.
1133 printlog("Table will be of size %d\n", rhi.table_size);
1134 return try_all_hash_functions(
1135 items,
1136 item_count,
1137 importance,
1138 table,
1139 hash_item_size,
1140 rhi.table_size,
1141 set_key,
1142 1); // Now do the full job!
1143 }
1144
1145 // end of components.cpp
1146