1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3 * See COPYRIGHT in top-level directory.
4 */
5
6 /*
7 * The original version of this code was contributed by Milind Chabbi
8 * based on the work when he was at Rice University. It relies on the
9 * HMCS lock description in [1] and the fast path described in [2].
10 *
11 * [1] Chabbi, Milind, Michael Fagan, and John Mellor-Crummey. "High
12 * performance locks for multi-level NUMA systems." In Proceedings of
13 * the ACM SIGPLAN Symposium on Principles and Practice of Parallel
14 * Programming (PPoPP'15), ACM, 2015.
15 *
16 * [2] Chabbi, Milind, and John Mellor-Crummey. "Contention-conscious,
17 * locality-preserving locks." In Proceedings of the 21st ACM SIGPLAN
18 * Symposium on Principles and Practice of Parallel Programming (PPoPP'16,
19 * p. 22. ACM, 2016.
20 */
21
22 #include "lock/zm_lock_types.h"
23 #include <hwloc.h>
24
25 #ifndef DEFAULT_THRESHOLD
26 #define DEFAULT_THRESHOLD 256
27 #endif
28
29 #ifndef HMCS_DEFAULT_MAX_LEVELS
30 #define HMCS_DEFAULT_MAX_LEVELS 3
31 #endif
32
33 #define WAIT (0xffffffff)
34 #define COHORT_START (0x1)
35 #define ACQUIRE_PARENT (0xcffffffc)
36
37 #ifndef TRUE
38 #define TRUE 1
39 #else
40 #error "TRUE already defined"
41 #endif
42
43 #ifndef FALSE
44 #define FALSE 0
45 #else
46 #error "TRUE already defined"
47 #endif
48
49 /* Atomic operation shorthands. The memory ordering defaults to:
50 * 1- Acquire ordering for loads
51 * 2- Release ordering for stores
52 * 3- Acquire+Release ordering for read-modify-write operations
53 * */
54
55 #define LOAD(addr) zm_atomic_load(addr, zm_memord_acquire)
56 #define STORE(addr, val) zm_atomic_store(addr, val, zm_memord_release)
57 #define SWAP(addr, desire) zm_atomic_exchange_ptr(addr, desire, zm_memord_acq_rel)
58 #define CAS(addr, expect, desire) zm_atomic_compare_exchange_strong(addr,\
59 expect,\
60 desire,\
61 zm_memord_acq_rel,\
62 zm_memord_acquire)
63
64 struct hnode{
65 unsigned threshold __attribute__((aligned(ZM_CACHELINE_SIZE)));
66 struct hnode * parent __attribute__((aligned(ZM_CACHELINE_SIZE)));
67 zm_atomic_ptr_t lock __attribute__((aligned(ZM_CACHELINE_SIZE)));
68 zm_mcs_qnode_t node __attribute__((aligned(ZM_CACHELINE_SIZE)));
69
70 }__attribute__((aligned(ZM_CACHELINE_SIZE)));
71
72 struct leaf{
73 struct hnode * cur_node;
74 struct hnode * root_node;
75 zm_mcs_qnode_t I;
76 int curDepth;
77 int took_fast_path;
78 };
79
80 struct lock{
81 // Assumes tids range from [0.. max_threads)
82 // Assumes that tid 0 is close to tid and so on.
83 struct leaf ** leaf_nodes __attribute__((aligned(ZM_CACHELINE_SIZE)));
84 hwloc_topology_t topo;
85 int levels;
86 };
87
88 static zm_thread_local int tid = -1;
89
90 /* TODO: automate hardware topology detection
91 * instead of the below hard-coded method */
92
93 /* Check the actual affinity mask assigned to the thread */
check_affinity(hwloc_topology_t topo)94 static void check_affinity(hwloc_topology_t topo) {
95 hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
96 int set_length;
97 hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
98 set_length = hwloc_get_nbobjs_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU);
99 hwloc_bitmap_free(cpuset);
100
101 if(set_length != 1) {
102 printf("IZEM:HMCS:ERROR: thread bound to more than one HW thread!\n");
103 exit(EXIT_FAILURE);
104 }
105 }
106
reuse_qnode(zm_mcs_qnode_t * I)107 static inline void reuse_qnode(zm_mcs_qnode_t *I){
108 STORE(&I->status, WAIT);
109 STORE(&I->next, ZM_NULL);
110 }
111
new_hnode()112 static void* new_hnode() {
113 int err;
114 void *storage;
115 err = posix_memalign(&storage, ZM_CACHELINE_SIZE, sizeof(struct hnode));
116 if (err != 0) {
117 printf("posix_memalign failed in HMCS : new_hnode \n");
118 exit(EXIT_FAILURE);
119 }
120 return storage;
121 }
122
123 /* TODO: Macro or Template this for fast comprison */
get_threshold(struct hnode * L)124 static inline unsigned get_threshold(struct hnode *L) {
125 return L->threshold;
126 }
127
normal_mcs_release_with_value(struct hnode * L,zm_mcs_qnode_t * I,unsigned val)128 static inline void normal_mcs_release_with_value(struct hnode * L, zm_mcs_qnode_t *I, unsigned val){
129
130 zm_mcs_qnode_t *succ = (zm_mcs_qnode_t *)LOAD(&I->next);
131 if(succ) {
132 STORE(&succ->status, val);
133 return;
134 }
135 zm_mcs_qnode_t *tmp = I;
136 if (CAS(&(L->lock), (zm_ptr_t*)&tmp,ZM_NULL))
137 return;
138 while(succ == NULL)
139 succ = (zm_mcs_qnode_t *)LOAD(&I->next); /* SPIN */
140 STORE(&succ->status, val);
141 return;
142 }
143
acquire_root(struct hnode * L,zm_mcs_qnode_t * I)144 static inline void acquire_root(struct hnode * L, zm_mcs_qnode_t *I) {
145 // Prepare the node for use.
146 reuse_qnode(I);
147 zm_mcs_qnode_t *pred = (zm_mcs_qnode_t*) SWAP(&(L->lock), (zm_ptr_t)I);
148
149 if(!pred) {
150 // I am the first one at this level
151 return;
152 }
153
154 STORE(&pred->next, I);
155 while(LOAD(&I->status) == WAIT)
156 ; /* SPIN */
157 return;
158 }
159
tryacq_root(struct hnode * L,zm_mcs_qnode_t * I,int * success)160 static inline void tryacq_root(struct hnode * L, zm_mcs_qnode_t *I, int *success) {
161 zm_ptr_t expected = ZM_NULL;
162 // Prepare the node for use.
163 reuse_qnode(I);
164 *success = CAS(&(L->lock), &expected, (zm_ptr_t)I);
165
166 return;
167 }
168
release_root(struct hnode * L,zm_mcs_qnode_t * I)169 static inline void release_root(struct hnode * L, zm_mcs_qnode_t *I) {
170 // Top level release is usual MCS
171 // At the top level MCS we always writr COHORT_START since
172 // 1. It will release the lock
173 // 2. Will never grow large
174 // 3. Avoids a read from I->status
175 normal_mcs_release_with_value(L, I, COHORT_START);
176 }
177
nowaiters_root(struct hnode * L,zm_mcs_qnode_t * I)178 static inline int nowaiters_root(struct hnode * L, zm_mcs_qnode_t *I) {
179 return (LOAD(&I->next) == ZM_NULL);
180 }
181
acquire_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)182 static inline void acquire_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
183 // Trivial case = root level
184 if (level == 1)
185 acquire_root(L, I);
186 else {
187 // Prepare the node for use.
188 reuse_qnode(I);
189 zm_mcs_qnode_t* pred = (zm_mcs_qnode_t*)SWAP(&(L->lock), (zm_ptr_t)I);
190 if(!pred) {
191 // I am the first one at this level
192 // begining of cohort
193 STORE(&I->status, COHORT_START);
194 // acquire at next level if not at the top level
195 acquire_helper(level - 1, L->parent, &(L->node));
196 return;
197 } else {
198 STORE(&pred->next, I);
199 for(;;){
200 unsigned myStatus = LOAD(&I->status);
201 if(myStatus < ACQUIRE_PARENT) {
202 return;
203 }
204 if(myStatus == ACQUIRE_PARENT) {
205 // beginning of cohort
206 STORE(&I->status, COHORT_START);
207 // This means this level is acquired and we can start the next level
208 acquire_helper(level - 1, L->parent, &(L->node));
209 return;
210 }
211 // spin back; (I->status == WAIT)
212 }
213 }
214 }
215 }
216
release_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)217 static inline void release_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
218 // Trivial case = root level
219 if (level == 1) {
220 release_root(L, I);
221 } else {
222 unsigned cur_count = LOAD(&(I->status)) ;
223 zm_mcs_qnode_t * succ;
224
225 // Lower level releases
226 if(cur_count == get_threshold(L)) {
227 // NO KNOWN SUCCESSORS / DESCENDENTS
228 // reached threshold and have next level
229 // release to next level
230 release_helper(level - 1, L->parent, &(L->node));
231 //COMMIT_ALL_WRITES();
232 // Tap successor at this level and ask to spin acquire next level lock
233 normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
234 return;
235 }
236
237 succ = (zm_mcs_qnode_t*)LOAD(&(I->next));
238 // Not reached threshold
239 if(succ) {
240 STORE(&succ->status, cur_count + 1);
241 return; // released
242 }
243 // No known successor, so release
244 release_helper(level - 1, L->parent, &(L->node));
245 // Tap successor at this level and ask to spin acquire next level lock
246 normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
247 }
248 }
249
nowaiters_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)250 static inline int nowaiters_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
251 if (level == 1 ) {
252 return nowaiters_root(L,I);
253 } else {
254 if(LOAD(&I->next) != ZM_NULL)
255 return FALSE;
256 else
257 return nowaiters_helper(level - 1, L->parent, &(L->node));
258 }
259 }
260
new_leaf(struct hnode * h,int depth)261 static void* new_leaf(struct hnode *h, int depth) {
262 int err;
263 struct leaf *leaf;
264 err = posix_memalign((void **) &leaf, ZM_CACHELINE_SIZE, sizeof(struct leaf));
265 if (err != 0) {
266 printf("posix_memalign failed in HMCS : new_leaf \n");
267 exit(EXIT_FAILURE);
268 }
269 leaf->cur_node = h;
270 leaf->curDepth = depth;
271 leaf->took_fast_path = FALSE;
272 struct hnode *tmp, *root_node;
273 for(tmp = leaf->cur_node; tmp->parent != NULL; tmp = tmp->parent);
274 root_node = tmp;
275 leaf->root_node = root_node;
276 return leaf;
277 }
278
acquire_from_leaf(int level,struct leaf * L)279 static inline void acquire_from_leaf(int level, struct leaf *L){
280 if((zm_ptr_t)L->cur_node->lock == ZM_NULL
281 && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
282 // go FP
283 L->took_fast_path = TRUE;
284 acquire_root(L->root_node, &L->I);
285 return;
286 }
287 acquire_helper(level, L->cur_node, &L->I);
288 return;
289 }
290
tryacq_from_leaf(int level,struct leaf * L,int * success)291 static inline void tryacq_from_leaf(int level, struct leaf *L, int *success){
292 *success = 0;
293 if((zm_ptr_t)L->cur_node->lock == ZM_NULL
294 && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
295 tryacq_root(L->root_node, &L->I, success);
296 if (*success)
297 L->took_fast_path = TRUE;
298 }
299 return;
300 }
301
release_from_leaf(int level,struct leaf * L)302 static inline void release_from_leaf(int level, struct leaf *L){
303 //myrelease(cur_node, I);
304 if(L->took_fast_path) {
305 release_root(L->root_node, &L->I);
306 L->took_fast_path = FALSE;
307 return;
308 }
309 release_helper(level, L->cur_node, &L->I);
310 return;
311 }
312
nowaiters_from_leaf(int level,struct leaf * L)313 static inline int nowaiters_from_leaf(int level, struct leaf *L){
314 // Shouldnt this be nowaiters(root_node, I)?
315 if(L->took_fast_path) {
316 return nowaiters_root(L->cur_node, &L->I);
317 }
318
319 return nowaiters_helper(level, L->cur_node, &L->I);
320 }
321
get_hwthread_id(hwloc_topology_t topo)322 static int get_hwthread_id(hwloc_topology_t topo){
323 hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
324 hwloc_obj_t obj;
325 hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
326 obj = hwloc_get_obj_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU, 0);
327 hwloc_bitmap_free(cpuset);
328 return obj->logical_index;
329 }
330
set_hierarchy(struct lock * L,int * max_threads,int ** particip_per_level)331 static void set_hierarchy(struct lock *L, int *max_threads, int** particip_per_level) {
332 int max_depth, levels = 0, max_levels = HMCS_DEFAULT_MAX_LEVELS, explicit_levels = 0;
333 char tmp[20];
334 char *s = getenv("ZM_HMCS_MAX_LEVELS");
335 if (s != NULL)
336 max_levels = atoi(s);
337 int depths[max_levels];
338 int idx = 0;
339 /* advice to users: run `hwloc-ls -s --no-io --no-icaches` and choose
340 * depths of interest in ascending order. The first depth must be `0' */
341
342 s = getenv("ZM_HMCS_EXPLICIT_LEVELS");
343 if (s != NULL) {
344 strcpy(tmp, s);
345 explicit_levels = 1;
346 char* token;
347 token = strtok(tmp,",");
348 while(token != NULL) {
349 depths[idx] = atoi(token);
350 if (idx == 0)
351 assert(depths[idx] == 0 && "the first depth must be machine level (i.e., depth 0), run `hwloc-ls -s --no-io --no-icaches` and choose appropriate depth values");
352 idx++;
353 token = strtok(NULL,",");
354 }
355 assert(idx == max_levels);
356 }
357
358 hwloc_topology_init(&L->topo);
359 hwloc_topology_load(L->topo);
360
361 *max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);
362
363 max_depth = hwloc_topology_get_depth(L->topo);
364 assert(max_depth >= 2); /* At least Machine and Core levels exist */
365
366 *particip_per_level = (int*) malloc(max_levels * sizeof(int));
367 int prev_nobjs = -1;
368 if(!explicit_levels) {
369 for (int d = max_depth - 2; d > 1; d--) {
370 int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
371 /* Check if this level has a hierarchical impact */
372 if(cur_nobjs != prev_nobjs) {
373 prev_nobjs = cur_nobjs;
374 (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
375 levels++;
376 if(levels >= max_levels - 1)
377 break;
378 }
379 }
380 (*particip_per_level)[levels] = *max_threads;
381 levels++;
382 } else {
383 for(int i = max_levels - 1; i >= 0; i--){
384 int d = depths[i];
385 int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
386 /* Check if this level has a hierarchical impact */
387 if(cur_nobjs != prev_nobjs) {
388 prev_nobjs = cur_nobjs;
389 (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
390 levels++;
391 } else {
392 assert(0 && "plz choose levels that have a hierarchical impact");
393 }
394 }
395 }
396
397 L->levels = levels;
398 }
399
free_hierarchy(int * particip_per_level)400 static void free_hierarchy(int* particip_per_level){
401 free(particip_per_level);
402 }
403
new_lock()404 static void* new_lock(){
405
406 struct lock *L;
407 posix_memalign((void **) &L, ZM_CACHELINE_SIZE, sizeof(struct lock));
408
409 int max_threads;
410 int *participants_at_level;
411 set_hierarchy(L, &max_threads, &participants_at_level);
412
413 // Total locks needed = participantsPerLevel[1] + participantsPerLevel[2] + .. participantsPerLevel[levels-1] + 1
414 // Save affinity
415 hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
416 hwloc_get_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);
417
418 int total_locks_needed = 0;
419 int levels = L->levels;
420
421 for (int i=0; i < levels; i++) {
422 total_locks_needed += max_threads / participants_at_level[i] ;
423 }
424 struct hnode ** lock_locations;
425 posix_memalign((void **) &lock_locations, ZM_CACHELINE_SIZE, sizeof(struct hnode*) * total_locks_needed);
426 struct leaf ** leaf_nodes;
427 posix_memalign((void **) &leaf_nodes, ZM_CACHELINE_SIZE, sizeof(struct leaf*) * max_threads);
428
429 int threshold = DEFAULT_THRESHOLD;
430 char *s = getenv("ZM_HMCS_THRESHOLD");
431 if (s != NULL)
432 threshold = atoi(s);
433
434 hwloc_obj_t obj;
435 for(int tid = 0 ; tid < max_threads; tid ++){
436 obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
437 hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
438 // Pin me to hw-thread-id = tid
439 int last_lock_location_end = 0;
440 for(int cur_level = 0 ; cur_level < levels; cur_level++){
441 if (tid%participants_at_level[cur_level] == 0) {
442 // master, initialize the lock
443 int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
444 last_lock_location_end += max_threads/participants_at_level[cur_level];
445 struct hnode * cur_hnode = new_hnode();
446 cur_hnode->threshold = threshold;
447 cur_hnode->parent = NULL;
448 cur_hnode->lock = ZM_NULL;
449 lock_locations[lock_location] = cur_hnode;
450 }
451 }
452 }
453
454 // setup parents
455 for(int tid = 0 ; tid < max_threads; tid ++){
456 obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
457 hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
458 int last_lock_location_end = 0;
459 for(int cur_level = 0 ; cur_level < levels - 1; cur_level++){
460 if (tid%participants_at_level[cur_level] == 0) {
461 int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
462 last_lock_location_end += max_threads/participants_at_level[cur_level];
463 int parentLockLocation = last_lock_location_end + tid/participants_at_level[cur_level+1];
464 lock_locations[lock_location]->parent = lock_locations[parentLockLocation];
465 }
466 }
467 leaf_nodes[tid] = (struct leaf*)new_leaf(lock_locations[tid/participants_at_level[0]], levels);
468 }
469 free(lock_locations);
470 free_hierarchy(participants_at_level);
471 // Restore affinity
472 hwloc_set_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);
473 L->leaf_nodes = leaf_nodes;
474
475 hwloc_bitmap_free(cpuset);
476
477 return L;
478 }
479
search_nodes_rec(struct hnode * node,struct hnode ** nodes_to_free,int * num_ptrs,int max_threads)480 static void search_nodes_rec(struct hnode *node, struct hnode **nodes_to_free, int *num_ptrs, int max_threads) {
481 int i;
482 if(node != NULL) {
483 for(i = 0; i < *num_ptrs; i++) {
484 if(node == nodes_to_free[i])
485 break; /* already marked to be free'd */
486 }
487 if(i == *num_ptrs) { /* newly encountered pointer */
488 nodes_to_free[*num_ptrs] = node;
489 (*num_ptrs)++;
490 assert(*num_ptrs < 2*max_threads);
491 }
492 search_nodes_rec(node->parent, nodes_to_free, num_ptrs, max_threads);
493 }
494 }
495
free_lock(struct lock * L)496 static void free_lock(struct lock* L) {
497 int max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);
498 int num_ptrs = 0;
499 struct hnode **nodes_to_free = (struct hnode**) malloc(2*max_threads*sizeof(struct hnode*));
500 for (int tid = 0; tid < max_threads; tid++) {
501 search_nodes_rec(L->leaf_nodes[tid]->cur_node, nodes_to_free, &num_ptrs, max_threads);
502 free(L->leaf_nodes[tid]);
503 }
504 free(L->leaf_nodes);
505 for(int i = 0; i < num_ptrs; i++)
506 free(nodes_to_free[i]);
507 free(nodes_to_free);
508 hwloc_topology_destroy(L->topo);
509 free(L);
510 }
511
hmcs_acquire(struct lock * L)512 static inline void hmcs_acquire(struct lock *L){
513 if (zm_unlikely(tid == -1)) {
514 check_affinity(L->topo);
515 tid = get_hwthread_id(L->topo);
516 }
517 acquire_from_leaf(L->levels, L->leaf_nodes[tid]);
518 }
519
hmcs_tryacq(struct lock * L,int * success)520 static inline void hmcs_tryacq(struct lock *L, int *success){
521 if (zm_unlikely(tid == -1)) {
522 check_affinity(L->topo);
523 tid = get_hwthread_id(L->topo);
524 }
525 tryacq_from_leaf(L->levels, L->leaf_nodes[tid], success);
526 }
527
hmcs_release(struct lock * L)528 static inline void hmcs_release(struct lock *L){
529 release_from_leaf(L->levels, L->leaf_nodes[tid]);
530 }
531
hmcs_nowaiters(struct lock * L)532 static inline int hmcs_nowaiters(struct lock *L){
533 return nowaiters_from_leaf(L->levels, L->leaf_nodes[tid]);
534 }
535
zm_hmcs_init(zm_hmcs_t * handle)536 int zm_hmcs_init(zm_hmcs_t * handle) {
537 void *p = new_lock();
538 *handle = (zm_hmcs_t) p;
539 return 0;
540 }
541
zm_hmcs_destroy(zm_hmcs_t * L)542 int zm_hmcs_destroy(zm_hmcs_t *L) {
543 free_lock((struct lock*)(*L));
544 return 0;
545 }
546
zm_hmcs_acquire(zm_hmcs_t L)547 int zm_hmcs_acquire(zm_hmcs_t L){
548 hmcs_acquire((struct lock*)L);
549 return 0;
550 }
551
zm_hmcs_tryacq(zm_hmcs_t L,int * success)552 int zm_hmcs_tryacq(zm_hmcs_t L, int *success){
553 hmcs_tryacq((struct lock*)L, success);
554 return 0;
555 }
zm_hmcs_release(zm_hmcs_t L)556 int zm_hmcs_release(zm_hmcs_t L){
557 hmcs_release((struct lock*)L);
558 return 0;
559 }
zm_hmcs_nowaiters(zm_hmcs_t L)560 int zm_hmcs_nowaiters(zm_hmcs_t L){
561 return hmcs_nowaiters((struct lock*)L);
562 }
563
564