1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
2 /*
3  * See COPYRIGHT in top-level directory.
4  */
5 
6 /*
7  * The original version of this code was contributed by Milind Chabbi
8  * based on the work when he was at Rice University. It relies on the
9  * HMCS lock description in [1] and the fast path described in [2].
10  *
11  * [1] Chabbi, Milind, Michael Fagan, and John Mellor-Crummey. "High
12  * performance locks for multi-level NUMA systems." In Proceedings of
13  * the ACM SIGPLAN Symposium on Principles and Practice of Parallel
14  * Programming (PPoPP'15), ACM, 2015.
15  *
16  * [2] Chabbi, Milind, and John Mellor-Crummey. "Contention-conscious,
17  * locality-preserving locks." In Proceedings of the 21st ACM SIGPLAN
18  * Symposium on Principles and Practice of Parallel Programming (PPoPP'16,
19  * p. 22. ACM, 2016.
20  */
21 
22 #include "lock/zm_lock_types.h"
23 #include <hwloc.h>
24 
25 #ifndef DEFAULT_THRESHOLD
26 #define DEFAULT_THRESHOLD 256
27 #endif
28 
29 #ifndef HMCS_DEFAULT_MAX_LEVELS
30 #define HMCS_DEFAULT_MAX_LEVELS 3
31 #endif
32 
33 #define WAIT (0xffffffff)
34 #define COHORT_START (0x1)
35 #define ACQUIRE_PARENT (0xcffffffc)
36 
37 #ifndef TRUE
38 #define TRUE 1
39 #else
40 #error "TRUE already defined"
41 #endif
42 
43 #ifndef FALSE
44 #define FALSE 0
45 #else
46 #error "TRUE already defined"
47 #endif
48 
49 /* Atomic operation shorthands. The memory ordering defaults to:
50  * 1- Acquire ordering for loads
51  * 2- Release ordering for stores
52  * 3- Acquire+Release ordering for read-modify-write operations
53  * */
54 
55 #define LOAD(addr)                  zm_atomic_load(addr, zm_memord_acquire)
56 #define STORE(addr, val)            zm_atomic_store(addr, val, zm_memord_release)
57 #define SWAP(addr, desire)          zm_atomic_exchange_ptr(addr, desire, zm_memord_acq_rel)
58 #define CAS(addr, expect, desire)   zm_atomic_compare_exchange_strong(addr,\
59                                                                       expect,\
60                                                                       desire,\
61                                                                       zm_memord_acq_rel,\
62                                                                       zm_memord_acquire)
63 
64 struct hnode{
65     unsigned threshold __attribute__((aligned(ZM_CACHELINE_SIZE)));
66     struct hnode * parent __attribute__((aligned(ZM_CACHELINE_SIZE)));
67     zm_atomic_ptr_t lock __attribute__((aligned(ZM_CACHELINE_SIZE)));
68     zm_mcs_qnode_t node __attribute__((aligned(ZM_CACHELINE_SIZE)));
69 
70 }__attribute__((aligned(ZM_CACHELINE_SIZE)));
71 
72 struct leaf{
73     struct hnode * cur_node;
74     struct hnode * root_node;
75     zm_mcs_qnode_t I;
76     int curDepth;
77     int took_fast_path;
78 };
79 
80 struct lock{
81     // Assumes tids range from [0.. max_threads)
82     // Assumes that tid 0 is close to tid and so on.
83     struct leaf ** leaf_nodes __attribute__((aligned(ZM_CACHELINE_SIZE)));
84     hwloc_topology_t topo;
85     int levels;
86 };
87 
88 static zm_thread_local int tid = -1;
89 
90 /* TODO: automate hardware topology detection
91  * instead of the below hard-coded method */
92 
93 /* Check the actual affinity mask assigned to the thread */
check_affinity(hwloc_topology_t topo)94 static void check_affinity(hwloc_topology_t topo) {
95     hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
96     int set_length;
97     hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
98     set_length = hwloc_get_nbobjs_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU);
99     hwloc_bitmap_free(cpuset);
100 
101     if(set_length != 1) {
102         printf("IZEM:HMCS:ERROR: thread bound to more than one HW thread!\n");
103         exit(EXIT_FAILURE);
104     }
105 }
106 
reuse_qnode(zm_mcs_qnode_t * I)107 static inline void reuse_qnode(zm_mcs_qnode_t *I){
108     STORE(&I->status, WAIT);
109     STORE(&I->next, ZM_NULL);
110 }
111 
new_hnode()112 static void* new_hnode() {
113     int err;
114     void *storage;
115     err = posix_memalign(&storage, ZM_CACHELINE_SIZE, sizeof(struct hnode));
116     if (err != 0) {
117         printf("posix_memalign failed in HMCS : new_hnode \n");
118         exit(EXIT_FAILURE);
119     }
120     return storage;
121 }
122 
123 /* TODO: Macro or Template this for fast comprison */
get_threshold(struct hnode * L)124 static inline unsigned get_threshold(struct hnode *L) {
125     return L->threshold;
126 }
127 
normal_mcs_release_with_value(struct hnode * L,zm_mcs_qnode_t * I,unsigned val)128 static inline void normal_mcs_release_with_value(struct hnode * L, zm_mcs_qnode_t *I, unsigned val){
129 
130     zm_mcs_qnode_t *succ = (zm_mcs_qnode_t *)LOAD(&I->next);
131     if(succ) {
132         STORE(&succ->status, val);
133         return;
134     }
135     zm_mcs_qnode_t *tmp = I;
136     if (CAS(&(L->lock), (zm_ptr_t*)&tmp,ZM_NULL))
137         return;
138     while(succ == NULL)
139         succ = (zm_mcs_qnode_t *)LOAD(&I->next); /* SPIN */
140     STORE(&succ->status, val);
141     return;
142 }
143 
acquire_root(struct hnode * L,zm_mcs_qnode_t * I)144 static inline void acquire_root(struct hnode * L, zm_mcs_qnode_t *I) {
145     // Prepare the node for use.
146     reuse_qnode(I);
147     zm_mcs_qnode_t *pred = (zm_mcs_qnode_t*) SWAP(&(L->lock), (zm_ptr_t)I);
148 
149     if(!pred) {
150         // I am the first one at this level
151         return;
152     }
153 
154     STORE(&pred->next, I);
155     while(LOAD(&I->status) == WAIT)
156         ; /* SPIN */
157     return;
158 }
159 
tryacq_root(struct hnode * L,zm_mcs_qnode_t * I,int * success)160 static inline void tryacq_root(struct hnode * L, zm_mcs_qnode_t *I, int *success) {
161     zm_ptr_t expected = ZM_NULL;
162     // Prepare the node for use.
163     reuse_qnode(I);
164     *success = CAS(&(L->lock), &expected, (zm_ptr_t)I);
165 
166     return;
167 }
168 
release_root(struct hnode * L,zm_mcs_qnode_t * I)169 static inline void release_root(struct hnode * L, zm_mcs_qnode_t *I) {
170     // Top level release is usual MCS
171     // At the top level MCS we always writr COHORT_START since
172     // 1. It will release the lock
173     // 2. Will never grow large
174     // 3. Avoids a read from I->status
175     normal_mcs_release_with_value(L, I, COHORT_START);
176 }
177 
nowaiters_root(struct hnode * L,zm_mcs_qnode_t * I)178 static inline int nowaiters_root(struct hnode * L, zm_mcs_qnode_t *I) {
179     return (LOAD(&I->next) == ZM_NULL);
180 }
181 
acquire_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)182 static inline void acquire_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
183     // Trivial case = root level
184     if (level == 1)
185         acquire_root(L, I);
186     else {
187         // Prepare the node for use.
188         reuse_qnode(I);
189         zm_mcs_qnode_t* pred = (zm_mcs_qnode_t*)SWAP(&(L->lock), (zm_ptr_t)I);
190         if(!pred) {
191             // I am the first one at this level
192             // begining of cohort
193             STORE(&I->status, COHORT_START);
194             // acquire at next level if not at the top level
195             acquire_helper(level - 1, L->parent, &(L->node));
196             return;
197         } else {
198             STORE(&pred->next, I);
199             for(;;){
200                 unsigned myStatus = LOAD(&I->status);
201                 if(myStatus < ACQUIRE_PARENT) {
202                     return;
203                 }
204                 if(myStatus == ACQUIRE_PARENT) {
205                     // beginning of cohort
206                     STORE(&I->status, COHORT_START);
207                     // This means this level is acquired and we can start the next level
208                     acquire_helper(level - 1, L->parent, &(L->node));
209                     return;
210                 }
211                 // spin back; (I->status == WAIT)
212             }
213         }
214     }
215 }
216 
release_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)217 static inline void release_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
218     // Trivial case = root level
219     if (level == 1) {
220         release_root(L, I);
221     } else {
222         unsigned cur_count = LOAD(&(I->status)) ;
223         zm_mcs_qnode_t * succ;
224 
225         // Lower level releases
226         if(cur_count == get_threshold(L)) {
227             // NO KNOWN SUCCESSORS / DESCENDENTS
228             // reached threshold and have next level
229             // release to next level
230             release_helper(level - 1, L->parent, &(L->node));
231             //COMMIT_ALL_WRITES();
232             // Tap successor at this level and ask to spin acquire next level lock
233             normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
234             return;
235         }
236 
237         succ = (zm_mcs_qnode_t*)LOAD(&(I->next));
238         // Not reached threshold
239         if(succ) {
240             STORE(&succ->status, cur_count + 1);
241             return; // released
242         }
243         // No known successor, so release
244         release_helper(level - 1, L->parent, &(L->node));
245         // Tap successor at this level and ask to spin acquire next level lock
246         normal_mcs_release_with_value(L, I, ACQUIRE_PARENT);
247     }
248 }
249 
nowaiters_helper(int level,struct hnode * L,zm_mcs_qnode_t * I)250 static inline int nowaiters_helper(int level, struct hnode * L, zm_mcs_qnode_t *I) {
251     if (level == 1 ) {
252         return nowaiters_root(L,I);
253     } else {
254         if(LOAD(&I->next) != ZM_NULL)
255             return FALSE;
256         else
257             return nowaiters_helper(level - 1, L->parent, &(L->node));
258     }
259 }
260 
new_leaf(struct hnode * h,int depth)261 static void* new_leaf(struct hnode *h, int depth) {
262     int err;
263     struct leaf *leaf;
264     err = posix_memalign((void **) &leaf, ZM_CACHELINE_SIZE, sizeof(struct leaf));
265     if (err != 0) {
266         printf("posix_memalign failed in HMCS : new_leaf \n");
267         exit(EXIT_FAILURE);
268     }
269     leaf->cur_node = h;
270     leaf->curDepth = depth;
271     leaf->took_fast_path = FALSE;
272     struct hnode *tmp, *root_node;
273     for(tmp = leaf->cur_node; tmp->parent != NULL; tmp = tmp->parent);
274     root_node = tmp;
275     leaf->root_node = root_node;
276     return leaf;
277 }
278 
acquire_from_leaf(int level,struct leaf * L)279 static inline void acquire_from_leaf(int level, struct leaf *L){
280     if((zm_ptr_t)L->cur_node->lock == ZM_NULL
281     && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
282         // go FP
283         L->took_fast_path = TRUE;
284         acquire_root(L->root_node, &L->I);
285         return;
286     }
287     acquire_helper(level, L->cur_node, &L->I);
288     return;
289 }
290 
tryacq_from_leaf(int level,struct leaf * L,int * success)291 static inline void tryacq_from_leaf(int level, struct leaf *L, int *success){
292     *success = 0;
293     if((zm_ptr_t)L->cur_node->lock == ZM_NULL
294     && (zm_ptr_t)L->root_node->lock == ZM_NULL) {
295         tryacq_root(L->root_node, &L->I, success);
296         if (*success)
297             L->took_fast_path = TRUE;
298     }
299     return;
300 }
301 
release_from_leaf(int level,struct leaf * L)302 static inline void release_from_leaf(int level, struct leaf *L){
303     //myrelease(cur_node, I);
304     if(L->took_fast_path) {
305         release_root(L->root_node, &L->I);
306         L->took_fast_path = FALSE;
307         return;
308     }
309     release_helper(level, L->cur_node, &L->I);
310     return;
311 }
312 
nowaiters_from_leaf(int level,struct leaf * L)313 static inline int nowaiters_from_leaf(int level, struct leaf *L){
314     // Shouldnt this be nowaiters(root_node, I)?
315     if(L->took_fast_path) {
316         return nowaiters_root(L->cur_node, &L->I);
317     }
318 
319     return nowaiters_helper(level, L->cur_node, &L->I);
320 }
321 
get_hwthread_id(hwloc_topology_t topo)322 static int get_hwthread_id(hwloc_topology_t topo){
323     hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
324     hwloc_obj_t obj;
325     hwloc_get_cpubind(topo, cpuset, HWLOC_CPUBIND_THREAD);
326     obj = hwloc_get_obj_inside_cpuset_by_type(topo, cpuset, HWLOC_OBJ_PU, 0);
327     hwloc_bitmap_free(cpuset);
328     return obj->logical_index;
329 }
330 
set_hierarchy(struct lock * L,int * max_threads,int ** particip_per_level)331 static void set_hierarchy(struct lock *L, int *max_threads, int** particip_per_level) {
332     int max_depth, levels = 0, max_levels = HMCS_DEFAULT_MAX_LEVELS, explicit_levels = 0;
333     char tmp[20];
334     char *s = getenv("ZM_HMCS_MAX_LEVELS");
335     if (s != NULL)
336         max_levels = atoi(s);
337     int depths[max_levels];
338     int idx = 0;
339     /* advice to users: run `hwloc-ls -s --no-io --no-icaches` and choose
340      * depths of interest in ascending order. The first depth must be `0' */
341 
342     s = getenv("ZM_HMCS_EXPLICIT_LEVELS");
343     if (s != NULL) {
344         strcpy(tmp, s);
345         explicit_levels = 1;
346         char* token;
347         token = strtok(tmp,",");
348         while(token != NULL) {
349             depths[idx] = atoi(token);
350             if (idx == 0)
351                 assert(depths[idx] == 0 && "the first depth must be machine level (i.e., depth 0), run `hwloc-ls -s --no-io --no-icaches` and choose appropriate depth values");
352             idx++;
353             token = strtok(NULL,",");
354         }
355         assert(idx == max_levels);
356     }
357 
358     hwloc_topology_init(&L->topo);
359     hwloc_topology_load(L->topo);
360 
361     *max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);
362 
363     max_depth = hwloc_topology_get_depth(L->topo);
364     assert(max_depth >= 2); /* At least Machine and Core levels exist */
365 
366     *particip_per_level = (int*) malloc(max_levels * sizeof(int));
367     int prev_nobjs = -1;
368     if(!explicit_levels) {
369         for (int d = max_depth - 2; d > 1; d--) {
370             int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
371             /* Check if this level has a hierarchical impact */
372             if(cur_nobjs != prev_nobjs) {
373                 prev_nobjs = cur_nobjs;
374                 (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
375                 levels++;
376                 if(levels >= max_levels - 1)
377                     break;
378             }
379         }
380         (*particip_per_level)[levels] = *max_threads;
381         levels++;
382     } else {
383         for(int i = max_levels - 1; i >= 0; i--){
384             int d = depths[i];
385             int cur_nobjs = hwloc_get_nbobjs_by_depth(L->topo, d);
386             /* Check if this level has a hierarchical impact */
387             if(cur_nobjs != prev_nobjs) {
388                 prev_nobjs = cur_nobjs;
389                 (*particip_per_level)[levels] = (*max_threads)/cur_nobjs;
390                 levels++;
391             } else {
392                 assert(0 && "plz choose levels that have a hierarchical impact");
393             }
394         }
395     }
396 
397     L->levels = levels;
398 }
399 
free_hierarchy(int * particip_per_level)400 static void free_hierarchy(int* particip_per_level){
401     free(particip_per_level);
402 }
403 
new_lock()404 static void* new_lock(){
405 
406     struct lock *L;
407     posix_memalign((void **) &L, ZM_CACHELINE_SIZE, sizeof(struct lock));
408 
409     int max_threads;
410     int *participants_at_level;
411     set_hierarchy(L, &max_threads, &participants_at_level);
412 
413     // Total locks needed = participantsPerLevel[1] + participantsPerLevel[2] + .. participantsPerLevel[levels-1] + 1
414     // Save affinity
415     hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
416     hwloc_get_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);
417 
418     int total_locks_needed = 0;
419     int levels = L->levels;
420 
421     for (int i=0; i < levels; i++) {
422         total_locks_needed += max_threads / participants_at_level[i] ;
423     }
424     struct hnode ** lock_locations;
425     posix_memalign((void **) &lock_locations, ZM_CACHELINE_SIZE, sizeof(struct hnode*) * total_locks_needed);
426     struct leaf ** leaf_nodes;
427     posix_memalign((void **) &leaf_nodes, ZM_CACHELINE_SIZE, sizeof(struct leaf*) * max_threads);
428 
429     int threshold = DEFAULT_THRESHOLD;
430     char *s = getenv("ZM_HMCS_THRESHOLD");
431     if (s != NULL)
432         threshold = atoi(s);
433 
434     hwloc_obj_t obj;
435     for(int tid = 0 ; tid < max_threads; tid ++){
436         obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
437         hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
438         // Pin me to hw-thread-id = tid
439         int last_lock_location_end = 0;
440         for(int cur_level = 0 ; cur_level < levels; cur_level++){
441             if (tid%participants_at_level[cur_level] == 0) {
442                 // master, initialize the lock
443                 int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
444                 last_lock_location_end += max_threads/participants_at_level[cur_level];
445                 struct hnode * cur_hnode = new_hnode();
446                 cur_hnode->threshold = threshold;
447                 cur_hnode->parent = NULL;
448                 cur_hnode->lock = ZM_NULL;
449                 lock_locations[lock_location] = cur_hnode;
450             }
451         }
452     }
453 
454     // setup parents
455     for(int tid = 0 ; tid < max_threads; tid ++){
456         obj = hwloc_get_obj_by_type (L->topo, HWLOC_OBJ_PU, tid);
457         hwloc_set_cpubind(L->topo, obj->cpuset, HWLOC_CPUBIND_THREAD);
458         int last_lock_location_end = 0;
459         for(int cur_level = 0 ; cur_level < levels - 1; cur_level++){
460             if (tid%participants_at_level[cur_level] == 0) {
461                 int lock_location = last_lock_location_end + tid/participants_at_level[cur_level];
462                 last_lock_location_end += max_threads/participants_at_level[cur_level];
463                 int parentLockLocation = last_lock_location_end + tid/participants_at_level[cur_level+1];
464                 lock_locations[lock_location]->parent = lock_locations[parentLockLocation];
465             }
466         }
467         leaf_nodes[tid] = (struct leaf*)new_leaf(lock_locations[tid/participants_at_level[0]], levels);
468     }
469     free(lock_locations);
470     free_hierarchy(participants_at_level);
471     // Restore affinity
472     hwloc_set_cpubind(L->topo, cpuset, HWLOC_CPUBIND_THREAD);
473     L->leaf_nodes = leaf_nodes;
474 
475     hwloc_bitmap_free(cpuset);
476 
477     return L;
478 }
479 
search_nodes_rec(struct hnode * node,struct hnode ** nodes_to_free,int * num_ptrs,int max_threads)480 static void search_nodes_rec(struct hnode *node, struct hnode **nodes_to_free, int *num_ptrs, int max_threads) {
481     int i;
482     if(node != NULL) {
483         for(i = 0; i < *num_ptrs; i++) {
484             if(node == nodes_to_free[i])
485                 break; /* already marked to be free'd */
486         }
487         if(i == *num_ptrs) { /* newly encountered pointer */
488             nodes_to_free[*num_ptrs] = node;
489             (*num_ptrs)++;
490             assert(*num_ptrs < 2*max_threads);
491         }
492         search_nodes_rec(node->parent, nodes_to_free, num_ptrs, max_threads);
493     }
494 }
495 
free_lock(struct lock * L)496 static void free_lock(struct lock* L) {
497     int max_threads = hwloc_get_nbobjs_by_type(L->topo, HWLOC_OBJ_PU);
498     int num_ptrs = 0;
499     struct hnode **nodes_to_free = (struct hnode**) malloc(2*max_threads*sizeof(struct hnode*));
500     for (int tid = 0; tid < max_threads; tid++) {
501         search_nodes_rec(L->leaf_nodes[tid]->cur_node, nodes_to_free, &num_ptrs, max_threads);
502         free(L->leaf_nodes[tid]);
503     }
504     free(L->leaf_nodes);
505     for(int i = 0; i < num_ptrs; i++)
506         free(nodes_to_free[i]);
507     free(nodes_to_free);
508     hwloc_topology_destroy(L->topo);
509     free(L);
510 }
511 
hmcs_acquire(struct lock * L)512 static inline void hmcs_acquire(struct lock *L){
513     if (zm_unlikely(tid == -1)) {
514         check_affinity(L->topo);
515         tid = get_hwthread_id(L->topo);
516     }
517     acquire_from_leaf(L->levels, L->leaf_nodes[tid]);
518 }
519 
hmcs_tryacq(struct lock * L,int * success)520 static inline void hmcs_tryacq(struct lock *L, int *success){
521     if (zm_unlikely(tid == -1)) {
522         check_affinity(L->topo);
523         tid = get_hwthread_id(L->topo);
524     }
525     tryacq_from_leaf(L->levels, L->leaf_nodes[tid], success);
526 }
527 
hmcs_release(struct lock * L)528 static inline void hmcs_release(struct lock *L){
529     release_from_leaf(L->levels, L->leaf_nodes[tid]);
530 }
531 
hmcs_nowaiters(struct lock * L)532 static inline int hmcs_nowaiters(struct lock *L){
533     return nowaiters_from_leaf(L->levels, L->leaf_nodes[tid]);
534 }
535 
zm_hmcs_init(zm_hmcs_t * handle)536 int zm_hmcs_init(zm_hmcs_t * handle) {
537     void *p = new_lock();
538     *handle  = (zm_hmcs_t) p;
539     return 0;
540 }
541 
zm_hmcs_destroy(zm_hmcs_t * L)542 int zm_hmcs_destroy(zm_hmcs_t *L) {
543     free_lock((struct lock*)(*L));
544     return 0;
545 }
546 
zm_hmcs_acquire(zm_hmcs_t L)547 int zm_hmcs_acquire(zm_hmcs_t L){
548     hmcs_acquire((struct lock*)L);
549     return 0;
550 }
551 
zm_hmcs_tryacq(zm_hmcs_t L,int * success)552 int zm_hmcs_tryacq(zm_hmcs_t L, int *success){
553     hmcs_tryacq((struct lock*)L, success);
554     return 0;
555 }
zm_hmcs_release(zm_hmcs_t L)556 int zm_hmcs_release(zm_hmcs_t L){
557     hmcs_release((struct lock*)L);
558     return 0;
559 }
zm_hmcs_nowaiters(zm_hmcs_t L)560 int zm_hmcs_nowaiters(zm_hmcs_t L){
561     return hmcs_nowaiters((struct lock*)L);
562 }
563 
564