1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 /*  Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 /* Lock primitives inspired by Karl Malbrain, Concurrency Kit, and TF/sync.
12 ******************************************************************************/
13 #include "libxsmm_main.h"
14 
15 #if !defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
16 # define LIBXSMM_SYNC_FUTEX
17 #endif
18 
19 #if defined(LIBXSMM_OFFLOAD_TARGET)
20 # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
21 #endif
22 #include <stdint.h>
23 #if defined(_WIN32)
24 # include <process.h>
25 #else
26 # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
27 #   include <linux/futex.h>
28 # endif
29 # include <unistd.h>
30 # include <time.h>
31 #endif
32 #if defined(LIBXSMM_OFFLOAD_TARGET)
33 # pragma offload_attribute(pop)
34 #endif
35 
36 #if !defined(LIBXSMM_SYNC_RWLOCK_BITS)
37 # if defined(__MINGW32__)
38 #   define LIBXSMM_SYNC_RWLOCK_BITS 32
39 # else
40 #   define LIBXSMM_SYNC_RWLOCK_BITS 16
41 # endif
42 #endif
43 
44 #if !defined(LIBXSMM_SYNC_GENERIC_PID) && 1
45 # define LIBXSMM_SYNC_GENERIC_PID
46 #endif
47 
48 
49 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_core_tag { /* per-core */
50   uint8_t id;
51   volatile uint8_t core_sense;
52   volatile uint8_t* thread_senses;
53   volatile uint8_t* my_flags[2];
54   uint8_t** partner_flags[2];
55   uint8_t parity;
56   uint8_t sense;
57 } internal_sync_core_tag;
58 
59 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_thread_tag { /* per-thread */
60   int core_tid;
61   internal_sync_core_tag *core;
62 } internal_sync_thread_tag;
63 
64 struct LIBXSMM_RETARGETABLE libxsmm_barrier {
65   internal_sync_core_tag** cores;
66   internal_sync_thread_tag** threads;
67   int ncores, nthreads_per_core;
68   int nthreads, ncores_nbits; /* nbits(ncores) != log2(ncores) */
69   /* internal counter type which is guaranteed to be atomic when using certain methods */
70   volatile int threads_waiting;
71   /* thread-safety during initialization */
72   volatile uint8_t init_done;
73 };
74 
75 
libxsmm_barrier_create(int ncores,int nthreads_per_core)76 LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core)
77 {
78   libxsmm_barrier *const barrier = (libxsmm_barrier*)malloc(sizeof(libxsmm_barrier));
79 #if (0 == LIBXSMM_SYNC)
80   LIBXSMM_UNUSED(ncores); LIBXSMM_UNUSED(nthreads_per_core);
81 #else
82   if (NULL != barrier && 1 < ncores && 1 <= nthreads_per_core) {
83     barrier->ncores = ncores;
84     barrier->ncores_nbits = (int)LIBXSMM_NBITS(ncores);
85     barrier->nthreads_per_core = nthreads_per_core;
86     barrier->nthreads = ncores * nthreads_per_core;
87     barrier->threads = (internal_sync_thread_tag**)libxsmm_aligned_malloc(
88       barrier->nthreads * sizeof(internal_sync_thread_tag*), LIBXSMM_CACHELINE);
89     barrier->cores = (internal_sync_core_tag**)libxsmm_aligned_malloc(
90       barrier->ncores * sizeof(internal_sync_core_tag*), LIBXSMM_CACHELINE);
91     barrier->threads_waiting = barrier->nthreads; /* atomic */
92     barrier->init_done = 0; /* false */
93   }
94   else
95 #endif
96   if (NULL != barrier) {
97     barrier->nthreads = 1;
98   }
99   return barrier;
100 }
101 
102 
libxsmm_barrier_init(libxsmm_barrier * barrier,int tid)103 LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid)
104 {
105 #if (0 == LIBXSMM_SYNC)
106   LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid);
107 #else
108   if (NULL != barrier && 1 < barrier->nthreads) {
109     const int cid = tid / barrier->nthreads_per_core; /* this thread's core ID */
110     internal_sync_core_tag* core = 0;
111     int i;
112     internal_sync_thread_tag* thread;
113 
114     /* we only initialize the barrier once */
115     if (barrier->init_done == 2) {
116       return;
117     }
118 
119     /* allocate per-thread structure */
120     thread = (internal_sync_thread_tag*)libxsmm_aligned_malloc(
121       sizeof(internal_sync_thread_tag), LIBXSMM_CACHELINE);
122     barrier->threads[tid] = thread;
123     thread->core_tid = tid - (barrier->nthreads_per_core * cid); /* mod */
124 
125     /* each core's thread 0 does all the allocations */
126     if (0 == thread->core_tid) {
127       core = (internal_sync_core_tag*)libxsmm_aligned_malloc(
128         sizeof(internal_sync_core_tag), LIBXSMM_CACHELINE);
129       core->id = (uint8_t)cid;
130       core->core_sense = 1;
131 
132       core->thread_senses = (uint8_t*)libxsmm_aligned_malloc(
133         barrier->nthreads_per_core * sizeof(uint8_t), LIBXSMM_CACHELINE);
134       for (i = 0; i < barrier->nthreads_per_core; ++i) core->thread_senses[i] = 1;
135 
136       for (i = 0; i < 2; ++i) {
137         core->my_flags[i] = (uint8_t*)libxsmm_aligned_malloc(
138           barrier->ncores_nbits * sizeof(uint8_t) * LIBXSMM_CACHELINE,
139           LIBXSMM_CACHELINE);
140         core->partner_flags[i] = (uint8_t**)libxsmm_aligned_malloc(
141           barrier->ncores_nbits * sizeof(uint8_t*),
142           LIBXSMM_CACHELINE);
143       }
144 
145       core->parity = 0;
146       core->sense = 1;
147       barrier->cores[cid] = core;
148     }
149 
150     /* barrier to let all the allocations complete */
151     if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) {
152       barrier->threads_waiting = barrier->nthreads; /* atomic */
153       barrier->init_done = 1; /* true */
154     }
155     else {
156       while (0/*false*/ == barrier->init_done);
157     }
158 
159     /* set required per-thread information */
160     thread->core = barrier->cores[cid];
161 
162     /* each core's thread 0 completes setup */
163     if (0 == thread->core_tid) {
164       int di;
165       for (i = di = 0; i < barrier->ncores_nbits; ++i, di += LIBXSMM_CACHELINE) {
166         /* find dissemination partner and link to it */
167         const int dissem_cid = (cid + (1 << i)) % barrier->ncores;
168         assert(0 != core); /* initialized under the same condition; see above */
169         core->my_flags[0][di] = core->my_flags[1][di] = 0;
170         core->partner_flags[0][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[0][di];
171         core->partner_flags[1][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[1][di];
172       }
173     }
174 
175     /* barrier to let initialization complete */
176     if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) {
177       barrier->threads_waiting = barrier->nthreads; /* atomic */
178       barrier->init_done = 2;
179     }
180     else {
181       while (2 != barrier->init_done);
182     }
183   }
184 #endif
185 }
186 
187 
LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC)188 LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC)
189 void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid)
190 {
191 #if (0 == LIBXSMM_SYNC)
192   LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid);
193 #else
194   if (NULL != barrier && 1 < barrier->nthreads) {
195     internal_sync_thread_tag *const thread = barrier->threads[tid];
196     internal_sync_core_tag *const core = thread->core;
197 
198     /* first let's execute a memory fence */
199     LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
200 
201     /* first signal this thread's arrival */
202     core->thread_senses[thread->core_tid] = (uint8_t)(0 == core->thread_senses[thread->core_tid] ? 1 : 0);
203 
204     /* each core's thread 0 syncs across cores */
205     if (0 == thread->core_tid) {
206       int i;
207       /* wait for the core's remaining threads */
208       for (i = 1; i < barrier->nthreads_per_core; ++i) {
209         uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[i];
210         while (core_sense == thread_sense) { /* avoid evaluation in unspecified order */
211           LIBXSMM_SYNC_PAUSE;
212           core_sense = core->core_sense;
213           thread_sense = core->thread_senses[i];
214         }
215       }
216 
217       if (1 < barrier->ncores) {
218         int di;
219 # if defined(__MIC__)
220         /* cannot use LIBXSMM_ALIGNED since attribute may not apply to local non-static arrays */
221         uint8_t sendbuffer[LIBXSMM_CACHELINE+LIBXSMM_CACHELINE-1];
222         uint8_t *const sendbuf = LIBXSMM_ALIGN(sendbuffer, LIBXSMM_CACHELINE);
223         __m512d m512d;
224         _mm_prefetch((const char*)core->partner_flags[core->parity][0], _MM_HINT_ET1);
225         sendbuf[0] = core->sense;
226         m512d = LIBXSMM_INTRINSICS_MM512_LOAD_PD(sendbuf);
227 # endif
228 
229         for (i = di = 0; i < barrier->ncores_nbits - 1; ++i, di += LIBXSMM_CACHELINE) {
230 # if defined(__MIC__)
231           _mm_prefetch((const char*)core->partner_flags[core->parity][i+1], _MM_HINT_ET1);
232           _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d);
233 # else
234           *core->partner_flags[core->parity][i] = core->sense;
235 # endif
236           while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE;
237         }
238 
239 # if defined(__MIC__)
240         _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d);
241 # else
242         *core->partner_flags[core->parity][i] = core->sense;
243 # endif
244         while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE;
245         if (1 == core->parity) {
246           core->sense = (uint8_t)(0 == core->sense ? 1 : 0);
247         }
248         core->parity = (uint8_t)(1 - core->parity);
249       }
250 
251       /* wake up the core's remaining threads */
252       core->core_sense = core->thread_senses[0];
253     }
254     else { /* other threads wait for cross-core sync to complete */
255       uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[thread->core_tid];
256       while (core_sense != thread_sense) { /* avoid evaluation in unspecified order */
257         LIBXSMM_SYNC_PAUSE;
258         core_sense = core->core_sense;
259         thread_sense = core->thread_senses[thread->core_tid];
260       }
261     }
262   }
263 #endif
264 }
265 
266 
libxsmm_barrier_destroy(const libxsmm_barrier * barrier)267 LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier)
268 {
269 #if (0 != LIBXSMM_SYNC)
270   if (NULL != barrier && 1 < barrier->nthreads) {
271     if (2 == barrier->init_done) {
272       int i;
273       for (i = 0; i < barrier->ncores; ++i) {
274         int j;
275         libxsmm_free((const void*)barrier->cores[i]->thread_senses);
276         for (j = 0; j < 2; ++j) {
277           libxsmm_free((const void*)barrier->cores[i]->my_flags[j]);
278           libxsmm_free(barrier->cores[i]->partner_flags[j]);
279         }
280         libxsmm_free(barrier->cores[i]);
281       }
282       for (i = 0; i < barrier->nthreads; ++i) {
283         libxsmm_free(barrier->threads[i]);
284       }
285     }
286     libxsmm_free(barrier->threads);
287     libxsmm_free(barrier->cores);
288   }
289 #endif
290   free((libxsmm_barrier*)barrier);
291 }
292 
293 
294 #if (0 != LIBXSMM_SYNC)
295 enum {
296   INTERNAL_SYNC_LOCK_FREE = 0,
297   INTERNAL_SYNC_LOCK_LOCKED = 1,
298   INTERNAL_SYNC_LOCK_CONTESTED = 2,
299   INTERNAL_SYNC_RWLOCK_READINC = 0x10000/*(USHRT_MAX+1)*/,
300   INTERNAL_SYNC_FUTEX = 202
301 };
302 #endif
303 
304 
305 typedef unsigned int libxsmm_spinlock_state;
306 struct LIBXSMM_RETARGETABLE libxsmm_spinlock {
307   volatile libxsmm_spinlock_state state;
308 };
309 
310 
libxsmm_spinlock_create(void)311 LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void)
312 {
313   libxsmm_spinlock *const result = (libxsmm_spinlock*)malloc(sizeof(libxsmm_spinlock));
314 #if (0 != LIBXSMM_SYNC)
315   if (0 != result) {
316     result->state = INTERNAL_SYNC_LOCK_FREE;
317   }
318 #endif
319   return result;
320 }
321 
322 
libxsmm_spinlock_destroy(const libxsmm_spinlock * spinlock)323 LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock)
324 {
325   free((libxsmm_spinlock*)spinlock);
326 }
327 
328 
libxsmm_spinlock_trylock(libxsmm_spinlock * spinlock)329 LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock)
330 {
331 #if (0 != LIBXSMM_SYNC)
332 # if 0
333   /*const*/ libxsmm_spinlock_state lock_free = INTERNAL_SYNC_LOCK_FREE;
334   assert(0 != spinlock);
335   return 0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&spinlock->state, lock_free, INTERNAL_SYNC_LOCK_LOCKED, LIBXSMM_ATOMIC_RELAXED)
336     ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + 1) /* not acquired */
337     : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK));
338 # else
339   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + !LIBXSMM_ATOMIC_TRYLOCK(&spinlock->state, LIBXSMM_ATOMIC_RELAXED);
340 # endif
341 #else
342   LIBXSMM_UNUSED(spinlock);
343   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK);
344 #endif
345 }
346 
347 
libxsmm_spinlock_acquire(libxsmm_spinlock * spinlock)348 LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock)
349 {
350 #if (0 != LIBXSMM_SYNC)
351   assert(0 != spinlock);
352   for (;;) {
353     if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&spinlock->state, 1, LIBXSMM_ATOMIC_RELAXED)) break;
354     LIBXSMM_SYNC_CYCLE(&spinlock->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE);
355   }
356   LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
357 #else
358   LIBXSMM_UNUSED(spinlock);
359 #endif
360 }
361 
362 
libxsmm_spinlock_release(libxsmm_spinlock * spinlock)363 LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock)
364 {
365 #if (0 != LIBXSMM_SYNC)
366   assert(0 != spinlock);
367   LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
368   spinlock->state = INTERNAL_SYNC_LOCK_FREE;
369 #else
370   LIBXSMM_UNUSED(spinlock);
371 #endif
372 }
373 
374 
375 #if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
376 typedef int libxsmm_mutex_state;
377 #else
378 typedef char libxsmm_mutex_state;
379 #endif
380 struct LIBXSMM_RETARGETABLE libxsmm_mutex {
381   volatile libxsmm_mutex_state state;
382 };
383 
384 
libxsmm_mutex_create(void)385 LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void)
386 {
387   libxsmm_mutex *const result = (libxsmm_mutex*)malloc(sizeof(libxsmm_mutex));
388 #if (0 != LIBXSMM_SYNC)
389   if (0 != result) {
390     result->state = INTERNAL_SYNC_LOCK_FREE;
391   }
392 #endif
393   return result;
394 }
395 
396 
libxsmm_mutex_destroy(const libxsmm_mutex * mutex)397 LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex)
398 {
399   free((libxsmm_mutex*)mutex);
400 }
401 
402 
libxsmm_mutex_trylock(libxsmm_mutex * mutex)403 LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex)
404 {
405 #if (0 != LIBXSMM_SYNC)
406   assert(0 != mutex);
407   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) + !LIBXSMM_ATOMIC_TRYLOCK(&mutex->state, LIBXSMM_ATOMIC_RELAXED);
408 #else
409   LIBXSMM_UNUSED(mutex);
410   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX);
411 #endif
412 }
413 
414 
libxsmm_mutex_acquire(libxsmm_mutex * mutex)415 LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex)
416 {
417 #if (0 != LIBXSMM_SYNC)
418 # if defined(_WIN32)
419   assert(0 != mutex);
420   while (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) != libxsmm_mutex_trylock(mutex)) {
421     LIBXSMM_SYNC_CYCLE(&mutex->state, 0/*free*/, LIBXSMM_SYNC_NPAUSE);
422   }
423 # else
424   libxsmm_mutex_state lock_free = INTERNAL_SYNC_LOCK_FREE, lock_state = INTERNAL_SYNC_LOCK_LOCKED;
425   assert(0 != mutex);
426   while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&mutex->state, lock_free, lock_state, LIBXSMM_ATOMIC_RELAXED)) {
427     libxsmm_mutex_state state;
428     /* coverity[unreachable] may be reachable more than once due to volatile state */
429     for (state = mutex->state; INTERNAL_SYNC_LOCK_FREE != state; state = mutex->state) {
430 #     if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__)
431       LIBXSMM_SYNC_CYCLE_ELSE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE, {
432         /*const*/ libxsmm_mutex_state state_locked = INTERNAL_SYNC_LOCK_LOCKED;
433         if (INTERNAL_SYNC_LOCK_LOCKED != state || LIBXSMM_ATOMIC_CMPSWP(&mutex->state,
434           state_locked, INTERNAL_SYNC_LOCK_CONTESTED, LIBXSMM_ATOMIC_RELAXED))
435         {
436           syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAIT, INTERNAL_SYNC_LOCK_CONTESTED, NULL, NULL, 0);
437           lock_state = INTERNAL_SYNC_LOCK_CONTESTED;
438         }}
439       );
440       break;
441 #     else
442       LIBXSMM_SYNC_CYCLE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE);
443 #     endif
444     }
445     lock_free = INTERNAL_SYNC_LOCK_FREE;
446   }
447 # endif
448 #else
449   LIBXSMM_UNUSED(mutex);
450 #endif
451 }
452 
453 
libxsmm_mutex_release(libxsmm_mutex * mutex)454 LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex)
455 {
456 #if (0 != LIBXSMM_SYNC)
457   assert(0 != mutex);
458   LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
459 # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
460   if (INTERNAL_SYNC_LOCK_CONTESTED == LIBXSMM_ATOMIC_FETCH_SUB(&mutex->state, 1, LIBXSMM_ATOMIC_RELAXED)) {
461     mutex->state = INTERNAL_SYNC_LOCK_FREE;
462     syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAKE, 1, NULL, NULL, 0);
463   }
464 # else
465   mutex->state = INTERNAL_SYNC_LOCK_FREE;
466 # endif
467 #else
468   LIBXSMM_UNUSED(mutex);
469 #endif
470 }
471 
472 
473 #if (0 != LIBXSMM_SYNC)
474 typedef LIBXSMM_CONCATENATE3(uint,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_uint_t;
475 typedef LIBXSMM_CONCATENATE3(int,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_int_t;
476 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_sync_counter {
477   struct { internal_sync_uint_t writer, reader; } kind;
478   uint32_t bits;
479 } internal_sync_counter;
480 #endif
481 LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_rwlock {
482 #if (0 != LIBXSMM_SYNC)
483   volatile internal_sync_counter completions;
484   volatile internal_sync_counter requests;
485 #else
486   int dummy;
487 #endif
488 };
489 
490 
libxsmm_rwlock_create(void)491 LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void)
492 {
493   libxsmm_rwlock *const result = (libxsmm_rwlock*)malloc(sizeof(libxsmm_rwlock));
494   if (0 != result) {
495 #if (0 != LIBXSMM_SYNC)
496     LIBXSMM_MEMZERO127(&result->completions);
497     LIBXSMM_MEMZERO127(&result->requests);
498 #else
499     LIBXSMM_MEMZERO127(result);
500 #endif
501   }
502   return result;
503 }
504 
505 
libxsmm_rwlock_destroy(const libxsmm_rwlock * rwlock)506 LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock)
507 {
508   free((libxsmm_rwlock*)rwlock);
509 }
510 
511 
512 #if (0 != LIBXSMM_SYNC)
internal_rwlock_trylock(libxsmm_rwlock * rwlock,internal_sync_counter * prev)513 LIBXSMM_API_INLINE int internal_rwlock_trylock(libxsmm_rwlock* rwlock, internal_sync_counter* prev)
514 {
515   internal_sync_counter next;
516   assert(0 != rwlock && 0 != prev);
517   do {
518     prev->bits = rwlock->requests.bits;
519     next.bits = prev->bits;
520     ++next.kind.writer;
521   }
522   while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&rwlock->requests.bits, prev->bits, next.bits, LIBXSMM_ATOMIC_RELAXED));
523   return rwlock->completions.bits != prev->bits
524     ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */
525     : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK));
526 }
527 #endif
528 
529 
libxsmm_rwlock_trylock(libxsmm_rwlock * rwlock)530 LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock)
531 {
532 #if (0 != LIBXSMM_SYNC)
533   internal_sync_counter prev;
534   return internal_rwlock_trylock(rwlock, &prev);
535 #else
536   LIBXSMM_UNUSED(rwlock);
537   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
538 #endif
539 }
540 
541 
libxsmm_rwlock_acquire(libxsmm_rwlock * rwlock)542 LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock)
543 {
544 #if (0 != LIBXSMM_SYNC)
545   internal_sync_counter prev;
546   if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_trylock(rwlock, &prev)) {
547     while (rwlock->completions.bits != prev.bits) {
548       LIBXSMM_SYNC_CYCLE(&rwlock->completions.bits, prev.bits, LIBXSMM_SYNC_NPAUSE);
549     }
550   }
551 #else
552   LIBXSMM_UNUSED(rwlock);
553 #endif
554 }
555 
556 
libxsmm_rwlock_release(libxsmm_rwlock * rwlock)557 LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock)
558 {
559 #if (0 != LIBXSMM_SYNC)
560   assert(0 != rwlock);
561   LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.writer, 1, LIBXSMM_ATOMIC_SEQ_CST);
562 #else
563   LIBXSMM_UNUSED(rwlock);
564 #endif
565 }
566 
567 
568 #if (0 != LIBXSMM_SYNC)
internal_rwlock_tryread(libxsmm_rwlock * rwlock,internal_sync_counter * prev)569 LIBXSMM_API_INLINE int internal_rwlock_tryread(libxsmm_rwlock* rwlock, internal_sync_counter* prev)
570 {
571 #if (0 != LIBXSMM_SYNC)
572   assert(0 != rwlock && 0 != prev);
573   prev->bits = LIBXSMM_ATOMIC_FETCH_ADD(&rwlock->requests.bits, INTERNAL_SYNC_RWLOCK_READINC, LIBXSMM_ATOMIC_SEQ_CST);
574   return rwlock->completions.kind.writer != prev->kind.writer
575     ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */
576     : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK));
577 #else
578   LIBXSMM_UNUSED(rwlock); LIBXSMM_UNUSED(prev);
579   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
580 #endif
581 }
582 #endif
583 
584 
libxsmm_rwlock_tryread(libxsmm_rwlock * rwlock)585 LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock)
586 {
587 #if (0 != LIBXSMM_SYNC)
588   internal_sync_counter prev;
589   return internal_rwlock_tryread(rwlock, &prev);
590 #else
591   LIBXSMM_UNUSED(rwlock);
592   return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
593 #endif
594 }
595 
596 
libxsmm_rwlock_acqread(libxsmm_rwlock * rwlock)597 LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock)
598 {
599 #if (0 != LIBXSMM_SYNC)
600   internal_sync_counter prev;
601   if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_tryread(rwlock, &prev)) {
602     while (rwlock->completions.kind.writer != prev.kind.writer) {
603       LIBXSMM_SYNC_CYCLE(&rwlock->completions.kind.writer, prev.kind.writer, LIBXSMM_SYNC_NPAUSE);
604     }
605   }
606 #else
607   LIBXSMM_UNUSED(rwlock);
608 #endif
609 }
610 
611 
libxsmm_rwlock_relread(libxsmm_rwlock * rwlock)612 LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock)
613 {
614 #if (0 != LIBXSMM_SYNC)
615   assert(0 != rwlock);
616   LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.reader, 1, LIBXSMM_ATOMIC_SEQ_CST);
617 #else
618   LIBXSMM_UNUSED(rwlock);
619 #endif
620 }
621 
622 
libxsmm_get_pid(void)623 LIBXSMM_API unsigned int libxsmm_get_pid(void)
624 {
625 #if defined(_WIN32)
626   return (unsigned int)_getpid();
627 #else
628   return (unsigned int)getpid();
629 #endif
630 }
631 
632 
633 LIBXSMM_API_INTERN unsigned int internal_get_tid(void);
internal_get_tid(void)634 LIBXSMM_API_INTERN unsigned int internal_get_tid(void)
635 {
636   const unsigned int nthreads = LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_thread_count, 1, LIBXSMM_ATOMIC_RELAXED);
637 #if !defined(NDEBUG)
638   static int error_once = 0;
639   if (LIBXSMM_NTHREADS_MAX < nthreads
640     && 0 != libxsmm_verbosity /* library code is expected to be mute */
641     && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
642   {
643     fprintf(stderr, "LIBXSMM ERROR: maximum number of threads is exhausted!\n");
644   }
645 #endif
646   LIBXSMM_ASSERT(LIBXSMM_NTHREADS_MAX == LIBXSMM_UP2POT(LIBXSMM_NTHREADS_MAX));
647   return LIBXSMM_MOD2(nthreads - 1, LIBXSMM_NTHREADS_MAX);
648 }
649 
650 
libxsmm_get_tid(void)651 LIBXSMM_API unsigned int libxsmm_get_tid(void)
652 {
653 #if (0 != LIBXSMM_SYNC)
654 # if defined(LIBXSMM_SYNC_GENERIC_PID)
655   static LIBXSMM_TLS unsigned int tid = 0xFFFFFFFF;
656   if (0xFFFFFFFF == tid) tid = internal_get_tid();
657   return tid;
658 # else
659   void* tls = LIBXSMM_TLS_GETVALUE(libxsmm_tlskey);
660   if (NULL == tls) {
661     static unsigned int tid[LIBXSMM_NTHREADS_MAX];
662     const int i = internal_get_tid();
663     tid[i] = i; tls = tid + i;
664     /* coverity[check_return] */
665     LIBXSMM_TLS_SETVALUE(libxsmm_tlskey, tls);
666   }
667   return *(unsigned int*)tls;
668 # endif
669 #else
670   return 0;
671 #endif
672 }
673 
674