1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 /* Lock primitives inspired by Karl Malbrain, Concurrency Kit, and TF/sync.
12 ******************************************************************************/
13 #include "libxsmm_main.h"
14
15 #if !defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
16 # define LIBXSMM_SYNC_FUTEX
17 #endif
18
19 #if defined(LIBXSMM_OFFLOAD_TARGET)
20 # pragma offload_attribute(push,target(LIBXSMM_OFFLOAD_TARGET))
21 #endif
22 #include <stdint.h>
23 #if defined(_WIN32)
24 # include <process.h>
25 #else
26 # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
27 # include <linux/futex.h>
28 # endif
29 # include <unistd.h>
30 # include <time.h>
31 #endif
32 #if defined(LIBXSMM_OFFLOAD_TARGET)
33 # pragma offload_attribute(pop)
34 #endif
35
36 #if !defined(LIBXSMM_SYNC_RWLOCK_BITS)
37 # if defined(__MINGW32__)
38 # define LIBXSMM_SYNC_RWLOCK_BITS 32
39 # else
40 # define LIBXSMM_SYNC_RWLOCK_BITS 16
41 # endif
42 #endif
43
44 #if !defined(LIBXSMM_SYNC_GENERIC_PID) && 1
45 # define LIBXSMM_SYNC_GENERIC_PID
46 #endif
47
48
49 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_core_tag { /* per-core */
50 uint8_t id;
51 volatile uint8_t core_sense;
52 volatile uint8_t* thread_senses;
53 volatile uint8_t* my_flags[2];
54 uint8_t** partner_flags[2];
55 uint8_t parity;
56 uint8_t sense;
57 } internal_sync_core_tag;
58
59 LIBXSMM_EXTERN_C typedef struct LIBXSMM_RETARGETABLE internal_sync_thread_tag { /* per-thread */
60 int core_tid;
61 internal_sync_core_tag *core;
62 } internal_sync_thread_tag;
63
64 struct LIBXSMM_RETARGETABLE libxsmm_barrier {
65 internal_sync_core_tag** cores;
66 internal_sync_thread_tag** threads;
67 int ncores, nthreads_per_core;
68 int nthreads, ncores_nbits; /* nbits(ncores) != log2(ncores) */
69 /* internal counter type which is guaranteed to be atomic when using certain methods */
70 volatile int threads_waiting;
71 /* thread-safety during initialization */
72 volatile uint8_t init_done;
73 };
74
75
libxsmm_barrier_create(int ncores,int nthreads_per_core)76 LIBXSMM_API libxsmm_barrier* libxsmm_barrier_create(int ncores, int nthreads_per_core)
77 {
78 libxsmm_barrier *const barrier = (libxsmm_barrier*)malloc(sizeof(libxsmm_barrier));
79 #if (0 == LIBXSMM_SYNC)
80 LIBXSMM_UNUSED(ncores); LIBXSMM_UNUSED(nthreads_per_core);
81 #else
82 if (NULL != barrier && 1 < ncores && 1 <= nthreads_per_core) {
83 barrier->ncores = ncores;
84 barrier->ncores_nbits = (int)LIBXSMM_NBITS(ncores);
85 barrier->nthreads_per_core = nthreads_per_core;
86 barrier->nthreads = ncores * nthreads_per_core;
87 barrier->threads = (internal_sync_thread_tag**)libxsmm_aligned_malloc(
88 barrier->nthreads * sizeof(internal_sync_thread_tag*), LIBXSMM_CACHELINE);
89 barrier->cores = (internal_sync_core_tag**)libxsmm_aligned_malloc(
90 barrier->ncores * sizeof(internal_sync_core_tag*), LIBXSMM_CACHELINE);
91 barrier->threads_waiting = barrier->nthreads; /* atomic */
92 barrier->init_done = 0; /* false */
93 }
94 else
95 #endif
96 if (NULL != barrier) {
97 barrier->nthreads = 1;
98 }
99 return barrier;
100 }
101
102
libxsmm_barrier_init(libxsmm_barrier * barrier,int tid)103 LIBXSMM_API void libxsmm_barrier_init(libxsmm_barrier* barrier, int tid)
104 {
105 #if (0 == LIBXSMM_SYNC)
106 LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid);
107 #else
108 if (NULL != barrier && 1 < barrier->nthreads) {
109 const int cid = tid / barrier->nthreads_per_core; /* this thread's core ID */
110 internal_sync_core_tag* core = 0;
111 int i;
112 internal_sync_thread_tag* thread;
113
114 /* we only initialize the barrier once */
115 if (barrier->init_done == 2) {
116 return;
117 }
118
119 /* allocate per-thread structure */
120 thread = (internal_sync_thread_tag*)libxsmm_aligned_malloc(
121 sizeof(internal_sync_thread_tag), LIBXSMM_CACHELINE);
122 barrier->threads[tid] = thread;
123 thread->core_tid = tid - (barrier->nthreads_per_core * cid); /* mod */
124
125 /* each core's thread 0 does all the allocations */
126 if (0 == thread->core_tid) {
127 core = (internal_sync_core_tag*)libxsmm_aligned_malloc(
128 sizeof(internal_sync_core_tag), LIBXSMM_CACHELINE);
129 core->id = (uint8_t)cid;
130 core->core_sense = 1;
131
132 core->thread_senses = (uint8_t*)libxsmm_aligned_malloc(
133 barrier->nthreads_per_core * sizeof(uint8_t), LIBXSMM_CACHELINE);
134 for (i = 0; i < barrier->nthreads_per_core; ++i) core->thread_senses[i] = 1;
135
136 for (i = 0; i < 2; ++i) {
137 core->my_flags[i] = (uint8_t*)libxsmm_aligned_malloc(
138 barrier->ncores_nbits * sizeof(uint8_t) * LIBXSMM_CACHELINE,
139 LIBXSMM_CACHELINE);
140 core->partner_flags[i] = (uint8_t**)libxsmm_aligned_malloc(
141 barrier->ncores_nbits * sizeof(uint8_t*),
142 LIBXSMM_CACHELINE);
143 }
144
145 core->parity = 0;
146 core->sense = 1;
147 barrier->cores[cid] = core;
148 }
149
150 /* barrier to let all the allocations complete */
151 if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) {
152 barrier->threads_waiting = barrier->nthreads; /* atomic */
153 barrier->init_done = 1; /* true */
154 }
155 else {
156 while (0/*false*/ == barrier->init_done);
157 }
158
159 /* set required per-thread information */
160 thread->core = barrier->cores[cid];
161
162 /* each core's thread 0 completes setup */
163 if (0 == thread->core_tid) {
164 int di;
165 for (i = di = 0; i < barrier->ncores_nbits; ++i, di += LIBXSMM_CACHELINE) {
166 /* find dissemination partner and link to it */
167 const int dissem_cid = (cid + (1 << i)) % barrier->ncores;
168 assert(0 != core); /* initialized under the same condition; see above */
169 core->my_flags[0][di] = core->my_flags[1][di] = 0;
170 core->partner_flags[0][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[0][di];
171 core->partner_flags[1][i] = (uint8_t*)&barrier->cores[dissem_cid]->my_flags[1][di];
172 }
173 }
174
175 /* barrier to let initialization complete */
176 if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&barrier->threads_waiting, 1, LIBXSMM_ATOMIC_RELAXED)) {
177 barrier->threads_waiting = barrier->nthreads; /* atomic */
178 barrier->init_done = 2;
179 }
180 else {
181 while (2 != barrier->init_done);
182 }
183 }
184 #endif
185 }
186
187
LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC)188 LIBXSMM_API LIBXSMM_INTRINSICS(LIBXSMM_X86_GENERIC)
189 void libxsmm_barrier_wait(libxsmm_barrier* barrier, int tid)
190 {
191 #if (0 == LIBXSMM_SYNC)
192 LIBXSMM_UNUSED(barrier); LIBXSMM_UNUSED(tid);
193 #else
194 if (NULL != barrier && 1 < barrier->nthreads) {
195 internal_sync_thread_tag *const thread = barrier->threads[tid];
196 internal_sync_core_tag *const core = thread->core;
197
198 /* first let's execute a memory fence */
199 LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
200
201 /* first signal this thread's arrival */
202 core->thread_senses[thread->core_tid] = (uint8_t)(0 == core->thread_senses[thread->core_tid] ? 1 : 0);
203
204 /* each core's thread 0 syncs across cores */
205 if (0 == thread->core_tid) {
206 int i;
207 /* wait for the core's remaining threads */
208 for (i = 1; i < barrier->nthreads_per_core; ++i) {
209 uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[i];
210 while (core_sense == thread_sense) { /* avoid evaluation in unspecified order */
211 LIBXSMM_SYNC_PAUSE;
212 core_sense = core->core_sense;
213 thread_sense = core->thread_senses[i];
214 }
215 }
216
217 if (1 < barrier->ncores) {
218 int di;
219 # if defined(__MIC__)
220 /* cannot use LIBXSMM_ALIGNED since attribute may not apply to local non-static arrays */
221 uint8_t sendbuffer[LIBXSMM_CACHELINE+LIBXSMM_CACHELINE-1];
222 uint8_t *const sendbuf = LIBXSMM_ALIGN(sendbuffer, LIBXSMM_CACHELINE);
223 __m512d m512d;
224 _mm_prefetch((const char*)core->partner_flags[core->parity][0], _MM_HINT_ET1);
225 sendbuf[0] = core->sense;
226 m512d = LIBXSMM_INTRINSICS_MM512_LOAD_PD(sendbuf);
227 # endif
228
229 for (i = di = 0; i < barrier->ncores_nbits - 1; ++i, di += LIBXSMM_CACHELINE) {
230 # if defined(__MIC__)
231 _mm_prefetch((const char*)core->partner_flags[core->parity][i+1], _MM_HINT_ET1);
232 _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d);
233 # else
234 *core->partner_flags[core->parity][i] = core->sense;
235 # endif
236 while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE;
237 }
238
239 # if defined(__MIC__)
240 _mm512_storenrngo_pd(core->partner_flags[core->parity][i], m512d);
241 # else
242 *core->partner_flags[core->parity][i] = core->sense;
243 # endif
244 while (core->my_flags[core->parity][di] != core->sense) LIBXSMM_SYNC_PAUSE;
245 if (1 == core->parity) {
246 core->sense = (uint8_t)(0 == core->sense ? 1 : 0);
247 }
248 core->parity = (uint8_t)(1 - core->parity);
249 }
250
251 /* wake up the core's remaining threads */
252 core->core_sense = core->thread_senses[0];
253 }
254 else { /* other threads wait for cross-core sync to complete */
255 uint8_t core_sense = core->core_sense, thread_sense = core->thread_senses[thread->core_tid];
256 while (core_sense != thread_sense) { /* avoid evaluation in unspecified order */
257 LIBXSMM_SYNC_PAUSE;
258 core_sense = core->core_sense;
259 thread_sense = core->thread_senses[thread->core_tid];
260 }
261 }
262 }
263 #endif
264 }
265
266
libxsmm_barrier_destroy(const libxsmm_barrier * barrier)267 LIBXSMM_API void libxsmm_barrier_destroy(const libxsmm_barrier* barrier)
268 {
269 #if (0 != LIBXSMM_SYNC)
270 if (NULL != barrier && 1 < barrier->nthreads) {
271 if (2 == barrier->init_done) {
272 int i;
273 for (i = 0; i < barrier->ncores; ++i) {
274 int j;
275 libxsmm_free((const void*)barrier->cores[i]->thread_senses);
276 for (j = 0; j < 2; ++j) {
277 libxsmm_free((const void*)barrier->cores[i]->my_flags[j]);
278 libxsmm_free(barrier->cores[i]->partner_flags[j]);
279 }
280 libxsmm_free(barrier->cores[i]);
281 }
282 for (i = 0; i < barrier->nthreads; ++i) {
283 libxsmm_free(barrier->threads[i]);
284 }
285 }
286 libxsmm_free(barrier->threads);
287 libxsmm_free(barrier->cores);
288 }
289 #endif
290 free((libxsmm_barrier*)barrier);
291 }
292
293
294 #if (0 != LIBXSMM_SYNC)
295 enum {
296 INTERNAL_SYNC_LOCK_FREE = 0,
297 INTERNAL_SYNC_LOCK_LOCKED = 1,
298 INTERNAL_SYNC_LOCK_CONTESTED = 2,
299 INTERNAL_SYNC_RWLOCK_READINC = 0x10000/*(USHRT_MAX+1)*/,
300 INTERNAL_SYNC_FUTEX = 202
301 };
302 #endif
303
304
305 typedef unsigned int libxsmm_spinlock_state;
306 struct LIBXSMM_RETARGETABLE libxsmm_spinlock {
307 volatile libxsmm_spinlock_state state;
308 };
309
310
libxsmm_spinlock_create(void)311 LIBXSMM_API libxsmm_spinlock* libxsmm_spinlock_create(void)
312 {
313 libxsmm_spinlock *const result = (libxsmm_spinlock*)malloc(sizeof(libxsmm_spinlock));
314 #if (0 != LIBXSMM_SYNC)
315 if (0 != result) {
316 result->state = INTERNAL_SYNC_LOCK_FREE;
317 }
318 #endif
319 return result;
320 }
321
322
libxsmm_spinlock_destroy(const libxsmm_spinlock * spinlock)323 LIBXSMM_API void libxsmm_spinlock_destroy(const libxsmm_spinlock* spinlock)
324 {
325 free((libxsmm_spinlock*)spinlock);
326 }
327
328
libxsmm_spinlock_trylock(libxsmm_spinlock * spinlock)329 LIBXSMM_API int libxsmm_spinlock_trylock(libxsmm_spinlock* spinlock)
330 {
331 #if (0 != LIBXSMM_SYNC)
332 # if 0
333 /*const*/ libxsmm_spinlock_state lock_free = INTERNAL_SYNC_LOCK_FREE;
334 assert(0 != spinlock);
335 return 0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&spinlock->state, lock_free, INTERNAL_SYNC_LOCK_LOCKED, LIBXSMM_ATOMIC_RELAXED)
336 ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + 1) /* not acquired */
337 : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK));
338 # else
339 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK) + !LIBXSMM_ATOMIC_TRYLOCK(&spinlock->state, LIBXSMM_ATOMIC_RELAXED);
340 # endif
341 #else
342 LIBXSMM_UNUSED(spinlock);
343 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_SPINLOCK);
344 #endif
345 }
346
347
libxsmm_spinlock_acquire(libxsmm_spinlock * spinlock)348 LIBXSMM_API void libxsmm_spinlock_acquire(libxsmm_spinlock* spinlock)
349 {
350 #if (0 != LIBXSMM_SYNC)
351 assert(0 != spinlock);
352 for (;;) {
353 if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&spinlock->state, 1, LIBXSMM_ATOMIC_RELAXED)) break;
354 LIBXSMM_SYNC_CYCLE(&spinlock->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE);
355 }
356 LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
357 #else
358 LIBXSMM_UNUSED(spinlock);
359 #endif
360 }
361
362
libxsmm_spinlock_release(libxsmm_spinlock * spinlock)363 LIBXSMM_API void libxsmm_spinlock_release(libxsmm_spinlock* spinlock)
364 {
365 #if (0 != LIBXSMM_SYNC)
366 assert(0 != spinlock);
367 LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
368 spinlock->state = INTERNAL_SYNC_LOCK_FREE;
369 #else
370 LIBXSMM_UNUSED(spinlock);
371 #endif
372 }
373
374
375 #if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
376 typedef int libxsmm_mutex_state;
377 #else
378 typedef char libxsmm_mutex_state;
379 #endif
380 struct LIBXSMM_RETARGETABLE libxsmm_mutex {
381 volatile libxsmm_mutex_state state;
382 };
383
384
libxsmm_mutex_create(void)385 LIBXSMM_API libxsmm_mutex* libxsmm_mutex_create(void)
386 {
387 libxsmm_mutex *const result = (libxsmm_mutex*)malloc(sizeof(libxsmm_mutex));
388 #if (0 != LIBXSMM_SYNC)
389 if (0 != result) {
390 result->state = INTERNAL_SYNC_LOCK_FREE;
391 }
392 #endif
393 return result;
394 }
395
396
libxsmm_mutex_destroy(const libxsmm_mutex * mutex)397 LIBXSMM_API void libxsmm_mutex_destroy(const libxsmm_mutex* mutex)
398 {
399 free((libxsmm_mutex*)mutex);
400 }
401
402
libxsmm_mutex_trylock(libxsmm_mutex * mutex)403 LIBXSMM_API int libxsmm_mutex_trylock(libxsmm_mutex* mutex)
404 {
405 #if (0 != LIBXSMM_SYNC)
406 assert(0 != mutex);
407 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) + !LIBXSMM_ATOMIC_TRYLOCK(&mutex->state, LIBXSMM_ATOMIC_RELAXED);
408 #else
409 LIBXSMM_UNUSED(mutex);
410 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX);
411 #endif
412 }
413
414
libxsmm_mutex_acquire(libxsmm_mutex * mutex)415 LIBXSMM_API void libxsmm_mutex_acquire(libxsmm_mutex* mutex)
416 {
417 #if (0 != LIBXSMM_SYNC)
418 # if defined(_WIN32)
419 assert(0 != mutex);
420 while (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_MUTEX) != libxsmm_mutex_trylock(mutex)) {
421 LIBXSMM_SYNC_CYCLE(&mutex->state, 0/*free*/, LIBXSMM_SYNC_NPAUSE);
422 }
423 # else
424 libxsmm_mutex_state lock_free = INTERNAL_SYNC_LOCK_FREE, lock_state = INTERNAL_SYNC_LOCK_LOCKED;
425 assert(0 != mutex);
426 while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&mutex->state, lock_free, lock_state, LIBXSMM_ATOMIC_RELAXED)) {
427 libxsmm_mutex_state state;
428 /* coverity[unreachable] may be reachable more than once due to volatile state */
429 for (state = mutex->state; INTERNAL_SYNC_LOCK_FREE != state; state = mutex->state) {
430 # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__)
431 LIBXSMM_SYNC_CYCLE_ELSE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE, {
432 /*const*/ libxsmm_mutex_state state_locked = INTERNAL_SYNC_LOCK_LOCKED;
433 if (INTERNAL_SYNC_LOCK_LOCKED != state || LIBXSMM_ATOMIC_CMPSWP(&mutex->state,
434 state_locked, INTERNAL_SYNC_LOCK_CONTESTED, LIBXSMM_ATOMIC_RELAXED))
435 {
436 syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAIT, INTERNAL_SYNC_LOCK_CONTESTED, NULL, NULL, 0);
437 lock_state = INTERNAL_SYNC_LOCK_CONTESTED;
438 }}
439 );
440 break;
441 # else
442 LIBXSMM_SYNC_CYCLE(&mutex->state, INTERNAL_SYNC_LOCK_FREE, LIBXSMM_SYNC_NPAUSE);
443 # endif
444 }
445 lock_free = INTERNAL_SYNC_LOCK_FREE;
446 }
447 # endif
448 #else
449 LIBXSMM_UNUSED(mutex);
450 #endif
451 }
452
453
libxsmm_mutex_release(libxsmm_mutex * mutex)454 LIBXSMM_API void libxsmm_mutex_release(libxsmm_mutex* mutex)
455 {
456 #if (0 != LIBXSMM_SYNC)
457 assert(0 != mutex);
458 LIBXSMM_ATOMIC_SYNC(LIBXSMM_ATOMIC_SEQ_CST);
459 # if defined(LIBXSMM_SYNC_FUTEX) && defined(__linux__) && defined(__USE_GNU)
460 if (INTERNAL_SYNC_LOCK_CONTESTED == LIBXSMM_ATOMIC_FETCH_SUB(&mutex->state, 1, LIBXSMM_ATOMIC_RELAXED)) {
461 mutex->state = INTERNAL_SYNC_LOCK_FREE;
462 syscall(INTERNAL_SYNC_FUTEX, &mutex->state, FUTEX_WAKE, 1, NULL, NULL, 0);
463 }
464 # else
465 mutex->state = INTERNAL_SYNC_LOCK_FREE;
466 # endif
467 #else
468 LIBXSMM_UNUSED(mutex);
469 #endif
470 }
471
472
473 #if (0 != LIBXSMM_SYNC)
474 typedef LIBXSMM_CONCATENATE3(uint,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_uint_t;
475 typedef LIBXSMM_CONCATENATE3(int,LIBXSMM_SYNC_RWLOCK_BITS,_t) internal_sync_int_t;
476 LIBXSMM_EXTERN_C typedef union LIBXSMM_RETARGETABLE internal_sync_counter {
477 struct { internal_sync_uint_t writer, reader; } kind;
478 uint32_t bits;
479 } internal_sync_counter;
480 #endif
481 LIBXSMM_EXTERN_C struct LIBXSMM_RETARGETABLE libxsmm_rwlock {
482 #if (0 != LIBXSMM_SYNC)
483 volatile internal_sync_counter completions;
484 volatile internal_sync_counter requests;
485 #else
486 int dummy;
487 #endif
488 };
489
490
libxsmm_rwlock_create(void)491 LIBXSMM_API libxsmm_rwlock* libxsmm_rwlock_create(void)
492 {
493 libxsmm_rwlock *const result = (libxsmm_rwlock*)malloc(sizeof(libxsmm_rwlock));
494 if (0 != result) {
495 #if (0 != LIBXSMM_SYNC)
496 LIBXSMM_MEMZERO127(&result->completions);
497 LIBXSMM_MEMZERO127(&result->requests);
498 #else
499 LIBXSMM_MEMZERO127(result);
500 #endif
501 }
502 return result;
503 }
504
505
libxsmm_rwlock_destroy(const libxsmm_rwlock * rwlock)506 LIBXSMM_API void libxsmm_rwlock_destroy(const libxsmm_rwlock* rwlock)
507 {
508 free((libxsmm_rwlock*)rwlock);
509 }
510
511
512 #if (0 != LIBXSMM_SYNC)
internal_rwlock_trylock(libxsmm_rwlock * rwlock,internal_sync_counter * prev)513 LIBXSMM_API_INLINE int internal_rwlock_trylock(libxsmm_rwlock* rwlock, internal_sync_counter* prev)
514 {
515 internal_sync_counter next;
516 assert(0 != rwlock && 0 != prev);
517 do {
518 prev->bits = rwlock->requests.bits;
519 next.bits = prev->bits;
520 ++next.kind.writer;
521 }
522 while (0/*false*/ == LIBXSMM_ATOMIC_CMPSWP(&rwlock->requests.bits, prev->bits, next.bits, LIBXSMM_ATOMIC_RELAXED));
523 return rwlock->completions.bits != prev->bits
524 ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */
525 : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK));
526 }
527 #endif
528
529
libxsmm_rwlock_trylock(libxsmm_rwlock * rwlock)530 LIBXSMM_API int libxsmm_rwlock_trylock(libxsmm_rwlock* rwlock)
531 {
532 #if (0 != LIBXSMM_SYNC)
533 internal_sync_counter prev;
534 return internal_rwlock_trylock(rwlock, &prev);
535 #else
536 LIBXSMM_UNUSED(rwlock);
537 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
538 #endif
539 }
540
541
libxsmm_rwlock_acquire(libxsmm_rwlock * rwlock)542 LIBXSMM_API void libxsmm_rwlock_acquire(libxsmm_rwlock* rwlock)
543 {
544 #if (0 != LIBXSMM_SYNC)
545 internal_sync_counter prev;
546 if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_trylock(rwlock, &prev)) {
547 while (rwlock->completions.bits != prev.bits) {
548 LIBXSMM_SYNC_CYCLE(&rwlock->completions.bits, prev.bits, LIBXSMM_SYNC_NPAUSE);
549 }
550 }
551 #else
552 LIBXSMM_UNUSED(rwlock);
553 #endif
554 }
555
556
libxsmm_rwlock_release(libxsmm_rwlock * rwlock)557 LIBXSMM_API void libxsmm_rwlock_release(libxsmm_rwlock* rwlock)
558 {
559 #if (0 != LIBXSMM_SYNC)
560 assert(0 != rwlock);
561 LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.writer, 1, LIBXSMM_ATOMIC_SEQ_CST);
562 #else
563 LIBXSMM_UNUSED(rwlock);
564 #endif
565 }
566
567
568 #if (0 != LIBXSMM_SYNC)
internal_rwlock_tryread(libxsmm_rwlock * rwlock,internal_sync_counter * prev)569 LIBXSMM_API_INLINE int internal_rwlock_tryread(libxsmm_rwlock* rwlock, internal_sync_counter* prev)
570 {
571 #if (0 != LIBXSMM_SYNC)
572 assert(0 != rwlock && 0 != prev);
573 prev->bits = LIBXSMM_ATOMIC_FETCH_ADD(&rwlock->requests.bits, INTERNAL_SYNC_RWLOCK_READINC, LIBXSMM_ATOMIC_SEQ_CST);
574 return rwlock->completions.kind.writer != prev->kind.writer
575 ? (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) + 1) /* not acquired */
576 : (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK));
577 #else
578 LIBXSMM_UNUSED(rwlock); LIBXSMM_UNUSED(prev);
579 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
580 #endif
581 }
582 #endif
583
584
libxsmm_rwlock_tryread(libxsmm_rwlock * rwlock)585 LIBXSMM_API int libxsmm_rwlock_tryread(libxsmm_rwlock* rwlock)
586 {
587 #if (0 != LIBXSMM_SYNC)
588 internal_sync_counter prev;
589 return internal_rwlock_tryread(rwlock, &prev);
590 #else
591 LIBXSMM_UNUSED(rwlock);
592 return LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK);
593 #endif
594 }
595
596
libxsmm_rwlock_acqread(libxsmm_rwlock * rwlock)597 LIBXSMM_API void libxsmm_rwlock_acqread(libxsmm_rwlock* rwlock)
598 {
599 #if (0 != LIBXSMM_SYNC)
600 internal_sync_counter prev;
601 if (LIBXSMM_LOCK_ACQUIRED(LIBXSMM_LOCK_RWLOCK) != internal_rwlock_tryread(rwlock, &prev)) {
602 while (rwlock->completions.kind.writer != prev.kind.writer) {
603 LIBXSMM_SYNC_CYCLE(&rwlock->completions.kind.writer, prev.kind.writer, LIBXSMM_SYNC_NPAUSE);
604 }
605 }
606 #else
607 LIBXSMM_UNUSED(rwlock);
608 #endif
609 }
610
611
libxsmm_rwlock_relread(libxsmm_rwlock * rwlock)612 LIBXSMM_API void libxsmm_rwlock_relread(libxsmm_rwlock* rwlock)
613 {
614 #if (0 != LIBXSMM_SYNC)
615 assert(0 != rwlock);
616 LIBXSMM_ATOMIC(LIBXSMM_ATOMIC_FETCH_ADD, LIBXSMM_SYNC_RWLOCK_BITS)(&rwlock->completions.kind.reader, 1, LIBXSMM_ATOMIC_SEQ_CST);
617 #else
618 LIBXSMM_UNUSED(rwlock);
619 #endif
620 }
621
622
libxsmm_get_pid(void)623 LIBXSMM_API unsigned int libxsmm_get_pid(void)
624 {
625 #if defined(_WIN32)
626 return (unsigned int)_getpid();
627 #else
628 return (unsigned int)getpid();
629 #endif
630 }
631
632
633 LIBXSMM_API_INTERN unsigned int internal_get_tid(void);
internal_get_tid(void)634 LIBXSMM_API_INTERN unsigned int internal_get_tid(void)
635 {
636 const unsigned int nthreads = LIBXSMM_ATOMIC_ADD_FETCH(&libxsmm_thread_count, 1, LIBXSMM_ATOMIC_RELAXED);
637 #if !defined(NDEBUG)
638 static int error_once = 0;
639 if (LIBXSMM_NTHREADS_MAX < nthreads
640 && 0 != libxsmm_verbosity /* library code is expected to be mute */
641 && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
642 {
643 fprintf(stderr, "LIBXSMM ERROR: maximum number of threads is exhausted!\n");
644 }
645 #endif
646 LIBXSMM_ASSERT(LIBXSMM_NTHREADS_MAX == LIBXSMM_UP2POT(LIBXSMM_NTHREADS_MAX));
647 return LIBXSMM_MOD2(nthreads - 1, LIBXSMM_NTHREADS_MAX);
648 }
649
650
libxsmm_get_tid(void)651 LIBXSMM_API unsigned int libxsmm_get_tid(void)
652 {
653 #if (0 != LIBXSMM_SYNC)
654 # if defined(LIBXSMM_SYNC_GENERIC_PID)
655 static LIBXSMM_TLS unsigned int tid = 0xFFFFFFFF;
656 if (0xFFFFFFFF == tid) tid = internal_get_tid();
657 return tid;
658 # else
659 void* tls = LIBXSMM_TLS_GETVALUE(libxsmm_tlskey);
660 if (NULL == tls) {
661 static unsigned int tid[LIBXSMM_NTHREADS_MAX];
662 const int i = internal_get_tid();
663 tid[i] = i; tls = tid + i;
664 /* coverity[check_return] */
665 LIBXSMM_TLS_SETVALUE(libxsmm_tlskey, tls);
666 }
667 return *(unsigned int*)tls;
668 # endif
669 #else
670 return 0;
671 #endif
672 }
673
674