xref: /freebsd/sys/arm/arm/stdatomic.c (revision 5b9c547c)
1 /*-
2  * Copyright (c) 2013 Ed Schouten <ed@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/stdatomic.h>
32 #include <sys/types.h>
33 
34 #include <machine/cpufunc.h>
35 #include <machine/sysarch.h>
36 
37 /*
38  * Executing statements with interrupts disabled.
39  */
40 
41 #if defined(_KERNEL) && !defined(SMP)
42 #define	WITHOUT_INTERRUPTS(s) do {					\
43 	register_t regs;						\
44 									\
45 	regs = intr_disable();						\
46 	do s while (0);							\
47 	intr_restore(regs);						\
48 } while (0)
49 #endif /* _KERNEL && !SMP */
50 
51 /*
52  * Memory barriers.
53  *
54  * It turns out __sync_synchronize() does not emit any code when used
55  * with GCC 4.2. Implement our own version that does work reliably.
56  *
57  * Although __sync_lock_test_and_set() should only perform an acquire
58  * barrier, make it do a full barrier like the other functions. This
59  * should make <stdatomic.h>'s atomic_exchange_explicit() work reliably.
60  */
61 
62 #if defined(_KERNEL) && !defined(SMP)
63 static inline void
64 do_sync(void)
65 {
66 
67 	__asm volatile ("" : : : "memory");
68 }
69 #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
70 static inline void
71 do_sync(void)
72 {
73 
74 	__asm volatile ("dmb" : : : "memory");
75 }
76 #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
77     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
78     defined(__ARM_ARCH_6ZK__)
79 static inline void
80 do_sync(void)
81 {
82 
83 	__asm volatile ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
84 }
85 #endif
86 
87 #if defined(__CLANG_ATOMICS) || defined(__GNUC_ATOMICS)
88 
89 /*
90  * New C11 __atomic_* API.
91  */
92 
93 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
94     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
95     defined(__ARM_ARCH_6ZK__) || \
96     defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
97 
98 /* These systems should be supported by the compiler. */
99 
100 #else /* __ARM_ARCH_5__ */
101 
102 /* Clang doesn't allow us to reimplement builtins without this. */
103 #ifdef __clang__
104 #pragma redefine_extname __sync_synchronize_ext __sync_synchronize
105 #define __sync_synchronize __sync_synchronize_ext
106 #endif
107 
108 void
109 __sync_synchronize(void)
110 {
111 }
112 
113 #ifdef _KERNEL
114 
115 #ifdef SMP
116 #error "On SMP systems we should have proper atomic operations."
117 #endif
118 
119 /*
120  * On uniprocessor systems, we can perform the atomic operations by
121  * disabling interrupts.
122  */
123 
124 #define	EMIT_LOAD_N(N, uintN_t)						\
125 uintN_t									\
126 __atomic_load_##N(uintN_t *mem, int model __unused)			\
127 {									\
128 	uintN_t ret;							\
129 									\
130 	WITHOUT_INTERRUPTS({						\
131 		ret = *mem;						\
132 	});								\
133 	return (ret);							\
134 }
135 
136 #define	EMIT_STORE_N(N, uintN_t)					\
137 void									\
138 __atomic_store_##N(uintN_t *mem, uintN_t val, int model __unused)	\
139 {									\
140 									\
141 	WITHOUT_INTERRUPTS({						\
142 		*mem = val;						\
143 	});								\
144 }
145 
146 #define	EMIT_COMPARE_EXCHANGE_N(N, uintN_t)				\
147 _Bool									\
148 __atomic_compare_exchange_##N(uintN_t *mem, uintN_t *expected,		\
149     uintN_t desired, int success __unused, int failure __unused)	\
150 {									\
151 	_Bool ret;							\
152 									\
153 	WITHOUT_INTERRUPTS({						\
154 		if (*mem == *expected) {				\
155 			*mem = desired;					\
156 			ret = 1;					\
157 		} else {						\
158 			*expected = *mem;				\
159 			ret = 0;					\
160 		}							\
161 	});								\
162 	return (ret);							\
163 }
164 
165 #define	EMIT_FETCH_OP_N(N, uintN_t, name, op)				\
166 uintN_t									\
167 __atomic_##name##_##N(uintN_t *mem, uintN_t val, int model __unused)	\
168 {									\
169 	uintN_t ret;							\
170 									\
171 	WITHOUT_INTERRUPTS({						\
172 		ret = *mem;						\
173 		*mem op val;						\
174 	});								\
175 	return (ret);							\
176 }
177 
178 #define	EMIT_ALL_OPS_N(N, uintN_t)					\
179 EMIT_LOAD_N(N, uintN_t)							\
180 EMIT_STORE_N(N, uintN_t)						\
181 EMIT_COMPARE_EXCHANGE_N(N, uintN_t)					\
182 EMIT_FETCH_OP_N(N, uintN_t, exchange, =)				\
183 EMIT_FETCH_OP_N(N, uintN_t, fetch_add, +=)				\
184 EMIT_FETCH_OP_N(N, uintN_t, fetch_and, &=)				\
185 EMIT_FETCH_OP_N(N, uintN_t, fetch_or, |=)				\
186 EMIT_FETCH_OP_N(N, uintN_t, fetch_sub, -=)				\
187 EMIT_FETCH_OP_N(N, uintN_t, fetch_xor, ^=)
188 
189 EMIT_ALL_OPS_N(1, uint8_t)
190 EMIT_ALL_OPS_N(2, uint16_t)
191 EMIT_ALL_OPS_N(4, uint32_t)
192 EMIT_ALL_OPS_N(8, uint64_t)
193 #undef	EMIT_ALL_OPS_N
194 
195 #else /* !_KERNEL */
196 
197 /*
198  * For userspace on uniprocessor systems, we can implement the atomic
199  * operations by using a Restartable Atomic Sequence. This makes the
200  * kernel restart the code from the beginning when interrupted.
201  */
202 
203 #define	EMIT_LOAD_N(N, uintN_t)						\
204 uintN_t									\
205 __atomic_load_##N(uintN_t *mem, int model __unused)			\
206 {									\
207 									\
208 	return (*mem);							\
209 }
210 
211 #define	EMIT_STORE_N(N, uintN_t)					\
212 void									\
213 __atomic_store_##N(uintN_t *mem, uintN_t val, int model __unused)	\
214 {									\
215 									\
216 	*mem = val;							\
217 }
218 
219 #define	EMIT_EXCHANGE_N(N, uintN_t, ldr, str)				\
220 uintN_t									\
221 __atomic_exchange_##N(uintN_t *mem, uintN_t val, int model __unused)	\
222 {									\
223 	uint32_t old, temp, ras_start;					\
224 									\
225 	ras_start = ARM_RAS_START;					\
226 	__asm volatile (						\
227 		/* Set up Restartable Atomic Sequence. */		\
228 		"1:"							\
229 		"\tadr   %2, 1b\n"					\
230 		"\tstr   %2, [%5]\n"					\
231 		"\tadr   %2, 2f\n"					\
232 		"\tstr   %2, [%5, #4]\n"				\
233 									\
234 		"\t"ldr" %0, %4\n"	/* Load old value. */		\
235 		"\t"str" %3, %1\n"	/* Store new value. */		\
236 									\
237 		/* Tear down Restartable Atomic Sequence. */		\
238 		"2:"							\
239 		"\tmov   %2, #0x00000000\n"				\
240 		"\tstr   %2, [%5]\n"					\
241 		"\tmov   %2, #0xffffffff\n"				\
242 		"\tstr   %2, [%5, #4]\n"				\
243 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
244 		: "r" (val), "m" (*mem), "r" (ras_start));		\
245 	return (old);							\
246 }
247 
248 #define	EMIT_COMPARE_EXCHANGE_N(N, uintN_t, ldr, streq)			\
249 _Bool									\
250 __atomic_compare_exchange_##N(uintN_t *mem, uintN_t *pexpected,		\
251     uintN_t desired, int success __unused, int failure __unused)	\
252 {									\
253 	uint32_t expected, old, temp, ras_start;			\
254 									\
255 	expected = *pexpected;						\
256 	ras_start = ARM_RAS_START;					\
257 	__asm volatile (						\
258 		/* Set up Restartable Atomic Sequence. */		\
259 		"1:"							\
260 		"\tadr   %2, 1b\n"					\
261 		"\tstr   %2, [%6]\n"					\
262 		"\tadr   %2, 2f\n"					\
263 		"\tstr   %2, [%6, #4]\n"				\
264 									\
265 		"\t"ldr" %0, %5\n"	/* Load old value. */		\
266 		"\tcmp   %0, %3\n"	/* Compare to expected value. */\
267 		"\t"streq" %4, %1\n"	/* Store new value. */		\
268 									\
269 		/* Tear down Restartable Atomic Sequence. */		\
270 		"2:"							\
271 		"\tmov   %2, #0x00000000\n"				\
272 		"\tstr   %2, [%6]\n"					\
273 		"\tmov   %2, #0xffffffff\n"				\
274 		"\tstr   %2, [%6, #4]\n"				\
275 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
276 		: "r" (expected), "r" (desired), "m" (*mem),		\
277 		  "r" (ras_start));					\
278 	if (old == expected) {						\
279 		return (1);						\
280 	} else {							\
281 		*pexpected = old;					\
282 		return (0);						\
283 	}								\
284 }
285 
286 #define	EMIT_FETCH_OP_N(N, uintN_t, ldr, str, name, op)			\
287 uintN_t									\
288 __atomic_##name##_##N(uintN_t *mem, uintN_t val, int model __unused)	\
289 {									\
290 	uint32_t old, temp, ras_start;					\
291 									\
292 	ras_start = ARM_RAS_START;					\
293 	__asm volatile (						\
294 		/* Set up Restartable Atomic Sequence. */		\
295 		"1:"							\
296 		"\tadr   %2, 1b\n"					\
297 		"\tstr   %2, [%5]\n"					\
298 		"\tadr   %2, 2f\n"					\
299 		"\tstr   %2, [%5, #4]\n"				\
300 									\
301 		"\t"ldr" %0, %4\n"	/* Load old value. */		\
302 		"\t"op"  %2, %0, %3\n"	/* Calculate new value. */	\
303 		"\t"str" %2, %1\n"	/* Store new value. */		\
304 									\
305 		/* Tear down Restartable Atomic Sequence. */		\
306 		"2:"							\
307 		"\tmov   %2, #0x00000000\n"				\
308 		"\tstr   %2, [%5]\n"					\
309 		"\tmov   %2, #0xffffffff\n"				\
310 		"\tstr   %2, [%5, #4]\n"				\
311 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
312 		: "r" (val), "m" (*mem), "r" (ras_start));		\
313 	return (old);							\
314 }
315 
316 #define	EMIT_ALL_OPS_N(N, uintN_t, ldr, str, streq)			\
317 EMIT_LOAD_N(N, uintN_t)							\
318 EMIT_STORE_N(N, uintN_t)						\
319 EMIT_EXCHANGE_N(N, uintN_t, ldr, str)					\
320 EMIT_COMPARE_EXCHANGE_N(N, uintN_t, ldr, streq)				\
321 EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_add, "add")			\
322 EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_and, "and")			\
323 EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_or, "orr")			\
324 EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_sub, "sub")			\
325 EMIT_FETCH_OP_N(N, uintN_t, ldr, str, fetch_xor, "eor")
326 
327 EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "strbeq")
328 EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "strheq")
329 EMIT_ALL_OPS_N(4, uint32_t, "ldr", "str", "streq")
330 #undef	EMIT_ALL_OPS_N
331 
332 #endif /* _KERNEL */
333 
334 #endif
335 
336 #endif /* __CLANG_ATOMICS || __GNUC_ATOMICS */
337 
338 #if defined(__SYNC_ATOMICS) || defined(EMIT_SYNC_ATOMICS)
339 
340 #ifdef __clang__
341 #pragma redefine_extname __sync_lock_test_and_set_1_c __sync_lock_test_and_set_1
342 #pragma redefine_extname __sync_lock_test_and_set_2_c __sync_lock_test_and_set_2
343 #pragma	redefine_extname __sync_lock_test_and_set_4_c __sync_lock_test_and_set_4
344 #pragma	redefine_extname __sync_val_compare_and_swap_1_c __sync_val_compare_and_swap_1
345 #pragma	redefine_extname __sync_val_compare_and_swap_2_c __sync_val_compare_and_swap_2
346 #pragma	redefine_extname __sync_val_compare_and_swap_4_c __sync_val_compare_and_swap_4
347 #pragma	redefine_extname __sync_fetch_and_add_1_c __sync_fetch_and_add_1
348 #pragma	redefine_extname __sync_fetch_and_add_2_c __sync_fetch_and_add_2
349 #pragma	redefine_extname __sync_fetch_and_add_4_c __sync_fetch_and_add_4
350 #pragma	redefine_extname __sync_fetch_and_and_1_c __sync_fetch_and_and_1
351 #pragma	redefine_extname __sync_fetch_and_and_2_c __sync_fetch_and_and_2
352 #pragma	redefine_extname __sync_fetch_and_and_4_c __sync_fetch_and_and_4
353 #pragma	redefine_extname __sync_fetch_and_or_1_c __sync_fetch_and_or_1
354 #pragma	redefine_extname __sync_fetch_and_or_2_c __sync_fetch_and_or_2
355 #pragma	redefine_extname __sync_fetch_and_or_4_c __sync_fetch_and_or_4
356 #pragma	redefine_extname __sync_fetch_and_xor_1_c __sync_fetch_and_xor_1
357 #pragma	redefine_extname __sync_fetch_and_xor_2_c __sync_fetch_and_xor_2
358 #pragma	redefine_extname __sync_fetch_and_xor_4_c __sync_fetch_and_xor_4
359 #pragma	redefine_extname __sync_fetch_and_sub_1_c __sync_fetch_and_sub_1
360 #pragma	redefine_extname __sync_fetch_and_sub_2_c __sync_fetch_and_sub_2
361 #pragma	redefine_extname __sync_fetch_and_sub_4_c __sync_fetch_and_sub_4
362 #endif
363 
364 /*
365  * Old __sync_* API.
366  */
367 
368 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
369     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
370     defined(__ARM_ARCH_6ZK__) || \
371     defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
372 
373 /* Implementations for old GCC versions, lacking support for atomics. */
374 
375 typedef union {
376 	uint8_t		v8[4];
377 	uint32_t	v32;
378 } reg_t;
379 
380 /*
381  * Given a memory address pointing to an 8-bit or 16-bit integer, return
382  * the address of the 32-bit word containing it.
383  */
384 
385 static inline uint32_t *
386 round_to_word(void *ptr)
387 {
388 
389 	return ((uint32_t *)((intptr_t)ptr & ~3));
390 }
391 
392 /*
393  * Utility functions for loading and storing 8-bit and 16-bit integers
394  * in 32-bit words at an offset corresponding with the location of the
395  * atomic variable.
396  */
397 
398 static inline void
399 put_1(reg_t *r, const uint8_t *offset_ptr, uint8_t val)
400 {
401 	size_t offset;
402 
403 	offset = (intptr_t)offset_ptr & 3;
404 	r->v8[offset] = val;
405 }
406 
407 static inline uint8_t
408 get_1(const reg_t *r, const uint8_t *offset_ptr)
409 {
410 	size_t offset;
411 
412 	offset = (intptr_t)offset_ptr & 3;
413 	return (r->v8[offset]);
414 }
415 
416 static inline void
417 put_2(reg_t *r, const uint16_t *offset_ptr, uint16_t val)
418 {
419 	size_t offset;
420 	union {
421 		uint16_t in;
422 		uint8_t out[2];
423 	} bytes;
424 
425 	offset = (intptr_t)offset_ptr & 3;
426 	bytes.in = val;
427 	r->v8[offset] = bytes.out[0];
428 	r->v8[offset + 1] = bytes.out[1];
429 }
430 
431 static inline uint16_t
432 get_2(const reg_t *r, const uint16_t *offset_ptr)
433 {
434 	size_t offset;
435 	union {
436 		uint8_t in[2];
437 		uint16_t out;
438 	} bytes;
439 
440 	offset = (intptr_t)offset_ptr & 3;
441 	bytes.in[0] = r->v8[offset];
442 	bytes.in[1] = r->v8[offset + 1];
443 	return (bytes.out);
444 }
445 
446 /*
447  * 8-bit and 16-bit routines.
448  *
449  * These operations are not natively supported by the CPU, so we use
450  * some shifting and bitmasking on top of the 32-bit instructions.
451  */
452 
453 #define	EMIT_LOCK_TEST_AND_SET_N(N, uintN_t)				\
454 uintN_t									\
455 __sync_lock_test_and_set_##N##_c(uintN_t *mem, uintN_t val)			\
456 {									\
457 	uint32_t *mem32;						\
458 	reg_t val32, negmask, old;					\
459 	uint32_t temp1, temp2;						\
460 									\
461 	mem32 = round_to_word(mem);					\
462 	val32.v32 = 0x00000000;						\
463 	put_##N(&val32, mem, val);					\
464 	negmask.v32 = 0xffffffff;					\
465 	put_##N(&negmask, mem, 0);					\
466 									\
467 	do_sync();							\
468 	__asm volatile (						\
469 		"1:"							\
470 		"\tldrex %0, %6\n"	/* Load old value. */		\
471 		"\tand   %2, %5, %0\n"	/* Remove the old value. */	\
472 		"\torr   %2, %2, %4\n"	/* Put in the new value. */	\
473 		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
474 		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
475 		"\tbne   1b\n"		/* Spin if failed. */		\
476 		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
477 		  "=&r" (temp2)						\
478 		: "r" (val32.v32), "r" (negmask.v32), "m" (*mem32));	\
479 	return (get_##N(&old, mem));					\
480 }
481 
482 EMIT_LOCK_TEST_AND_SET_N(1, uint8_t)
483 EMIT_LOCK_TEST_AND_SET_N(2, uint16_t)
484 
485 #define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
486 uintN_t									\
487 __sync_val_compare_and_swap_##N##_c(uintN_t *mem, uintN_t expected,		\
488     uintN_t desired)							\
489 {									\
490 	uint32_t *mem32;						\
491 	reg_t expected32, desired32, posmask, old;			\
492 	uint32_t negmask, temp1, temp2;					\
493 									\
494 	mem32 = round_to_word(mem);					\
495 	expected32.v32 = 0x00000000;					\
496 	put_##N(&expected32, mem, expected);				\
497 	desired32.v32 = 0x00000000;					\
498 	put_##N(&desired32, mem, desired);				\
499 	posmask.v32 = 0x00000000;					\
500 	put_##N(&posmask, mem, ~0);					\
501 	negmask = ~posmask.v32;						\
502 									\
503 	do_sync();							\
504 	__asm volatile (						\
505 		"1:"							\
506 		"\tldrex %0, %8\n"	/* Load old value. */		\
507 		"\tand   %2, %6, %0\n"	/* Isolate the old value. */	\
508 		"\tcmp   %2, %4\n"	/* Compare to expected value. */\
509 		"\tbne   2f\n"		/* Values are unequal. */	\
510 		"\tand   %2, %7, %0\n"	/* Remove the old value. */	\
511 		"\torr   %2, %5\n"	/* Put in the new value. */	\
512 		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
513 		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
514 		"\tbne   1b\n"		/* Spin if failed. */		\
515 		"2:"							\
516 		: "=&r" (old), "=m" (*mem32), "=&r" (temp1),		\
517 		  "=&r" (temp2)						\
518 		: "r" (expected32.v32), "r" (desired32.v32),		\
519 		  "r" (posmask.v32), "r" (negmask), "m" (*mem32));	\
520 	return (get_##N(&old, mem));					\
521 }
522 
523 EMIT_VAL_COMPARE_AND_SWAP_N(1, uint8_t)
524 EMIT_VAL_COMPARE_AND_SWAP_N(2, uint16_t)
525 
526 #define	EMIT_ARITHMETIC_FETCH_AND_OP_N(N, uintN_t, name, op)		\
527 uintN_t									\
528 __sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
529 {									\
530 	uint32_t *mem32;						\
531 	reg_t val32, posmask, old;					\
532 	uint32_t negmask, temp1, temp2;					\
533 									\
534 	mem32 = round_to_word(mem);					\
535 	val32.v32 = 0x00000000;						\
536 	put_##N(&val32, mem, val);					\
537 	posmask.v32 = 0x00000000;					\
538 	put_##N(&posmask, mem, ~0);					\
539 	negmask = ~posmask.v32;						\
540 									\
541 	do_sync();							\
542 	__asm volatile (						\
543 		"1:"							\
544 		"\tldrex %0, %7\n"	/* Load old value. */		\
545 		"\t"op"  %2, %0, %4\n"	/* Calculate new value. */	\
546 		"\tand   %2, %5\n"	/* Isolate the new value. */	\
547 		"\tand   %3, %6, %0\n"	/* Remove the old value. */	\
548 		"\torr   %2, %2, %3\n"	/* Put in the new value. */	\
549 		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
550 		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
551 		"\tbne   1b\n"		/* Spin if failed. */		\
552 		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
553 		  "=&r" (temp2)						\
554 		: "r" (val32.v32), "r" (posmask.v32), "r" (negmask),	\
555 		  "m" (*mem32));					\
556 	return (get_##N(&old, mem));					\
557 }
558 
559 EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_add, "add")
560 EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_sub, "sub")
561 EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_add, "add")
562 EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_sub, "sub")
563 
564 #define	EMIT_BITWISE_FETCH_AND_OP_N(N, uintN_t, name, op, idempotence)	\
565 uintN_t									\
566 __sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
567 {									\
568 	uint32_t *mem32;						\
569 	reg_t val32, old;						\
570 	uint32_t temp1, temp2;						\
571 									\
572 	mem32 = round_to_word(mem);					\
573 	val32.v32 = idempotence ? 0xffffffff : 0x00000000;		\
574 	put_##N(&val32, mem, val);					\
575 									\
576 	do_sync();							\
577 	__asm volatile (						\
578 		"1:"							\
579 		"\tldrex %0, %5\n"	/* Load old value. */		\
580 		"\t"op"  %2, %4, %0\n"	/* Calculate new value. */	\
581 		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
582 		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
583 		"\tbne   1b\n"		/* Spin if failed. */		\
584 		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
585 		  "=&r" (temp2)						\
586 		: "r" (val32.v32), "m" (*mem32));			\
587 	return (get_##N(&old, mem));					\
588 }
589 
590 EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_and, "and", 1)
591 EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_or, "orr", 0)
592 EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_xor, "eor", 0)
593 EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_and, "and", 1)
594 EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_or, "orr", 0)
595 EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_xor, "eor", 0)
596 
597 /*
598  * 32-bit routines.
599  */
600 
601 uint32_t
602 __sync_lock_test_and_set_4_c(uint32_t *mem, uint32_t val)
603 {
604 	uint32_t old, temp;
605 
606 	do_sync();
607 	__asm volatile (
608 		"1:"
609 		"\tldrex %0, %4\n"	/* Load old value. */
610 		"\tstrex %2, %3, %1\n"	/* Attempt to store. */
611 		"\tcmp   %2, #0\n"	/* Did it succeed? */
612 		"\tbne   1b\n"		/* Spin if failed. */
613 		: "=&r" (old), "=m" (*mem), "=&r" (temp)
614 		: "r" (val), "m" (*mem));
615 	return (old);
616 }
617 
618 uint32_t
619 __sync_val_compare_and_swap_4_c(uint32_t *mem, uint32_t expected,
620     uint32_t desired)
621 {
622 	uint32_t old, temp;
623 
624 	do_sync();
625 	__asm volatile (
626 		"1:"
627 		"\tldrex %0, %5\n"	/* Load old value. */
628 		"\tcmp   %0, %3\n"	/* Compare to expected value. */
629 		"\tbne   2f\n"		/* Values are unequal. */
630 		"\tstrex %2, %4, %1\n"	/* Attempt to store. */
631 		"\tcmp   %2, #0\n"	/* Did it succeed? */
632 		"\tbne   1b\n"		/* Spin if failed. */
633 		"2:"
634 		: "=&r" (old), "=m" (*mem), "=&r" (temp)
635 		: "r" (expected), "r" (desired), "m" (*mem));
636 	return (old);
637 }
638 
639 #define	EMIT_FETCH_AND_OP_4(name, op)					\
640 uint32_t								\
641 __sync_##name##_4##_c(uint32_t *mem, uint32_t val)				\
642 {									\
643 	uint32_t old, temp1, temp2;					\
644 									\
645 	do_sync();							\
646 	__asm volatile (						\
647 		"1:"							\
648 		"\tldrex %0, %5\n"	/* Load old value. */		\
649 		"\t"op"  %2, %0, %4\n"	/* Calculate new value. */	\
650 		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
651 		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
652 		"\tbne   1b\n"		/* Spin if failed. */		\
653 		: "=&r" (old), "=m" (*mem), "=&r" (temp1),		\
654 		  "=&r" (temp2)						\
655 		: "r" (val), "m" (*mem));				\
656 	return (old);							\
657 }
658 
659 EMIT_FETCH_AND_OP_4(fetch_and_add, "add")
660 EMIT_FETCH_AND_OP_4(fetch_and_and, "and")
661 EMIT_FETCH_AND_OP_4(fetch_and_or, "orr")
662 EMIT_FETCH_AND_OP_4(fetch_and_sub, "sub")
663 EMIT_FETCH_AND_OP_4(fetch_and_xor, "eor")
664 
665 #ifndef __clang__
666 __strong_reference(__sync_lock_test_and_set_1_c, __sync_lock_test_and_set_1);
667 __strong_reference(__sync_lock_test_and_set_2_c, __sync_lock_test_and_set_2);
668 __strong_reference(__sync_lock_test_and_set_4_c, __sync_lock_test_and_set_4);
669 __strong_reference(__sync_val_compare_and_swap_1_c, __sync_val_compare_and_swap_1);
670 __strong_reference(__sync_val_compare_and_swap_2_c, __sync_val_compare_and_swap_2);
671 __strong_reference(__sync_val_compare_and_swap_4_c, __sync_val_compare_and_swap_4);
672 __strong_reference(__sync_fetch_and_add_1_c, __sync_fetch_and_add_1);
673 __strong_reference(__sync_fetch_and_add_2_c, __sync_fetch_and_add_2);
674 __strong_reference(__sync_fetch_and_add_4_c, __sync_fetch_and_add_4);
675 __strong_reference(__sync_fetch_and_and_1_c, __sync_fetch_and_and_1);
676 __strong_reference(__sync_fetch_and_and_2_c, __sync_fetch_and_and_2);
677 __strong_reference(__sync_fetch_and_and_4_c, __sync_fetch_and_and_4);
678 __strong_reference(__sync_fetch_and_sub_1_c, __sync_fetch_and_sub_1);
679 __strong_reference(__sync_fetch_and_sub_2_c, __sync_fetch_and_sub_2);
680 __strong_reference(__sync_fetch_and_sub_4_c, __sync_fetch_and_sub_4);
681 __strong_reference(__sync_fetch_and_or_1_c, __sync_fetch_and_or_1);
682 __strong_reference(__sync_fetch_and_or_2_c, __sync_fetch_and_or_2);
683 __strong_reference(__sync_fetch_and_or_4_c, __sync_fetch_and_or_4);
684 __strong_reference(__sync_fetch_and_xor_1_c, __sync_fetch_and_xor_1);
685 __strong_reference(__sync_fetch_and_xor_2_c, __sync_fetch_and_xor_2);
686 __strong_reference(__sync_fetch_and_xor_4_c, __sync_fetch_and_xor_4);
687 #endif
688 
689 #else /* __ARM_ARCH_5__ */
690 
691 #ifdef _KERNEL
692 
693 #ifdef SMP
694 #error "On SMP systems we should have proper atomic operations."
695 #endif
696 
697 /*
698  * On uniprocessor systems, we can perform the atomic operations by
699  * disabling interrupts.
700  */
701 
702 #define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
703 uintN_t									\
704 __sync_val_compare_and_swap_##N(uintN_t *mem, uintN_t expected,		\
705     uintN_t desired)							\
706 {									\
707 	uintN_t ret;							\
708 									\
709 	WITHOUT_INTERRUPTS({						\
710 		ret = *mem;						\
711 		if (*mem == expected)					\
712 			*mem = desired;					\
713 	});								\
714 	return (ret);							\
715 }
716 
717 #define	EMIT_FETCH_AND_OP_N(N, uintN_t, name, op)			\
718 uintN_t									\
719 __sync_##name##_##N(uintN_t *mem, uintN_t val)				\
720 {									\
721 	uintN_t ret;							\
722 									\
723 	WITHOUT_INTERRUPTS({						\
724 		ret = *mem;						\
725 		*mem op val;						\
726 	});								\
727 	return (ret);							\
728 }
729 
730 #define	EMIT_ALL_OPS_N(N, uintN_t)					\
731 EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)					\
732 EMIT_FETCH_AND_OP_N(N, uintN_t, lock_test_and_set, =)			\
733 EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_add, +=)			\
734 EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_and, &=)			\
735 EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_or, |=)			\
736 EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_sub, -=)			\
737 EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_xor, ^=)
738 
739 EMIT_ALL_OPS_N(1, uint8_t)
740 EMIT_ALL_OPS_N(2, uint16_t)
741 EMIT_ALL_OPS_N(4, uint32_t)
742 EMIT_ALL_OPS_N(8, uint64_t)
743 #undef	EMIT_ALL_OPS_N
744 
745 #else /* !_KERNEL */
746 
747 /*
748  * For userspace on uniprocessor systems, we can implement the atomic
749  * operations by using a Restartable Atomic Sequence. This makes the
750  * kernel restart the code from the beginning when interrupted.
751  */
752 
753 #define	EMIT_LOCK_TEST_AND_SET_N(N, uintN_t, ldr, str)			\
754 uintN_t									\
755 __sync_lock_test_and_set_##N##_c(uintN_t *mem, uintN_t val)			\
756 {									\
757 	uint32_t old, temp, ras_start;					\
758 									\
759 	ras_start = ARM_RAS_START;					\
760 	__asm volatile (						\
761 		/* Set up Restartable Atomic Sequence. */		\
762 		"1:"							\
763 		"\tadr   %2, 1b\n"					\
764 		"\tstr   %2, [%5]\n"					\
765 		"\tadr   %2, 2f\n"					\
766 		"\tstr   %2, [%5, #4]\n"				\
767 									\
768 		"\t"ldr" %0, %4\n"	/* Load old value. */		\
769 		"\t"str" %3, %1\n"	/* Store new value. */		\
770 									\
771 		/* Tear down Restartable Atomic Sequence. */		\
772 		"2:"							\
773 		"\tmov   %2, #0x00000000\n"				\
774 		"\tstr   %2, [%5]\n"					\
775 		"\tmov   %2, #0xffffffff\n"				\
776 		"\tstr   %2, [%5, #4]\n"				\
777 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
778 		: "r" (val), "m" (*mem), "r" (ras_start));		\
779 	return (old);							\
780 }
781 
782 #define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t, ldr, streq)		\
783 uintN_t									\
784 __sync_val_compare_and_swap_##N##_c(uintN_t *mem, uintN_t expected,		\
785     uintN_t desired)							\
786 {									\
787 	uint32_t old, temp, ras_start;					\
788 									\
789 	ras_start = ARM_RAS_START;					\
790 	__asm volatile (						\
791 		/* Set up Restartable Atomic Sequence. */		\
792 		"1:"							\
793 		"\tadr   %2, 1b\n"					\
794 		"\tstr   %2, [%6]\n"					\
795 		"\tadr   %2, 2f\n"					\
796 		"\tstr   %2, [%6, #4]\n"				\
797 									\
798 		"\t"ldr" %0, %5\n"	/* Load old value. */		\
799 		"\tcmp   %0, %3\n"	/* Compare to expected value. */\
800 		"\t"streq" %4, %1\n"	/* Store new value. */		\
801 									\
802 		/* Tear down Restartable Atomic Sequence. */		\
803 		"2:"							\
804 		"\tmov   %2, #0x00000000\n"				\
805 		"\tstr   %2, [%6]\n"					\
806 		"\tmov   %2, #0xffffffff\n"				\
807 		"\tstr   %2, [%6, #4]\n"				\
808 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
809 		: "r" (expected), "r" (desired), "m" (*mem),		\
810 		  "r" (ras_start));					\
811 	return (old);							\
812 }
813 
814 #define	EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, name, op)		\
815 uintN_t									\
816 __sync_##name##_##N##_c(uintN_t *mem, uintN_t val)				\
817 {									\
818 	uint32_t old, temp, ras_start;					\
819 									\
820 	ras_start = ARM_RAS_START;					\
821 	__asm volatile (						\
822 		/* Set up Restartable Atomic Sequence. */		\
823 		"1:"							\
824 		"\tadr   %2, 1b\n"					\
825 		"\tstr   %2, [%5]\n"					\
826 		"\tadr   %2, 2f\n"					\
827 		"\tstr   %2, [%5, #4]\n"				\
828 									\
829 		"\t"ldr" %0, %4\n"	/* Load old value. */		\
830 		"\t"op"  %2, %0, %3\n"	/* Calculate new value. */	\
831 		"\t"str" %2, %1\n"	/* Store new value. */		\
832 									\
833 		/* Tear down Restartable Atomic Sequence. */		\
834 		"2:"							\
835 		"\tmov   %2, #0x00000000\n"				\
836 		"\tstr   %2, [%5]\n"					\
837 		"\tmov   %2, #0xffffffff\n"				\
838 		"\tstr   %2, [%5, #4]\n"				\
839 		: "=&r" (old), "=m" (*mem), "=&r" (temp)		\
840 		: "r" (val), "m" (*mem), "r" (ras_start));		\
841 	return (old);							\
842 }
843 
844 #define	EMIT_ALL_OPS_N(N, uintN_t, ldr, str, streq)			\
845 EMIT_LOCK_TEST_AND_SET_N(N, uintN_t, ldr, str)				\
846 EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t, ldr, streq)			\
847 EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_add, "add")		\
848 EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_and, "and")		\
849 EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_or, "orr")		\
850 EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_sub, "sub")		\
851 EMIT_FETCH_AND_OP_N(N, uintN_t, ldr, str, fetch_and_xor, "eor")
852 
853 #ifdef __clang__
854 EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "strbeq")
855 EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "strheq")
856 #else
857 EMIT_ALL_OPS_N(1, uint8_t, "ldrb", "strb", "streqb")
858 EMIT_ALL_OPS_N(2, uint16_t, "ldrh", "strh", "streqh")
859 #endif
860 EMIT_ALL_OPS_N(4, uint32_t, "ldr", "str", "streq")
861 
862 #ifndef __clang__
863 __strong_reference(__sync_lock_test_and_set_1_c, __sync_lock_test_and_set_1);
864 __strong_reference(__sync_lock_test_and_set_2_c, __sync_lock_test_and_set_2);
865 __strong_reference(__sync_lock_test_and_set_4_c, __sync_lock_test_and_set_4);
866 __strong_reference(__sync_val_compare_and_swap_1_c, __sync_val_compare_and_swap_1);
867 __strong_reference(__sync_val_compare_and_swap_2_c, __sync_val_compare_and_swap_2);
868 __strong_reference(__sync_val_compare_and_swap_4_c, __sync_val_compare_and_swap_4);
869 __strong_reference(__sync_fetch_and_add_1_c, __sync_fetch_and_add_1);
870 __strong_reference(__sync_fetch_and_add_2_c, __sync_fetch_and_add_2);
871 __strong_reference(__sync_fetch_and_add_4_c, __sync_fetch_and_add_4);
872 __strong_reference(__sync_fetch_and_and_1_c, __sync_fetch_and_and_1);
873 __strong_reference(__sync_fetch_and_and_2_c, __sync_fetch_and_and_2);
874 __strong_reference(__sync_fetch_and_and_4_c, __sync_fetch_and_and_4);
875 __strong_reference(__sync_fetch_and_sub_1_c, __sync_fetch_and_sub_1);
876 __strong_reference(__sync_fetch_and_sub_2_c, __sync_fetch_and_sub_2);
877 __strong_reference(__sync_fetch_and_sub_4_c, __sync_fetch_and_sub_4);
878 __strong_reference(__sync_fetch_and_or_1_c, __sync_fetch_and_or_1);
879 __strong_reference(__sync_fetch_and_or_2_c, __sync_fetch_and_or_2);
880 __strong_reference(__sync_fetch_and_or_4_c, __sync_fetch_and_or_4);
881 __strong_reference(__sync_fetch_and_xor_1_c, __sync_fetch_and_xor_1);
882 __strong_reference(__sync_fetch_and_xor_2_c, __sync_fetch_and_xor_2);
883 __strong_reference(__sync_fetch_and_xor_4_c, __sync_fetch_and_xor_4);
884 #endif
885 
886 #endif /* _KERNEL */
887 
888 #endif
889 
890 #endif /* __SYNC_ATOMICS */
891