1 /* automatically generated by memory-auto.sh, do not edit! */
2 
3 /*
4  * Copyright (c) 2005, 2006 Matt Fredette
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *      This product includes software developed by Matt Fredette.
18  * 4. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 _TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");
35 
36 /* macros: */
37 
38 /* the plain partial read internal macro: */
39 #define _tme_memory_read(type_whole, type_part, mem, offset)		\
40   (((type_whole)							\
41     *((_tme_const type_part *)						\
42       (_tme_cast_pointer_const(tme_uint8_t *, type_whole *, mem)	\
43        + (offset))))							\
44    << (8 * (TME_ENDIAN_NATIVE == TME_ENDIAN_BIG				\
45 	    ? (sizeof(type_whole)					\
46 	       - ((offset) + sizeof(type_part)))			\
47 	    : (offset))))
48 
49 /* the plain partial write internal macro: */
50 #define _tme_memory_write(type_whole, type_part, mem, offset, x)	\
51   do {									\
52     *((type_part *)							\
53       (_tme_cast_pointer(tme_uint8_t *, type_whole *, mem)		\
54        + (offset)))							\
55       = (type_part)							\
56         (((type_whole) (x))						\
57 	 >> (8 * (TME_ENDIAN_NATIVE == TME_ENDIAN_BIG			\
58 		  ? (sizeof(type_whole)					\
59 		     - ((offset) + sizeof(type_part)))			\
60 		  : (offset))));					\
61   } while (/* CONSTCOND */ 0)
62 
63 /* this tests bits in a memory address: */
64 #define _tme_memory_address_test(mem, bits, align_min)			\
65   (((bits) & ~((align_min - 1))) & ((unsigned long) (mem)))
66 
67 /* this returns a mask of all-bits-one in given type: */
68 #define _tme_memory_type_mask(type, shift)				\
69   ((type) ((((type) 0) - ((type) 1)) shift))
70 
71 
72 /* the bus 16-bit read slow function: */
73 tme_uint16_t tme_memory_bus_read16 _TME_P((_tme_const tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int, unsigned int));
74 
75 /* the bus 16-bit write slow function: */
76 void tme_memory_bus_write16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int, unsigned int));
77 
78 /* the bus 32-bit read slow function: */
79 tme_uint32_t tme_memory_bus_read32 _TME_P((_tme_const tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int, unsigned int));
80 
81 /* the bus 32-bit write slow function: */
82 void tme_memory_bus_write32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int, unsigned int));
83 
84 #ifdef TME_HAVE_INT64_T
85 
86 /* the bus 64-bit read slow function: */
87 tme_uint64_t tme_memory_bus_read64 _TME_P((_tme_const tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int, unsigned int));
88 
89 /* the bus 64-bit write slow function: */
90 void tme_memory_bus_write64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int, unsigned int));
91 
92 #endif /* TME_HAVE_INT64_T */
93 
94 /* the bus read buffer function and default macro implementation: */
95 void tme_memory_bus_read_buffer _TME_P((_tme_const tme_shared tme_uint8_t *, tme_uint8_t *, unsigned long, tme_rwlock_t *, unsigned int, unsigned int));
96 #define tme_memory_bus_read_buffer(mem, buffer, count, rwlock, align_min, bus_boundary) \
97   do { \
98     if (TME_THREADS_COOPERATIVE) { \
99       memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count)); \
100     } \
101     else { \
102       tme_memory_bus_read_buffer(((_tme_const tme_shared tme_uint8_t *) (mem)), ((tme_uint8_t *) _tme_audit_pointer(buffer)), (count), (rwlock), (align_min), (bus_boundary)); \
103     } \
104   } while (/* CONSTCOND */ 0)
105 
106 /* the bus write buffer function and default macro implementation: */
107 void tme_memory_bus_write_buffer _TME_P((tme_shared tme_uint8_t *, _tme_const tme_uint8_t *, unsigned long, tme_rwlock_t *, unsigned int, unsigned int));
108 #define tme_memory_bus_write_buffer(mem, buffer, count, rwlock, align_min, bus_boundary) \
109   do { \
110     if (TME_THREADS_COOPERATIVE) { \
111       memcpy((tme_uint8_t *) (mem), (buffer), (count)); \
112     } \
113     else { \
114       tme_memory_bus_write_buffer(((tme_shared tme_uint8_t *) _tme_audit_pointer_shared(mem)), ((_tme_const tme_uint8_t *) _tme_audit_pointer_const(buffer)), (count), (rwlock), (align_min), (bus_boundary)); \
115     } \
116   } while (/* CONSTCOND */ 0)
117 
118 /* the 8-bit atomic operations: */
119 tme_uint8_t tme_memory_atomic_add8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
120 tme_uint8_t tme_memory_atomic_sub8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
121 tme_uint8_t tme_memory_atomic_mul8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
122 tme_uint8_t tme_memory_atomic_div8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
123 tme_uint8_t tme_memory_atomic_and8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
124 tme_uint8_t tme_memory_atomic_or8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
125 tme_uint8_t tme_memory_atomic_xor8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
126 tme_uint8_t tme_memory_atomic_not8 _TME_P((tme_shared tme_uint8_t *, tme_rwlock_t *, unsigned int));
127 tme_uint8_t tme_memory_atomic_neg8 _TME_P((tme_shared tme_uint8_t *, tme_rwlock_t *, unsigned int));
128 tme_uint8_t tme_memory_atomic_xchg8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
129 tme_uint8_t tme_memory_atomic_cx8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_uint8_t, tme_rwlock_t *, unsigned int));
130 
131 /* the default 16-bit memory plain read macro: */
132 #define tme_memory_read16(mem, align_min) \
133   ( \
134    /* if we know at compile time that the memory is aligned \
135       enough to read directly, do the single direct read. \
136    \
137       otherwise, if we know at compile time that the memory \
138       is less aligned than the smallest acceptable parts size, \
139       test if the memory is aligned enough to read directly, \
140       and do the single direct read if it is: */ \
141    (__tme_predict_true((_TME_ALIGNOF_INT16_T == 1 \
142                         || (align_min) >= _TME_ALIGNOF_INT16_T) \
143                        || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint16_t) \
144                            && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0))) \
145    ? \
146      _tme_memory_read(tme_uint16_t, tme_uint16_t, mem, 0) \
147    : \
148      (_tme_memory_read(tme_uint16_t, tme_uint8_t, mem, (0 / 8)) \
149       | _tme_memory_read(tme_uint16_t, tme_uint8_t, mem, (8 / 8))) \
150   )
151 
152 /* the default 16-bit memory plain write macro: */
153 #define tme_memory_write16(mem, x, align_min) \
154   do { \
155     if \
156       /* if we know at compile time that the memory is aligned \
157          enough to write directly, do the single direct write. \
158       \
159          otherwise, if we know at compile time that the memory \
160          is less aligned than the smallest acceptable parts size, \
161          test if the memory is aligned enough to write directly, \
162          and do the single direct write if it is: */ \
163       (__tme_predict_true((_TME_ALIGNOF_INT16_T == 1 \
164                            || (align_min) >= _TME_ALIGNOF_INT16_T) \
165                           || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint16_t) \
166                               && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0))) \
167       { \
168         _tme_memory_write(tme_uint16_t, tme_uint16_t, mem, 0, x); \
169       } \
170     else \
171       { \
172         _tme_memory_write(tme_uint16_t, tme_uint8_t, mem, (0 / 8), x); \
173         _tme_memory_write(tme_uint16_t, tme_uint8_t, mem, (8 / 8), x); \
174       } \
175   } while (/* CONSTCOND */ 0)
176 
177 /* the default 16-bit memory atomic read macro: */
178 #define tme_memory_atomic_read16(mem, lock, align_min) \
179   ( \
180    /* if threads are cooperative, do a plain read: */ \
181    (TME_THREADS_COOPERATIVE) \
182    ? \
183      tme_memory_read16((_tme_const tme_uint16_t *) _tme_audit_type(mem, tme_uint16_t *), align_min) \
184    /* otherwise, if we aren't locking for all memory accesses, and we can \
185       make direct 16-bit accesses, and this memory is aligned \
186       enough to make a single direct atomic access, do the single \
187       direct atomic read: */ \
188    : \
189    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
190                        && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
191                        && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) - 1, align_min) == 0)) \
192    ? \
193      (*_tme_audit_type(mem, tme_uint16_t *)) \
194    /* otherwise, we must do a slow indirect atomic read: */ \
195    : \
196      tme_memory_atomic_read16(mem, lock, align_min) \
197   )
198 
199 /* the default 16-bit memory atomic write macro: */
200 #define tme_memory_atomic_write16(mem, x, lock, align_min) \
201   do { \
202     if \
203       /* if threads are cooperative, do a plain write: */ \
204       (TME_THREADS_COOPERATIVE) \
205       { \
206         tme_memory_write16((tme_uint16_t *) _tme_cast_pointer_shared(tme_uint16_t *, tme_uint16_t *, mem), x, align_min); \
207       /* otherwise, if we aren't locking for all memory accesses, and we can \
208          make direct 16-bit accesses, and this memory is aligned \
209          enough to make a single direct atomic access, do the single \
210          direct atomic write: */ \
211       } \
212     else if \
213       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
214                           && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
215                           && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) - 1, align_min) == 0)) \
216       { \
217         (*_tme_audit_type(mem, tme_uint16_t *)) \
218           = (x); \
219       /* otherwise, we must do a slow indirect atomic write: */ \
220       } \
221     else \
222       { \
223         tme_memory_atomic_write16(mem, x, lock, align_min); \
224       } \
225   } while (/* CONSTCOND */ 0)
226 
227 /* the default 16-bit memory bus read macro: */
228 #define tme_memory_bus_read16(mem, lock, align_min, bus_boundary) \
229   ( \
230    /* if threads are cooperative, do a plain read: */ \
231    (TME_THREADS_COOPERATIVE) \
232    ? \
233      tme_memory_read16((_tme_const tme_uint16_t *) _tme_audit_type(mem, tme_uint16_t *), align_min) \
234    /* otherwise, if we aren't locking for all memory accesses, the \
235       host supports misaligned 16-bit accesses, the host's bus \
236       boundary is greater than or equal to the emulated bus \
237       boundary, and this memory is aligned enough, do a single \
238       direct bus read: */ \
239    : \
240    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
241                        && _TME_ALIGNOF_INT16_T < sizeof(tme_uint16_t) \
242                        && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
243                        && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0)) \
244    ? \
245      (*_tme_audit_type(mem, tme_uint16_t *)) \
246    /* otherwise, if we're locking for all memory accesses, or \
247       if this memory must cross at least one host bus boundary \
248       and the host bus boundary is less than the emulated bus \
249       boundary, do a slow indirect atomic read: */ \
250    : \
251    (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
252                         || (sizeof(tme_uint16_t) > TME_MEMORY_BUS_BOUNDARY \
253                             && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
254    ? \
255      tme_memory_atomic_read16(mem, lock, align_min) \
256    /* otherwise, if the memory is not larger than the emulated \
257       bus boundary, or if size-alignment would mean an atomic \
258       host access and it is size-aligned, do a single atomic \
259       read, which may be direct or slow: */ \
260    : \
261    (__tme_predict_true((sizeof(tme_uint16_t) <= (bus_boundary) \
262                         || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
263                             && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) <= sizeof(tme_uint16_t))) \
264                        && _tme_memory_address_test(mem, sizeof(tme_uint16_t) - 1, align_min) == 0)) \
265    ? \
266      tme_memory_atomic_read16(mem, lock, sizeof(tme_uint16_t)) \
267    /* otherwise, we must do a slow bus read: */ \
268    : \
269      tme_memory_bus_read16(mem, lock, align_min, bus_boundary) \
270   )
271 
272 /* the default 16-bit memory bus write macro: */
273 #define tme_memory_bus_write16(mem, x, lock, align_min, bus_boundary) \
274   do { \
275     if \
276       /* if threads are cooperative, do a plain write: */ \
277       (TME_THREADS_COOPERATIVE) \
278       { \
279         tme_memory_write16((tme_uint16_t *) _tme_cast_pointer_shared(tme_uint16_t *, tme_uint16_t *, mem), x, align_min); \
280       /* otherwise, if we aren't locking for all memory accesses, the \
281          host supports misaligned 16-bit accesses, the host's bus \
282          boundary is greater than or equal to the emulated bus \
283          boundary, and this memory is aligned enough, do a single \
284          direct bus write: */ \
285       } \
286     else if \
287       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
288                           && _TME_ALIGNOF_INT16_T < sizeof(tme_uint16_t) \
289                           && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
290                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0)) \
291       { \
292         (*_tme_audit_type(mem, tme_uint16_t *)) \
293           = (x); \
294       /* otherwise, if we're locking for all memory accesses, or \
295          if this memory must cross at least one host bus boundary \
296          and the host bus boundary is less than the emulated bus \
297          boundary, do a slow indirect atomic write: */ \
298       } \
299     else if \
300       (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
301                            || (sizeof(tme_uint16_t) > TME_MEMORY_BUS_BOUNDARY \
302                                && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
303       { \
304         tme_memory_atomic_write16(mem, x, lock, align_min); \
305       /* otherwise, if the memory is not larger than the emulated \
306          bus boundary, or if size-alignment would mean an atomic \
307          host access and it is size-aligned, do a single atomic \
308          write, which may be direct or slow: */ \
309       } \
310     else if \
311       (__tme_predict_true((sizeof(tme_uint16_t) <= (bus_boundary) \
312                            || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
313                                && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) <= sizeof(tme_uint16_t))) \
314                           && _tme_memory_address_test(mem, sizeof(tme_uint16_t) - 1, align_min) == 0)) \
315       { \
316         tme_memory_atomic_write16(mem, x, lock, sizeof(tme_uint16_t)); \
317       /* otherwise, we must do a slow bus write: */ \
318       } \
319     else \
320       { \
321         tme_memory_bus_write16(mem, x, lock, align_min, bus_boundary); \
322       } \
323   } while (/* CONSTCOND */ 0)
324 
325 /* the 16-bit atomic operations: */
326 tme_uint16_t tme_memory_atomic_add16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
327 tme_uint16_t tme_memory_atomic_sub16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
328 tme_uint16_t tme_memory_atomic_mul16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
329 tme_uint16_t tme_memory_atomic_div16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
330 tme_uint16_t tme_memory_atomic_and16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
331 tme_uint16_t tme_memory_atomic_or16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
332 tme_uint16_t tme_memory_atomic_xor16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
333 tme_uint16_t tme_memory_atomic_not16 _TME_P((tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
334 tme_uint16_t tme_memory_atomic_neg16 _TME_P((tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
335 tme_uint16_t tme_memory_atomic_xchg16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
336 tme_uint16_t tme_memory_atomic_cx16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_uint16_t, tme_rwlock_t *, unsigned int));
337 tme_uint16_t tme_memory_atomic_read16 _TME_P((_tme_const tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
338 void tme_memory_atomic_write16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
339 
340 /* the default 32-bit memory plain read macro: */
341 #define tme_memory_read32(mem, align_min) \
342   ( \
343    /* if we know at compile time that the memory is aligned \
344       enough to read directly, do the single direct read. \
345    \
346       otherwise, if we know at compile time that the memory \
347       is less aligned than the smallest acceptable parts size, \
348       test if the memory is aligned enough to read directly, \
349       and do the single direct read if it is: */ \
350    (__tme_predict_true((_TME_ALIGNOF_INT32_T == 1 \
351                         || (align_min) >= _TME_ALIGNOF_INT32_T) \
352                        || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) \
353                            && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0))) \
354    ? \
355      _tme_memory_read(tme_uint32_t, tme_uint32_t, mem, 0) \
356    : \
357    ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) <= sizeof(tme_uint8_t)) \
358     && ((align_min) <= sizeof(tme_uint8_t))) \
359    ? \
360      (_tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (0 / 8)) \
361       | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (8 / 8)) \
362       | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (16 / 8)) \
363       | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (24 / 8))) \
364    : \
365    (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
366    ? \
367      (_tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (0 / 8)) \
368       | _tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (8 / 8)) \
369       | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (24 / 8))) \
370    : \
371      (_tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (0 / 8)) \
372       | _tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (16 / 8))) \
373   )
374 
375 /* the default 32-bit memory plain write macro: */
376 #define tme_memory_write32(mem, x, align_min) \
377   do { \
378     if \
379       /* if we know at compile time that the memory is aligned \
380          enough to write directly, do the single direct write. \
381       \
382          otherwise, if we know at compile time that the memory \
383          is less aligned than the smallest acceptable parts size, \
384          test if the memory is aligned enough to write directly, \
385          and do the single direct write if it is: */ \
386       (__tme_predict_true((_TME_ALIGNOF_INT32_T == 1 \
387                            || (align_min) >= _TME_ALIGNOF_INT32_T) \
388                           || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) \
389                               && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0))) \
390       { \
391         _tme_memory_write(tme_uint32_t, tme_uint32_t, mem, 0, x); \
392       } \
393     else if \
394       ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) <= sizeof(tme_uint8_t)) \
395        && ((align_min) <= sizeof(tme_uint8_t))) \
396       { \
397         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (0 / 8), x); \
398         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (8 / 8), x); \
399         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (16 / 8), x); \
400         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (24 / 8), x); \
401       } \
402     else if \
403       (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
404       { \
405         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (0 / 8), x); \
406         _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (8 / 8), x); \
407         _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (24 / 8), x); \
408       } \
409     else \
410       { \
411         _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (0 / 8), x); \
412         _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (16 / 8), x); \
413       } \
414   } while (/* CONSTCOND */ 0)
415 
416 /* the default 32-bit memory atomic read macro: */
417 #define tme_memory_atomic_read32(mem, lock, align_min) \
418   ( \
419    /* if threads are cooperative, do a plain read: */ \
420    (TME_THREADS_COOPERATIVE) \
421    ? \
422      tme_memory_read32((_tme_const tme_uint32_t *) _tme_audit_type(mem, tme_uint32_t *), align_min) \
423    /* otherwise, if we aren't locking for all memory accesses, and we can \
424       make direct 32-bit accesses, and this memory is aligned \
425       enough to make a single direct atomic access, do the single \
426       direct atomic read: */ \
427    : \
428    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
429                        && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
430                        && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) - 1, align_min) == 0)) \
431    ? \
432      (*_tme_audit_type(mem, tme_uint32_t *)) \
433    /* otherwise, we must do a slow indirect atomic read: */ \
434    : \
435      tme_memory_atomic_read32(mem, lock, align_min) \
436   )
437 
438 /* the default 32-bit memory atomic write macro: */
439 #define tme_memory_atomic_write32(mem, x, lock, align_min) \
440   do { \
441     if \
442       /* if threads are cooperative, do a plain write: */ \
443       (TME_THREADS_COOPERATIVE) \
444       { \
445         tme_memory_write32((tme_uint32_t *) _tme_cast_pointer_shared(tme_uint32_t *, tme_uint32_t *, mem), x, align_min); \
446       /* otherwise, if we aren't locking for all memory accesses, and we can \
447          make direct 32-bit accesses, and this memory is aligned \
448          enough to make a single direct atomic access, do the single \
449          direct atomic write: */ \
450       } \
451     else if \
452       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
453                           && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
454                           && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) - 1, align_min) == 0)) \
455       { \
456         (*_tme_audit_type(mem, tme_uint32_t *)) \
457           = (x); \
458       /* otherwise, we must do a slow indirect atomic write: */ \
459       } \
460     else \
461       { \
462         tme_memory_atomic_write32(mem, x, lock, align_min); \
463       } \
464   } while (/* CONSTCOND */ 0)
465 
466 /* the default 32-bit memory bus read macro: */
467 #define tme_memory_bus_read32(mem, lock, align_min, bus_boundary) \
468   ( \
469    /* if threads are cooperative, do a plain read: */ \
470    (TME_THREADS_COOPERATIVE) \
471    ? \
472      tme_memory_read32((_tme_const tme_uint32_t *) _tme_audit_type(mem, tme_uint32_t *), align_min) \
473    /* otherwise, if we aren't locking for all memory accesses, the \
474       host supports misaligned 32-bit accesses, the host's bus \
475       boundary is greater than or equal to the emulated bus \
476       boundary, and this memory is aligned enough, do a single \
477       direct bus read: */ \
478    : \
479    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
480                        && _TME_ALIGNOF_INT32_T < sizeof(tme_uint32_t) \
481                        && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
482                        && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0)) \
483    ? \
484      (*_tme_audit_type(mem, tme_uint32_t *)) \
485    /* otherwise, if we're locking for all memory accesses, or \
486       if this memory must cross at least one host bus boundary \
487       and the host bus boundary is less than the emulated bus \
488       boundary, do a slow indirect atomic read: */ \
489    : \
490    (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
491                         || (sizeof(tme_uint32_t) > TME_MEMORY_BUS_BOUNDARY \
492                             && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
493    ? \
494      tme_memory_atomic_read32(mem, lock, align_min) \
495    /* otherwise, if the memory is not larger than the emulated \
496       bus boundary, or if size-alignment would mean an atomic \
497       host access and it is size-aligned, do a single atomic \
498       read, which may be direct or slow: */ \
499    : \
500    (__tme_predict_true((sizeof(tme_uint32_t) <= (bus_boundary) \
501                         || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
502                             && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) <= sizeof(tme_uint32_t))) \
503                        && _tme_memory_address_test(mem, sizeof(tme_uint32_t) - 1, align_min) == 0)) \
504    ? \
505      tme_memory_atomic_read32(mem, lock, sizeof(tme_uint32_t)) \
506    /* otherwise, we must do a slow bus read: */ \
507    : \
508      tme_memory_bus_read32(mem, lock, align_min, bus_boundary) \
509   )
510 
511 /* the default 32-bit memory bus write macro: */
512 #define tme_memory_bus_write32(mem, x, lock, align_min, bus_boundary) \
513   do { \
514     if \
515       /* if threads are cooperative, do a plain write: */ \
516       (TME_THREADS_COOPERATIVE) \
517       { \
518         tme_memory_write32((tme_uint32_t *) _tme_cast_pointer_shared(tme_uint32_t *, tme_uint32_t *, mem), x, align_min); \
519       /* otherwise, if we aren't locking for all memory accesses, the \
520          host supports misaligned 32-bit accesses, the host's bus \
521          boundary is greater than or equal to the emulated bus \
522          boundary, and this memory is aligned enough, do a single \
523          direct bus write: */ \
524       } \
525     else if \
526       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
527                           && _TME_ALIGNOF_INT32_T < sizeof(tme_uint32_t) \
528                           && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
529                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0)) \
530       { \
531         (*_tme_audit_type(mem, tme_uint32_t *)) \
532           = (x); \
533       /* otherwise, if we're locking for all memory accesses, or \
534          if this memory must cross at least one host bus boundary \
535          and the host bus boundary is less than the emulated bus \
536          boundary, do a slow indirect atomic write: */ \
537       } \
538     else if \
539       (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
540                            || (sizeof(tme_uint32_t) > TME_MEMORY_BUS_BOUNDARY \
541                                && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
542       { \
543         tme_memory_atomic_write32(mem, x, lock, align_min); \
544       /* otherwise, if the memory is not larger than the emulated \
545          bus boundary, or if size-alignment would mean an atomic \
546          host access and it is size-aligned, do a single atomic \
547          write, which may be direct or slow: */ \
548       } \
549     else if \
550       (__tme_predict_true((sizeof(tme_uint32_t) <= (bus_boundary) \
551                            || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
552                                && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) <= sizeof(tme_uint32_t))) \
553                           && _tme_memory_address_test(mem, sizeof(tme_uint32_t) - 1, align_min) == 0)) \
554       { \
555         tme_memory_atomic_write32(mem, x, lock, sizeof(tme_uint32_t)); \
556       /* otherwise, we must do a slow bus write: */ \
557       } \
558     else \
559       { \
560         tme_memory_bus_write32(mem, x, lock, align_min, bus_boundary); \
561       } \
562   } while (/* CONSTCOND */ 0)
563 
564 /* the 32-bit atomic operations: */
565 tme_uint32_t tme_memory_atomic_add32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
566 tme_uint32_t tme_memory_atomic_sub32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
567 tme_uint32_t tme_memory_atomic_mul32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
568 tme_uint32_t tme_memory_atomic_div32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
569 tme_uint32_t tme_memory_atomic_and32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
570 tme_uint32_t tme_memory_atomic_or32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
571 tme_uint32_t tme_memory_atomic_xor32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
572 tme_uint32_t tme_memory_atomic_not32 _TME_P((tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
573 tme_uint32_t tme_memory_atomic_neg32 _TME_P((tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
574 tme_uint32_t tme_memory_atomic_xchg32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
575 tme_uint32_t tme_memory_atomic_cx32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_uint32_t, tme_rwlock_t *, unsigned int));
576 tme_uint32_t tme_memory_atomic_read32 _TME_P((_tme_const tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
577 void tme_memory_atomic_write32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
578 
579 #ifdef TME_HAVE_INT64_T
580 
581 /* the default 64-bit memory plain read macro: */
582 #define tme_memory_read64(mem, align_min) \
583   ( \
584    /* if we know at compile time that the memory is aligned \
585       enough to read directly, do the single direct read. \
586    \
587       otherwise, if we know at compile time that the memory \
588       is less aligned than the smallest acceptable parts size, \
589       test if the memory is aligned enough to read directly, \
590       and do the single direct read if it is: */ \
591    (__tme_predict_true((_TME_ALIGNOF_INT64_T == 1 \
592                         || (align_min) >= _TME_ALIGNOF_INT64_T) \
593                        || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) \
594                            && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0))) \
595    ? \
596      _tme_memory_read(tme_uint64_t, tme_uint64_t, mem, 0) \
597    : \
598    ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint8_t)) \
599     && ((align_min) <= sizeof(tme_uint8_t))) \
600    ? \
601      (_tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (0 / 8)) \
602       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (8 / 8)) \
603       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (16 / 8)) \
604       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (24 / 8)) \
605       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (32 / 8)) \
606       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (40 / 8)) \
607       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (48 / 8)) \
608       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (56 / 8))) \
609    : \
610    (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
611    ? \
612      (_tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (0 / 8)) \
613       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (8 / 8)) \
614       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (24 / 8)) \
615       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (40 / 8)) \
616       | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (56 / 8))) \
617    : \
618    ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint16_t)) \
619     && ((align_min) <= sizeof(tme_uint16_t))) \
620    ? \
621      (_tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (0 / 8)) \
622       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (16 / 8)) \
623       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (32 / 8)) \
624       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (48 / 8))) \
625    : \
626    (_tme_memory_address_test(mem, sizeof(tme_uint16_t), align_min) != 0) \
627    ? \
628      (_tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (0 / 8)) \
629       | _tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (16 / 8)) \
630       | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (48 / 8))) \
631    : \
632      (_tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (0 / 8)) \
633       | _tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (32 / 8))) \
634   )
635 
636 /* the default 64-bit memory plain write macro: */
637 #define tme_memory_write64(mem, x, align_min) \
638   do { \
639     if \
640       /* if we know at compile time that the memory is aligned \
641          enough to write directly, do the single direct write. \
642       \
643          otherwise, if we know at compile time that the memory \
644          is less aligned than the smallest acceptable parts size, \
645          test if the memory is aligned enough to write directly, \
646          and do the single direct write if it is: */ \
647       (__tme_predict_true((_TME_ALIGNOF_INT64_T == 1 \
648                            || (align_min) >= _TME_ALIGNOF_INT64_T) \
649                           || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) \
650                               && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0))) \
651       { \
652         _tme_memory_write(tme_uint64_t, tme_uint64_t, mem, 0, x); \
653       } \
654     else if \
655       ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint8_t)) \
656        && ((align_min) <= sizeof(tme_uint8_t))) \
657       { \
658         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (0 / 8), x); \
659         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (8 / 8), x); \
660         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (16 / 8), x); \
661         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (24 / 8), x); \
662         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (32 / 8), x); \
663         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (40 / 8), x); \
664         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (48 / 8), x); \
665         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (56 / 8), x); \
666       } \
667     else if \
668       (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
669       { \
670         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (0 / 8), x); \
671         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (8 / 8), x); \
672         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (24 / 8), x); \
673         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (40 / 8), x); \
674         _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (56 / 8), x); \
675       } \
676     else if \
677       ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint16_t)) \
678        && ((align_min) <= sizeof(tme_uint16_t))) \
679       { \
680         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (0 / 8), x); \
681         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (16 / 8), x); \
682         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (32 / 8), x); \
683         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (48 / 8), x); \
684       } \
685     else if \
686       (_tme_memory_address_test(mem, sizeof(tme_uint16_t), align_min) != 0) \
687       { \
688         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (0 / 8), x); \
689         _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (16 / 8), x); \
690         _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (48 / 8), x); \
691       } \
692     else \
693       { \
694         _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (0 / 8), x); \
695         _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (32 / 8), x); \
696       } \
697   } while (/* CONSTCOND */ 0)
698 
699 /* the default 64-bit memory atomic read macro: */
700 #define tme_memory_atomic_read64(mem, lock, align_min) \
701   ( \
702    /* if threads are cooperative, do a plain read: */ \
703    (TME_THREADS_COOPERATIVE) \
704    ? \
705      tme_memory_read64((_tme_const tme_uint64_t *) _tme_audit_type(mem, tme_uint64_t *), align_min) \
706    /* otherwise, if we aren't locking for all memory accesses, and we can \
707       make direct 64-bit accesses, and this memory is aligned \
708       enough to make a single direct atomic access, do the single \
709       direct atomic read: */ \
710    : \
711    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
712                        && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
713                        && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) - 1, align_min) == 0)) \
714    ? \
715      (*_tme_audit_type(mem, tme_uint64_t *)) \
716    /* otherwise, we must do a slow indirect atomic read: */ \
717    : \
718      tme_memory_atomic_read64(mem, lock, align_min) \
719   )
720 
721 /* the default 64-bit memory atomic write macro: */
722 #define tme_memory_atomic_write64(mem, x, lock, align_min) \
723   do { \
724     if \
725       /* if threads are cooperative, do a plain write: */ \
726       (TME_THREADS_COOPERATIVE) \
727       { \
728         tme_memory_write64((tme_uint64_t *) _tme_cast_pointer_shared(tme_uint64_t *, tme_uint64_t *, mem), x, align_min); \
729       /* otherwise, if we aren't locking for all memory accesses, and we can \
730          make direct 64-bit accesses, and this memory is aligned \
731          enough to make a single direct atomic access, do the single \
732          direct atomic write: */ \
733       } \
734     else if \
735       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
736                           && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
737                           && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) - 1, align_min) == 0)) \
738       { \
739         (*_tme_audit_type(mem, tme_uint64_t *)) \
740           = (x); \
741       /* otherwise, we must do a slow indirect atomic write: */ \
742       } \
743     else \
744       { \
745         tme_memory_atomic_write64(mem, x, lock, align_min); \
746       } \
747   } while (/* CONSTCOND */ 0)
748 
749 /* the default 64-bit memory bus read macro: */
750 #define tme_memory_bus_read64(mem, lock, align_min, bus_boundary) \
751   ( \
752    /* if threads are cooperative, do a plain read: */ \
753    (TME_THREADS_COOPERATIVE) \
754    ? \
755      tme_memory_read64((_tme_const tme_uint64_t *) _tme_audit_type(mem, tme_uint64_t *), align_min) \
756    /* otherwise, if we aren't locking for all memory accesses, the \
757       host supports misaligned 64-bit accesses, the host's bus \
758       boundary is greater than or equal to the emulated bus \
759       boundary, and this memory is aligned enough, do a single \
760       direct bus read: */ \
761    : \
762    (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
763                        && _TME_ALIGNOF_INT64_T < sizeof(tme_uint64_t) \
764                        && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
765                        && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0)) \
766    ? \
767      (*_tme_audit_type(mem, tme_uint64_t *)) \
768    /* otherwise, if we're locking for all memory accesses, or \
769       if this memory must cross at least one host bus boundary \
770       and the host bus boundary is less than the emulated bus \
771       boundary, do a slow indirect atomic read: */ \
772    : \
773    (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
774                         || (sizeof(tme_uint64_t) > TME_MEMORY_BUS_BOUNDARY \
775                             && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
776    ? \
777      tme_memory_atomic_read64(mem, lock, align_min) \
778    /* otherwise, if the memory is not larger than the emulated \
779       bus boundary, or if size-alignment would mean an atomic \
780       host access and it is size-aligned, do a single atomic \
781       read, which may be direct or slow: */ \
782    : \
783    (__tme_predict_true((sizeof(tme_uint64_t) <= (bus_boundary) \
784                         || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
785                             && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) <= sizeof(tme_uint64_t))) \
786                        && _tme_memory_address_test(mem, sizeof(tme_uint64_t) - 1, align_min) == 0)) \
787    ? \
788      tme_memory_atomic_read64(mem, lock, sizeof(tme_uint64_t)) \
789    /* otherwise, we must do a slow bus read: */ \
790    : \
791      tme_memory_bus_read64(mem, lock, align_min, bus_boundary) \
792   )
793 
794 /* the default 64-bit memory bus write macro: */
795 #define tme_memory_bus_write64(mem, x, lock, align_min, bus_boundary) \
796   do { \
797     if \
798       /* if threads are cooperative, do a plain write: */ \
799       (TME_THREADS_COOPERATIVE) \
800       { \
801         tme_memory_write64((tme_uint64_t *) _tme_cast_pointer_shared(tme_uint64_t *, tme_uint64_t *, mem), x, align_min); \
802       /* otherwise, if we aren't locking for all memory accesses, the \
803          host supports misaligned 64-bit accesses, the host's bus \
804          boundary is greater than or equal to the emulated bus \
805          boundary, and this memory is aligned enough, do a single \
806          direct bus write: */ \
807       } \
808     else if \
809       (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
810                           && _TME_ALIGNOF_INT64_T < sizeof(tme_uint64_t) \
811                           && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
812                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0)) \
813       { \
814         (*_tme_audit_type(mem, tme_uint64_t *)) \
815           = (x); \
816       /* otherwise, if we're locking for all memory accesses, or \
817          if this memory must cross at least one host bus boundary \
818          and the host bus boundary is less than the emulated bus \
819          boundary, do a slow indirect atomic write: */ \
820       } \
821     else if \
822       (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
823                            || (sizeof(tme_uint64_t) > TME_MEMORY_BUS_BOUNDARY \
824                                && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
825       { \
826         tme_memory_atomic_write64(mem, x, lock, align_min); \
827       /* otherwise, if the memory is not larger than the emulated \
828          bus boundary, or if size-alignment would mean an atomic \
829          host access and it is size-aligned, do a single atomic \
830          write, which may be direct or slow: */ \
831       } \
832     else if \
833       (__tme_predict_true((sizeof(tme_uint64_t) <= (bus_boundary) \
834                            || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
835                                && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) <= sizeof(tme_uint64_t))) \
836                           && _tme_memory_address_test(mem, sizeof(tme_uint64_t) - 1, align_min) == 0)) \
837       { \
838         tme_memory_atomic_write64(mem, x, lock, sizeof(tme_uint64_t)); \
839       /* otherwise, we must do a slow bus write: */ \
840       } \
841     else \
842       { \
843         tme_memory_bus_write64(mem, x, lock, align_min, bus_boundary); \
844       } \
845   } while (/* CONSTCOND */ 0)
846 
847 /* the 64-bit atomic operations: */
848 tme_uint64_t tme_memory_atomic_add64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
849 tme_uint64_t tme_memory_atomic_sub64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
850 tme_uint64_t tme_memory_atomic_mul64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
851 tme_uint64_t tme_memory_atomic_div64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
852 tme_uint64_t tme_memory_atomic_and64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
853 tme_uint64_t tme_memory_atomic_or64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
854 tme_uint64_t tme_memory_atomic_xor64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
855 tme_uint64_t tme_memory_atomic_not64 _TME_P((tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
856 tme_uint64_t tme_memory_atomic_neg64 _TME_P((tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
857 tme_uint64_t tme_memory_atomic_xchg64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
858 tme_uint64_t tme_memory_atomic_cx64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_uint64_t, tme_rwlock_t *, unsigned int));
859 tme_uint64_t tme_memory_atomic_read64 _TME_P((_tme_const tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
860 void tme_memory_atomic_write64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
861 
862 #endif /* TME_HAVE_INT64_T */
863