1 /* automatically generated by memory-auto.sh, do not edit! */
2
3 /*
4 * Copyright (c) 2005, 2006 Matt Fredette
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by Matt Fredette.
18 * 4. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /* includes: */
35 #include <tme/memory.h>
36
37
38 _TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");
39
40 /* undefine the macro version of tme_memory_bus_read16: */
41 #undef tme_memory_bus_read16
42
43 /* the bus 16-bit read slow function: */
44 tme_uint16_t
tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)45 tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
46 {
47 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
48 unsigned int size_skip;
49 unsigned int size_done;
50 tme_uint16_t x;
51 #ifdef TME_HAVE_INT64_T
52 _tme_const tme_shared tme_uint64_t *parts64;
53 tme_uint64_t part64;
54 #endif /* TME_HAVE_INT64_T */
55 _tme_const tme_shared tme_uint32_t *parts32;
56 tme_uint32_t part32;
57 _tme_const tme_shared tme_uint16_t *parts16;
58 tme_uint16_t part16;
59 _tme_const tme_shared tme_uint8_t *parts8;
60 tme_uint8_t part8;
61
62 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
63
64 #ifdef TME_HAVE_INT64_T
65
66 if (host_boundary == sizeof(tme_uint64_t)) {
67
68 /* prepare to read the first 64-bit part of the memory: */
69 parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
70 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
71 size_done = 0;
72
73 /* read the first 64-bit part of the memory: */
74 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
75
76 /* on a little-endian host, we shift off the skip
77 data on the right, and shift the remaining data
78 up into position in the result: */
79 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
80 x = (((tme_uint16_t) (part64 >> size_skip)) << 0);
81 }
82
83 /* on a big-endian host, we shift off the skip data
84 on the left, and shift the remaining data down
85 into position in the result: */
86 else {
87 x = ((part64 << size_skip) >> ((64 - 16) + 0));
88 }
89 size_done = 64 - size_skip;
90
91 /* read at most one remaining 64-bit part of the memory: */
92 if (__tme_predict_false(size_done < 16)) {
93
94 /* make a boundary: */
95 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
96
97 /* read the next 64-bit part of the memory: */
98 parts64++;
99 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
100
101 /* on a little-endian host, we shift off the skip
102 data on the right, and shift the remaining data
103 up into position in the result: */
104 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
105 x |= (((tme_uint16_t) (part64 >> 0)) << size_done);
106 }
107
108 /* on a big-endian host, we shift off the skip data
109 on the left, and shift the remaining data down
110 into position in the result: */
111 else {
112 x |= ((part64 << 0) >> ((64 - 16) + size_done));
113 }
114 }
115 }
116
117 else
118
119 #endif /* TME_HAVE_INT64_T */
120
121 if (host_boundary == sizeof(tme_uint32_t)) {
122
123 /* prepare to read the first 32-bit part of the memory: */
124 parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
125 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
126 size_done = 0;
127
128 /* read the first 32-bit part of the memory: */
129 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
130
131 /* on a little-endian host, we shift off the skip
132 data on the right, and shift the remaining data
133 up into position in the result: */
134 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
135 x = (((tme_uint16_t) (part32 >> size_skip)) << 0);
136 }
137
138 /* on a big-endian host, we shift off the skip data
139 on the left, and shift the remaining data down
140 into position in the result: */
141 else {
142 x = ((part32 << size_skip) >> ((32 - 16) + 0));
143 }
144 size_done = 32 - size_skip;
145
146 /* read at most one remaining 32-bit part of the memory: */
147 if (__tme_predict_false(size_done < 16)) {
148
149 /* make a boundary: */
150 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
151
152 /* read the next 32-bit part of the memory: */
153 parts32++;
154 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
155
156 /* on a little-endian host, we shift off the skip
157 data on the right, and shift the remaining data
158 up into position in the result: */
159 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
160 x |= (((tme_uint16_t) (part32 >> 0)) << size_done);
161 }
162
163 /* on a big-endian host, we shift off the skip data
164 on the left, and shift the remaining data down
165 into position in the result: */
166 else {
167 x |= ((part32 << 0) >> ((32 - 16) + size_done));
168 }
169 }
170 }
171
172 else if (host_boundary == sizeof(tme_uint16_t)) {
173
174 /* prepare to read the first 16-bit part of the memory: */
175 parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
176 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
177 size_done = 0;
178
179 /* read the first 16-bit part of the memory: */
180 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
181
182 /* on a little-endian host, we shift off the skip
183 data on the right, and shift the remaining data
184 up into position in the result: */
185 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
186 x = (((tme_uint16_t) (part16 >> size_skip)) << 0);
187 }
188
189 /* on a big-endian host, we shift off the skip data
190 on the left, and shift the remaining data down
191 into position in the result: */
192 else {
193 x = ((((tme_uint16_t) part16) << ((16 - 16) + size_skip)) >> 0);
194 }
195 size_done = 16 - size_skip;
196
197 /* read at most one remaining 16-bit part of the memory: */
198 if (__tme_predict_false(size_done < 16)) {
199
200 /* make a boundary: */
201 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
202
203 /* read the next 16-bit part of the memory: */
204 parts16++;
205 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
206
207 /* on a little-endian host, we shift off the skip
208 data on the right, and shift the remaining data
209 up into position in the result: */
210 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
211 x |= (((tme_uint16_t) (part16 >> 0)) << size_done);
212 }
213
214 /* on a big-endian host, we shift off the skip data
215 on the left, and shift the remaining data down
216 into position in the result: */
217 else {
218 x |= ((((tme_uint16_t) part16) << ((16 - 16) + 0)) >> size_done);
219 }
220 }
221 }
222
223 else {
224
225 /* prepare to read the first 8-bit part of the memory: */
226 parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
227 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
228 size_done = 0;
229
230 /* read the first 8-bit part of the memory: */
231 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
232
233 /* on a little-endian host, we shift off the skip
234 data on the right, and shift the remaining data
235 up into position in the result: */
236 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
237 x = (((tme_uint16_t) (part8 >> size_skip)) << 0);
238 }
239
240 /* on a big-endian host, we shift off the skip data
241 on the left, and shift the remaining data down
242 into position in the result: */
243 else {
244 x = ((((tme_uint16_t) part8) << ((16 - 8) + size_skip)) >> 0);
245 }
246 size_done = 8 - size_skip;
247
248 /* read at most one remaining 8-bit part of the memory: */
249 if (__tme_predict_false(size_done < 16)) {
250
251 /* make a boundary: */
252 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
253
254 /* read the next 8-bit part of the memory: */
255 parts8++;
256 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
257
258 /* on a little-endian host, we shift off the skip
259 data on the right, and shift the remaining data
260 up into position in the result: */
261 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
262 x |= (((tme_uint16_t) (part8 >> 0)) << size_done);
263 }
264
265 /* on a big-endian host, we shift off the skip data
266 on the left, and shift the remaining data down
267 into position in the result: */
268 else {
269 x |= ((((tme_uint16_t) part8) << ((16 - 8) + 0)) >> size_done);
270 }
271 }
272 }
273
274 /* return the value read: */
275 return (x);
276 }
277
278 /* undefine the macro version of tme_memory_bus_write16: */
279 #undef tme_memory_bus_write16
280
281 /* the bus 16-bit write slow function: */
282 void
tme_memory_bus_write16(tme_shared tme_uint16_t * mem,tme_uint16_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)283 tme_memory_bus_write16(tme_shared tme_uint16_t *mem, tme_uint16_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
284 {
285 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
286 unsigned int size_skip;
287 unsigned int size_done;
288 #ifdef TME_HAVE_INT64_T
289 tme_shared tme_uint64_t *parts64;
290 tme_uint64_t part64;
291 tme_uint64_t part64_cmp;
292 #endif /* TME_HAVE_INT64_T */
293 tme_shared tme_uint32_t *parts32;
294 tme_uint32_t part32;
295 tme_uint32_t part32_cmp;
296 tme_shared tme_uint16_t *parts16;
297 tme_uint16_t part16;
298 tme_uint16_t part16_cmp;
299 tme_shared tme_uint8_t *parts8;
300 tme_uint8_t part8;
301 tme_uint8_t part8_cmp;
302
303 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
304
305 #ifdef TME_HAVE_INT64_T
306
307 if (host_boundary == sizeof(tme_uint64_t)) {
308
309 /* prepare to write the first 64-bit part of the memory: */
310 parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
311 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
312 size_done = 0;
313
314 /* write the first 64-bit part of the memory: */
315 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
316 do {
317 part64_cmp = part64;
318
319 /* on a little-endian host, we clear with zeroes
320 shifted up past the skip data, and then we
321 insert the data shifted up past the skip data: */
322 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
323 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
324 part64 |= (((tme_uint64_t) x) << size_skip);
325 }
326
327 /* on a big-endian host, we clear with zeroes
328 shifted down past the skip data, and then we
329 insert the data shifted down past the skip data: */
330 else {
331 part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + 0)) >> size_skip);
332 part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> size_skip);
333 }
334
335 /* loop until we can atomically update this part: */
336 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
337 } while (part64 != part64_cmp);
338 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
339 x >>= (64 - size_skip);
340 }
341 else {
342 x <<= (64 - size_skip);
343 }
344 size_done = 64 - size_skip;
345
346 /* write at most one remaining 64-bit part of the memory: */
347 if (__tme_predict_false(size_done < 16)) {
348
349 /* make a boundary: */
350 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
351
352 /* write the next 64-bit part of the memory: */
353 parts64++;
354 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
355 do {
356 part64_cmp = part64;
357
358 /* on a little-endian host, we clear with zeroes
359 shifted up past the skip data, and then we
360 insert the data shifted up past the skip data: */
361 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
362 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
363 part64 |= (((tme_uint64_t) x) << 0);
364 }
365
366 /* on a big-endian host, we clear with zeroes
367 shifted down past the skip data, and then we
368 insert the data shifted down past the skip data: */
369 else {
370 part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + size_done)) >> 0);
371 part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> 0);
372 }
373
374 /* loop until we can atomically update this part: */
375 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
376 } while (part64 != part64_cmp);
377 }
378 }
379
380 else
381
382 #endif /* TME_HAVE_INT64_T */
383
384 if (host_boundary == sizeof(tme_uint32_t)) {
385
386 /* prepare to write the first 32-bit part of the memory: */
387 parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
388 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
389 size_done = 0;
390
391 /* write the first 32-bit part of the memory: */
392 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
393 do {
394 part32_cmp = part32;
395
396 /* on a little-endian host, we clear with zeroes
397 shifted up past the skip data, and then we
398 insert the data shifted up past the skip data: */
399 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
400 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
401 part32 |= (((tme_uint32_t) x) << size_skip);
402 }
403
404 /* on a big-endian host, we clear with zeroes
405 shifted down past the skip data, and then we
406 insert the data shifted down past the skip data: */
407 else {
408 part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + 0)) >> size_skip);
409 part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> size_skip);
410 }
411
412 /* loop until we can atomically update this part: */
413 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
414 } while (part32 != part32_cmp);
415 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
416 x >>= (32 - size_skip);
417 }
418 else {
419 x <<= (32 - size_skip);
420 }
421 size_done = 32 - size_skip;
422
423 /* write at most one remaining 32-bit part of the memory: */
424 if (__tme_predict_false(size_done < 16)) {
425
426 /* make a boundary: */
427 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
428
429 /* write the next 32-bit part of the memory: */
430 parts32++;
431 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
432 do {
433 part32_cmp = part32;
434
435 /* on a little-endian host, we clear with zeroes
436 shifted up past the skip data, and then we
437 insert the data shifted up past the skip data: */
438 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
439 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
440 part32 |= (((tme_uint32_t) x) << 0);
441 }
442
443 /* on a big-endian host, we clear with zeroes
444 shifted down past the skip data, and then we
445 insert the data shifted down past the skip data: */
446 else {
447 part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + size_done)) >> 0);
448 part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> 0);
449 }
450
451 /* loop until we can atomically update this part: */
452 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
453 } while (part32 != part32_cmp);
454 }
455 }
456
457 else if (host_boundary == sizeof(tme_uint16_t)) {
458
459 /* prepare to write the first 16-bit part of the memory: */
460 parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
461 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
462 size_done = 0;
463
464 /* write the first 16-bit part of the memory: */
465 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
466 do {
467 part16_cmp = part16;
468
469 /* on a little-endian host, we clear with zeroes
470 shifted up past the skip data, and then we
471 insert the data shifted up past the skip data: */
472 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
473 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
474 part16 |= (((tme_uint16_t) x) << size_skip);
475 }
476
477 /* on a big-endian host, we clear with zeroes
478 shifted down past the skip data, and then we
479 insert the data shifted down past the skip data: */
480 else {
481 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
482 part16 |= (x >> ((16 - 16) + size_skip));
483 }
484
485 /* loop until we can atomically update this part: */
486 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
487 } while (part16 != part16_cmp);
488 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
489 x >>= (16 - size_skip);
490 }
491 else {
492 x <<= (16 - size_skip);
493 }
494 size_done = 16 - size_skip;
495
496 /* write at most one remaining 16-bit part of the memory: */
497 if (__tme_predict_false(size_done < 16)) {
498
499 /* make a boundary: */
500 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
501
502 /* write the next 16-bit part of the memory: */
503 parts16++;
504 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
505 do {
506 part16_cmp = part16;
507
508 /* on a little-endian host, we clear with zeroes
509 shifted up past the skip data, and then we
510 insert the data shifted up past the skip data: */
511 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
512 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
513 part16 |= (((tme_uint16_t) x) << 0);
514 }
515
516 /* on a big-endian host, we clear with zeroes
517 shifted down past the skip data, and then we
518 insert the data shifted down past the skip data: */
519 else {
520 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
521 part16 |= (x >> ((16 - 16) + 0));
522 }
523
524 /* loop until we can atomically update this part: */
525 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
526 } while (part16 != part16_cmp);
527 }
528 }
529
530 else {
531
532 /* prepare to write the first 8-bit part of the memory: */
533 parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
534 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
535 size_done = 0;
536
537 /* write the first 8-bit part of the memory: */
538 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
539 do {
540 part8_cmp = part8;
541
542 /* on a little-endian host, we clear with zeroes
543 shifted up past the skip data, and then we
544 insert the data shifted up past the skip data: */
545 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
546 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
547 part8 |= (((tme_uint8_t) x) << size_skip);
548 }
549
550 /* on a big-endian host, we clear with zeroes
551 shifted down past the skip data, and then we
552 insert the data shifted down past the skip data: */
553 else {
554 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
555 part8 |= (x >> ((16 - 8) + size_skip));
556 }
557
558 /* loop until we can atomically update this part: */
559 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
560 } while (part8 != part8_cmp);
561 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
562 x >>= (8 - size_skip);
563 }
564 else {
565 x <<= (8 - size_skip);
566 }
567 size_done = 8 - size_skip;
568
569 /* write at most one remaining 8-bit part of the memory: */
570 if (__tme_predict_false(size_done < 16)) {
571
572 /* make a boundary: */
573 tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
574
575 /* write the next 8-bit part of the memory: */
576 parts8++;
577 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
578 do {
579 part8_cmp = part8;
580
581 /* on a little-endian host, we clear with zeroes
582 shifted up past the skip data, and then we
583 insert the data shifted up past the skip data: */
584 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
585 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
586 part8 |= (((tme_uint8_t) x) << 0);
587 }
588
589 /* on a big-endian host, we clear with zeroes
590 shifted down past the skip data, and then we
591 insert the data shifted down past the skip data: */
592 else {
593 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
594 part8 |= (x >> ((16 - 8) + 0));
595 }
596
597 /* loop until we can atomically update this part: */
598 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
599 } while (part8 != part8_cmp);
600 }
601 }
602 }
603
604 /* undefine the macro version of tme_memory_bus_read32: */
605 #undef tme_memory_bus_read32
606
607 /* the bus 32-bit read slow function: */
608 tme_uint32_t
tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)609 tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
610 {
611 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
612 unsigned int size_skip;
613 unsigned int size_done;
614 tme_uint32_t x;
615 #ifdef TME_HAVE_INT64_T
616 _tme_const tme_shared tme_uint64_t *parts64;
617 tme_uint64_t part64;
618 #endif /* TME_HAVE_INT64_T */
619 _tme_const tme_shared tme_uint32_t *parts32;
620 tme_uint32_t part32;
621 _tme_const tme_shared tme_uint16_t *parts16;
622 tme_uint16_t part16;
623 _tme_const tme_shared tme_uint8_t *parts8;
624 tme_uint8_t part8;
625
626 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
627
628 #ifdef TME_HAVE_INT64_T
629
630 if (host_boundary == sizeof(tme_uint64_t)) {
631
632 /* prepare to read the first 64-bit part of the memory: */
633 parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
634 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
635 size_done = 0;
636
637 /* read the first 64-bit part of the memory: */
638 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
639
640 /* on a little-endian host, we shift off the skip
641 data on the right, and shift the remaining data
642 up into position in the result: */
643 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
644 x = (((tme_uint32_t) (part64 >> size_skip)) << 0);
645 }
646
647 /* on a big-endian host, we shift off the skip data
648 on the left, and shift the remaining data down
649 into position in the result: */
650 else {
651 x = ((part64 << size_skip) >> ((64 - 32) + 0));
652 }
653 size_done = 64 - size_skip;
654
655 /* read at most one remaining 64-bit part of the memory: */
656 if (__tme_predict_false(size_done < 32)) {
657
658 /* make a boundary: */
659 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
660
661 /* read the next 64-bit part of the memory: */
662 parts64++;
663 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
664
665 /* on a little-endian host, we shift off the skip
666 data on the right, and shift the remaining data
667 up into position in the result: */
668 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
669 x |= (((tme_uint32_t) (part64 >> 0)) << size_done);
670 }
671
672 /* on a big-endian host, we shift off the skip data
673 on the left, and shift the remaining data down
674 into position in the result: */
675 else {
676 x |= ((part64 << 0) >> ((64 - 32) + size_done));
677 }
678 }
679 }
680
681 else
682
683 #endif /* TME_HAVE_INT64_T */
684
685 if (host_boundary == sizeof(tme_uint32_t)) {
686
687 /* prepare to read the first 32-bit part of the memory: */
688 parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
689 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
690 size_done = 0;
691
692 /* read the first 32-bit part of the memory: */
693 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
694
695 /* on a little-endian host, we shift off the skip
696 data on the right, and shift the remaining data
697 up into position in the result: */
698 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
699 x = (((tme_uint32_t) (part32 >> size_skip)) << 0);
700 }
701
702 /* on a big-endian host, we shift off the skip data
703 on the left, and shift the remaining data down
704 into position in the result: */
705 else {
706 x = ((((tme_uint32_t) part32) << ((32 - 32) + size_skip)) >> 0);
707 }
708 size_done = 32 - size_skip;
709
710 /* read at most one remaining 32-bit part of the memory: */
711 if (__tme_predict_false(size_done < 32)) {
712
713 /* make a boundary: */
714 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
715
716 /* read the next 32-bit part of the memory: */
717 parts32++;
718 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
719
720 /* on a little-endian host, we shift off the skip
721 data on the right, and shift the remaining data
722 up into position in the result: */
723 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
724 x |= (((tme_uint32_t) (part32 >> 0)) << size_done);
725 }
726
727 /* on a big-endian host, we shift off the skip data
728 on the left, and shift the remaining data down
729 into position in the result: */
730 else {
731 x |= ((((tme_uint32_t) part32) << ((32 - 32) + 0)) >> size_done);
732 }
733 }
734 }
735
736 else if (host_boundary == sizeof(tme_uint16_t)) {
737
738 /* prepare to read the first 16-bit part of the memory: */
739 parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
740 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
741 size_done = 0;
742
743 /* read the first 16-bit part of the memory: */
744 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
745
746 /* on a little-endian host, we shift off the skip
747 data on the right, and shift the remaining data
748 up into position in the result: */
749 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
750 x = (((tme_uint32_t) (part16 >> size_skip)) << 0);
751 }
752
753 /* on a big-endian host, we shift off the skip data
754 on the left, and shift the remaining data down
755 into position in the result: */
756 else {
757 x = ((((tme_uint32_t) part16) << ((32 - 16) + size_skip)) >> 0);
758 }
759 size_done = 16 - size_skip;
760
761 /* read any remaining 16-bit parts of the memory: */
762 for (; size_done < 32; size_done += 16) {
763
764 /* make a boundary: */
765 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
766
767 /* read the next 16-bit part of the memory: */
768 parts16++;
769 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
770
771 /* on a little-endian host, we shift off the skip
772 data on the right, and shift the remaining data
773 up into position in the result: */
774 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
775 x |= (((tme_uint32_t) (part16 >> 0)) << size_done);
776 }
777
778 /* on a big-endian host, we shift off the skip data
779 on the left, and shift the remaining data down
780 into position in the result: */
781 else {
782 x |= ((((tme_uint32_t) part16) << ((32 - 16) + 0)) >> size_done);
783 }
784 }
785 }
786
787 else {
788
789 /* prepare to read the first 8-bit part of the memory: */
790 parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
791 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
792 size_done = 0;
793
794 /* read the first 8-bit part of the memory: */
795 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
796
797 /* on a little-endian host, we shift off the skip
798 data on the right, and shift the remaining data
799 up into position in the result: */
800 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
801 x = (((tme_uint32_t) (part8 >> size_skip)) << 0);
802 }
803
804 /* on a big-endian host, we shift off the skip data
805 on the left, and shift the remaining data down
806 into position in the result: */
807 else {
808 x = ((((tme_uint32_t) part8) << ((32 - 8) + size_skip)) >> 0);
809 }
810 size_done = 8 - size_skip;
811
812 /* read any remaining 8-bit parts of the memory: */
813 for (; size_done < 32; size_done += 8) {
814
815 /* make a boundary: */
816 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
817
818 /* read the next 8-bit part of the memory: */
819 parts8++;
820 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
821
822 /* on a little-endian host, we shift off the skip
823 data on the right, and shift the remaining data
824 up into position in the result: */
825 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
826 x |= (((tme_uint32_t) (part8 >> 0)) << size_done);
827 }
828
829 /* on a big-endian host, we shift off the skip data
830 on the left, and shift the remaining data down
831 into position in the result: */
832 else {
833 x |= ((((tme_uint32_t) part8) << ((32 - 8) + 0)) >> size_done);
834 }
835 }
836 }
837
838 /* return the value read: */
839 return (x);
840 }
841
842 /* undefine the macro version of tme_memory_bus_write32: */
843 #undef tme_memory_bus_write32
844
845 /* the bus 32-bit write slow function: */
846 void
tme_memory_bus_write32(tme_shared tme_uint32_t * mem,tme_uint32_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)847 tme_memory_bus_write32(tme_shared tme_uint32_t *mem, tme_uint32_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
848 {
849 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
850 unsigned int size_skip;
851 unsigned int size_done;
852 #ifdef TME_HAVE_INT64_T
853 tme_shared tme_uint64_t *parts64;
854 tme_uint64_t part64;
855 tme_uint64_t part64_cmp;
856 #endif /* TME_HAVE_INT64_T */
857 tme_shared tme_uint32_t *parts32;
858 tme_uint32_t part32;
859 tme_uint32_t part32_cmp;
860 tme_shared tme_uint16_t *parts16;
861 tme_uint16_t part16;
862 tme_uint16_t part16_cmp;
863 tme_shared tme_uint8_t *parts8;
864 tme_uint8_t part8;
865 tme_uint8_t part8_cmp;
866
867 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
868
869 #ifdef TME_HAVE_INT64_T
870
871 if (host_boundary == sizeof(tme_uint64_t)) {
872
873 /* prepare to write the first 64-bit part of the memory: */
874 parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
875 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
876 size_done = 0;
877
878 /* write the first 64-bit part of the memory: */
879 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
880 do {
881 part64_cmp = part64;
882
883 /* on a little-endian host, we clear with zeroes
884 shifted up past the skip data, and then we
885 insert the data shifted up past the skip data: */
886 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
887 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
888 part64 |= (((tme_uint64_t) x) << size_skip);
889 }
890
891 /* on a big-endian host, we clear with zeroes
892 shifted down past the skip data, and then we
893 insert the data shifted down past the skip data: */
894 else {
895 part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + 0)) >> size_skip);
896 part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> size_skip);
897 }
898
899 /* loop until we can atomically update this part: */
900 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
901 } while (part64 != part64_cmp);
902 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
903 x >>= (64 - size_skip);
904 }
905 else {
906 x <<= (64 - size_skip);
907 }
908 size_done = 64 - size_skip;
909
910 /* write at most one remaining 64-bit part of the memory: */
911 if (__tme_predict_false(size_done < 32)) {
912
913 /* make a boundary: */
914 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
915
916 /* write the next 64-bit part of the memory: */
917 parts64++;
918 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
919 do {
920 part64_cmp = part64;
921
922 /* on a little-endian host, we clear with zeroes
923 shifted up past the skip data, and then we
924 insert the data shifted up past the skip data: */
925 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
926 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
927 part64 |= (((tme_uint64_t) x) << 0);
928 }
929
930 /* on a big-endian host, we clear with zeroes
931 shifted down past the skip data, and then we
932 insert the data shifted down past the skip data: */
933 else {
934 part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + size_done)) >> 0);
935 part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> 0);
936 }
937
938 /* loop until we can atomically update this part: */
939 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
940 } while (part64 != part64_cmp);
941 }
942 }
943
944 else
945
946 #endif /* TME_HAVE_INT64_T */
947
948 if (host_boundary == sizeof(tme_uint32_t)) {
949
950 /* prepare to write the first 32-bit part of the memory: */
951 parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
952 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
953 size_done = 0;
954
955 /* write the first 32-bit part of the memory: */
956 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
957 do {
958 part32_cmp = part32;
959
960 /* on a little-endian host, we clear with zeroes
961 shifted up past the skip data, and then we
962 insert the data shifted up past the skip data: */
963 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
964 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
965 part32 |= (((tme_uint32_t) x) << size_skip);
966 }
967
968 /* on a big-endian host, we clear with zeroes
969 shifted down past the skip data, and then we
970 insert the data shifted down past the skip data: */
971 else {
972 part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
973 part32 |= (x >> ((32 - 32) + size_skip));
974 }
975
976 /* loop until we can atomically update this part: */
977 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
978 } while (part32 != part32_cmp);
979 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
980 x >>= (32 - size_skip);
981 }
982 else {
983 x <<= (32 - size_skip);
984 }
985 size_done = 32 - size_skip;
986
987 /* write at most one remaining 32-bit part of the memory: */
988 if (__tme_predict_false(size_done < 32)) {
989
990 /* make a boundary: */
991 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
992
993 /* write the next 32-bit part of the memory: */
994 parts32++;
995 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
996 do {
997 part32_cmp = part32;
998
999 /* on a little-endian host, we clear with zeroes
1000 shifted up past the skip data, and then we
1001 insert the data shifted up past the skip data: */
1002 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1003 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1004 part32 |= (((tme_uint32_t) x) << 0);
1005 }
1006
1007 /* on a big-endian host, we clear with zeroes
1008 shifted down past the skip data, and then we
1009 insert the data shifted down past the skip data: */
1010 else {
1011 part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
1012 part32 |= (x >> ((32 - 32) + 0));
1013 }
1014
1015 /* loop until we can atomically update this part: */
1016 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1017 } while (part32 != part32_cmp);
1018 }
1019 }
1020
1021 else if (host_boundary == sizeof(tme_uint16_t)) {
1022
1023 /* prepare to write the first 16-bit part of the memory: */
1024 parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1025 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1026 size_done = 0;
1027
1028 /* write the first 16-bit part of the memory: */
1029 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1030 do {
1031 part16_cmp = part16;
1032
1033 /* on a little-endian host, we clear with zeroes
1034 shifted up past the skip data, and then we
1035 insert the data shifted up past the skip data: */
1036 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1037 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
1038 part16 |= (((tme_uint16_t) x) << size_skip);
1039 }
1040
1041 /* on a big-endian host, we clear with zeroes
1042 shifted down past the skip data, and then we
1043 insert the data shifted down past the skip data: */
1044 else {
1045 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
1046 part16 |= (x >> ((32 - 16) + size_skip));
1047 }
1048
1049 /* loop until we can atomically update this part: */
1050 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1051 } while (part16 != part16_cmp);
1052 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1053 x >>= (16 - size_skip);
1054 }
1055 else {
1056 x <<= (16 - size_skip);
1057 }
1058 size_done = 16 - size_skip;
1059
1060 /* try to write one full 16-bit part of memory: */
1061 if (__tme_predict_true(size_done <= (32 - 16))) {
1062
1063 /* make a boundary: */
1064 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1065
1066 /* write a full 16-bit part of memory: */
1067 part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 16)));
1068 parts16++;
1069 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
1070 size_done += 16;
1071 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1072 x >>= 16;
1073 }
1074 else {
1075 x <<= 16;
1076 }
1077 }
1078
1079 /* write at most one remaining 16-bit part of the memory: */
1080 if (__tme_predict_false(size_done < 32)) {
1081
1082 /* make a boundary: */
1083 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1084
1085 /* write the next 16-bit part of the memory: */
1086 parts16++;
1087 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1088 do {
1089 part16_cmp = part16;
1090
1091 /* on a little-endian host, we clear with zeroes
1092 shifted up past the skip data, and then we
1093 insert the data shifted up past the skip data: */
1094 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1095 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1096 part16 |= (((tme_uint16_t) x) << 0);
1097 }
1098
1099 /* on a big-endian host, we clear with zeroes
1100 shifted down past the skip data, and then we
1101 insert the data shifted down past the skip data: */
1102 else {
1103 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
1104 part16 |= (x >> ((32 - 16) + 0));
1105 }
1106
1107 /* loop until we can atomically update this part: */
1108 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1109 } while (part16 != part16_cmp);
1110 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1111 x >>= (16 - 0);
1112 }
1113 else {
1114 x <<= (16 - 0);
1115 }
1116 }
1117 }
1118
1119 else {
1120
1121 /* prepare to write the first 8-bit part of the memory: */
1122 parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1123 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1124 size_done = 0;
1125
1126 /* write the first 8-bit part of the memory: */
1127 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1128 do {
1129 part8_cmp = part8;
1130
1131 /* on a little-endian host, we clear with zeroes
1132 shifted up past the skip data, and then we
1133 insert the data shifted up past the skip data: */
1134 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1135 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
1136 part8 |= (((tme_uint8_t) x) << size_skip);
1137 }
1138
1139 /* on a big-endian host, we clear with zeroes
1140 shifted down past the skip data, and then we
1141 insert the data shifted down past the skip data: */
1142 else {
1143 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
1144 part8 |= (x >> ((32 - 8) + size_skip));
1145 }
1146
1147 /* loop until we can atomically update this part: */
1148 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1149 } while (part8 != part8_cmp);
1150 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1151 x >>= (8 - size_skip);
1152 }
1153 else {
1154 x <<= (8 - size_skip);
1155 }
1156 size_done = 8 - size_skip;
1157
1158 /* write as many full 8-bit parts of the memory as we can: */
1159 for (; size_done <= (32 - 8); ) {
1160
1161 /* make a boundary: */
1162 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1163
1164 /* write a full 8-bit part of memory: */
1165 part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 8)));
1166 parts8++;
1167 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
1168 size_done += 8;
1169 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1170 x >>= 8;
1171 }
1172 else {
1173 x <<= 8;
1174 }
1175 }
1176
1177 /* write at most one remaining 8-bit part of the memory: */
1178 if (__tme_predict_false(size_done < 32)) {
1179
1180 /* make a boundary: */
1181 tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1182
1183 /* write the next 8-bit part of the memory: */
1184 parts8++;
1185 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1186 do {
1187 part8_cmp = part8;
1188
1189 /* on a little-endian host, we clear with zeroes
1190 shifted up past the skip data, and then we
1191 insert the data shifted up past the skip data: */
1192 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1193 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1194 part8 |= (((tme_uint8_t) x) << 0);
1195 }
1196
1197 /* on a big-endian host, we clear with zeroes
1198 shifted down past the skip data, and then we
1199 insert the data shifted down past the skip data: */
1200 else {
1201 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
1202 part8 |= (x >> ((32 - 8) + 0));
1203 }
1204
1205 /* loop until we can atomically update this part: */
1206 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1207 } while (part8 != part8_cmp);
1208 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1209 x >>= (8 - 0);
1210 }
1211 else {
1212 x <<= (8 - 0);
1213 }
1214 }
1215 }
1216 }
1217
1218 #ifdef TME_HAVE_INT64_T
1219
1220 /* undefine the macro version of tme_memory_bus_read64: */
1221 #undef tme_memory_bus_read64
1222
1223 /* the bus 64-bit read slow function: */
1224 tme_uint64_t
tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1225 tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1226 {
1227 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1228 unsigned int size_skip;
1229 unsigned int size_done;
1230 tme_uint64_t x;
1231 #ifdef TME_HAVE_INT64_T
1232 _tme_const tme_shared tme_uint64_t *parts64;
1233 tme_uint64_t part64;
1234 #endif /* TME_HAVE_INT64_T */
1235 _tme_const tme_shared tme_uint32_t *parts32;
1236 tme_uint32_t part32;
1237 _tme_const tme_shared tme_uint16_t *parts16;
1238 tme_uint16_t part16;
1239 _tme_const tme_shared tme_uint8_t *parts8;
1240 tme_uint8_t part8;
1241
1242 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
1243
1244 #ifdef TME_HAVE_INT64_T
1245
1246 if (host_boundary == sizeof(tme_uint64_t)) {
1247
1248 /* prepare to read the first 64-bit part of the memory: */
1249 parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
1250 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
1251 size_done = 0;
1252
1253 /* read the first 64-bit part of the memory: */
1254 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1255
1256 /* on a little-endian host, we shift off the skip
1257 data on the right, and shift the remaining data
1258 up into position in the result: */
1259 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1260 x = (((tme_uint64_t) (part64 >> size_skip)) << 0);
1261 }
1262
1263 /* on a big-endian host, we shift off the skip data
1264 on the left, and shift the remaining data down
1265 into position in the result: */
1266 else {
1267 x = ((((tme_uint64_t) part64) << ((64 - 64) + size_skip)) >> 0);
1268 }
1269 size_done = 64 - size_skip;
1270
1271 /* read at most one remaining 64-bit part of the memory: */
1272 if (__tme_predict_false(size_done < 64)) {
1273
1274 /* make a boundary: */
1275 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1276
1277 /* read the next 64-bit part of the memory: */
1278 parts64++;
1279 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1280
1281 /* on a little-endian host, we shift off the skip
1282 data on the right, and shift the remaining data
1283 up into position in the result: */
1284 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1285 x |= (((tme_uint64_t) (part64 >> 0)) << size_done);
1286 }
1287
1288 /* on a big-endian host, we shift off the skip data
1289 on the left, and shift the remaining data down
1290 into position in the result: */
1291 else {
1292 x |= ((((tme_uint64_t) part64) << ((64 - 64) + 0)) >> size_done);
1293 }
1294 }
1295 }
1296
1297 else
1298
1299 #endif /* TME_HAVE_INT64_T */
1300
1301 if (host_boundary == sizeof(tme_uint32_t)) {
1302
1303 /* prepare to read the first 32-bit part of the memory: */
1304 parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
1305 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
1306 size_done = 0;
1307
1308 /* read the first 32-bit part of the memory: */
1309 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1310
1311 /* on a little-endian host, we shift off the skip
1312 data on the right, and shift the remaining data
1313 up into position in the result: */
1314 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1315 x = (((tme_uint64_t) (part32 >> size_skip)) << 0);
1316 }
1317
1318 /* on a big-endian host, we shift off the skip data
1319 on the left, and shift the remaining data down
1320 into position in the result: */
1321 else {
1322 x = ((((tme_uint64_t) part32) << ((64 - 32) + size_skip)) >> 0);
1323 }
1324 size_done = 32 - size_skip;
1325
1326 /* read any remaining 32-bit parts of the memory: */
1327 for (; size_done < 64; size_done += 32) {
1328
1329 /* make a boundary: */
1330 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1331
1332 /* read the next 32-bit part of the memory: */
1333 parts32++;
1334 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1335
1336 /* on a little-endian host, we shift off the skip
1337 data on the right, and shift the remaining data
1338 up into position in the result: */
1339 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1340 x |= (((tme_uint64_t) (part32 >> 0)) << size_done);
1341 }
1342
1343 /* on a big-endian host, we shift off the skip data
1344 on the left, and shift the remaining data down
1345 into position in the result: */
1346 else {
1347 x |= ((((tme_uint64_t) part32) << ((64 - 32) + 0)) >> size_done);
1348 }
1349 }
1350 }
1351
1352 else if (host_boundary == sizeof(tme_uint16_t)) {
1353
1354 /* prepare to read the first 16-bit part of the memory: */
1355 parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1356 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1357 size_done = 0;
1358
1359 /* read the first 16-bit part of the memory: */
1360 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1361
1362 /* on a little-endian host, we shift off the skip
1363 data on the right, and shift the remaining data
1364 up into position in the result: */
1365 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1366 x = (((tme_uint64_t) (part16 >> size_skip)) << 0);
1367 }
1368
1369 /* on a big-endian host, we shift off the skip data
1370 on the left, and shift the remaining data down
1371 into position in the result: */
1372 else {
1373 x = ((((tme_uint64_t) part16) << ((64 - 16) + size_skip)) >> 0);
1374 }
1375 size_done = 16 - size_skip;
1376
1377 /* read any remaining 16-bit parts of the memory: */
1378 for (; size_done < 64; size_done += 16) {
1379
1380 /* make a boundary: */
1381 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1382
1383 /* read the next 16-bit part of the memory: */
1384 parts16++;
1385 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1386
1387 /* on a little-endian host, we shift off the skip
1388 data on the right, and shift the remaining data
1389 up into position in the result: */
1390 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1391 x |= (((tme_uint64_t) (part16 >> 0)) << size_done);
1392 }
1393
1394 /* on a big-endian host, we shift off the skip data
1395 on the left, and shift the remaining data down
1396 into position in the result: */
1397 else {
1398 x |= ((((tme_uint64_t) part16) << ((64 - 16) + 0)) >> size_done);
1399 }
1400 }
1401 }
1402
1403 else {
1404
1405 /* prepare to read the first 8-bit part of the memory: */
1406 parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1407 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1408 size_done = 0;
1409
1410 /* read the first 8-bit part of the memory: */
1411 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1412
1413 /* on a little-endian host, we shift off the skip
1414 data on the right, and shift the remaining data
1415 up into position in the result: */
1416 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1417 x = (((tme_uint64_t) (part8 >> size_skip)) << 0);
1418 }
1419
1420 /* on a big-endian host, we shift off the skip data
1421 on the left, and shift the remaining data down
1422 into position in the result: */
1423 else {
1424 x = ((((tme_uint64_t) part8) << ((64 - 8) + size_skip)) >> 0);
1425 }
1426 size_done = 8 - size_skip;
1427
1428 /* read any remaining 8-bit parts of the memory: */
1429 for (; size_done < 64; size_done += 8) {
1430
1431 /* make a boundary: */
1432 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1433
1434 /* read the next 8-bit part of the memory: */
1435 parts8++;
1436 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1437
1438 /* on a little-endian host, we shift off the skip
1439 data on the right, and shift the remaining data
1440 up into position in the result: */
1441 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1442 x |= (((tme_uint64_t) (part8 >> 0)) << size_done);
1443 }
1444
1445 /* on a big-endian host, we shift off the skip data
1446 on the left, and shift the remaining data down
1447 into position in the result: */
1448 else {
1449 x |= ((((tme_uint64_t) part8) << ((64 - 8) + 0)) >> size_done);
1450 }
1451 }
1452 }
1453
1454 /* return the value read: */
1455 return (x);
1456 }
1457
1458 /* undefine the macro version of tme_memory_bus_write64: */
1459 #undef tme_memory_bus_write64
1460
1461 /* the bus 64-bit write slow function: */
1462 void
tme_memory_bus_write64(tme_shared tme_uint64_t * mem,tme_uint64_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1463 tme_memory_bus_write64(tme_shared tme_uint64_t *mem, tme_uint64_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1464 {
1465 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1466 unsigned int size_skip;
1467 unsigned int size_done;
1468 #ifdef TME_HAVE_INT64_T
1469 tme_shared tme_uint64_t *parts64;
1470 tme_uint64_t part64;
1471 tme_uint64_t part64_cmp;
1472 #endif /* TME_HAVE_INT64_T */
1473 tme_shared tme_uint32_t *parts32;
1474 tme_uint32_t part32;
1475 tme_uint32_t part32_cmp;
1476 tme_shared tme_uint16_t *parts16;
1477 tme_uint16_t part16;
1478 tme_uint16_t part16_cmp;
1479 tme_shared tme_uint8_t *parts8;
1480 tme_uint8_t part8;
1481 tme_uint8_t part8_cmp;
1482
1483 assert (bus_boundary != 0 && bus_boundary <= host_boundary);
1484
1485 #ifdef TME_HAVE_INT64_T
1486
1487 if (host_boundary == sizeof(tme_uint64_t)) {
1488
1489 /* prepare to write the first 64-bit part of the memory: */
1490 parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
1491 size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
1492 size_done = 0;
1493
1494 /* write the first 64-bit part of the memory: */
1495 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1496 do {
1497 part64_cmp = part64;
1498
1499 /* on a little-endian host, we clear with zeroes
1500 shifted up past the skip data, and then we
1501 insert the data shifted up past the skip data: */
1502 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1503 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1504 part64 |= (((tme_uint64_t) x) << size_skip);
1505 }
1506
1507 /* on a big-endian host, we clear with zeroes
1508 shifted down past the skip data, and then we
1509 insert the data shifted down past the skip data: */
1510 else {
1511 part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << 0) >> size_skip);
1512 part64 |= (x >> ((64 - 64) + size_skip));
1513 }
1514
1515 /* loop until we can atomically update this part: */
1516 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
1517 } while (part64 != part64_cmp);
1518 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1519 x >>= (64 - size_skip);
1520 }
1521 else {
1522 x <<= (64 - size_skip);
1523 }
1524 size_done = 64 - size_skip;
1525
1526 /* write at most one remaining 64-bit part of the memory: */
1527 if (__tme_predict_false(size_done < 64)) {
1528
1529 /* make a boundary: */
1530 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1531
1532 /* write the next 64-bit part of the memory: */
1533 parts64++;
1534 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1535 do {
1536 part64_cmp = part64;
1537
1538 /* on a little-endian host, we clear with zeroes
1539 shifted up past the skip data, and then we
1540 insert the data shifted up past the skip data: */
1541 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1542 part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1543 part64 |= (((tme_uint64_t) x) << 0);
1544 }
1545
1546 /* on a big-endian host, we clear with zeroes
1547 shifted down past the skip data, and then we
1548 insert the data shifted down past the skip data: */
1549 else {
1550 part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << size_done) >> 0);
1551 part64 |= (x >> ((64 - 64) + 0));
1552 }
1553
1554 /* loop until we can atomically update this part: */
1555 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
1556 } while (part64 != part64_cmp);
1557 }
1558 }
1559
1560 else
1561
1562 #endif /* TME_HAVE_INT64_T */
1563
1564 if (host_boundary == sizeof(tme_uint32_t)) {
1565
1566 /* prepare to write the first 32-bit part of the memory: */
1567 parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
1568 size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
1569 size_done = 0;
1570
1571 /* write the first 32-bit part of the memory: */
1572 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1573 do {
1574 part32_cmp = part32;
1575
1576 /* on a little-endian host, we clear with zeroes
1577 shifted up past the skip data, and then we
1578 insert the data shifted up past the skip data: */
1579 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1580 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1581 part32 |= (((tme_uint32_t) x) << size_skip);
1582 }
1583
1584 /* on a big-endian host, we clear with zeroes
1585 shifted down past the skip data, and then we
1586 insert the data shifted down past the skip data: */
1587 else {
1588 part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
1589 part32 |= (x >> ((64 - 32) + size_skip));
1590 }
1591
1592 /* loop until we can atomically update this part: */
1593 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1594 } while (part32 != part32_cmp);
1595 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1596 x >>= (32 - size_skip);
1597 }
1598 else {
1599 x <<= (32 - size_skip);
1600 }
1601 size_done = 32 - size_skip;
1602
1603 /* try to write one full 32-bit part of memory: */
1604 if (__tme_predict_true(size_done <= (64 - 32))) {
1605
1606 /* make a boundary: */
1607 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1608
1609 /* write a full 32-bit part of memory: */
1610 part32 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 32)));
1611 parts32++;
1612 tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
1613 size_done += 32;
1614 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1615 x >>= 32;
1616 }
1617 else {
1618 x <<= 32;
1619 }
1620 }
1621
1622 /* write at most one remaining 32-bit part of the memory: */
1623 if (__tme_predict_false(size_done < 64)) {
1624
1625 /* make a boundary: */
1626 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1627
1628 /* write the next 32-bit part of the memory: */
1629 parts32++;
1630 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1631 do {
1632 part32_cmp = part32;
1633
1634 /* on a little-endian host, we clear with zeroes
1635 shifted up past the skip data, and then we
1636 insert the data shifted up past the skip data: */
1637 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1638 part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1639 part32 |= (((tme_uint32_t) x) << 0);
1640 }
1641
1642 /* on a big-endian host, we clear with zeroes
1643 shifted down past the skip data, and then we
1644 insert the data shifted down past the skip data: */
1645 else {
1646 part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
1647 part32 |= (x >> ((64 - 32) + 0));
1648 }
1649
1650 /* loop until we can atomically update this part: */
1651 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1652 } while (part32 != part32_cmp);
1653 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1654 x >>= (32 - 0);
1655 }
1656 else {
1657 x <<= (32 - 0);
1658 }
1659 }
1660 }
1661
1662 else if (host_boundary == sizeof(tme_uint16_t)) {
1663
1664 /* prepare to write the first 16-bit part of the memory: */
1665 parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1666 size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1667 size_done = 0;
1668
1669 /* write the first 16-bit part of the memory: */
1670 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1671 do {
1672 part16_cmp = part16;
1673
1674 /* on a little-endian host, we clear with zeroes
1675 shifted up past the skip data, and then we
1676 insert the data shifted up past the skip data: */
1677 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1678 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1679 part16 |= (((tme_uint16_t) x) << size_skip);
1680 }
1681
1682 /* on a big-endian host, we clear with zeroes
1683 shifted down past the skip data, and then we
1684 insert the data shifted down past the skip data: */
1685 else {
1686 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
1687 part16 |= (x >> ((64 - 16) + size_skip));
1688 }
1689
1690 /* loop until we can atomically update this part: */
1691 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1692 } while (part16 != part16_cmp);
1693 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1694 x >>= (16 - size_skip);
1695 }
1696 else {
1697 x <<= (16 - size_skip);
1698 }
1699 size_done = 16 - size_skip;
1700
1701 /* write as many full 16-bit parts of the memory as we can: */
1702 for (; size_done <= (64 - 16); ) {
1703
1704 /* make a boundary: */
1705 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1706
1707 /* write a full 16-bit part of memory: */
1708 part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 16)));
1709 parts16++;
1710 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
1711 size_done += 16;
1712 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1713 x >>= 16;
1714 }
1715 else {
1716 x <<= 16;
1717 }
1718 }
1719
1720 /* write at most one remaining 16-bit part of the memory: */
1721 if (__tme_predict_false(size_done < 64)) {
1722
1723 /* make a boundary: */
1724 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1725
1726 /* write the next 16-bit part of the memory: */
1727 parts16++;
1728 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1729 do {
1730 part16_cmp = part16;
1731
1732 /* on a little-endian host, we clear with zeroes
1733 shifted up past the skip data, and then we
1734 insert the data shifted up past the skip data: */
1735 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1736 part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1737 part16 |= (((tme_uint16_t) x) << 0);
1738 }
1739
1740 /* on a big-endian host, we clear with zeroes
1741 shifted down past the skip data, and then we
1742 insert the data shifted down past the skip data: */
1743 else {
1744 part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
1745 part16 |= (x >> ((64 - 16) + 0));
1746 }
1747
1748 /* loop until we can atomically update this part: */
1749 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1750 } while (part16 != part16_cmp);
1751 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1752 x >>= (16 - 0);
1753 }
1754 else {
1755 x <<= (16 - 0);
1756 }
1757 }
1758 }
1759
1760 else {
1761
1762 /* prepare to write the first 8-bit part of the memory: */
1763 parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1764 size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1765 size_done = 0;
1766
1767 /* write the first 8-bit part of the memory: */
1768 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1769 do {
1770 part8_cmp = part8;
1771
1772 /* on a little-endian host, we clear with zeroes
1773 shifted up past the skip data, and then we
1774 insert the data shifted up past the skip data: */
1775 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1776 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1777 part8 |= (((tme_uint8_t) x) << size_skip);
1778 }
1779
1780 /* on a big-endian host, we clear with zeroes
1781 shifted down past the skip data, and then we
1782 insert the data shifted down past the skip data: */
1783 else {
1784 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
1785 part8 |= (x >> ((64 - 8) + size_skip));
1786 }
1787
1788 /* loop until we can atomically update this part: */
1789 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1790 } while (part8 != part8_cmp);
1791 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1792 x >>= (8 - size_skip);
1793 }
1794 else {
1795 x <<= (8 - size_skip);
1796 }
1797 size_done = 8 - size_skip;
1798
1799 /* write as many full 8-bit parts of the memory as we can: */
1800 for (; size_done <= (64 - 8); ) {
1801
1802 /* make a boundary: */
1803 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1804
1805 /* write a full 8-bit part of memory: */
1806 part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 8)));
1807 parts8++;
1808 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
1809 size_done += 8;
1810 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1811 x >>= 8;
1812 }
1813 else {
1814 x <<= 8;
1815 }
1816 }
1817
1818 /* write at most one remaining 8-bit part of the memory: */
1819 if (__tme_predict_false(size_done < 64)) {
1820
1821 /* make a boundary: */
1822 tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1823
1824 /* write the next 8-bit part of the memory: */
1825 parts8++;
1826 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1827 do {
1828 part8_cmp = part8;
1829
1830 /* on a little-endian host, we clear with zeroes
1831 shifted up past the skip data, and then we
1832 insert the data shifted up past the skip data: */
1833 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1834 part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1835 part8 |= (((tme_uint8_t) x) << 0);
1836 }
1837
1838 /* on a big-endian host, we clear with zeroes
1839 shifted down past the skip data, and then we
1840 insert the data shifted down past the skip data: */
1841 else {
1842 part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
1843 part8 |= (x >> ((64 - 8) + 0));
1844 }
1845
1846 /* loop until we can atomically update this part: */
1847 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1848 } while (part8 != part8_cmp);
1849 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1850 x >>= (8 - 0);
1851 }
1852 else {
1853 x <<= (8 - 0);
1854 }
1855 }
1856 }
1857 }
1858
1859 #endif /* TME_HAVE_INT64_T */
1860
1861 /* undefine the macro version of tme_memory_bus_read_buffer: */
1862 #undef tme_memory_bus_read_buffer
1863
1864 /* the bus read buffer function: */
1865 void
tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t * mem,tme_uint8_t * buffer,unsigned long count,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1866 tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t *mem, tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1867 {
1868 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1869 _tme_const tme_uint8_t *part_buffer;
1870 unsigned int count_done;
1871 unsigned int count_misaligned;
1872 unsigned int bits_misaligned;
1873 #ifdef TME_HAVE_INT64_T
1874 _tme_const tme_shared tme_uint64_t *parts64;
1875 tme_uint64_t part64_buffer;
1876 tme_uint64_t part64;
1877 tme_uint64_t part64_next;
1878 #endif /* TME_HAVE_INT64_T */
1879 _tme_const tme_shared tme_uint32_t *parts32;
1880 tme_uint32_t part32_buffer;
1881 tme_uint32_t part32;
1882 tme_uint32_t part32_next;
1883 _tme_const tme_shared tme_uint16_t *parts16;
1884 tme_uint16_t part16_buffer;
1885 tme_uint16_t part16;
1886 tme_uint16_t part16_next;
1887 _tme_const tme_shared tme_uint8_t *parts8;
1888 tme_uint8_t part8_buffer;
1889 tme_uint8_t part8;
1890 tme_uint8_t part8_next;
1891
1892 assert (count != 0);
1893 assert (bus_boundary != 0);
1894
1895 /* if we are locking for all memory accesses, lock memory
1896 around a memcpy: */
1897 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
1898 tme_rwlock_rdlock(rwlock);
1899 memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count));
1900 tme_rwlock_unlock(rwlock);
1901 }
1902
1903 /* otherwise, if the emulated bus boundary is greater than the
1904 host's bus boundary, we are forced to stop all other threads
1905 around a memcpy: */
1906 else if (__tme_predict_false(bus_boundary == 0
1907 || bus_boundary > host_boundary)) {
1908 tme_thread_suspend_others();
1909 memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count) + (0 && align_min));
1910 tme_thread_resume_others();
1911 }
1912
1913 #ifdef TME_HAVE_INT64_T
1914
1915 else if (host_boundary == sizeof(tme_uint64_t)) {
1916
1917 /* make a 64-bit pointer to the memory: */
1918 parts64 = (_tme_const tme_shared tme_uint64_t *) mem;
1919
1920 /* if this pointer is not 64-bit aligned: */
1921 if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
1922
1923 /* get the misalignment from the previous 64-bit boundary: */
1924 count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
1925
1926 /* truncate this pointer to the previous 64-bit boundary: */
1927 parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
1928
1929 /* get the number of bytes to read in the first 64-bit memory part: */
1930 count_done = sizeof(tme_uint64_t) - count_misaligned;
1931 if (__tme_predict_false(count_done > count)) {
1932 count_done = count;
1933 }
1934
1935 /* read the first 64-bit memory part: */
1936 part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1937 parts64++;
1938
1939 /* copy to the buffer the bytes to read in the first
1940 64-bit memory part: */
1941 part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
1942 count -= count_done;
1943 do {
1944 *buffer = *part_buffer;
1945 part_buffer++;
1946 buffer++;
1947 } while (--count_done != 0);
1948 }
1949
1950 /* if we have full 64-bit parts to read: */
1951 if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
1952
1953 /* if the buffer is 64-bit aligned: */
1954 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
1955
1956 /* read full 64-bit parts without shifting: */
1957 do {
1958 part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1959 tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
1960
1961 /* advance: */
1962 parts64++;
1963 buffer += sizeof(tme_uint64_t);
1964 count -= sizeof(tme_uint64_t);
1965 } while (count >= sizeof(tme_uint64_t));
1966 }
1967
1968 /* otherwise, the buffer is not 64-bit aligned: */
1969 else {
1970
1971 /* get the misalignment to the next 64-bit boundary: */
1972 count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
1973
1974 /* read the next 64-bit memory part: */
1975 part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1976 parts64++;
1977
1978 /* copy to the buffer until it is aligned: */
1979 part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
1980 count_done = count_misaligned;
1981 count -= count_misaligned;
1982 do {
1983 *buffer = *part_buffer;
1984 part_buffer++;
1985 buffer++;
1986 } while (--count_done != 0);
1987
1988 /* read full 64-bit words with shifting: */
1989 bits_misaligned = count_misaligned * 8;
1990 part64
1991 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
1992 ? (part64_buffer >> bits_misaligned)
1993 : (part64_buffer << bits_misaligned));
1994 for (; count >= sizeof(tme_uint64_t); ) {
1995 part64_next = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1996 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1997 part64 |= (part64_next << (64 - bits_misaligned));
1998 tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
1999 part64 = (part64_next >> bits_misaligned);
2000 }
2001 else {
2002 part64 |= (part64_next >> (64 - bits_misaligned));
2003 tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
2004 part64 = (part64_next << bits_misaligned);
2005 }
2006
2007 /* advance: */
2008 parts64++;
2009 buffer += sizeof(tme_uint64_t);
2010 count -= sizeof(tme_uint64_t);
2011 }
2012
2013 /* calculate how many more bytes there are to read in this
2014 64-bit memory part: */
2015 count_done = sizeof(tme_uint64_t) - count_misaligned;
2016 part64_buffer = part64;
2017
2018 /* copy to the buffer the remaining bytes in this 64-bit part: */
2019 if (count_done > count) {
2020 count_done = count;
2021 }
2022 part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
2023 count -= count_done;
2024 do {
2025 *buffer = *part_buffer;
2026 part_buffer++;
2027 buffer++;
2028 } while (--count_done != 0);
2029 }
2030 }
2031
2032 /* if we still have bytes to read: */
2033 if (__tme_predict_false(count > 0)) {
2034
2035 /* we must have less than a full 64-bit part to read: */
2036 assert (count < sizeof(tme_uint64_t));
2037
2038 /* read the last 64-bit memory part: */
2039 part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
2040
2041 /* copy to the buffer the bytes to read in the first
2042 64-bit memory part: */
2043 part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
2044 count_done = count;
2045 do {
2046 *buffer = *part_buffer;
2047 part_buffer++;
2048 buffer++;
2049 } while (--count_done != 0);
2050 }
2051
2052 }
2053
2054 #endif /* TME_HAVE_INT64_T */
2055
2056 else if (host_boundary == sizeof(tme_uint32_t)) {
2057
2058 /* make a 32-bit pointer to the memory: */
2059 parts32 = (_tme_const tme_shared tme_uint32_t *) mem;
2060
2061 /* if this pointer is not 32-bit aligned: */
2062 if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
2063
2064 /* get the misalignment from the previous 32-bit boundary: */
2065 count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
2066
2067 /* truncate this pointer to the previous 32-bit boundary: */
2068 parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
2069
2070 /* get the number of bytes to read in the first 32-bit memory part: */
2071 count_done = sizeof(tme_uint32_t) - count_misaligned;
2072 if (__tme_predict_false(count_done > count)) {
2073 count_done = count;
2074 }
2075
2076 /* read the first 32-bit memory part: */
2077 part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2078 parts32++;
2079
2080 /* copy to the buffer the bytes to read in the first
2081 32-bit memory part: */
2082 part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2083 count -= count_done;
2084 do {
2085 *buffer = *part_buffer;
2086 part_buffer++;
2087 buffer++;
2088 } while (--count_done != 0);
2089 }
2090
2091 /* if we have full 32-bit parts to read: */
2092 if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
2093
2094 /* if the buffer is 32-bit aligned: */
2095 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
2096
2097 /* read full 32-bit parts without shifting: */
2098 do {
2099 part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2100 tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2101
2102 /* advance: */
2103 parts32++;
2104 buffer += sizeof(tme_uint32_t);
2105 count -= sizeof(tme_uint32_t);
2106 } while (count >= sizeof(tme_uint32_t));
2107 }
2108
2109 /* otherwise, the buffer is not 32-bit aligned: */
2110 else {
2111
2112 /* get the misalignment to the next 32-bit boundary: */
2113 count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
2114
2115 /* read the next 32-bit memory part: */
2116 part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2117 parts32++;
2118
2119 /* copy to the buffer until it is aligned: */
2120 part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2121 count_done = count_misaligned;
2122 count -= count_misaligned;
2123 do {
2124 *buffer = *part_buffer;
2125 part_buffer++;
2126 buffer++;
2127 } while (--count_done != 0);
2128
2129 /* read full 32-bit words with shifting: */
2130 bits_misaligned = count_misaligned * 8;
2131 part32
2132 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2133 ? (part32_buffer >> bits_misaligned)
2134 : (part32_buffer << bits_misaligned));
2135 for (; count >= sizeof(tme_uint32_t); ) {
2136 part32_next = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2137 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2138 part32 |= (part32_next << (32 - bits_misaligned));
2139 tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2140 part32 = (part32_next >> bits_misaligned);
2141 }
2142 else {
2143 part32 |= (part32_next >> (32 - bits_misaligned));
2144 tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2145 part32 = (part32_next << bits_misaligned);
2146 }
2147
2148 /* advance: */
2149 parts32++;
2150 buffer += sizeof(tme_uint32_t);
2151 count -= sizeof(tme_uint32_t);
2152 }
2153
2154 /* calculate how many more bytes there are to read in this
2155 32-bit memory part: */
2156 count_done = sizeof(tme_uint32_t) - count_misaligned;
2157 part32_buffer = part32;
2158
2159 /* copy to the buffer the remaining bytes in this 32-bit part: */
2160 if (count_done > count) {
2161 count_done = count;
2162 }
2163 part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2164 count -= count_done;
2165 do {
2166 *buffer = *part_buffer;
2167 part_buffer++;
2168 buffer++;
2169 } while (--count_done != 0);
2170 }
2171 }
2172
2173 /* if we still have bytes to read: */
2174 if (__tme_predict_false(count > 0)) {
2175
2176 /* we must have less than a full 32-bit part to read: */
2177 assert (count < sizeof(tme_uint32_t));
2178
2179 /* read the last 32-bit memory part: */
2180 part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2181
2182 /* copy to the buffer the bytes to read in the first
2183 32-bit memory part: */
2184 part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2185 count_done = count;
2186 do {
2187 *buffer = *part_buffer;
2188 part_buffer++;
2189 buffer++;
2190 } while (--count_done != 0);
2191 }
2192
2193 }
2194
2195 else if (host_boundary == sizeof(tme_uint16_t)) {
2196
2197 /* make a 16-bit pointer to the memory: */
2198 parts16 = (_tme_const tme_shared tme_uint16_t *) mem;
2199
2200 /* if this pointer is not 16-bit aligned: */
2201 if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
2202
2203 /* get the misalignment from the previous 16-bit boundary: */
2204 count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
2205
2206 /* truncate this pointer to the previous 16-bit boundary: */
2207 parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
2208
2209 /* get the number of bytes to read in the first 16-bit memory part: */
2210 count_done = sizeof(tme_uint16_t) - count_misaligned;
2211 if (__tme_predict_false(count_done > count)) {
2212 count_done = count;
2213 }
2214
2215 /* read the first 16-bit memory part: */
2216 part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2217 parts16++;
2218
2219 /* copy to the buffer the bytes to read in the first
2220 16-bit memory part: */
2221 part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
2222 count -= count_done;
2223 do {
2224 *buffer = *part_buffer;
2225 part_buffer++;
2226 buffer++;
2227 } while (--count_done != 0);
2228 }
2229
2230 /* if we have full 16-bit parts to read: */
2231 if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
2232
2233 /* if the buffer is 16-bit aligned: */
2234 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
2235
2236 /* read full 16-bit parts without shifting: */
2237 do {
2238 part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2239 tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2240
2241 /* advance: */
2242 parts16++;
2243 buffer += sizeof(tme_uint16_t);
2244 count -= sizeof(tme_uint16_t);
2245 } while (count >= sizeof(tme_uint16_t));
2246 }
2247
2248 /* otherwise, the buffer is not 16-bit aligned: */
2249 else {
2250
2251 /* get the misalignment to the next 16-bit boundary: */
2252 count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
2253
2254 /* read the next 16-bit memory part: */
2255 part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2256 parts16++;
2257
2258 /* copy to the buffer until it is aligned: */
2259 part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2260 count_done = count_misaligned;
2261 count -= count_misaligned;
2262 do {
2263 *buffer = *part_buffer;
2264 part_buffer++;
2265 buffer++;
2266 } while (--count_done != 0);
2267
2268 /* read full 16-bit words with shifting: */
2269 bits_misaligned = count_misaligned * 8;
2270 part16
2271 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2272 ? (part16_buffer >> bits_misaligned)
2273 : (part16_buffer << bits_misaligned));
2274 for (; count >= sizeof(tme_uint16_t); ) {
2275 part16_next = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2276 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2277 part16 |= (part16_next << (16 - bits_misaligned));
2278 tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2279 part16 = (part16_next >> bits_misaligned);
2280 }
2281 else {
2282 part16 |= (part16_next >> (16 - bits_misaligned));
2283 tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2284 part16 = (part16_next << bits_misaligned);
2285 }
2286
2287 /* advance: */
2288 parts16++;
2289 buffer += sizeof(tme_uint16_t);
2290 count -= sizeof(tme_uint16_t);
2291 }
2292
2293 /* calculate how many more bytes there are to read in this
2294 16-bit memory part: */
2295 count_done = sizeof(tme_uint16_t) - count_misaligned;
2296 part16_buffer = part16;
2297
2298 /* copy to the buffer the remaining bytes in this 16-bit part: */
2299 if (count_done > count) {
2300 count_done = count;
2301 }
2302 part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2303 count -= count_done;
2304 do {
2305 *buffer = *part_buffer;
2306 part_buffer++;
2307 buffer++;
2308 } while (--count_done != 0);
2309 }
2310 }
2311
2312 /* if we still have bytes to read: */
2313 if (__tme_predict_false(count > 0)) {
2314
2315 /* we must have less than a full 16-bit part to read: */
2316 assert (count < sizeof(tme_uint16_t));
2317
2318 /* read the last 16-bit memory part: */
2319 part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2320
2321 /* copy to the buffer the bytes to read in the first
2322 16-bit memory part: */
2323 part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2324 count_done = count;
2325 do {
2326 *buffer = *part_buffer;
2327 part_buffer++;
2328 buffer++;
2329 } while (--count_done != 0);
2330 }
2331
2332 }
2333
2334 else {
2335
2336 /* make a 8-bit pointer to the memory: */
2337 parts8 = (_tme_const tme_shared tme_uint8_t *) mem;
2338
2339 /* if this pointer is not 8-bit aligned: */
2340 if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
2341
2342 /* get the misalignment from the previous 8-bit boundary: */
2343 count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
2344
2345 /* truncate this pointer to the previous 8-bit boundary: */
2346 parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
2347
2348 /* get the number of bytes to read in the first 8-bit memory part: */
2349 count_done = sizeof(tme_uint8_t) - count_misaligned;
2350 if (__tme_predict_false(count_done > count)) {
2351 count_done = count;
2352 }
2353
2354 /* read the first 8-bit memory part: */
2355 part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2356 parts8++;
2357
2358 /* copy to the buffer the bytes to read in the first
2359 8-bit memory part: */
2360 part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
2361 count -= count_done;
2362 do {
2363 *buffer = *part_buffer;
2364 part_buffer++;
2365 buffer++;
2366 } while (--count_done != 0);
2367 }
2368
2369 /* if we have full 8-bit parts to read: */
2370 if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
2371
2372 /* if the buffer is 8-bit aligned: */
2373 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
2374
2375 /* read full 8-bit parts without shifting: */
2376 do {
2377 part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2378 tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2379
2380 /* advance: */
2381 parts8++;
2382 buffer += sizeof(tme_uint8_t);
2383 count -= sizeof(tme_uint8_t);
2384 } while (count >= sizeof(tme_uint8_t));
2385 }
2386
2387 /* otherwise, the buffer is not 8-bit aligned: */
2388 else {
2389
2390 /* get the misalignment to the next 8-bit boundary: */
2391 count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
2392
2393 /* read the next 8-bit memory part: */
2394 part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2395 parts8++;
2396
2397 /* copy to the buffer until it is aligned: */
2398 part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2399 count_done = count_misaligned;
2400 count -= count_misaligned;
2401 do {
2402 *buffer = *part_buffer;
2403 part_buffer++;
2404 buffer++;
2405 } while (--count_done != 0);
2406
2407 /* read full 8-bit words with shifting: */
2408 bits_misaligned = count_misaligned * 8;
2409 part8
2410 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2411 ? (part8_buffer >> bits_misaligned)
2412 : (part8_buffer << bits_misaligned));
2413 for (; count >= sizeof(tme_uint8_t); ) {
2414 part8_next = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2415 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2416 part8 |= (part8_next << (8 - bits_misaligned));
2417 tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2418 part8 = (part8_next >> bits_misaligned);
2419 }
2420 else {
2421 part8 |= (part8_next >> (8 - bits_misaligned));
2422 tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2423 part8 = (part8_next << bits_misaligned);
2424 }
2425
2426 /* advance: */
2427 parts8++;
2428 buffer += sizeof(tme_uint8_t);
2429 count -= sizeof(tme_uint8_t);
2430 }
2431
2432 /* calculate how many more bytes there are to read in this
2433 8-bit memory part: */
2434 count_done = sizeof(tme_uint8_t) - count_misaligned;
2435 part8_buffer = part8;
2436
2437 /* copy to the buffer the remaining bytes in this 8-bit part: */
2438 if (count_done > count) {
2439 count_done = count;
2440 }
2441 part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2442 count -= count_done;
2443 do {
2444 *buffer = *part_buffer;
2445 part_buffer++;
2446 buffer++;
2447 } while (--count_done != 0);
2448 }
2449 }
2450
2451 /* if we still have bytes to read: */
2452 if (__tme_predict_false(count > 0)) {
2453
2454 /* we must have less than a full 8-bit part to read: */
2455 assert (count < sizeof(tme_uint8_t));
2456
2457 /* read the last 8-bit memory part: */
2458 part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2459
2460 /* copy to the buffer the bytes to read in the first
2461 8-bit memory part: */
2462 part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2463 count_done = count;
2464 do {
2465 *buffer = *part_buffer;
2466 part_buffer++;
2467 buffer++;
2468 } while (--count_done != 0);
2469 }
2470
2471 }
2472 }
2473
2474 /* undefine the macro version of tme_memory_bus_write_buffer: */
2475 #undef tme_memory_bus_write_buffer
2476
2477 /* the bus write buffer function: */
2478 void
tme_memory_bus_write_buffer(tme_shared tme_uint8_t * mem,_tme_const tme_uint8_t * buffer,unsigned long count,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)2479 tme_memory_bus_write_buffer(tme_shared tme_uint8_t *mem, _tme_const tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
2480 {
2481 const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
2482 tme_uint8_t *part_buffer;
2483 unsigned int count_done;
2484 unsigned int count_misaligned;
2485 unsigned int bits_misaligned;
2486 #ifdef TME_HAVE_INT64_T
2487 tme_shared tme_uint64_t *parts64;
2488 tme_uint64_t part64_buffer;
2489 tme_uint64_t part64;
2490 tme_uint64_t part64_next;
2491 tme_uint64_t part64_mask;
2492 tme_uint64_t part64_cmp;
2493 #endif /* TME_HAVE_INT64_T */
2494 tme_shared tme_uint32_t *parts32;
2495 tme_uint32_t part32_buffer;
2496 tme_uint32_t part32;
2497 tme_uint32_t part32_next;
2498 tme_uint32_t part32_mask;
2499 tme_uint32_t part32_cmp;
2500 tme_shared tme_uint16_t *parts16;
2501 tme_uint16_t part16_buffer;
2502 tme_uint16_t part16;
2503 tme_uint16_t part16_next;
2504 tme_uint16_t part16_mask;
2505 tme_uint16_t part16_cmp;
2506 tme_shared tme_uint8_t *parts8;
2507 tme_uint8_t part8_buffer;
2508 tme_uint8_t part8;
2509 tme_uint8_t part8_next;
2510 tme_uint8_t part8_mask;
2511 tme_uint8_t part8_cmp;
2512
2513 assert (count != 0);
2514 assert (bus_boundary != 0);
2515
2516 /* if we are locking for all memory accesses, lock memory
2517 around a memcpy: */
2518 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
2519 tme_rwlock_wrlock(rwlock);
2520 memcpy((tme_uint8_t *) (mem), (buffer), (count));
2521 tme_rwlock_unlock(rwlock);
2522 }
2523
2524 /* otherwise, if the emulated bus boundary is greater than the
2525 host's bus boundary, we are forced to stop all other threads
2526 around a memcpy: */
2527 else if (__tme_predict_false(bus_boundary == 0
2528 || bus_boundary > host_boundary)) {
2529 tme_thread_suspend_others();
2530 memcpy((tme_uint8_t *) (mem), (buffer), (count) + (0 && align_min));
2531 tme_thread_resume_others();
2532 }
2533
2534 #ifdef TME_HAVE_INT64_T
2535
2536 else if (host_boundary == sizeof(tme_uint64_t)) {
2537
2538 /* make a 64-bit pointer to the memory: */
2539 parts64 = (tme_shared tme_uint64_t *) mem;
2540
2541 /* if this pointer is not 64-bit aligned: */
2542 if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
2543
2544 /* get the misalignment from the previous 64-bit boundary: */
2545 count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
2546
2547 /* truncate this pointer to the previous 64-bit boundary: */
2548 parts64 = (tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
2549
2550 /* get the number of bytes to write in the first 64-bit memory part: */
2551 count_done = sizeof(tme_uint64_t) - count_misaligned;
2552 if (__tme_predict_false(count_done > count)) {
2553 count_done = count;
2554 }
2555
2556 /* make a mask that clears for the data to write in the
2557 first 64-bit memory part: */
2558 part64_mask = 1;
2559 part64_mask = (part64_mask << (count_done * 8)) - 1;
2560 part64_mask
2561 <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2562 ? (count_misaligned * 8)
2563 : (64 - ((count_misaligned + count_done) * 8)));
2564 part64_mask = ~part64_mask;
2565
2566 /* copy from the buffer the bytes to write in the first
2567 64-bit memory part: */
2568 part64_buffer = 0;
2569 part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
2570 count -= count_done;
2571 do {
2572 *part_buffer = *buffer;
2573 part_buffer++;
2574 buffer++;
2575 } while (--count_done != 0);
2576
2577 /* compare-and-exchange the first 64-bit memory part: */
2578 part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2579 do {
2580 part64_cmp = part64;
2581 part64 = (part64 & part64_mask) | part64_buffer;
2582 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
2583 } while (part64 != part64_cmp);
2584 parts64++;
2585 }
2586
2587 /* if we have full 64-bit parts to write: */
2588 if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
2589
2590 /* if the buffer is 64-bit aligned: */
2591 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
2592
2593 /* write full 64-bit parts without shifting: */
2594 do {
2595 part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2596 tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2597
2598 /* advance: */
2599 parts64++;
2600 buffer += sizeof(tme_uint64_t);
2601 count -= sizeof(tme_uint64_t);
2602 } while (count >= sizeof(tme_uint64_t));
2603 }
2604
2605 /* otherwise, the buffer is not 64-bit aligned: */
2606 else {
2607
2608 /* get the misalignment to the next 64-bit boundary: */
2609 count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
2610
2611 /* copy from the buffer until it is aligned: */
2612 part64_buffer = 0;
2613 part_buffer = ((tme_uint8_t *) &part64_buffer);
2614 count_done = count_misaligned;
2615 count -= count_misaligned;
2616 do {
2617 *part_buffer = *buffer;
2618 part_buffer++;
2619 buffer++;
2620 } while (--count_done != 0);
2621
2622 /* write full 64-bit words with shifting: */
2623 bits_misaligned = count_misaligned * 8;
2624 part64 = part64_buffer;
2625 for (; count >= sizeof(tme_uint64_t); ) {
2626 part64_next = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2627 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2628 part64 |= (part64_next << bits_misaligned);
2629 tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2630 part64 = (part64_next >> (64 - bits_misaligned));
2631 }
2632 else {
2633 part64 |= (part64_next >> bits_misaligned);
2634 tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2635 part64 = (part64_next << (64 - bits_misaligned));
2636 }
2637
2638 /* advance: */
2639 parts64++;
2640 buffer += sizeof(tme_uint64_t);
2641 count -= sizeof(tme_uint64_t);
2642 }
2643
2644 /* calculate how many more bytes there are to write in this
2645 64-bit memory part: */
2646 count_done = sizeof(tme_uint64_t) - count_misaligned;
2647 part64_buffer = part64;
2648
2649 /* if we can't write one more full 64-bit memory part: */
2650 if (count_done > count) {
2651
2652 /* we will reread this data to write below: */
2653 buffer -= count_misaligned;
2654 count += count_misaligned;
2655 }
2656
2657 /* otherwise, we can write one more full 64-bit memory part: */
2658 else {
2659
2660 /* copy from the buffer until we have the full 64-bit part: */
2661 part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
2662 count -= count_done;
2663 do {
2664 *part_buffer = *buffer;
2665 part_buffer++;
2666 buffer++;
2667 } while (--count_done != 0);
2668
2669 /* write the last full 64-bit memory part: */
2670 part64 = part64_buffer;
2671 tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2672 }
2673 }
2674 }
2675
2676 /* if we still have bytes to write: */
2677 if (__tme_predict_false(count > 0)) {
2678
2679 /* we must have less than a full 64-bit part to write: */
2680 assert (count < sizeof(tme_uint64_t));
2681
2682 /* make a mask that clears for the data to write in the last
2683 64-bit memory part: */
2684 part64_mask
2685 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2686 ? _tme_memory_type_mask(tme_uint64_t, << (count * 8))
2687 : _tme_memory_type_mask(tme_uint64_t, >> (count * 8)));
2688
2689 /* copy from the buffer the bytes to write in the last
2690 64-bit memory part: */
2691 part64_buffer = 0;
2692 part_buffer = ((tme_uint8_t *) &part64_buffer);
2693 count_done = count;
2694 do {
2695 *part_buffer = *buffer;
2696 part_buffer++;
2697 buffer++;
2698 } while (--count_done != 0);
2699
2700 /* compare-and-exchange the last 64-bit memory part: */
2701 part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2702 do {
2703 part64_cmp = part64;
2704 part64 = (part64 & part64_mask) | part64_buffer;
2705 part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
2706 } while (part64 != part64_cmp);
2707 }
2708
2709 }
2710
2711 #endif /* TME_HAVE_INT64_T */
2712
2713 else if (host_boundary == sizeof(tme_uint32_t)) {
2714
2715 /* make a 32-bit pointer to the memory: */
2716 parts32 = (tme_shared tme_uint32_t *) mem;
2717
2718 /* if this pointer is not 32-bit aligned: */
2719 if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
2720
2721 /* get the misalignment from the previous 32-bit boundary: */
2722 count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
2723
2724 /* truncate this pointer to the previous 32-bit boundary: */
2725 parts32 = (tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
2726
2727 /* get the number of bytes to write in the first 32-bit memory part: */
2728 count_done = sizeof(tme_uint32_t) - count_misaligned;
2729 if (__tme_predict_false(count_done > count)) {
2730 count_done = count;
2731 }
2732
2733 /* make a mask that clears for the data to write in the
2734 first 32-bit memory part: */
2735 part32_mask = 1;
2736 part32_mask = (part32_mask << (count_done * 8)) - 1;
2737 part32_mask
2738 <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2739 ? (count_misaligned * 8)
2740 : (32 - ((count_misaligned + count_done) * 8)));
2741 part32_mask = ~part32_mask;
2742
2743 /* copy from the buffer the bytes to write in the first
2744 32-bit memory part: */
2745 part32_buffer = 0;
2746 part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2747 count -= count_done;
2748 do {
2749 *part_buffer = *buffer;
2750 part_buffer++;
2751 buffer++;
2752 } while (--count_done != 0);
2753
2754 /* compare-and-exchange the first 32-bit memory part: */
2755 part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2756 do {
2757 part32_cmp = part32;
2758 part32 = (part32 & part32_mask) | part32_buffer;
2759 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
2760 } while (part32 != part32_cmp);
2761 parts32++;
2762 }
2763
2764 /* if we have full 32-bit parts to write: */
2765 if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
2766
2767 /* if the buffer is 32-bit aligned: */
2768 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
2769
2770 /* write full 32-bit parts without shifting: */
2771 do {
2772 part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2773 tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2774
2775 /* advance: */
2776 parts32++;
2777 buffer += sizeof(tme_uint32_t);
2778 count -= sizeof(tme_uint32_t);
2779 } while (count >= sizeof(tme_uint32_t));
2780 }
2781
2782 /* otherwise, the buffer is not 32-bit aligned: */
2783 else {
2784
2785 /* get the misalignment to the next 32-bit boundary: */
2786 count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
2787
2788 /* copy from the buffer until it is aligned: */
2789 part32_buffer = 0;
2790 part_buffer = ((tme_uint8_t *) &part32_buffer);
2791 count_done = count_misaligned;
2792 count -= count_misaligned;
2793 do {
2794 *part_buffer = *buffer;
2795 part_buffer++;
2796 buffer++;
2797 } while (--count_done != 0);
2798
2799 /* write full 32-bit words with shifting: */
2800 bits_misaligned = count_misaligned * 8;
2801 part32 = part32_buffer;
2802 for (; count >= sizeof(tme_uint32_t); ) {
2803 part32_next = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2804 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2805 part32 |= (part32_next << bits_misaligned);
2806 tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2807 part32 = (part32_next >> (32 - bits_misaligned));
2808 }
2809 else {
2810 part32 |= (part32_next >> bits_misaligned);
2811 tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2812 part32 = (part32_next << (32 - bits_misaligned));
2813 }
2814
2815 /* advance: */
2816 parts32++;
2817 buffer += sizeof(tme_uint32_t);
2818 count -= sizeof(tme_uint32_t);
2819 }
2820
2821 /* calculate how many more bytes there are to write in this
2822 32-bit memory part: */
2823 count_done = sizeof(tme_uint32_t) - count_misaligned;
2824 part32_buffer = part32;
2825
2826 /* if we can't write one more full 32-bit memory part: */
2827 if (count_done > count) {
2828
2829 /* we will reread this data to write below: */
2830 buffer -= count_misaligned;
2831 count += count_misaligned;
2832 }
2833
2834 /* otherwise, we can write one more full 32-bit memory part: */
2835 else {
2836
2837 /* copy from the buffer until we have the full 32-bit part: */
2838 part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2839 count -= count_done;
2840 do {
2841 *part_buffer = *buffer;
2842 part_buffer++;
2843 buffer++;
2844 } while (--count_done != 0);
2845
2846 /* write the last full 32-bit memory part: */
2847 part32 = part32_buffer;
2848 tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2849 }
2850 }
2851 }
2852
2853 /* if we still have bytes to write: */
2854 if (__tme_predict_false(count > 0)) {
2855
2856 /* we must have less than a full 32-bit part to write: */
2857 assert (count < sizeof(tme_uint32_t));
2858
2859 /* make a mask that clears for the data to write in the last
2860 32-bit memory part: */
2861 part32_mask
2862 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2863 ? _tme_memory_type_mask(tme_uint32_t, << (count * 8))
2864 : _tme_memory_type_mask(tme_uint32_t, >> (count * 8)));
2865
2866 /* copy from the buffer the bytes to write in the last
2867 32-bit memory part: */
2868 part32_buffer = 0;
2869 part_buffer = ((tme_uint8_t *) &part32_buffer);
2870 count_done = count;
2871 do {
2872 *part_buffer = *buffer;
2873 part_buffer++;
2874 buffer++;
2875 } while (--count_done != 0);
2876
2877 /* compare-and-exchange the last 32-bit memory part: */
2878 part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2879 do {
2880 part32_cmp = part32;
2881 part32 = (part32 & part32_mask) | part32_buffer;
2882 part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
2883 } while (part32 != part32_cmp);
2884 }
2885
2886 }
2887
2888 else if (host_boundary == sizeof(tme_uint16_t)) {
2889
2890 /* make a 16-bit pointer to the memory: */
2891 parts16 = (tme_shared tme_uint16_t *) mem;
2892
2893 /* if this pointer is not 16-bit aligned: */
2894 if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
2895
2896 /* get the misalignment from the previous 16-bit boundary: */
2897 count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
2898
2899 /* truncate this pointer to the previous 16-bit boundary: */
2900 parts16 = (tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
2901
2902 /* get the number of bytes to write in the first 16-bit memory part: */
2903 count_done = sizeof(tme_uint16_t) - count_misaligned;
2904 if (__tme_predict_false(count_done > count)) {
2905 count_done = count;
2906 }
2907
2908 /* make a mask that clears for the data to write in the
2909 first 16-bit memory part: */
2910 part16_mask = 1;
2911 part16_mask = (part16_mask << (count_done * 8)) - 1;
2912 part16_mask
2913 <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2914 ? (count_misaligned * 8)
2915 : (16 - ((count_misaligned + count_done) * 8)));
2916 part16_mask = ~part16_mask;
2917
2918 /* copy from the buffer the bytes to write in the first
2919 16-bit memory part: */
2920 part16_buffer = 0;
2921 part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
2922 count -= count_done;
2923 do {
2924 *part_buffer = *buffer;
2925 part_buffer++;
2926 buffer++;
2927 } while (--count_done != 0);
2928
2929 /* compare-and-exchange the first 16-bit memory part: */
2930 part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2931 do {
2932 part16_cmp = part16;
2933 part16 = (part16 & part16_mask) | part16_buffer;
2934 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
2935 } while (part16 != part16_cmp);
2936 parts16++;
2937 }
2938
2939 /* if we have full 16-bit parts to write: */
2940 if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
2941
2942 /* if the buffer is 16-bit aligned: */
2943 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
2944
2945 /* write full 16-bit parts without shifting: */
2946 do {
2947 part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2948 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2949
2950 /* advance: */
2951 parts16++;
2952 buffer += sizeof(tme_uint16_t);
2953 count -= sizeof(tme_uint16_t);
2954 } while (count >= sizeof(tme_uint16_t));
2955 }
2956
2957 /* otherwise, the buffer is not 16-bit aligned: */
2958 else {
2959
2960 /* get the misalignment to the next 16-bit boundary: */
2961 count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
2962
2963 /* copy from the buffer until it is aligned: */
2964 part16_buffer = 0;
2965 part_buffer = ((tme_uint8_t *) &part16_buffer);
2966 count_done = count_misaligned;
2967 count -= count_misaligned;
2968 do {
2969 *part_buffer = *buffer;
2970 part_buffer++;
2971 buffer++;
2972 } while (--count_done != 0);
2973
2974 /* write full 16-bit words with shifting: */
2975 bits_misaligned = count_misaligned * 8;
2976 part16 = part16_buffer;
2977 for (; count >= sizeof(tme_uint16_t); ) {
2978 part16_next = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2979 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2980 part16 |= (part16_next << bits_misaligned);
2981 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2982 part16 = (part16_next >> (16 - bits_misaligned));
2983 }
2984 else {
2985 part16 |= (part16_next >> bits_misaligned);
2986 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2987 part16 = (part16_next << (16 - bits_misaligned));
2988 }
2989
2990 /* advance: */
2991 parts16++;
2992 buffer += sizeof(tme_uint16_t);
2993 count -= sizeof(tme_uint16_t);
2994 }
2995
2996 /* calculate how many more bytes there are to write in this
2997 16-bit memory part: */
2998 count_done = sizeof(tme_uint16_t) - count_misaligned;
2999 part16_buffer = part16;
3000
3001 /* if we can't write one more full 16-bit memory part: */
3002 if (count_done > count) {
3003
3004 /* we will reread this data to write below: */
3005 buffer -= count_misaligned;
3006 count += count_misaligned;
3007 }
3008
3009 /* otherwise, we can write one more full 16-bit memory part: */
3010 else {
3011
3012 /* copy from the buffer until we have the full 16-bit part: */
3013 part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
3014 count -= count_done;
3015 do {
3016 *part_buffer = *buffer;
3017 part_buffer++;
3018 buffer++;
3019 } while (--count_done != 0);
3020
3021 /* write the last full 16-bit memory part: */
3022 part16 = part16_buffer;
3023 tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
3024 }
3025 }
3026 }
3027
3028 /* if we still have bytes to write: */
3029 if (__tme_predict_false(count > 0)) {
3030
3031 /* we must have less than a full 16-bit part to write: */
3032 assert (count < sizeof(tme_uint16_t));
3033
3034 /* make a mask that clears for the data to write in the last
3035 16-bit memory part: */
3036 part16_mask
3037 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3038 ? _tme_memory_type_mask(tme_uint16_t, << (count * 8))
3039 : _tme_memory_type_mask(tme_uint16_t, >> (count * 8)));
3040
3041 /* copy from the buffer the bytes to write in the last
3042 16-bit memory part: */
3043 part16_buffer = 0;
3044 part_buffer = ((tme_uint8_t *) &part16_buffer);
3045 count_done = count;
3046 do {
3047 *part_buffer = *buffer;
3048 part_buffer++;
3049 buffer++;
3050 } while (--count_done != 0);
3051
3052 /* compare-and-exchange the last 16-bit memory part: */
3053 part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
3054 do {
3055 part16_cmp = part16;
3056 part16 = (part16 & part16_mask) | part16_buffer;
3057 part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
3058 } while (part16 != part16_cmp);
3059 }
3060
3061 }
3062
3063 else {
3064
3065 /* make a 8-bit pointer to the memory: */
3066 parts8 = (tme_shared tme_uint8_t *) mem;
3067
3068 /* if this pointer is not 8-bit aligned: */
3069 if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
3070
3071 /* get the misalignment from the previous 8-bit boundary: */
3072 count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
3073
3074 /* truncate this pointer to the previous 8-bit boundary: */
3075 parts8 = (tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
3076
3077 /* get the number of bytes to write in the first 8-bit memory part: */
3078 count_done = sizeof(tme_uint8_t) - count_misaligned;
3079 if (__tme_predict_false(count_done > count)) {
3080 count_done = count;
3081 }
3082
3083 /* make a mask that clears for the data to write in the
3084 first 8-bit memory part: */
3085 part8_mask = 1;
3086 part8_mask = (part8_mask << (count_done * 8)) - 1;
3087 part8_mask
3088 <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3089 ? (count_misaligned * 8)
3090 : (8 - ((count_misaligned + count_done) * 8)));
3091 part8_mask = ~part8_mask;
3092
3093 /* copy from the buffer the bytes to write in the first
3094 8-bit memory part: */
3095 part8_buffer = 0;
3096 part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
3097 count -= count_done;
3098 do {
3099 *part_buffer = *buffer;
3100 part_buffer++;
3101 buffer++;
3102 } while (--count_done != 0);
3103
3104 /* compare-and-exchange the first 8-bit memory part: */
3105 part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3106 do {
3107 part8_cmp = part8;
3108 part8 = (part8 & part8_mask) | part8_buffer;
3109 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
3110 } while (part8 != part8_cmp);
3111 parts8++;
3112 }
3113
3114 /* if we have full 8-bit parts to write: */
3115 if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
3116
3117 /* if the buffer is 8-bit aligned: */
3118 if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
3119
3120 /* write full 8-bit parts without shifting: */
3121 do {
3122 part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3123 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3124
3125 /* advance: */
3126 parts8++;
3127 buffer += sizeof(tme_uint8_t);
3128 count -= sizeof(tme_uint8_t);
3129 } while (count >= sizeof(tme_uint8_t));
3130 }
3131
3132 /* otherwise, the buffer is not 8-bit aligned: */
3133 else {
3134
3135 /* get the misalignment to the next 8-bit boundary: */
3136 count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
3137
3138 /* copy from the buffer until it is aligned: */
3139 part8_buffer = 0;
3140 part_buffer = ((tme_uint8_t *) &part8_buffer);
3141 count_done = count_misaligned;
3142 count -= count_misaligned;
3143 do {
3144 *part_buffer = *buffer;
3145 part_buffer++;
3146 buffer++;
3147 } while (--count_done != 0);
3148
3149 /* write full 8-bit words with shifting: */
3150 bits_misaligned = count_misaligned * 8;
3151 part8 = part8_buffer;
3152 for (; count >= sizeof(tme_uint8_t); ) {
3153 part8_next = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3154 if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
3155 part8 |= (part8_next << bits_misaligned);
3156 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3157 part8 = (part8_next >> (8 - bits_misaligned));
3158 }
3159 else {
3160 part8 |= (part8_next >> bits_misaligned);
3161 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3162 part8 = (part8_next << (8 - bits_misaligned));
3163 }
3164
3165 /* advance: */
3166 parts8++;
3167 buffer += sizeof(tme_uint8_t);
3168 count -= sizeof(tme_uint8_t);
3169 }
3170
3171 /* calculate how many more bytes there are to write in this
3172 8-bit memory part: */
3173 count_done = sizeof(tme_uint8_t) - count_misaligned;
3174 part8_buffer = part8;
3175
3176 /* if we can't write one more full 8-bit memory part: */
3177 if (count_done > count) {
3178
3179 /* we will reread this data to write below: */
3180 buffer -= count_misaligned;
3181 count += count_misaligned;
3182 }
3183
3184 /* otherwise, we can write one more full 8-bit memory part: */
3185 else {
3186
3187 /* copy from the buffer until we have the full 8-bit part: */
3188 part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
3189 count -= count_done;
3190 do {
3191 *part_buffer = *buffer;
3192 part_buffer++;
3193 buffer++;
3194 } while (--count_done != 0);
3195
3196 /* write the last full 8-bit memory part: */
3197 part8 = part8_buffer;
3198 tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3199 }
3200 }
3201 }
3202
3203 /* if we still have bytes to write: */
3204 if (__tme_predict_false(count > 0)) {
3205
3206 /* we must have less than a full 8-bit part to write: */
3207 assert (count < sizeof(tme_uint8_t));
3208
3209 /* make a mask that clears for the data to write in the last
3210 8-bit memory part: */
3211 part8_mask
3212 = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3213 ? _tme_memory_type_mask(tme_uint8_t, << (count * 8))
3214 : _tme_memory_type_mask(tme_uint8_t, >> (count * 8)));
3215
3216 /* copy from the buffer the bytes to write in the last
3217 8-bit memory part: */
3218 part8_buffer = 0;
3219 part_buffer = ((tme_uint8_t *) &part8_buffer);
3220 count_done = count;
3221 do {
3222 *part_buffer = *buffer;
3223 part_buffer++;
3224 buffer++;
3225 } while (--count_done != 0);
3226
3227 /* compare-and-exchange the last 8-bit memory part: */
3228 part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3229 do {
3230 part8_cmp = part8;
3231 part8 = (part8 & part8_mask) | part8_buffer;
3232 part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
3233 } while (part8 != part8_cmp);
3234 }
3235
3236 }
3237 }
3238
3239 /* the 8-bit atomic operations: */
3240
3241 /* undefine any macro version of tme_memory_atomic_add8: */
3242 #undef tme_memory_atomic_add8
3243
3244 /* the 8-bit atomic add function: */
3245 tme_uint8_t
tme_memory_atomic_add8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3246 tme_memory_atomic_add8(tme_shared tme_uint8_t *memory,
3247 tme_uint8_t operand,
3248 tme_rwlock_t *rwlock,
3249 unsigned int align_min)
3250 {
3251 tme_uint8_t value_read;
3252 tme_uint8_t value_written;
3253 tme_uint8_t value_read_verify;
3254
3255 /* if we can't make direct accesses at all, all atomic
3256 accesses must be done under lock. (when threads are
3257 cooperative the actual locking isn't needed): */
3258 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3259 if (!TME_THREADS_COOPERATIVE) {
3260 tme_rwlock_wrlock(rwlock);
3261 }
3262 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3263 value_written = value_read + operand;
3264 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3265 if (!TME_THREADS_COOPERATIVE) {
3266 tme_rwlock_unlock(rwlock);
3267 }
3268 }
3269
3270 /* otherwise, threads are not cooperative and this host CPU
3271 can make atomic accesses to at least the most common memory
3272 size.
3273
3274 in that case, the only reason this function should get
3275 called is if the host CPU can't do an atomic 8-bit
3276 add at all, or if it can't do it at this alignment.
3277
3278 we emulate the atomic 8-bit add with a compare-and-exchange: */
3279 else {
3280
3281 /* do an atomic read of the memory: */
3282 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3283
3284 /* spin the add in a compare-and-exchange loop: */
3285 for (;;) {
3286
3287 /* make the value to write: */
3288 value_written = value_read + operand;
3289
3290 /* try the compare-and-exchange: */
3291 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3292
3293 /* if the compare-and-exchange failed: */
3294 if (__tme_predict_false(value_read_verify != value_read)) {
3295
3296 /* loop with the new value read from the memory: */
3297 value_read = value_read_verify;
3298 continue;
3299 }
3300
3301 /* stop now: */
3302 break;
3303 }
3304 }
3305
3306 /* return the value read: */
3307 return (value_read);
3308 }
3309
3310 /* undefine any macro version of tme_memory_atomic_sub8: */
3311 #undef tme_memory_atomic_sub8
3312
3313 /* the 8-bit atomic sub function: */
3314 tme_uint8_t
tme_memory_atomic_sub8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3315 tme_memory_atomic_sub8(tme_shared tme_uint8_t *memory,
3316 tme_uint8_t operand,
3317 tme_rwlock_t *rwlock,
3318 unsigned int align_min)
3319 {
3320 tme_uint8_t value_read;
3321 tme_uint8_t value_written;
3322 tme_uint8_t value_read_verify;
3323
3324 /* if we can't make direct accesses at all, all atomic
3325 accesses must be done under lock. (when threads are
3326 cooperative the actual locking isn't needed): */
3327 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3328 if (!TME_THREADS_COOPERATIVE) {
3329 tme_rwlock_wrlock(rwlock);
3330 }
3331 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3332 value_written = value_read - operand;
3333 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3334 if (!TME_THREADS_COOPERATIVE) {
3335 tme_rwlock_unlock(rwlock);
3336 }
3337 }
3338
3339 /* otherwise, threads are not cooperative and this host CPU
3340 can make atomic accesses to at least the most common memory
3341 size.
3342
3343 in that case, the only reason this function should get
3344 called is if the host CPU can't do an atomic 8-bit
3345 sub at all, or if it can't do it at this alignment.
3346
3347 we emulate the atomic 8-bit sub with a compare-and-exchange: */
3348 else {
3349
3350 /* do an atomic read of the memory: */
3351 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3352
3353 /* spin the sub in a compare-and-exchange loop: */
3354 for (;;) {
3355
3356 /* make the value to write: */
3357 value_written = value_read - operand;
3358
3359 /* try the compare-and-exchange: */
3360 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3361
3362 /* if the compare-and-exchange failed: */
3363 if (__tme_predict_false(value_read_verify != value_read)) {
3364
3365 /* loop with the new value read from the memory: */
3366 value_read = value_read_verify;
3367 continue;
3368 }
3369
3370 /* stop now: */
3371 break;
3372 }
3373 }
3374
3375 /* return the value read: */
3376 return (value_read);
3377 }
3378
3379 /* undefine any macro version of tme_memory_atomic_mul8: */
3380 #undef tme_memory_atomic_mul8
3381
3382 /* the 8-bit atomic mul function: */
3383 tme_uint8_t
tme_memory_atomic_mul8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3384 tme_memory_atomic_mul8(tme_shared tme_uint8_t *memory,
3385 tme_uint8_t operand,
3386 tme_rwlock_t *rwlock,
3387 unsigned int align_min)
3388 {
3389 tme_uint8_t value_read;
3390 tme_uint8_t value_written;
3391 tme_uint8_t value_read_verify;
3392
3393 /* if we can't make direct accesses at all, all atomic
3394 accesses must be done under lock. (when threads are
3395 cooperative the actual locking isn't needed): */
3396 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3397 if (!TME_THREADS_COOPERATIVE) {
3398 tme_rwlock_wrlock(rwlock);
3399 }
3400 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3401 value_written = value_read * operand;
3402 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3403 if (!TME_THREADS_COOPERATIVE) {
3404 tme_rwlock_unlock(rwlock);
3405 }
3406 }
3407
3408 /* otherwise, threads are not cooperative and this host CPU
3409 can make atomic accesses to at least the most common memory
3410 size.
3411
3412 in that case, the only reason this function should get
3413 called is if the host CPU can't do an atomic 8-bit
3414 mul at all, or if it can't do it at this alignment.
3415
3416 we emulate the atomic 8-bit mul with a compare-and-exchange: */
3417 else {
3418
3419 /* do an atomic read of the memory: */
3420 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3421
3422 /* spin the mul in a compare-and-exchange loop: */
3423 for (;;) {
3424
3425 /* make the value to write: */
3426 value_written = value_read * operand;
3427
3428 /* try the compare-and-exchange: */
3429 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3430
3431 /* if the compare-and-exchange failed: */
3432 if (__tme_predict_false(value_read_verify != value_read)) {
3433
3434 /* loop with the new value read from the memory: */
3435 value_read = value_read_verify;
3436 continue;
3437 }
3438
3439 /* stop now: */
3440 break;
3441 }
3442 }
3443
3444 /* return the value read: */
3445 return (value_read);
3446 }
3447
3448 /* undefine any macro version of tme_memory_atomic_div8: */
3449 #undef tme_memory_atomic_div8
3450
3451 /* the 8-bit atomic div function: */
3452 tme_uint8_t
tme_memory_atomic_div8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3453 tme_memory_atomic_div8(tme_shared tme_uint8_t *memory,
3454 tme_uint8_t operand,
3455 tme_rwlock_t *rwlock,
3456 unsigned int align_min)
3457 {
3458 tme_uint8_t value_read;
3459 tme_uint8_t value_written;
3460 tme_uint8_t value_read_verify;
3461
3462 /* if we can't make direct accesses at all, all atomic
3463 accesses must be done under lock. (when threads are
3464 cooperative the actual locking isn't needed): */
3465 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3466 if (!TME_THREADS_COOPERATIVE) {
3467 tme_rwlock_wrlock(rwlock);
3468 }
3469 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3470 value_written = value_read / operand;
3471 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3472 if (!TME_THREADS_COOPERATIVE) {
3473 tme_rwlock_unlock(rwlock);
3474 }
3475 }
3476
3477 /* otherwise, threads are not cooperative and this host CPU
3478 can make atomic accesses to at least the most common memory
3479 size.
3480
3481 in that case, the only reason this function should get
3482 called is if the host CPU can't do an atomic 8-bit
3483 div at all, or if it can't do it at this alignment.
3484
3485 we emulate the atomic 8-bit div with a compare-and-exchange: */
3486 else {
3487
3488 /* do an atomic read of the memory: */
3489 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3490
3491 /* spin the div in a compare-and-exchange loop: */
3492 for (;;) {
3493
3494 /* make the value to write: */
3495 value_written = value_read / operand;
3496
3497 /* try the compare-and-exchange: */
3498 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3499
3500 /* if the compare-and-exchange failed: */
3501 if (__tme_predict_false(value_read_verify != value_read)) {
3502
3503 /* loop with the new value read from the memory: */
3504 value_read = value_read_verify;
3505 continue;
3506 }
3507
3508 /* stop now: */
3509 break;
3510 }
3511 }
3512
3513 /* return the value read: */
3514 return (value_read);
3515 }
3516
3517 /* undefine any macro version of tme_memory_atomic_and8: */
3518 #undef tme_memory_atomic_and8
3519
3520 /* the 8-bit atomic and function: */
3521 tme_uint8_t
tme_memory_atomic_and8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3522 tme_memory_atomic_and8(tme_shared tme_uint8_t *memory,
3523 tme_uint8_t operand,
3524 tme_rwlock_t *rwlock,
3525 unsigned int align_min)
3526 {
3527 tme_uint8_t value_read;
3528 tme_uint8_t value_written;
3529 tme_uint8_t value_read_verify;
3530
3531 /* if we can't make direct accesses at all, all atomic
3532 accesses must be done under lock. (when threads are
3533 cooperative the actual locking isn't needed): */
3534 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3535 if (!TME_THREADS_COOPERATIVE) {
3536 tme_rwlock_wrlock(rwlock);
3537 }
3538 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3539 value_written = value_read & operand;
3540 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3541 if (!TME_THREADS_COOPERATIVE) {
3542 tme_rwlock_unlock(rwlock);
3543 }
3544 }
3545
3546 /* otherwise, threads are not cooperative and this host CPU
3547 can make atomic accesses to at least the most common memory
3548 size.
3549
3550 in that case, the only reason this function should get
3551 called is if the host CPU can't do an atomic 8-bit
3552 and at all, or if it can't do it at this alignment.
3553
3554 we emulate the atomic 8-bit and with a compare-and-exchange: */
3555 else {
3556
3557 /* do an atomic read of the memory: */
3558 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3559
3560 /* spin the and in a compare-and-exchange loop: */
3561 for (;;) {
3562
3563 /* make the value to write: */
3564 value_written = value_read & operand;
3565
3566 /* try the compare-and-exchange: */
3567 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3568
3569 /* if the compare-and-exchange failed: */
3570 if (__tme_predict_false(value_read_verify != value_read)) {
3571
3572 /* loop with the new value read from the memory: */
3573 value_read = value_read_verify;
3574 continue;
3575 }
3576
3577 /* stop now: */
3578 break;
3579 }
3580 }
3581
3582 /* return the value read: */
3583 return (value_read);
3584 }
3585
3586 /* undefine any macro version of tme_memory_atomic_or8: */
3587 #undef tme_memory_atomic_or8
3588
3589 /* the 8-bit atomic or function: */
3590 tme_uint8_t
tme_memory_atomic_or8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3591 tme_memory_atomic_or8(tme_shared tme_uint8_t *memory,
3592 tme_uint8_t operand,
3593 tme_rwlock_t *rwlock,
3594 unsigned int align_min)
3595 {
3596 tme_uint8_t value_read;
3597 tme_uint8_t value_written;
3598 tme_uint8_t value_read_verify;
3599
3600 /* if we can't make direct accesses at all, all atomic
3601 accesses must be done under lock. (when threads are
3602 cooperative the actual locking isn't needed): */
3603 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3604 if (!TME_THREADS_COOPERATIVE) {
3605 tme_rwlock_wrlock(rwlock);
3606 }
3607 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3608 value_written = value_read | operand;
3609 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3610 if (!TME_THREADS_COOPERATIVE) {
3611 tme_rwlock_unlock(rwlock);
3612 }
3613 }
3614
3615 /* otherwise, threads are not cooperative and this host CPU
3616 can make atomic accesses to at least the most common memory
3617 size.
3618
3619 in that case, the only reason this function should get
3620 called is if the host CPU can't do an atomic 8-bit
3621 or at all, or if it can't do it at this alignment.
3622
3623 we emulate the atomic 8-bit or with a compare-and-exchange: */
3624 else {
3625
3626 /* do an atomic read of the memory: */
3627 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3628
3629 /* spin the or in a compare-and-exchange loop: */
3630 for (;;) {
3631
3632 /* make the value to write: */
3633 value_written = value_read | operand;
3634
3635 /* try the compare-and-exchange: */
3636 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3637
3638 /* if the compare-and-exchange failed: */
3639 if (__tme_predict_false(value_read_verify != value_read)) {
3640
3641 /* loop with the new value read from the memory: */
3642 value_read = value_read_verify;
3643 continue;
3644 }
3645
3646 /* stop now: */
3647 break;
3648 }
3649 }
3650
3651 /* return the value read: */
3652 return (value_read);
3653 }
3654
3655 /* undefine any macro version of tme_memory_atomic_xor8: */
3656 #undef tme_memory_atomic_xor8
3657
3658 /* the 8-bit atomic xor function: */
3659 tme_uint8_t
tme_memory_atomic_xor8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3660 tme_memory_atomic_xor8(tme_shared tme_uint8_t *memory,
3661 tme_uint8_t operand,
3662 tme_rwlock_t *rwlock,
3663 unsigned int align_min)
3664 {
3665 tme_uint8_t value_read;
3666 tme_uint8_t value_written;
3667 tme_uint8_t value_read_verify;
3668
3669 /* if we can't make direct accesses at all, all atomic
3670 accesses must be done under lock. (when threads are
3671 cooperative the actual locking isn't needed): */
3672 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3673 if (!TME_THREADS_COOPERATIVE) {
3674 tme_rwlock_wrlock(rwlock);
3675 }
3676 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3677 value_written = value_read ^ operand;
3678 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3679 if (!TME_THREADS_COOPERATIVE) {
3680 tme_rwlock_unlock(rwlock);
3681 }
3682 }
3683
3684 /* otherwise, threads are not cooperative and this host CPU
3685 can make atomic accesses to at least the most common memory
3686 size.
3687
3688 in that case, the only reason this function should get
3689 called is if the host CPU can't do an atomic 8-bit
3690 xor at all, or if it can't do it at this alignment.
3691
3692 we emulate the atomic 8-bit xor with a compare-and-exchange: */
3693 else {
3694
3695 /* do an atomic read of the memory: */
3696 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3697
3698 /* spin the xor in a compare-and-exchange loop: */
3699 for (;;) {
3700
3701 /* make the value to write: */
3702 value_written = value_read ^ operand;
3703
3704 /* try the compare-and-exchange: */
3705 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3706
3707 /* if the compare-and-exchange failed: */
3708 if (__tme_predict_false(value_read_verify != value_read)) {
3709
3710 /* loop with the new value read from the memory: */
3711 value_read = value_read_verify;
3712 continue;
3713 }
3714
3715 /* stop now: */
3716 break;
3717 }
3718 }
3719
3720 /* return the value read: */
3721 return (value_read);
3722 }
3723
3724 /* undefine any macro version of tme_memory_atomic_not8: */
3725 #undef tme_memory_atomic_not8
3726
3727 /* the 8-bit atomic not function: */
3728 tme_uint8_t
tme_memory_atomic_not8(tme_shared tme_uint8_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)3729 tme_memory_atomic_not8(tme_shared tme_uint8_t *memory,
3730 tme_rwlock_t *rwlock,
3731 unsigned int align_min)
3732 {
3733 tme_uint8_t value_read;
3734 tme_uint8_t value_written;
3735 tme_uint8_t value_read_verify;
3736
3737 /* if we can't make direct accesses at all, all atomic
3738 accesses must be done under lock. (when threads are
3739 cooperative the actual locking isn't needed): */
3740 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3741 if (!TME_THREADS_COOPERATIVE) {
3742 tme_rwlock_wrlock(rwlock);
3743 }
3744 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3745 value_written = ~value_read;
3746 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3747 if (!TME_THREADS_COOPERATIVE) {
3748 tme_rwlock_unlock(rwlock);
3749 }
3750 }
3751
3752 /* otherwise, threads are not cooperative and this host CPU
3753 can make atomic accesses to at least the most common memory
3754 size.
3755
3756 in that case, the only reason this function should get
3757 called is if the host CPU can't do an atomic 8-bit
3758 not at all, or if it can't do it at this alignment.
3759
3760 we emulate the atomic 8-bit not with a compare-and-exchange: */
3761 else {
3762
3763 /* do an atomic read of the memory: */
3764 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3765
3766 /* spin the not in a compare-and-exchange loop: */
3767 for (;;) {
3768
3769 /* make the value to write: */
3770 value_written = ~value_read;
3771
3772 /* try the compare-and-exchange: */
3773 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3774
3775 /* if the compare-and-exchange failed: */
3776 if (__tme_predict_false(value_read_verify != value_read)) {
3777
3778 /* loop with the new value read from the memory: */
3779 value_read = value_read_verify;
3780 continue;
3781 }
3782
3783 /* stop now: */
3784 break;
3785 }
3786 }
3787
3788 /* return the value read: */
3789 return (value_read);
3790 }
3791
3792 /* undefine any macro version of tme_memory_atomic_neg8: */
3793 #undef tme_memory_atomic_neg8
3794
3795 /* the 8-bit atomic neg function: */
3796 tme_uint8_t
tme_memory_atomic_neg8(tme_shared tme_uint8_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)3797 tme_memory_atomic_neg8(tme_shared tme_uint8_t *memory,
3798 tme_rwlock_t *rwlock,
3799 unsigned int align_min)
3800 {
3801 tme_uint8_t value_read;
3802 tme_uint8_t value_written;
3803 tme_uint8_t value_read_verify;
3804
3805 /* if we can't make direct accesses at all, all atomic
3806 accesses must be done under lock. (when threads are
3807 cooperative the actual locking isn't needed): */
3808 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3809 if (!TME_THREADS_COOPERATIVE) {
3810 tme_rwlock_wrlock(rwlock);
3811 }
3812 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3813 value_written = 0 - value_read;
3814 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3815 if (!TME_THREADS_COOPERATIVE) {
3816 tme_rwlock_unlock(rwlock);
3817 }
3818 }
3819
3820 /* otherwise, threads are not cooperative and this host CPU
3821 can make atomic accesses to at least the most common memory
3822 size.
3823
3824 in that case, the only reason this function should get
3825 called is if the host CPU can't do an atomic 8-bit
3826 neg at all, or if it can't do it at this alignment.
3827
3828 we emulate the atomic 8-bit neg with a compare-and-exchange: */
3829 else {
3830
3831 /* do an atomic read of the memory: */
3832 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3833
3834 /* spin the neg in a compare-and-exchange loop: */
3835 for (;;) {
3836
3837 /* make the value to write: */
3838 value_written = 0 - value_read;
3839
3840 /* try the compare-and-exchange: */
3841 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3842
3843 /* if the compare-and-exchange failed: */
3844 if (__tme_predict_false(value_read_verify != value_read)) {
3845
3846 /* loop with the new value read from the memory: */
3847 value_read = value_read_verify;
3848 continue;
3849 }
3850
3851 /* stop now: */
3852 break;
3853 }
3854 }
3855
3856 /* return the value read: */
3857 return (value_read);
3858 }
3859
3860 /* undefine any macro version of tme_memory_atomic_xchg8: */
3861 #undef tme_memory_atomic_xchg8
3862
3863 /* the 8-bit atomic xchg function: */
3864 tme_uint8_t
tme_memory_atomic_xchg8(tme_shared tme_uint8_t * memory,tme_uint8_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)3865 tme_memory_atomic_xchg8(tme_shared tme_uint8_t *memory,
3866 tme_uint8_t value_written,
3867 tme_rwlock_t *rwlock,
3868 unsigned int align_min)
3869 {
3870 tme_uint8_t value_read;
3871 tme_uint8_t value_read_verify;
3872
3873 /* if we can't make direct accesses at all, all atomic
3874 accesses must be done under lock. (when threads are
3875 cooperative the actual locking isn't needed): */
3876 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3877 if (!TME_THREADS_COOPERATIVE) {
3878 tme_rwlock_wrlock(rwlock);
3879 }
3880 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3881 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3882 if (!TME_THREADS_COOPERATIVE) {
3883 tme_rwlock_unlock(rwlock);
3884 }
3885 }
3886
3887 /* otherwise, threads are not cooperative and this host CPU
3888 can make atomic accesses to at least the most common memory
3889 size.
3890
3891 in that case, the only reason this function should get
3892 called is if the host CPU can't do an atomic 8-bit
3893 xchg at all, or if it can't do it at this alignment.
3894
3895 we emulate the atomic 8-bit xchg with a compare-and-exchange: */
3896 else {
3897
3898 /* do an atomic read of the memory: */
3899 value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3900
3901 /* spin the xchg in a compare-and-exchange loop: */
3902 for (;;) {
3903
3904 /* try the compare-and-exchange: */
3905 value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3906
3907 /* if the compare-and-exchange failed: */
3908 if (__tme_predict_false(value_read_verify != value_read)) {
3909
3910 /* loop with the new value read from the memory: */
3911 value_read = value_read_verify;
3912 continue;
3913 }
3914
3915 /* stop now: */
3916 break;
3917 }
3918 }
3919
3920 /* return the value read: */
3921 return (value_read);
3922 }
3923
3924 /* undefine any macro version of tme_memory_atomic_cx8: */
3925 #undef tme_memory_atomic_cx8
3926
3927 /* the 8-bit atomic cx function: */
3928 tme_uint8_t
tme_memory_atomic_cx8(tme_shared tme_uint8_t * memory,tme_uint8_t value_cmp,tme_uint8_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)3929 tme_memory_atomic_cx8(tme_shared tme_uint8_t *memory,
3930 tme_uint8_t value_cmp,
3931 tme_uint8_t value_written,
3932 tme_rwlock_t *rwlock,
3933 unsigned int align_min)
3934 {
3935 tme_uint8_t value_read;
3936
3937 /* if we can't make direct accesses at all, all atomic
3938 accesses must be done under lock. (when threads are
3939 cooperative the actual locking isn't needed): */
3940 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3941 if (!TME_THREADS_COOPERATIVE) {
3942 tme_rwlock_wrlock(rwlock);
3943 }
3944 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3945 if (value_read == value_cmp) {
3946 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3947 }
3948 if (!TME_THREADS_COOPERATIVE) {
3949 tme_rwlock_unlock(rwlock);
3950 }
3951 }
3952
3953 /* otherwise, threads are not cooperative and this host CPU
3954 can make atomic accesses to at least the most common memory
3955 size.
3956
3957 in that case, the only reason this function should get
3958 called is if the host CPU can't do an atomic 8-bit
3959 cx at all, or if it can't do it at this alignment.
3960
3961 we assume that these problematic atomic cxs are rare,
3962 and to emulate them we simply stop all other threads while
3963 doing the cx: */
3964 else {
3965 tme_thread_suspend_others();
3966 value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3967 if (value_read == value_cmp) {
3968 tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3969 }
3970 tme_thread_resume_others();
3971 }
3972
3973 /* return the value read: */
3974 return (value_read);
3975 }
3976
3977 /* the 16-bit atomic operations: */
3978
3979 /* undefine any macro version of tme_memory_atomic_add16: */
3980 #undef tme_memory_atomic_add16
3981
3982 /* the 16-bit atomic add function: */
3983 tme_uint16_t
tme_memory_atomic_add16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3984 tme_memory_atomic_add16(tme_shared tme_uint16_t *memory,
3985 tme_uint16_t operand,
3986 tme_rwlock_t *rwlock,
3987 unsigned int align_min)
3988 {
3989 tme_uint16_t value_read;
3990 tme_uint16_t value_written;
3991 tme_uint16_t value_read_verify;
3992
3993 /* if we can't make direct accesses at all, all atomic
3994 accesses must be done under lock. (when threads are
3995 cooperative the actual locking isn't needed): */
3996 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3997 if (!TME_THREADS_COOPERATIVE) {
3998 tme_rwlock_wrlock(rwlock);
3999 }
4000 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4001 value_written = value_read + operand;
4002 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4003 if (!TME_THREADS_COOPERATIVE) {
4004 tme_rwlock_unlock(rwlock);
4005 }
4006 }
4007
4008 /* otherwise, threads are not cooperative and this host CPU
4009 can make atomic accesses to at least the most common memory
4010 size.
4011
4012 in that case, the only reason this function should get
4013 called is if the host CPU can't do an atomic 16-bit
4014 add at all, or if it can't do it at this alignment.
4015
4016 we emulate the atomic 16-bit add with a compare-and-exchange: */
4017 else {
4018
4019 /* do an atomic read of the memory: */
4020 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4021
4022 /* spin the add in a compare-and-exchange loop: */
4023 for (;;) {
4024
4025 /* make the value to write: */
4026 value_written = value_read + operand;
4027
4028 /* try the compare-and-exchange: */
4029 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4030
4031 /* if the compare-and-exchange failed: */
4032 if (__tme_predict_false(value_read_verify != value_read)) {
4033
4034 /* loop with the new value read from the memory: */
4035 value_read = value_read_verify;
4036 continue;
4037 }
4038
4039 /* stop now: */
4040 break;
4041 }
4042 }
4043
4044 /* return the value read: */
4045 return (value_read);
4046 }
4047
4048 /* undefine any macro version of tme_memory_atomic_sub16: */
4049 #undef tme_memory_atomic_sub16
4050
4051 /* the 16-bit atomic sub function: */
4052 tme_uint16_t
tme_memory_atomic_sub16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4053 tme_memory_atomic_sub16(tme_shared tme_uint16_t *memory,
4054 tme_uint16_t operand,
4055 tme_rwlock_t *rwlock,
4056 unsigned int align_min)
4057 {
4058 tme_uint16_t value_read;
4059 tme_uint16_t value_written;
4060 tme_uint16_t value_read_verify;
4061
4062 /* if we can't make direct accesses at all, all atomic
4063 accesses must be done under lock. (when threads are
4064 cooperative the actual locking isn't needed): */
4065 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4066 if (!TME_THREADS_COOPERATIVE) {
4067 tme_rwlock_wrlock(rwlock);
4068 }
4069 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4070 value_written = value_read - operand;
4071 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4072 if (!TME_THREADS_COOPERATIVE) {
4073 tme_rwlock_unlock(rwlock);
4074 }
4075 }
4076
4077 /* otherwise, threads are not cooperative and this host CPU
4078 can make atomic accesses to at least the most common memory
4079 size.
4080
4081 in that case, the only reason this function should get
4082 called is if the host CPU can't do an atomic 16-bit
4083 sub at all, or if it can't do it at this alignment.
4084
4085 we emulate the atomic 16-bit sub with a compare-and-exchange: */
4086 else {
4087
4088 /* do an atomic read of the memory: */
4089 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4090
4091 /* spin the sub in a compare-and-exchange loop: */
4092 for (;;) {
4093
4094 /* make the value to write: */
4095 value_written = value_read - operand;
4096
4097 /* try the compare-and-exchange: */
4098 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4099
4100 /* if the compare-and-exchange failed: */
4101 if (__tme_predict_false(value_read_verify != value_read)) {
4102
4103 /* loop with the new value read from the memory: */
4104 value_read = value_read_verify;
4105 continue;
4106 }
4107
4108 /* stop now: */
4109 break;
4110 }
4111 }
4112
4113 /* return the value read: */
4114 return (value_read);
4115 }
4116
4117 /* undefine any macro version of tme_memory_atomic_mul16: */
4118 #undef tme_memory_atomic_mul16
4119
4120 /* the 16-bit atomic mul function: */
4121 tme_uint16_t
tme_memory_atomic_mul16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4122 tme_memory_atomic_mul16(tme_shared tme_uint16_t *memory,
4123 tme_uint16_t operand,
4124 tme_rwlock_t *rwlock,
4125 unsigned int align_min)
4126 {
4127 tme_uint16_t value_read;
4128 tme_uint16_t value_written;
4129 tme_uint16_t value_read_verify;
4130
4131 /* if we can't make direct accesses at all, all atomic
4132 accesses must be done under lock. (when threads are
4133 cooperative the actual locking isn't needed): */
4134 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4135 if (!TME_THREADS_COOPERATIVE) {
4136 tme_rwlock_wrlock(rwlock);
4137 }
4138 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4139 value_written = value_read * operand;
4140 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4141 if (!TME_THREADS_COOPERATIVE) {
4142 tme_rwlock_unlock(rwlock);
4143 }
4144 }
4145
4146 /* otherwise, threads are not cooperative and this host CPU
4147 can make atomic accesses to at least the most common memory
4148 size.
4149
4150 in that case, the only reason this function should get
4151 called is if the host CPU can't do an atomic 16-bit
4152 mul at all, or if it can't do it at this alignment.
4153
4154 we emulate the atomic 16-bit mul with a compare-and-exchange: */
4155 else {
4156
4157 /* do an atomic read of the memory: */
4158 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4159
4160 /* spin the mul in a compare-and-exchange loop: */
4161 for (;;) {
4162
4163 /* make the value to write: */
4164 value_written = value_read * operand;
4165
4166 /* try the compare-and-exchange: */
4167 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4168
4169 /* if the compare-and-exchange failed: */
4170 if (__tme_predict_false(value_read_verify != value_read)) {
4171
4172 /* loop with the new value read from the memory: */
4173 value_read = value_read_verify;
4174 continue;
4175 }
4176
4177 /* stop now: */
4178 break;
4179 }
4180 }
4181
4182 /* return the value read: */
4183 return (value_read);
4184 }
4185
4186 /* undefine any macro version of tme_memory_atomic_div16: */
4187 #undef tme_memory_atomic_div16
4188
4189 /* the 16-bit atomic div function: */
4190 tme_uint16_t
tme_memory_atomic_div16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4191 tme_memory_atomic_div16(tme_shared tme_uint16_t *memory,
4192 tme_uint16_t operand,
4193 tme_rwlock_t *rwlock,
4194 unsigned int align_min)
4195 {
4196 tme_uint16_t value_read;
4197 tme_uint16_t value_written;
4198 tme_uint16_t value_read_verify;
4199
4200 /* if we can't make direct accesses at all, all atomic
4201 accesses must be done under lock. (when threads are
4202 cooperative the actual locking isn't needed): */
4203 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4204 if (!TME_THREADS_COOPERATIVE) {
4205 tme_rwlock_wrlock(rwlock);
4206 }
4207 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4208 value_written = value_read / operand;
4209 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4210 if (!TME_THREADS_COOPERATIVE) {
4211 tme_rwlock_unlock(rwlock);
4212 }
4213 }
4214
4215 /* otherwise, threads are not cooperative and this host CPU
4216 can make atomic accesses to at least the most common memory
4217 size.
4218
4219 in that case, the only reason this function should get
4220 called is if the host CPU can't do an atomic 16-bit
4221 div at all, or if it can't do it at this alignment.
4222
4223 we emulate the atomic 16-bit div with a compare-and-exchange: */
4224 else {
4225
4226 /* do an atomic read of the memory: */
4227 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4228
4229 /* spin the div in a compare-and-exchange loop: */
4230 for (;;) {
4231
4232 /* make the value to write: */
4233 value_written = value_read / operand;
4234
4235 /* try the compare-and-exchange: */
4236 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4237
4238 /* if the compare-and-exchange failed: */
4239 if (__tme_predict_false(value_read_verify != value_read)) {
4240
4241 /* loop with the new value read from the memory: */
4242 value_read = value_read_verify;
4243 continue;
4244 }
4245
4246 /* stop now: */
4247 break;
4248 }
4249 }
4250
4251 /* return the value read: */
4252 return (value_read);
4253 }
4254
4255 /* undefine any macro version of tme_memory_atomic_and16: */
4256 #undef tme_memory_atomic_and16
4257
4258 /* the 16-bit atomic and function: */
4259 tme_uint16_t
tme_memory_atomic_and16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4260 tme_memory_atomic_and16(tme_shared tme_uint16_t *memory,
4261 tme_uint16_t operand,
4262 tme_rwlock_t *rwlock,
4263 unsigned int align_min)
4264 {
4265 tme_uint16_t value_read;
4266 tme_uint16_t value_written;
4267 tme_uint16_t value_read_verify;
4268
4269 /* if we can't make direct accesses at all, all atomic
4270 accesses must be done under lock. (when threads are
4271 cooperative the actual locking isn't needed): */
4272 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4273 if (!TME_THREADS_COOPERATIVE) {
4274 tme_rwlock_wrlock(rwlock);
4275 }
4276 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4277 value_written = value_read & operand;
4278 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4279 if (!TME_THREADS_COOPERATIVE) {
4280 tme_rwlock_unlock(rwlock);
4281 }
4282 }
4283
4284 /* otherwise, threads are not cooperative and this host CPU
4285 can make atomic accesses to at least the most common memory
4286 size.
4287
4288 in that case, the only reason this function should get
4289 called is if the host CPU can't do an atomic 16-bit
4290 and at all, or if it can't do it at this alignment.
4291
4292 we emulate the atomic 16-bit and with a compare-and-exchange: */
4293 else {
4294
4295 /* do an atomic read of the memory: */
4296 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4297
4298 /* spin the and in a compare-and-exchange loop: */
4299 for (;;) {
4300
4301 /* make the value to write: */
4302 value_written = value_read & operand;
4303
4304 /* try the compare-and-exchange: */
4305 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4306
4307 /* if the compare-and-exchange failed: */
4308 if (__tme_predict_false(value_read_verify != value_read)) {
4309
4310 /* loop with the new value read from the memory: */
4311 value_read = value_read_verify;
4312 continue;
4313 }
4314
4315 /* stop now: */
4316 break;
4317 }
4318 }
4319
4320 /* return the value read: */
4321 return (value_read);
4322 }
4323
4324 /* undefine any macro version of tme_memory_atomic_or16: */
4325 #undef tme_memory_atomic_or16
4326
4327 /* the 16-bit atomic or function: */
4328 tme_uint16_t
tme_memory_atomic_or16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4329 tme_memory_atomic_or16(tme_shared tme_uint16_t *memory,
4330 tme_uint16_t operand,
4331 tme_rwlock_t *rwlock,
4332 unsigned int align_min)
4333 {
4334 tme_uint16_t value_read;
4335 tme_uint16_t value_written;
4336 tme_uint16_t value_read_verify;
4337
4338 /* if we can't make direct accesses at all, all atomic
4339 accesses must be done under lock. (when threads are
4340 cooperative the actual locking isn't needed): */
4341 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4342 if (!TME_THREADS_COOPERATIVE) {
4343 tme_rwlock_wrlock(rwlock);
4344 }
4345 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4346 value_written = value_read | operand;
4347 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4348 if (!TME_THREADS_COOPERATIVE) {
4349 tme_rwlock_unlock(rwlock);
4350 }
4351 }
4352
4353 /* otherwise, threads are not cooperative and this host CPU
4354 can make atomic accesses to at least the most common memory
4355 size.
4356
4357 in that case, the only reason this function should get
4358 called is if the host CPU can't do an atomic 16-bit
4359 or at all, or if it can't do it at this alignment.
4360
4361 we emulate the atomic 16-bit or with a compare-and-exchange: */
4362 else {
4363
4364 /* do an atomic read of the memory: */
4365 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4366
4367 /* spin the or in a compare-and-exchange loop: */
4368 for (;;) {
4369
4370 /* make the value to write: */
4371 value_written = value_read | operand;
4372
4373 /* try the compare-and-exchange: */
4374 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4375
4376 /* if the compare-and-exchange failed: */
4377 if (__tme_predict_false(value_read_verify != value_read)) {
4378
4379 /* loop with the new value read from the memory: */
4380 value_read = value_read_verify;
4381 continue;
4382 }
4383
4384 /* stop now: */
4385 break;
4386 }
4387 }
4388
4389 /* return the value read: */
4390 return (value_read);
4391 }
4392
4393 /* undefine any macro version of tme_memory_atomic_xor16: */
4394 #undef tme_memory_atomic_xor16
4395
4396 /* the 16-bit atomic xor function: */
4397 tme_uint16_t
tme_memory_atomic_xor16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4398 tme_memory_atomic_xor16(tme_shared tme_uint16_t *memory,
4399 tme_uint16_t operand,
4400 tme_rwlock_t *rwlock,
4401 unsigned int align_min)
4402 {
4403 tme_uint16_t value_read;
4404 tme_uint16_t value_written;
4405 tme_uint16_t value_read_verify;
4406
4407 /* if we can't make direct accesses at all, all atomic
4408 accesses must be done under lock. (when threads are
4409 cooperative the actual locking isn't needed): */
4410 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4411 if (!TME_THREADS_COOPERATIVE) {
4412 tme_rwlock_wrlock(rwlock);
4413 }
4414 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4415 value_written = value_read ^ operand;
4416 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4417 if (!TME_THREADS_COOPERATIVE) {
4418 tme_rwlock_unlock(rwlock);
4419 }
4420 }
4421
4422 /* otherwise, threads are not cooperative and this host CPU
4423 can make atomic accesses to at least the most common memory
4424 size.
4425
4426 in that case, the only reason this function should get
4427 called is if the host CPU can't do an atomic 16-bit
4428 xor at all, or if it can't do it at this alignment.
4429
4430 we emulate the atomic 16-bit xor with a compare-and-exchange: */
4431 else {
4432
4433 /* do an atomic read of the memory: */
4434 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4435
4436 /* spin the xor in a compare-and-exchange loop: */
4437 for (;;) {
4438
4439 /* make the value to write: */
4440 value_written = value_read ^ operand;
4441
4442 /* try the compare-and-exchange: */
4443 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4444
4445 /* if the compare-and-exchange failed: */
4446 if (__tme_predict_false(value_read_verify != value_read)) {
4447
4448 /* loop with the new value read from the memory: */
4449 value_read = value_read_verify;
4450 continue;
4451 }
4452
4453 /* stop now: */
4454 break;
4455 }
4456 }
4457
4458 /* return the value read: */
4459 return (value_read);
4460 }
4461
4462 /* undefine any macro version of tme_memory_atomic_not16: */
4463 #undef tme_memory_atomic_not16
4464
4465 /* the 16-bit atomic not function: */
4466 tme_uint16_t
tme_memory_atomic_not16(tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4467 tme_memory_atomic_not16(tme_shared tme_uint16_t *memory,
4468 tme_rwlock_t *rwlock,
4469 unsigned int align_min)
4470 {
4471 tme_uint16_t value_read;
4472 tme_uint16_t value_written;
4473 tme_uint16_t value_read_verify;
4474
4475 /* if we can't make direct accesses at all, all atomic
4476 accesses must be done under lock. (when threads are
4477 cooperative the actual locking isn't needed): */
4478 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4479 if (!TME_THREADS_COOPERATIVE) {
4480 tme_rwlock_wrlock(rwlock);
4481 }
4482 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4483 value_written = ~value_read;
4484 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4485 if (!TME_THREADS_COOPERATIVE) {
4486 tme_rwlock_unlock(rwlock);
4487 }
4488 }
4489
4490 /* otherwise, threads are not cooperative and this host CPU
4491 can make atomic accesses to at least the most common memory
4492 size.
4493
4494 in that case, the only reason this function should get
4495 called is if the host CPU can't do an atomic 16-bit
4496 not at all, or if it can't do it at this alignment.
4497
4498 we emulate the atomic 16-bit not with a compare-and-exchange: */
4499 else {
4500
4501 /* do an atomic read of the memory: */
4502 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4503
4504 /* spin the not in a compare-and-exchange loop: */
4505 for (;;) {
4506
4507 /* make the value to write: */
4508 value_written = ~value_read;
4509
4510 /* try the compare-and-exchange: */
4511 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4512
4513 /* if the compare-and-exchange failed: */
4514 if (__tme_predict_false(value_read_verify != value_read)) {
4515
4516 /* loop with the new value read from the memory: */
4517 value_read = value_read_verify;
4518 continue;
4519 }
4520
4521 /* stop now: */
4522 break;
4523 }
4524 }
4525
4526 /* return the value read: */
4527 return (value_read);
4528 }
4529
4530 /* undefine any macro version of tme_memory_atomic_neg16: */
4531 #undef tme_memory_atomic_neg16
4532
4533 /* the 16-bit atomic neg function: */
4534 tme_uint16_t
tme_memory_atomic_neg16(tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4535 tme_memory_atomic_neg16(tme_shared tme_uint16_t *memory,
4536 tme_rwlock_t *rwlock,
4537 unsigned int align_min)
4538 {
4539 tme_uint16_t value_read;
4540 tme_uint16_t value_written;
4541 tme_uint16_t value_read_verify;
4542
4543 /* if we can't make direct accesses at all, all atomic
4544 accesses must be done under lock. (when threads are
4545 cooperative the actual locking isn't needed): */
4546 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4547 if (!TME_THREADS_COOPERATIVE) {
4548 tme_rwlock_wrlock(rwlock);
4549 }
4550 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4551 value_written = 0 - value_read;
4552 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4553 if (!TME_THREADS_COOPERATIVE) {
4554 tme_rwlock_unlock(rwlock);
4555 }
4556 }
4557
4558 /* otherwise, threads are not cooperative and this host CPU
4559 can make atomic accesses to at least the most common memory
4560 size.
4561
4562 in that case, the only reason this function should get
4563 called is if the host CPU can't do an atomic 16-bit
4564 neg at all, or if it can't do it at this alignment.
4565
4566 we emulate the atomic 16-bit neg with a compare-and-exchange: */
4567 else {
4568
4569 /* do an atomic read of the memory: */
4570 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4571
4572 /* spin the neg in a compare-and-exchange loop: */
4573 for (;;) {
4574
4575 /* make the value to write: */
4576 value_written = 0 - value_read;
4577
4578 /* try the compare-and-exchange: */
4579 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4580
4581 /* if the compare-and-exchange failed: */
4582 if (__tme_predict_false(value_read_verify != value_read)) {
4583
4584 /* loop with the new value read from the memory: */
4585 value_read = value_read_verify;
4586 continue;
4587 }
4588
4589 /* stop now: */
4590 break;
4591 }
4592 }
4593
4594 /* return the value read: */
4595 return (value_read);
4596 }
4597
4598 /* undefine any macro version of tme_memory_atomic_xchg16: */
4599 #undef tme_memory_atomic_xchg16
4600
4601 /* the 16-bit atomic xchg function: */
4602 tme_uint16_t
tme_memory_atomic_xchg16(tme_shared tme_uint16_t * memory,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4603 tme_memory_atomic_xchg16(tme_shared tme_uint16_t *memory,
4604 tme_uint16_t value_written,
4605 tme_rwlock_t *rwlock,
4606 unsigned int align_min)
4607 {
4608 tme_uint16_t value_read;
4609 tme_uint16_t value_read_verify;
4610
4611 /* if we can't make direct accesses at all, all atomic
4612 accesses must be done under lock. (when threads are
4613 cooperative the actual locking isn't needed): */
4614 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4615 if (!TME_THREADS_COOPERATIVE) {
4616 tme_rwlock_wrlock(rwlock);
4617 }
4618 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4619 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4620 if (!TME_THREADS_COOPERATIVE) {
4621 tme_rwlock_unlock(rwlock);
4622 }
4623 }
4624
4625 /* otherwise, threads are not cooperative and this host CPU
4626 can make atomic accesses to at least the most common memory
4627 size.
4628
4629 in that case, the only reason this function should get
4630 called is if the host CPU can't do an atomic 16-bit
4631 xchg at all, or if it can't do it at this alignment.
4632
4633 we emulate the atomic 16-bit xchg with a compare-and-exchange: */
4634 else {
4635
4636 /* do an atomic read of the memory: */
4637 value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4638
4639 /* spin the xchg in a compare-and-exchange loop: */
4640 for (;;) {
4641
4642 /* try the compare-and-exchange: */
4643 value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4644
4645 /* if the compare-and-exchange failed: */
4646 if (__tme_predict_false(value_read_verify != value_read)) {
4647
4648 /* loop with the new value read from the memory: */
4649 value_read = value_read_verify;
4650 continue;
4651 }
4652
4653 /* stop now: */
4654 break;
4655 }
4656 }
4657
4658 /* return the value read: */
4659 return (value_read);
4660 }
4661
4662 /* undefine any macro version of tme_memory_atomic_cx16: */
4663 #undef tme_memory_atomic_cx16
4664
4665 /* the 16-bit atomic cx function: */
4666 tme_uint16_t
tme_memory_atomic_cx16(tme_shared tme_uint16_t * memory,tme_uint16_t value_cmp,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4667 tme_memory_atomic_cx16(tme_shared tme_uint16_t *memory,
4668 tme_uint16_t value_cmp,
4669 tme_uint16_t value_written,
4670 tme_rwlock_t *rwlock,
4671 unsigned int align_min)
4672 {
4673 tme_uint16_t value_read;
4674
4675 /* if we can't make direct accesses at all, all atomic
4676 accesses must be done under lock. (when threads are
4677 cooperative the actual locking isn't needed): */
4678 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4679 if (!TME_THREADS_COOPERATIVE) {
4680 tme_rwlock_wrlock(rwlock);
4681 }
4682 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4683 if (value_read == value_cmp) {
4684 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4685 }
4686 if (!TME_THREADS_COOPERATIVE) {
4687 tme_rwlock_unlock(rwlock);
4688 }
4689 }
4690
4691 /* otherwise, threads are not cooperative and this host CPU
4692 can make atomic accesses to at least the most common memory
4693 size.
4694
4695 in that case, the only reason this function should get
4696 called is if the host CPU can't do an atomic 16-bit
4697 cx at all, or if it can't do it at this alignment.
4698
4699 we assume that these problematic atomic cxs are rare,
4700 and to emulate them we simply stop all other threads while
4701 doing the cx: */
4702 else {
4703 tme_thread_suspend_others();
4704 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4705 if (value_read == value_cmp) {
4706 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4707 }
4708 tme_thread_resume_others();
4709 }
4710
4711 /* return the value read: */
4712 return (value_read);
4713 }
4714
4715 /* undefine any macro version of tme_memory_atomic_read16: */
4716 #undef tme_memory_atomic_read16
4717
4718 /* the 16-bit atomic read function: */
4719 tme_uint16_t
tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4720 tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t *memory,
4721 tme_rwlock_t *rwlock,
4722 unsigned int align_min)
4723 {
4724 tme_uint16_t value_read;
4725
4726 /* if we can't make direct accesses at all, all atomic
4727 accesses must be done under lock. (when threads are
4728 cooperative the actual locking isn't needed): */
4729 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4730 if (!TME_THREADS_COOPERATIVE) {
4731 tme_rwlock_rdlock(rwlock);
4732 }
4733 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4734 if (!TME_THREADS_COOPERATIVE) {
4735 tme_rwlock_unlock(rwlock);
4736 }
4737 }
4738
4739 /* otherwise, threads are not cooperative and this host CPU
4740 can make atomic accesses to at least the most common memory
4741 size.
4742
4743 in that case, the only reason this function should get
4744 called is if the host CPU can't do an atomic 16-bit
4745 read at all, or if it can't do it at this alignment.
4746
4747 we assume that these problematic atomic reads are rare,
4748 and to emulate them we simply stop all other threads while
4749 doing the read: */
4750 else {
4751 tme_thread_suspend_others();
4752 value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4753 tme_thread_resume_others();
4754 }
4755
4756 /* return the value read: */
4757 return (value_read);
4758 }
4759
4760 /* undefine any macro version of tme_memory_atomic_write16: */
4761 #undef tme_memory_atomic_write16
4762
4763 /* the 16-bit atomic write function: */
4764 void
tme_memory_atomic_write16(tme_shared tme_uint16_t * memory,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4765 tme_memory_atomic_write16(tme_shared tme_uint16_t *memory,
4766 tme_uint16_t value_written,
4767 tme_rwlock_t *rwlock,
4768 unsigned int align_min)
4769 {
4770
4771 /* if we can't make direct accesses at all, all atomic
4772 accesses must be done under lock. (when threads are
4773 cooperative the actual locking isn't needed): */
4774 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4775 if (!TME_THREADS_COOPERATIVE) {
4776 tme_rwlock_wrlock(rwlock);
4777 }
4778 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4779 if (!TME_THREADS_COOPERATIVE) {
4780 tme_rwlock_unlock(rwlock);
4781 }
4782 }
4783
4784 /* otherwise, threads are not cooperative and this host CPU
4785 can make atomic accesses to at least the most common memory
4786 size.
4787
4788 in that case, the only reason this function should get
4789 called is if the host CPU can't do an atomic 16-bit
4790 write at all, or if it can't do it at this alignment.
4791
4792 we assume that these problematic atomic writes are rare,
4793 and to emulate them we simply stop all other threads while
4794 doing the write: */
4795 else {
4796 tme_thread_suspend_others();
4797 tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4798 tme_thread_resume_others();
4799 }
4800 }
4801
4802 /* the 32-bit atomic operations: */
4803
4804 /* undefine any macro version of tme_memory_atomic_add32: */
4805 #undef tme_memory_atomic_add32
4806
4807 /* the 32-bit atomic add function: */
4808 tme_uint32_t
tme_memory_atomic_add32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4809 tme_memory_atomic_add32(tme_shared tme_uint32_t *memory,
4810 tme_uint32_t operand,
4811 tme_rwlock_t *rwlock,
4812 unsigned int align_min)
4813 {
4814 tme_uint32_t value_read;
4815 tme_uint32_t value_written;
4816 tme_uint32_t value_read_verify;
4817
4818 /* if we can't make direct accesses at all, all atomic
4819 accesses must be done under lock. (when threads are
4820 cooperative the actual locking isn't needed): */
4821 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4822 if (!TME_THREADS_COOPERATIVE) {
4823 tme_rwlock_wrlock(rwlock);
4824 }
4825 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4826 value_written = value_read + operand;
4827 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4828 if (!TME_THREADS_COOPERATIVE) {
4829 tme_rwlock_unlock(rwlock);
4830 }
4831 }
4832
4833 /* otherwise, threads are not cooperative and this host CPU
4834 can make atomic accesses to at least the most common memory
4835 size.
4836
4837 in that case, the only reason this function should get
4838 called is if the host CPU can't do an atomic 32-bit
4839 add at all, or if it can't do it at this alignment.
4840
4841 we emulate the atomic 32-bit add with a compare-and-exchange: */
4842 else {
4843
4844 /* do an atomic read of the memory: */
4845 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4846
4847 /* spin the add in a compare-and-exchange loop: */
4848 for (;;) {
4849
4850 /* make the value to write: */
4851 value_written = value_read + operand;
4852
4853 /* try the compare-and-exchange: */
4854 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4855
4856 /* if the compare-and-exchange failed: */
4857 if (__tme_predict_false(value_read_verify != value_read)) {
4858
4859 /* loop with the new value read from the memory: */
4860 value_read = value_read_verify;
4861 continue;
4862 }
4863
4864 /* stop now: */
4865 break;
4866 }
4867 }
4868
4869 /* return the value read: */
4870 return (value_read);
4871 }
4872
4873 /* undefine any macro version of tme_memory_atomic_sub32: */
4874 #undef tme_memory_atomic_sub32
4875
4876 /* the 32-bit atomic sub function: */
4877 tme_uint32_t
tme_memory_atomic_sub32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4878 tme_memory_atomic_sub32(tme_shared tme_uint32_t *memory,
4879 tme_uint32_t operand,
4880 tme_rwlock_t *rwlock,
4881 unsigned int align_min)
4882 {
4883 tme_uint32_t value_read;
4884 tme_uint32_t value_written;
4885 tme_uint32_t value_read_verify;
4886
4887 /* if we can't make direct accesses at all, all atomic
4888 accesses must be done under lock. (when threads are
4889 cooperative the actual locking isn't needed): */
4890 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4891 if (!TME_THREADS_COOPERATIVE) {
4892 tme_rwlock_wrlock(rwlock);
4893 }
4894 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4895 value_written = value_read - operand;
4896 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4897 if (!TME_THREADS_COOPERATIVE) {
4898 tme_rwlock_unlock(rwlock);
4899 }
4900 }
4901
4902 /* otherwise, threads are not cooperative and this host CPU
4903 can make atomic accesses to at least the most common memory
4904 size.
4905
4906 in that case, the only reason this function should get
4907 called is if the host CPU can't do an atomic 32-bit
4908 sub at all, or if it can't do it at this alignment.
4909
4910 we emulate the atomic 32-bit sub with a compare-and-exchange: */
4911 else {
4912
4913 /* do an atomic read of the memory: */
4914 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4915
4916 /* spin the sub in a compare-and-exchange loop: */
4917 for (;;) {
4918
4919 /* make the value to write: */
4920 value_written = value_read - operand;
4921
4922 /* try the compare-and-exchange: */
4923 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4924
4925 /* if the compare-and-exchange failed: */
4926 if (__tme_predict_false(value_read_verify != value_read)) {
4927
4928 /* loop with the new value read from the memory: */
4929 value_read = value_read_verify;
4930 continue;
4931 }
4932
4933 /* stop now: */
4934 break;
4935 }
4936 }
4937
4938 /* return the value read: */
4939 return (value_read);
4940 }
4941
4942 /* undefine any macro version of tme_memory_atomic_mul32: */
4943 #undef tme_memory_atomic_mul32
4944
4945 /* the 32-bit atomic mul function: */
4946 tme_uint32_t
tme_memory_atomic_mul32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4947 tme_memory_atomic_mul32(tme_shared tme_uint32_t *memory,
4948 tme_uint32_t operand,
4949 tme_rwlock_t *rwlock,
4950 unsigned int align_min)
4951 {
4952 tme_uint32_t value_read;
4953 tme_uint32_t value_written;
4954 tme_uint32_t value_read_verify;
4955
4956 /* if we can't make direct accesses at all, all atomic
4957 accesses must be done under lock. (when threads are
4958 cooperative the actual locking isn't needed): */
4959 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4960 if (!TME_THREADS_COOPERATIVE) {
4961 tme_rwlock_wrlock(rwlock);
4962 }
4963 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4964 value_written = value_read * operand;
4965 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4966 if (!TME_THREADS_COOPERATIVE) {
4967 tme_rwlock_unlock(rwlock);
4968 }
4969 }
4970
4971 /* otherwise, threads are not cooperative and this host CPU
4972 can make atomic accesses to at least the most common memory
4973 size.
4974
4975 in that case, the only reason this function should get
4976 called is if the host CPU can't do an atomic 32-bit
4977 mul at all, or if it can't do it at this alignment.
4978
4979 we emulate the atomic 32-bit mul with a compare-and-exchange: */
4980 else {
4981
4982 /* do an atomic read of the memory: */
4983 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4984
4985 /* spin the mul in a compare-and-exchange loop: */
4986 for (;;) {
4987
4988 /* make the value to write: */
4989 value_written = value_read * operand;
4990
4991 /* try the compare-and-exchange: */
4992 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4993
4994 /* if the compare-and-exchange failed: */
4995 if (__tme_predict_false(value_read_verify != value_read)) {
4996
4997 /* loop with the new value read from the memory: */
4998 value_read = value_read_verify;
4999 continue;
5000 }
5001
5002 /* stop now: */
5003 break;
5004 }
5005 }
5006
5007 /* return the value read: */
5008 return (value_read);
5009 }
5010
5011 /* undefine any macro version of tme_memory_atomic_div32: */
5012 #undef tme_memory_atomic_div32
5013
5014 /* the 32-bit atomic div function: */
5015 tme_uint32_t
tme_memory_atomic_div32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5016 tme_memory_atomic_div32(tme_shared tme_uint32_t *memory,
5017 tme_uint32_t operand,
5018 tme_rwlock_t *rwlock,
5019 unsigned int align_min)
5020 {
5021 tme_uint32_t value_read;
5022 tme_uint32_t value_written;
5023 tme_uint32_t value_read_verify;
5024
5025 /* if we can't make direct accesses at all, all atomic
5026 accesses must be done under lock. (when threads are
5027 cooperative the actual locking isn't needed): */
5028 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5029 if (!TME_THREADS_COOPERATIVE) {
5030 tme_rwlock_wrlock(rwlock);
5031 }
5032 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5033 value_written = value_read / operand;
5034 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5035 if (!TME_THREADS_COOPERATIVE) {
5036 tme_rwlock_unlock(rwlock);
5037 }
5038 }
5039
5040 /* otherwise, threads are not cooperative and this host CPU
5041 can make atomic accesses to at least the most common memory
5042 size.
5043
5044 in that case, the only reason this function should get
5045 called is if the host CPU can't do an atomic 32-bit
5046 div at all, or if it can't do it at this alignment.
5047
5048 we emulate the atomic 32-bit div with a compare-and-exchange: */
5049 else {
5050
5051 /* do an atomic read of the memory: */
5052 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5053
5054 /* spin the div in a compare-and-exchange loop: */
5055 for (;;) {
5056
5057 /* make the value to write: */
5058 value_written = value_read / operand;
5059
5060 /* try the compare-and-exchange: */
5061 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5062
5063 /* if the compare-and-exchange failed: */
5064 if (__tme_predict_false(value_read_verify != value_read)) {
5065
5066 /* loop with the new value read from the memory: */
5067 value_read = value_read_verify;
5068 continue;
5069 }
5070
5071 /* stop now: */
5072 break;
5073 }
5074 }
5075
5076 /* return the value read: */
5077 return (value_read);
5078 }
5079
5080 /* undefine any macro version of tme_memory_atomic_and32: */
5081 #undef tme_memory_atomic_and32
5082
5083 /* the 32-bit atomic and function: */
5084 tme_uint32_t
tme_memory_atomic_and32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5085 tme_memory_atomic_and32(tme_shared tme_uint32_t *memory,
5086 tme_uint32_t operand,
5087 tme_rwlock_t *rwlock,
5088 unsigned int align_min)
5089 {
5090 tme_uint32_t value_read;
5091 tme_uint32_t value_written;
5092 tme_uint32_t value_read_verify;
5093
5094 /* if we can't make direct accesses at all, all atomic
5095 accesses must be done under lock. (when threads are
5096 cooperative the actual locking isn't needed): */
5097 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5098 if (!TME_THREADS_COOPERATIVE) {
5099 tme_rwlock_wrlock(rwlock);
5100 }
5101 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5102 value_written = value_read & operand;
5103 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5104 if (!TME_THREADS_COOPERATIVE) {
5105 tme_rwlock_unlock(rwlock);
5106 }
5107 }
5108
5109 /* otherwise, threads are not cooperative and this host CPU
5110 can make atomic accesses to at least the most common memory
5111 size.
5112
5113 in that case, the only reason this function should get
5114 called is if the host CPU can't do an atomic 32-bit
5115 and at all, or if it can't do it at this alignment.
5116
5117 we emulate the atomic 32-bit and with a compare-and-exchange: */
5118 else {
5119
5120 /* do an atomic read of the memory: */
5121 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5122
5123 /* spin the and in a compare-and-exchange loop: */
5124 for (;;) {
5125
5126 /* make the value to write: */
5127 value_written = value_read & operand;
5128
5129 /* try the compare-and-exchange: */
5130 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5131
5132 /* if the compare-and-exchange failed: */
5133 if (__tme_predict_false(value_read_verify != value_read)) {
5134
5135 /* loop with the new value read from the memory: */
5136 value_read = value_read_verify;
5137 continue;
5138 }
5139
5140 /* stop now: */
5141 break;
5142 }
5143 }
5144
5145 /* return the value read: */
5146 return (value_read);
5147 }
5148
5149 /* undefine any macro version of tme_memory_atomic_or32: */
5150 #undef tme_memory_atomic_or32
5151
5152 /* the 32-bit atomic or function: */
5153 tme_uint32_t
tme_memory_atomic_or32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5154 tme_memory_atomic_or32(tme_shared tme_uint32_t *memory,
5155 tme_uint32_t operand,
5156 tme_rwlock_t *rwlock,
5157 unsigned int align_min)
5158 {
5159 tme_uint32_t value_read;
5160 tme_uint32_t value_written;
5161 tme_uint32_t value_read_verify;
5162
5163 /* if we can't make direct accesses at all, all atomic
5164 accesses must be done under lock. (when threads are
5165 cooperative the actual locking isn't needed): */
5166 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5167 if (!TME_THREADS_COOPERATIVE) {
5168 tme_rwlock_wrlock(rwlock);
5169 }
5170 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5171 value_written = value_read | operand;
5172 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5173 if (!TME_THREADS_COOPERATIVE) {
5174 tme_rwlock_unlock(rwlock);
5175 }
5176 }
5177
5178 /* otherwise, threads are not cooperative and this host CPU
5179 can make atomic accesses to at least the most common memory
5180 size.
5181
5182 in that case, the only reason this function should get
5183 called is if the host CPU can't do an atomic 32-bit
5184 or at all, or if it can't do it at this alignment.
5185
5186 we emulate the atomic 32-bit or with a compare-and-exchange: */
5187 else {
5188
5189 /* do an atomic read of the memory: */
5190 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5191
5192 /* spin the or in a compare-and-exchange loop: */
5193 for (;;) {
5194
5195 /* make the value to write: */
5196 value_written = value_read | operand;
5197
5198 /* try the compare-and-exchange: */
5199 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5200
5201 /* if the compare-and-exchange failed: */
5202 if (__tme_predict_false(value_read_verify != value_read)) {
5203
5204 /* loop with the new value read from the memory: */
5205 value_read = value_read_verify;
5206 continue;
5207 }
5208
5209 /* stop now: */
5210 break;
5211 }
5212 }
5213
5214 /* return the value read: */
5215 return (value_read);
5216 }
5217
5218 /* undefine any macro version of tme_memory_atomic_xor32: */
5219 #undef tme_memory_atomic_xor32
5220
5221 /* the 32-bit atomic xor function: */
5222 tme_uint32_t
tme_memory_atomic_xor32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5223 tme_memory_atomic_xor32(tme_shared tme_uint32_t *memory,
5224 tme_uint32_t operand,
5225 tme_rwlock_t *rwlock,
5226 unsigned int align_min)
5227 {
5228 tme_uint32_t value_read;
5229 tme_uint32_t value_written;
5230 tme_uint32_t value_read_verify;
5231
5232 /* if we can't make direct accesses at all, all atomic
5233 accesses must be done under lock. (when threads are
5234 cooperative the actual locking isn't needed): */
5235 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5236 if (!TME_THREADS_COOPERATIVE) {
5237 tme_rwlock_wrlock(rwlock);
5238 }
5239 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5240 value_written = value_read ^ operand;
5241 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5242 if (!TME_THREADS_COOPERATIVE) {
5243 tme_rwlock_unlock(rwlock);
5244 }
5245 }
5246
5247 /* otherwise, threads are not cooperative and this host CPU
5248 can make atomic accesses to at least the most common memory
5249 size.
5250
5251 in that case, the only reason this function should get
5252 called is if the host CPU can't do an atomic 32-bit
5253 xor at all, or if it can't do it at this alignment.
5254
5255 we emulate the atomic 32-bit xor with a compare-and-exchange: */
5256 else {
5257
5258 /* do an atomic read of the memory: */
5259 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5260
5261 /* spin the xor in a compare-and-exchange loop: */
5262 for (;;) {
5263
5264 /* make the value to write: */
5265 value_written = value_read ^ operand;
5266
5267 /* try the compare-and-exchange: */
5268 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5269
5270 /* if the compare-and-exchange failed: */
5271 if (__tme_predict_false(value_read_verify != value_read)) {
5272
5273 /* loop with the new value read from the memory: */
5274 value_read = value_read_verify;
5275 continue;
5276 }
5277
5278 /* stop now: */
5279 break;
5280 }
5281 }
5282
5283 /* return the value read: */
5284 return (value_read);
5285 }
5286
5287 /* undefine any macro version of tme_memory_atomic_not32: */
5288 #undef tme_memory_atomic_not32
5289
5290 /* the 32-bit atomic not function: */
5291 tme_uint32_t
tme_memory_atomic_not32(tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5292 tme_memory_atomic_not32(tme_shared tme_uint32_t *memory,
5293 tme_rwlock_t *rwlock,
5294 unsigned int align_min)
5295 {
5296 tme_uint32_t value_read;
5297 tme_uint32_t value_written;
5298 tme_uint32_t value_read_verify;
5299
5300 /* if we can't make direct accesses at all, all atomic
5301 accesses must be done under lock. (when threads are
5302 cooperative the actual locking isn't needed): */
5303 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5304 if (!TME_THREADS_COOPERATIVE) {
5305 tme_rwlock_wrlock(rwlock);
5306 }
5307 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5308 value_written = ~value_read;
5309 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5310 if (!TME_THREADS_COOPERATIVE) {
5311 tme_rwlock_unlock(rwlock);
5312 }
5313 }
5314
5315 /* otherwise, threads are not cooperative and this host CPU
5316 can make atomic accesses to at least the most common memory
5317 size.
5318
5319 in that case, the only reason this function should get
5320 called is if the host CPU can't do an atomic 32-bit
5321 not at all, or if it can't do it at this alignment.
5322
5323 we emulate the atomic 32-bit not with a compare-and-exchange: */
5324 else {
5325
5326 /* do an atomic read of the memory: */
5327 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5328
5329 /* spin the not in a compare-and-exchange loop: */
5330 for (;;) {
5331
5332 /* make the value to write: */
5333 value_written = ~value_read;
5334
5335 /* try the compare-and-exchange: */
5336 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5337
5338 /* if the compare-and-exchange failed: */
5339 if (__tme_predict_false(value_read_verify != value_read)) {
5340
5341 /* loop with the new value read from the memory: */
5342 value_read = value_read_verify;
5343 continue;
5344 }
5345
5346 /* stop now: */
5347 break;
5348 }
5349 }
5350
5351 /* return the value read: */
5352 return (value_read);
5353 }
5354
5355 /* undefine any macro version of tme_memory_atomic_neg32: */
5356 #undef tme_memory_atomic_neg32
5357
5358 /* the 32-bit atomic neg function: */
5359 tme_uint32_t
tme_memory_atomic_neg32(tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5360 tme_memory_atomic_neg32(tme_shared tme_uint32_t *memory,
5361 tme_rwlock_t *rwlock,
5362 unsigned int align_min)
5363 {
5364 tme_uint32_t value_read;
5365 tme_uint32_t value_written;
5366 tme_uint32_t value_read_verify;
5367
5368 /* if we can't make direct accesses at all, all atomic
5369 accesses must be done under lock. (when threads are
5370 cooperative the actual locking isn't needed): */
5371 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5372 if (!TME_THREADS_COOPERATIVE) {
5373 tme_rwlock_wrlock(rwlock);
5374 }
5375 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5376 value_written = 0 - value_read;
5377 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5378 if (!TME_THREADS_COOPERATIVE) {
5379 tme_rwlock_unlock(rwlock);
5380 }
5381 }
5382
5383 /* otherwise, threads are not cooperative and this host CPU
5384 can make atomic accesses to at least the most common memory
5385 size.
5386
5387 in that case, the only reason this function should get
5388 called is if the host CPU can't do an atomic 32-bit
5389 neg at all, or if it can't do it at this alignment.
5390
5391 we emulate the atomic 32-bit neg with a compare-and-exchange: */
5392 else {
5393
5394 /* do an atomic read of the memory: */
5395 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5396
5397 /* spin the neg in a compare-and-exchange loop: */
5398 for (;;) {
5399
5400 /* make the value to write: */
5401 value_written = 0 - value_read;
5402
5403 /* try the compare-and-exchange: */
5404 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5405
5406 /* if the compare-and-exchange failed: */
5407 if (__tme_predict_false(value_read_verify != value_read)) {
5408
5409 /* loop with the new value read from the memory: */
5410 value_read = value_read_verify;
5411 continue;
5412 }
5413
5414 /* stop now: */
5415 break;
5416 }
5417 }
5418
5419 /* return the value read: */
5420 return (value_read);
5421 }
5422
5423 /* undefine any macro version of tme_memory_atomic_xchg32: */
5424 #undef tme_memory_atomic_xchg32
5425
5426 /* the 32-bit atomic xchg function: */
5427 tme_uint32_t
tme_memory_atomic_xchg32(tme_shared tme_uint32_t * memory,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5428 tme_memory_atomic_xchg32(tme_shared tme_uint32_t *memory,
5429 tme_uint32_t value_written,
5430 tme_rwlock_t *rwlock,
5431 unsigned int align_min)
5432 {
5433 tme_uint32_t value_read;
5434 tme_uint32_t value_read_verify;
5435
5436 /* if we can't make direct accesses at all, all atomic
5437 accesses must be done under lock. (when threads are
5438 cooperative the actual locking isn't needed): */
5439 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5440 if (!TME_THREADS_COOPERATIVE) {
5441 tme_rwlock_wrlock(rwlock);
5442 }
5443 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5444 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5445 if (!TME_THREADS_COOPERATIVE) {
5446 tme_rwlock_unlock(rwlock);
5447 }
5448 }
5449
5450 /* otherwise, threads are not cooperative and this host CPU
5451 can make atomic accesses to at least the most common memory
5452 size.
5453
5454 in that case, the only reason this function should get
5455 called is if the host CPU can't do an atomic 32-bit
5456 xchg at all, or if it can't do it at this alignment.
5457
5458 we emulate the atomic 32-bit xchg with a compare-and-exchange: */
5459 else {
5460
5461 /* do an atomic read of the memory: */
5462 value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5463
5464 /* spin the xchg in a compare-and-exchange loop: */
5465 for (;;) {
5466
5467 /* try the compare-and-exchange: */
5468 value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5469
5470 /* if the compare-and-exchange failed: */
5471 if (__tme_predict_false(value_read_verify != value_read)) {
5472
5473 /* loop with the new value read from the memory: */
5474 value_read = value_read_verify;
5475 continue;
5476 }
5477
5478 /* stop now: */
5479 break;
5480 }
5481 }
5482
5483 /* return the value read: */
5484 return (value_read);
5485 }
5486
5487 /* undefine any macro version of tme_memory_atomic_cx32: */
5488 #undef tme_memory_atomic_cx32
5489
5490 /* the 32-bit atomic cx function: */
5491 tme_uint32_t
tme_memory_atomic_cx32(tme_shared tme_uint32_t * memory,tme_uint32_t value_cmp,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5492 tme_memory_atomic_cx32(tme_shared tme_uint32_t *memory,
5493 tme_uint32_t value_cmp,
5494 tme_uint32_t value_written,
5495 tme_rwlock_t *rwlock,
5496 unsigned int align_min)
5497 {
5498 tme_uint32_t value_read;
5499
5500 /* if we can't make direct accesses at all, all atomic
5501 accesses must be done under lock. (when threads are
5502 cooperative the actual locking isn't needed): */
5503 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5504 if (!TME_THREADS_COOPERATIVE) {
5505 tme_rwlock_wrlock(rwlock);
5506 }
5507 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5508 if (value_read == value_cmp) {
5509 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5510 }
5511 if (!TME_THREADS_COOPERATIVE) {
5512 tme_rwlock_unlock(rwlock);
5513 }
5514 }
5515
5516 /* otherwise, threads are not cooperative and this host CPU
5517 can make atomic accesses to at least the most common memory
5518 size.
5519
5520 in that case, the only reason this function should get
5521 called is if the host CPU can't do an atomic 32-bit
5522 cx at all, or if it can't do it at this alignment.
5523
5524 we assume that these problematic atomic cxs are rare,
5525 and to emulate them we simply stop all other threads while
5526 doing the cx: */
5527 else {
5528 tme_thread_suspend_others();
5529 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5530 if (value_read == value_cmp) {
5531 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5532 }
5533 tme_thread_resume_others();
5534 }
5535
5536 /* return the value read: */
5537 return (value_read);
5538 }
5539
5540 /* undefine any macro version of tme_memory_atomic_read32: */
5541 #undef tme_memory_atomic_read32
5542
5543 /* the 32-bit atomic read function: */
5544 tme_uint32_t
tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5545 tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t *memory,
5546 tme_rwlock_t *rwlock,
5547 unsigned int align_min)
5548 {
5549 tme_uint32_t value_read;
5550
5551 /* if we can't make direct accesses at all, all atomic
5552 accesses must be done under lock. (when threads are
5553 cooperative the actual locking isn't needed): */
5554 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5555 if (!TME_THREADS_COOPERATIVE) {
5556 tme_rwlock_rdlock(rwlock);
5557 }
5558 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5559 if (!TME_THREADS_COOPERATIVE) {
5560 tme_rwlock_unlock(rwlock);
5561 }
5562 }
5563
5564 /* otherwise, threads are not cooperative and this host CPU
5565 can make atomic accesses to at least the most common memory
5566 size.
5567
5568 in that case, the only reason this function should get
5569 called is if the host CPU can't do an atomic 32-bit
5570 read at all, or if it can't do it at this alignment.
5571
5572 we assume that these problematic atomic reads are rare,
5573 and to emulate them we simply stop all other threads while
5574 doing the read: */
5575 else {
5576 tme_thread_suspend_others();
5577 value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5578 tme_thread_resume_others();
5579 }
5580
5581 /* return the value read: */
5582 return (value_read);
5583 }
5584
5585 /* undefine any macro version of tme_memory_atomic_write32: */
5586 #undef tme_memory_atomic_write32
5587
5588 /* the 32-bit atomic write function: */
5589 void
tme_memory_atomic_write32(tme_shared tme_uint32_t * memory,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5590 tme_memory_atomic_write32(tme_shared tme_uint32_t *memory,
5591 tme_uint32_t value_written,
5592 tme_rwlock_t *rwlock,
5593 unsigned int align_min)
5594 {
5595
5596 /* if we can't make direct accesses at all, all atomic
5597 accesses must be done under lock. (when threads are
5598 cooperative the actual locking isn't needed): */
5599 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5600 if (!TME_THREADS_COOPERATIVE) {
5601 tme_rwlock_wrlock(rwlock);
5602 }
5603 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5604 if (!TME_THREADS_COOPERATIVE) {
5605 tme_rwlock_unlock(rwlock);
5606 }
5607 }
5608
5609 /* otherwise, threads are not cooperative and this host CPU
5610 can make atomic accesses to at least the most common memory
5611 size.
5612
5613 in that case, the only reason this function should get
5614 called is if the host CPU can't do an atomic 32-bit
5615 write at all, or if it can't do it at this alignment.
5616
5617 we assume that these problematic atomic writes are rare,
5618 and to emulate them we simply stop all other threads while
5619 doing the write: */
5620 else {
5621 tme_thread_suspend_others();
5622 tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5623 tme_thread_resume_others();
5624 }
5625 }
5626
5627 #ifdef TME_HAVE_INT64_T
5628
5629 /* the 64-bit atomic operations: */
5630
5631 /* undefine any macro version of tme_memory_atomic_add64: */
5632 #undef tme_memory_atomic_add64
5633
5634 /* the 64-bit atomic add function: */
5635 tme_uint64_t
tme_memory_atomic_add64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5636 tme_memory_atomic_add64(tme_shared tme_uint64_t *memory,
5637 tme_uint64_t operand,
5638 tme_rwlock_t *rwlock,
5639 unsigned int align_min)
5640 {
5641 tme_uint64_t value_read;
5642 tme_uint64_t value_written;
5643 tme_uint64_t value_read_verify;
5644
5645 /* if we can't make direct accesses at all, all atomic
5646 accesses must be done under lock. (when threads are
5647 cooperative the actual locking isn't needed): */
5648 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5649 if (!TME_THREADS_COOPERATIVE) {
5650 tme_rwlock_wrlock(rwlock);
5651 }
5652 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5653 value_written = value_read + operand;
5654 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5655 if (!TME_THREADS_COOPERATIVE) {
5656 tme_rwlock_unlock(rwlock);
5657 }
5658 }
5659
5660 /* otherwise, threads are not cooperative and this host CPU
5661 can make atomic accesses to at least the most common memory
5662 size.
5663
5664 in that case, the only reason this function should get
5665 called is if the host CPU can't do an atomic 64-bit
5666 add at all, or if it can't do it at this alignment.
5667
5668 we emulate the atomic 64-bit add with a compare-and-exchange: */
5669 else {
5670
5671 /* do an atomic read of the memory: */
5672 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5673
5674 /* spin the add in a compare-and-exchange loop: */
5675 for (;;) {
5676
5677 /* make the value to write: */
5678 value_written = value_read + operand;
5679
5680 /* try the compare-and-exchange: */
5681 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5682
5683 /* if the compare-and-exchange failed: */
5684 if (__tme_predict_false(value_read_verify != value_read)) {
5685
5686 /* loop with the new value read from the memory: */
5687 value_read = value_read_verify;
5688 continue;
5689 }
5690
5691 /* stop now: */
5692 break;
5693 }
5694 }
5695
5696 /* return the value read: */
5697 return (value_read);
5698 }
5699
5700 /* undefine any macro version of tme_memory_atomic_sub64: */
5701 #undef tme_memory_atomic_sub64
5702
5703 /* the 64-bit atomic sub function: */
5704 tme_uint64_t
tme_memory_atomic_sub64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5705 tme_memory_atomic_sub64(tme_shared tme_uint64_t *memory,
5706 tme_uint64_t operand,
5707 tme_rwlock_t *rwlock,
5708 unsigned int align_min)
5709 {
5710 tme_uint64_t value_read;
5711 tme_uint64_t value_written;
5712 tme_uint64_t value_read_verify;
5713
5714 /* if we can't make direct accesses at all, all atomic
5715 accesses must be done under lock. (when threads are
5716 cooperative the actual locking isn't needed): */
5717 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5718 if (!TME_THREADS_COOPERATIVE) {
5719 tme_rwlock_wrlock(rwlock);
5720 }
5721 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5722 value_written = value_read - operand;
5723 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5724 if (!TME_THREADS_COOPERATIVE) {
5725 tme_rwlock_unlock(rwlock);
5726 }
5727 }
5728
5729 /* otherwise, threads are not cooperative and this host CPU
5730 can make atomic accesses to at least the most common memory
5731 size.
5732
5733 in that case, the only reason this function should get
5734 called is if the host CPU can't do an atomic 64-bit
5735 sub at all, or if it can't do it at this alignment.
5736
5737 we emulate the atomic 64-bit sub with a compare-and-exchange: */
5738 else {
5739
5740 /* do an atomic read of the memory: */
5741 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5742
5743 /* spin the sub in a compare-and-exchange loop: */
5744 for (;;) {
5745
5746 /* make the value to write: */
5747 value_written = value_read - operand;
5748
5749 /* try the compare-and-exchange: */
5750 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5751
5752 /* if the compare-and-exchange failed: */
5753 if (__tme_predict_false(value_read_verify != value_read)) {
5754
5755 /* loop with the new value read from the memory: */
5756 value_read = value_read_verify;
5757 continue;
5758 }
5759
5760 /* stop now: */
5761 break;
5762 }
5763 }
5764
5765 /* return the value read: */
5766 return (value_read);
5767 }
5768
5769 /* undefine any macro version of tme_memory_atomic_mul64: */
5770 #undef tme_memory_atomic_mul64
5771
5772 /* the 64-bit atomic mul function: */
5773 tme_uint64_t
tme_memory_atomic_mul64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5774 tme_memory_atomic_mul64(tme_shared tme_uint64_t *memory,
5775 tme_uint64_t operand,
5776 tme_rwlock_t *rwlock,
5777 unsigned int align_min)
5778 {
5779 tme_uint64_t value_read;
5780 tme_uint64_t value_written;
5781 tme_uint64_t value_read_verify;
5782
5783 /* if we can't make direct accesses at all, all atomic
5784 accesses must be done under lock. (when threads are
5785 cooperative the actual locking isn't needed): */
5786 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5787 if (!TME_THREADS_COOPERATIVE) {
5788 tme_rwlock_wrlock(rwlock);
5789 }
5790 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5791 value_written = value_read * operand;
5792 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5793 if (!TME_THREADS_COOPERATIVE) {
5794 tme_rwlock_unlock(rwlock);
5795 }
5796 }
5797
5798 /* otherwise, threads are not cooperative and this host CPU
5799 can make atomic accesses to at least the most common memory
5800 size.
5801
5802 in that case, the only reason this function should get
5803 called is if the host CPU can't do an atomic 64-bit
5804 mul at all, or if it can't do it at this alignment.
5805
5806 we emulate the atomic 64-bit mul with a compare-and-exchange: */
5807 else {
5808
5809 /* do an atomic read of the memory: */
5810 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5811
5812 /* spin the mul in a compare-and-exchange loop: */
5813 for (;;) {
5814
5815 /* make the value to write: */
5816 value_written = value_read * operand;
5817
5818 /* try the compare-and-exchange: */
5819 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5820
5821 /* if the compare-and-exchange failed: */
5822 if (__tme_predict_false(value_read_verify != value_read)) {
5823
5824 /* loop with the new value read from the memory: */
5825 value_read = value_read_verify;
5826 continue;
5827 }
5828
5829 /* stop now: */
5830 break;
5831 }
5832 }
5833
5834 /* return the value read: */
5835 return (value_read);
5836 }
5837
5838 /* undefine any macro version of tme_memory_atomic_div64: */
5839 #undef tme_memory_atomic_div64
5840
5841 /* the 64-bit atomic div function: */
5842 tme_uint64_t
tme_memory_atomic_div64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5843 tme_memory_atomic_div64(tme_shared tme_uint64_t *memory,
5844 tme_uint64_t operand,
5845 tme_rwlock_t *rwlock,
5846 unsigned int align_min)
5847 {
5848 tme_uint64_t value_read;
5849 tme_uint64_t value_written;
5850 tme_uint64_t value_read_verify;
5851
5852 /* if we can't make direct accesses at all, all atomic
5853 accesses must be done under lock. (when threads are
5854 cooperative the actual locking isn't needed): */
5855 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5856 if (!TME_THREADS_COOPERATIVE) {
5857 tme_rwlock_wrlock(rwlock);
5858 }
5859 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5860 value_written = value_read / operand;
5861 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5862 if (!TME_THREADS_COOPERATIVE) {
5863 tme_rwlock_unlock(rwlock);
5864 }
5865 }
5866
5867 /* otherwise, threads are not cooperative and this host CPU
5868 can make atomic accesses to at least the most common memory
5869 size.
5870
5871 in that case, the only reason this function should get
5872 called is if the host CPU can't do an atomic 64-bit
5873 div at all, or if it can't do it at this alignment.
5874
5875 we emulate the atomic 64-bit div with a compare-and-exchange: */
5876 else {
5877
5878 /* do an atomic read of the memory: */
5879 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5880
5881 /* spin the div in a compare-and-exchange loop: */
5882 for (;;) {
5883
5884 /* make the value to write: */
5885 value_written = value_read / operand;
5886
5887 /* try the compare-and-exchange: */
5888 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5889
5890 /* if the compare-and-exchange failed: */
5891 if (__tme_predict_false(value_read_verify != value_read)) {
5892
5893 /* loop with the new value read from the memory: */
5894 value_read = value_read_verify;
5895 continue;
5896 }
5897
5898 /* stop now: */
5899 break;
5900 }
5901 }
5902
5903 /* return the value read: */
5904 return (value_read);
5905 }
5906
5907 /* undefine any macro version of tme_memory_atomic_and64: */
5908 #undef tme_memory_atomic_and64
5909
5910 /* the 64-bit atomic and function: */
5911 tme_uint64_t
tme_memory_atomic_and64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5912 tme_memory_atomic_and64(tme_shared tme_uint64_t *memory,
5913 tme_uint64_t operand,
5914 tme_rwlock_t *rwlock,
5915 unsigned int align_min)
5916 {
5917 tme_uint64_t value_read;
5918 tme_uint64_t value_written;
5919 tme_uint64_t value_read_verify;
5920
5921 /* if we can't make direct accesses at all, all atomic
5922 accesses must be done under lock. (when threads are
5923 cooperative the actual locking isn't needed): */
5924 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5925 if (!TME_THREADS_COOPERATIVE) {
5926 tme_rwlock_wrlock(rwlock);
5927 }
5928 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5929 value_written = value_read & operand;
5930 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5931 if (!TME_THREADS_COOPERATIVE) {
5932 tme_rwlock_unlock(rwlock);
5933 }
5934 }
5935
5936 /* otherwise, threads are not cooperative and this host CPU
5937 can make atomic accesses to at least the most common memory
5938 size.
5939
5940 in that case, the only reason this function should get
5941 called is if the host CPU can't do an atomic 64-bit
5942 and at all, or if it can't do it at this alignment.
5943
5944 we emulate the atomic 64-bit and with a compare-and-exchange: */
5945 else {
5946
5947 /* do an atomic read of the memory: */
5948 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5949
5950 /* spin the and in a compare-and-exchange loop: */
5951 for (;;) {
5952
5953 /* make the value to write: */
5954 value_written = value_read & operand;
5955
5956 /* try the compare-and-exchange: */
5957 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5958
5959 /* if the compare-and-exchange failed: */
5960 if (__tme_predict_false(value_read_verify != value_read)) {
5961
5962 /* loop with the new value read from the memory: */
5963 value_read = value_read_verify;
5964 continue;
5965 }
5966
5967 /* stop now: */
5968 break;
5969 }
5970 }
5971
5972 /* return the value read: */
5973 return (value_read);
5974 }
5975
5976 /* undefine any macro version of tme_memory_atomic_or64: */
5977 #undef tme_memory_atomic_or64
5978
5979 /* the 64-bit atomic or function: */
5980 tme_uint64_t
tme_memory_atomic_or64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5981 tme_memory_atomic_or64(tme_shared tme_uint64_t *memory,
5982 tme_uint64_t operand,
5983 tme_rwlock_t *rwlock,
5984 unsigned int align_min)
5985 {
5986 tme_uint64_t value_read;
5987 tme_uint64_t value_written;
5988 tme_uint64_t value_read_verify;
5989
5990 /* if we can't make direct accesses at all, all atomic
5991 accesses must be done under lock. (when threads are
5992 cooperative the actual locking isn't needed): */
5993 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5994 if (!TME_THREADS_COOPERATIVE) {
5995 tme_rwlock_wrlock(rwlock);
5996 }
5997 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5998 value_written = value_read | operand;
5999 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6000 if (!TME_THREADS_COOPERATIVE) {
6001 tme_rwlock_unlock(rwlock);
6002 }
6003 }
6004
6005 /* otherwise, threads are not cooperative and this host CPU
6006 can make atomic accesses to at least the most common memory
6007 size.
6008
6009 in that case, the only reason this function should get
6010 called is if the host CPU can't do an atomic 64-bit
6011 or at all, or if it can't do it at this alignment.
6012
6013 we emulate the atomic 64-bit or with a compare-and-exchange: */
6014 else {
6015
6016 /* do an atomic read of the memory: */
6017 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6018
6019 /* spin the or in a compare-and-exchange loop: */
6020 for (;;) {
6021
6022 /* make the value to write: */
6023 value_written = value_read | operand;
6024
6025 /* try the compare-and-exchange: */
6026 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6027
6028 /* if the compare-and-exchange failed: */
6029 if (__tme_predict_false(value_read_verify != value_read)) {
6030
6031 /* loop with the new value read from the memory: */
6032 value_read = value_read_verify;
6033 continue;
6034 }
6035
6036 /* stop now: */
6037 break;
6038 }
6039 }
6040
6041 /* return the value read: */
6042 return (value_read);
6043 }
6044
6045 /* undefine any macro version of tme_memory_atomic_xor64: */
6046 #undef tme_memory_atomic_xor64
6047
6048 /* the 64-bit atomic xor function: */
6049 tme_uint64_t
tme_memory_atomic_xor64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)6050 tme_memory_atomic_xor64(tme_shared tme_uint64_t *memory,
6051 tme_uint64_t operand,
6052 tme_rwlock_t *rwlock,
6053 unsigned int align_min)
6054 {
6055 tme_uint64_t value_read;
6056 tme_uint64_t value_written;
6057 tme_uint64_t value_read_verify;
6058
6059 /* if we can't make direct accesses at all, all atomic
6060 accesses must be done under lock. (when threads are
6061 cooperative the actual locking isn't needed): */
6062 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6063 if (!TME_THREADS_COOPERATIVE) {
6064 tme_rwlock_wrlock(rwlock);
6065 }
6066 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6067 value_written = value_read ^ operand;
6068 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6069 if (!TME_THREADS_COOPERATIVE) {
6070 tme_rwlock_unlock(rwlock);
6071 }
6072 }
6073
6074 /* otherwise, threads are not cooperative and this host CPU
6075 can make atomic accesses to at least the most common memory
6076 size.
6077
6078 in that case, the only reason this function should get
6079 called is if the host CPU can't do an atomic 64-bit
6080 xor at all, or if it can't do it at this alignment.
6081
6082 we emulate the atomic 64-bit xor with a compare-and-exchange: */
6083 else {
6084
6085 /* do an atomic read of the memory: */
6086 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6087
6088 /* spin the xor in a compare-and-exchange loop: */
6089 for (;;) {
6090
6091 /* make the value to write: */
6092 value_written = value_read ^ operand;
6093
6094 /* try the compare-and-exchange: */
6095 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6096
6097 /* if the compare-and-exchange failed: */
6098 if (__tme_predict_false(value_read_verify != value_read)) {
6099
6100 /* loop with the new value read from the memory: */
6101 value_read = value_read_verify;
6102 continue;
6103 }
6104
6105 /* stop now: */
6106 break;
6107 }
6108 }
6109
6110 /* return the value read: */
6111 return (value_read);
6112 }
6113
6114 /* undefine any macro version of tme_memory_atomic_not64: */
6115 #undef tme_memory_atomic_not64
6116
6117 /* the 64-bit atomic not function: */
6118 tme_uint64_t
tme_memory_atomic_not64(tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6119 tme_memory_atomic_not64(tme_shared tme_uint64_t *memory,
6120 tme_rwlock_t *rwlock,
6121 unsigned int align_min)
6122 {
6123 tme_uint64_t value_read;
6124 tme_uint64_t value_written;
6125 tme_uint64_t value_read_verify;
6126
6127 /* if we can't make direct accesses at all, all atomic
6128 accesses must be done under lock. (when threads are
6129 cooperative the actual locking isn't needed): */
6130 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6131 if (!TME_THREADS_COOPERATIVE) {
6132 tme_rwlock_wrlock(rwlock);
6133 }
6134 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6135 value_written = ~value_read;
6136 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6137 if (!TME_THREADS_COOPERATIVE) {
6138 tme_rwlock_unlock(rwlock);
6139 }
6140 }
6141
6142 /* otherwise, threads are not cooperative and this host CPU
6143 can make atomic accesses to at least the most common memory
6144 size.
6145
6146 in that case, the only reason this function should get
6147 called is if the host CPU can't do an atomic 64-bit
6148 not at all, or if it can't do it at this alignment.
6149
6150 we emulate the atomic 64-bit not with a compare-and-exchange: */
6151 else {
6152
6153 /* do an atomic read of the memory: */
6154 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6155
6156 /* spin the not in a compare-and-exchange loop: */
6157 for (;;) {
6158
6159 /* make the value to write: */
6160 value_written = ~value_read;
6161
6162 /* try the compare-and-exchange: */
6163 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6164
6165 /* if the compare-and-exchange failed: */
6166 if (__tme_predict_false(value_read_verify != value_read)) {
6167
6168 /* loop with the new value read from the memory: */
6169 value_read = value_read_verify;
6170 continue;
6171 }
6172
6173 /* stop now: */
6174 break;
6175 }
6176 }
6177
6178 /* return the value read: */
6179 return (value_read);
6180 }
6181
6182 /* undefine any macro version of tme_memory_atomic_neg64: */
6183 #undef tme_memory_atomic_neg64
6184
6185 /* the 64-bit atomic neg function: */
6186 tme_uint64_t
tme_memory_atomic_neg64(tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6187 tme_memory_atomic_neg64(tme_shared tme_uint64_t *memory,
6188 tme_rwlock_t *rwlock,
6189 unsigned int align_min)
6190 {
6191 tme_uint64_t value_read;
6192 tme_uint64_t value_written;
6193 tme_uint64_t value_read_verify;
6194
6195 /* if we can't make direct accesses at all, all atomic
6196 accesses must be done under lock. (when threads are
6197 cooperative the actual locking isn't needed): */
6198 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6199 if (!TME_THREADS_COOPERATIVE) {
6200 tme_rwlock_wrlock(rwlock);
6201 }
6202 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6203 value_written = 0 - value_read;
6204 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6205 if (!TME_THREADS_COOPERATIVE) {
6206 tme_rwlock_unlock(rwlock);
6207 }
6208 }
6209
6210 /* otherwise, threads are not cooperative and this host CPU
6211 can make atomic accesses to at least the most common memory
6212 size.
6213
6214 in that case, the only reason this function should get
6215 called is if the host CPU can't do an atomic 64-bit
6216 neg at all, or if it can't do it at this alignment.
6217
6218 we emulate the atomic 64-bit neg with a compare-and-exchange: */
6219 else {
6220
6221 /* do an atomic read of the memory: */
6222 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6223
6224 /* spin the neg in a compare-and-exchange loop: */
6225 for (;;) {
6226
6227 /* make the value to write: */
6228 value_written = 0 - value_read;
6229
6230 /* try the compare-and-exchange: */
6231 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6232
6233 /* if the compare-and-exchange failed: */
6234 if (__tme_predict_false(value_read_verify != value_read)) {
6235
6236 /* loop with the new value read from the memory: */
6237 value_read = value_read_verify;
6238 continue;
6239 }
6240
6241 /* stop now: */
6242 break;
6243 }
6244 }
6245
6246 /* return the value read: */
6247 return (value_read);
6248 }
6249
6250 /* undefine any macro version of tme_memory_atomic_xchg64: */
6251 #undef tme_memory_atomic_xchg64
6252
6253 /* the 64-bit atomic xchg function: */
6254 tme_uint64_t
tme_memory_atomic_xchg64(tme_shared tme_uint64_t * memory,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6255 tme_memory_atomic_xchg64(tme_shared tme_uint64_t *memory,
6256 tme_uint64_t value_written,
6257 tme_rwlock_t *rwlock,
6258 unsigned int align_min)
6259 {
6260 tme_uint64_t value_read;
6261 tme_uint64_t value_read_verify;
6262
6263 /* if we can't make direct accesses at all, all atomic
6264 accesses must be done under lock. (when threads are
6265 cooperative the actual locking isn't needed): */
6266 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6267 if (!TME_THREADS_COOPERATIVE) {
6268 tme_rwlock_wrlock(rwlock);
6269 }
6270 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6271 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6272 if (!TME_THREADS_COOPERATIVE) {
6273 tme_rwlock_unlock(rwlock);
6274 }
6275 }
6276
6277 /* otherwise, threads are not cooperative and this host CPU
6278 can make atomic accesses to at least the most common memory
6279 size.
6280
6281 in that case, the only reason this function should get
6282 called is if the host CPU can't do an atomic 64-bit
6283 xchg at all, or if it can't do it at this alignment.
6284
6285 we emulate the atomic 64-bit xchg with a compare-and-exchange: */
6286 else {
6287
6288 /* do an atomic read of the memory: */
6289 value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6290
6291 /* spin the xchg in a compare-and-exchange loop: */
6292 for (;;) {
6293
6294 /* try the compare-and-exchange: */
6295 value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6296
6297 /* if the compare-and-exchange failed: */
6298 if (__tme_predict_false(value_read_verify != value_read)) {
6299
6300 /* loop with the new value read from the memory: */
6301 value_read = value_read_verify;
6302 continue;
6303 }
6304
6305 /* stop now: */
6306 break;
6307 }
6308 }
6309
6310 /* return the value read: */
6311 return (value_read);
6312 }
6313
6314 /* undefine any macro version of tme_memory_atomic_cx64: */
6315 #undef tme_memory_atomic_cx64
6316
6317 /* the 64-bit atomic cx function: */
6318 tme_uint64_t
tme_memory_atomic_cx64(tme_shared tme_uint64_t * memory,tme_uint64_t value_cmp,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6319 tme_memory_atomic_cx64(tme_shared tme_uint64_t *memory,
6320 tme_uint64_t value_cmp,
6321 tme_uint64_t value_written,
6322 tme_rwlock_t *rwlock,
6323 unsigned int align_min)
6324 {
6325 tme_uint64_t value_read;
6326
6327 /* if we can't make direct accesses at all, all atomic
6328 accesses must be done under lock. (when threads are
6329 cooperative the actual locking isn't needed): */
6330 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6331 if (!TME_THREADS_COOPERATIVE) {
6332 tme_rwlock_wrlock(rwlock);
6333 }
6334 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6335 if (value_read == value_cmp) {
6336 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6337 }
6338 if (!TME_THREADS_COOPERATIVE) {
6339 tme_rwlock_unlock(rwlock);
6340 }
6341 }
6342
6343 /* otherwise, threads are not cooperative and this host CPU
6344 can make atomic accesses to at least the most common memory
6345 size.
6346
6347 in that case, the only reason this function should get
6348 called is if the host CPU can't do an atomic 64-bit
6349 cx at all, or if it can't do it at this alignment.
6350
6351 we assume that these problematic atomic cxs are rare,
6352 and to emulate them we simply stop all other threads while
6353 doing the cx: */
6354 else {
6355 tme_thread_suspend_others();
6356 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6357 if (value_read == value_cmp) {
6358 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6359 }
6360 tme_thread_resume_others();
6361 }
6362
6363 /* return the value read: */
6364 return (value_read);
6365 }
6366
6367 /* undefine any macro version of tme_memory_atomic_read64: */
6368 #undef tme_memory_atomic_read64
6369
6370 /* the 64-bit atomic read function: */
6371 tme_uint64_t
tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6372 tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t *memory,
6373 tme_rwlock_t *rwlock,
6374 unsigned int align_min)
6375 {
6376 tme_uint64_t value_read;
6377
6378 /* if we can't make direct accesses at all, all atomic
6379 accesses must be done under lock. (when threads are
6380 cooperative the actual locking isn't needed): */
6381 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6382 if (!TME_THREADS_COOPERATIVE) {
6383 tme_rwlock_rdlock(rwlock);
6384 }
6385 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6386 if (!TME_THREADS_COOPERATIVE) {
6387 tme_rwlock_unlock(rwlock);
6388 }
6389 }
6390
6391 /* otherwise, threads are not cooperative and this host CPU
6392 can make atomic accesses to at least the most common memory
6393 size.
6394
6395 in that case, the only reason this function should get
6396 called is if the host CPU can't do an atomic 64-bit
6397 read at all, or if it can't do it at this alignment.
6398
6399 we assume that these problematic atomic reads are rare,
6400 and to emulate them we simply stop all other threads while
6401 doing the read: */
6402 else {
6403 tme_thread_suspend_others();
6404 value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6405 tme_thread_resume_others();
6406 }
6407
6408 /* return the value read: */
6409 return (value_read);
6410 }
6411
6412 /* undefine any macro version of tme_memory_atomic_write64: */
6413 #undef tme_memory_atomic_write64
6414
6415 /* the 64-bit atomic write function: */
6416 void
tme_memory_atomic_write64(tme_shared tme_uint64_t * memory,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6417 tme_memory_atomic_write64(tme_shared tme_uint64_t *memory,
6418 tme_uint64_t value_written,
6419 tme_rwlock_t *rwlock,
6420 unsigned int align_min)
6421 {
6422
6423 /* if we can't make direct accesses at all, all atomic
6424 accesses must be done under lock. (when threads are
6425 cooperative the actual locking isn't needed): */
6426 if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6427 if (!TME_THREADS_COOPERATIVE) {
6428 tme_rwlock_wrlock(rwlock);
6429 }
6430 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6431 if (!TME_THREADS_COOPERATIVE) {
6432 tme_rwlock_unlock(rwlock);
6433 }
6434 }
6435
6436 /* otherwise, threads are not cooperative and this host CPU
6437 can make atomic accesses to at least the most common memory
6438 size.
6439
6440 in that case, the only reason this function should get
6441 called is if the host CPU can't do an atomic 64-bit
6442 write at all, or if it can't do it at this alignment.
6443
6444 we assume that these problematic atomic writes are rare,
6445 and to emulate them we simply stop all other threads while
6446 doing the write: */
6447 else {
6448 tme_thread_suspend_others();
6449 tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6450 tme_thread_resume_others();
6451 }
6452 }
6453
6454 #endif /* TME_HAVE_INT64_T */
6455