1 /* automatically generated by memory-auto.sh, do not edit! */
2 
3 /*
4  * Copyright (c) 2005, 2006 Matt Fredette
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *      This product includes software developed by Matt Fredette.
18  * 4. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /* includes: */
35 #include <tme/memory.h>
36 
37 
38 _TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");
39 
40 /* undefine the macro version of tme_memory_bus_read16: */
41 #undef tme_memory_bus_read16
42 
43 /* the bus 16-bit read slow function: */
44 tme_uint16_t
tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)45 tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
46 {
47   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
48   unsigned int size_skip;
49   unsigned int size_done;
50   tme_uint16_t x;
51 #ifdef TME_HAVE_INT64_T
52   _tme_const tme_shared tme_uint64_t *parts64;
53   tme_uint64_t part64;
54 #endif /* TME_HAVE_INT64_T */
55   _tme_const tme_shared tme_uint32_t *parts32;
56   tme_uint32_t part32;
57   _tme_const tme_shared tme_uint16_t *parts16;
58   tme_uint16_t part16;
59   _tme_const tme_shared tme_uint8_t *parts8;
60   tme_uint8_t part8;
61 
62   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
63 
64 #ifdef TME_HAVE_INT64_T
65 
66   if (host_boundary == sizeof(tme_uint64_t)) {
67 
68     /* prepare to read the first 64-bit part of the memory: */
69     parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
70     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
71     size_done = 0;
72 
73     /* read the first 64-bit part of the memory: */
74     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
75 
76     /* on a little-endian host, we shift off the skip
77        data on the right, and shift the remaining data
78        up into position in the result: */
79     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
80       x = (((tme_uint16_t) (part64 >> size_skip)) << 0);
81     }
82 
83     /* on a big-endian host, we shift off the skip data
84        on the left, and shift the remaining data down
85        into position in the result: */
86     else {
87       x = ((part64 << size_skip) >> ((64 - 16) + 0));
88     }
89     size_done = 64 - size_skip;
90 
91     /* read at most one remaining 64-bit part of the memory: */
92     if (__tme_predict_false(size_done < 16)) {
93 
94       /* make a boundary: */
95       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
96 
97       /* read the next 64-bit part of the memory: */
98       parts64++;
99       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
100 
101       /* on a little-endian host, we shift off the skip
102          data on the right, and shift the remaining data
103          up into position in the result: */
104       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
105         x |= (((tme_uint16_t) (part64 >> 0)) << size_done);
106       }
107 
108       /* on a big-endian host, we shift off the skip data
109          on the left, and shift the remaining data down
110          into position in the result: */
111       else {
112         x |= ((part64 << 0) >> ((64 - 16) + size_done));
113       }
114     }
115   }
116 
117   else
118 
119 #endif /* TME_HAVE_INT64_T */
120 
121   if (host_boundary == sizeof(tme_uint32_t)) {
122 
123     /* prepare to read the first 32-bit part of the memory: */
124     parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
125     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
126     size_done = 0;
127 
128     /* read the first 32-bit part of the memory: */
129     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
130 
131     /* on a little-endian host, we shift off the skip
132        data on the right, and shift the remaining data
133        up into position in the result: */
134     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
135       x = (((tme_uint16_t) (part32 >> size_skip)) << 0);
136     }
137 
138     /* on a big-endian host, we shift off the skip data
139        on the left, and shift the remaining data down
140        into position in the result: */
141     else {
142       x = ((part32 << size_skip) >> ((32 - 16) + 0));
143     }
144     size_done = 32 - size_skip;
145 
146     /* read at most one remaining 32-bit part of the memory: */
147     if (__tme_predict_false(size_done < 16)) {
148 
149       /* make a boundary: */
150       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
151 
152       /* read the next 32-bit part of the memory: */
153       parts32++;
154       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
155 
156       /* on a little-endian host, we shift off the skip
157          data on the right, and shift the remaining data
158          up into position in the result: */
159       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
160         x |= (((tme_uint16_t) (part32 >> 0)) << size_done);
161       }
162 
163       /* on a big-endian host, we shift off the skip data
164          on the left, and shift the remaining data down
165          into position in the result: */
166       else {
167         x |= ((part32 << 0) >> ((32 - 16) + size_done));
168       }
169     }
170   }
171 
172   else if (host_boundary == sizeof(tme_uint16_t)) {
173 
174     /* prepare to read the first 16-bit part of the memory: */
175     parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
176     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
177     size_done = 0;
178 
179     /* read the first 16-bit part of the memory: */
180     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
181 
182     /* on a little-endian host, we shift off the skip
183        data on the right, and shift the remaining data
184        up into position in the result: */
185     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
186       x = (((tme_uint16_t) (part16 >> size_skip)) << 0);
187     }
188 
189     /* on a big-endian host, we shift off the skip data
190        on the left, and shift the remaining data down
191        into position in the result: */
192     else {
193       x = ((((tme_uint16_t) part16) << ((16 - 16) + size_skip)) >> 0);
194     }
195     size_done = 16 - size_skip;
196 
197     /* read at most one remaining 16-bit part of the memory: */
198     if (__tme_predict_false(size_done < 16)) {
199 
200       /* make a boundary: */
201       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
202 
203       /* read the next 16-bit part of the memory: */
204       parts16++;
205       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
206 
207       /* on a little-endian host, we shift off the skip
208          data on the right, and shift the remaining data
209          up into position in the result: */
210       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
211         x |= (((tme_uint16_t) (part16 >> 0)) << size_done);
212       }
213 
214       /* on a big-endian host, we shift off the skip data
215          on the left, and shift the remaining data down
216          into position in the result: */
217       else {
218         x |= ((((tme_uint16_t) part16) << ((16 - 16) + 0)) >> size_done);
219       }
220     }
221   }
222 
223   else {
224 
225     /* prepare to read the first 8-bit part of the memory: */
226     parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
227     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
228     size_done = 0;
229 
230     /* read the first 8-bit part of the memory: */
231     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
232 
233     /* on a little-endian host, we shift off the skip
234        data on the right, and shift the remaining data
235        up into position in the result: */
236     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
237       x = (((tme_uint16_t) (part8 >> size_skip)) << 0);
238     }
239 
240     /* on a big-endian host, we shift off the skip data
241        on the left, and shift the remaining data down
242        into position in the result: */
243     else {
244       x = ((((tme_uint16_t) part8) << ((16 - 8) + size_skip)) >> 0);
245     }
246     size_done = 8 - size_skip;
247 
248     /* read at most one remaining 8-bit part of the memory: */
249     if (__tme_predict_false(size_done < 16)) {
250 
251       /* make a boundary: */
252       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
253 
254       /* read the next 8-bit part of the memory: */
255       parts8++;
256       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
257 
258       /* on a little-endian host, we shift off the skip
259          data on the right, and shift the remaining data
260          up into position in the result: */
261       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
262         x |= (((tme_uint16_t) (part8 >> 0)) << size_done);
263       }
264 
265       /* on a big-endian host, we shift off the skip data
266          on the left, and shift the remaining data down
267          into position in the result: */
268       else {
269         x |= ((((tme_uint16_t) part8) << ((16 - 8) + 0)) >> size_done);
270       }
271     }
272   }
273 
274   /* return the value read: */
275   return (x);
276 }
277 
278 /* undefine the macro version of tme_memory_bus_write16: */
279 #undef tme_memory_bus_write16
280 
281 /* the bus 16-bit write slow function: */
282 void
tme_memory_bus_write16(tme_shared tme_uint16_t * mem,tme_uint16_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)283 tme_memory_bus_write16(tme_shared tme_uint16_t *mem, tme_uint16_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
284 {
285   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
286   unsigned int size_skip;
287   unsigned int size_done;
288 #ifdef TME_HAVE_INT64_T
289   tme_shared tme_uint64_t *parts64;
290   tme_uint64_t part64;
291   tme_uint64_t part64_cmp;
292 #endif /* TME_HAVE_INT64_T */
293   tme_shared tme_uint32_t *parts32;
294   tme_uint32_t part32;
295   tme_uint32_t part32_cmp;
296   tme_shared tme_uint16_t *parts16;
297   tme_uint16_t part16;
298   tme_uint16_t part16_cmp;
299   tme_shared tme_uint8_t *parts8;
300   tme_uint8_t part8;
301   tme_uint8_t part8_cmp;
302 
303   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
304 
305 #ifdef TME_HAVE_INT64_T
306 
307   if (host_boundary == sizeof(tme_uint64_t)) {
308 
309     /* prepare to write the first 64-bit part of the memory: */
310     parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
311     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
312     size_done = 0;
313 
314     /* write the first 64-bit part of the memory: */
315     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
316     do {
317       part64_cmp = part64;
318 
319       /* on a little-endian host, we clear with zeroes
320          shifted up past the skip data, and then we
321          insert the data shifted up past the skip data: */
322       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
323         part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
324         part64 |= (((tme_uint64_t) x) << size_skip);
325       }
326 
327       /* on a big-endian host, we clear with zeroes
328          shifted down past the skip data, and then we
329          insert the data shifted down past the skip data: */
330       else {
331         part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + 0)) >> size_skip);
332         part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> size_skip);
333       }
334 
335       /* loop until we can atomically update this part: */
336       part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
337     } while (part64 != part64_cmp);
338     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
339       x >>= (64 - size_skip);
340     }
341     else {
342       x <<= (64 - size_skip);
343     }
344     size_done = 64 - size_skip;
345 
346     /* write at most one remaining 64-bit part of the memory: */
347     if (__tme_predict_false(size_done < 16)) {
348 
349       /* make a boundary: */
350       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
351 
352       /* write the next 64-bit part of the memory: */
353       parts64++;
354       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
355       do {
356         part64_cmp = part64;
357 
358         /* on a little-endian host, we clear with zeroes
359            shifted up past the skip data, and then we
360            insert the data shifted up past the skip data: */
361         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
362           part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
363           part64 |= (((tme_uint64_t) x) << 0);
364         }
365 
366         /* on a big-endian host, we clear with zeroes
367            shifted down past the skip data, and then we
368            insert the data shifted down past the skip data: */
369         else {
370           part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + size_done)) >> 0);
371           part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> 0);
372         }
373 
374         /* loop until we can atomically update this part: */
375         part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
376       } while (part64 != part64_cmp);
377     }
378   }
379 
380   else
381 
382 #endif /* TME_HAVE_INT64_T */
383 
384   if (host_boundary == sizeof(tme_uint32_t)) {
385 
386     /* prepare to write the first 32-bit part of the memory: */
387     parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
388     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
389     size_done = 0;
390 
391     /* write the first 32-bit part of the memory: */
392     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
393     do {
394       part32_cmp = part32;
395 
396       /* on a little-endian host, we clear with zeroes
397          shifted up past the skip data, and then we
398          insert the data shifted up past the skip data: */
399       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
400         part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
401         part32 |= (((tme_uint32_t) x) << size_skip);
402       }
403 
404       /* on a big-endian host, we clear with zeroes
405          shifted down past the skip data, and then we
406          insert the data shifted down past the skip data: */
407       else {
408         part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + 0)) >> size_skip);
409         part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> size_skip);
410       }
411 
412       /* loop until we can atomically update this part: */
413       part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
414     } while (part32 != part32_cmp);
415     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
416       x >>= (32 - size_skip);
417     }
418     else {
419       x <<= (32 - size_skip);
420     }
421     size_done = 32 - size_skip;
422 
423     /* write at most one remaining 32-bit part of the memory: */
424     if (__tme_predict_false(size_done < 16)) {
425 
426       /* make a boundary: */
427       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
428 
429       /* write the next 32-bit part of the memory: */
430       parts32++;
431       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
432       do {
433         part32_cmp = part32;
434 
435         /* on a little-endian host, we clear with zeroes
436            shifted up past the skip data, and then we
437            insert the data shifted up past the skip data: */
438         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
439           part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
440           part32 |= (((tme_uint32_t) x) << 0);
441         }
442 
443         /* on a big-endian host, we clear with zeroes
444            shifted down past the skip data, and then we
445            insert the data shifted down past the skip data: */
446         else {
447           part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + size_done)) >> 0);
448           part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> 0);
449         }
450 
451         /* loop until we can atomically update this part: */
452         part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
453       } while (part32 != part32_cmp);
454     }
455   }
456 
457   else if (host_boundary == sizeof(tme_uint16_t)) {
458 
459     /* prepare to write the first 16-bit part of the memory: */
460     parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
461     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
462     size_done = 0;
463 
464     /* write the first 16-bit part of the memory: */
465     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
466     do {
467       part16_cmp = part16;
468 
469       /* on a little-endian host, we clear with zeroes
470          shifted up past the skip data, and then we
471          insert the data shifted up past the skip data: */
472       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
473         part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
474         part16 |= (((tme_uint16_t) x) << size_skip);
475       }
476 
477       /* on a big-endian host, we clear with zeroes
478          shifted down past the skip data, and then we
479          insert the data shifted down past the skip data: */
480       else {
481         part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
482         part16 |= (x >> ((16 - 16) + size_skip));
483       }
484 
485       /* loop until we can atomically update this part: */
486       part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
487     } while (part16 != part16_cmp);
488     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
489       x >>= (16 - size_skip);
490     }
491     else {
492       x <<= (16 - size_skip);
493     }
494     size_done = 16 - size_skip;
495 
496     /* write at most one remaining 16-bit part of the memory: */
497     if (__tme_predict_false(size_done < 16)) {
498 
499       /* make a boundary: */
500       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
501 
502       /* write the next 16-bit part of the memory: */
503       parts16++;
504       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
505       do {
506         part16_cmp = part16;
507 
508         /* on a little-endian host, we clear with zeroes
509            shifted up past the skip data, and then we
510            insert the data shifted up past the skip data: */
511         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
512           part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
513           part16 |= (((tme_uint16_t) x) << 0);
514         }
515 
516         /* on a big-endian host, we clear with zeroes
517            shifted down past the skip data, and then we
518            insert the data shifted down past the skip data: */
519         else {
520           part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
521           part16 |= (x >> ((16 - 16) + 0));
522         }
523 
524         /* loop until we can atomically update this part: */
525         part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
526       } while (part16 != part16_cmp);
527     }
528   }
529 
530   else {
531 
532     /* prepare to write the first 8-bit part of the memory: */
533     parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
534     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
535     size_done = 0;
536 
537     /* write the first 8-bit part of the memory: */
538     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
539     do {
540       part8_cmp = part8;
541 
542       /* on a little-endian host, we clear with zeroes
543          shifted up past the skip data, and then we
544          insert the data shifted up past the skip data: */
545       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
546         part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
547         part8 |= (((tme_uint8_t) x) << size_skip);
548       }
549 
550       /* on a big-endian host, we clear with zeroes
551          shifted down past the skip data, and then we
552          insert the data shifted down past the skip data: */
553       else {
554         part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
555         part8 |= (x >> ((16 - 8) + size_skip));
556       }
557 
558       /* loop until we can atomically update this part: */
559       part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
560     } while (part8 != part8_cmp);
561     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
562       x >>= (8 - size_skip);
563     }
564     else {
565       x <<= (8 - size_skip);
566     }
567     size_done = 8 - size_skip;
568 
569     /* write at most one remaining 8-bit part of the memory: */
570     if (__tme_predict_false(size_done < 16)) {
571 
572       /* make a boundary: */
573       tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
574 
575       /* write the next 8-bit part of the memory: */
576       parts8++;
577       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
578       do {
579         part8_cmp = part8;
580 
581         /* on a little-endian host, we clear with zeroes
582            shifted up past the skip data, and then we
583            insert the data shifted up past the skip data: */
584         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
585           part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
586           part8 |= (((tme_uint8_t) x) << 0);
587         }
588 
589         /* on a big-endian host, we clear with zeroes
590            shifted down past the skip data, and then we
591            insert the data shifted down past the skip data: */
592         else {
593           part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
594           part8 |= (x >> ((16 - 8) + 0));
595         }
596 
597         /* loop until we can atomically update this part: */
598         part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
599       } while (part8 != part8_cmp);
600     }
601   }
602 }
603 
604 /* undefine the macro version of tme_memory_bus_read32: */
605 #undef tme_memory_bus_read32
606 
607 /* the bus 32-bit read slow function: */
608 tme_uint32_t
tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)609 tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
610 {
611   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
612   unsigned int size_skip;
613   unsigned int size_done;
614   tme_uint32_t x;
615 #ifdef TME_HAVE_INT64_T
616   _tme_const tme_shared tme_uint64_t *parts64;
617   tme_uint64_t part64;
618 #endif /* TME_HAVE_INT64_T */
619   _tme_const tme_shared tme_uint32_t *parts32;
620   tme_uint32_t part32;
621   _tme_const tme_shared tme_uint16_t *parts16;
622   tme_uint16_t part16;
623   _tme_const tme_shared tme_uint8_t *parts8;
624   tme_uint8_t part8;
625 
626   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
627 
628 #ifdef TME_HAVE_INT64_T
629 
630   if (host_boundary == sizeof(tme_uint64_t)) {
631 
632     /* prepare to read the first 64-bit part of the memory: */
633     parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
634     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
635     size_done = 0;
636 
637     /* read the first 64-bit part of the memory: */
638     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
639 
640     /* on a little-endian host, we shift off the skip
641        data on the right, and shift the remaining data
642        up into position in the result: */
643     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
644       x = (((tme_uint32_t) (part64 >> size_skip)) << 0);
645     }
646 
647     /* on a big-endian host, we shift off the skip data
648        on the left, and shift the remaining data down
649        into position in the result: */
650     else {
651       x = ((part64 << size_skip) >> ((64 - 32) + 0));
652     }
653     size_done = 64 - size_skip;
654 
655     /* read at most one remaining 64-bit part of the memory: */
656     if (__tme_predict_false(size_done < 32)) {
657 
658       /* make a boundary: */
659       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
660 
661       /* read the next 64-bit part of the memory: */
662       parts64++;
663       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
664 
665       /* on a little-endian host, we shift off the skip
666          data on the right, and shift the remaining data
667          up into position in the result: */
668       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
669         x |= (((tme_uint32_t) (part64 >> 0)) << size_done);
670       }
671 
672       /* on a big-endian host, we shift off the skip data
673          on the left, and shift the remaining data down
674          into position in the result: */
675       else {
676         x |= ((part64 << 0) >> ((64 - 32) + size_done));
677       }
678     }
679   }
680 
681   else
682 
683 #endif /* TME_HAVE_INT64_T */
684 
685   if (host_boundary == sizeof(tme_uint32_t)) {
686 
687     /* prepare to read the first 32-bit part of the memory: */
688     parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
689     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
690     size_done = 0;
691 
692     /* read the first 32-bit part of the memory: */
693     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
694 
695     /* on a little-endian host, we shift off the skip
696        data on the right, and shift the remaining data
697        up into position in the result: */
698     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
699       x = (((tme_uint32_t) (part32 >> size_skip)) << 0);
700     }
701 
702     /* on a big-endian host, we shift off the skip data
703        on the left, and shift the remaining data down
704        into position in the result: */
705     else {
706       x = ((((tme_uint32_t) part32) << ((32 - 32) + size_skip)) >> 0);
707     }
708     size_done = 32 - size_skip;
709 
710     /* read at most one remaining 32-bit part of the memory: */
711     if (__tme_predict_false(size_done < 32)) {
712 
713       /* make a boundary: */
714       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
715 
716       /* read the next 32-bit part of the memory: */
717       parts32++;
718       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
719 
720       /* on a little-endian host, we shift off the skip
721          data on the right, and shift the remaining data
722          up into position in the result: */
723       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
724         x |= (((tme_uint32_t) (part32 >> 0)) << size_done);
725       }
726 
727       /* on a big-endian host, we shift off the skip data
728          on the left, and shift the remaining data down
729          into position in the result: */
730       else {
731         x |= ((((tme_uint32_t) part32) << ((32 - 32) + 0)) >> size_done);
732       }
733     }
734   }
735 
736   else if (host_boundary == sizeof(tme_uint16_t)) {
737 
738     /* prepare to read the first 16-bit part of the memory: */
739     parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
740     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
741     size_done = 0;
742 
743     /* read the first 16-bit part of the memory: */
744     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
745 
746     /* on a little-endian host, we shift off the skip
747        data on the right, and shift the remaining data
748        up into position in the result: */
749     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
750       x = (((tme_uint32_t) (part16 >> size_skip)) << 0);
751     }
752 
753     /* on a big-endian host, we shift off the skip data
754        on the left, and shift the remaining data down
755        into position in the result: */
756     else {
757       x = ((((tme_uint32_t) part16) << ((32 - 16) + size_skip)) >> 0);
758     }
759     size_done = 16 - size_skip;
760 
761     /* read any remaining 16-bit parts of the memory: */
762     for (; size_done < 32; size_done += 16) {
763 
764       /* make a boundary: */
765       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
766 
767       /* read the next 16-bit part of the memory: */
768       parts16++;
769       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
770 
771       /* on a little-endian host, we shift off the skip
772          data on the right, and shift the remaining data
773          up into position in the result: */
774       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
775         x |= (((tme_uint32_t) (part16 >> 0)) << size_done);
776       }
777 
778       /* on a big-endian host, we shift off the skip data
779          on the left, and shift the remaining data down
780          into position in the result: */
781       else {
782         x |= ((((tme_uint32_t) part16) << ((32 - 16) + 0)) >> size_done);
783       }
784     }
785   }
786 
787   else {
788 
789     /* prepare to read the first 8-bit part of the memory: */
790     parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
791     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
792     size_done = 0;
793 
794     /* read the first 8-bit part of the memory: */
795     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
796 
797     /* on a little-endian host, we shift off the skip
798        data on the right, and shift the remaining data
799        up into position in the result: */
800     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
801       x = (((tme_uint32_t) (part8 >> size_skip)) << 0);
802     }
803 
804     /* on a big-endian host, we shift off the skip data
805        on the left, and shift the remaining data down
806        into position in the result: */
807     else {
808       x = ((((tme_uint32_t) part8) << ((32 - 8) + size_skip)) >> 0);
809     }
810     size_done = 8 - size_skip;
811 
812     /* read any remaining 8-bit parts of the memory: */
813     for (; size_done < 32; size_done += 8) {
814 
815       /* make a boundary: */
816       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
817 
818       /* read the next 8-bit part of the memory: */
819       parts8++;
820       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
821 
822       /* on a little-endian host, we shift off the skip
823          data on the right, and shift the remaining data
824          up into position in the result: */
825       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
826         x |= (((tme_uint32_t) (part8 >> 0)) << size_done);
827       }
828 
829       /* on a big-endian host, we shift off the skip data
830          on the left, and shift the remaining data down
831          into position in the result: */
832       else {
833         x |= ((((tme_uint32_t) part8) << ((32 - 8) + 0)) >> size_done);
834       }
835     }
836   }
837 
838   /* return the value read: */
839   return (x);
840 }
841 
842 /* undefine the macro version of tme_memory_bus_write32: */
843 #undef tme_memory_bus_write32
844 
845 /* the bus 32-bit write slow function: */
846 void
tme_memory_bus_write32(tme_shared tme_uint32_t * mem,tme_uint32_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)847 tme_memory_bus_write32(tme_shared tme_uint32_t *mem, tme_uint32_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
848 {
849   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
850   unsigned int size_skip;
851   unsigned int size_done;
852 #ifdef TME_HAVE_INT64_T
853   tme_shared tme_uint64_t *parts64;
854   tme_uint64_t part64;
855   tme_uint64_t part64_cmp;
856 #endif /* TME_HAVE_INT64_T */
857   tme_shared tme_uint32_t *parts32;
858   tme_uint32_t part32;
859   tme_uint32_t part32_cmp;
860   tme_shared tme_uint16_t *parts16;
861   tme_uint16_t part16;
862   tme_uint16_t part16_cmp;
863   tme_shared tme_uint8_t *parts8;
864   tme_uint8_t part8;
865   tme_uint8_t part8_cmp;
866 
867   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
868 
869 #ifdef TME_HAVE_INT64_T
870 
871   if (host_boundary == sizeof(tme_uint64_t)) {
872 
873     /* prepare to write the first 64-bit part of the memory: */
874     parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
875     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
876     size_done = 0;
877 
878     /* write the first 64-bit part of the memory: */
879     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
880     do {
881       part64_cmp = part64;
882 
883       /* on a little-endian host, we clear with zeroes
884          shifted up past the skip data, and then we
885          insert the data shifted up past the skip data: */
886       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
887         part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
888         part64 |= (((tme_uint64_t) x) << size_skip);
889       }
890 
891       /* on a big-endian host, we clear with zeroes
892          shifted down past the skip data, and then we
893          insert the data shifted down past the skip data: */
894       else {
895         part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + 0)) >> size_skip);
896         part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> size_skip);
897       }
898 
899       /* loop until we can atomically update this part: */
900       part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
901     } while (part64 != part64_cmp);
902     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
903       x >>= (64 - size_skip);
904     }
905     else {
906       x <<= (64 - size_skip);
907     }
908     size_done = 64 - size_skip;
909 
910     /* write at most one remaining 64-bit part of the memory: */
911     if (__tme_predict_false(size_done < 32)) {
912 
913       /* make a boundary: */
914       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
915 
916       /* write the next 64-bit part of the memory: */
917       parts64++;
918       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
919       do {
920         part64_cmp = part64;
921 
922         /* on a little-endian host, we clear with zeroes
923            shifted up past the skip data, and then we
924            insert the data shifted up past the skip data: */
925         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
926           part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
927           part64 |= (((tme_uint64_t) x) << 0);
928         }
929 
930         /* on a big-endian host, we clear with zeroes
931            shifted down past the skip data, and then we
932            insert the data shifted down past the skip data: */
933         else {
934           part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + size_done)) >> 0);
935           part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> 0);
936         }
937 
938         /* loop until we can atomically update this part: */
939         part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
940       } while (part64 != part64_cmp);
941     }
942   }
943 
944   else
945 
946 #endif /* TME_HAVE_INT64_T */
947 
948   if (host_boundary == sizeof(tme_uint32_t)) {
949 
950     /* prepare to write the first 32-bit part of the memory: */
951     parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
952     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
953     size_done = 0;
954 
955     /* write the first 32-bit part of the memory: */
956     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
957     do {
958       part32_cmp = part32;
959 
960       /* on a little-endian host, we clear with zeroes
961          shifted up past the skip data, and then we
962          insert the data shifted up past the skip data: */
963       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
964         part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
965         part32 |= (((tme_uint32_t) x) << size_skip);
966       }
967 
968       /* on a big-endian host, we clear with zeroes
969          shifted down past the skip data, and then we
970          insert the data shifted down past the skip data: */
971       else {
972         part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
973         part32 |= (x >> ((32 - 32) + size_skip));
974       }
975 
976       /* loop until we can atomically update this part: */
977       part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
978     } while (part32 != part32_cmp);
979     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
980       x >>= (32 - size_skip);
981     }
982     else {
983       x <<= (32 - size_skip);
984     }
985     size_done = 32 - size_skip;
986 
987     /* write at most one remaining 32-bit part of the memory: */
988     if (__tme_predict_false(size_done < 32)) {
989 
990       /* make a boundary: */
991       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
992 
993       /* write the next 32-bit part of the memory: */
994       parts32++;
995       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
996       do {
997         part32_cmp = part32;
998 
999         /* on a little-endian host, we clear with zeroes
1000            shifted up past the skip data, and then we
1001            insert the data shifted up past the skip data: */
1002         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1003           part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1004           part32 |= (((tme_uint32_t) x) << 0);
1005         }
1006 
1007         /* on a big-endian host, we clear with zeroes
1008            shifted down past the skip data, and then we
1009            insert the data shifted down past the skip data: */
1010         else {
1011           part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
1012           part32 |= (x >> ((32 - 32) + 0));
1013         }
1014 
1015         /* loop until we can atomically update this part: */
1016         part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1017       } while (part32 != part32_cmp);
1018     }
1019   }
1020 
1021   else if (host_boundary == sizeof(tme_uint16_t)) {
1022 
1023     /* prepare to write the first 16-bit part of the memory: */
1024     parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1025     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1026     size_done = 0;
1027 
1028     /* write the first 16-bit part of the memory: */
1029     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1030     do {
1031       part16_cmp = part16;
1032 
1033       /* on a little-endian host, we clear with zeroes
1034          shifted up past the skip data, and then we
1035          insert the data shifted up past the skip data: */
1036       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1037         part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
1038         part16 |= (((tme_uint16_t) x) << size_skip);
1039       }
1040 
1041       /* on a big-endian host, we clear with zeroes
1042          shifted down past the skip data, and then we
1043          insert the data shifted down past the skip data: */
1044       else {
1045         part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
1046         part16 |= (x >> ((32 - 16) + size_skip));
1047       }
1048 
1049       /* loop until we can atomically update this part: */
1050       part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1051     } while (part16 != part16_cmp);
1052     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1053       x >>= (16 - size_skip);
1054     }
1055     else {
1056       x <<= (16 - size_skip);
1057     }
1058     size_done = 16 - size_skip;
1059 
1060     /* try to write one full 16-bit part of memory: */
1061     if (__tme_predict_true(size_done <= (32 - 16))) {
1062 
1063       /* make a boundary: */
1064       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1065 
1066       /* write a full 16-bit part of memory: */
1067       part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 16)));
1068       parts16++;
1069       tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
1070       size_done += 16;
1071       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1072         x >>= 16;
1073       }
1074       else {
1075         x <<= 16;
1076       }
1077     }
1078 
1079     /* write at most one remaining 16-bit part of the memory: */
1080     if (__tme_predict_false(size_done < 32)) {
1081 
1082       /* make a boundary: */
1083       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1084 
1085       /* write the next 16-bit part of the memory: */
1086       parts16++;
1087       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1088       do {
1089         part16_cmp = part16;
1090 
1091         /* on a little-endian host, we clear with zeroes
1092            shifted up past the skip data, and then we
1093            insert the data shifted up past the skip data: */
1094         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1095           part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1096           part16 |= (((tme_uint16_t) x) << 0);
1097         }
1098 
1099         /* on a big-endian host, we clear with zeroes
1100            shifted down past the skip data, and then we
1101            insert the data shifted down past the skip data: */
1102         else {
1103           part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
1104           part16 |= (x >> ((32 - 16) + 0));
1105         }
1106 
1107         /* loop until we can atomically update this part: */
1108         part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1109       } while (part16 != part16_cmp);
1110       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1111         x >>= (16 - 0);
1112       }
1113       else {
1114         x <<= (16 - 0);
1115       }
1116     }
1117   }
1118 
1119   else {
1120 
1121     /* prepare to write the first 8-bit part of the memory: */
1122     parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1123     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1124     size_done = 0;
1125 
1126     /* write the first 8-bit part of the memory: */
1127     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1128     do {
1129       part8_cmp = part8;
1130 
1131       /* on a little-endian host, we clear with zeroes
1132          shifted up past the skip data, and then we
1133          insert the data shifted up past the skip data: */
1134       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1135         part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
1136         part8 |= (((tme_uint8_t) x) << size_skip);
1137       }
1138 
1139       /* on a big-endian host, we clear with zeroes
1140          shifted down past the skip data, and then we
1141          insert the data shifted down past the skip data: */
1142       else {
1143         part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
1144         part8 |= (x >> ((32 - 8) + size_skip));
1145       }
1146 
1147       /* loop until we can atomically update this part: */
1148       part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1149     } while (part8 != part8_cmp);
1150     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1151       x >>= (8 - size_skip);
1152     }
1153     else {
1154       x <<= (8 - size_skip);
1155     }
1156     size_done = 8 - size_skip;
1157 
1158     /* write as many full 8-bit parts of the memory as we can: */
1159     for (; size_done <= (32 - 8); ) {
1160 
1161       /* make a boundary: */
1162       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1163 
1164       /* write a full 8-bit part of memory: */
1165       part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 8)));
1166       parts8++;
1167       tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
1168       size_done += 8;
1169       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1170         x >>= 8;
1171       }
1172       else {
1173         x <<= 8;
1174       }
1175     }
1176 
1177     /* write at most one remaining 8-bit part of the memory: */
1178     if (__tme_predict_false(size_done < 32)) {
1179 
1180       /* make a boundary: */
1181       tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1182 
1183       /* write the next 8-bit part of the memory: */
1184       parts8++;
1185       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1186       do {
1187         part8_cmp = part8;
1188 
1189         /* on a little-endian host, we clear with zeroes
1190            shifted up past the skip data, and then we
1191            insert the data shifted up past the skip data: */
1192         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1193           part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
1194           part8 |= (((tme_uint8_t) x) << 0);
1195         }
1196 
1197         /* on a big-endian host, we clear with zeroes
1198            shifted down past the skip data, and then we
1199            insert the data shifted down past the skip data: */
1200         else {
1201           part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
1202           part8 |= (x >> ((32 - 8) + 0));
1203         }
1204 
1205         /* loop until we can atomically update this part: */
1206         part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1207       } while (part8 != part8_cmp);
1208       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1209         x >>= (8 - 0);
1210       }
1211       else {
1212         x <<= (8 - 0);
1213       }
1214     }
1215   }
1216 }
1217 
1218 #ifdef TME_HAVE_INT64_T
1219 
1220 /* undefine the macro version of tme_memory_bus_read64: */
1221 #undef tme_memory_bus_read64
1222 
1223 /* the bus 64-bit read slow function: */
1224 tme_uint64_t
tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t * mem,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1225 tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1226 {
1227   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1228   unsigned int size_skip;
1229   unsigned int size_done;
1230   tme_uint64_t x;
1231 #ifdef TME_HAVE_INT64_T
1232   _tme_const tme_shared tme_uint64_t *parts64;
1233   tme_uint64_t part64;
1234 #endif /* TME_HAVE_INT64_T */
1235   _tme_const tme_shared tme_uint32_t *parts32;
1236   tme_uint32_t part32;
1237   _tme_const tme_shared tme_uint16_t *parts16;
1238   tme_uint16_t part16;
1239   _tme_const tme_shared tme_uint8_t *parts8;
1240   tme_uint8_t part8;
1241 
1242   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
1243 
1244 #ifdef TME_HAVE_INT64_T
1245 
1246   if (host_boundary == sizeof(tme_uint64_t)) {
1247 
1248     /* prepare to read the first 64-bit part of the memory: */
1249     parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
1250     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
1251     size_done = 0;
1252 
1253     /* read the first 64-bit part of the memory: */
1254     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1255 
1256     /* on a little-endian host, we shift off the skip
1257        data on the right, and shift the remaining data
1258        up into position in the result: */
1259     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1260       x = (((tme_uint64_t) (part64 >> size_skip)) << 0);
1261     }
1262 
1263     /* on a big-endian host, we shift off the skip data
1264        on the left, and shift the remaining data down
1265        into position in the result: */
1266     else {
1267       x = ((((tme_uint64_t) part64) << ((64 - 64) + size_skip)) >> 0);
1268     }
1269     size_done = 64 - size_skip;
1270 
1271     /* read at most one remaining 64-bit part of the memory: */
1272     if (__tme_predict_false(size_done < 64)) {
1273 
1274       /* make a boundary: */
1275       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1276 
1277       /* read the next 64-bit part of the memory: */
1278       parts64++;
1279       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1280 
1281       /* on a little-endian host, we shift off the skip
1282          data on the right, and shift the remaining data
1283          up into position in the result: */
1284       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1285         x |= (((tme_uint64_t) (part64 >> 0)) << size_done);
1286       }
1287 
1288       /* on a big-endian host, we shift off the skip data
1289          on the left, and shift the remaining data down
1290          into position in the result: */
1291       else {
1292         x |= ((((tme_uint64_t) part64) << ((64 - 64) + 0)) >> size_done);
1293       }
1294     }
1295   }
1296 
1297   else
1298 
1299 #endif /* TME_HAVE_INT64_T */
1300 
1301   if (host_boundary == sizeof(tme_uint32_t)) {
1302 
1303     /* prepare to read the first 32-bit part of the memory: */
1304     parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
1305     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
1306     size_done = 0;
1307 
1308     /* read the first 32-bit part of the memory: */
1309     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1310 
1311     /* on a little-endian host, we shift off the skip
1312        data on the right, and shift the remaining data
1313        up into position in the result: */
1314     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1315       x = (((tme_uint64_t) (part32 >> size_skip)) << 0);
1316     }
1317 
1318     /* on a big-endian host, we shift off the skip data
1319        on the left, and shift the remaining data down
1320        into position in the result: */
1321     else {
1322       x = ((((tme_uint64_t) part32) << ((64 - 32) + size_skip)) >> 0);
1323     }
1324     size_done = 32 - size_skip;
1325 
1326     /* read any remaining 32-bit parts of the memory: */
1327     for (; size_done < 64; size_done += 32) {
1328 
1329       /* make a boundary: */
1330       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1331 
1332       /* read the next 32-bit part of the memory: */
1333       parts32++;
1334       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1335 
1336       /* on a little-endian host, we shift off the skip
1337          data on the right, and shift the remaining data
1338          up into position in the result: */
1339       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1340         x |= (((tme_uint64_t) (part32 >> 0)) << size_done);
1341       }
1342 
1343       /* on a big-endian host, we shift off the skip data
1344          on the left, and shift the remaining data down
1345          into position in the result: */
1346       else {
1347         x |= ((((tme_uint64_t) part32) << ((64 - 32) + 0)) >> size_done);
1348       }
1349     }
1350   }
1351 
1352   else if (host_boundary == sizeof(tme_uint16_t)) {
1353 
1354     /* prepare to read the first 16-bit part of the memory: */
1355     parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1356     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1357     size_done = 0;
1358 
1359     /* read the first 16-bit part of the memory: */
1360     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1361 
1362     /* on a little-endian host, we shift off the skip
1363        data on the right, and shift the remaining data
1364        up into position in the result: */
1365     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1366       x = (((tme_uint64_t) (part16 >> size_skip)) << 0);
1367     }
1368 
1369     /* on a big-endian host, we shift off the skip data
1370        on the left, and shift the remaining data down
1371        into position in the result: */
1372     else {
1373       x = ((((tme_uint64_t) part16) << ((64 - 16) + size_skip)) >> 0);
1374     }
1375     size_done = 16 - size_skip;
1376 
1377     /* read any remaining 16-bit parts of the memory: */
1378     for (; size_done < 64; size_done += 16) {
1379 
1380       /* make a boundary: */
1381       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1382 
1383       /* read the next 16-bit part of the memory: */
1384       parts16++;
1385       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1386 
1387       /* on a little-endian host, we shift off the skip
1388          data on the right, and shift the remaining data
1389          up into position in the result: */
1390       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1391         x |= (((tme_uint64_t) (part16 >> 0)) << size_done);
1392       }
1393 
1394       /* on a big-endian host, we shift off the skip data
1395          on the left, and shift the remaining data down
1396          into position in the result: */
1397       else {
1398         x |= ((((tme_uint64_t) part16) << ((64 - 16) + 0)) >> size_done);
1399       }
1400     }
1401   }
1402 
1403   else {
1404 
1405     /* prepare to read the first 8-bit part of the memory: */
1406     parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1407     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1408     size_done = 0;
1409 
1410     /* read the first 8-bit part of the memory: */
1411     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1412 
1413     /* on a little-endian host, we shift off the skip
1414        data on the right, and shift the remaining data
1415        up into position in the result: */
1416     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1417       x = (((tme_uint64_t) (part8 >> size_skip)) << 0);
1418     }
1419 
1420     /* on a big-endian host, we shift off the skip data
1421        on the left, and shift the remaining data down
1422        into position in the result: */
1423     else {
1424       x = ((((tme_uint64_t) part8) << ((64 - 8) + size_skip)) >> 0);
1425     }
1426     size_done = 8 - size_skip;
1427 
1428     /* read any remaining 8-bit parts of the memory: */
1429     for (; size_done < 64; size_done += 8) {
1430 
1431       /* make a boundary: */
1432       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
1433 
1434       /* read the next 8-bit part of the memory: */
1435       parts8++;
1436       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1437 
1438       /* on a little-endian host, we shift off the skip
1439          data on the right, and shift the remaining data
1440          up into position in the result: */
1441       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1442         x |= (((tme_uint64_t) (part8 >> 0)) << size_done);
1443       }
1444 
1445       /* on a big-endian host, we shift off the skip data
1446          on the left, and shift the remaining data down
1447          into position in the result: */
1448       else {
1449         x |= ((((tme_uint64_t) part8) << ((64 - 8) + 0)) >> size_done);
1450       }
1451     }
1452   }
1453 
1454   /* return the value read: */
1455   return (x);
1456 }
1457 
1458 /* undefine the macro version of tme_memory_bus_write64: */
1459 #undef tme_memory_bus_write64
1460 
1461 /* the bus 64-bit write slow function: */
1462 void
tme_memory_bus_write64(tme_shared tme_uint64_t * mem,tme_uint64_t x,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1463 tme_memory_bus_write64(tme_shared tme_uint64_t *mem, tme_uint64_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1464 {
1465   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1466   unsigned int size_skip;
1467   unsigned int size_done;
1468 #ifdef TME_HAVE_INT64_T
1469   tme_shared tme_uint64_t *parts64;
1470   tme_uint64_t part64;
1471   tme_uint64_t part64_cmp;
1472 #endif /* TME_HAVE_INT64_T */
1473   tme_shared tme_uint32_t *parts32;
1474   tme_uint32_t part32;
1475   tme_uint32_t part32_cmp;
1476   tme_shared tme_uint16_t *parts16;
1477   tme_uint16_t part16;
1478   tme_uint16_t part16_cmp;
1479   tme_shared tme_uint8_t *parts8;
1480   tme_uint8_t part8;
1481   tme_uint8_t part8_cmp;
1482 
1483   assert (bus_boundary != 0 && bus_boundary <= host_boundary);
1484 
1485 #ifdef TME_HAVE_INT64_T
1486 
1487   if (host_boundary == sizeof(tme_uint64_t)) {
1488 
1489     /* prepare to write the first 64-bit part of the memory: */
1490     parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
1491     size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
1492     size_done = 0;
1493 
1494     /* write the first 64-bit part of the memory: */
1495     part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1496     do {
1497       part64_cmp = part64;
1498 
1499       /* on a little-endian host, we clear with zeroes
1500          shifted up past the skip data, and then we
1501          insert the data shifted up past the skip data: */
1502       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1503         part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1504         part64 |= (((tme_uint64_t) x) << size_skip);
1505       }
1506 
1507       /* on a big-endian host, we clear with zeroes
1508          shifted down past the skip data, and then we
1509          insert the data shifted down past the skip data: */
1510       else {
1511         part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << 0) >> size_skip);
1512         part64 |= (x >> ((64 - 64) + size_skip));
1513       }
1514 
1515       /* loop until we can atomically update this part: */
1516       part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
1517     } while (part64 != part64_cmp);
1518     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1519       x >>= (64 - size_skip);
1520     }
1521     else {
1522       x <<= (64 - size_skip);
1523     }
1524     size_done = 64 - size_skip;
1525 
1526     /* write at most one remaining 64-bit part of the memory: */
1527     if (__tme_predict_false(size_done < 64)) {
1528 
1529       /* make a boundary: */
1530       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1531 
1532       /* write the next 64-bit part of the memory: */
1533       parts64++;
1534       part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1535       do {
1536         part64_cmp = part64;
1537 
1538         /* on a little-endian host, we clear with zeroes
1539            shifted up past the skip data, and then we
1540            insert the data shifted up past the skip data: */
1541         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1542           part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1543           part64 |= (((tme_uint64_t) x) << 0);
1544         }
1545 
1546         /* on a big-endian host, we clear with zeroes
1547            shifted down past the skip data, and then we
1548            insert the data shifted down past the skip data: */
1549         else {
1550           part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << size_done) >> 0);
1551           part64 |= (x >> ((64 - 64) + 0));
1552         }
1553 
1554         /* loop until we can atomically update this part: */
1555         part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
1556       } while (part64 != part64_cmp);
1557     }
1558   }
1559 
1560   else
1561 
1562 #endif /* TME_HAVE_INT64_T */
1563 
1564   if (host_boundary == sizeof(tme_uint32_t)) {
1565 
1566     /* prepare to write the first 32-bit part of the memory: */
1567     parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
1568     size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
1569     size_done = 0;
1570 
1571     /* write the first 32-bit part of the memory: */
1572     part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1573     do {
1574       part32_cmp = part32;
1575 
1576       /* on a little-endian host, we clear with zeroes
1577          shifted up past the skip data, and then we
1578          insert the data shifted up past the skip data: */
1579       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1580         part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1581         part32 |= (((tme_uint32_t) x) << size_skip);
1582       }
1583 
1584       /* on a big-endian host, we clear with zeroes
1585          shifted down past the skip data, and then we
1586          insert the data shifted down past the skip data: */
1587       else {
1588         part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
1589         part32 |= (x >> ((64 - 32) + size_skip));
1590       }
1591 
1592       /* loop until we can atomically update this part: */
1593       part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1594     } while (part32 != part32_cmp);
1595     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1596       x >>= (32 - size_skip);
1597     }
1598     else {
1599       x <<= (32 - size_skip);
1600     }
1601     size_done = 32 - size_skip;
1602 
1603     /* try to write one full 32-bit part of memory: */
1604     if (__tme_predict_true(size_done <= (64 - 32))) {
1605 
1606       /* make a boundary: */
1607       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1608 
1609       /* write a full 32-bit part of memory: */
1610       part32 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 32)));
1611       parts32++;
1612       tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
1613       size_done += 32;
1614       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1615         x >>= 32;
1616       }
1617       else {
1618         x <<= 32;
1619       }
1620     }
1621 
1622     /* write at most one remaining 32-bit part of the memory: */
1623     if (__tme_predict_false(size_done < 64)) {
1624 
1625       /* make a boundary: */
1626       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1627 
1628       /* write the next 32-bit part of the memory: */
1629       parts32++;
1630       part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
1631       do {
1632         part32_cmp = part32;
1633 
1634         /* on a little-endian host, we clear with zeroes
1635            shifted up past the skip data, and then we
1636            insert the data shifted up past the skip data: */
1637         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1638           part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1639           part32 |= (((tme_uint32_t) x) << 0);
1640         }
1641 
1642         /* on a big-endian host, we clear with zeroes
1643            shifted down past the skip data, and then we
1644            insert the data shifted down past the skip data: */
1645         else {
1646           part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
1647           part32 |= (x >> ((64 - 32) + 0));
1648         }
1649 
1650         /* loop until we can atomically update this part: */
1651         part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
1652       } while (part32 != part32_cmp);
1653       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1654         x >>= (32 - 0);
1655       }
1656       else {
1657         x <<= (32 - 0);
1658       }
1659     }
1660   }
1661 
1662   else if (host_boundary == sizeof(tme_uint16_t)) {
1663 
1664     /* prepare to write the first 16-bit part of the memory: */
1665     parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
1666     size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
1667     size_done = 0;
1668 
1669     /* write the first 16-bit part of the memory: */
1670     part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1671     do {
1672       part16_cmp = part16;
1673 
1674       /* on a little-endian host, we clear with zeroes
1675          shifted up past the skip data, and then we
1676          insert the data shifted up past the skip data: */
1677       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1678         part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1679         part16 |= (((tme_uint16_t) x) << size_skip);
1680       }
1681 
1682       /* on a big-endian host, we clear with zeroes
1683          shifted down past the skip data, and then we
1684          insert the data shifted down past the skip data: */
1685       else {
1686         part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
1687         part16 |= (x >> ((64 - 16) + size_skip));
1688       }
1689 
1690       /* loop until we can atomically update this part: */
1691       part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1692     } while (part16 != part16_cmp);
1693     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1694       x >>= (16 - size_skip);
1695     }
1696     else {
1697       x <<= (16 - size_skip);
1698     }
1699     size_done = 16 - size_skip;
1700 
1701     /* write as many full 16-bit parts of the memory as we can: */
1702     for (; size_done <= (64 - 16); ) {
1703 
1704       /* make a boundary: */
1705       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1706 
1707       /* write a full 16-bit part of memory: */
1708       part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 16)));
1709       parts16++;
1710       tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
1711       size_done += 16;
1712       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1713         x >>= 16;
1714       }
1715       else {
1716         x <<= 16;
1717       }
1718     }
1719 
1720     /* write at most one remaining 16-bit part of the memory: */
1721     if (__tme_predict_false(size_done < 64)) {
1722 
1723       /* make a boundary: */
1724       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1725 
1726       /* write the next 16-bit part of the memory: */
1727       parts16++;
1728       part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
1729       do {
1730         part16_cmp = part16;
1731 
1732         /* on a little-endian host, we clear with zeroes
1733            shifted up past the skip data, and then we
1734            insert the data shifted up past the skip data: */
1735         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1736           part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1737           part16 |= (((tme_uint16_t) x) << 0);
1738         }
1739 
1740         /* on a big-endian host, we clear with zeroes
1741            shifted down past the skip data, and then we
1742            insert the data shifted down past the skip data: */
1743         else {
1744           part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
1745           part16 |= (x >> ((64 - 16) + 0));
1746         }
1747 
1748         /* loop until we can atomically update this part: */
1749         part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
1750       } while (part16 != part16_cmp);
1751       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1752         x >>= (16 - 0);
1753       }
1754       else {
1755         x <<= (16 - 0);
1756       }
1757     }
1758   }
1759 
1760   else {
1761 
1762     /* prepare to write the first 8-bit part of the memory: */
1763     parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
1764     size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
1765     size_done = 0;
1766 
1767     /* write the first 8-bit part of the memory: */
1768     part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1769     do {
1770       part8_cmp = part8;
1771 
1772       /* on a little-endian host, we clear with zeroes
1773          shifted up past the skip data, and then we
1774          insert the data shifted up past the skip data: */
1775       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1776         part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
1777         part8 |= (((tme_uint8_t) x) << size_skip);
1778       }
1779 
1780       /* on a big-endian host, we clear with zeroes
1781          shifted down past the skip data, and then we
1782          insert the data shifted down past the skip data: */
1783       else {
1784         part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
1785         part8 |= (x >> ((64 - 8) + size_skip));
1786       }
1787 
1788       /* loop until we can atomically update this part: */
1789       part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1790     } while (part8 != part8_cmp);
1791     if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1792       x >>= (8 - size_skip);
1793     }
1794     else {
1795       x <<= (8 - size_skip);
1796     }
1797     size_done = 8 - size_skip;
1798 
1799     /* write as many full 8-bit parts of the memory as we can: */
1800     for (; size_done <= (64 - 8); ) {
1801 
1802       /* make a boundary: */
1803       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1804 
1805       /* write a full 8-bit part of memory: */
1806       part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 8)));
1807       parts8++;
1808       tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
1809       size_done += 8;
1810       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1811         x >>= 8;
1812       }
1813       else {
1814         x <<= 8;
1815       }
1816     }
1817 
1818     /* write at most one remaining 8-bit part of the memory: */
1819     if (__tme_predict_false(size_done < 64)) {
1820 
1821       /* make a boundary: */
1822       tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
1823 
1824       /* write the next 8-bit part of the memory: */
1825       parts8++;
1826       part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
1827       do {
1828         part8_cmp = part8;
1829 
1830         /* on a little-endian host, we clear with zeroes
1831            shifted up past the skip data, and then we
1832            insert the data shifted up past the skip data: */
1833         if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1834           part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
1835           part8 |= (((tme_uint8_t) x) << 0);
1836         }
1837 
1838         /* on a big-endian host, we clear with zeroes
1839            shifted down past the skip data, and then we
1840            insert the data shifted down past the skip data: */
1841         else {
1842           part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
1843           part8 |= (x >> ((64 - 8) + 0));
1844         }
1845 
1846         /* loop until we can atomically update this part: */
1847         part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
1848       } while (part8 != part8_cmp);
1849       if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1850         x >>= (8 - 0);
1851       }
1852       else {
1853         x <<= (8 - 0);
1854       }
1855     }
1856   }
1857 }
1858 
1859 #endif /* TME_HAVE_INT64_T */
1860 
1861 /* undefine the macro version of tme_memory_bus_read_buffer: */
1862 #undef tme_memory_bus_read_buffer
1863 
1864 /* the bus read buffer function: */
1865 void
tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t * mem,tme_uint8_t * buffer,unsigned long count,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)1866 tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t *mem, tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
1867 {
1868   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
1869   _tme_const tme_uint8_t *part_buffer;
1870   unsigned int count_done;
1871   unsigned int count_misaligned;
1872   unsigned int bits_misaligned;
1873 #ifdef TME_HAVE_INT64_T
1874   _tme_const tme_shared tme_uint64_t *parts64;
1875   tme_uint64_t part64_buffer;
1876   tme_uint64_t part64;
1877   tme_uint64_t part64_next;
1878 #endif /* TME_HAVE_INT64_T */
1879   _tme_const tme_shared tme_uint32_t *parts32;
1880   tme_uint32_t part32_buffer;
1881   tme_uint32_t part32;
1882   tme_uint32_t part32_next;
1883   _tme_const tme_shared tme_uint16_t *parts16;
1884   tme_uint16_t part16_buffer;
1885   tme_uint16_t part16;
1886   tme_uint16_t part16_next;
1887   _tme_const tme_shared tme_uint8_t *parts8;
1888   tme_uint8_t part8_buffer;
1889   tme_uint8_t part8;
1890   tme_uint8_t part8_next;
1891 
1892   assert (count != 0);
1893   assert (bus_boundary != 0);
1894 
1895   /* if we are locking for all memory accesses, lock memory
1896      around a memcpy: */
1897   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
1898     tme_rwlock_rdlock(rwlock);
1899     memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count));
1900     tme_rwlock_unlock(rwlock);
1901   }
1902 
1903   /* otherwise, if the emulated bus boundary is greater than the
1904      host's bus boundary, we are forced to stop all other threads
1905      around a memcpy: */
1906   else if (__tme_predict_false(bus_boundary == 0
1907                                || bus_boundary > host_boundary)) {
1908     tme_thread_suspend_others();
1909     memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count) + (0 && align_min));
1910     tme_thread_resume_others();
1911   }
1912 
1913 #ifdef TME_HAVE_INT64_T
1914 
1915   else if (host_boundary == sizeof(tme_uint64_t)) {
1916 
1917     /* make a 64-bit pointer to the memory: */
1918     parts64 = (_tme_const tme_shared tme_uint64_t *) mem;
1919 
1920     /* if this pointer is not 64-bit aligned: */
1921     if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
1922 
1923       /* get the misalignment from the previous 64-bit boundary: */
1924       count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
1925 
1926       /* truncate this pointer to the previous 64-bit boundary: */
1927       parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
1928 
1929       /* get the number of bytes to read in the first 64-bit memory part: */
1930       count_done = sizeof(tme_uint64_t) - count_misaligned;
1931       if (__tme_predict_false(count_done > count)) {
1932         count_done = count;
1933       }
1934 
1935       /* read the first 64-bit memory part: */
1936       part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1937       parts64++;
1938 
1939       /* copy to the buffer the bytes to read in the first
1940          64-bit memory part: */
1941       part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
1942       count -= count_done;
1943       do {
1944         *buffer = *part_buffer;
1945         part_buffer++;
1946         buffer++;
1947       } while (--count_done != 0);
1948     }
1949 
1950     /* if we have full 64-bit parts to read: */
1951     if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
1952 
1953       /* if the buffer is 64-bit aligned: */
1954       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
1955 
1956         /* read full 64-bit parts without shifting: */
1957         do {
1958           part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1959           tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
1960 
1961           /* advance: */
1962           parts64++;
1963           buffer += sizeof(tme_uint64_t);
1964           count -= sizeof(tme_uint64_t);
1965         } while (count >= sizeof(tme_uint64_t));
1966       }
1967 
1968       /* otherwise, the buffer is not 64-bit aligned: */
1969       else {
1970 
1971         /* get the misalignment to the next 64-bit boundary: */
1972         count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
1973 
1974         /* read the next 64-bit memory part: */
1975         part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1976         parts64++;
1977 
1978         /* copy to the buffer until it is aligned: */
1979         part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
1980         count_done = count_misaligned;
1981         count -= count_misaligned;
1982         do {
1983           *buffer = *part_buffer;
1984           part_buffer++;
1985           buffer++;
1986         } while (--count_done != 0);
1987 
1988         /* read full 64-bit words with shifting: */
1989         bits_misaligned = count_misaligned * 8;
1990         part64
1991           = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
1992              ? (part64_buffer >> bits_misaligned)
1993              : (part64_buffer << bits_misaligned));
1994         for (; count >= sizeof(tme_uint64_t); ) {
1995           part64_next = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
1996           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
1997             part64 |= (part64_next << (64 - bits_misaligned));
1998             tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
1999             part64 = (part64_next >> bits_misaligned);
2000           }
2001           else {
2002             part64 |= (part64_next >> (64 - bits_misaligned));
2003             tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
2004             part64 = (part64_next << bits_misaligned);
2005           }
2006 
2007           /* advance: */
2008           parts64++;
2009           buffer += sizeof(tme_uint64_t);
2010           count -= sizeof(tme_uint64_t);
2011         }
2012 
2013         /* calculate how many more bytes there are to read in this
2014            64-bit memory part: */
2015         count_done = sizeof(tme_uint64_t) - count_misaligned;
2016         part64_buffer = part64;
2017 
2018         /* copy to the buffer the remaining bytes in this 64-bit part: */
2019         if (count_done > count) {
2020           count_done = count;
2021         }
2022         part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
2023         count -= count_done;
2024         do {
2025           *buffer = *part_buffer;
2026           part_buffer++;
2027           buffer++;
2028         } while (--count_done != 0);
2029       }
2030     }
2031 
2032     /* if we still have bytes to read: */
2033     if (__tme_predict_false(count > 0)) {
2034 
2035       /* we must have less than a full 64-bit part to read: */
2036       assert (count < sizeof(tme_uint64_t));
2037 
2038       /* read the last 64-bit memory part: */
2039       part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
2040 
2041       /* copy to the buffer the bytes to read in the first
2042          64-bit memory part: */
2043       part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
2044       count_done = count;
2045       do {
2046         *buffer = *part_buffer;
2047         part_buffer++;
2048         buffer++;
2049       } while (--count_done != 0);
2050     }
2051 
2052   }
2053 
2054 #endif /* TME_HAVE_INT64_T */
2055 
2056   else if (host_boundary == sizeof(tme_uint32_t)) {
2057 
2058     /* make a 32-bit pointer to the memory: */
2059     parts32 = (_tme_const tme_shared tme_uint32_t *) mem;
2060 
2061     /* if this pointer is not 32-bit aligned: */
2062     if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
2063 
2064       /* get the misalignment from the previous 32-bit boundary: */
2065       count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
2066 
2067       /* truncate this pointer to the previous 32-bit boundary: */
2068       parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
2069 
2070       /* get the number of bytes to read in the first 32-bit memory part: */
2071       count_done = sizeof(tme_uint32_t) - count_misaligned;
2072       if (__tme_predict_false(count_done > count)) {
2073         count_done = count;
2074       }
2075 
2076       /* read the first 32-bit memory part: */
2077       part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2078       parts32++;
2079 
2080       /* copy to the buffer the bytes to read in the first
2081          32-bit memory part: */
2082       part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2083       count -= count_done;
2084       do {
2085         *buffer = *part_buffer;
2086         part_buffer++;
2087         buffer++;
2088       } while (--count_done != 0);
2089     }
2090 
2091     /* if we have full 32-bit parts to read: */
2092     if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
2093 
2094       /* if the buffer is 32-bit aligned: */
2095       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
2096 
2097         /* read full 32-bit parts without shifting: */
2098         do {
2099           part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2100           tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2101 
2102           /* advance: */
2103           parts32++;
2104           buffer += sizeof(tme_uint32_t);
2105           count -= sizeof(tme_uint32_t);
2106         } while (count >= sizeof(tme_uint32_t));
2107       }
2108 
2109       /* otherwise, the buffer is not 32-bit aligned: */
2110       else {
2111 
2112         /* get the misalignment to the next 32-bit boundary: */
2113         count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
2114 
2115         /* read the next 32-bit memory part: */
2116         part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2117         parts32++;
2118 
2119         /* copy to the buffer until it is aligned: */
2120         part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2121         count_done = count_misaligned;
2122         count -= count_misaligned;
2123         do {
2124           *buffer = *part_buffer;
2125           part_buffer++;
2126           buffer++;
2127         } while (--count_done != 0);
2128 
2129         /* read full 32-bit words with shifting: */
2130         bits_misaligned = count_misaligned * 8;
2131         part32
2132           = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2133              ? (part32_buffer >> bits_misaligned)
2134              : (part32_buffer << bits_misaligned));
2135         for (; count >= sizeof(tme_uint32_t); ) {
2136           part32_next = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2137           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2138             part32 |= (part32_next << (32 - bits_misaligned));
2139             tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2140             part32 = (part32_next >> bits_misaligned);
2141           }
2142           else {
2143             part32 |= (part32_next >> (32 - bits_misaligned));
2144             tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
2145             part32 = (part32_next << bits_misaligned);
2146           }
2147 
2148           /* advance: */
2149           parts32++;
2150           buffer += sizeof(tme_uint32_t);
2151           count -= sizeof(tme_uint32_t);
2152         }
2153 
2154         /* calculate how many more bytes there are to read in this
2155            32-bit memory part: */
2156         count_done = sizeof(tme_uint32_t) - count_misaligned;
2157         part32_buffer = part32;
2158 
2159         /* copy to the buffer the remaining bytes in this 32-bit part: */
2160         if (count_done > count) {
2161           count_done = count;
2162         }
2163         part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2164         count -= count_done;
2165         do {
2166           *buffer = *part_buffer;
2167           part_buffer++;
2168           buffer++;
2169         } while (--count_done != 0);
2170       }
2171     }
2172 
2173     /* if we still have bytes to read: */
2174     if (__tme_predict_false(count > 0)) {
2175 
2176       /* we must have less than a full 32-bit part to read: */
2177       assert (count < sizeof(tme_uint32_t));
2178 
2179       /* read the last 32-bit memory part: */
2180       part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
2181 
2182       /* copy to the buffer the bytes to read in the first
2183          32-bit memory part: */
2184       part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
2185       count_done = count;
2186       do {
2187         *buffer = *part_buffer;
2188         part_buffer++;
2189         buffer++;
2190       } while (--count_done != 0);
2191     }
2192 
2193   }
2194 
2195   else if (host_boundary == sizeof(tme_uint16_t)) {
2196 
2197     /* make a 16-bit pointer to the memory: */
2198     parts16 = (_tme_const tme_shared tme_uint16_t *) mem;
2199 
2200     /* if this pointer is not 16-bit aligned: */
2201     if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
2202 
2203       /* get the misalignment from the previous 16-bit boundary: */
2204       count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
2205 
2206       /* truncate this pointer to the previous 16-bit boundary: */
2207       parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
2208 
2209       /* get the number of bytes to read in the first 16-bit memory part: */
2210       count_done = sizeof(tme_uint16_t) - count_misaligned;
2211       if (__tme_predict_false(count_done > count)) {
2212         count_done = count;
2213       }
2214 
2215       /* read the first 16-bit memory part: */
2216       part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2217       parts16++;
2218 
2219       /* copy to the buffer the bytes to read in the first
2220          16-bit memory part: */
2221       part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
2222       count -= count_done;
2223       do {
2224         *buffer = *part_buffer;
2225         part_buffer++;
2226         buffer++;
2227       } while (--count_done != 0);
2228     }
2229 
2230     /* if we have full 16-bit parts to read: */
2231     if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
2232 
2233       /* if the buffer is 16-bit aligned: */
2234       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
2235 
2236         /* read full 16-bit parts without shifting: */
2237         do {
2238           part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2239           tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2240 
2241           /* advance: */
2242           parts16++;
2243           buffer += sizeof(tme_uint16_t);
2244           count -= sizeof(tme_uint16_t);
2245         } while (count >= sizeof(tme_uint16_t));
2246       }
2247 
2248       /* otherwise, the buffer is not 16-bit aligned: */
2249       else {
2250 
2251         /* get the misalignment to the next 16-bit boundary: */
2252         count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
2253 
2254         /* read the next 16-bit memory part: */
2255         part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2256         parts16++;
2257 
2258         /* copy to the buffer until it is aligned: */
2259         part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2260         count_done = count_misaligned;
2261         count -= count_misaligned;
2262         do {
2263           *buffer = *part_buffer;
2264           part_buffer++;
2265           buffer++;
2266         } while (--count_done != 0);
2267 
2268         /* read full 16-bit words with shifting: */
2269         bits_misaligned = count_misaligned * 8;
2270         part16
2271           = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2272              ? (part16_buffer >> bits_misaligned)
2273              : (part16_buffer << bits_misaligned));
2274         for (; count >= sizeof(tme_uint16_t); ) {
2275           part16_next = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2276           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2277             part16 |= (part16_next << (16 - bits_misaligned));
2278             tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2279             part16 = (part16_next >> bits_misaligned);
2280           }
2281           else {
2282             part16 |= (part16_next >> (16 - bits_misaligned));
2283             tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
2284             part16 = (part16_next << bits_misaligned);
2285           }
2286 
2287           /* advance: */
2288           parts16++;
2289           buffer += sizeof(tme_uint16_t);
2290           count -= sizeof(tme_uint16_t);
2291         }
2292 
2293         /* calculate how many more bytes there are to read in this
2294            16-bit memory part: */
2295         count_done = sizeof(tme_uint16_t) - count_misaligned;
2296         part16_buffer = part16;
2297 
2298         /* copy to the buffer the remaining bytes in this 16-bit part: */
2299         if (count_done > count) {
2300           count_done = count;
2301         }
2302         part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2303         count -= count_done;
2304         do {
2305           *buffer = *part_buffer;
2306           part_buffer++;
2307           buffer++;
2308         } while (--count_done != 0);
2309       }
2310     }
2311 
2312     /* if we still have bytes to read: */
2313     if (__tme_predict_false(count > 0)) {
2314 
2315       /* we must have less than a full 16-bit part to read: */
2316       assert (count < sizeof(tme_uint16_t));
2317 
2318       /* read the last 16-bit memory part: */
2319       part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
2320 
2321       /* copy to the buffer the bytes to read in the first
2322          16-bit memory part: */
2323       part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
2324       count_done = count;
2325       do {
2326         *buffer = *part_buffer;
2327         part_buffer++;
2328         buffer++;
2329       } while (--count_done != 0);
2330     }
2331 
2332   }
2333 
2334   else {
2335 
2336     /* make a 8-bit pointer to the memory: */
2337     parts8 = (_tme_const tme_shared tme_uint8_t *) mem;
2338 
2339     /* if this pointer is not 8-bit aligned: */
2340     if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
2341 
2342       /* get the misalignment from the previous 8-bit boundary: */
2343       count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
2344 
2345       /* truncate this pointer to the previous 8-bit boundary: */
2346       parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
2347 
2348       /* get the number of bytes to read in the first 8-bit memory part: */
2349       count_done = sizeof(tme_uint8_t) - count_misaligned;
2350       if (__tme_predict_false(count_done > count)) {
2351         count_done = count;
2352       }
2353 
2354       /* read the first 8-bit memory part: */
2355       part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2356       parts8++;
2357 
2358       /* copy to the buffer the bytes to read in the first
2359          8-bit memory part: */
2360       part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
2361       count -= count_done;
2362       do {
2363         *buffer = *part_buffer;
2364         part_buffer++;
2365         buffer++;
2366       } while (--count_done != 0);
2367     }
2368 
2369     /* if we have full 8-bit parts to read: */
2370     if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
2371 
2372       /* if the buffer is 8-bit aligned: */
2373       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
2374 
2375         /* read full 8-bit parts without shifting: */
2376         do {
2377           part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2378           tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2379 
2380           /* advance: */
2381           parts8++;
2382           buffer += sizeof(tme_uint8_t);
2383           count -= sizeof(tme_uint8_t);
2384         } while (count >= sizeof(tme_uint8_t));
2385       }
2386 
2387       /* otherwise, the buffer is not 8-bit aligned: */
2388       else {
2389 
2390         /* get the misalignment to the next 8-bit boundary: */
2391         count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
2392 
2393         /* read the next 8-bit memory part: */
2394         part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2395         parts8++;
2396 
2397         /* copy to the buffer until it is aligned: */
2398         part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2399         count_done = count_misaligned;
2400         count -= count_misaligned;
2401         do {
2402           *buffer = *part_buffer;
2403           part_buffer++;
2404           buffer++;
2405         } while (--count_done != 0);
2406 
2407         /* read full 8-bit words with shifting: */
2408         bits_misaligned = count_misaligned * 8;
2409         part8
2410           = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2411              ? (part8_buffer >> bits_misaligned)
2412              : (part8_buffer << bits_misaligned));
2413         for (; count >= sizeof(tme_uint8_t); ) {
2414           part8_next = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2415           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2416             part8 |= (part8_next << (8 - bits_misaligned));
2417             tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2418             part8 = (part8_next >> bits_misaligned);
2419           }
2420           else {
2421             part8 |= (part8_next >> (8 - bits_misaligned));
2422             tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
2423             part8 = (part8_next << bits_misaligned);
2424           }
2425 
2426           /* advance: */
2427           parts8++;
2428           buffer += sizeof(tme_uint8_t);
2429           count -= sizeof(tme_uint8_t);
2430         }
2431 
2432         /* calculate how many more bytes there are to read in this
2433            8-bit memory part: */
2434         count_done = sizeof(tme_uint8_t) - count_misaligned;
2435         part8_buffer = part8;
2436 
2437         /* copy to the buffer the remaining bytes in this 8-bit part: */
2438         if (count_done > count) {
2439           count_done = count;
2440         }
2441         part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2442         count -= count_done;
2443         do {
2444           *buffer = *part_buffer;
2445           part_buffer++;
2446           buffer++;
2447         } while (--count_done != 0);
2448       }
2449     }
2450 
2451     /* if we still have bytes to read: */
2452     if (__tme_predict_false(count > 0)) {
2453 
2454       /* we must have less than a full 8-bit part to read: */
2455       assert (count < sizeof(tme_uint8_t));
2456 
2457       /* read the last 8-bit memory part: */
2458       part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
2459 
2460       /* copy to the buffer the bytes to read in the first
2461          8-bit memory part: */
2462       part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
2463       count_done = count;
2464       do {
2465         *buffer = *part_buffer;
2466         part_buffer++;
2467         buffer++;
2468       } while (--count_done != 0);
2469     }
2470 
2471   }
2472 }
2473 
2474 /* undefine the macro version of tme_memory_bus_write_buffer: */
2475 #undef tme_memory_bus_write_buffer
2476 
2477 /* the bus write buffer function: */
2478 void
tme_memory_bus_write_buffer(tme_shared tme_uint8_t * mem,_tme_const tme_uint8_t * buffer,unsigned long count,tme_rwlock_t * rwlock,unsigned int align_min,unsigned int bus_boundary)2479 tme_memory_bus_write_buffer(tme_shared tme_uint8_t *mem, _tme_const tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
2480 {
2481   const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
2482   tme_uint8_t *part_buffer;
2483   unsigned int count_done;
2484   unsigned int count_misaligned;
2485   unsigned int bits_misaligned;
2486 #ifdef TME_HAVE_INT64_T
2487   tme_shared tme_uint64_t *parts64;
2488   tme_uint64_t part64_buffer;
2489   tme_uint64_t part64;
2490   tme_uint64_t part64_next;
2491   tme_uint64_t part64_mask;
2492   tme_uint64_t part64_cmp;
2493 #endif /* TME_HAVE_INT64_T */
2494   tme_shared tme_uint32_t *parts32;
2495   tme_uint32_t part32_buffer;
2496   tme_uint32_t part32;
2497   tme_uint32_t part32_next;
2498   tme_uint32_t part32_mask;
2499   tme_uint32_t part32_cmp;
2500   tme_shared tme_uint16_t *parts16;
2501   tme_uint16_t part16_buffer;
2502   tme_uint16_t part16;
2503   tme_uint16_t part16_next;
2504   tme_uint16_t part16_mask;
2505   tme_uint16_t part16_cmp;
2506   tme_shared tme_uint8_t *parts8;
2507   tme_uint8_t part8_buffer;
2508   tme_uint8_t part8;
2509   tme_uint8_t part8_next;
2510   tme_uint8_t part8_mask;
2511   tme_uint8_t part8_cmp;
2512 
2513   assert (count != 0);
2514   assert (bus_boundary != 0);
2515 
2516   /* if we are locking for all memory accesses, lock memory
2517      around a memcpy: */
2518   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
2519     tme_rwlock_wrlock(rwlock);
2520     memcpy((tme_uint8_t *) (mem), (buffer), (count));
2521     tme_rwlock_unlock(rwlock);
2522   }
2523 
2524   /* otherwise, if the emulated bus boundary is greater than the
2525      host's bus boundary, we are forced to stop all other threads
2526      around a memcpy: */
2527   else if (__tme_predict_false(bus_boundary == 0
2528                                || bus_boundary > host_boundary)) {
2529     tme_thread_suspend_others();
2530     memcpy((tme_uint8_t *) (mem), (buffer), (count) + (0 && align_min));
2531     tme_thread_resume_others();
2532   }
2533 
2534 #ifdef TME_HAVE_INT64_T
2535 
2536   else if (host_boundary == sizeof(tme_uint64_t)) {
2537 
2538     /* make a 64-bit pointer to the memory: */
2539     parts64 = (tme_shared tme_uint64_t *) mem;
2540 
2541     /* if this pointer is not 64-bit aligned: */
2542     if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
2543 
2544       /* get the misalignment from the previous 64-bit boundary: */
2545       count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
2546 
2547       /* truncate this pointer to the previous 64-bit boundary: */
2548       parts64 = (tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
2549 
2550       /* get the number of bytes to write in the first 64-bit memory part: */
2551       count_done = sizeof(tme_uint64_t) - count_misaligned;
2552       if (__tme_predict_false(count_done > count)) {
2553         count_done = count;
2554       }
2555 
2556       /* make a mask that clears for the data to write in the
2557          first 64-bit memory part: */
2558       part64_mask = 1;
2559       part64_mask = (part64_mask << (count_done * 8)) - 1;
2560       part64_mask
2561         <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2562              ? (count_misaligned * 8)
2563              : (64 - ((count_misaligned + count_done) * 8)));
2564       part64_mask = ~part64_mask;
2565 
2566       /* copy from the buffer the bytes to write in the first
2567          64-bit memory part: */
2568       part64_buffer = 0;
2569       part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
2570       count -= count_done;
2571       do {
2572         *part_buffer = *buffer;
2573         part_buffer++;
2574         buffer++;
2575       } while (--count_done != 0);
2576 
2577       /* compare-and-exchange the first 64-bit memory part: */
2578       part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2579       do {
2580         part64_cmp = part64;
2581         part64 = (part64 & part64_mask) | part64_buffer;
2582         part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
2583       } while (part64 != part64_cmp);
2584       parts64++;
2585     }
2586 
2587     /* if we have full 64-bit parts to write: */
2588     if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
2589 
2590       /* if the buffer is 64-bit aligned: */
2591       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
2592 
2593         /* write full 64-bit parts without shifting: */
2594         do {
2595           part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2596           tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2597 
2598           /* advance: */
2599           parts64++;
2600           buffer += sizeof(tme_uint64_t);
2601           count -= sizeof(tme_uint64_t);
2602         } while (count >= sizeof(tme_uint64_t));
2603       }
2604 
2605       /* otherwise, the buffer is not 64-bit aligned: */
2606       else {
2607 
2608         /* get the misalignment to the next 64-bit boundary: */
2609         count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
2610 
2611         /* copy from the buffer until it is aligned: */
2612         part64_buffer = 0;
2613         part_buffer = ((tme_uint8_t *) &part64_buffer);
2614         count_done = count_misaligned;
2615         count -= count_misaligned;
2616         do {
2617           *part_buffer = *buffer;
2618           part_buffer++;
2619           buffer++;
2620         } while (--count_done != 0);
2621 
2622         /* write full 64-bit words with shifting: */
2623         bits_misaligned = count_misaligned * 8;
2624         part64 = part64_buffer;
2625         for (; count >= sizeof(tme_uint64_t); ) {
2626           part64_next = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2627           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2628             part64 |= (part64_next << bits_misaligned);
2629             tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2630             part64 = (part64_next >> (64 - bits_misaligned));
2631           }
2632           else {
2633             part64 |= (part64_next >> bits_misaligned);
2634             tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2635             part64 = (part64_next << (64 - bits_misaligned));
2636           }
2637 
2638           /* advance: */
2639           parts64++;
2640           buffer += sizeof(tme_uint64_t);
2641           count -= sizeof(tme_uint64_t);
2642         }
2643 
2644         /* calculate how many more bytes there are to write in this
2645            64-bit memory part: */
2646         count_done = sizeof(tme_uint64_t) - count_misaligned;
2647         part64_buffer = part64;
2648 
2649         /* if we can't write one more full 64-bit memory part: */
2650         if (count_done > count) {
2651 
2652           /* we will reread this data to write below: */
2653           buffer -= count_misaligned;
2654           count += count_misaligned;
2655         }
2656 
2657         /* otherwise, we can write one more full 64-bit memory part: */
2658         else {
2659 
2660           /* copy from the buffer until we have the full 64-bit part: */
2661           part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
2662           count -= count_done;
2663           do {
2664             *part_buffer = *buffer;
2665             part_buffer++;
2666             buffer++;
2667           } while (--count_done != 0);
2668 
2669           /* write the last full 64-bit memory part: */
2670           part64 = part64_buffer;
2671           tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
2672         }
2673       }
2674     }
2675 
2676     /* if we still have bytes to write: */
2677     if (__tme_predict_false(count > 0)) {
2678 
2679       /* we must have less than a full 64-bit part to write: */
2680       assert (count < sizeof(tme_uint64_t));
2681 
2682       /* make a mask that clears for the data to write in the last
2683          64-bit memory part: */
2684       part64_mask
2685         = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2686            ? _tme_memory_type_mask(tme_uint64_t, << (count * 8))
2687            : _tme_memory_type_mask(tme_uint64_t, >> (count * 8)));
2688 
2689       /* copy from the buffer the bytes to write in the last
2690          64-bit memory part: */
2691       part64_buffer = 0;
2692       part_buffer = ((tme_uint8_t *) &part64_buffer);
2693       count_done = count;
2694       do {
2695         *part_buffer = *buffer;
2696         part_buffer++;
2697         buffer++;
2698       } while (--count_done != 0);
2699 
2700       /* compare-and-exchange the last 64-bit memory part: */
2701       part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
2702       do {
2703         part64_cmp = part64;
2704         part64 = (part64 & part64_mask) | part64_buffer;
2705         part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
2706       } while (part64 != part64_cmp);
2707     }
2708 
2709   }
2710 
2711 #endif /* TME_HAVE_INT64_T */
2712 
2713   else if (host_boundary == sizeof(tme_uint32_t)) {
2714 
2715     /* make a 32-bit pointer to the memory: */
2716     parts32 = (tme_shared tme_uint32_t *) mem;
2717 
2718     /* if this pointer is not 32-bit aligned: */
2719     if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
2720 
2721       /* get the misalignment from the previous 32-bit boundary: */
2722       count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
2723 
2724       /* truncate this pointer to the previous 32-bit boundary: */
2725       parts32 = (tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
2726 
2727       /* get the number of bytes to write in the first 32-bit memory part: */
2728       count_done = sizeof(tme_uint32_t) - count_misaligned;
2729       if (__tme_predict_false(count_done > count)) {
2730         count_done = count;
2731       }
2732 
2733       /* make a mask that clears for the data to write in the
2734          first 32-bit memory part: */
2735       part32_mask = 1;
2736       part32_mask = (part32_mask << (count_done * 8)) - 1;
2737       part32_mask
2738         <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2739              ? (count_misaligned * 8)
2740              : (32 - ((count_misaligned + count_done) * 8)));
2741       part32_mask = ~part32_mask;
2742 
2743       /* copy from the buffer the bytes to write in the first
2744          32-bit memory part: */
2745       part32_buffer = 0;
2746       part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2747       count -= count_done;
2748       do {
2749         *part_buffer = *buffer;
2750         part_buffer++;
2751         buffer++;
2752       } while (--count_done != 0);
2753 
2754       /* compare-and-exchange the first 32-bit memory part: */
2755       part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2756       do {
2757         part32_cmp = part32;
2758         part32 = (part32 & part32_mask) | part32_buffer;
2759         part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
2760       } while (part32 != part32_cmp);
2761       parts32++;
2762     }
2763 
2764     /* if we have full 32-bit parts to write: */
2765     if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
2766 
2767       /* if the buffer is 32-bit aligned: */
2768       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
2769 
2770         /* write full 32-bit parts without shifting: */
2771         do {
2772           part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2773           tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2774 
2775           /* advance: */
2776           parts32++;
2777           buffer += sizeof(tme_uint32_t);
2778           count -= sizeof(tme_uint32_t);
2779         } while (count >= sizeof(tme_uint32_t));
2780       }
2781 
2782       /* otherwise, the buffer is not 32-bit aligned: */
2783       else {
2784 
2785         /* get the misalignment to the next 32-bit boundary: */
2786         count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
2787 
2788         /* copy from the buffer until it is aligned: */
2789         part32_buffer = 0;
2790         part_buffer = ((tme_uint8_t *) &part32_buffer);
2791         count_done = count_misaligned;
2792         count -= count_misaligned;
2793         do {
2794           *part_buffer = *buffer;
2795           part_buffer++;
2796           buffer++;
2797         } while (--count_done != 0);
2798 
2799         /* write full 32-bit words with shifting: */
2800         bits_misaligned = count_misaligned * 8;
2801         part32 = part32_buffer;
2802         for (; count >= sizeof(tme_uint32_t); ) {
2803           part32_next = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2804           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2805             part32 |= (part32_next << bits_misaligned);
2806             tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2807             part32 = (part32_next >> (32 - bits_misaligned));
2808           }
2809           else {
2810             part32 |= (part32_next >> bits_misaligned);
2811             tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2812             part32 = (part32_next << (32 - bits_misaligned));
2813           }
2814 
2815           /* advance: */
2816           parts32++;
2817           buffer += sizeof(tme_uint32_t);
2818           count -= sizeof(tme_uint32_t);
2819         }
2820 
2821         /* calculate how many more bytes there are to write in this
2822            32-bit memory part: */
2823         count_done = sizeof(tme_uint32_t) - count_misaligned;
2824         part32_buffer = part32;
2825 
2826         /* if we can't write one more full 32-bit memory part: */
2827         if (count_done > count) {
2828 
2829           /* we will reread this data to write below: */
2830           buffer -= count_misaligned;
2831           count += count_misaligned;
2832         }
2833 
2834         /* otherwise, we can write one more full 32-bit memory part: */
2835         else {
2836 
2837           /* copy from the buffer until we have the full 32-bit part: */
2838           part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
2839           count -= count_done;
2840           do {
2841             *part_buffer = *buffer;
2842             part_buffer++;
2843             buffer++;
2844           } while (--count_done != 0);
2845 
2846           /* write the last full 32-bit memory part: */
2847           part32 = part32_buffer;
2848           tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
2849         }
2850       }
2851     }
2852 
2853     /* if we still have bytes to write: */
2854     if (__tme_predict_false(count > 0)) {
2855 
2856       /* we must have less than a full 32-bit part to write: */
2857       assert (count < sizeof(tme_uint32_t));
2858 
2859       /* make a mask that clears for the data to write in the last
2860          32-bit memory part: */
2861       part32_mask
2862         = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2863            ? _tme_memory_type_mask(tme_uint32_t, << (count * 8))
2864            : _tme_memory_type_mask(tme_uint32_t, >> (count * 8)));
2865 
2866       /* copy from the buffer the bytes to write in the last
2867          32-bit memory part: */
2868       part32_buffer = 0;
2869       part_buffer = ((tme_uint8_t *) &part32_buffer);
2870       count_done = count;
2871       do {
2872         *part_buffer = *buffer;
2873         part_buffer++;
2874         buffer++;
2875       } while (--count_done != 0);
2876 
2877       /* compare-and-exchange the last 32-bit memory part: */
2878       part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
2879       do {
2880         part32_cmp = part32;
2881         part32 = (part32 & part32_mask) | part32_buffer;
2882         part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
2883       } while (part32 != part32_cmp);
2884     }
2885 
2886   }
2887 
2888   else if (host_boundary == sizeof(tme_uint16_t)) {
2889 
2890     /* make a 16-bit pointer to the memory: */
2891     parts16 = (tme_shared tme_uint16_t *) mem;
2892 
2893     /* if this pointer is not 16-bit aligned: */
2894     if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
2895 
2896       /* get the misalignment from the previous 16-bit boundary: */
2897       count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
2898 
2899       /* truncate this pointer to the previous 16-bit boundary: */
2900       parts16 = (tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
2901 
2902       /* get the number of bytes to write in the first 16-bit memory part: */
2903       count_done = sizeof(tme_uint16_t) - count_misaligned;
2904       if (__tme_predict_false(count_done > count)) {
2905         count_done = count;
2906       }
2907 
2908       /* make a mask that clears for the data to write in the
2909          first 16-bit memory part: */
2910       part16_mask = 1;
2911       part16_mask = (part16_mask << (count_done * 8)) - 1;
2912       part16_mask
2913         <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
2914              ? (count_misaligned * 8)
2915              : (16 - ((count_misaligned + count_done) * 8)));
2916       part16_mask = ~part16_mask;
2917 
2918       /* copy from the buffer the bytes to write in the first
2919          16-bit memory part: */
2920       part16_buffer = 0;
2921       part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
2922       count -= count_done;
2923       do {
2924         *part_buffer = *buffer;
2925         part_buffer++;
2926         buffer++;
2927       } while (--count_done != 0);
2928 
2929       /* compare-and-exchange the first 16-bit memory part: */
2930       part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2931       do {
2932         part16_cmp = part16;
2933         part16 = (part16 & part16_mask) | part16_buffer;
2934         part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
2935       } while (part16 != part16_cmp);
2936       parts16++;
2937     }
2938 
2939     /* if we have full 16-bit parts to write: */
2940     if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
2941 
2942       /* if the buffer is 16-bit aligned: */
2943       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
2944 
2945         /* write full 16-bit parts without shifting: */
2946         do {
2947           part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2948           tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2949 
2950           /* advance: */
2951           parts16++;
2952           buffer += sizeof(tme_uint16_t);
2953           count -= sizeof(tme_uint16_t);
2954         } while (count >= sizeof(tme_uint16_t));
2955       }
2956 
2957       /* otherwise, the buffer is not 16-bit aligned: */
2958       else {
2959 
2960         /* get the misalignment to the next 16-bit boundary: */
2961         count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
2962 
2963         /* copy from the buffer until it is aligned: */
2964         part16_buffer = 0;
2965         part_buffer = ((tme_uint8_t *) &part16_buffer);
2966         count_done = count_misaligned;
2967         count -= count_misaligned;
2968         do {
2969           *part_buffer = *buffer;
2970           part_buffer++;
2971           buffer++;
2972         } while (--count_done != 0);
2973 
2974         /* write full 16-bit words with shifting: */
2975         bits_misaligned = count_misaligned * 8;
2976         part16 = part16_buffer;
2977         for (; count >= sizeof(tme_uint16_t); ) {
2978           part16_next = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
2979           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
2980             part16 |= (part16_next << bits_misaligned);
2981             tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2982             part16 = (part16_next >> (16 - bits_misaligned));
2983           }
2984           else {
2985             part16 |= (part16_next >> bits_misaligned);
2986             tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
2987             part16 = (part16_next << (16 - bits_misaligned));
2988           }
2989 
2990           /* advance: */
2991           parts16++;
2992           buffer += sizeof(tme_uint16_t);
2993           count -= sizeof(tme_uint16_t);
2994         }
2995 
2996         /* calculate how many more bytes there are to write in this
2997            16-bit memory part: */
2998         count_done = sizeof(tme_uint16_t) - count_misaligned;
2999         part16_buffer = part16;
3000 
3001         /* if we can't write one more full 16-bit memory part: */
3002         if (count_done > count) {
3003 
3004           /* we will reread this data to write below: */
3005           buffer -= count_misaligned;
3006           count += count_misaligned;
3007         }
3008 
3009         /* otherwise, we can write one more full 16-bit memory part: */
3010         else {
3011 
3012           /* copy from the buffer until we have the full 16-bit part: */
3013           part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
3014           count -= count_done;
3015           do {
3016             *part_buffer = *buffer;
3017             part_buffer++;
3018             buffer++;
3019           } while (--count_done != 0);
3020 
3021           /* write the last full 16-bit memory part: */
3022           part16 = part16_buffer;
3023           tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
3024         }
3025       }
3026     }
3027 
3028     /* if we still have bytes to write: */
3029     if (__tme_predict_false(count > 0)) {
3030 
3031       /* we must have less than a full 16-bit part to write: */
3032       assert (count < sizeof(tme_uint16_t));
3033 
3034       /* make a mask that clears for the data to write in the last
3035          16-bit memory part: */
3036       part16_mask
3037         = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3038            ? _tme_memory_type_mask(tme_uint16_t, << (count * 8))
3039            : _tme_memory_type_mask(tme_uint16_t, >> (count * 8)));
3040 
3041       /* copy from the buffer the bytes to write in the last
3042          16-bit memory part: */
3043       part16_buffer = 0;
3044       part_buffer = ((tme_uint8_t *) &part16_buffer);
3045       count_done = count;
3046       do {
3047         *part_buffer = *buffer;
3048         part_buffer++;
3049         buffer++;
3050       } while (--count_done != 0);
3051 
3052       /* compare-and-exchange the last 16-bit memory part: */
3053       part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
3054       do {
3055         part16_cmp = part16;
3056         part16 = (part16 & part16_mask) | part16_buffer;
3057         part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
3058       } while (part16 != part16_cmp);
3059     }
3060 
3061   }
3062 
3063   else {
3064 
3065     /* make a 8-bit pointer to the memory: */
3066     parts8 = (tme_shared tme_uint8_t *) mem;
3067 
3068     /* if this pointer is not 8-bit aligned: */
3069     if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
3070 
3071       /* get the misalignment from the previous 8-bit boundary: */
3072       count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
3073 
3074       /* truncate this pointer to the previous 8-bit boundary: */
3075       parts8 = (tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
3076 
3077       /* get the number of bytes to write in the first 8-bit memory part: */
3078       count_done = sizeof(tme_uint8_t) - count_misaligned;
3079       if (__tme_predict_false(count_done > count)) {
3080         count_done = count;
3081       }
3082 
3083       /* make a mask that clears for the data to write in the
3084          first 8-bit memory part: */
3085       part8_mask = 1;
3086       part8_mask = (part8_mask << (count_done * 8)) - 1;
3087       part8_mask
3088         <<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3089              ? (count_misaligned * 8)
3090              : (8 - ((count_misaligned + count_done) * 8)));
3091       part8_mask = ~part8_mask;
3092 
3093       /* copy from the buffer the bytes to write in the first
3094          8-bit memory part: */
3095       part8_buffer = 0;
3096       part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
3097       count -= count_done;
3098       do {
3099         *part_buffer = *buffer;
3100         part_buffer++;
3101         buffer++;
3102       } while (--count_done != 0);
3103 
3104       /* compare-and-exchange the first 8-bit memory part: */
3105       part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3106       do {
3107         part8_cmp = part8;
3108         part8 = (part8 & part8_mask) | part8_buffer;
3109         part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
3110       } while (part8 != part8_cmp);
3111       parts8++;
3112     }
3113 
3114     /* if we have full 8-bit parts to write: */
3115     if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
3116 
3117       /* if the buffer is 8-bit aligned: */
3118       if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
3119 
3120         /* write full 8-bit parts without shifting: */
3121         do {
3122           part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3123           tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3124 
3125           /* advance: */
3126           parts8++;
3127           buffer += sizeof(tme_uint8_t);
3128           count -= sizeof(tme_uint8_t);
3129         } while (count >= sizeof(tme_uint8_t));
3130       }
3131 
3132       /* otherwise, the buffer is not 8-bit aligned: */
3133       else {
3134 
3135         /* get the misalignment to the next 8-bit boundary: */
3136         count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
3137 
3138         /* copy from the buffer until it is aligned: */
3139         part8_buffer = 0;
3140         part_buffer = ((tme_uint8_t *) &part8_buffer);
3141         count_done = count_misaligned;
3142         count -= count_misaligned;
3143         do {
3144           *part_buffer = *buffer;
3145           part_buffer++;
3146           buffer++;
3147         } while (--count_done != 0);
3148 
3149         /* write full 8-bit words with shifting: */
3150         bits_misaligned = count_misaligned * 8;
3151         part8 = part8_buffer;
3152         for (; count >= sizeof(tme_uint8_t); ) {
3153           part8_next = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3154           if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
3155             part8 |= (part8_next << bits_misaligned);
3156             tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3157             part8 = (part8_next >> (8 - bits_misaligned));
3158           }
3159           else {
3160             part8 |= (part8_next >> bits_misaligned);
3161             tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3162             part8 = (part8_next << (8 - bits_misaligned));
3163           }
3164 
3165           /* advance: */
3166           parts8++;
3167           buffer += sizeof(tme_uint8_t);
3168           count -= sizeof(tme_uint8_t);
3169         }
3170 
3171         /* calculate how many more bytes there are to write in this
3172            8-bit memory part: */
3173         count_done = sizeof(tme_uint8_t) - count_misaligned;
3174         part8_buffer = part8;
3175 
3176         /* if we can't write one more full 8-bit memory part: */
3177         if (count_done > count) {
3178 
3179           /* we will reread this data to write below: */
3180           buffer -= count_misaligned;
3181           count += count_misaligned;
3182         }
3183 
3184         /* otherwise, we can write one more full 8-bit memory part: */
3185         else {
3186 
3187           /* copy from the buffer until we have the full 8-bit part: */
3188           part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
3189           count -= count_done;
3190           do {
3191             *part_buffer = *buffer;
3192             part_buffer++;
3193             buffer++;
3194           } while (--count_done != 0);
3195 
3196           /* write the last full 8-bit memory part: */
3197           part8 = part8_buffer;
3198           tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
3199         }
3200       }
3201     }
3202 
3203     /* if we still have bytes to write: */
3204     if (__tme_predict_false(count > 0)) {
3205 
3206       /* we must have less than a full 8-bit part to write: */
3207       assert (count < sizeof(tme_uint8_t));
3208 
3209       /* make a mask that clears for the data to write in the last
3210          8-bit memory part: */
3211       part8_mask
3212         = (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
3213            ? _tme_memory_type_mask(tme_uint8_t, << (count * 8))
3214            : _tme_memory_type_mask(tme_uint8_t, >> (count * 8)));
3215 
3216       /* copy from the buffer the bytes to write in the last
3217          8-bit memory part: */
3218       part8_buffer = 0;
3219       part_buffer = ((tme_uint8_t *) &part8_buffer);
3220       count_done = count;
3221       do {
3222         *part_buffer = *buffer;
3223         part_buffer++;
3224         buffer++;
3225       } while (--count_done != 0);
3226 
3227       /* compare-and-exchange the last 8-bit memory part: */
3228       part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
3229       do {
3230         part8_cmp = part8;
3231         part8 = (part8 & part8_mask) | part8_buffer;
3232         part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
3233       } while (part8 != part8_cmp);
3234     }
3235 
3236   }
3237 }
3238 
3239 /* the 8-bit atomic operations: */
3240 
3241 /* undefine any macro version of tme_memory_atomic_add8: */
3242 #undef tme_memory_atomic_add8
3243 
3244 /* the 8-bit atomic add function: */
3245 tme_uint8_t
tme_memory_atomic_add8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3246 tme_memory_atomic_add8(tme_shared tme_uint8_t *memory,
3247                         tme_uint8_t operand,
3248                         tme_rwlock_t *rwlock,
3249                         unsigned int align_min)
3250 {
3251   tme_uint8_t value_read;
3252   tme_uint8_t value_written;
3253   tme_uint8_t value_read_verify;
3254 
3255   /* if we can't make direct accesses at all, all atomic
3256      accesses must be done under lock.  (when threads are
3257      cooperative the actual locking isn't needed): */
3258   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3259     if (!TME_THREADS_COOPERATIVE) {
3260       tme_rwlock_wrlock(rwlock);
3261     }
3262     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3263     value_written = value_read + operand;
3264     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3265     if (!TME_THREADS_COOPERATIVE) {
3266       tme_rwlock_unlock(rwlock);
3267     }
3268   }
3269 
3270   /* otherwise, threads are not cooperative and this host CPU
3271      can make atomic accesses to at least the most common memory
3272      size.
3273 
3274      in that case, the only reason this function should get
3275      called is if the host CPU can't do an atomic 8-bit
3276      add at all, or if it can't do it at this alignment.
3277 
3278      we emulate the atomic 8-bit add with a compare-and-exchange: */
3279   else {
3280 
3281     /* do an atomic read of the memory: */
3282     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3283 
3284     /* spin the add in a compare-and-exchange loop: */
3285     for (;;) {
3286 
3287       /* make the value to write: */
3288       value_written = value_read + operand;
3289 
3290       /* try the compare-and-exchange: */
3291       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3292 
3293       /* if the compare-and-exchange failed: */
3294       if (__tme_predict_false(value_read_verify != value_read)) {
3295 
3296         /* loop with the new value read from the memory: */
3297         value_read = value_read_verify;
3298         continue;
3299       }
3300 
3301       /* stop now: */
3302       break;
3303     }
3304   }
3305 
3306   /* return the value read: */
3307   return (value_read);
3308 }
3309 
3310 /* undefine any macro version of tme_memory_atomic_sub8: */
3311 #undef tme_memory_atomic_sub8
3312 
3313 /* the 8-bit atomic sub function: */
3314 tme_uint8_t
tme_memory_atomic_sub8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3315 tme_memory_atomic_sub8(tme_shared tme_uint8_t *memory,
3316                         tme_uint8_t operand,
3317                         tme_rwlock_t *rwlock,
3318                         unsigned int align_min)
3319 {
3320   tme_uint8_t value_read;
3321   tme_uint8_t value_written;
3322   tme_uint8_t value_read_verify;
3323 
3324   /* if we can't make direct accesses at all, all atomic
3325      accesses must be done under lock.  (when threads are
3326      cooperative the actual locking isn't needed): */
3327   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3328     if (!TME_THREADS_COOPERATIVE) {
3329       tme_rwlock_wrlock(rwlock);
3330     }
3331     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3332     value_written = value_read - operand;
3333     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3334     if (!TME_THREADS_COOPERATIVE) {
3335       tme_rwlock_unlock(rwlock);
3336     }
3337   }
3338 
3339   /* otherwise, threads are not cooperative and this host CPU
3340      can make atomic accesses to at least the most common memory
3341      size.
3342 
3343      in that case, the only reason this function should get
3344      called is if the host CPU can't do an atomic 8-bit
3345      sub at all, or if it can't do it at this alignment.
3346 
3347      we emulate the atomic 8-bit sub with a compare-and-exchange: */
3348   else {
3349 
3350     /* do an atomic read of the memory: */
3351     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3352 
3353     /* spin the sub in a compare-and-exchange loop: */
3354     for (;;) {
3355 
3356       /* make the value to write: */
3357       value_written = value_read - operand;
3358 
3359       /* try the compare-and-exchange: */
3360       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3361 
3362       /* if the compare-and-exchange failed: */
3363       if (__tme_predict_false(value_read_verify != value_read)) {
3364 
3365         /* loop with the new value read from the memory: */
3366         value_read = value_read_verify;
3367         continue;
3368       }
3369 
3370       /* stop now: */
3371       break;
3372     }
3373   }
3374 
3375   /* return the value read: */
3376   return (value_read);
3377 }
3378 
3379 /* undefine any macro version of tme_memory_atomic_mul8: */
3380 #undef tme_memory_atomic_mul8
3381 
3382 /* the 8-bit atomic mul function: */
3383 tme_uint8_t
tme_memory_atomic_mul8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3384 tme_memory_atomic_mul8(tme_shared tme_uint8_t *memory,
3385                         tme_uint8_t operand,
3386                         tme_rwlock_t *rwlock,
3387                         unsigned int align_min)
3388 {
3389   tme_uint8_t value_read;
3390   tme_uint8_t value_written;
3391   tme_uint8_t value_read_verify;
3392 
3393   /* if we can't make direct accesses at all, all atomic
3394      accesses must be done under lock.  (when threads are
3395      cooperative the actual locking isn't needed): */
3396   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3397     if (!TME_THREADS_COOPERATIVE) {
3398       tme_rwlock_wrlock(rwlock);
3399     }
3400     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3401     value_written = value_read * operand;
3402     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3403     if (!TME_THREADS_COOPERATIVE) {
3404       tme_rwlock_unlock(rwlock);
3405     }
3406   }
3407 
3408   /* otherwise, threads are not cooperative and this host CPU
3409      can make atomic accesses to at least the most common memory
3410      size.
3411 
3412      in that case, the only reason this function should get
3413      called is if the host CPU can't do an atomic 8-bit
3414      mul at all, or if it can't do it at this alignment.
3415 
3416      we emulate the atomic 8-bit mul with a compare-and-exchange: */
3417   else {
3418 
3419     /* do an atomic read of the memory: */
3420     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3421 
3422     /* spin the mul in a compare-and-exchange loop: */
3423     for (;;) {
3424 
3425       /* make the value to write: */
3426       value_written = value_read * operand;
3427 
3428       /* try the compare-and-exchange: */
3429       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3430 
3431       /* if the compare-and-exchange failed: */
3432       if (__tme_predict_false(value_read_verify != value_read)) {
3433 
3434         /* loop with the new value read from the memory: */
3435         value_read = value_read_verify;
3436         continue;
3437       }
3438 
3439       /* stop now: */
3440       break;
3441     }
3442   }
3443 
3444   /* return the value read: */
3445   return (value_read);
3446 }
3447 
3448 /* undefine any macro version of tme_memory_atomic_div8: */
3449 #undef tme_memory_atomic_div8
3450 
3451 /* the 8-bit atomic div function: */
3452 tme_uint8_t
tme_memory_atomic_div8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3453 tme_memory_atomic_div8(tme_shared tme_uint8_t *memory,
3454                         tme_uint8_t operand,
3455                         tme_rwlock_t *rwlock,
3456                         unsigned int align_min)
3457 {
3458   tme_uint8_t value_read;
3459   tme_uint8_t value_written;
3460   tme_uint8_t value_read_verify;
3461 
3462   /* if we can't make direct accesses at all, all atomic
3463      accesses must be done under lock.  (when threads are
3464      cooperative the actual locking isn't needed): */
3465   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3466     if (!TME_THREADS_COOPERATIVE) {
3467       tme_rwlock_wrlock(rwlock);
3468     }
3469     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3470     value_written = value_read / operand;
3471     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3472     if (!TME_THREADS_COOPERATIVE) {
3473       tme_rwlock_unlock(rwlock);
3474     }
3475   }
3476 
3477   /* otherwise, threads are not cooperative and this host CPU
3478      can make atomic accesses to at least the most common memory
3479      size.
3480 
3481      in that case, the only reason this function should get
3482      called is if the host CPU can't do an atomic 8-bit
3483      div at all, or if it can't do it at this alignment.
3484 
3485      we emulate the atomic 8-bit div with a compare-and-exchange: */
3486   else {
3487 
3488     /* do an atomic read of the memory: */
3489     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3490 
3491     /* spin the div in a compare-and-exchange loop: */
3492     for (;;) {
3493 
3494       /* make the value to write: */
3495       value_written = value_read / operand;
3496 
3497       /* try the compare-and-exchange: */
3498       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3499 
3500       /* if the compare-and-exchange failed: */
3501       if (__tme_predict_false(value_read_verify != value_read)) {
3502 
3503         /* loop with the new value read from the memory: */
3504         value_read = value_read_verify;
3505         continue;
3506       }
3507 
3508       /* stop now: */
3509       break;
3510     }
3511   }
3512 
3513   /* return the value read: */
3514   return (value_read);
3515 }
3516 
3517 /* undefine any macro version of tme_memory_atomic_and8: */
3518 #undef tme_memory_atomic_and8
3519 
3520 /* the 8-bit atomic and function: */
3521 tme_uint8_t
tme_memory_atomic_and8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3522 tme_memory_atomic_and8(tme_shared tme_uint8_t *memory,
3523                         tme_uint8_t operand,
3524                         tme_rwlock_t *rwlock,
3525                         unsigned int align_min)
3526 {
3527   tme_uint8_t value_read;
3528   tme_uint8_t value_written;
3529   tme_uint8_t value_read_verify;
3530 
3531   /* if we can't make direct accesses at all, all atomic
3532      accesses must be done under lock.  (when threads are
3533      cooperative the actual locking isn't needed): */
3534   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3535     if (!TME_THREADS_COOPERATIVE) {
3536       tme_rwlock_wrlock(rwlock);
3537     }
3538     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3539     value_written = value_read & operand;
3540     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3541     if (!TME_THREADS_COOPERATIVE) {
3542       tme_rwlock_unlock(rwlock);
3543     }
3544   }
3545 
3546   /* otherwise, threads are not cooperative and this host CPU
3547      can make atomic accesses to at least the most common memory
3548      size.
3549 
3550      in that case, the only reason this function should get
3551      called is if the host CPU can't do an atomic 8-bit
3552      and at all, or if it can't do it at this alignment.
3553 
3554      we emulate the atomic 8-bit and with a compare-and-exchange: */
3555   else {
3556 
3557     /* do an atomic read of the memory: */
3558     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3559 
3560     /* spin the and in a compare-and-exchange loop: */
3561     for (;;) {
3562 
3563       /* make the value to write: */
3564       value_written = value_read & operand;
3565 
3566       /* try the compare-and-exchange: */
3567       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3568 
3569       /* if the compare-and-exchange failed: */
3570       if (__tme_predict_false(value_read_verify != value_read)) {
3571 
3572         /* loop with the new value read from the memory: */
3573         value_read = value_read_verify;
3574         continue;
3575       }
3576 
3577       /* stop now: */
3578       break;
3579     }
3580   }
3581 
3582   /* return the value read: */
3583   return (value_read);
3584 }
3585 
3586 /* undefine any macro version of tme_memory_atomic_or8: */
3587 #undef tme_memory_atomic_or8
3588 
3589 /* the 8-bit atomic or function: */
3590 tme_uint8_t
tme_memory_atomic_or8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3591 tme_memory_atomic_or8(tme_shared tme_uint8_t *memory,
3592                         tme_uint8_t operand,
3593                         tme_rwlock_t *rwlock,
3594                         unsigned int align_min)
3595 {
3596   tme_uint8_t value_read;
3597   tme_uint8_t value_written;
3598   tme_uint8_t value_read_verify;
3599 
3600   /* if we can't make direct accesses at all, all atomic
3601      accesses must be done under lock.  (when threads are
3602      cooperative the actual locking isn't needed): */
3603   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3604     if (!TME_THREADS_COOPERATIVE) {
3605       tme_rwlock_wrlock(rwlock);
3606     }
3607     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3608     value_written = value_read | operand;
3609     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3610     if (!TME_THREADS_COOPERATIVE) {
3611       tme_rwlock_unlock(rwlock);
3612     }
3613   }
3614 
3615   /* otherwise, threads are not cooperative and this host CPU
3616      can make atomic accesses to at least the most common memory
3617      size.
3618 
3619      in that case, the only reason this function should get
3620      called is if the host CPU can't do an atomic 8-bit
3621      or at all, or if it can't do it at this alignment.
3622 
3623      we emulate the atomic 8-bit or with a compare-and-exchange: */
3624   else {
3625 
3626     /* do an atomic read of the memory: */
3627     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3628 
3629     /* spin the or in a compare-and-exchange loop: */
3630     for (;;) {
3631 
3632       /* make the value to write: */
3633       value_written = value_read | operand;
3634 
3635       /* try the compare-and-exchange: */
3636       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3637 
3638       /* if the compare-and-exchange failed: */
3639       if (__tme_predict_false(value_read_verify != value_read)) {
3640 
3641         /* loop with the new value read from the memory: */
3642         value_read = value_read_verify;
3643         continue;
3644       }
3645 
3646       /* stop now: */
3647       break;
3648     }
3649   }
3650 
3651   /* return the value read: */
3652   return (value_read);
3653 }
3654 
3655 /* undefine any macro version of tme_memory_atomic_xor8: */
3656 #undef tme_memory_atomic_xor8
3657 
3658 /* the 8-bit atomic xor function: */
3659 tme_uint8_t
tme_memory_atomic_xor8(tme_shared tme_uint8_t * memory,tme_uint8_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3660 tme_memory_atomic_xor8(tme_shared tme_uint8_t *memory,
3661                         tme_uint8_t operand,
3662                         tme_rwlock_t *rwlock,
3663                         unsigned int align_min)
3664 {
3665   tme_uint8_t value_read;
3666   tme_uint8_t value_written;
3667   tme_uint8_t value_read_verify;
3668 
3669   /* if we can't make direct accesses at all, all atomic
3670      accesses must be done under lock.  (when threads are
3671      cooperative the actual locking isn't needed): */
3672   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3673     if (!TME_THREADS_COOPERATIVE) {
3674       tme_rwlock_wrlock(rwlock);
3675     }
3676     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3677     value_written = value_read ^ operand;
3678     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3679     if (!TME_THREADS_COOPERATIVE) {
3680       tme_rwlock_unlock(rwlock);
3681     }
3682   }
3683 
3684   /* otherwise, threads are not cooperative and this host CPU
3685      can make atomic accesses to at least the most common memory
3686      size.
3687 
3688      in that case, the only reason this function should get
3689      called is if the host CPU can't do an atomic 8-bit
3690      xor at all, or if it can't do it at this alignment.
3691 
3692      we emulate the atomic 8-bit xor with a compare-and-exchange: */
3693   else {
3694 
3695     /* do an atomic read of the memory: */
3696     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3697 
3698     /* spin the xor in a compare-and-exchange loop: */
3699     for (;;) {
3700 
3701       /* make the value to write: */
3702       value_written = value_read ^ operand;
3703 
3704       /* try the compare-and-exchange: */
3705       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3706 
3707       /* if the compare-and-exchange failed: */
3708       if (__tme_predict_false(value_read_verify != value_read)) {
3709 
3710         /* loop with the new value read from the memory: */
3711         value_read = value_read_verify;
3712         continue;
3713       }
3714 
3715       /* stop now: */
3716       break;
3717     }
3718   }
3719 
3720   /* return the value read: */
3721   return (value_read);
3722 }
3723 
3724 /* undefine any macro version of tme_memory_atomic_not8: */
3725 #undef tme_memory_atomic_not8
3726 
3727 /* the 8-bit atomic not function: */
3728 tme_uint8_t
tme_memory_atomic_not8(tme_shared tme_uint8_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)3729 tme_memory_atomic_not8(tme_shared tme_uint8_t *memory,
3730                         tme_rwlock_t *rwlock,
3731                         unsigned int align_min)
3732 {
3733   tme_uint8_t value_read;
3734   tme_uint8_t value_written;
3735   tme_uint8_t value_read_verify;
3736 
3737   /* if we can't make direct accesses at all, all atomic
3738      accesses must be done under lock.  (when threads are
3739      cooperative the actual locking isn't needed): */
3740   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3741     if (!TME_THREADS_COOPERATIVE) {
3742       tme_rwlock_wrlock(rwlock);
3743     }
3744     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3745     value_written = ~value_read;
3746     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3747     if (!TME_THREADS_COOPERATIVE) {
3748       tme_rwlock_unlock(rwlock);
3749     }
3750   }
3751 
3752   /* otherwise, threads are not cooperative and this host CPU
3753      can make atomic accesses to at least the most common memory
3754      size.
3755 
3756      in that case, the only reason this function should get
3757      called is if the host CPU can't do an atomic 8-bit
3758      not at all, or if it can't do it at this alignment.
3759 
3760      we emulate the atomic 8-bit not with a compare-and-exchange: */
3761   else {
3762 
3763     /* do an atomic read of the memory: */
3764     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3765 
3766     /* spin the not in a compare-and-exchange loop: */
3767     for (;;) {
3768 
3769       /* make the value to write: */
3770       value_written = ~value_read;
3771 
3772       /* try the compare-and-exchange: */
3773       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3774 
3775       /* if the compare-and-exchange failed: */
3776       if (__tme_predict_false(value_read_verify != value_read)) {
3777 
3778         /* loop with the new value read from the memory: */
3779         value_read = value_read_verify;
3780         continue;
3781       }
3782 
3783       /* stop now: */
3784       break;
3785     }
3786   }
3787 
3788   /* return the value read: */
3789   return (value_read);
3790 }
3791 
3792 /* undefine any macro version of tme_memory_atomic_neg8: */
3793 #undef tme_memory_atomic_neg8
3794 
3795 /* the 8-bit atomic neg function: */
3796 tme_uint8_t
tme_memory_atomic_neg8(tme_shared tme_uint8_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)3797 tme_memory_atomic_neg8(tme_shared tme_uint8_t *memory,
3798                         tme_rwlock_t *rwlock,
3799                         unsigned int align_min)
3800 {
3801   tme_uint8_t value_read;
3802   tme_uint8_t value_written;
3803   tme_uint8_t value_read_verify;
3804 
3805   /* if we can't make direct accesses at all, all atomic
3806      accesses must be done under lock.  (when threads are
3807      cooperative the actual locking isn't needed): */
3808   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3809     if (!TME_THREADS_COOPERATIVE) {
3810       tme_rwlock_wrlock(rwlock);
3811     }
3812     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3813     value_written = 0 - value_read;
3814     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3815     if (!TME_THREADS_COOPERATIVE) {
3816       tme_rwlock_unlock(rwlock);
3817     }
3818   }
3819 
3820   /* otherwise, threads are not cooperative and this host CPU
3821      can make atomic accesses to at least the most common memory
3822      size.
3823 
3824      in that case, the only reason this function should get
3825      called is if the host CPU can't do an atomic 8-bit
3826      neg at all, or if it can't do it at this alignment.
3827 
3828      we emulate the atomic 8-bit neg with a compare-and-exchange: */
3829   else {
3830 
3831     /* do an atomic read of the memory: */
3832     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3833 
3834     /* spin the neg in a compare-and-exchange loop: */
3835     for (;;) {
3836 
3837       /* make the value to write: */
3838       value_written = 0 - value_read;
3839 
3840       /* try the compare-and-exchange: */
3841       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3842 
3843       /* if the compare-and-exchange failed: */
3844       if (__tme_predict_false(value_read_verify != value_read)) {
3845 
3846         /* loop with the new value read from the memory: */
3847         value_read = value_read_verify;
3848         continue;
3849       }
3850 
3851       /* stop now: */
3852       break;
3853     }
3854   }
3855 
3856   /* return the value read: */
3857   return (value_read);
3858 }
3859 
3860 /* undefine any macro version of tme_memory_atomic_xchg8: */
3861 #undef tme_memory_atomic_xchg8
3862 
3863 /* the 8-bit atomic xchg function: */
3864 tme_uint8_t
tme_memory_atomic_xchg8(tme_shared tme_uint8_t * memory,tme_uint8_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)3865 tme_memory_atomic_xchg8(tme_shared tme_uint8_t *memory,
3866                         tme_uint8_t value_written,
3867                         tme_rwlock_t *rwlock,
3868                         unsigned int align_min)
3869 {
3870   tme_uint8_t value_read;
3871   tme_uint8_t value_read_verify;
3872 
3873   /* if we can't make direct accesses at all, all atomic
3874      accesses must be done under lock.  (when threads are
3875      cooperative the actual locking isn't needed): */
3876   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3877     if (!TME_THREADS_COOPERATIVE) {
3878       tme_rwlock_wrlock(rwlock);
3879     }
3880     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3881     tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3882     if (!TME_THREADS_COOPERATIVE) {
3883       tme_rwlock_unlock(rwlock);
3884     }
3885   }
3886 
3887   /* otherwise, threads are not cooperative and this host CPU
3888      can make atomic accesses to at least the most common memory
3889      size.
3890 
3891      in that case, the only reason this function should get
3892      called is if the host CPU can't do an atomic 8-bit
3893      xchg at all, or if it can't do it at this alignment.
3894 
3895      we emulate the atomic 8-bit xchg with a compare-and-exchange: */
3896   else {
3897 
3898     /* do an atomic read of the memory: */
3899     value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
3900 
3901     /* spin the xchg in a compare-and-exchange loop: */
3902     for (;;) {
3903 
3904       /* try the compare-and-exchange: */
3905       value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
3906 
3907       /* if the compare-and-exchange failed: */
3908       if (__tme_predict_false(value_read_verify != value_read)) {
3909 
3910         /* loop with the new value read from the memory: */
3911         value_read = value_read_verify;
3912         continue;
3913       }
3914 
3915       /* stop now: */
3916       break;
3917     }
3918   }
3919 
3920   /* return the value read: */
3921   return (value_read);
3922 }
3923 
3924 /* undefine any macro version of tme_memory_atomic_cx8: */
3925 #undef tme_memory_atomic_cx8
3926 
3927 /* the 8-bit atomic cx function: */
3928 tme_uint8_t
tme_memory_atomic_cx8(tme_shared tme_uint8_t * memory,tme_uint8_t value_cmp,tme_uint8_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)3929 tme_memory_atomic_cx8(tme_shared tme_uint8_t *memory,
3930                         tme_uint8_t value_cmp,
3931                         tme_uint8_t value_written,
3932                         tme_rwlock_t *rwlock,
3933                         unsigned int align_min)
3934 {
3935   tme_uint8_t value_read;
3936 
3937   /* if we can't make direct accesses at all, all atomic
3938      accesses must be done under lock.  (when threads are
3939      cooperative the actual locking isn't needed): */
3940   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3941     if (!TME_THREADS_COOPERATIVE) {
3942       tme_rwlock_wrlock(rwlock);
3943     }
3944     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3945     if (value_read == value_cmp) {
3946       tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3947     }
3948     if (!TME_THREADS_COOPERATIVE) {
3949       tme_rwlock_unlock(rwlock);
3950     }
3951   }
3952 
3953   /* otherwise, threads are not cooperative and this host CPU
3954      can make atomic accesses to at least the most common memory
3955      size.
3956 
3957      in that case, the only reason this function should get
3958      called is if the host CPU can't do an atomic 8-bit
3959      cx at all, or if it can't do it at this alignment.
3960 
3961      we assume that these problematic atomic cxs are rare,
3962      and to emulate them we simply stop all other threads while
3963      doing the cx: */
3964   else {
3965     tme_thread_suspend_others();
3966     value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
3967     if (value_read == value_cmp) {
3968       tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
3969     }
3970     tme_thread_resume_others();
3971   }
3972 
3973   /* return the value read: */
3974   return (value_read);
3975 }
3976 
3977 /* the 16-bit atomic operations: */
3978 
3979 /* undefine any macro version of tme_memory_atomic_add16: */
3980 #undef tme_memory_atomic_add16
3981 
3982 /* the 16-bit atomic add function: */
3983 tme_uint16_t
tme_memory_atomic_add16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)3984 tme_memory_atomic_add16(tme_shared tme_uint16_t *memory,
3985                         tme_uint16_t operand,
3986                         tme_rwlock_t *rwlock,
3987                         unsigned int align_min)
3988 {
3989   tme_uint16_t value_read;
3990   tme_uint16_t value_written;
3991   tme_uint16_t value_read_verify;
3992 
3993   /* if we can't make direct accesses at all, all atomic
3994      accesses must be done under lock.  (when threads are
3995      cooperative the actual locking isn't needed): */
3996   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
3997     if (!TME_THREADS_COOPERATIVE) {
3998       tme_rwlock_wrlock(rwlock);
3999     }
4000     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4001     value_written = value_read + operand;
4002     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4003     if (!TME_THREADS_COOPERATIVE) {
4004       tme_rwlock_unlock(rwlock);
4005     }
4006   }
4007 
4008   /* otherwise, threads are not cooperative and this host CPU
4009      can make atomic accesses to at least the most common memory
4010      size.
4011 
4012      in that case, the only reason this function should get
4013      called is if the host CPU can't do an atomic 16-bit
4014      add at all, or if it can't do it at this alignment.
4015 
4016      we emulate the atomic 16-bit add with a compare-and-exchange: */
4017   else {
4018 
4019     /* do an atomic read of the memory: */
4020     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4021 
4022     /* spin the add in a compare-and-exchange loop: */
4023     for (;;) {
4024 
4025       /* make the value to write: */
4026       value_written = value_read + operand;
4027 
4028       /* try the compare-and-exchange: */
4029       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4030 
4031       /* if the compare-and-exchange failed: */
4032       if (__tme_predict_false(value_read_verify != value_read)) {
4033 
4034         /* loop with the new value read from the memory: */
4035         value_read = value_read_verify;
4036         continue;
4037       }
4038 
4039       /* stop now: */
4040       break;
4041     }
4042   }
4043 
4044   /* return the value read: */
4045   return (value_read);
4046 }
4047 
4048 /* undefine any macro version of tme_memory_atomic_sub16: */
4049 #undef tme_memory_atomic_sub16
4050 
4051 /* the 16-bit atomic sub function: */
4052 tme_uint16_t
tme_memory_atomic_sub16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4053 tme_memory_atomic_sub16(tme_shared tme_uint16_t *memory,
4054                         tme_uint16_t operand,
4055                         tme_rwlock_t *rwlock,
4056                         unsigned int align_min)
4057 {
4058   tme_uint16_t value_read;
4059   tme_uint16_t value_written;
4060   tme_uint16_t value_read_verify;
4061 
4062   /* if we can't make direct accesses at all, all atomic
4063      accesses must be done under lock.  (when threads are
4064      cooperative the actual locking isn't needed): */
4065   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4066     if (!TME_THREADS_COOPERATIVE) {
4067       tme_rwlock_wrlock(rwlock);
4068     }
4069     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4070     value_written = value_read - operand;
4071     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4072     if (!TME_THREADS_COOPERATIVE) {
4073       tme_rwlock_unlock(rwlock);
4074     }
4075   }
4076 
4077   /* otherwise, threads are not cooperative and this host CPU
4078      can make atomic accesses to at least the most common memory
4079      size.
4080 
4081      in that case, the only reason this function should get
4082      called is if the host CPU can't do an atomic 16-bit
4083      sub at all, or if it can't do it at this alignment.
4084 
4085      we emulate the atomic 16-bit sub with a compare-and-exchange: */
4086   else {
4087 
4088     /* do an atomic read of the memory: */
4089     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4090 
4091     /* spin the sub in a compare-and-exchange loop: */
4092     for (;;) {
4093 
4094       /* make the value to write: */
4095       value_written = value_read - operand;
4096 
4097       /* try the compare-and-exchange: */
4098       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4099 
4100       /* if the compare-and-exchange failed: */
4101       if (__tme_predict_false(value_read_verify != value_read)) {
4102 
4103         /* loop with the new value read from the memory: */
4104         value_read = value_read_verify;
4105         continue;
4106       }
4107 
4108       /* stop now: */
4109       break;
4110     }
4111   }
4112 
4113   /* return the value read: */
4114   return (value_read);
4115 }
4116 
4117 /* undefine any macro version of tme_memory_atomic_mul16: */
4118 #undef tme_memory_atomic_mul16
4119 
4120 /* the 16-bit atomic mul function: */
4121 tme_uint16_t
tme_memory_atomic_mul16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4122 tme_memory_atomic_mul16(tme_shared tme_uint16_t *memory,
4123                         tme_uint16_t operand,
4124                         tme_rwlock_t *rwlock,
4125                         unsigned int align_min)
4126 {
4127   tme_uint16_t value_read;
4128   tme_uint16_t value_written;
4129   tme_uint16_t value_read_verify;
4130 
4131   /* if we can't make direct accesses at all, all atomic
4132      accesses must be done under lock.  (when threads are
4133      cooperative the actual locking isn't needed): */
4134   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4135     if (!TME_THREADS_COOPERATIVE) {
4136       tme_rwlock_wrlock(rwlock);
4137     }
4138     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4139     value_written = value_read * operand;
4140     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4141     if (!TME_THREADS_COOPERATIVE) {
4142       tme_rwlock_unlock(rwlock);
4143     }
4144   }
4145 
4146   /* otherwise, threads are not cooperative and this host CPU
4147      can make atomic accesses to at least the most common memory
4148      size.
4149 
4150      in that case, the only reason this function should get
4151      called is if the host CPU can't do an atomic 16-bit
4152      mul at all, or if it can't do it at this alignment.
4153 
4154      we emulate the atomic 16-bit mul with a compare-and-exchange: */
4155   else {
4156 
4157     /* do an atomic read of the memory: */
4158     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4159 
4160     /* spin the mul in a compare-and-exchange loop: */
4161     for (;;) {
4162 
4163       /* make the value to write: */
4164       value_written = value_read * operand;
4165 
4166       /* try the compare-and-exchange: */
4167       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4168 
4169       /* if the compare-and-exchange failed: */
4170       if (__tme_predict_false(value_read_verify != value_read)) {
4171 
4172         /* loop with the new value read from the memory: */
4173         value_read = value_read_verify;
4174         continue;
4175       }
4176 
4177       /* stop now: */
4178       break;
4179     }
4180   }
4181 
4182   /* return the value read: */
4183   return (value_read);
4184 }
4185 
4186 /* undefine any macro version of tme_memory_atomic_div16: */
4187 #undef tme_memory_atomic_div16
4188 
4189 /* the 16-bit atomic div function: */
4190 tme_uint16_t
tme_memory_atomic_div16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4191 tme_memory_atomic_div16(tme_shared tme_uint16_t *memory,
4192                         tme_uint16_t operand,
4193                         tme_rwlock_t *rwlock,
4194                         unsigned int align_min)
4195 {
4196   tme_uint16_t value_read;
4197   tme_uint16_t value_written;
4198   tme_uint16_t value_read_verify;
4199 
4200   /* if we can't make direct accesses at all, all atomic
4201      accesses must be done under lock.  (when threads are
4202      cooperative the actual locking isn't needed): */
4203   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4204     if (!TME_THREADS_COOPERATIVE) {
4205       tme_rwlock_wrlock(rwlock);
4206     }
4207     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4208     value_written = value_read / operand;
4209     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4210     if (!TME_THREADS_COOPERATIVE) {
4211       tme_rwlock_unlock(rwlock);
4212     }
4213   }
4214 
4215   /* otherwise, threads are not cooperative and this host CPU
4216      can make atomic accesses to at least the most common memory
4217      size.
4218 
4219      in that case, the only reason this function should get
4220      called is if the host CPU can't do an atomic 16-bit
4221      div at all, or if it can't do it at this alignment.
4222 
4223      we emulate the atomic 16-bit div with a compare-and-exchange: */
4224   else {
4225 
4226     /* do an atomic read of the memory: */
4227     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4228 
4229     /* spin the div in a compare-and-exchange loop: */
4230     for (;;) {
4231 
4232       /* make the value to write: */
4233       value_written = value_read / operand;
4234 
4235       /* try the compare-and-exchange: */
4236       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4237 
4238       /* if the compare-and-exchange failed: */
4239       if (__tme_predict_false(value_read_verify != value_read)) {
4240 
4241         /* loop with the new value read from the memory: */
4242         value_read = value_read_verify;
4243         continue;
4244       }
4245 
4246       /* stop now: */
4247       break;
4248     }
4249   }
4250 
4251   /* return the value read: */
4252   return (value_read);
4253 }
4254 
4255 /* undefine any macro version of tme_memory_atomic_and16: */
4256 #undef tme_memory_atomic_and16
4257 
4258 /* the 16-bit atomic and function: */
4259 tme_uint16_t
tme_memory_atomic_and16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4260 tme_memory_atomic_and16(tme_shared tme_uint16_t *memory,
4261                         tme_uint16_t operand,
4262                         tme_rwlock_t *rwlock,
4263                         unsigned int align_min)
4264 {
4265   tme_uint16_t value_read;
4266   tme_uint16_t value_written;
4267   tme_uint16_t value_read_verify;
4268 
4269   /* if we can't make direct accesses at all, all atomic
4270      accesses must be done under lock.  (when threads are
4271      cooperative the actual locking isn't needed): */
4272   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4273     if (!TME_THREADS_COOPERATIVE) {
4274       tme_rwlock_wrlock(rwlock);
4275     }
4276     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4277     value_written = value_read & operand;
4278     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4279     if (!TME_THREADS_COOPERATIVE) {
4280       tme_rwlock_unlock(rwlock);
4281     }
4282   }
4283 
4284   /* otherwise, threads are not cooperative and this host CPU
4285      can make atomic accesses to at least the most common memory
4286      size.
4287 
4288      in that case, the only reason this function should get
4289      called is if the host CPU can't do an atomic 16-bit
4290      and at all, or if it can't do it at this alignment.
4291 
4292      we emulate the atomic 16-bit and with a compare-and-exchange: */
4293   else {
4294 
4295     /* do an atomic read of the memory: */
4296     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4297 
4298     /* spin the and in a compare-and-exchange loop: */
4299     for (;;) {
4300 
4301       /* make the value to write: */
4302       value_written = value_read & operand;
4303 
4304       /* try the compare-and-exchange: */
4305       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4306 
4307       /* if the compare-and-exchange failed: */
4308       if (__tme_predict_false(value_read_verify != value_read)) {
4309 
4310         /* loop with the new value read from the memory: */
4311         value_read = value_read_verify;
4312         continue;
4313       }
4314 
4315       /* stop now: */
4316       break;
4317     }
4318   }
4319 
4320   /* return the value read: */
4321   return (value_read);
4322 }
4323 
4324 /* undefine any macro version of tme_memory_atomic_or16: */
4325 #undef tme_memory_atomic_or16
4326 
4327 /* the 16-bit atomic or function: */
4328 tme_uint16_t
tme_memory_atomic_or16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4329 tme_memory_atomic_or16(tme_shared tme_uint16_t *memory,
4330                         tme_uint16_t operand,
4331                         tme_rwlock_t *rwlock,
4332                         unsigned int align_min)
4333 {
4334   tme_uint16_t value_read;
4335   tme_uint16_t value_written;
4336   tme_uint16_t value_read_verify;
4337 
4338   /* if we can't make direct accesses at all, all atomic
4339      accesses must be done under lock.  (when threads are
4340      cooperative the actual locking isn't needed): */
4341   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4342     if (!TME_THREADS_COOPERATIVE) {
4343       tme_rwlock_wrlock(rwlock);
4344     }
4345     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4346     value_written = value_read | operand;
4347     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4348     if (!TME_THREADS_COOPERATIVE) {
4349       tme_rwlock_unlock(rwlock);
4350     }
4351   }
4352 
4353   /* otherwise, threads are not cooperative and this host CPU
4354      can make atomic accesses to at least the most common memory
4355      size.
4356 
4357      in that case, the only reason this function should get
4358      called is if the host CPU can't do an atomic 16-bit
4359      or at all, or if it can't do it at this alignment.
4360 
4361      we emulate the atomic 16-bit or with a compare-and-exchange: */
4362   else {
4363 
4364     /* do an atomic read of the memory: */
4365     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4366 
4367     /* spin the or in a compare-and-exchange loop: */
4368     for (;;) {
4369 
4370       /* make the value to write: */
4371       value_written = value_read | operand;
4372 
4373       /* try the compare-and-exchange: */
4374       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4375 
4376       /* if the compare-and-exchange failed: */
4377       if (__tme_predict_false(value_read_verify != value_read)) {
4378 
4379         /* loop with the new value read from the memory: */
4380         value_read = value_read_verify;
4381         continue;
4382       }
4383 
4384       /* stop now: */
4385       break;
4386     }
4387   }
4388 
4389   /* return the value read: */
4390   return (value_read);
4391 }
4392 
4393 /* undefine any macro version of tme_memory_atomic_xor16: */
4394 #undef tme_memory_atomic_xor16
4395 
4396 /* the 16-bit atomic xor function: */
4397 tme_uint16_t
tme_memory_atomic_xor16(tme_shared tme_uint16_t * memory,tme_uint16_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4398 tme_memory_atomic_xor16(tme_shared tme_uint16_t *memory,
4399                         tme_uint16_t operand,
4400                         tme_rwlock_t *rwlock,
4401                         unsigned int align_min)
4402 {
4403   tme_uint16_t value_read;
4404   tme_uint16_t value_written;
4405   tme_uint16_t value_read_verify;
4406 
4407   /* if we can't make direct accesses at all, all atomic
4408      accesses must be done under lock.  (when threads are
4409      cooperative the actual locking isn't needed): */
4410   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4411     if (!TME_THREADS_COOPERATIVE) {
4412       tme_rwlock_wrlock(rwlock);
4413     }
4414     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4415     value_written = value_read ^ operand;
4416     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4417     if (!TME_THREADS_COOPERATIVE) {
4418       tme_rwlock_unlock(rwlock);
4419     }
4420   }
4421 
4422   /* otherwise, threads are not cooperative and this host CPU
4423      can make atomic accesses to at least the most common memory
4424      size.
4425 
4426      in that case, the only reason this function should get
4427      called is if the host CPU can't do an atomic 16-bit
4428      xor at all, or if it can't do it at this alignment.
4429 
4430      we emulate the atomic 16-bit xor with a compare-and-exchange: */
4431   else {
4432 
4433     /* do an atomic read of the memory: */
4434     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4435 
4436     /* spin the xor in a compare-and-exchange loop: */
4437     for (;;) {
4438 
4439       /* make the value to write: */
4440       value_written = value_read ^ operand;
4441 
4442       /* try the compare-and-exchange: */
4443       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4444 
4445       /* if the compare-and-exchange failed: */
4446       if (__tme_predict_false(value_read_verify != value_read)) {
4447 
4448         /* loop with the new value read from the memory: */
4449         value_read = value_read_verify;
4450         continue;
4451       }
4452 
4453       /* stop now: */
4454       break;
4455     }
4456   }
4457 
4458   /* return the value read: */
4459   return (value_read);
4460 }
4461 
4462 /* undefine any macro version of tme_memory_atomic_not16: */
4463 #undef tme_memory_atomic_not16
4464 
4465 /* the 16-bit atomic not function: */
4466 tme_uint16_t
tme_memory_atomic_not16(tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4467 tme_memory_atomic_not16(tme_shared tme_uint16_t *memory,
4468                         tme_rwlock_t *rwlock,
4469                         unsigned int align_min)
4470 {
4471   tme_uint16_t value_read;
4472   tme_uint16_t value_written;
4473   tme_uint16_t value_read_verify;
4474 
4475   /* if we can't make direct accesses at all, all atomic
4476      accesses must be done under lock.  (when threads are
4477      cooperative the actual locking isn't needed): */
4478   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4479     if (!TME_THREADS_COOPERATIVE) {
4480       tme_rwlock_wrlock(rwlock);
4481     }
4482     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4483     value_written = ~value_read;
4484     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4485     if (!TME_THREADS_COOPERATIVE) {
4486       tme_rwlock_unlock(rwlock);
4487     }
4488   }
4489 
4490   /* otherwise, threads are not cooperative and this host CPU
4491      can make atomic accesses to at least the most common memory
4492      size.
4493 
4494      in that case, the only reason this function should get
4495      called is if the host CPU can't do an atomic 16-bit
4496      not at all, or if it can't do it at this alignment.
4497 
4498      we emulate the atomic 16-bit not with a compare-and-exchange: */
4499   else {
4500 
4501     /* do an atomic read of the memory: */
4502     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4503 
4504     /* spin the not in a compare-and-exchange loop: */
4505     for (;;) {
4506 
4507       /* make the value to write: */
4508       value_written = ~value_read;
4509 
4510       /* try the compare-and-exchange: */
4511       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4512 
4513       /* if the compare-and-exchange failed: */
4514       if (__tme_predict_false(value_read_verify != value_read)) {
4515 
4516         /* loop with the new value read from the memory: */
4517         value_read = value_read_verify;
4518         continue;
4519       }
4520 
4521       /* stop now: */
4522       break;
4523     }
4524   }
4525 
4526   /* return the value read: */
4527   return (value_read);
4528 }
4529 
4530 /* undefine any macro version of tme_memory_atomic_neg16: */
4531 #undef tme_memory_atomic_neg16
4532 
4533 /* the 16-bit atomic neg function: */
4534 tme_uint16_t
tme_memory_atomic_neg16(tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4535 tme_memory_atomic_neg16(tme_shared tme_uint16_t *memory,
4536                         tme_rwlock_t *rwlock,
4537                         unsigned int align_min)
4538 {
4539   tme_uint16_t value_read;
4540   tme_uint16_t value_written;
4541   tme_uint16_t value_read_verify;
4542 
4543   /* if we can't make direct accesses at all, all atomic
4544      accesses must be done under lock.  (when threads are
4545      cooperative the actual locking isn't needed): */
4546   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4547     if (!TME_THREADS_COOPERATIVE) {
4548       tme_rwlock_wrlock(rwlock);
4549     }
4550     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4551     value_written = 0 - value_read;
4552     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4553     if (!TME_THREADS_COOPERATIVE) {
4554       tme_rwlock_unlock(rwlock);
4555     }
4556   }
4557 
4558   /* otherwise, threads are not cooperative and this host CPU
4559      can make atomic accesses to at least the most common memory
4560      size.
4561 
4562      in that case, the only reason this function should get
4563      called is if the host CPU can't do an atomic 16-bit
4564      neg at all, or if it can't do it at this alignment.
4565 
4566      we emulate the atomic 16-bit neg with a compare-and-exchange: */
4567   else {
4568 
4569     /* do an atomic read of the memory: */
4570     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4571 
4572     /* spin the neg in a compare-and-exchange loop: */
4573     for (;;) {
4574 
4575       /* make the value to write: */
4576       value_written = 0 - value_read;
4577 
4578       /* try the compare-and-exchange: */
4579       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4580 
4581       /* if the compare-and-exchange failed: */
4582       if (__tme_predict_false(value_read_verify != value_read)) {
4583 
4584         /* loop with the new value read from the memory: */
4585         value_read = value_read_verify;
4586         continue;
4587       }
4588 
4589       /* stop now: */
4590       break;
4591     }
4592   }
4593 
4594   /* return the value read: */
4595   return (value_read);
4596 }
4597 
4598 /* undefine any macro version of tme_memory_atomic_xchg16: */
4599 #undef tme_memory_atomic_xchg16
4600 
4601 /* the 16-bit atomic xchg function: */
4602 tme_uint16_t
tme_memory_atomic_xchg16(tme_shared tme_uint16_t * memory,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4603 tme_memory_atomic_xchg16(tme_shared tme_uint16_t *memory,
4604                         tme_uint16_t value_written,
4605                         tme_rwlock_t *rwlock,
4606                         unsigned int align_min)
4607 {
4608   tme_uint16_t value_read;
4609   tme_uint16_t value_read_verify;
4610 
4611   /* if we can't make direct accesses at all, all atomic
4612      accesses must be done under lock.  (when threads are
4613      cooperative the actual locking isn't needed): */
4614   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4615     if (!TME_THREADS_COOPERATIVE) {
4616       tme_rwlock_wrlock(rwlock);
4617     }
4618     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4619     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4620     if (!TME_THREADS_COOPERATIVE) {
4621       tme_rwlock_unlock(rwlock);
4622     }
4623   }
4624 
4625   /* otherwise, threads are not cooperative and this host CPU
4626      can make atomic accesses to at least the most common memory
4627      size.
4628 
4629      in that case, the only reason this function should get
4630      called is if the host CPU can't do an atomic 16-bit
4631      xchg at all, or if it can't do it at this alignment.
4632 
4633      we emulate the atomic 16-bit xchg with a compare-and-exchange: */
4634   else {
4635 
4636     /* do an atomic read of the memory: */
4637     value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
4638 
4639     /* spin the xchg in a compare-and-exchange loop: */
4640     for (;;) {
4641 
4642       /* try the compare-and-exchange: */
4643       value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
4644 
4645       /* if the compare-and-exchange failed: */
4646       if (__tme_predict_false(value_read_verify != value_read)) {
4647 
4648         /* loop with the new value read from the memory: */
4649         value_read = value_read_verify;
4650         continue;
4651       }
4652 
4653       /* stop now: */
4654       break;
4655     }
4656   }
4657 
4658   /* return the value read: */
4659   return (value_read);
4660 }
4661 
4662 /* undefine any macro version of tme_memory_atomic_cx16: */
4663 #undef tme_memory_atomic_cx16
4664 
4665 /* the 16-bit atomic cx function: */
4666 tme_uint16_t
tme_memory_atomic_cx16(tme_shared tme_uint16_t * memory,tme_uint16_t value_cmp,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4667 tme_memory_atomic_cx16(tme_shared tme_uint16_t *memory,
4668                         tme_uint16_t value_cmp,
4669                         tme_uint16_t value_written,
4670                         tme_rwlock_t *rwlock,
4671                         unsigned int align_min)
4672 {
4673   tme_uint16_t value_read;
4674 
4675   /* if we can't make direct accesses at all, all atomic
4676      accesses must be done under lock.  (when threads are
4677      cooperative the actual locking isn't needed): */
4678   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4679     if (!TME_THREADS_COOPERATIVE) {
4680       tme_rwlock_wrlock(rwlock);
4681     }
4682     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4683     if (value_read == value_cmp) {
4684       tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4685     }
4686     if (!TME_THREADS_COOPERATIVE) {
4687       tme_rwlock_unlock(rwlock);
4688     }
4689   }
4690 
4691   /* otherwise, threads are not cooperative and this host CPU
4692      can make atomic accesses to at least the most common memory
4693      size.
4694 
4695      in that case, the only reason this function should get
4696      called is if the host CPU can't do an atomic 16-bit
4697      cx at all, or if it can't do it at this alignment.
4698 
4699      we assume that these problematic atomic cxs are rare,
4700      and to emulate them we simply stop all other threads while
4701      doing the cx: */
4702   else {
4703     tme_thread_suspend_others();
4704     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4705     if (value_read == value_cmp) {
4706       tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4707     }
4708     tme_thread_resume_others();
4709   }
4710 
4711   /* return the value read: */
4712   return (value_read);
4713 }
4714 
4715 /* undefine any macro version of tme_memory_atomic_read16: */
4716 #undef tme_memory_atomic_read16
4717 
4718 /* the 16-bit atomic read function: */
4719 tme_uint16_t
tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)4720 tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t *memory,
4721                         tme_rwlock_t *rwlock,
4722                         unsigned int align_min)
4723 {
4724   tme_uint16_t value_read;
4725 
4726   /* if we can't make direct accesses at all, all atomic
4727      accesses must be done under lock.  (when threads are
4728      cooperative the actual locking isn't needed): */
4729   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4730     if (!TME_THREADS_COOPERATIVE) {
4731       tme_rwlock_rdlock(rwlock);
4732     }
4733     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4734     if (!TME_THREADS_COOPERATIVE) {
4735       tme_rwlock_unlock(rwlock);
4736     }
4737   }
4738 
4739   /* otherwise, threads are not cooperative and this host CPU
4740      can make atomic accesses to at least the most common memory
4741      size.
4742 
4743      in that case, the only reason this function should get
4744      called is if the host CPU can't do an atomic 16-bit
4745      read at all, or if it can't do it at this alignment.
4746 
4747      we assume that these problematic atomic reads are rare,
4748      and to emulate them we simply stop all other threads while
4749      doing the read: */
4750   else {
4751     tme_thread_suspend_others();
4752     value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
4753     tme_thread_resume_others();
4754   }
4755 
4756   /* return the value read: */
4757   return (value_read);
4758 }
4759 
4760 /* undefine any macro version of tme_memory_atomic_write16: */
4761 #undef tme_memory_atomic_write16
4762 
4763 /* the 16-bit atomic write function: */
4764 void
tme_memory_atomic_write16(tme_shared tme_uint16_t * memory,tme_uint16_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)4765 tme_memory_atomic_write16(tme_shared tme_uint16_t *memory,
4766                         tme_uint16_t value_written,
4767                         tme_rwlock_t *rwlock,
4768                         unsigned int align_min)
4769 {
4770 
4771   /* if we can't make direct accesses at all, all atomic
4772      accesses must be done under lock.  (when threads are
4773      cooperative the actual locking isn't needed): */
4774   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4775     if (!TME_THREADS_COOPERATIVE) {
4776       tme_rwlock_wrlock(rwlock);
4777     }
4778     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4779     if (!TME_THREADS_COOPERATIVE) {
4780       tme_rwlock_unlock(rwlock);
4781     }
4782   }
4783 
4784   /* otherwise, threads are not cooperative and this host CPU
4785      can make atomic accesses to at least the most common memory
4786      size.
4787 
4788      in that case, the only reason this function should get
4789      called is if the host CPU can't do an atomic 16-bit
4790      write at all, or if it can't do it at this alignment.
4791 
4792      we assume that these problematic atomic writes are rare,
4793      and to emulate them we simply stop all other threads while
4794      doing the write: */
4795   else {
4796     tme_thread_suspend_others();
4797     tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
4798     tme_thread_resume_others();
4799   }
4800 }
4801 
4802 /* the 32-bit atomic operations: */
4803 
4804 /* undefine any macro version of tme_memory_atomic_add32: */
4805 #undef tme_memory_atomic_add32
4806 
4807 /* the 32-bit atomic add function: */
4808 tme_uint32_t
tme_memory_atomic_add32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4809 tme_memory_atomic_add32(tme_shared tme_uint32_t *memory,
4810                         tme_uint32_t operand,
4811                         tme_rwlock_t *rwlock,
4812                         unsigned int align_min)
4813 {
4814   tme_uint32_t value_read;
4815   tme_uint32_t value_written;
4816   tme_uint32_t value_read_verify;
4817 
4818   /* if we can't make direct accesses at all, all atomic
4819      accesses must be done under lock.  (when threads are
4820      cooperative the actual locking isn't needed): */
4821   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4822     if (!TME_THREADS_COOPERATIVE) {
4823       tme_rwlock_wrlock(rwlock);
4824     }
4825     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4826     value_written = value_read + operand;
4827     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4828     if (!TME_THREADS_COOPERATIVE) {
4829       tme_rwlock_unlock(rwlock);
4830     }
4831   }
4832 
4833   /* otherwise, threads are not cooperative and this host CPU
4834      can make atomic accesses to at least the most common memory
4835      size.
4836 
4837      in that case, the only reason this function should get
4838      called is if the host CPU can't do an atomic 32-bit
4839      add at all, or if it can't do it at this alignment.
4840 
4841      we emulate the atomic 32-bit add with a compare-and-exchange: */
4842   else {
4843 
4844     /* do an atomic read of the memory: */
4845     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4846 
4847     /* spin the add in a compare-and-exchange loop: */
4848     for (;;) {
4849 
4850       /* make the value to write: */
4851       value_written = value_read + operand;
4852 
4853       /* try the compare-and-exchange: */
4854       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4855 
4856       /* if the compare-and-exchange failed: */
4857       if (__tme_predict_false(value_read_verify != value_read)) {
4858 
4859         /* loop with the new value read from the memory: */
4860         value_read = value_read_verify;
4861         continue;
4862       }
4863 
4864       /* stop now: */
4865       break;
4866     }
4867   }
4868 
4869   /* return the value read: */
4870   return (value_read);
4871 }
4872 
4873 /* undefine any macro version of tme_memory_atomic_sub32: */
4874 #undef tme_memory_atomic_sub32
4875 
4876 /* the 32-bit atomic sub function: */
4877 tme_uint32_t
tme_memory_atomic_sub32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4878 tme_memory_atomic_sub32(tme_shared tme_uint32_t *memory,
4879                         tme_uint32_t operand,
4880                         tme_rwlock_t *rwlock,
4881                         unsigned int align_min)
4882 {
4883   tme_uint32_t value_read;
4884   tme_uint32_t value_written;
4885   tme_uint32_t value_read_verify;
4886 
4887   /* if we can't make direct accesses at all, all atomic
4888      accesses must be done under lock.  (when threads are
4889      cooperative the actual locking isn't needed): */
4890   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4891     if (!TME_THREADS_COOPERATIVE) {
4892       tme_rwlock_wrlock(rwlock);
4893     }
4894     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4895     value_written = value_read - operand;
4896     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4897     if (!TME_THREADS_COOPERATIVE) {
4898       tme_rwlock_unlock(rwlock);
4899     }
4900   }
4901 
4902   /* otherwise, threads are not cooperative and this host CPU
4903      can make atomic accesses to at least the most common memory
4904      size.
4905 
4906      in that case, the only reason this function should get
4907      called is if the host CPU can't do an atomic 32-bit
4908      sub at all, or if it can't do it at this alignment.
4909 
4910      we emulate the atomic 32-bit sub with a compare-and-exchange: */
4911   else {
4912 
4913     /* do an atomic read of the memory: */
4914     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4915 
4916     /* spin the sub in a compare-and-exchange loop: */
4917     for (;;) {
4918 
4919       /* make the value to write: */
4920       value_written = value_read - operand;
4921 
4922       /* try the compare-and-exchange: */
4923       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4924 
4925       /* if the compare-and-exchange failed: */
4926       if (__tme_predict_false(value_read_verify != value_read)) {
4927 
4928         /* loop with the new value read from the memory: */
4929         value_read = value_read_verify;
4930         continue;
4931       }
4932 
4933       /* stop now: */
4934       break;
4935     }
4936   }
4937 
4938   /* return the value read: */
4939   return (value_read);
4940 }
4941 
4942 /* undefine any macro version of tme_memory_atomic_mul32: */
4943 #undef tme_memory_atomic_mul32
4944 
4945 /* the 32-bit atomic mul function: */
4946 tme_uint32_t
tme_memory_atomic_mul32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)4947 tme_memory_atomic_mul32(tme_shared tme_uint32_t *memory,
4948                         tme_uint32_t operand,
4949                         tme_rwlock_t *rwlock,
4950                         unsigned int align_min)
4951 {
4952   tme_uint32_t value_read;
4953   tme_uint32_t value_written;
4954   tme_uint32_t value_read_verify;
4955 
4956   /* if we can't make direct accesses at all, all atomic
4957      accesses must be done under lock.  (when threads are
4958      cooperative the actual locking isn't needed): */
4959   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
4960     if (!TME_THREADS_COOPERATIVE) {
4961       tme_rwlock_wrlock(rwlock);
4962     }
4963     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
4964     value_written = value_read * operand;
4965     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
4966     if (!TME_THREADS_COOPERATIVE) {
4967       tme_rwlock_unlock(rwlock);
4968     }
4969   }
4970 
4971   /* otherwise, threads are not cooperative and this host CPU
4972      can make atomic accesses to at least the most common memory
4973      size.
4974 
4975      in that case, the only reason this function should get
4976      called is if the host CPU can't do an atomic 32-bit
4977      mul at all, or if it can't do it at this alignment.
4978 
4979      we emulate the atomic 32-bit mul with a compare-and-exchange: */
4980   else {
4981 
4982     /* do an atomic read of the memory: */
4983     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
4984 
4985     /* spin the mul in a compare-and-exchange loop: */
4986     for (;;) {
4987 
4988       /* make the value to write: */
4989       value_written = value_read * operand;
4990 
4991       /* try the compare-and-exchange: */
4992       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
4993 
4994       /* if the compare-and-exchange failed: */
4995       if (__tme_predict_false(value_read_verify != value_read)) {
4996 
4997         /* loop with the new value read from the memory: */
4998         value_read = value_read_verify;
4999         continue;
5000       }
5001 
5002       /* stop now: */
5003       break;
5004     }
5005   }
5006 
5007   /* return the value read: */
5008   return (value_read);
5009 }
5010 
5011 /* undefine any macro version of tme_memory_atomic_div32: */
5012 #undef tme_memory_atomic_div32
5013 
5014 /* the 32-bit atomic div function: */
5015 tme_uint32_t
tme_memory_atomic_div32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5016 tme_memory_atomic_div32(tme_shared tme_uint32_t *memory,
5017                         tme_uint32_t operand,
5018                         tme_rwlock_t *rwlock,
5019                         unsigned int align_min)
5020 {
5021   tme_uint32_t value_read;
5022   tme_uint32_t value_written;
5023   tme_uint32_t value_read_verify;
5024 
5025   /* if we can't make direct accesses at all, all atomic
5026      accesses must be done under lock.  (when threads are
5027      cooperative the actual locking isn't needed): */
5028   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5029     if (!TME_THREADS_COOPERATIVE) {
5030       tme_rwlock_wrlock(rwlock);
5031     }
5032     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5033     value_written = value_read / operand;
5034     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5035     if (!TME_THREADS_COOPERATIVE) {
5036       tme_rwlock_unlock(rwlock);
5037     }
5038   }
5039 
5040   /* otherwise, threads are not cooperative and this host CPU
5041      can make atomic accesses to at least the most common memory
5042      size.
5043 
5044      in that case, the only reason this function should get
5045      called is if the host CPU can't do an atomic 32-bit
5046      div at all, or if it can't do it at this alignment.
5047 
5048      we emulate the atomic 32-bit div with a compare-and-exchange: */
5049   else {
5050 
5051     /* do an atomic read of the memory: */
5052     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5053 
5054     /* spin the div in a compare-and-exchange loop: */
5055     for (;;) {
5056 
5057       /* make the value to write: */
5058       value_written = value_read / operand;
5059 
5060       /* try the compare-and-exchange: */
5061       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5062 
5063       /* if the compare-and-exchange failed: */
5064       if (__tme_predict_false(value_read_verify != value_read)) {
5065 
5066         /* loop with the new value read from the memory: */
5067         value_read = value_read_verify;
5068         continue;
5069       }
5070 
5071       /* stop now: */
5072       break;
5073     }
5074   }
5075 
5076   /* return the value read: */
5077   return (value_read);
5078 }
5079 
5080 /* undefine any macro version of tme_memory_atomic_and32: */
5081 #undef tme_memory_atomic_and32
5082 
5083 /* the 32-bit atomic and function: */
5084 tme_uint32_t
tme_memory_atomic_and32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5085 tme_memory_atomic_and32(tme_shared tme_uint32_t *memory,
5086                         tme_uint32_t operand,
5087                         tme_rwlock_t *rwlock,
5088                         unsigned int align_min)
5089 {
5090   tme_uint32_t value_read;
5091   tme_uint32_t value_written;
5092   tme_uint32_t value_read_verify;
5093 
5094   /* if we can't make direct accesses at all, all atomic
5095      accesses must be done under lock.  (when threads are
5096      cooperative the actual locking isn't needed): */
5097   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5098     if (!TME_THREADS_COOPERATIVE) {
5099       tme_rwlock_wrlock(rwlock);
5100     }
5101     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5102     value_written = value_read & operand;
5103     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5104     if (!TME_THREADS_COOPERATIVE) {
5105       tme_rwlock_unlock(rwlock);
5106     }
5107   }
5108 
5109   /* otherwise, threads are not cooperative and this host CPU
5110      can make atomic accesses to at least the most common memory
5111      size.
5112 
5113      in that case, the only reason this function should get
5114      called is if the host CPU can't do an atomic 32-bit
5115      and at all, or if it can't do it at this alignment.
5116 
5117      we emulate the atomic 32-bit and with a compare-and-exchange: */
5118   else {
5119 
5120     /* do an atomic read of the memory: */
5121     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5122 
5123     /* spin the and in a compare-and-exchange loop: */
5124     for (;;) {
5125 
5126       /* make the value to write: */
5127       value_written = value_read & operand;
5128 
5129       /* try the compare-and-exchange: */
5130       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5131 
5132       /* if the compare-and-exchange failed: */
5133       if (__tme_predict_false(value_read_verify != value_read)) {
5134 
5135         /* loop with the new value read from the memory: */
5136         value_read = value_read_verify;
5137         continue;
5138       }
5139 
5140       /* stop now: */
5141       break;
5142     }
5143   }
5144 
5145   /* return the value read: */
5146   return (value_read);
5147 }
5148 
5149 /* undefine any macro version of tme_memory_atomic_or32: */
5150 #undef tme_memory_atomic_or32
5151 
5152 /* the 32-bit atomic or function: */
5153 tme_uint32_t
tme_memory_atomic_or32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5154 tme_memory_atomic_or32(tme_shared tme_uint32_t *memory,
5155                         tme_uint32_t operand,
5156                         tme_rwlock_t *rwlock,
5157                         unsigned int align_min)
5158 {
5159   tme_uint32_t value_read;
5160   tme_uint32_t value_written;
5161   tme_uint32_t value_read_verify;
5162 
5163   /* if we can't make direct accesses at all, all atomic
5164      accesses must be done under lock.  (when threads are
5165      cooperative the actual locking isn't needed): */
5166   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5167     if (!TME_THREADS_COOPERATIVE) {
5168       tme_rwlock_wrlock(rwlock);
5169     }
5170     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5171     value_written = value_read | operand;
5172     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5173     if (!TME_THREADS_COOPERATIVE) {
5174       tme_rwlock_unlock(rwlock);
5175     }
5176   }
5177 
5178   /* otherwise, threads are not cooperative and this host CPU
5179      can make atomic accesses to at least the most common memory
5180      size.
5181 
5182      in that case, the only reason this function should get
5183      called is if the host CPU can't do an atomic 32-bit
5184      or at all, or if it can't do it at this alignment.
5185 
5186      we emulate the atomic 32-bit or with a compare-and-exchange: */
5187   else {
5188 
5189     /* do an atomic read of the memory: */
5190     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5191 
5192     /* spin the or in a compare-and-exchange loop: */
5193     for (;;) {
5194 
5195       /* make the value to write: */
5196       value_written = value_read | operand;
5197 
5198       /* try the compare-and-exchange: */
5199       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5200 
5201       /* if the compare-and-exchange failed: */
5202       if (__tme_predict_false(value_read_verify != value_read)) {
5203 
5204         /* loop with the new value read from the memory: */
5205         value_read = value_read_verify;
5206         continue;
5207       }
5208 
5209       /* stop now: */
5210       break;
5211     }
5212   }
5213 
5214   /* return the value read: */
5215   return (value_read);
5216 }
5217 
5218 /* undefine any macro version of tme_memory_atomic_xor32: */
5219 #undef tme_memory_atomic_xor32
5220 
5221 /* the 32-bit atomic xor function: */
5222 tme_uint32_t
tme_memory_atomic_xor32(tme_shared tme_uint32_t * memory,tme_uint32_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5223 tme_memory_atomic_xor32(tme_shared tme_uint32_t *memory,
5224                         tme_uint32_t operand,
5225                         tme_rwlock_t *rwlock,
5226                         unsigned int align_min)
5227 {
5228   tme_uint32_t value_read;
5229   tme_uint32_t value_written;
5230   tme_uint32_t value_read_verify;
5231 
5232   /* if we can't make direct accesses at all, all atomic
5233      accesses must be done under lock.  (when threads are
5234      cooperative the actual locking isn't needed): */
5235   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5236     if (!TME_THREADS_COOPERATIVE) {
5237       tme_rwlock_wrlock(rwlock);
5238     }
5239     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5240     value_written = value_read ^ operand;
5241     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5242     if (!TME_THREADS_COOPERATIVE) {
5243       tme_rwlock_unlock(rwlock);
5244     }
5245   }
5246 
5247   /* otherwise, threads are not cooperative and this host CPU
5248      can make atomic accesses to at least the most common memory
5249      size.
5250 
5251      in that case, the only reason this function should get
5252      called is if the host CPU can't do an atomic 32-bit
5253      xor at all, or if it can't do it at this alignment.
5254 
5255      we emulate the atomic 32-bit xor with a compare-and-exchange: */
5256   else {
5257 
5258     /* do an atomic read of the memory: */
5259     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5260 
5261     /* spin the xor in a compare-and-exchange loop: */
5262     for (;;) {
5263 
5264       /* make the value to write: */
5265       value_written = value_read ^ operand;
5266 
5267       /* try the compare-and-exchange: */
5268       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5269 
5270       /* if the compare-and-exchange failed: */
5271       if (__tme_predict_false(value_read_verify != value_read)) {
5272 
5273         /* loop with the new value read from the memory: */
5274         value_read = value_read_verify;
5275         continue;
5276       }
5277 
5278       /* stop now: */
5279       break;
5280     }
5281   }
5282 
5283   /* return the value read: */
5284   return (value_read);
5285 }
5286 
5287 /* undefine any macro version of tme_memory_atomic_not32: */
5288 #undef tme_memory_atomic_not32
5289 
5290 /* the 32-bit atomic not function: */
5291 tme_uint32_t
tme_memory_atomic_not32(tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5292 tme_memory_atomic_not32(tme_shared tme_uint32_t *memory,
5293                         tme_rwlock_t *rwlock,
5294                         unsigned int align_min)
5295 {
5296   tme_uint32_t value_read;
5297   tme_uint32_t value_written;
5298   tme_uint32_t value_read_verify;
5299 
5300   /* if we can't make direct accesses at all, all atomic
5301      accesses must be done under lock.  (when threads are
5302      cooperative the actual locking isn't needed): */
5303   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5304     if (!TME_THREADS_COOPERATIVE) {
5305       tme_rwlock_wrlock(rwlock);
5306     }
5307     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5308     value_written = ~value_read;
5309     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5310     if (!TME_THREADS_COOPERATIVE) {
5311       tme_rwlock_unlock(rwlock);
5312     }
5313   }
5314 
5315   /* otherwise, threads are not cooperative and this host CPU
5316      can make atomic accesses to at least the most common memory
5317      size.
5318 
5319      in that case, the only reason this function should get
5320      called is if the host CPU can't do an atomic 32-bit
5321      not at all, or if it can't do it at this alignment.
5322 
5323      we emulate the atomic 32-bit not with a compare-and-exchange: */
5324   else {
5325 
5326     /* do an atomic read of the memory: */
5327     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5328 
5329     /* spin the not in a compare-and-exchange loop: */
5330     for (;;) {
5331 
5332       /* make the value to write: */
5333       value_written = ~value_read;
5334 
5335       /* try the compare-and-exchange: */
5336       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5337 
5338       /* if the compare-and-exchange failed: */
5339       if (__tme_predict_false(value_read_verify != value_read)) {
5340 
5341         /* loop with the new value read from the memory: */
5342         value_read = value_read_verify;
5343         continue;
5344       }
5345 
5346       /* stop now: */
5347       break;
5348     }
5349   }
5350 
5351   /* return the value read: */
5352   return (value_read);
5353 }
5354 
5355 /* undefine any macro version of tme_memory_atomic_neg32: */
5356 #undef tme_memory_atomic_neg32
5357 
5358 /* the 32-bit atomic neg function: */
5359 tme_uint32_t
tme_memory_atomic_neg32(tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5360 tme_memory_atomic_neg32(tme_shared tme_uint32_t *memory,
5361                         tme_rwlock_t *rwlock,
5362                         unsigned int align_min)
5363 {
5364   tme_uint32_t value_read;
5365   tme_uint32_t value_written;
5366   tme_uint32_t value_read_verify;
5367 
5368   /* if we can't make direct accesses at all, all atomic
5369      accesses must be done under lock.  (when threads are
5370      cooperative the actual locking isn't needed): */
5371   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5372     if (!TME_THREADS_COOPERATIVE) {
5373       tme_rwlock_wrlock(rwlock);
5374     }
5375     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5376     value_written = 0 - value_read;
5377     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5378     if (!TME_THREADS_COOPERATIVE) {
5379       tme_rwlock_unlock(rwlock);
5380     }
5381   }
5382 
5383   /* otherwise, threads are not cooperative and this host CPU
5384      can make atomic accesses to at least the most common memory
5385      size.
5386 
5387      in that case, the only reason this function should get
5388      called is if the host CPU can't do an atomic 32-bit
5389      neg at all, or if it can't do it at this alignment.
5390 
5391      we emulate the atomic 32-bit neg with a compare-and-exchange: */
5392   else {
5393 
5394     /* do an atomic read of the memory: */
5395     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5396 
5397     /* spin the neg in a compare-and-exchange loop: */
5398     for (;;) {
5399 
5400       /* make the value to write: */
5401       value_written = 0 - value_read;
5402 
5403       /* try the compare-and-exchange: */
5404       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5405 
5406       /* if the compare-and-exchange failed: */
5407       if (__tme_predict_false(value_read_verify != value_read)) {
5408 
5409         /* loop with the new value read from the memory: */
5410         value_read = value_read_verify;
5411         continue;
5412       }
5413 
5414       /* stop now: */
5415       break;
5416     }
5417   }
5418 
5419   /* return the value read: */
5420   return (value_read);
5421 }
5422 
5423 /* undefine any macro version of tme_memory_atomic_xchg32: */
5424 #undef tme_memory_atomic_xchg32
5425 
5426 /* the 32-bit atomic xchg function: */
5427 tme_uint32_t
tme_memory_atomic_xchg32(tme_shared tme_uint32_t * memory,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5428 tme_memory_atomic_xchg32(tme_shared tme_uint32_t *memory,
5429                         tme_uint32_t value_written,
5430                         tme_rwlock_t *rwlock,
5431                         unsigned int align_min)
5432 {
5433   tme_uint32_t value_read;
5434   tme_uint32_t value_read_verify;
5435 
5436   /* if we can't make direct accesses at all, all atomic
5437      accesses must be done under lock.  (when threads are
5438      cooperative the actual locking isn't needed): */
5439   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5440     if (!TME_THREADS_COOPERATIVE) {
5441       tme_rwlock_wrlock(rwlock);
5442     }
5443     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5444     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5445     if (!TME_THREADS_COOPERATIVE) {
5446       tme_rwlock_unlock(rwlock);
5447     }
5448   }
5449 
5450   /* otherwise, threads are not cooperative and this host CPU
5451      can make atomic accesses to at least the most common memory
5452      size.
5453 
5454      in that case, the only reason this function should get
5455      called is if the host CPU can't do an atomic 32-bit
5456      xchg at all, or if it can't do it at this alignment.
5457 
5458      we emulate the atomic 32-bit xchg with a compare-and-exchange: */
5459   else {
5460 
5461     /* do an atomic read of the memory: */
5462     value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
5463 
5464     /* spin the xchg in a compare-and-exchange loop: */
5465     for (;;) {
5466 
5467       /* try the compare-and-exchange: */
5468       value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
5469 
5470       /* if the compare-and-exchange failed: */
5471       if (__tme_predict_false(value_read_verify != value_read)) {
5472 
5473         /* loop with the new value read from the memory: */
5474         value_read = value_read_verify;
5475         continue;
5476       }
5477 
5478       /* stop now: */
5479       break;
5480     }
5481   }
5482 
5483   /* return the value read: */
5484   return (value_read);
5485 }
5486 
5487 /* undefine any macro version of tme_memory_atomic_cx32: */
5488 #undef tme_memory_atomic_cx32
5489 
5490 /* the 32-bit atomic cx function: */
5491 tme_uint32_t
tme_memory_atomic_cx32(tme_shared tme_uint32_t * memory,tme_uint32_t value_cmp,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5492 tme_memory_atomic_cx32(tme_shared tme_uint32_t *memory,
5493                         tme_uint32_t value_cmp,
5494                         tme_uint32_t value_written,
5495                         tme_rwlock_t *rwlock,
5496                         unsigned int align_min)
5497 {
5498   tme_uint32_t value_read;
5499 
5500   /* if we can't make direct accesses at all, all atomic
5501      accesses must be done under lock.  (when threads are
5502      cooperative the actual locking isn't needed): */
5503   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5504     if (!TME_THREADS_COOPERATIVE) {
5505       tme_rwlock_wrlock(rwlock);
5506     }
5507     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5508     if (value_read == value_cmp) {
5509       tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5510     }
5511     if (!TME_THREADS_COOPERATIVE) {
5512       tme_rwlock_unlock(rwlock);
5513     }
5514   }
5515 
5516   /* otherwise, threads are not cooperative and this host CPU
5517      can make atomic accesses to at least the most common memory
5518      size.
5519 
5520      in that case, the only reason this function should get
5521      called is if the host CPU can't do an atomic 32-bit
5522      cx at all, or if it can't do it at this alignment.
5523 
5524      we assume that these problematic atomic cxs are rare,
5525      and to emulate them we simply stop all other threads while
5526      doing the cx: */
5527   else {
5528     tme_thread_suspend_others();
5529     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5530     if (value_read == value_cmp) {
5531       tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5532     }
5533     tme_thread_resume_others();
5534   }
5535 
5536   /* return the value read: */
5537   return (value_read);
5538 }
5539 
5540 /* undefine any macro version of tme_memory_atomic_read32: */
5541 #undef tme_memory_atomic_read32
5542 
5543 /* the 32-bit atomic read function: */
5544 tme_uint32_t
tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)5545 tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t *memory,
5546                         tme_rwlock_t *rwlock,
5547                         unsigned int align_min)
5548 {
5549   tme_uint32_t value_read;
5550 
5551   /* if we can't make direct accesses at all, all atomic
5552      accesses must be done under lock.  (when threads are
5553      cooperative the actual locking isn't needed): */
5554   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5555     if (!TME_THREADS_COOPERATIVE) {
5556       tme_rwlock_rdlock(rwlock);
5557     }
5558     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5559     if (!TME_THREADS_COOPERATIVE) {
5560       tme_rwlock_unlock(rwlock);
5561     }
5562   }
5563 
5564   /* otherwise, threads are not cooperative and this host CPU
5565      can make atomic accesses to at least the most common memory
5566      size.
5567 
5568      in that case, the only reason this function should get
5569      called is if the host CPU can't do an atomic 32-bit
5570      read at all, or if it can't do it at this alignment.
5571 
5572      we assume that these problematic atomic reads are rare,
5573      and to emulate them we simply stop all other threads while
5574      doing the read: */
5575   else {
5576     tme_thread_suspend_others();
5577     value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
5578     tme_thread_resume_others();
5579   }
5580 
5581   /* return the value read: */
5582   return (value_read);
5583 }
5584 
5585 /* undefine any macro version of tme_memory_atomic_write32: */
5586 #undef tme_memory_atomic_write32
5587 
5588 /* the 32-bit atomic write function: */
5589 void
tme_memory_atomic_write32(tme_shared tme_uint32_t * memory,tme_uint32_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)5590 tme_memory_atomic_write32(tme_shared tme_uint32_t *memory,
5591                         tme_uint32_t value_written,
5592                         tme_rwlock_t *rwlock,
5593                         unsigned int align_min)
5594 {
5595 
5596   /* if we can't make direct accesses at all, all atomic
5597      accesses must be done under lock.  (when threads are
5598      cooperative the actual locking isn't needed): */
5599   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5600     if (!TME_THREADS_COOPERATIVE) {
5601       tme_rwlock_wrlock(rwlock);
5602     }
5603     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5604     if (!TME_THREADS_COOPERATIVE) {
5605       tme_rwlock_unlock(rwlock);
5606     }
5607   }
5608 
5609   /* otherwise, threads are not cooperative and this host CPU
5610      can make atomic accesses to at least the most common memory
5611      size.
5612 
5613      in that case, the only reason this function should get
5614      called is if the host CPU can't do an atomic 32-bit
5615      write at all, or if it can't do it at this alignment.
5616 
5617      we assume that these problematic atomic writes are rare,
5618      and to emulate them we simply stop all other threads while
5619      doing the write: */
5620   else {
5621     tme_thread_suspend_others();
5622     tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
5623     tme_thread_resume_others();
5624   }
5625 }
5626 
5627 #ifdef TME_HAVE_INT64_T
5628 
5629 /* the 64-bit atomic operations: */
5630 
5631 /* undefine any macro version of tme_memory_atomic_add64: */
5632 #undef tme_memory_atomic_add64
5633 
5634 /* the 64-bit atomic add function: */
5635 tme_uint64_t
tme_memory_atomic_add64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5636 tme_memory_atomic_add64(tme_shared tme_uint64_t *memory,
5637                         tme_uint64_t operand,
5638                         tme_rwlock_t *rwlock,
5639                         unsigned int align_min)
5640 {
5641   tme_uint64_t value_read;
5642   tme_uint64_t value_written;
5643   tme_uint64_t value_read_verify;
5644 
5645   /* if we can't make direct accesses at all, all atomic
5646      accesses must be done under lock.  (when threads are
5647      cooperative the actual locking isn't needed): */
5648   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5649     if (!TME_THREADS_COOPERATIVE) {
5650       tme_rwlock_wrlock(rwlock);
5651     }
5652     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5653     value_written = value_read + operand;
5654     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5655     if (!TME_THREADS_COOPERATIVE) {
5656       tme_rwlock_unlock(rwlock);
5657     }
5658   }
5659 
5660   /* otherwise, threads are not cooperative and this host CPU
5661      can make atomic accesses to at least the most common memory
5662      size.
5663 
5664      in that case, the only reason this function should get
5665      called is if the host CPU can't do an atomic 64-bit
5666      add at all, or if it can't do it at this alignment.
5667 
5668      we emulate the atomic 64-bit add with a compare-and-exchange: */
5669   else {
5670 
5671     /* do an atomic read of the memory: */
5672     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5673 
5674     /* spin the add in a compare-and-exchange loop: */
5675     for (;;) {
5676 
5677       /* make the value to write: */
5678       value_written = value_read + operand;
5679 
5680       /* try the compare-and-exchange: */
5681       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5682 
5683       /* if the compare-and-exchange failed: */
5684       if (__tme_predict_false(value_read_verify != value_read)) {
5685 
5686         /* loop with the new value read from the memory: */
5687         value_read = value_read_verify;
5688         continue;
5689       }
5690 
5691       /* stop now: */
5692       break;
5693     }
5694   }
5695 
5696   /* return the value read: */
5697   return (value_read);
5698 }
5699 
5700 /* undefine any macro version of tme_memory_atomic_sub64: */
5701 #undef tme_memory_atomic_sub64
5702 
5703 /* the 64-bit atomic sub function: */
5704 tme_uint64_t
tme_memory_atomic_sub64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5705 tme_memory_atomic_sub64(tme_shared tme_uint64_t *memory,
5706                         tme_uint64_t operand,
5707                         tme_rwlock_t *rwlock,
5708                         unsigned int align_min)
5709 {
5710   tme_uint64_t value_read;
5711   tme_uint64_t value_written;
5712   tme_uint64_t value_read_verify;
5713 
5714   /* if we can't make direct accesses at all, all atomic
5715      accesses must be done under lock.  (when threads are
5716      cooperative the actual locking isn't needed): */
5717   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5718     if (!TME_THREADS_COOPERATIVE) {
5719       tme_rwlock_wrlock(rwlock);
5720     }
5721     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5722     value_written = value_read - operand;
5723     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5724     if (!TME_THREADS_COOPERATIVE) {
5725       tme_rwlock_unlock(rwlock);
5726     }
5727   }
5728 
5729   /* otherwise, threads are not cooperative and this host CPU
5730      can make atomic accesses to at least the most common memory
5731      size.
5732 
5733      in that case, the only reason this function should get
5734      called is if the host CPU can't do an atomic 64-bit
5735      sub at all, or if it can't do it at this alignment.
5736 
5737      we emulate the atomic 64-bit sub with a compare-and-exchange: */
5738   else {
5739 
5740     /* do an atomic read of the memory: */
5741     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5742 
5743     /* spin the sub in a compare-and-exchange loop: */
5744     for (;;) {
5745 
5746       /* make the value to write: */
5747       value_written = value_read - operand;
5748 
5749       /* try the compare-and-exchange: */
5750       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5751 
5752       /* if the compare-and-exchange failed: */
5753       if (__tme_predict_false(value_read_verify != value_read)) {
5754 
5755         /* loop with the new value read from the memory: */
5756         value_read = value_read_verify;
5757         continue;
5758       }
5759 
5760       /* stop now: */
5761       break;
5762     }
5763   }
5764 
5765   /* return the value read: */
5766   return (value_read);
5767 }
5768 
5769 /* undefine any macro version of tme_memory_atomic_mul64: */
5770 #undef tme_memory_atomic_mul64
5771 
5772 /* the 64-bit atomic mul function: */
5773 tme_uint64_t
tme_memory_atomic_mul64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5774 tme_memory_atomic_mul64(tme_shared tme_uint64_t *memory,
5775                         tme_uint64_t operand,
5776                         tme_rwlock_t *rwlock,
5777                         unsigned int align_min)
5778 {
5779   tme_uint64_t value_read;
5780   tme_uint64_t value_written;
5781   tme_uint64_t value_read_verify;
5782 
5783   /* if we can't make direct accesses at all, all atomic
5784      accesses must be done under lock.  (when threads are
5785      cooperative the actual locking isn't needed): */
5786   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5787     if (!TME_THREADS_COOPERATIVE) {
5788       tme_rwlock_wrlock(rwlock);
5789     }
5790     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5791     value_written = value_read * operand;
5792     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5793     if (!TME_THREADS_COOPERATIVE) {
5794       tme_rwlock_unlock(rwlock);
5795     }
5796   }
5797 
5798   /* otherwise, threads are not cooperative and this host CPU
5799      can make atomic accesses to at least the most common memory
5800      size.
5801 
5802      in that case, the only reason this function should get
5803      called is if the host CPU can't do an atomic 64-bit
5804      mul at all, or if it can't do it at this alignment.
5805 
5806      we emulate the atomic 64-bit mul with a compare-and-exchange: */
5807   else {
5808 
5809     /* do an atomic read of the memory: */
5810     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5811 
5812     /* spin the mul in a compare-and-exchange loop: */
5813     for (;;) {
5814 
5815       /* make the value to write: */
5816       value_written = value_read * operand;
5817 
5818       /* try the compare-and-exchange: */
5819       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5820 
5821       /* if the compare-and-exchange failed: */
5822       if (__tme_predict_false(value_read_verify != value_read)) {
5823 
5824         /* loop with the new value read from the memory: */
5825         value_read = value_read_verify;
5826         continue;
5827       }
5828 
5829       /* stop now: */
5830       break;
5831     }
5832   }
5833 
5834   /* return the value read: */
5835   return (value_read);
5836 }
5837 
5838 /* undefine any macro version of tme_memory_atomic_div64: */
5839 #undef tme_memory_atomic_div64
5840 
5841 /* the 64-bit atomic div function: */
5842 tme_uint64_t
tme_memory_atomic_div64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5843 tme_memory_atomic_div64(tme_shared tme_uint64_t *memory,
5844                         tme_uint64_t operand,
5845                         tme_rwlock_t *rwlock,
5846                         unsigned int align_min)
5847 {
5848   tme_uint64_t value_read;
5849   tme_uint64_t value_written;
5850   tme_uint64_t value_read_verify;
5851 
5852   /* if we can't make direct accesses at all, all atomic
5853      accesses must be done under lock.  (when threads are
5854      cooperative the actual locking isn't needed): */
5855   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5856     if (!TME_THREADS_COOPERATIVE) {
5857       tme_rwlock_wrlock(rwlock);
5858     }
5859     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5860     value_written = value_read / operand;
5861     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5862     if (!TME_THREADS_COOPERATIVE) {
5863       tme_rwlock_unlock(rwlock);
5864     }
5865   }
5866 
5867   /* otherwise, threads are not cooperative and this host CPU
5868      can make atomic accesses to at least the most common memory
5869      size.
5870 
5871      in that case, the only reason this function should get
5872      called is if the host CPU can't do an atomic 64-bit
5873      div at all, or if it can't do it at this alignment.
5874 
5875      we emulate the atomic 64-bit div with a compare-and-exchange: */
5876   else {
5877 
5878     /* do an atomic read of the memory: */
5879     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5880 
5881     /* spin the div in a compare-and-exchange loop: */
5882     for (;;) {
5883 
5884       /* make the value to write: */
5885       value_written = value_read / operand;
5886 
5887       /* try the compare-and-exchange: */
5888       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5889 
5890       /* if the compare-and-exchange failed: */
5891       if (__tme_predict_false(value_read_verify != value_read)) {
5892 
5893         /* loop with the new value read from the memory: */
5894         value_read = value_read_verify;
5895         continue;
5896       }
5897 
5898       /* stop now: */
5899       break;
5900     }
5901   }
5902 
5903   /* return the value read: */
5904   return (value_read);
5905 }
5906 
5907 /* undefine any macro version of tme_memory_atomic_and64: */
5908 #undef tme_memory_atomic_and64
5909 
5910 /* the 64-bit atomic and function: */
5911 tme_uint64_t
tme_memory_atomic_and64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5912 tme_memory_atomic_and64(tme_shared tme_uint64_t *memory,
5913                         tme_uint64_t operand,
5914                         tme_rwlock_t *rwlock,
5915                         unsigned int align_min)
5916 {
5917   tme_uint64_t value_read;
5918   tme_uint64_t value_written;
5919   tme_uint64_t value_read_verify;
5920 
5921   /* if we can't make direct accesses at all, all atomic
5922      accesses must be done under lock.  (when threads are
5923      cooperative the actual locking isn't needed): */
5924   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5925     if (!TME_THREADS_COOPERATIVE) {
5926       tme_rwlock_wrlock(rwlock);
5927     }
5928     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5929     value_written = value_read & operand;
5930     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
5931     if (!TME_THREADS_COOPERATIVE) {
5932       tme_rwlock_unlock(rwlock);
5933     }
5934   }
5935 
5936   /* otherwise, threads are not cooperative and this host CPU
5937      can make atomic accesses to at least the most common memory
5938      size.
5939 
5940      in that case, the only reason this function should get
5941      called is if the host CPU can't do an atomic 64-bit
5942      and at all, or if it can't do it at this alignment.
5943 
5944      we emulate the atomic 64-bit and with a compare-and-exchange: */
5945   else {
5946 
5947     /* do an atomic read of the memory: */
5948     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
5949 
5950     /* spin the and in a compare-and-exchange loop: */
5951     for (;;) {
5952 
5953       /* make the value to write: */
5954       value_written = value_read & operand;
5955 
5956       /* try the compare-and-exchange: */
5957       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
5958 
5959       /* if the compare-and-exchange failed: */
5960       if (__tme_predict_false(value_read_verify != value_read)) {
5961 
5962         /* loop with the new value read from the memory: */
5963         value_read = value_read_verify;
5964         continue;
5965       }
5966 
5967       /* stop now: */
5968       break;
5969     }
5970   }
5971 
5972   /* return the value read: */
5973   return (value_read);
5974 }
5975 
5976 /* undefine any macro version of tme_memory_atomic_or64: */
5977 #undef tme_memory_atomic_or64
5978 
5979 /* the 64-bit atomic or function: */
5980 tme_uint64_t
tme_memory_atomic_or64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)5981 tme_memory_atomic_or64(tme_shared tme_uint64_t *memory,
5982                         tme_uint64_t operand,
5983                         tme_rwlock_t *rwlock,
5984                         unsigned int align_min)
5985 {
5986   tme_uint64_t value_read;
5987   tme_uint64_t value_written;
5988   tme_uint64_t value_read_verify;
5989 
5990   /* if we can't make direct accesses at all, all atomic
5991      accesses must be done under lock.  (when threads are
5992      cooperative the actual locking isn't needed): */
5993   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
5994     if (!TME_THREADS_COOPERATIVE) {
5995       tme_rwlock_wrlock(rwlock);
5996     }
5997     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
5998     value_written = value_read | operand;
5999     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6000     if (!TME_THREADS_COOPERATIVE) {
6001       tme_rwlock_unlock(rwlock);
6002     }
6003   }
6004 
6005   /* otherwise, threads are not cooperative and this host CPU
6006      can make atomic accesses to at least the most common memory
6007      size.
6008 
6009      in that case, the only reason this function should get
6010      called is if the host CPU can't do an atomic 64-bit
6011      or at all, or if it can't do it at this alignment.
6012 
6013      we emulate the atomic 64-bit or with a compare-and-exchange: */
6014   else {
6015 
6016     /* do an atomic read of the memory: */
6017     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6018 
6019     /* spin the or in a compare-and-exchange loop: */
6020     for (;;) {
6021 
6022       /* make the value to write: */
6023       value_written = value_read | operand;
6024 
6025       /* try the compare-and-exchange: */
6026       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6027 
6028       /* if the compare-and-exchange failed: */
6029       if (__tme_predict_false(value_read_verify != value_read)) {
6030 
6031         /* loop with the new value read from the memory: */
6032         value_read = value_read_verify;
6033         continue;
6034       }
6035 
6036       /* stop now: */
6037       break;
6038     }
6039   }
6040 
6041   /* return the value read: */
6042   return (value_read);
6043 }
6044 
6045 /* undefine any macro version of tme_memory_atomic_xor64: */
6046 #undef tme_memory_atomic_xor64
6047 
6048 /* the 64-bit atomic xor function: */
6049 tme_uint64_t
tme_memory_atomic_xor64(tme_shared tme_uint64_t * memory,tme_uint64_t operand,tme_rwlock_t * rwlock,unsigned int align_min)6050 tme_memory_atomic_xor64(tme_shared tme_uint64_t *memory,
6051                         tme_uint64_t operand,
6052                         tme_rwlock_t *rwlock,
6053                         unsigned int align_min)
6054 {
6055   tme_uint64_t value_read;
6056   tme_uint64_t value_written;
6057   tme_uint64_t value_read_verify;
6058 
6059   /* if we can't make direct accesses at all, all atomic
6060      accesses must be done under lock.  (when threads are
6061      cooperative the actual locking isn't needed): */
6062   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6063     if (!TME_THREADS_COOPERATIVE) {
6064       tme_rwlock_wrlock(rwlock);
6065     }
6066     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6067     value_written = value_read ^ operand;
6068     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6069     if (!TME_THREADS_COOPERATIVE) {
6070       tme_rwlock_unlock(rwlock);
6071     }
6072   }
6073 
6074   /* otherwise, threads are not cooperative and this host CPU
6075      can make atomic accesses to at least the most common memory
6076      size.
6077 
6078      in that case, the only reason this function should get
6079      called is if the host CPU can't do an atomic 64-bit
6080      xor at all, or if it can't do it at this alignment.
6081 
6082      we emulate the atomic 64-bit xor with a compare-and-exchange: */
6083   else {
6084 
6085     /* do an atomic read of the memory: */
6086     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6087 
6088     /* spin the xor in a compare-and-exchange loop: */
6089     for (;;) {
6090 
6091       /* make the value to write: */
6092       value_written = value_read ^ operand;
6093 
6094       /* try the compare-and-exchange: */
6095       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6096 
6097       /* if the compare-and-exchange failed: */
6098       if (__tme_predict_false(value_read_verify != value_read)) {
6099 
6100         /* loop with the new value read from the memory: */
6101         value_read = value_read_verify;
6102         continue;
6103       }
6104 
6105       /* stop now: */
6106       break;
6107     }
6108   }
6109 
6110   /* return the value read: */
6111   return (value_read);
6112 }
6113 
6114 /* undefine any macro version of tme_memory_atomic_not64: */
6115 #undef tme_memory_atomic_not64
6116 
6117 /* the 64-bit atomic not function: */
6118 tme_uint64_t
tme_memory_atomic_not64(tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6119 tme_memory_atomic_not64(tme_shared tme_uint64_t *memory,
6120                         tme_rwlock_t *rwlock,
6121                         unsigned int align_min)
6122 {
6123   tme_uint64_t value_read;
6124   tme_uint64_t value_written;
6125   tme_uint64_t value_read_verify;
6126 
6127   /* if we can't make direct accesses at all, all atomic
6128      accesses must be done under lock.  (when threads are
6129      cooperative the actual locking isn't needed): */
6130   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6131     if (!TME_THREADS_COOPERATIVE) {
6132       tme_rwlock_wrlock(rwlock);
6133     }
6134     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6135     value_written = ~value_read;
6136     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6137     if (!TME_THREADS_COOPERATIVE) {
6138       tme_rwlock_unlock(rwlock);
6139     }
6140   }
6141 
6142   /* otherwise, threads are not cooperative and this host CPU
6143      can make atomic accesses to at least the most common memory
6144      size.
6145 
6146      in that case, the only reason this function should get
6147      called is if the host CPU can't do an atomic 64-bit
6148      not at all, or if it can't do it at this alignment.
6149 
6150      we emulate the atomic 64-bit not with a compare-and-exchange: */
6151   else {
6152 
6153     /* do an atomic read of the memory: */
6154     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6155 
6156     /* spin the not in a compare-and-exchange loop: */
6157     for (;;) {
6158 
6159       /* make the value to write: */
6160       value_written = ~value_read;
6161 
6162       /* try the compare-and-exchange: */
6163       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6164 
6165       /* if the compare-and-exchange failed: */
6166       if (__tme_predict_false(value_read_verify != value_read)) {
6167 
6168         /* loop with the new value read from the memory: */
6169         value_read = value_read_verify;
6170         continue;
6171       }
6172 
6173       /* stop now: */
6174       break;
6175     }
6176   }
6177 
6178   /* return the value read: */
6179   return (value_read);
6180 }
6181 
6182 /* undefine any macro version of tme_memory_atomic_neg64: */
6183 #undef tme_memory_atomic_neg64
6184 
6185 /* the 64-bit atomic neg function: */
6186 tme_uint64_t
tme_memory_atomic_neg64(tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6187 tme_memory_atomic_neg64(tme_shared tme_uint64_t *memory,
6188                         tme_rwlock_t *rwlock,
6189                         unsigned int align_min)
6190 {
6191   tme_uint64_t value_read;
6192   tme_uint64_t value_written;
6193   tme_uint64_t value_read_verify;
6194 
6195   /* if we can't make direct accesses at all, all atomic
6196      accesses must be done under lock.  (when threads are
6197      cooperative the actual locking isn't needed): */
6198   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6199     if (!TME_THREADS_COOPERATIVE) {
6200       tme_rwlock_wrlock(rwlock);
6201     }
6202     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6203     value_written = 0 - value_read;
6204     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6205     if (!TME_THREADS_COOPERATIVE) {
6206       tme_rwlock_unlock(rwlock);
6207     }
6208   }
6209 
6210   /* otherwise, threads are not cooperative and this host CPU
6211      can make atomic accesses to at least the most common memory
6212      size.
6213 
6214      in that case, the only reason this function should get
6215      called is if the host CPU can't do an atomic 64-bit
6216      neg at all, or if it can't do it at this alignment.
6217 
6218      we emulate the atomic 64-bit neg with a compare-and-exchange: */
6219   else {
6220 
6221     /* do an atomic read of the memory: */
6222     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6223 
6224     /* spin the neg in a compare-and-exchange loop: */
6225     for (;;) {
6226 
6227       /* make the value to write: */
6228       value_written = 0 - value_read;
6229 
6230       /* try the compare-and-exchange: */
6231       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6232 
6233       /* if the compare-and-exchange failed: */
6234       if (__tme_predict_false(value_read_verify != value_read)) {
6235 
6236         /* loop with the new value read from the memory: */
6237         value_read = value_read_verify;
6238         continue;
6239       }
6240 
6241       /* stop now: */
6242       break;
6243     }
6244   }
6245 
6246   /* return the value read: */
6247   return (value_read);
6248 }
6249 
6250 /* undefine any macro version of tme_memory_atomic_xchg64: */
6251 #undef tme_memory_atomic_xchg64
6252 
6253 /* the 64-bit atomic xchg function: */
6254 tme_uint64_t
tme_memory_atomic_xchg64(tme_shared tme_uint64_t * memory,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6255 tme_memory_atomic_xchg64(tme_shared tme_uint64_t *memory,
6256                         tme_uint64_t value_written,
6257                         tme_rwlock_t *rwlock,
6258                         unsigned int align_min)
6259 {
6260   tme_uint64_t value_read;
6261   tme_uint64_t value_read_verify;
6262 
6263   /* if we can't make direct accesses at all, all atomic
6264      accesses must be done under lock.  (when threads are
6265      cooperative the actual locking isn't needed): */
6266   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6267     if (!TME_THREADS_COOPERATIVE) {
6268       tme_rwlock_wrlock(rwlock);
6269     }
6270     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6271     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6272     if (!TME_THREADS_COOPERATIVE) {
6273       tme_rwlock_unlock(rwlock);
6274     }
6275   }
6276 
6277   /* otherwise, threads are not cooperative and this host CPU
6278      can make atomic accesses to at least the most common memory
6279      size.
6280 
6281      in that case, the only reason this function should get
6282      called is if the host CPU can't do an atomic 64-bit
6283      xchg at all, or if it can't do it at this alignment.
6284 
6285      we emulate the atomic 64-bit xchg with a compare-and-exchange: */
6286   else {
6287 
6288     /* do an atomic read of the memory: */
6289     value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
6290 
6291     /* spin the xchg in a compare-and-exchange loop: */
6292     for (;;) {
6293 
6294       /* try the compare-and-exchange: */
6295       value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
6296 
6297       /* if the compare-and-exchange failed: */
6298       if (__tme_predict_false(value_read_verify != value_read)) {
6299 
6300         /* loop with the new value read from the memory: */
6301         value_read = value_read_verify;
6302         continue;
6303       }
6304 
6305       /* stop now: */
6306       break;
6307     }
6308   }
6309 
6310   /* return the value read: */
6311   return (value_read);
6312 }
6313 
6314 /* undefine any macro version of tme_memory_atomic_cx64: */
6315 #undef tme_memory_atomic_cx64
6316 
6317 /* the 64-bit atomic cx function: */
6318 tme_uint64_t
tme_memory_atomic_cx64(tme_shared tme_uint64_t * memory,tme_uint64_t value_cmp,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6319 tme_memory_atomic_cx64(tme_shared tme_uint64_t *memory,
6320                         tme_uint64_t value_cmp,
6321                         tme_uint64_t value_written,
6322                         tme_rwlock_t *rwlock,
6323                         unsigned int align_min)
6324 {
6325   tme_uint64_t value_read;
6326 
6327   /* if we can't make direct accesses at all, all atomic
6328      accesses must be done under lock.  (when threads are
6329      cooperative the actual locking isn't needed): */
6330   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6331     if (!TME_THREADS_COOPERATIVE) {
6332       tme_rwlock_wrlock(rwlock);
6333     }
6334     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6335     if (value_read == value_cmp) {
6336       tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6337     }
6338     if (!TME_THREADS_COOPERATIVE) {
6339       tme_rwlock_unlock(rwlock);
6340     }
6341   }
6342 
6343   /* otherwise, threads are not cooperative and this host CPU
6344      can make atomic accesses to at least the most common memory
6345      size.
6346 
6347      in that case, the only reason this function should get
6348      called is if the host CPU can't do an atomic 64-bit
6349      cx at all, or if it can't do it at this alignment.
6350 
6351      we assume that these problematic atomic cxs are rare,
6352      and to emulate them we simply stop all other threads while
6353      doing the cx: */
6354   else {
6355     tme_thread_suspend_others();
6356     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6357     if (value_read == value_cmp) {
6358       tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6359     }
6360     tme_thread_resume_others();
6361   }
6362 
6363   /* return the value read: */
6364   return (value_read);
6365 }
6366 
6367 /* undefine any macro version of tme_memory_atomic_read64: */
6368 #undef tme_memory_atomic_read64
6369 
6370 /* the 64-bit atomic read function: */
6371 tme_uint64_t
tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t * memory,tme_rwlock_t * rwlock,unsigned int align_min)6372 tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t *memory,
6373                         tme_rwlock_t *rwlock,
6374                         unsigned int align_min)
6375 {
6376   tme_uint64_t value_read;
6377 
6378   /* if we can't make direct accesses at all, all atomic
6379      accesses must be done under lock.  (when threads are
6380      cooperative the actual locking isn't needed): */
6381   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6382     if (!TME_THREADS_COOPERATIVE) {
6383       tme_rwlock_rdlock(rwlock);
6384     }
6385     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6386     if (!TME_THREADS_COOPERATIVE) {
6387       tme_rwlock_unlock(rwlock);
6388     }
6389   }
6390 
6391   /* otherwise, threads are not cooperative and this host CPU
6392      can make atomic accesses to at least the most common memory
6393      size.
6394 
6395      in that case, the only reason this function should get
6396      called is if the host CPU can't do an atomic 64-bit
6397      read at all, or if it can't do it at this alignment.
6398 
6399      we assume that these problematic atomic reads are rare,
6400      and to emulate them we simply stop all other threads while
6401      doing the read: */
6402   else {
6403     tme_thread_suspend_others();
6404     value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
6405     tme_thread_resume_others();
6406   }
6407 
6408   /* return the value read: */
6409   return (value_read);
6410 }
6411 
6412 /* undefine any macro version of tme_memory_atomic_write64: */
6413 #undef tme_memory_atomic_write64
6414 
6415 /* the 64-bit atomic write function: */
6416 void
tme_memory_atomic_write64(tme_shared tme_uint64_t * memory,tme_uint64_t value_written,tme_rwlock_t * rwlock,unsigned int align_min)6417 tme_memory_atomic_write64(tme_shared tme_uint64_t *memory,
6418                         tme_uint64_t value_written,
6419                         tme_rwlock_t *rwlock,
6420                         unsigned int align_min)
6421 {
6422 
6423   /* if we can't make direct accesses at all, all atomic
6424      accesses must be done under lock.  (when threads are
6425      cooperative the actual locking isn't needed): */
6426   if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
6427     if (!TME_THREADS_COOPERATIVE) {
6428       tme_rwlock_wrlock(rwlock);
6429     }
6430     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6431     if (!TME_THREADS_COOPERATIVE) {
6432       tme_rwlock_unlock(rwlock);
6433     }
6434   }
6435 
6436   /* otherwise, threads are not cooperative and this host CPU
6437      can make atomic accesses to at least the most common memory
6438      size.
6439 
6440      in that case, the only reason this function should get
6441      called is if the host CPU can't do an atomic 64-bit
6442      write at all, or if it can't do it at this alignment.
6443 
6444      we assume that these problematic atomic writes are rare,
6445      and to emulate them we simply stop all other threads while
6446      doing the write: */
6447   else {
6448     tme_thread_suspend_others();
6449     tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
6450     tme_thread_resume_others();
6451   }
6452 }
6453 
6454 #endif /* TME_HAVE_INT64_T */
6455