xref: /qemu/accel/tcg/ldst_atomicity.c.inc (revision 333c813b)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#ifdef CONFIG_ATOMIC64
13# define HAVE_al8          true
14#else
15# define HAVE_al8          false
16#endif
17#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
18
19#if defined(CONFIG_ATOMIC128)
20# define HAVE_al16_fast    true
21#else
22# define HAVE_al16_fast    false
23#endif
24#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128)
25# define HAVE_al16         true
26#else
27# define HAVE_al16         false
28#endif
29
30
31/**
32 * required_atomicity:
33 *
34 * Return the lg2 bytes of atomicity required by @memop for @p.
35 * If the operation must be split into two operations to be
36 * examined separately for atomicity, return -lg2.
37 */
38static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
39{
40    MemOp atom = memop & MO_ATOM_MASK;
41    MemOp size = memop & MO_SIZE;
42    MemOp half = size ? size - 1 : 0;
43    unsigned tmp;
44    int atmax;
45
46    switch (atom) {
47    case MO_ATOM_NONE:
48        atmax = MO_8;
49        break;
50
51    case MO_ATOM_IFALIGN_PAIR:
52        size = half;
53        /* fall through */
54
55    case MO_ATOM_IFALIGN:
56        tmp = (1 << size) - 1;
57        atmax = p & tmp ? MO_8 : size;
58        break;
59
60    case MO_ATOM_WITHIN16:
61        tmp = p & 15;
62        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
63        break;
64
65    case MO_ATOM_WITHIN16_PAIR:
66        tmp = p & 15;
67        if (tmp + (1 << size) <= 16) {
68            atmax = size;
69        } else if (tmp + (1 << half) == 16) {
70            /*
71             * The pair exactly straddles the boundary.
72             * Both halves are naturally aligned and atomic.
73             */
74            atmax = half;
75        } else {
76            /*
77             * One of the pair crosses the boundary, and is non-atomic.
78             * The other of the pair does not cross, and is atomic.
79             */
80            atmax = -half;
81        }
82        break;
83
84    case MO_ATOM_SUBALIGN:
85        /*
86         * Examine the alignment of p to determine if there are subobjects
87         * that must be aligned.  Note that we only really need ctz4() --
88         * any more sigificant bits are discarded by the immediately
89         * following comparison.
90         */
91        tmp = ctz32(p);
92        atmax = MIN(size, tmp);
93        break;
94
95    default:
96        g_assert_not_reached();
97    }
98
99    /*
100     * Here we have the architectural atomicity of the operation.
101     * However, when executing in a serial context, we need no extra
102     * host atomicity in order to avoid racing.  This reduction
103     * avoids looping with cpu_loop_exit_atomic.
104     */
105    if (cpu_in_serial_context(env_cpu(env))) {
106        return MO_8;
107    }
108    return atmax;
109}
110
111/**
112 * load_atomic2:
113 * @pv: host address
114 *
115 * Atomically load 2 aligned bytes from @pv.
116 */
117static inline uint16_t load_atomic2(void *pv)
118{
119    uint16_t *p = __builtin_assume_aligned(pv, 2);
120    return qatomic_read(p);
121}
122
123/**
124 * load_atomic4:
125 * @pv: host address
126 *
127 * Atomically load 4 aligned bytes from @pv.
128 */
129static inline uint32_t load_atomic4(void *pv)
130{
131    uint32_t *p = __builtin_assume_aligned(pv, 4);
132    return qatomic_read(p);
133}
134
135/**
136 * load_atomic8:
137 * @pv: host address
138 *
139 * Atomically load 8 aligned bytes from @pv.
140 */
141static inline uint64_t load_atomic8(void *pv)
142{
143    uint64_t *p = __builtin_assume_aligned(pv, 8);
144
145    qemu_build_assert(HAVE_al8);
146    return qatomic_read__nocheck(p);
147}
148
149/**
150 * load_atomic16:
151 * @pv: host address
152 *
153 * Atomically load 16 aligned bytes from @pv.
154 */
155static inline Int128 ATTRIBUTE_ATOMIC128_OPT
156load_atomic16(void *pv)
157{
158#ifdef CONFIG_ATOMIC128
159    __uint128_t *p = __builtin_assume_aligned(pv, 16);
160    Int128Alias r;
161
162    r.u = qatomic_read__nocheck(p);
163    return r.s;
164#else
165    qemu_build_not_reached();
166#endif
167}
168
169/**
170 * load_atomic8_or_exit:
171 * @env: cpu context
172 * @ra: host unwind address
173 * @pv: host address
174 *
175 * Atomically load 8 aligned bytes from @pv.
176 * If this is not possible, longjmp out to restart serially.
177 */
178static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
179{
180    if (HAVE_al8) {
181        return load_atomic8(pv);
182    }
183
184#ifdef CONFIG_USER_ONLY
185    /*
186     * If the page is not writable, then assume the value is immutable
187     * and requires no locking.  This ignores the case of MAP_SHARED with
188     * another process, because the fallback start_exclusive solution
189     * provides no protection across processes.
190     */
191    if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
192        uint64_t *p = __builtin_assume_aligned(pv, 8);
193        return *p;
194    }
195#endif
196
197    /* Ultimate fallback: re-execute in serial context. */
198    cpu_loop_exit_atomic(env_cpu(env), ra);
199}
200
201/**
202 * load_atomic16_or_exit:
203 * @env: cpu context
204 * @ra: host unwind address
205 * @pv: host address
206 *
207 * Atomically load 16 aligned bytes from @pv.
208 * If this is not possible, longjmp out to restart serially.
209 */
210static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
211{
212    Int128 *p = __builtin_assume_aligned(pv, 16);
213
214    if (HAVE_al16_fast) {
215        return load_atomic16(p);
216    }
217
218#ifdef CONFIG_USER_ONLY
219    /*
220     * We can only use cmpxchg to emulate a load if the page is writable.
221     * If the page is not writable, then assume the value is immutable
222     * and requires no locking.  This ignores the case of MAP_SHARED with
223     * another process, because the fallback start_exclusive solution
224     * provides no protection across processes.
225     */
226    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
227        return *p;
228    }
229#endif
230
231    /*
232     * In system mode all guest pages are writable, and for user-only
233     * we have just checked writability.  Try cmpxchg.
234     */
235#if defined(CONFIG_CMPXCHG128)
236    /* Swap 0 with 0, with the side-effect of returning the old value. */
237    {
238        Int128Alias r;
239        r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0);
240        return r.s;
241    }
242#endif
243
244    /* Ultimate fallback: re-execute in serial context. */
245    cpu_loop_exit_atomic(env_cpu(env), ra);
246}
247
248/**
249 * load_atom_extract_al4x2:
250 * @pv: host address
251 *
252 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
253 */
254static uint32_t load_atom_extract_al4x2(void *pv)
255{
256    uintptr_t pi = (uintptr_t)pv;
257    int sh = (pi & 3) * 8;
258    uint32_t a, b;
259
260    pv = (void *)(pi & ~3);
261    a = load_atomic4(pv);
262    b = load_atomic4(pv + 4);
263
264    if (HOST_BIG_ENDIAN) {
265        return (a << sh) | (b >> (-sh & 31));
266    } else {
267        return (a >> sh) | (b << (-sh & 31));
268    }
269}
270
271/**
272 * load_atom_extract_al8x2:
273 * @pv: host address
274 *
275 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
276 */
277static uint64_t load_atom_extract_al8x2(void *pv)
278{
279    uintptr_t pi = (uintptr_t)pv;
280    int sh = (pi & 7) * 8;
281    uint64_t a, b;
282
283    pv = (void *)(pi & ~7);
284    a = load_atomic8(pv);
285    b = load_atomic8(pv + 8);
286
287    if (HOST_BIG_ENDIAN) {
288        return (a << sh) | (b >> (-sh & 63));
289    } else {
290        return (a >> sh) | (b << (-sh & 63));
291    }
292}
293
294/**
295 * load_atom_extract_al8_or_exit:
296 * @env: cpu context
297 * @ra: host unwind address
298 * @pv: host address
299 * @s: object size in bytes, @s <= 4.
300 *
301 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
302 * not cross an 8-byte boundary.  This means that we can perform an atomic
303 * 8-byte load and extract.
304 * The value is returned in the low bits of a uint32_t.
305 */
306static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
307                                              void *pv, int s)
308{
309    uintptr_t pi = (uintptr_t)pv;
310    int o = pi & 7;
311    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
312
313    pv = (void *)(pi & ~7);
314    return load_atomic8_or_exit(env, ra, pv) >> shr;
315}
316
317/**
318 * load_atom_extract_al16_or_exit:
319 * @env: cpu context
320 * @ra: host unwind address
321 * @p: host address
322 * @s: object size in bytes, @s <= 8.
323 *
324 * Atomically load @s bytes from @p, when p % 16 < 8
325 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
326 * boundary, but *does* cross an 8-byte boundary.
327 * This is the slow version, so we must have eliminated
328 * any faster load_atom_extract_al8_or_exit case.
329 *
330 * If this is not possible, longjmp out to restart serially.
331 */
332static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
333                                               void *pv, int s)
334{
335    uintptr_t pi = (uintptr_t)pv;
336    int o = pi & 7;
337    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
338    Int128 r;
339
340    /*
341     * Note constraints above: p & 8 must be clear.
342     * Provoke SIGBUS if possible otherwise.
343     */
344    pv = (void *)(pi & ~7);
345    r = load_atomic16_or_exit(env, ra, pv);
346
347    r = int128_urshift(r, shr);
348    return int128_getlo(r);
349}
350
351/**
352 * load_atom_extract_al16_or_al8:
353 * @p: host address
354 * @s: object size in bytes, @s <= 8.
355 *
356 * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
357 * cross an 16-byte boundary then the access must be 16-byte atomic,
358 * otherwise the access must be 8-byte atomic.
359 */
360static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
361load_atom_extract_al16_or_al8(void *pv, int s)
362{
363#if defined(CONFIG_ATOMIC128)
364    uintptr_t pi = (uintptr_t)pv;
365    int o = pi & 7;
366    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
367    __uint128_t r;
368
369    pv = (void *)(pi & ~7);
370    if (pi & 8) {
371        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
372        uint64_t a = qatomic_read__nocheck(p8);
373        uint64_t b = qatomic_read__nocheck(p8 + 1);
374
375        if (HOST_BIG_ENDIAN) {
376            r = ((__uint128_t)a << 64) | b;
377        } else {
378            r = ((__uint128_t)b << 64) | a;
379        }
380    } else {
381        __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0);
382        r = qatomic_read__nocheck(p16);
383    }
384    return r >> shr;
385#else
386    qemu_build_not_reached();
387#endif
388}
389
390/**
391 * load_atom_4_by_2:
392 * @pv: host address
393 *
394 * Load 4 bytes from @pv, with two 2-byte atomic loads.
395 */
396static inline uint32_t load_atom_4_by_2(void *pv)
397{
398    uint32_t a = load_atomic2(pv);
399    uint32_t b = load_atomic2(pv + 2);
400
401    if (HOST_BIG_ENDIAN) {
402        return (a << 16) | b;
403    } else {
404        return (b << 16) | a;
405    }
406}
407
408/**
409 * load_atom_8_by_2:
410 * @pv: host address
411 *
412 * Load 8 bytes from @pv, with four 2-byte atomic loads.
413 */
414static inline uint64_t load_atom_8_by_2(void *pv)
415{
416    uint32_t a = load_atom_4_by_2(pv);
417    uint32_t b = load_atom_4_by_2(pv + 4);
418
419    if (HOST_BIG_ENDIAN) {
420        return ((uint64_t)a << 32) | b;
421    } else {
422        return ((uint64_t)b << 32) | a;
423    }
424}
425
426/**
427 * load_atom_8_by_4:
428 * @pv: host address
429 *
430 * Load 8 bytes from @pv, with two 4-byte atomic loads.
431 */
432static inline uint64_t load_atom_8_by_4(void *pv)
433{
434    uint32_t a = load_atomic4(pv);
435    uint32_t b = load_atomic4(pv + 4);
436
437    if (HOST_BIG_ENDIAN) {
438        return ((uint64_t)a << 32) | b;
439    } else {
440        return ((uint64_t)b << 32) | a;
441    }
442}
443
444/**
445 * load_atom_8_by_8_or_4:
446 * @pv: host address
447 *
448 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
449 */
450static inline uint64_t load_atom_8_by_8_or_4(void *pv)
451{
452    if (HAVE_al8_fast) {
453        return load_atomic8(pv);
454    } else {
455        return load_atom_8_by_4(pv);
456    }
457}
458
459/**
460 * load_atom_2:
461 * @p: host address
462 * @memop: the full memory op
463 *
464 * Load 2 bytes from @p, honoring the atomicity of @memop.
465 */
466static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
467                            void *pv, MemOp memop)
468{
469    uintptr_t pi = (uintptr_t)pv;
470    int atmax;
471
472    if (likely((pi & 1) == 0)) {
473        return load_atomic2(pv);
474    }
475    if (HAVE_al16_fast) {
476        return load_atom_extract_al16_or_al8(pv, 2);
477    }
478
479    atmax = required_atomicity(env, pi, memop);
480    switch (atmax) {
481    case MO_8:
482        return lduw_he_p(pv);
483    case MO_16:
484        /* The only case remaining is MO_ATOM_WITHIN16. */
485        if (!HAVE_al8_fast && (pi & 3) == 1) {
486            /* Big or little endian, we want the middle two bytes. */
487            return load_atomic4(pv - 1) >> 8;
488        }
489        if ((pi & 15) != 7) {
490            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
491        }
492        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
493    default:
494        g_assert_not_reached();
495    }
496}
497
498/**
499 * load_atom_4:
500 * @p: host address
501 * @memop: the full memory op
502 *
503 * Load 4 bytes from @p, honoring the atomicity of @memop.
504 */
505static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
506                            void *pv, MemOp memop)
507{
508    uintptr_t pi = (uintptr_t)pv;
509    int atmax;
510
511    if (likely((pi & 3) == 0)) {
512        return load_atomic4(pv);
513    }
514    if (HAVE_al16_fast) {
515        return load_atom_extract_al16_or_al8(pv, 4);
516    }
517
518    atmax = required_atomicity(env, pi, memop);
519    switch (atmax) {
520    case MO_8:
521    case MO_16:
522    case -MO_16:
523        /*
524         * For MO_ATOM_IFALIGN, this is more atomicity than required,
525         * but it's trivially supported on all hosts, better than 4
526         * individual byte loads (when the host requires alignment),
527         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
528         */
529        return load_atom_extract_al4x2(pv);
530    case MO_32:
531        if (!(pi & 4)) {
532            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
533        }
534        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
535    default:
536        g_assert_not_reached();
537    }
538}
539
540/**
541 * load_atom_8:
542 * @p: host address
543 * @memop: the full memory op
544 *
545 * Load 8 bytes from @p, honoring the atomicity of @memop.
546 */
547static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
548                            void *pv, MemOp memop)
549{
550    uintptr_t pi = (uintptr_t)pv;
551    int atmax;
552
553    /*
554     * If the host does not support 8-byte atomics, wait until we have
555     * examined the atomicity parameters below.
556     */
557    if (HAVE_al8 && likely((pi & 7) == 0)) {
558        return load_atomic8(pv);
559    }
560    if (HAVE_al16_fast) {
561        return load_atom_extract_al16_or_al8(pv, 8);
562    }
563
564    atmax = required_atomicity(env, pi, memop);
565    if (atmax == MO_64) {
566        if (!HAVE_al8 && (pi & 7) == 0) {
567            load_atomic8_or_exit(env, ra, pv);
568        }
569        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
570    }
571    if (HAVE_al8_fast) {
572        return load_atom_extract_al8x2(pv);
573    }
574    switch (atmax) {
575    case MO_8:
576        return ldq_he_p(pv);
577    case MO_16:
578        return load_atom_8_by_2(pv);
579    case MO_32:
580        return load_atom_8_by_4(pv);
581    case -MO_32:
582        if (HAVE_al8) {
583            return load_atom_extract_al8x2(pv);
584        }
585        cpu_loop_exit_atomic(env_cpu(env), ra);
586    default:
587        g_assert_not_reached();
588    }
589}
590
591/**
592 * load_atom_16:
593 * @p: host address
594 * @memop: the full memory op
595 *
596 * Load 16 bytes from @p, honoring the atomicity of @memop.
597 */
598static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
599                           void *pv, MemOp memop)
600{
601    uintptr_t pi = (uintptr_t)pv;
602    int atmax;
603    Int128 r;
604    uint64_t a, b;
605
606    /*
607     * If the host does not support 16-byte atomics, wait until we have
608     * examined the atomicity parameters below.
609     */
610    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
611        return load_atomic16(pv);
612    }
613
614    atmax = required_atomicity(env, pi, memop);
615    switch (atmax) {
616    case MO_8:
617        memcpy(&r, pv, 16);
618        return r;
619    case MO_16:
620        a = load_atom_8_by_2(pv);
621        b = load_atom_8_by_2(pv + 8);
622        break;
623    case MO_32:
624        a = load_atom_8_by_4(pv);
625        b = load_atom_8_by_4(pv + 8);
626        break;
627    case MO_64:
628        if (!HAVE_al8) {
629            cpu_loop_exit_atomic(env_cpu(env), ra);
630        }
631        a = load_atomic8(pv);
632        b = load_atomic8(pv + 8);
633        break;
634    case -MO_64:
635        if (!HAVE_al8) {
636            cpu_loop_exit_atomic(env_cpu(env), ra);
637        }
638        a = load_atom_extract_al8x2(pv);
639        b = load_atom_extract_al8x2(pv + 8);
640        break;
641    case MO_128:
642        return load_atomic16_or_exit(env, ra, pv);
643    default:
644        g_assert_not_reached();
645    }
646    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
647}
648
649/**
650 * store_atomic2:
651 * @pv: host address
652 * @val: value to store
653 *
654 * Atomically store 2 aligned bytes to @pv.
655 */
656static inline void store_atomic2(void *pv, uint16_t val)
657{
658    uint16_t *p = __builtin_assume_aligned(pv, 2);
659    qatomic_set(p, val);
660}
661
662/**
663 * store_atomic4:
664 * @pv: host address
665 * @val: value to store
666 *
667 * Atomically store 4 aligned bytes to @pv.
668 */
669static inline void store_atomic4(void *pv, uint32_t val)
670{
671    uint32_t *p = __builtin_assume_aligned(pv, 4);
672    qatomic_set(p, val);
673}
674
675/**
676 * store_atomic8:
677 * @pv: host address
678 * @val: value to store
679 *
680 * Atomically store 8 aligned bytes to @pv.
681 */
682static inline void store_atomic8(void *pv, uint64_t val)
683{
684    uint64_t *p = __builtin_assume_aligned(pv, 8);
685
686    qemu_build_assert(HAVE_al8);
687    qatomic_set__nocheck(p, val);
688}
689
690/**
691 * store_atomic16:
692 * @pv: host address
693 * @val: value to store
694 *
695 * Atomically store 16 aligned bytes to @pv.
696 */
697static inline void ATTRIBUTE_ATOMIC128_OPT
698store_atomic16(void *pv, Int128Alias val)
699{
700#if defined(CONFIG_ATOMIC128)
701    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
702    qatomic_set__nocheck(pu, val.u);
703#elif defined(CONFIG_CMPXCHG128)
704    __uint128_t *pu = __builtin_assume_aligned(pv, 16);
705    __uint128_t o;
706
707    /*
708     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
709     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
710     * and accept the sequential consistency that comes with it.
711     */
712    do {
713        o = *pu;
714    } while (!__sync_bool_compare_and_swap_16(pu, o, val.u));
715#else
716    qemu_build_not_reached();
717#endif
718}
719
720/**
721 * store_atom_4x2
722 */
723static inline void store_atom_4_by_2(void *pv, uint32_t val)
724{
725    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
726    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
727}
728
729/**
730 * store_atom_8_by_2
731 */
732static inline void store_atom_8_by_2(void *pv, uint64_t val)
733{
734    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
735    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
736}
737
738/**
739 * store_atom_8_by_4
740 */
741static inline void store_atom_8_by_4(void *pv, uint64_t val)
742{
743    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
744    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
745}
746
747/**
748 * store_atom_insert_al4:
749 * @p: host address
750 * @val: shifted value to store
751 * @msk: mask for value to store
752 *
753 * Atomically store @val to @p, masked by @msk.
754 */
755static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
756{
757    uint32_t old, new;
758
759    p = __builtin_assume_aligned(p, 4);
760    old = qatomic_read(p);
761    do {
762        new = (old & ~msk) | val;
763    } while (!__atomic_compare_exchange_n(p, &old, new, true,
764                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
765}
766
767/**
768 * store_atom_insert_al8:
769 * @p: host address
770 * @val: shifted value to store
771 * @msk: mask for value to store
772 *
773 * Atomically store @val to @p masked by @msk.
774 */
775static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
776{
777    uint64_t old, new;
778
779    qemu_build_assert(HAVE_al8);
780    p = __builtin_assume_aligned(p, 8);
781    old = qatomic_read__nocheck(p);
782    do {
783        new = (old & ~msk) | val;
784    } while (!__atomic_compare_exchange_n(p, &old, new, true,
785                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
786}
787
788/**
789 * store_atom_insert_al16:
790 * @p: host address
791 * @val: shifted value to store
792 * @msk: mask for value to store
793 *
794 * Atomically store @val to @p masked by @msk.
795 */
796static void ATTRIBUTE_ATOMIC128_OPT
797store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
798{
799#if defined(CONFIG_ATOMIC128)
800    __uint128_t *pu, old, new;
801
802    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
803    pu = __builtin_assume_aligned(ps, 16);
804    old = *pu;
805    do {
806        new = (old & ~msk.u) | val.u;
807    } while (!__atomic_compare_exchange_n(pu, &old, new, true,
808                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
809#elif defined(CONFIG_CMPXCHG128)
810    __uint128_t *pu, old, new;
811
812    /*
813     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
814     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
815     * and accept the sequential consistency that comes with it.
816     */
817    pu = __builtin_assume_aligned(ps, 16);
818    do {
819        old = *pu;
820        new = (old & ~msk.u) | val.u;
821    } while (!__sync_bool_compare_and_swap_16(pu, old, new));
822#else
823    qemu_build_not_reached();
824#endif
825}
826
827/**
828 * store_bytes_leN:
829 * @pv: host address
830 * @size: number of bytes to store
831 * @val_le: data to store
832 *
833 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
834 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
835 */
836static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
837{
838    uint8_t *p = pv;
839    for (int i = 0; i < size; i++, val_le >>= 8) {
840        p[i] = val_le;
841    }
842    return val_le;
843}
844
845/**
846 * store_parts_leN
847 * @pv: host address
848 * @size: number of bytes to store
849 * @val_le: data to store
850 *
851 * As store_bytes_leN, but atomically on each aligned part.
852 */
853G_GNUC_UNUSED
854static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
855{
856    do {
857        int n;
858
859        /* Find minimum of alignment and size */
860        switch (((uintptr_t)pv | size) & 7) {
861        case 4:
862            store_atomic4(pv, le32_to_cpu(val_le));
863            val_le >>= 32;
864            n = 4;
865            break;
866        case 2:
867        case 6:
868            store_atomic2(pv, le16_to_cpu(val_le));
869            val_le >>= 16;
870            n = 2;
871            break;
872        default:
873            *(uint8_t *)pv = val_le;
874            val_le >>= 8;
875            n = 1;
876            break;
877        case 0:
878            g_assert_not_reached();
879        }
880        pv += n;
881        size -= n;
882    } while (size != 0);
883
884    return val_le;
885}
886
887/**
888 * store_whole_le4
889 * @pv: host address
890 * @size: number of bytes to store
891 * @val_le: data to store
892 *
893 * As store_bytes_leN, but atomically as a whole.
894 * Four aligned bytes are guaranteed to cover the store.
895 */
896static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
897{
898    int sz = size * 8;
899    int o = (uintptr_t)pv & 3;
900    int sh = o * 8;
901    uint32_t m = MAKE_64BIT_MASK(0, sz);
902    uint32_t v;
903
904    if (HOST_BIG_ENDIAN) {
905        v = bswap32(val_le) >> sh;
906        m = bswap32(m) >> sh;
907    } else {
908        v = val_le << sh;
909        m <<= sh;
910    }
911    store_atom_insert_al4(pv - o, v, m);
912    return val_le >> sz;
913}
914
915/**
916 * store_whole_le8
917 * @pv: host address
918 * @size: number of bytes to store
919 * @val_le: data to store
920 *
921 * As store_bytes_leN, but atomically as a whole.
922 * Eight aligned bytes are guaranteed to cover the store.
923 */
924static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
925{
926    int sz = size * 8;
927    int o = (uintptr_t)pv & 7;
928    int sh = o * 8;
929    uint64_t m = MAKE_64BIT_MASK(0, sz);
930    uint64_t v;
931
932    qemu_build_assert(HAVE_al8);
933    if (HOST_BIG_ENDIAN) {
934        v = bswap64(val_le) >> sh;
935        m = bswap64(m) >> sh;
936    } else {
937        v = val_le << sh;
938        m <<= sh;
939    }
940    store_atom_insert_al8(pv - o, v, m);
941    return val_le >> sz;
942}
943
944/**
945 * store_whole_le16
946 * @pv: host address
947 * @size: number of bytes to store
948 * @val_le: data to store
949 *
950 * As store_bytes_leN, but atomically as a whole.
951 * 16 aligned bytes are guaranteed to cover the store.
952 */
953static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
954{
955    int sz = size * 8;
956    int o = (uintptr_t)pv & 15;
957    int sh = o * 8;
958    Int128 m, v;
959
960    qemu_build_assert(HAVE_al16);
961
962    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
963    if (sz <= 64) {
964        m = int128_make64(MAKE_64BIT_MASK(0, sz));
965    } else {
966        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
967    }
968
969    if (HOST_BIG_ENDIAN) {
970        v = int128_urshift(bswap128(val_le), sh);
971        m = int128_urshift(bswap128(m), sh);
972    } else {
973        v = int128_lshift(val_le, sh);
974        m = int128_lshift(m, sh);
975    }
976    store_atom_insert_al16(pv - o, v, m);
977
978    /* Unused if sz <= 64. */
979    return int128_gethi(val_le) >> (sz - 64);
980}
981
982/**
983 * store_atom_2:
984 * @p: host address
985 * @val: the value to store
986 * @memop: the full memory op
987 *
988 * Store 2 bytes to @p, honoring the atomicity of @memop.
989 */
990static void store_atom_2(CPUArchState *env, uintptr_t ra,
991                         void *pv, MemOp memop, uint16_t val)
992{
993    uintptr_t pi = (uintptr_t)pv;
994    int atmax;
995
996    if (likely((pi & 1) == 0)) {
997        store_atomic2(pv, val);
998        return;
999    }
1000
1001    atmax = required_atomicity(env, pi, memop);
1002    if (atmax == MO_8) {
1003        stw_he_p(pv, val);
1004        return;
1005    }
1006
1007    /*
1008     * The only case remaining is MO_ATOM_WITHIN16.
1009     * Big or little endian, we want the middle two bytes in each test.
1010     */
1011    if ((pi & 3) == 1) {
1012        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
1013        return;
1014    } else if ((pi & 7) == 3) {
1015        if (HAVE_al8) {
1016            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
1017            return;
1018        }
1019    } else if ((pi & 15) == 7) {
1020        if (HAVE_al16) {
1021            Int128 v = int128_lshift(int128_make64(val), 56);
1022            Int128 m = int128_lshift(int128_make64(0xffff), 56);
1023            store_atom_insert_al16(pv - 7, v, m);
1024            return;
1025        }
1026    } else {
1027        g_assert_not_reached();
1028    }
1029
1030    cpu_loop_exit_atomic(env_cpu(env), ra);
1031}
1032
1033/**
1034 * store_atom_4:
1035 * @p: host address
1036 * @val: the value to store
1037 * @memop: the full memory op
1038 *
1039 * Store 4 bytes to @p, honoring the atomicity of @memop.
1040 */
1041static void store_atom_4(CPUArchState *env, uintptr_t ra,
1042                         void *pv, MemOp memop, uint32_t val)
1043{
1044    uintptr_t pi = (uintptr_t)pv;
1045    int atmax;
1046
1047    if (likely((pi & 3) == 0)) {
1048        store_atomic4(pv, val);
1049        return;
1050    }
1051
1052    atmax = required_atomicity(env, pi, memop);
1053    switch (atmax) {
1054    case MO_8:
1055        stl_he_p(pv, val);
1056        return;
1057    case MO_16:
1058        store_atom_4_by_2(pv, val);
1059        return;
1060    case -MO_16:
1061        {
1062            uint32_t val_le = cpu_to_le32(val);
1063            int s2 = pi & 3;
1064            int s1 = 4 - s2;
1065
1066            switch (s2) {
1067            case 1:
1068                val_le = store_whole_le4(pv, s1, val_le);
1069                *(uint8_t *)(pv + 3) = val_le;
1070                break;
1071            case 3:
1072                *(uint8_t *)pv = val_le;
1073                store_whole_le4(pv + 1, s2, val_le >> 8);
1074                break;
1075            case 0: /* aligned */
1076            case 2: /* atmax MO_16 */
1077            default:
1078                g_assert_not_reached();
1079            }
1080        }
1081        return;
1082    case MO_32:
1083        if ((pi & 7) < 4) {
1084            if (HAVE_al8) {
1085                store_whole_le8(pv, 4, cpu_to_le32(val));
1086                return;
1087            }
1088        } else {
1089            if (HAVE_al16) {
1090                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
1091                return;
1092            }
1093        }
1094        cpu_loop_exit_atomic(env_cpu(env), ra);
1095    default:
1096        g_assert_not_reached();
1097    }
1098}
1099
1100/**
1101 * store_atom_8:
1102 * @p: host address
1103 * @val: the value to store
1104 * @memop: the full memory op
1105 *
1106 * Store 8 bytes to @p, honoring the atomicity of @memop.
1107 */
1108static void store_atom_8(CPUArchState *env, uintptr_t ra,
1109                         void *pv, MemOp memop, uint64_t val)
1110{
1111    uintptr_t pi = (uintptr_t)pv;
1112    int atmax;
1113
1114    if (HAVE_al8 && likely((pi & 7) == 0)) {
1115        store_atomic8(pv, val);
1116        return;
1117    }
1118
1119    atmax = required_atomicity(env, pi, memop);
1120    switch (atmax) {
1121    case MO_8:
1122        stq_he_p(pv, val);
1123        return;
1124    case MO_16:
1125        store_atom_8_by_2(pv, val);
1126        return;
1127    case MO_32:
1128        store_atom_8_by_4(pv, val);
1129        return;
1130    case -MO_32:
1131        if (HAVE_al8) {
1132            uint64_t val_le = cpu_to_le64(val);
1133            int s2 = pi & 7;
1134            int s1 = 8 - s2;
1135
1136            switch (s2) {
1137            case 1 ... 3:
1138                val_le = store_whole_le8(pv, s1, val_le);
1139                store_bytes_leN(pv + s1, s2, val_le);
1140                break;
1141            case 5 ... 7:
1142                val_le = store_bytes_leN(pv, s1, val_le);
1143                store_whole_le8(pv + s1, s2, val_le);
1144                break;
1145            case 0: /* aligned */
1146            case 4: /* atmax MO_32 */
1147            default:
1148                g_assert_not_reached();
1149            }
1150            return;
1151        }
1152        break;
1153    case MO_64:
1154        if (HAVE_al16) {
1155            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1156            return;
1157        }
1158        break;
1159    default:
1160        g_assert_not_reached();
1161    }
1162    cpu_loop_exit_atomic(env_cpu(env), ra);
1163}
1164
1165/**
1166 * store_atom_16:
1167 * @p: host address
1168 * @val: the value to store
1169 * @memop: the full memory op
1170 *
1171 * Store 16 bytes to @p, honoring the atomicity of @memop.
1172 */
1173static void store_atom_16(CPUArchState *env, uintptr_t ra,
1174                          void *pv, MemOp memop, Int128 val)
1175{
1176    uintptr_t pi = (uintptr_t)pv;
1177    uint64_t a, b;
1178    int atmax;
1179
1180    if (HAVE_al16_fast && likely((pi & 15) == 0)) {
1181        store_atomic16(pv, val);
1182        return;
1183    }
1184
1185    atmax = required_atomicity(env, pi, memop);
1186
1187    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1188    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1189    switch (atmax) {
1190    case MO_8:
1191        memcpy(pv, &val, 16);
1192        return;
1193    case MO_16:
1194        store_atom_8_by_2(pv, a);
1195        store_atom_8_by_2(pv + 8, b);
1196        return;
1197    case MO_32:
1198        store_atom_8_by_4(pv, a);
1199        store_atom_8_by_4(pv + 8, b);
1200        return;
1201    case MO_64:
1202        if (HAVE_al8) {
1203            store_atomic8(pv, a);
1204            store_atomic8(pv + 8, b);
1205            return;
1206        }
1207        break;
1208    case -MO_64:
1209        if (HAVE_al16) {
1210            uint64_t val_le;
1211            int s2 = pi & 15;
1212            int s1 = 16 - s2;
1213
1214            if (HOST_BIG_ENDIAN) {
1215                val = bswap128(val);
1216            }
1217            switch (s2) {
1218            case 1 ... 7:
1219                val_le = store_whole_le16(pv, s1, val);
1220                store_bytes_leN(pv + s1, s2, val_le);
1221                break;
1222            case 9 ... 15:
1223                store_bytes_leN(pv, s1, int128_getlo(val));
1224                val = int128_urshift(val, s1 * 8);
1225                store_whole_le16(pv + s1, s2, val);
1226                break;
1227            case 0: /* aligned */
1228            case 8: /* atmax MO_64 */
1229            default:
1230                g_assert_not_reached();
1231            }
1232            return;
1233        }
1234        break;
1235    case MO_128:
1236        if (HAVE_al16) {
1237            store_atomic16(pv, val);
1238            return;
1239        }
1240        break;
1241    default:
1242        g_assert_not_reached();
1243    }
1244    cpu_loop_exit_atomic(env_cpu(env), ra);
1245}
1246