xref: /qemu/accel/tcg/ldst_atomicity.c.inc (revision 2bfb10df)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#ifdef CONFIG_ATOMIC64
13# define HAVE_al8          true
14#else
15# define HAVE_al8          false
16#endif
17#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
18
19/**
20 * required_atomicity:
21 *
22 * Return the lg2 bytes of atomicity required by @memop for @p.
23 * If the operation must be split into two operations to be
24 * examined separately for atomicity, return -lg2.
25 */
26static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
27{
28    MemOp atom = memop & MO_ATOM_MASK;
29    MemOp size = memop & MO_SIZE;
30    MemOp half = size ? size - 1 : 0;
31    unsigned tmp;
32    int atmax;
33
34    switch (atom) {
35    case MO_ATOM_NONE:
36        atmax = MO_8;
37        break;
38
39    case MO_ATOM_IFALIGN_PAIR:
40        size = half;
41        /* fall through */
42
43    case MO_ATOM_IFALIGN:
44        tmp = (1 << size) - 1;
45        atmax = p & tmp ? MO_8 : size;
46        break;
47
48    case MO_ATOM_WITHIN16:
49        tmp = p & 15;
50        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
51        break;
52
53    case MO_ATOM_WITHIN16_PAIR:
54        tmp = p & 15;
55        if (tmp + (1 << size) <= 16) {
56            atmax = size;
57        } else if (tmp + (1 << half) == 16) {
58            /*
59             * The pair exactly straddles the boundary.
60             * Both halves are naturally aligned and atomic.
61             */
62            atmax = half;
63        } else {
64            /*
65             * One of the pair crosses the boundary, and is non-atomic.
66             * The other of the pair does not cross, and is atomic.
67             */
68            atmax = -half;
69        }
70        break;
71
72    case MO_ATOM_SUBALIGN:
73        /*
74         * Examine the alignment of p to determine if there are subobjects
75         * that must be aligned.  Note that we only really need ctz4() --
76         * any more sigificant bits are discarded by the immediately
77         * following comparison.
78         */
79        tmp = ctz32(p);
80        atmax = MIN(size, tmp);
81        break;
82
83    default:
84        g_assert_not_reached();
85    }
86
87    /*
88     * Here we have the architectural atomicity of the operation.
89     * However, when executing in a serial context, we need no extra
90     * host atomicity in order to avoid racing.  This reduction
91     * avoids looping with cpu_loop_exit_atomic.
92     */
93    if (cpu_in_serial_context(env_cpu(env))) {
94        return MO_8;
95    }
96    return atmax;
97}
98
99/**
100 * load_atomic2:
101 * @pv: host address
102 *
103 * Atomically load 2 aligned bytes from @pv.
104 */
105static inline uint16_t load_atomic2(void *pv)
106{
107    uint16_t *p = __builtin_assume_aligned(pv, 2);
108    return qatomic_read(p);
109}
110
111/**
112 * load_atomic4:
113 * @pv: host address
114 *
115 * Atomically load 4 aligned bytes from @pv.
116 */
117static inline uint32_t load_atomic4(void *pv)
118{
119    uint32_t *p = __builtin_assume_aligned(pv, 4);
120    return qatomic_read(p);
121}
122
123/**
124 * load_atomic8:
125 * @pv: host address
126 *
127 * Atomically load 8 aligned bytes from @pv.
128 */
129static inline uint64_t load_atomic8(void *pv)
130{
131    uint64_t *p = __builtin_assume_aligned(pv, 8);
132
133    qemu_build_assert(HAVE_al8);
134    return qatomic_read__nocheck(p);
135}
136
137/**
138 * load_atomic8_or_exit:
139 * @env: cpu context
140 * @ra: host unwind address
141 * @pv: host address
142 *
143 * Atomically load 8 aligned bytes from @pv.
144 * If this is not possible, longjmp out to restart serially.
145 */
146static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
147{
148    if (HAVE_al8) {
149        return load_atomic8(pv);
150    }
151
152#ifdef CONFIG_USER_ONLY
153    /*
154     * If the page is not writable, then assume the value is immutable
155     * and requires no locking.  This ignores the case of MAP_SHARED with
156     * another process, because the fallback start_exclusive solution
157     * provides no protection across processes.
158     */
159    if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) {
160        uint64_t *p = __builtin_assume_aligned(pv, 8);
161        return *p;
162    }
163#endif
164
165    /* Ultimate fallback: re-execute in serial context. */
166    cpu_loop_exit_atomic(env_cpu(env), ra);
167}
168
169/**
170 * load_atomic16_or_exit:
171 * @env: cpu context
172 * @ra: host unwind address
173 * @pv: host address
174 *
175 * Atomically load 16 aligned bytes from @pv.
176 * If this is not possible, longjmp out to restart serially.
177 */
178static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
179{
180    Int128 *p = __builtin_assume_aligned(pv, 16);
181
182    if (HAVE_ATOMIC128_RO) {
183        return atomic16_read_ro(p);
184    }
185
186#ifdef CONFIG_USER_ONLY
187    /*
188     * We can only use cmpxchg to emulate a load if the page is writable.
189     * If the page is not writable, then assume the value is immutable
190     * and requires no locking.  This ignores the case of MAP_SHARED with
191     * another process, because the fallback start_exclusive solution
192     * provides no protection across processes.
193     */
194    if (!page_check_range(h2g(p), 16, PAGE_WRITE)) {
195        return *p;
196    }
197#endif
198
199    /*
200     * In system mode all guest pages are writable, and for user-only
201     * we have just checked writability.  Try cmpxchg.
202     */
203    if (HAVE_ATOMIC128_RW) {
204        return atomic16_read_rw(p);
205    }
206
207    /* Ultimate fallback: re-execute in serial context. */
208    cpu_loop_exit_atomic(env_cpu(env), ra);
209}
210
211/**
212 * load_atom_extract_al4x2:
213 * @pv: host address
214 *
215 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
216 */
217static uint32_t load_atom_extract_al4x2(void *pv)
218{
219    uintptr_t pi = (uintptr_t)pv;
220    int sh = (pi & 3) * 8;
221    uint32_t a, b;
222
223    pv = (void *)(pi & ~3);
224    a = load_atomic4(pv);
225    b = load_atomic4(pv + 4);
226
227    if (HOST_BIG_ENDIAN) {
228        return (a << sh) | (b >> (-sh & 31));
229    } else {
230        return (a >> sh) | (b << (-sh & 31));
231    }
232}
233
234/**
235 * load_atom_extract_al8x2:
236 * @pv: host address
237 *
238 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
239 */
240static uint64_t load_atom_extract_al8x2(void *pv)
241{
242    uintptr_t pi = (uintptr_t)pv;
243    int sh = (pi & 7) * 8;
244    uint64_t a, b;
245
246    pv = (void *)(pi & ~7);
247    a = load_atomic8(pv);
248    b = load_atomic8(pv + 8);
249
250    if (HOST_BIG_ENDIAN) {
251        return (a << sh) | (b >> (-sh & 63));
252    } else {
253        return (a >> sh) | (b << (-sh & 63));
254    }
255}
256
257/**
258 * load_atom_extract_al8_or_exit:
259 * @env: cpu context
260 * @ra: host unwind address
261 * @pv: host address
262 * @s: object size in bytes, @s <= 4.
263 *
264 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
265 * not cross an 8-byte boundary.  This means that we can perform an atomic
266 * 8-byte load and extract.
267 * The value is returned in the low bits of a uint32_t.
268 */
269static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
270                                              void *pv, int s)
271{
272    uintptr_t pi = (uintptr_t)pv;
273    int o = pi & 7;
274    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
275
276    pv = (void *)(pi & ~7);
277    return load_atomic8_or_exit(env, ra, pv) >> shr;
278}
279
280/**
281 * load_atom_extract_al16_or_exit:
282 * @env: cpu context
283 * @ra: host unwind address
284 * @p: host address
285 * @s: object size in bytes, @s <= 8.
286 *
287 * Atomically load @s bytes from @p, when p % 16 < 8
288 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
289 * boundary, but *does* cross an 8-byte boundary.
290 * This is the slow version, so we must have eliminated
291 * any faster load_atom_extract_al8_or_exit case.
292 *
293 * If this is not possible, longjmp out to restart serially.
294 */
295static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
296                                               void *pv, int s)
297{
298    uintptr_t pi = (uintptr_t)pv;
299    int o = pi & 7;
300    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
301    Int128 r;
302
303    /*
304     * Note constraints above: p & 8 must be clear.
305     * Provoke SIGBUS if possible otherwise.
306     */
307    pv = (void *)(pi & ~7);
308    r = load_atomic16_or_exit(env, ra, pv);
309
310    r = int128_urshift(r, shr);
311    return int128_getlo(r);
312}
313
314/**
315 * load_atom_extract_al16_or_al8:
316 * @p: host address
317 * @s: object size in bytes, @s <= 8.
318 *
319 * Load @s bytes from @p, when p % s != 0.  If [p, p+s-1] does not
320 * cross an 16-byte boundary then the access must be 16-byte atomic,
321 * otherwise the access must be 8-byte atomic.
322 */
323static inline uint64_t ATTRIBUTE_ATOMIC128_OPT
324load_atom_extract_al16_or_al8(void *pv, int s)
325{
326    uintptr_t pi = (uintptr_t)pv;
327    int o = pi & 7;
328    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
329    Int128 r;
330
331    pv = (void *)(pi & ~7);
332    if (pi & 8) {
333        uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8);
334        uint64_t a = qatomic_read__nocheck(p8);
335        uint64_t b = qatomic_read__nocheck(p8 + 1);
336
337        if (HOST_BIG_ENDIAN) {
338            r = int128_make128(b, a);
339        } else {
340            r = int128_make128(a, b);
341        }
342    } else {
343        r = atomic16_read_ro(pv);
344    }
345    return int128_getlo(int128_urshift(r, shr));
346}
347
348/**
349 * load_atom_4_by_2:
350 * @pv: host address
351 *
352 * Load 4 bytes from @pv, with two 2-byte atomic loads.
353 */
354static inline uint32_t load_atom_4_by_2(void *pv)
355{
356    uint32_t a = load_atomic2(pv);
357    uint32_t b = load_atomic2(pv + 2);
358
359    if (HOST_BIG_ENDIAN) {
360        return (a << 16) | b;
361    } else {
362        return (b << 16) | a;
363    }
364}
365
366/**
367 * load_atom_8_by_2:
368 * @pv: host address
369 *
370 * Load 8 bytes from @pv, with four 2-byte atomic loads.
371 */
372static inline uint64_t load_atom_8_by_2(void *pv)
373{
374    uint32_t a = load_atom_4_by_2(pv);
375    uint32_t b = load_atom_4_by_2(pv + 4);
376
377    if (HOST_BIG_ENDIAN) {
378        return ((uint64_t)a << 32) | b;
379    } else {
380        return ((uint64_t)b << 32) | a;
381    }
382}
383
384/**
385 * load_atom_8_by_4:
386 * @pv: host address
387 *
388 * Load 8 bytes from @pv, with two 4-byte atomic loads.
389 */
390static inline uint64_t load_atom_8_by_4(void *pv)
391{
392    uint32_t a = load_atomic4(pv);
393    uint32_t b = load_atomic4(pv + 4);
394
395    if (HOST_BIG_ENDIAN) {
396        return ((uint64_t)a << 32) | b;
397    } else {
398        return ((uint64_t)b << 32) | a;
399    }
400}
401
402/**
403 * load_atom_8_by_8_or_4:
404 * @pv: host address
405 *
406 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
407 */
408static inline uint64_t load_atom_8_by_8_or_4(void *pv)
409{
410    if (HAVE_al8_fast) {
411        return load_atomic8(pv);
412    } else {
413        return load_atom_8_by_4(pv);
414    }
415}
416
417/**
418 * load_atom_2:
419 * @p: host address
420 * @memop: the full memory op
421 *
422 * Load 2 bytes from @p, honoring the atomicity of @memop.
423 */
424static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
425                            void *pv, MemOp memop)
426{
427    uintptr_t pi = (uintptr_t)pv;
428    int atmax;
429
430    if (likely((pi & 1) == 0)) {
431        return load_atomic2(pv);
432    }
433    if (HAVE_ATOMIC128_RO) {
434        return load_atom_extract_al16_or_al8(pv, 2);
435    }
436
437    atmax = required_atomicity(env, pi, memop);
438    switch (atmax) {
439    case MO_8:
440        return lduw_he_p(pv);
441    case MO_16:
442        /* The only case remaining is MO_ATOM_WITHIN16. */
443        if (!HAVE_al8_fast && (pi & 3) == 1) {
444            /* Big or little endian, we want the middle two bytes. */
445            return load_atomic4(pv - 1) >> 8;
446        }
447        if ((pi & 15) != 7) {
448            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
449        }
450        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
451    default:
452        g_assert_not_reached();
453    }
454}
455
456/**
457 * load_atom_4:
458 * @p: host address
459 * @memop: the full memory op
460 *
461 * Load 4 bytes from @p, honoring the atomicity of @memop.
462 */
463static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
464                            void *pv, MemOp memop)
465{
466    uintptr_t pi = (uintptr_t)pv;
467    int atmax;
468
469    if (likely((pi & 3) == 0)) {
470        return load_atomic4(pv);
471    }
472    if (HAVE_ATOMIC128_RO) {
473        return load_atom_extract_al16_or_al8(pv, 4);
474    }
475
476    atmax = required_atomicity(env, pi, memop);
477    switch (atmax) {
478    case MO_8:
479    case MO_16:
480    case -MO_16:
481        /*
482         * For MO_ATOM_IFALIGN, this is more atomicity than required,
483         * but it's trivially supported on all hosts, better than 4
484         * individual byte loads (when the host requires alignment),
485         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
486         */
487        return load_atom_extract_al4x2(pv);
488    case MO_32:
489        if (!(pi & 4)) {
490            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
491        }
492        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
493    default:
494        g_assert_not_reached();
495    }
496}
497
498/**
499 * load_atom_8:
500 * @p: host address
501 * @memop: the full memory op
502 *
503 * Load 8 bytes from @p, honoring the atomicity of @memop.
504 */
505static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
506                            void *pv, MemOp memop)
507{
508    uintptr_t pi = (uintptr_t)pv;
509    int atmax;
510
511    /*
512     * If the host does not support 8-byte atomics, wait until we have
513     * examined the atomicity parameters below.
514     */
515    if (HAVE_al8 && likely((pi & 7) == 0)) {
516        return load_atomic8(pv);
517    }
518    if (HAVE_ATOMIC128_RO) {
519        return load_atom_extract_al16_or_al8(pv, 8);
520    }
521
522    atmax = required_atomicity(env, pi, memop);
523    if (atmax == MO_64) {
524        if (!HAVE_al8 && (pi & 7) == 0) {
525            load_atomic8_or_exit(env, ra, pv);
526        }
527        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
528    }
529    if (HAVE_al8_fast) {
530        return load_atom_extract_al8x2(pv);
531    }
532    switch (atmax) {
533    case MO_8:
534        return ldq_he_p(pv);
535    case MO_16:
536        return load_atom_8_by_2(pv);
537    case MO_32:
538        return load_atom_8_by_4(pv);
539    case -MO_32:
540        if (HAVE_al8) {
541            return load_atom_extract_al8x2(pv);
542        }
543        cpu_loop_exit_atomic(env_cpu(env), ra);
544    default:
545        g_assert_not_reached();
546    }
547}
548
549/**
550 * load_atom_16:
551 * @p: host address
552 * @memop: the full memory op
553 *
554 * Load 16 bytes from @p, honoring the atomicity of @memop.
555 */
556static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
557                           void *pv, MemOp memop)
558{
559    uintptr_t pi = (uintptr_t)pv;
560    int atmax;
561    Int128 r;
562    uint64_t a, b;
563
564    /*
565     * If the host does not support 16-byte atomics, wait until we have
566     * examined the atomicity parameters below.
567     */
568    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
569        return atomic16_read_ro(pv);
570    }
571
572    atmax = required_atomicity(env, pi, memop);
573    switch (atmax) {
574    case MO_8:
575        memcpy(&r, pv, 16);
576        return r;
577    case MO_16:
578        a = load_atom_8_by_2(pv);
579        b = load_atom_8_by_2(pv + 8);
580        break;
581    case MO_32:
582        a = load_atom_8_by_4(pv);
583        b = load_atom_8_by_4(pv + 8);
584        break;
585    case MO_64:
586        if (!HAVE_al8) {
587            cpu_loop_exit_atomic(env_cpu(env), ra);
588        }
589        a = load_atomic8(pv);
590        b = load_atomic8(pv + 8);
591        break;
592    case -MO_64:
593        if (!HAVE_al8) {
594            cpu_loop_exit_atomic(env_cpu(env), ra);
595        }
596        a = load_atom_extract_al8x2(pv);
597        b = load_atom_extract_al8x2(pv + 8);
598        break;
599    case MO_128:
600        return load_atomic16_or_exit(env, ra, pv);
601    default:
602        g_assert_not_reached();
603    }
604    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
605}
606
607/**
608 * store_atomic2:
609 * @pv: host address
610 * @val: value to store
611 *
612 * Atomically store 2 aligned bytes to @pv.
613 */
614static inline void store_atomic2(void *pv, uint16_t val)
615{
616    uint16_t *p = __builtin_assume_aligned(pv, 2);
617    qatomic_set(p, val);
618}
619
620/**
621 * store_atomic4:
622 * @pv: host address
623 * @val: value to store
624 *
625 * Atomically store 4 aligned bytes to @pv.
626 */
627static inline void store_atomic4(void *pv, uint32_t val)
628{
629    uint32_t *p = __builtin_assume_aligned(pv, 4);
630    qatomic_set(p, val);
631}
632
633/**
634 * store_atomic8:
635 * @pv: host address
636 * @val: value to store
637 *
638 * Atomically store 8 aligned bytes to @pv.
639 */
640static inline void store_atomic8(void *pv, uint64_t val)
641{
642    uint64_t *p = __builtin_assume_aligned(pv, 8);
643
644    qemu_build_assert(HAVE_al8);
645    qatomic_set__nocheck(p, val);
646}
647
648/**
649 * store_atom_4x2
650 */
651static inline void store_atom_4_by_2(void *pv, uint32_t val)
652{
653    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
654    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
655}
656
657/**
658 * store_atom_8_by_2
659 */
660static inline void store_atom_8_by_2(void *pv, uint64_t val)
661{
662    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
663    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
664}
665
666/**
667 * store_atom_8_by_4
668 */
669static inline void store_atom_8_by_4(void *pv, uint64_t val)
670{
671    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
672    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
673}
674
675/**
676 * store_atom_insert_al4:
677 * @p: host address
678 * @val: shifted value to store
679 * @msk: mask for value to store
680 *
681 * Atomically store @val to @p, masked by @msk.
682 */
683static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
684{
685    uint32_t old, new;
686
687    p = __builtin_assume_aligned(p, 4);
688    old = qatomic_read(p);
689    do {
690        new = (old & ~msk) | val;
691    } while (!__atomic_compare_exchange_n(p, &old, new, true,
692                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
693}
694
695/**
696 * store_atom_insert_al8:
697 * @p: host address
698 * @val: shifted value to store
699 * @msk: mask for value to store
700 *
701 * Atomically store @val to @p masked by @msk.
702 */
703static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
704{
705    uint64_t old, new;
706
707    qemu_build_assert(HAVE_al8);
708    p = __builtin_assume_aligned(p, 8);
709    old = qatomic_read__nocheck(p);
710    do {
711        new = (old & ~msk) | val;
712    } while (!__atomic_compare_exchange_n(p, &old, new, true,
713                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
714}
715
716/**
717 * store_atom_insert_al16:
718 * @p: host address
719 * @val: shifted value to store
720 * @msk: mask for value to store
721 *
722 * Atomically store @val to @p masked by @msk.
723 */
724static void ATTRIBUTE_ATOMIC128_OPT
725store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk)
726{
727#if defined(CONFIG_ATOMIC128)
728    __uint128_t *pu, old, new;
729
730    /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */
731    pu = __builtin_assume_aligned(ps, 16);
732    old = *pu;
733    do {
734        new = (old & ~msk.u) | val.u;
735    } while (!__atomic_compare_exchange_n(pu, &old, new, true,
736                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
737#elif defined(CONFIG_CMPXCHG128)
738    __uint128_t *pu, old, new;
739
740    /*
741     * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always
742     * defer to libatomic, so we must use __sync_*_compare_and_swap_16
743     * and accept the sequential consistency that comes with it.
744     */
745    pu = __builtin_assume_aligned(ps, 16);
746    do {
747        old = *pu;
748        new = (old & ~msk.u) | val.u;
749    } while (!__sync_bool_compare_and_swap_16(pu, old, new));
750#else
751    qemu_build_not_reached();
752#endif
753}
754
755/**
756 * store_bytes_leN:
757 * @pv: host address
758 * @size: number of bytes to store
759 * @val_le: data to store
760 *
761 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
762 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
763 */
764static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
765{
766    uint8_t *p = pv;
767    for (int i = 0; i < size; i++, val_le >>= 8) {
768        p[i] = val_le;
769    }
770    return val_le;
771}
772
773/**
774 * store_parts_leN
775 * @pv: host address
776 * @size: number of bytes to store
777 * @val_le: data to store
778 *
779 * As store_bytes_leN, but atomically on each aligned part.
780 */
781G_GNUC_UNUSED
782static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
783{
784    do {
785        int n;
786
787        /* Find minimum of alignment and size */
788        switch (((uintptr_t)pv | size) & 7) {
789        case 4:
790            store_atomic4(pv, le32_to_cpu(val_le));
791            val_le >>= 32;
792            n = 4;
793            break;
794        case 2:
795        case 6:
796            store_atomic2(pv, le16_to_cpu(val_le));
797            val_le >>= 16;
798            n = 2;
799            break;
800        default:
801            *(uint8_t *)pv = val_le;
802            val_le >>= 8;
803            n = 1;
804            break;
805        case 0:
806            g_assert_not_reached();
807        }
808        pv += n;
809        size -= n;
810    } while (size != 0);
811
812    return val_le;
813}
814
815/**
816 * store_whole_le4
817 * @pv: host address
818 * @size: number of bytes to store
819 * @val_le: data to store
820 *
821 * As store_bytes_leN, but atomically as a whole.
822 * Four aligned bytes are guaranteed to cover the store.
823 */
824static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
825{
826    int sz = size * 8;
827    int o = (uintptr_t)pv & 3;
828    int sh = o * 8;
829    uint32_t m = MAKE_64BIT_MASK(0, sz);
830    uint32_t v;
831
832    if (HOST_BIG_ENDIAN) {
833        v = bswap32(val_le) >> sh;
834        m = bswap32(m) >> sh;
835    } else {
836        v = val_le << sh;
837        m <<= sh;
838    }
839    store_atom_insert_al4(pv - o, v, m);
840    return val_le >> sz;
841}
842
843/**
844 * store_whole_le8
845 * @pv: host address
846 * @size: number of bytes to store
847 * @val_le: data to store
848 *
849 * As store_bytes_leN, but atomically as a whole.
850 * Eight aligned bytes are guaranteed to cover the store.
851 */
852static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
853{
854    int sz = size * 8;
855    int o = (uintptr_t)pv & 7;
856    int sh = o * 8;
857    uint64_t m = MAKE_64BIT_MASK(0, sz);
858    uint64_t v;
859
860    qemu_build_assert(HAVE_al8);
861    if (HOST_BIG_ENDIAN) {
862        v = bswap64(val_le) >> sh;
863        m = bswap64(m) >> sh;
864    } else {
865        v = val_le << sh;
866        m <<= sh;
867    }
868    store_atom_insert_al8(pv - o, v, m);
869    return val_le >> sz;
870}
871
872/**
873 * store_whole_le16
874 * @pv: host address
875 * @size: number of bytes to store
876 * @val_le: data to store
877 *
878 * As store_bytes_leN, but atomically as a whole.
879 * 16 aligned bytes are guaranteed to cover the store.
880 */
881static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
882{
883    int sz = size * 8;
884    int o = (uintptr_t)pv & 15;
885    int sh = o * 8;
886    Int128 m, v;
887
888    qemu_build_assert(HAVE_ATOMIC128_RW);
889
890    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
891    if (sz <= 64) {
892        m = int128_make64(MAKE_64BIT_MASK(0, sz));
893    } else {
894        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
895    }
896
897    if (HOST_BIG_ENDIAN) {
898        v = int128_urshift(bswap128(val_le), sh);
899        m = int128_urshift(bswap128(m), sh);
900    } else {
901        v = int128_lshift(val_le, sh);
902        m = int128_lshift(m, sh);
903    }
904    store_atom_insert_al16(pv - o, v, m);
905
906    /* Unused if sz <= 64. */
907    return int128_gethi(val_le) >> (sz - 64);
908}
909
910/**
911 * store_atom_2:
912 * @p: host address
913 * @val: the value to store
914 * @memop: the full memory op
915 *
916 * Store 2 bytes to @p, honoring the atomicity of @memop.
917 */
918static void store_atom_2(CPUArchState *env, uintptr_t ra,
919                         void *pv, MemOp memop, uint16_t val)
920{
921    uintptr_t pi = (uintptr_t)pv;
922    int atmax;
923
924    if (likely((pi & 1) == 0)) {
925        store_atomic2(pv, val);
926        return;
927    }
928
929    atmax = required_atomicity(env, pi, memop);
930    if (atmax == MO_8) {
931        stw_he_p(pv, val);
932        return;
933    }
934
935    /*
936     * The only case remaining is MO_ATOM_WITHIN16.
937     * Big or little endian, we want the middle two bytes in each test.
938     */
939    if ((pi & 3) == 1) {
940        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
941        return;
942    } else if ((pi & 7) == 3) {
943        if (HAVE_al8) {
944            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
945            return;
946        }
947    } else if ((pi & 15) == 7) {
948        if (HAVE_ATOMIC128_RW) {
949            Int128 v = int128_lshift(int128_make64(val), 56);
950            Int128 m = int128_lshift(int128_make64(0xffff), 56);
951            store_atom_insert_al16(pv - 7, v, m);
952            return;
953        }
954    } else {
955        g_assert_not_reached();
956    }
957
958    cpu_loop_exit_atomic(env_cpu(env), ra);
959}
960
961/**
962 * store_atom_4:
963 * @p: host address
964 * @val: the value to store
965 * @memop: the full memory op
966 *
967 * Store 4 bytes to @p, honoring the atomicity of @memop.
968 */
969static void store_atom_4(CPUArchState *env, uintptr_t ra,
970                         void *pv, MemOp memop, uint32_t val)
971{
972    uintptr_t pi = (uintptr_t)pv;
973    int atmax;
974
975    if (likely((pi & 3) == 0)) {
976        store_atomic4(pv, val);
977        return;
978    }
979
980    atmax = required_atomicity(env, pi, memop);
981    switch (atmax) {
982    case MO_8:
983        stl_he_p(pv, val);
984        return;
985    case MO_16:
986        store_atom_4_by_2(pv, val);
987        return;
988    case -MO_16:
989        {
990            uint32_t val_le = cpu_to_le32(val);
991            int s2 = pi & 3;
992            int s1 = 4 - s2;
993
994            switch (s2) {
995            case 1:
996                val_le = store_whole_le4(pv, s1, val_le);
997                *(uint8_t *)(pv + 3) = val_le;
998                break;
999            case 3:
1000                *(uint8_t *)pv = val_le;
1001                store_whole_le4(pv + 1, s2, val_le >> 8);
1002                break;
1003            case 0: /* aligned */
1004            case 2: /* atmax MO_16 */
1005            default:
1006                g_assert_not_reached();
1007            }
1008        }
1009        return;
1010    case MO_32:
1011        if ((pi & 7) < 4) {
1012            if (HAVE_al8) {
1013                store_whole_le8(pv, 4, cpu_to_le32(val));
1014                return;
1015            }
1016        } else {
1017            if (HAVE_ATOMIC128_RW) {
1018                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
1019                return;
1020            }
1021        }
1022        cpu_loop_exit_atomic(env_cpu(env), ra);
1023    default:
1024        g_assert_not_reached();
1025    }
1026}
1027
1028/**
1029 * store_atom_8:
1030 * @p: host address
1031 * @val: the value to store
1032 * @memop: the full memory op
1033 *
1034 * Store 8 bytes to @p, honoring the atomicity of @memop.
1035 */
1036static void store_atom_8(CPUArchState *env, uintptr_t ra,
1037                         void *pv, MemOp memop, uint64_t val)
1038{
1039    uintptr_t pi = (uintptr_t)pv;
1040    int atmax;
1041
1042    if (HAVE_al8 && likely((pi & 7) == 0)) {
1043        store_atomic8(pv, val);
1044        return;
1045    }
1046
1047    atmax = required_atomicity(env, pi, memop);
1048    switch (atmax) {
1049    case MO_8:
1050        stq_he_p(pv, val);
1051        return;
1052    case MO_16:
1053        store_atom_8_by_2(pv, val);
1054        return;
1055    case MO_32:
1056        store_atom_8_by_4(pv, val);
1057        return;
1058    case -MO_32:
1059        if (HAVE_al8) {
1060            uint64_t val_le = cpu_to_le64(val);
1061            int s2 = pi & 7;
1062            int s1 = 8 - s2;
1063
1064            switch (s2) {
1065            case 1 ... 3:
1066                val_le = store_whole_le8(pv, s1, val_le);
1067                store_bytes_leN(pv + s1, s2, val_le);
1068                break;
1069            case 5 ... 7:
1070                val_le = store_bytes_leN(pv, s1, val_le);
1071                store_whole_le8(pv + s1, s2, val_le);
1072                break;
1073            case 0: /* aligned */
1074            case 4: /* atmax MO_32 */
1075            default:
1076                g_assert_not_reached();
1077            }
1078            return;
1079        }
1080        break;
1081    case MO_64:
1082        if (HAVE_ATOMIC128_RW) {
1083            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1084            return;
1085        }
1086        break;
1087    default:
1088        g_assert_not_reached();
1089    }
1090    cpu_loop_exit_atomic(env_cpu(env), ra);
1091}
1092
1093/**
1094 * store_atom_16:
1095 * @p: host address
1096 * @val: the value to store
1097 * @memop: the full memory op
1098 *
1099 * Store 16 bytes to @p, honoring the atomicity of @memop.
1100 */
1101static void store_atom_16(CPUArchState *env, uintptr_t ra,
1102                          void *pv, MemOp memop, Int128 val)
1103{
1104    uintptr_t pi = (uintptr_t)pv;
1105    uint64_t a, b;
1106    int atmax;
1107
1108    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1109        atomic16_set(pv, val);
1110        return;
1111    }
1112
1113    atmax = required_atomicity(env, pi, memop);
1114
1115    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1116    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1117    switch (atmax) {
1118    case MO_8:
1119        memcpy(pv, &val, 16);
1120        return;
1121    case MO_16:
1122        store_atom_8_by_2(pv, a);
1123        store_atom_8_by_2(pv + 8, b);
1124        return;
1125    case MO_32:
1126        store_atom_8_by_4(pv, a);
1127        store_atom_8_by_4(pv + 8, b);
1128        return;
1129    case MO_64:
1130        if (HAVE_al8) {
1131            store_atomic8(pv, a);
1132            store_atomic8(pv + 8, b);
1133            return;
1134        }
1135        break;
1136    case -MO_64:
1137        if (HAVE_ATOMIC128_RW) {
1138            uint64_t val_le;
1139            int s2 = pi & 15;
1140            int s1 = 16 - s2;
1141
1142            if (HOST_BIG_ENDIAN) {
1143                val = bswap128(val);
1144            }
1145            switch (s2) {
1146            case 1 ... 7:
1147                val_le = store_whole_le16(pv, s1, val);
1148                store_bytes_leN(pv + s1, s2, val_le);
1149                break;
1150            case 9 ... 15:
1151                store_bytes_leN(pv, s1, int128_getlo(val));
1152                val = int128_urshift(val, s1 * 8);
1153                store_whole_le16(pv + s1, s2, val);
1154                break;
1155            case 0: /* aligned */
1156            case 8: /* atmax MO_64 */
1157            default:
1158                g_assert_not_reached();
1159            }
1160            return;
1161        }
1162        break;
1163    case MO_128:
1164        if (HAVE_ATOMIC128_RW) {
1165            atomic16_set(pv, val);
1166            return;
1167        }
1168        break;
1169    default:
1170        g_assert_not_reached();
1171    }
1172    cpu_loop_exit_atomic(env_cpu(env), ra);
1173}
1174