xref: /qemu/accel/tcg/ldst_atomicity.c.inc (revision 10be627d)
1/*
2 * Routines common to user and system emulation of load/store.
3 *
4 *  Copyright (c) 2022 Linaro, Ltd.
5 *
6 * SPDX-License-Identifier: GPL-2.0-or-later
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2 or later.
9 * See the COPYING file in the top-level directory.
10 */
11
12#include "host/load-extract-al16-al8.h"
13#include "host/store-insert-al16.h"
14
15#ifdef CONFIG_ATOMIC64
16# define HAVE_al8          true
17#else
18# define HAVE_al8          false
19#endif
20#define HAVE_al8_fast      (ATOMIC_REG_SIZE >= 8)
21
22/**
23 * required_atomicity:
24 *
25 * Return the lg2 bytes of atomicity required by @memop for @p.
26 * If the operation must be split into two operations to be
27 * examined separately for atomicity, return -lg2.
28 */
29static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop)
30{
31    MemOp atom = memop & MO_ATOM_MASK;
32    MemOp size = memop & MO_SIZE;
33    MemOp half = size ? size - 1 : 0;
34    unsigned tmp;
35    int atmax;
36
37    switch (atom) {
38    case MO_ATOM_NONE:
39        atmax = MO_8;
40        break;
41
42    case MO_ATOM_IFALIGN_PAIR:
43        size = half;
44        /* fall through */
45
46    case MO_ATOM_IFALIGN:
47        tmp = (1 << size) - 1;
48        atmax = p & tmp ? MO_8 : size;
49        break;
50
51    case MO_ATOM_WITHIN16:
52        tmp = p & 15;
53        atmax = (tmp + (1 << size) <= 16 ? size : MO_8);
54        break;
55
56    case MO_ATOM_WITHIN16_PAIR:
57        tmp = p & 15;
58        if (tmp + (1 << size) <= 16) {
59            atmax = size;
60        } else if (tmp + (1 << half) == 16) {
61            /*
62             * The pair exactly straddles the boundary.
63             * Both halves are naturally aligned and atomic.
64             */
65            atmax = half;
66        } else {
67            /*
68             * One of the pair crosses the boundary, and is non-atomic.
69             * The other of the pair does not cross, and is atomic.
70             */
71            atmax = -half;
72        }
73        break;
74
75    case MO_ATOM_SUBALIGN:
76        /*
77         * Examine the alignment of p to determine if there are subobjects
78         * that must be aligned.  Note that we only really need ctz4() --
79         * any more sigificant bits are discarded by the immediately
80         * following comparison.
81         */
82        tmp = ctz32(p);
83        atmax = MIN(size, tmp);
84        break;
85
86    default:
87        g_assert_not_reached();
88    }
89
90    /*
91     * Here we have the architectural atomicity of the operation.
92     * However, when executing in a serial context, we need no extra
93     * host atomicity in order to avoid racing.  This reduction
94     * avoids looping with cpu_loop_exit_atomic.
95     */
96    if (cpu_in_serial_context(env_cpu(env))) {
97        return MO_8;
98    }
99    return atmax;
100}
101
102/**
103 * load_atomic2:
104 * @pv: host address
105 *
106 * Atomically load 2 aligned bytes from @pv.
107 */
108static inline uint16_t load_atomic2(void *pv)
109{
110    uint16_t *p = __builtin_assume_aligned(pv, 2);
111    return qatomic_read(p);
112}
113
114/**
115 * load_atomic4:
116 * @pv: host address
117 *
118 * Atomically load 4 aligned bytes from @pv.
119 */
120static inline uint32_t load_atomic4(void *pv)
121{
122    uint32_t *p = __builtin_assume_aligned(pv, 4);
123    return qatomic_read(p);
124}
125
126/**
127 * load_atomic8:
128 * @pv: host address
129 *
130 * Atomically load 8 aligned bytes from @pv.
131 */
132static inline uint64_t load_atomic8(void *pv)
133{
134    uint64_t *p = __builtin_assume_aligned(pv, 8);
135
136    qemu_build_assert(HAVE_al8);
137    return qatomic_read__nocheck(p);
138}
139
140/**
141 * load_atomic8_or_exit:
142 * @env: cpu context
143 * @ra: host unwind address
144 * @pv: host address
145 *
146 * Atomically load 8 aligned bytes from @pv.
147 * If this is not possible, longjmp out to restart serially.
148 */
149static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
150{
151    if (HAVE_al8) {
152        return load_atomic8(pv);
153    }
154
155#ifdef CONFIG_USER_ONLY
156    /*
157     * If the page is not writable, then assume the value is immutable
158     * and requires no locking.  This ignores the case of MAP_SHARED with
159     * another process, because the fallback start_exclusive solution
160     * provides no protection across processes.
161     */
162    WITH_MMAP_LOCK_GUARD() {
163        if (!page_check_range(h2g(pv), 8, PAGE_WRITE_ORG)) {
164            uint64_t *p = __builtin_assume_aligned(pv, 8);
165            return *p;
166        }
167    }
168#endif
169
170    /* Ultimate fallback: re-execute in serial context. */
171    cpu_loop_exit_atomic(env_cpu(env), ra);
172}
173
174/**
175 * load_atomic16_or_exit:
176 * @env: cpu context
177 * @ra: host unwind address
178 * @pv: host address
179 *
180 * Atomically load 16 aligned bytes from @pv.
181 * If this is not possible, longjmp out to restart serially.
182 */
183static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv)
184{
185    Int128 *p = __builtin_assume_aligned(pv, 16);
186
187    if (HAVE_ATOMIC128_RO) {
188        return atomic16_read_ro(p);
189    }
190
191    /*
192     * We can only use cmpxchg to emulate a load if the page is writable.
193     * If the page is not writable, then assume the value is immutable
194     * and requires no locking.  This ignores the case of MAP_SHARED with
195     * another process, because the fallback start_exclusive solution
196     * provides no protection across processes.
197     *
198     * In system mode all guest pages are writable.  For user mode,
199     * we must take mmap_lock so that the query remains valid until
200     * the write is complete -- tests/tcg/multiarch/munmap-pthread.c
201     * is an example that can race.
202     */
203    WITH_MMAP_LOCK_GUARD() {
204#ifdef CONFIG_USER_ONLY
205        if (!page_check_range(h2g(p), 16, PAGE_WRITE_ORG)) {
206            return *p;
207        }
208#endif
209        if (HAVE_ATOMIC128_RW) {
210            return atomic16_read_rw(p);
211        }
212    }
213
214    /* Ultimate fallback: re-execute in serial context. */
215    cpu_loop_exit_atomic(env_cpu(env), ra);
216}
217
218/**
219 * load_atom_extract_al4x2:
220 * @pv: host address
221 *
222 * Load 4 bytes from @p, from two sequential atomic 4-byte loads.
223 */
224static uint32_t load_atom_extract_al4x2(void *pv)
225{
226    uintptr_t pi = (uintptr_t)pv;
227    int sh = (pi & 3) * 8;
228    uint32_t a, b;
229
230    pv = (void *)(pi & ~3);
231    a = load_atomic4(pv);
232    b = load_atomic4(pv + 4);
233
234    if (HOST_BIG_ENDIAN) {
235        return (a << sh) | (b >> (-sh & 31));
236    } else {
237        return (a >> sh) | (b << (-sh & 31));
238    }
239}
240
241/**
242 * load_atom_extract_al8x2:
243 * @pv: host address
244 *
245 * Load 8 bytes from @p, from two sequential atomic 8-byte loads.
246 */
247static uint64_t load_atom_extract_al8x2(void *pv)
248{
249    uintptr_t pi = (uintptr_t)pv;
250    int sh = (pi & 7) * 8;
251    uint64_t a, b;
252
253    pv = (void *)(pi & ~7);
254    a = load_atomic8(pv);
255    b = load_atomic8(pv + 8);
256
257    if (HOST_BIG_ENDIAN) {
258        return (a << sh) | (b >> (-sh & 63));
259    } else {
260        return (a >> sh) | (b << (-sh & 63));
261    }
262}
263
264/**
265 * load_atom_extract_al8_or_exit:
266 * @env: cpu context
267 * @ra: host unwind address
268 * @pv: host address
269 * @s: object size in bytes, @s <= 4.
270 *
271 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does
272 * not cross an 8-byte boundary.  This means that we can perform an atomic
273 * 8-byte load and extract.
274 * The value is returned in the low bits of a uint32_t.
275 */
276static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra,
277                                              void *pv, int s)
278{
279    uintptr_t pi = (uintptr_t)pv;
280    int o = pi & 7;
281    int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8;
282
283    pv = (void *)(pi & ~7);
284    return load_atomic8_or_exit(env, ra, pv) >> shr;
285}
286
287/**
288 * load_atom_extract_al16_or_exit:
289 * @env: cpu context
290 * @ra: host unwind address
291 * @p: host address
292 * @s: object size in bytes, @s <= 8.
293 *
294 * Atomically load @s bytes from @p, when p % 16 < 8
295 * and p % 16 + s > 8.  I.e. does not cross a 16-byte
296 * boundary, but *does* cross an 8-byte boundary.
297 * This is the slow version, so we must have eliminated
298 * any faster load_atom_extract_al8_or_exit case.
299 *
300 * If this is not possible, longjmp out to restart serially.
301 */
302static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra,
303                                               void *pv, int s)
304{
305    uintptr_t pi = (uintptr_t)pv;
306    int o = pi & 7;
307    int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8;
308    Int128 r;
309
310    /*
311     * Note constraints above: p & 8 must be clear.
312     * Provoke SIGBUS if possible otherwise.
313     */
314    pv = (void *)(pi & ~7);
315    r = load_atomic16_or_exit(env, ra, pv);
316
317    r = int128_urshift(r, shr);
318    return int128_getlo(r);
319}
320
321/**
322 * load_atom_4_by_2:
323 * @pv: host address
324 *
325 * Load 4 bytes from @pv, with two 2-byte atomic loads.
326 */
327static inline uint32_t load_atom_4_by_2(void *pv)
328{
329    uint32_t a = load_atomic2(pv);
330    uint32_t b = load_atomic2(pv + 2);
331
332    if (HOST_BIG_ENDIAN) {
333        return (a << 16) | b;
334    } else {
335        return (b << 16) | a;
336    }
337}
338
339/**
340 * load_atom_8_by_2:
341 * @pv: host address
342 *
343 * Load 8 bytes from @pv, with four 2-byte atomic loads.
344 */
345static inline uint64_t load_atom_8_by_2(void *pv)
346{
347    uint32_t a = load_atom_4_by_2(pv);
348    uint32_t b = load_atom_4_by_2(pv + 4);
349
350    if (HOST_BIG_ENDIAN) {
351        return ((uint64_t)a << 32) | b;
352    } else {
353        return ((uint64_t)b << 32) | a;
354    }
355}
356
357/**
358 * load_atom_8_by_4:
359 * @pv: host address
360 *
361 * Load 8 bytes from @pv, with two 4-byte atomic loads.
362 */
363static inline uint64_t load_atom_8_by_4(void *pv)
364{
365    uint32_t a = load_atomic4(pv);
366    uint32_t b = load_atomic4(pv + 4);
367
368    if (HOST_BIG_ENDIAN) {
369        return ((uint64_t)a << 32) | b;
370    } else {
371        return ((uint64_t)b << 32) | a;
372    }
373}
374
375/**
376 * load_atom_8_by_8_or_4:
377 * @pv: host address
378 *
379 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity.
380 */
381static inline uint64_t load_atom_8_by_8_or_4(void *pv)
382{
383    if (HAVE_al8_fast) {
384        return load_atomic8(pv);
385    } else {
386        return load_atom_8_by_4(pv);
387    }
388}
389
390/**
391 * load_atom_2:
392 * @p: host address
393 * @memop: the full memory op
394 *
395 * Load 2 bytes from @p, honoring the atomicity of @memop.
396 */
397static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra,
398                            void *pv, MemOp memop)
399{
400    uintptr_t pi = (uintptr_t)pv;
401    int atmax;
402
403    if (likely((pi & 1) == 0)) {
404        return load_atomic2(pv);
405    }
406    if (HAVE_ATOMIC128_RO) {
407        return load_atom_extract_al16_or_al8(pv, 2);
408    }
409
410    atmax = required_atomicity(env, pi, memop);
411    switch (atmax) {
412    case MO_8:
413        return lduw_he_p(pv);
414    case MO_16:
415        /* The only case remaining is MO_ATOM_WITHIN16. */
416        if (!HAVE_al8_fast && (pi & 3) == 1) {
417            /* Big or little endian, we want the middle two bytes. */
418            return load_atomic4(pv - 1) >> 8;
419        }
420        if ((pi & 15) != 7) {
421            return load_atom_extract_al8_or_exit(env, ra, pv, 2);
422        }
423        return load_atom_extract_al16_or_exit(env, ra, pv, 2);
424    default:
425        g_assert_not_reached();
426    }
427}
428
429/**
430 * load_atom_4:
431 * @p: host address
432 * @memop: the full memory op
433 *
434 * Load 4 bytes from @p, honoring the atomicity of @memop.
435 */
436static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra,
437                            void *pv, MemOp memop)
438{
439    uintptr_t pi = (uintptr_t)pv;
440    int atmax;
441
442    if (likely((pi & 3) == 0)) {
443        return load_atomic4(pv);
444    }
445    if (HAVE_ATOMIC128_RO) {
446        return load_atom_extract_al16_or_al8(pv, 4);
447    }
448
449    atmax = required_atomicity(env, pi, memop);
450    switch (atmax) {
451    case MO_8:
452    case MO_16:
453    case -MO_16:
454        /*
455         * For MO_ATOM_IFALIGN, this is more atomicity than required,
456         * but it's trivially supported on all hosts, better than 4
457         * individual byte loads (when the host requires alignment),
458         * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0.
459         */
460        return load_atom_extract_al4x2(pv);
461    case MO_32:
462        if (!(pi & 4)) {
463            return load_atom_extract_al8_or_exit(env, ra, pv, 4);
464        }
465        return load_atom_extract_al16_or_exit(env, ra, pv, 4);
466    default:
467        g_assert_not_reached();
468    }
469}
470
471/**
472 * load_atom_8:
473 * @p: host address
474 * @memop: the full memory op
475 *
476 * Load 8 bytes from @p, honoring the atomicity of @memop.
477 */
478static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra,
479                            void *pv, MemOp memop)
480{
481    uintptr_t pi = (uintptr_t)pv;
482    int atmax;
483
484    /*
485     * If the host does not support 8-byte atomics, wait until we have
486     * examined the atomicity parameters below.
487     */
488    if (HAVE_al8 && likely((pi & 7) == 0)) {
489        return load_atomic8(pv);
490    }
491    if (HAVE_ATOMIC128_RO) {
492        return load_atom_extract_al16_or_al8(pv, 8);
493    }
494
495    atmax = required_atomicity(env, pi, memop);
496    if (atmax == MO_64) {
497        if (!HAVE_al8 && (pi & 7) == 0) {
498            load_atomic8_or_exit(env, ra, pv);
499        }
500        return load_atom_extract_al16_or_exit(env, ra, pv, 8);
501    }
502    if (HAVE_al8_fast) {
503        return load_atom_extract_al8x2(pv);
504    }
505    switch (atmax) {
506    case MO_8:
507        return ldq_he_p(pv);
508    case MO_16:
509        return load_atom_8_by_2(pv);
510    case MO_32:
511        return load_atom_8_by_4(pv);
512    case -MO_32:
513        if (HAVE_al8) {
514            return load_atom_extract_al8x2(pv);
515        }
516        cpu_loop_exit_atomic(env_cpu(env), ra);
517    default:
518        g_assert_not_reached();
519    }
520}
521
522/**
523 * load_atom_16:
524 * @p: host address
525 * @memop: the full memory op
526 *
527 * Load 16 bytes from @p, honoring the atomicity of @memop.
528 */
529static Int128 load_atom_16(CPUArchState *env, uintptr_t ra,
530                           void *pv, MemOp memop)
531{
532    uintptr_t pi = (uintptr_t)pv;
533    int atmax;
534    Int128 r;
535    uint64_t a, b;
536
537    /*
538     * If the host does not support 16-byte atomics, wait until we have
539     * examined the atomicity parameters below.
540     */
541    if (HAVE_ATOMIC128_RO && likely((pi & 15) == 0)) {
542        return atomic16_read_ro(pv);
543    }
544
545    atmax = required_atomicity(env, pi, memop);
546    switch (atmax) {
547    case MO_8:
548        memcpy(&r, pv, 16);
549        return r;
550    case MO_16:
551        a = load_atom_8_by_2(pv);
552        b = load_atom_8_by_2(pv + 8);
553        break;
554    case MO_32:
555        a = load_atom_8_by_4(pv);
556        b = load_atom_8_by_4(pv + 8);
557        break;
558    case MO_64:
559        if (!HAVE_al8) {
560            cpu_loop_exit_atomic(env_cpu(env), ra);
561        }
562        a = load_atomic8(pv);
563        b = load_atomic8(pv + 8);
564        break;
565    case -MO_64:
566        if (!HAVE_al8) {
567            cpu_loop_exit_atomic(env_cpu(env), ra);
568        }
569        a = load_atom_extract_al8x2(pv);
570        b = load_atom_extract_al8x2(pv + 8);
571        break;
572    case MO_128:
573        return load_atomic16_or_exit(env, ra, pv);
574    default:
575        g_assert_not_reached();
576    }
577    return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b);
578}
579
580/**
581 * store_atomic2:
582 * @pv: host address
583 * @val: value to store
584 *
585 * Atomically store 2 aligned bytes to @pv.
586 */
587static inline void store_atomic2(void *pv, uint16_t val)
588{
589    uint16_t *p = __builtin_assume_aligned(pv, 2);
590    qatomic_set(p, val);
591}
592
593/**
594 * store_atomic4:
595 * @pv: host address
596 * @val: value to store
597 *
598 * Atomically store 4 aligned bytes to @pv.
599 */
600static inline void store_atomic4(void *pv, uint32_t val)
601{
602    uint32_t *p = __builtin_assume_aligned(pv, 4);
603    qatomic_set(p, val);
604}
605
606/**
607 * store_atomic8:
608 * @pv: host address
609 * @val: value to store
610 *
611 * Atomically store 8 aligned bytes to @pv.
612 */
613static inline void store_atomic8(void *pv, uint64_t val)
614{
615    uint64_t *p = __builtin_assume_aligned(pv, 8);
616
617    qemu_build_assert(HAVE_al8);
618    qatomic_set__nocheck(p, val);
619}
620
621/**
622 * store_atom_4x2
623 */
624static inline void store_atom_4_by_2(void *pv, uint32_t val)
625{
626    store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0));
627    store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16));
628}
629
630/**
631 * store_atom_8_by_2
632 */
633static inline void store_atom_8_by_2(void *pv, uint64_t val)
634{
635    store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
636    store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
637}
638
639/**
640 * store_atom_8_by_4
641 */
642static inline void store_atom_8_by_4(void *pv, uint64_t val)
643{
644    store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0));
645    store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32));
646}
647
648/**
649 * store_atom_insert_al4:
650 * @p: host address
651 * @val: shifted value to store
652 * @msk: mask for value to store
653 *
654 * Atomically store @val to @p, masked by @msk.
655 */
656static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk)
657{
658    uint32_t old, new;
659
660    p = __builtin_assume_aligned(p, 4);
661    old = qatomic_read(p);
662    do {
663        new = (old & ~msk) | val;
664    } while (!__atomic_compare_exchange_n(p, &old, new, true,
665                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
666}
667
668/**
669 * store_atom_insert_al8:
670 * @p: host address
671 * @val: shifted value to store
672 * @msk: mask for value to store
673 *
674 * Atomically store @val to @p masked by @msk.
675 */
676static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk)
677{
678    uint64_t old, new;
679
680    qemu_build_assert(HAVE_al8);
681    p = __builtin_assume_aligned(p, 8);
682    old = qatomic_read__nocheck(p);
683    do {
684        new = (old & ~msk) | val;
685    } while (!__atomic_compare_exchange_n(p, &old, new, true,
686                                          __ATOMIC_RELAXED, __ATOMIC_RELAXED));
687}
688
689/**
690 * store_bytes_leN:
691 * @pv: host address
692 * @size: number of bytes to store
693 * @val_le: data to store
694 *
695 * Store @size bytes at @p.  The bytes to store are extracted in little-endian order
696 * from @val_le; return the bytes of @val_le beyond @size that have not been stored.
697 */
698static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le)
699{
700    uint8_t *p = pv;
701    for (int i = 0; i < size; i++, val_le >>= 8) {
702        p[i] = val_le;
703    }
704    return val_le;
705}
706
707/**
708 * store_parts_leN
709 * @pv: host address
710 * @size: number of bytes to store
711 * @val_le: data to store
712 *
713 * As store_bytes_leN, but atomically on each aligned part.
714 */
715G_GNUC_UNUSED
716static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le)
717{
718    do {
719        int n;
720
721        /* Find minimum of alignment and size */
722        switch (((uintptr_t)pv | size) & 7) {
723        case 4:
724            store_atomic4(pv, le32_to_cpu(val_le));
725            val_le >>= 32;
726            n = 4;
727            break;
728        case 2:
729        case 6:
730            store_atomic2(pv, le16_to_cpu(val_le));
731            val_le >>= 16;
732            n = 2;
733            break;
734        default:
735            *(uint8_t *)pv = val_le;
736            val_le >>= 8;
737            n = 1;
738            break;
739        case 0:
740            g_assert_not_reached();
741        }
742        pv += n;
743        size -= n;
744    } while (size != 0);
745
746    return val_le;
747}
748
749/**
750 * store_whole_le4
751 * @pv: host address
752 * @size: number of bytes to store
753 * @val_le: data to store
754 *
755 * As store_bytes_leN, but atomically as a whole.
756 * Four aligned bytes are guaranteed to cover the store.
757 */
758static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le)
759{
760    int sz = size * 8;
761    int o = (uintptr_t)pv & 3;
762    int sh = o * 8;
763    uint32_t m = MAKE_64BIT_MASK(0, sz);
764    uint32_t v;
765
766    if (HOST_BIG_ENDIAN) {
767        v = bswap32(val_le) >> sh;
768        m = bswap32(m) >> sh;
769    } else {
770        v = val_le << sh;
771        m <<= sh;
772    }
773    store_atom_insert_al4(pv - o, v, m);
774    return val_le >> sz;
775}
776
777/**
778 * store_whole_le8
779 * @pv: host address
780 * @size: number of bytes to store
781 * @val_le: data to store
782 *
783 * As store_bytes_leN, but atomically as a whole.
784 * Eight aligned bytes are guaranteed to cover the store.
785 */
786static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le)
787{
788    int sz = size * 8;
789    int o = (uintptr_t)pv & 7;
790    int sh = o * 8;
791    uint64_t m = MAKE_64BIT_MASK(0, sz);
792    uint64_t v;
793
794    qemu_build_assert(HAVE_al8);
795    if (HOST_BIG_ENDIAN) {
796        v = bswap64(val_le) >> sh;
797        m = bswap64(m) >> sh;
798    } else {
799        v = val_le << sh;
800        m <<= sh;
801    }
802    store_atom_insert_al8(pv - o, v, m);
803    return val_le >> sz;
804}
805
806/**
807 * store_whole_le16
808 * @pv: host address
809 * @size: number of bytes to store
810 * @val_le: data to store
811 *
812 * As store_bytes_leN, but atomically as a whole.
813 * 16 aligned bytes are guaranteed to cover the store.
814 */
815static uint64_t store_whole_le16(void *pv, int size, Int128 val_le)
816{
817    int sz = size * 8;
818    int o = (uintptr_t)pv & 15;
819    int sh = o * 8;
820    Int128 m, v;
821
822    qemu_build_assert(HAVE_ATOMIC128_RW);
823
824    /* Like MAKE_64BIT_MASK(0, sz), but larger. */
825    if (sz <= 64) {
826        m = int128_make64(MAKE_64BIT_MASK(0, sz));
827    } else {
828        m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64));
829    }
830
831    if (HOST_BIG_ENDIAN) {
832        v = int128_urshift(bswap128(val_le), sh);
833        m = int128_urshift(bswap128(m), sh);
834    } else {
835        v = int128_lshift(val_le, sh);
836        m = int128_lshift(m, sh);
837    }
838    store_atom_insert_al16(pv - o, v, m);
839
840    if (sz <= 64) {
841        return 0;
842    }
843    return int128_gethi(val_le) >> (sz - 64);
844}
845
846/**
847 * store_atom_2:
848 * @p: host address
849 * @val: the value to store
850 * @memop: the full memory op
851 *
852 * Store 2 bytes to @p, honoring the atomicity of @memop.
853 */
854static void store_atom_2(CPUArchState *env, uintptr_t ra,
855                         void *pv, MemOp memop, uint16_t val)
856{
857    uintptr_t pi = (uintptr_t)pv;
858    int atmax;
859
860    if (likely((pi & 1) == 0)) {
861        store_atomic2(pv, val);
862        return;
863    }
864
865    atmax = required_atomicity(env, pi, memop);
866    if (atmax == MO_8) {
867        stw_he_p(pv, val);
868        return;
869    }
870
871    /*
872     * The only case remaining is MO_ATOM_WITHIN16.
873     * Big or little endian, we want the middle two bytes in each test.
874     */
875    if ((pi & 3) == 1) {
876        store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16));
877        return;
878    } else if ((pi & 7) == 3) {
879        if (HAVE_al8) {
880            store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16));
881            return;
882        }
883    } else if ((pi & 15) == 7) {
884        if (HAVE_ATOMIC128_RW) {
885            Int128 v = int128_lshift(int128_make64(val), 56);
886            Int128 m = int128_lshift(int128_make64(0xffff), 56);
887            store_atom_insert_al16(pv - 7, v, m);
888            return;
889        }
890    } else {
891        g_assert_not_reached();
892    }
893
894    cpu_loop_exit_atomic(env_cpu(env), ra);
895}
896
897/**
898 * store_atom_4:
899 * @p: host address
900 * @val: the value to store
901 * @memop: the full memory op
902 *
903 * Store 4 bytes to @p, honoring the atomicity of @memop.
904 */
905static void store_atom_4(CPUArchState *env, uintptr_t ra,
906                         void *pv, MemOp memop, uint32_t val)
907{
908    uintptr_t pi = (uintptr_t)pv;
909    int atmax;
910
911    if (likely((pi & 3) == 0)) {
912        store_atomic4(pv, val);
913        return;
914    }
915
916    atmax = required_atomicity(env, pi, memop);
917    switch (atmax) {
918    case MO_8:
919        stl_he_p(pv, val);
920        return;
921    case MO_16:
922        store_atom_4_by_2(pv, val);
923        return;
924    case -MO_16:
925        {
926            uint32_t val_le = cpu_to_le32(val);
927            int s2 = pi & 3;
928            int s1 = 4 - s2;
929
930            switch (s2) {
931            case 1:
932                val_le = store_whole_le4(pv, s1, val_le);
933                *(uint8_t *)(pv + 3) = val_le;
934                break;
935            case 3:
936                *(uint8_t *)pv = val_le;
937                store_whole_le4(pv + 1, s2, val_le >> 8);
938                break;
939            case 0: /* aligned */
940            case 2: /* atmax MO_16 */
941            default:
942                g_assert_not_reached();
943            }
944        }
945        return;
946    case MO_32:
947        if ((pi & 7) < 4) {
948            if (HAVE_al8) {
949                store_whole_le8(pv, 4, cpu_to_le32(val));
950                return;
951            }
952        } else {
953            if (HAVE_ATOMIC128_RW) {
954                store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val)));
955                return;
956            }
957        }
958        cpu_loop_exit_atomic(env_cpu(env), ra);
959    default:
960        g_assert_not_reached();
961    }
962}
963
964/**
965 * store_atom_8:
966 * @p: host address
967 * @val: the value to store
968 * @memop: the full memory op
969 *
970 * Store 8 bytes to @p, honoring the atomicity of @memop.
971 */
972static void store_atom_8(CPUArchState *env, uintptr_t ra,
973                         void *pv, MemOp memop, uint64_t val)
974{
975    uintptr_t pi = (uintptr_t)pv;
976    int atmax;
977
978    if (HAVE_al8 && likely((pi & 7) == 0)) {
979        store_atomic8(pv, val);
980        return;
981    }
982
983    atmax = required_atomicity(env, pi, memop);
984    switch (atmax) {
985    case MO_8:
986        stq_he_p(pv, val);
987        return;
988    case MO_16:
989        store_atom_8_by_2(pv, val);
990        return;
991    case MO_32:
992        store_atom_8_by_4(pv, val);
993        return;
994    case -MO_32:
995        if (HAVE_al8) {
996            uint64_t val_le = cpu_to_le64(val);
997            int s2 = pi & 7;
998            int s1 = 8 - s2;
999
1000            switch (s2) {
1001            case 1 ... 3:
1002                val_le = store_whole_le8(pv, s1, val_le);
1003                store_bytes_leN(pv + s1, s2, val_le);
1004                break;
1005            case 5 ... 7:
1006                val_le = store_bytes_leN(pv, s1, val_le);
1007                store_whole_le8(pv + s1, s2, val_le);
1008                break;
1009            case 0: /* aligned */
1010            case 4: /* atmax MO_32 */
1011            default:
1012                g_assert_not_reached();
1013            }
1014            return;
1015        }
1016        break;
1017    case MO_64:
1018        if (HAVE_ATOMIC128_RW) {
1019            store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val)));
1020            return;
1021        }
1022        break;
1023    default:
1024        g_assert_not_reached();
1025    }
1026    cpu_loop_exit_atomic(env_cpu(env), ra);
1027}
1028
1029/**
1030 * store_atom_16:
1031 * @p: host address
1032 * @val: the value to store
1033 * @memop: the full memory op
1034 *
1035 * Store 16 bytes to @p, honoring the atomicity of @memop.
1036 */
1037static void store_atom_16(CPUArchState *env, uintptr_t ra,
1038                          void *pv, MemOp memop, Int128 val)
1039{
1040    uintptr_t pi = (uintptr_t)pv;
1041    uint64_t a, b;
1042    int atmax;
1043
1044    if (HAVE_ATOMIC128_RW && likely((pi & 15) == 0)) {
1045        atomic16_set(pv, val);
1046        return;
1047    }
1048
1049    atmax = required_atomicity(env, pi, memop);
1050
1051    a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val);
1052    b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val);
1053    switch (atmax) {
1054    case MO_8:
1055        memcpy(pv, &val, 16);
1056        return;
1057    case MO_16:
1058        store_atom_8_by_2(pv, a);
1059        store_atom_8_by_2(pv + 8, b);
1060        return;
1061    case MO_32:
1062        store_atom_8_by_4(pv, a);
1063        store_atom_8_by_4(pv + 8, b);
1064        return;
1065    case MO_64:
1066        if (HAVE_al8) {
1067            store_atomic8(pv, a);
1068            store_atomic8(pv + 8, b);
1069            return;
1070        }
1071        break;
1072    case -MO_64:
1073        if (HAVE_ATOMIC128_RW) {
1074            uint64_t val_le;
1075            int s2 = pi & 15;
1076            int s1 = 16 - s2;
1077
1078            if (HOST_BIG_ENDIAN) {
1079                val = bswap128(val);
1080            }
1081            switch (s2) {
1082            case 1 ... 7:
1083                val_le = store_whole_le16(pv, s1, val);
1084                store_bytes_leN(pv + s1, s2, val_le);
1085                break;
1086            case 9 ... 15:
1087                store_bytes_leN(pv, s1, int128_getlo(val));
1088                val = int128_urshift(val, s1 * 8);
1089                store_whole_le16(pv + s1, s2, val);
1090                break;
1091            case 0: /* aligned */
1092            case 8: /* atmax MO_64 */
1093            default:
1094                g_assert_not_reached();
1095            }
1096            return;
1097        }
1098        break;
1099    case MO_128:
1100        if (HAVE_ATOMIC128_RW) {
1101            atomic16_set(pv, val);
1102            return;
1103        }
1104        break;
1105    default:
1106        g_assert_not_reached();
1107    }
1108    cpu_loop_exit_atomic(env_cpu(env), ra);
1109}
1110