1/* 2 * Routines common to user and system emulation of load/store. 3 * 4 * Copyright (c) 2022 Linaro, Ltd. 5 * 6 * SPDX-License-Identifier: GPL-2.0-or-later 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12#ifdef CONFIG_ATOMIC64 13# define HAVE_al8 true 14#else 15# define HAVE_al8 false 16#endif 17#define HAVE_al8_fast (ATOMIC_REG_SIZE >= 8) 18 19#if defined(CONFIG_ATOMIC128) 20# define HAVE_al16_fast true 21#else 22# define HAVE_al16_fast false 23#endif 24#if defined(CONFIG_ATOMIC128) || defined(CONFIG_CMPXCHG128) 25# define HAVE_al16 true 26#else 27# define HAVE_al16 false 28#endif 29 30 31/** 32 * required_atomicity: 33 * 34 * Return the lg2 bytes of atomicity required by @memop for @p. 35 * If the operation must be split into two operations to be 36 * examined separately for atomicity, return -lg2. 37 */ 38static int required_atomicity(CPUArchState *env, uintptr_t p, MemOp memop) 39{ 40 MemOp atom = memop & MO_ATOM_MASK; 41 MemOp size = memop & MO_SIZE; 42 MemOp half = size ? size - 1 : 0; 43 unsigned tmp; 44 int atmax; 45 46 switch (atom) { 47 case MO_ATOM_NONE: 48 atmax = MO_8; 49 break; 50 51 case MO_ATOM_IFALIGN_PAIR: 52 size = half; 53 /* fall through */ 54 55 case MO_ATOM_IFALIGN: 56 tmp = (1 << size) - 1; 57 atmax = p & tmp ? MO_8 : size; 58 break; 59 60 case MO_ATOM_WITHIN16: 61 tmp = p & 15; 62 atmax = (tmp + (1 << size) <= 16 ? size : MO_8); 63 break; 64 65 case MO_ATOM_WITHIN16_PAIR: 66 tmp = p & 15; 67 if (tmp + (1 << size) <= 16) { 68 atmax = size; 69 } else if (tmp + (1 << half) == 16) { 70 /* 71 * The pair exactly straddles the boundary. 72 * Both halves are naturally aligned and atomic. 73 */ 74 atmax = half; 75 } else { 76 /* 77 * One of the pair crosses the boundary, and is non-atomic. 78 * The other of the pair does not cross, and is atomic. 79 */ 80 atmax = -half; 81 } 82 break; 83 84 case MO_ATOM_SUBALIGN: 85 /* 86 * Examine the alignment of p to determine if there are subobjects 87 * that must be aligned. Note that we only really need ctz4() -- 88 * any more sigificant bits are discarded by the immediately 89 * following comparison. 90 */ 91 tmp = ctz32(p); 92 atmax = MIN(size, tmp); 93 break; 94 95 default: 96 g_assert_not_reached(); 97 } 98 99 /* 100 * Here we have the architectural atomicity of the operation. 101 * However, when executing in a serial context, we need no extra 102 * host atomicity in order to avoid racing. This reduction 103 * avoids looping with cpu_loop_exit_atomic. 104 */ 105 if (cpu_in_serial_context(env_cpu(env))) { 106 return MO_8; 107 } 108 return atmax; 109} 110 111/** 112 * load_atomic2: 113 * @pv: host address 114 * 115 * Atomically load 2 aligned bytes from @pv. 116 */ 117static inline uint16_t load_atomic2(void *pv) 118{ 119 uint16_t *p = __builtin_assume_aligned(pv, 2); 120 return qatomic_read(p); 121} 122 123/** 124 * load_atomic4: 125 * @pv: host address 126 * 127 * Atomically load 4 aligned bytes from @pv. 128 */ 129static inline uint32_t load_atomic4(void *pv) 130{ 131 uint32_t *p = __builtin_assume_aligned(pv, 4); 132 return qatomic_read(p); 133} 134 135/** 136 * load_atomic8: 137 * @pv: host address 138 * 139 * Atomically load 8 aligned bytes from @pv. 140 */ 141static inline uint64_t load_atomic8(void *pv) 142{ 143 uint64_t *p = __builtin_assume_aligned(pv, 8); 144 145 qemu_build_assert(HAVE_al8); 146 return qatomic_read__nocheck(p); 147} 148 149/** 150 * load_atomic16: 151 * @pv: host address 152 * 153 * Atomically load 16 aligned bytes from @pv. 154 */ 155static inline Int128 ATTRIBUTE_ATOMIC128_OPT 156load_atomic16(void *pv) 157{ 158#ifdef CONFIG_ATOMIC128 159 __uint128_t *p = __builtin_assume_aligned(pv, 16); 160 Int128Alias r; 161 162 r.u = qatomic_read__nocheck(p); 163 return r.s; 164#else 165 qemu_build_not_reached(); 166#endif 167} 168 169/** 170 * load_atomic8_or_exit: 171 * @env: cpu context 172 * @ra: host unwind address 173 * @pv: host address 174 * 175 * Atomically load 8 aligned bytes from @pv. 176 * If this is not possible, longjmp out to restart serially. 177 */ 178static uint64_t load_atomic8_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 179{ 180 if (HAVE_al8) { 181 return load_atomic8(pv); 182 } 183 184#ifdef CONFIG_USER_ONLY 185 /* 186 * If the page is not writable, then assume the value is immutable 187 * and requires no locking. This ignores the case of MAP_SHARED with 188 * another process, because the fallback start_exclusive solution 189 * provides no protection across processes. 190 */ 191 if (!page_check_range(h2g(pv), 8, PAGE_WRITE)) { 192 uint64_t *p = __builtin_assume_aligned(pv, 8); 193 return *p; 194 } 195#endif 196 197 /* Ultimate fallback: re-execute in serial context. */ 198 cpu_loop_exit_atomic(env_cpu(env), ra); 199} 200 201/** 202 * load_atomic16_or_exit: 203 * @env: cpu context 204 * @ra: host unwind address 205 * @pv: host address 206 * 207 * Atomically load 16 aligned bytes from @pv. 208 * If this is not possible, longjmp out to restart serially. 209 */ 210static Int128 load_atomic16_or_exit(CPUArchState *env, uintptr_t ra, void *pv) 211{ 212 Int128 *p = __builtin_assume_aligned(pv, 16); 213 214 if (HAVE_al16_fast) { 215 return load_atomic16(p); 216 } 217 218#ifdef CONFIG_USER_ONLY 219 /* 220 * We can only use cmpxchg to emulate a load if the page is writable. 221 * If the page is not writable, then assume the value is immutable 222 * and requires no locking. This ignores the case of MAP_SHARED with 223 * another process, because the fallback start_exclusive solution 224 * provides no protection across processes. 225 */ 226 if (!page_check_range(h2g(p), 16, PAGE_WRITE)) { 227 return *p; 228 } 229#endif 230 231 /* 232 * In system mode all guest pages are writable, and for user-only 233 * we have just checked writability. Try cmpxchg. 234 */ 235#if defined(CONFIG_CMPXCHG128) 236 /* Swap 0 with 0, with the side-effect of returning the old value. */ 237 { 238 Int128Alias r; 239 r.u = __sync_val_compare_and_swap_16((__uint128_t *)p, 0, 0); 240 return r.s; 241 } 242#endif 243 244 /* Ultimate fallback: re-execute in serial context. */ 245 cpu_loop_exit_atomic(env_cpu(env), ra); 246} 247 248/** 249 * load_atom_extract_al4x2: 250 * @pv: host address 251 * 252 * Load 4 bytes from @p, from two sequential atomic 4-byte loads. 253 */ 254static uint32_t load_atom_extract_al4x2(void *pv) 255{ 256 uintptr_t pi = (uintptr_t)pv; 257 int sh = (pi & 3) * 8; 258 uint32_t a, b; 259 260 pv = (void *)(pi & ~3); 261 a = load_atomic4(pv); 262 b = load_atomic4(pv + 4); 263 264 if (HOST_BIG_ENDIAN) { 265 return (a << sh) | (b >> (-sh & 31)); 266 } else { 267 return (a >> sh) | (b << (-sh & 31)); 268 } 269} 270 271/** 272 * load_atom_extract_al8x2: 273 * @pv: host address 274 * 275 * Load 8 bytes from @p, from two sequential atomic 8-byte loads. 276 */ 277static uint64_t load_atom_extract_al8x2(void *pv) 278{ 279 uintptr_t pi = (uintptr_t)pv; 280 int sh = (pi & 7) * 8; 281 uint64_t a, b; 282 283 pv = (void *)(pi & ~7); 284 a = load_atomic8(pv); 285 b = load_atomic8(pv + 8); 286 287 if (HOST_BIG_ENDIAN) { 288 return (a << sh) | (b >> (-sh & 63)); 289 } else { 290 return (a >> sh) | (b << (-sh & 63)); 291 } 292} 293 294/** 295 * load_atom_extract_al8_or_exit: 296 * @env: cpu context 297 * @ra: host unwind address 298 * @pv: host address 299 * @s: object size in bytes, @s <= 4. 300 * 301 * Atomically load @s bytes from @p, when p % s != 0, and [p, p+s-1] does 302 * not cross an 8-byte boundary. This means that we can perform an atomic 303 * 8-byte load and extract. 304 * The value is returned in the low bits of a uint32_t. 305 */ 306static uint32_t load_atom_extract_al8_or_exit(CPUArchState *env, uintptr_t ra, 307 void *pv, int s) 308{ 309 uintptr_t pi = (uintptr_t)pv; 310 int o = pi & 7; 311 int shr = (HOST_BIG_ENDIAN ? 8 - s - o : o) * 8; 312 313 pv = (void *)(pi & ~7); 314 return load_atomic8_or_exit(env, ra, pv) >> shr; 315} 316 317/** 318 * load_atom_extract_al16_or_exit: 319 * @env: cpu context 320 * @ra: host unwind address 321 * @p: host address 322 * @s: object size in bytes, @s <= 8. 323 * 324 * Atomically load @s bytes from @p, when p % 16 < 8 325 * and p % 16 + s > 8. I.e. does not cross a 16-byte 326 * boundary, but *does* cross an 8-byte boundary. 327 * This is the slow version, so we must have eliminated 328 * any faster load_atom_extract_al8_or_exit case. 329 * 330 * If this is not possible, longjmp out to restart serially. 331 */ 332static uint64_t load_atom_extract_al16_or_exit(CPUArchState *env, uintptr_t ra, 333 void *pv, int s) 334{ 335 uintptr_t pi = (uintptr_t)pv; 336 int o = pi & 7; 337 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 338 Int128 r; 339 340 /* 341 * Note constraints above: p & 8 must be clear. 342 * Provoke SIGBUS if possible otherwise. 343 */ 344 pv = (void *)(pi & ~7); 345 r = load_atomic16_or_exit(env, ra, pv); 346 347 r = int128_urshift(r, shr); 348 return int128_getlo(r); 349} 350 351/** 352 * load_atom_extract_al16_or_al8: 353 * @p: host address 354 * @s: object size in bytes, @s <= 8. 355 * 356 * Load @s bytes from @p, when p % s != 0. If [p, p+s-1] does not 357 * cross an 16-byte boundary then the access must be 16-byte atomic, 358 * otherwise the access must be 8-byte atomic. 359 */ 360static inline uint64_t ATTRIBUTE_ATOMIC128_OPT 361load_atom_extract_al16_or_al8(void *pv, int s) 362{ 363#if defined(CONFIG_ATOMIC128) 364 uintptr_t pi = (uintptr_t)pv; 365 int o = pi & 7; 366 int shr = (HOST_BIG_ENDIAN ? 16 - s - o : o) * 8; 367 __uint128_t r; 368 369 pv = (void *)(pi & ~7); 370 if (pi & 8) { 371 uint64_t *p8 = __builtin_assume_aligned(pv, 16, 8); 372 uint64_t a = qatomic_read__nocheck(p8); 373 uint64_t b = qatomic_read__nocheck(p8 + 1); 374 375 if (HOST_BIG_ENDIAN) { 376 r = ((__uint128_t)a << 64) | b; 377 } else { 378 r = ((__uint128_t)b << 64) | a; 379 } 380 } else { 381 __uint128_t *p16 = __builtin_assume_aligned(pv, 16, 0); 382 r = qatomic_read__nocheck(p16); 383 } 384 return r >> shr; 385#else 386 qemu_build_not_reached(); 387#endif 388} 389 390/** 391 * load_atom_4_by_2: 392 * @pv: host address 393 * 394 * Load 4 bytes from @pv, with two 2-byte atomic loads. 395 */ 396static inline uint32_t load_atom_4_by_2(void *pv) 397{ 398 uint32_t a = load_atomic2(pv); 399 uint32_t b = load_atomic2(pv + 2); 400 401 if (HOST_BIG_ENDIAN) { 402 return (a << 16) | b; 403 } else { 404 return (b << 16) | a; 405 } 406} 407 408/** 409 * load_atom_8_by_2: 410 * @pv: host address 411 * 412 * Load 8 bytes from @pv, with four 2-byte atomic loads. 413 */ 414static inline uint64_t load_atom_8_by_2(void *pv) 415{ 416 uint32_t a = load_atom_4_by_2(pv); 417 uint32_t b = load_atom_4_by_2(pv + 4); 418 419 if (HOST_BIG_ENDIAN) { 420 return ((uint64_t)a << 32) | b; 421 } else { 422 return ((uint64_t)b << 32) | a; 423 } 424} 425 426/** 427 * load_atom_8_by_4: 428 * @pv: host address 429 * 430 * Load 8 bytes from @pv, with two 4-byte atomic loads. 431 */ 432static inline uint64_t load_atom_8_by_4(void *pv) 433{ 434 uint32_t a = load_atomic4(pv); 435 uint32_t b = load_atomic4(pv + 4); 436 437 if (HOST_BIG_ENDIAN) { 438 return ((uint64_t)a << 32) | b; 439 } else { 440 return ((uint64_t)b << 32) | a; 441 } 442} 443 444/** 445 * load_atom_8_by_8_or_4: 446 * @pv: host address 447 * 448 * Load 8 bytes from aligned @pv, with at least 4-byte atomicity. 449 */ 450static inline uint64_t load_atom_8_by_8_or_4(void *pv) 451{ 452 if (HAVE_al8_fast) { 453 return load_atomic8(pv); 454 } else { 455 return load_atom_8_by_4(pv); 456 } 457} 458 459/** 460 * load_atom_2: 461 * @p: host address 462 * @memop: the full memory op 463 * 464 * Load 2 bytes from @p, honoring the atomicity of @memop. 465 */ 466static uint16_t load_atom_2(CPUArchState *env, uintptr_t ra, 467 void *pv, MemOp memop) 468{ 469 uintptr_t pi = (uintptr_t)pv; 470 int atmax; 471 472 if (likely((pi & 1) == 0)) { 473 return load_atomic2(pv); 474 } 475 if (HAVE_al16_fast) { 476 return load_atom_extract_al16_or_al8(pv, 2); 477 } 478 479 atmax = required_atomicity(env, pi, memop); 480 switch (atmax) { 481 case MO_8: 482 return lduw_he_p(pv); 483 case MO_16: 484 /* The only case remaining is MO_ATOM_WITHIN16. */ 485 if (!HAVE_al8_fast && (pi & 3) == 1) { 486 /* Big or little endian, we want the middle two bytes. */ 487 return load_atomic4(pv - 1) >> 8; 488 } 489 if ((pi & 15) != 7) { 490 return load_atom_extract_al8_or_exit(env, ra, pv, 2); 491 } 492 return load_atom_extract_al16_or_exit(env, ra, pv, 2); 493 default: 494 g_assert_not_reached(); 495 } 496} 497 498/** 499 * load_atom_4: 500 * @p: host address 501 * @memop: the full memory op 502 * 503 * Load 4 bytes from @p, honoring the atomicity of @memop. 504 */ 505static uint32_t load_atom_4(CPUArchState *env, uintptr_t ra, 506 void *pv, MemOp memop) 507{ 508 uintptr_t pi = (uintptr_t)pv; 509 int atmax; 510 511 if (likely((pi & 3) == 0)) { 512 return load_atomic4(pv); 513 } 514 if (HAVE_al16_fast) { 515 return load_atom_extract_al16_or_al8(pv, 4); 516 } 517 518 atmax = required_atomicity(env, pi, memop); 519 switch (atmax) { 520 case MO_8: 521 case MO_16: 522 case -MO_16: 523 /* 524 * For MO_ATOM_IFALIGN, this is more atomicity than required, 525 * but it's trivially supported on all hosts, better than 4 526 * individual byte loads (when the host requires alignment), 527 * and overlaps with the MO_ATOM_SUBALIGN case of p % 2 == 0. 528 */ 529 return load_atom_extract_al4x2(pv); 530 case MO_32: 531 if (!(pi & 4)) { 532 return load_atom_extract_al8_or_exit(env, ra, pv, 4); 533 } 534 return load_atom_extract_al16_or_exit(env, ra, pv, 4); 535 default: 536 g_assert_not_reached(); 537 } 538} 539 540/** 541 * load_atom_8: 542 * @p: host address 543 * @memop: the full memory op 544 * 545 * Load 8 bytes from @p, honoring the atomicity of @memop. 546 */ 547static uint64_t load_atom_8(CPUArchState *env, uintptr_t ra, 548 void *pv, MemOp memop) 549{ 550 uintptr_t pi = (uintptr_t)pv; 551 int atmax; 552 553 /* 554 * If the host does not support 8-byte atomics, wait until we have 555 * examined the atomicity parameters below. 556 */ 557 if (HAVE_al8 && likely((pi & 7) == 0)) { 558 return load_atomic8(pv); 559 } 560 if (HAVE_al16_fast) { 561 return load_atom_extract_al16_or_al8(pv, 8); 562 } 563 564 atmax = required_atomicity(env, pi, memop); 565 if (atmax == MO_64) { 566 if (!HAVE_al8 && (pi & 7) == 0) { 567 load_atomic8_or_exit(env, ra, pv); 568 } 569 return load_atom_extract_al16_or_exit(env, ra, pv, 8); 570 } 571 if (HAVE_al8_fast) { 572 return load_atom_extract_al8x2(pv); 573 } 574 switch (atmax) { 575 case MO_8: 576 return ldq_he_p(pv); 577 case MO_16: 578 return load_atom_8_by_2(pv); 579 case MO_32: 580 return load_atom_8_by_4(pv); 581 case -MO_32: 582 if (HAVE_al8) { 583 return load_atom_extract_al8x2(pv); 584 } 585 cpu_loop_exit_atomic(env_cpu(env), ra); 586 default: 587 g_assert_not_reached(); 588 } 589} 590 591/** 592 * load_atom_16: 593 * @p: host address 594 * @memop: the full memory op 595 * 596 * Load 16 bytes from @p, honoring the atomicity of @memop. 597 */ 598static Int128 load_atom_16(CPUArchState *env, uintptr_t ra, 599 void *pv, MemOp memop) 600{ 601 uintptr_t pi = (uintptr_t)pv; 602 int atmax; 603 Int128 r; 604 uint64_t a, b; 605 606 /* 607 * If the host does not support 16-byte atomics, wait until we have 608 * examined the atomicity parameters below. 609 */ 610 if (HAVE_al16_fast && likely((pi & 15) == 0)) { 611 return load_atomic16(pv); 612 } 613 614 atmax = required_atomicity(env, pi, memop); 615 switch (atmax) { 616 case MO_8: 617 memcpy(&r, pv, 16); 618 return r; 619 case MO_16: 620 a = load_atom_8_by_2(pv); 621 b = load_atom_8_by_2(pv + 8); 622 break; 623 case MO_32: 624 a = load_atom_8_by_4(pv); 625 b = load_atom_8_by_4(pv + 8); 626 break; 627 case MO_64: 628 if (!HAVE_al8) { 629 cpu_loop_exit_atomic(env_cpu(env), ra); 630 } 631 a = load_atomic8(pv); 632 b = load_atomic8(pv + 8); 633 break; 634 case -MO_64: 635 if (!HAVE_al8) { 636 cpu_loop_exit_atomic(env_cpu(env), ra); 637 } 638 a = load_atom_extract_al8x2(pv); 639 b = load_atom_extract_al8x2(pv + 8); 640 break; 641 case MO_128: 642 return load_atomic16_or_exit(env, ra, pv); 643 default: 644 g_assert_not_reached(); 645 } 646 return int128_make128(HOST_BIG_ENDIAN ? b : a, HOST_BIG_ENDIAN ? a : b); 647} 648 649/** 650 * store_atomic2: 651 * @pv: host address 652 * @val: value to store 653 * 654 * Atomically store 2 aligned bytes to @pv. 655 */ 656static inline void store_atomic2(void *pv, uint16_t val) 657{ 658 uint16_t *p = __builtin_assume_aligned(pv, 2); 659 qatomic_set(p, val); 660} 661 662/** 663 * store_atomic4: 664 * @pv: host address 665 * @val: value to store 666 * 667 * Atomically store 4 aligned bytes to @pv. 668 */ 669static inline void store_atomic4(void *pv, uint32_t val) 670{ 671 uint32_t *p = __builtin_assume_aligned(pv, 4); 672 qatomic_set(p, val); 673} 674 675/** 676 * store_atomic8: 677 * @pv: host address 678 * @val: value to store 679 * 680 * Atomically store 8 aligned bytes to @pv. 681 */ 682static inline void store_atomic8(void *pv, uint64_t val) 683{ 684 uint64_t *p = __builtin_assume_aligned(pv, 8); 685 686 qemu_build_assert(HAVE_al8); 687 qatomic_set__nocheck(p, val); 688} 689 690/** 691 * store_atomic16: 692 * @pv: host address 693 * @val: value to store 694 * 695 * Atomically store 16 aligned bytes to @pv. 696 */ 697static inline void ATTRIBUTE_ATOMIC128_OPT 698store_atomic16(void *pv, Int128Alias val) 699{ 700#if defined(CONFIG_ATOMIC128) 701 __uint128_t *pu = __builtin_assume_aligned(pv, 16); 702 qatomic_set__nocheck(pu, val.u); 703#elif defined(CONFIG_CMPXCHG128) 704 __uint128_t *pu = __builtin_assume_aligned(pv, 16); 705 __uint128_t o; 706 707 /* 708 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 709 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 710 * and accept the sequential consistency that comes with it. 711 */ 712 do { 713 o = *pu; 714 } while (!__sync_bool_compare_and_swap_16(pu, o, val.u)); 715#else 716 qemu_build_not_reached(); 717#endif 718} 719 720/** 721 * store_atom_4x2 722 */ 723static inline void store_atom_4_by_2(void *pv, uint32_t val) 724{ 725 store_atomic2(pv, val >> (HOST_BIG_ENDIAN ? 16 : 0)); 726 store_atomic2(pv + 2, val >> (HOST_BIG_ENDIAN ? 0 : 16)); 727} 728 729/** 730 * store_atom_8_by_2 731 */ 732static inline void store_atom_8_by_2(void *pv, uint64_t val) 733{ 734 store_atom_4_by_2(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 735 store_atom_4_by_2(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 736} 737 738/** 739 * store_atom_8_by_4 740 */ 741static inline void store_atom_8_by_4(void *pv, uint64_t val) 742{ 743 store_atomic4(pv, val >> (HOST_BIG_ENDIAN ? 32 : 0)); 744 store_atomic4(pv + 4, val >> (HOST_BIG_ENDIAN ? 0 : 32)); 745} 746 747/** 748 * store_atom_insert_al4: 749 * @p: host address 750 * @val: shifted value to store 751 * @msk: mask for value to store 752 * 753 * Atomically store @val to @p, masked by @msk. 754 */ 755static void store_atom_insert_al4(uint32_t *p, uint32_t val, uint32_t msk) 756{ 757 uint32_t old, new; 758 759 p = __builtin_assume_aligned(p, 4); 760 old = qatomic_read(p); 761 do { 762 new = (old & ~msk) | val; 763 } while (!__atomic_compare_exchange_n(p, &old, new, true, 764 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 765} 766 767/** 768 * store_atom_insert_al8: 769 * @p: host address 770 * @val: shifted value to store 771 * @msk: mask for value to store 772 * 773 * Atomically store @val to @p masked by @msk. 774 */ 775static void store_atom_insert_al8(uint64_t *p, uint64_t val, uint64_t msk) 776{ 777 uint64_t old, new; 778 779 qemu_build_assert(HAVE_al8); 780 p = __builtin_assume_aligned(p, 8); 781 old = qatomic_read__nocheck(p); 782 do { 783 new = (old & ~msk) | val; 784 } while (!__atomic_compare_exchange_n(p, &old, new, true, 785 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 786} 787 788/** 789 * store_atom_insert_al16: 790 * @p: host address 791 * @val: shifted value to store 792 * @msk: mask for value to store 793 * 794 * Atomically store @val to @p masked by @msk. 795 */ 796static void ATTRIBUTE_ATOMIC128_OPT 797store_atom_insert_al16(Int128 *ps, Int128Alias val, Int128Alias msk) 798{ 799#if defined(CONFIG_ATOMIC128) 800 __uint128_t *pu, old, new; 801 802 /* With CONFIG_ATOMIC128, we can avoid the memory barriers. */ 803 pu = __builtin_assume_aligned(ps, 16); 804 old = *pu; 805 do { 806 new = (old & ~msk.u) | val.u; 807 } while (!__atomic_compare_exchange_n(pu, &old, new, true, 808 __ATOMIC_RELAXED, __ATOMIC_RELAXED)); 809#elif defined(CONFIG_CMPXCHG128) 810 __uint128_t *pu, old, new; 811 812 /* 813 * Without CONFIG_ATOMIC128, __atomic_compare_exchange_n will always 814 * defer to libatomic, so we must use __sync_*_compare_and_swap_16 815 * and accept the sequential consistency that comes with it. 816 */ 817 pu = __builtin_assume_aligned(ps, 16); 818 do { 819 old = *pu; 820 new = (old & ~msk.u) | val.u; 821 } while (!__sync_bool_compare_and_swap_16(pu, old, new)); 822#else 823 qemu_build_not_reached(); 824#endif 825} 826 827/** 828 * store_bytes_leN: 829 * @pv: host address 830 * @size: number of bytes to store 831 * @val_le: data to store 832 * 833 * Store @size bytes at @p. The bytes to store are extracted in little-endian order 834 * from @val_le; return the bytes of @val_le beyond @size that have not been stored. 835 */ 836static uint64_t store_bytes_leN(void *pv, int size, uint64_t val_le) 837{ 838 uint8_t *p = pv; 839 for (int i = 0; i < size; i++, val_le >>= 8) { 840 p[i] = val_le; 841 } 842 return val_le; 843} 844 845/** 846 * store_parts_leN 847 * @pv: host address 848 * @size: number of bytes to store 849 * @val_le: data to store 850 * 851 * As store_bytes_leN, but atomically on each aligned part. 852 */ 853G_GNUC_UNUSED 854static uint64_t store_parts_leN(void *pv, int size, uint64_t val_le) 855{ 856 do { 857 int n; 858 859 /* Find minimum of alignment and size */ 860 switch (((uintptr_t)pv | size) & 7) { 861 case 4: 862 store_atomic4(pv, le32_to_cpu(val_le)); 863 val_le >>= 32; 864 n = 4; 865 break; 866 case 2: 867 case 6: 868 store_atomic2(pv, le16_to_cpu(val_le)); 869 val_le >>= 16; 870 n = 2; 871 break; 872 default: 873 *(uint8_t *)pv = val_le; 874 val_le >>= 8; 875 n = 1; 876 break; 877 case 0: 878 g_assert_not_reached(); 879 } 880 pv += n; 881 size -= n; 882 } while (size != 0); 883 884 return val_le; 885} 886 887/** 888 * store_whole_le4 889 * @pv: host address 890 * @size: number of bytes to store 891 * @val_le: data to store 892 * 893 * As store_bytes_leN, but atomically as a whole. 894 * Four aligned bytes are guaranteed to cover the store. 895 */ 896static uint64_t store_whole_le4(void *pv, int size, uint64_t val_le) 897{ 898 int sz = size * 8; 899 int o = (uintptr_t)pv & 3; 900 int sh = o * 8; 901 uint32_t m = MAKE_64BIT_MASK(0, sz); 902 uint32_t v; 903 904 if (HOST_BIG_ENDIAN) { 905 v = bswap32(val_le) >> sh; 906 m = bswap32(m) >> sh; 907 } else { 908 v = val_le << sh; 909 m <<= sh; 910 } 911 store_atom_insert_al4(pv - o, v, m); 912 return val_le >> sz; 913} 914 915/** 916 * store_whole_le8 917 * @pv: host address 918 * @size: number of bytes to store 919 * @val_le: data to store 920 * 921 * As store_bytes_leN, but atomically as a whole. 922 * Eight aligned bytes are guaranteed to cover the store. 923 */ 924static uint64_t store_whole_le8(void *pv, int size, uint64_t val_le) 925{ 926 int sz = size * 8; 927 int o = (uintptr_t)pv & 7; 928 int sh = o * 8; 929 uint64_t m = MAKE_64BIT_MASK(0, sz); 930 uint64_t v; 931 932 qemu_build_assert(HAVE_al8); 933 if (HOST_BIG_ENDIAN) { 934 v = bswap64(val_le) >> sh; 935 m = bswap64(m) >> sh; 936 } else { 937 v = val_le << sh; 938 m <<= sh; 939 } 940 store_atom_insert_al8(pv - o, v, m); 941 return val_le >> sz; 942} 943 944/** 945 * store_whole_le16 946 * @pv: host address 947 * @size: number of bytes to store 948 * @val_le: data to store 949 * 950 * As store_bytes_leN, but atomically as a whole. 951 * 16 aligned bytes are guaranteed to cover the store. 952 */ 953static uint64_t store_whole_le16(void *pv, int size, Int128 val_le) 954{ 955 int sz = size * 8; 956 int o = (uintptr_t)pv & 15; 957 int sh = o * 8; 958 Int128 m, v; 959 960 qemu_build_assert(HAVE_al16); 961 962 /* Like MAKE_64BIT_MASK(0, sz), but larger. */ 963 if (sz <= 64) { 964 m = int128_make64(MAKE_64BIT_MASK(0, sz)); 965 } else { 966 m = int128_make128(-1, MAKE_64BIT_MASK(0, sz - 64)); 967 } 968 969 if (HOST_BIG_ENDIAN) { 970 v = int128_urshift(bswap128(val_le), sh); 971 m = int128_urshift(bswap128(m), sh); 972 } else { 973 v = int128_lshift(val_le, sh); 974 m = int128_lshift(m, sh); 975 } 976 store_atom_insert_al16(pv - o, v, m); 977 978 /* Unused if sz <= 64. */ 979 return int128_gethi(val_le) >> (sz - 64); 980} 981 982/** 983 * store_atom_2: 984 * @p: host address 985 * @val: the value to store 986 * @memop: the full memory op 987 * 988 * Store 2 bytes to @p, honoring the atomicity of @memop. 989 */ 990static void store_atom_2(CPUArchState *env, uintptr_t ra, 991 void *pv, MemOp memop, uint16_t val) 992{ 993 uintptr_t pi = (uintptr_t)pv; 994 int atmax; 995 996 if (likely((pi & 1) == 0)) { 997 store_atomic2(pv, val); 998 return; 999 } 1000 1001 atmax = required_atomicity(env, pi, memop); 1002 if (atmax == MO_8) { 1003 stw_he_p(pv, val); 1004 return; 1005 } 1006 1007 /* 1008 * The only case remaining is MO_ATOM_WITHIN16. 1009 * Big or little endian, we want the middle two bytes in each test. 1010 */ 1011 if ((pi & 3) == 1) { 1012 store_atom_insert_al4(pv - 1, (uint32_t)val << 8, MAKE_64BIT_MASK(8, 16)); 1013 return; 1014 } else if ((pi & 7) == 3) { 1015 if (HAVE_al8) { 1016 store_atom_insert_al8(pv - 3, (uint64_t)val << 24, MAKE_64BIT_MASK(24, 16)); 1017 return; 1018 } 1019 } else if ((pi & 15) == 7) { 1020 if (HAVE_al16) { 1021 Int128 v = int128_lshift(int128_make64(val), 56); 1022 Int128 m = int128_lshift(int128_make64(0xffff), 56); 1023 store_atom_insert_al16(pv - 7, v, m); 1024 return; 1025 } 1026 } else { 1027 g_assert_not_reached(); 1028 } 1029 1030 cpu_loop_exit_atomic(env_cpu(env), ra); 1031} 1032 1033/** 1034 * store_atom_4: 1035 * @p: host address 1036 * @val: the value to store 1037 * @memop: the full memory op 1038 * 1039 * Store 4 bytes to @p, honoring the atomicity of @memop. 1040 */ 1041static void store_atom_4(CPUArchState *env, uintptr_t ra, 1042 void *pv, MemOp memop, uint32_t val) 1043{ 1044 uintptr_t pi = (uintptr_t)pv; 1045 int atmax; 1046 1047 if (likely((pi & 3) == 0)) { 1048 store_atomic4(pv, val); 1049 return; 1050 } 1051 1052 atmax = required_atomicity(env, pi, memop); 1053 switch (atmax) { 1054 case MO_8: 1055 stl_he_p(pv, val); 1056 return; 1057 case MO_16: 1058 store_atom_4_by_2(pv, val); 1059 return; 1060 case -MO_16: 1061 { 1062 uint32_t val_le = cpu_to_le32(val); 1063 int s2 = pi & 3; 1064 int s1 = 4 - s2; 1065 1066 switch (s2) { 1067 case 1: 1068 val_le = store_whole_le4(pv, s1, val_le); 1069 *(uint8_t *)(pv + 3) = val_le; 1070 break; 1071 case 3: 1072 *(uint8_t *)pv = val_le; 1073 store_whole_le4(pv + 1, s2, val_le >> 8); 1074 break; 1075 case 0: /* aligned */ 1076 case 2: /* atmax MO_16 */ 1077 default: 1078 g_assert_not_reached(); 1079 } 1080 } 1081 return; 1082 case MO_32: 1083 if ((pi & 7) < 4) { 1084 if (HAVE_al8) { 1085 store_whole_le8(pv, 4, cpu_to_le32(val)); 1086 return; 1087 } 1088 } else { 1089 if (HAVE_al16) { 1090 store_whole_le16(pv, 4, int128_make64(cpu_to_le32(val))); 1091 return; 1092 } 1093 } 1094 cpu_loop_exit_atomic(env_cpu(env), ra); 1095 default: 1096 g_assert_not_reached(); 1097 } 1098} 1099 1100/** 1101 * store_atom_8: 1102 * @p: host address 1103 * @val: the value to store 1104 * @memop: the full memory op 1105 * 1106 * Store 8 bytes to @p, honoring the atomicity of @memop. 1107 */ 1108static void store_atom_8(CPUArchState *env, uintptr_t ra, 1109 void *pv, MemOp memop, uint64_t val) 1110{ 1111 uintptr_t pi = (uintptr_t)pv; 1112 int atmax; 1113 1114 if (HAVE_al8 && likely((pi & 7) == 0)) { 1115 store_atomic8(pv, val); 1116 return; 1117 } 1118 1119 atmax = required_atomicity(env, pi, memop); 1120 switch (atmax) { 1121 case MO_8: 1122 stq_he_p(pv, val); 1123 return; 1124 case MO_16: 1125 store_atom_8_by_2(pv, val); 1126 return; 1127 case MO_32: 1128 store_atom_8_by_4(pv, val); 1129 return; 1130 case -MO_32: 1131 if (HAVE_al8) { 1132 uint64_t val_le = cpu_to_le64(val); 1133 int s2 = pi & 7; 1134 int s1 = 8 - s2; 1135 1136 switch (s2) { 1137 case 1 ... 3: 1138 val_le = store_whole_le8(pv, s1, val_le); 1139 store_bytes_leN(pv + s1, s2, val_le); 1140 break; 1141 case 5 ... 7: 1142 val_le = store_bytes_leN(pv, s1, val_le); 1143 store_whole_le8(pv + s1, s2, val_le); 1144 break; 1145 case 0: /* aligned */ 1146 case 4: /* atmax MO_32 */ 1147 default: 1148 g_assert_not_reached(); 1149 } 1150 return; 1151 } 1152 break; 1153 case MO_64: 1154 if (HAVE_al16) { 1155 store_whole_le16(pv, 8, int128_make64(cpu_to_le64(val))); 1156 return; 1157 } 1158 break; 1159 default: 1160 g_assert_not_reached(); 1161 } 1162 cpu_loop_exit_atomic(env_cpu(env), ra); 1163} 1164 1165/** 1166 * store_atom_16: 1167 * @p: host address 1168 * @val: the value to store 1169 * @memop: the full memory op 1170 * 1171 * Store 16 bytes to @p, honoring the atomicity of @memop. 1172 */ 1173static void store_atom_16(CPUArchState *env, uintptr_t ra, 1174 void *pv, MemOp memop, Int128 val) 1175{ 1176 uintptr_t pi = (uintptr_t)pv; 1177 uint64_t a, b; 1178 int atmax; 1179 1180 if (HAVE_al16_fast && likely((pi & 15) == 0)) { 1181 store_atomic16(pv, val); 1182 return; 1183 } 1184 1185 atmax = required_atomicity(env, pi, memop); 1186 1187 a = HOST_BIG_ENDIAN ? int128_gethi(val) : int128_getlo(val); 1188 b = HOST_BIG_ENDIAN ? int128_getlo(val) : int128_gethi(val); 1189 switch (atmax) { 1190 case MO_8: 1191 memcpy(pv, &val, 16); 1192 return; 1193 case MO_16: 1194 store_atom_8_by_2(pv, a); 1195 store_atom_8_by_2(pv + 8, b); 1196 return; 1197 case MO_32: 1198 store_atom_8_by_4(pv, a); 1199 store_atom_8_by_4(pv + 8, b); 1200 return; 1201 case MO_64: 1202 if (HAVE_al8) { 1203 store_atomic8(pv, a); 1204 store_atomic8(pv + 8, b); 1205 return; 1206 } 1207 break; 1208 case -MO_64: 1209 if (HAVE_al16) { 1210 uint64_t val_le; 1211 int s2 = pi & 15; 1212 int s1 = 16 - s2; 1213 1214 if (HOST_BIG_ENDIAN) { 1215 val = bswap128(val); 1216 } 1217 switch (s2) { 1218 case 1 ... 7: 1219 val_le = store_whole_le16(pv, s1, val); 1220 store_bytes_leN(pv + s1, s2, val_le); 1221 break; 1222 case 9 ... 15: 1223 store_bytes_leN(pv, s1, int128_getlo(val)); 1224 val = int128_urshift(val, s1 * 8); 1225 store_whole_le16(pv + s1, s2, val); 1226 break; 1227 case 0: /* aligned */ 1228 case 8: /* atmax MO_64 */ 1229 default: 1230 g_assert_not_reached(); 1231 } 1232 return; 1233 } 1234 break; 1235 case MO_128: 1236 if (HAVE_al16) { 1237 store_atomic16(pv, val); 1238 return; 1239 } 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 cpu_loop_exit_atomic(env_cpu(env), ra); 1245} 1246