1; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64 2; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32 3; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC 4 5; This file checks that atomic (non-seq_cst) stores of immediate values are 6; done in one mov instruction and not 2. More precisely, it makes sure that the 7; immediate is not first copied uselessly into a register. 8 9; Similarily, it checks that a binary operation of an immediate with an atomic 10; variable that is stored back in that variable is done as a single instruction. 11; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release) 12; should be just an add instruction, instead of loading x into a register, doing 13; an add and storing the result back. 14; The binary operations supported are currently add, and, or, xor. 15; sub is not supported because they are translated by an addition of the 16; negated immediate. 17; Finally, we also check the same kind of pattern for inc/dec 18 19; seq_cst stores are left as (lock) xchgl, but we try to check every other 20; attribute at least once. 21 22; Please note that these operations do not require the lock prefix: only 23; sequentially consistent stores require this kind of protection on X86. 24; And even for seq_cst operations, llvm uses the xchg instruction which has 25; an implicit lock prefix, so making it explicit is not required. 26 27define void @store_atomic_imm_8(i8* %p) { 28; X64-LABEL: store_atomic_imm_8 29; X64: movb 30; X64-NOT: movb 31; X32-LABEL: store_atomic_imm_8 32; X32: movb 33; X32-NOT: movb 34 store atomic i8 42, i8* %p release, align 1 35 ret void 36} 37 38define void @store_atomic_imm_16(i16* %p) { 39; X64-LABEL: store_atomic_imm_16 40; X64: movw 41; X64-NOT: movw 42; X32-LABEL: store_atomic_imm_16 43; X32: movw 44; X32-NOT: movw 45 store atomic i16 42, i16* %p monotonic, align 2 46 ret void 47} 48 49define void @store_atomic_imm_32(i32* %p) { 50; X64-LABEL: store_atomic_imm_32 51; X64: movl 52; X64-NOT: movl 53; On 32 bits, there is an extra movl for each of those functions 54; (probably for alignment reasons). 55; X32-LABEL: store_atomic_imm_32 56; X32: movl 4(%esp), %eax 57; X32: movl 58; X32-NOT: movl 59 store atomic i32 42, i32* %p release, align 4 60 ret void 61} 62 63define void @store_atomic_imm_64(i64* %p) { 64; X64-LABEL: store_atomic_imm_64 65; X64: movq 66; X64-NOT: movq 67; These are implemented with a CAS loop on 32 bit architectures, and thus 68; cannot be optimized in the same way as the others. 69; X32-LABEL: store_atomic_imm_64 70; X32: cmpxchg8b 71 store atomic i64 42, i64* %p release, align 8 72 ret void 73} 74 75; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, 76; even on X64, one must use movabsq that can only target a register. 77define void @store_atomic_imm_64_big(i64* %p) { 78; X64-LABEL: store_atomic_imm_64_big 79; X64: movabsq 80; X64: movq 81 store atomic i64 100000000000, i64* %p monotonic, align 8 82 ret void 83} 84 85; It would be incorrect to replace a lock xchgl by a movl 86define void @store_atomic_imm_32_seq_cst(i32* %p) { 87; X64-LABEL: store_atomic_imm_32_seq_cst 88; X64: xchgl 89; X32-LABEL: store_atomic_imm_32_seq_cst 90; X32: xchgl 91 store atomic i32 42, i32* %p seq_cst, align 4 92 ret void 93} 94 95; ----- ADD ----- 96 97define void @add_8(i8* %p) { 98; X64-LABEL: add_8 99; X64-NOT: lock 100; X64: addb 101; X64-NOT: movb 102; X32-LABEL: add_8 103; X32-NOT: lock 104; X32: addb 105; X32-NOT: movb 106 %1 = load atomic i8* %p seq_cst, align 1 107 %2 = add i8 %1, 2 108 store atomic i8 %2, i8* %p release, align 1 109 ret void 110} 111 112define void @add_16(i16* %p) { 113; Currently the transformation is not done on 16 bit accesses, as the backend 114; treat 16 bit arithmetic as expensive on X86/X86_64. 115; X64-LABEL: add_16 116; X64-NOT: addw 117; X32-LABEL: add_16 118; X32-NOT: addw 119 %1 = load atomic i16* %p acquire, align 2 120 %2 = add i16 %1, 2 121 store atomic i16 %2, i16* %p release, align 2 122 ret void 123} 124 125define void @add_32(i32* %p) { 126; X64-LABEL: add_32 127; X64-NOT: lock 128; X64: addl 129; X64-NOT: movl 130; X32-LABEL: add_32 131; X32-NOT: lock 132; X32: addl 133; X32-NOT: movl 134 %1 = load atomic i32* %p acquire, align 4 135 %2 = add i32 %1, 2 136 store atomic i32 %2, i32* %p monotonic, align 4 137 ret void 138} 139 140define void @add_64(i64* %p) { 141; X64-LABEL: add_64 142; X64-NOT: lock 143; X64: addq 144; X64-NOT: movq 145; We do not check X86-32 as it cannot do 'addq'. 146; X32-LABEL: add_64 147 %1 = load atomic i64* %p acquire, align 8 148 %2 = add i64 %1, 2 149 store atomic i64 %2, i64* %p release, align 8 150 ret void 151} 152 153define void @add_32_seq_cst(i32* %p) { 154; X64-LABEL: add_32_seq_cst 155; X64: xchgl 156; X32-LABEL: add_32_seq_cst 157; X32: xchgl 158 %1 = load atomic i32* %p monotonic, align 4 159 %2 = add i32 %1, 2 160 store atomic i32 %2, i32* %p seq_cst, align 4 161 ret void 162} 163 164; ----- AND ----- 165 166define void @and_8(i8* %p) { 167; X64-LABEL: and_8 168; X64-NOT: lock 169; X64: andb 170; X64-NOT: movb 171; X32-LABEL: and_8 172; X32-NOT: lock 173; X32: andb 174; X32-NOT: movb 175 %1 = load atomic i8* %p monotonic, align 1 176 %2 = and i8 %1, 2 177 store atomic i8 %2, i8* %p release, align 1 178 ret void 179} 180 181define void @and_16(i16* %p) { 182; Currently the transformation is not done on 16 bit accesses, as the backend 183; treat 16 bit arithmetic as expensive on X86/X86_64. 184; X64-LABEL: and_16 185; X64-NOT: andw 186; X32-LABEL: and_16 187; X32-NOT: andw 188 %1 = load atomic i16* %p acquire, align 2 189 %2 = and i16 %1, 2 190 store atomic i16 %2, i16* %p release, align 2 191 ret void 192} 193 194define void @and_32(i32* %p) { 195; X64-LABEL: and_32 196; X64-NOT: lock 197; X64: andl 198; X64-NOT: movl 199; X32-LABEL: and_32 200; X32-NOT: lock 201; X32: andl 202; X32-NOT: movl 203 %1 = load atomic i32* %p acquire, align 4 204 %2 = and i32 %1, 2 205 store atomic i32 %2, i32* %p release, align 4 206 ret void 207} 208 209define void @and_64(i64* %p) { 210; X64-LABEL: and_64 211; X64-NOT: lock 212; X64: andq 213; X64-NOT: movq 214; We do not check X86-32 as it cannot do 'andq'. 215; X32-LABEL: and_64 216 %1 = load atomic i64* %p acquire, align 8 217 %2 = and i64 %1, 2 218 store atomic i64 %2, i64* %p release, align 8 219 ret void 220} 221 222define void @and_32_seq_cst(i32* %p) { 223; X64-LABEL: and_32_seq_cst 224; X64: xchgl 225; X32-LABEL: and_32_seq_cst 226; X32: xchgl 227 %1 = load atomic i32* %p monotonic, align 4 228 %2 = and i32 %1, 2 229 store atomic i32 %2, i32* %p seq_cst, align 4 230 ret void 231} 232 233; ----- OR ----- 234 235define void @or_8(i8* %p) { 236; X64-LABEL: or_8 237; X64-NOT: lock 238; X64: orb 239; X64-NOT: movb 240; X32-LABEL: or_8 241; X32-NOT: lock 242; X32: orb 243; X32-NOT: movb 244 %1 = load atomic i8* %p acquire, align 1 245 %2 = or i8 %1, 2 246 store atomic i8 %2, i8* %p release, align 1 247 ret void 248} 249 250define void @or_16(i16* %p) { 251; X64-LABEL: or_16 252; X64-NOT: orw 253; X32-LABEL: or_16 254; X32-NOT: orw 255 %1 = load atomic i16* %p acquire, align 2 256 %2 = or i16 %1, 2 257 store atomic i16 %2, i16* %p release, align 2 258 ret void 259} 260 261define void @or_32(i32* %p) { 262; X64-LABEL: or_32 263; X64-NOT: lock 264; X64: orl 265; X64-NOT: movl 266; X32-LABEL: or_32 267; X32-NOT: lock 268; X32: orl 269; X32-NOT: movl 270 %1 = load atomic i32* %p acquire, align 4 271 %2 = or i32 %1, 2 272 store atomic i32 %2, i32* %p release, align 4 273 ret void 274} 275 276define void @or_64(i64* %p) { 277; X64-LABEL: or_64 278; X64-NOT: lock 279; X64: orq 280; X64-NOT: movq 281; We do not check X86-32 as it cannot do 'orq'. 282; X32-LABEL: or_64 283 %1 = load atomic i64* %p acquire, align 8 284 %2 = or i64 %1, 2 285 store atomic i64 %2, i64* %p release, align 8 286 ret void 287} 288 289define void @or_32_seq_cst(i32* %p) { 290; X64-LABEL: or_32_seq_cst 291; X64: xchgl 292; X32-LABEL: or_32_seq_cst 293; X32: xchgl 294 %1 = load atomic i32* %p monotonic, align 4 295 %2 = or i32 %1, 2 296 store atomic i32 %2, i32* %p seq_cst, align 4 297 ret void 298} 299 300; ----- XOR ----- 301 302define void @xor_8(i8* %p) { 303; X64-LABEL: xor_8 304; X64-NOT: lock 305; X64: xorb 306; X64-NOT: movb 307; X32-LABEL: xor_8 308; X32-NOT: lock 309; X32: xorb 310; X32-NOT: movb 311 %1 = load atomic i8* %p acquire, align 1 312 %2 = xor i8 %1, 2 313 store atomic i8 %2, i8* %p release, align 1 314 ret void 315} 316 317define void @xor_16(i16* %p) { 318; X64-LABEL: xor_16 319; X64-NOT: xorw 320; X32-LABEL: xor_16 321; X32-NOT: xorw 322 %1 = load atomic i16* %p acquire, align 2 323 %2 = xor i16 %1, 2 324 store atomic i16 %2, i16* %p release, align 2 325 ret void 326} 327 328define void @xor_32(i32* %p) { 329; X64-LABEL: xor_32 330; X64-NOT: lock 331; X64: xorl 332; X64-NOT: movl 333; X32-LABEL: xor_32 334; X32-NOT: lock 335; X32: xorl 336; X32-NOT: movl 337 %1 = load atomic i32* %p acquire, align 4 338 %2 = xor i32 %1, 2 339 store atomic i32 %2, i32* %p release, align 4 340 ret void 341} 342 343define void @xor_64(i64* %p) { 344; X64-LABEL: xor_64 345; X64-NOT: lock 346; X64: xorq 347; X64-NOT: movq 348; We do not check X86-32 as it cannot do 'xorq'. 349; X32-LABEL: xor_64 350 %1 = load atomic i64* %p acquire, align 8 351 %2 = xor i64 %1, 2 352 store atomic i64 %2, i64* %p release, align 8 353 ret void 354} 355 356define void @xor_32_seq_cst(i32* %p) { 357; X64-LABEL: xor_32_seq_cst 358; X64: xchgl 359; X32-LABEL: xor_32_seq_cst 360; X32: xchgl 361 %1 = load atomic i32* %p monotonic, align 4 362 %2 = xor i32 %1, 2 363 store atomic i32 %2, i32* %p seq_cst, align 4 364 ret void 365} 366 367; ----- INC ----- 368 369define void @inc_8(i8* %p) { 370; X64-LABEL: inc_8 371; X64-NOT: lock 372; X64: incb 373; X64-NOT: movb 374; X32-LABEL: inc_8 375; X32-NOT: lock 376; X32: incb 377; X32-NOT: movb 378; SLOW_INC-LABEL: inc_8 379; SLOW_INC-NOT: incb 380; SLOW_INC-NOT: movb 381 %1 = load atomic i8* %p seq_cst, align 1 382 %2 = add i8 %1, 1 383 store atomic i8 %2, i8* %p release, align 1 384 ret void 385} 386 387define void @inc_16(i16* %p) { 388; Currently the transformation is not done on 16 bit accesses, as the backend 389; treat 16 bit arithmetic as expensive on X86/X86_64. 390; X64-LABEL: inc_16 391; X64-NOT: incw 392; X32-LABEL: inc_16 393; X32-NOT: incw 394; SLOW_INC-LABEL: inc_16 395; SLOW_INC-NOT: incw 396 %1 = load atomic i16* %p acquire, align 2 397 %2 = add i16 %1, 1 398 store atomic i16 %2, i16* %p release, align 2 399 ret void 400} 401 402define void @inc_32(i32* %p) { 403; X64-LABEL: inc_32 404; X64-NOT: lock 405; X64: incl 406; X64-NOT: movl 407; X32-LABEL: inc_32 408; X32-NOT: lock 409; X32: incl 410; X32-NOT: movl 411; SLOW_INC-LABEL: inc_32 412; SLOW_INC-NOT: incl 413; SLOW_INC-NOT: movl 414 %1 = load atomic i32* %p acquire, align 4 415 %2 = add i32 %1, 1 416 store atomic i32 %2, i32* %p monotonic, align 4 417 ret void 418} 419 420define void @inc_64(i64* %p) { 421; X64-LABEL: inc_64 422; X64-NOT: lock 423; X64: incq 424; X64-NOT: movq 425; We do not check X86-32 as it cannot do 'incq'. 426; X32-LABEL: inc_64 427; SLOW_INC-LABEL: inc_64 428; SLOW_INC-NOT: incq 429; SLOW_INC-NOT: movq 430 %1 = load atomic i64* %p acquire, align 8 431 %2 = add i64 %1, 1 432 store atomic i64 %2, i64* %p release, align 8 433 ret void 434} 435 436define void @inc_32_seq_cst(i32* %p) { 437; X64-LABEL: inc_32_seq_cst 438; X64: xchgl 439; X32-LABEL: inc_32_seq_cst 440; X32: xchgl 441 %1 = load atomic i32* %p monotonic, align 4 442 %2 = add i32 %1, 1 443 store atomic i32 %2, i32* %p seq_cst, align 4 444 ret void 445} 446 447; ----- DEC ----- 448 449define void @dec_8(i8* %p) { 450; X64-LABEL: dec_8 451; X64-NOT: lock 452; X64: decb 453; X64-NOT: movb 454; X32-LABEL: dec_8 455; X32-NOT: lock 456; X32: decb 457; X32-NOT: movb 458; SLOW_INC-LABEL: dec_8 459; SLOW_INC-NOT: decb 460; SLOW_INC-NOT: movb 461 %1 = load atomic i8* %p seq_cst, align 1 462 %2 = sub i8 %1, 1 463 store atomic i8 %2, i8* %p release, align 1 464 ret void 465} 466 467define void @dec_16(i16* %p) { 468; Currently the transformation is not done on 16 bit accesses, as the backend 469; treat 16 bit arithmetic as expensive on X86/X86_64. 470; X64-LABEL: dec_16 471; X64-NOT: decw 472; X32-LABEL: dec_16 473; X32-NOT: decw 474; SLOW_INC-LABEL: dec_16 475; SLOW_INC-NOT: decw 476 %1 = load atomic i16* %p acquire, align 2 477 %2 = sub i16 %1, 1 478 store atomic i16 %2, i16* %p release, align 2 479 ret void 480} 481 482define void @dec_32(i32* %p) { 483; X64-LABEL: dec_32 484; X64-NOT: lock 485; X64: decl 486; X64-NOT: movl 487; X32-LABEL: dec_32 488; X32-NOT: lock 489; X32: decl 490; X32-NOT: movl 491; SLOW_INC-LABEL: dec_32 492; SLOW_INC-NOT: decl 493; SLOW_INC-NOT: movl 494 %1 = load atomic i32* %p acquire, align 4 495 %2 = sub i32 %1, 1 496 store atomic i32 %2, i32* %p monotonic, align 4 497 ret void 498} 499 500define void @dec_64(i64* %p) { 501; X64-LABEL: dec_64 502; X64-NOT: lock 503; X64: decq 504; X64-NOT: movq 505; We do not check X86-32 as it cannot do 'decq'. 506; X32-LABEL: dec_64 507; SLOW_INC-LABEL: dec_64 508; SLOW_INC-NOT: decq 509; SLOW_INC-NOT: movq 510 %1 = load atomic i64* %p acquire, align 8 511 %2 = sub i64 %1, 1 512 store atomic i64 %2, i64* %p release, align 8 513 ret void 514} 515 516define void @dec_32_seq_cst(i32* %p) { 517; X64-LABEL: dec_32_seq_cst 518; X64: xchgl 519; X32-LABEL: dec_32_seq_cst 520; X32: xchgl 521 %1 = load atomic i32* %p monotonic, align 4 522 %2 = sub i32 %1, 1 523 store atomic i32 %2, i32* %p seq_cst, align 4 524 ret void 525} 526