1; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64
2; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32
3; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
4
5; This file checks that atomic (non-seq_cst) stores of immediate values are
6; done in one mov instruction and not 2. More precisely, it makes sure that the
7; immediate is not first copied uselessly into a register.
8
9; Similarily, it checks that a binary operation of an immediate with an atomic
10; variable that is stored back in that variable is done as a single instruction.
11; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release)
12; should be just an add instruction, instead of loading x into a register, doing
13; an add and storing the result back.
14; The binary operations supported are currently add, and, or, xor.
15; sub is not supported because they are translated by an addition of the
16; negated immediate.
17; Finally, we also check the same kind of pattern for inc/dec
18
19; seq_cst stores are left as (lock) xchgl, but we try to check every other
20; attribute at least once.
21
22; Please note that these operations do not require the lock prefix: only
23; sequentially consistent stores require this kind of protection on X86.
24; And even for seq_cst operations, llvm uses the xchg instruction which has
25; an implicit lock prefix, so making it explicit is not required.
26
27define void @store_atomic_imm_8(i8* %p) {
28; X64-LABEL: store_atomic_imm_8
29; X64: movb
30; X64-NOT: movb
31; X32-LABEL: store_atomic_imm_8
32; X32: movb
33; X32-NOT: movb
34  store atomic i8 42, i8* %p release, align 1
35  ret void
36}
37
38define void @store_atomic_imm_16(i16* %p) {
39; X64-LABEL: store_atomic_imm_16
40; X64: movw
41; X64-NOT: movw
42; X32-LABEL: store_atomic_imm_16
43; X32: movw
44; X32-NOT: movw
45  store atomic i16 42, i16* %p monotonic, align 2
46  ret void
47}
48
49define void @store_atomic_imm_32(i32* %p) {
50; X64-LABEL: store_atomic_imm_32
51; X64: movl
52; X64-NOT: movl
53;   On 32 bits, there is an extra movl for each of those functions
54;   (probably for alignment reasons).
55; X32-LABEL: store_atomic_imm_32
56; X32: movl 4(%esp), %eax
57; X32: movl
58; X32-NOT: movl
59  store atomic i32 42, i32* %p release, align 4
60  ret void
61}
62
63define void @store_atomic_imm_64(i64* %p) {
64; X64-LABEL: store_atomic_imm_64
65; X64: movq
66; X64-NOT: movq
67;   These are implemented with a CAS loop on 32 bit architectures, and thus
68;   cannot be optimized in the same way as the others.
69; X32-LABEL: store_atomic_imm_64
70; X32: cmpxchg8b
71  store atomic i64 42, i64* %p release, align 8
72  ret void
73}
74
75; If an immediate is too big to fit in 32 bits, it cannot be store in one mov,
76; even on X64, one must use movabsq that can only target a register.
77define void @store_atomic_imm_64_big(i64* %p) {
78; X64-LABEL: store_atomic_imm_64_big
79; X64: movabsq
80; X64: movq
81  store atomic i64 100000000000, i64* %p monotonic, align 8
82  ret void
83}
84
85; It would be incorrect to replace a lock xchgl by a movl
86define void @store_atomic_imm_32_seq_cst(i32* %p) {
87; X64-LABEL: store_atomic_imm_32_seq_cst
88; X64: xchgl
89; X32-LABEL: store_atomic_imm_32_seq_cst
90; X32: xchgl
91  store atomic i32 42, i32* %p seq_cst, align 4
92  ret void
93}
94
95; ----- ADD -----
96
97define void @add_8(i8* %p) {
98; X64-LABEL: add_8
99; X64-NOT: lock
100; X64: addb
101; X64-NOT: movb
102; X32-LABEL: add_8
103; X32-NOT: lock
104; X32: addb
105; X32-NOT: movb
106  %1 = load atomic i8* %p seq_cst, align 1
107  %2 = add i8 %1, 2
108  store atomic i8 %2, i8* %p release, align 1
109  ret void
110}
111
112define void @add_16(i16* %p) {
113;   Currently the transformation is not done on 16 bit accesses, as the backend
114;   treat 16 bit arithmetic as expensive on X86/X86_64.
115; X64-LABEL: add_16
116; X64-NOT: addw
117; X32-LABEL: add_16
118; X32-NOT: addw
119  %1 = load atomic i16* %p acquire, align 2
120  %2 = add i16 %1, 2
121  store atomic i16 %2, i16* %p release, align 2
122  ret void
123}
124
125define void @add_32(i32* %p) {
126; X64-LABEL: add_32
127; X64-NOT: lock
128; X64: addl
129; X64-NOT: movl
130; X32-LABEL: add_32
131; X32-NOT: lock
132; X32: addl
133; X32-NOT: movl
134  %1 = load atomic i32* %p acquire, align 4
135  %2 = add i32 %1, 2
136  store atomic i32 %2, i32* %p monotonic, align 4
137  ret void
138}
139
140define void @add_64(i64* %p) {
141; X64-LABEL: add_64
142; X64-NOT: lock
143; X64: addq
144; X64-NOT: movq
145;   We do not check X86-32 as it cannot do 'addq'.
146; X32-LABEL: add_64
147  %1 = load atomic i64* %p acquire, align 8
148  %2 = add i64 %1, 2
149  store atomic i64 %2, i64* %p release, align 8
150  ret void
151}
152
153define void @add_32_seq_cst(i32* %p) {
154; X64-LABEL: add_32_seq_cst
155; X64: xchgl
156; X32-LABEL: add_32_seq_cst
157; X32: xchgl
158  %1 = load atomic i32* %p monotonic, align 4
159  %2 = add i32 %1, 2
160  store atomic i32 %2, i32* %p seq_cst, align 4
161  ret void
162}
163
164; ----- AND -----
165
166define void @and_8(i8* %p) {
167; X64-LABEL: and_8
168; X64-NOT: lock
169; X64: andb
170; X64-NOT: movb
171; X32-LABEL: and_8
172; X32-NOT: lock
173; X32: andb
174; X32-NOT: movb
175  %1 = load atomic i8* %p monotonic, align 1
176  %2 = and i8 %1, 2
177  store atomic i8 %2, i8* %p release, align 1
178  ret void
179}
180
181define void @and_16(i16* %p) {
182;   Currently the transformation is not done on 16 bit accesses, as the backend
183;   treat 16 bit arithmetic as expensive on X86/X86_64.
184; X64-LABEL: and_16
185; X64-NOT: andw
186; X32-LABEL: and_16
187; X32-NOT: andw
188  %1 = load atomic i16* %p acquire, align 2
189  %2 = and i16 %1, 2
190  store atomic i16 %2, i16* %p release, align 2
191  ret void
192}
193
194define void @and_32(i32* %p) {
195; X64-LABEL: and_32
196; X64-NOT: lock
197; X64: andl
198; X64-NOT: movl
199; X32-LABEL: and_32
200; X32-NOT: lock
201; X32: andl
202; X32-NOT: movl
203  %1 = load atomic i32* %p acquire, align 4
204  %2 = and i32 %1, 2
205  store atomic i32 %2, i32* %p release, align 4
206  ret void
207}
208
209define void @and_64(i64* %p) {
210; X64-LABEL: and_64
211; X64-NOT: lock
212; X64: andq
213; X64-NOT: movq
214;   We do not check X86-32 as it cannot do 'andq'.
215; X32-LABEL: and_64
216  %1 = load atomic i64* %p acquire, align 8
217  %2 = and i64 %1, 2
218  store atomic i64 %2, i64* %p release, align 8
219  ret void
220}
221
222define void @and_32_seq_cst(i32* %p) {
223; X64-LABEL: and_32_seq_cst
224; X64: xchgl
225; X32-LABEL: and_32_seq_cst
226; X32: xchgl
227  %1 = load atomic i32* %p monotonic, align 4
228  %2 = and i32 %1, 2
229  store atomic i32 %2, i32* %p seq_cst, align 4
230  ret void
231}
232
233; ----- OR -----
234
235define void @or_8(i8* %p) {
236; X64-LABEL: or_8
237; X64-NOT: lock
238; X64: orb
239; X64-NOT: movb
240; X32-LABEL: or_8
241; X32-NOT: lock
242; X32: orb
243; X32-NOT: movb
244  %1 = load atomic i8* %p acquire, align 1
245  %2 = or i8 %1, 2
246  store atomic i8 %2, i8* %p release, align 1
247  ret void
248}
249
250define void @or_16(i16* %p) {
251; X64-LABEL: or_16
252; X64-NOT: orw
253; X32-LABEL: or_16
254; X32-NOT: orw
255  %1 = load atomic i16* %p acquire, align 2
256  %2 = or i16 %1, 2
257  store atomic i16 %2, i16* %p release, align 2
258  ret void
259}
260
261define void @or_32(i32* %p) {
262; X64-LABEL: or_32
263; X64-NOT: lock
264; X64: orl
265; X64-NOT: movl
266; X32-LABEL: or_32
267; X32-NOT: lock
268; X32: orl
269; X32-NOT: movl
270  %1 = load atomic i32* %p acquire, align 4
271  %2 = or i32 %1, 2
272  store atomic i32 %2, i32* %p release, align 4
273  ret void
274}
275
276define void @or_64(i64* %p) {
277; X64-LABEL: or_64
278; X64-NOT: lock
279; X64: orq
280; X64-NOT: movq
281;   We do not check X86-32 as it cannot do 'orq'.
282; X32-LABEL: or_64
283  %1 = load atomic i64* %p acquire, align 8
284  %2 = or i64 %1, 2
285  store atomic i64 %2, i64* %p release, align 8
286  ret void
287}
288
289define void @or_32_seq_cst(i32* %p) {
290; X64-LABEL: or_32_seq_cst
291; X64: xchgl
292; X32-LABEL: or_32_seq_cst
293; X32: xchgl
294  %1 = load atomic i32* %p monotonic, align 4
295  %2 = or i32 %1, 2
296  store atomic i32 %2, i32* %p seq_cst, align 4
297  ret void
298}
299
300; ----- XOR -----
301
302define void @xor_8(i8* %p) {
303; X64-LABEL: xor_8
304; X64-NOT: lock
305; X64: xorb
306; X64-NOT: movb
307; X32-LABEL: xor_8
308; X32-NOT: lock
309; X32: xorb
310; X32-NOT: movb
311  %1 = load atomic i8* %p acquire, align 1
312  %2 = xor i8 %1, 2
313  store atomic i8 %2, i8* %p release, align 1
314  ret void
315}
316
317define void @xor_16(i16* %p) {
318; X64-LABEL: xor_16
319; X64-NOT: xorw
320; X32-LABEL: xor_16
321; X32-NOT: xorw
322  %1 = load atomic i16* %p acquire, align 2
323  %2 = xor i16 %1, 2
324  store atomic i16 %2, i16* %p release, align 2
325  ret void
326}
327
328define void @xor_32(i32* %p) {
329; X64-LABEL: xor_32
330; X64-NOT: lock
331; X64: xorl
332; X64-NOT: movl
333; X32-LABEL: xor_32
334; X32-NOT: lock
335; X32: xorl
336; X32-NOT: movl
337  %1 = load atomic i32* %p acquire, align 4
338  %2 = xor i32 %1, 2
339  store atomic i32 %2, i32* %p release, align 4
340  ret void
341}
342
343define void @xor_64(i64* %p) {
344; X64-LABEL: xor_64
345; X64-NOT: lock
346; X64: xorq
347; X64-NOT: movq
348;   We do not check X86-32 as it cannot do 'xorq'.
349; X32-LABEL: xor_64
350  %1 = load atomic i64* %p acquire, align 8
351  %2 = xor i64 %1, 2
352  store atomic i64 %2, i64* %p release, align 8
353  ret void
354}
355
356define void @xor_32_seq_cst(i32* %p) {
357; X64-LABEL: xor_32_seq_cst
358; X64: xchgl
359; X32-LABEL: xor_32_seq_cst
360; X32: xchgl
361  %1 = load atomic i32* %p monotonic, align 4
362  %2 = xor i32 %1, 2
363  store atomic i32 %2, i32* %p seq_cst, align 4
364  ret void
365}
366
367; ----- INC -----
368
369define void @inc_8(i8* %p) {
370; X64-LABEL: inc_8
371; X64-NOT: lock
372; X64: incb
373; X64-NOT: movb
374; X32-LABEL: inc_8
375; X32-NOT: lock
376; X32: incb
377; X32-NOT: movb
378; SLOW_INC-LABEL: inc_8
379; SLOW_INC-NOT: incb
380; SLOW_INC-NOT: movb
381  %1 = load atomic i8* %p seq_cst, align 1
382  %2 = add i8 %1, 1
383  store atomic i8 %2, i8* %p release, align 1
384  ret void
385}
386
387define void @inc_16(i16* %p) {
388;   Currently the transformation is not done on 16 bit accesses, as the backend
389;   treat 16 bit arithmetic as expensive on X86/X86_64.
390; X64-LABEL: inc_16
391; X64-NOT: incw
392; X32-LABEL: inc_16
393; X32-NOT: incw
394; SLOW_INC-LABEL: inc_16
395; SLOW_INC-NOT: incw
396  %1 = load atomic i16* %p acquire, align 2
397  %2 = add i16 %1, 1
398  store atomic i16 %2, i16* %p release, align 2
399  ret void
400}
401
402define void @inc_32(i32* %p) {
403; X64-LABEL: inc_32
404; X64-NOT: lock
405; X64: incl
406; X64-NOT: movl
407; X32-LABEL: inc_32
408; X32-NOT: lock
409; X32: incl
410; X32-NOT: movl
411; SLOW_INC-LABEL: inc_32
412; SLOW_INC-NOT: incl
413; SLOW_INC-NOT: movl
414  %1 = load atomic i32* %p acquire, align 4
415  %2 = add i32 %1, 1
416  store atomic i32 %2, i32* %p monotonic, align 4
417  ret void
418}
419
420define void @inc_64(i64* %p) {
421; X64-LABEL: inc_64
422; X64-NOT: lock
423; X64: incq
424; X64-NOT: movq
425;   We do not check X86-32 as it cannot do 'incq'.
426; X32-LABEL: inc_64
427; SLOW_INC-LABEL: inc_64
428; SLOW_INC-NOT: incq
429; SLOW_INC-NOT: movq
430  %1 = load atomic i64* %p acquire, align 8
431  %2 = add i64 %1, 1
432  store atomic i64 %2, i64* %p release, align 8
433  ret void
434}
435
436define void @inc_32_seq_cst(i32* %p) {
437; X64-LABEL: inc_32_seq_cst
438; X64: xchgl
439; X32-LABEL: inc_32_seq_cst
440; X32: xchgl
441  %1 = load atomic i32* %p monotonic, align 4
442  %2 = add i32 %1, 1
443  store atomic i32 %2, i32* %p seq_cst, align 4
444  ret void
445}
446
447; ----- DEC -----
448
449define void @dec_8(i8* %p) {
450; X64-LABEL: dec_8
451; X64-NOT: lock
452; X64: decb
453; X64-NOT: movb
454; X32-LABEL: dec_8
455; X32-NOT: lock
456; X32: decb
457; X32-NOT: movb
458; SLOW_INC-LABEL: dec_8
459; SLOW_INC-NOT: decb
460; SLOW_INC-NOT: movb
461  %1 = load atomic i8* %p seq_cst, align 1
462  %2 = sub i8 %1, 1
463  store atomic i8 %2, i8* %p release, align 1
464  ret void
465}
466
467define void @dec_16(i16* %p) {
468;   Currently the transformation is not done on 16 bit accesses, as the backend
469;   treat 16 bit arithmetic as expensive on X86/X86_64.
470; X64-LABEL: dec_16
471; X64-NOT: decw
472; X32-LABEL: dec_16
473; X32-NOT: decw
474; SLOW_INC-LABEL: dec_16
475; SLOW_INC-NOT: decw
476  %1 = load atomic i16* %p acquire, align 2
477  %2 = sub i16 %1, 1
478  store atomic i16 %2, i16* %p release, align 2
479  ret void
480}
481
482define void @dec_32(i32* %p) {
483; X64-LABEL: dec_32
484; X64-NOT: lock
485; X64: decl
486; X64-NOT: movl
487; X32-LABEL: dec_32
488; X32-NOT: lock
489; X32: decl
490; X32-NOT: movl
491; SLOW_INC-LABEL: dec_32
492; SLOW_INC-NOT: decl
493; SLOW_INC-NOT: movl
494  %1 = load atomic i32* %p acquire, align 4
495  %2 = sub i32 %1, 1
496  store atomic i32 %2, i32* %p monotonic, align 4
497  ret void
498}
499
500define void @dec_64(i64* %p) {
501; X64-LABEL: dec_64
502; X64-NOT: lock
503; X64: decq
504; X64-NOT: movq
505;   We do not check X86-32 as it cannot do 'decq'.
506; X32-LABEL: dec_64
507; SLOW_INC-LABEL: dec_64
508; SLOW_INC-NOT: decq
509; SLOW_INC-NOT: movq
510  %1 = load atomic i64* %p acquire, align 8
511  %2 = sub i64 %1, 1
512  store atomic i64 %2, i64* %p release, align 8
513  ret void
514}
515
516define void @dec_32_seq_cst(i32* %p) {
517; X64-LABEL: dec_32_seq_cst
518; X64: xchgl
519; X32-LABEL: dec_32_seq_cst
520; X32: xchgl
521  %1 = load atomic i32* %p monotonic, align 4
522  %2 = sub i32 %1, 1
523  store atomic i32 %2, i32* %p seq_cst, align 4
524  ret void
525}
526