1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
5
6;===------------------------------------------------------------------------===;
7; Global Address Space
8;===------------------------------------------------------------------------===;
9; FUNC-LABEL: {{^}}store_i1:
10; EG: MEM_RAT MSKOR
11; SI: buffer_store_byte
12define void @store_i1(i1 addrspace(1)* %out) {
13entry:
14  store i1 true, i1 addrspace(1)* %out
15  ret void
16}
17
18; i8 store
19; EG-LABEL: {{^}}store_i8:
20; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
21
22; IG 0: Get the byte index and truncate the value
23; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
24; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
25; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
26; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
27
28
29; IG 1: Truncate the calculated the shift amount for the mask
30
31; IG 2: Shift the value and the mask
32; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
33; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
34; EG-NEXT: 255
35; IG 3: Initialize the Y and Z channels to zero
36;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
37; EG: MOV T[[RW_GPR]].Y, 0.0
38; EG: MOV * T[[RW_GPR]].Z, 0.0
39
40; SI-LABEL: {{^}}store_i8:
41; SI: buffer_store_byte
42
43define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
44entry:
45  store i8 %in, i8 addrspace(1)* %out
46  ret void
47}
48
49; i16 store
50; EG-LABEL: {{^}}store_i16:
51; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
52
53; IG 0: Get the byte index and truncate the value
54
55
56; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
57; EG-NEXT: 3(4.203895e-45),
58
59; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
60; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y
61
62; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
63; IG 1: Truncate the calculated the shift amount for the mask
64
65; IG 2: Shift the value and the mask
66; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]]
67; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
68; EG-NEXT: 65535
69; IG 3: Initialize the Y and Z channels to zero
70;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
71; EG: MOV T[[RW_GPR]].Y, 0.0
72; EG: MOV * T[[RW_GPR]].Z, 0.0
73
74; SI-LABEL: {{^}}store_i16:
75; SI: buffer_store_short
76define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
77entry:
78  store i16 %in, i16 addrspace(1)* %out
79  ret void
80}
81
82; EG-LABEL: {{^}}store_v2i8:
83; EG: MEM_RAT MSKOR
84; EG-NOT: MEM_RAT MSKOR
85; SI-LABEL: {{^}}store_v2i8:
86; SI: buffer_store_byte
87; SI: buffer_store_byte
88define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
89entry:
90  %0 = trunc <2 x i32> %in to <2 x i8>
91  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
92  ret void
93}
94
95
96; EG-LABEL: {{^}}store_v2i16:
97; EG: MEM_RAT_CACHELESS STORE_RAW
98; CM-LABEL: {{^}}store_v2i16:
99; CM: MEM_RAT_CACHELESS STORE_DWORD
100; SI-LABEL: {{^}}store_v2i16:
101; SI: buffer_store_short
102; SI: buffer_store_short
103define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
104entry:
105  %0 = trunc <2 x i32> %in to <2 x i16>
106  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
107  ret void
108}
109
110; EG-LABEL: {{^}}store_v4i8:
111; EG: MEM_RAT_CACHELESS STORE_RAW
112; CM-LABEL: {{^}}store_v4i8:
113; CM: MEM_RAT_CACHELESS STORE_DWORD
114; SI-LABEL: {{^}}store_v4i8:
115; SI: buffer_store_byte
116; SI: buffer_store_byte
117; SI: buffer_store_byte
118; SI: buffer_store_byte
119define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
120entry:
121  %0 = trunc <4 x i32> %in to <4 x i8>
122  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
123  ret void
124}
125
126; floating-point store
127; EG-LABEL: {{^}}store_f32:
128; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
129; CM-LABEL: {{^}}store_f32:
130; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
131; SI-LABEL: {{^}}store_f32:
132; SI: buffer_store_dword
133
134define void @store_f32(float addrspace(1)* %out, float %in) {
135  store float %in, float addrspace(1)* %out
136  ret void
137}
138
139; EG-LABEL: {{^}}store_v4i16:
140; EG: MEM_RAT MSKOR
141; EG: MEM_RAT MSKOR
142; EG: MEM_RAT MSKOR
143; EG: MEM_RAT MSKOR
144; EG-NOT: MEM_RAT MSKOR
145; SI-LABEL: {{^}}store_v4i16:
146; SI: buffer_store_short
147; SI: buffer_store_short
148; SI: buffer_store_short
149; SI: buffer_store_short
150; SI-NOT: buffer_store_byte
151define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
152entry:
153  %0 = trunc <4 x i32> %in to <4 x i16>
154  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
155  ret void
156}
157
158; vec2 floating-point stores
159; EG-LABEL: {{^}}store_v2f32:
160; EG: MEM_RAT_CACHELESS STORE_RAW
161; CM-LABEL: {{^}}store_v2f32:
162; CM: MEM_RAT_CACHELESS STORE_DWORD
163; SI-LABEL: {{^}}store_v2f32:
164; SI: buffer_store_dwordx2
165
166define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
167entry:
168  %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
169  %1 = insertelement <2 x float> %0, float %b, i32 1
170  store <2 x float> %1, <2 x float> addrspace(1)* %out
171  ret void
172}
173
174; EG-LABEL: {{^}}store_v4i32:
175; EG: MEM_RAT_CACHELESS STORE_RAW
176; EG-NOT: MEM_RAT_CACHELESS STORE_RAW
177; CM-LABEL: {{^}}store_v4i32:
178; CM: MEM_RAT_CACHELESS STORE_DWORD
179; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
180; SI-LABEL: {{^}}store_v4i32:
181; SI: buffer_store_dwordx4
182define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
183entry:
184  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
185  ret void
186}
187
188; FUNC-LABEL: {{^}}store_i64_i8:
189; EG: MEM_RAT MSKOR
190; SI: buffer_store_byte
191define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
192entry:
193  %0 = trunc i64 %in to i8
194  store i8 %0, i8 addrspace(1)* %out
195  ret void
196}
197
198; FUNC-LABEL: {{^}}store_i64_i16:
199; EG: MEM_RAT MSKOR
200; SI: buffer_store_short
201define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
202entry:
203  %0 = trunc i64 %in to i16
204  store i16 %0, i16 addrspace(1)* %out
205  ret void
206}
207
208;===------------------------------------------------------------------------===;
209; Local Address Space
210;===------------------------------------------------------------------------===;
211
212; FUNC-LABEL: {{^}}store_local_i1:
213; EG: LDS_BYTE_WRITE
214; SI: ds_write_b8
215define void @store_local_i1(i1 addrspace(3)* %out) {
216entry:
217  store i1 true, i1 addrspace(3)* %out
218  ret void
219}
220
221; EG-LABEL: {{^}}store_local_i8:
222; EG: LDS_BYTE_WRITE
223; SI-LABEL: {{^}}store_local_i8:
224; SI: ds_write_b8
225define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
226  store i8 %in, i8 addrspace(3)* %out
227  ret void
228}
229
230; EG-LABEL: {{^}}store_local_i16:
231; EG: LDS_SHORT_WRITE
232; SI-LABEL: {{^}}store_local_i16:
233; SI: ds_write_b16
234define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
235  store i16 %in, i16 addrspace(3)* %out
236  ret void
237}
238
239; EG-LABEL: {{^}}store_local_v2i16:
240; EG: LDS_WRITE
241; CM-LABEL: {{^}}store_local_v2i16:
242; CM: LDS_WRITE
243; SI-LABEL: {{^}}store_local_v2i16:
244; SI: ds_write_b16
245; SI: ds_write_b16
246define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
247entry:
248  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
249  ret void
250}
251
252; EG-LABEL: {{^}}store_local_v4i8:
253; EG: LDS_WRITE
254; CM-LABEL: {{^}}store_local_v4i8:
255; CM: LDS_WRITE
256; SI-LABEL: {{^}}store_local_v4i8:
257; SI: ds_write_b8
258; SI: ds_write_b8
259; SI: ds_write_b8
260; SI: ds_write_b8
261define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
262entry:
263  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
264  ret void
265}
266
267; EG-LABEL: {{^}}store_local_v2i32:
268; EG: LDS_WRITE
269; EG: LDS_WRITE
270; CM-LABEL: {{^}}store_local_v2i32:
271; CM: LDS_WRITE
272; CM: LDS_WRITE
273; SI-LABEL: {{^}}store_local_v2i32:
274; SI: ds_write_b64
275define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
276entry:
277  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
278  ret void
279}
280
281; EG-LABEL: {{^}}store_local_v4i32:
282; EG: LDS_WRITE
283; EG: LDS_WRITE
284; EG: LDS_WRITE
285; EG: LDS_WRITE
286; CM-LABEL: {{^}}store_local_v4i32:
287; CM: LDS_WRITE
288; CM: LDS_WRITE
289; CM: LDS_WRITE
290; CM: LDS_WRITE
291; SI-LABEL: {{^}}store_local_v4i32:
292; SI: ds_write_b32
293; SI: ds_write_b32
294; SI: ds_write_b32
295; SI: ds_write_b32
296define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
297entry:
298  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
299  ret void
300}
301
302; FUNC-LABEL: {{^}}store_local_i64_i8:
303; EG: LDS_BYTE_WRITE
304; SI: ds_write_b8
305define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
306entry:
307  %0 = trunc i64 %in to i8
308  store i8 %0, i8 addrspace(3)* %out
309  ret void
310}
311
312; FUNC-LABEL: {{^}}store_local_i64_i16:
313; EG: LDS_SHORT_WRITE
314; SI: ds_write_b16
315define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
316entry:
317  %0 = trunc i64 %in to i16
318  store i16 %0, i16 addrspace(3)* %out
319  ret void
320}
321
322; The stores in this function are combined by the optimizer to create a
323; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
324; should not try to split the 64-bit store back into 2 32-bit stores.
325;
326; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
327; be two 32-bit stores.
328
329; EG-LABEL: {{^}}vecload2:
330; EG: MEM_RAT_CACHELESS STORE_RAW
331; CM-LABEL: {{^}}vecload2:
332; CM: MEM_RAT_CACHELESS STORE_DWORD
333; SI-LABEL: {{^}}vecload2:
334; SI: buffer_store_dwordx2
335define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
336entry:
337  %0 = load i32 addrspace(2)* %mem, align 4
338  %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1
339  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4
340  store i32 %0, i32 addrspace(1)* %out, align 4
341  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
342  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
343  ret void
344}
345
346attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
347
348; When i128 was a legal type this program generated cannot select errors:
349
350; FUNC-LABEL: {{^}}"i128-const-store":
351; FIXME: We should be able to to this with one store instruction
352; EG: STORE_RAW
353; EG: STORE_RAW
354; EG: STORE_RAW
355; EG: STORE_RAW
356; CM: STORE_DWORD
357; CM: STORE_DWORD
358; CM: STORE_DWORD
359; CM: STORE_DWORD
360; SI: buffer_store_dwordx2
361; SI: buffer_store_dwordx2
362define void @i128-const-store(i32 addrspace(1)* %out) {
363entry:
364  store i32 1, i32 addrspace(1)* %out, align 4
365  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
366  store i32 1, i32 addrspace(1)* %arrayidx2, align 4
367  %arrayidx4 = getelementptr inbounds i32 addrspace(1)* %out, i64 2
368  store i32 2, i32 addrspace(1)* %arrayidx4, align 4
369  %arrayidx6 = getelementptr inbounds i32 addrspace(1)* %out, i64 3
370  store i32 2, i32 addrspace(1)* %arrayidx6, align 4
371  ret void
372}
373