1; RUN: llc -amdgpu-load-store-vectorizer=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
2
3declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
4declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
5declare i32 @llvm.amdgcn.workitem.id.x()
6declare i32 @llvm.amdgcn.workitem.id.y()
7
8; GCN-LABEL: {{^}}v_permlane16_b32_vss:
9; GFX10-NOT: v_readfirstlane_b32
10; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
11define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
12  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
13  store i32 %v, i32 addrspace(1)* %out
14  ret void
15}
16
17; GCN-LABEL: {{^}}v_permlane16_b32_vii:
18; GFX10-NOT: v_readfirstlane_b32
19; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
20define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
21  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
22  store i32 %v, i32 addrspace(1)* %out
23  ret void
24}
25
26; GCN-LABEL: {{^}}v_permlane16_b32_vll:
27; FIXME-GFX10: It is allowed to have both immediates as literals
28; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
29; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
30; GFX10-NOT: v_readfirstlane_b32
31; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
32define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
33  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
34  store i32 %v, i32 addrspace(1)* %out
35  ret void
36}
37
38; GCN-LABEL: {{^}}v_permlane16_b32_vvv:
39; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
40; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
41; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
42define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
43  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
44  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
45  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
46  store i32 %v, i32 addrspace(1)* %out
47  ret void
48}
49
50; GCN-LABEL: {{^}}v_permlane16_b32_vvs:
51; GFX10-NOT: v_readfirstlane_b32
52; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
53; GFX10-NOT: v_readfirstlane_b32
54; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
55define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
56  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
57  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
58  store i32 %v, i32 addrspace(1)* %out
59  ret void
60}
61
62; GCN-LABEL: {{^}}v_permlane16_b32_vsv:
63; GFX10-NOT: v_readfirstlane_b32
64; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
65; GFX10-NOT: v_readfirstlane_b32
66; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
67define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
68  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
69  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
70  store i32 %v, i32 addrspace(1)* %out
71  ret void
72}
73
74; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi:
75; GFX10-NOT: v_readfirstlane_b32
76; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
77define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
78  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
79  store i32 %v, i32 addrspace(1)* %out
80  ret void
81}
82
83; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc:
84; GFX10-NOT: v_readfirstlane_b32
85; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
86define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
87  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
88  store i32 %v, i32 addrspace(1)* %out
89  ret void
90}
91
92; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc:
93; GFX10-NOT: v_readfirstlane_b32
94; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
95define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
96  %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
97  store i32 %v, i32 addrspace(1)* %out
98  ret void
99}
100
101; GCN-LABEL: {{^}}v_permlanex16_b32_vss:
102; GFX10-NOT: v_readfirstlane_b32
103; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
104define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
105  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
106  store i32 %v, i32 addrspace(1)* %out
107  ret void
108}
109
110; GCN-LABEL: {{^}}v_permlanex16_b32_vii:
111; GFX10-NOT: v_readfirstlane_b32
112; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
113define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
114  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
115  store i32 %v, i32 addrspace(1)* %out
116  ret void
117}
118
119; GCN-LABEL: {{^}}v_permlanex16_b32_vll:
120; FIXME-GFX10: It is allowed to have both immediates as literals
121; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
122; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
123; GFX10-NOT: v_readfirstlane_b32
124; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
125define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
126  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
127  store i32 %v, i32 addrspace(1)* %out
128  ret void
129}
130
131; GCN-LABEL: {{^}}v_permlanex16_b32_vvv:
132; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
133; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
134; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
135define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
136  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
137  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
138  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
139  store i32 %v, i32 addrspace(1)* %out
140  ret void
141}
142
143; GCN-LABEL: {{^}}v_permlanex16_b32_vvs:
144; GFX10-NOT: v_readfirstlane_b32
145; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
146; GFX10-NOT: v_readfirstlane_b32
147; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
148define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
149  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
150  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
151  store i32 %v, i32 addrspace(1)* %out
152  ret void
153}
154
155; GCN-LABEL: {{^}}v_permlanex16_b32_vsv:
156; GFX10-NOT: v_readfirstlane_b32
157; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
158; GFX10-NOT: v_readfirstlane_b32
159; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
160define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
161  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
162  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
163  store i32 %v, i32 addrspace(1)* %out
164  ret void
165}
166
167; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi:
168; GFX10-NOT: v_readfirstlane_b32
169; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
170define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
171  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
172  store i32 %v, i32 addrspace(1)* %out
173  ret void
174}
175
176; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc:
177; GFX10-NOT: v_readfirstlane_b32
178; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
179define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
180  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
181  store i32 %v, i32 addrspace(1)* %out
182  ret void
183}
184
185; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc:
186; GFX10-NOT: v_readfirstlane_b32
187; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
188define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
189  %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
190  store i32 %v, i32 addrspace(1)* %out
191  ret void
192}
193
194; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid:
195; GFX10: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
196define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
197  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
198  %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
199  store i32 %v, i32 addrspace(1)* %out
200  ret void
201}
202
203; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid:
204; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
205define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
206  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
207  %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
208  store i32 %v, i32 addrspace(1)* %out
209  ret void
210}
211
212; GCN-LABEL: {{^}}v_permlane16_b32_i_tid:
213; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
214; GFX10: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
215define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
216  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
217  %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
218  store i32 %v, i32 addrspace(1)* %out
219  ret void
220}
221
222; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi:
223; GFX10-NOT: 0x3039
224; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
225define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
226  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
227  %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
228  store i32 %v, i32 addrspace(1)* %out
229  ret void
230}
231
232; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc:
233; GFX10-NOT: 0x3039
234; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
235define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
236  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
237  %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
238  store i32 %v, i32 addrspace(1)* %out
239  ret void
240}
241
242; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc:
243; GFX10-NOT: 0x3039
244; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
245define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
246  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
247  %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
248  store i32 %v, i32 addrspace(1)* %out
249  ret void
250}
251
252; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid:
253; GFX10: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
254define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
255  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
256  %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
257  store i32 %v, i32 addrspace(1)* %out
258  ret void
259}
260
261; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid:
262; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
263define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
264  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
265  %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
266  store i32 %v, i32 addrspace(1)* %out
267  ret void
268}
269
270; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid:
271; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
272; GFX10: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
273define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
274  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
275  %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
276  store i32 %v, i32 addrspace(1)* %out
277  ret void
278}
279
280; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi:
281; GFX10-NOT: 0x3039
282; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
283define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
284  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
285  %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
286  store i32 %v, i32 addrspace(1)* %out
287  ret void
288}
289
290; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc:
291; GFX10-NOT: 0x3039
292; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
293define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
294  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
295  %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
296  store i32 %v, i32 addrspace(1)* %out
297  ret void
298}
299
300; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc:
301; GFX10-NOT: 0x3039
302; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
303define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
304  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
305  %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
306  store i32 %v, i32 addrspace(1)* %out
307  ret void
308}
309
310attributes #0 = { nounwind readnone convergent }
311attributes #1 = { nounwind }
312