1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2
3declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
4
5; Make sure we don't turn the 32-bit argument load into a 16-bit
6; load. There aren't extending scalar lods, so that would require
7; using a buffer_load instruction.
8
9; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
10; SI: s_load_dword s
11; SI: buffer_store_short v
12define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
13  %trunc = trunc i32 %arg to i16
14  store i16 %trunc, i16 addrspace(1)* %out
15  ret void
16}
17
18; It should be OK (and probably performance neutral) to reduce this,
19; but we don't know if the load is uniform yet.
20
21; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
22; SI: buffer_load_dword v
23; SI: buffer_store_short v
24define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
25  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
26  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
27  %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
28  %load = load i32, i32 addrspace(1)* %gep.in
29  %trunc = trunc i32 %load to i16
30  store i16 %trunc, i16 addrspace(1)* %gep.out
31  ret void
32}
33
34; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
35; SI: s_load_dword s
36; SI: buffer_store_byte v
37define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
38  %trunc = trunc i32 %arg to i8
39  store i8 %trunc, i8 addrspace(1)* %out
40  ret void
41}
42
43; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
44; SI: buffer_load_dword v
45; SI: buffer_store_byte v
46define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
47  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
48  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
49  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
50  %load = load i32, i32 addrspace(1)* %gep.in
51  %trunc = trunc i32 %load to i8
52  store i8 %trunc, i8 addrspace(1)* %gep.out
53  ret void
54}
55
56; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
57; SI: s_load_dword s
58; SI: buffer_store_byte v
59define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
60  %trunc = trunc i32 %arg to i1
61  store i1 %trunc, i1 addrspace(1)* %out
62  ret void
63}
64
65; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
66; SI: buffer_load_dword v
67; SI: buffer_store_byte v
68define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
69  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
70  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
71  %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
72  %load = load i32, i32 addrspace(1)* %gep.in
73  %trunc = trunc i32 %load to i1
74  store i1 %trunc, i1 addrspace(1)* %gep.out
75  ret void
76}
77
78; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
79; SI: s_load_dword s
80; SI: buffer_store_dword v
81define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
82  %trunc = trunc i64 %arg to i32
83  store i32 %trunc, i32 addrspace(1)* %out
84  ret void
85}
86
87; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
88; SI: buffer_load_dword v
89; SI: buffer_store_dword v
90define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
91  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
92  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
93  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
94  %load = load i64, i64 addrspace(1)* %gep.in
95  %trunc = trunc i64 %load to i32
96  store i32 %trunc, i32 addrspace(1)* %gep.out
97  ret void
98}
99
100; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
101; SI: s_load_dword s
102; SI: buffer_store_dword v
103define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
104  %srl = lshr i64 %arg, 32
105  %trunc = trunc i64 %srl to i32
106  store i32 %trunc, i32 addrspace(1)* %out
107  ret void
108}
109
110; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
111; SI: buffer_load_dword v
112; SI: buffer_store_dword v
113define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
114  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
115  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
116  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
117  %load = load i64, i64 addrspace(1)* %gep.in
118  %srl = lshr i64 %load, 32
119  %trunc = trunc i64 %srl to i32
120  store i32 %trunc, i32 addrspace(1)* %gep.out
121  ret void
122}
123
124; Might as well reduce to 8-bit loads.
125; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
126; SI: s_load_dword s
127; SI: buffer_store_byte v
128define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
129  %trunc = trunc i16 %arg to i8
130  store i8 %trunc, i8 addrspace(1)* %out
131  ret void
132}
133
134; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
135; SI: buffer_load_ubyte v
136; SI: buffer_store_byte v
137define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
138  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
139  %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
140  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
141  %load = load i16, i16 addrspace(1)* %gep.in
142  %trunc = trunc i16 %load to i8
143  store i8 %trunc, i8 addrspace(1)* %gep.out
144  ret void
145}
146
147; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
148; SI: s_load_dword s
149; SI: buffer_store_byte v
150define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
151  %srl = lshr i64 %arg, 32
152  %trunc = trunc i64 %srl to i8
153  store i8 %trunc, i8 addrspace(1)* %out
154  ret void
155}
156
157; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
158; SI: buffer_load_dword v
159; SI: buffer_store_byte v
160define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
161  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
162  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
163  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
164  %load = load i64, i64 addrspace(1)* %gep.in
165  %srl = lshr i64 %load, 32
166  %trunc = trunc i64 %srl to i8
167  store i8 %trunc, i8 addrspace(1)* %gep.out
168  ret void
169}
170
171; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
172; SI: s_load_dword s
173; SI: buffer_store_byte v
174define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
175  %trunc = trunc i64 %arg to i8
176  store i8 %trunc, i8 addrspace(1)* %out
177  ret void
178}
179
180; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
181; SI: buffer_load_dword v
182; SI: buffer_store_byte v
183define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
184  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
185  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
186  %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
187  %load = load i64, i64 addrspace(1)* %gep.in
188  %trunc = trunc i64 %load to i8
189  store i8 %trunc, i8 addrspace(1)* %gep.out
190  ret void
191}
192
193; FUNC-LABEL: {{^}}smrd_mask_i32_to_i16
194; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
195; SI: s_waitcnt lgkmcnt(0)
196; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
197define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
198entry:
199  %val = load i32, i32 addrspace(4)* %in
200  %mask = and i32 %val, 65535
201  store i32 %mask, i32 addrspace(1)* %out
202  ret void
203}
204
205; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
206; SI: buffer_load_dword v
207; SI: buffer_store_dword v
208define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
209  %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
210  %bc = bitcast <2 x i32> %ld to i64
211  %hi = lshr i64 %bc, 32
212  %trunc = trunc i64 %hi to i32
213  store i32 %trunc, i32 addrspace(1)* %out
214  ret void
215}
216