1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
5
6; Testing for ds_read/write_b128
7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
9
10; FUNC-LABEL: {{^}}local_load_i16:
11; GFX9-NOT: m0
12; SICIVI: s_mov_b32 m0
13
14; GCN: ds_read_u16 v{{[0-9]+}}
15
16; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
17; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
18; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
19; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
20; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
21define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
22entry:
23  %ld = load i16, i16 addrspace(3)* %in
24  store i16 %ld, i16 addrspace(3)* %out
25  ret void
26}
27
28; FUNC-LABEL: {{^}}local_load_v2i16:
29; GFX9-NOT: m0
30; SICIVI: s_mov_b32 m0
31
32; GCN: ds_read_b32
33
34; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
35; EG: LDS_READ_RET {{.*}} [[FROM]]
36; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
37; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
38; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
39define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
40entry:
41  %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
42  store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
43  ret void
44}
45
46; FUNC-LABEL: {{^}}local_load_v3i16:
47; GFX9-NOT: m0
48; SICIVI: s_mov_b32 m0
49
50; GCN: ds_read_b64
51; GCN-DAG: ds_write_b32
52; GCN-DAG: ds_write_b16
53
54; EG-DAG: LDS_USHORT_READ_RET
55; EG-DAG: LDS_USHORT_READ_RET
56define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
57entry:
58  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
59  store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
60  ret void
61}
62
63; FUNC-LABEL: {{^}}local_load_v4i16:
64; GFX9-NOT: m0
65; SICIVI: s_mov_b32 m0
66
67; GCN: ds_read_b64
68
69; EG: LDS_READ_RET
70; EG: LDS_READ_RET
71define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
72entry:
73  %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
74  store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
75  ret void
76}
77
78; FUNC-LABEL: {{^}}local_load_v8i16:
79; GFX9-NOT: m0
80; SICIVI: s_mov_b32 m0
81
82; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
83
84; EG: LDS_READ_RET
85; EG: LDS_READ_RET
86; EG: LDS_READ_RET
87; EG: LDS_READ_RET
88define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
89entry:
90  %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
91  store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
92  ret void
93}
94
95; FUNC-LABEL: {{^}}local_load_v16i16:
96; GFX9-NOT: m0
97; SICIVI: s_mov_b32 m0
98
99; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
100; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
101
102
103; EG: LDS_READ_RET
104; EG: LDS_READ_RET
105; EG: LDS_READ_RET
106; EG: LDS_READ_RET
107
108; EG: LDS_READ_RET
109; EG: LDS_READ_RET
110; EG: LDS_READ_RET
111; EG: LDS_READ_RET
112define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
113entry:
114  %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
115  store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
116  ret void
117}
118
119; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
120; GFX9-NOT: m0
121; SICIVI: s_mov_b32 m0
122
123; GCN: ds_read_u16
124; GCN: ds_write_b32
125
126; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
127; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
128; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
129; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
130; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
131define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
132  %a = load i16, i16 addrspace(3)* %in
133  %ext = zext i16 %a to i32
134  store i32 %ext, i32 addrspace(3)* %out
135  ret void
136}
137
138; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
139; GCN-NOT: s_wqm_b64
140
141; GFX9-NOT: m0
142; SICIVI: s_mov_b32 m0
143
144; GCN: ds_read_i16
145
146; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
147; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
148; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
149; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
150; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
151; EG: 16
152; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
153define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
154  %a = load i16, i16 addrspace(3)* %in
155  %ext = sext i16 %a to i32
156  store i32 %ext, i32 addrspace(3)* %out
157  ret void
158}
159
160; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
161; GFX9-NOT: m0
162; SICIVI: s_mov_b32 m0
163
164; GCN: ds_read_u16
165
166; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
167; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
168; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
169; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
170; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
171define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
172  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
173  %ext = zext <1 x i16> %load to <1 x i32>
174  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
175  ret void
176}
177
178; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
179; GFX9-NOT: m0
180; SICIVI: s_mov_b32 m0
181
182; GCN: ds_read_i16
183
184; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
185; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
186; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
187; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
188; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
189; EG: 16
190; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
191define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
192  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
193  %ext = sext <1 x i16> %load to <1 x i32>
194  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
195  ret void
196}
197
198; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
199; GCN-NOT: s_wqm_b64
200; GFX9-NOT: m0
201; SICIVI: s_mov_b32 m0
202
203; GCN: ds_read_b32
204
205; EG: LDS_READ_RET
206define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
207  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
208  %ext = zext <2 x i16> %load to <2 x i32>
209  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
210  ret void
211}
212
213; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
214; GCN-NOT: s_wqm_b64
215; GFX9-NOT: m0
216; SICIVI: s_mov_b32 m0
217
218; GCN: ds_read_b32
219
220; EG: LDS_READ_RET
221; EG: BFE_INT
222; EG: BFE_INT
223define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
224  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
225  %ext = sext <2 x i16> %load to <2 x i32>
226  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
227  ret void
228}
229
230; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
231; GFX9-NOT: m0
232; SICIVI: s_mov_b32 m0
233
234; GCN: ds_read_b64
235; SI-DAG: ds_write_b32
236; SI-DAG: ds_write_b64
237; CIVI-DAG: ds_write_b96
238; GFX9-DAG: ds_write_b96
239
240; EG: LDS_USHORT_READ_RET
241; EG: LDS_USHORT_READ_RET
242; EG: LDS_USHORT_READ_RET
243define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
244entry:
245  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
246  %ext = zext <3 x i16> %ld to <3 x i32>
247  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
248  ret void
249}
250
251; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
252; GFX9-NOT: m0
253; SICIVI: s_mov_b32 m0
254
255; GCN: ds_read_b64
256; SI-DAG: ds_write_b32
257; SI-DAG: ds_write_b64
258; CIVI-DAG: ds_write_b96
259; GFX9-DAG: ds_write_b96
260
261; EG: LDS_USHORT_READ_RET
262; EG: LDS_USHORT_READ_RET
263; EG: LDS_USHORT_READ_RET
264; EG-DAG: BFE_INT
265; EG-DAG: BFE_INT
266; EG-DAG: BFE_INT
267define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
268entry:
269  %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
270  %ext = sext <3 x i16> %ld to <3 x i32>
271  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
272  ret void
273}
274
275; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
276; GCN-NOT: s_wqm_b64
277; GFX9-NOT: m0
278; SICIVI: s_mov_b32 m0
279
280; GCN: ds_read_b64
281
282; EG: LDS_READ_RET
283; EG: LDS_READ_RET
284define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
285  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
286  %ext = zext <4 x i16> %load to <4 x i32>
287  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
288  ret void
289}
290
291; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
292; GCN-NOT: s_wqm_b64
293; GFX9-NOT: m0
294; SICIVI: s_mov_b32 m0
295
296; GCN: ds_read_b64
297
298; EG: LDS_READ_RET
299; EG: LDS_READ_RET
300; EG-DAG: BFE_INT
301; EG-DAG: BFE_INT
302; EG-DAG: BFE_INT
303; EG-DAG: BFE_INT
304define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
305  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
306  %ext = sext <4 x i16> %load to <4 x i32>
307  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
308  ret void
309}
310
311; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
312; GFX9-NOT: m0
313; SICIVI: s_mov_b32 m0
314
315; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
316
317; EG: LDS_READ_RET
318; EG: LDS_READ_RET
319; EG: LDS_READ_RET
320; EG: LDS_READ_RET
321define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
322  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
323  %ext = zext <8 x i16> %load to <8 x i32>
324  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
325  ret void
326}
327
328; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
329; GFX9-NOT: m0
330; SICIVI: s_mov_b32 m0
331
332; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
333
334; EG: LDS_READ_RET
335; EG: LDS_READ_RET
336; EG: LDS_READ_RET
337; EG: LDS_READ_RET
338; EG-DAG: BFE_INT
339; EG-DAG: BFE_INT
340; EG-DAG: BFE_INT
341; EG-DAG: BFE_INT
342; EG-DAG: BFE_INT
343; EG-DAG: BFE_INT
344; EG-DAG: BFE_INT
345; EG-DAG: BFE_INT
346define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
347  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
348  %ext = sext <8 x i16> %load to <8 x i32>
349  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
350  ret void
351}
352
353; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
354; GFX9-NOT: m0
355; SICIVI: s_mov_b32 m0
356
357; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
358; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
359
360; GCN: ds_write2_b64
361; GCN: ds_write2_b64
362; GCN: ds_write2_b64
363; GCN: ds_write2_b64
364
365; EG: LDS_READ_RET
366; EG: LDS_READ_RET
367; EG: LDS_READ_RET
368; EG: LDS_READ_RET
369; EG: LDS_READ_RET
370; EG: LDS_READ_RET
371; EG: LDS_READ_RET
372; EG: LDS_READ_RET
373define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
374  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
375  %ext = zext <16 x i16> %load to <16 x i32>
376  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
377  ret void
378}
379
380; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
381; GFX9-NOT: m0
382; SICIVI: s_mov_b32 m0
383
384
385; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
386; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
387
388; EG: LDS_READ_RET
389; EG: LDS_READ_RET
390; EG: LDS_READ_RET
391; EG: LDS_READ_RET
392; EG: LDS_READ_RET
393; EG: LDS_READ_RET
394; EG: LDS_READ_RET
395; EG: LDS_READ_RET
396; EG-DAG: BFE_INT
397; EG-DAG: BFE_INT
398; EG-DAG: BFE_INT
399; EG-DAG: BFE_INT
400; EG-DAG: BFE_INT
401; EG-DAG: BFE_INT
402; EG-DAG: BFE_INT
403; EG-DAG: BFE_INT
404; EG-DAG: BFE_INT
405; EG-DAG: BFE_INT
406; EG-DAG: BFE_INT
407; EG-DAG: BFE_INT
408; EG-DAG: BFE_INT
409; EG-DAG: BFE_INT
410; EG-DAG: BFE_INT
411; EG-DAG: BFE_INT
412define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
413  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
414  %ext = sext <16 x i16> %load to <16 x i32>
415  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
416  ret void
417}
418
419; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
420; GFX9-NOT: m0
421; SICIVI: s_mov_b32 m0
422
423; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
424; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
425; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
426; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
427
428; EG: LDS_READ_RET
429; EG: LDS_READ_RET
430; EG: LDS_READ_RET
431; EG: LDS_READ_RET
432; EG: LDS_READ_RET
433; EG: LDS_READ_RET
434; EG: LDS_READ_RET
435; EG: LDS_READ_RET
436; EG: LDS_READ_RET
437; EG: LDS_READ_RET
438; EG: LDS_READ_RET
439; EG: LDS_READ_RET
440; EG: LDS_READ_RET
441; EG: LDS_READ_RET
442; EG: LDS_READ_RET
443; EG: LDS_READ_RET
444define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
445  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
446  %ext = zext <32 x i16> %load to <32 x i32>
447  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
448  ret void
449}
450
451; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
452; GFX9-NOT: m0
453; SICIVI: s_mov_b32 m0
454
455; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
456; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
457; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
458; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
459; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
460; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
461; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
462; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
463; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
464; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
465; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
466; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
467
468; EG: LDS_READ_RET
469; EG: LDS_READ_RET
470; EG: LDS_READ_RET
471; EG: LDS_READ_RET
472; EG: LDS_READ_RET
473; EG: LDS_READ_RET
474; EG: LDS_READ_RET
475; EG: LDS_READ_RET
476; EG: LDS_READ_RET
477; EG: LDS_READ_RET
478; EG: LDS_READ_RET
479; EG: LDS_READ_RET
480; EG: LDS_READ_RET
481; EG: LDS_READ_RET
482; EG: LDS_READ_RET
483; EG: LDS_READ_RET
484define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
485  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
486  %ext = sext <32 x i16> %load to <32 x i32>
487  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
488  ret void
489}
490
491; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
492; GFX9-NOT: m0
493; SICIVI: s_mov_b32 m0
494
495; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
496; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
497; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
498; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
499; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
500; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
501; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
502; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
503; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
504; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
505; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
506; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
507; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
508; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
509; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
510; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
511; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
512; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
513; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
514; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
515; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
516; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
517; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
518; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
519
520; EG: LDS_READ_RET
521; EG: LDS_READ_RET
522; EG: LDS_READ_RET
523; EG: LDS_READ_RET
524; EG: LDS_READ_RET
525; EG: LDS_READ_RET
526; EG: LDS_READ_RET
527; EG: LDS_READ_RET
528; EG: LDS_READ_RET
529; EG: LDS_READ_RET
530; EG: LDS_READ_RET
531; EG: LDS_READ_RET
532; EG: LDS_READ_RET
533; EG: LDS_READ_RET
534; EG: LDS_READ_RET
535; EG: LDS_READ_RET
536; EG: LDS_READ_RET
537; EG: LDS_READ_RET
538; EG: LDS_READ_RET
539; EG: LDS_READ_RET
540; EG: LDS_READ_RET
541; EG: LDS_READ_RET
542; EG: LDS_READ_RET
543; EG: LDS_READ_RET
544; EG: LDS_READ_RET
545; EG: LDS_READ_RET
546; EG: LDS_READ_RET
547; EG: LDS_READ_RET
548; EG: LDS_READ_RET
549; EG: LDS_READ_RET
550; EG: LDS_READ_RET
551; EG: LDS_READ_RET
552define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
553  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
554  %ext = zext <64 x i16> %load to <64 x i32>
555  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
556  ret void
557}
558
559; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
560; GFX9-NOT: m0
561; SICIVI: s_mov_b32 m0
562
563; EG: LDS_READ_RET
564; EG: LDS_READ_RET
565; EG: LDS_READ_RET
566; EG: LDS_READ_RET
567; EG: LDS_READ_RET
568; EG: LDS_READ_RET
569; EG: LDS_READ_RET
570; EG: LDS_READ_RET
571; EG: LDS_READ_RET
572; EG: LDS_READ_RET
573; EG: LDS_READ_RET
574; EG: LDS_READ_RET
575; EG: LDS_READ_RET
576; EG: LDS_READ_RET
577; EG: LDS_READ_RET
578; EG: LDS_READ_RET
579; EG: LDS_READ_RET
580; EG: LDS_READ_RET
581; EG: LDS_READ_RET
582; EG: LDS_READ_RET
583; EG: LDS_READ_RET
584; EG: LDS_READ_RET
585; EG: LDS_READ_RET
586; EG: LDS_READ_RET
587; EG: LDS_READ_RET
588; EG: LDS_READ_RET
589; EG: LDS_READ_RET
590; EG: LDS_READ_RET
591; EG: LDS_READ_RET
592; EG: LDS_READ_RET
593; EG: LDS_READ_RET
594; EG: LDS_READ_RET
595define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
596  %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
597  %ext = sext <64 x i16> %load to <64 x i32>
598  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
599  ret void
600}
601
602; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
603; GFX9-NOT: m0
604; SICIVI: s_mov_b32 m0
605
606; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
607; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
608
609; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
610
611; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
612; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
613; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
614; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
615; EG-DAG: LDS_WRITE
616define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
617  %a = load i16, i16 addrspace(3)* %in
618  %ext = zext i16 %a to i64
619  store i64 %ext, i64 addrspace(3)* %out
620  ret void
621}
622
623; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
624; GFX9-NOT: m0
625; SICIVI: s_mov_b32 m0
626
627; FIXME: Need to optimize this sequence to avoid an extra shift.
628;  t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
629;          t28: i64 = any_extend t25
630;        t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
631; SI: ds_read_i16 v[[LO:[0-9]+]],
632; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
633; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
634; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
635
636; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
637
638; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
639; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
640; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
641; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
642; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
643; EG-DAG: LDS_WRITE
644; EG-DAG: 16
645; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
646define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
647  %a = load i16, i16 addrspace(3)* %in
648  %ext = sext i16 %a to i64
649  store i64 %ext, i64 addrspace(3)* %out
650  ret void
651}
652
653; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
654; GFX9-NOT: m0
655; SICIVI: s_mov_b32 m0
656
657
658; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
659; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
660; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
661; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
662; EG-DAG: LDS_WRITE
663define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
664  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
665  %ext = zext <1 x i16> %load to <1 x i64>
666  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
667  ret void
668}
669
670; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
671; GFX9-NOT: m0
672; SICIVI: s_mov_b32 m0
673
674
675; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
676; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
677; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
678; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
679; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
680; EG-DAG: LDS_WRITE
681; EG-DAG: 16
682; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
683define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
684  %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
685  %ext = sext <1 x i16> %load to <1 x i64>
686  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
687  ret void
688}
689
690; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
691; GFX9-NOT: m0
692; SICIVI: s_mov_b32 m0
693
694
695; EG: LDS_READ_RET
696define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
697  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
698  %ext = zext <2 x i16> %load to <2 x i64>
699  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
700  ret void
701}
702
703; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
704; GFX9-NOT: m0
705; SICIVI: s_mov_b32 m0
706
707
708; EG: LDS_READ_RET
709; EG-DAG: BFE_INT
710; EG-DAG: ASHR
711define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
712  %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
713  %ext = sext <2 x i16> %load to <2 x i64>
714  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
715  ret void
716}
717
718; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
719; GFX9-NOT: m0
720; SICIVI: s_mov_b32 m0
721
722
723; EG: LDS_READ_RET
724; EG: LDS_READ_RET
725define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
726  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
727  %ext = zext <4 x i16> %load to <4 x i64>
728  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
729  ret void
730}
731
732; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
733; GFX9-NOT: m0
734; SICIVI: s_mov_b32 m0
735
736
737; EG: LDS_READ_RET
738; EG: LDS_READ_RET
739; EG-DAG: BFE_INT
740; EG-DAG: BFE_INT
741; EG-DAG: ASHR
742; EG-DAG: ASHR
743define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
744  %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
745  %ext = sext <4 x i16> %load to <4 x i64>
746  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
747  ret void
748}
749
750; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
751; GFX9-NOT: m0
752; SICIVI: s_mov_b32 m0
753
754
755; EG: LDS_READ_RET
756; EG: LDS_READ_RET
757; EG: LDS_READ_RET
758; EG: LDS_READ_RET
759define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
760  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
761  %ext = zext <8 x i16> %load to <8 x i64>
762  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
763  ret void
764}
765
766; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
767; GFX9-NOT: m0
768; SICIVI: s_mov_b32 m0
769
770
771; EG: LDS_READ_RET
772; EG: LDS_READ_RET
773; EG: LDS_READ_RET
774; EG: LDS_READ_RET
775; EG-DAG: BFE_INT
776; EG-DAG: BFE_INT
777; EG-DAG: ASHR
778; EG-DAG: ASHR
779; EG-DAG: BFE_INT
780; EG-DAG: BFE_INT
781; EG-DAG: ASHR
782; EG-DAG: ASHR
783define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
784  %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
785  %ext = sext <8 x i16> %load to <8 x i64>
786  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
787  ret void
788}
789
790; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
791; GFX9-NOT: m0
792; SICIVI: s_mov_b32 m0
793
794
795; EG: LDS_READ_RET
796; EG: LDS_READ_RET
797; EG: LDS_READ_RET
798; EG: LDS_READ_RET
799; EG: LDS_READ_RET
800; EG: LDS_READ_RET
801; EG: LDS_READ_RET
802; EG: LDS_READ_RET
803define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
804  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
805  %ext = zext <16 x i16> %load to <16 x i64>
806  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
807  ret void
808}
809
810; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
811; GFX9-NOT: m0
812; SICIVI: s_mov_b32 m0
813
814
815; EG: LDS_READ_RET
816; EG: LDS_READ_RET
817; EG: LDS_READ_RET
818; EG: LDS_READ_RET
819; EG: LDS_READ_RET
820; EG: LDS_READ_RET
821; EG: LDS_READ_RET
822; EG: LDS_READ_RET
823; EG-DAG: BFE_INT
824; EG-DAG: BFE_INT
825; EG-DAG: ASHR
826; EG-DAG: ASHR
827; EG-DAG: BFE_INT
828; EG-DAG: BFE_INT
829; EG-DAG: ASHR
830; EG-DAG: ASHR
831; EG-DAG: BFE_INT
832; EG-DAG: BFE_INT
833; EG-DAG: ASHR
834; EG-DAG: ASHR
835; EG-DAG: BFE_INT
836; EG-DAG: BFE_INT
837; EG-DAG: ASHR
838; EG-DAG: ASHR
839define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
840  %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
841  %ext = sext <16 x i16> %load to <16 x i64>
842  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
843  ret void
844}
845
846; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
847; GFX9-NOT: m0
848; SICIVI: s_mov_b32 m0
849
850
851; EG: LDS_READ_RET
852; EG: LDS_READ_RET
853; EG: LDS_READ_RET
854; EG: LDS_READ_RET
855; EG: LDS_READ_RET
856; EG: LDS_READ_RET
857; EG: LDS_READ_RET
858; EG: LDS_READ_RET
859; EG: LDS_READ_RET
860; EG: LDS_READ_RET
861; EG: LDS_READ_RET
862; EG: LDS_READ_RET
863; EG: LDS_READ_RET
864; EG: LDS_READ_RET
865; EG: LDS_READ_RET
866; EG: LDS_READ_RET
867define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
868  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
869  %ext = zext <32 x i16> %load to <32 x i64>
870  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
871  ret void
872}
873
874; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
875; GFX9-NOT: m0
876; SICIVI: s_mov_b32 m0
877
878
879; EG: LDS_READ_RET
880; EG: LDS_READ_RET
881; EG: LDS_READ_RET
882; EG: LDS_READ_RET
883; EG: LDS_READ_RET
884; EG: LDS_READ_RET
885; EG: LDS_READ_RET
886; EG: LDS_READ_RET
887; EG: LDS_READ_RET
888; EG: LDS_READ_RET
889; EG: LDS_READ_RET
890; EG: LDS_READ_RET
891; EG: LDS_READ_RET
892; EG: LDS_READ_RET
893; EG: LDS_READ_RET
894; EG: LDS_READ_RET
895; EG-DAG: BFE_INT
896; EG-DAG: BFE_INT
897; EG-DAG: ASHR
898; EG-DAG: ASHR
899; EG-DAG: BFE_INT
900; EG-DAG: BFE_INT
901; EG-DAG: ASHR
902; EG-DAG: ASHR
903; EG-DAG: BFE_INT
904; EG-DAG: BFE_INT
905; EG-DAG: ASHR
906; EG-DAG: ASHR
907; EG-DAG: BFE_INT
908; EG-DAG: BFE_INT
909; EG-DAG: ASHR
910; EG-DAG: ASHR
911; EG-DAG: BFE_INT
912; EG-DAG: BFE_INT
913; EG-DAG: ASHR
914; EG-DAG: ASHR
915; EG-DAG: BFE_INT
916; EG-DAG: BFE_INT
917; EG-DAG: ASHR
918; EG-DAG: ASHR
919; EG-DAG: BFE_INT
920; EG-DAG: BFE_INT
921; EG-DAG: ASHR
922; EG-DAG: ASHR
923; EG-DAG: BFE_INT
924; EG-DAG: BFE_INT
925; EG-DAG: ASHR
926; EG-DAG: ASHR
927define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
928  %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
929  %ext = sext <32 x i16> %load to <32 x i64>
930  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
931  ret void
932}
933
934; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
935; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
936;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
937;   %ext = zext <64 x i16> %load to <64 x i64>
938;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
939;   ret void
940; }
941
942; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
943; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
944;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
945;   %ext = sext <64 x i16> %load to <64 x i64>
946;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
947;   ret void
948; }
949
950; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
951; FUNC-LABEL: {{^}}local_v8i16_to_128:
952
953; SI-NOT: ds_read_b128
954; SI-NOT: ds_write_b128
955
956; CIVI: ds_read_b128
957; CIVI: ds_write_b128
958
959; EG: LDS_READ_RET
960; EG: LDS_READ_RET
961; EG: LDS_READ_RET
962; EG: LDS_READ_RET
963define amdgpu_kernel void @local_v8i16_to_128(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
964  %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in, align 16
965  store <8 x i16> %ld, <8 x i16> addrspace(3)* %out, align 16
966  ret void
967}
968
969attributes #0 = { nounwind }
970