1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,NO-D16-HI %s
3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
4
5; GCN-LABEL: {{^}}store_global_hi_v2i16:
6; GCN: s_waitcnt
7
8; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
9
10; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
11; GFX803-NEXT: flat_store_short v[0:1], v2
12; GFX906-NEXT: global_store_short v[0:1], v2, off
13
14; GCN-NEXT: s_waitcnt
15; GCN-NEXT: s_setpc_b64
16define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 {
17entry:
18  ; FIXME: ABI for pre-gfx9
19  %value = bitcast i32 %arg to <2 x i16>
20  %hi = extractelement <2 x i16> %value, i32 1
21  store i16 %hi, i16 addrspace(1)* %out
22  ret void
23}
24
25; GCN-LABEL: {{^}}store_global_hi_v2f16:
26; GCN: s_waitcnt
27
28; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
29
30; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
31; GFX803-NEXT: flat_store_short v[0:1], v2
32; GFX906-NEXT: global_store_short v[0:1], v2, off
33
34; GCN-NEXT: s_waitcnt
35; GCN-NEXT: s_setpc_b64
36define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 {
37entry:
38  ; FIXME: ABI for pre-gfx9
39  %value = bitcast i32 %arg to <2 x half>
40  %hi = extractelement <2 x half> %value, i32 1
41  store half %hi, half addrspace(1)* %out
42  ret void
43}
44
45; GCN-LABEL: {{^}}store_global_hi_i32_shift:
46; GCN: s_waitcnt
47
48; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off
49
50; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
51; GFX803-NEXT: flat_store_short v[0:1], v2
52; GFX906-NEXT: global_store_short v[0:1], v2, off
53
54; GCN-NEXT: s_waitcnt
55; GCN-NEXT: s_setpc_b64
56define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 {
57entry:
58  %hi32 = lshr i32 %value, 16
59  %hi = trunc i32 %hi32 to i16
60  store i16 %hi, i16 addrspace(1)* %out
61  ret void
62}
63
64; GCN-LABEL: {{^}}store_global_hi_v2i16_i8:
65; GCN: s_waitcnt
66
67; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
68
69; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
70; GFX803-NEXT: flat_store_byte v[0:1], v2
71; GFX906-NEXT: global_store_byte v[0:1], v2, off
72
73; GCN-NEXT: s_waitcnt
74; GCN-NEXT: s_setpc_b64
75define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 {
76entry:
77  %value = bitcast i32 %arg to <2 x i16>
78  %hi = extractelement <2 x i16> %value, i32 1
79  %trunc = trunc i16 %hi to i8
80  store i8 %trunc, i8 addrspace(1)* %out
81  ret void
82}
83
84; GCN-LABEL: {{^}}store_global_hi_i8_shift:
85; GCN: s_waitcnt
86
87; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off
88
89; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
90; GFX803-NEXT: flat_store_byte v[0:1], v2
91; GFX906-NEXT: global_store_byte v[0:1], v2, off
92
93; GCN-NEXT: s_waitcnt
94; GCN-NEXT: s_setpc_b64
95define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 {
96entry:
97  %hi32 = lshr i32 %value, 16
98  %hi = trunc i32 %hi32 to i8
99  store i8 %hi, i8 addrspace(1)* %out
100  ret void
101}
102
103; GCN-LABEL: {{^}}store_global_hi_v2i16_max_offset:
104; GCN: s_waitcnt
105; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:4094
106
107; GFX803-DAG: v_add_u32_e32
108; GFX803-DAG: v_addc_u32_e32
109; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
110; GFX803: flat_store_short v[0:1], v2{{$}}
111
112; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
113; GFX906-NEXT: global_store_short v[0:1], v2, off
114
115; GCN-NEXT: s_waitcnt
116; GCN-NEXT: s_setpc_b64
117define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
118entry:
119  ; FIXME: ABI for pre-gfx9
120  %value = bitcast i32 %arg to <2 x i16>
121  %hi = extractelement <2 x i16> %value, i32 1
122  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 2047
123  store i16 %hi, i16 addrspace(1)* %gep
124  ret void
125}
126
127; GCN-LABEL: {{^}}store_global_hi_v2i16_min_offset:
128; GCN: s_waitcnt
129; GFX900-NEXT: global_store_short_d16_hi v[0:1], v2, off offset:-4096{{$}}
130
131; GFX803-DAG: v_add_u32_e32
132; GFX803-DAG: v_addc_u32_e32
133; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
134; GFX803: flat_store_short v[0:1], v{{[0-9]$}}
135
136; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
137; GFX906-NEXT: global_store_short v[0:1], v2, off
138
139; GCN-NEXT: s_waitcnt
140; GCN-NEXT: s_setpc_b64
141define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 {
142entry:
143  %value = bitcast i32 %arg to <2 x i16>
144  %hi = extractelement <2 x i16> %value, i32 1
145  %gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 -2048
146  store i16 %hi, i16 addrspace(1)* %gep
147  ret void
148}
149
150; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_max_offset:
151; GCN: s_waitcnt
152; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:4095
153
154; GFX803-DAG: v_add_u32_e32
155; GFX803-DAG: v_addc_u32_e32
156; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
157; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
158
159; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
160; GFX906-NEXT: global_store_byte v[0:1], v2, off
161
162; GCN-NEXT: s_waitcnt
163; GCN-NEXT: s_setpc_b64
164define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
165entry:
166  %value = bitcast i32 %arg to <2 x i16>
167  %hi = extractelement <2 x i16> %value, i32 1
168  %trunc = trunc i16 %hi to i8
169  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 4095
170  store i8 %trunc, i8 addrspace(1)* %gep
171  ret void
172}
173
174; GCN-LABEL: {{^}}store_global_hi_v2i16_i8_min_offset:
175; GCN: s_waitcnt
176; GFX900-NEXT: global_store_byte_d16_hi v[0:1], v2, off offset:-4095
177
178; GFX803-DAG: v_add_u32_e32
179; GFX803-DAG: v_addc_u32_e32
180; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
181; GFX803: flat_store_byte v[0:1], v{{[0-9]$}}
182
183; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
184; GFX906-NEXT: global_store_byte v[0:1], v2, off
185
186; GCN-NEXT: s_waitcnt
187; GCN-NEXT: s_setpc_b64
188define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 {
189entry:
190  %value = bitcast i32 %arg to <2 x i16>
191  %hi = extractelement <2 x i16> %value, i32 1
192  %trunc = trunc i16 %hi to i8
193  %gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 -4095
194  store i8 %trunc, i8 addrspace(1)* %gep
195  ret void
196}
197
198; GCN-LABEL: {{^}}store_flat_hi_v2i16:
199; GCN: s_waitcnt
200
201; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
202
203; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
204; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
205
206; GCN-NEXT: s_waitcnt
207; GCN-NEXT: s_setpc_b64
208define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 {
209entry:
210  %value = bitcast i32 %arg to <2 x i16>
211  %hi = extractelement <2 x i16> %value, i32 1
212  store i16 %hi, i16* %out
213  ret void
214}
215
216; GCN-LABEL: {{^}}store_flat_hi_v2f16:
217; GCN: s_waitcnt
218
219; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
220
221; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
222; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
223
224; GCN-NEXT: s_waitcnt
225; GCN-NEXT: s_setpc_b64
226define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 {
227entry:
228  %value = bitcast i32 %arg to <2 x half>
229  %hi = extractelement <2 x half> %value, i32 1
230  store half %hi, half* %out
231  ret void
232}
233
234; GCN-LABEL: {{^}}store_flat_hi_i32_shift:
235; GCN: s_waitcnt
236
237; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
238
239; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
240; NO-D16-HI-NEXT: flat_store_short v[0:1], v2
241
242; GCN-NEXT: s_waitcnt
243; GCN-NEXT: s_setpc_b64
244define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 {
245entry:
246  %hi32 = lshr i32 %value, 16
247  %hi = trunc i32 %hi32 to i16
248  store i16 %hi, i16* %out
249  ret void
250}
251
252; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8:
253; GCN: s_waitcnt
254
255; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
256
257; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
258; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
259
260; GCN-NEXT: s_waitcnt
261; GCN-NEXT: s_setpc_b64
262define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 {
263entry:
264  %value = bitcast i32 %arg to <2 x i16>
265  %hi = extractelement <2 x i16> %value, i32 1
266  %trunc = trunc i16 %hi to i8
267  store i8 %trunc, i8* %out
268  ret void
269}
270
271; GCN-LABEL: {{^}}store_flat_hi_i8_shift:
272; GCN: s_waitcnt
273
274; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
275
276; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
277; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2
278
279; GCN-NEXT: s_waitcnt
280; GCN-NEXT: s_setpc_b64
281define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 {
282entry:
283  %hi32 = lshr i32 %value, 16
284  %hi = trunc i32 %hi32 to i8
285  store i8 %hi, i8* %out
286  ret void
287}
288
289; GCN-LABEL: {{^}}store_flat_hi_v2i16_max_offset:
290; GCN: s_waitcnt
291; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:4094{{$}}
292
293; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
294; GFX906-NEXT: flat_store_short v[0:1], v2 offset:4094
295
296; GFX803-DAG: v_add_u32_e32
297; GFX803-DAG: v_addc_u32_e32
298; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
299; GFX803: flat_store_short v[0:1], v2{{$}}
300
301; GCN-NEXT: s_waitcnt
302; GCN-NEXT: s_setpc_b64
303define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 {
304entry:
305  %value = bitcast i32 %arg to <2 x i16>
306  %hi = extractelement <2 x i16> %value, i32 1
307  %gep = getelementptr inbounds i16, i16* %out, i64 2047
308  store i16 %hi, i16* %gep
309  ret void
310}
311
312; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset:
313; GCN: s_waitcnt
314; GCN: v_add{{(_co)?}}_{{i|u}}32_e32
315
316; GFX803: v_addc_u32_e32
317; GFX900: v_addc_co_u32_e32
318
319; GFX906-NEXT: v_lshrrev_b32_e32
320; GFX906-NEXT: v_addc_co_u32_e32
321; GFX906: flat_store_short v[0:1], v2
322
323; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}}
324; GFX803: flat_store_short v[0:1], v2{{$}}
325; GCN-NEXT: s_waitcnt
326; GCN-NEXT: s_setpc_b64
327define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 {
328entry:
329  %value = bitcast i32 %arg to <2 x i16>
330  %hi = extractelement <2 x i16> %value, i32 1
331  %gep = getelementptr inbounds i16, i16* %out, i64 -1023
332  store i16 %hi, i16* %gep
333  ret void
334}
335
336; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_max_offset:
337; GCN: s_waitcnt
338; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:4095{{$}}
339
340; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
341; GFX803-DAG: v_add_u32_e32
342; GFX803-DAG: v_addc_u32_e32
343; GFX803: flat_store_byte v[0:1], v2{{$}}
344
345; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
346; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}}
347
348; GCN-NEXT: s_waitcnt
349; GCN-NEXT: s_setpc_b64
350define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 {
351entry:
352  %value = bitcast i32 %arg to <2 x i16>
353  %hi = extractelement <2 x i16> %value, i32 1
354  %trunc = trunc i16 %hi to i8
355  %gep = getelementptr inbounds i8, i8* %out, i64 4095
356  store i8 %trunc, i8* %gep
357  ret void
358}
359
360; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset:
361; GCN: s_waitcnt
362; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32
363
364; GFX803-DAG: v_addc_u32_e32
365; GFX900-DAG: v_addc_co_u32_e32
366; GFX906-DAG: v_add_co_u32_e32
367
368; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}}
369
370; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2
371; GFX906-NEXT: v_addc_co_u32_e32
372; GFX906-NEXT: flat_store_byte v[0:1], v2{{$}}
373
374; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2
375; GFX803: flat_store_byte v[0:1], v2{{$}}
376
377; GCN-NEXT: s_waitcnt
378; GCN-NEXT: s_setpc_b64
379define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 {
380entry:
381  %value = bitcast i32 %arg to <2 x i16>
382  %hi = extractelement <2 x i16> %value, i32 1
383  %trunc = trunc i16 %hi to i8
384  %gep = getelementptr inbounds i8, i8* %out, i64 -4095
385  store i8 %trunc, i8* %gep
386  ret void
387}
388
389; GCN-LABEL: {{^}}store_private_hi_v2i16:
390; GCN: s_waitcnt
391
392; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
393
394; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
395; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
396
397; GCN-NEXT: s_waitcnt
398; GCN-NEXT: s_setpc_b64
399define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 {
400entry:
401  ; FIXME: ABI for pre-gfx9
402  %value = bitcast i32 %arg to <2 x i16>
403  %hi = extractelement <2 x i16> %value, i32 1
404  store i16 %hi, i16 addrspace(5)* %out
405  ret void
406}
407
408; GCN-LABEL: {{^}}store_private_hi_v2f16:
409; GCN: s_waitcnt
410
411; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
412
413; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
414; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
415
416; GCN-NEXT: s_waitcnt
417; GCN-NEXT: s_setpc_b64
418define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 {
419entry:
420  ; FIXME: ABI for pre-gfx9
421  %value = bitcast i32 %arg to <2 x half>
422  %hi = extractelement <2 x half> %value, i32 1
423  store half %hi, half addrspace(5)* %out
424  ret void
425}
426
427; GCN-LABEL: {{^}}store_private_hi_i32_shift:
428; GCN: s_waitcnt
429
430; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen{{$}}
431
432; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
433; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen{{$}}
434
435; GCN-NEXT: s_waitcnt
436; GCN-NEXT: s_setpc_b64
437define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 {
438entry:
439  %hi32 = lshr i32 %value, 16
440  %hi = trunc i32 %hi32 to i16
441  store i16 %hi, i16 addrspace(5)* %out
442  ret void
443}
444
445; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
446; GCN: s_waitcnt
447
448; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}}
449
450; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
451; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}}
452
453; GCN-NEXT: s_waitcnt
454; GCN-NEXT: s_setpc_b64
455define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 {
456entry:
457  %value = bitcast i32 %arg to <2 x i16>
458  %hi = extractelement <2 x i16> %value, i32 1
459  %trunc = trunc i16 %hi to i8
460  store i8 %trunc, i8 addrspace(5)* %out
461  ret void
462}
463
464; GCN-LABEL: {{^}}store_private_hi_i8_shift:
465; GCN: s_waitcnt
466
467; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s4 offen{{$}}
468
469; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
470; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s4 offen{{$}}
471
472; GCN-NEXT: s_waitcnt
473; GCN-NEXT: s_setpc_b64
474define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 {
475entry:
476  %hi32 = lshr i32 %value, 16
477  %hi = trunc i32 %hi32 to i8
478  store i8 %hi, i8 addrspace(5)* %out
479  ret void
480}
481
482; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
483; GCN: s_waitcnt
484; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
485
486; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
487; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}}
488
489; GCN-NEXT: s_waitcnt
490; GCN-NEXT: s_setpc_b64
491define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32 %arg) #0 {
492entry:
493  %value = bitcast i32 %arg to <2 x i16>
494  %hi = extractelement <2 x i16> %value, i32 1
495  %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045
496  store i16 %hi, i16 addrspace(5)* %gep
497  ret void
498}
499
500
501
502; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
503; GCN: s_waitcnt
504
505; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s4{{$}}
506
507; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
508; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s4{{$}}
509
510; GCN-NEXT: s_waitcnt
511; GCN-NEXT: s_setpc_b64
512define void @store_private_hi_v2i16_nooff(i32 %arg) #0 {
513entry:
514  ; FIXME: ABI for pre-gfx9
515  %value = bitcast i32 %arg to <2 x i16>
516  %hi = extractelement <2 x i16> %value, i32 1
517  store volatile i16 %hi, i16 addrspace(5)* null
518  ret void
519}
520
521
522; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
523; GCN: s_waitcnt
524
525; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s4{{$}}
526
527; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
528; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s4{{$}}
529
530; GCN-NEXT: s_waitcnt
531; GCN-NEXT: s_setpc_b64
532define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 {
533entry:
534  %value = bitcast i32 %arg to <2 x i16>
535  %hi = extractelement <2 x i16> %value, i32 1
536  %trunc = trunc i16 %hi to i8
537  store volatile i8 %trunc, i8 addrspace(5)* null
538  ret void
539}
540
541; GCN-LABEL: {{^}}store_local_hi_v2i16:
542; GCN: s_waitcnt
543
544; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
545
546; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
547; NO-D16-HI: ds_write_b16 v0, v1
548
549; GCN-NEXT: s_waitcnt
550; GCN-NEXT: s_setpc_b64
551define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 {
552entry:
553  ; FIXME: ABI for pre-gfx9
554  %value = bitcast i32 %arg to <2 x i16>
555  %hi = extractelement <2 x i16> %value, i32 1
556  store i16 %hi, i16 addrspace(3)* %out
557  ret void
558}
559
560; GCN-LABEL: {{^}}store_local_hi_v2f16:
561; GCN: s_waitcnt
562
563; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
564
565; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
566; NO-D16-HI: ds_write_b16 v0, v1
567
568; GCN-NEXT: s_waitcnt
569; GCN-NEXT: s_setpc_b64
570define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 {
571entry:
572  ; FIXME: ABI for pre-gfx9
573  %value = bitcast i32 %arg to <2 x half>
574  %hi = extractelement <2 x half> %value, i32 1
575  store half %hi, half addrspace(3)* %out
576  ret void
577}
578
579; GCN-LABEL: {{^}}store_local_hi_i32_shift:
580; GCN: s_waitcnt
581
582; GFX900-NEXT: ds_write_b16_d16_hi v0, v1{{$}}
583
584; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
585; NO-D16-HI: ds_write_b16 v0, v1
586
587; GCN-NEXT: s_waitcnt
588; GCN-NEXT: s_setpc_b64
589define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 {
590entry:
591  %hi32 = lshr i32 %value, 16
592  %hi = trunc i32 %hi32 to i16
593  store i16 %hi, i16 addrspace(3)* %out
594  ret void
595}
596
597; GCN-LABEL: {{^}}store_local_hi_v2i16_i8:
598; GCN: s_waitcnt
599
600; GFX900-NEXT: ds_write_b8_d16_hi v0, v1{{$}}
601
602; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
603; NO-D16-HI: ds_write_b8 v0, v1
604
605; GCN-NEXT: s_waitcnt
606; GCN-NEXT: s_setpc_b64
607define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 {
608entry:
609  %value = bitcast i32 %arg to <2 x i16>
610  %hi = extractelement <2 x i16> %value, i32 1
611  %trunc = trunc i16 %hi to i8
612  store i8 %trunc, i8 addrspace(3)* %out
613  ret void
614}
615
616; GCN-LABEL: {{^}}store_local_hi_v2i16_max_offset:
617; GCN: s_waitcnt
618; GFX900-NEXT: ds_write_b16_d16_hi v0, v1 offset:65534{{$}}
619
620; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
621; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}}
622
623; GCN-NEXT: s_waitcnt
624; GCN-NEXT: s_setpc_b64
625define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 {
626entry:
627  ; FIXME: ABI for pre-gfx9
628  %value = bitcast i32 %arg to <2 x i16>
629  %hi = extractelement <2 x i16> %value, i32 1
630  %gep = getelementptr inbounds i16, i16 addrspace(3)* %out, i64 32767
631  store i16 %hi, i16 addrspace(3)* %gep
632  ret void
633}
634
635; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
636; GCN: s_waitcnt
637; GFX900: buffer_store_dword
638; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094
639define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
640entry:
641  %obj0 = alloca [10 x i32], align 4, addrspace(5)
642  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
643  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
644  store volatile i32 123, i32 addrspace(5)* %bc
645  %value = bitcast i32 %arg to <2 x i16>
646  %hi = extractelement <2 x i16> %value, i32 1
647  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
648  store i16 %hi, i16 addrspace(5)* %gep
649  ret void
650}
651
652; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
653; GCN: s_waitcnt
654; GFX900: buffer_store_dword
655; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095
656define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
657entry:
658  %obj0 = alloca [10 x i32], align 4, addrspace(5)
659  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
660  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
661  store volatile i32 123, i32 addrspace(5)* %bc
662  %value = bitcast i32 %arg to <2 x i16>
663  %hi = extractelement <2 x i16> %value, i32 1
664  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
665  %trunc = trunc i16 %hi to i8
666  store i8 %trunc, i8 addrspace(5)* %gep
667  ret void
668}
669
670attributes #0 = { nounwind }
671