1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
3
4; How the replacement of i64 stores with v2i32 stores resulted in
5; breaking other users of the bitcast if they already existed
6
7; GCN-LABEL: {{^}}extract_vector_elt_select_error:
8; GCN: buffer_store_dword
9; GCN: buffer_store_dword
10; GCN: buffer_store_dwordx2
11define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
12  %vec = bitcast i64 %val to <2 x i32>
13  %elt0 = extractelement <2 x i32> %vec, i32 0
14  %elt1 = extractelement <2 x i32> %vec, i32 1
15
16  store volatile i32 %elt0, i32 addrspace(1)* %out
17  store volatile i32 %elt1, i32 addrspace(1)* %out
18  store volatile i64 %val, i64 addrspace(1)* %in
19  ret void
20}
21
22; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
23define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
24  %p0 = extractelement <2 x i64> %foo, i32 0
25  %p1 = extractelement <2 x i64> %foo, i32 1
26  %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
27  store volatile i64 %p1, i64 addrspace(1)* %out
28  store volatile i64 %p0, i64 addrspace(1)* %out1
29  ret void
30}
31
32; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
33; GCN-NOT: buffer_load
34; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
35; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
36; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
37; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
38; SI: store_dwordx2 v[{{[0-9:]+}}]
39; VI: s_cselect_b64 s{{\[}}[[S_LO:[0-9]+]]:[[S_HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
40; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
41; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[S_HI]]
42; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
43define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
44  %dynelt = extractelement <2 x i64> %foo, i32 %elt
45  store volatile i64 %dynelt, i64 addrspace(1)* %out
46  ret void
47}
48
49; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
50; GCN:     buffer_load_dwordx4
51; GCN-NOT: buffer_load
52; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
53; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
54; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
55; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
56; GCN: store_dwordx2 v[{{[0-9:]+}}]
57define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
58  %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
59  %or = or <2 x i64> %load, %arst
60  %dynelt = extractelement <2 x i64> %or, i32 %elt
61  store volatile i64 %dynelt, i64 addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
66; SI-NOT: buffer_load
67; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
68; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
69; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
70; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
71; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
72; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
73; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
74; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
75; SI: store_dwordx2 v[{{[0-9:]+}}]
76; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
77; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
78; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
79; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
80; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T1LO]]
81; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T1HI]]
82; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
83define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
84  %dynelt = extractelement <3 x i64> %foo, i32 %elt
85  store volatile i64 %dynelt, i64 addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
90; GCN-NOT: buffer_load
91; SI-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
92; SI-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
93; SI-DAG: s_cmp_eq_u32 [[IDX]], 2
94; SI-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
95; SI-DAG: s_cmp_eq_u32 [[IDX]], 3
96; SI-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
97; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
98; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
99; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
100; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
101; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
102; SI-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C3]]
103; SI: store_dwordx2 v[{{[0-9:]+}}]
104; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
105; VI: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
106; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 2
107; VI: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}
108; VI: s_cmp_eq_u32 [[IDX:s[0-9]+]], 3
109; VI: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}
110; VI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]]
111; VI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]]
112; VI: store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
113define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
114  %dynelt = extractelement <4 x i64> %foo, i32 %elt
115  store volatile i64 %dynelt, i64 addrspace(1)* %out
116  ret void
117}
118
119attributes #0 = { nounwind }
120