1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
5; GCN-LABEL: v_shl_i128_vv:
6; GCN:       ; %bb.0:
7; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GCN-NEXT:    v_lshl_b64 v[5:6], v[2:3], v4
9; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 64, v4
10; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v4
11; GCN-NEXT:    v_lshl_b64 v[7:8], v[0:1], v4
12; GCN-NEXT:    v_lshr_b64 v[9:10], v[0:1], v9
13; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v11
14; GCN-NEXT:    v_or_b32_e32 v6, v6, v10
15; GCN-NEXT:    v_or_b32_e32 v5, v5, v9
16; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
17; GCN-NEXT:    v_cndmask_b32_e32 v6, v1, v6, vcc
18; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
19; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v8, vcc
20; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v4
21; GCN-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[6:7]
22; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v2, s[6:7]
23; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
24; GCN-NEXT:    s_setpc_b64 s[30:31]
25  %shl = shl i128 %lhs, %rhs
26  ret i128 %shl
27}
28
29define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
30; GCN-LABEL: v_lshr_i128_vv:
31; GCN:       ; %bb.0:
32; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GCN-NEXT:    v_lshr_b64 v[5:6], v[0:1], v4
34; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 64, v4
35; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v4
36; GCN-NEXT:    v_lshr_b64 v[7:8], v[2:3], v4
37; GCN-NEXT:    v_lshl_b64 v[9:10], v[2:3], v9
38; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v11
39; GCN-NEXT:    v_or_b32_e32 v6, v6, v10
40; GCN-NEXT:    v_or_b32_e32 v5, v5, v9
41; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
42; GCN-NEXT:    v_cndmask_b32_e32 v6, v3, v6, vcc
43; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
44; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v8, vcc
45; GCN-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v4
46; GCN-NEXT:    v_cndmask_b32_e64 v1, v6, v1, s[6:7]
47; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[6:7]
48; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v7, vcc
49; GCN-NEXT:    s_setpc_b64 s[30:31]
50
51  %shl = lshr i128 %lhs, %rhs
52  ret i128 %shl
53}
54
55define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
56; GCN-LABEL: v_ashr_i128_vv:
57; GCN:       ; %bb.0:
58; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
60; GCN-NEXT:    v_ashr_i64 v[5:6], v[2:3], v4
61; GCN-NEXT:    v_lshr_b64 v[7:8], v[0:1], v4
62; GCN-NEXT:    v_sub_i32_e32 v10, vcc, 64, v4
63; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, 64, v4
64; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
65; GCN-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc
66; GCN-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
67; GCN-NEXT:    v_lshl_b64 v[9:10], v[2:3], v10
68; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], v11
69; GCN-NEXT:    v_or_b32_e32 v8, v8, v10
70; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
71; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
72; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
73; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
74; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
75; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
76; GCN-NEXT:    v_mov_b32_e32 v2, v5
77; GCN-NEXT:    v_mov_b32_e32 v3, v6
78; GCN-NEXT:    s_setpc_b64 s[30:31]
79  %shl = ashr i128 %lhs, %rhs
80  ret i128 %shl
81}
82
83
84define i128 @v_shl_i128_vk(i128 %lhs) {
85; GCN-LABEL: v_shl_i128_vk:
86; GCN:       ; %bb.0:
87; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], 17
89; GCN-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
90; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
91; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 17
92; GCN-NEXT:    s_setpc_b64 s[30:31]
93  %shl = shl i128 %lhs, 17
94  ret i128 %shl
95}
96
97define i128 @v_lshr_i128_vk(i128 %lhs) {
98; GCN-LABEL: v_lshr_i128_vk:
99; GCN:       ; %bb.0:
100; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101; GCN-NEXT:    v_lshr_b64 v[0:1], v[2:3], 1
102; GCN-NEXT:    v_mov_b32_e32 v2, 0
103; GCN-NEXT:    v_mov_b32_e32 v3, 0
104; GCN-NEXT:    s_setpc_b64 s[30:31]
105  %shl = lshr i128 %lhs, 65
106  ret i128 %shl
107}
108
109define i128 @v_ashr_i128_vk(i128 %lhs) {
110; GCN-LABEL: v_ashr_i128_vk:
111; GCN:       ; %bb.0:
112; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GCN-NEXT:    v_lshl_b64 v[4:5], v[2:3], 31
114; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v1
115; GCN-NEXT:    v_or_b32_e32 v4, v0, v4
116; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 33
117; GCN-NEXT:    v_mov_b32_e32 v0, v4
118; GCN-NEXT:    v_mov_b32_e32 v1, v5
119; GCN-NEXT:    s_setpc_b64 s[30:31]
120  %shl = ashr i128 %lhs, 33
121  ret i128 %shl
122}
123
124define i128 @v_shl_i128_kv(i128 %rhs) {
125; GCN-LABEL: v_shl_i128_kv:
126; GCN:       ; %bb.0:
127; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 64, v0
129; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v0
130; GCN-NEXT:    v_lshl_b64 v[4:5], 17, v0
131; GCN-NEXT:    v_lshr_b64 v[1:2], 17, v1
132; GCN-NEXT:    v_lshl_b64 v[6:7], 17, v3
133; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
134; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
135; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
136; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
137; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, v0
138; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v2, s[6:7]
139; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[6:7]
140; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
141; GCN-NEXT:    s_setpc_b64 s[30:31]
142  %shl = shl i128 17, %rhs
143  ret i128 %shl
144}
145
146define i128 @v_lshr_i128_kv(i128 %rhs) {
147; GCN-LABEL: v_lshr_i128_kv:
148; GCN:       ; %bb.0:
149; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; GCN-NEXT:    s_mov_b32 s7, 0
151; GCN-NEXT:    s_movk_i32 s6, 0x41
152; GCN-NEXT:    v_mov_b32_e32 v3, 0x41
153; GCN-NEXT:    v_lshr_b64 v[1:2], s[6:7], v0
154; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
155; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
156; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
157; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
158; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
159; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
160; GCN-NEXT:    v_mov_b32_e32 v2, 0
161; GCN-NEXT:    v_mov_b32_e32 v3, 0
162; GCN-NEXT:    s_setpc_b64 s[30:31]
163  %shl = lshr i128 65, %rhs
164  ret i128 %shl
165}
166
167define i128 @v_ashr_i128_kv(i128 %rhs) {
168; GCN-LABEL: v_ashr_i128_kv:
169; GCN:       ; %bb.0:
170; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GCN-NEXT:    v_lshr_b64 v[1:2], 33, v0
172; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
173; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
174; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
175; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
176; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
177; GCN-NEXT:    v_cndmask_b32_e32 v0, 33, v3, vcc
178; GCN-NEXT:    v_mov_b32_e32 v2, 0
179; GCN-NEXT:    v_mov_b32_e32 v3, 0
180; GCN-NEXT:    s_setpc_b64 s[30:31]
181  %shl = ashr i128 33, %rhs
182  ret i128 %shl
183}
184
185define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
186; GCN-LABEL: s_shl_i128_ss:
187; GCN:         .amd_kernel_code_t
188; GCN-NEXT:     amd_code_version_major = 1
189; GCN-NEXT:     amd_code_version_minor = 2
190; GCN-NEXT:     amd_machine_kind = 1
191; GCN-NEXT:     amd_machine_version_major = 7
192; GCN-NEXT:     amd_machine_version_minor = 0
193; GCN-NEXT:     amd_machine_version_stepping = 0
194; GCN-NEXT:     kernel_code_entry_byte_offset = 256
195; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
196; GCN-NEXT:     granulated_workitem_vgpr_count = 1
197; GCN-NEXT:     granulated_wavefront_sgpr_count = 1
198; GCN-NEXT:     priority = 0
199; GCN-NEXT:     float_mode = 192
200; GCN-NEXT:     priv = 0
201; GCN-NEXT:     enable_dx10_clamp = 1
202; GCN-NEXT:     debug_mode = 0
203; GCN-NEXT:     enable_ieee_mode = 1
204; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
205; GCN-NEXT:     user_sgpr_count = 6
206; GCN-NEXT:     enable_trap_handler = 0
207; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
208; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
209; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
210; GCN-NEXT:     enable_sgpr_workgroup_info = 0
211; GCN-NEXT:     enable_vgpr_workitem_id = 0
212; GCN-NEXT:     enable_exception_msb = 0
213; GCN-NEXT:     granulated_lds_size = 0
214; GCN-NEXT:     enable_exception = 0
215; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
216; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
217; GCN-NEXT:     enable_sgpr_queue_ptr = 0
218; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
219; GCN-NEXT:     enable_sgpr_dispatch_id = 0
220; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
221; GCN-NEXT:     enable_sgpr_private_segment_size = 0
222; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
223; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
224; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
225; GCN-NEXT:     enable_ordered_append_gds = 0
226; GCN-NEXT:     private_element_size = 1
227; GCN-NEXT:     is_ptr64 = 1
228; GCN-NEXT:     is_dynamic_callstack = 0
229; GCN-NEXT:     is_debug_enabled = 0
230; GCN-NEXT:     is_xnack_enabled = 0
231; GCN-NEXT:     workitem_private_segment_byte_size = 0
232; GCN-NEXT:     workgroup_group_segment_byte_size = 0
233; GCN-NEXT:     gds_segment_byte_size = 0
234; GCN-NEXT:     kernarg_segment_byte_size = 32
235; GCN-NEXT:     workgroup_fbarrier_count = 0
236; GCN-NEXT:     wavefront_sgpr_count = 15
237; GCN-NEXT:     workitem_vgpr_count = 8
238; GCN-NEXT:     reserved_vgpr_first = 0
239; GCN-NEXT:     reserved_vgpr_count = 0
240; GCN-NEXT:     reserved_sgpr_first = 0
241; GCN-NEXT:     reserved_sgpr_count = 0
242; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
243; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
244; GCN-NEXT:     kernarg_segment_alignment = 4
245; GCN-NEXT:     group_segment_alignment = 4
246; GCN-NEXT:     private_segment_alignment = 4
247; GCN-NEXT:     wavefront_size = 6
248; GCN-NEXT:     call_convention = -1
249; GCN-NEXT:     runtime_loader_kernel_symbol = 0
250; GCN-NEXT:    .end_amd_kernel_code_t
251; GCN-NEXT:  ; %bb.0:
252; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
253; GCN-NEXT:    s_waitcnt lgkmcnt(0)
254; GCN-NEXT:    s_lshl_b64 s[6:7], s[2:3], s4
255; GCN-NEXT:    s_sub_i32 s5, 64, s4
256; GCN-NEXT:    s_sub_i32 s12, s4, 64
257; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s4
258; GCN-NEXT:    s_lshr_b64 s[10:11], s[0:1], s5
259; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s12
260; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
261; GCN-NEXT:    v_mov_b32_e32 v0, s3
262; GCN-NEXT:    v_mov_b32_e32 v2, s2
263; GCN-NEXT:    v_mov_b32_e32 v1, s9
264; GCN-NEXT:    v_mov_b32_e32 v4, s8
265; GCN-NEXT:    v_mov_b32_e32 v3, s1
266; GCN-NEXT:    v_mov_b32_e32 v5, s0
267; GCN-NEXT:    v_mov_b32_e32 v6, s7
268; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s4, 64
269; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
270; GCN-NEXT:    v_mov_b32_e32 v6, s6
271; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 0
272; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
273; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
274; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
275; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
276; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
277; GCN-NEXT:    v_mov_b32_e32 v4, 0
278; GCN-NEXT:    v_mov_b32_e32 v5, 0
279; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
280; GCN-NEXT:    s_endpgm
281  %shift = shl i128 %lhs, %rhs
282  store i128 %shift, i128 addrspace(1)* null
283  ret void
284}
285
286define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
287; GCN-LABEL: s_lshr_i128_ss:
288; GCN:         .amd_kernel_code_t
289; GCN-NEXT:     amd_code_version_major = 1
290; GCN-NEXT:     amd_code_version_minor = 2
291; GCN-NEXT:     amd_machine_kind = 1
292; GCN-NEXT:     amd_machine_version_major = 7
293; GCN-NEXT:     amd_machine_version_minor = 0
294; GCN-NEXT:     amd_machine_version_stepping = 0
295; GCN-NEXT:     kernel_code_entry_byte_offset = 256
296; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
297; GCN-NEXT:     granulated_workitem_vgpr_count = 1
298; GCN-NEXT:     granulated_wavefront_sgpr_count = 1
299; GCN-NEXT:     priority = 0
300; GCN-NEXT:     float_mode = 192
301; GCN-NEXT:     priv = 0
302; GCN-NEXT:     enable_dx10_clamp = 1
303; GCN-NEXT:     debug_mode = 0
304; GCN-NEXT:     enable_ieee_mode = 1
305; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
306; GCN-NEXT:     user_sgpr_count = 6
307; GCN-NEXT:     enable_trap_handler = 0
308; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
309; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
310; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
311; GCN-NEXT:     enable_sgpr_workgroup_info = 0
312; GCN-NEXT:     enable_vgpr_workitem_id = 0
313; GCN-NEXT:     enable_exception_msb = 0
314; GCN-NEXT:     granulated_lds_size = 0
315; GCN-NEXT:     enable_exception = 0
316; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
317; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
318; GCN-NEXT:     enable_sgpr_queue_ptr = 0
319; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
320; GCN-NEXT:     enable_sgpr_dispatch_id = 0
321; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
322; GCN-NEXT:     enable_sgpr_private_segment_size = 0
323; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
324; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
325; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
326; GCN-NEXT:     enable_ordered_append_gds = 0
327; GCN-NEXT:     private_element_size = 1
328; GCN-NEXT:     is_ptr64 = 1
329; GCN-NEXT:     is_dynamic_callstack = 0
330; GCN-NEXT:     is_debug_enabled = 0
331; GCN-NEXT:     is_xnack_enabled = 0
332; GCN-NEXT:     workitem_private_segment_byte_size = 0
333; GCN-NEXT:     workgroup_group_segment_byte_size = 0
334; GCN-NEXT:     gds_segment_byte_size = 0
335; GCN-NEXT:     kernarg_segment_byte_size = 32
336; GCN-NEXT:     workgroup_fbarrier_count = 0
337; GCN-NEXT:     wavefront_sgpr_count = 15
338; GCN-NEXT:     workitem_vgpr_count = 8
339; GCN-NEXT:     reserved_vgpr_first = 0
340; GCN-NEXT:     reserved_vgpr_count = 0
341; GCN-NEXT:     reserved_sgpr_first = 0
342; GCN-NEXT:     reserved_sgpr_count = 0
343; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
344; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
345; GCN-NEXT:     kernarg_segment_alignment = 4
346; GCN-NEXT:     group_segment_alignment = 4
347; GCN-NEXT:     private_segment_alignment = 4
348; GCN-NEXT:     wavefront_size = 6
349; GCN-NEXT:     call_convention = -1
350; GCN-NEXT:     runtime_loader_kernel_symbol = 0
351; GCN-NEXT:    .end_amd_kernel_code_t
352; GCN-NEXT:  ; %bb.0:
353; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
354; GCN-NEXT:    s_waitcnt lgkmcnt(0)
355; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
356; GCN-NEXT:    s_sub_i32 s5, 64, s4
357; GCN-NEXT:    s_sub_i32 s12, s4, 64
358; GCN-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
359; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
360; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s12
361; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
362; GCN-NEXT:    v_mov_b32_e32 v0, s1
363; GCN-NEXT:    v_mov_b32_e32 v4, s0
364; GCN-NEXT:    v_mov_b32_e32 v2, s9
365; GCN-NEXT:    v_mov_b32_e32 v5, s8
366; GCN-NEXT:    v_mov_b32_e32 v1, s3
367; GCN-NEXT:    v_mov_b32_e32 v3, s2
368; GCN-NEXT:    v_mov_b32_e32 v6, s7
369; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s4, 64
370; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
371; GCN-NEXT:    v_mov_b32_e32 v6, s6
372; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 0
373; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[0:1]
374; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v6, vcc
375; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v2, vcc
376; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
377; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
378; GCN-NEXT:    v_mov_b32_e32 v4, 0
379; GCN-NEXT:    v_mov_b32_e32 v5, 0
380; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
381; GCN-NEXT:    s_endpgm
382  %shift = lshr i128 %lhs, %rhs
383  store i128 %shift, i128 addrspace(1)* null
384  ret void
385}
386
387define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
388; GCN-LABEL: s_ashr_i128_ss:
389; GCN:         .amd_kernel_code_t
390; GCN-NEXT:     amd_code_version_major = 1
391; GCN-NEXT:     amd_code_version_minor = 2
392; GCN-NEXT:     amd_machine_kind = 1
393; GCN-NEXT:     amd_machine_version_major = 7
394; GCN-NEXT:     amd_machine_version_minor = 0
395; GCN-NEXT:     amd_machine_version_stepping = 0
396; GCN-NEXT:     kernel_code_entry_byte_offset = 256
397; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
398; GCN-NEXT:     granulated_workitem_vgpr_count = 1
399; GCN-NEXT:     granulated_wavefront_sgpr_count = 1
400; GCN-NEXT:     priority = 0
401; GCN-NEXT:     float_mode = 192
402; GCN-NEXT:     priv = 0
403; GCN-NEXT:     enable_dx10_clamp = 1
404; GCN-NEXT:     debug_mode = 0
405; GCN-NEXT:     enable_ieee_mode = 1
406; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
407; GCN-NEXT:     user_sgpr_count = 6
408; GCN-NEXT:     enable_trap_handler = 0
409; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
410; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
411; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
412; GCN-NEXT:     enable_sgpr_workgroup_info = 0
413; GCN-NEXT:     enable_vgpr_workitem_id = 0
414; GCN-NEXT:     enable_exception_msb = 0
415; GCN-NEXT:     granulated_lds_size = 0
416; GCN-NEXT:     enable_exception = 0
417; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
418; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
419; GCN-NEXT:     enable_sgpr_queue_ptr = 0
420; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
421; GCN-NEXT:     enable_sgpr_dispatch_id = 0
422; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
423; GCN-NEXT:     enable_sgpr_private_segment_size = 0
424; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
425; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
426; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
427; GCN-NEXT:     enable_ordered_append_gds = 0
428; GCN-NEXT:     private_element_size = 1
429; GCN-NEXT:     is_ptr64 = 1
430; GCN-NEXT:     is_dynamic_callstack = 0
431; GCN-NEXT:     is_debug_enabled = 0
432; GCN-NEXT:     is_xnack_enabled = 0
433; GCN-NEXT:     workitem_private_segment_byte_size = 0
434; GCN-NEXT:     workgroup_group_segment_byte_size = 0
435; GCN-NEXT:     gds_segment_byte_size = 0
436; GCN-NEXT:     kernarg_segment_byte_size = 32
437; GCN-NEXT:     workgroup_fbarrier_count = 0
438; GCN-NEXT:     wavefront_sgpr_count = 16
439; GCN-NEXT:     workitem_vgpr_count = 8
440; GCN-NEXT:     reserved_vgpr_first = 0
441; GCN-NEXT:     reserved_vgpr_count = 0
442; GCN-NEXT:     reserved_sgpr_first = 0
443; GCN-NEXT:     reserved_sgpr_count = 0
444; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
445; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
446; GCN-NEXT:     kernarg_segment_alignment = 4
447; GCN-NEXT:     group_segment_alignment = 4
448; GCN-NEXT:     private_segment_alignment = 4
449; GCN-NEXT:     wavefront_size = 6
450; GCN-NEXT:     call_convention = -1
451; GCN-NEXT:     runtime_loader_kernel_symbol = 0
452; GCN-NEXT:    .end_amd_kernel_code_t
453; GCN-NEXT:  ; %bb.0:
454; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
455; GCN-NEXT:    s_waitcnt lgkmcnt(0)
456; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
457; GCN-NEXT:    s_sub_i32 s5, 64, s4
458; GCN-NEXT:    s_sub_i32 s12, s4, 64
459; GCN-NEXT:    s_ashr_i64 s[8:9], s[2:3], s4
460; GCN-NEXT:    s_ashr_i32 s13, s3, 31
461; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
462; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s12
463; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
464; GCN-NEXT:    v_mov_b32_e32 v0, s1
465; GCN-NEXT:    v_mov_b32_e32 v4, s0
466; GCN-NEXT:    v_mov_b32_e32 v2, s13
467; GCN-NEXT:    v_mov_b32_e32 v3, s9
468; GCN-NEXT:    v_mov_b32_e32 v5, s8
469; GCN-NEXT:    v_mov_b32_e32 v1, s3
470; GCN-NEXT:    v_mov_b32_e32 v6, s2
471; GCN-NEXT:    v_mov_b32_e32 v7, s7
472; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s4, 64
473; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
474; GCN-NEXT:    v_mov_b32_e32 v7, s6
475; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 0
476; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v0, s[0:1]
477; GCN-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
478; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
479; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
480; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
481; GCN-NEXT:    v_mov_b32_e32 v4, 0
482; GCN-NEXT:    v_mov_b32_e32 v5, 0
483; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
484; GCN-NEXT:    s_endpgm
485  %shift = ashr i128 %lhs, %rhs
486  store i128 %shift, i128 addrspace(1)* null
487  ret void
488}
489
490define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
491; GCN-LABEL: v_shl_v2i128_vv:
492; GCN:       ; %bb.0:
493; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v8
495; GCN-NEXT:    v_sub_i32_e32 v18, vcc, 64, v8
496; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v18
497; GCN-NEXT:    v_or_b32_e32 v20, v17, v19
498; GCN-NEXT:    v_or_b32_e32 v21, v16, v18
499; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v12
500; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v16
501; GCN-NEXT:    v_lshl_b64 v[18:19], v[6:7], v12
502; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
503; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
504; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
505; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
506; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
507; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
508; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
509; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
510; GCN-NEXT:    v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
511; GCN-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v8
512; GCN-NEXT:    v_lshl_b64 v[8:9], v[0:1], v8
513; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v18
514; GCN-NEXT:    v_cmp_gt_u64_e64 s[12:13], 64, v[12:13]
515; GCN-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v12
516; GCN-NEXT:    v_lshl_b64 v[12:13], v[4:5], v12
517; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v18
518; GCN-NEXT:    s_and_b64 vcc, s[6:7], s[10:11]
519; GCN-NEXT:    v_cndmask_b32_e32 v18, v1, v20, vcc
520; GCN-NEXT:    v_cndmask_b32_e32 v19, v0, v21, vcc
521; GCN-NEXT:    s_and_b64 s[6:7], s[8:9], s[12:13]
522; GCN-NEXT:    v_cndmask_b32_e64 v17, v5, v17, s[6:7]
523; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v16, s[6:7]
524; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v9, vcc
525; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v8, vcc
526; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v13, s[6:7]
527; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
528; GCN-NEXT:    v_cndmask_b32_e32 v3, v18, v3, vcc
529; GCN-NEXT:    v_cndmask_b32_e32 v2, v19, v2, vcc
530; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
531; GCN-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
532; GCN-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
533; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v12, s[6:7]
534; GCN-NEXT:    s_setpc_b64 s[30:31]
535  %shl = shl <2 x i128> %lhs, %rhs
536  ret <2 x i128> %shl
537}
538
539define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
540; GCN-LABEL: v_lshr_v2i128_vv:
541; GCN:       ; %bb.0:
542; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v8
544; GCN-NEXT:    v_sub_i32_e32 v18, vcc, 64, v8
545; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v18
546; GCN-NEXT:    v_or_b32_e32 v20, v17, v19
547; GCN-NEXT:    v_or_b32_e32 v21, v16, v18
548; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v12
549; GCN-NEXT:    v_lshl_b64 v[16:17], v[6:7], v16
550; GCN-NEXT:    v_lshr_b64 v[18:19], v[4:5], v12
551; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
552; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
553; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
554; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
555; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
556; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
557; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
558; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
559; GCN-NEXT:    v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
560; GCN-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v8
561; GCN-NEXT:    v_lshr_b64 v[8:9], v[2:3], v8
562; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v18
563; GCN-NEXT:    v_cmp_gt_u64_e64 s[12:13], 64, v[12:13]
564; GCN-NEXT:    v_subrev_i32_e32 v18, vcc, 64, v12
565; GCN-NEXT:    v_lshr_b64 v[12:13], v[6:7], v12
566; GCN-NEXT:    v_lshr_b64 v[6:7], v[6:7], v18
567; GCN-NEXT:    s_and_b64 vcc, s[6:7], s[10:11]
568; GCN-NEXT:    v_cndmask_b32_e32 v18, v3, v20, vcc
569; GCN-NEXT:    v_cndmask_b32_e32 v19, v2, v21, vcc
570; GCN-NEXT:    s_and_b64 s[6:7], s[8:9], s[12:13]
571; GCN-NEXT:    v_cndmask_b32_e64 v17, v7, v17, s[6:7]
572; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v16, s[6:7]
573; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
574; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
575; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v13, s[6:7]
576; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
577; GCN-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
578; GCN-NEXT:    v_cndmask_b32_e32 v0, v19, v0, vcc
579; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
580; GCN-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
581; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
582; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[6:7]
583; GCN-NEXT:    s_setpc_b64 s[30:31]
584  %shl = lshr <2 x i128> %lhs, %rhs
585  ret <2 x i128> %shl
586}
587
588define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
589; GCN-LABEL: v_ashr_v2i128_vv:
590; GCN:       ; %bb.0:
591; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v8
593; GCN-NEXT:    v_sub_i32_e32 v18, vcc, 64, v8
594; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v18
595; GCN-NEXT:    v_or_b32_e32 v20, v17, v19
596; GCN-NEXT:    v_or_b32_e32 v21, v16, v18
597; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v12
598; GCN-NEXT:    v_lshl_b64 v[16:17], v[6:7], v16
599; GCN-NEXT:    v_lshr_b64 v[18:19], v[4:5], v12
600; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
601; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
602; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
603; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
604; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
605; GCN-NEXT:    v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
606; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
607; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
608; GCN-NEXT:    v_cmp_gt_u64_e64 s[10:11], 64, v[8:9]
609; GCN-NEXT:    v_subrev_i32_e32 v9, vcc, 64, v8
610; GCN-NEXT:    v_ashr_i64 v[16:17], v[2:3], v9
611; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], s[10:11]
612; GCN-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s[6:7]
613; GCN-NEXT:    v_cndmask_b32_e64 v16, v16, v21, s[6:7]
614; GCN-NEXT:    v_cmp_gt_u64_e64 s[10:11], 64, v[12:13]
615; GCN-NEXT:    v_ashr_i64 v[8:9], v[2:3], v8
616; GCN-NEXT:    v_ashrrev_i32_e32 v20, 31, v3
617; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v12
618; GCN-NEXT:    v_ashr_i64 v[12:13], v[6:7], v12
619; GCN-NEXT:    v_ashrrev_i32_e32 v21, 31, v7
620; GCN-NEXT:    v_ashr_i64 v[2:3], v[6:7], v2
621; GCN-NEXT:    s_and_b64 vcc, s[8:9], s[10:11]
622; GCN-NEXT:    v_cndmask_b32_e32 v6, v3, v19, vcc
623; GCN-NEXT:    v_cndmask_b32_e32 v18, v2, v18, vcc
624; GCN-NEXT:    v_cndmask_b32_e64 v3, v20, v9, s[6:7]
625; GCN-NEXT:    v_cndmask_b32_e64 v2, v20, v8, s[6:7]
626; GCN-NEXT:    v_cndmask_b32_e32 v7, v21, v13, vcc
627; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
628; GCN-NEXT:    v_cndmask_b32_e64 v1, v17, v1, s[6:7]
629; GCN-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[6:7]
630; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
631; GCN-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[6:7]
632; GCN-NEXT:    v_cndmask_b32_e64 v4, v18, v4, s[6:7]
633; GCN-NEXT:    v_cndmask_b32_e32 v6, v21, v12, vcc
634; GCN-NEXT:    s_setpc_b64 s[30:31]
635  %shl = ashr <2 x i128> %lhs, %rhs
636  ret <2 x i128> %shl
637}
638
639define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
640; GCN-LABEL: s_shl_v2i128ss:
641; GCN:         .amd_kernel_code_t
642; GCN-NEXT:     amd_code_version_major = 1
643; GCN-NEXT:     amd_code_version_minor = 2
644; GCN-NEXT:     amd_machine_kind = 1
645; GCN-NEXT:     amd_machine_version_major = 7
646; GCN-NEXT:     amd_machine_version_minor = 0
647; GCN-NEXT:     amd_machine_version_stepping = 0
648; GCN-NEXT:     kernel_code_entry_byte_offset = 256
649; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
650; GCN-NEXT:     granulated_workitem_vgpr_count = 3
651; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
652; GCN-NEXT:     priority = 0
653; GCN-NEXT:     float_mode = 192
654; GCN-NEXT:     priv = 0
655; GCN-NEXT:     enable_dx10_clamp = 1
656; GCN-NEXT:     debug_mode = 0
657; GCN-NEXT:     enable_ieee_mode = 1
658; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
659; GCN-NEXT:     user_sgpr_count = 6
660; GCN-NEXT:     enable_trap_handler = 0
661; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
662; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
663; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
664; GCN-NEXT:     enable_sgpr_workgroup_info = 0
665; GCN-NEXT:     enable_vgpr_workitem_id = 0
666; GCN-NEXT:     enable_exception_msb = 0
667; GCN-NEXT:     granulated_lds_size = 0
668; GCN-NEXT:     enable_exception = 0
669; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
670; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
671; GCN-NEXT:     enable_sgpr_queue_ptr = 0
672; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
673; GCN-NEXT:     enable_sgpr_dispatch_id = 0
674; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
675; GCN-NEXT:     enable_sgpr_private_segment_size = 0
676; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
677; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
678; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
679; GCN-NEXT:     enable_ordered_append_gds = 0
680; GCN-NEXT:     private_element_size = 1
681; GCN-NEXT:     is_ptr64 = 1
682; GCN-NEXT:     is_dynamic_callstack = 0
683; GCN-NEXT:     is_debug_enabled = 0
684; GCN-NEXT:     is_xnack_enabled = 0
685; GCN-NEXT:     workitem_private_segment_byte_size = 0
686; GCN-NEXT:     workgroup_group_segment_byte_size = 0
687; GCN-NEXT:     gds_segment_byte_size = 0
688; GCN-NEXT:     kernarg_segment_byte_size = 64
689; GCN-NEXT:     workgroup_fbarrier_count = 0
690; GCN-NEXT:     wavefront_sgpr_count = 36
691; GCN-NEXT:     workitem_vgpr_count = 16
692; GCN-NEXT:     reserved_vgpr_first = 0
693; GCN-NEXT:     reserved_vgpr_count = 0
694; GCN-NEXT:     reserved_sgpr_first = 0
695; GCN-NEXT:     reserved_sgpr_count = 0
696; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
697; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
698; GCN-NEXT:     kernarg_segment_alignment = 5
699; GCN-NEXT:     group_segment_alignment = 4
700; GCN-NEXT:     private_segment_alignment = 4
701; GCN-NEXT:     wavefront_size = 6
702; GCN-NEXT:     call_convention = -1
703; GCN-NEXT:     runtime_loader_kernel_symbol = 0
704; GCN-NEXT:    .end_amd_kernel_code_t
705; GCN-NEXT:  ; %bb.0:
706; GCN-NEXT:    s_load_dwordx8 s[12:19], s[4:5], 0x8
707; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
708; GCN-NEXT:    s_waitcnt lgkmcnt(0)
709; GCN-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[12:13], 64
710; GCN-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[14:15], 0
711; GCN-NEXT:    s_lshl_b64 s[20:21], s[4:5], s12
712; GCN-NEXT:    s_lshl_b64 s[22:23], s[6:7], s12
713; GCN-NEXT:    s_sub_i32 s30, 64, s12
714; GCN-NEXT:    s_sub_i32 s31, s12, 64
715; GCN-NEXT:    s_sub_i32 s32, 64, s16
716; GCN-NEXT:    s_sub_i32 s33, s16, 64
717; GCN-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
718; GCN-NEXT:    v_cmp_lt_u64_e64 s[14:15], s[16:17], 64
719; GCN-NEXT:    v_cmp_eq_u64_e64 s[24:25], s[18:19], 0
720; GCN-NEXT:    s_lshl_b64 s[26:27], s[8:9], s16
721; GCN-NEXT:    s_lshl_b64 s[28:29], s[10:11], s16
722; GCN-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
723; GCN-NEXT:    v_mov_b32_e32 v0, s21
724; GCN-NEXT:    v_mov_b32_e32 v2, s20
725; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
726; GCN-NEXT:    v_mov_b32_e32 v8, 0
727; GCN-NEXT:    v_mov_b32_e32 v9, 0
728; GCN-NEXT:    v_mov_b32_e32 v10, 16
729; GCN-NEXT:    v_mov_b32_e32 v11, 0
730; GCN-NEXT:    v_mov_b32_e32 v3, s7
731; GCN-NEXT:    v_mov_b32_e32 v6, s6
732; GCN-NEXT:    v_mov_b32_e32 v7, s11
733; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
734; GCN-NEXT:    v_mov_b32_e32 v0, s27
735; GCN-NEXT:    s_and_b64 s[0:1], s[24:25], s[14:15]
736; GCN-NEXT:    s_lshr_b64 s[2:3], s[4:5], s30
737; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s31
738; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v0, s[0:1]
739; GCN-NEXT:    v_mov_b32_e32 v4, s26
740; GCN-NEXT:    s_lshr_b64 s[6:7], s[8:9], s32
741; GCN-NEXT:    s_lshl_b64 s[8:9], s[8:9], s33
742; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
743; GCN-NEXT:    s_or_b64 s[2:3], s[22:23], s[2:3]
744; GCN-NEXT:    v_mov_b32_e32 v2, s5
745; GCN-NEXT:    v_mov_b32_e32 v12, s4
746; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
747; GCN-NEXT:    s_or_b64 s[4:5], s[28:29], s[6:7]
748; GCN-NEXT:    v_mov_b32_e32 v13, s9
749; GCN-NEXT:    v_mov_b32_e32 v14, s8
750; GCN-NEXT:    v_mov_b32_e32 v15, s3
751; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v15, vcc
752; GCN-NEXT:    v_mov_b32_e32 v15, s2
753; GCN-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
754; GCN-NEXT:    v_mov_b32_e32 v15, s5
755; GCN-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[0:1]
756; GCN-NEXT:    v_mov_b32_e32 v15, s4
757; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[0:1]
758; GCN-NEXT:    v_mov_b32_e32 v15, s10
759; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[12:13], 0
760; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v3, vcc
761; GCN-NEXT:    v_cndmask_b32_e32 v2, v12, v6, vcc
762; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[16:17], 0
763; GCN-NEXT:    v_cndmask_b32_e32 v7, v13, v7, vcc
764; GCN-NEXT:    v_cndmask_b32_e32 v6, v14, v15, vcc
765; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
766; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
767; GCN-NEXT:    s_endpgm
768  %shift = shl <2 x i128> %lhs, %rhs
769  store <2 x i128> %shift, <2 x i128> addrspace(1)* null
770  ret void
771}
772
773define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
774; GCN-LABEL: s_lshr_v2i128_ss:
775; GCN:         .amd_kernel_code_t
776; GCN-NEXT:     amd_code_version_major = 1
777; GCN-NEXT:     amd_code_version_minor = 2
778; GCN-NEXT:     amd_machine_kind = 1
779; GCN-NEXT:     amd_machine_version_major = 7
780; GCN-NEXT:     amd_machine_version_minor = 0
781; GCN-NEXT:     amd_machine_version_stepping = 0
782; GCN-NEXT:     kernel_code_entry_byte_offset = 256
783; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
784; GCN-NEXT:     granulated_workitem_vgpr_count = 4
785; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
786; GCN-NEXT:     priority = 0
787; GCN-NEXT:     float_mode = 192
788; GCN-NEXT:     priv = 0
789; GCN-NEXT:     enable_dx10_clamp = 1
790; GCN-NEXT:     debug_mode = 0
791; GCN-NEXT:     enable_ieee_mode = 1
792; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
793; GCN-NEXT:     user_sgpr_count = 6
794; GCN-NEXT:     enable_trap_handler = 0
795; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
796; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
797; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
798; GCN-NEXT:     enable_sgpr_workgroup_info = 0
799; GCN-NEXT:     enable_vgpr_workitem_id = 0
800; GCN-NEXT:     enable_exception_msb = 0
801; GCN-NEXT:     granulated_lds_size = 0
802; GCN-NEXT:     enable_exception = 0
803; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
804; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
805; GCN-NEXT:     enable_sgpr_queue_ptr = 0
806; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
807; GCN-NEXT:     enable_sgpr_dispatch_id = 0
808; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
809; GCN-NEXT:     enable_sgpr_private_segment_size = 0
810; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
811; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
812; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
813; GCN-NEXT:     enable_ordered_append_gds = 0
814; GCN-NEXT:     private_element_size = 1
815; GCN-NEXT:     is_ptr64 = 1
816; GCN-NEXT:     is_dynamic_callstack = 0
817; GCN-NEXT:     is_debug_enabled = 0
818; GCN-NEXT:     is_xnack_enabled = 0
819; GCN-NEXT:     workitem_private_segment_byte_size = 0
820; GCN-NEXT:     workgroup_group_segment_byte_size = 0
821; GCN-NEXT:     gds_segment_byte_size = 0
822; GCN-NEXT:     kernarg_segment_byte_size = 64
823; GCN-NEXT:     workgroup_fbarrier_count = 0
824; GCN-NEXT:     wavefront_sgpr_count = 36
825; GCN-NEXT:     workitem_vgpr_count = 17
826; GCN-NEXT:     reserved_vgpr_first = 0
827; GCN-NEXT:     reserved_vgpr_count = 0
828; GCN-NEXT:     reserved_sgpr_first = 0
829; GCN-NEXT:     reserved_sgpr_count = 0
830; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
831; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
832; GCN-NEXT:     kernarg_segment_alignment = 5
833; GCN-NEXT:     group_segment_alignment = 4
834; GCN-NEXT:     private_segment_alignment = 4
835; GCN-NEXT:     wavefront_size = 6
836; GCN-NEXT:     call_convention = -1
837; GCN-NEXT:     runtime_loader_kernel_symbol = 0
838; GCN-NEXT:    .end_amd_kernel_code_t
839; GCN-NEXT:  ; %bb.0:
840; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
841; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
842; GCN-NEXT:    s_waitcnt lgkmcnt(0)
843; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
844; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
845; GCN-NEXT:    s_lshr_b64 s[20:21], s[2:3], s8
846; GCN-NEXT:    s_lshr_b64 s[22:23], s[0:1], s8
847; GCN-NEXT:    s_sub_i32 s30, 64, s8
848; GCN-NEXT:    s_sub_i32 s31, s8, 64
849; GCN-NEXT:    s_sub_i32 s32, 64, s12
850; GCN-NEXT:    s_sub_i32 s33, s12, 64
851; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
852; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
853; GCN-NEXT:    v_cmp_eq_u64_e64 s[24:25], s[14:15], 0
854; GCN-NEXT:    s_lshr_b64 s[26:27], s[6:7], s12
855; GCN-NEXT:    s_lshr_b64 s[28:29], s[4:5], s12
856; GCN-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
857; GCN-NEXT:    v_mov_b32_e32 v0, s21
858; GCN-NEXT:    v_mov_b32_e32 v1, s20
859; GCN-NEXT:    s_and_b64 vcc, s[18:19], s[16:17]
860; GCN-NEXT:    v_mov_b32_e32 v8, 0
861; GCN-NEXT:    v_mov_b32_e32 v9, 0
862; GCN-NEXT:    v_mov_b32_e32 v10, 16
863; GCN-NEXT:    v_mov_b32_e32 v11, 0
864; GCN-NEXT:    v_mov_b32_e32 v4, s1
865; GCN-NEXT:    v_mov_b32_e32 v5, s0
866; GCN-NEXT:    v_mov_b32_e32 v12, s5
867; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v0, vcc
868; GCN-NEXT:    v_mov_b32_e32 v0, s27
869; GCN-NEXT:    s_and_b64 s[0:1], s[24:25], s[10:11]
870; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s30
871; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s31
872; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[0:1]
873; GCN-NEXT:    v_mov_b32_e32 v0, s26
874; GCN-NEXT:    s_lshl_b64 s[14:15], s[6:7], s32
875; GCN-NEXT:    s_lshr_b64 s[6:7], s[6:7], s33
876; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
877; GCN-NEXT:    s_or_b64 s[10:11], s[22:23], s[10:11]
878; GCN-NEXT:    v_mov_b32_e32 v1, s3
879; GCN-NEXT:    v_mov_b32_e32 v13, s2
880; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[0:1]
881; GCN-NEXT:    s_or_b64 s[2:3], s[28:29], s[14:15]
882; GCN-NEXT:    v_mov_b32_e32 v0, s7
883; GCN-NEXT:    v_mov_b32_e32 v14, s6
884; GCN-NEXT:    v_mov_b32_e32 v15, s11
885; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
886; GCN-NEXT:    v_mov_b32_e32 v15, s10
887; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v15, vcc
888; GCN-NEXT:    v_mov_b32_e32 v15, s3
889; GCN-NEXT:    v_cndmask_b32_e64 v15, v0, v15, s[0:1]
890; GCN-NEXT:    v_mov_b32_e32 v0, s2
891; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, v0, s[0:1]
892; GCN-NEXT:    v_mov_b32_e32 v16, s4
893; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[8:9], 0
894; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
895; GCN-NEXT:    v_cndmask_b32_e32 v0, v13, v5, vcc
896; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[12:13], 0
897; GCN-NEXT:    v_cndmask_b32_e32 v5, v15, v12, vcc
898; GCN-NEXT:    v_cndmask_b32_e32 v4, v14, v16, vcc
899; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
900; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
901; GCN-NEXT:    s_endpgm
902  %shift = lshr <2 x i128> %lhs, %rhs
903  store <2 x i128> %shift, <2 x i128> addrspace(1)* null
904  ret void
905}
906
907define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
908; GCN-LABEL: s_ashr_v2i128_ss:
909; GCN:         .amd_kernel_code_t
910; GCN-NEXT:     amd_code_version_major = 1
911; GCN-NEXT:     amd_code_version_minor = 2
912; GCN-NEXT:     amd_machine_kind = 1
913; GCN-NEXT:     amd_machine_version_major = 7
914; GCN-NEXT:     amd_machine_version_minor = 0
915; GCN-NEXT:     amd_machine_version_stepping = 0
916; GCN-NEXT:     kernel_code_entry_byte_offset = 256
917; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
918; GCN-NEXT:     granulated_workitem_vgpr_count = 4
919; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
920; GCN-NEXT:     priority = 0
921; GCN-NEXT:     float_mode = 192
922; GCN-NEXT:     priv = 0
923; GCN-NEXT:     enable_dx10_clamp = 1
924; GCN-NEXT:     debug_mode = 0
925; GCN-NEXT:     enable_ieee_mode = 1
926; GCN-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
927; GCN-NEXT:     user_sgpr_count = 6
928; GCN-NEXT:     enable_trap_handler = 0
929; GCN-NEXT:     enable_sgpr_workgroup_id_x = 1
930; GCN-NEXT:     enable_sgpr_workgroup_id_y = 0
931; GCN-NEXT:     enable_sgpr_workgroup_id_z = 0
932; GCN-NEXT:     enable_sgpr_workgroup_info = 0
933; GCN-NEXT:     enable_vgpr_workitem_id = 0
934; GCN-NEXT:     enable_exception_msb = 0
935; GCN-NEXT:     granulated_lds_size = 0
936; GCN-NEXT:     enable_exception = 0
937; GCN-NEXT:     enable_sgpr_private_segment_buffer = 1
938; GCN-NEXT:     enable_sgpr_dispatch_ptr = 0
939; GCN-NEXT:     enable_sgpr_queue_ptr = 0
940; GCN-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
941; GCN-NEXT:     enable_sgpr_dispatch_id = 0
942; GCN-NEXT:     enable_sgpr_flat_scratch_init = 0
943; GCN-NEXT:     enable_sgpr_private_segment_size = 0
944; GCN-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
945; GCN-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
946; GCN-NEXT:     enable_sgpr_grid_workgroup_count_z = 0
947; GCN-NEXT:     enable_ordered_append_gds = 0
948; GCN-NEXT:     private_element_size = 1
949; GCN-NEXT:     is_ptr64 = 1
950; GCN-NEXT:     is_dynamic_callstack = 0
951; GCN-NEXT:     is_debug_enabled = 0
952; GCN-NEXT:     is_xnack_enabled = 0
953; GCN-NEXT:     workitem_private_segment_byte_size = 0
954; GCN-NEXT:     workgroup_group_segment_byte_size = 0
955; GCN-NEXT:     gds_segment_byte_size = 0
956; GCN-NEXT:     kernarg_segment_byte_size = 64
957; GCN-NEXT:     workgroup_fbarrier_count = 0
958; GCN-NEXT:     wavefront_sgpr_count = 37
959; GCN-NEXT:     workitem_vgpr_count = 17
960; GCN-NEXT:     reserved_vgpr_first = 0
961; GCN-NEXT:     reserved_vgpr_count = 0
962; GCN-NEXT:     reserved_sgpr_first = 0
963; GCN-NEXT:     reserved_sgpr_count = 0
964; GCN-NEXT:     debug_wavefront_private_segment_offset_sgpr = 0
965; GCN-NEXT:     debug_private_segment_buffer_sgpr = 0
966; GCN-NEXT:     kernarg_segment_alignment = 5
967; GCN-NEXT:     group_segment_alignment = 4
968; GCN-NEXT:     private_segment_alignment = 4
969; GCN-NEXT:     wavefront_size = 6
970; GCN-NEXT:     call_convention = -1
971; GCN-NEXT:     runtime_loader_kernel_symbol = 0
972; GCN-NEXT:    .end_amd_kernel_code_t
973; GCN-NEXT:  ; %bb.0:
974; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
975; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
976; GCN-NEXT:    s_waitcnt lgkmcnt(0)
977; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
978; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
979; GCN-NEXT:    s_ashr_i64 s[20:21], s[2:3], s8
980; GCN-NEXT:    s_ashr_i32 s30, s3, 31
981; GCN-NEXT:    s_lshr_b64 s[22:23], s[0:1], s8
982; GCN-NEXT:    s_sub_i32 s31, 64, s8
983; GCN-NEXT:    s_sub_i32 s32, s8, 64
984; GCN-NEXT:    s_sub_i32 s33, 64, s12
985; GCN-NEXT:    s_sub_i32 s34, s12, 64
986; GCN-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
987; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
988; GCN-NEXT:    v_cmp_eq_u64_e64 s[24:25], s[14:15], 0
989; GCN-NEXT:    s_ashr_i64 s[26:27], s[6:7], s12
990; GCN-NEXT:    s_lshr_b64 s[28:29], s[4:5], s12
991; GCN-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
992; GCN-NEXT:    v_mov_b32_e32 v0, s21
993; GCN-NEXT:    v_mov_b32_e32 v1, s20
994; GCN-NEXT:    v_mov_b32_e32 v2, s30
995; GCN-NEXT:    s_ashr_i32 s14, s7, 31
996; GCN-NEXT:    s_and_b64 vcc, s[18:19], s[16:17]
997; GCN-NEXT:    v_mov_b32_e32 v4, s14
998; GCN-NEXT:    v_mov_b32_e32 v8, 0
999; GCN-NEXT:    v_mov_b32_e32 v9, 0
1000; GCN-NEXT:    v_mov_b32_e32 v10, 16
1001; GCN-NEXT:    v_mov_b32_e32 v11, 0
1002; GCN-NEXT:    v_mov_b32_e32 v5, s1
1003; GCN-NEXT:    v_mov_b32_e32 v12, s0
1004; GCN-NEXT:    v_mov_b32_e32 v13, s5
1005; GCN-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
1006; GCN-NEXT:    v_mov_b32_e32 v0, s27
1007; GCN-NEXT:    s_and_b64 s[0:1], s[24:25], s[10:11]
1008; GCN-NEXT:    s_lshl_b64 s[10:11], s[2:3], s31
1009; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s32
1010; GCN-NEXT:    v_cndmask_b32_e64 v7, v4, v0, s[0:1]
1011; GCN-NEXT:    v_mov_b32_e32 v0, s26
1012; GCN-NEXT:    s_lshl_b64 s[14:15], s[6:7], s33
1013; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], s34
1014; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
1015; GCN-NEXT:    s_or_b64 s[10:11], s[22:23], s[10:11]
1016; GCN-NEXT:    v_mov_b32_e32 v1, s3
1017; GCN-NEXT:    v_mov_b32_e32 v14, s2
1018; GCN-NEXT:    v_cndmask_b32_e64 v6, v4, v0, s[0:1]
1019; GCN-NEXT:    s_or_b64 s[2:3], s[28:29], s[14:15]
1020; GCN-NEXT:    v_mov_b32_e32 v0, s7
1021; GCN-NEXT:    v_mov_b32_e32 v4, s6
1022; GCN-NEXT:    v_mov_b32_e32 v15, s11
1023; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v15, vcc
1024; GCN-NEXT:    v_mov_b32_e32 v15, s10
1025; GCN-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
1026; GCN-NEXT:    v_mov_b32_e32 v15, s3
1027; GCN-NEXT:    v_cndmask_b32_e64 v15, v0, v15, s[0:1]
1028; GCN-NEXT:    v_mov_b32_e32 v0, s2
1029; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[0:1]
1030; GCN-NEXT:    v_mov_b32_e32 v16, s4
1031; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[8:9], 0
1032; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1033; GCN-NEXT:    v_cndmask_b32_e32 v0, v14, v12, vcc
1034; GCN-NEXT:    v_cmp_eq_u64_e64 vcc, s[12:13], 0
1035; GCN-NEXT:    v_cndmask_b32_e32 v5, v15, v13, vcc
1036; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v16, vcc
1037; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1038; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1039; GCN-NEXT:    s_endpgm
1040  %shift = ashr <2 x i128> %lhs, %rhs
1041  store <2 x i128> %shift, <2 x i128> addrspace(1)* null
1042  ret void
1043}
1044
1045