1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
3; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -amdgpu-bypass-slow-div=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
5
6define amdgpu_kernel void @sdivrem_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
7; GFX8-LABEL: sdivrem_i32:
8; GFX8:       ; %bb.0:
9; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
10; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
11; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
12; GFX8-NEXT:    s_add_i32 s1, s1, s6
13; GFX8-NEXT:    s_xor_b32 s7, s1, s6
14; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
15; GFX8-NEXT:    s_sub_i32 s1, 0, s7
16; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
17; GFX8-NEXT:    s_add_i32 s0, s0, s8
18; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
19; GFX8-NEXT:    s_xor_b32 s9, s0, s8
20; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
21; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
22; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
23; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
24; GFX8-NEXT:    s_xor_b32 s4, s8, s6
25; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
26; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
27; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v0
28; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX8-NEXT:    v_mov_b32_e32 v0, s0
30; GFX8-NEXT:    v_mov_b32_e32 v1, s1
31; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s7
32; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
33; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
34; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
35; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
36; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
37; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
38; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
39; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
40; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
41; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
42; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
43; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
44; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
45; GFX8-NEXT:    v_xor_b32_e32 v3, s8, v3
46; GFX8-NEXT:    flat_store_dword v[0:1], v2
47; GFX8-NEXT:    v_mov_b32_e32 v0, s2
48; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v3
49; GFX8-NEXT:    v_mov_b32_e32 v1, s3
50; GFX8-NEXT:    flat_store_dword v[0:1], v3
51; GFX8-NEXT:    s_endpgm
52;
53; GFX9-LABEL: sdivrem_i32:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
56; GFX9-NEXT:    v_mov_b32_e32 v2, 0
57; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
59; GFX9-NEXT:    s_add_i32 s1, s1, s6
60; GFX9-NEXT:    s_xor_b32 s7, s1, s6
61; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
62; GFX9-NEXT:    s_sub_i32 s1, 0, s7
63; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
64; GFX9-NEXT:    s_add_i32 s0, s0, s8
65; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
66; GFX9-NEXT:    s_xor_b32 s9, s0, s8
67; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
68; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
69; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
70; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
71; GFX9-NEXT:    s_xor_b32 s4, s8, s6
72; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
73; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
74; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
75; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
76; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
77; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
78; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
79; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
80; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
81; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
82; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
83; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
84; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
85; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
86; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
87; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
88; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
89; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
90; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
93; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
94; GFX9-NEXT:    s_endpgm
95;
96; GFX10-LABEL: sdivrem_i32:
97; GFX10:       ; %bb.0:
98; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
99; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
101; GFX10-NEXT:    s_ashr_i32 s8, s0, 31
102; GFX10-NEXT:    s_add_i32 s1, s1, s6
103; GFX10-NEXT:    s_add_i32 s0, s0, s8
104; GFX10-NEXT:    s_xor_b32 s7, s1, s6
105; GFX10-NEXT:    s_xor_b32 s0, s0, s8
106; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
107; GFX10-NEXT:    s_sub_i32 s1, 0, s7
108; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
109; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
110; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
111; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
112; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
113; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
114; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
115; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s7
116; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
117; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
118; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
119; GFX10-NEXT:    s_xor_b32 s4, s8, s6
120; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
121; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
122; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
123; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
124; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
125; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
126; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
127; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
128; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
129; GFX10-NEXT:    v_mov_b32_e32 v2, 0
130; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
131; GFX10-NEXT:    v_xor_b32_e32 v1, s8, v1
132; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
133; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s8, v1
134; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
136; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]
137; GFX10-NEXT:    s_endpgm
138  %div = sdiv i32 %x, %y
139  store i32 %div, i32 addrspace(1)* %out0
140  %rem = srem i32 %x, %y
141  store i32 %rem, i32 addrspace(1)* %out1
142  ret void
143}
144
145define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)* %out1, i64 %x, i64 %y) {
146; GFX8-LABEL: sdivrem_i64:
147; GFX8:       ; %bb.0:
148; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
149; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
151; GFX8-NEXT:    s_ashr_i32 s12, s11, 31
152; GFX8-NEXT:    s_add_u32 s0, s8, s2
153; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
154; GFX8-NEXT:    s_and_b32 s1, s1, 1
155; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
156; GFX8-NEXT:    s_addc_u32 s1, s9, s2
157; GFX8-NEXT:    s_add_u32 s8, s10, s12
158; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
159; GFX8-NEXT:    s_and_b32 s3, s3, 1
160; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
161; GFX8-NEXT:    s_mov_b32 s13, s12
162; GFX8-NEXT:    s_addc_u32 s9, s11, s12
163; GFX8-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
164; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
165; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
166; GFX8-NEXT:    s_mov_b32 s3, s2
167; GFX8-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
168; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
169; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
170; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
171; GFX8-NEXT:    s_sub_u32 s14, 0, s8
172; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
173; GFX8-NEXT:    s_and_b32 s0, s0, 1
174; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
175; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
176; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
177; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
178; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
179; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
180; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
181; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
182; GFX8-NEXT:    s_subb_u32 s15, 0, s9
183; GFX8-NEXT:    v_mul_lo_u32 v2, s14, v1
184; GFX8-NEXT:    v_mul_lo_u32 v3, s15, v0
185; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
186; GFX8-NEXT:    v_mul_lo_u32 v4, s14, v0
187; GFX8-NEXT:    v_mov_b32_e32 v6, s9
188; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
189; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
190; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
191; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
192; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
193; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
194; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
195; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
196; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
197; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
198; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
199; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
200; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
201; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
202; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
203; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
204; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
205; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
206; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
207; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
208; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
209; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
210; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
211; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
212; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
213; GFX8-NEXT:    v_mul_lo_u32 v4, s15, v0
214; GFX8-NEXT:    v_mul_lo_u32 v5, s14, v3
215; GFX8-NEXT:    v_mul_hi_u32 v8, s14, v0
216; GFX8-NEXT:    v_mul_lo_u32 v7, s14, v0
217; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
218; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
219; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
220; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
221; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
222; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
223; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
224; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
225; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
226; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
227; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
228; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
229; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
230; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
231; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
232; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
233; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
234; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
235; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
236; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
237; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
238; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
239; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
240; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
241; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
242; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
243; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
244; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
245; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
246; GFX8-NEXT:    v_mul_hi_u32 v5, s10, v0
247; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
248; GFX8-NEXT:    v_mov_b32_e32 v4, s11
249; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
250; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
251; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
252; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
253; GFX8-NEXT:    v_mul_lo_u32 v5, s11, v1
254; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
255; GFX8-NEXT:    v_mul_hi_u32 v3, s10, v1
256; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
257; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
258; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
259; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
260; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
261; GFX8-NEXT:    v_mul_hi_u32 v1, s11, v1
262; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
263; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
264; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
265; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
266; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
267; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
268; GFX8-NEXT:    v_mul_hi_u32 v7, s8, v0
269; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
270; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
271; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
272; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s10, v5
273; GFX8-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v2, vcc
274; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s11, v2
275; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
276; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
277; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
278; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
279; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
280; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
281; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
282; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v3
283; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc
284; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
285; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
286; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
287; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
288; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
289; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
290; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
291; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
292; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
293; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
294; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
295; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
296; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
297; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
298; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
299; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
300; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
301; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
302; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
303; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
304; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
305; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
306; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
307; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
308; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
309; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
310; GFX8-NEXT:    v_mov_b32_e32 v4, s1
311; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
312; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
313; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
314; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v2
315; GFX8-NEXT:    v_mov_b32_e32 v5, s2
316; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s2, v3
317; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v4, v5, vcc
318; GFX8-NEXT:    v_mov_b32_e32 v4, s4
319; GFX8-NEXT:    v_mov_b32_e32 v5, s5
320; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
321; GFX8-NEXT:    v_mov_b32_e32 v0, s6
322; GFX8-NEXT:    v_mov_b32_e32 v1, s7
323; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
324; GFX8-NEXT:    s_endpgm
325;
326; GFX9-LABEL: sdivrem_i64:
327; GFX9:       ; %bb.0:
328; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX9-NEXT:    s_ashr_i32 s2, s9, 31
331; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
332; GFX9-NEXT:    s_add_u32 s0, s8, s2
333; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
334; GFX9-NEXT:    s_and_b32 s1, s1, 1
335; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
336; GFX9-NEXT:    s_addc_u32 s1, s9, s2
337; GFX9-NEXT:    s_add_u32 s8, s10, s12
338; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
339; GFX9-NEXT:    s_and_b32 s3, s3, 1
340; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
341; GFX9-NEXT:    s_mov_b32 s13, s12
342; GFX9-NEXT:    s_addc_u32 s9, s11, s12
343; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
344; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
345; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
346; GFX9-NEXT:    s_mov_b32 s3, s2
347; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
348; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
349; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
350; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
351; GFX9-NEXT:    s_sub_u32 s14, 0, s8
352; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
353; GFX9-NEXT:    s_and_b32 s0, s0, 1
354; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
355; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
356; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
357; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
358; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
359; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
360; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
361; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
362; GFX9-NEXT:    s_subb_u32 s15, 0, s9
363; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
364; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
365; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
366; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v0
367; GFX9-NEXT:    v_mov_b32_e32 v8, s11
368; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
369; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
370; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
371; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
372; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
373; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
374; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
375; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
376; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
377; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
378; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
379; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
380; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
381; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
382; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
383; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
384; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
385; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
386; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
387; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
388; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
389; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
390; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
391; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
392; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v3
393; GFX9-NEXT:    v_mul_hi_u32 v6, s14, v0
394; GFX9-NEXT:    v_mul_lo_u32 v7, s14, v0
395; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
396; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
397; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
398; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
399; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
400; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
401; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
402; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
403; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
404; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
405; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
406; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
407; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
408; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
409; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
410; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
411; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
412; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
413; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
414; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
415; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
416; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
417; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
418; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
419; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
420; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
421; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
422; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v0
423; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
424; GFX9-NEXT:    v_mov_b32_e32 v4, s9
425; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
426; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
427; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
428; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
429; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v1
430; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
431; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v1
432; GFX9-NEXT:    v_mul_hi_u32 v1, s11, v1
433; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
434; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
435; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
436; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
437; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
438; GFX9-NEXT:    v_add_u32_e32 v3, v5, v3
439; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
440; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
441; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
442; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
443; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v0
444; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v0
445; GFX9-NEXT:    v_mov_b32_e32 v6, 0
446; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v5
447; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s10, v7
448; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v8, v2, vcc
449; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
450; GFX9-NEXT:    v_sub_u32_e32 v2, s11, v2
451; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
452; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
453; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
454; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
455; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
456; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[0:1]
457; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v3
458; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc
459; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v0
460; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v1, s[0:1]
461; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
462; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
463; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
464; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
465; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
466; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
467; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v8
468; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
469; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
470; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
471; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
472; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
473; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
474; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
475; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
476; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
477; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
478; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s[0:1]
479; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s[0:1]
480; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
481; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
482; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
483; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
484; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
485; GFX9-NEXT:    v_mov_b32_e32 v4, s1
486; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
487; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
488; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
489; GFX9-NEXT:    v_xor_b32_e32 v4, s2, v2
490; GFX9-NEXT:    v_mov_b32_e32 v5, s2
491; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
492; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
493; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
494; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[6:7]
495; GFX9-NEXT:    s_endpgm
496;
497; GFX10-LABEL: sdivrem_i64:
498; GFX10:       ; %bb.0:
499; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
500; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX10-NEXT:    s_ashr_i32 s2, s9, 31
502; GFX10-NEXT:    s_ashr_i32 s12, s11, 31
503; GFX10-NEXT:    s_add_u32 s0, s8, s2
504; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
505; GFX10-NEXT:    s_mov_b32 s13, s12
506; GFX10-NEXT:    s_and_b32 s1, s1, 1
507; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
508; GFX10-NEXT:    s_addc_u32 s1, s9, s2
509; GFX10-NEXT:    s_add_u32 s8, s10, s12
510; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
511; GFX10-NEXT:    s_and_b32 s3, s3, 1
512; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
513; GFX10-NEXT:    s_mov_b32 s3, s2
514; GFX10-NEXT:    s_addc_u32 s9, s11, s12
515; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
516; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
517; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
518; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
519; GFX10-NEXT:    s_sub_u32 s1, 0, s8
520; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
521; GFX10-NEXT:    s_and_b32 s0, s0, 1
522; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
523; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
524; GFX10-NEXT:    s_subb_u32 s14, 0, s9
525; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
526; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
527; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
528; GFX10-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
529; GFX10-NEXT:    v_trunc_f32_e32 v1, v1
530; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
531; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
532; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
533; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v1
534; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
535; GFX10-NEXT:    v_mul_lo_u32 v3, s14, v0
536; GFX10-NEXT:    v_mul_hi_u32 v4, s1, v0
537; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
538; GFX10-NEXT:    v_add3_u32 v2, v3, v2, v4
539; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v5
540; GFX10-NEXT:    v_mul_hi_u32 v6, v1, v5
541; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v5
542; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v2
543; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v2
544; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v2
545; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
546; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v4
547; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
548; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v6
549; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
550; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v5
551; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
552; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v8
553; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
554; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
555; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v6
556; GFX10-NEXT:    v_add_co_u32 v3, s0, v5, v3
557; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
558; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
559; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
560; GFX10-NEXT:    v_mul_lo_u32 v4, s14, v0
561; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v0
562; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v1, v2, vcc_lo
563; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v0
564; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
565; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
566; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v7
567; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v5
568; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v7
569; GFX10-NEXT:    v_mul_hi_u32 v7, v0, v7
570; GFX10-NEXT:    v_mul_lo_u32 v6, v0, v4
571; GFX10-NEXT:    v_mul_lo_u32 v9, v3, v4
572; GFX10-NEXT:    v_mul_hi_u32 v10, v0, v4
573; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v4
574; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
575; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
576; GFX10-NEXT:    v_add_co_u32 v8, s0, v9, v8
577; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
578; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
579; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
580; GFX10-NEXT:    v_add_co_u32 v7, s0, v8, v10
581; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
582; GFX10-NEXT:    v_add_nc_u32_e32 v5, v6, v5
583; GFX10-NEXT:    v_add_nc_u32_e32 v4, v9, v8
584; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
585; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
586; GFX10-NEXT:    v_add3_u32 v2, v4, v6, v3
587; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
588; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
589; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
590; GFX10-NEXT:    v_mul_lo_u32 v2, s11, v0
591; GFX10-NEXT:    v_mul_hi_u32 v4, s11, v0
592; GFX10-NEXT:    v_mul_hi_u32 v0, s10, v0
593; GFX10-NEXT:    v_mul_lo_u32 v3, s10, v1
594; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v1
595; GFX10-NEXT:    v_mul_hi_u32 v6, s10, v1
596; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
597; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v3
598; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
599; GFX10-NEXT:    v_add_co_u32 v4, s0, v5, v4
600; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
601; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
602; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
603; GFX10-NEXT:    v_add_co_u32 v2, s0, v4, v6
604; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
605; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
606; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v4
607; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
608; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
609; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v0
610; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
611; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
612; GFX10-NEXT:    v_mul_hi_u32 v3, s8, v0
613; GFX10-NEXT:    v_mul_lo_u32 v4, s8, v1
614; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
615; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, 1
616; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
617; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s11, v2
618; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s10, v5
619; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s0, s11, v2, vcc_lo
620; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
621; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v5
622; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
623; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v5, s8
624; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
625; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v2
626; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
627; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
628; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
629; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
630; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
631; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
632; GFX10-NEXT:    v_add_co_u32 v13, s0, v3, 1
633; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
634; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
635; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
636; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v2
637; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
638; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
639; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
640; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
641; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
642; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
643; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
644; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
645; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
646; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
647; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
648; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
649; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
650; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
651; GFX10-NEXT:    v_mov_b32_e32 v4, 0
652; GFX10-NEXT:    v_xor_b32_e32 v0, s8, v0
653; GFX10-NEXT:    v_xor_b32_e32 v1, s9, v1
654; GFX10-NEXT:    v_xor_b32_e32 v3, s2, v3
655; GFX10-NEXT:    v_xor_b32_e32 v5, s2, v2
656; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, s8
657; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v1, vcc_lo
658; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s2
659; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v5, vcc_lo
660; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
661; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
662; GFX10-NEXT:    s_endpgm
663  %div = sdiv i64 %x, %y
664  store i64 %div, i64 addrspace(1)* %out0
665  %rem = srem i64 %x, %y
666  store i64 %rem, i64 addrspace(1)* %out1
667  ret void
668}
669
670define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) {
671; GFX8-LABEL: sdivrem_v2i32:
672; GFX8:       ; %bb.0:
673; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x18
674; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
675; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
676; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
678; GFX8-NEXT:    s_add_i32 s0, s0, s8
679; GFX8-NEXT:    s_xor_b32 s9, s0, s8
680; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
681; GFX8-NEXT:    s_ashr_i32 s11, s1, 31
682; GFX8-NEXT:    s_add_i32 s0, s1, s11
683; GFX8-NEXT:    s_sub_i32 s1, 0, s9
684; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
685; GFX8-NEXT:    s_xor_b32 s12, s0, s11
686; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
687; GFX8-NEXT:    s_ashr_i32 s10, s2, 31
688; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
689; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
690; GFX8-NEXT:    s_add_i32 s0, s2, s10
691; GFX8-NEXT:    s_xor_b32 s0, s0, s10
692; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
693; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
694; GFX8-NEXT:    s_ashr_i32 s2, s3, 31
695; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
696; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
697; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
698; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
699; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
700; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s9
701; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
702; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
703; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
704; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
705; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
706; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
707; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
708; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
709; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
710; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s9, v2
711; GFX8-NEXT:    s_sub_i32 s0, 0, s12
712; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
713; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
714; GFX8-NEXT:    s_add_i32 s1, s3, s2
715; GFX8-NEXT:    s_xor_b32 s1, s1, s2
716; GFX8-NEXT:    s_xor_b32 s0, s10, s8
717; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
718; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
719; GFX8-NEXT:    v_xor_b32_e32 v2, s10, v2
720; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
721; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
722; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
723; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
724; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
725; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
726; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
727; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
728; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
729; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
730; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
731; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
732; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
733; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
734; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
735; GFX8-NEXT:    s_xor_b32 s0, s2, s11
736; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
737; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
738; GFX8-NEXT:    v_mov_b32_e32 v4, s4
739; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
740; GFX8-NEXT:    v_mov_b32_e32 v5, s5
741; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
742; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
743; GFX8-NEXT:    v_mov_b32_e32 v0, s6
744; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
745; GFX8-NEXT:    v_mov_b32_e32 v1, s7
746; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
747; GFX8-NEXT:    s_endpgm
748;
749; GFX9-LABEL: sdivrem_v2i32:
750; GFX9:       ; %bb.0:
751; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x18
752; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX9-NEXT:    s_ashr_i32 s10, s6, 31
754; GFX9-NEXT:    s_add_i32 s0, s6, s10
755; GFX9-NEXT:    s_xor_b32 s6, s0, s10
756; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
757; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
758; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
759; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
760; GFX9-NEXT:    s_add_i32 s7, s7, s5
761; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
762; GFX9-NEXT:    s_xor_b32 s7, s7, s5
763; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
764; GFX9-NEXT:    s_sub_i32 s11, 0, s6
765; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
766; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
767; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
768; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
769; GFX9-NEXT:    s_ashr_i32 s4, s8, 31
770; GFX9-NEXT:    s_add_i32 s8, s8, s4
771; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
772; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
773; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
774; GFX9-NEXT:    s_xor_b32 s8, s8, s4
775; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
776; GFX9-NEXT:    s_sub_i32 s12, 0, s7
777; GFX9-NEXT:    s_ashr_i32 s11, s9, 31
778; GFX9-NEXT:    s_add_i32 s9, s9, s11
779; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
780; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
781; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v1
782; GFX9-NEXT:    s_xor_b32 s9, s9, s11
783; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s6
784; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
785; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
786; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v3
787; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
788; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
789; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
790; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
791; GFX9-NEXT:    v_mul_hi_u32 v1, s9, v1
792; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
793; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
794; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
795; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
796; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
797; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
798; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s7
799; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
800; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
801; GFX9-NEXT:    s_xor_b32 s6, s4, s10
802; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
803; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
804; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
805; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
806; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
807; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
808; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
809; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
810; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
811; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v3
812; GFX9-NEXT:    s_xor_b32 s4, s11, s5
813; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
814; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
815; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
816; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
817; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
818; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
819; GFX9-NEXT:    v_mov_b32_e32 v4, 0
820; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
821; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
822; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
823; GFX9-NEXT:    s_endpgm
824;
825; GFX10-LABEL: sdivrem_v2i32:
826; GFX10:       ; %bb.0:
827; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x18
828; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX10-NEXT:    s_ashr_i32 s2, s0, 31
830; GFX10-NEXT:    s_ashr_i32 s3, s1, 31
831; GFX10-NEXT:    s_add_i32 s0, s0, s2
832; GFX10-NEXT:    s_add_i32 s1, s1, s3
833; GFX10-NEXT:    s_xor_b32 s8, s0, s2
834; GFX10-NEXT:    s_xor_b32 s9, s1, s3
835; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
836; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
837; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
838; GFX10-NEXT:    s_sub_i32 s6, 0, s8
839; GFX10-NEXT:    s_sub_i32 s7, 0, s9
840; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
841; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
842; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
843; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
844; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
845; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
846; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
848; GFX10-NEXT:    s_ashr_i32 s11, s1, 31
849; GFX10-NEXT:    s_add_i32 s0, s0, s10
850; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
851; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
852; GFX10-NEXT:    s_add_i32 s1, s1, s11
853; GFX10-NEXT:    s_xor_b32 s0, s0, s10
854; GFX10-NEXT:    s_xor_b32 s1, s1, s11
855; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
856; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
857; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
858; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
859; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
860; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
861; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
862; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s8
863; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s9
864; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
865; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
866; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
867; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s1, v3
868; GFX10-NEXT:    s_xor_b32 s1, s10, s2
869; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
870; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
871; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
872; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
873; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
874; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
875; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
876; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
877; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
878; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
879; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v2
880; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
881; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s8, v2
882; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
883; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
884; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
885; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
886; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
887; GFX10-NEXT:    s_xor_b32 s0, s11, s3
888; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
889; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
890; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
891; GFX10-NEXT:    v_xor_b32_e32 v3, s11, v3
892; GFX10-NEXT:    v_mov_b32_e32 v4, 0
893; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
894; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
895; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s10, v2
896; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s11, v3
897; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
899; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
900; GFX10-NEXT:    s_endpgm
901  %div = sdiv <2 x i32> %x, %y
902  store <2 x i32> %div, <2 x i32> addrspace(1)* %out0
903  %rem = srem <2 x i32> %x, %y
904  store <2 x i32> %rem, <2 x i32> addrspace(1)* %out1
905  ret void
906}
907
908define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %x, <4 x i32> %y) {
909; GFX8-LABEL: sdivrem_v4i32:
910; GFX8:       ; %bb.0:
911; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
912; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
913; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
914; GFX8-NEXT:    v_mov_b32_e32 v3, 0x4f7ffffe
915; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX8-NEXT:    s_ashr_i32 s12, s0, 31
917; GFX8-NEXT:    s_add_i32 s0, s0, s12
918; GFX8-NEXT:    s_xor_b32 s13, s0, s12
919; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
920; GFX8-NEXT:    s_ashr_i32 s15, s1, 31
921; GFX8-NEXT:    s_add_i32 s0, s1, s15
922; GFX8-NEXT:    s_sub_i32 s1, 0, s13
923; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
924; GFX8-NEXT:    s_xor_b32 s16, s0, s15
925; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s16
926; GFX8-NEXT:    s_ashr_i32 s14, s4, 31
927; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
928; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
929; GFX8-NEXT:    s_add_i32 s0, s4, s14
930; GFX8-NEXT:    s_xor_b32 s0, s0, s14
931; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
932; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
933; GFX8-NEXT:    s_ashr_i32 s4, s5, 31
934; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
935; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
936; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
937; GFX8-NEXT:    v_mul_f32_e32 v1, v2, v3
938; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
939; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s13
940; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
941; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
942; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
943; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
944; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v2
945; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
946; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
947; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
948; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
949; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s13, v2
950; GFX8-NEXT:    s_sub_i32 s0, 0, s16
951; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
952; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v1
953; GFX8-NEXT:    s_add_i32 s1, s5, s4
954; GFX8-NEXT:    s_xor_b32 s1, s1, s4
955; GFX8-NEXT:    s_xor_b32 s0, s14, s12
956; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
957; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
958; GFX8-NEXT:    v_xor_b32_e32 v2, s14, v2
959; GFX8-NEXT:    s_ashr_i32 s5, s2, 31
960; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
961; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
962; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
963; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s14, v2
964; GFX8-NEXT:    v_mul_lo_u32 v5, v1, s16
965; GFX8-NEXT:    s_add_i32 s0, s2, s5
966; GFX8-NEXT:    s_xor_b32 s2, s0, s5
967; GFX8-NEXT:    s_ashr_i32 s12, s6, 31
968; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v5
969; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v1
970; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
971; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
972; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
973; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
974; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
975; GFX8-NEXT:    v_rcp_iflag_f32_e32 v5, v5
976; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v1
977; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s16, v2
978; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v3
979; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v5
980; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
981; GFX8-NEXT:    v_subrev_u32_e64 v6, s[0:1], s16, v2
982; GFX8-NEXT:    s_sub_i32 s0, 0, s2
983; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
984; GFX8-NEXT:    v_mul_lo_u32 v6, s0, v5
985; GFX8-NEXT:    s_add_i32 s1, s6, s12
986; GFX8-NEXT:    s_xor_b32 s1, s1, s12
987; GFX8-NEXT:    s_xor_b32 s0, s4, s15
988; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
989; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
990; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
991; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
992; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
993; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v5
994; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v2
995; GFX8-NEXT:    s_ashr_i32 s4, s3, 31
996; GFX8-NEXT:    v_mul_lo_u32 v7, v6, s2
997; GFX8-NEXT:    s_add_i32 s0, s3, s4
998; GFX8-NEXT:    s_xor_b32 s3, s0, s4
999; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v7
1000; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v6
1001; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
1002; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
1003; GFX8-NEXT:    v_cvt_f32_u32_e32 v7, s3
1004; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s2, v2
1005; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
1006; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v7
1007; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
1008; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
1009; GFX8-NEXT:    v_mul_f32_e32 v3, v7, v3
1010; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
1011; GFX8-NEXT:    v_subrev_u32_e64 v7, s[0:1], s2, v2
1012; GFX8-NEXT:    s_sub_i32 s0, 0, s3
1013; GFX8-NEXT:    v_cndmask_b32_e32 v7, v2, v7, vcc
1014; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v3
1015; GFX8-NEXT:    s_ashr_i32 s2, s7, 31
1016; GFX8-NEXT:    s_add_i32 s1, s7, s2
1017; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
1018; GFX8-NEXT:    v_mul_hi_u32 v2, v3, v2
1019; GFX8-NEXT:    s_xor_b32 s1, s1, s2
1020; GFX8-NEXT:    s_xor_b32 s0, s12, s5
1021; GFX8-NEXT:    v_xor_b32_e32 v6, s0, v6
1022; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1023; GFX8-NEXT:    v_mul_hi_u32 v3, s1, v2
1024; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v6
1025; GFX8-NEXT:    v_xor_b32_e32 v6, s12, v7
1026; GFX8-NEXT:    v_mul_lo_u32 v7, v3, s3
1027; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v6
1028; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
1029; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s1, v7
1030; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v7
1031; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
1032; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s3, v7
1033; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
1034; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 1, v3
1035; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v7
1036; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
1037; GFX8-NEXT:    v_subrev_u32_e64 v8, s[0:1], s3, v7
1038; GFX8-NEXT:    s_xor_b32 s0, s2, s4
1039; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
1040; GFX8-NEXT:    v_xor_b32_e32 v3, s0, v3
1041; GFX8-NEXT:    v_mov_b32_e32 v8, s8
1042; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s0, v3
1043; GFX8-NEXT:    v_mov_b32_e32 v9, s9
1044; GFX8-NEXT:    v_xor_b32_e32 v7, s2, v7
1045; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1046; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s2, v7
1047; GFX8-NEXT:    v_mov_b32_e32 v0, s10
1048; GFX8-NEXT:    v_mov_b32_e32 v1, s11
1049; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1050; GFX8-NEXT:    s_endpgm
1051;
1052; GFX9-LABEL: sdivrem_v4i32:
1053; GFX9:       ; %bb.0:
1054; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x20
1055; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4f7ffffe
1056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1057; GFX9-NEXT:    s_ashr_i32 s6, s12, 31
1058; GFX9-NEXT:    s_add_i32 s0, s12, s6
1059; GFX9-NEXT:    s_xor_b32 s7, s0, s6
1060; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
1061; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1062; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1063; GFX9-NEXT:    s_ashr_i32 s4, s13, 31
1064; GFX9-NEXT:    s_add_i32 s5, s13, s4
1065; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1066; GFX9-NEXT:    s_sub_i32 s12, 0, s7
1067; GFX9-NEXT:    s_xor_b32 s5, s5, s4
1068; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
1069; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1070; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1071; GFX9-NEXT:    s_sub_i32 s13, 0, s5
1072; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1073; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v0
1074; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX9-NEXT:    s_ashr_i32 s12, s8, 31
1076; GFX9-NEXT:    s_add_i32 s8, s8, s12
1077; GFX9-NEXT:    s_xor_b32 s8, s8, s12
1078; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
1079; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v2
1080; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1081; GFX9-NEXT:    s_xor_b32 s6, s12, s6
1082; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
1083; GFX9-NEXT:    v_mul_hi_u32 v0, s8, v0
1084; GFX9-NEXT:    v_mul_lo_u32 v3, s13, v1
1085; GFX9-NEXT:    s_ashr_i32 s13, s9, 31
1086; GFX9-NEXT:    s_add_i32 s9, s9, s13
1087; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s7
1088; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1089; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
1090; GFX9-NEXT:    s_xor_b32 s4, s13, s4
1091; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v4
1092; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
1093; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1094; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
1095; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
1096; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
1097; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v4
1098; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1099; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v4
1100; GFX9-NEXT:    s_xor_b32 s7, s9, s13
1101; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1102; GFX9-NEXT:    v_mul_hi_u32 v1, s7, v1
1103; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1104; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
1105; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
1106; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
1107; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
1108; GFX9-NEXT:    s_ashr_i32 s6, s14, 31
1109; GFX9-NEXT:    v_subrev_u32_e32 v4, s12, v3
1110; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v5
1111; GFX9-NEXT:    s_add_i32 s7, s14, s6
1112; GFX9-NEXT:    s_xor_b32 s7, s7, s6
1113; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s7
1114; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
1115; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1116; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1117; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1118; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v3
1119; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1120; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
1121; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v2
1122; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
1123; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1124; GFX9-NEXT:    s_sub_i32 s8, 0, s7
1125; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1126; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v5
1127; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
1128; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
1129; GFX9-NEXT:    s_ashr_i32 s4, s15, 31
1130; GFX9-NEXT:    s_add_i32 s9, s15, s4
1131; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
1132; GFX9-NEXT:    s_xor_b32 s9, s9, s4
1133; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s9
1134; GFX9-NEXT:    v_subrev_u32_e32 v7, s5, v3
1135; GFX9-NEXT:    s_ashr_i32 s5, s10, 31
1136; GFX9-NEXT:    s_add_i32 s8, s10, s5
1137; GFX9-NEXT:    s_xor_b32 s8, s8, s5
1138; GFX9-NEXT:    v_add_u32_e32 v5, v5, v6
1139; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v5
1140; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v8
1141; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1142; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
1143; GFX9-NEXT:    v_mul_lo_u32 v7, v6, s7
1144; GFX9-NEXT:    v_mul_f32_e32 v2, v8, v2
1145; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1146; GFX9-NEXT:    v_subrev_u32_e32 v5, s13, v3
1147; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
1148; GFX9-NEXT:    s_sub_i32 s8, 0, s9
1149; GFX9-NEXT:    v_mul_lo_u32 v8, s8, v2
1150; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
1151; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
1152; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
1153; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v3
1154; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1155; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
1156; GFX9-NEXT:    v_add_u32_e32 v7, 1, v6
1157; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
1158; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
1159; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v3
1160; GFX9-NEXT:    s_ashr_i32 s7, s11, 31
1161; GFX9-NEXT:    s_add_i32 s8, s11, s7
1162; GFX9-NEXT:    s_xor_b32 s8, s8, s7
1163; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
1164; GFX9-NEXT:    v_mul_hi_u32 v8, s8, v2
1165; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
1166; GFX9-NEXT:    s_xor_b32 s6, s5, s6
1167; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
1168; GFX9-NEXT:    v_mul_lo_u32 v7, v8, s9
1169; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v6
1170; GFX9-NEXT:    v_subrev_u32_e32 v6, s5, v3
1171; GFX9-NEXT:    s_xor_b32 s4, s7, s4
1172; GFX9-NEXT:    v_sub_u32_e32 v3, s8, v7
1173; GFX9-NEXT:    v_add_u32_e32 v7, 1, v8
1174; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1175; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
1176; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
1177; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
1178; GFX9-NEXT:    v_add_u32_e32 v8, 1, v7
1179; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1180; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
1181; GFX9-NEXT:    v_subrev_u32_e32 v8, s9, v3
1182; GFX9-NEXT:    v_cndmask_b32_e32 v8, v3, v8, vcc
1183; GFX9-NEXT:    v_xor_b32_e32 v3, s4, v7
1184; GFX9-NEXT:    v_subrev_u32_e32 v2, s6, v2
1185; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v3
1186; GFX9-NEXT:    v_xor_b32_e32 v7, s7, v8
1187; GFX9-NEXT:    v_mov_b32_e32 v8, 0
1188; GFX9-NEXT:    v_subrev_u32_e32 v7, s7, v7
1189; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
1190; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3]
1191; GFX9-NEXT:    s_endpgm
1192;
1193; GFX10-LABEL: sdivrem_v4i32:
1194; GFX10:       ; %bb.0:
1195; GFX10-NEXT:    s_clause 0x1
1196; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
1197; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1198; GFX10-NEXT:    v_mov_b32_e32 v4, 0x4f7ffffe
1199; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX10-NEXT:    s_ashr_i32 s12, s8, 31
1201; GFX10-NEXT:    s_ashr_i32 s14, s10, 31
1202; GFX10-NEXT:    s_add_i32 s6, s8, s12
1203; GFX10-NEXT:    s_add_i32 s8, s10, s14
1204; GFX10-NEXT:    s_xor_b32 s10, s6, s12
1205; GFX10-NEXT:    s_ashr_i32 s13, s9, 31
1206; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s10
1207; GFX10-NEXT:    s_ashr_i32 s15, s11, 31
1208; GFX10-NEXT:    s_add_i32 s7, s9, s13
1209; GFX10-NEXT:    s_add_i32 s9, s11, s15
1210; GFX10-NEXT:    s_xor_b32 s11, s7, s13
1211; GFX10-NEXT:    s_xor_b32 s8, s8, s14
1212; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1213; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s11
1214; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s8
1215; GFX10-NEXT:    s_xor_b32 s9, s9, s15
1216; GFX10-NEXT:    s_sub_i32 s6, 0, s10
1217; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s9
1218; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1219; GFX10-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1220; GFX10-NEXT:    s_sub_i32 s7, 0, s11
1221; GFX10-NEXT:    s_sub_i32 s19, 0, s8
1222; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1223; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
1224; GFX10-NEXT:    s_ashr_i32 s16, s0, 31
1225; GFX10-NEXT:    s_ashr_i32 s17, s1, 31
1226; GFX10-NEXT:    s_add_i32 s0, s0, s16
1227; GFX10-NEXT:    s_ashr_i32 s18, s2, 31
1228; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
1229; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v4
1230; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v4
1231; GFX10-NEXT:    s_xor_b32 s0, s0, s16
1232; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
1233; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
1234; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
1235; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
1236; GFX10-NEXT:    s_sub_i32 s6, 0, s9
1237; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
1238; GFX10-NEXT:    s_add_i32 s1, s1, s17
1239; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v1
1240; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v2
1241; GFX10-NEXT:    v_mul_hi_u32 v4, v0, v4
1242; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v3
1243; GFX10-NEXT:    s_add_i32 s2, s2, s18
1244; GFX10-NEXT:    s_ashr_i32 s19, s3, 31
1245; GFX10-NEXT:    s_xor_b32 s1, s1, s17
1246; GFX10-NEXT:    s_xor_b32 s2, s2, s18
1247; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v5
1248; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
1249; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v4
1250; GFX10-NEXT:    v_mul_hi_u32 v7, v3, v7
1251; GFX10-NEXT:    s_add_i32 s3, s3, s19
1252; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1253; GFX10-NEXT:    s_xor_b32 s3, s3, s19
1254; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
1255; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v5
1256; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v6
1257; GFX10-NEXT:    s_xor_b32 s12, s16, s12
1258; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v7
1259; GFX10-NEXT:    s_xor_b32 s13, s17, s13
1260; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
1261; GFX10-NEXT:    v_mul_hi_u32 v2, s2, v2
1262; GFX10-NEXT:    v_mul_lo_u32 v4, v0, s10
1263; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
1264; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
1265; GFX10-NEXT:    s_xor_b32 s14, s18, s14
1266; GFX10-NEXT:    v_mul_lo_u32 v5, v1, s11
1267; GFX10-NEXT:    v_mul_lo_u32 v6, v2, s8
1268; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s0, v4
1269; GFX10-NEXT:    v_mul_lo_u32 v7, v3, s9
1270; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
1271; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
1272; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
1273; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v4
1274; GFX10-NEXT:    v_sub_nc_u32_e32 v5, s1, v5
1275; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s2, v6
1276; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s3, v7
1277; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
1278; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s10, v4
1279; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
1280; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s8, v6
1281; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s9, v7
1282; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
1283; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
1284; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s11, v5
1285; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
1286; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s8, v6
1287; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s2
1288; GFX10-NEXT:    v_subrev_nc_u32_e32 v11, s9, v7
1289; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v0
1290; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v4
1291; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
1292; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
1293; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s2
1294; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v1
1295; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
1296; GFX10-NEXT:    v_subrev_nc_u32_e32 v8, s10, v4
1297; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v2
1298; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v5
1299; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s8, v6
1300; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v3
1301; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
1302; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v7
1303; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s0
1304; GFX10-NEXT:    v_subrev_nc_u32_e32 v9, s11, v5
1305; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s1
1306; GFX10-NEXT:    v_subrev_nc_u32_e32 v10, s8, v6
1307; GFX10-NEXT:    v_subrev_nc_u32_e32 v12, s9, v7
1308; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
1309; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s0
1310; GFX10-NEXT:    s_xor_b32 s0, s19, s15
1311; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s1
1312; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v12, vcc_lo
1313; GFX10-NEXT:    v_xor_b32_e32 v0, s12, v0
1314; GFX10-NEXT:    v_xor_b32_e32 v1, s13, v1
1315; GFX10-NEXT:    v_xor_b32_e32 v2, s14, v2
1316; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
1317; GFX10-NEXT:    v_xor_b32_e32 v4, s16, v4
1318; GFX10-NEXT:    v_xor_b32_e32 v5, s17, v5
1319; GFX10-NEXT:    v_xor_b32_e32 v6, s18, v6
1320; GFX10-NEXT:    v_xor_b32_e32 v7, s19, v7
1321; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s12, v0
1322; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s13, v1
1323; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s14, v2
1324; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s0, v3
1325; GFX10-NEXT:    v_mov_b32_e32 v8, 0
1326; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, s16, v4
1327; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s17, v5
1328; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s18, v6
1329; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s19, v7
1330; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[4:5]
1332; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
1333; GFX10-NEXT:    s_endpgm
1334  %div = sdiv <4 x i32> %x, %y
1335  store <4 x i32> %div, <4 x i32> addrspace(1)* %out0
1336  %rem = srem <4 x i32> %x, %y
1337  store <4 x i32> %rem, <4 x i32> addrspace(1)* %out1
1338  ret void
1339}
1340
1341define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64> addrspace(1)* %out1, <2 x i64> %x, <2 x i64> %y) {
1342; GFX8-LABEL: sdivrem_v2i64:
1343; GFX8:       ; %bb.0:
1344; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1345; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
1346; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1347; GFX8-NEXT:    s_ashr_i32 s6, s9, 31
1348; GFX8-NEXT:    s_ashr_i32 s12, s1, 31
1349; GFX8-NEXT:    s_add_u32 s14, s8, s6
1350; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
1351; GFX8-NEXT:    s_and_b32 s7, s7, 1
1352; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
1353; GFX8-NEXT:    s_addc_u32 s15, s9, s6
1354; GFX8-NEXT:    s_add_u32 s0, s0, s12
1355; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
1356; GFX8-NEXT:    s_and_b32 s7, s7, 1
1357; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
1358; GFX8-NEXT:    s_mov_b32 s13, s12
1359; GFX8-NEXT:    s_addc_u32 s1, s1, s12
1360; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
1361; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
1362; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
1363; GFX8-NEXT:    s_mov_b32 s7, s6
1364; GFX8-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
1365; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
1366; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
1367; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1368; GFX8-NEXT:    s_sub_u32 s16, 0, s8
1369; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1370; GFX8-NEXT:    s_and_b32 s0, s0, 1
1371; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1372; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1373; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
1374; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
1375; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
1376; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
1377; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
1378; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1379; GFX8-NEXT:    s_subb_u32 s17, 0, s9
1380; GFX8-NEXT:    v_mul_lo_u32 v2, s16, v1
1381; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
1382; GFX8-NEXT:    v_mul_hi_u32 v5, s16, v0
1383; GFX8-NEXT:    v_mul_lo_u32 v4, s16, v0
1384; GFX8-NEXT:    v_mov_b32_e32 v6, s9
1385; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1386; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1387; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
1388; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
1389; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
1390; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
1391; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
1392; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1393; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
1394; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1395; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
1396; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
1397; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
1398; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
1399; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1400; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
1401; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1402; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
1403; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
1404; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
1405; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1406; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
1407; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
1408; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1409; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
1410; GFX8-NEXT:    v_mul_lo_u32 v4, s17, v0
1411; GFX8-NEXT:    v_mul_lo_u32 v5, s16, v3
1412; GFX8-NEXT:    v_mul_hi_u32 v8, s16, v0
1413; GFX8-NEXT:    v_mul_lo_u32 v7, s16, v0
1414; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
1415; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
1416; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
1417; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
1418; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
1419; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
1420; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
1421; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
1422; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
1423; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
1424; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1425; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
1426; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
1427; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
1428; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
1429; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
1430; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
1431; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
1432; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
1433; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
1434; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
1435; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
1436; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
1437; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
1438; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1439; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1440; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1441; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
1442; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
1443; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
1444; GFX8-NEXT:    v_mul_hi_u32 v0, s15, v0
1445; GFX8-NEXT:    v_mov_b32_e32 v4, s15
1446; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1447; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1448; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1449; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1450; GFX8-NEXT:    v_mul_lo_u32 v5, s15, v1
1451; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1452; GFX8-NEXT:    v_mul_hi_u32 v3, s14, v1
1453; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
1454; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1455; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1456; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1457; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
1458; GFX8-NEXT:    v_mul_hi_u32 v1, s15, v1
1459; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1460; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1461; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
1462; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1463; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
1464; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
1465; GFX8-NEXT:    v_mul_hi_u32 v7, s8, v0
1466; GFX8-NEXT:    v_mul_lo_u32 v5, s8, v0
1467; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1468; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
1469; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s14, v5
1470; GFX8-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v2, vcc
1471; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s15, v2
1472; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
1473; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
1474; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
1475; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
1476; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
1477; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
1478; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
1479; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v3
1480; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc
1481; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v0
1482; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
1483; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
1484; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1485; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
1486; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
1487; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
1488; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
1489; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
1490; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
1491; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
1492; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
1493; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
1494; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
1495; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
1496; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
1497; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
1498; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
1499; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
1500; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
1501; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
1502; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1503; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
1504; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
1505; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
1506; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
1507; GFX8-NEXT:    s_ashr_i32 s12, s3, 31
1508; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
1509; GFX8-NEXT:    s_add_u32 s0, s10, s8
1510; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
1511; GFX8-NEXT:    v_mov_b32_e32 v4, s1
1512; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
1513; GFX8-NEXT:    s_and_b32 s1, s1, 1
1514; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
1515; GFX8-NEXT:    s_addc_u32 s1, s11, s8
1516; GFX8-NEXT:    s_add_u32 s2, s2, s12
1517; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
1518; GFX8-NEXT:    s_and_b32 s7, s7, 1
1519; GFX8-NEXT:    s_cmp_lg_u32 s7, 0
1520; GFX8-NEXT:    s_mov_b32 s13, s12
1521; GFX8-NEXT:    s_addc_u32 s3, s3, s12
1522; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
1523; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
1524; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s3
1525; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
1526; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
1527; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
1528; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
1529; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
1530; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v4
1531; GFX8-NEXT:    v_mov_b32_e32 v6, s6
1532; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
1533; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v2, v6, vcc
1534; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v7
1535; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
1536; GFX8-NEXT:    s_mov_b32 s9, s8
1537; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
1538; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
1539; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
1540; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
1541; GFX8-NEXT:    s_sub_u32 s10, 0, s2
1542; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
1543; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
1544; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1545; GFX8-NEXT:    s_and_b32 s0, s0, 1
1546; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1547; GFX8-NEXT:    s_subb_u32 s11, 0, s3
1548; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
1549; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
1550; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
1551; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v2
1552; GFX8-NEXT:    v_mov_b32_e32 v10, s3
1553; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
1554; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
1555; GFX8-NEXT:    v_mul_lo_u32 v7, v3, v8
1556; GFX8-NEXT:    v_mul_lo_u32 v9, v2, v6
1557; GFX8-NEXT:    v_mul_hi_u32 v11, v2, v8
1558; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v8
1559; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
1560; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1561; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
1562; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1563; GFX8-NEXT:    v_mul_lo_u32 v11, v3, v6
1564; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
1565; GFX8-NEXT:    v_mul_hi_u32 v9, v2, v6
1566; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v11, v8
1567; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
1568; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
1569; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1570; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
1571; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v6
1572; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
1573; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1574; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
1575; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
1576; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
1577; GFX8-NEXT:    v_addc_u32_e64 v7, s[0:1], v3, v6, vcc
1578; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v2
1579; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
1580; GFX8-NEXT:    v_mul_hi_u32 v12, s10, v2
1581; GFX8-NEXT:    v_mul_lo_u32 v11, s10, v2
1582; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
1583; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v9
1584; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v12
1585; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v11
1586; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v8
1587; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v11
1588; GFX8-NEXT:    v_mul_hi_u32 v11, v7, v11
1589; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
1590; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
1591; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
1592; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
1593; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
1594; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v12, v6
1595; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v8
1596; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v11
1597; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
1598; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
1599; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
1600; GFX8-NEXT:    v_add_u32_e64 v11, s[0:1], v11, v12
1601; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
1602; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
1603; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
1604; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v11, v9
1605; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
1606; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
1607; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
1608; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1609; GFX8-NEXT:    v_mul_lo_u32 v6, s7, v2
1610; GFX8-NEXT:    v_mul_lo_u32 v7, s6, v3
1611; GFX8-NEXT:    v_mul_hi_u32 v9, s6, v2
1612; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v2
1613; GFX8-NEXT:    v_mov_b32_e32 v8, s7
1614; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
1615; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1616; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
1617; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1618; GFX8-NEXT:    v_mul_lo_u32 v9, s7, v3
1619; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
1620; GFX8-NEXT:    v_mul_hi_u32 v7, s6, v3
1621; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
1622; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1623; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
1624; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
1625; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
1626; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
1627; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
1628; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1629; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
1630; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
1631; GFX8-NEXT:    v_mul_lo_u32 v6, s3, v2
1632; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
1633; GFX8-NEXT:    v_mul_hi_u32 v11, s2, v2
1634; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v2
1635; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
1636; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
1637; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s6, v9
1638; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v6, vcc
1639; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s7, v6
1640; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
1641; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
1642; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
1643; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1644; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
1645; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
1646; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
1647; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v7
1648; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
1649; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
1650; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
1651; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
1652; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
1653; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
1654; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
1655; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v2
1656; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v10, vcc
1657; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v3, s[0:1]
1658; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v14
1659; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
1660; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
1661; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
1662; GFX8-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
1663; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
1664; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
1665; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
1666; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
1667; GFX8-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
1668; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
1669; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
1670; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
1671; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
1672; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
1673; GFX8-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
1674; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v2
1675; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v3
1676; GFX8-NEXT:    v_mov_b32_e32 v8, s1
1677; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
1678; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
1679; GFX8-NEXT:    v_xor_b32_e32 v7, s8, v7
1680; GFX8-NEXT:    v_xor_b32_e32 v8, s8, v6
1681; GFX8-NEXT:    v_mov_b32_e32 v9, s8
1682; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s8, v7
1683; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
1684; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1685; GFX8-NEXT:    v_mov_b32_e32 v9, s5
1686; GFX8-NEXT:    v_mov_b32_e32 v8, s4
1687; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1688; GFX8-NEXT:    s_nop 0
1689; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1690; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1691; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
1692; GFX8-NEXT:    s_endpgm
1693;
1694; GFX9-LABEL: sdivrem_v2i64:
1695; GFX9:       ; %bb.0:
1696; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1697; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
1698; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
1700; GFX9-NEXT:    s_ashr_i32 s12, s1, 31
1701; GFX9-NEXT:    s_add_u32 s14, s8, s6
1702; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
1703; GFX9-NEXT:    s_and_b32 s7, s7, 1
1704; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
1705; GFX9-NEXT:    s_addc_u32 s15, s9, s6
1706; GFX9-NEXT:    s_add_u32 s0, s0, s12
1707; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
1708; GFX9-NEXT:    s_and_b32 s7, s7, 1
1709; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
1710; GFX9-NEXT:    s_mov_b32 s13, s12
1711; GFX9-NEXT:    s_addc_u32 s1, s1, s12
1712; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
1713; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
1714; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
1715; GFX9-NEXT:    s_mov_b32 s7, s6
1716; GFX9-NEXT:    s_xor_b64 s[14:15], s[14:15], s[6:7]
1717; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
1718; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
1719; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1720; GFX9-NEXT:    s_sub_u32 s16, 0, s8
1721; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
1722; GFX9-NEXT:    s_and_b32 s0, s0, 1
1723; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
1724; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
1725; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
1726; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
1727; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
1728; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1729; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1730; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
1731; GFX9-NEXT:    s_subb_u32 s17, 0, s9
1732; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
1733; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
1734; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
1735; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v0
1736; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
1737; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
1738; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
1739; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
1740; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
1741; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
1742; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1743; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
1744; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1745; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
1746; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
1747; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
1748; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
1749; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
1750; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1751; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
1752; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
1753; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
1754; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
1755; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1756; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
1757; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
1758; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
1759; GFX9-NEXT:    v_mul_lo_u32 v4, s17, v0
1760; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v3
1761; GFX9-NEXT:    v_mul_hi_u32 v6, s16, v0
1762; GFX9-NEXT:    v_mul_lo_u32 v7, s16, v0
1763; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1764; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
1765; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
1766; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
1767; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
1768; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
1769; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
1770; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
1771; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
1772; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1773; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
1774; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
1775; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
1776; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
1777; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
1778; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
1779; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
1780; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
1781; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
1782; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
1783; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
1784; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
1785; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1786; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1787; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1788; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
1789; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
1790; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
1791; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
1792; GFX9-NEXT:    v_mov_b32_e32 v7, s15
1793; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
1794; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1795; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
1796; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1797; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v1
1798; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
1799; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v1
1800; GFX9-NEXT:    v_mul_hi_u32 v1, s15, v1
1801; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
1802; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
1803; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
1804; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1805; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1806; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
1807; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1808; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
1809; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
1810; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
1811; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
1812; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
1813; GFX9-NEXT:    v_mov_b32_e32 v5, s9
1814; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
1815; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s14, v6
1816; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc
1817; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
1818; GFX9-NEXT:    v_sub_u32_e32 v2, s15, v2
1819; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
1820; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
1821; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
1822; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
1823; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
1824; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
1825; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s8, v3
1826; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc
1827; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
1828; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
1829; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
1830; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1831; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
1832; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
1833; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
1834; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
1835; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
1836; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
1837; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
1838; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
1839; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
1840; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
1841; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
1842; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
1843; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
1844; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
1845; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
1846; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
1847; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
1848; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1849; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
1850; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
1851; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
1852; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
1853; GFX9-NEXT:    s_add_u32 s10, s10, s8
1854; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
1855; GFX9-NEXT:    s_and_b32 s7, s7, 1
1856; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
1857; GFX9-NEXT:    s_addc_u32 s11, s11, s8
1858; GFX9-NEXT:    s_add_u32 s2, s2, s12
1859; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
1860; GFX9-NEXT:    s_and_b32 s7, s7, 1
1861; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
1862; GFX9-NEXT:    s_mov_b32 s13, s12
1863; GFX9-NEXT:    s_addc_u32 s3, s3, s12
1864; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
1865; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
1866; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
1867; GFX9-NEXT:    s_mov_b32 s9, s8
1868; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
1869; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
1870; GFX9-NEXT:    v_add_f32_e32 v4, v4, v5
1871; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1872; GFX9-NEXT:    s_sub_u32 s7, 0, s2
1873; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
1874; GFX9-NEXT:    v_mov_b32_e32 v5, s1
1875; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
1876; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
1877; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
1878; GFX9-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
1879; GFX9-NEXT:    v_add_f32_e32 v4, v7, v4
1880; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
1881; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
1882; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
1883; GFX9-NEXT:    s_and_b32 s1, s1, 1
1884; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
1885; GFX9-NEXT:    s_subb_u32 s14, 0, s3
1886; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
1887; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v6
1888; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
1889; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v4
1890; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
1891; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
1892; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
1893; GFX9-NEXT:    v_add3_u32 v5, v8, v9, v10
1894; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
1895; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v5
1896; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v7
1897; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
1898; GFX9-NEXT:    v_xor_b32_e32 v3, s6, v3
1899; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
1900; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1901; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
1902; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1903; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v5
1904; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
1905; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v5
1906; GFX9-NEXT:    v_mul_hi_u32 v5, v6, v5
1907; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
1908; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
1909; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
1910; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
1911; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
1912; GFX9-NEXT:    v_add_u32_e32 v9, v10, v9
1913; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1914; GFX9-NEXT:    v_add3_u32 v5, v9, v8, v5
1915; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
1916; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc
1917; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
1918; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v7
1919; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
1920; GFX9-NEXT:    v_mul_lo_u32 v11, s7, v4
1921; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
1922; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v2
1923; GFX9-NEXT:    v_add3_u32 v8, v8, v9, v10
1924; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v11
1925; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v8
1926; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v11
1927; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v11
1928; GFX9-NEXT:    v_mov_b32_e32 v12, s6
1929; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
1930; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
1931; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v9, v6
1932; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
1933; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
1934; GFX9-NEXT:    v_add_u32_e32 v6, v10, v6
1935; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v8
1936; GFX9-NEXT:    v_mul_hi_u32 v7, v7, v8
1937; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v11
1938; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
1939; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
1940; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
1941; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v9, v6
1942; GFX9-NEXT:    v_add_u32_e32 v10, v11, v10
1943; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
1944; GFX9-NEXT:    v_add3_u32 v7, v10, v8, v7
1945; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
1946; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
1947; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
1948; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
1949; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v7
1950; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
1951; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v12, vcc
1952; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v6
1953; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v9
1954; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1955; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
1956; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1957; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v7
1958; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
1959; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
1960; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v7
1961; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
1962; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
1963; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
1964; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
1965; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
1966; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
1967; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
1968; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
1969; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v7
1970; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v2
1971; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
1972; GFX9-NEXT:    v_mul_hi_u32 v8, s2, v2
1973; GFX9-NEXT:    v_mul_lo_u32 v10, s2, v2
1974; GFX9-NEXT:    v_mov_b32_e32 v11, s11
1975; GFX9-NEXT:    v_mov_b32_e32 v9, s3
1976; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
1977; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v10
1978; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v11, v6, vcc
1979; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
1980; GFX9-NEXT:    v_sub_u32_e32 v6, s11, v6
1981; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
1982; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
1983; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
1984; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v8
1985; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
1986; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
1987; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v7
1988; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
1989; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
1990; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
1991; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
1992; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
1993; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
1994; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
1995; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v2
1996; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v9, vcc
1997; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v3, s[0:1]
1998; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v14
1999; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
2000; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
2001; GFX9-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
2002; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
2003; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
2004; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
2005; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
2006; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
2007; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
2008; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
2009; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
2010; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
2011; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
2012; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
2013; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[12:13]
2014; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
2015; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
2016; GFX9-NEXT:    v_mov_b32_e32 v8, s1
2017; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
2018; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
2019; GFX9-NEXT:    v_xor_b32_e32 v7, s8, v7
2020; GFX9-NEXT:    v_mov_b32_e32 v13, 0
2021; GFX9-NEXT:    v_xor_b32_e32 v8, s8, v6
2022; GFX9-NEXT:    v_mov_b32_e32 v9, s8
2023; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s8, v7
2024; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
2025; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2026; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[4:5]
2027; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[6:7]
2028; GFX9-NEXT:    s_endpgm
2029;
2030; GFX10-LABEL: sdivrem_v2i64:
2031; GFX10:       ; %bb.0:
2032; GFX10-NEXT:    s_clause 0x1
2033; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
2034; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2035; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2036; GFX10-NEXT:    s_ashr_i32 s12, s9, 31
2037; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
2038; GFX10-NEXT:    s_add_u32 s14, s8, s12
2039; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
2040; GFX10-NEXT:    s_mov_b32 s13, s12
2041; GFX10-NEXT:    s_and_b32 s7, s7, 1
2042; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
2043; GFX10-NEXT:    s_addc_u32 s15, s9, s12
2044; GFX10-NEXT:    s_add_u32 s0, s0, s6
2045; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
2046; GFX10-NEXT:    s_and_b32 s8, s7, 1
2047; GFX10-NEXT:    s_mov_b32 s7, s6
2048; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
2049; GFX10-NEXT:    s_addc_u32 s1, s1, s6
2050; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
2051; GFX10-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
2052; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
2053; GFX10-NEXT:    s_sub_u32 s22, 0, s8
2054; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
2055; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
2056; GFX10-NEXT:    s_and_b32 s0, s0, 1
2057; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
2058; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
2059; GFX10-NEXT:    s_subb_u32 s23, 0, s9
2060; GFX10-NEXT:    s_ashr_i32 s16, s11, 31
2061; GFX10-NEXT:    s_xor_b64 s[20:21], s[12:13], s[6:7]
2062; GFX10-NEXT:    s_ashr_i32 s18, s3, 31
2063; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
2064; GFX10-NEXT:    s_add_u32 s0, s10, s16
2065; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
2066; GFX10-NEXT:    s_mov_b32 s19, s18
2067; GFX10-NEXT:    s_and_b32 s1, s1, 1
2068; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2069; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
2070; GFX10-NEXT:    s_mov_b32 s17, s16
2071; GFX10-NEXT:    s_addc_u32 s1, s11, s16
2072; GFX10-NEXT:    s_add_u32 s2, s2, s18
2073; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
2074; GFX10-NEXT:    s_and_b32 s6, s6, 1
2075; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
2076; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
2077; GFX10-NEXT:    s_addc_u32 s3, s3, s18
2078; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
2079; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[18:19]
2080; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
2081; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
2082; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s2
2083; GFX10-NEXT:    s_sub_u32 s6, 0, s2
2084; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
2085; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
2086; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
2087; GFX10-NEXT:    s_and_b32 s0, s0, 1
2088; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
2089; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
2090; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
2091; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
2092; GFX10-NEXT:    s_subb_u32 s7, 0, s3
2093; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2094; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
2095; GFX10-NEXT:    v_mul_lo_u32 v3, s22, v2
2096; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2097; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
2098; GFX10-NEXT:    v_mul_lo_u32 v4, s23, v0
2099; GFX10-NEXT:    v_mul_hi_u32 v5, s22, v0
2100; GFX10-NEXT:    v_mul_lo_u32 v6, s22, v0
2101; GFX10-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v1
2102; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v5
2103; GFX10-NEXT:    v_trunc_f32_e32 v4, v7
2104; GFX10-NEXT:    v_mul_lo_u32 v5, v2, v6
2105; GFX10-NEXT:    v_mul_hi_u32 v7, v0, v6
2106; GFX10-NEXT:    v_mul_hi_u32 v6, v2, v6
2107; GFX10-NEXT:    v_mul_lo_u32 v8, v0, v3
2108; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v3
2109; GFX10-NEXT:    v_mul_f32_e32 v9, 0xcf800000, v4
2110; GFX10-NEXT:    v_mul_hi_u32 v11, v0, v3
2111; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
2112; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
2113; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
2114; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v8
2115; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
2116; GFX10-NEXT:    v_add_co_u32 v6, s0, v10, v6
2117; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
2118; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
2119; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
2120; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
2121; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
2122; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
2123; GFX10-NEXT:    v_mul_lo_u32 v12, s7, v1
2124; GFX10-NEXT:    v_mul_hi_u32 v13, s6, v1
2125; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
2126; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
2127; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
2128; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
2129; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
2130; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
2131; GFX10-NEXT:    v_add3_u32 v8, v12, v9, v13
2132; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v11
2133; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v11
2134; GFX10-NEXT:    v_mul_hi_u32 v11, v4, v11
2135; GFX10-NEXT:    v_add3_u32 v3, v7, v6, v3
2136; GFX10-NEXT:    v_mul_lo_u32 v6, v1, v8
2137; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v8
2138; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
2139; GFX10-NEXT:    v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo
2140; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
2141; GFX10-NEXT:    v_mul_lo_u32 v14, s23, v0
2142; GFX10-NEXT:    v_add_co_u32 v6, s0, v9, v6
2143; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
2144; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v11
2145; GFX10-NEXT:    v_mul_hi_u32 v15, s22, v0
2146; GFX10-NEXT:    v_mul_lo_u32 v16, s22, v12
2147; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
2148; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v10
2149; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
2150; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v0
2151; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
2152; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
2153; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
2154; GFX10-NEXT:    v_add3_u32 v14, v14, v16, v15
2155; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
2156; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
2157; GFX10-NEXT:    v_mul_lo_u32 v10, v12, v13
2158; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
2159; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
2160; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v14
2161; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v13
2162; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
2163; GFX10-NEXT:    v_mul_hi_u32 v13, v12, v13
2164; GFX10-NEXT:    v_mul_lo_u32 v15, v12, v14
2165; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
2166; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
2167; GFX10-NEXT:    v_add_co_u32 v5, s1, v10, v11
2168; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
2169; GFX10-NEXT:    v_mul_hi_u32 v16, v0, v14
2170; GFX10-NEXT:    v_add_co_u32 v8, s1, v15, v13
2171; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
2172; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s1, v4, v6, s0
2173; GFX10-NEXT:    v_add_co_u32 v5, s1, v5, v9
2174; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
2175; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v16
2176; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
2177; GFX10-NEXT:    v_mul_lo_u32 v13, s7, v1
2178; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
2179; GFX10-NEXT:    v_mul_hi_u32 v15, s6, v1
2180; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v11
2181; GFX10-NEXT:    v_mul_hi_u32 v7, v12, v14
2182; GFX10-NEXT:    v_add_nc_u32_e32 v10, v10, v16
2183; GFX10-NEXT:    v_add_co_u32 v5, s1, v8, v5
2184; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
2185; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
2186; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v6
2187; GFX10-NEXT:    v_add3_u32 v9, v13, v9, v15
2188; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
2189; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
2190; GFX10-NEXT:    v_mul_lo_u32 v14, v1, v9
2191; GFX10-NEXT:    v_mul_lo_u32 v12, v11, v3
2192; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo
2193; GFX10-NEXT:    v_mul_hi_u32 v13, v1, v3
2194; GFX10-NEXT:    v_mul_hi_u32 v3, v11, v3
2195; GFX10-NEXT:    v_mul_lo_u32 v8, v11, v9
2196; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
2197; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
2198; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v9
2199; GFX10-NEXT:    v_add_co_u32 v7, s1, v12, v14
2200; GFX10-NEXT:    v_mul_hi_u32 v9, v11, v9
2201; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
2202; GFX10-NEXT:    v_add_co_u32 v3, s1, v8, v3
2203; GFX10-NEXT:    v_mul_lo_u32 v8, s15, v0
2204; GFX10-NEXT:    v_mul_lo_u32 v14, s14, v2
2205; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
2206; GFX10-NEXT:    v_mul_hi_u32 v12, s14, v0
2207; GFX10-NEXT:    v_mul_hi_u32 v0, s15, v0
2208; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v13
2209; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v2
2210; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
2211; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v10
2212; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
2213; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v14
2214; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
2215; GFX10-NEXT:    v_add_co_u32 v0, s1, v13, v0
2216; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v2
2217; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
2218; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v12
2219; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
2220; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
2221; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
2222; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
2223; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v15
2224; GFX10-NEXT:    v_add_nc_u32_e32 v8, v14, v8
2225; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s1
2226; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v8
2227; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v12
2228; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
2229; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v7
2230; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
2231; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
2232; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
2233; GFX10-NEXT:    v_add3_u32 v5, v5, v7, v9
2234; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
2235; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
2236; GFX10-NEXT:    v_mov_b32_e32 v9, 0
2237; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, vcc_lo, v4, v5, s0
2238; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v0
2239; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
2240; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
2241; GFX10-NEXT:    v_add3_u32 v4, v6, v8, v7
2242; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
2243; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
2244; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s14, v5
2245; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s15, v4
2246; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo
2247; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v5
2248; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
2249; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
2250; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
2251; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
2252; GFX10-NEXT:    v_mul_lo_u32 v15, s11, v3
2253; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
2254; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
2255; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
2256; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v5, s8
2257; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
2258; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
2259; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
2260; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
2261; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s0
2262; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v13
2263; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
2264; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
2265; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v12
2266; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
2267; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v14
2268; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
2269; GFX10-NEXT:    v_add_co_u32 v7, s0, v15, v7
2270; GFX10-NEXT:    v_add_co_u32 v1, s1, v6, v1
2271; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
2272; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
2273; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v17
2274; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
2275; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
2276; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
2277; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
2278; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v13
2279; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v15
2280; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s0
2281; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v1
2282; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
2283; GFX10-NEXT:    v_add_co_u32 v14, s0, v17, 1
2284; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
2285; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
2286; GFX10-NEXT:    v_add3_u32 v3, v6, v1, v3
2287; GFX10-NEXT:    v_mul_lo_u32 v10, s3, v7
2288; GFX10-NEXT:    v_mul_lo_u32 v16, s2, v7
2289; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v15, s0
2290; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v3
2291; GFX10-NEXT:    v_mul_hi_u32 v15, s2, v7
2292; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v14, s0
2293; GFX10-NEXT:    v_sub_co_u32 v14, s1, v12, s8
2294; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1
2295; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
2296; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
2297; GFX10-NEXT:    v_add3_u32 v6, v10, v11, v15
2298; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s0
2299; GFX10-NEXT:    v_cndmask_b32_e64 v2, v13, v8, s0
2300; GFX10-NEXT:    v_sub_co_u32 v8, s0, s10, v16
2301; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0
2302; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc_lo
2303; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
2304; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v6
2305; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v10
2306; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
2307; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
2308; GFX10-NEXT:    v_xor_b32_e32 v5, s12, v5
2309; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v2
2310; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
2311; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0
2312; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
2313; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
2314; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
2315; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo
2316; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s20
2317; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0
2318; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v10
2319; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo
2320; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s0
2321; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
2322; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
2323; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
2324; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
2325; GFX10-NEXT:    v_add_co_u32 v15, s0, v7, 1
2326; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
2327; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
2328; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v14, s0
2329; GFX10-NEXT:    v_add_co_u32 v14, s0, v15, 1
2330; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
2331; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
2332; GFX10-NEXT:    v_sub_co_u32 v11, s0, v12, s2
2333; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
2334; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v14, vcc_lo
2335; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
2336; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
2337; GFX10-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc_lo
2338; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc_lo
2339; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v14, s0
2340; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
2341; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
2342; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s0
2343; GFX10-NEXT:    s_xor_b64 s[0:1], s[16:17], s[18:19]
2344; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s12
2345; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo
2346; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v7
2347; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
2348; GFX10-NEXT:    v_xor_b32_e32 v6, s16, v6
2349; GFX10-NEXT:    v_xor_b32_e32 v7, s16, v8
2350; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
2351; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
2352; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s16
2353; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s16, v7, vcc_lo
2354; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2355; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
2356; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
2357; GFX10-NEXT:    s_endpgm
2358  %div = sdiv <2 x i64> %x, %y
2359  store <2 x i64> %div, <2 x i64> addrspace(1)* %out0
2360  %rem = srem <2 x i64> %x, %y
2361  store <2 x i64> %rem, <2 x i64> addrspace(1)* %out1
2362  ret void
2363}
2364
2365define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out0, i8 addrspace(1)* %out1, i8 %x, i8 %y) {
2366; GFX8-LABEL: sdiv_i8:
2367; GFX8:       ; %bb.0:
2368; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x10
2369; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2370; GFX8-NEXT:    s_bfe_i32 s1, s0, 0x80008
2371; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
2372; GFX8-NEXT:    s_add_i32 s1, s1, s6
2373; GFX8-NEXT:    s_xor_b32 s7, s1, s6
2374; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
2375; GFX8-NEXT:    s_sub_i32 s1, 0, s7
2376; GFX8-NEXT:    s_sext_i32_i8 s0, s0
2377; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
2378; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2379; GFX8-NEXT:    s_add_i32 s0, s0, s8
2380; GFX8-NEXT:    s_xor_b32 s9, s0, s8
2381; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2382; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2383; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
2384; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2385; GFX8-NEXT:    s_xor_b32 s4, s8, s6
2386; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2387; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2388; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v0
2389; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2390; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2391; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2392; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s7
2393; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2394; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
2395; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2396; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2397; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
2398; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2399; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2400; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2401; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2402; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
2403; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
2404; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2405; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
2406; GFX8-NEXT:    v_xor_b32_e32 v3, s8, v3
2407; GFX8-NEXT:    flat_store_byte v[0:1], v2
2408; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2409; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v3
2410; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2411; GFX8-NEXT:    flat_store_byte v[0:1], v3
2412; GFX8-NEXT:    s_endpgm
2413;
2414; GFX9-LABEL: sdiv_i8:
2415; GFX9:       ; %bb.0:
2416; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x10
2417; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2419; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x80008
2420; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
2421; GFX9-NEXT:    s_add_i32 s1, s1, s6
2422; GFX9-NEXT:    s_xor_b32 s7, s1, s6
2423; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
2424; GFX9-NEXT:    s_sub_i32 s1, 0, s7
2425; GFX9-NEXT:    s_sext_i32_i8 s0, s0
2426; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
2427; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2428; GFX9-NEXT:    s_add_i32 s0, s0, s8
2429; GFX9-NEXT:    s_xor_b32 s9, s0, s8
2430; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2431; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2432; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
2433; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2434; GFX9-NEXT:    s_xor_b32 s4, s8, s6
2435; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
2436; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
2437; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
2438; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
2439; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2440; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
2441; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
2442; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2443; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
2444; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2445; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2446; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
2447; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2448; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
2449; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2450; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
2451; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
2452; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
2453; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
2454; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2455; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
2456; GFX9-NEXT:    global_store_byte v2, v1, s[2:3]
2457; GFX9-NEXT:    s_endpgm
2458;
2459; GFX10-LABEL: sdiv_i8:
2460; GFX10:       ; %bb.0:
2461; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
2462; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2463; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80008
2464; GFX10-NEXT:    s_sext_i32_i8 s0, s0
2465; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
2466; GFX10-NEXT:    s_ashr_i32 s8, s0, 31
2467; GFX10-NEXT:    s_add_i32 s1, s1, s6
2468; GFX10-NEXT:    s_add_i32 s0, s0, s8
2469; GFX10-NEXT:    s_xor_b32 s7, s1, s6
2470; GFX10-NEXT:    s_xor_b32 s0, s0, s8
2471; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
2472; GFX10-NEXT:    s_sub_i32 s1, 0, s7
2473; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2474; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2475; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2476; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
2477; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
2478; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
2479; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
2480; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s7
2481; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2482; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
2483; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2484; GFX10-NEXT:    s_xor_b32 s4, s8, s6
2485; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
2486; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
2487; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2488; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2489; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2490; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
2491; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
2492; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2493; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2494; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2495; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
2496; GFX10-NEXT:    v_xor_b32_e32 v1, s8, v1
2497; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
2498; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s8, v1
2499; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2500; GFX10-NEXT:    global_store_byte v2, v0, s[0:1]
2501; GFX10-NEXT:    global_store_byte v2, v1, s[2:3]
2502; GFX10-NEXT:    s_endpgm
2503  %div = sdiv i8 %x, %y
2504  store i8 %div, i8 addrspace(1)* %out0
2505  %rem = srem i8 %x, %y
2506  store i8 %rem, i8 addrspace(1)* %out1
2507  ret void
2508}
2509
2510define amdgpu_kernel void @sdivrem_v2i8(<2 x i8> addrspace(1)* %out0, <2 x i8> addrspace(1)* %out1, <2 x i8> %x, <2 x i8> %y) {
2511; GFX8-LABEL: sdivrem_v2i8:
2512; GFX8:       ; %bb.0:
2513; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x10
2514; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2515; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
2516; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
2517; GFX8-NEXT:    s_add_i32 s0, s0, s3
2518; GFX8-NEXT:    s_xor_b32 s8, s0, s3
2519; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
2520; GFX8-NEXT:    s_sub_i32 s6, 0, s8
2521; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80018
2522; GFX8-NEXT:    s_ashr_i32 s10, s1, 31
2523; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2524; GFX8-NEXT:    s_add_i32 s1, s1, s10
2525; GFX8-NEXT:    s_xor_b32 s11, s1, s10
2526; GFX8-NEXT:    s_sext_i32_i8 s0, s2
2527; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2528; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2529; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
2530; GFX8-NEXT:    s_ashr_i32 s9, s0, 31
2531; GFX8-NEXT:    s_add_i32 s0, s0, s9
2532; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
2533; GFX8-NEXT:    s_xor_b32 s0, s0, s9
2534; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2535; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
2536; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2537; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2538; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
2539; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
2540; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
2541; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s8
2542; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2543; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v2
2544; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
2545; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2546; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s8, v2
2547; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2548; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2549; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
2550; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2551; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s8, v2
2552; GFX8-NEXT:    s_sub_i32 s1, 0, s11
2553; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2554; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
2555; GFX8-NEXT:    s_bfe_i32 s1, s2, 0x80008
2556; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
2557; GFX8-NEXT:    s_add_i32 s1, s1, s2
2558; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
2559; GFX8-NEXT:    s_xor_b32 s1, s1, s2
2560; GFX8-NEXT:    s_xor_b32 s0, s9, s3
2561; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
2562; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
2563; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
2564; GFX8-NEXT:    v_xor_b32_e32 v2, s9, v2
2565; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2566; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s11
2567; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s9, v2
2568; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2569; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
2570; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2571; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2572; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
2573; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2574; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2575; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
2576; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2577; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
2578; GFX8-NEXT:    s_xor_b32 s0, s2, s10
2579; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
2580; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2581; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
2582; GFX8-NEXT:    s_movk_i32 s0, 0xff
2583; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
2584; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
2585; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
2586; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2587; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2588; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2589; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
2590; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2591; GFX8-NEXT:    flat_store_short v[0:1], v4
2592; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
2593; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
2594; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2595; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2596; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2597; GFX8-NEXT:    flat_store_short v[0:1], v2
2598; GFX8-NEXT:    s_endpgm
2599;
2600; GFX9-LABEL: sdivrem_v2i8:
2601; GFX9:       ; %bb.0:
2602; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
2603; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2604; GFX9-NEXT:    s_bfe_i32 s0, s6, 0x80010
2605; GFX9-NEXT:    s_ashr_i32 s7, s0, 31
2606; GFX9-NEXT:    s_add_i32 s0, s0, s7
2607; GFX9-NEXT:    s_xor_b32 s8, s0, s7
2608; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
2609; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2610; GFX9-NEXT:    s_bfe_i32 s5, s6, 0x80018
2611; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2612; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2613; GFX9-NEXT:    s_add_i32 s5, s5, s9
2614; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2615; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
2616; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2617; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2618; GFX9-NEXT:    s_sub_i32 s10, 0, s8
2619; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2620; GFX9-NEXT:    s_sext_i32_i8 s4, s6
2621; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v0
2622; GFX9-NEXT:    s_ashr_i32 s10, s4, 31
2623; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2624; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2625; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2626; GFX9-NEXT:    s_add_i32 s4, s4, s10
2627; GFX9-NEXT:    s_xor_b32 s4, s4, s10
2628; GFX9-NEXT:    s_sub_i32 s11, 0, s5
2629; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2630; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2631; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v1
2632; GFX9-NEXT:    s_bfe_i32 s6, s6, 0x80008
2633; GFX9-NEXT:    s_ashr_i32 s11, s6, 31
2634; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
2635; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
2636; GFX9-NEXT:    s_add_i32 s6, s6, s11
2637; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2638; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
2639; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2640; GFX9-NEXT:    s_xor_b32 s4, s6, s11
2641; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
2642; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2643; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
2644; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
2645; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2646; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
2647; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2648; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
2649; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
2650; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
2651; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s5
2652; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2653; GFX9-NEXT:    s_xor_b32 s6, s10, s7
2654; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
2655; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
2656; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2657; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2658; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
2659; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2660; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
2661; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
2662; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2663; GFX9-NEXT:    s_xor_b32 s4, s11, s9
2664; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
2665; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
2666; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
2667; GFX9-NEXT:    s_movk_i32 s4, 0xff
2668; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2669; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
2670; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
2671; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
2672; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
2673; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
2674; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2675; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2676; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
2677; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2678; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
2679; GFX9-NEXT:    v_and_b32_e32 v0, s4, v3
2680; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
2681; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
2682; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2683; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
2684; GFX9-NEXT:    s_endpgm
2685;
2686; GFX10-LABEL: sdivrem_v2i8:
2687; GFX10:       ; %bb.0:
2688; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
2689; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80018
2691; GFX10-NEXT:    s_bfe_i32 s2, s0, 0x80010
2692; GFX10-NEXT:    s_ashr_i32 s3, s1, 31
2693; GFX10-NEXT:    s_ashr_i32 s8, s2, 31
2694; GFX10-NEXT:    s_add_i32 s1, s1, s3
2695; GFX10-NEXT:    s_add_i32 s2, s2, s8
2696; GFX10-NEXT:    s_xor_b32 s1, s1, s3
2697; GFX10-NEXT:    s_xor_b32 s2, s2, s8
2698; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
2699; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s2
2700; GFX10-NEXT:    s_sub_i32 s6, 0, s1
2701; GFX10-NEXT:    s_sub_i32 s7, 0, s2
2702; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2703; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2704; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2705; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2706; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2707; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
2708; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
2709; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
2710; GFX10-NEXT:    s_sext_i32_i8 s6, s0
2711; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x80008
2712; GFX10-NEXT:    s_ashr_i32 s9, s6, 31
2713; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
2714; GFX10-NEXT:    s_add_i32 s6, s6, s9
2715; GFX10-NEXT:    s_add_i32 s0, s0, s10
2716; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
2717; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
2718; GFX10-NEXT:    s_xor_b32 s0, s0, s10
2719; GFX10-NEXT:    s_xor_b32 s6, s6, s9
2720; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
2721; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2722; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
2723; GFX10-NEXT:    v_mul_hi_u32 v1, s6, v1
2724; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s1
2725; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s2
2726; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
2727; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v1
2728; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
2729; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s6, v3
2730; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
2731; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
2732; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
2733; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
2734; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
2735; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
2736; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2737; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
2738; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
2739; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
2740; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
2741; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
2742; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v1
2743; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
2744; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
2745; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
2746; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
2747; GFX10-NEXT:    s_xor_b32 s1, s10, s3
2748; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
2749; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
2750; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
2751; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
2752; GFX10-NEXT:    s_xor_b32 s0, s9, s8
2753; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
2754; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
2755; GFX10-NEXT:    v_xor_b32_e32 v3, s9, v3
2756; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s10, v2
2757; GFX10-NEXT:    s_movk_i32 s1, 0xff
2758; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
2759; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2760; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s9, v3
2761; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2762; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2763; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2764; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2765; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
2767; GFX10-NEXT:    global_store_short v1, v2, s[6:7]
2768; GFX10-NEXT:    s_endpgm
2769  %div = sdiv <2 x i8> %x, %y
2770  store <2 x i8> %div, <2 x i8> addrspace(1)* %out0
2771  %rem = srem <2 x i8> %x, %y
2772  store <2 x i8> %rem, <2 x i8> addrspace(1)* %out1
2773  ret void
2774}
2775
2776define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out0, i16 addrspace(1)* %out1, i16 %x, i16 %y) {
2777; GFX8-LABEL: sdiv_i16:
2778; GFX8:       ; %bb.0:
2779; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x10
2780; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2781; GFX8-NEXT:    s_bfe_i32 s1, s0, 0x100010
2782; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
2783; GFX8-NEXT:    s_add_i32 s1, s1, s6
2784; GFX8-NEXT:    s_xor_b32 s7, s1, s6
2785; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
2786; GFX8-NEXT:    s_sub_i32 s1, 0, s7
2787; GFX8-NEXT:    s_sext_i32_i16 s0, s0
2788; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
2789; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2790; GFX8-NEXT:    s_add_i32 s0, s0, s8
2791; GFX8-NEXT:    s_xor_b32 s9, s0, s8
2792; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2793; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2794; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
2795; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2796; GFX8-NEXT:    s_xor_b32 s4, s8, s6
2797; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2798; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2799; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v0
2800; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2801; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2802; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2803; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s7
2804; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2805; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
2806; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2807; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2808; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
2809; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2810; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
2811; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
2812; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2813; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
2814; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
2815; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2816; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
2817; GFX8-NEXT:    v_xor_b32_e32 v3, s8, v3
2818; GFX8-NEXT:    flat_store_short v[0:1], v2
2819; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2820; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v3
2821; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2822; GFX8-NEXT:    flat_store_short v[0:1], v3
2823; GFX8-NEXT:    s_endpgm
2824;
2825; GFX9-LABEL: sdiv_i16:
2826; GFX9:       ; %bb.0:
2827; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x10
2828; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2829; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2830; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x100010
2831; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
2832; GFX9-NEXT:    s_add_i32 s1, s1, s6
2833; GFX9-NEXT:    s_xor_b32 s7, s1, s6
2834; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
2835; GFX9-NEXT:    s_sub_i32 s1, 0, s7
2836; GFX9-NEXT:    s_sext_i32_i16 s0, s0
2837; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
2838; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2839; GFX9-NEXT:    s_add_i32 s0, s0, s8
2840; GFX9-NEXT:    s_xor_b32 s9, s0, s8
2841; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2842; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2843; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
2844; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2845; GFX9-NEXT:    s_xor_b32 s4, s8, s6
2846; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
2847; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
2848; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
2849; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
2850; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2851; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
2852; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
2853; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2854; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
2855; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2856; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
2857; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
2858; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2859; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
2860; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2861; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
2862; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
2863; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
2864; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
2865; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2866; GFX9-NEXT:    global_store_short v2, v0, s[0:1]
2867; GFX9-NEXT:    global_store_short v2, v1, s[2:3]
2868; GFX9-NEXT:    s_endpgm
2869;
2870; GFX10-LABEL: sdiv_i16:
2871; GFX10:       ; %bb.0:
2872; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
2873; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2874; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x100010
2875; GFX10-NEXT:    s_sext_i32_i16 s0, s0
2876; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
2877; GFX10-NEXT:    s_ashr_i32 s8, s0, 31
2878; GFX10-NEXT:    s_add_i32 s1, s1, s6
2879; GFX10-NEXT:    s_add_i32 s0, s0, s8
2880; GFX10-NEXT:    s_xor_b32 s7, s1, s6
2881; GFX10-NEXT:    s_xor_b32 s0, s0, s8
2882; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s7
2883; GFX10-NEXT:    s_sub_i32 s1, 0, s7
2884; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2885; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2886; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
2887; GFX10-NEXT:    v_mul_lo_u32 v1, s1, v0
2888; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
2889; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
2890; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
2891; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s7
2892; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2893; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
2894; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
2895; GFX10-NEXT:    s_xor_b32 s4, s8, s6
2896; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
2897; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
2898; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2899; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2900; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
2901; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v1
2902; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s7, v1
2903; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
2904; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
2905; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2906; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
2907; GFX10-NEXT:    v_xor_b32_e32 v1, s8, v1
2908; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
2909; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s8, v1
2910; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2911; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
2912; GFX10-NEXT:    global_store_short v2, v1, s[2:3]
2913; GFX10-NEXT:    s_endpgm
2914  %div = sdiv i16 %x, %y
2915  store i16 %div, i16 addrspace(1)* %out0
2916  %rem = srem i16 %x, %y
2917  store i16 %rem, i16 addrspace(1)* %out1
2918  ret void
2919}
2920
2921define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %x, <2 x i16> %y) {
2922; GFX8-LABEL: sdivrem_v2i16:
2923; GFX8:       ; %bb.0:
2924; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x14
2925; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x10
2926; GFX8-NEXT:    s_mov_b32 s9, 0x100010
2927; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2928; GFX8-NEXT:    s_sext_i32_i16 s1, s0
2929; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
2930; GFX8-NEXT:    s_add_i32 s1, s1, s2
2931; GFX8-NEXT:    s_xor_b32 s3, s1, s2
2932; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
2933; GFX8-NEXT:    s_sub_i32 s6, 0, s3
2934; GFX8-NEXT:    s_sext_i32_i16 s1, s8
2935; GFX8-NEXT:    s_bfe_i32 s0, s0, s9
2936; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2937; GFX8-NEXT:    s_ashr_i32 s10, s1, 31
2938; GFX8-NEXT:    s_ashr_i32 s11, s0, 31
2939; GFX8-NEXT:    s_add_i32 s1, s1, s10
2940; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
2941; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
2942; GFX8-NEXT:    s_add_i32 s0, s0, s11
2943; GFX8-NEXT:    s_xor_b32 s1, s1, s10
2944; GFX8-NEXT:    s_xor_b32 s12, s0, s11
2945; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
2946; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s12
2947; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
2948; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
2949; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
2950; GFX8-NEXT:    v_mul_hi_u32 v0, s1, v0
2951; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v2
2952; GFX8-NEXT:    v_mul_lo_u32 v2, v0, s3
2953; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2954; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
2955; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s1, v2
2956; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
2957; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2958; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
2959; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2960; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v0
2961; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
2962; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
2963; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2964; GFX8-NEXT:    v_subrev_u32_e64 v3, s[0:1], s3, v2
2965; GFX8-NEXT:    s_sub_i32 s1, 0, s12
2966; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2967; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v1
2968; GFX8-NEXT:    s_bfe_i32 s1, s8, s9
2969; GFX8-NEXT:    s_xor_b32 s0, s10, s2
2970; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
2971; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
2972; GFX8-NEXT:    s_add_i32 s1, s1, s2
2973; GFX8-NEXT:    s_xor_b32 s1, s1, s2
2974; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
2975; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
2976; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
2977; GFX8-NEXT:    v_xor_b32_e32 v2, s10, v2
2978; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
2979; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s12
2980; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v2
2981; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2982; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s1, v3
2983; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
2984; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2985; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
2986; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2987; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
2988; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
2989; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
2990; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s12, v3
2991; GFX8-NEXT:    s_xor_b32 s0, s2, s11
2992; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
2993; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2994; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
2995; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2996; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
2997; GFX8-NEXT:    v_and_b32_e32 v1, s0, v1
2998; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
2999; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3000; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3001; GFX8-NEXT:    v_and_b32_e32 v0, s0, v3
3002; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3003; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3004; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3005; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3006; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3007; GFX8-NEXT:    flat_store_dword v[0:1], v4
3008; GFX8-NEXT:    v_mov_b32_e32 v0, s6
3009; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3010; GFX8-NEXT:    flat_store_dword v[0:1], v2
3011; GFX8-NEXT:    s_endpgm
3012;
3013; GFX9-LABEL: sdivrem_v2i16:
3014; GFX9:       ; %bb.0:
3015; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x14
3016; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3017; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3018; GFX9-NEXT:    s_ashr_i32 s7, s0, 31
3019; GFX9-NEXT:    s_add_i32 s0, s0, s7
3020; GFX9-NEXT:    s_xor_b32 s8, s0, s7
3021; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
3022; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3023; GFX9-NEXT:    s_load_dword s9, s[4:5], 0x10
3024; GFX9-NEXT:    s_mov_b32 s4, 0x100010
3025; GFX9-NEXT:    s_bfe_i32 s6, s6, s4
3026; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3027; GFX9-NEXT:    s_ashr_i32 s10, s6, 31
3028; GFX9-NEXT:    s_add_i32 s6, s6, s10
3029; GFX9-NEXT:    s_sub_i32 s11, 0, s8
3030; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3031; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
3032; GFX9-NEXT:    s_xor_b32 s6, s6, s10
3033; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
3034; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3035; GFX9-NEXT:    s_sext_i32_i16 s5, s9
3036; GFX9-NEXT:    v_mul_lo_u32 v1, s11, v0
3037; GFX9-NEXT:    s_ashr_i32 s11, s5, 31
3038; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
3039; GFX9-NEXT:    s_add_i32 s5, s5, s11
3040; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
3041; GFX9-NEXT:    s_xor_b32 s5, s5, s11
3042; GFX9-NEXT:    s_bfe_i32 s4, s9, s4
3043; GFX9-NEXT:    s_sub_i32 s9, 0, s6
3044; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
3045; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
3046; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
3047; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
3048; GFX9-NEXT:    s_xor_b32 s7, s11, s7
3049; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v1
3050; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
3051; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
3052; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
3053; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
3054; GFX9-NEXT:    s_ashr_i32 s5, s4, 31
3055; GFX9-NEXT:    s_add_i32 s4, s4, s5
3056; GFX9-NEXT:    s_xor_b32 s4, s4, s5
3057; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
3058; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
3059; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
3060; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
3061; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v2
3062; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s6
3063; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3064; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
3065; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
3066; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
3067; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v2
3068; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
3069; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3070; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
3071; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3072; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
3073; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
3074; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3075; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
3076; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
3077; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
3078; GFX9-NEXT:    v_subrev_u32_e32 v4, s6, v3
3079; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3080; GFX9-NEXT:    s_xor_b32 s4, s5, s10
3081; GFX9-NEXT:    v_xor_b32_e32 v0, s7, v0
3082; GFX9-NEXT:    v_xor_b32_e32 v2, s11, v2
3083; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
3084; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
3085; GFX9-NEXT:    v_subrev_u32_e32 v0, s7, v0
3086; GFX9-NEXT:    v_subrev_u32_e32 v2, s11, v2
3087; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3088; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3089; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
3090; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v1
3091; GFX9-NEXT:    v_and_or_b32 v1, v2, v4, v3
3092; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3093; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
3094; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
3095; GFX9-NEXT:    s_endpgm
3096;
3097; GFX10-LABEL: sdivrem_v2i16:
3098; GFX10:       ; %bb.0:
3099; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x14
3100; GFX10-NEXT:    s_mov_b32 s1, 0x100010
3101; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3102; GFX10-NEXT:    s_sext_i32_i16 s2, s0
3103; GFX10-NEXT:    s_bfe_i32 s0, s0, s1
3104; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
3105; GFX10-NEXT:    s_ashr_i32 s8, s0, 31
3106; GFX10-NEXT:    s_add_i32 s2, s2, s3
3107; GFX10-NEXT:    s_add_i32 s0, s0, s8
3108; GFX10-NEXT:    s_xor_b32 s2, s2, s3
3109; GFX10-NEXT:    s_xor_b32 s9, s0, s8
3110; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
3111; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
3112; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
3113; GFX10-NEXT:    s_sub_i32 s6, 0, s2
3114; GFX10-NEXT:    s_sub_i32 s7, 0, s9
3115; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3116; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
3117; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3118; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
3119; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
3120; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
3121; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
3122; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
3123; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3124; GFX10-NEXT:    s_sext_i32_i16 s6, s0
3125; GFX10-NEXT:    s_bfe_i32 s0, s0, s1
3126; GFX10-NEXT:    s_ashr_i32 s1, s6, 31
3127; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
3128; GFX10-NEXT:    s_add_i32 s6, s6, s1
3129; GFX10-NEXT:    s_add_i32 s0, s0, s10
3130; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
3131; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
3132; GFX10-NEXT:    s_xor_b32 s6, s6, s1
3133; GFX10-NEXT:    s_xor_b32 s0, s0, s10
3134; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
3135; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
3136; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
3137; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
3138; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
3139; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s9
3140; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
3141; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
3142; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
3143; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
3144; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3145; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
3146; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
3147; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
3148; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
3149; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
3150; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
3151; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
3152; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
3153; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
3154; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
3155; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v2
3156; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v3
3157; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
3158; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s9, v3
3159; GFX10-NEXT:    s_xor_b32 s2, s1, s3
3160; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
3161; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
3162; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
3163; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
3164; GFX10-NEXT:    s_xor_b32 s0, s10, s8
3165; GFX10-NEXT:    v_xor_b32_e32 v0, s2, v0
3166; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
3167; GFX10-NEXT:    v_xor_b32_e32 v2, s1, v2
3168; GFX10-NEXT:    v_xor_b32_e32 v3, s10, v3
3169; GFX10-NEXT:    v_mov_b32_e32 v4, 0xffff
3170; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s2, v0
3171; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3172; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s1, v2
3173; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3174; GFX10-NEXT:    v_and_or_b32 v0, v0, v4, v1
3175; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3176; GFX10-NEXT:    v_and_or_b32 v2, v2, v4, v3
3177; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3178; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
3179; GFX10-NEXT:    global_store_dword v1, v2, s[6:7]
3180; GFX10-NEXT:    s_endpgm
3181  %div = sdiv <2 x i16> %x, %y
3182  store <2 x i16> %div, <2 x i16> addrspace(1)* %out0
3183  %rem = srem <2 x i16> %x, %y
3184  store <2 x i16> %rem, <2 x i16> addrspace(1)* %out1
3185  ret void
3186}
3187
3188define amdgpu_kernel void @sdivrem_i3(i3 addrspace(1)* %out0, i3 addrspace(1)* %out1, i3 %x, i3 %y) {
3189; GFX8-LABEL: sdivrem_i3:
3190; GFX8:       ; %bb.0:
3191; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x10
3192; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3193; GFX8-NEXT:    s_bfe_i32 s1, s0, 0x30008
3194; GFX8-NEXT:    s_ashr_i32 s6, s1, 31
3195; GFX8-NEXT:    s_add_i32 s1, s1, s6
3196; GFX8-NEXT:    s_xor_b32 s7, s1, s6
3197; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
3198; GFX8-NEXT:    s_sub_i32 s1, 0, s7
3199; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x30000
3200; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
3201; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3202; GFX8-NEXT:    s_add_i32 s0, s0, s8
3203; GFX8-NEXT:    s_xor_b32 s9, s0, s8
3204; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3205; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
3206; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
3207; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3208; GFX8-NEXT:    s_xor_b32 s4, s8, s6
3209; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
3210; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
3211; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v0
3212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3213; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3214; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3215; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s7
3216; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3217; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s9, v3
3218; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
3219; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3220; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
3221; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3222; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
3223; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
3224; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
3225; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s7, v3
3226; GFX8-NEXT:    v_xor_b32_e32 v2, s4, v2
3227; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
3228; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
3229; GFX8-NEXT:    v_xor_b32_e32 v3, s8, v3
3230; GFX8-NEXT:    v_and_b32_e32 v2, 7, v2
3231; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v3
3232; GFX8-NEXT:    flat_store_byte v[0:1], v2
3233; GFX8-NEXT:    v_mov_b32_e32 v0, s2
3234; GFX8-NEXT:    v_and_b32_e32 v2, 7, v3
3235; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3236; GFX8-NEXT:    flat_store_byte v[0:1], v2
3237; GFX8-NEXT:    s_endpgm
3238;
3239; GFX9-LABEL: sdivrem_i3:
3240; GFX9:       ; %bb.0:
3241; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x10
3242; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3243; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX9-NEXT:    s_bfe_i32 s1, s0, 0x30008
3245; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
3246; GFX9-NEXT:    s_add_i32 s1, s1, s6
3247; GFX9-NEXT:    s_xor_b32 s7, s1, s6
3248; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
3249; GFX9-NEXT:    s_sub_i32 s1, 0, s7
3250; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x30000
3251; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
3252; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3253; GFX9-NEXT:    s_add_i32 s0, s0, s8
3254; GFX9-NEXT:    s_xor_b32 s9, s0, s8
3255; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3256; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
3257; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
3258; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3259; GFX9-NEXT:    s_xor_b32 s4, s8, s6
3260; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
3261; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
3262; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
3263; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
3264; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3265; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
3266; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
3267; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3268; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
3269; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3270; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3271; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
3272; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3273; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
3274; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3275; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
3276; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
3277; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
3278; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
3279; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3281; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
3282; GFX9-NEXT:    v_and_b32_e32 v0, 7, v1
3283; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3284; GFX9-NEXT:    s_endpgm
3285;
3286; GFX10-LABEL: sdivrem_i3:
3287; GFX10:       ; %bb.0:
3288; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
3289; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3290; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x30008
3291; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x30000
3292; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
3293; GFX10-NEXT:    s_ashr_i32 s7, s0, 31
3294; GFX10-NEXT:    s_add_i32 s1, s1, s6
3295; GFX10-NEXT:    s_add_i32 s0, s0, s7
3296; GFX10-NEXT:    s_xor_b32 s1, s1, s6
3297; GFX10-NEXT:    s_xor_b32 s0, s0, s7
3298; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
3299; GFX10-NEXT:    s_sub_i32 s2, 0, s1
3300; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3301; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3302; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
3303; GFX10-NEXT:    v_mul_lo_u32 v1, s2, v0
3304; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
3305; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
3306; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
3307; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
3308; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3309; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
3310; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3311; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3312; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3313; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3314; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3315; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3316; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3317; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3318; GFX10-NEXT:    s_xor_b32 s4, s7, s6
3319; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3320; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3321; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3322; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
3323; GFX10-NEXT:    v_xor_b32_e32 v1, s7, v1
3324; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
3325; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s7, v1
3326; GFX10-NEXT:    v_and_b32_e32 v0, 7, v0
3327; GFX10-NEXT:    v_and_b32_e32 v1, 7, v1
3328; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3329; GFX10-NEXT:    global_store_byte v2, v0, s[0:1]
3330; GFX10-NEXT:    global_store_byte v2, v1, s[2:3]
3331; GFX10-NEXT:    s_endpgm
3332  %div = sdiv i3 %x, %y
3333  store i3 %div, i3 addrspace(1)* %out0
3334  %rem = srem i3 %x, %y
3335  store i3 %rem, i3 addrspace(1)* %out1
3336  ret void
3337}
3338
3339define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)* %out1, i27 %x, i27 %y) {
3340; GFX8-LABEL: sdivrem_i27:
3341; GFX8:       ; %bb.0:
3342; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
3343; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
3344; GFX8-NEXT:    s_mov_b32 s9, 0x7ffffff
3345; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3346; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
3347; GFX8-NEXT:    s_ashr_i32 s2, s1, 31
3348; GFX8-NEXT:    s_add_i32 s1, s1, s2
3349; GFX8-NEXT:    s_xor_b32 s3, s1, s2
3350; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s3
3351; GFX8-NEXT:    s_sub_i32 s1, 0, s3
3352; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
3353; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
3354; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3355; GFX8-NEXT:    s_add_i32 s0, s0, s8
3356; GFX8-NEXT:    s_xor_b32 s0, s0, s8
3357; GFX8-NEXT:    s_xor_b32 s2, s8, s2
3358; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3359; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
3360; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
3361; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
3362; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
3363; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
3364; GFX8-NEXT:    v_mul_lo_u32 v1, v0, s3
3365; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
3366; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s0, v1
3367; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
3368; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3369; GFX8-NEXT:    v_subrev_u32_e64 v2, s[0:1], s3, v1
3370; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3371; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
3372; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
3373; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3374; GFX8-NEXT:    v_subrev_u32_e64 v2, s[0:1], s3, v1
3375; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3376; GFX8-NEXT:    v_xor_b32_e32 v0, s2, v0
3377; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s2, v0
3378; GFX8-NEXT:    v_xor_b32_e32 v1, s8, v1
3379; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s8, v1
3380; GFX8-NEXT:    v_and_b32_e32 v3, s9, v0
3381; GFX8-NEXT:    v_mov_b32_e32 v0, s4
3382; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3383; GFX8-NEXT:    flat_store_dword v[0:1], v3
3384; GFX8-NEXT:    v_mov_b32_e32 v0, s6
3385; GFX8-NEXT:    v_and_b32_e32 v2, s9, v2
3386; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3387; GFX8-NEXT:    flat_store_dword v[0:1], v2
3388; GFX8-NEXT:    s_endpgm
3389;
3390; GFX9-LABEL: sdivrem_i27:
3391; GFX9:       ; %bb.0:
3392; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
3393; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3394; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3395; GFX9-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
3396; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
3397; GFX9-NEXT:    s_add_i32 s1, s1, s6
3398; GFX9-NEXT:    s_xor_b32 s7, s1, s6
3399; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
3400; GFX9-NEXT:    s_sub_i32 s1, 0, s7
3401; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
3402; GFX9-NEXT:    s_ashr_i32 s8, s0, 31
3403; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3404; GFX9-NEXT:    s_add_i32 s0, s0, s8
3405; GFX9-NEXT:    s_xor_b32 s9, s0, s8
3406; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3407; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
3408; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v0
3409; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3410; GFX9-NEXT:    s_xor_b32 s5, s8, s6
3411; GFX9-NEXT:    s_mov_b32 s4, 0x7ffffff
3412; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
3413; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
3414; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
3415; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s7
3416; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3417; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
3418; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
3419; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3420; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
3421; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3422; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
3423; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
3424; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
3425; GFX9-NEXT:    v_subrev_u32_e32 v3, s7, v1
3426; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
3427; GFX9-NEXT:    v_xor_b32_e32 v0, s5, v0
3428; GFX9-NEXT:    v_subrev_u32_e32 v0, s5, v0
3429; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
3430; GFX9-NEXT:    v_subrev_u32_e32 v1, s8, v1
3431; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
3432; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3433; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
3434; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
3435; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
3436; GFX9-NEXT:    s_endpgm
3437;
3438; GFX10-LABEL: sdivrem_i27:
3439; GFX10:       ; %bb.0:
3440; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
3441; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3442; GFX10-NEXT:    s_bfe_i32 s1, s1, 0x1b0000
3443; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x1b0000
3444; GFX10-NEXT:    s_ashr_i32 s6, s1, 31
3445; GFX10-NEXT:    s_ashr_i32 s7, s0, 31
3446; GFX10-NEXT:    s_add_i32 s1, s1, s6
3447; GFX10-NEXT:    s_add_i32 s0, s0, s7
3448; GFX10-NEXT:    s_xor_b32 s1, s1, s6
3449; GFX10-NEXT:    s_xor_b32 s0, s0, s7
3450; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
3451; GFX10-NEXT:    s_sub_i32 s2, 0, s1
3452; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
3453; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
3454; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
3455; GFX10-NEXT:    v_mul_lo_u32 v1, s2, v0
3456; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
3457; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
3458; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
3459; GFX10-NEXT:    v_mul_lo_u32 v1, v0, s1
3460; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3461; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
3462; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3463; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3464; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3465; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3466; GFX10-NEXT:    v_add_nc_u32_e32 v2, 1, v0
3467; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
3468; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
3469; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
3470; GFX10-NEXT:    s_xor_b32 s4, s7, s6
3471; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
3472; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
3473; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3474; GFX10-NEXT:    v_xor_b32_e32 v0, s4, v0
3475; GFX10-NEXT:    v_xor_b32_e32 v1, s7, v1
3476; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s4, v0
3477; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s7, v1
3478; GFX10-NEXT:    s_mov_b32 s4, 0x7ffffff
3479; GFX10-NEXT:    v_and_b32_e32 v0, s4, v0
3480; GFX10-NEXT:    v_and_b32_e32 v1, s4, v1
3481; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3482; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
3483; GFX10-NEXT:    global_store_dword v2, v1, s[2:3]
3484; GFX10-NEXT:    s_endpgm
3485  %div = sdiv i27 %x, %y
3486  store i27 %div, i27 addrspace(1)* %out0
3487  %rem = srem i27 %x, %y
3488  store i27 %rem, i27 addrspace(1)* %out1
3489  ret void
3490}
3491