1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
6
7define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
8; GFX6-LABEL: v_ssubsat_i7:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
12; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
13; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
14; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
15; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
16; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
17; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
18; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
19; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
20; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
21; GFX6-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX8-LABEL: v_ssubsat_i7:
24; GFX8:       ; %bb.0:
25; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
27; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
28; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
29; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
30; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
31; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
32; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
33; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
34; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
35; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
36; GFX8-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX9-LABEL: v_ssubsat_i7:
39; GFX9:       ; %bb.0:
40; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
42; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
43; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
44; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
45; GFX9-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX10-LABEL: v_ssubsat_i7:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
51; GFX10-NEXT:    v_lshlrev_b16 v0, 9, v0
52; GFX10-NEXT:    v_lshlrev_b16 v1, 9, v1
53; GFX10-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
54; GFX10-NEXT:    v_ashrrev_i16 v0, 9, v0
55; GFX10-NEXT:    s_setpc_b64 s[30:31]
56  %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
57  ret i7 %result
58}
59
60define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61; GFX6-LABEL: s_ssubsat_i7:
62; GFX6:       ; %bb.0:
63; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
64; GFX6-NEXT:    s_max_i32 s2, s0, -1
65; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
66; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
67; GFX6-NEXT:    s_min_i32 s3, s0, -1
68; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
69; GFX6-NEXT:    s_max_i32 s1, s2, s1
70; GFX6-NEXT:    s_min_i32 s1, s1, s3
71; GFX6-NEXT:    s_sub_i32 s0, s0, s1
72; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
73; GFX6-NEXT:    ; return to shader part epilog
74;
75; GFX8-LABEL: s_ssubsat_i7:
76; GFX8:       ; %bb.0:
77; GFX8-NEXT:    s_bfe_u32 s2, 9, 0x100000
78; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
79; GFX8-NEXT:    s_sext_i32_i16 s3, s0
80; GFX8-NEXT:    s_sext_i32_i16 s4, -1
81; GFX8-NEXT:    s_max_i32 s5, s3, s4
82; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
83; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fff
84; GFX8-NEXT:    s_min_i32 s3, s3, s4
85; GFX8-NEXT:    s_sext_i32_i16 s4, s5
86; GFX8-NEXT:    s_sext_i32_i16 s1, s1
87; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
88; GFX8-NEXT:    s_max_i32 s1, s4, s1
89; GFX8-NEXT:    s_sext_i32_i16 s1, s1
90; GFX8-NEXT:    s_sext_i32_i16 s3, s3
91; GFX8-NEXT:    s_min_i32 s1, s1, s3
92; GFX8-NEXT:    s_sub_i32 s0, s0, s1
93; GFX8-NEXT:    s_sext_i32_i16 s0, s0
94; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
95; GFX8-NEXT:    ; return to shader part epilog
96;
97; GFX9-LABEL: s_ssubsat_i7:
98; GFX9:       ; %bb.0:
99; GFX9-NEXT:    s_bfe_u32 s2, 9, 0x100000
100; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
101; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
102; GFX9-NEXT:    v_mov_b32_e32 v0, s1
103; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
104; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
105; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
106; GFX9-NEXT:    ; return to shader part epilog
107;
108; GFX10-LABEL: s_ssubsat_i7:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    s_bfe_u32 s2, 9, 0x100000
111; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
112; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
113; GFX10-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
114; GFX10-NEXT:    v_ashrrev_i16 v0, 9, v0
115; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
116; GFX10-NEXT:    ; return to shader part epilog
117  %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
118  ret i7 %result
119}
120
121define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
122; GFX6-LABEL: v_ssubsat_i8:
123; GFX6:       ; %bb.0:
124; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
126; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
127; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
128; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
129; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
130; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
131; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
132; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
133; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
134; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
135; GFX6-NEXT:    s_setpc_b64 s[30:31]
136;
137; GFX8-LABEL: v_ssubsat_i8:
138; GFX8:       ; %bb.0:
139; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
141; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
142; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
143; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
144; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
145; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
146; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
147; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
148; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
149; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
150; GFX8-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX9-LABEL: v_ssubsat_i8:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
156; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
157; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
158; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
159; GFX9-NEXT:    s_setpc_b64 s[30:31]
160;
161; GFX10-LABEL: v_ssubsat_i8:
162; GFX10:       ; %bb.0:
163; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
165; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
166; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
167; GFX10-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
168; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
169; GFX10-NEXT:    s_setpc_b64 s[30:31]
170  %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
171  ret i8 %result
172}
173
174define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
175; GFX6-LABEL: s_ssubsat_i8:
176; GFX6:       ; %bb.0:
177; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
178; GFX6-NEXT:    s_max_i32 s2, s0, -1
179; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
180; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
181; GFX6-NEXT:    s_min_i32 s3, s0, -1
182; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
183; GFX6-NEXT:    s_max_i32 s1, s2, s1
184; GFX6-NEXT:    s_min_i32 s1, s1, s3
185; GFX6-NEXT:    s_sub_i32 s0, s0, s1
186; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
187; GFX6-NEXT:    ; return to shader part epilog
188;
189; GFX8-LABEL: s_ssubsat_i8:
190; GFX8:       ; %bb.0:
191; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
192; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
193; GFX8-NEXT:    s_sext_i32_i16 s3, s0
194; GFX8-NEXT:    s_sext_i32_i16 s4, -1
195; GFX8-NEXT:    s_max_i32 s5, s3, s4
196; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
197; GFX8-NEXT:    s_sub_i32 s5, s5, 0x7fff
198; GFX8-NEXT:    s_min_i32 s3, s3, s4
199; GFX8-NEXT:    s_sext_i32_i16 s4, s5
200; GFX8-NEXT:    s_sext_i32_i16 s1, s1
201; GFX8-NEXT:    s_sub_i32 s3, s3, 0xffff8000
202; GFX8-NEXT:    s_max_i32 s1, s4, s1
203; GFX8-NEXT:    s_sext_i32_i16 s1, s1
204; GFX8-NEXT:    s_sext_i32_i16 s3, s3
205; GFX8-NEXT:    s_min_i32 s1, s1, s3
206; GFX8-NEXT:    s_sub_i32 s0, s0, s1
207; GFX8-NEXT:    s_sext_i32_i16 s0, s0
208; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
209; GFX8-NEXT:    ; return to shader part epilog
210;
211; GFX9-LABEL: s_ssubsat_i8:
212; GFX9:       ; %bb.0:
213; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
214; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
215; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
216; GFX9-NEXT:    v_mov_b32_e32 v0, s1
217; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
218; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
219; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
220; GFX9-NEXT:    ; return to shader part epilog
221;
222; GFX10-LABEL: s_ssubsat_i8:
223; GFX10:       ; %bb.0:
224; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
225; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
226; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
227; GFX10-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
228; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
229; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
230; GFX10-NEXT:    ; return to shader part epilog
231  %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
232  ret i8 %result
233}
234
235define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
236; GFX6-LABEL: v_ssubsat_v2i8:
237; GFX6:       ; %bb.0:
238; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
240; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
241; GFX6-NEXT:    s_brev_b32 s4, -2
242; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
243; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
244; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
245; GFX6-NEXT:    s_brev_b32 s5, 1
246; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
247; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
248; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
249; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
250; GFX6-NEXT:    v_min_i32_e32 v1, v1, v5
251; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
252; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
253; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
254; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
255; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
256; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
257; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
258; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
259; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
260; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
261; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
262; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
263; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
264; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
265; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
266; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
267; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
268; GFX6-NEXT:    s_setpc_b64 s[30:31]
269;
270; GFX8-LABEL: v_ssubsat_v2i8:
271; GFX8:       ; %bb.0:
272; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273; GFX8-NEXT:    v_mov_b32_e32 v2, 8
274; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
275; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
276; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
277; GFX8-NEXT:    v_max_i16_e32 v4, -1, v0
278; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
279; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
280; GFX8-NEXT:    s_movk_i32 s5, 0x8000
281; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
282; GFX8-NEXT:    v_min_i16_e32 v5, -1, v0
283; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
284; GFX8-NEXT:    v_max_i16_e32 v1, v4, v1
285; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
286; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
287; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
288; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
289; GFX8-NEXT:    v_min_i16_e32 v4, -1, v3
290; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
291; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
292; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
293; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
294; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
295; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
296; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
297; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
298; GFX8-NEXT:    s_setpc_b64 s[30:31]
299;
300; GFX9-LABEL: v_ssubsat_v2i8:
301; GFX9:       ; %bb.0:
302; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
303; GFX9-NEXT:    s_mov_b32 s4, 8
304; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
305; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
306; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
307; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v2
308; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
309; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
310; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
311; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
312; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
313; GFX9-NEXT:    s_movk_i32 s4, 0xff
314; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
315; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
316; GFX9-NEXT:    s_setpc_b64 s[30:31]
317;
318; GFX10-LABEL: v_ssubsat_v2i8:
319; GFX10:       ; %bb.0:
320; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
322; GFX10-NEXT:    s_mov_b32 s4, 8
323; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
324; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
325; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
326; GFX10-NEXT:    s_movk_i32 s4, 0xff
327; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
328; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
329; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
330; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
331; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
332; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
333; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
334; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
335; GFX10-NEXT:    s_setpc_b64 s[30:31]
336  %lhs = bitcast i16 %lhs.arg to <2 x i8>
337  %rhs = bitcast i16 %rhs.arg to <2 x i8>
338  %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
339  %cast.result = bitcast <2 x i8> %result to i16
340  ret i16 %cast.result
341}
342
343define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
344; GFX6-LABEL: s_ssubsat_v2i8:
345; GFX6:       ; %bb.0:
346; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
347; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
348; GFX6-NEXT:    s_brev_b32 s4, -2
349; GFX6-NEXT:    s_max_i32 s6, s0, -1
350; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
351; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
352; GFX6-NEXT:    s_brev_b32 s5, 1
353; GFX6-NEXT:    s_sub_i32 s6, s6, s4
354; GFX6-NEXT:    s_min_i32 s7, s0, -1
355; GFX6-NEXT:    s_sub_i32 s7, s7, s5
356; GFX6-NEXT:    s_max_i32 s1, s6, s1
357; GFX6-NEXT:    s_min_i32 s1, s1, s7
358; GFX6-NEXT:    s_sub_i32 s0, s0, s1
359; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
360; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
361; GFX6-NEXT:    s_max_i32 s3, s1, -1
362; GFX6-NEXT:    s_sub_i32 s3, s3, s4
363; GFX6-NEXT:    s_min_i32 s4, s1, -1
364; GFX6-NEXT:    s_sub_i32 s4, s4, s5
365; GFX6-NEXT:    s_max_i32 s2, s3, s2
366; GFX6-NEXT:    s_min_i32 s2, s2, s4
367; GFX6-NEXT:    s_sub_i32 s1, s1, s2
368; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
369; GFX6-NEXT:    s_movk_i32 s2, 0xff
370; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
371; GFX6-NEXT:    s_and_b32 s1, s1, s2
372; GFX6-NEXT:    s_and_b32 s0, s0, s2
373; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
374; GFX6-NEXT:    s_or_b32 s0, s0, s1
375; GFX6-NEXT:    ; return to shader part epilog
376;
377; GFX8-LABEL: s_ssubsat_v2i8:
378; GFX8:       ; %bb.0:
379; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
380; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
381; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
382; GFX8-NEXT:    s_sext_i32_i16 s7, s0
383; GFX8-NEXT:    s_sext_i32_i16 s8, -1
384; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
385; GFX8-NEXT:    s_max_i32 s9, s7, s8
386; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
387; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
388; GFX8-NEXT:    s_sub_i32 s9, s9, s5
389; GFX8-NEXT:    s_movk_i32 s6, 0x8000
390; GFX8-NEXT:    s_min_i32 s7, s7, s8
391; GFX8-NEXT:    s_sext_i32_i16 s9, s9
392; GFX8-NEXT:    s_sext_i32_i16 s1, s1
393; GFX8-NEXT:    s_sub_i32 s7, s7, s6
394; GFX8-NEXT:    s_max_i32 s1, s9, s1
395; GFX8-NEXT:    s_sext_i32_i16 s1, s1
396; GFX8-NEXT:    s_sext_i32_i16 s7, s7
397; GFX8-NEXT:    s_min_i32 s1, s1, s7
398; GFX8-NEXT:    s_sub_i32 s0, s0, s1
399; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
400; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
401; GFX8-NEXT:    s_sext_i32_i16 s3, s1
402; GFX8-NEXT:    s_max_i32 s7, s3, s8
403; GFX8-NEXT:    s_sub_i32 s5, s7, s5
404; GFX8-NEXT:    s_min_i32 s3, s3, s8
405; GFX8-NEXT:    s_sext_i32_i16 s5, s5
406; GFX8-NEXT:    s_sext_i32_i16 s2, s2
407; GFX8-NEXT:    s_sub_i32 s3, s3, s6
408; GFX8-NEXT:    s_max_i32 s2, s5, s2
409; GFX8-NEXT:    s_sext_i32_i16 s2, s2
410; GFX8-NEXT:    s_sext_i32_i16 s3, s3
411; GFX8-NEXT:    s_min_i32 s2, s2, s3
412; GFX8-NEXT:    s_sub_i32 s1, s1, s2
413; GFX8-NEXT:    s_sext_i32_i16 s1, s1
414; GFX8-NEXT:    s_sext_i32_i16 s0, s0
415; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
416; GFX8-NEXT:    s_movk_i32 s2, 0xff
417; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
418; GFX8-NEXT:    s_and_b32 s1, s1, s2
419; GFX8-NEXT:    s_and_b32 s0, s0, s2
420; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
421; GFX8-NEXT:    s_or_b32 s0, s0, s1
422; GFX8-NEXT:    ; return to shader part epilog
423;
424; GFX9-LABEL: s_ssubsat_v2i8:
425; GFX9:       ; %bb.0:
426; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
427; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
428; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
429; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
430; GFX9-NEXT:    s_mov_b32 s2, 0x80008
431; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
432; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
433; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
434; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
435; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
436; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
437; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
438; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
439; GFX9-NEXT:    v_mov_b32_e32 v0, s1
440; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
441; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
442; GFX9-NEXT:    s_movk_i32 s0, 0xff
443; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
445; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
446; GFX9-NEXT:    ; return to shader part epilog
447;
448; GFX10-LABEL: s_ssubsat_v2i8:
449; GFX10:       ; %bb.0:
450; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
451; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
452; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
453; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
454; GFX10-NEXT:    s_mov_b32 s2, 0x80008
455; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
456; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
457; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
458; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
459; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
460; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
461; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
462; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
463; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
464; GFX10-NEXT:    s_movk_i32 s0, 0xff
465; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
466; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
467; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
468; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
469; GFX10-NEXT:    ; return to shader part epilog
470  %lhs = bitcast i16 %lhs.arg to <2 x i8>
471  %rhs = bitcast i16 %rhs.arg to <2 x i8>
472  %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
473  %cast.result = bitcast <2 x i8> %result to i16
474  ret i16 %cast.result
475}
476
477define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
478; GFX6-LABEL: v_ssubsat_v4i8:
479; GFX6:       ; %bb.0:
480; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
482; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
483; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
484; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
485; GFX6-NEXT:    s_brev_b32 s4, -2
486; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
487; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
488; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
489; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
490; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
491; GFX6-NEXT:    s_brev_b32 s5, 1
492; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
493; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
494; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
495; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
496; GFX6-NEXT:    v_min_i32_e32 v1, v1, v10
497; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
498; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
499; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
500; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
501; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
502; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
503; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
504; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
505; GFX6-NEXT:    v_min_i32_e32 v2, v2, v8
506; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
507; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
508; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
509; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
510; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
511; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
512; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
513; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
514; GFX6-NEXT:    v_max_i32_e32 v3, v5, v3
515; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
516; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
517; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
518; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
519; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
520; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
521; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
522; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
523; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
524; GFX6-NEXT:    s_movk_i32 s4, 0xff
525; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
526; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
527; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
528; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
529; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
530; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
531; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
532; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
533; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
534; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
535; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
536; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
537; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
538; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
539; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
540; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
541; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
542; GFX6-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX8-LABEL: v_ssubsat_v4i8:
545; GFX8:       ; %bb.0:
546; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX8-NEXT:    v_mov_b32_e32 v2, 8
548; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
549; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
550; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
551; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
552; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
553; GFX8-NEXT:    v_max_i16_e32 v8, -1, v0
554; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
555; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
556; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
557; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
558; GFX8-NEXT:    s_movk_i32 s5, 0x8000
559; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
560; GFX8-NEXT:    v_min_i16_e32 v10, -1, v0
561; GFX8-NEXT:    v_subrev_u16_e32 v10, s5, v10
562; GFX8-NEXT:    v_max_i16_e32 v1, v8, v1
563; GFX8-NEXT:    v_min_i16_e32 v1, v1, v10
564; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
565; GFX8-NEXT:    v_max_i16_e32 v1, -1, v3
566; GFX8-NEXT:    v_subrev_u16_e32 v1, s4, v1
567; GFX8-NEXT:    v_min_i16_e32 v8, -1, v3
568; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
569; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
570; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
571; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
572; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
573; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
574; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
575; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
576; GFX8-NEXT:    v_sub_u16_e32 v4, v4, v9
577; GFX8-NEXT:    v_min_i16_e32 v6, -1, v2
578; GFX8-NEXT:    v_subrev_u16_e32 v6, s5, v6
579; GFX8-NEXT:    v_max_i16_e32 v3, v4, v3
580; GFX8-NEXT:    v_min_i16_e32 v3, v3, v6
581; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v3
582; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
583; GFX8-NEXT:    v_max_i16_e32 v5, -1, v3
584; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
585; GFX8-NEXT:    v_sub_u16_e32 v5, v5, v9
586; GFX8-NEXT:    v_min_i16_e32 v6, -1, v3
587; GFX8-NEXT:    v_subrev_u16_e32 v6, 0x8000, v6
588; GFX8-NEXT:    v_max_i16_e32 v4, v5, v4
589; GFX8-NEXT:    v_min_i16_e32 v4, v4, v6
590; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v4
591; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
592; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
593; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
594; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
595; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
596; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
597; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
598; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
599; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
600; GFX8-NEXT:    s_setpc_b64 s[30:31]
601;
602; GFX9-LABEL: v_ssubsat_v4i8:
603; GFX9:       ; %bb.0:
604; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX9-NEXT:    s_mov_b32 s4, 8
606; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
607; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
608; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
609; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
610; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
611; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
612; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
613; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
614; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
615; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
616; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
617; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
618; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
619; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
620; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
621; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
622; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
623; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
624; GFX9-NEXT:    v_pk_sub_i16 v1, v2, v3 clamp
625; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
626; GFX9-NEXT:    v_mov_b32_e32 v2, 8
627; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
628; GFX9-NEXT:    s_movk_i32 s4, 0xff
629; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
630; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
631; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
632; GFX9-NEXT:    v_mov_b32_e32 v3, 24
633; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
634; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
635; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
636; GFX9-NEXT:    s_setpc_b64 s[30:31]
637;
638; GFX10-LABEL: v_ssubsat_v4i8:
639; GFX10:       ; %bb.0:
640; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
642; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
643; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
644; GFX10-NEXT:    s_mov_b32 s4, 8
645; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
646; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
647; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
648; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
649; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
650; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
651; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
652; GFX10-NEXT:    s_movk_i32 s4, 0xff
653; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
654; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
655; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
656; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
657; GFX10-NEXT:    v_mov_b32_e32 v4, 24
658; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
659; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
660; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
661; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
662; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
663; GFX10-NEXT:    v_pk_sub_i16 v1, v2, v3 clamp
664; GFX10-NEXT:    v_mov_b32_e32 v2, 8
665; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
666; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
667; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
668; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
669; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
670; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
671; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
672; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
673; GFX10-NEXT:    s_setpc_b64 s[30:31]
674  %lhs = bitcast i32 %lhs.arg to <4 x i8>
675  %rhs = bitcast i32 %rhs.arg to <4 x i8>
676  %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
677  %cast.result = bitcast <4 x i8> %result to i32
678  ret i32 %cast.result
679}
680
681define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
682; GFX6-LABEL: s_ssubsat_v4i8:
683; GFX6:       ; %bb.0:
684; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
685; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
686; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
687; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
688; GFX6-NEXT:    s_brev_b32 s8, -2
689; GFX6-NEXT:    s_max_i32 s10, s0, -1
690; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
691; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
692; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
693; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
694; GFX6-NEXT:    s_brev_b32 s9, 1
695; GFX6-NEXT:    s_sub_i32 s10, s10, s8
696; GFX6-NEXT:    s_min_i32 s11, s0, -1
697; GFX6-NEXT:    s_sub_i32 s11, s11, s9
698; GFX6-NEXT:    s_max_i32 s1, s10, s1
699; GFX6-NEXT:    s_min_i32 s1, s1, s11
700; GFX6-NEXT:    s_sub_i32 s0, s0, s1
701; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
702; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
703; GFX6-NEXT:    s_max_i32 s5, s1, -1
704; GFX6-NEXT:    s_sub_i32 s5, s5, s8
705; GFX6-NEXT:    s_min_i32 s10, s1, -1
706; GFX6-NEXT:    s_sub_i32 s10, s10, s9
707; GFX6-NEXT:    s_max_i32 s2, s5, s2
708; GFX6-NEXT:    s_min_i32 s2, s2, s10
709; GFX6-NEXT:    s_sub_i32 s1, s1, s2
710; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
711; GFX6-NEXT:    s_max_i32 s5, s2, -1
712; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
713; GFX6-NEXT:    s_sub_i32 s5, s5, s8
714; GFX6-NEXT:    s_min_i32 s6, s2, -1
715; GFX6-NEXT:    s_sub_i32 s6, s6, s9
716; GFX6-NEXT:    s_max_i32 s3, s5, s3
717; GFX6-NEXT:    s_min_i32 s3, s3, s6
718; GFX6-NEXT:    s_sub_i32 s2, s2, s3
719; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
720; GFX6-NEXT:    s_max_i32 s5, s3, -1
721; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
722; GFX6-NEXT:    s_sub_i32 s5, s5, s8
723; GFX6-NEXT:    s_min_i32 s6, s3, -1
724; GFX6-NEXT:    s_sub_i32 s6, s6, s9
725; GFX6-NEXT:    s_max_i32 s4, s5, s4
726; GFX6-NEXT:    s_min_i32 s4, s4, s6
727; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
728; GFX6-NEXT:    s_sub_i32 s3, s3, s4
729; GFX6-NEXT:    s_movk_i32 s4, 0xff
730; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
731; GFX6-NEXT:    s_and_b32 s1, s1, s4
732; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
733; GFX6-NEXT:    s_and_b32 s0, s0, s4
734; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
735; GFX6-NEXT:    s_or_b32 s0, s0, s1
736; GFX6-NEXT:    s_and_b32 s1, s2, s4
737; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
738; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
739; GFX6-NEXT:    s_or_b32 s0, s0, s1
740; GFX6-NEXT:    s_and_b32 s1, s3, s4
741; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
742; GFX6-NEXT:    s_or_b32 s0, s0, s1
743; GFX6-NEXT:    ; return to shader part epilog
744;
745; GFX8-LABEL: s_ssubsat_v4i8:
746; GFX8:       ; %bb.0:
747; GFX8-NEXT:    s_bfe_u32 s8, 8, 0x100000
748; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
749; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
750; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
751; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
752; GFX8-NEXT:    s_sext_i32_i16 s11, s0
753; GFX8-NEXT:    s_sext_i32_i16 s12, -1
754; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
755; GFX8-NEXT:    s_max_i32 s13, s11, s12
756; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
757; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
758; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
759; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
760; GFX8-NEXT:    s_sub_i32 s13, s13, s9
761; GFX8-NEXT:    s_movk_i32 s10, 0x8000
762; GFX8-NEXT:    s_min_i32 s11, s11, s12
763; GFX8-NEXT:    s_sext_i32_i16 s13, s13
764; GFX8-NEXT:    s_sext_i32_i16 s1, s1
765; GFX8-NEXT:    s_sub_i32 s11, s11, s10
766; GFX8-NEXT:    s_max_i32 s1, s13, s1
767; GFX8-NEXT:    s_sext_i32_i16 s1, s1
768; GFX8-NEXT:    s_sext_i32_i16 s11, s11
769; GFX8-NEXT:    s_min_i32 s1, s1, s11
770; GFX8-NEXT:    s_sub_i32 s0, s0, s1
771; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
772; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
773; GFX8-NEXT:    s_sext_i32_i16 s5, s1
774; GFX8-NEXT:    s_max_i32 s11, s5, s12
775; GFX8-NEXT:    s_sub_i32 s11, s11, s9
776; GFX8-NEXT:    s_min_i32 s5, s5, s12
777; GFX8-NEXT:    s_sext_i32_i16 s11, s11
778; GFX8-NEXT:    s_sext_i32_i16 s2, s2
779; GFX8-NEXT:    s_sub_i32 s5, s5, s10
780; GFX8-NEXT:    s_max_i32 s2, s11, s2
781; GFX8-NEXT:    s_sext_i32_i16 s2, s2
782; GFX8-NEXT:    s_sext_i32_i16 s5, s5
783; GFX8-NEXT:    s_min_i32 s2, s2, s5
784; GFX8-NEXT:    s_sub_i32 s1, s1, s2
785; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
786; GFX8-NEXT:    s_sext_i32_i16 s5, s2
787; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
788; GFX8-NEXT:    s_max_i32 s6, s5, s12
789; GFX8-NEXT:    s_sub_i32 s6, s6, s9
790; GFX8-NEXT:    s_min_i32 s5, s5, s12
791; GFX8-NEXT:    s_sext_i32_i16 s6, s6
792; GFX8-NEXT:    s_sext_i32_i16 s3, s3
793; GFX8-NEXT:    s_sub_i32 s5, s5, s10
794; GFX8-NEXT:    s_max_i32 s3, s6, s3
795; GFX8-NEXT:    s_sext_i32_i16 s3, s3
796; GFX8-NEXT:    s_sext_i32_i16 s5, s5
797; GFX8-NEXT:    s_min_i32 s3, s3, s5
798; GFX8-NEXT:    s_sub_i32 s2, s2, s3
799; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
800; GFX8-NEXT:    s_sext_i32_i16 s5, s3
801; GFX8-NEXT:    s_max_i32 s6, s5, s12
802; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
803; GFX8-NEXT:    s_sub_i32 s6, s6, s9
804; GFX8-NEXT:    s_min_i32 s5, s5, s12
805; GFX8-NEXT:    s_sext_i32_i16 s6, s6
806; GFX8-NEXT:    s_sext_i32_i16 s4, s4
807; GFX8-NEXT:    s_sub_i32 s5, s5, s10
808; GFX8-NEXT:    s_max_i32 s4, s6, s4
809; GFX8-NEXT:    s_sext_i32_i16 s4, s4
810; GFX8-NEXT:    s_sext_i32_i16 s5, s5
811; GFX8-NEXT:    s_sext_i32_i16 s1, s1
812; GFX8-NEXT:    s_min_i32 s4, s4, s5
813; GFX8-NEXT:    s_sext_i32_i16 s0, s0
814; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
815; GFX8-NEXT:    s_sub_i32 s3, s3, s4
816; GFX8-NEXT:    s_movk_i32 s4, 0xff
817; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
818; GFX8-NEXT:    s_sext_i32_i16 s2, s2
819; GFX8-NEXT:    s_and_b32 s1, s1, s4
820; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
821; GFX8-NEXT:    s_and_b32 s0, s0, s4
822; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
823; GFX8-NEXT:    s_sext_i32_i16 s3, s3
824; GFX8-NEXT:    s_or_b32 s0, s0, s1
825; GFX8-NEXT:    s_and_b32 s1, s2, s4
826; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
827; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
828; GFX8-NEXT:    s_or_b32 s0, s0, s1
829; GFX8-NEXT:    s_and_b32 s1, s3, s4
830; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
831; GFX8-NEXT:    s_or_b32 s0, s0, s1
832; GFX8-NEXT:    ; return to shader part epilog
833;
834; GFX9-LABEL: s_ssubsat_v4i8:
835; GFX9:       ; %bb.0:
836; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
837; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
838; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
839; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
840; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
841; GFX9-NEXT:    s_mov_b32 s4, 0x80008
842; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
843; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
844; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
845; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
846; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
847; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
848; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
849; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
850; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
851; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
852; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
853; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
854; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
855; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
856; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
857; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
858; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
859; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
860; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
861; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
862; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
863; GFX9-NEXT:    v_mov_b32_e32 v0, s1
864; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
865; GFX9-NEXT:    v_mov_b32_e32 v1, s4
866; GFX9-NEXT:    s_mov_b32 s2, 8
867; GFX9-NEXT:    v_pk_sub_i16 v1, s3, v1 clamp
868; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
869; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
870; GFX9-NEXT:    s_movk_i32 s0, 0xff
871; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
872; GFX9-NEXT:    s_mov_b32 s5, 24
873; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
874; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
875; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
876; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
877; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
878; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
879; GFX9-NEXT:    ; return to shader part epilog
880;
881; GFX10-LABEL: s_ssubsat_v4i8:
882; GFX10:       ; %bb.0:
883; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
884; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
885; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
886; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
887; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
888; GFX10-NEXT:    s_mov_b32 s3, 0x80008
889; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
890; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
891; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
892; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
893; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
894; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
895; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
896; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
897; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
898; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
899; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
900; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
901; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
902; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
903; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
904; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
905; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
906; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
907; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
908; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
909; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
910; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
911; GFX10-NEXT:    v_pk_sub_i16 v1, s2, s3 clamp
912; GFX10-NEXT:    s_mov_b32 s0, 8
913; GFX10-NEXT:    s_movk_i32 s1, 0xff
914; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
915; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
916; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
917; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
918; GFX10-NEXT:    s_mov_b32 s0, 24
919; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
920; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
921; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
922; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
923; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
924; GFX10-NEXT:    ; return to shader part epilog
925  %lhs = bitcast i32 %lhs.arg to <4 x i8>
926  %rhs = bitcast i32 %rhs.arg to <4 x i8>
927  %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
928  %cast.result = bitcast <4 x i8> %result to i32
929  ret i32 %cast.result
930}
931
932define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) {
933; GFX6-LABEL: v_ssubsat_i24:
934; GFX6:       ; %bb.0:
935; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
937; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
938; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
939; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
940; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
941; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
942; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
943; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
944; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
945; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
946; GFX6-NEXT:    s_setpc_b64 s[30:31]
947;
948; GFX8-LABEL: v_ssubsat_i24:
949; GFX8:       ; %bb.0:
950; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v0, v1
952; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
953; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
954; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
955; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 24
956; GFX8-NEXT:    v_cmp_lt_i32_e64 s[6:7], 0, v0
957; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 23, v3
958; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
959; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
960; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
961; GFX8-NEXT:    s_setpc_b64 s[30:31]
962;
963; GFX9-LABEL: v_ssubsat_i24:
964; GFX9:       ; %bb.0:
965; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
967; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
968; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
969; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
970; GFX9-NEXT:    s_setpc_b64 s[30:31]
971;
972; GFX10-LABEL: v_ssubsat_i24:
973; GFX10:       ; %bb.0:
974; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
976; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
977; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
978; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
979; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
980; GFX10-NEXT:    s_setpc_b64 s[30:31]
981  %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
982  ret i24 %result
983}
984
985define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
986; GFX6-LABEL: s_ssubsat_i24:
987; GFX6:       ; %bb.0:
988; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
989; GFX6-NEXT:    s_max_i32 s2, s0, -1
990; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
991; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
992; GFX6-NEXT:    s_min_i32 s3, s0, -1
993; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
994; GFX6-NEXT:    s_max_i32 s1, s2, s1
995; GFX6-NEXT:    s_min_i32 s1, s1, s3
996; GFX6-NEXT:    s_sub_i32 s0, s0, s1
997; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
998; GFX6-NEXT:    ; return to shader part epilog
999;
1000; GFX8-LABEL: s_ssubsat_i24:
1001; GFX8:       ; %bb.0:
1002; GFX8-NEXT:    s_sub_i32 s2, s0, s1
1003; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x180000
1004; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x180000
1005; GFX8-NEXT:    s_cmp_lt_i32 s3, s0
1006; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1007; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x180000
1008; GFX8-NEXT:    s_cmp_gt_i32 s1, 0
1009; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
1010; GFX8-NEXT:    s_xor_b32 s0, s1, s0
1011; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
1012; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
1013; GFX8-NEXT:    s_and_b32 s0, s0, 1
1014; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1015; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
1016; GFX8-NEXT:    ; return to shader part epilog
1017;
1018; GFX9-LABEL: s_ssubsat_i24:
1019; GFX9:       ; %bb.0:
1020; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
1021; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
1022; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1023; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1024; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1025; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1026; GFX9-NEXT:    ; return to shader part epilog
1027;
1028; GFX10-LABEL: s_ssubsat_i24:
1029; GFX10:       ; %bb.0:
1030; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
1031; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
1032; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s1 clamp
1033; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1034; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1035; GFX10-NEXT:    ; return to shader part epilog
1036  %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs)
1037  ret i24 %result
1038}
1039
1040define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
1041; GFX6-LABEL: v_ssubsat_i32:
1042; GFX6:       ; %bb.0:
1043; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1044; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
1045; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
1046; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
1047; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
1048; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
1049; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
1050; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1051; GFX6-NEXT:    s_setpc_b64 s[30:31]
1052;
1053; GFX8-LABEL: v_ssubsat_i32:
1054; GFX8:       ; %bb.0:
1055; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1056; GFX8-NEXT:    v_max_i32_e32 v2, -1, v0
1057; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
1058; GFX8-NEXT:    v_min_i32_e32 v3, -1, v0
1059; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 0x80000000, v3
1060; GFX8-NEXT:    v_max_i32_e32 v1, v2, v1
1061; GFX8-NEXT:    v_min_i32_e32 v1, v1, v3
1062; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
1063; GFX8-NEXT:    s_setpc_b64 s[30:31]
1064;
1065; GFX9-LABEL: v_ssubsat_i32:
1066; GFX9:       ; %bb.0:
1067; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1068; GFX9-NEXT:    v_sub_i32 v0, v0, v1 clamp
1069; GFX9-NEXT:    s_setpc_b64 s[30:31]
1070;
1071; GFX10-LABEL: v_ssubsat_i32:
1072; GFX10:       ; %bb.0:
1073; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1074; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1075; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v1 clamp
1076; GFX10-NEXT:    s_setpc_b64 s[30:31]
1077  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1078  ret i32 %result
1079}
1080
1081define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1082; GFX6-LABEL: s_ssubsat_i32:
1083; GFX6:       ; %bb.0:
1084; GFX6-NEXT:    s_max_i32 s2, s0, -1
1085; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
1086; GFX6-NEXT:    s_min_i32 s3, s0, -1
1087; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
1088; GFX6-NEXT:    s_max_i32 s1, s2, s1
1089; GFX6-NEXT:    s_min_i32 s1, s1, s3
1090; GFX6-NEXT:    s_sub_i32 s0, s0, s1
1091; GFX6-NEXT:    ; return to shader part epilog
1092;
1093; GFX8-LABEL: s_ssubsat_i32:
1094; GFX8:       ; %bb.0:
1095; GFX8-NEXT:    s_max_i32 s2, s0, -1
1096; GFX8-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
1097; GFX8-NEXT:    s_min_i32 s3, s0, -1
1098; GFX8-NEXT:    s_sub_i32 s3, s3, 0x80000000
1099; GFX8-NEXT:    s_max_i32 s1, s2, s1
1100; GFX8-NEXT:    s_min_i32 s1, s1, s3
1101; GFX8-NEXT:    s_sub_i32 s0, s0, s1
1102; GFX8-NEXT:    ; return to shader part epilog
1103;
1104; GFX9-LABEL: s_ssubsat_i32:
1105; GFX9:       ; %bb.0:
1106; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1107; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1108; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1109; GFX9-NEXT:    ; return to shader part epilog
1110;
1111; GFX10-LABEL: s_ssubsat_i32:
1112; GFX10:       ; %bb.0:
1113; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s1 clamp
1114; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1115; GFX10-NEXT:    ; return to shader part epilog
1116  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1117  ret i32 %result
1118}
1119
1120define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1121; GFX6-LABEL: ssubsat_i32_sv:
1122; GFX6:       ; %bb.0:
1123; GFX6-NEXT:    s_max_i32 s1, s0, -1
1124; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
1125; GFX6-NEXT:    s_min_i32 s2, s0, -1
1126; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
1127; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
1128; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
1129; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1130; GFX6-NEXT:    ; return to shader part epilog
1131;
1132; GFX8-LABEL: ssubsat_i32_sv:
1133; GFX8:       ; %bb.0:
1134; GFX8-NEXT:    s_max_i32 s1, s0, -1
1135; GFX8-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
1136; GFX8-NEXT:    s_min_i32 s2, s0, -1
1137; GFX8-NEXT:    s_sub_i32 s2, s2, 0x80000000
1138; GFX8-NEXT:    v_max_i32_e32 v0, s1, v0
1139; GFX8-NEXT:    v_min_i32_e32 v0, s2, v0
1140; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1141; GFX8-NEXT:    ; return to shader part epilog
1142;
1143; GFX9-LABEL: ssubsat_i32_sv:
1144; GFX9:       ; %bb.0:
1145; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1146; GFX9-NEXT:    ; return to shader part epilog
1147;
1148; GFX10-LABEL: ssubsat_i32_sv:
1149; GFX10:       ; %bb.0:
1150; GFX10-NEXT:    v_sub_nc_i32 v0, s0, v0 clamp
1151; GFX10-NEXT:    ; return to shader part epilog
1152  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1153  %cast = bitcast i32 %result to float
1154  ret float %cast
1155}
1156
1157define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1158; GFX6-LABEL: ssubsat_i32_vs:
1159; GFX6:       ; %bb.0:
1160; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
1161; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
1162; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
1163; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x80000000, v2
1164; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
1165; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
1166; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
1167; GFX6-NEXT:    ; return to shader part epilog
1168;
1169; GFX8-LABEL: ssubsat_i32_vs:
1170; GFX8:       ; %bb.0:
1171; GFX8-NEXT:    v_max_i32_e32 v1, -1, v0
1172; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1
1173; GFX8-NEXT:    v_min_i32_e32 v2, -1, v0
1174; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x80000000, v2
1175; GFX8-NEXT:    v_max_i32_e32 v1, s0, v1
1176; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
1177; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v1
1178; GFX8-NEXT:    ; return to shader part epilog
1179;
1180; GFX9-LABEL: ssubsat_i32_vs:
1181; GFX9:       ; %bb.0:
1182; GFX9-NEXT:    v_sub_i32 v0, v0, s0 clamp
1183; GFX9-NEXT:    ; return to shader part epilog
1184;
1185; GFX10-LABEL: ssubsat_i32_vs:
1186; GFX10:       ; %bb.0:
1187; GFX10-NEXT:    v_sub_nc_i32 v0, v0, s0 clamp
1188; GFX10-NEXT:    ; return to shader part epilog
1189  %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs)
1190  %cast = bitcast i32 %result to float
1191  ret float %cast
1192}
1193
1194define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1195; GFX6-LABEL: v_ssubsat_v2i32:
1196; GFX6:       ; %bb.0:
1197; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1198; GFX6-NEXT:    s_brev_b32 s4, -2
1199; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
1200; GFX6-NEXT:    s_brev_b32 s5, 1
1201; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
1202; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
1203; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
1204; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
1205; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
1206; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1207; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
1208; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
1209; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
1210; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
1211; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
1212; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
1213; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
1214; GFX6-NEXT:    s_setpc_b64 s[30:31]
1215;
1216; GFX8-LABEL: v_ssubsat_v2i32:
1217; GFX8:       ; %bb.0:
1218; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219; GFX8-NEXT:    s_brev_b32 s4, -2
1220; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
1221; GFX8-NEXT:    s_brev_b32 s5, 1
1222; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
1223; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
1224; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s5, v5
1225; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
1226; GFX8-NEXT:    v_min_i32_e32 v2, v2, v5
1227; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
1228; GFX8-NEXT:    v_max_i32_e32 v2, -1, v1
1229; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
1230; GFX8-NEXT:    v_min_i32_e32 v4, -1, v1
1231; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s5, v4
1232; GFX8-NEXT:    v_max_i32_e32 v2, v2, v3
1233; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
1234; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v2
1235; GFX8-NEXT:    s_setpc_b64 s[30:31]
1236;
1237; GFX9-LABEL: v_ssubsat_v2i32:
1238; GFX9:       ; %bb.0:
1239; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240; GFX9-NEXT:    v_sub_i32 v0, v0, v2 clamp
1241; GFX9-NEXT:    v_sub_i32 v1, v1, v3 clamp
1242; GFX9-NEXT:    s_setpc_b64 s[30:31]
1243;
1244; GFX10-LABEL: v_ssubsat_v2i32:
1245; GFX10:       ; %bb.0:
1246; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1247; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1248; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v2 clamp
1249; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v3 clamp
1250; GFX10-NEXT:    s_setpc_b64 s[30:31]
1251  %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1252  ret <2 x i32> %result
1253}
1254
1255define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1256; GFX6-LABEL: s_ssubsat_v2i32:
1257; GFX6:       ; %bb.0:
1258; GFX6-NEXT:    s_brev_b32 s4, -2
1259; GFX6-NEXT:    s_max_i32 s6, s0, -1
1260; GFX6-NEXT:    s_brev_b32 s5, 1
1261; GFX6-NEXT:    s_sub_i32 s6, s6, s4
1262; GFX6-NEXT:    s_min_i32 s7, s0, -1
1263; GFX6-NEXT:    s_sub_i32 s7, s7, s5
1264; GFX6-NEXT:    s_max_i32 s2, s6, s2
1265; GFX6-NEXT:    s_min_i32 s2, s2, s7
1266; GFX6-NEXT:    s_sub_i32 s0, s0, s2
1267; GFX6-NEXT:    s_max_i32 s2, s1, -1
1268; GFX6-NEXT:    s_sub_i32 s2, s2, s4
1269; GFX6-NEXT:    s_min_i32 s4, s1, -1
1270; GFX6-NEXT:    s_sub_i32 s4, s4, s5
1271; GFX6-NEXT:    s_max_i32 s2, s2, s3
1272; GFX6-NEXT:    s_min_i32 s2, s2, s4
1273; GFX6-NEXT:    s_sub_i32 s1, s1, s2
1274; GFX6-NEXT:    ; return to shader part epilog
1275;
1276; GFX8-LABEL: s_ssubsat_v2i32:
1277; GFX8:       ; %bb.0:
1278; GFX8-NEXT:    s_brev_b32 s4, -2
1279; GFX8-NEXT:    s_max_i32 s6, s0, -1
1280; GFX8-NEXT:    s_brev_b32 s5, 1
1281; GFX8-NEXT:    s_sub_i32 s6, s6, s4
1282; GFX8-NEXT:    s_min_i32 s7, s0, -1
1283; GFX8-NEXT:    s_sub_i32 s7, s7, s5
1284; GFX8-NEXT:    s_max_i32 s2, s6, s2
1285; GFX8-NEXT:    s_min_i32 s2, s2, s7
1286; GFX8-NEXT:    s_sub_i32 s0, s0, s2
1287; GFX8-NEXT:    s_max_i32 s2, s1, -1
1288; GFX8-NEXT:    s_sub_i32 s2, s2, s4
1289; GFX8-NEXT:    s_min_i32 s4, s1, -1
1290; GFX8-NEXT:    s_sub_i32 s4, s4, s5
1291; GFX8-NEXT:    s_max_i32 s2, s2, s3
1292; GFX8-NEXT:    s_min_i32 s2, s2, s4
1293; GFX8-NEXT:    s_sub_i32 s1, s1, s2
1294; GFX8-NEXT:    ; return to shader part epilog
1295;
1296; GFX9-LABEL: s_ssubsat_v2i32:
1297; GFX9:       ; %bb.0:
1298; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1299; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1300; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1301; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1302; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1303; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1304; GFX9-NEXT:    ; return to shader part epilog
1305;
1306; GFX10-LABEL: s_ssubsat_v2i32:
1307; GFX10:       ; %bb.0:
1308; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s2 clamp
1309; GFX10-NEXT:    v_sub_nc_i32 v1, s1, s3 clamp
1310; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1311; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1312; GFX10-NEXT:    ; return to shader part epilog
1313  %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1314  ret <2 x i32> %result
1315}
1316
1317define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1318; GFX6-LABEL: v_ssubsat_v3i32:
1319; GFX6:       ; %bb.0:
1320; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1321; GFX6-NEXT:    s_brev_b32 s4, -2
1322; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
1323; GFX6-NEXT:    s_brev_b32 s5, 1
1324; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s4, v6
1325; GFX6-NEXT:    v_min_i32_e32 v7, -1, v0
1326; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, s5, v7
1327; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
1328; GFX6-NEXT:    v_min_i32_e32 v3, v3, v7
1329; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
1330; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
1331; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
1332; GFX6-NEXT:    v_min_i32_e32 v6, -1, v1
1333; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
1334; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
1335; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
1336; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
1337; GFX6-NEXT:    v_max_i32_e32 v3, -1, v2
1338; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
1339; GFX6-NEXT:    v_min_i32_e32 v4, -1, v2
1340; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
1341; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
1342; GFX6-NEXT:    v_min_i32_e32 v3, v3, v4
1343; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1344; GFX6-NEXT:    s_setpc_b64 s[30:31]
1345;
1346; GFX8-LABEL: v_ssubsat_v3i32:
1347; GFX8:       ; %bb.0:
1348; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349; GFX8-NEXT:    s_brev_b32 s4, -2
1350; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
1351; GFX8-NEXT:    s_brev_b32 s5, 1
1352; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s4, v6
1353; GFX8-NEXT:    v_min_i32_e32 v7, -1, v0
1354; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s5, v7
1355; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
1356; GFX8-NEXT:    v_min_i32_e32 v3, v3, v7
1357; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
1358; GFX8-NEXT:    v_max_i32_e32 v3, -1, v1
1359; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s4, v3
1360; GFX8-NEXT:    v_min_i32_e32 v6, -1, v1
1361; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
1362; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
1363; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
1364; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
1365; GFX8-NEXT:    v_max_i32_e32 v3, -1, v2
1366; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s4, v3
1367; GFX8-NEXT:    v_min_i32_e32 v4, -1, v2
1368; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s5, v4
1369; GFX8-NEXT:    v_max_i32_e32 v3, v3, v5
1370; GFX8-NEXT:    v_min_i32_e32 v3, v3, v4
1371; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1372; GFX8-NEXT:    s_setpc_b64 s[30:31]
1373;
1374; GFX9-LABEL: v_ssubsat_v3i32:
1375; GFX9:       ; %bb.0:
1376; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1377; GFX9-NEXT:    v_sub_i32 v0, v0, v3 clamp
1378; GFX9-NEXT:    v_sub_i32 v1, v1, v4 clamp
1379; GFX9-NEXT:    v_sub_i32 v2, v2, v5 clamp
1380; GFX9-NEXT:    s_setpc_b64 s[30:31]
1381;
1382; GFX10-LABEL: v_ssubsat_v3i32:
1383; GFX10:       ; %bb.0:
1384; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1385; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1386; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v3 clamp
1387; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v4 clamp
1388; GFX10-NEXT:    v_sub_nc_i32 v2, v2, v5 clamp
1389; GFX10-NEXT:    s_setpc_b64 s[30:31]
1390  %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1391  ret <3 x i32> %result
1392}
1393
1394define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1395; GFX6-LABEL: s_ssubsat_v3i32:
1396; GFX6:       ; %bb.0:
1397; GFX6-NEXT:    s_brev_b32 s6, -2
1398; GFX6-NEXT:    s_max_i32 s8, s0, -1
1399; GFX6-NEXT:    s_brev_b32 s7, 1
1400; GFX6-NEXT:    s_sub_i32 s8, s8, s6
1401; GFX6-NEXT:    s_min_i32 s9, s0, -1
1402; GFX6-NEXT:    s_sub_i32 s9, s9, s7
1403; GFX6-NEXT:    s_max_i32 s3, s8, s3
1404; GFX6-NEXT:    s_min_i32 s3, s3, s9
1405; GFX6-NEXT:    s_sub_i32 s0, s0, s3
1406; GFX6-NEXT:    s_max_i32 s3, s1, -1
1407; GFX6-NEXT:    s_sub_i32 s3, s3, s6
1408; GFX6-NEXT:    s_min_i32 s8, s1, -1
1409; GFX6-NEXT:    s_sub_i32 s8, s8, s7
1410; GFX6-NEXT:    s_max_i32 s3, s3, s4
1411; GFX6-NEXT:    s_min_i32 s3, s3, s8
1412; GFX6-NEXT:    s_sub_i32 s1, s1, s3
1413; GFX6-NEXT:    s_max_i32 s3, s2, -1
1414; GFX6-NEXT:    s_sub_i32 s3, s3, s6
1415; GFX6-NEXT:    s_min_i32 s4, s2, -1
1416; GFX6-NEXT:    s_sub_i32 s4, s4, s7
1417; GFX6-NEXT:    s_max_i32 s3, s3, s5
1418; GFX6-NEXT:    s_min_i32 s3, s3, s4
1419; GFX6-NEXT:    s_sub_i32 s2, s2, s3
1420; GFX6-NEXT:    ; return to shader part epilog
1421;
1422; GFX8-LABEL: s_ssubsat_v3i32:
1423; GFX8:       ; %bb.0:
1424; GFX8-NEXT:    s_brev_b32 s6, -2
1425; GFX8-NEXT:    s_max_i32 s8, s0, -1
1426; GFX8-NEXT:    s_brev_b32 s7, 1
1427; GFX8-NEXT:    s_sub_i32 s8, s8, s6
1428; GFX8-NEXT:    s_min_i32 s9, s0, -1
1429; GFX8-NEXT:    s_sub_i32 s9, s9, s7
1430; GFX8-NEXT:    s_max_i32 s3, s8, s3
1431; GFX8-NEXT:    s_min_i32 s3, s3, s9
1432; GFX8-NEXT:    s_sub_i32 s0, s0, s3
1433; GFX8-NEXT:    s_max_i32 s3, s1, -1
1434; GFX8-NEXT:    s_sub_i32 s3, s3, s6
1435; GFX8-NEXT:    s_min_i32 s8, s1, -1
1436; GFX8-NEXT:    s_sub_i32 s8, s8, s7
1437; GFX8-NEXT:    s_max_i32 s3, s3, s4
1438; GFX8-NEXT:    s_min_i32 s3, s3, s8
1439; GFX8-NEXT:    s_sub_i32 s1, s1, s3
1440; GFX8-NEXT:    s_max_i32 s3, s2, -1
1441; GFX8-NEXT:    s_sub_i32 s3, s3, s6
1442; GFX8-NEXT:    s_min_i32 s4, s2, -1
1443; GFX8-NEXT:    s_sub_i32 s4, s4, s7
1444; GFX8-NEXT:    s_max_i32 s3, s3, s5
1445; GFX8-NEXT:    s_min_i32 s3, s3, s4
1446; GFX8-NEXT:    s_sub_i32 s2, s2, s3
1447; GFX8-NEXT:    ; return to shader part epilog
1448;
1449; GFX9-LABEL: s_ssubsat_v3i32:
1450; GFX9:       ; %bb.0:
1451; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1452; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1453; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1454; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1455; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1456; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1457; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1458; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1459; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1460; GFX9-NEXT:    ; return to shader part epilog
1461;
1462; GFX10-LABEL: s_ssubsat_v3i32:
1463; GFX10:       ; %bb.0:
1464; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s3 clamp
1465; GFX10-NEXT:    v_sub_nc_i32 v1, s1, s4 clamp
1466; GFX10-NEXT:    v_sub_nc_i32 v2, s2, s5 clamp
1467; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1468; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1469; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1470; GFX10-NEXT:    ; return to shader part epilog
1471  %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1472  ret <3 x i32> %result
1473}
1474
1475define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1476; GFX6-LABEL: v_ssubsat_v4i32:
1477; GFX6:       ; %bb.0:
1478; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1479; GFX6-NEXT:    s_brev_b32 s4, -2
1480; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
1481; GFX6-NEXT:    s_brev_b32 s5, 1
1482; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
1483; GFX6-NEXT:    v_min_i32_e32 v9, -1, v0
1484; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, s5, v9
1485; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
1486; GFX6-NEXT:    v_min_i32_e32 v4, v4, v9
1487; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
1488; GFX6-NEXT:    v_max_i32_e32 v4, -1, v1
1489; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
1490; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
1491; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
1492; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
1493; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
1494; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
1495; GFX6-NEXT:    v_max_i32_e32 v4, -1, v2
1496; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
1497; GFX6-NEXT:    v_min_i32_e32 v5, -1, v2
1498; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
1499; GFX6-NEXT:    v_max_i32_e32 v4, v4, v6
1500; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
1501; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
1502; GFX6-NEXT:    v_max_i32_e32 v4, -1, v3
1503; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
1504; GFX6-NEXT:    v_min_i32_e32 v5, -1, v3
1505; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
1506; GFX6-NEXT:    v_max_i32_e32 v4, v4, v7
1507; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
1508; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
1509; GFX6-NEXT:    s_setpc_b64 s[30:31]
1510;
1511; GFX8-LABEL: v_ssubsat_v4i32:
1512; GFX8:       ; %bb.0:
1513; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1514; GFX8-NEXT:    s_brev_b32 s4, -2
1515; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
1516; GFX8-NEXT:    s_brev_b32 s5, 1
1517; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s4, v8
1518; GFX8-NEXT:    v_min_i32_e32 v9, -1, v0
1519; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s5, v9
1520; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
1521; GFX8-NEXT:    v_min_i32_e32 v4, v4, v9
1522; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
1523; GFX8-NEXT:    v_max_i32_e32 v4, -1, v1
1524; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
1525; GFX8-NEXT:    v_min_i32_e32 v8, -1, v1
1526; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s5, v8
1527; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
1528; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
1529; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v4
1530; GFX8-NEXT:    v_max_i32_e32 v4, -1, v2
1531; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
1532; GFX8-NEXT:    v_min_i32_e32 v5, -1, v2
1533; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s5, v5
1534; GFX8-NEXT:    v_max_i32_e32 v4, v4, v6
1535; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
1536; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v4
1537; GFX8-NEXT:    v_max_i32_e32 v4, -1, v3
1538; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
1539; GFX8-NEXT:    v_min_i32_e32 v5, -1, v3
1540; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
1541; GFX8-NEXT:    v_max_i32_e32 v4, v4, v7
1542; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
1543; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v4
1544; GFX8-NEXT:    s_setpc_b64 s[30:31]
1545;
1546; GFX9-LABEL: v_ssubsat_v4i32:
1547; GFX9:       ; %bb.0:
1548; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1549; GFX9-NEXT:    v_sub_i32 v0, v0, v4 clamp
1550; GFX9-NEXT:    v_sub_i32 v1, v1, v5 clamp
1551; GFX9-NEXT:    v_sub_i32 v2, v2, v6 clamp
1552; GFX9-NEXT:    v_sub_i32 v3, v3, v7 clamp
1553; GFX9-NEXT:    s_setpc_b64 s[30:31]
1554;
1555; GFX10-LABEL: v_ssubsat_v4i32:
1556; GFX10:       ; %bb.0:
1557; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1558; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1559; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v4 clamp
1560; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v5 clamp
1561; GFX10-NEXT:    v_sub_nc_i32 v2, v2, v6 clamp
1562; GFX10-NEXT:    v_sub_nc_i32 v3, v3, v7 clamp
1563; GFX10-NEXT:    s_setpc_b64 s[30:31]
1564  %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1565  ret <4 x i32> %result
1566}
1567
1568define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1569; GFX6-LABEL: s_ssubsat_v4i32:
1570; GFX6:       ; %bb.0:
1571; GFX6-NEXT:    s_brev_b32 s8, -2
1572; GFX6-NEXT:    s_max_i32 s10, s0, -1
1573; GFX6-NEXT:    s_brev_b32 s9, 1
1574; GFX6-NEXT:    s_sub_i32 s10, s10, s8
1575; GFX6-NEXT:    s_min_i32 s11, s0, -1
1576; GFX6-NEXT:    s_sub_i32 s11, s11, s9
1577; GFX6-NEXT:    s_max_i32 s4, s10, s4
1578; GFX6-NEXT:    s_min_i32 s4, s4, s11
1579; GFX6-NEXT:    s_sub_i32 s0, s0, s4
1580; GFX6-NEXT:    s_max_i32 s4, s1, -1
1581; GFX6-NEXT:    s_sub_i32 s4, s4, s8
1582; GFX6-NEXT:    s_min_i32 s10, s1, -1
1583; GFX6-NEXT:    s_sub_i32 s10, s10, s9
1584; GFX6-NEXT:    s_max_i32 s4, s4, s5
1585; GFX6-NEXT:    s_min_i32 s4, s4, s10
1586; GFX6-NEXT:    s_sub_i32 s1, s1, s4
1587; GFX6-NEXT:    s_max_i32 s4, s2, -1
1588; GFX6-NEXT:    s_sub_i32 s4, s4, s8
1589; GFX6-NEXT:    s_min_i32 s5, s2, -1
1590; GFX6-NEXT:    s_sub_i32 s5, s5, s9
1591; GFX6-NEXT:    s_max_i32 s4, s4, s6
1592; GFX6-NEXT:    s_min_i32 s4, s4, s5
1593; GFX6-NEXT:    s_sub_i32 s2, s2, s4
1594; GFX6-NEXT:    s_max_i32 s4, s3, -1
1595; GFX6-NEXT:    s_sub_i32 s4, s4, s8
1596; GFX6-NEXT:    s_min_i32 s5, s3, -1
1597; GFX6-NEXT:    s_sub_i32 s5, s5, s9
1598; GFX6-NEXT:    s_max_i32 s4, s4, s7
1599; GFX6-NEXT:    s_min_i32 s4, s4, s5
1600; GFX6-NEXT:    s_sub_i32 s3, s3, s4
1601; GFX6-NEXT:    ; return to shader part epilog
1602;
1603; GFX8-LABEL: s_ssubsat_v4i32:
1604; GFX8:       ; %bb.0:
1605; GFX8-NEXT:    s_brev_b32 s8, -2
1606; GFX8-NEXT:    s_max_i32 s10, s0, -1
1607; GFX8-NEXT:    s_brev_b32 s9, 1
1608; GFX8-NEXT:    s_sub_i32 s10, s10, s8
1609; GFX8-NEXT:    s_min_i32 s11, s0, -1
1610; GFX8-NEXT:    s_sub_i32 s11, s11, s9
1611; GFX8-NEXT:    s_max_i32 s4, s10, s4
1612; GFX8-NEXT:    s_min_i32 s4, s4, s11
1613; GFX8-NEXT:    s_sub_i32 s0, s0, s4
1614; GFX8-NEXT:    s_max_i32 s4, s1, -1
1615; GFX8-NEXT:    s_sub_i32 s4, s4, s8
1616; GFX8-NEXT:    s_min_i32 s10, s1, -1
1617; GFX8-NEXT:    s_sub_i32 s10, s10, s9
1618; GFX8-NEXT:    s_max_i32 s4, s4, s5
1619; GFX8-NEXT:    s_min_i32 s4, s4, s10
1620; GFX8-NEXT:    s_sub_i32 s1, s1, s4
1621; GFX8-NEXT:    s_max_i32 s4, s2, -1
1622; GFX8-NEXT:    s_sub_i32 s4, s4, s8
1623; GFX8-NEXT:    s_min_i32 s5, s2, -1
1624; GFX8-NEXT:    s_sub_i32 s5, s5, s9
1625; GFX8-NEXT:    s_max_i32 s4, s4, s6
1626; GFX8-NEXT:    s_min_i32 s4, s4, s5
1627; GFX8-NEXT:    s_sub_i32 s2, s2, s4
1628; GFX8-NEXT:    s_max_i32 s4, s3, -1
1629; GFX8-NEXT:    s_sub_i32 s4, s4, s8
1630; GFX8-NEXT:    s_min_i32 s5, s3, -1
1631; GFX8-NEXT:    s_sub_i32 s5, s5, s9
1632; GFX8-NEXT:    s_max_i32 s4, s4, s7
1633; GFX8-NEXT:    s_min_i32 s4, s4, s5
1634; GFX8-NEXT:    s_sub_i32 s3, s3, s4
1635; GFX8-NEXT:    ; return to shader part epilog
1636;
1637; GFX9-LABEL: s_ssubsat_v4i32:
1638; GFX9:       ; %bb.0:
1639; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1640; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1641; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1642; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1643; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1644; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1645; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1646; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
1647; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1648; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1649; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1650; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1651; GFX9-NEXT:    ; return to shader part epilog
1652;
1653; GFX10-LABEL: s_ssubsat_v4i32:
1654; GFX10:       ; %bb.0:
1655; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s4 clamp
1656; GFX10-NEXT:    v_sub_nc_i32 v1, s1, s5 clamp
1657; GFX10-NEXT:    v_sub_nc_i32 v2, s2, s6 clamp
1658; GFX10-NEXT:    v_sub_nc_i32 v3, s3, s7 clamp
1659; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1660; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1661; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1662; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1663; GFX10-NEXT:    ; return to shader part epilog
1664  %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1665  ret <4 x i32> %result
1666}
1667
1668define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1669; GFX6-LABEL: v_ssubsat_v5i32:
1670; GFX6:       ; %bb.0:
1671; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1672; GFX6-NEXT:    s_brev_b32 s4, -2
1673; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
1674; GFX6-NEXT:    s_brev_b32 s5, 1
1675; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s4, v10
1676; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
1677; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
1678; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
1679; GFX6-NEXT:    v_min_i32_e32 v5, v5, v12
1680; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
1681; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
1682; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
1683; GFX6-NEXT:    v_min_i32_e32 v10, -1, v1
1684; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
1685; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
1686; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
1687; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
1688; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
1689; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
1690; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
1691; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
1692; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
1693; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1694; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
1695; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
1696; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
1697; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
1698; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
1699; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
1700; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
1701; GFX6-NEXT:    v_max_i32_e32 v5, v5, v8
1702; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1703; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
1704; GFX6-NEXT:    v_max_i32_e32 v5, -1, v4
1705; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
1706; GFX6-NEXT:    v_min_i32_e32 v6, -1, v4
1707; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v13
1708; GFX6-NEXT:    v_max_i32_e32 v5, v5, v9
1709; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
1710; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v5
1711; GFX6-NEXT:    s_setpc_b64 s[30:31]
1712;
1713; GFX8-LABEL: v_ssubsat_v5i32:
1714; GFX8:       ; %bb.0:
1715; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1716; GFX8-NEXT:    s_brev_b32 s4, -2
1717; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
1718; GFX8-NEXT:    s_brev_b32 s5, 1
1719; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s4, v10
1720; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
1721; GFX8-NEXT:    v_subrev_u32_e32 v12, vcc, s5, v12
1722; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
1723; GFX8-NEXT:    v_min_i32_e32 v5, v5, v12
1724; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
1725; GFX8-NEXT:    v_max_i32_e32 v5, -1, v1
1726; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v5
1727; GFX8-NEXT:    v_min_i32_e32 v10, -1, v1
1728; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s5, v10
1729; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
1730; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
1731; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
1732; GFX8-NEXT:    v_max_i32_e32 v5, -1, v2
1733; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v5
1734; GFX8-NEXT:    v_min_i32_e32 v6, -1, v2
1735; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
1736; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
1737; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1738; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
1739; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
1740; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
1741; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
1742; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
1743; GFX8-NEXT:    v_min_i32_e32 v6, -1, v3
1744; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v13
1745; GFX8-NEXT:    v_max_i32_e32 v5, v5, v8
1746; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1747; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
1748; GFX8-NEXT:    v_max_i32_e32 v5, -1, v4
1749; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
1750; GFX8-NEXT:    v_min_i32_e32 v6, -1, v4
1751; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v13
1752; GFX8-NEXT:    v_max_i32_e32 v5, v5, v9
1753; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
1754; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
1755; GFX8-NEXT:    s_setpc_b64 s[30:31]
1756;
1757; GFX9-LABEL: v_ssubsat_v5i32:
1758; GFX9:       ; %bb.0:
1759; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1760; GFX9-NEXT:    v_sub_i32 v0, v0, v5 clamp
1761; GFX9-NEXT:    v_sub_i32 v1, v1, v6 clamp
1762; GFX9-NEXT:    v_sub_i32 v2, v2, v7 clamp
1763; GFX9-NEXT:    v_sub_i32 v3, v3, v8 clamp
1764; GFX9-NEXT:    v_sub_i32 v4, v4, v9 clamp
1765; GFX9-NEXT:    s_setpc_b64 s[30:31]
1766;
1767; GFX10-LABEL: v_ssubsat_v5i32:
1768; GFX10:       ; %bb.0:
1769; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1770; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1771; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v5 clamp
1772; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v6 clamp
1773; GFX10-NEXT:    v_sub_nc_i32 v2, v2, v7 clamp
1774; GFX10-NEXT:    v_sub_nc_i32 v3, v3, v8 clamp
1775; GFX10-NEXT:    v_sub_nc_i32 v4, v4, v9 clamp
1776; GFX10-NEXT:    s_setpc_b64 s[30:31]
1777  %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1778  ret <5 x i32> %result
1779}
1780
1781define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1782; GFX6-LABEL: s_ssubsat_v5i32:
1783; GFX6:       ; %bb.0:
1784; GFX6-NEXT:    s_brev_b32 s10, -2
1785; GFX6-NEXT:    s_max_i32 s12, s0, -1
1786; GFX6-NEXT:    s_brev_b32 s11, 1
1787; GFX6-NEXT:    s_sub_i32 s12, s12, s10
1788; GFX6-NEXT:    s_min_i32 s13, s0, -1
1789; GFX6-NEXT:    s_sub_i32 s13, s13, s11
1790; GFX6-NEXT:    s_max_i32 s5, s12, s5
1791; GFX6-NEXT:    s_min_i32 s5, s5, s13
1792; GFX6-NEXT:    s_sub_i32 s0, s0, s5
1793; GFX6-NEXT:    s_max_i32 s5, s1, -1
1794; GFX6-NEXT:    s_sub_i32 s5, s5, s10
1795; GFX6-NEXT:    s_min_i32 s12, s1, -1
1796; GFX6-NEXT:    s_sub_i32 s12, s12, s11
1797; GFX6-NEXT:    s_max_i32 s5, s5, s6
1798; GFX6-NEXT:    s_min_i32 s5, s5, s12
1799; GFX6-NEXT:    s_sub_i32 s1, s1, s5
1800; GFX6-NEXT:    s_max_i32 s5, s2, -1
1801; GFX6-NEXT:    s_sub_i32 s5, s5, s10
1802; GFX6-NEXT:    s_min_i32 s6, s2, -1
1803; GFX6-NEXT:    s_sub_i32 s6, s6, s11
1804; GFX6-NEXT:    s_max_i32 s5, s5, s7
1805; GFX6-NEXT:    s_min_i32 s5, s5, s6
1806; GFX6-NEXT:    s_sub_i32 s2, s2, s5
1807; GFX6-NEXT:    s_max_i32 s5, s3, -1
1808; GFX6-NEXT:    s_sub_i32 s5, s5, s10
1809; GFX6-NEXT:    s_min_i32 s6, s3, -1
1810; GFX6-NEXT:    s_sub_i32 s6, s6, s11
1811; GFX6-NEXT:    s_max_i32 s5, s5, s8
1812; GFX6-NEXT:    s_min_i32 s5, s5, s6
1813; GFX6-NEXT:    s_sub_i32 s3, s3, s5
1814; GFX6-NEXT:    s_max_i32 s5, s4, -1
1815; GFX6-NEXT:    s_sub_i32 s5, s5, s10
1816; GFX6-NEXT:    s_min_i32 s6, s4, -1
1817; GFX6-NEXT:    s_sub_i32 s6, s6, s11
1818; GFX6-NEXT:    s_max_i32 s5, s5, s9
1819; GFX6-NEXT:    s_min_i32 s5, s5, s6
1820; GFX6-NEXT:    s_sub_i32 s4, s4, s5
1821; GFX6-NEXT:    ; return to shader part epilog
1822;
1823; GFX8-LABEL: s_ssubsat_v5i32:
1824; GFX8:       ; %bb.0:
1825; GFX8-NEXT:    s_brev_b32 s10, -2
1826; GFX8-NEXT:    s_max_i32 s12, s0, -1
1827; GFX8-NEXT:    s_brev_b32 s11, 1
1828; GFX8-NEXT:    s_sub_i32 s12, s12, s10
1829; GFX8-NEXT:    s_min_i32 s13, s0, -1
1830; GFX8-NEXT:    s_sub_i32 s13, s13, s11
1831; GFX8-NEXT:    s_max_i32 s5, s12, s5
1832; GFX8-NEXT:    s_min_i32 s5, s5, s13
1833; GFX8-NEXT:    s_sub_i32 s0, s0, s5
1834; GFX8-NEXT:    s_max_i32 s5, s1, -1
1835; GFX8-NEXT:    s_sub_i32 s5, s5, s10
1836; GFX8-NEXT:    s_min_i32 s12, s1, -1
1837; GFX8-NEXT:    s_sub_i32 s12, s12, s11
1838; GFX8-NEXT:    s_max_i32 s5, s5, s6
1839; GFX8-NEXT:    s_min_i32 s5, s5, s12
1840; GFX8-NEXT:    s_sub_i32 s1, s1, s5
1841; GFX8-NEXT:    s_max_i32 s5, s2, -1
1842; GFX8-NEXT:    s_sub_i32 s5, s5, s10
1843; GFX8-NEXT:    s_min_i32 s6, s2, -1
1844; GFX8-NEXT:    s_sub_i32 s6, s6, s11
1845; GFX8-NEXT:    s_max_i32 s5, s5, s7
1846; GFX8-NEXT:    s_min_i32 s5, s5, s6
1847; GFX8-NEXT:    s_sub_i32 s2, s2, s5
1848; GFX8-NEXT:    s_max_i32 s5, s3, -1
1849; GFX8-NEXT:    s_sub_i32 s5, s5, s10
1850; GFX8-NEXT:    s_min_i32 s6, s3, -1
1851; GFX8-NEXT:    s_sub_i32 s6, s6, s11
1852; GFX8-NEXT:    s_max_i32 s5, s5, s8
1853; GFX8-NEXT:    s_min_i32 s5, s5, s6
1854; GFX8-NEXT:    s_sub_i32 s3, s3, s5
1855; GFX8-NEXT:    s_max_i32 s5, s4, -1
1856; GFX8-NEXT:    s_sub_i32 s5, s5, s10
1857; GFX8-NEXT:    s_min_i32 s6, s4, -1
1858; GFX8-NEXT:    s_sub_i32 s6, s6, s11
1859; GFX8-NEXT:    s_max_i32 s5, s5, s9
1860; GFX8-NEXT:    s_min_i32 s5, s5, s6
1861; GFX8-NEXT:    s_sub_i32 s4, s4, s5
1862; GFX8-NEXT:    ; return to shader part epilog
1863;
1864; GFX9-LABEL: s_ssubsat_v5i32:
1865; GFX9:       ; %bb.0:
1866; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1867; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1868; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1869; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1870; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1871; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
1872; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
1873; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
1874; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
1875; GFX9-NEXT:    v_sub_i32 v4, s4, v4 clamp
1876; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1877; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1878; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1879; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1880; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1881; GFX9-NEXT:    ; return to shader part epilog
1882;
1883; GFX10-LABEL: s_ssubsat_v5i32:
1884; GFX10:       ; %bb.0:
1885; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s5 clamp
1886; GFX10-NEXT:    v_sub_nc_i32 v1, s1, s6 clamp
1887; GFX10-NEXT:    v_sub_nc_i32 v2, s2, s7 clamp
1888; GFX10-NEXT:    v_sub_nc_i32 v3, s3, s8 clamp
1889; GFX10-NEXT:    v_sub_nc_i32 v4, s4, s9 clamp
1890; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1891; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1892; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1893; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1894; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
1895; GFX10-NEXT:    ; return to shader part epilog
1896  %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1897  ret <5 x i32> %result
1898}
1899
1900define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1901; GFX6-LABEL: v_ssubsat_v16i32:
1902; GFX6:       ; %bb.0:
1903; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1904; GFX6-NEXT:    s_brev_b32 s4, -2
1905; GFX6-NEXT:    v_max_i32_e32 v32, -1, v0
1906; GFX6-NEXT:    v_subrev_i32_e32 v32, vcc, s4, v32
1907; GFX6-NEXT:    v_max_i32_e32 v16, v32, v16
1908; GFX6-NEXT:    s_brev_b32 s5, 1
1909; GFX6-NEXT:    v_min_i32_e32 v32, -1, v0
1910; GFX6-NEXT:    v_subrev_i32_e32 v32, vcc, s5, v32
1911; GFX6-NEXT:    v_min_i32_e32 v16, v16, v32
1912; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
1913; GFX6-NEXT:    v_max_i32_e32 v16, -1, v1
1914; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
1915; GFX6-NEXT:    v_max_i32_e32 v16, v16, v17
1916; GFX6-NEXT:    v_min_i32_e32 v17, -1, v1
1917; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, s5, v17
1918; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
1919; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v16
1920; GFX6-NEXT:    v_max_i32_e32 v16, -1, v2
1921; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
1922; GFX6-NEXT:    v_min_i32_e32 v17, -1, v2
1923; GFX6-NEXT:    v_max_i32_e32 v16, v16, v18
1924; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, s5, v17
1925; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
1926; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v16
1927; GFX6-NEXT:    v_bfrev_b32_e32 v16, -2
1928; GFX6-NEXT:    v_max_i32_e32 v17, -1, v3
1929; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1930; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
1931; GFX6-NEXT:    v_bfrev_b32_e32 v18, 1
1932; GFX6-NEXT:    v_min_i32_e32 v19, -1, v3
1933; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1934; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1935; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v17
1936; GFX6-NEXT:    v_max_i32_e32 v17, -1, v4
1937; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1938; GFX6-NEXT:    v_min_i32_e32 v19, -1, v4
1939; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
1940; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1941; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1942; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
1943; GFX6-NEXT:    v_max_i32_e32 v17, -1, v5
1944; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1945; GFX6-NEXT:    v_min_i32_e32 v19, -1, v5
1946; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
1947; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1948; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1949; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v17
1950; GFX6-NEXT:    v_max_i32_e32 v17, -1, v6
1951; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1952; GFX6-NEXT:    v_min_i32_e32 v19, -1, v6
1953; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
1954; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1955; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1956; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v17
1957; GFX6-NEXT:    v_max_i32_e32 v17, -1, v7
1958; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1959; GFX6-NEXT:    v_min_i32_e32 v19, -1, v7
1960; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
1961; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1962; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1963; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v17
1964; GFX6-NEXT:    v_max_i32_e32 v17, -1, v8
1965; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1966; GFX6-NEXT:    v_min_i32_e32 v19, -1, v8
1967; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
1968; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1969; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1970; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v17
1971; GFX6-NEXT:    v_max_i32_e32 v17, -1, v9
1972; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1973; GFX6-NEXT:    v_min_i32_e32 v19, -1, v9
1974; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
1975; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1976; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1977; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
1978; GFX6-NEXT:    v_max_i32_e32 v17, -1, v10
1979; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1980; GFX6-NEXT:    v_min_i32_e32 v19, -1, v10
1981; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
1982; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1983; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1984; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v17
1985; GFX6-NEXT:    v_max_i32_e32 v17, -1, v11
1986; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1987; GFX6-NEXT:    v_min_i32_e32 v19, -1, v11
1988; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
1989; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1990; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1991; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
1992; GFX6-NEXT:    v_max_i32_e32 v17, -1, v12
1993; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
1994; GFX6-NEXT:    v_min_i32_e32 v19, -1, v12
1995; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
1996; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
1997; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1998; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v17
1999; GFX6-NEXT:    v_max_i32_e32 v17, -1, v13
2000; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
2001; GFX6-NEXT:    v_min_i32_e32 v19, -1, v13
2002; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
2003; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
2004; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2005; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v17
2006; GFX6-NEXT:    v_max_i32_e32 v17, -1, v14
2007; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
2008; GFX6-NEXT:    v_min_i32_e32 v19, -1, v14
2009; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
2010; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
2011; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2012; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v17
2013; GFX6-NEXT:    v_max_i32_e32 v17, -1, v15
2014; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v17, v16
2015; GFX6-NEXT:    v_min_i32_e32 v17, -1, v15
2016; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v18
2017; GFX6-NEXT:    v_max_i32_e32 v16, v16, v31
2018; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
2019; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
2020; GFX6-NEXT:    s_setpc_b64 s[30:31]
2021;
2022; GFX8-LABEL: v_ssubsat_v16i32:
2023; GFX8:       ; %bb.0:
2024; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2025; GFX8-NEXT:    s_brev_b32 s4, -2
2026; GFX8-NEXT:    v_max_i32_e32 v32, -1, v0
2027; GFX8-NEXT:    v_subrev_u32_e32 v32, vcc, s4, v32
2028; GFX8-NEXT:    v_max_i32_e32 v16, v32, v16
2029; GFX8-NEXT:    s_brev_b32 s5, 1
2030; GFX8-NEXT:    v_min_i32_e32 v32, -1, v0
2031; GFX8-NEXT:    v_subrev_u32_e32 v32, vcc, s5, v32
2032; GFX8-NEXT:    v_min_i32_e32 v16, v16, v32
2033; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v16
2034; GFX8-NEXT:    v_max_i32_e32 v16, -1, v1
2035; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, s4, v16
2036; GFX8-NEXT:    v_max_i32_e32 v16, v16, v17
2037; GFX8-NEXT:    v_min_i32_e32 v17, -1, v1
2038; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, s5, v17
2039; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2040; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v16
2041; GFX8-NEXT:    v_max_i32_e32 v16, -1, v2
2042; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, s4, v16
2043; GFX8-NEXT:    v_min_i32_e32 v17, -1, v2
2044; GFX8-NEXT:    v_max_i32_e32 v16, v16, v18
2045; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, s5, v17
2046; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2047; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v16
2048; GFX8-NEXT:    v_bfrev_b32_e32 v16, -2
2049; GFX8-NEXT:    v_max_i32_e32 v17, -1, v3
2050; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2051; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2052; GFX8-NEXT:    v_bfrev_b32_e32 v18, 1
2053; GFX8-NEXT:    v_min_i32_e32 v19, -1, v3
2054; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2055; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2056; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v17
2057; GFX8-NEXT:    v_max_i32_e32 v17, -1, v4
2058; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2059; GFX8-NEXT:    v_min_i32_e32 v19, -1, v4
2060; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
2061; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2062; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2063; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v17
2064; GFX8-NEXT:    v_max_i32_e32 v17, -1, v5
2065; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2066; GFX8-NEXT:    v_min_i32_e32 v19, -1, v5
2067; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
2068; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2069; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2070; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v17
2071; GFX8-NEXT:    v_max_i32_e32 v17, -1, v6
2072; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2073; GFX8-NEXT:    v_min_i32_e32 v19, -1, v6
2074; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
2075; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2076; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2077; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v17
2078; GFX8-NEXT:    v_max_i32_e32 v17, -1, v7
2079; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2080; GFX8-NEXT:    v_min_i32_e32 v19, -1, v7
2081; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
2082; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2083; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2084; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, v7, v17
2085; GFX8-NEXT:    v_max_i32_e32 v17, -1, v8
2086; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2087; GFX8-NEXT:    v_min_i32_e32 v19, -1, v8
2088; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
2089; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2090; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2091; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v8, v17
2092; GFX8-NEXT:    v_max_i32_e32 v17, -1, v9
2093; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2094; GFX8-NEXT:    v_min_i32_e32 v19, -1, v9
2095; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
2096; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2097; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2098; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, v9, v17
2099; GFX8-NEXT:    v_max_i32_e32 v17, -1, v10
2100; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2101; GFX8-NEXT:    v_min_i32_e32 v19, -1, v10
2102; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
2103; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2104; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2105; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v17
2106; GFX8-NEXT:    v_max_i32_e32 v17, -1, v11
2107; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2108; GFX8-NEXT:    v_min_i32_e32 v19, -1, v11
2109; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
2110; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2111; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2112; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, v11, v17
2113; GFX8-NEXT:    v_max_i32_e32 v17, -1, v12
2114; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2115; GFX8-NEXT:    v_min_i32_e32 v19, -1, v12
2116; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
2117; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2118; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2119; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v17
2120; GFX8-NEXT:    v_max_i32_e32 v17, -1, v13
2121; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2122; GFX8-NEXT:    v_min_i32_e32 v19, -1, v13
2123; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
2124; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2125; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2126; GFX8-NEXT:    v_sub_u32_e32 v13, vcc, v13, v17
2127; GFX8-NEXT:    v_max_i32_e32 v17, -1, v14
2128; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
2129; GFX8-NEXT:    v_min_i32_e32 v19, -1, v14
2130; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
2131; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
2132; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2133; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, v14, v17
2134; GFX8-NEXT:    v_max_i32_e32 v17, -1, v15
2135; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v17, v16
2136; GFX8-NEXT:    v_min_i32_e32 v17, -1, v15
2137; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v18
2138; GFX8-NEXT:    v_max_i32_e32 v16, v16, v31
2139; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2140; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, v15, v16
2141; GFX8-NEXT:    s_setpc_b64 s[30:31]
2142;
2143; GFX9-LABEL: v_ssubsat_v16i32:
2144; GFX9:       ; %bb.0:
2145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2146; GFX9-NEXT:    v_sub_i32 v0, v0, v16 clamp
2147; GFX9-NEXT:    v_sub_i32 v1, v1, v17 clamp
2148; GFX9-NEXT:    v_sub_i32 v2, v2, v18 clamp
2149; GFX9-NEXT:    v_sub_i32 v3, v3, v19 clamp
2150; GFX9-NEXT:    v_sub_i32 v4, v4, v20 clamp
2151; GFX9-NEXT:    v_sub_i32 v5, v5, v21 clamp
2152; GFX9-NEXT:    v_sub_i32 v6, v6, v22 clamp
2153; GFX9-NEXT:    v_sub_i32 v7, v7, v23 clamp
2154; GFX9-NEXT:    v_sub_i32 v8, v8, v24 clamp
2155; GFX9-NEXT:    v_sub_i32 v9, v9, v25 clamp
2156; GFX9-NEXT:    v_sub_i32 v10, v10, v26 clamp
2157; GFX9-NEXT:    v_sub_i32 v11, v11, v27 clamp
2158; GFX9-NEXT:    v_sub_i32 v12, v12, v28 clamp
2159; GFX9-NEXT:    v_sub_i32 v13, v13, v29 clamp
2160; GFX9-NEXT:    v_sub_i32 v14, v14, v30 clamp
2161; GFX9-NEXT:    v_sub_i32 v15, v15, v31 clamp
2162; GFX9-NEXT:    s_setpc_b64 s[30:31]
2163;
2164; GFX10-LABEL: v_ssubsat_v16i32:
2165; GFX10:       ; %bb.0:
2166; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2167; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2168; GFX10-NEXT:    v_sub_nc_i32 v0, v0, v16 clamp
2169; GFX10-NEXT:    v_sub_nc_i32 v1, v1, v17 clamp
2170; GFX10-NEXT:    v_sub_nc_i32 v2, v2, v18 clamp
2171; GFX10-NEXT:    v_sub_nc_i32 v3, v3, v19 clamp
2172; GFX10-NEXT:    v_sub_nc_i32 v4, v4, v20 clamp
2173; GFX10-NEXT:    v_sub_nc_i32 v5, v5, v21 clamp
2174; GFX10-NEXT:    v_sub_nc_i32 v6, v6, v22 clamp
2175; GFX10-NEXT:    v_sub_nc_i32 v7, v7, v23 clamp
2176; GFX10-NEXT:    v_sub_nc_i32 v8, v8, v24 clamp
2177; GFX10-NEXT:    v_sub_nc_i32 v9, v9, v25 clamp
2178; GFX10-NEXT:    v_sub_nc_i32 v10, v10, v26 clamp
2179; GFX10-NEXT:    v_sub_nc_i32 v11, v11, v27 clamp
2180; GFX10-NEXT:    v_sub_nc_i32 v12, v12, v28 clamp
2181; GFX10-NEXT:    v_sub_nc_i32 v13, v13, v29 clamp
2182; GFX10-NEXT:    v_sub_nc_i32 v14, v14, v30 clamp
2183; GFX10-NEXT:    v_sub_nc_i32 v15, v15, v31 clamp
2184; GFX10-NEXT:    s_setpc_b64 s[30:31]
2185  %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2186  ret <16 x i32> %result
2187}
2188
2189define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2190; GFX6-LABEL: s_ssubsat_v16i32:
2191; GFX6:       ; %bb.0:
2192; GFX6-NEXT:    s_brev_b32 s32, -2
2193; GFX6-NEXT:    s_max_i32 s34, s0, -1
2194; GFX6-NEXT:    s_brev_b32 s33, 1
2195; GFX6-NEXT:    s_sub_i32 s34, s34, s32
2196; GFX6-NEXT:    s_min_i32 s35, s0, -1
2197; GFX6-NEXT:    s_sub_i32 s35, s35, s33
2198; GFX6-NEXT:    s_max_i32 s16, s34, s16
2199; GFX6-NEXT:    s_min_i32 s16, s16, s35
2200; GFX6-NEXT:    s_sub_i32 s0, s0, s16
2201; GFX6-NEXT:    s_max_i32 s16, s1, -1
2202; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2203; GFX6-NEXT:    s_min_i32 s34, s1, -1
2204; GFX6-NEXT:    s_sub_i32 s34, s34, s33
2205; GFX6-NEXT:    s_max_i32 s16, s16, s17
2206; GFX6-NEXT:    s_min_i32 s16, s16, s34
2207; GFX6-NEXT:    s_sub_i32 s1, s1, s16
2208; GFX6-NEXT:    s_max_i32 s16, s2, -1
2209; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2210; GFX6-NEXT:    s_min_i32 s17, s2, -1
2211; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2212; GFX6-NEXT:    s_max_i32 s16, s16, s18
2213; GFX6-NEXT:    s_min_i32 s16, s16, s17
2214; GFX6-NEXT:    s_sub_i32 s2, s2, s16
2215; GFX6-NEXT:    s_max_i32 s16, s3, -1
2216; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2217; GFX6-NEXT:    s_min_i32 s17, s3, -1
2218; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2219; GFX6-NEXT:    s_max_i32 s16, s16, s19
2220; GFX6-NEXT:    s_min_i32 s16, s16, s17
2221; GFX6-NEXT:    s_sub_i32 s3, s3, s16
2222; GFX6-NEXT:    s_max_i32 s16, s4, -1
2223; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2224; GFX6-NEXT:    s_min_i32 s17, s4, -1
2225; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2226; GFX6-NEXT:    s_max_i32 s16, s16, s20
2227; GFX6-NEXT:    s_min_i32 s16, s16, s17
2228; GFX6-NEXT:    s_sub_i32 s4, s4, s16
2229; GFX6-NEXT:    s_max_i32 s16, s5, -1
2230; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2231; GFX6-NEXT:    s_min_i32 s17, s5, -1
2232; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2233; GFX6-NEXT:    s_max_i32 s16, s16, s21
2234; GFX6-NEXT:    s_min_i32 s16, s16, s17
2235; GFX6-NEXT:    s_sub_i32 s5, s5, s16
2236; GFX6-NEXT:    s_max_i32 s16, s6, -1
2237; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2238; GFX6-NEXT:    s_min_i32 s17, s6, -1
2239; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2240; GFX6-NEXT:    s_max_i32 s16, s16, s22
2241; GFX6-NEXT:    s_min_i32 s16, s16, s17
2242; GFX6-NEXT:    s_sub_i32 s6, s6, s16
2243; GFX6-NEXT:    s_max_i32 s16, s7, -1
2244; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2245; GFX6-NEXT:    s_min_i32 s17, s7, -1
2246; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2247; GFX6-NEXT:    s_max_i32 s16, s16, s23
2248; GFX6-NEXT:    s_min_i32 s16, s16, s17
2249; GFX6-NEXT:    s_sub_i32 s7, s7, s16
2250; GFX6-NEXT:    s_max_i32 s16, s8, -1
2251; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2252; GFX6-NEXT:    s_min_i32 s17, s8, -1
2253; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2254; GFX6-NEXT:    s_max_i32 s16, s16, s24
2255; GFX6-NEXT:    s_min_i32 s16, s16, s17
2256; GFX6-NEXT:    s_sub_i32 s8, s8, s16
2257; GFX6-NEXT:    s_max_i32 s16, s9, -1
2258; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2259; GFX6-NEXT:    s_min_i32 s17, s9, -1
2260; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2261; GFX6-NEXT:    s_max_i32 s16, s16, s25
2262; GFX6-NEXT:    s_min_i32 s16, s16, s17
2263; GFX6-NEXT:    s_sub_i32 s9, s9, s16
2264; GFX6-NEXT:    s_max_i32 s16, s10, -1
2265; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2266; GFX6-NEXT:    s_min_i32 s17, s10, -1
2267; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2268; GFX6-NEXT:    s_max_i32 s16, s16, s26
2269; GFX6-NEXT:    s_min_i32 s16, s16, s17
2270; GFX6-NEXT:    s_sub_i32 s10, s10, s16
2271; GFX6-NEXT:    s_max_i32 s16, s11, -1
2272; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2273; GFX6-NEXT:    s_min_i32 s17, s11, -1
2274; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2275; GFX6-NEXT:    s_max_i32 s16, s16, s27
2276; GFX6-NEXT:    s_min_i32 s16, s16, s17
2277; GFX6-NEXT:    s_sub_i32 s11, s11, s16
2278; GFX6-NEXT:    s_max_i32 s16, s12, -1
2279; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2280; GFX6-NEXT:    s_min_i32 s17, s12, -1
2281; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2282; GFX6-NEXT:    s_max_i32 s16, s16, s28
2283; GFX6-NEXT:    s_min_i32 s16, s16, s17
2284; GFX6-NEXT:    s_sub_i32 s12, s12, s16
2285; GFX6-NEXT:    s_max_i32 s16, s13, -1
2286; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2287; GFX6-NEXT:    s_min_i32 s17, s13, -1
2288; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2289; GFX6-NEXT:    s_max_i32 s16, s16, s29
2290; GFX6-NEXT:    s_min_i32 s16, s16, s17
2291; GFX6-NEXT:    s_sub_i32 s13, s13, s16
2292; GFX6-NEXT:    s_max_i32 s16, s14, -1
2293; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2294; GFX6-NEXT:    s_min_i32 s17, s14, -1
2295; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2296; GFX6-NEXT:    s_max_i32 s16, s16, s30
2297; GFX6-NEXT:    s_min_i32 s16, s16, s17
2298; GFX6-NEXT:    s_sub_i32 s14, s14, s16
2299; GFX6-NEXT:    s_max_i32 s16, s15, -1
2300; GFX6-NEXT:    s_sub_i32 s16, s16, s32
2301; GFX6-NEXT:    s_min_i32 s17, s15, -1
2302; GFX6-NEXT:    s_sub_i32 s17, s17, s33
2303; GFX6-NEXT:    s_max_i32 s16, s16, s31
2304; GFX6-NEXT:    s_min_i32 s16, s16, s17
2305; GFX6-NEXT:    s_sub_i32 s15, s15, s16
2306; GFX6-NEXT:    ; return to shader part epilog
2307;
2308; GFX8-LABEL: s_ssubsat_v16i32:
2309; GFX8:       ; %bb.0:
2310; GFX8-NEXT:    s_brev_b32 s32, -2
2311; GFX8-NEXT:    s_max_i32 s34, s0, -1
2312; GFX8-NEXT:    s_brev_b32 s33, 1
2313; GFX8-NEXT:    s_sub_i32 s34, s34, s32
2314; GFX8-NEXT:    s_min_i32 s35, s0, -1
2315; GFX8-NEXT:    s_sub_i32 s35, s35, s33
2316; GFX8-NEXT:    s_max_i32 s16, s34, s16
2317; GFX8-NEXT:    s_min_i32 s16, s16, s35
2318; GFX8-NEXT:    s_sub_i32 s0, s0, s16
2319; GFX8-NEXT:    s_max_i32 s16, s1, -1
2320; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2321; GFX8-NEXT:    s_min_i32 s34, s1, -1
2322; GFX8-NEXT:    s_sub_i32 s34, s34, s33
2323; GFX8-NEXT:    s_max_i32 s16, s16, s17
2324; GFX8-NEXT:    s_min_i32 s16, s16, s34
2325; GFX8-NEXT:    s_sub_i32 s1, s1, s16
2326; GFX8-NEXT:    s_max_i32 s16, s2, -1
2327; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2328; GFX8-NEXT:    s_min_i32 s17, s2, -1
2329; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2330; GFX8-NEXT:    s_max_i32 s16, s16, s18
2331; GFX8-NEXT:    s_min_i32 s16, s16, s17
2332; GFX8-NEXT:    s_sub_i32 s2, s2, s16
2333; GFX8-NEXT:    s_max_i32 s16, s3, -1
2334; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2335; GFX8-NEXT:    s_min_i32 s17, s3, -1
2336; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2337; GFX8-NEXT:    s_max_i32 s16, s16, s19
2338; GFX8-NEXT:    s_min_i32 s16, s16, s17
2339; GFX8-NEXT:    s_sub_i32 s3, s3, s16
2340; GFX8-NEXT:    s_max_i32 s16, s4, -1
2341; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2342; GFX8-NEXT:    s_min_i32 s17, s4, -1
2343; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2344; GFX8-NEXT:    s_max_i32 s16, s16, s20
2345; GFX8-NEXT:    s_min_i32 s16, s16, s17
2346; GFX8-NEXT:    s_sub_i32 s4, s4, s16
2347; GFX8-NEXT:    s_max_i32 s16, s5, -1
2348; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2349; GFX8-NEXT:    s_min_i32 s17, s5, -1
2350; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2351; GFX8-NEXT:    s_max_i32 s16, s16, s21
2352; GFX8-NEXT:    s_min_i32 s16, s16, s17
2353; GFX8-NEXT:    s_sub_i32 s5, s5, s16
2354; GFX8-NEXT:    s_max_i32 s16, s6, -1
2355; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2356; GFX8-NEXT:    s_min_i32 s17, s6, -1
2357; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2358; GFX8-NEXT:    s_max_i32 s16, s16, s22
2359; GFX8-NEXT:    s_min_i32 s16, s16, s17
2360; GFX8-NEXT:    s_sub_i32 s6, s6, s16
2361; GFX8-NEXT:    s_max_i32 s16, s7, -1
2362; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2363; GFX8-NEXT:    s_min_i32 s17, s7, -1
2364; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2365; GFX8-NEXT:    s_max_i32 s16, s16, s23
2366; GFX8-NEXT:    s_min_i32 s16, s16, s17
2367; GFX8-NEXT:    s_sub_i32 s7, s7, s16
2368; GFX8-NEXT:    s_max_i32 s16, s8, -1
2369; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2370; GFX8-NEXT:    s_min_i32 s17, s8, -1
2371; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2372; GFX8-NEXT:    s_max_i32 s16, s16, s24
2373; GFX8-NEXT:    s_min_i32 s16, s16, s17
2374; GFX8-NEXT:    s_sub_i32 s8, s8, s16
2375; GFX8-NEXT:    s_max_i32 s16, s9, -1
2376; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2377; GFX8-NEXT:    s_min_i32 s17, s9, -1
2378; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2379; GFX8-NEXT:    s_max_i32 s16, s16, s25
2380; GFX8-NEXT:    s_min_i32 s16, s16, s17
2381; GFX8-NEXT:    s_sub_i32 s9, s9, s16
2382; GFX8-NEXT:    s_max_i32 s16, s10, -1
2383; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2384; GFX8-NEXT:    s_min_i32 s17, s10, -1
2385; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2386; GFX8-NEXT:    s_max_i32 s16, s16, s26
2387; GFX8-NEXT:    s_min_i32 s16, s16, s17
2388; GFX8-NEXT:    s_sub_i32 s10, s10, s16
2389; GFX8-NEXT:    s_max_i32 s16, s11, -1
2390; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2391; GFX8-NEXT:    s_min_i32 s17, s11, -1
2392; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2393; GFX8-NEXT:    s_max_i32 s16, s16, s27
2394; GFX8-NEXT:    s_min_i32 s16, s16, s17
2395; GFX8-NEXT:    s_sub_i32 s11, s11, s16
2396; GFX8-NEXT:    s_max_i32 s16, s12, -1
2397; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2398; GFX8-NEXT:    s_min_i32 s17, s12, -1
2399; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2400; GFX8-NEXT:    s_max_i32 s16, s16, s28
2401; GFX8-NEXT:    s_min_i32 s16, s16, s17
2402; GFX8-NEXT:    s_sub_i32 s12, s12, s16
2403; GFX8-NEXT:    s_max_i32 s16, s13, -1
2404; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2405; GFX8-NEXT:    s_min_i32 s17, s13, -1
2406; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2407; GFX8-NEXT:    s_max_i32 s16, s16, s29
2408; GFX8-NEXT:    s_min_i32 s16, s16, s17
2409; GFX8-NEXT:    s_sub_i32 s13, s13, s16
2410; GFX8-NEXT:    s_max_i32 s16, s14, -1
2411; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2412; GFX8-NEXT:    s_min_i32 s17, s14, -1
2413; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2414; GFX8-NEXT:    s_max_i32 s16, s16, s30
2415; GFX8-NEXT:    s_min_i32 s16, s16, s17
2416; GFX8-NEXT:    s_sub_i32 s14, s14, s16
2417; GFX8-NEXT:    s_max_i32 s16, s15, -1
2418; GFX8-NEXT:    s_sub_i32 s16, s16, s32
2419; GFX8-NEXT:    s_min_i32 s17, s15, -1
2420; GFX8-NEXT:    s_sub_i32 s17, s17, s33
2421; GFX8-NEXT:    s_max_i32 s16, s16, s31
2422; GFX8-NEXT:    s_min_i32 s16, s16, s17
2423; GFX8-NEXT:    s_sub_i32 s15, s15, s16
2424; GFX8-NEXT:    ; return to shader part epilog
2425;
2426; GFX9-LABEL: s_ssubsat_v16i32:
2427; GFX9:       ; %bb.0:
2428; GFX9-NEXT:    v_mov_b32_e32 v0, s16
2429; GFX9-NEXT:    v_mov_b32_e32 v1, s17
2430; GFX9-NEXT:    v_mov_b32_e32 v2, s18
2431; GFX9-NEXT:    v_mov_b32_e32 v3, s19
2432; GFX9-NEXT:    v_mov_b32_e32 v4, s20
2433; GFX9-NEXT:    v_mov_b32_e32 v5, s21
2434; GFX9-NEXT:    v_mov_b32_e32 v6, s22
2435; GFX9-NEXT:    v_mov_b32_e32 v7, s23
2436; GFX9-NEXT:    v_mov_b32_e32 v8, s24
2437; GFX9-NEXT:    v_mov_b32_e32 v9, s25
2438; GFX9-NEXT:    v_mov_b32_e32 v10, s26
2439; GFX9-NEXT:    v_mov_b32_e32 v11, s27
2440; GFX9-NEXT:    v_mov_b32_e32 v12, s28
2441; GFX9-NEXT:    v_mov_b32_e32 v13, s29
2442; GFX9-NEXT:    v_mov_b32_e32 v14, s30
2443; GFX9-NEXT:    v_mov_b32_e32 v15, s31
2444; GFX9-NEXT:    v_sub_i32 v0, s0, v0 clamp
2445; GFX9-NEXT:    v_sub_i32 v1, s1, v1 clamp
2446; GFX9-NEXT:    v_sub_i32 v2, s2, v2 clamp
2447; GFX9-NEXT:    v_sub_i32 v3, s3, v3 clamp
2448; GFX9-NEXT:    v_sub_i32 v4, s4, v4 clamp
2449; GFX9-NEXT:    v_sub_i32 v5, s5, v5 clamp
2450; GFX9-NEXT:    v_sub_i32 v6, s6, v6 clamp
2451; GFX9-NEXT:    v_sub_i32 v7, s7, v7 clamp
2452; GFX9-NEXT:    v_sub_i32 v8, s8, v8 clamp
2453; GFX9-NEXT:    v_sub_i32 v9, s9, v9 clamp
2454; GFX9-NEXT:    v_sub_i32 v10, s10, v10 clamp
2455; GFX9-NEXT:    v_sub_i32 v11, s11, v11 clamp
2456; GFX9-NEXT:    v_sub_i32 v12, s12, v12 clamp
2457; GFX9-NEXT:    v_sub_i32 v13, s13, v13 clamp
2458; GFX9-NEXT:    v_sub_i32 v14, s14, v14 clamp
2459; GFX9-NEXT:    v_sub_i32 v15, s15, v15 clamp
2460; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2461; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2462; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2463; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2464; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
2465; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
2466; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
2467; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
2468; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
2469; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
2470; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
2471; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
2472; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
2473; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
2474; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
2475; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
2476; GFX9-NEXT:    ; return to shader part epilog
2477;
2478; GFX10-LABEL: s_ssubsat_v16i32:
2479; GFX10:       ; %bb.0:
2480; GFX10-NEXT:    v_sub_nc_i32 v0, s0, s16 clamp
2481; GFX10-NEXT:    v_sub_nc_i32 v1, s1, s17 clamp
2482; GFX10-NEXT:    v_sub_nc_i32 v2, s2, s18 clamp
2483; GFX10-NEXT:    v_sub_nc_i32 v3, s3, s19 clamp
2484; GFX10-NEXT:    v_sub_nc_i32 v4, s4, s20 clamp
2485; GFX10-NEXT:    v_sub_nc_i32 v5, s5, s21 clamp
2486; GFX10-NEXT:    v_sub_nc_i32 v6, s6, s22 clamp
2487; GFX10-NEXT:    v_sub_nc_i32 v7, s7, s23 clamp
2488; GFX10-NEXT:    v_sub_nc_i32 v8, s8, s24 clamp
2489; GFX10-NEXT:    v_sub_nc_i32 v9, s9, s25 clamp
2490; GFX10-NEXT:    v_sub_nc_i32 v10, s10, s26 clamp
2491; GFX10-NEXT:    v_sub_nc_i32 v11, s11, s27 clamp
2492; GFX10-NEXT:    v_sub_nc_i32 v12, s12, s28 clamp
2493; GFX10-NEXT:    v_sub_nc_i32 v13, s13, s29 clamp
2494; GFX10-NEXT:    v_sub_nc_i32 v14, s14, s30 clamp
2495; GFX10-NEXT:    v_sub_nc_i32 v15, s15, s31 clamp
2496; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2497; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2498; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
2499; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
2500; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
2501; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
2502; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
2503; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
2504; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
2505; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
2506; GFX10-NEXT:    v_readfirstlane_b32 s10, v10
2507; GFX10-NEXT:    v_readfirstlane_b32 s11, v11
2508; GFX10-NEXT:    v_readfirstlane_b32 s12, v12
2509; GFX10-NEXT:    v_readfirstlane_b32 s13, v13
2510; GFX10-NEXT:    v_readfirstlane_b32 s14, v14
2511; GFX10-NEXT:    v_readfirstlane_b32 s15, v15
2512; GFX10-NEXT:    ; return to shader part epilog
2513  %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2514  ret <16 x i32> %result
2515}
2516
2517define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
2518; GFX6-LABEL: v_ssubsat_i16:
2519; GFX6:       ; %bb.0:
2520; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2521; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2522; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
2523; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2524; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
2525; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
2526; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
2527; GFX6-NEXT:    v_max_i32_e32 v1, v2, v1
2528; GFX6-NEXT:    v_min_i32_e32 v1, v1, v3
2529; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
2530; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2531; GFX6-NEXT:    s_setpc_b64 s[30:31]
2532;
2533; GFX8-LABEL: v_ssubsat_i16:
2534; GFX8:       ; %bb.0:
2535; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2536; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
2537; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x7fff, v2
2538; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
2539; GFX8-NEXT:    v_subrev_u16_e32 v3, 0x8000, v3
2540; GFX8-NEXT:    v_max_i16_e32 v1, v2, v1
2541; GFX8-NEXT:    v_min_i16_e32 v1, v1, v3
2542; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
2543; GFX8-NEXT:    s_setpc_b64 s[30:31]
2544;
2545; GFX9-LABEL: v_ssubsat_i16:
2546; GFX9:       ; %bb.0:
2547; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2548; GFX9-NEXT:    v_sub_i16 v0, v0, v1 clamp
2549; GFX9-NEXT:    s_setpc_b64 s[30:31]
2550;
2551; GFX10-LABEL: v_ssubsat_i16:
2552; GFX10:       ; %bb.0:
2553; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2554; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2555; GFX10-NEXT:    v_sub_nc_i16 v0, v0, v1 clamp
2556; GFX10-NEXT:    s_setpc_b64 s[30:31]
2557  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2558  ret i16 %result
2559}
2560
2561define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2562; GFX6-LABEL: s_ssubsat_i16:
2563; GFX6:       ; %bb.0:
2564; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2565; GFX6-NEXT:    s_max_i32 s2, s0, -1
2566; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2567; GFX6-NEXT:    s_sub_i32 s2, s2, 0x7fffffff
2568; GFX6-NEXT:    s_min_i32 s3, s0, -1
2569; GFX6-NEXT:    s_sub_i32 s3, s3, 0x80000000
2570; GFX6-NEXT:    s_max_i32 s1, s2, s1
2571; GFX6-NEXT:    s_min_i32 s1, s1, s3
2572; GFX6-NEXT:    s_sub_i32 s0, s0, s1
2573; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2574; GFX6-NEXT:    ; return to shader part epilog
2575;
2576; GFX8-LABEL: s_ssubsat_i16:
2577; GFX8:       ; %bb.0:
2578; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2579; GFX8-NEXT:    s_sext_i32_i16 s3, -1
2580; GFX8-NEXT:    s_max_i32 s4, s2, s3
2581; GFX8-NEXT:    s_sub_i32 s4, s4, 0x7fff
2582; GFX8-NEXT:    s_min_i32 s2, s2, s3
2583; GFX8-NEXT:    s_sext_i32_i16 s3, s4
2584; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2585; GFX8-NEXT:    s_sub_i32 s2, s2, 0xffff8000
2586; GFX8-NEXT:    s_max_i32 s1, s3, s1
2587; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2588; GFX8-NEXT:    s_sext_i32_i16 s2, s2
2589; GFX8-NEXT:    s_min_i32 s1, s1, s2
2590; GFX8-NEXT:    s_sub_i32 s0, s0, s1
2591; GFX8-NEXT:    ; return to shader part epilog
2592;
2593; GFX9-LABEL: s_ssubsat_i16:
2594; GFX9:       ; %bb.0:
2595; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2596; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
2597; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2598; GFX9-NEXT:    ; return to shader part epilog
2599;
2600; GFX10-LABEL: s_ssubsat_i16:
2601; GFX10:       ; %bb.0:
2602; GFX10-NEXT:    v_sub_nc_i16 v0, s0, s1 clamp
2603; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2604; GFX10-NEXT:    ; return to shader part epilog
2605  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2606  ret i16 %result
2607}
2608
2609define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2610; GFX6-LABEL: ssubsat_i16_sv:
2611; GFX6:       ; %bb.0:
2612; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2613; GFX6-NEXT:    s_max_i32 s1, s0, -1
2614; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2615; GFX6-NEXT:    s_sub_i32 s1, s1, 0x7fffffff
2616; GFX6-NEXT:    s_min_i32 s2, s0, -1
2617; GFX6-NEXT:    s_sub_i32 s2, s2, 0x80000000
2618; GFX6-NEXT:    v_max_i32_e32 v0, s1, v0
2619; GFX6-NEXT:    v_min_i32_e32 v0, s2, v0
2620; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2621; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2622; GFX6-NEXT:    ; return to shader part epilog
2623;
2624; GFX8-LABEL: ssubsat_i16_sv:
2625; GFX8:       ; %bb.0:
2626; GFX8-NEXT:    s_sext_i32_i16 s1, s0
2627; GFX8-NEXT:    s_sext_i32_i16 s2, -1
2628; GFX8-NEXT:    s_max_i32 s3, s1, s2
2629; GFX8-NEXT:    s_sub_i32 s3, s3, 0x7fff
2630; GFX8-NEXT:    s_min_i32 s1, s1, s2
2631; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffff8000
2632; GFX8-NEXT:    v_max_i16_e32 v0, s3, v0
2633; GFX8-NEXT:    v_min_i16_e32 v0, s1, v0
2634; GFX8-NEXT:    v_sub_u16_e32 v0, s0, v0
2635; GFX8-NEXT:    ; return to shader part epilog
2636;
2637; GFX9-LABEL: ssubsat_i16_sv:
2638; GFX9:       ; %bb.0:
2639; GFX9-NEXT:    v_sub_i16 v0, s0, v0 clamp
2640; GFX9-NEXT:    ; return to shader part epilog
2641;
2642; GFX10-LABEL: ssubsat_i16_sv:
2643; GFX10:       ; %bb.0:
2644; GFX10-NEXT:    v_sub_nc_i16 v0, s0, v0 clamp
2645; GFX10-NEXT:    ; return to shader part epilog
2646  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2647  %cast = bitcast i16 %result to half
2648  ret half %cast
2649}
2650
2651define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2652; GFX6-LABEL: ssubsat_i16_vs:
2653; GFX6:       ; %bb.0:
2654; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2655; GFX6-NEXT:    v_max_i32_e32 v1, -1, v0
2656; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2657; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1
2658; GFX6-NEXT:    v_min_i32_e32 v2, -1, v0
2659; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x80000000, v2
2660; GFX6-NEXT:    v_max_i32_e32 v1, s0, v1
2661; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
2662; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
2663; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2664; GFX6-NEXT:    ; return to shader part epilog
2665;
2666; GFX8-LABEL: ssubsat_i16_vs:
2667; GFX8:       ; %bb.0:
2668; GFX8-NEXT:    v_max_i16_e32 v1, -1, v0
2669; GFX8-NEXT:    v_subrev_u16_e32 v1, 0x7fff, v1
2670; GFX8-NEXT:    v_min_i16_e32 v2, -1, v0
2671; GFX8-NEXT:    v_subrev_u16_e32 v2, 0x8000, v2
2672; GFX8-NEXT:    v_max_i16_e32 v1, s0, v1
2673; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
2674; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v1
2675; GFX8-NEXT:    ; return to shader part epilog
2676;
2677; GFX9-LABEL: ssubsat_i16_vs:
2678; GFX9:       ; %bb.0:
2679; GFX9-NEXT:    v_sub_i16 v0, v0, s0 clamp
2680; GFX9-NEXT:    ; return to shader part epilog
2681;
2682; GFX10-LABEL: ssubsat_i16_vs:
2683; GFX10:       ; %bb.0:
2684; GFX10-NEXT:    v_sub_nc_i16 v0, v0, s0 clamp
2685; GFX10-NEXT:    ; return to shader part epilog
2686  %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
2687  %cast = bitcast i16 %result to half
2688  ret half %cast
2689}
2690
2691define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2692; GFX6-LABEL: v_ssubsat_v2i16:
2693; GFX6:       ; %bb.0:
2694; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2695; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2696; GFX6-NEXT:    s_brev_b32 s4, -2
2697; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
2698; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2699; GFX6-NEXT:    s_brev_b32 s5, 1
2700; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
2701; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
2702; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
2703; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
2704; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
2705; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2706; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2707; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2708; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
2709; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
2710; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
2711; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
2712; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
2713; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
2714; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2715; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2716; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2717; GFX6-NEXT:    s_setpc_b64 s[30:31]
2718;
2719; GFX8-LABEL: v_ssubsat_v2i16:
2720; GFX8:       ; %bb.0:
2721; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2722; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
2723; GFX8-NEXT:    v_max_i16_e32 v3, -1, v0
2724; GFX8-NEXT:    s_movk_i32 s5, 0x8000
2725; GFX8-NEXT:    v_subrev_u16_e32 v3, s4, v3
2726; GFX8-NEXT:    v_min_i16_e32 v4, -1, v0
2727; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2728; GFX8-NEXT:    v_subrev_u16_e32 v4, s5, v4
2729; GFX8-NEXT:    v_max_i16_e32 v3, v3, v1
2730; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
2731; GFX8-NEXT:    v_max_i16_e32 v4, -1, v2
2732; GFX8-NEXT:    v_subrev_u16_e32 v4, s4, v4
2733; GFX8-NEXT:    v_min_i16_e32 v5, -1, v2
2734; GFX8-NEXT:    v_subrev_u16_e32 v5, s5, v5
2735; GFX8-NEXT:    v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2736; GFX8-NEXT:    v_min_i16_e32 v1, v1, v5
2737; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v3
2738; GFX8-NEXT:    v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2739; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2740; GFX8-NEXT:    s_setpc_b64 s[30:31]
2741;
2742; GFX9-LABEL: v_ssubsat_v2i16:
2743; GFX9:       ; %bb.0:
2744; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2745; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
2746; GFX9-NEXT:    s_setpc_b64 s[30:31]
2747;
2748; GFX10-LABEL: v_ssubsat_v2i16:
2749; GFX10:       ; %bb.0:
2750; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2751; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2752; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
2753; GFX10-NEXT:    s_setpc_b64 s[30:31]
2754  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2755  ret <2 x i16> %result
2756}
2757
2758define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2759; GFX6-LABEL: s_ssubsat_v2i16:
2760; GFX6:       ; %bb.0:
2761; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2762; GFX6-NEXT:    s_brev_b32 s4, -2
2763; GFX6-NEXT:    s_max_i32 s6, s0, -1
2764; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2765; GFX6-NEXT:    s_brev_b32 s5, 1
2766; GFX6-NEXT:    s_sub_i32 s6, s6, s4
2767; GFX6-NEXT:    s_min_i32 s7, s0, -1
2768; GFX6-NEXT:    s_sub_i32 s7, s7, s5
2769; GFX6-NEXT:    s_max_i32 s2, s6, s2
2770; GFX6-NEXT:    s_min_i32 s2, s2, s7
2771; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2772; GFX6-NEXT:    s_sub_i32 s0, s0, s2
2773; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
2774; GFX6-NEXT:    s_max_i32 s3, s1, -1
2775; GFX6-NEXT:    s_sub_i32 s3, s3, s4
2776; GFX6-NEXT:    s_min_i32 s4, s1, -1
2777; GFX6-NEXT:    s_sub_i32 s4, s4, s5
2778; GFX6-NEXT:    s_max_i32 s2, s3, s2
2779; GFX6-NEXT:    s_min_i32 s2, s2, s4
2780; GFX6-NEXT:    s_sub_i32 s1, s1, s2
2781; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
2782; GFX6-NEXT:    s_mov_b32 s2, 0xffff
2783; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2784; GFX6-NEXT:    s_and_b32 s1, s1, s2
2785; GFX6-NEXT:    s_and_b32 s0, s0, s2
2786; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2787; GFX6-NEXT:    s_or_b32 s0, s0, s1
2788; GFX6-NEXT:    ; return to shader part epilog
2789;
2790; GFX8-LABEL: s_ssubsat_v2i16:
2791; GFX8:       ; %bb.0:
2792; GFX8-NEXT:    s_sext_i32_i16 s6, s0
2793; GFX8-NEXT:    s_sext_i32_i16 s7, -1
2794; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
2795; GFX8-NEXT:    s_max_i32 s8, s6, s7
2796; GFX8-NEXT:    s_sub_i32 s8, s8, s4
2797; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
2798; GFX8-NEXT:    s_movk_i32 s5, 0x8000
2799; GFX8-NEXT:    s_min_i32 s6, s6, s7
2800; GFX8-NEXT:    s_sext_i32_i16 s8, s8
2801; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2802; GFX8-NEXT:    s_sub_i32 s6, s6, s5
2803; GFX8-NEXT:    s_max_i32 s1, s8, s1
2804; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2805; GFX8-NEXT:    s_sext_i32_i16 s6, s6
2806; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
2807; GFX8-NEXT:    s_min_i32 s1, s1, s6
2808; GFX8-NEXT:    s_sub_i32 s0, s0, s1
2809; GFX8-NEXT:    s_sext_i32_i16 s1, s2
2810; GFX8-NEXT:    s_max_i32 s6, s1, s7
2811; GFX8-NEXT:    s_sub_i32 s4, s6, s4
2812; GFX8-NEXT:    s_min_i32 s1, s1, s7
2813; GFX8-NEXT:    s_sext_i32_i16 s4, s4
2814; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2815; GFX8-NEXT:    s_sub_i32 s1, s1, s5
2816; GFX8-NEXT:    s_max_i32 s3, s4, s3
2817; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2818; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2819; GFX8-NEXT:    s_min_i32 s1, s3, s1
2820; GFX8-NEXT:    s_sub_i32 s1, s2, s1
2821; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
2822; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
2823; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
2824; GFX8-NEXT:    s_or_b32 s0, s0, s1
2825; GFX8-NEXT:    ; return to shader part epilog
2826;
2827; GFX9-LABEL: s_ssubsat_v2i16:
2828; GFX9:       ; %bb.0:
2829; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2830; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2831; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2832; GFX9-NEXT:    ; return to shader part epilog
2833;
2834; GFX10-LABEL: s_ssubsat_v2i16:
2835; GFX10:       ; %bb.0:
2836; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
2837; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2838; GFX10-NEXT:    ; return to shader part epilog
2839  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2840  %cast = bitcast <2 x i16> %result to i32
2841  ret i32 %cast
2842}
2843
2844define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2845; GFX6-LABEL: ssubsat_v2i16_sv:
2846; GFX6:       ; %bb.0:
2847; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2848; GFX6-NEXT:    s_brev_b32 s2, -2
2849; GFX6-NEXT:    s_max_i32 s4, s0, -1
2850; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2851; GFX6-NEXT:    s_brev_b32 s3, 1
2852; GFX6-NEXT:    s_sub_i32 s4, s4, s2
2853; GFX6-NEXT:    s_min_i32 s5, s0, -1
2854; GFX6-NEXT:    s_sub_i32 s5, s5, s3
2855; GFX6-NEXT:    v_max_i32_e32 v0, s4, v0
2856; GFX6-NEXT:    v_min_i32_e32 v0, s5, v0
2857; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2858; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2859; GFX6-NEXT:    s_max_i32 s1, s0, -1
2860; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2861; GFX6-NEXT:    s_sub_i32 s1, s1, s2
2862; GFX6-NEXT:    s_min_i32 s2, s0, -1
2863; GFX6-NEXT:    s_sub_i32 s2, s2, s3
2864; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
2865; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
2866; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
2867; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2868; GFX6-NEXT:    s_mov_b32 s0, 0xffff
2869; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2870; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
2871; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
2872; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2873; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2874; GFX6-NEXT:    ; return to shader part epilog
2875;
2876; GFX8-LABEL: ssubsat_v2i16_sv:
2877; GFX8:       ; %bb.0:
2878; GFX8-NEXT:    s_sext_i32_i16 s4, s0
2879; GFX8-NEXT:    s_sext_i32_i16 s5, -1
2880; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
2881; GFX8-NEXT:    s_max_i32 s6, s4, s5
2882; GFX8-NEXT:    s_movk_i32 s3, 0x8000
2883; GFX8-NEXT:    s_sub_i32 s6, s6, s2
2884; GFX8-NEXT:    s_min_i32 s4, s4, s5
2885; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2886; GFX8-NEXT:    s_sub_i32 s4, s4, s3
2887; GFX8-NEXT:    v_max_i16_e32 v1, s6, v0
2888; GFX8-NEXT:    v_min_i16_e32 v1, s4, v1
2889; GFX8-NEXT:    s_sext_i32_i16 s4, s1
2890; GFX8-NEXT:    s_max_i32 s6, s4, s5
2891; GFX8-NEXT:    s_sub_i32 s2, s6, s2
2892; GFX8-NEXT:    s_min_i32 s4, s4, s5
2893; GFX8-NEXT:    v_mov_b32_e32 v2, s2
2894; GFX8-NEXT:    s_sub_i32 s3, s4, s3
2895; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2896; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
2897; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2898; GFX8-NEXT:    v_sub_u16_e32 v1, s0, v1
2899; GFX8-NEXT:    v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2900; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
2901; GFX8-NEXT:    ; return to shader part epilog
2902;
2903; GFX9-LABEL: ssubsat_v2i16_sv:
2904; GFX9:       ; %bb.0:
2905; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2906; GFX9-NEXT:    ; return to shader part epilog
2907;
2908; GFX10-LABEL: ssubsat_v2i16_sv:
2909; GFX10:       ; %bb.0:
2910; GFX10-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
2911; GFX10-NEXT:    ; return to shader part epilog
2912  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2913  %cast = bitcast <2 x i16> %result to float
2914  ret float %cast
2915}
2916
2917define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2918; GFX6-LABEL: ssubsat_v2i16_vs:
2919; GFX6:       ; %bb.0:
2920; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2921; GFX6-NEXT:    s_brev_b32 s2, -2
2922; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
2923; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2924; GFX6-NEXT:    s_brev_b32 s3, 1
2925; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
2926; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
2927; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
2928; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2929; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
2930; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2931; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
2932; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
2933; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2934; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
2935; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
2936; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
2937; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2938; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
2939; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2940; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2941; GFX6-NEXT:    s_mov_b32 s0, 0xffff
2942; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2943; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
2944; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
2945; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2946; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2947; GFX6-NEXT:    ; return to shader part epilog
2948;
2949; GFX8-LABEL: ssubsat_v2i16_vs:
2950; GFX8:       ; %bb.0:
2951; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
2952; GFX8-NEXT:    v_max_i16_e32 v2, -1, v0
2953; GFX8-NEXT:    s_movk_i32 s3, 0x8000
2954; GFX8-NEXT:    v_subrev_u16_e32 v2, s2, v2
2955; GFX8-NEXT:    v_min_i16_e32 v3, -1, v0
2956; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2957; GFX8-NEXT:    v_subrev_u16_e32 v3, s3, v3
2958; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
2959; GFX8-NEXT:    v_min_i16_e32 v2, v2, v3
2960; GFX8-NEXT:    v_max_i16_e32 v3, -1, v1
2961; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2962; GFX8-NEXT:    v_subrev_u16_e32 v3, s2, v3
2963; GFX8-NEXT:    v_min_i16_e32 v4, -1, v1
2964; GFX8-NEXT:    v_subrev_u16_e32 v4, s3, v4
2965; GFX8-NEXT:    v_max_i16_e32 v3, s1, v3
2966; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
2967; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v2
2968; GFX8-NEXT:    v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2969; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2970; GFX8-NEXT:    ; return to shader part epilog
2971;
2972; GFX9-LABEL: ssubsat_v2i16_vs:
2973; GFX9:       ; %bb.0:
2974; GFX9-NEXT:    v_pk_sub_i16 v0, v0, s0 clamp
2975; GFX9-NEXT:    ; return to shader part epilog
2976;
2977; GFX10-LABEL: ssubsat_v2i16_vs:
2978; GFX10:       ; %bb.0:
2979; GFX10-NEXT:    v_pk_sub_i16 v0, v0, s0 clamp
2980; GFX10-NEXT:    ; return to shader part epilog
2981  %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2982  %cast = bitcast <2 x i16> %result to float
2983  ret float %cast
2984}
2985
2986; FIXME: v3i16 insert/extract
2987; define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
2988;   %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
2989;   ret <3 x i16> %result
2990; }
2991
2992; define amdgpu_ps <3 x i16> @s_ssubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
2993;   %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
2994;   ret <3 x i16> %result
2995; }
2996
2997define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
2998; GFX6-LABEL: v_ssubsat_v4i16:
2999; GFX6:       ; %bb.0:
3000; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3001; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3002; GFX6-NEXT:    s_brev_b32 s4, -2
3003; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
3004; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3005; GFX6-NEXT:    s_brev_b32 s5, 1
3006; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
3007; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
3008; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
3009; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
3010; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
3011; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3012; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
3013; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
3014; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
3015; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
3016; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
3017; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
3018; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3019; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3020; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
3021; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
3022; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
3023; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
3024; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
3025; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
3026; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
3027; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
3028; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3029; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3030; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
3031; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
3032; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
3033; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
3034; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
3035; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
3036; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
3037; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
3038; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
3039; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3040; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
3041; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3042; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3043; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
3044; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3045; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3046; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3047; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3048; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3049; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3050; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3051; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3052; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3053; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3054; GFX6-NEXT:    s_setpc_b64 s[30:31]
3055;
3056; GFX8-LABEL: v_ssubsat_v4i16:
3057; GFX8:       ; %bb.0:
3058; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3059; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3060; GFX8-NEXT:    v_max_i16_e32 v6, -1, v0
3061; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3062; GFX8-NEXT:    v_subrev_u16_e32 v6, s4, v6
3063; GFX8-NEXT:    v_min_i16_e32 v7, -1, v0
3064; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
3065; GFX8-NEXT:    v_subrev_u16_e32 v7, s5, v7
3066; GFX8-NEXT:    v_max_i16_e32 v6, v6, v2
3067; GFX8-NEXT:    v_min_i16_e32 v6, v6, v7
3068; GFX8-NEXT:    v_max_i16_e32 v7, -1, v4
3069; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
3070; GFX8-NEXT:    v_min_i16_e32 v8, -1, v4
3071; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
3072; GFX8-NEXT:    v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3073; GFX8-NEXT:    v_max_i16_e32 v7, -1, v1
3074; GFX8-NEXT:    v_min_i16_e32 v2, v2, v8
3075; GFX8-NEXT:    v_subrev_u16_e32 v7, s4, v7
3076; GFX8-NEXT:    v_min_i16_e32 v8, -1, v1
3077; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3078; GFX8-NEXT:    v_subrev_u16_e32 v8, s5, v8
3079; GFX8-NEXT:    v_max_i16_e32 v7, v7, v3
3080; GFX8-NEXT:    v_min_i16_e32 v7, v7, v8
3081; GFX8-NEXT:    v_max_i16_e32 v8, -1, v5
3082; GFX8-NEXT:    v_subrev_u16_e32 v8, s4, v8
3083; GFX8-NEXT:    v_min_i16_e32 v9, -1, v5
3084; GFX8-NEXT:    v_subrev_u16_e32 v9, s5, v9
3085; GFX8-NEXT:    v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3086; GFX8-NEXT:    v_min_i16_e32 v3, v3, v9
3087; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v6
3088; GFX8-NEXT:    v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3089; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
3090; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v7
3091; GFX8-NEXT:    v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3092; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
3093; GFX8-NEXT:    s_setpc_b64 s[30:31]
3094;
3095; GFX9-LABEL: v_ssubsat_v4i16:
3096; GFX9:       ; %bb.0:
3097; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3098; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
3099; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
3100; GFX9-NEXT:    s_setpc_b64 s[30:31]
3101;
3102; GFX10-LABEL: v_ssubsat_v4i16:
3103; GFX10:       ; %bb.0:
3104; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3105; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3106; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v2 clamp
3107; GFX10-NEXT:    v_pk_sub_i16 v1, v1, v3 clamp
3108; GFX10-NEXT:    s_setpc_b64 s[30:31]
3109  %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3110  %cast = bitcast <4 x i16> %result to <2 x float>
3111  ret <2 x float> %cast
3112}
3113
3114define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3115; GFX6-LABEL: s_ssubsat_v4i16:
3116; GFX6:       ; %bb.0:
3117; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3118; GFX6-NEXT:    s_brev_b32 s8, -2
3119; GFX6-NEXT:    s_max_i32 s10, s0, -1
3120; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3121; GFX6-NEXT:    s_brev_b32 s9, 1
3122; GFX6-NEXT:    s_sub_i32 s10, s10, s8
3123; GFX6-NEXT:    s_min_i32 s11, s0, -1
3124; GFX6-NEXT:    s_sub_i32 s11, s11, s9
3125; GFX6-NEXT:    s_max_i32 s4, s10, s4
3126; GFX6-NEXT:    s_min_i32 s4, s4, s11
3127; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3128; GFX6-NEXT:    s_sub_i32 s0, s0, s4
3129; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
3130; GFX6-NEXT:    s_max_i32 s5, s1, -1
3131; GFX6-NEXT:    s_sub_i32 s5, s5, s8
3132; GFX6-NEXT:    s_min_i32 s10, s1, -1
3133; GFX6-NEXT:    s_sub_i32 s10, s10, s9
3134; GFX6-NEXT:    s_max_i32 s4, s5, s4
3135; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3136; GFX6-NEXT:    s_min_i32 s4, s4, s10
3137; GFX6-NEXT:    s_max_i32 s5, s2, -1
3138; GFX6-NEXT:    s_sub_i32 s1, s1, s4
3139; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
3140; GFX6-NEXT:    s_sub_i32 s5, s5, s8
3141; GFX6-NEXT:    s_min_i32 s6, s2, -1
3142; GFX6-NEXT:    s_sub_i32 s6, s6, s9
3143; GFX6-NEXT:    s_max_i32 s4, s5, s4
3144; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3145; GFX6-NEXT:    s_min_i32 s4, s4, s6
3146; GFX6-NEXT:    s_max_i32 s5, s3, -1
3147; GFX6-NEXT:    s_sub_i32 s2, s2, s4
3148; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
3149; GFX6-NEXT:    s_sub_i32 s5, s5, s8
3150; GFX6-NEXT:    s_min_i32 s6, s3, -1
3151; GFX6-NEXT:    s_sub_i32 s6, s6, s9
3152; GFX6-NEXT:    s_max_i32 s4, s5, s4
3153; GFX6-NEXT:    s_min_i32 s4, s4, s6
3154; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3155; GFX6-NEXT:    s_sub_i32 s3, s3, s4
3156; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3157; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3158; GFX6-NEXT:    s_and_b32 s1, s1, s4
3159; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3160; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3161; GFX6-NEXT:    s_and_b32 s0, s0, s4
3162; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3163; GFX6-NEXT:    s_or_b32 s0, s0, s1
3164; GFX6-NEXT:    s_and_b32 s1, s2, s4
3165; GFX6-NEXT:    s_and_b32 s2, s3, s4
3166; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3167; GFX6-NEXT:    s_or_b32 s1, s1, s2
3168; GFX6-NEXT:    ; return to shader part epilog
3169;
3170; GFX8-LABEL: s_ssubsat_v4i16:
3171; GFX8:       ; %bb.0:
3172; GFX8-NEXT:    s_sext_i32_i16 s10, s0
3173; GFX8-NEXT:    s_sext_i32_i16 s11, -1
3174; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
3175; GFX8-NEXT:    s_max_i32 s12, s10, s11
3176; GFX8-NEXT:    s_sub_i32 s12, s12, s8
3177; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
3178; GFX8-NEXT:    s_movk_i32 s9, 0x8000
3179; GFX8-NEXT:    s_min_i32 s10, s10, s11
3180; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3181; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3182; GFX8-NEXT:    s_sub_i32 s10, s10, s9
3183; GFX8-NEXT:    s_max_i32 s2, s12, s2
3184; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3185; GFX8-NEXT:    s_sext_i32_i16 s10, s10
3186; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
3187; GFX8-NEXT:    s_min_i32 s2, s2, s10
3188; GFX8-NEXT:    s_sub_i32 s0, s0, s2
3189; GFX8-NEXT:    s_sext_i32_i16 s2, s4
3190; GFX8-NEXT:    s_max_i32 s10, s2, s11
3191; GFX8-NEXT:    s_sub_i32 s10, s10, s8
3192; GFX8-NEXT:    s_min_i32 s2, s2, s11
3193; GFX8-NEXT:    s_sext_i32_i16 s10, s10
3194; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3195; GFX8-NEXT:    s_sub_i32 s2, s2, s9
3196; GFX8-NEXT:    s_max_i32 s6, s10, s6
3197; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3198; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3199; GFX8-NEXT:    s_min_i32 s2, s6, s2
3200; GFX8-NEXT:    s_sub_i32 s2, s4, s2
3201; GFX8-NEXT:    s_sext_i32_i16 s4, s1
3202; GFX8-NEXT:    s_max_i32 s6, s4, s11
3203; GFX8-NEXT:    s_sub_i32 s6, s6, s8
3204; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
3205; GFX8-NEXT:    s_min_i32 s4, s4, s11
3206; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3207; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3208; GFX8-NEXT:    s_sub_i32 s4, s4, s9
3209; GFX8-NEXT:    s_max_i32 s3, s6, s3
3210; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3211; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3212; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
3213; GFX8-NEXT:    s_min_i32 s3, s3, s4
3214; GFX8-NEXT:    s_sub_i32 s1, s1, s3
3215; GFX8-NEXT:    s_sext_i32_i16 s3, s5
3216; GFX8-NEXT:    s_max_i32 s4, s3, s11
3217; GFX8-NEXT:    s_sub_i32 s4, s4, s8
3218; GFX8-NEXT:    s_min_i32 s3, s3, s11
3219; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3220; GFX8-NEXT:    s_sext_i32_i16 s6, s7
3221; GFX8-NEXT:    s_sub_i32 s3, s3, s9
3222; GFX8-NEXT:    s_max_i32 s4, s4, s6
3223; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3224; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3225; GFX8-NEXT:    s_min_i32 s3, s4, s3
3226; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
3227; GFX8-NEXT:    s_sub_i32 s3, s5, s3
3228; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
3229; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3230; GFX8-NEXT:    s_or_b32 s0, s0, s2
3231; GFX8-NEXT:    s_bfe_u32 s2, s3, 0x100000
3232; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
3233; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3234; GFX8-NEXT:    s_or_b32 s1, s1, s2
3235; GFX8-NEXT:    ; return to shader part epilog
3236;
3237; GFX9-LABEL: s_ssubsat_v4i16:
3238; GFX9:       ; %bb.0:
3239; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3240; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3241; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
3242; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
3243; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3244; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3245; GFX9-NEXT:    ; return to shader part epilog
3246;
3247; GFX10-LABEL: s_ssubsat_v4i16:
3248; GFX10:       ; %bb.0:
3249; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s2 clamp
3250; GFX10-NEXT:    v_pk_sub_i16 v1, s1, s3 clamp
3251; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
3252; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
3253; GFX10-NEXT:    ; return to shader part epilog
3254  %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3255  %cast = bitcast <4 x i16> %result to <2 x i32>
3256  ret <2 x i32> %cast
3257}
3258
3259; FIXME
3260; define <5 x i16> @v_ssubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3261;   %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3262;   ret <5 x i16> %result
3263; }
3264
3265; define amdgpu_ps <5 x i16> @s_ssubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3266;   %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3267;   ret <5 x i16> %result
3268; }
3269
3270define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3271; GFX6-LABEL: v_ssubsat_v6i16:
3272; GFX6:       ; %bb.0:
3273; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3274; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3275; GFX6-NEXT:    s_brev_b32 s4, -2
3276; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
3277; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3278; GFX6-NEXT:    s_brev_b32 s5, 1
3279; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s4, v12
3280; GFX6-NEXT:    v_min_i32_e32 v14, -1, v0
3281; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, s5, v14
3282; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
3283; GFX6-NEXT:    v_min_i32_e32 v6, v6, v14
3284; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3285; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
3286; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
3287; GFX6-NEXT:    v_max_i32_e32 v7, -1, v1
3288; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, s4, v7
3289; GFX6-NEXT:    v_min_i32_e32 v12, -1, v1
3290; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
3291; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3292; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3293; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
3294; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
3295; GFX6-NEXT:    v_max_i32_e32 v7, -1, v2
3296; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
3297; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
3298; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
3299; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
3300; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
3301; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3302; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3303; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3304; GFX6-NEXT:    v_max_i32_e32 v7, -1, v3
3305; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
3306; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
3307; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
3308; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
3309; GFX6-NEXT:    v_min_i32_e32 v8, -1, v3
3310; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
3311; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3312; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3313; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3314; GFX6-NEXT:    v_max_i32_e32 v7, -1, v4
3315; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
3316; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
3317; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
3318; GFX6-NEXT:    v_min_i32_e32 v8, -1, v4
3319; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
3320; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3321; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3322; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3323; GFX6-NEXT:    v_max_i32_e32 v7, -1, v5
3324; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
3325; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
3326; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
3327; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
3328; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3329; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
3330; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
3331; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3332; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3333; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
3334; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3335; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3336; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3337; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
3338; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3339; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3340; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3341; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3342; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3343; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3344; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3345; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3346; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
3347; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3348; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
3349; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3350; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3351; GFX6-NEXT:    s_setpc_b64 s[30:31]
3352;
3353; GFX8-LABEL: v_ssubsat_v6i16:
3354; GFX8:       ; %bb.0:
3355; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3356; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3357; GFX8-NEXT:    v_max_i16_e32 v9, -1, v0
3358; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3359; GFX8-NEXT:    v_subrev_u16_e32 v9, s4, v9
3360; GFX8-NEXT:    v_min_i16_e32 v11, -1, v0
3361; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
3362; GFX8-NEXT:    v_subrev_u16_e32 v11, s5, v11
3363; GFX8-NEXT:    v_max_i16_e32 v9, v9, v3
3364; GFX8-NEXT:    v_min_i16_e32 v9, v9, v11
3365; GFX8-NEXT:    v_max_i16_e32 v11, -1, v6
3366; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
3367; GFX8-NEXT:    v_min_i16_e32 v13, -1, v6
3368; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
3369; GFX8-NEXT:    v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3370; GFX8-NEXT:    v_max_i16_e32 v11, -1, v1
3371; GFX8-NEXT:    v_min_i16_e32 v3, v3, v13
3372; GFX8-NEXT:    v_subrev_u16_e32 v11, s4, v11
3373; GFX8-NEXT:    v_min_i16_e32 v13, -1, v1
3374; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
3375; GFX8-NEXT:    v_subrev_u16_e32 v13, s5, v13
3376; GFX8-NEXT:    v_max_i16_e32 v11, v11, v4
3377; GFX8-NEXT:    v_min_i16_e32 v11, v11, v13
3378; GFX8-NEXT:    v_max_i16_e32 v13, -1, v7
3379; GFX8-NEXT:    v_subrev_u16_e32 v13, s4, v13
3380; GFX8-NEXT:    v_min_i16_e32 v14, -1, v7
3381; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
3382; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
3383; GFX8-NEXT:    v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3384; GFX8-NEXT:    v_max_i16_e32 v13, -1, v2
3385; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
3386; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
3387; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v10
3388; GFX8-NEXT:    v_min_i16_e32 v14, -1, v2
3389; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
3390; GFX8-NEXT:    v_sub_u16_e32 v14, v14, v12
3391; GFX8-NEXT:    v_max_i16_e32 v13, v13, v5
3392; GFX8-NEXT:    v_min_i16_e32 v13, v13, v14
3393; GFX8-NEXT:    v_max_i16_e32 v14, -1, v8
3394; GFX8-NEXT:    v_sub_u16_e32 v10, v14, v10
3395; GFX8-NEXT:    v_min_i16_e32 v14, -1, v8
3396; GFX8-NEXT:    v_sub_u16_e32 v12, v14, v12
3397; GFX8-NEXT:    v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3398; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v9
3399; GFX8-NEXT:    v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3400; GFX8-NEXT:    v_min_i16_e32 v5, v5, v12
3401; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
3402; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v11
3403; GFX8-NEXT:    v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3404; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
3405; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v13
3406; GFX8-NEXT:    v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3407; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
3408; GFX8-NEXT:    s_setpc_b64 s[30:31]
3409;
3410; GFX9-LABEL: v_ssubsat_v6i16:
3411; GFX9:       ; %bb.0:
3412; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3413; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
3414; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
3415; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
3416; GFX9-NEXT:    s_setpc_b64 s[30:31]
3417;
3418; GFX10-LABEL: v_ssubsat_v6i16:
3419; GFX10:       ; %bb.0:
3420; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3421; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3422; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v3 clamp
3423; GFX10-NEXT:    v_pk_sub_i16 v1, v1, v4 clamp
3424; GFX10-NEXT:    v_pk_sub_i16 v2, v2, v5 clamp
3425; GFX10-NEXT:    s_setpc_b64 s[30:31]
3426  %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3427  %cast = bitcast <6 x i16> %result to <3 x float>
3428  ret <3 x float> %cast
3429}
3430
3431define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3432; GFX6-LABEL: s_ssubsat_v6i16:
3433; GFX6:       ; %bb.0:
3434; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3435; GFX6-NEXT:    s_brev_b32 s12, -2
3436; GFX6-NEXT:    s_max_i32 s14, s0, -1
3437; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3438; GFX6-NEXT:    s_brev_b32 s13, 1
3439; GFX6-NEXT:    s_sub_i32 s14, s14, s12
3440; GFX6-NEXT:    s_min_i32 s15, s0, -1
3441; GFX6-NEXT:    s_sub_i32 s15, s15, s13
3442; GFX6-NEXT:    s_max_i32 s6, s14, s6
3443; GFX6-NEXT:    s_min_i32 s6, s6, s15
3444; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3445; GFX6-NEXT:    s_sub_i32 s0, s0, s6
3446; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
3447; GFX6-NEXT:    s_max_i32 s7, s1, -1
3448; GFX6-NEXT:    s_sub_i32 s7, s7, s12
3449; GFX6-NEXT:    s_min_i32 s14, s1, -1
3450; GFX6-NEXT:    s_sub_i32 s14, s14, s13
3451; GFX6-NEXT:    s_max_i32 s6, s7, s6
3452; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3453; GFX6-NEXT:    s_min_i32 s6, s6, s14
3454; GFX6-NEXT:    s_max_i32 s7, s2, -1
3455; GFX6-NEXT:    s_sub_i32 s1, s1, s6
3456; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
3457; GFX6-NEXT:    s_sub_i32 s7, s7, s12
3458; GFX6-NEXT:    s_min_i32 s8, s2, -1
3459; GFX6-NEXT:    s_sub_i32 s8, s8, s13
3460; GFX6-NEXT:    s_max_i32 s6, s7, s6
3461; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3462; GFX6-NEXT:    s_min_i32 s6, s6, s8
3463; GFX6-NEXT:    s_max_i32 s7, s3, -1
3464; GFX6-NEXT:    s_sub_i32 s2, s2, s6
3465; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
3466; GFX6-NEXT:    s_sub_i32 s7, s7, s12
3467; GFX6-NEXT:    s_min_i32 s8, s3, -1
3468; GFX6-NEXT:    s_sub_i32 s8, s8, s13
3469; GFX6-NEXT:    s_max_i32 s6, s7, s6
3470; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3471; GFX6-NEXT:    s_min_i32 s6, s6, s8
3472; GFX6-NEXT:    s_max_i32 s7, s4, -1
3473; GFX6-NEXT:    s_sub_i32 s3, s3, s6
3474; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
3475; GFX6-NEXT:    s_sub_i32 s7, s7, s12
3476; GFX6-NEXT:    s_min_i32 s8, s4, -1
3477; GFX6-NEXT:    s_sub_i32 s8, s8, s13
3478; GFX6-NEXT:    s_max_i32 s6, s7, s6
3479; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3480; GFX6-NEXT:    s_min_i32 s6, s6, s8
3481; GFX6-NEXT:    s_max_i32 s7, s5, -1
3482; GFX6-NEXT:    s_sub_i32 s4, s4, s6
3483; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
3484; GFX6-NEXT:    s_sub_i32 s7, s7, s12
3485; GFX6-NEXT:    s_min_i32 s8, s5, -1
3486; GFX6-NEXT:    s_sub_i32 s8, s8, s13
3487; GFX6-NEXT:    s_max_i32 s6, s7, s6
3488; GFX6-NEXT:    s_min_i32 s6, s6, s8
3489; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3490; GFX6-NEXT:    s_sub_i32 s5, s5, s6
3491; GFX6-NEXT:    s_mov_b32 s6, 0xffff
3492; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3493; GFX6-NEXT:    s_and_b32 s1, s1, s6
3494; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3495; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3496; GFX6-NEXT:    s_and_b32 s0, s0, s6
3497; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3498; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3499; GFX6-NEXT:    s_or_b32 s0, s0, s1
3500; GFX6-NEXT:    s_and_b32 s1, s2, s6
3501; GFX6-NEXT:    s_and_b32 s2, s3, s6
3502; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3503; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3504; GFX6-NEXT:    s_and_b32 s3, s5, s6
3505; GFX6-NEXT:    s_or_b32 s1, s1, s2
3506; GFX6-NEXT:    s_and_b32 s2, s4, s6
3507; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3508; GFX6-NEXT:    s_or_b32 s2, s2, s3
3509; GFX6-NEXT:    ; return to shader part epilog
3510;
3511; GFX8-LABEL: s_ssubsat_v6i16:
3512; GFX8:       ; %bb.0:
3513; GFX8-NEXT:    s_sext_i32_i16 s14, s0
3514; GFX8-NEXT:    s_sext_i32_i16 s15, -1
3515; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
3516; GFX8-NEXT:    s_max_i32 s16, s14, s15
3517; GFX8-NEXT:    s_sub_i32 s16, s16, s12
3518; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
3519; GFX8-NEXT:    s_movk_i32 s13, 0x8000
3520; GFX8-NEXT:    s_min_i32 s14, s14, s15
3521; GFX8-NEXT:    s_sext_i32_i16 s16, s16
3522; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3523; GFX8-NEXT:    s_sub_i32 s14, s14, s13
3524; GFX8-NEXT:    s_max_i32 s3, s16, s3
3525; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3526; GFX8-NEXT:    s_sext_i32_i16 s14, s14
3527; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
3528; GFX8-NEXT:    s_min_i32 s3, s3, s14
3529; GFX8-NEXT:    s_sub_i32 s0, s0, s3
3530; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3531; GFX8-NEXT:    s_max_i32 s14, s3, s15
3532; GFX8-NEXT:    s_sub_i32 s14, s14, s12
3533; GFX8-NEXT:    s_min_i32 s3, s3, s15
3534; GFX8-NEXT:    s_sext_i32_i16 s14, s14
3535; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3536; GFX8-NEXT:    s_sub_i32 s3, s3, s13
3537; GFX8-NEXT:    s_max_i32 s9, s14, s9
3538; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3539; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3540; GFX8-NEXT:    s_min_i32 s3, s9, s3
3541; GFX8-NEXT:    s_sub_i32 s3, s6, s3
3542; GFX8-NEXT:    s_sext_i32_i16 s6, s1
3543; GFX8-NEXT:    s_max_i32 s9, s6, s15
3544; GFX8-NEXT:    s_sub_i32 s9, s9, s12
3545; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
3546; GFX8-NEXT:    s_min_i32 s6, s6, s15
3547; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3548; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3549; GFX8-NEXT:    s_sub_i32 s6, s6, s13
3550; GFX8-NEXT:    s_max_i32 s4, s9, s4
3551; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3552; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3553; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
3554; GFX8-NEXT:    s_min_i32 s4, s4, s6
3555; GFX8-NEXT:    s_sub_i32 s1, s1, s4
3556; GFX8-NEXT:    s_sext_i32_i16 s4, s7
3557; GFX8-NEXT:    s_max_i32 s6, s4, s15
3558; GFX8-NEXT:    s_sub_i32 s6, s6, s12
3559; GFX8-NEXT:    s_min_i32 s4, s4, s15
3560; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3561; GFX8-NEXT:    s_sext_i32_i16 s9, s10
3562; GFX8-NEXT:    s_sub_i32 s4, s4, s13
3563; GFX8-NEXT:    s_max_i32 s6, s6, s9
3564; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3565; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3566; GFX8-NEXT:    s_min_i32 s4, s6, s4
3567; GFX8-NEXT:    s_sext_i32_i16 s6, s2
3568; GFX8-NEXT:    s_sub_i32 s4, s7, s4
3569; GFX8-NEXT:    s_max_i32 s7, s6, s15
3570; GFX8-NEXT:    s_sub_i32 s7, s7, s12
3571; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
3572; GFX8-NEXT:    s_min_i32 s6, s6, s15
3573; GFX8-NEXT:    s_sext_i32_i16 s7, s7
3574; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3575; GFX8-NEXT:    s_sub_i32 s6, s6, s13
3576; GFX8-NEXT:    s_max_i32 s5, s7, s5
3577; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3578; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3579; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
3580; GFX8-NEXT:    s_min_i32 s5, s5, s6
3581; GFX8-NEXT:    s_sub_i32 s2, s2, s5
3582; GFX8-NEXT:    s_sext_i32_i16 s5, s8
3583; GFX8-NEXT:    s_max_i32 s6, s5, s15
3584; GFX8-NEXT:    s_sub_i32 s6, s6, s12
3585; GFX8-NEXT:    s_min_i32 s5, s5, s15
3586; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3587; GFX8-NEXT:    s_sext_i32_i16 s7, s11
3588; GFX8-NEXT:    s_sub_i32 s5, s5, s13
3589; GFX8-NEXT:    s_max_i32 s6, s6, s7
3590; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
3591; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3592; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3593; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
3594; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3595; GFX8-NEXT:    s_min_i32 s5, s6, s5
3596; GFX8-NEXT:    s_or_b32 s0, s0, s3
3597; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x100000
3598; GFX8-NEXT:    s_sub_i32 s5, s8, s5
3599; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
3600; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3601; GFX8-NEXT:    s_or_b32 s1, s1, s3
3602; GFX8-NEXT:    s_bfe_u32 s3, s5, 0x100000
3603; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
3604; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3605; GFX8-NEXT:    s_or_b32 s2, s2, s3
3606; GFX8-NEXT:    ; return to shader part epilog
3607;
3608; GFX9-LABEL: s_ssubsat_v6i16:
3609; GFX9:       ; %bb.0:
3610; GFX9-NEXT:    v_mov_b32_e32 v0, s3
3611; GFX9-NEXT:    v_mov_b32_e32 v1, s4
3612; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3613; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
3614; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
3615; GFX9-NEXT:    v_pk_sub_i16 v2, s2, v2 clamp
3616; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3617; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3618; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
3619; GFX9-NEXT:    ; return to shader part epilog
3620;
3621; GFX10-LABEL: s_ssubsat_v6i16:
3622; GFX10:       ; %bb.0:
3623; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s3 clamp
3624; GFX10-NEXT:    v_pk_sub_i16 v1, s1, s4 clamp
3625; GFX10-NEXT:    v_pk_sub_i16 v2, s2, s5 clamp
3626; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
3627; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
3628; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
3629; GFX10-NEXT:    ; return to shader part epilog
3630  %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3631  %cast = bitcast <6 x i16> %result to <3 x i32>
3632  ret <3 x i32> %cast
3633}
3634
3635define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3636; GFX6-LABEL: v_ssubsat_v8i16:
3637; GFX6:       ; %bb.0:
3638; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3639; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3640; GFX6-NEXT:    s_brev_b32 s4, -2
3641; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
3642; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
3643; GFX6-NEXT:    s_brev_b32 s5, 1
3644; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
3645; GFX6-NEXT:    v_min_i32_e32 v18, -1, v0
3646; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, s5, v18
3647; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
3648; GFX6-NEXT:    v_min_i32_e32 v8, v8, v18
3649; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3650; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
3651; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
3652; GFX6-NEXT:    v_max_i32_e32 v9, -1, v1
3653; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, s4, v9
3654; GFX6-NEXT:    v_min_i32_e32 v16, -1, v1
3655; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s5, v16
3656; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3657; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3658; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
3659; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
3660; GFX6-NEXT:    v_max_i32_e32 v9, -1, v2
3661; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
3662; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
3663; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3664; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
3665; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
3666; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3667; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3668; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3669; GFX6-NEXT:    v_max_i32_e32 v9, -1, v3
3670; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
3671; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
3672; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
3673; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3674; GFX6-NEXT:    v_min_i32_e32 v10, -1, v3
3675; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
3676; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3677; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3678; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3679; GFX6-NEXT:    v_max_i32_e32 v9, -1, v4
3680; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
3681; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
3682; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3683; GFX6-NEXT:    v_min_i32_e32 v10, -1, v4
3684; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
3685; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3686; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3687; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3688; GFX6-NEXT:    v_max_i32_e32 v9, -1, v5
3689; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
3690; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
3691; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3692; GFX6-NEXT:    v_min_i32_e32 v10, -1, v5
3693; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
3694; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3695; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3696; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3697; GFX6-NEXT:    v_max_i32_e32 v9, -1, v6
3698; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v8
3699; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
3700; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3701; GFX6-NEXT:    v_min_i32_e32 v10, -1, v6
3702; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
3703; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3704; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
3705; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3706; GFX6-NEXT:    v_max_i32_e32 v9, -1, v7
3707; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3708; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
3709; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
3710; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
3711; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
3712; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3713; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3714; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
3715; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
3716; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3717; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3718; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3719; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
3720; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3721; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3722; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3723; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
3724; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3725; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3726; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3727; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3728; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
3729; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3730; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
3731; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
3732; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3733; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
3734; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3735; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
3736; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3737; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
3738; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3739; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
3740; GFX6-NEXT:    s_setpc_b64 s[30:31]
3741;
3742; GFX8-LABEL: v_ssubsat_v8i16:
3743; GFX8:       ; %bb.0:
3744; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3745; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3746; GFX8-NEXT:    v_max_i16_e32 v12, -1, v0
3747; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3748; GFX8-NEXT:    v_subrev_u16_e32 v12, s4, v12
3749; GFX8-NEXT:    v_min_i16_e32 v14, -1, v0
3750; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
3751; GFX8-NEXT:    v_subrev_u16_e32 v14, s5, v14
3752; GFX8-NEXT:    v_max_i16_e32 v12, v12, v4
3753; GFX8-NEXT:    v_min_i16_e32 v12, v12, v14
3754; GFX8-NEXT:    v_max_i16_e32 v14, -1, v8
3755; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
3756; GFX8-NEXT:    v_min_i16_e32 v16, -1, v8
3757; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
3758; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3759; GFX8-NEXT:    v_max_i16_e32 v14, -1, v1
3760; GFX8-NEXT:    v_min_i16_e32 v4, v4, v16
3761; GFX8-NEXT:    v_subrev_u16_e32 v14, s4, v14
3762; GFX8-NEXT:    v_min_i16_e32 v16, -1, v1
3763; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
3764; GFX8-NEXT:    v_subrev_u16_e32 v16, s5, v16
3765; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
3766; GFX8-NEXT:    v_min_i16_e32 v14, v14, v16
3767; GFX8-NEXT:    v_max_i16_e32 v16, -1, v9
3768; GFX8-NEXT:    v_subrev_u16_e32 v16, s4, v16
3769; GFX8-NEXT:    v_min_i16_e32 v17, -1, v9
3770; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
3771; GFX8-NEXT:    v_subrev_u16_e32 v17, s5, v17
3772; GFX8-NEXT:    v_max_i16_sdwa v5, v16, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3773; GFX8-NEXT:    v_max_i16_e32 v16, -1, v2
3774; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
3775; GFX8-NEXT:    v_min_i16_e32 v5, v5, v17
3776; GFX8-NEXT:    v_sub_u16_e32 v16, v16, v13
3777; GFX8-NEXT:    v_min_i16_e32 v17, -1, v2
3778; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
3779; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v15
3780; GFX8-NEXT:    v_max_i16_e32 v16, v16, v6
3781; GFX8-NEXT:    v_min_i16_e32 v16, v16, v17
3782; GFX8-NEXT:    v_max_i16_e32 v17, -1, v10
3783; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
3784; GFX8-NEXT:    v_min_i16_e32 v18, -1, v10
3785; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
3786; GFX8-NEXT:    v_max_i16_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3787; GFX8-NEXT:    v_max_i16_e32 v17, -1, v3
3788; GFX8-NEXT:    v_min_i16_e32 v6, v6, v18
3789; GFX8-NEXT:    v_sub_u16_e32 v17, v17, v13
3790; GFX8-NEXT:    v_min_i16_e32 v18, -1, v3
3791; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
3792; GFX8-NEXT:    v_sub_u16_e32 v18, v18, v15
3793; GFX8-NEXT:    v_max_i16_e32 v17, v17, v7
3794; GFX8-NEXT:    v_min_i16_e32 v17, v17, v18
3795; GFX8-NEXT:    v_max_i16_e32 v18, -1, v11
3796; GFX8-NEXT:    v_sub_u16_e32 v13, v18, v13
3797; GFX8-NEXT:    v_min_i16_e32 v18, -1, v11
3798; GFX8-NEXT:    v_sub_u16_e32 v0, v0, v12
3799; GFX8-NEXT:    v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3800; GFX8-NEXT:    v_sub_u16_e32 v15, v18, v15
3801; GFX8-NEXT:    v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3802; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
3803; GFX8-NEXT:    v_sub_u16_e32 v1, v1, v14
3804; GFX8-NEXT:    v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3805; GFX8-NEXT:    v_min_i16_e32 v7, v7, v15
3806; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
3807; GFX8-NEXT:    v_sub_u16_e32 v2, v2, v16
3808; GFX8-NEXT:    v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3809; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
3810; GFX8-NEXT:    v_sub_u16_e32 v3, v3, v17
3811; GFX8-NEXT:    v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3812; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
3813; GFX8-NEXT:    s_setpc_b64 s[30:31]
3814;
3815; GFX9-LABEL: v_ssubsat_v8i16:
3816; GFX9:       ; %bb.0:
3817; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3818; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
3819; GFX9-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
3820; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
3821; GFX9-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
3822; GFX9-NEXT:    s_setpc_b64 s[30:31]
3823;
3824; GFX10-LABEL: v_ssubsat_v8i16:
3825; GFX10:       ; %bb.0:
3826; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3827; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3828; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v4 clamp
3829; GFX10-NEXT:    v_pk_sub_i16 v1, v1, v5 clamp
3830; GFX10-NEXT:    v_pk_sub_i16 v2, v2, v6 clamp
3831; GFX10-NEXT:    v_pk_sub_i16 v3, v3, v7 clamp
3832; GFX10-NEXT:    s_setpc_b64 s[30:31]
3833  %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3834  %cast = bitcast <8 x i16> %result to <4 x float>
3835  ret <4 x float> %cast
3836}
3837
3838define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3839; GFX6-LABEL: s_ssubsat_v8i16:
3840; GFX6:       ; %bb.0:
3841; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3842; GFX6-NEXT:    s_brev_b32 s16, -2
3843; GFX6-NEXT:    s_max_i32 s18, s0, -1
3844; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
3845; GFX6-NEXT:    s_brev_b32 s17, 1
3846; GFX6-NEXT:    s_sub_i32 s18, s18, s16
3847; GFX6-NEXT:    s_min_i32 s19, s0, -1
3848; GFX6-NEXT:    s_sub_i32 s19, s19, s17
3849; GFX6-NEXT:    s_max_i32 s8, s18, s8
3850; GFX6-NEXT:    s_min_i32 s8, s8, s19
3851; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3852; GFX6-NEXT:    s_sub_i32 s0, s0, s8
3853; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
3854; GFX6-NEXT:    s_max_i32 s9, s1, -1
3855; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3856; GFX6-NEXT:    s_min_i32 s18, s1, -1
3857; GFX6-NEXT:    s_sub_i32 s18, s18, s17
3858; GFX6-NEXT:    s_max_i32 s8, s9, s8
3859; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3860; GFX6-NEXT:    s_min_i32 s8, s8, s18
3861; GFX6-NEXT:    s_max_i32 s9, s2, -1
3862; GFX6-NEXT:    s_sub_i32 s1, s1, s8
3863; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
3864; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3865; GFX6-NEXT:    s_min_i32 s10, s2, -1
3866; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3867; GFX6-NEXT:    s_max_i32 s8, s9, s8
3868; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3869; GFX6-NEXT:    s_min_i32 s8, s8, s10
3870; GFX6-NEXT:    s_max_i32 s9, s3, -1
3871; GFX6-NEXT:    s_sub_i32 s2, s2, s8
3872; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
3873; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3874; GFX6-NEXT:    s_min_i32 s10, s3, -1
3875; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3876; GFX6-NEXT:    s_max_i32 s8, s9, s8
3877; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3878; GFX6-NEXT:    s_min_i32 s8, s8, s10
3879; GFX6-NEXT:    s_max_i32 s9, s4, -1
3880; GFX6-NEXT:    s_sub_i32 s3, s3, s8
3881; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
3882; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3883; GFX6-NEXT:    s_min_i32 s10, s4, -1
3884; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3885; GFX6-NEXT:    s_max_i32 s8, s9, s8
3886; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3887; GFX6-NEXT:    s_min_i32 s8, s8, s10
3888; GFX6-NEXT:    s_max_i32 s9, s5, -1
3889; GFX6-NEXT:    s_sub_i32 s4, s4, s8
3890; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
3891; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3892; GFX6-NEXT:    s_min_i32 s10, s5, -1
3893; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3894; GFX6-NEXT:    s_max_i32 s8, s9, s8
3895; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3896; GFX6-NEXT:    s_min_i32 s8, s8, s10
3897; GFX6-NEXT:    s_max_i32 s9, s6, -1
3898; GFX6-NEXT:    s_sub_i32 s5, s5, s8
3899; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
3900; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3901; GFX6-NEXT:    s_min_i32 s10, s6, -1
3902; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3903; GFX6-NEXT:    s_max_i32 s8, s9, s8
3904; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
3905; GFX6-NEXT:    s_min_i32 s8, s8, s10
3906; GFX6-NEXT:    s_max_i32 s9, s7, -1
3907; GFX6-NEXT:    s_sub_i32 s6, s6, s8
3908; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
3909; GFX6-NEXT:    s_sub_i32 s9, s9, s16
3910; GFX6-NEXT:    s_min_i32 s10, s7, -1
3911; GFX6-NEXT:    s_sub_i32 s10, s10, s17
3912; GFX6-NEXT:    s_max_i32 s8, s9, s8
3913; GFX6-NEXT:    s_min_i32 s8, s8, s10
3914; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3915; GFX6-NEXT:    s_sub_i32 s7, s7, s8
3916; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3917; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3918; GFX6-NEXT:    s_and_b32 s1, s1, s8
3919; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3920; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3921; GFX6-NEXT:    s_and_b32 s0, s0, s8
3922; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3923; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3924; GFX6-NEXT:    s_or_b32 s0, s0, s1
3925; GFX6-NEXT:    s_and_b32 s1, s2, s8
3926; GFX6-NEXT:    s_and_b32 s2, s3, s8
3927; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3928; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
3929; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3930; GFX6-NEXT:    s_and_b32 s3, s5, s8
3931; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
3932; GFX6-NEXT:    s_or_b32 s1, s1, s2
3933; GFX6-NEXT:    s_and_b32 s2, s4, s8
3934; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3935; GFX6-NEXT:    s_and_b32 s4, s7, s8
3936; GFX6-NEXT:    s_or_b32 s2, s2, s3
3937; GFX6-NEXT:    s_and_b32 s3, s6, s8
3938; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3939; GFX6-NEXT:    s_or_b32 s3, s3, s4
3940; GFX6-NEXT:    ; return to shader part epilog
3941;
3942; GFX8-LABEL: s_ssubsat_v8i16:
3943; GFX8:       ; %bb.0:
3944; GFX8-NEXT:    s_sext_i32_i16 s18, s0
3945; GFX8-NEXT:    s_sext_i32_i16 s19, -1
3946; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
3947; GFX8-NEXT:    s_max_i32 s20, s18, s19
3948; GFX8-NEXT:    s_sub_i32 s20, s20, s16
3949; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
3950; GFX8-NEXT:    s_movk_i32 s17, 0x8000
3951; GFX8-NEXT:    s_min_i32 s18, s18, s19
3952; GFX8-NEXT:    s_sext_i32_i16 s20, s20
3953; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3954; GFX8-NEXT:    s_sub_i32 s18, s18, s17
3955; GFX8-NEXT:    s_max_i32 s4, s20, s4
3956; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3957; GFX8-NEXT:    s_sext_i32_i16 s18, s18
3958; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
3959; GFX8-NEXT:    s_min_i32 s4, s4, s18
3960; GFX8-NEXT:    s_sub_i32 s0, s0, s4
3961; GFX8-NEXT:    s_sext_i32_i16 s4, s8
3962; GFX8-NEXT:    s_max_i32 s18, s4, s19
3963; GFX8-NEXT:    s_sub_i32 s18, s18, s16
3964; GFX8-NEXT:    s_min_i32 s4, s4, s19
3965; GFX8-NEXT:    s_sext_i32_i16 s18, s18
3966; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3967; GFX8-NEXT:    s_sub_i32 s4, s4, s17
3968; GFX8-NEXT:    s_max_i32 s12, s18, s12
3969; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3970; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3971; GFX8-NEXT:    s_min_i32 s4, s12, s4
3972; GFX8-NEXT:    s_sub_i32 s4, s8, s4
3973; GFX8-NEXT:    s_sext_i32_i16 s8, s1
3974; GFX8-NEXT:    s_max_i32 s12, s8, s19
3975; GFX8-NEXT:    s_sub_i32 s12, s12, s16
3976; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
3977; GFX8-NEXT:    s_min_i32 s8, s8, s19
3978; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3979; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3980; GFX8-NEXT:    s_sub_i32 s8, s8, s17
3981; GFX8-NEXT:    s_max_i32 s5, s12, s5
3982; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3983; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3984; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
3985; GFX8-NEXT:    s_min_i32 s5, s5, s8
3986; GFX8-NEXT:    s_sub_i32 s1, s1, s5
3987; GFX8-NEXT:    s_sext_i32_i16 s5, s9
3988; GFX8-NEXT:    s_max_i32 s8, s5, s19
3989; GFX8-NEXT:    s_sub_i32 s8, s8, s16
3990; GFX8-NEXT:    s_min_i32 s5, s5, s19
3991; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3992; GFX8-NEXT:    s_sext_i32_i16 s12, s13
3993; GFX8-NEXT:    s_sub_i32 s5, s5, s17
3994; GFX8-NEXT:    s_max_i32 s8, s8, s12
3995; GFX8-NEXT:    s_sext_i32_i16 s8, s8
3996; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3997; GFX8-NEXT:    s_min_i32 s5, s8, s5
3998; GFX8-NEXT:    s_sext_i32_i16 s8, s2
3999; GFX8-NEXT:    s_sub_i32 s5, s9, s5
4000; GFX8-NEXT:    s_max_i32 s9, s8, s19
4001; GFX8-NEXT:    s_sub_i32 s9, s9, s16
4002; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
4003; GFX8-NEXT:    s_min_i32 s8, s8, s19
4004; GFX8-NEXT:    s_sext_i32_i16 s9, s9
4005; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4006; GFX8-NEXT:    s_sub_i32 s8, s8, s17
4007; GFX8-NEXT:    s_max_i32 s6, s9, s6
4008; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4009; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4010; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
4011; GFX8-NEXT:    s_min_i32 s6, s6, s8
4012; GFX8-NEXT:    s_sub_i32 s2, s2, s6
4013; GFX8-NEXT:    s_sext_i32_i16 s6, s10
4014; GFX8-NEXT:    s_max_i32 s8, s6, s19
4015; GFX8-NEXT:    s_sub_i32 s8, s8, s16
4016; GFX8-NEXT:    s_min_i32 s6, s6, s19
4017; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4018; GFX8-NEXT:    s_sext_i32_i16 s9, s14
4019; GFX8-NEXT:    s_sub_i32 s6, s6, s17
4020; GFX8-NEXT:    s_max_i32 s8, s8, s9
4021; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4022; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4023; GFX8-NEXT:    s_min_i32 s6, s8, s6
4024; GFX8-NEXT:    s_sext_i32_i16 s8, s3
4025; GFX8-NEXT:    s_max_i32 s9, s8, s19
4026; GFX8-NEXT:    s_sub_i32 s9, s9, s16
4027; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
4028; GFX8-NEXT:    s_min_i32 s8, s8, s19
4029; GFX8-NEXT:    s_sext_i32_i16 s9, s9
4030; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4031; GFX8-NEXT:    s_sub_i32 s8, s8, s17
4032; GFX8-NEXT:    s_max_i32 s7, s9, s7
4033; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4034; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4035; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
4036; GFX8-NEXT:    s_min_i32 s7, s7, s8
4037; GFX8-NEXT:    s_sub_i32 s3, s3, s7
4038; GFX8-NEXT:    s_sext_i32_i16 s7, s11
4039; GFX8-NEXT:    s_max_i32 s8, s7, s19
4040; GFX8-NEXT:    s_sub_i32 s8, s8, s16
4041; GFX8-NEXT:    s_bfe_u32 s4, s4, 0x100000
4042; GFX8-NEXT:    s_min_i32 s7, s7, s19
4043; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4044; GFX8-NEXT:    s_sext_i32_i16 s9, s15
4045; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
4046; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4047; GFX8-NEXT:    s_sub_i32 s7, s7, s17
4048; GFX8-NEXT:    s_max_i32 s8, s8, s9
4049; GFX8-NEXT:    s_or_b32 s0, s0, s4
4050; GFX8-NEXT:    s_bfe_u32 s4, s5, 0x100000
4051; GFX8-NEXT:    s_sub_i32 s6, s10, s6
4052; GFX8-NEXT:    s_sext_i32_i16 s8, s8
4053; GFX8-NEXT:    s_sext_i32_i16 s7, s7
4054; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
4055; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4056; GFX8-NEXT:    s_min_i32 s7, s8, s7
4057; GFX8-NEXT:    s_or_b32 s1, s1, s4
4058; GFX8-NEXT:    s_bfe_u32 s4, s6, 0x100000
4059; GFX8-NEXT:    s_sub_i32 s7, s11, s7
4060; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
4061; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4062; GFX8-NEXT:    s_or_b32 s2, s2, s4
4063; GFX8-NEXT:    s_bfe_u32 s4, s7, 0x100000
4064; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
4065; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4066; GFX8-NEXT:    s_or_b32 s3, s3, s4
4067; GFX8-NEXT:    ; return to shader part epilog
4068;
4069; GFX9-LABEL: s_ssubsat_v8i16:
4070; GFX9:       ; %bb.0:
4071; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4072; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4073; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4074; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4075; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
4076; GFX9-NEXT:    v_pk_sub_i16 v1, s1, v1 clamp
4077; GFX9-NEXT:    v_pk_sub_i16 v2, s2, v2 clamp
4078; GFX9-NEXT:    v_pk_sub_i16 v3, s3, v3 clamp
4079; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4080; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4081; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4082; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4083; GFX9-NEXT:    ; return to shader part epilog
4084;
4085; GFX10-LABEL: s_ssubsat_v8i16:
4086; GFX10:       ; %bb.0:
4087; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s4 clamp
4088; GFX10-NEXT:    v_pk_sub_i16 v1, s1, s5 clamp
4089; GFX10-NEXT:    v_pk_sub_i16 v2, s2, s6 clamp
4090; GFX10-NEXT:    v_pk_sub_i16 v3, s3, s7 clamp
4091; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4092; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4093; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
4094; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
4095; GFX10-NEXT:    ; return to shader part epilog
4096  %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4097  %cast = bitcast <8 x i16> %result to <4 x i32>
4098  ret <4 x i32> %cast
4099}
4100
4101; FIXME: i48 broken because i48 add broken
4102; define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
4103;   %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4104;   ret i48 %result
4105; }
4106
4107; define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4108;   %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4109;   ret i48 %result
4110; }
4111
4112; define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4113;   %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4114;   %ext.result = zext i48 %result to i64
4115;   %cast = bitcast i64 %ext.result to <2 x float>
4116;   ret <2 x float> %cast
4117; }
4118
4119; define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4120;   %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs)
4121;   %ext.result = zext i48 %result to i64
4122;   %cast = bitcast i64 %ext.result to <2 x float>
4123;   ret <2 x float> %cast
4124; }
4125
4126define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
4127; GFX6-LABEL: v_ssubsat_i64:
4128; GFX6:       ; %bb.0:
4129; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4130; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v0, v2
4131; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
4132; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4133; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
4134; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4135; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4136; GFX6-NEXT:    v_add_i32_e64 v2, s[6:7], 0, v0
4137; GFX6-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4138; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4139; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4140; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4141; GFX6-NEXT:    s_setpc_b64 s[30:31]
4142;
4143; GFX8-LABEL: v_ssubsat_i64:
4144; GFX8:       ; %bb.0:
4145; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4146; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v0, v2
4147; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
4148; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4149; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
4150; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4151; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4152; GFX8-NEXT:    v_add_u32_e64 v2, s[6:7], 0, v0
4153; GFX8-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4154; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4155; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4156; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4157; GFX8-NEXT:    s_setpc_b64 s[30:31]
4158;
4159; GFX9-LABEL: v_ssubsat_i64:
4160; GFX9:       ; %bb.0:
4161; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4162; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
4163; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
4164; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4165; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
4166; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4167; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4168; GFX9-NEXT:    v_add_co_u32_e64 v2, s[6:7], 0, v0
4169; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4170; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4171; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4172; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4173; GFX9-NEXT:    s_setpc_b64 s[30:31]
4174;
4175; GFX10-LABEL: v_ssubsat_i64:
4176; GFX10:       ; %bb.0:
4177; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4178; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4179; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
4180; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4181; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
4182; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4183; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4184; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
4185; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
4186; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
4187; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
4188; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4189; GFX10-NEXT:    s_setpc_b64 s[30:31]
4190  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4191  ret i64 %result
4192}
4193
4194define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4195; GFX6-LABEL: s_ssubsat_i64:
4196; GFX6:       ; %bb.0:
4197; GFX6-NEXT:    s_sub_u32 s4, s0, s2
4198; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
4199; GFX6-NEXT:    s_and_b32 s5, s5, 1
4200; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
4201; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4202; GFX6-NEXT:    s_subb_u32 s5, s1, s3
4203; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4204; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4205; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4206; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
4207; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4208; GFX6-NEXT:    s_add_u32 s0, s2, 0
4209; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4210; GFX6-NEXT:    s_and_b32 s1, s1, 1
4211; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4212; GFX6-NEXT:    s_addc_u32 s1, s2, 0x80000000
4213; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4214; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4215; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4216; GFX6-NEXT:    v_mov_b32_e32 v3, s5
4217; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4218; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4219; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4220; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4221; GFX6-NEXT:    ; return to shader part epilog
4222;
4223; GFX8-LABEL: s_ssubsat_i64:
4224; GFX8:       ; %bb.0:
4225; GFX8-NEXT:    s_sub_u32 s4, s0, s2
4226; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
4227; GFX8-NEXT:    s_and_b32 s5, s5, 1
4228; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
4229; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4230; GFX8-NEXT:    s_subb_u32 s5, s1, s3
4231; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4232; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4233; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4234; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
4235; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4236; GFX8-NEXT:    s_add_u32 s0, s2, 0
4237; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4238; GFX8-NEXT:    s_and_b32 s1, s1, 1
4239; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4240; GFX8-NEXT:    s_addc_u32 s1, s2, 0x80000000
4241; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4242; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4243; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4244; GFX8-NEXT:    v_mov_b32_e32 v3, s5
4245; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4246; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4247; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4248; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4249; GFX8-NEXT:    ; return to shader part epilog
4250;
4251; GFX9-LABEL: s_ssubsat_i64:
4252; GFX9:       ; %bb.0:
4253; GFX9-NEXT:    s_sub_u32 s4, s0, s2
4254; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
4255; GFX9-NEXT:    s_and_b32 s5, s5, 1
4256; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
4257; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4258; GFX9-NEXT:    s_subb_u32 s5, s1, s3
4259; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4260; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4261; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
4262; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4263; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4264; GFX9-NEXT:    s_add_u32 s0, s2, 0
4265; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4266; GFX9-NEXT:    s_and_b32 s1, s1, 1
4267; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4268; GFX9-NEXT:    s_addc_u32 s1, s2, 0x80000000
4269; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4270; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4271; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4272; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4273; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4274; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4275; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4276; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4277; GFX9-NEXT:    ; return to shader part epilog
4278;
4279; GFX10-LABEL: s_ssubsat_i64:
4280; GFX10:       ; %bb.0:
4281; GFX10-NEXT:    s_sub_u32 s4, s0, s2
4282; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
4283; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4284; GFX10-NEXT:    s_and_b32 s5, s5, 1
4285; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
4286; GFX10-NEXT:    s_subb_u32 s5, s1, s3
4287; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4288; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
4289; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4290; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4291; GFX10-NEXT:    s_xor_b32 s3, s1, s0
4292; GFX10-NEXT:    s_add_u32 s0, s2, 0
4293; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
4294; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s3
4295; GFX10-NEXT:    s_and_b32 s1, s1, 1
4296; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
4297; GFX10-NEXT:    s_addc_u32 s1, s2, 0x80000000
4298; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4299; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s3
4300; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4301; GFX10-NEXT:    ; return to shader part epilog
4302  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4303  ret i64 %result
4304}
4305
4306define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4307; GFX6-LABEL: ssubsat_i64_sv:
4308; GFX6:       ; %bb.0:
4309; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4310; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v0
4311; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
4312; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4313; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
4314; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4315; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4316; GFX6-NEXT:    v_add_i32_e64 v4, s[2:3], 0, v0
4317; GFX6-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4318; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4319; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4320; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4321; GFX6-NEXT:    ; return to shader part epilog
4322;
4323; GFX8-LABEL: ssubsat_i64_sv:
4324; GFX8:       ; %bb.0:
4325; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4326; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
4327; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
4328; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4329; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
4330; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4331; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4332; GFX8-NEXT:    v_add_u32_e64 v4, s[2:3], 0, v0
4333; GFX8-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4334; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4335; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4336; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4337; GFX8-NEXT:    ; return to shader part epilog
4338;
4339; GFX9-LABEL: ssubsat_i64_sv:
4340; GFX9:       ; %bb.0:
4341; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4342; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s0, v0
4343; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
4344; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4345; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
4346; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4347; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4348; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], 0, v0
4349; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4350; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4351; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4352; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4353; GFX9-NEXT:    ; return to shader part epilog
4354;
4355; GFX10-LABEL: ssubsat_i64_sv:
4356; GFX10:       ; %bb.0:
4357; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, s0, v0
4358; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4359; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1]
4360; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4361; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
4362; GFX10-NEXT:    v_add_co_u32 v0, s1, v4, 0
4363; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
4364; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
4365; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
4366; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4367; GFX10-NEXT:    ; return to shader part epilog
4368  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4369  %cast = bitcast i64 %result to <2 x float>
4370  ret <2 x float> %cast
4371}
4372
4373define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4374; GFX6-LABEL: ssubsat_i64_vs:
4375; GFX6:       ; %bb.0:
4376; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4377; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
4378; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
4379; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4380; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
4381; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4382; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4383; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], 0, v0
4384; GFX6-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4385; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4386; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4387; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4388; GFX6-NEXT:    ; return to shader part epilog
4389;
4390; GFX8-LABEL: ssubsat_i64_vs:
4391; GFX8:       ; %bb.0:
4392; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4393; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v0
4394; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
4395; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4396; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
4397; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4398; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4399; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], 0, v0
4400; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4401; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4402; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4403; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4404; GFX8-NEXT:    ; return to shader part epilog
4405;
4406; GFX9-LABEL: ssubsat_i64_vs:
4407; GFX9:       ; %bb.0:
4408; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4409; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
4410; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
4411; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4412; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[0:1], 0
4413; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4414; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4415; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], 0, v0
4416; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4417; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4418; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4419; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4420; GFX9-NEXT:    ; return to shader part epilog
4421;
4422; GFX10-LABEL: ssubsat_i64_vs:
4423; GFX10:       ; %bb.0:
4424; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
4425; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4426; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[0:1], 0
4427; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4428; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4429; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, 0
4430; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
4431; GFX10-NEXT:    s_xor_b32 vcc_lo, s1, vcc_lo
4432; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
4433; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4434; GFX10-NEXT:    ; return to shader part epilog
4435  %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
4436  %cast = bitcast i64 %result to <2 x float>
4437  ret <2 x float> %cast
4438}
4439
4440define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4441; GFX6-LABEL: v_ssubsat_v2i64:
4442; GFX6:       ; %bb.0:
4443; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4444; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v0, v4
4445; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
4446; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4447; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
4448; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4449; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
4450; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
4451; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4452; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4453; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4454; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4455; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v2, v6
4456; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
4457; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4458; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
4459; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4460; GFX6-NEXT:    v_add_i32_e64 v3, s[6:7], 0, v2
4461; GFX6-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4462; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4463; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4464; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4465; GFX6-NEXT:    s_setpc_b64 s[30:31]
4466;
4467; GFX8-LABEL: v_ssubsat_v2i64:
4468; GFX8:       ; %bb.0:
4469; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4470; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v0, v4
4471; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v1, v5, vcc
4472; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4473; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
4474; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4475; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
4476; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
4477; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4478; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4479; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4480; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4481; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v2, v6
4482; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v3, v7, vcc
4483; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4484; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
4485; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4486; GFX8-NEXT:    v_add_u32_e64 v3, s[6:7], 0, v2
4487; GFX8-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4488; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4489; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4490; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4491; GFX8-NEXT:    s_setpc_b64 s[30:31]
4492;
4493; GFX9-LABEL: v_ssubsat_v2i64:
4494; GFX9:       ; %bb.0:
4495; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4496; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v0, v4
4497; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
4498; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4499; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
4500; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4501; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
4502; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
4503; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4504; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4505; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4506; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4507; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, v2, v6
4508; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
4509; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4510; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
4511; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4512; GFX9-NEXT:    v_add_co_u32_e64 v3, s[6:7], 0, v2
4513; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4514; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4515; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4516; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4517; GFX9-NEXT:    s_setpc_b64 s[30:31]
4518;
4519; GFX10-LABEL: v_ssubsat_v2i64:
4520; GFX10:       ; %bb.0:
4521; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4522; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4523; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, v4
4524; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4525; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v2, v6
4526; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4527; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4528; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4529; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[4:5]
4530; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
4531; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, 0, v[6:7]
4532; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
4533; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
4534; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4535; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
4536; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
4537; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4538; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
4539; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
4540; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
4541; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
4542; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4543; GFX10-NEXT:    s_setpc_b64 s[30:31]
4544  %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4545  ret <2 x i64> %result
4546}
4547
4548define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4549; GFX6-LABEL: s_ssubsat_v2i64:
4550; GFX6:       ; %bb.0:
4551; GFX6-NEXT:    s_sub_u32 s8, s0, s4
4552; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
4553; GFX6-NEXT:    s_and_b32 s9, s9, 1
4554; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
4555; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4556; GFX6-NEXT:    s_subb_u32 s9, s1, s5
4557; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4558; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4559; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4560; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
4561; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4562; GFX6-NEXT:    s_add_u32 s0, s4, 0
4563; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4564; GFX6-NEXT:    s_and_b32 s1, s1, 1
4565; GFX6-NEXT:    s_brev_b32 s5, 1
4566; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4567; GFX6-NEXT:    s_addc_u32 s1, s4, s5
4568; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4569; GFX6-NEXT:    s_sub_u32 s0, s2, s6
4570; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4571; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4572; GFX6-NEXT:    v_mov_b32_e32 v0, s8
4573; GFX6-NEXT:    s_and_b32 s1, s1, 1
4574; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4575; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4576; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4577; GFX6-NEXT:    v_mov_b32_e32 v3, s9
4578; GFX6-NEXT:    s_subb_u32 s1, s3, s7
4579; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4580; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4581; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4582; GFX6-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
4583; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
4584; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4585; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4586; GFX6-NEXT:    s_add_u32 s0, s4, 0
4587; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
4588; GFX6-NEXT:    s_and_b32 s2, s2, 1
4589; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
4590; GFX6-NEXT:    s_addc_u32 s3, s4, s5
4591; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4592; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4593; GFX6-NEXT:    v_mov_b32_e32 v5, s1
4594; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4595; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4596; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
4597; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
4598; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
4599; GFX6-NEXT:    v_readfirstlane_b32 s3, v1
4600; GFX6-NEXT:    ; return to shader part epilog
4601;
4602; GFX8-LABEL: s_ssubsat_v2i64:
4603; GFX8:       ; %bb.0:
4604; GFX8-NEXT:    s_sub_u32 s8, s0, s4
4605; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
4606; GFX8-NEXT:    s_and_b32 s9, s9, 1
4607; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
4608; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4609; GFX8-NEXT:    s_subb_u32 s9, s1, s5
4610; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4611; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4612; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4613; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
4614; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4615; GFX8-NEXT:    s_add_u32 s0, s4, 0
4616; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4617; GFX8-NEXT:    s_and_b32 s1, s1, 1
4618; GFX8-NEXT:    s_brev_b32 s5, 1
4619; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4620; GFX8-NEXT:    s_addc_u32 s1, s4, s5
4621; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4622; GFX8-NEXT:    s_sub_u32 s0, s2, s6
4623; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4624; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4625; GFX8-NEXT:    v_mov_b32_e32 v0, s8
4626; GFX8-NEXT:    s_and_b32 s1, s1, 1
4627; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4628; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4629; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4630; GFX8-NEXT:    v_mov_b32_e32 v3, s9
4631; GFX8-NEXT:    s_subb_u32 s1, s3, s7
4632; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4633; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4634; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4635; GFX8-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
4636; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
4637; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4638; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4639; GFX8-NEXT:    s_add_u32 s0, s4, 0
4640; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4641; GFX8-NEXT:    s_and_b32 s2, s2, 1
4642; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
4643; GFX8-NEXT:    s_addc_u32 s3, s4, s5
4644; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4645; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4646; GFX8-NEXT:    v_mov_b32_e32 v5, s1
4647; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4648; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4649; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
4650; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
4651; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4652; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4653; GFX8-NEXT:    ; return to shader part epilog
4654;
4655; GFX9-LABEL: s_ssubsat_v2i64:
4656; GFX9:       ; %bb.0:
4657; GFX9-NEXT:    s_sub_u32 s8, s0, s4
4658; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
4659; GFX9-NEXT:    s_and_b32 s9, s9, 1
4660; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
4661; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4662; GFX9-NEXT:    s_subb_u32 s9, s1, s5
4663; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4664; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4665; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
4666; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
4667; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4668; GFX9-NEXT:    s_add_u32 s0, s4, 0
4669; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4670; GFX9-NEXT:    s_and_b32 s1, s1, 1
4671; GFX9-NEXT:    s_brev_b32 s5, 1
4672; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4673; GFX9-NEXT:    s_addc_u32 s1, s4, s5
4674; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4675; GFX9-NEXT:    s_sub_u32 s0, s2, s6
4676; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4677; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4678; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4679; GFX9-NEXT:    s_and_b32 s1, s1, 1
4680; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4681; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4682; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4683; GFX9-NEXT:    v_mov_b32_e32 v3, s9
4684; GFX9-NEXT:    s_subb_u32 s1, s3, s7
4685; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4686; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4687; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4688; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
4689; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
4690; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4691; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4692; GFX9-NEXT:    s_add_u32 s0, s4, 0
4693; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4694; GFX9-NEXT:    s_and_b32 s2, s2, 1
4695; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
4696; GFX9-NEXT:    s_addc_u32 s3, s4, s5
4697; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4698; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4699; GFX9-NEXT:    v_mov_b32_e32 v5, s1
4700; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4701; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4702; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
4703; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
4704; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4705; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4706; GFX9-NEXT:    ; return to shader part epilog
4707;
4708; GFX10-LABEL: s_ssubsat_v2i64:
4709; GFX10:       ; %bb.0:
4710; GFX10-NEXT:    s_sub_u32 s8, s0, s4
4711; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
4712; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, s[4:5], 0
4713; GFX10-NEXT:    s_and_b32 s9, s9, 1
4714; GFX10-NEXT:    v_mov_b32_e32 v0, s8
4715; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
4716; GFX10-NEXT:    s_brev_b32 s10, 1
4717; GFX10-NEXT:    s_subb_u32 s9, s1, s5
4718; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
4719; GFX10-NEXT:    s_ashr_i32 s1, s9, 31
4720; GFX10-NEXT:    v_mov_b32_e32 v1, s9
4721; GFX10-NEXT:    s_xor_b32 s8, s4, s0
4722; GFX10-NEXT:    s_add_u32 s0, s1, 0
4723; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
4724; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
4725; GFX10-NEXT:    s_and_b32 s4, s4, 1
4726; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
4727; GFX10-NEXT:    s_addc_u32 s1, s1, s10
4728; GFX10-NEXT:    s_sub_u32 s4, s2, s6
4729; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
4730; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
4731; GFX10-NEXT:    s_and_b32 s5, s5, 1
4732; GFX10-NEXT:    v_mov_b32_e32 v2, s4
4733; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
4734; GFX10-NEXT:    s_subb_u32 s5, s3, s7
4735; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
4736; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[6:7], 0
4737; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
4738; GFX10-NEXT:    v_mov_b32_e32 v3, s5
4739; GFX10-NEXT:    s_xor_b32 s2, s3, s2
4740; GFX10-NEXT:    s_add_u32 s0, s1, 0
4741; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
4742; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
4743; GFX10-NEXT:    s_and_b32 s3, s3, 1
4744; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4745; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
4746; GFX10-NEXT:    s_addc_u32 s1, s1, s10
4747; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
4748; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4749; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
4750; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
4751; GFX10-NEXT:    ; return to shader part epilog
4752  %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4753  ret <2 x i64> %result
4754}
4755
4756define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
4757; GFX6-LABEL: s_ssubsat_i128:
4758; GFX6:       ; %bb.0:
4759; GFX6-NEXT:    s_sub_u32 s8, s0, s4
4760; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
4761; GFX6-NEXT:    s_and_b32 s9, s9, 1
4762; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
4763; GFX6-NEXT:    s_subb_u32 s9, s1, s5
4764; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
4765; GFX6-NEXT:    s_and_b32 s10, s10, 1
4766; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
4767; GFX6-NEXT:    s_subb_u32 s10, s2, s6
4768; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
4769; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4770; GFX6-NEXT:    s_and_b32 s11, s11, 1
4771; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4772; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
4773; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4774; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
4775; GFX6-NEXT:    s_subb_u32 s11, s3, s7
4776; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4777; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4778; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
4779; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
4780; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
4781; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[10:11], v[0:1]
4782; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4783; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
4784; GFX6-NEXT:    s_ashr_i32 s3, s11, 31
4785; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
4786; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
4787; GFX6-NEXT:    s_add_u32 s0, s3, 0
4788; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4789; GFX6-NEXT:    s_and_b32 s1, s1, 1
4790; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4791; GFX6-NEXT:    s_addc_u32 s1, s3, 0
4792; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
4793; GFX6-NEXT:    s_and_b32 s2, s2, 1
4794; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[6:7], 0
4795; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
4796; GFX6-NEXT:    s_addc_u32 s2, s3, 0
4797; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
4798; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
4799; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
4800; GFX6-NEXT:    s_and_b32 s4, s4, 1
4801; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
4802; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
4803; GFX6-NEXT:    s_addc_u32 s3, s3, 0x80000000
4804; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4805; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4806; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4807; GFX6-NEXT:    v_mov_b32_e32 v4, s9
4808; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4809; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4810; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4811; GFX6-NEXT:    v_mov_b32_e32 v2, s2
4812; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4813; GFX6-NEXT:    v_mov_b32_e32 v4, s10
4814; GFX6-NEXT:    v_mov_b32_e32 v5, s11
4815; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4816; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4817; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4818; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4819; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
4820; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
4821; GFX6-NEXT:    ; return to shader part epilog
4822;
4823; GFX8-LABEL: s_ssubsat_i128:
4824; GFX8:       ; %bb.0:
4825; GFX8-NEXT:    s_sub_u32 s8, s0, s4
4826; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
4827; GFX8-NEXT:    s_and_b32 s9, s9, 1
4828; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
4829; GFX8-NEXT:    s_subb_u32 s9, s1, s5
4830; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
4831; GFX8-NEXT:    s_and_b32 s10, s10, 1
4832; GFX8-NEXT:    s_cmp_lg_u32 s10, 0
4833; GFX8-NEXT:    s_subb_u32 s10, s2, s6
4834; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
4835; GFX8-NEXT:    s_and_b32 s11, s11, 1
4836; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4837; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
4838; GFX8-NEXT:    v_mov_b32_e32 v2, s0
4839; GFX8-NEXT:    s_subb_u32 s11, s3, s7
4840; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4841; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
4842; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4843; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
4844; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4845; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4846; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
4847; GFX8-NEXT:    s_and_b32 s0, 1, s2
4848; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
4849; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4850; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
4851; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
4852; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4853; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
4854; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4855; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4856; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
4857; GFX8-NEXT:    s_and_b32 s0, 1, s2
4858; GFX8-NEXT:    s_ashr_i32 s3, s11, 31
4859; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4860; GFX8-NEXT:    s_add_u32 s0, s3, 0
4861; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4862; GFX8-NEXT:    s_and_b32 s1, s1, 1
4863; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4864; GFX8-NEXT:    s_addc_u32 s1, s3, 0
4865; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4866; GFX8-NEXT:    s_and_b32 s2, s2, 1
4867; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
4868; GFX8-NEXT:    s_addc_u32 s2, s3, 0
4869; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
4870; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
4871; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
4872; GFX8-NEXT:    s_and_b32 s4, s4, 1
4873; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
4874; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
4875; GFX8-NEXT:    s_addc_u32 s3, s3, 0x80000000
4876; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4877; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4878; GFX8-NEXT:    v_mov_b32_e32 v3, s8
4879; GFX8-NEXT:    v_mov_b32_e32 v4, s9
4880; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4881; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4882; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4883; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4884; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4885; GFX8-NEXT:    v_mov_b32_e32 v4, s10
4886; GFX8-NEXT:    v_mov_b32_e32 v5, s11
4887; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4888; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4889; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4890; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4891; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
4892; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
4893; GFX8-NEXT:    ; return to shader part epilog
4894;
4895; GFX9-LABEL: s_ssubsat_i128:
4896; GFX9:       ; %bb.0:
4897; GFX9-NEXT:    s_sub_u32 s8, s0, s4
4898; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
4899; GFX9-NEXT:    s_and_b32 s9, s9, 1
4900; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
4901; GFX9-NEXT:    s_subb_u32 s9, s1, s5
4902; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
4903; GFX9-NEXT:    s_and_b32 s10, s10, 1
4904; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
4905; GFX9-NEXT:    s_subb_u32 s10, s2, s6
4906; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
4907; GFX9-NEXT:    s_and_b32 s11, s11, 1
4908; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4909; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
4910; GFX9-NEXT:    v_mov_b32_e32 v2, s0
4911; GFX9-NEXT:    s_subb_u32 s11, s3, s7
4912; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4913; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
4914; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4915; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
4916; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4917; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4918; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
4919; GFX9-NEXT:    s_and_b32 s0, 1, s2
4920; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
4921; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4922; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[4:5], 0
4923; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
4924; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4925; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
4926; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4927; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4928; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
4929; GFX9-NEXT:    s_and_b32 s0, 1, s2
4930; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
4931; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4932; GFX9-NEXT:    s_add_u32 s0, s3, 0
4933; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4934; GFX9-NEXT:    s_and_b32 s1, s1, 1
4935; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4936; GFX9-NEXT:    s_addc_u32 s1, s3, 0
4937; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4938; GFX9-NEXT:    s_and_b32 s2, s2, 1
4939; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
4940; GFX9-NEXT:    s_addc_u32 s2, s3, 0
4941; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
4942; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
4943; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
4944; GFX9-NEXT:    s_and_b32 s4, s4, 1
4945; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
4946; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
4947; GFX9-NEXT:    s_addc_u32 s3, s3, 0x80000000
4948; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4949; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4950; GFX9-NEXT:    v_mov_b32_e32 v3, s8
4951; GFX9-NEXT:    v_mov_b32_e32 v4, s9
4952; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4953; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4954; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4955; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4956; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4957; GFX9-NEXT:    v_mov_b32_e32 v4, s10
4958; GFX9-NEXT:    v_mov_b32_e32 v5, s11
4959; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4960; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4961; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4962; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4963; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4964; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4965; GFX9-NEXT:    ; return to shader part epilog
4966;
4967; GFX10-LABEL: s_ssubsat_i128:
4968; GFX10:       ; %bb.0:
4969; GFX10-NEXT:    s_sub_u32 s8, s0, s4
4970; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
4971; GFX10-NEXT:    s_and_b32 s9, s9, 1
4972; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
4973; GFX10-NEXT:    s_subb_u32 s9, s1, s5
4974; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
4975; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
4976; GFX10-NEXT:    s_and_b32 s10, s10, 1
4977; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
4978; GFX10-NEXT:    s_subb_u32 s10, s2, s6
4979; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
4980; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
4981; GFX10-NEXT:    s_and_b32 s11, s11, 1
4982; GFX10-NEXT:    v_mov_b32_e32 v3, s10
4983; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
4984; GFX10-NEXT:    s_subb_u32 s11, s3, s7
4985; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
4986; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
4987; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
4988; GFX10-NEXT:    v_mov_b32_e32 v4, s11
4989; GFX10-NEXT:    s_and_b32 s0, 1, s0
4990; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
4991; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
4992; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[4:5], 0
4993; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
4994; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
4995; GFX10-NEXT:    s_ashr_i32 s3, s11, 31
4996; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
4997; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
4998; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[6:7], 0
4999; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
5000; GFX10-NEXT:    s_and_b32 s0, 1, s1
5001; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5002; GFX10-NEXT:    s_add_u32 s0, s3, 0
5003; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5004; GFX10-NEXT:    s_and_b32 s1, s1, 1
5005; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
5006; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
5007; GFX10-NEXT:    v_mov_b32_e32 v2, s9
5008; GFX10-NEXT:    s_addc_u32 s1, s3, 0
5009; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
5010; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5011; GFX10-NEXT:    s_and_b32 s2, s2, 1
5012; GFX10-NEXT:    v_mov_b32_e32 v1, s8
5013; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
5014; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5015; GFX10-NEXT:    s_addc_u32 s2, s3, 0
5016; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
5017; GFX10-NEXT:    s_and_b32 s4, s4, 1
5018; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5019; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
5020; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
5021; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
5022; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
5023; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
5024; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
5025; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
5026; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
5027; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
5028; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5029; GFX10-NEXT:    ; return to shader part epilog
5030  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5031  ret i128 %result
5032}
5033
5034define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5035; GFX6-LABEL: ssubsat_i128_sv:
5036; GFX6:       ; %bb.0:
5037; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5038; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v0
5039; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
5040; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5041; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5042; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
5043; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
5044; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5045; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5046; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5047; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5048; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5049; GFX6-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5050; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5051; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5052; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5053; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5054; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5055; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5056; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5057; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
5058; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5059; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v8
5060; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
5061; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5062; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5063; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5064; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5065; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5066; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5067; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5068; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5069; GFX6-NEXT:    ; return to shader part epilog
5070;
5071; GFX8-LABEL: ssubsat_i128_sv:
5072; GFX8:       ; %bb.0:
5073; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5074; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s0, v0
5075; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
5076; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5077; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5078; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v2, vcc
5079; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
5080; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5081; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5082; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5083; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5084; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5085; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5086; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5087; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5088; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5089; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5090; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5091; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5092; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5093; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
5094; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5095; GFX8-NEXT:    v_xor_b32_e32 v0, v0, v8
5096; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
5097; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5098; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5099; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5100; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5101; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5102; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5103; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5104; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5105; GFX8-NEXT:    ; return to shader part epilog
5106;
5107; GFX9-LABEL: ssubsat_i128_sv:
5108; GFX9:       ; %bb.0:
5109; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5110; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s0, v0
5111; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v1, vcc
5112; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5113; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5114; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v2, vcc
5115; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
5116; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
5117; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
5118; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7]
5119; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
5120; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7]
5121; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
5122; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
5123; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5124; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[2:3]
5125; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5126; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5127; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5128; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5129; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
5130; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
5131; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v8
5132; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
5133; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
5134; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
5135; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5136; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5137; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5138; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5139; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5140; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5141; GFX9-NEXT:    ; return to shader part epilog
5142;
5143; GFX10-LABEL: ssubsat_i128_sv:
5144; GFX10:       ; %bb.0:
5145; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, s0, v0
5146; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5147; GFX10-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5148; GFX10-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5149; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
5150; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5151; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
5152; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5153; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
5154; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5155; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
5156; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5157; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
5158; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
5159; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5160; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5161; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5162; GFX10-NEXT:    v_xor_b32_e32 v0, v0, v8
5163; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
5164; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
5165; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5166; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
5167; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
5168; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
5169; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
5170; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
5171; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
5172; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
5173; GFX10-NEXT:    ; return to shader part epilog
5174  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5175  %cast = bitcast i128 %result to <4 x float>
5176  ret <4 x float> %cast
5177}
5178
5179define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5180; GFX6-LABEL: ssubsat_i128_vs:
5181; GFX6:       ; %bb.0:
5182; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5183; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v0
5184; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v5, vcc
5185; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5186; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5187; GFX6-NEXT:    v_subb_u32_e32 v6, vcc, v2, v6, vcc
5188; GFX6-NEXT:    v_subb_u32_e32 v7, vcc, v3, v7, vcc
5189; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5190; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5191; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5192; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5193; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
5194; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5195; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5196; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5197; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5198; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5199; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[2:3], 0
5200; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5201; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5202; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5203; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5204; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
5205; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5206; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5207; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5208; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5209; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5210; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5211; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5212; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5213; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5214; GFX6-NEXT:    ; return to shader part epilog
5215;
5216; GFX8-LABEL: ssubsat_i128_vs:
5217; GFX8:       ; %bb.0:
5218; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5219; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s0, v0
5220; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v5, vcc
5221; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5222; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5223; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v6, vcc
5224; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v3, v7, vcc
5225; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5226; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5227; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5228; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5229; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
5230; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5231; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5232; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
5233; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5234; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5235; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5236; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
5237; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5238; GFX8-NEXT:    s_and_b32 s0, 1, s4
5239; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5240; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5241; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5242; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5243; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
5244; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5245; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5246; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5247; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5248; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5249; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5250; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5251; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5252; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5253; GFX8-NEXT:    ; return to shader part epilog
5254;
5255; GFX9-LABEL: ssubsat_i128_vs:
5256; GFX9:       ; %bb.0:
5257; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5258; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s0, v0
5259; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v5, vcc
5260; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5261; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5262; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v6, vcc
5263; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v3, v7, vcc
5264; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5265; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
5266; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5267; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5268; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
5269; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5270; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5271; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
5272; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5273; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5274; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
5275; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
5276; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5277; GFX9-NEXT:    s_and_b32 s0, 1, s4
5278; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5279; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5280; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5281; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5282; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
5283; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
5284; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
5285; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
5286; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5287; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5288; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5289; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5290; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5291; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5292; GFX9-NEXT:    ; return to shader part epilog
5293;
5294; GFX10-LABEL: ssubsat_i128_vs:
5295; GFX10:       ; %bb.0:
5296; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, s0
5297; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5298; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5299; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5300; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5301; GFX10-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
5302; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
5303; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
5304; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5305; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5306; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
5307; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
5308; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5309; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5310; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
5311; GFX10-NEXT:    s_and_b32 s0, 1, s4
5312; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5313; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5314; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5315; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5316; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5317; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5318; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
5319; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
5320; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
5321; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
5322; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
5323; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
5324; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
5325; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
5326; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
5327; GFX10-NEXT:    ; return to shader part epilog
5328  %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
5329  %cast = bitcast i128 %result to <4 x float>
5330  ret <4 x float> %cast
5331}
5332
5333define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5334; GFX6-LABEL: v_ssubsat_v2i128:
5335; GFX6:       ; %bb.0:
5336; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5337; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v0, v8
5338; GFX6-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
5339; GFX6-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
5340; GFX6-NEXT:    v_subb_u32_e32 v19, vcc, v3, v11, vcc
5341; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5342; GFX6-NEXT:    v_bfrev_b32_e32 v20, 1
5343; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5344; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5345; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5346; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5347; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5348; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5349; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5350; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5351; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5352; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5353; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5354; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5355; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
5356; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
5357; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5358; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5359; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v20, vcc
5360; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5361; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5362; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5363; GFX6-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
5364; GFX6-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
5365; GFX6-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
5366; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v4, v12
5367; GFX6-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
5368; GFX6-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
5369; GFX6-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
5370; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5371; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5372; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5373; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5374; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5375; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5376; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5377; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5378; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5379; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5380; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5381; GFX6-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5382; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
5383; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5384; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v5
5385; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
5386; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
5387; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, v5, v20, vcc
5388; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
5389; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5390; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5391; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5392; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5393; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5394; GFX6-NEXT:    s_setpc_b64 s[30:31]
5395;
5396; GFX8-LABEL: v_ssubsat_v2i128:
5397; GFX8:       ; %bb.0:
5398; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5399; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v0, v8
5400; GFX8-NEXT:    v_subb_u32_e32 v17, vcc, v1, v9, vcc
5401; GFX8-NEXT:    v_subb_u32_e32 v18, vcc, v2, v10, vcc
5402; GFX8-NEXT:    v_subb_u32_e32 v19, vcc, v3, v11, vcc
5403; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5404; GFX8-NEXT:    v_bfrev_b32_e32 v20, 1
5405; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5406; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5407; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5408; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5409; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5410; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5411; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5412; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5413; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5414; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5415; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5416; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5417; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
5418; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
5419; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5420; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5421; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v20, vcc
5422; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5423; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5424; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5425; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
5426; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
5427; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
5428; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v4, v12
5429; GFX8-NEXT:    v_subb_u32_e32 v9, vcc, v5, v13, vcc
5430; GFX8-NEXT:    v_subb_u32_e32 v10, vcc, v6, v14, vcc
5431; GFX8-NEXT:    v_subb_u32_e32 v11, vcc, v7, v15, vcc
5432; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5433; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5434; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5435; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5436; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5437; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5438; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5439; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5440; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5441; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5442; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5443; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5444; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
5445; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5446; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v5
5447; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
5448; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
5449; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v5, v20, vcc
5450; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
5451; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5452; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5453; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5454; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5455; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5456; GFX8-NEXT:    s_setpc_b64 s[30:31]
5457;
5458; GFX9-LABEL: v_ssubsat_v2i128:
5459; GFX9:       ; %bb.0:
5460; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5461; GFX9-NEXT:    v_sub_co_u32_e32 v16, vcc, v0, v8
5462; GFX9-NEXT:    v_subb_co_u32_e32 v17, vcc, v1, v9, vcc
5463; GFX9-NEXT:    v_subb_co_u32_e32 v18, vcc, v2, v10, vcc
5464; GFX9-NEXT:    v_subb_co_u32_e32 v19, vcc, v3, v11, vcc
5465; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1]
5466; GFX9-NEXT:    v_bfrev_b32_e32 v20, 1
5467; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5468; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3]
5469; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5470; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3]
5471; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5472; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[8:9]
5473; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5474; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[10:11]
5475; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5476; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5477; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5478; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5479; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
5480; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
5481; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
5482; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
5483; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v20, vcc
5484; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5485; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5486; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v2, vcc
5487; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v3, vcc
5488; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v8, vcc
5489; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v9, vcc
5490; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v4, v12
5491; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v5, v13, vcc
5492; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, v6, v14, vcc
5493; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v7, v15, vcc
5494; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5495; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5496; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5497; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5498; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5499; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5500; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[12:13]
5501; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5502; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, 0, v[14:15]
5503; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5504; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5505; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
5506; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
5507; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5508; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v5
5509; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
5510; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v5, vcc
5511; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v5, v20, vcc
5512; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
5513; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5514; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5515; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5516; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5517; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5518; GFX9-NEXT:    s_setpc_b64 s[30:31]
5519;
5520; GFX10-LABEL: v_ssubsat_v2i128:
5521; GFX10:       ; %bb.0:
5522; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5523; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5524; GFX10-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
5525; GFX10-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
5526; GFX10-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
5527; GFX10-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
5528; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
5529; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5530; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
5531; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5532; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
5533; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5534; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
5535; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
5536; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
5537; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5538; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5539; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
5540; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v4, v12
5541; GFX10-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
5542; GFX10-NEXT:    v_sub_co_ci_u32_e32 v10, vcc_lo, v6, v14, vcc_lo
5543; GFX10-NEXT:    v_sub_co_ci_u32_e32 v11, vcc_lo, v7, v15, vcc_lo
5544; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[8:9], v[4:5]
5545; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5546; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v19
5547; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[10:11], v[6:7]
5548; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
5549; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[10:11], v[6:7]
5550; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5551; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
5552; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
5553; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
5554; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, 0, v[12:13]
5555; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v11
5556; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s4
5557; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[14:15]
5558; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s4
5559; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v0
5560; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s5
5561; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[14:15]
5562; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
5563; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo
5564; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v3, s4
5565; GFX10-NEXT:    v_cndmask_b32_e64 v4, v13, v12, s5
5566; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v0
5567; GFX10-NEXT:    v_cndmask_b32_e64 v0, v16, v2, s4
5568; GFX10-NEXT:    v_cndmask_b32_e64 v2, v18, v5, s4
5569; GFX10-NEXT:    v_and_b32_e32 v3, 1, v4
5570; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v7, 0
5571; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
5572; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, 0, v7, vcc_lo
5573; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v3
5574; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo
5575; GFX10-NEXT:    v_cndmask_b32_e64 v3, v19, v6, s4
5576; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s5
5577; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s5
5578; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s5
5579; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s5
5580; GFX10-NEXT:    s_setpc_b64 s[30:31]
5581  %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5582  ret <2 x i128> %result
5583}
5584
5585define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5586; GFX6-LABEL: s_ssubsat_v2i128:
5587; GFX6:       ; %bb.0:
5588; GFX6-NEXT:    s_sub_u32 s16, s0, s8
5589; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
5590; GFX6-NEXT:    s_and_b32 s17, s17, 1
5591; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
5592; GFX6-NEXT:    s_subb_u32 s17, s1, s9
5593; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
5594; GFX6-NEXT:    s_and_b32 s18, s18, 1
5595; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
5596; GFX6-NEXT:    s_subb_u32 s18, s2, s10
5597; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
5598; GFX6-NEXT:    v_mov_b32_e32 v3, s1
5599; GFX6-NEXT:    s_and_b32 s19, s19, 1
5600; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5601; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
5602; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5603; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
5604; GFX6-NEXT:    s_subb_u32 s19, s3, s11
5605; GFX6-NEXT:    v_mov_b32_e32 v1, s3
5606; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5607; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
5608; GFX6-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
5609; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
5610; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[18:19], v[0:1]
5611; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5612; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
5613; GFX6-NEXT:    s_ashr_i32 s3, s19, 31
5614; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
5615; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5616; GFX6-NEXT:    s_add_u32 s0, s3, 0
5617; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
5618; GFX6-NEXT:    s_and_b32 s1, s1, 1
5619; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
5620; GFX6-NEXT:    s_addc_u32 s1, s3, 0
5621; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
5622; GFX6-NEXT:    s_and_b32 s2, s2, 1
5623; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
5624; GFX6-NEXT:    s_addc_u32 s2, s3, 0
5625; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
5626; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[10:11], 0
5627; GFX6-NEXT:    s_and_b32 s9, s9, 1
5628; GFX6-NEXT:    s_brev_b32 s8, 1
5629; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
5630; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5631; GFX6-NEXT:    s_addc_u32 s3, s3, s8
5632; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5633; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5634; GFX6-NEXT:    s_sub_u32 s0, s4, s12
5635; GFX6-NEXT:    v_mov_b32_e32 v2, s1
5636; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
5637; GFX6-NEXT:    s_and_b32 s1, s1, 1
5638; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
5639; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5640; GFX6-NEXT:    s_subb_u32 s1, s5, s13
5641; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5642; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5643; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
5644; GFX6-NEXT:    s_and_b32 s2, s2, 1
5645; GFX6-NEXT:    v_mov_b32_e32 v3, s16
5646; GFX6-NEXT:    v_mov_b32_e32 v4, s17
5647; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
5648; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5649; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5650; GFX6-NEXT:    v_mov_b32_e32 v1, s3
5651; GFX6-NEXT:    v_mov_b32_e32 v2, s18
5652; GFX6-NEXT:    v_mov_b32_e32 v3, s19
5653; GFX6-NEXT:    s_subb_u32 s2, s6, s14
5654; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5655; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5656; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
5657; GFX6-NEXT:    v_mov_b32_e32 v2, s4
5658; GFX6-NEXT:    s_and_b32 s3, s3, 1
5659; GFX6-NEXT:    v_mov_b32_e32 v3, s5
5660; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
5661; GFX6-NEXT:    v_mov_b32_e32 v0, s6
5662; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5663; GFX6-NEXT:    s_subb_u32 s3, s7, s15
5664; GFX6-NEXT:    v_mov_b32_e32 v1, s7
5665; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5666; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5667; GFX6-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
5668; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
5669; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
5670; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5671; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
5672; GFX6-NEXT:    s_ashr_i32 s7, s3, 31
5673; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
5674; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
5675; GFX6-NEXT:    s_add_u32 s4, s7, 0
5676; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
5677; GFX6-NEXT:    s_and_b32 s5, s5, 1
5678; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
5679; GFX6-NEXT:    s_addc_u32 s5, s7, 0
5680; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
5681; GFX6-NEXT:    s_and_b32 s6, s6, 1
5682; GFX6-NEXT:    v_cmp_eq_u64_e64 vcc, s[14:15], 0
5683; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
5684; GFX6-NEXT:    s_addc_u32 s6, s7, 0
5685; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5686; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
5687; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5688; GFX6-NEXT:    s_and_b32 s9, s9, 1
5689; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
5690; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5691; GFX6-NEXT:    s_addc_u32 s7, s7, s8
5692; GFX6-NEXT:    v_mov_b32_e32 v1, s4
5693; GFX6-NEXT:    v_mov_b32_e32 v2, s5
5694; GFX6-NEXT:    v_mov_b32_e32 v3, s0
5695; GFX6-NEXT:    v_mov_b32_e32 v8, s1
5696; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5697; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5698; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5699; GFX6-NEXT:    v_mov_b32_e32 v2, s6
5700; GFX6-NEXT:    v_mov_b32_e32 v3, s7
5701; GFX6-NEXT:    v_mov_b32_e32 v8, s2
5702; GFX6-NEXT:    v_mov_b32_e32 v9, s3
5703; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5704; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5705; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
5706; GFX6-NEXT:    v_readfirstlane_b32 s1, v4
5707; GFX6-NEXT:    v_readfirstlane_b32 s2, v6
5708; GFX6-NEXT:    v_readfirstlane_b32 s3, v7
5709; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
5710; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
5711; GFX6-NEXT:    v_readfirstlane_b32 s6, v2
5712; GFX6-NEXT:    v_readfirstlane_b32 s7, v3
5713; GFX6-NEXT:    ; return to shader part epilog
5714;
5715; GFX8-LABEL: s_ssubsat_v2i128:
5716; GFX8:       ; %bb.0:
5717; GFX8-NEXT:    s_sub_u32 s16, s0, s8
5718; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
5719; GFX8-NEXT:    s_and_b32 s17, s17, 1
5720; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
5721; GFX8-NEXT:    s_subb_u32 s17, s1, s9
5722; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
5723; GFX8-NEXT:    s_and_b32 s18, s18, 1
5724; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
5725; GFX8-NEXT:    s_subb_u32 s18, s2, s10
5726; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
5727; GFX8-NEXT:    s_and_b32 s19, s19, 1
5728; GFX8-NEXT:    v_mov_b32_e32 v3, s1
5729; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
5730; GFX8-NEXT:    v_mov_b32_e32 v2, s0
5731; GFX8-NEXT:    s_subb_u32 s19, s3, s11
5732; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5733; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
5734; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5735; GFX8-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
5736; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5737; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5738; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
5739; GFX8-NEXT:    s_and_b32 s0, 1, s2
5740; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5741; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5742; GFX8-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
5743; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
5744; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5745; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
5746; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5747; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5748; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5749; GFX8-NEXT:    s_and_b32 s0, 1, s2
5750; GFX8-NEXT:    s_ashr_i32 s3, s19, 31
5751; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5752; GFX8-NEXT:    s_add_u32 s0, s3, 0
5753; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
5754; GFX8-NEXT:    s_and_b32 s1, s1, 1
5755; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
5756; GFX8-NEXT:    s_addc_u32 s1, s3, 0
5757; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5758; GFX8-NEXT:    s_and_b32 s2, s2, 1
5759; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
5760; GFX8-NEXT:    s_addc_u32 s2, s3, 0
5761; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
5762; GFX8-NEXT:    s_and_b32 s9, s9, 1
5763; GFX8-NEXT:    s_brev_b32 s8, 1
5764; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
5765; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5766; GFX8-NEXT:    s_addc_u32 s3, s3, s8
5767; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5768; GFX8-NEXT:    v_mov_b32_e32 v1, s0
5769; GFX8-NEXT:    s_sub_u32 s0, s4, s12
5770; GFX8-NEXT:    v_mov_b32_e32 v2, s1
5771; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
5772; GFX8-NEXT:    s_and_b32 s1, s1, 1
5773; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
5774; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5775; GFX8-NEXT:    s_subb_u32 s1, s5, s13
5776; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5777; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5778; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5779; GFX8-NEXT:    s_and_b32 s2, s2, 1
5780; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
5781; GFX8-NEXT:    v_mov_b32_e32 v3, s16
5782; GFX8-NEXT:    v_mov_b32_e32 v4, s17
5783; GFX8-NEXT:    s_subb_u32 s2, s6, s14
5784; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5785; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5786; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5787; GFX8-NEXT:    v_mov_b32_e32 v2, s18
5788; GFX8-NEXT:    v_mov_b32_e32 v3, s19
5789; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
5790; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5791; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5792; GFX8-NEXT:    s_and_b32 s3, s3, 1
5793; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5794; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
5795; GFX8-NEXT:    v_mov_b32_e32 v3, s5
5796; GFX8-NEXT:    s_subb_u32 s3, s7, s15
5797; GFX8-NEXT:    v_mov_b32_e32 v0, s6
5798; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5799; GFX8-NEXT:    v_mov_b32_e32 v1, s7
5800; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
5801; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5802; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5803; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5804; GFX8-NEXT:    s_and_b32 s4, 1, s6
5805; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5806; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5807; GFX8-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
5808; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
5809; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5810; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
5811; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5812; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5813; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
5814; GFX8-NEXT:    s_and_b32 s4, 1, s6
5815; GFX8-NEXT:    s_ashr_i32 s7, s3, 31
5816; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5817; GFX8-NEXT:    s_add_u32 s4, s7, 0
5818; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
5819; GFX8-NEXT:    s_and_b32 s5, s5, 1
5820; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
5821; GFX8-NEXT:    s_addc_u32 s5, s7, 0
5822; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5823; GFX8-NEXT:    s_and_b32 s6, s6, 1
5824; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
5825; GFX8-NEXT:    s_addc_u32 s6, s7, 0
5826; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5827; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
5828; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5829; GFX8-NEXT:    s_and_b32 s9, s9, 1
5830; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
5831; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5832; GFX8-NEXT:    s_addc_u32 s7, s7, s8
5833; GFX8-NEXT:    v_mov_b32_e32 v1, s4
5834; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5835; GFX8-NEXT:    v_mov_b32_e32 v3, s0
5836; GFX8-NEXT:    v_mov_b32_e32 v8, s1
5837; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5838; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5839; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5840; GFX8-NEXT:    v_mov_b32_e32 v2, s6
5841; GFX8-NEXT:    v_mov_b32_e32 v3, s7
5842; GFX8-NEXT:    v_mov_b32_e32 v8, s2
5843; GFX8-NEXT:    v_mov_b32_e32 v9, s3
5844; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5845; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5846; GFX8-NEXT:    v_readfirstlane_b32 s0, v5
5847; GFX8-NEXT:    v_readfirstlane_b32 s1, v4
5848; GFX8-NEXT:    v_readfirstlane_b32 s2, v6
5849; GFX8-NEXT:    v_readfirstlane_b32 s3, v7
5850; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5851; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5852; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
5853; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
5854; GFX8-NEXT:    ; return to shader part epilog
5855;
5856; GFX9-LABEL: s_ssubsat_v2i128:
5857; GFX9:       ; %bb.0:
5858; GFX9-NEXT:    s_sub_u32 s16, s0, s8
5859; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
5860; GFX9-NEXT:    s_and_b32 s17, s17, 1
5861; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
5862; GFX9-NEXT:    s_subb_u32 s17, s1, s9
5863; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
5864; GFX9-NEXT:    s_and_b32 s18, s18, 1
5865; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
5866; GFX9-NEXT:    s_subb_u32 s18, s2, s10
5867; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
5868; GFX9-NEXT:    s_and_b32 s19, s19, 1
5869; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5870; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
5871; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5872; GFX9-NEXT:    s_subb_u32 s19, s3, s11
5873; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5874; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
5875; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5876; GFX9-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
5877; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5878; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5879; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1]
5880; GFX9-NEXT:    s_and_b32 s0, 1, s2
5881; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5882; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5883; GFX9-NEXT:    v_cmp_gt_u64_e64 s[0:1], s[8:9], 0
5884; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
5885; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5886; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
5887; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5888; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5889; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
5890; GFX9-NEXT:    s_and_b32 s0, 1, s2
5891; GFX9-NEXT:    s_ashr_i32 s3, s19, 31
5892; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5893; GFX9-NEXT:    s_add_u32 s0, s3, 0
5894; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
5895; GFX9-NEXT:    s_and_b32 s1, s1, 1
5896; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
5897; GFX9-NEXT:    s_addc_u32 s1, s3, 0
5898; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5899; GFX9-NEXT:    s_and_b32 s2, s2, 1
5900; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
5901; GFX9-NEXT:    s_addc_u32 s2, s3, 0
5902; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
5903; GFX9-NEXT:    s_and_b32 s9, s9, 1
5904; GFX9-NEXT:    s_brev_b32 s8, 1
5905; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
5906; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5907; GFX9-NEXT:    s_addc_u32 s3, s3, s8
5908; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5909; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5910; GFX9-NEXT:    s_sub_u32 s0, s4, s12
5911; GFX9-NEXT:    v_mov_b32_e32 v2, s1
5912; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
5913; GFX9-NEXT:    s_and_b32 s1, s1, 1
5914; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
5915; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5916; GFX9-NEXT:    s_subb_u32 s1, s5, s13
5917; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5918; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5919; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5920; GFX9-NEXT:    s_and_b32 s2, s2, 1
5921; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
5922; GFX9-NEXT:    v_mov_b32_e32 v3, s16
5923; GFX9-NEXT:    v_mov_b32_e32 v4, s17
5924; GFX9-NEXT:    s_subb_u32 s2, s6, s14
5925; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5926; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5927; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5928; GFX9-NEXT:    v_mov_b32_e32 v2, s18
5929; GFX9-NEXT:    v_mov_b32_e32 v3, s19
5930; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
5931; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5932; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5933; GFX9-NEXT:    s_and_b32 s3, s3, 1
5934; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5935; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
5936; GFX9-NEXT:    v_mov_b32_e32 v3, s5
5937; GFX9-NEXT:    s_subb_u32 s3, s7, s15
5938; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5939; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5940; GFX9-NEXT:    v_mov_b32_e32 v1, s7
5941; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
5942; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5943; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5944; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5945; GFX9-NEXT:    s_and_b32 s4, 1, s6
5946; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5947; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5948; GFX9-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[12:13], 0
5949; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
5950; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5951; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
5952; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5953; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5954; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
5955; GFX9-NEXT:    s_and_b32 s4, 1, s6
5956; GFX9-NEXT:    s_ashr_i32 s7, s3, 31
5957; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5958; GFX9-NEXT:    s_add_u32 s4, s7, 0
5959; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
5960; GFX9-NEXT:    s_and_b32 s5, s5, 1
5961; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
5962; GFX9-NEXT:    s_addc_u32 s5, s7, 0
5963; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5964; GFX9-NEXT:    s_and_b32 s6, s6, 1
5965; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
5966; GFX9-NEXT:    s_addc_u32 s6, s7, 0
5967; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
5968; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
5969; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5970; GFX9-NEXT:    s_and_b32 s9, s9, 1
5971; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
5972; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5973; GFX9-NEXT:    s_addc_u32 s7, s7, s8
5974; GFX9-NEXT:    v_mov_b32_e32 v1, s4
5975; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5976; GFX9-NEXT:    v_mov_b32_e32 v3, s0
5977; GFX9-NEXT:    v_mov_b32_e32 v8, s1
5978; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5979; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5980; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5981; GFX9-NEXT:    v_mov_b32_e32 v2, s6
5982; GFX9-NEXT:    v_mov_b32_e32 v3, s7
5983; GFX9-NEXT:    v_mov_b32_e32 v8, s2
5984; GFX9-NEXT:    v_mov_b32_e32 v9, s3
5985; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5986; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5987; GFX9-NEXT:    v_readfirstlane_b32 s0, v5
5988; GFX9-NEXT:    v_readfirstlane_b32 s1, v4
5989; GFX9-NEXT:    v_readfirstlane_b32 s2, v6
5990; GFX9-NEXT:    v_readfirstlane_b32 s3, v7
5991; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5992; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5993; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
5994; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
5995; GFX9-NEXT:    ; return to shader part epilog
5996;
5997; GFX10-LABEL: s_ssubsat_v2i128:
5998; GFX10:       ; %bb.0:
5999; GFX10-NEXT:    s_sub_u32 s16, s0, s8
6000; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
6001; GFX10-NEXT:    s_and_b32 s17, s17, 1
6002; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
6003; GFX10-NEXT:    s_subb_u32 s17, s1, s9
6004; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
6005; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
6006; GFX10-NEXT:    s_and_b32 s18, s18, 1
6007; GFX10-NEXT:    v_cmp_gt_u64_e64 s1, s[8:9], 0
6008; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
6009; GFX10-NEXT:    s_subb_u32 s18, s2, s10
6010; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
6011; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
6012; GFX10-NEXT:    s_and_b32 s19, s19, 1
6013; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
6014; GFX10-NEXT:    s_subb_u32 s19, s3, s11
6015; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
6016; GFX10-NEXT:    s_cmp_eq_u64 s[18:19], s[2:3]
6017; GFX10-NEXT:    v_mov_b32_e32 v3, s19
6018; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
6019; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
6020; GFX10-NEXT:    s_and_b32 s0, 1, s20
6021; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
6022; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6023; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
6024; GFX10-NEXT:    s_ashr_i32 s3, s19, 31
6025; GFX10-NEXT:    s_and_b32 s0, 1, s0
6026; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
6027; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
6028; GFX10-NEXT:    v_cmp_gt_i64_e64 s1, s[10:11], 0
6029; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
6030; GFX10-NEXT:    s_add_u32 s0, s3, 0
6031; GFX10-NEXT:    s_brev_b32 s10, 1
6032; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
6033; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
6034; GFX10-NEXT:    s_and_b32 s1, s1, 1
6035; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
6036; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
6037; GFX10-NEXT:    v_mov_b32_e32 v2, s17
6038; GFX10-NEXT:    s_addc_u32 s1, s3, 0
6039; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6040; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
6041; GFX10-NEXT:    s_and_b32 s2, s2, 1
6042; GFX10-NEXT:    v_mov_b32_e32 v1, s16
6043; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
6044; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
6045; GFX10-NEXT:    s_addc_u32 s2, s3, 0
6046; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
6047; GFX10-NEXT:    s_and_b32 s8, s8, 1
6048; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6049; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
6050; GFX10-NEXT:    s_addc_u32 s3, s3, s10
6051; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
6052; GFX10-NEXT:    s_sub_u32 s0, s4, s12
6053; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
6054; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
6055; GFX10-NEXT:    s_and_b32 s8, s8, 1
6056; GFX10-NEXT:    v_mov_b32_e32 v2, s18
6057; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
6058; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
6059; GFX10-NEXT:    s_subb_u32 s1, s5, s13
6060; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
6061; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6062; GFX10-NEXT:    s_and_b32 s8, s8, 1
6063; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
6064; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
6065; GFX10-NEXT:    v_cmp_gt_u64_e64 s3, s[12:13], 0
6066; GFX10-NEXT:    s_subb_u32 s8, s6, s14
6067; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
6068; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
6069; GFX10-NEXT:    s_and_b32 s9, s9, 1
6070; GFX10-NEXT:    v_mov_b32_e32 v7, s8
6071; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
6072; GFX10-NEXT:    s_subb_u32 s9, s7, s15
6073; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
6074; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[6:7]
6075; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6076; GFX10-NEXT:    v_mov_b32_e32 v8, s9
6077; GFX10-NEXT:    s_and_b32 s2, 1, s2
6078; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
6079; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
6080; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
6081; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6082; GFX10-NEXT:    s_ashr_i32 s5, s9, 31
6083; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
6084; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
6085; GFX10-NEXT:    v_cmp_gt_i64_e64 s3, s[14:15], 0
6086; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s3
6087; GFX10-NEXT:    s_and_b32 s3, 1, s2
6088; GFX10-NEXT:    s_add_u32 s2, s5, 0
6089; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
6090; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
6091; GFX10-NEXT:    s_and_b32 s4, s4, 1
6092; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
6093; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
6094; GFX10-NEXT:    v_mov_b32_e32 v6, s1
6095; GFX10-NEXT:    s_addc_u32 s3, s5, 0
6096; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
6097; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
6098; GFX10-NEXT:    s_and_b32 s4, s4, 1
6099; GFX10-NEXT:    v_mov_b32_e32 v5, s0
6100; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
6101; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
6102; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
6103; GFX10-NEXT:    s_addc_u32 s4, s5, 0
6104; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
6105; GFX10-NEXT:    s_and_b32 s6, s6, 1
6106; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
6107; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
6108; GFX10-NEXT:    s_addc_u32 s1, s5, s10
6109; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
6110; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
6111; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
6112; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
6113; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
6114; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
6115; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
6116; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
6117; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
6118; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
6119; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
6120; GFX10-NEXT:    ; return to shader part epilog
6121  %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6122  ret <2 x i128> %result
6123}
6124
6125declare i7 @llvm.ssub.sat.i7(i7, i7) #0
6126declare i8 @llvm.ssub.sat.i8(i8, i8) #0
6127declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) #0
6128declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) #0
6129
6130declare i16 @llvm.ssub.sat.i16(i16, i16) #0
6131declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) #0
6132declare <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16>, <3 x i16>) #0
6133declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) #0
6134declare <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16>, <5 x i16>) #0
6135declare <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16>, <6 x i16>) #0
6136declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) #0
6137
6138declare i24 @llvm.ssub.sat.i24(i24, i24) #0
6139
6140declare i32 @llvm.ssub.sat.i32(i32, i32) #0
6141declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) #0
6142declare <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32>, <3 x i32>) #0
6143declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) #0
6144declare <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32>, <5 x i32>) #0
6145declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) #0
6146
6147declare i48 @llvm.ssub.sat.i48(i48, i48) #0
6148
6149declare i64 @llvm.ssub.sat.i64(i64, i64) #0
6150declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) #0
6151
6152declare i128 @llvm.ssub.sat.i128(i128, i128) #0
6153declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) #0
6154
6155attributes #0 = { nounwind readnone speculatable willreturn }
6156