1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
6
7define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
8; GFX6-LABEL: v_saddsat_i7:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
12; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
13; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
14; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
15; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
16; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
17; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
18; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
19; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
20; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 25, v0
21; GFX6-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX8-LABEL: v_saddsat_i7:
24; GFX8:       ; %bb.0:
25; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
27; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
28; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
29; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
30; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
31; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
32; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
33; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
34; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
35; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
36; GFX8-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX9-LABEL: v_saddsat_i7:
39; GFX9:       ; %bb.0:
40; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
42; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
43; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
44; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
45; GFX9-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX10-LABEL: v_saddsat_i7:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
51; GFX10-NEXT:    v_lshlrev_b16 v0, 9, v0
52; GFX10-NEXT:    v_lshlrev_b16 v1, 9, v1
53; GFX10-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
54; GFX10-NEXT:    v_ashrrev_i16 v0, 9, v0
55; GFX10-NEXT:    s_setpc_b64 s[30:31]
56  %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
57  ret i7 %result
58}
59
60define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
61; GFX6-LABEL: s_saddsat_i7:
62; GFX6:       ; %bb.0:
63; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
64; GFX6-NEXT:    s_min_i32 s3, s0, 0
65; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
66; GFX6-NEXT:    s_max_i32 s2, s0, 0
67; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
68; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
69; GFX6-NEXT:    s_max_i32 s1, s3, s1
70; GFX6-NEXT:    s_min_i32 s1, s1, s2
71; GFX6-NEXT:    s_add_i32 s0, s0, s1
72; GFX6-NEXT:    s_ashr_i32 s0, s0, 25
73; GFX6-NEXT:    ; return to shader part epilog
74;
75; GFX8-LABEL: s_saddsat_i7:
76; GFX8:       ; %bb.0:
77; GFX8-NEXT:    s_bfe_u32 s2, 9, 0x100000
78; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
79; GFX8-NEXT:    s_sext_i32_i16 s3, s0
80; GFX8-NEXT:    s_sext_i32_i16 s4, 0
81; GFX8-NEXT:    s_max_i32 s5, s3, s4
82; GFX8-NEXT:    s_min_i32 s3, s3, s4
83; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
84; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
85; GFX8-NEXT:    s_sext_i32_i16 s3, s3
86; GFX8-NEXT:    s_sext_i32_i16 s1, s1
87; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
88; GFX8-NEXT:    s_max_i32 s1, s3, s1
89; GFX8-NEXT:    s_sext_i32_i16 s1, s1
90; GFX8-NEXT:    s_sext_i32_i16 s3, s5
91; GFX8-NEXT:    s_min_i32 s1, s1, s3
92; GFX8-NEXT:    s_add_i32 s0, s0, s1
93; GFX8-NEXT:    s_sext_i32_i16 s0, s0
94; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
95; GFX8-NEXT:    ; return to shader part epilog
96;
97; GFX9-LABEL: s_saddsat_i7:
98; GFX9:       ; %bb.0:
99; GFX9-NEXT:    s_bfe_u32 s2, 9, 0x100000
100; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
101; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
102; GFX9-NEXT:    v_mov_b32_e32 v0, s1
103; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
104; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 9, v0
105; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
106; GFX9-NEXT:    ; return to shader part epilog
107;
108; GFX10-LABEL: s_saddsat_i7:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    s_bfe_u32 s2, 9, 0x100000
111; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
112; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
113; GFX10-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
114; GFX10-NEXT:    v_ashrrev_i16 v0, 9, v0
115; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
116; GFX10-NEXT:    ; return to shader part epilog
117  %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
118  ret i7 %result
119}
120
121define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
122; GFX6-LABEL: v_saddsat_i8:
123; GFX6:       ; %bb.0:
124; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
126; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
127; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
128; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
129; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
130; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
131; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
132; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
133; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
134; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
135; GFX6-NEXT:    s_setpc_b64 s[30:31]
136;
137; GFX8-LABEL: v_saddsat_i8:
138; GFX8:       ; %bb.0:
139; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
141; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
142; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
143; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
144; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
145; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
146; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
147; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
148; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
149; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
150; GFX8-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX9-LABEL: v_saddsat_i8:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
156; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
157; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
158; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
159; GFX9-NEXT:    s_setpc_b64 s[30:31]
160;
161; GFX10-LABEL: v_saddsat_i8:
162; GFX10:       ; %bb.0:
163; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
165; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
166; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
167; GFX10-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
168; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
169; GFX10-NEXT:    s_setpc_b64 s[30:31]
170  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
171  ret i8 %result
172}
173
174define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
175; GFX6-LABEL: s_saddsat_i8:
176; GFX6:       ; %bb.0:
177; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
178; GFX6-NEXT:    s_min_i32 s3, s0, 0
179; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
180; GFX6-NEXT:    s_max_i32 s2, s0, 0
181; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
182; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
183; GFX6-NEXT:    s_max_i32 s1, s3, s1
184; GFX6-NEXT:    s_min_i32 s1, s1, s2
185; GFX6-NEXT:    s_add_i32 s0, s0, s1
186; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
187; GFX6-NEXT:    ; return to shader part epilog
188;
189; GFX8-LABEL: s_saddsat_i8:
190; GFX8:       ; %bb.0:
191; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
192; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
193; GFX8-NEXT:    s_sext_i32_i16 s3, s0
194; GFX8-NEXT:    s_sext_i32_i16 s4, 0
195; GFX8-NEXT:    s_max_i32 s5, s3, s4
196; GFX8-NEXT:    s_min_i32 s3, s3, s4
197; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
198; GFX8-NEXT:    s_sub_i32 s3, 0xffff8000, s3
199; GFX8-NEXT:    s_sext_i32_i16 s3, s3
200; GFX8-NEXT:    s_sext_i32_i16 s1, s1
201; GFX8-NEXT:    s_sub_i32 s5, 0x7fff, s5
202; GFX8-NEXT:    s_max_i32 s1, s3, s1
203; GFX8-NEXT:    s_sext_i32_i16 s1, s1
204; GFX8-NEXT:    s_sext_i32_i16 s3, s5
205; GFX8-NEXT:    s_min_i32 s1, s1, s3
206; GFX8-NEXT:    s_add_i32 s0, s0, s1
207; GFX8-NEXT:    s_sext_i32_i16 s0, s0
208; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
209; GFX8-NEXT:    ; return to shader part epilog
210;
211; GFX9-LABEL: s_saddsat_i8:
212; GFX9:       ; %bb.0:
213; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
214; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
215; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
216; GFX9-NEXT:    v_mov_b32_e32 v0, s1
217; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
218; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
219; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
220; GFX9-NEXT:    ; return to shader part epilog
221;
222; GFX10-LABEL: s_saddsat_i8:
223; GFX10:       ; %bb.0:
224; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
225; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
226; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
227; GFX10-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
228; GFX10-NEXT:    v_ashrrev_i16 v0, 8, v0
229; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
230; GFX10-NEXT:    ; return to shader part epilog
231  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
232  ret i8 %result
233}
234
235define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
236; GFX6-LABEL: v_saddsat_v2i8:
237; GFX6:       ; %bb.0:
238; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
240; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
241; GFX6-NEXT:    s_brev_b32 s5, 1
242; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
243; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
244; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
245; GFX6-NEXT:    s_brev_b32 s4, -2
246; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
247; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
248; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
249; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
250; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
251; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
252; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
253; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
254; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
255; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
256; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
257; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
258; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
259; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
260; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
261; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
262; GFX6-NEXT:    v_mov_b32_e32 v2, 0xff
263; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
264; GFX6-NEXT:    v_and_b32_e32 v1, v1, v2
265; GFX6-NEXT:    v_and_b32_e32 v0, v0, v2
266; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
267; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
268; GFX6-NEXT:    s_setpc_b64 s[30:31]
269;
270; GFX8-LABEL: v_saddsat_v2i8:
271; GFX8:       ; %bb.0:
272; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273; GFX8-NEXT:    v_mov_b32_e32 v2, 8
274; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
275; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
276; GFX8-NEXT:    s_movk_i32 s5, 0x8000
277; GFX8-NEXT:    v_min_i16_e32 v5, 0, v0
278; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
279; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
280; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
281; GFX8-NEXT:    v_max_i16_e32 v4, 0, v0
282; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
283; GFX8-NEXT:    v_sub_u16_e32 v4, s4, v4
284; GFX8-NEXT:    v_max_i16_e32 v1, v5, v1
285; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
286; GFX8-NEXT:    v_min_i16_e32 v4, 0, v3
287; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
288; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
289; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
290; GFX8-NEXT:    v_sub_u16_e32 v1, s4, v1
291; GFX8-NEXT:    v_max_i16_e32 v2, v4, v2
292; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
293; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
294; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
295; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
296; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
297; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
298; GFX8-NEXT:    s_setpc_b64 s[30:31]
299;
300; GFX9-LABEL: v_saddsat_v2i8:
301; GFX9:       ; %bb.0:
302; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
303; GFX9-NEXT:    s_mov_b32 s4, 8
304; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
305; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
306; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
307; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v2
308; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
309; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
310; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
311; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
312; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
313; GFX9-NEXT:    s_movk_i32 s4, 0xff
314; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
315; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
316; GFX9-NEXT:    s_setpc_b64 s[30:31]
317;
318; GFX10-LABEL: v_saddsat_v2i8:
319; GFX10:       ; %bb.0:
320; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
322; GFX10-NEXT:    s_mov_b32 s4, 8
323; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
324; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
325; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
326; GFX10-NEXT:    s_movk_i32 s4, 0xff
327; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
328; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
329; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
330; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
331; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
332; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
333; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
334; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
335; GFX10-NEXT:    s_setpc_b64 s[30:31]
336  %lhs = bitcast i16 %lhs.arg to <2 x i8>
337  %rhs = bitcast i16 %rhs.arg to <2 x i8>
338  %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
339  %cast.result = bitcast <2 x i8> %result to i16
340  ret i16 %cast.result
341}
342
343define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
344; GFX6-LABEL: s_saddsat_v2i8:
345; GFX6:       ; %bb.0:
346; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
347; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
348; GFX6-NEXT:    s_brev_b32 s5, 1
349; GFX6-NEXT:    s_min_i32 s7, s0, 0
350; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
351; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
352; GFX6-NEXT:    s_brev_b32 s4, -2
353; GFX6-NEXT:    s_max_i32 s6, s0, 0
354; GFX6-NEXT:    s_sub_i32 s7, s5, s7
355; GFX6-NEXT:    s_sub_i32 s6, s4, s6
356; GFX6-NEXT:    s_max_i32 s1, s7, s1
357; GFX6-NEXT:    s_min_i32 s1, s1, s6
358; GFX6-NEXT:    s_add_i32 s0, s0, s1
359; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
360; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
361; GFX6-NEXT:    s_max_i32 s3, s1, 0
362; GFX6-NEXT:    s_sub_i32 s3, s4, s3
363; GFX6-NEXT:    s_min_i32 s4, s1, 0
364; GFX6-NEXT:    s_sub_i32 s4, s5, s4
365; GFX6-NEXT:    s_max_i32 s2, s4, s2
366; GFX6-NEXT:    s_min_i32 s2, s2, s3
367; GFX6-NEXT:    s_add_i32 s1, s1, s2
368; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
369; GFX6-NEXT:    s_movk_i32 s2, 0xff
370; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
371; GFX6-NEXT:    s_and_b32 s1, s1, s2
372; GFX6-NEXT:    s_and_b32 s0, s0, s2
373; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
374; GFX6-NEXT:    s_or_b32 s0, s0, s1
375; GFX6-NEXT:    ; return to shader part epilog
376;
377; GFX8-LABEL: s_saddsat_v2i8:
378; GFX8:       ; %bb.0:
379; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
380; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
381; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
382; GFX8-NEXT:    s_sext_i32_i16 s7, s0
383; GFX8-NEXT:    s_sext_i32_i16 s8, 0
384; GFX8-NEXT:    s_movk_i32 s6, 0x8000
385; GFX8-NEXT:    s_max_i32 s9, s7, s8
386; GFX8-NEXT:    s_min_i32 s7, s7, s8
387; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
388; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
389; GFX8-NEXT:    s_sub_i32 s7, s6, s7
390; GFX8-NEXT:    s_movk_i32 s5, 0x7fff
391; GFX8-NEXT:    s_sext_i32_i16 s7, s7
392; GFX8-NEXT:    s_sext_i32_i16 s1, s1
393; GFX8-NEXT:    s_sub_i32 s9, s5, s9
394; GFX8-NEXT:    s_max_i32 s1, s7, s1
395; GFX8-NEXT:    s_sext_i32_i16 s1, s1
396; GFX8-NEXT:    s_sext_i32_i16 s7, s9
397; GFX8-NEXT:    s_min_i32 s1, s1, s7
398; GFX8-NEXT:    s_add_i32 s0, s0, s1
399; GFX8-NEXT:    s_lshl_b32 s1, s2, s4
400; GFX8-NEXT:    s_lshl_b32 s2, s3, s4
401; GFX8-NEXT:    s_sext_i32_i16 s3, s1
402; GFX8-NEXT:    s_max_i32 s7, s3, s8
403; GFX8-NEXT:    s_min_i32 s3, s3, s8
404; GFX8-NEXT:    s_sub_i32 s3, s6, s3
405; GFX8-NEXT:    s_sext_i32_i16 s3, s3
406; GFX8-NEXT:    s_sext_i32_i16 s2, s2
407; GFX8-NEXT:    s_sub_i32 s5, s5, s7
408; GFX8-NEXT:    s_max_i32 s2, s3, s2
409; GFX8-NEXT:    s_sext_i32_i16 s2, s2
410; GFX8-NEXT:    s_sext_i32_i16 s3, s5
411; GFX8-NEXT:    s_min_i32 s2, s2, s3
412; GFX8-NEXT:    s_add_i32 s1, s1, s2
413; GFX8-NEXT:    s_sext_i32_i16 s1, s1
414; GFX8-NEXT:    s_sext_i32_i16 s0, s0
415; GFX8-NEXT:    s_ashr_i32 s1, s1, s4
416; GFX8-NEXT:    s_movk_i32 s2, 0xff
417; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
418; GFX8-NEXT:    s_and_b32 s1, s1, s2
419; GFX8-NEXT:    s_and_b32 s0, s0, s2
420; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
421; GFX8-NEXT:    s_or_b32 s0, s0, s1
422; GFX8-NEXT:    ; return to shader part epilog
423;
424; GFX9-LABEL: s_saddsat_v2i8:
425; GFX9:       ; %bb.0:
426; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
427; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
428; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
429; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
430; GFX9-NEXT:    s_mov_b32 s2, 0x80008
431; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
432; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
433; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
434; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
435; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
436; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
437; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
438; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
439; GFX9-NEXT:    v_mov_b32_e32 v0, s1
440; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
441; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
442; GFX9-NEXT:    s_movk_i32 s0, 0xff
443; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
444; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
445; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
446; GFX9-NEXT:    ; return to shader part epilog
447;
448; GFX10-LABEL: s_saddsat_v2i8:
449; GFX10:       ; %bb.0:
450; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
451; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
452; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
453; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
454; GFX10-NEXT:    s_mov_b32 s2, 0x80008
455; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
456; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
457; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
458; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
459; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
460; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
461; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
462; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
463; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
464; GFX10-NEXT:    s_movk_i32 s0, 0xff
465; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
466; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
467; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
468; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
469; GFX10-NEXT:    ; return to shader part epilog
470  %lhs = bitcast i16 %lhs.arg to <2 x i8>
471  %rhs = bitcast i16 %rhs.arg to <2 x i8>
472  %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
473  %cast.result = bitcast <2 x i8> %result to i16
474  ret i16 %cast.result
475}
476
477define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
478; GFX6-LABEL: v_saddsat_v4i8:
479; GFX6:       ; %bb.0:
480; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
482; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
483; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
484; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
485; GFX6-NEXT:    s_brev_b32 s5, 1
486; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
487; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
488; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
489; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
490; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
491; GFX6-NEXT:    s_brev_b32 s4, -2
492; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
493; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
494; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
495; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
496; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
497; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
498; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
499; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
500; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
501; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
502; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
503; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
504; GFX6-NEXT:    v_max_i32_e32 v2, v8, v2
505; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
506; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
507; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
508; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
509; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
510; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
511; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
512; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
513; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
514; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
515; GFX6-NEXT:    v_min_i32_e32 v3, v3, v5
516; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
517; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
518; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
519; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
520; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
521; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
522; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
523; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
524; GFX6-NEXT:    s_movk_i32 s4, 0xff
525; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
526; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
527; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
528; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
529; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
530; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
531; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
532; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
533; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
534; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
535; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
536; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
537; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
538; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
539; GFX6-NEXT:    v_and_b32_e32 v1, s4, v3
540; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
541; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
542; GFX6-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX8-LABEL: v_saddsat_v4i8:
545; GFX8:       ; %bb.0:
546; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX8-NEXT:    v_mov_b32_e32 v2, 8
548; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
549; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
550; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
551; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
552; GFX8-NEXT:    s_movk_i32 s5, 0x8000
553; GFX8-NEXT:    v_min_i16_e32 v10, 0, v0
554; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
555; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
556; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
557; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
558; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
559; GFX8-NEXT:    v_max_i16_e32 v8, 0, v0
560; GFX8-NEXT:    v_sub_u16_e32 v10, s5, v10
561; GFX8-NEXT:    v_sub_u16_e32 v8, s4, v8
562; GFX8-NEXT:    v_max_i16_e32 v1, v10, v1
563; GFX8-NEXT:    v_min_i16_e32 v1, v1, v8
564; GFX8-NEXT:    v_min_i16_e32 v8, 0, v3
565; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
566; GFX8-NEXT:    v_max_i16_e32 v1, 0, v3
567; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
568; GFX8-NEXT:    v_sub_u16_e32 v1, s4, v1
569; GFX8-NEXT:    v_max_i16_e32 v2, v8, v2
570; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
571; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
572; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
573; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
574; GFX8-NEXT:    v_min_i16_e32 v6, 0, v2
575; GFX8-NEXT:    v_mov_b32_e32 v9, 0x7fff
576; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
577; GFX8-NEXT:    v_sub_u16_e32 v6, s5, v6
578; GFX8-NEXT:    v_sub_u16_e32 v4, v9, v4
579; GFX8-NEXT:    v_max_i16_e32 v3, v6, v3
580; GFX8-NEXT:    v_min_i16_e32 v3, v3, v4
581; GFX8-NEXT:    v_add_u16_e32 v2, v2, v3
582; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
583; GFX8-NEXT:    v_min_i16_e32 v6, 0, v3
584; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
585; GFX8-NEXT:    v_max_i16_e32 v5, 0, v3
586; GFX8-NEXT:    v_sub_u16_e32 v6, 0x8000, v6
587; GFX8-NEXT:    v_sub_u16_e32 v5, v9, v5
588; GFX8-NEXT:    v_max_i16_e32 v4, v6, v4
589; GFX8-NEXT:    v_min_i16_e32 v4, v4, v5
590; GFX8-NEXT:    v_add_u16_e32 v3, v3, v4
591; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
592; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
593; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
594; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
595; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
596; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
597; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
598; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
599; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
600; GFX8-NEXT:    s_setpc_b64 s[30:31]
601;
602; GFX9-LABEL: v_saddsat_v4i8:
603; GFX9:       ; %bb.0:
604; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; GFX9-NEXT:    s_mov_b32 s4, 8
606; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
607; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
608; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
609; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
610; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
611; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
612; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
613; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
614; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
615; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
616; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
617; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
618; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
619; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
620; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
621; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
622; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
623; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
624; GFX9-NEXT:    v_pk_add_i16 v1, v2, v3 clamp
625; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
626; GFX9-NEXT:    v_mov_b32_e32 v2, 8
627; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
628; GFX9-NEXT:    s_movk_i32 s4, 0xff
629; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
630; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
631; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
632; GFX9-NEXT:    v_mov_b32_e32 v3, 24
633; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
634; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
635; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
636; GFX9-NEXT:    s_setpc_b64 s[30:31]
637;
638; GFX10-LABEL: v_saddsat_v4i8:
639; GFX10:       ; %bb.0:
640; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
641; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
642; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
643; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
644; GFX10-NEXT:    s_mov_b32 s4, 8
645; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
646; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
647; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
648; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
649; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
650; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
651; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
652; GFX10-NEXT:    s_movk_i32 s4, 0xff
653; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
654; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
655; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
656; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
657; GFX10-NEXT:    v_mov_b32_e32 v4, 24
658; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
659; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
660; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
661; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
662; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
663; GFX10-NEXT:    v_pk_add_i16 v1, v2, v3 clamp
664; GFX10-NEXT:    v_mov_b32_e32 v2, 8
665; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
666; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
667; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
668; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
669; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
670; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
671; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
672; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
673; GFX10-NEXT:    s_setpc_b64 s[30:31]
674  %lhs = bitcast i32 %lhs.arg to <4 x i8>
675  %rhs = bitcast i32 %rhs.arg to <4 x i8>
676  %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
677  %cast.result = bitcast <4 x i8> %result to i32
678  ret i32 %cast.result
679}
680
681define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
682; GFX6-LABEL: s_saddsat_v4i8:
683; GFX6:       ; %bb.0:
684; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
685; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
686; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
687; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
688; GFX6-NEXT:    s_brev_b32 s9, 1
689; GFX6-NEXT:    s_min_i32 s11, s0, 0
690; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
691; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
692; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
693; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
694; GFX6-NEXT:    s_brev_b32 s8, -2
695; GFX6-NEXT:    s_max_i32 s10, s0, 0
696; GFX6-NEXT:    s_sub_i32 s11, s9, s11
697; GFX6-NEXT:    s_sub_i32 s10, s8, s10
698; GFX6-NEXT:    s_max_i32 s1, s11, s1
699; GFX6-NEXT:    s_min_i32 s1, s1, s10
700; GFX6-NEXT:    s_add_i32 s0, s0, s1
701; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
702; GFX6-NEXT:    s_min_i32 s10, s1, 0
703; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
704; GFX6-NEXT:    s_max_i32 s5, s1, 0
705; GFX6-NEXT:    s_sub_i32 s10, s9, s10
706; GFX6-NEXT:    s_sub_i32 s5, s8, s5
707; GFX6-NEXT:    s_max_i32 s2, s10, s2
708; GFX6-NEXT:    s_min_i32 s2, s2, s5
709; GFX6-NEXT:    s_add_i32 s1, s1, s2
710; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
711; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
712; GFX6-NEXT:    s_min_i32 s6, s2, 0
713; GFX6-NEXT:    s_max_i32 s5, s2, 0
714; GFX6-NEXT:    s_sub_i32 s6, s9, s6
715; GFX6-NEXT:    s_sub_i32 s5, s8, s5
716; GFX6-NEXT:    s_max_i32 s3, s6, s3
717; GFX6-NEXT:    s_min_i32 s3, s3, s5
718; GFX6-NEXT:    s_add_i32 s2, s2, s3
719; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
720; GFX6-NEXT:    s_min_i32 s6, s3, 0
721; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
722; GFX6-NEXT:    s_max_i32 s5, s3, 0
723; GFX6-NEXT:    s_sub_i32 s6, s9, s6
724; GFX6-NEXT:    s_sub_i32 s5, s8, s5
725; GFX6-NEXT:    s_max_i32 s4, s6, s4
726; GFX6-NEXT:    s_min_i32 s4, s4, s5
727; GFX6-NEXT:    s_ashr_i32 s1, s1, 24
728; GFX6-NEXT:    s_add_i32 s3, s3, s4
729; GFX6-NEXT:    s_movk_i32 s4, 0xff
730; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
731; GFX6-NEXT:    s_and_b32 s1, s1, s4
732; GFX6-NEXT:    s_ashr_i32 s2, s2, 24
733; GFX6-NEXT:    s_and_b32 s0, s0, s4
734; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
735; GFX6-NEXT:    s_or_b32 s0, s0, s1
736; GFX6-NEXT:    s_and_b32 s1, s2, s4
737; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
738; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
739; GFX6-NEXT:    s_or_b32 s0, s0, s1
740; GFX6-NEXT:    s_and_b32 s1, s3, s4
741; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
742; GFX6-NEXT:    s_or_b32 s0, s0, s1
743; GFX6-NEXT:    ; return to shader part epilog
744;
745; GFX8-LABEL: s_saddsat_v4i8:
746; GFX8:       ; %bb.0:
747; GFX8-NEXT:    s_bfe_u32 s8, 8, 0x100000
748; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
749; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
750; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
751; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
752; GFX8-NEXT:    s_sext_i32_i16 s11, s0
753; GFX8-NEXT:    s_sext_i32_i16 s12, 0
754; GFX8-NEXT:    s_movk_i32 s10, 0x8000
755; GFX8-NEXT:    s_max_i32 s13, s11, s12
756; GFX8-NEXT:    s_min_i32 s11, s11, s12
757; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
758; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
759; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
760; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
761; GFX8-NEXT:    s_sub_i32 s11, s10, s11
762; GFX8-NEXT:    s_movk_i32 s9, 0x7fff
763; GFX8-NEXT:    s_sext_i32_i16 s11, s11
764; GFX8-NEXT:    s_sext_i32_i16 s1, s1
765; GFX8-NEXT:    s_sub_i32 s13, s9, s13
766; GFX8-NEXT:    s_max_i32 s1, s11, s1
767; GFX8-NEXT:    s_sext_i32_i16 s1, s1
768; GFX8-NEXT:    s_sext_i32_i16 s11, s13
769; GFX8-NEXT:    s_min_i32 s1, s1, s11
770; GFX8-NEXT:    s_add_i32 s0, s0, s1
771; GFX8-NEXT:    s_lshl_b32 s1, s2, s8
772; GFX8-NEXT:    s_lshl_b32 s2, s5, s8
773; GFX8-NEXT:    s_sext_i32_i16 s5, s1
774; GFX8-NEXT:    s_max_i32 s11, s5, s12
775; GFX8-NEXT:    s_min_i32 s5, s5, s12
776; GFX8-NEXT:    s_sub_i32 s5, s10, s5
777; GFX8-NEXT:    s_sext_i32_i16 s5, s5
778; GFX8-NEXT:    s_sext_i32_i16 s2, s2
779; GFX8-NEXT:    s_sub_i32 s11, s9, s11
780; GFX8-NEXT:    s_max_i32 s2, s5, s2
781; GFX8-NEXT:    s_sext_i32_i16 s2, s2
782; GFX8-NEXT:    s_sext_i32_i16 s5, s11
783; GFX8-NEXT:    s_min_i32 s2, s2, s5
784; GFX8-NEXT:    s_add_i32 s1, s1, s2
785; GFX8-NEXT:    s_lshl_b32 s2, s3, s8
786; GFX8-NEXT:    s_sext_i32_i16 s5, s2
787; GFX8-NEXT:    s_lshl_b32 s3, s6, s8
788; GFX8-NEXT:    s_max_i32 s6, s5, s12
789; GFX8-NEXT:    s_min_i32 s5, s5, s12
790; GFX8-NEXT:    s_sub_i32 s5, s10, s5
791; GFX8-NEXT:    s_sext_i32_i16 s5, s5
792; GFX8-NEXT:    s_sext_i32_i16 s3, s3
793; GFX8-NEXT:    s_sub_i32 s6, s9, s6
794; GFX8-NEXT:    s_max_i32 s3, s5, s3
795; GFX8-NEXT:    s_sext_i32_i16 s3, s3
796; GFX8-NEXT:    s_sext_i32_i16 s5, s6
797; GFX8-NEXT:    s_min_i32 s3, s3, s5
798; GFX8-NEXT:    s_add_i32 s2, s2, s3
799; GFX8-NEXT:    s_lshl_b32 s3, s4, s8
800; GFX8-NEXT:    s_sext_i32_i16 s5, s3
801; GFX8-NEXT:    s_max_i32 s6, s5, s12
802; GFX8-NEXT:    s_min_i32 s5, s5, s12
803; GFX8-NEXT:    s_lshl_b32 s4, s7, s8
804; GFX8-NEXT:    s_sub_i32 s5, s10, s5
805; GFX8-NEXT:    s_sext_i32_i16 s5, s5
806; GFX8-NEXT:    s_sext_i32_i16 s4, s4
807; GFX8-NEXT:    s_sub_i32 s6, s9, s6
808; GFX8-NEXT:    s_max_i32 s4, s5, s4
809; GFX8-NEXT:    s_sext_i32_i16 s4, s4
810; GFX8-NEXT:    s_sext_i32_i16 s5, s6
811; GFX8-NEXT:    s_sext_i32_i16 s1, s1
812; GFX8-NEXT:    s_min_i32 s4, s4, s5
813; GFX8-NEXT:    s_sext_i32_i16 s0, s0
814; GFX8-NEXT:    s_ashr_i32 s1, s1, s8
815; GFX8-NEXT:    s_add_i32 s3, s3, s4
816; GFX8-NEXT:    s_movk_i32 s4, 0xff
817; GFX8-NEXT:    s_ashr_i32 s0, s0, s8
818; GFX8-NEXT:    s_sext_i32_i16 s2, s2
819; GFX8-NEXT:    s_and_b32 s1, s1, s4
820; GFX8-NEXT:    s_ashr_i32 s2, s2, s8
821; GFX8-NEXT:    s_and_b32 s0, s0, s4
822; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
823; GFX8-NEXT:    s_sext_i32_i16 s3, s3
824; GFX8-NEXT:    s_or_b32 s0, s0, s1
825; GFX8-NEXT:    s_and_b32 s1, s2, s4
826; GFX8-NEXT:    s_ashr_i32 s3, s3, s8
827; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
828; GFX8-NEXT:    s_or_b32 s0, s0, s1
829; GFX8-NEXT:    s_and_b32 s1, s3, s4
830; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
831; GFX8-NEXT:    s_or_b32 s0, s0, s1
832; GFX8-NEXT:    ; return to shader part epilog
833;
834; GFX9-LABEL: s_saddsat_v4i8:
835; GFX9:       ; %bb.0:
836; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
837; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
838; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
839; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
840; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
841; GFX9-NEXT:    s_mov_b32 s4, 0x80008
842; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
843; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
844; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
845; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
846; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
847; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
848; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
849; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
850; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
851; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
852; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
853; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
854; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
855; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
856; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
857; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
858; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
859; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
860; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
861; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
862; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
863; GFX9-NEXT:    v_mov_b32_e32 v0, s1
864; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
865; GFX9-NEXT:    v_mov_b32_e32 v1, s4
866; GFX9-NEXT:    s_mov_b32 s2, 8
867; GFX9-NEXT:    v_pk_add_i16 v1, s3, v1 clamp
868; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
869; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
870; GFX9-NEXT:    s_movk_i32 s0, 0xff
871; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
872; GFX9-NEXT:    s_mov_b32 s5, 24
873; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
874; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
875; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
876; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
877; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
878; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
879; GFX9-NEXT:    ; return to shader part epilog
880;
881; GFX10-LABEL: s_saddsat_v4i8:
882; GFX10:       ; %bb.0:
883; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
884; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
885; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
886; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
887; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
888; GFX10-NEXT:    s_mov_b32 s3, 0x80008
889; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
890; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
891; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
892; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
893; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
894; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
895; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
896; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
897; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
898; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
899; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
900; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
901; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
902; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
903; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
904; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
905; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
906; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
907; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
908; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
909; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
910; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
911; GFX10-NEXT:    v_pk_add_i16 v1, s2, s3 clamp
912; GFX10-NEXT:    s_mov_b32 s0, 8
913; GFX10-NEXT:    s_movk_i32 s1, 0xff
914; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
915; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
916; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
917; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
918; GFX10-NEXT:    s_mov_b32 s0, 24
919; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
920; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
921; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
922; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
923; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
924; GFX10-NEXT:    ; return to shader part epilog
925  %lhs = bitcast i32 %lhs.arg to <4 x i8>
926  %rhs = bitcast i32 %rhs.arg to <4 x i8>
927  %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
928  %cast.result = bitcast <4 x i8> %result to i32
929  ret i32 %cast.result
930}
931
932define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) {
933; GFX6-LABEL: v_saddsat_i24:
934; GFX6:       ; %bb.0:
935; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
937; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
938; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
939; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
940; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
941; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
942; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
943; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
944; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
945; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
946; GFX6-NEXT:    s_setpc_b64 s[30:31]
947;
948; GFX8-LABEL: v_saddsat_i24:
949; GFX8:       ; %bb.0:
950; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
952; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 24
953; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 24
954; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
955; GFX8-NEXT:    v_bfe_i32 v0, v1, 0, 24
956; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v0
957; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 23, v3
958; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xff800000, v0
959; GFX8-NEXT:    s_xor_b64 vcc, s[6:7], s[4:5]
960; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
961; GFX8-NEXT:    s_setpc_b64 s[30:31]
962;
963; GFX9-LABEL: v_saddsat_i24:
964; GFX9:       ; %bb.0:
965; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
966; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
967; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
968; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
969; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
970; GFX9-NEXT:    s_setpc_b64 s[30:31]
971;
972; GFX10-LABEL: v_saddsat_i24:
973; GFX10:       ; %bb.0:
974; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
976; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
977; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
978; GFX10-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
979; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
980; GFX10-NEXT:    s_setpc_b64 s[30:31]
981  %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
982  ret i24 %result
983}
984
985define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
986; GFX6-LABEL: s_saddsat_i24:
987; GFX6:       ; %bb.0:
988; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
989; GFX6-NEXT:    s_min_i32 s3, s0, 0
990; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
991; GFX6-NEXT:    s_max_i32 s2, s0, 0
992; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
993; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
994; GFX6-NEXT:    s_max_i32 s1, s3, s1
995; GFX6-NEXT:    s_min_i32 s1, s1, s2
996; GFX6-NEXT:    s_add_i32 s0, s0, s1
997; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
998; GFX6-NEXT:    ; return to shader part epilog
999;
1000; GFX8-LABEL: s_saddsat_i24:
1001; GFX8:       ; %bb.0:
1002; GFX8-NEXT:    s_add_i32 s2, s0, s1
1003; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x180000
1004; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x180000
1005; GFX8-NEXT:    s_cmp_lt_i32 s3, s0
1006; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
1007; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x180000
1008; GFX8-NEXT:    s_cmp_lt_i32 s1, 0
1009; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
1010; GFX8-NEXT:    s_xor_b32 s0, s1, s0
1011; GFX8-NEXT:    s_ashr_i32 s1, s3, 23
1012; GFX8-NEXT:    s_add_i32 s1, s1, 0xff800000
1013; GFX8-NEXT:    s_and_b32 s0, s0, 1
1014; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1015; GFX8-NEXT:    s_cselect_b32 s0, s1, s2
1016; GFX8-NEXT:    ; return to shader part epilog
1017;
1018; GFX9-LABEL: s_saddsat_i24:
1019; GFX9:       ; %bb.0:
1020; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
1021; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
1022; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1023; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1024; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1025; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1026; GFX9-NEXT:    ; return to shader part epilog
1027;
1028; GFX10-LABEL: s_saddsat_i24:
1029; GFX10:       ; %bb.0:
1030; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
1031; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
1032; GFX10-NEXT:    v_add_nc_i32 v0, s0, s1 clamp
1033; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 8, v0
1034; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1035; GFX10-NEXT:    ; return to shader part epilog
1036  %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs)
1037  ret i24 %result
1038}
1039
1040define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
1041; GFX6-LABEL: v_saddsat_i32:
1042; GFX6:       ; %bb.0:
1043; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1044; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
1045; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
1046; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
1047; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
1048; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
1049; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
1050; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1051; GFX6-NEXT:    s_setpc_b64 s[30:31]
1052;
1053; GFX8-LABEL: v_saddsat_i32:
1054; GFX8:       ; %bb.0:
1055; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1056; GFX8-NEXT:    v_min_i32_e32 v3, 0, v0
1057; GFX8-NEXT:    v_max_i32_e32 v2, 0, v0
1058; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x80000000, v3
1059; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
1060; GFX8-NEXT:    v_max_i32_e32 v1, v3, v1
1061; GFX8-NEXT:    v_min_i32_e32 v1, v1, v2
1062; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1063; GFX8-NEXT:    s_setpc_b64 s[30:31]
1064;
1065; GFX9-LABEL: v_saddsat_i32:
1066; GFX9:       ; %bb.0:
1067; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1068; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
1069; GFX9-NEXT:    s_setpc_b64 s[30:31]
1070;
1071; GFX10-LABEL: v_saddsat_i32:
1072; GFX10:       ; %bb.0:
1073; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1074; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1075; GFX10-NEXT:    v_add_nc_i32 v0, v0, v1 clamp
1076; GFX10-NEXT:    s_setpc_b64 s[30:31]
1077  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1078  ret i32 %result
1079}
1080
1081define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
1082; GCN-LABEL: s_saddsat_i32:
1083; GCN:       ; %bb.0:
1084; GCN-NEXT:    s_cmp_gt_i32 s0, 0
1085; GCN-NEXT:    s_cselect_b32 s2, s0, 0
1086; GCN-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1087; GCN-NEXT:    s_cmp_lt_i32 s0, 0
1088; GCN-NEXT:    s_cselect_b32 s3, s0, 0
1089; GCN-NEXT:    s_sub_i32 s3, 0x80000000, s3
1090; GCN-NEXT:    s_cmp_gt_i32 s3, s1
1091; GCN-NEXT:    s_cselect_b32 s1, s3, s1
1092; GCN-NEXT:    s_cmp_lt_i32 s1, s2
1093; GCN-NEXT:    s_cselect_b32 s1, s1, s2
1094; GCN-NEXT:    s_add_i32 s0, s0, s1
1095; GCN-NEXT:    ; return to shader part epilog
1096; GFX6-LABEL: s_saddsat_i32:
1097; GFX6:       ; %bb.0:
1098; GFX6-NEXT:    s_min_i32 s3, s0, 0
1099; GFX6-NEXT:    s_max_i32 s2, s0, 0
1100; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
1101; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1102; GFX6-NEXT:    s_max_i32 s1, s3, s1
1103; GFX6-NEXT:    s_min_i32 s1, s1, s2
1104; GFX6-NEXT:    s_add_i32 s0, s0, s1
1105; GFX6-NEXT:    ; return to shader part epilog
1106;
1107; GFX8-LABEL: s_saddsat_i32:
1108; GFX8:       ; %bb.0:
1109; GFX8-NEXT:    s_min_i32 s3, s0, 0
1110; GFX8-NEXT:    s_max_i32 s2, s0, 0
1111; GFX8-NEXT:    s_sub_i32 s3, 0x80000000, s3
1112; GFX8-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
1113; GFX8-NEXT:    s_max_i32 s1, s3, s1
1114; GFX8-NEXT:    s_min_i32 s1, s1, s2
1115; GFX8-NEXT:    s_add_i32 s0, s0, s1
1116; GFX8-NEXT:    ; return to shader part epilog
1117;
1118; GFX9-LABEL: s_saddsat_i32:
1119; GFX9:       ; %bb.0:
1120; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1121; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1122; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1123; GFX9-NEXT:    ; return to shader part epilog
1124;
1125; GFX10-LABEL: s_saddsat_i32:
1126; GFX10:       ; %bb.0:
1127; GFX10-NEXT:    v_add_nc_i32 v0, s0, s1 clamp
1128; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1129; GFX10-NEXT:    ; return to shader part epilog
1130  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1131  ret i32 %result
1132}
1133
1134define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
1135; GFX6-LABEL: saddsat_i32_sv:
1136; GFX6:       ; %bb.0:
1137; GFX6-NEXT:    s_min_i32 s2, s0, 0
1138; GFX6-NEXT:    s_max_i32 s1, s0, 0
1139; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
1140; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
1141; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
1142; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
1143; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1144; GFX6-NEXT:    ; return to shader part epilog
1145;
1146; GFX8-LABEL: saddsat_i32_sv:
1147; GFX8:       ; %bb.0:
1148; GFX8-NEXT:    s_min_i32 s2, s0, 0
1149; GFX8-NEXT:    s_max_i32 s1, s0, 0
1150; GFX8-NEXT:    s_sub_i32 s2, 0x80000000, s2
1151; GFX8-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
1152; GFX8-NEXT:    v_max_i32_e32 v0, s2, v0
1153; GFX8-NEXT:    v_min_i32_e32 v0, s1, v0
1154; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1155; GFX8-NEXT:    ; return to shader part epilog
1156;
1157; GFX9-LABEL: saddsat_i32_sv:
1158; GFX9:       ; %bb.0:
1159; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1160; GFX9-NEXT:    ; return to shader part epilog
1161;
1162; GFX10-LABEL: saddsat_i32_sv:
1163; GFX10:       ; %bb.0:
1164; GFX10-NEXT:    v_add_nc_i32 v0, s0, v0 clamp
1165; GFX10-NEXT:    ; return to shader part epilog
1166  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1167  %cast = bitcast i32 %result to float
1168  ret float %cast
1169}
1170
1171define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
1172; GFX6-LABEL: saddsat_i32_vs:
1173; GFX6:       ; %bb.0:
1174; GFX6-NEXT:    v_min_i32_e32 v2, 0, v0
1175; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
1176; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x80000000, v2
1177; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
1178; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
1179; GFX6-NEXT:    v_min_i32_e32 v1, v2, v1
1180; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1181; GFX6-NEXT:    ; return to shader part epilog
1182;
1183; GFX8-LABEL: saddsat_i32_vs:
1184; GFX8:       ; %bb.0:
1185; GFX8-NEXT:    v_min_i32_e32 v2, 0, v0
1186; GFX8-NEXT:    v_max_i32_e32 v1, 0, v0
1187; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x80000000, v2
1188; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 0x7fffffff, v1
1189; GFX8-NEXT:    v_max_i32_e32 v2, s0, v2
1190; GFX8-NEXT:    v_min_i32_e32 v1, v2, v1
1191; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
1192; GFX8-NEXT:    ; return to shader part epilog
1193;
1194; GFX9-LABEL: saddsat_i32_vs:
1195; GFX9:       ; %bb.0:
1196; GFX9-NEXT:    v_add_i32 v0, v0, s0 clamp
1197; GFX9-NEXT:    ; return to shader part epilog
1198;
1199; GFX10-LABEL: saddsat_i32_vs:
1200; GFX10:       ; %bb.0:
1201; GFX10-NEXT:    v_add_nc_i32 v0, v0, s0 clamp
1202; GFX10-NEXT:    ; return to shader part epilog
1203  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
1204  %cast = bitcast i32 %result to float
1205  ret float %cast
1206}
1207
1208define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
1209; GFX6-LABEL: v_saddsat_v2i32:
1210; GFX6:       ; %bb.0:
1211; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1212; GFX6-NEXT:    s_brev_b32 s5, 1
1213; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
1214; GFX6-NEXT:    s_brev_b32 s4, -2
1215; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
1216; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
1217; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
1218; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
1219; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
1220; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
1221; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1222; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
1223; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
1224; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1225; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
1226; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
1227; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1228; GFX6-NEXT:    s_setpc_b64 s[30:31]
1229;
1230; GFX8-LABEL: v_saddsat_v2i32:
1231; GFX8:       ; %bb.0:
1232; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1233; GFX8-NEXT:    s_brev_b32 s5, 1
1234; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
1235; GFX8-NEXT:    s_brev_b32 s4, -2
1236; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
1237; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
1238; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
1239; GFX8-NEXT:    v_max_i32_e32 v2, v5, v2
1240; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
1241; GFX8-NEXT:    v_min_i32_e32 v4, 0, v1
1242; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1243; GFX8-NEXT:    v_max_i32_e32 v2, 0, v1
1244; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
1245; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s4, v2
1246; GFX8-NEXT:    v_max_i32_e32 v3, v4, v3
1247; GFX8-NEXT:    v_min_i32_e32 v2, v3, v2
1248; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
1249; GFX8-NEXT:    s_setpc_b64 s[30:31]
1250;
1251; GFX9-LABEL: v_saddsat_v2i32:
1252; GFX9:       ; %bb.0:
1253; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1254; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
1255; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
1256; GFX9-NEXT:    s_setpc_b64 s[30:31]
1257;
1258; GFX10-LABEL: v_saddsat_v2i32:
1259; GFX10:       ; %bb.0:
1260; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1262; GFX10-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
1263; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
1264; GFX10-NEXT:    s_setpc_b64 s[30:31]
1265  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1266  ret <2 x i32> %result
1267}
1268
1269define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
1270; GFX6-LABEL: s_saddsat_v2i32:
1271; GFX6:       ; %bb.0:
1272; GFX6-NEXT:    s_brev_b32 s5, 1
1273; GFX6-NEXT:    s_min_i32 s7, s0, 0
1274; GFX6-NEXT:    s_brev_b32 s4, -2
1275; GFX6-NEXT:    s_max_i32 s6, s0, 0
1276; GFX6-NEXT:    s_sub_i32 s7, s5, s7
1277; GFX6-NEXT:    s_sub_i32 s6, s4, s6
1278; GFX6-NEXT:    s_max_i32 s2, s7, s2
1279; GFX6-NEXT:    s_min_i32 s2, s2, s6
1280; GFX6-NEXT:    s_add_i32 s0, s0, s2
1281; GFX6-NEXT:    s_max_i32 s2, s1, 0
1282; GFX6-NEXT:    s_sub_i32 s2, s4, s2
1283; GFX6-NEXT:    s_min_i32 s4, s1, 0
1284; GFX6-NEXT:    s_sub_i32 s4, s5, s4
1285; GFX6-NEXT:    s_max_i32 s3, s4, s3
1286; GFX6-NEXT:    s_min_i32 s2, s3, s2
1287; GFX6-NEXT:    s_add_i32 s1, s1, s2
1288; GFX6-NEXT:    ; return to shader part epilog
1289;
1290; GFX8-LABEL: s_saddsat_v2i32:
1291; GFX8:       ; %bb.0:
1292; GFX8-NEXT:    s_brev_b32 s5, 1
1293; GFX8-NEXT:    s_min_i32 s7, s0, 0
1294; GFX8-NEXT:    s_brev_b32 s4, -2
1295; GFX8-NEXT:    s_max_i32 s6, s0, 0
1296; GFX8-NEXT:    s_sub_i32 s7, s5, s7
1297; GFX8-NEXT:    s_sub_i32 s6, s4, s6
1298; GFX8-NEXT:    s_max_i32 s2, s7, s2
1299; GFX8-NEXT:    s_min_i32 s2, s2, s6
1300; GFX8-NEXT:    s_add_i32 s0, s0, s2
1301; GFX8-NEXT:    s_max_i32 s2, s1, 0
1302; GFX8-NEXT:    s_sub_i32 s2, s4, s2
1303; GFX8-NEXT:    s_min_i32 s4, s1, 0
1304; GFX8-NEXT:    s_sub_i32 s4, s5, s4
1305; GFX8-NEXT:    s_max_i32 s3, s4, s3
1306; GFX8-NEXT:    s_min_i32 s2, s3, s2
1307; GFX8-NEXT:    s_add_i32 s1, s1, s2
1308; GFX8-NEXT:    ; return to shader part epilog
1309;
1310; GFX9-LABEL: s_saddsat_v2i32:
1311; GFX9:       ; %bb.0:
1312; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1313; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1314; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1315; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1316; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1317; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1318; GFX9-NEXT:    ; return to shader part epilog
1319;
1320; GFX10-LABEL: s_saddsat_v2i32:
1321; GFX10:       ; %bb.0:
1322; GFX10-NEXT:    v_add_nc_i32 v0, s0, s2 clamp
1323; GFX10-NEXT:    v_add_nc_i32 v1, s1, s3 clamp
1324; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1325; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1326; GFX10-NEXT:    ; return to shader part epilog
1327  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
1328  ret <2 x i32> %result
1329}
1330
1331define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
1332; GFX6-LABEL: v_saddsat_v3i32:
1333; GFX6:       ; %bb.0:
1334; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1335; GFX6-NEXT:    s_brev_b32 s5, 1
1336; GFX6-NEXT:    v_min_i32_e32 v7, 0, v0
1337; GFX6-NEXT:    s_brev_b32 s4, -2
1338; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
1339; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s5, v7
1340; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s4, v6
1341; GFX6-NEXT:    v_max_i32_e32 v3, v7, v3
1342; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
1343; GFX6-NEXT:    v_min_i32_e32 v6, 0, v1
1344; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
1345; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
1346; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
1347; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
1348; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
1349; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
1350; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
1351; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
1352; GFX6-NEXT:    v_max_i32_e32 v3, 0, v2
1353; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
1354; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
1355; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
1356; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
1357; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1358; GFX6-NEXT:    s_setpc_b64 s[30:31]
1359;
1360; GFX8-LABEL: v_saddsat_v3i32:
1361; GFX8:       ; %bb.0:
1362; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1363; GFX8-NEXT:    s_brev_b32 s5, 1
1364; GFX8-NEXT:    v_min_i32_e32 v7, 0, v0
1365; GFX8-NEXT:    s_brev_b32 s4, -2
1366; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
1367; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
1368; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s4, v6
1369; GFX8-NEXT:    v_max_i32_e32 v3, v7, v3
1370; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
1371; GFX8-NEXT:    v_min_i32_e32 v6, 0, v1
1372; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
1373; GFX8-NEXT:    v_max_i32_e32 v3, 0, v1
1374; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
1375; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
1376; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
1377; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
1378; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
1379; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
1380; GFX8-NEXT:    v_max_i32_e32 v3, 0, v2
1381; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
1382; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
1383; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
1384; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
1385; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
1386; GFX8-NEXT:    s_setpc_b64 s[30:31]
1387;
1388; GFX9-LABEL: v_saddsat_v3i32:
1389; GFX9:       ; %bb.0:
1390; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1391; GFX9-NEXT:    v_add_i32 v0, v0, v3 clamp
1392; GFX9-NEXT:    v_add_i32 v1, v1, v4 clamp
1393; GFX9-NEXT:    v_add_i32 v2, v2, v5 clamp
1394; GFX9-NEXT:    s_setpc_b64 s[30:31]
1395;
1396; GFX10-LABEL: v_saddsat_v3i32:
1397; GFX10:       ; %bb.0:
1398; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1399; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1400; GFX10-NEXT:    v_add_nc_i32 v0, v0, v3 clamp
1401; GFX10-NEXT:    v_add_nc_i32 v1, v1, v4 clamp
1402; GFX10-NEXT:    v_add_nc_i32 v2, v2, v5 clamp
1403; GFX10-NEXT:    s_setpc_b64 s[30:31]
1404  %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1405  ret <3 x i32> %result
1406}
1407
1408define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1409; GFX6-LABEL: s_saddsat_v3i32:
1410; GFX6:       ; %bb.0:
1411; GFX6-NEXT:    s_brev_b32 s7, 1
1412; GFX6-NEXT:    s_min_i32 s9, s0, 0
1413; GFX6-NEXT:    s_brev_b32 s6, -2
1414; GFX6-NEXT:    s_max_i32 s8, s0, 0
1415; GFX6-NEXT:    s_sub_i32 s9, s7, s9
1416; GFX6-NEXT:    s_sub_i32 s8, s6, s8
1417; GFX6-NEXT:    s_max_i32 s3, s9, s3
1418; GFX6-NEXT:    s_min_i32 s3, s3, s8
1419; GFX6-NEXT:    s_min_i32 s8, s1, 0
1420; GFX6-NEXT:    s_add_i32 s0, s0, s3
1421; GFX6-NEXT:    s_max_i32 s3, s1, 0
1422; GFX6-NEXT:    s_sub_i32 s8, s7, s8
1423; GFX6-NEXT:    s_sub_i32 s3, s6, s3
1424; GFX6-NEXT:    s_max_i32 s4, s8, s4
1425; GFX6-NEXT:    s_min_i32 s3, s4, s3
1426; GFX6-NEXT:    s_min_i32 s4, s2, 0
1427; GFX6-NEXT:    s_add_i32 s1, s1, s3
1428; GFX6-NEXT:    s_max_i32 s3, s2, 0
1429; GFX6-NEXT:    s_sub_i32 s4, s7, s4
1430; GFX6-NEXT:    s_sub_i32 s3, s6, s3
1431; GFX6-NEXT:    s_max_i32 s4, s4, s5
1432; GFX6-NEXT:    s_min_i32 s3, s4, s3
1433; GFX6-NEXT:    s_add_i32 s2, s2, s3
1434; GFX6-NEXT:    ; return to shader part epilog
1435;
1436; GFX8-LABEL: s_saddsat_v3i32:
1437; GFX8:       ; %bb.0:
1438; GFX8-NEXT:    s_brev_b32 s7, 1
1439; GFX8-NEXT:    s_min_i32 s9, s0, 0
1440; GFX8-NEXT:    s_brev_b32 s6, -2
1441; GFX8-NEXT:    s_max_i32 s8, s0, 0
1442; GFX8-NEXT:    s_sub_i32 s9, s7, s9
1443; GFX8-NEXT:    s_sub_i32 s8, s6, s8
1444; GFX8-NEXT:    s_max_i32 s3, s9, s3
1445; GFX8-NEXT:    s_min_i32 s3, s3, s8
1446; GFX8-NEXT:    s_min_i32 s8, s1, 0
1447; GFX8-NEXT:    s_add_i32 s0, s0, s3
1448; GFX8-NEXT:    s_max_i32 s3, s1, 0
1449; GFX8-NEXT:    s_sub_i32 s8, s7, s8
1450; GFX8-NEXT:    s_sub_i32 s3, s6, s3
1451; GFX8-NEXT:    s_max_i32 s4, s8, s4
1452; GFX8-NEXT:    s_min_i32 s3, s4, s3
1453; GFX8-NEXT:    s_min_i32 s4, s2, 0
1454; GFX8-NEXT:    s_add_i32 s1, s1, s3
1455; GFX8-NEXT:    s_max_i32 s3, s2, 0
1456; GFX8-NEXT:    s_sub_i32 s4, s7, s4
1457; GFX8-NEXT:    s_sub_i32 s3, s6, s3
1458; GFX8-NEXT:    s_max_i32 s4, s4, s5
1459; GFX8-NEXT:    s_min_i32 s3, s4, s3
1460; GFX8-NEXT:    s_add_i32 s2, s2, s3
1461; GFX8-NEXT:    ; return to shader part epilog
1462;
1463; GFX9-LABEL: s_saddsat_v3i32:
1464; GFX9:       ; %bb.0:
1465; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1466; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1467; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1468; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1469; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1470; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1471; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1472; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1473; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1474; GFX9-NEXT:    ; return to shader part epilog
1475;
1476; GFX10-LABEL: s_saddsat_v3i32:
1477; GFX10:       ; %bb.0:
1478; GFX10-NEXT:    v_add_nc_i32 v0, s0, s3 clamp
1479; GFX10-NEXT:    v_add_nc_i32 v1, s1, s4 clamp
1480; GFX10-NEXT:    v_add_nc_i32 v2, s2, s5 clamp
1481; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1482; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1483; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1484; GFX10-NEXT:    ; return to shader part epilog
1485  %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1486  ret <3 x i32> %result
1487}
1488
1489define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1490; GFX6-LABEL: v_saddsat_v4i32:
1491; GFX6:       ; %bb.0:
1492; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1493; GFX6-NEXT:    s_brev_b32 s5, 1
1494; GFX6-NEXT:    v_min_i32_e32 v9, 0, v0
1495; GFX6-NEXT:    s_brev_b32 s4, -2
1496; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
1497; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s5, v9
1498; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
1499; GFX6-NEXT:    v_max_i32_e32 v4, v9, v4
1500; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
1501; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
1502; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
1503; GFX6-NEXT:    v_max_i32_e32 v4, 0, v1
1504; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
1505; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
1506; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
1507; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1508; GFX6-NEXT:    v_min_i32_e32 v5, 0, v2
1509; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1510; GFX6-NEXT:    v_max_i32_e32 v4, 0, v2
1511; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
1512; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
1513; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
1514; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1515; GFX6-NEXT:    v_min_i32_e32 v5, 0, v3
1516; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1517; GFX6-NEXT:    v_max_i32_e32 v4, 0, v3
1518; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
1519; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
1520; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
1521; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
1522; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1523; GFX6-NEXT:    s_setpc_b64 s[30:31]
1524;
1525; GFX8-LABEL: v_saddsat_v4i32:
1526; GFX8:       ; %bb.0:
1527; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1528; GFX8-NEXT:    s_brev_b32 s5, 1
1529; GFX8-NEXT:    v_min_i32_e32 v9, 0, v0
1530; GFX8-NEXT:    s_brev_b32 s4, -2
1531; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
1532; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s5, v9
1533; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s4, v8
1534; GFX8-NEXT:    v_max_i32_e32 v4, v9, v4
1535; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
1536; GFX8-NEXT:    v_min_i32_e32 v8, 0, v1
1537; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
1538; GFX8-NEXT:    v_max_i32_e32 v4, 0, v1
1539; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s5, v8
1540; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
1541; GFX8-NEXT:    v_max_i32_e32 v5, v8, v5
1542; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1543; GFX8-NEXT:    v_min_i32_e32 v5, 0, v2
1544; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
1545; GFX8-NEXT:    v_max_i32_e32 v4, 0, v2
1546; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
1547; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
1548; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
1549; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1550; GFX8-NEXT:    v_min_i32_e32 v5, 0, v3
1551; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
1552; GFX8-NEXT:    v_max_i32_e32 v4, 0, v3
1553; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
1554; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
1555; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
1556; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
1557; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
1558; GFX8-NEXT:    s_setpc_b64 s[30:31]
1559;
1560; GFX9-LABEL: v_saddsat_v4i32:
1561; GFX9:       ; %bb.0:
1562; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1563; GFX9-NEXT:    v_add_i32 v0, v0, v4 clamp
1564; GFX9-NEXT:    v_add_i32 v1, v1, v5 clamp
1565; GFX9-NEXT:    v_add_i32 v2, v2, v6 clamp
1566; GFX9-NEXT:    v_add_i32 v3, v3, v7 clamp
1567; GFX9-NEXT:    s_setpc_b64 s[30:31]
1568;
1569; GFX10-LABEL: v_saddsat_v4i32:
1570; GFX10:       ; %bb.0:
1571; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1572; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1573; GFX10-NEXT:    v_add_nc_i32 v0, v0, v4 clamp
1574; GFX10-NEXT:    v_add_nc_i32 v1, v1, v5 clamp
1575; GFX10-NEXT:    v_add_nc_i32 v2, v2, v6 clamp
1576; GFX10-NEXT:    v_add_nc_i32 v3, v3, v7 clamp
1577; GFX10-NEXT:    s_setpc_b64 s[30:31]
1578  %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1579  ret <4 x i32> %result
1580}
1581
1582define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1583; GFX6-LABEL: s_saddsat_v4i32:
1584; GFX6:       ; %bb.0:
1585; GFX6-NEXT:    s_brev_b32 s9, 1
1586; GFX6-NEXT:    s_min_i32 s11, s0, 0
1587; GFX6-NEXT:    s_brev_b32 s8, -2
1588; GFX6-NEXT:    s_max_i32 s10, s0, 0
1589; GFX6-NEXT:    s_sub_i32 s11, s9, s11
1590; GFX6-NEXT:    s_sub_i32 s10, s8, s10
1591; GFX6-NEXT:    s_max_i32 s4, s11, s4
1592; GFX6-NEXT:    s_min_i32 s4, s4, s10
1593; GFX6-NEXT:    s_min_i32 s10, s1, 0
1594; GFX6-NEXT:    s_add_i32 s0, s0, s4
1595; GFX6-NEXT:    s_max_i32 s4, s1, 0
1596; GFX6-NEXT:    s_sub_i32 s10, s9, s10
1597; GFX6-NEXT:    s_sub_i32 s4, s8, s4
1598; GFX6-NEXT:    s_max_i32 s5, s10, s5
1599; GFX6-NEXT:    s_min_i32 s4, s5, s4
1600; GFX6-NEXT:    s_min_i32 s5, s2, 0
1601; GFX6-NEXT:    s_add_i32 s1, s1, s4
1602; GFX6-NEXT:    s_max_i32 s4, s2, 0
1603; GFX6-NEXT:    s_sub_i32 s5, s9, s5
1604; GFX6-NEXT:    s_sub_i32 s4, s8, s4
1605; GFX6-NEXT:    s_max_i32 s5, s5, s6
1606; GFX6-NEXT:    s_min_i32 s4, s5, s4
1607; GFX6-NEXT:    s_min_i32 s5, s3, 0
1608; GFX6-NEXT:    s_add_i32 s2, s2, s4
1609; GFX6-NEXT:    s_max_i32 s4, s3, 0
1610; GFX6-NEXT:    s_sub_i32 s5, s9, s5
1611; GFX6-NEXT:    s_sub_i32 s4, s8, s4
1612; GFX6-NEXT:    s_max_i32 s5, s5, s7
1613; GFX6-NEXT:    s_min_i32 s4, s5, s4
1614; GFX6-NEXT:    s_add_i32 s3, s3, s4
1615; GFX6-NEXT:    ; return to shader part epilog
1616;
1617; GFX8-LABEL: s_saddsat_v4i32:
1618; GFX8:       ; %bb.0:
1619; GFX8-NEXT:    s_brev_b32 s9, 1
1620; GFX8-NEXT:    s_min_i32 s11, s0, 0
1621; GFX8-NEXT:    s_brev_b32 s8, -2
1622; GFX8-NEXT:    s_max_i32 s10, s0, 0
1623; GFX8-NEXT:    s_sub_i32 s11, s9, s11
1624; GFX8-NEXT:    s_sub_i32 s10, s8, s10
1625; GFX8-NEXT:    s_max_i32 s4, s11, s4
1626; GFX8-NEXT:    s_min_i32 s4, s4, s10
1627; GFX8-NEXT:    s_min_i32 s10, s1, 0
1628; GFX8-NEXT:    s_add_i32 s0, s0, s4
1629; GFX8-NEXT:    s_max_i32 s4, s1, 0
1630; GFX8-NEXT:    s_sub_i32 s10, s9, s10
1631; GFX8-NEXT:    s_sub_i32 s4, s8, s4
1632; GFX8-NEXT:    s_max_i32 s5, s10, s5
1633; GFX8-NEXT:    s_min_i32 s4, s5, s4
1634; GFX8-NEXT:    s_min_i32 s5, s2, 0
1635; GFX8-NEXT:    s_add_i32 s1, s1, s4
1636; GFX8-NEXT:    s_max_i32 s4, s2, 0
1637; GFX8-NEXT:    s_sub_i32 s5, s9, s5
1638; GFX8-NEXT:    s_sub_i32 s4, s8, s4
1639; GFX8-NEXT:    s_max_i32 s5, s5, s6
1640; GFX8-NEXT:    s_min_i32 s4, s5, s4
1641; GFX8-NEXT:    s_min_i32 s5, s3, 0
1642; GFX8-NEXT:    s_add_i32 s2, s2, s4
1643; GFX8-NEXT:    s_max_i32 s4, s3, 0
1644; GFX8-NEXT:    s_sub_i32 s5, s9, s5
1645; GFX8-NEXT:    s_sub_i32 s4, s8, s4
1646; GFX8-NEXT:    s_max_i32 s5, s5, s7
1647; GFX8-NEXT:    s_min_i32 s4, s5, s4
1648; GFX8-NEXT:    s_add_i32 s3, s3, s4
1649; GFX8-NEXT:    ; return to shader part epilog
1650;
1651; GFX9-LABEL: s_saddsat_v4i32:
1652; GFX9:       ; %bb.0:
1653; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1654; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1655; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1656; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1657; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1658; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1659; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1660; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
1661; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1662; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1663; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1664; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1665; GFX9-NEXT:    ; return to shader part epilog
1666;
1667; GFX10-LABEL: s_saddsat_v4i32:
1668; GFX10:       ; %bb.0:
1669; GFX10-NEXT:    v_add_nc_i32 v0, s0, s4 clamp
1670; GFX10-NEXT:    v_add_nc_i32 v1, s1, s5 clamp
1671; GFX10-NEXT:    v_add_nc_i32 v2, s2, s6 clamp
1672; GFX10-NEXT:    v_add_nc_i32 v3, s3, s7 clamp
1673; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1674; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1675; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1676; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1677; GFX10-NEXT:    ; return to shader part epilog
1678  %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1679  ret <4 x i32> %result
1680}
1681
1682define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1683; GFX6-LABEL: v_saddsat_v5i32:
1684; GFX6:       ; %bb.0:
1685; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1686; GFX6-NEXT:    s_brev_b32 s5, 1
1687; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
1688; GFX6-NEXT:    s_brev_b32 s4, -2
1689; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
1690; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
1691; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s4, v10
1692; GFX6-NEXT:    v_max_i32_e32 v5, v12, v5
1693; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
1694; GFX6-NEXT:    v_min_i32_e32 v10, 0, v1
1695; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
1696; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
1697; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
1698; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
1699; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
1700; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1701; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
1702; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
1703; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
1704; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
1705; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
1706; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
1707; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
1708; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1709; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
1710; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
1711; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
1712; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
1713; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
1714; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
1715; GFX6-NEXT:    v_max_i32_e32 v6, v6, v8
1716; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1717; GFX6-NEXT:    v_min_i32_e32 v6, 0, v4
1718; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1719; GFX6-NEXT:    v_max_i32_e32 v5, 0, v4
1720; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v13, v6
1721; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
1722; GFX6-NEXT:    v_max_i32_e32 v6, v6, v9
1723; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
1724; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1725; GFX6-NEXT:    s_setpc_b64 s[30:31]
1726;
1727; GFX8-LABEL: v_saddsat_v5i32:
1728; GFX8:       ; %bb.0:
1729; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1730; GFX8-NEXT:    s_brev_b32 s5, 1
1731; GFX8-NEXT:    v_min_i32_e32 v12, 0, v0
1732; GFX8-NEXT:    s_brev_b32 s4, -2
1733; GFX8-NEXT:    v_max_i32_e32 v10, 0, v0
1734; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, s5, v12
1735; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s4, v10
1736; GFX8-NEXT:    v_max_i32_e32 v5, v12, v5
1737; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
1738; GFX8-NEXT:    v_min_i32_e32 v10, 0, v1
1739; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
1740; GFX8-NEXT:    v_max_i32_e32 v5, 0, v1
1741; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s5, v10
1742; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
1743; GFX8-NEXT:    v_max_i32_e32 v6, v10, v6
1744; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1745; GFX8-NEXT:    v_min_i32_e32 v6, 0, v2
1746; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
1747; GFX8-NEXT:    v_max_i32_e32 v5, 0, v2
1748; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
1749; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
1750; GFX8-NEXT:    v_max_i32_e32 v6, v6, v7
1751; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
1752; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1753; GFX8-NEXT:    v_min_i32_e32 v6, 0, v3
1754; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
1755; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
1756; GFX8-NEXT:    v_max_i32_e32 v5, 0, v3
1757; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
1758; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
1759; GFX8-NEXT:    v_max_i32_e32 v6, v6, v8
1760; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1761; GFX8-NEXT:    v_min_i32_e32 v6, 0, v4
1762; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
1763; GFX8-NEXT:    v_max_i32_e32 v5, 0, v4
1764; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v13, v6
1765; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
1766; GFX8-NEXT:    v_max_i32_e32 v6, v6, v9
1767; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
1768; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
1769; GFX8-NEXT:    s_setpc_b64 s[30:31]
1770;
1771; GFX9-LABEL: v_saddsat_v5i32:
1772; GFX9:       ; %bb.0:
1773; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1774; GFX9-NEXT:    v_add_i32 v0, v0, v5 clamp
1775; GFX9-NEXT:    v_add_i32 v1, v1, v6 clamp
1776; GFX9-NEXT:    v_add_i32 v2, v2, v7 clamp
1777; GFX9-NEXT:    v_add_i32 v3, v3, v8 clamp
1778; GFX9-NEXT:    v_add_i32 v4, v4, v9 clamp
1779; GFX9-NEXT:    s_setpc_b64 s[30:31]
1780;
1781; GFX10-LABEL: v_saddsat_v5i32:
1782; GFX10:       ; %bb.0:
1783; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1784; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1785; GFX10-NEXT:    v_add_nc_i32 v0, v0, v5 clamp
1786; GFX10-NEXT:    v_add_nc_i32 v1, v1, v6 clamp
1787; GFX10-NEXT:    v_add_nc_i32 v2, v2, v7 clamp
1788; GFX10-NEXT:    v_add_nc_i32 v3, v3, v8 clamp
1789; GFX10-NEXT:    v_add_nc_i32 v4, v4, v9 clamp
1790; GFX10-NEXT:    s_setpc_b64 s[30:31]
1791  %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1792  ret <5 x i32> %result
1793}
1794
1795define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1796; GFX6-LABEL: s_saddsat_v5i32:
1797; GFX6:       ; %bb.0:
1798; GFX6-NEXT:    s_brev_b32 s11, 1
1799; GFX6-NEXT:    s_min_i32 s13, s0, 0
1800; GFX6-NEXT:    s_brev_b32 s10, -2
1801; GFX6-NEXT:    s_max_i32 s12, s0, 0
1802; GFX6-NEXT:    s_sub_i32 s13, s11, s13
1803; GFX6-NEXT:    s_sub_i32 s12, s10, s12
1804; GFX6-NEXT:    s_max_i32 s5, s13, s5
1805; GFX6-NEXT:    s_min_i32 s5, s5, s12
1806; GFX6-NEXT:    s_min_i32 s12, s1, 0
1807; GFX6-NEXT:    s_add_i32 s0, s0, s5
1808; GFX6-NEXT:    s_max_i32 s5, s1, 0
1809; GFX6-NEXT:    s_sub_i32 s12, s11, s12
1810; GFX6-NEXT:    s_sub_i32 s5, s10, s5
1811; GFX6-NEXT:    s_max_i32 s6, s12, s6
1812; GFX6-NEXT:    s_min_i32 s5, s6, s5
1813; GFX6-NEXT:    s_min_i32 s6, s2, 0
1814; GFX6-NEXT:    s_add_i32 s1, s1, s5
1815; GFX6-NEXT:    s_max_i32 s5, s2, 0
1816; GFX6-NEXT:    s_sub_i32 s6, s11, s6
1817; GFX6-NEXT:    s_sub_i32 s5, s10, s5
1818; GFX6-NEXT:    s_max_i32 s6, s6, s7
1819; GFX6-NEXT:    s_min_i32 s5, s6, s5
1820; GFX6-NEXT:    s_min_i32 s6, s3, 0
1821; GFX6-NEXT:    s_add_i32 s2, s2, s5
1822; GFX6-NEXT:    s_max_i32 s5, s3, 0
1823; GFX6-NEXT:    s_sub_i32 s6, s11, s6
1824; GFX6-NEXT:    s_sub_i32 s5, s10, s5
1825; GFX6-NEXT:    s_max_i32 s6, s6, s8
1826; GFX6-NEXT:    s_min_i32 s5, s6, s5
1827; GFX6-NEXT:    s_min_i32 s6, s4, 0
1828; GFX6-NEXT:    s_add_i32 s3, s3, s5
1829; GFX6-NEXT:    s_max_i32 s5, s4, 0
1830; GFX6-NEXT:    s_sub_i32 s6, s11, s6
1831; GFX6-NEXT:    s_sub_i32 s5, s10, s5
1832; GFX6-NEXT:    s_max_i32 s6, s6, s9
1833; GFX6-NEXT:    s_min_i32 s5, s6, s5
1834; GFX6-NEXT:    s_add_i32 s4, s4, s5
1835; GFX6-NEXT:    ; return to shader part epilog
1836;
1837; GFX8-LABEL: s_saddsat_v5i32:
1838; GFX8:       ; %bb.0:
1839; GFX8-NEXT:    s_brev_b32 s11, 1
1840; GFX8-NEXT:    s_min_i32 s13, s0, 0
1841; GFX8-NEXT:    s_brev_b32 s10, -2
1842; GFX8-NEXT:    s_max_i32 s12, s0, 0
1843; GFX8-NEXT:    s_sub_i32 s13, s11, s13
1844; GFX8-NEXT:    s_sub_i32 s12, s10, s12
1845; GFX8-NEXT:    s_max_i32 s5, s13, s5
1846; GFX8-NEXT:    s_min_i32 s5, s5, s12
1847; GFX8-NEXT:    s_min_i32 s12, s1, 0
1848; GFX8-NEXT:    s_add_i32 s0, s0, s5
1849; GFX8-NEXT:    s_max_i32 s5, s1, 0
1850; GFX8-NEXT:    s_sub_i32 s12, s11, s12
1851; GFX8-NEXT:    s_sub_i32 s5, s10, s5
1852; GFX8-NEXT:    s_max_i32 s6, s12, s6
1853; GFX8-NEXT:    s_min_i32 s5, s6, s5
1854; GFX8-NEXT:    s_min_i32 s6, s2, 0
1855; GFX8-NEXT:    s_add_i32 s1, s1, s5
1856; GFX8-NEXT:    s_max_i32 s5, s2, 0
1857; GFX8-NEXT:    s_sub_i32 s6, s11, s6
1858; GFX8-NEXT:    s_sub_i32 s5, s10, s5
1859; GFX8-NEXT:    s_max_i32 s6, s6, s7
1860; GFX8-NEXT:    s_min_i32 s5, s6, s5
1861; GFX8-NEXT:    s_min_i32 s6, s3, 0
1862; GFX8-NEXT:    s_add_i32 s2, s2, s5
1863; GFX8-NEXT:    s_max_i32 s5, s3, 0
1864; GFX8-NEXT:    s_sub_i32 s6, s11, s6
1865; GFX8-NEXT:    s_sub_i32 s5, s10, s5
1866; GFX8-NEXT:    s_max_i32 s6, s6, s8
1867; GFX8-NEXT:    s_min_i32 s5, s6, s5
1868; GFX8-NEXT:    s_min_i32 s6, s4, 0
1869; GFX8-NEXT:    s_add_i32 s3, s3, s5
1870; GFX8-NEXT:    s_max_i32 s5, s4, 0
1871; GFX8-NEXT:    s_sub_i32 s6, s11, s6
1872; GFX8-NEXT:    s_sub_i32 s5, s10, s5
1873; GFX8-NEXT:    s_max_i32 s6, s6, s9
1874; GFX8-NEXT:    s_min_i32 s5, s6, s5
1875; GFX8-NEXT:    s_add_i32 s4, s4, s5
1876; GFX8-NEXT:    ; return to shader part epilog
1877;
1878; GFX9-LABEL: s_saddsat_v5i32:
1879; GFX9:       ; %bb.0:
1880; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1881; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1882; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1883; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1884; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1885; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
1886; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
1887; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
1888; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
1889; GFX9-NEXT:    v_add_i32 v4, s4, v4 clamp
1890; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1891; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1892; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1893; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1894; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1895; GFX9-NEXT:    ; return to shader part epilog
1896;
1897; GFX10-LABEL: s_saddsat_v5i32:
1898; GFX10:       ; %bb.0:
1899; GFX10-NEXT:    v_add_nc_i32 v0, s0, s5 clamp
1900; GFX10-NEXT:    v_add_nc_i32 v1, s1, s6 clamp
1901; GFX10-NEXT:    v_add_nc_i32 v2, s2, s7 clamp
1902; GFX10-NEXT:    v_add_nc_i32 v3, s3, s8 clamp
1903; GFX10-NEXT:    v_add_nc_i32 v4, s4, s9 clamp
1904; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1905; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1906; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1907; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1908; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
1909; GFX10-NEXT:    ; return to shader part epilog
1910  %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1911  ret <5 x i32> %result
1912}
1913
1914define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1915; GFX6-LABEL: v_saddsat_v16i32:
1916; GFX6:       ; %bb.0:
1917; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1918; GFX6-NEXT:    s_brev_b32 s4, 1
1919; GFX6-NEXT:    v_min_i32_e32 v32, 0, v0
1920; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, s4, v32
1921; GFX6-NEXT:    v_max_i32_e32 v16, v32, v16
1922; GFX6-NEXT:    s_brev_b32 s5, -2
1923; GFX6-NEXT:    v_max_i32_e32 v32, 0, v0
1924; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, s5, v32
1925; GFX6-NEXT:    v_min_i32_e32 v16, v16, v32
1926; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
1927; GFX6-NEXT:    v_min_i32_e32 v16, 0, v1
1928; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
1929; GFX6-NEXT:    v_max_i32_e32 v16, v16, v17
1930; GFX6-NEXT:    v_max_i32_e32 v17, 0, v1
1931; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
1932; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
1933; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
1934; GFX6-NEXT:    v_min_i32_e32 v16, 0, v2
1935; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
1936; GFX6-NEXT:    v_max_i32_e32 v17, 0, v2
1937; GFX6-NEXT:    v_max_i32_e32 v16, v16, v18
1938; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
1939; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
1940; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v16
1941; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
1942; GFX6-NEXT:    v_min_i32_e32 v17, 0, v3
1943; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1944; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
1945; GFX6-NEXT:    v_bfrev_b32_e32 v18, -2
1946; GFX6-NEXT:    v_max_i32_e32 v19, 0, v3
1947; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1948; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1949; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v17
1950; GFX6-NEXT:    v_min_i32_e32 v17, 0, v4
1951; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1952; GFX6-NEXT:    v_max_i32_e32 v19, 0, v4
1953; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
1954; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1955; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1956; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
1957; GFX6-NEXT:    v_min_i32_e32 v17, 0, v5
1958; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1959; GFX6-NEXT:    v_max_i32_e32 v19, 0, v5
1960; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
1961; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1962; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1963; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
1964; GFX6-NEXT:    v_min_i32_e32 v17, 0, v6
1965; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1966; GFX6-NEXT:    v_max_i32_e32 v19, 0, v6
1967; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
1968; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1969; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1970; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
1971; GFX6-NEXT:    v_min_i32_e32 v17, 0, v7
1972; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1973; GFX6-NEXT:    v_max_i32_e32 v19, 0, v7
1974; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
1975; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1976; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1977; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
1978; GFX6-NEXT:    v_min_i32_e32 v17, 0, v8
1979; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1980; GFX6-NEXT:    v_max_i32_e32 v19, 0, v8
1981; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
1982; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1983; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1984; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
1985; GFX6-NEXT:    v_min_i32_e32 v17, 0, v9
1986; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1987; GFX6-NEXT:    v_max_i32_e32 v19, 0, v9
1988; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
1989; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1990; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1991; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
1992; GFX6-NEXT:    v_min_i32_e32 v17, 0, v10
1993; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
1994; GFX6-NEXT:    v_max_i32_e32 v19, 0, v10
1995; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
1996; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
1997; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
1998; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
1999; GFX6-NEXT:    v_min_i32_e32 v17, 0, v11
2000; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
2001; GFX6-NEXT:    v_max_i32_e32 v19, 0, v11
2002; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
2003; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
2004; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2005; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
2006; GFX6-NEXT:    v_min_i32_e32 v17, 0, v12
2007; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
2008; GFX6-NEXT:    v_max_i32_e32 v19, 0, v12
2009; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
2010; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
2011; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2012; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
2013; GFX6-NEXT:    v_min_i32_e32 v17, 0, v13
2014; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
2015; GFX6-NEXT:    v_max_i32_e32 v19, 0, v13
2016; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
2017; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
2018; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2019; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
2020; GFX6-NEXT:    v_min_i32_e32 v17, 0, v14
2021; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
2022; GFX6-NEXT:    v_max_i32_e32 v19, 0, v14
2023; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
2024; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
2025; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
2026; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
2027; GFX6-NEXT:    v_max_i32_e32 v17, 0, v15
2028; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
2029; GFX6-NEXT:    v_min_i32_e32 v18, 0, v15
2030; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
2031; GFX6-NEXT:    v_max_i32_e32 v16, v16, v31
2032; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
2033; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
2034; GFX6-NEXT:    s_setpc_b64 s[30:31]
2035;
2036; GFX8-LABEL: v_saddsat_v16i32:
2037; GFX8:       ; %bb.0:
2038; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2039; GFX8-NEXT:    s_brev_b32 s4, 1
2040; GFX8-NEXT:    v_min_i32_e32 v32, 0, v0
2041; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, s4, v32
2042; GFX8-NEXT:    v_max_i32_e32 v16, v32, v16
2043; GFX8-NEXT:    s_brev_b32 s5, -2
2044; GFX8-NEXT:    v_max_i32_e32 v32, 0, v0
2045; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, s5, v32
2046; GFX8-NEXT:    v_min_i32_e32 v16, v16, v32
2047; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v16
2048; GFX8-NEXT:    v_min_i32_e32 v16, 0, v1
2049; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
2050; GFX8-NEXT:    v_max_i32_e32 v16, v16, v17
2051; GFX8-NEXT:    v_max_i32_e32 v17, 0, v1
2052; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
2053; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2054; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v16
2055; GFX8-NEXT:    v_min_i32_e32 v16, 0, v2
2056; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
2057; GFX8-NEXT:    v_max_i32_e32 v17, 0, v2
2058; GFX8-NEXT:    v_max_i32_e32 v16, v16, v18
2059; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
2060; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2061; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v16
2062; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
2063; GFX8-NEXT:    v_min_i32_e32 v17, 0, v3
2064; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2065; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
2066; GFX8-NEXT:    v_bfrev_b32_e32 v18, -2
2067; GFX8-NEXT:    v_max_i32_e32 v19, 0, v3
2068; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2069; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2070; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
2071; GFX8-NEXT:    v_min_i32_e32 v17, 0, v4
2072; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2073; GFX8-NEXT:    v_max_i32_e32 v19, 0, v4
2074; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
2075; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2076; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2077; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
2078; GFX8-NEXT:    v_min_i32_e32 v17, 0, v5
2079; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2080; GFX8-NEXT:    v_max_i32_e32 v19, 0, v5
2081; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
2082; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2083; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2084; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v17
2085; GFX8-NEXT:    v_min_i32_e32 v17, 0, v6
2086; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2087; GFX8-NEXT:    v_max_i32_e32 v19, 0, v6
2088; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
2089; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2090; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2091; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
2092; GFX8-NEXT:    v_min_i32_e32 v17, 0, v7
2093; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2094; GFX8-NEXT:    v_max_i32_e32 v19, 0, v7
2095; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
2096; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2097; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2098; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v17
2099; GFX8-NEXT:    v_min_i32_e32 v17, 0, v8
2100; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2101; GFX8-NEXT:    v_max_i32_e32 v19, 0, v8
2102; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
2103; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2104; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2105; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v17
2106; GFX8-NEXT:    v_min_i32_e32 v17, 0, v9
2107; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2108; GFX8-NEXT:    v_max_i32_e32 v19, 0, v9
2109; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
2110; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2111; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2112; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v17
2113; GFX8-NEXT:    v_min_i32_e32 v17, 0, v10
2114; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2115; GFX8-NEXT:    v_max_i32_e32 v19, 0, v10
2116; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
2117; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2118; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2119; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v17
2120; GFX8-NEXT:    v_min_i32_e32 v17, 0, v11
2121; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2122; GFX8-NEXT:    v_max_i32_e32 v19, 0, v11
2123; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
2124; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2125; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2126; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v17
2127; GFX8-NEXT:    v_min_i32_e32 v17, 0, v12
2128; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2129; GFX8-NEXT:    v_max_i32_e32 v19, 0, v12
2130; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
2131; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2132; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2133; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v17
2134; GFX8-NEXT:    v_min_i32_e32 v17, 0, v13
2135; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2136; GFX8-NEXT:    v_max_i32_e32 v19, 0, v13
2137; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
2138; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2139; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2140; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v17
2141; GFX8-NEXT:    v_min_i32_e32 v17, 0, v14
2142; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
2143; GFX8-NEXT:    v_max_i32_e32 v19, 0, v14
2144; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
2145; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
2146; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
2147; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v17
2148; GFX8-NEXT:    v_max_i32_e32 v17, 0, v15
2149; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
2150; GFX8-NEXT:    v_min_i32_e32 v18, 0, v15
2151; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v16, v18
2152; GFX8-NEXT:    v_max_i32_e32 v16, v16, v31
2153; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
2154; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v16
2155; GFX8-NEXT:    s_setpc_b64 s[30:31]
2156;
2157; GFX9-LABEL: v_saddsat_v16i32:
2158; GFX9:       ; %bb.0:
2159; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2160; GFX9-NEXT:    v_add_i32 v0, v0, v16 clamp
2161; GFX9-NEXT:    v_add_i32 v1, v1, v17 clamp
2162; GFX9-NEXT:    v_add_i32 v2, v2, v18 clamp
2163; GFX9-NEXT:    v_add_i32 v3, v3, v19 clamp
2164; GFX9-NEXT:    v_add_i32 v4, v4, v20 clamp
2165; GFX9-NEXT:    v_add_i32 v5, v5, v21 clamp
2166; GFX9-NEXT:    v_add_i32 v6, v6, v22 clamp
2167; GFX9-NEXT:    v_add_i32 v7, v7, v23 clamp
2168; GFX9-NEXT:    v_add_i32 v8, v8, v24 clamp
2169; GFX9-NEXT:    v_add_i32 v9, v9, v25 clamp
2170; GFX9-NEXT:    v_add_i32 v10, v10, v26 clamp
2171; GFX9-NEXT:    v_add_i32 v11, v11, v27 clamp
2172; GFX9-NEXT:    v_add_i32 v12, v12, v28 clamp
2173; GFX9-NEXT:    v_add_i32 v13, v13, v29 clamp
2174; GFX9-NEXT:    v_add_i32 v14, v14, v30 clamp
2175; GFX9-NEXT:    v_add_i32 v15, v15, v31 clamp
2176; GFX9-NEXT:    s_setpc_b64 s[30:31]
2177;
2178; GFX10-LABEL: v_saddsat_v16i32:
2179; GFX10:       ; %bb.0:
2180; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2181; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2182; GFX10-NEXT:    v_add_nc_i32 v0, v0, v16 clamp
2183; GFX10-NEXT:    v_add_nc_i32 v1, v1, v17 clamp
2184; GFX10-NEXT:    v_add_nc_i32 v2, v2, v18 clamp
2185; GFX10-NEXT:    v_add_nc_i32 v3, v3, v19 clamp
2186; GFX10-NEXT:    v_add_nc_i32 v4, v4, v20 clamp
2187; GFX10-NEXT:    v_add_nc_i32 v5, v5, v21 clamp
2188; GFX10-NEXT:    v_add_nc_i32 v6, v6, v22 clamp
2189; GFX10-NEXT:    v_add_nc_i32 v7, v7, v23 clamp
2190; GFX10-NEXT:    v_add_nc_i32 v8, v8, v24 clamp
2191; GFX10-NEXT:    v_add_nc_i32 v9, v9, v25 clamp
2192; GFX10-NEXT:    v_add_nc_i32 v10, v10, v26 clamp
2193; GFX10-NEXT:    v_add_nc_i32 v11, v11, v27 clamp
2194; GFX10-NEXT:    v_add_nc_i32 v12, v12, v28 clamp
2195; GFX10-NEXT:    v_add_nc_i32 v13, v13, v29 clamp
2196; GFX10-NEXT:    v_add_nc_i32 v14, v14, v30 clamp
2197; GFX10-NEXT:    v_add_nc_i32 v15, v15, v31 clamp
2198; GFX10-NEXT:    s_setpc_b64 s[30:31]
2199  %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2200  ret <16 x i32> %result
2201}
2202
2203define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
2204; GFX6-LABEL: s_saddsat_v16i32:
2205; GFX6:       ; %bb.0:
2206; GFX6-NEXT:    s_brev_b32 s33, 1
2207; GFX6-NEXT:    s_min_i32 s35, s0, 0
2208; GFX6-NEXT:    s_brev_b32 s32, -2
2209; GFX6-NEXT:    s_max_i32 s34, s0, 0
2210; GFX6-NEXT:    s_sub_i32 s35, s33, s35
2211; GFX6-NEXT:    s_sub_i32 s34, s32, s34
2212; GFX6-NEXT:    s_max_i32 s16, s35, s16
2213; GFX6-NEXT:    s_min_i32 s16, s16, s34
2214; GFX6-NEXT:    s_min_i32 s34, s1, 0
2215; GFX6-NEXT:    s_add_i32 s0, s0, s16
2216; GFX6-NEXT:    s_max_i32 s16, s1, 0
2217; GFX6-NEXT:    s_sub_i32 s34, s33, s34
2218; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2219; GFX6-NEXT:    s_max_i32 s17, s34, s17
2220; GFX6-NEXT:    s_min_i32 s16, s17, s16
2221; GFX6-NEXT:    s_min_i32 s17, s2, 0
2222; GFX6-NEXT:    s_add_i32 s1, s1, s16
2223; GFX6-NEXT:    s_max_i32 s16, s2, 0
2224; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2225; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2226; GFX6-NEXT:    s_max_i32 s17, s17, s18
2227; GFX6-NEXT:    s_min_i32 s16, s17, s16
2228; GFX6-NEXT:    s_min_i32 s17, s3, 0
2229; GFX6-NEXT:    s_add_i32 s2, s2, s16
2230; GFX6-NEXT:    s_max_i32 s16, s3, 0
2231; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2232; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2233; GFX6-NEXT:    s_max_i32 s17, s17, s19
2234; GFX6-NEXT:    s_min_i32 s16, s17, s16
2235; GFX6-NEXT:    s_min_i32 s17, s4, 0
2236; GFX6-NEXT:    s_add_i32 s3, s3, s16
2237; GFX6-NEXT:    s_max_i32 s16, s4, 0
2238; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2239; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2240; GFX6-NEXT:    s_max_i32 s17, s17, s20
2241; GFX6-NEXT:    s_min_i32 s16, s17, s16
2242; GFX6-NEXT:    s_min_i32 s17, s5, 0
2243; GFX6-NEXT:    s_add_i32 s4, s4, s16
2244; GFX6-NEXT:    s_max_i32 s16, s5, 0
2245; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2246; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2247; GFX6-NEXT:    s_max_i32 s17, s17, s21
2248; GFX6-NEXT:    s_min_i32 s16, s17, s16
2249; GFX6-NEXT:    s_min_i32 s17, s6, 0
2250; GFX6-NEXT:    s_add_i32 s5, s5, s16
2251; GFX6-NEXT:    s_max_i32 s16, s6, 0
2252; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2253; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2254; GFX6-NEXT:    s_max_i32 s17, s17, s22
2255; GFX6-NEXT:    s_min_i32 s16, s17, s16
2256; GFX6-NEXT:    s_min_i32 s17, s7, 0
2257; GFX6-NEXT:    s_add_i32 s6, s6, s16
2258; GFX6-NEXT:    s_max_i32 s16, s7, 0
2259; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2260; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2261; GFX6-NEXT:    s_max_i32 s17, s17, s23
2262; GFX6-NEXT:    s_min_i32 s16, s17, s16
2263; GFX6-NEXT:    s_min_i32 s17, s8, 0
2264; GFX6-NEXT:    s_add_i32 s7, s7, s16
2265; GFX6-NEXT:    s_max_i32 s16, s8, 0
2266; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2267; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2268; GFX6-NEXT:    s_max_i32 s17, s17, s24
2269; GFX6-NEXT:    s_min_i32 s16, s17, s16
2270; GFX6-NEXT:    s_min_i32 s17, s9, 0
2271; GFX6-NEXT:    s_add_i32 s8, s8, s16
2272; GFX6-NEXT:    s_max_i32 s16, s9, 0
2273; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2274; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2275; GFX6-NEXT:    s_max_i32 s17, s17, s25
2276; GFX6-NEXT:    s_min_i32 s16, s17, s16
2277; GFX6-NEXT:    s_min_i32 s17, s10, 0
2278; GFX6-NEXT:    s_add_i32 s9, s9, s16
2279; GFX6-NEXT:    s_max_i32 s16, s10, 0
2280; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2281; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2282; GFX6-NEXT:    s_max_i32 s17, s17, s26
2283; GFX6-NEXT:    s_min_i32 s16, s17, s16
2284; GFX6-NEXT:    s_min_i32 s17, s11, 0
2285; GFX6-NEXT:    s_add_i32 s10, s10, s16
2286; GFX6-NEXT:    s_max_i32 s16, s11, 0
2287; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2288; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2289; GFX6-NEXT:    s_max_i32 s17, s17, s27
2290; GFX6-NEXT:    s_min_i32 s16, s17, s16
2291; GFX6-NEXT:    s_min_i32 s17, s12, 0
2292; GFX6-NEXT:    s_add_i32 s11, s11, s16
2293; GFX6-NEXT:    s_max_i32 s16, s12, 0
2294; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2295; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2296; GFX6-NEXT:    s_max_i32 s17, s17, s28
2297; GFX6-NEXT:    s_min_i32 s16, s17, s16
2298; GFX6-NEXT:    s_min_i32 s17, s13, 0
2299; GFX6-NEXT:    s_add_i32 s12, s12, s16
2300; GFX6-NEXT:    s_max_i32 s16, s13, 0
2301; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2302; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2303; GFX6-NEXT:    s_max_i32 s17, s17, s29
2304; GFX6-NEXT:    s_min_i32 s16, s17, s16
2305; GFX6-NEXT:    s_min_i32 s17, s14, 0
2306; GFX6-NEXT:    s_add_i32 s13, s13, s16
2307; GFX6-NEXT:    s_max_i32 s16, s14, 0
2308; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2309; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2310; GFX6-NEXT:    s_max_i32 s17, s17, s30
2311; GFX6-NEXT:    s_min_i32 s16, s17, s16
2312; GFX6-NEXT:    s_min_i32 s17, s15, 0
2313; GFX6-NEXT:    s_add_i32 s14, s14, s16
2314; GFX6-NEXT:    s_max_i32 s16, s15, 0
2315; GFX6-NEXT:    s_sub_i32 s17, s33, s17
2316; GFX6-NEXT:    s_sub_i32 s16, s32, s16
2317; GFX6-NEXT:    s_max_i32 s17, s17, s31
2318; GFX6-NEXT:    s_min_i32 s16, s17, s16
2319; GFX6-NEXT:    s_add_i32 s15, s15, s16
2320; GFX6-NEXT:    ; return to shader part epilog
2321;
2322; GFX8-LABEL: s_saddsat_v16i32:
2323; GFX8:       ; %bb.0:
2324; GFX8-NEXT:    s_brev_b32 s33, 1
2325; GFX8-NEXT:    s_min_i32 s35, s0, 0
2326; GFX8-NEXT:    s_brev_b32 s32, -2
2327; GFX8-NEXT:    s_max_i32 s34, s0, 0
2328; GFX8-NEXT:    s_sub_i32 s35, s33, s35
2329; GFX8-NEXT:    s_sub_i32 s34, s32, s34
2330; GFX8-NEXT:    s_max_i32 s16, s35, s16
2331; GFX8-NEXT:    s_min_i32 s16, s16, s34
2332; GFX8-NEXT:    s_min_i32 s34, s1, 0
2333; GFX8-NEXT:    s_add_i32 s0, s0, s16
2334; GFX8-NEXT:    s_max_i32 s16, s1, 0
2335; GFX8-NEXT:    s_sub_i32 s34, s33, s34
2336; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2337; GFX8-NEXT:    s_max_i32 s17, s34, s17
2338; GFX8-NEXT:    s_min_i32 s16, s17, s16
2339; GFX8-NEXT:    s_min_i32 s17, s2, 0
2340; GFX8-NEXT:    s_add_i32 s1, s1, s16
2341; GFX8-NEXT:    s_max_i32 s16, s2, 0
2342; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2343; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2344; GFX8-NEXT:    s_max_i32 s17, s17, s18
2345; GFX8-NEXT:    s_min_i32 s16, s17, s16
2346; GFX8-NEXT:    s_min_i32 s17, s3, 0
2347; GFX8-NEXT:    s_add_i32 s2, s2, s16
2348; GFX8-NEXT:    s_max_i32 s16, s3, 0
2349; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2350; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2351; GFX8-NEXT:    s_max_i32 s17, s17, s19
2352; GFX8-NEXT:    s_min_i32 s16, s17, s16
2353; GFX8-NEXT:    s_min_i32 s17, s4, 0
2354; GFX8-NEXT:    s_add_i32 s3, s3, s16
2355; GFX8-NEXT:    s_max_i32 s16, s4, 0
2356; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2357; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2358; GFX8-NEXT:    s_max_i32 s17, s17, s20
2359; GFX8-NEXT:    s_min_i32 s16, s17, s16
2360; GFX8-NEXT:    s_min_i32 s17, s5, 0
2361; GFX8-NEXT:    s_add_i32 s4, s4, s16
2362; GFX8-NEXT:    s_max_i32 s16, s5, 0
2363; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2364; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2365; GFX8-NEXT:    s_max_i32 s17, s17, s21
2366; GFX8-NEXT:    s_min_i32 s16, s17, s16
2367; GFX8-NEXT:    s_min_i32 s17, s6, 0
2368; GFX8-NEXT:    s_add_i32 s5, s5, s16
2369; GFX8-NEXT:    s_max_i32 s16, s6, 0
2370; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2371; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2372; GFX8-NEXT:    s_max_i32 s17, s17, s22
2373; GFX8-NEXT:    s_min_i32 s16, s17, s16
2374; GFX8-NEXT:    s_min_i32 s17, s7, 0
2375; GFX8-NEXT:    s_add_i32 s6, s6, s16
2376; GFX8-NEXT:    s_max_i32 s16, s7, 0
2377; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2378; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2379; GFX8-NEXT:    s_max_i32 s17, s17, s23
2380; GFX8-NEXT:    s_min_i32 s16, s17, s16
2381; GFX8-NEXT:    s_min_i32 s17, s8, 0
2382; GFX8-NEXT:    s_add_i32 s7, s7, s16
2383; GFX8-NEXT:    s_max_i32 s16, s8, 0
2384; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2385; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2386; GFX8-NEXT:    s_max_i32 s17, s17, s24
2387; GFX8-NEXT:    s_min_i32 s16, s17, s16
2388; GFX8-NEXT:    s_min_i32 s17, s9, 0
2389; GFX8-NEXT:    s_add_i32 s8, s8, s16
2390; GFX8-NEXT:    s_max_i32 s16, s9, 0
2391; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2392; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2393; GFX8-NEXT:    s_max_i32 s17, s17, s25
2394; GFX8-NEXT:    s_min_i32 s16, s17, s16
2395; GFX8-NEXT:    s_min_i32 s17, s10, 0
2396; GFX8-NEXT:    s_add_i32 s9, s9, s16
2397; GFX8-NEXT:    s_max_i32 s16, s10, 0
2398; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2399; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2400; GFX8-NEXT:    s_max_i32 s17, s17, s26
2401; GFX8-NEXT:    s_min_i32 s16, s17, s16
2402; GFX8-NEXT:    s_min_i32 s17, s11, 0
2403; GFX8-NEXT:    s_add_i32 s10, s10, s16
2404; GFX8-NEXT:    s_max_i32 s16, s11, 0
2405; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2406; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2407; GFX8-NEXT:    s_max_i32 s17, s17, s27
2408; GFX8-NEXT:    s_min_i32 s16, s17, s16
2409; GFX8-NEXT:    s_min_i32 s17, s12, 0
2410; GFX8-NEXT:    s_add_i32 s11, s11, s16
2411; GFX8-NEXT:    s_max_i32 s16, s12, 0
2412; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2413; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2414; GFX8-NEXT:    s_max_i32 s17, s17, s28
2415; GFX8-NEXT:    s_min_i32 s16, s17, s16
2416; GFX8-NEXT:    s_min_i32 s17, s13, 0
2417; GFX8-NEXT:    s_add_i32 s12, s12, s16
2418; GFX8-NEXT:    s_max_i32 s16, s13, 0
2419; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2420; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2421; GFX8-NEXT:    s_max_i32 s17, s17, s29
2422; GFX8-NEXT:    s_min_i32 s16, s17, s16
2423; GFX8-NEXT:    s_min_i32 s17, s14, 0
2424; GFX8-NEXT:    s_add_i32 s13, s13, s16
2425; GFX8-NEXT:    s_max_i32 s16, s14, 0
2426; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2427; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2428; GFX8-NEXT:    s_max_i32 s17, s17, s30
2429; GFX8-NEXT:    s_min_i32 s16, s17, s16
2430; GFX8-NEXT:    s_min_i32 s17, s15, 0
2431; GFX8-NEXT:    s_add_i32 s14, s14, s16
2432; GFX8-NEXT:    s_max_i32 s16, s15, 0
2433; GFX8-NEXT:    s_sub_i32 s17, s33, s17
2434; GFX8-NEXT:    s_sub_i32 s16, s32, s16
2435; GFX8-NEXT:    s_max_i32 s17, s17, s31
2436; GFX8-NEXT:    s_min_i32 s16, s17, s16
2437; GFX8-NEXT:    s_add_i32 s15, s15, s16
2438; GFX8-NEXT:    ; return to shader part epilog
2439;
2440; GFX9-LABEL: s_saddsat_v16i32:
2441; GFX9:       ; %bb.0:
2442; GFX9-NEXT:    v_mov_b32_e32 v0, s16
2443; GFX9-NEXT:    v_mov_b32_e32 v1, s17
2444; GFX9-NEXT:    v_mov_b32_e32 v2, s18
2445; GFX9-NEXT:    v_mov_b32_e32 v3, s19
2446; GFX9-NEXT:    v_mov_b32_e32 v4, s20
2447; GFX9-NEXT:    v_mov_b32_e32 v5, s21
2448; GFX9-NEXT:    v_mov_b32_e32 v6, s22
2449; GFX9-NEXT:    v_mov_b32_e32 v7, s23
2450; GFX9-NEXT:    v_mov_b32_e32 v8, s24
2451; GFX9-NEXT:    v_mov_b32_e32 v9, s25
2452; GFX9-NEXT:    v_mov_b32_e32 v10, s26
2453; GFX9-NEXT:    v_mov_b32_e32 v11, s27
2454; GFX9-NEXT:    v_mov_b32_e32 v12, s28
2455; GFX9-NEXT:    v_mov_b32_e32 v13, s29
2456; GFX9-NEXT:    v_mov_b32_e32 v14, s30
2457; GFX9-NEXT:    v_mov_b32_e32 v15, s31
2458; GFX9-NEXT:    v_add_i32 v0, s0, v0 clamp
2459; GFX9-NEXT:    v_add_i32 v1, s1, v1 clamp
2460; GFX9-NEXT:    v_add_i32 v2, s2, v2 clamp
2461; GFX9-NEXT:    v_add_i32 v3, s3, v3 clamp
2462; GFX9-NEXT:    v_add_i32 v4, s4, v4 clamp
2463; GFX9-NEXT:    v_add_i32 v5, s5, v5 clamp
2464; GFX9-NEXT:    v_add_i32 v6, s6, v6 clamp
2465; GFX9-NEXT:    v_add_i32 v7, s7, v7 clamp
2466; GFX9-NEXT:    v_add_i32 v8, s8, v8 clamp
2467; GFX9-NEXT:    v_add_i32 v9, s9, v9 clamp
2468; GFX9-NEXT:    v_add_i32 v10, s10, v10 clamp
2469; GFX9-NEXT:    v_add_i32 v11, s11, v11 clamp
2470; GFX9-NEXT:    v_add_i32 v12, s12, v12 clamp
2471; GFX9-NEXT:    v_add_i32 v13, s13, v13 clamp
2472; GFX9-NEXT:    v_add_i32 v14, s14, v14 clamp
2473; GFX9-NEXT:    v_add_i32 v15, s15, v15 clamp
2474; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2475; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2476; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2477; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2478; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
2479; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
2480; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
2481; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
2482; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
2483; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
2484; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
2485; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
2486; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
2487; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
2488; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
2489; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
2490; GFX9-NEXT:    ; return to shader part epilog
2491;
2492; GFX10-LABEL: s_saddsat_v16i32:
2493; GFX10:       ; %bb.0:
2494; GFX10-NEXT:    v_add_nc_i32 v0, s0, s16 clamp
2495; GFX10-NEXT:    v_add_nc_i32 v1, s1, s17 clamp
2496; GFX10-NEXT:    v_add_nc_i32 v2, s2, s18 clamp
2497; GFX10-NEXT:    v_add_nc_i32 v3, s3, s19 clamp
2498; GFX10-NEXT:    v_add_nc_i32 v4, s4, s20 clamp
2499; GFX10-NEXT:    v_add_nc_i32 v5, s5, s21 clamp
2500; GFX10-NEXT:    v_add_nc_i32 v6, s6, s22 clamp
2501; GFX10-NEXT:    v_add_nc_i32 v7, s7, s23 clamp
2502; GFX10-NEXT:    v_add_nc_i32 v8, s8, s24 clamp
2503; GFX10-NEXT:    v_add_nc_i32 v9, s9, s25 clamp
2504; GFX10-NEXT:    v_add_nc_i32 v10, s10, s26 clamp
2505; GFX10-NEXT:    v_add_nc_i32 v11, s11, s27 clamp
2506; GFX10-NEXT:    v_add_nc_i32 v12, s12, s28 clamp
2507; GFX10-NEXT:    v_add_nc_i32 v13, s13, s29 clamp
2508; GFX10-NEXT:    v_add_nc_i32 v14, s14, s30 clamp
2509; GFX10-NEXT:    v_add_nc_i32 v15, s15, s31 clamp
2510; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2511; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2512; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
2513; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
2514; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
2515; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
2516; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
2517; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
2518; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
2519; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
2520; GFX10-NEXT:    v_readfirstlane_b32 s10, v10
2521; GFX10-NEXT:    v_readfirstlane_b32 s11, v11
2522; GFX10-NEXT:    v_readfirstlane_b32 s12, v12
2523; GFX10-NEXT:    v_readfirstlane_b32 s13, v13
2524; GFX10-NEXT:    v_readfirstlane_b32 s14, v14
2525; GFX10-NEXT:    v_readfirstlane_b32 s15, v15
2526; GFX10-NEXT:    ; return to shader part epilog
2527  %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
2528  ret <16 x i32> %result
2529}
2530
2531define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
2532; GFX6-LABEL: v_saddsat_i16:
2533; GFX6:       ; %bb.0:
2534; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2535; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2536; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
2537; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2538; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
2539; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
2540; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
2541; GFX6-NEXT:    v_max_i32_e32 v1, v3, v1
2542; GFX6-NEXT:    v_min_i32_e32 v1, v1, v2
2543; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2544; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2545; GFX6-NEXT:    s_setpc_b64 s[30:31]
2546;
2547; GFX8-LABEL: v_saddsat_i16:
2548; GFX8:       ; %bb.0:
2549; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2550; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
2551; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
2552; GFX8-NEXT:    v_sub_u16_e32 v3, 0x8000, v3
2553; GFX8-NEXT:    v_sub_u16_e32 v2, 0x7fff, v2
2554; GFX8-NEXT:    v_max_i16_e32 v1, v3, v1
2555; GFX8-NEXT:    v_min_i16_e32 v1, v1, v2
2556; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
2557; GFX8-NEXT:    s_setpc_b64 s[30:31]
2558;
2559; GFX9-LABEL: v_saddsat_i16:
2560; GFX9:       ; %bb.0:
2561; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
2563; GFX9-NEXT:    s_setpc_b64 s[30:31]
2564;
2565; GFX10-LABEL: v_saddsat_i16:
2566; GFX10:       ; %bb.0:
2567; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2568; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2569; GFX10-NEXT:    v_add_nc_i16 v0, v0, v1 clamp
2570; GFX10-NEXT:    s_setpc_b64 s[30:31]
2571  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2572  ret i16 %result
2573}
2574
2575define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
2576; GFX6-LABEL: s_saddsat_i16:
2577; GFX6:       ; %bb.0:
2578; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2579; GFX6-NEXT:    s_min_i32 s3, s0, 0
2580; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2581; GFX6-NEXT:    s_max_i32 s2, s0, 0
2582; GFX6-NEXT:    s_sub_i32 s3, 0x80000000, s3
2583; GFX6-NEXT:    s_sub_i32 s2, 0x7fffffff, s2
2584; GFX6-NEXT:    s_max_i32 s1, s3, s1
2585; GFX6-NEXT:    s_min_i32 s1, s1, s2
2586; GFX6-NEXT:    s_add_i32 s0, s0, s1
2587; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2588; GFX6-NEXT:    ; return to shader part epilog
2589;
2590; GFX8-LABEL: s_saddsat_i16:
2591; GFX8:       ; %bb.0:
2592; GFX8-NEXT:    s_sext_i32_i16 s2, s0
2593; GFX8-NEXT:    s_sext_i32_i16 s3, 0
2594; GFX8-NEXT:    s_max_i32 s4, s2, s3
2595; GFX8-NEXT:    s_min_i32 s2, s2, s3
2596; GFX8-NEXT:    s_sub_i32 s2, 0xffff8000, s2
2597; GFX8-NEXT:    s_sext_i32_i16 s2, s2
2598; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2599; GFX8-NEXT:    s_sub_i32 s4, 0x7fff, s4
2600; GFX8-NEXT:    s_max_i32 s1, s2, s1
2601; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2602; GFX8-NEXT:    s_sext_i32_i16 s2, s4
2603; GFX8-NEXT:    s_min_i32 s1, s1, s2
2604; GFX8-NEXT:    s_add_i32 s0, s0, s1
2605; GFX8-NEXT:    ; return to shader part epilog
2606;
2607; GFX9-LABEL: s_saddsat_i16:
2608; GFX9:       ; %bb.0:
2609; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2610; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
2611; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2612; GFX9-NEXT:    ; return to shader part epilog
2613;
2614; GFX10-LABEL: s_saddsat_i16:
2615; GFX10:       ; %bb.0:
2616; GFX10-NEXT:    v_add_nc_i16 v0, s0, s1 clamp
2617; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2618; GFX10-NEXT:    ; return to shader part epilog
2619  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2620  ret i16 %result
2621}
2622
2623define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
2624; GFX6-LABEL: saddsat_i16_sv:
2625; GFX6:       ; %bb.0:
2626; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2627; GFX6-NEXT:    s_min_i32 s2, s0, 0
2628; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2629; GFX6-NEXT:    s_max_i32 s1, s0, 0
2630; GFX6-NEXT:    s_sub_i32 s2, 0x80000000, s2
2631; GFX6-NEXT:    s_sub_i32 s1, 0x7fffffff, s1
2632; GFX6-NEXT:    v_max_i32_e32 v0, s2, v0
2633; GFX6-NEXT:    v_min_i32_e32 v0, s1, v0
2634; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2635; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2636; GFX6-NEXT:    ; return to shader part epilog
2637;
2638; GFX8-LABEL: saddsat_i16_sv:
2639; GFX8:       ; %bb.0:
2640; GFX8-NEXT:    s_sext_i32_i16 s1, s0
2641; GFX8-NEXT:    s_sext_i32_i16 s2, 0
2642; GFX8-NEXT:    s_max_i32 s3, s1, s2
2643; GFX8-NEXT:    s_min_i32 s1, s1, s2
2644; GFX8-NEXT:    s_sub_i32 s1, 0xffff8000, s1
2645; GFX8-NEXT:    s_sub_i32 s3, 0x7fff, s3
2646; GFX8-NEXT:    v_max_i16_e32 v0, s1, v0
2647; GFX8-NEXT:    v_min_i16_e32 v0, s3, v0
2648; GFX8-NEXT:    v_add_u16_e32 v0, s0, v0
2649; GFX8-NEXT:    ; return to shader part epilog
2650;
2651; GFX9-LABEL: saddsat_i16_sv:
2652; GFX9:       ; %bb.0:
2653; GFX9-NEXT:    v_add_i16 v0, s0, v0 clamp
2654; GFX9-NEXT:    ; return to shader part epilog
2655;
2656; GFX10-LABEL: saddsat_i16_sv:
2657; GFX10:       ; %bb.0:
2658; GFX10-NEXT:    v_add_nc_i16 v0, s0, v0 clamp
2659; GFX10-NEXT:    ; return to shader part epilog
2660  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2661  %cast = bitcast i16 %result to half
2662  ret half %cast
2663}
2664
2665define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
2666; GFX6-LABEL: saddsat_i16_vs:
2667; GFX6:       ; %bb.0:
2668; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2669; GFX6-NEXT:    v_min_i32_e32 v2, 0, v0
2670; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2671; GFX6-NEXT:    v_max_i32_e32 v1, 0, v0
2672; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x80000000, v2
2673; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 0x7fffffff, v1
2674; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
2675; GFX6-NEXT:    v_min_i32_e32 v1, v2, v1
2676; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
2677; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2678; GFX6-NEXT:    ; return to shader part epilog
2679;
2680; GFX8-LABEL: saddsat_i16_vs:
2681; GFX8:       ; %bb.0:
2682; GFX8-NEXT:    v_min_i16_e32 v2, 0, v0
2683; GFX8-NEXT:    v_max_i16_e32 v1, 0, v0
2684; GFX8-NEXT:    v_sub_u16_e32 v2, 0x8000, v2
2685; GFX8-NEXT:    v_sub_u16_e32 v1, 0x7fff, v1
2686; GFX8-NEXT:    v_max_i16_e32 v2, s0, v2
2687; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
2688; GFX8-NEXT:    v_add_u16_e32 v0, v0, v1
2689; GFX8-NEXT:    ; return to shader part epilog
2690;
2691; GFX9-LABEL: saddsat_i16_vs:
2692; GFX9:       ; %bb.0:
2693; GFX9-NEXT:    v_add_i16 v0, v0, s0 clamp
2694; GFX9-NEXT:    ; return to shader part epilog
2695;
2696; GFX10-LABEL: saddsat_i16_vs:
2697; GFX10:       ; %bb.0:
2698; GFX10-NEXT:    v_add_nc_i16 v0, v0, s0 clamp
2699; GFX10-NEXT:    ; return to shader part epilog
2700  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
2701  %cast = bitcast i16 %result to half
2702  ret half %cast
2703}
2704
2705define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
2706; GFX6-LABEL: v_saddsat_v2i16:
2707; GFX6:       ; %bb.0:
2708; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2709; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2710; GFX6-NEXT:    s_brev_b32 s5, 1
2711; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
2712; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2713; GFX6-NEXT:    s_brev_b32 s4, -2
2714; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
2715; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
2716; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
2717; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
2718; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2719; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
2720; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
2721; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2722; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2723; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
2724; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
2725; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
2726; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
2727; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
2728; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
2729; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2730; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2731; GFX6-NEXT:    s_setpc_b64 s[30:31]
2732;
2733; GFX8-LABEL: v_saddsat_v2i16:
2734; GFX8:       ; %bb.0:
2735; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736; GFX8-NEXT:    s_movk_i32 s5, 0x8000
2737; GFX8-NEXT:    v_min_i16_e32 v4, 0, v0
2738; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2739; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
2740; GFX8-NEXT:    v_max_i16_e32 v3, 0, v0
2741; GFX8-NEXT:    v_sub_u16_e32 v4, s5, v4
2742; GFX8-NEXT:    v_sub_u16_e32 v3, s4, v3
2743; GFX8-NEXT:    v_max_i16_e32 v4, v4, v1
2744; GFX8-NEXT:    v_min_i16_e32 v5, 0, v2
2745; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
2746; GFX8-NEXT:    v_max_i16_e32 v4, 0, v2
2747; GFX8-NEXT:    v_sub_u16_e32 v5, s5, v5
2748; GFX8-NEXT:    v_sub_u16_e32 v4, s4, v4
2749; GFX8-NEXT:    v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2750; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
2751; GFX8-NEXT:    v_add_u16_e32 v0, v0, v3
2752; GFX8-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2753; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2754; GFX8-NEXT:    s_setpc_b64 s[30:31]
2755;
2756; GFX9-LABEL: v_saddsat_v2i16:
2757; GFX9:       ; %bb.0:
2758; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2759; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
2760; GFX9-NEXT:    s_setpc_b64 s[30:31]
2761;
2762; GFX10-LABEL: v_saddsat_v2i16:
2763; GFX10:       ; %bb.0:
2764; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2765; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2766; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
2767; GFX10-NEXT:    s_setpc_b64 s[30:31]
2768  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2769  ret <2 x i16> %result
2770}
2771
2772define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
2773; GFX6-LABEL: s_saddsat_v2i16:
2774; GFX6:       ; %bb.0:
2775; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2776; GFX6-NEXT:    s_brev_b32 s5, 1
2777; GFX6-NEXT:    s_min_i32 s7, s0, 0
2778; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2779; GFX6-NEXT:    s_brev_b32 s4, -2
2780; GFX6-NEXT:    s_max_i32 s6, s0, 0
2781; GFX6-NEXT:    s_sub_i32 s7, s5, s7
2782; GFX6-NEXT:    s_sub_i32 s6, s4, s6
2783; GFX6-NEXT:    s_max_i32 s2, s7, s2
2784; GFX6-NEXT:    s_min_i32 s2, s2, s6
2785; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2786; GFX6-NEXT:    s_add_i32 s0, s0, s2
2787; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
2788; GFX6-NEXT:    s_max_i32 s3, s1, 0
2789; GFX6-NEXT:    s_sub_i32 s3, s4, s3
2790; GFX6-NEXT:    s_min_i32 s4, s1, 0
2791; GFX6-NEXT:    s_sub_i32 s4, s5, s4
2792; GFX6-NEXT:    s_max_i32 s2, s4, s2
2793; GFX6-NEXT:    s_min_i32 s2, s2, s3
2794; GFX6-NEXT:    s_add_i32 s1, s1, s2
2795; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
2796; GFX6-NEXT:    s_mov_b32 s2, 0xffff
2797; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2798; GFX6-NEXT:    s_and_b32 s1, s1, s2
2799; GFX6-NEXT:    s_and_b32 s0, s0, s2
2800; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2801; GFX6-NEXT:    s_or_b32 s0, s0, s1
2802; GFX6-NEXT:    ; return to shader part epilog
2803;
2804; GFX8-LABEL: s_saddsat_v2i16:
2805; GFX8:       ; %bb.0:
2806; GFX8-NEXT:    s_sext_i32_i16 s6, s0
2807; GFX8-NEXT:    s_sext_i32_i16 s7, 0
2808; GFX8-NEXT:    s_movk_i32 s5, 0x8000
2809; GFX8-NEXT:    s_max_i32 s8, s6, s7
2810; GFX8-NEXT:    s_min_i32 s6, s6, s7
2811; GFX8-NEXT:    s_sub_i32 s6, s5, s6
2812; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
2813; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
2814; GFX8-NEXT:    s_sext_i32_i16 s6, s6
2815; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2816; GFX8-NEXT:    s_sub_i32 s8, s4, s8
2817; GFX8-NEXT:    s_max_i32 s1, s6, s1
2818; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2819; GFX8-NEXT:    s_sext_i32_i16 s6, s8
2820; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
2821; GFX8-NEXT:    s_min_i32 s1, s1, s6
2822; GFX8-NEXT:    s_add_i32 s0, s0, s1
2823; GFX8-NEXT:    s_sext_i32_i16 s1, s2
2824; GFX8-NEXT:    s_max_i32 s6, s1, s7
2825; GFX8-NEXT:    s_min_i32 s1, s1, s7
2826; GFX8-NEXT:    s_sub_i32 s1, s5, s1
2827; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2828; GFX8-NEXT:    s_sext_i32_i16 s3, s3
2829; GFX8-NEXT:    s_sub_i32 s4, s4, s6
2830; GFX8-NEXT:    s_max_i32 s1, s1, s3
2831; GFX8-NEXT:    s_sext_i32_i16 s1, s1
2832; GFX8-NEXT:    s_sext_i32_i16 s3, s4
2833; GFX8-NEXT:    s_min_i32 s1, s1, s3
2834; GFX8-NEXT:    s_add_i32 s2, s2, s1
2835; GFX8-NEXT:    s_bfe_u32 s1, s2, 0x100000
2836; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
2837; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
2838; GFX8-NEXT:    s_or_b32 s0, s0, s1
2839; GFX8-NEXT:    ; return to shader part epilog
2840;
2841; GFX9-LABEL: s_saddsat_v2i16:
2842; GFX9:       ; %bb.0:
2843; GFX9-NEXT:    v_mov_b32_e32 v0, s1
2844; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2845; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2846; GFX9-NEXT:    ; return to shader part epilog
2847;
2848; GFX10-LABEL: s_saddsat_v2i16:
2849; GFX10:       ; %bb.0:
2850; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
2851; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2852; GFX10-NEXT:    ; return to shader part epilog
2853  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2854  %cast = bitcast <2 x i16> %result to i32
2855  ret i32 %cast
2856}
2857
2858define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
2859; GFX6-LABEL: saddsat_v2i16_sv:
2860; GFX6:       ; %bb.0:
2861; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2862; GFX6-NEXT:    s_brev_b32 s3, 1
2863; GFX6-NEXT:    s_min_i32 s5, s0, 0
2864; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2865; GFX6-NEXT:    s_brev_b32 s2, -2
2866; GFX6-NEXT:    s_max_i32 s4, s0, 0
2867; GFX6-NEXT:    s_sub_i32 s5, s3, s5
2868; GFX6-NEXT:    s_sub_i32 s4, s2, s4
2869; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
2870; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
2871; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2872; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2873; GFX6-NEXT:    s_max_i32 s1, s0, 0
2874; GFX6-NEXT:    s_sub_i32 s1, s2, s1
2875; GFX6-NEXT:    s_min_i32 s2, s0, 0
2876; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2877; GFX6-NEXT:    s_sub_i32 s2, s3, s2
2878; GFX6-NEXT:    v_max_i32_e32 v1, s2, v1
2879; GFX6-NEXT:    v_min_i32_e32 v1, s1, v1
2880; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
2881; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2882; GFX6-NEXT:    s_mov_b32 s0, 0xffff
2883; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2884; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
2885; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
2886; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2887; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2888; GFX6-NEXT:    ; return to shader part epilog
2889;
2890; GFX8-LABEL: saddsat_v2i16_sv:
2891; GFX8:       ; %bb.0:
2892; GFX8-NEXT:    s_sext_i32_i16 s4, s0
2893; GFX8-NEXT:    s_sext_i32_i16 s5, 0
2894; GFX8-NEXT:    s_movk_i32 s3, 0x8000
2895; GFX8-NEXT:    s_max_i32 s6, s4, s5
2896; GFX8-NEXT:    s_min_i32 s4, s4, s5
2897; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2898; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
2899; GFX8-NEXT:    s_sub_i32 s4, s3, s4
2900; GFX8-NEXT:    s_sub_i32 s6, s2, s6
2901; GFX8-NEXT:    v_max_i16_e32 v1, s4, v0
2902; GFX8-NEXT:    s_sext_i32_i16 s4, s1
2903; GFX8-NEXT:    v_min_i16_e32 v1, s6, v1
2904; GFX8-NEXT:    s_max_i32 s6, s4, s5
2905; GFX8-NEXT:    s_min_i32 s4, s4, s5
2906; GFX8-NEXT:    s_sub_i32 s3, s3, s4
2907; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2908; GFX8-NEXT:    s_sub_i32 s2, s2, s6
2909; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2910; GFX8-NEXT:    v_min_i16_e32 v0, s2, v0
2911; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2912; GFX8-NEXT:    v_add_u16_e32 v1, s0, v1
2913; GFX8-NEXT:    v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2914; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
2915; GFX8-NEXT:    ; return to shader part epilog
2916;
2917; GFX9-LABEL: saddsat_v2i16_sv:
2918; GFX9:       ; %bb.0:
2919; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2920; GFX9-NEXT:    ; return to shader part epilog
2921;
2922; GFX10-LABEL: saddsat_v2i16_sv:
2923; GFX10:       ; %bb.0:
2924; GFX10-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
2925; GFX10-NEXT:    ; return to shader part epilog
2926  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2927  %cast = bitcast <2 x i16> %result to float
2928  ret float %cast
2929}
2930
2931define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
2932; GFX6-LABEL: saddsat_v2i16_vs:
2933; GFX6:       ; %bb.0:
2934; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2935; GFX6-NEXT:    s_brev_b32 s3, 1
2936; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
2937; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2938; GFX6-NEXT:    s_brev_b32 s2, -2
2939; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
2940; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
2941; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
2942; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
2943; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2944; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
2945; GFX6-NEXT:    v_min_i32_e32 v3, 0, v1
2946; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2947; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
2948; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
2949; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
2950; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
2951; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
2952; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
2953; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
2954; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2955; GFX6-NEXT:    s_mov_b32 s0, 0xffff
2956; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2957; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
2958; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
2959; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2960; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2961; GFX6-NEXT:    ; return to shader part epilog
2962;
2963; GFX8-LABEL: saddsat_v2i16_vs:
2964; GFX8:       ; %bb.0:
2965; GFX8-NEXT:    s_movk_i32 s3, 0x8000
2966; GFX8-NEXT:    v_min_i16_e32 v3, 0, v0
2967; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2968; GFX8-NEXT:    s_movk_i32 s2, 0x7fff
2969; GFX8-NEXT:    v_max_i16_e32 v2, 0, v0
2970; GFX8-NEXT:    v_sub_u16_e32 v3, s3, v3
2971; GFX8-NEXT:    v_sub_u16_e32 v2, s2, v2
2972; GFX8-NEXT:    v_max_i16_e32 v3, s0, v3
2973; GFX8-NEXT:    v_min_i16_e32 v4, 0, v1
2974; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
2975; GFX8-NEXT:    v_min_i16_e32 v2, v3, v2
2976; GFX8-NEXT:    v_max_i16_e32 v3, 0, v1
2977; GFX8-NEXT:    v_sub_u16_e32 v4, s3, v4
2978; GFX8-NEXT:    v_sub_u16_e32 v3, s2, v3
2979; GFX8-NEXT:    v_max_i16_e32 v4, s1, v4
2980; GFX8-NEXT:    v_min_i16_e32 v3, v4, v3
2981; GFX8-NEXT:    v_add_u16_e32 v0, v0, v2
2982; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2983; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
2984; GFX8-NEXT:    ; return to shader part epilog
2985;
2986; GFX9-LABEL: saddsat_v2i16_vs:
2987; GFX9:       ; %bb.0:
2988; GFX9-NEXT:    v_pk_add_i16 v0, v0, s0 clamp
2989; GFX9-NEXT:    ; return to shader part epilog
2990;
2991; GFX10-LABEL: saddsat_v2i16_vs:
2992; GFX10:       ; %bb.0:
2993; GFX10-NEXT:    v_pk_add_i16 v0, v0, s0 clamp
2994; GFX10-NEXT:    ; return to shader part epilog
2995  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
2996  %cast = bitcast <2 x i16> %result to float
2997  ret float %cast
2998}
2999
3000; FIXME: v3i16 insert/extract
3001; define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
3002;   %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3003;   ret <3 x i16> %result
3004; }
3005
3006; define amdgpu_ps <3 x i16> @s_saddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
3007;   %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
3008;   ret <3 x i16> %result
3009; }
3010
3011define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
3012; GFX6-LABEL: v_saddsat_v4i16:
3013; GFX6:       ; %bb.0:
3014; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3015; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3016; GFX6-NEXT:    s_brev_b32 s5, 1
3017; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
3018; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3019; GFX6-NEXT:    s_brev_b32 s4, -2
3020; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
3021; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
3022; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
3023; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
3024; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3025; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
3026; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
3027; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
3028; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
3029; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
3030; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
3031; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
3032; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
3033; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3034; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3035; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3036; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
3037; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
3038; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
3039; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
3040; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
3041; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
3042; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
3043; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3044; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
3045; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3046; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
3047; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3048; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
3049; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
3050; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
3051; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
3052; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
3053; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3054; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
3055; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3056; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3057; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
3058; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3059; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3060; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3061; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3062; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3063; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3064; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3065; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3066; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3067; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3068; GFX6-NEXT:    s_setpc_b64 s[30:31]
3069;
3070; GFX8-LABEL: v_saddsat_v4i16:
3071; GFX8:       ; %bb.0:
3072; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3073; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3074; GFX8-NEXT:    v_min_i16_e32 v7, 0, v0
3075; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
3076; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3077; GFX8-NEXT:    v_max_i16_e32 v6, 0, v0
3078; GFX8-NEXT:    v_sub_u16_e32 v7, s5, v7
3079; GFX8-NEXT:    v_sub_u16_e32 v6, s4, v6
3080; GFX8-NEXT:    v_max_i16_e32 v7, v7, v2
3081; GFX8-NEXT:    v_min_i16_e32 v8, 0, v4
3082; GFX8-NEXT:    v_min_i16_e32 v6, v7, v6
3083; GFX8-NEXT:    v_max_i16_e32 v7, 0, v4
3084; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
3085; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
3086; GFX8-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3087; GFX8-NEXT:    v_min_i16_e32 v8, 0, v1
3088; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3089; GFX8-NEXT:    v_min_i16_e32 v2, v2, v7
3090; GFX8-NEXT:    v_max_i16_e32 v7, 0, v1
3091; GFX8-NEXT:    v_sub_u16_e32 v8, s5, v8
3092; GFX8-NEXT:    v_sub_u16_e32 v7, s4, v7
3093; GFX8-NEXT:    v_max_i16_e32 v8, v8, v3
3094; GFX8-NEXT:    v_min_i16_e32 v9, 0, v5
3095; GFX8-NEXT:    v_min_i16_e32 v7, v8, v7
3096; GFX8-NEXT:    v_max_i16_e32 v8, 0, v5
3097; GFX8-NEXT:    v_sub_u16_e32 v9, s5, v9
3098; GFX8-NEXT:    v_sub_u16_e32 v8, s4, v8
3099; GFX8-NEXT:    v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3100; GFX8-NEXT:    v_min_i16_e32 v3, v3, v8
3101; GFX8-NEXT:    v_add_u16_e32 v0, v0, v6
3102; GFX8-NEXT:    v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3103; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
3104; GFX8-NEXT:    v_add_u16_e32 v1, v1, v7
3105; GFX8-NEXT:    v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3106; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
3107; GFX8-NEXT:    s_setpc_b64 s[30:31]
3108;
3109; GFX9-LABEL: v_saddsat_v4i16:
3110; GFX9:       ; %bb.0:
3111; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
3113; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
3114; GFX9-NEXT:    s_setpc_b64 s[30:31]
3115;
3116; GFX10-LABEL: v_saddsat_v4i16:
3117; GFX10:       ; %bb.0:
3118; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3119; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3120; GFX10-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
3121; GFX10-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
3122; GFX10-NEXT:    s_setpc_b64 s[30:31]
3123  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3124  %cast = bitcast <4 x i16> %result to <2 x float>
3125  ret <2 x float> %cast
3126}
3127
3128define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
3129; GFX6-LABEL: s_saddsat_v4i16:
3130; GFX6:       ; %bb.0:
3131; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3132; GFX6-NEXT:    s_brev_b32 s9, 1
3133; GFX6-NEXT:    s_min_i32 s11, s0, 0
3134; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3135; GFX6-NEXT:    s_brev_b32 s8, -2
3136; GFX6-NEXT:    s_max_i32 s10, s0, 0
3137; GFX6-NEXT:    s_sub_i32 s11, s9, s11
3138; GFX6-NEXT:    s_sub_i32 s10, s8, s10
3139; GFX6-NEXT:    s_max_i32 s4, s11, s4
3140; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3141; GFX6-NEXT:    s_min_i32 s4, s4, s10
3142; GFX6-NEXT:    s_min_i32 s10, s1, 0
3143; GFX6-NEXT:    s_add_i32 s0, s0, s4
3144; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
3145; GFX6-NEXT:    s_max_i32 s5, s1, 0
3146; GFX6-NEXT:    s_sub_i32 s10, s9, s10
3147; GFX6-NEXT:    s_sub_i32 s5, s8, s5
3148; GFX6-NEXT:    s_max_i32 s4, s10, s4
3149; GFX6-NEXT:    s_min_i32 s4, s4, s5
3150; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3151; GFX6-NEXT:    s_add_i32 s1, s1, s4
3152; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
3153; GFX6-NEXT:    s_min_i32 s6, s2, 0
3154; GFX6-NEXT:    s_max_i32 s5, s2, 0
3155; GFX6-NEXT:    s_sub_i32 s6, s9, s6
3156; GFX6-NEXT:    s_sub_i32 s5, s8, s5
3157; GFX6-NEXT:    s_max_i32 s4, s6, s4
3158; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3159; GFX6-NEXT:    s_min_i32 s4, s4, s5
3160; GFX6-NEXT:    s_min_i32 s6, s3, 0
3161; GFX6-NEXT:    s_add_i32 s2, s2, s4
3162; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
3163; GFX6-NEXT:    s_max_i32 s5, s3, 0
3164; GFX6-NEXT:    s_sub_i32 s6, s9, s6
3165; GFX6-NEXT:    s_sub_i32 s5, s8, s5
3166; GFX6-NEXT:    s_max_i32 s4, s6, s4
3167; GFX6-NEXT:    s_min_i32 s4, s4, s5
3168; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3169; GFX6-NEXT:    s_add_i32 s3, s3, s4
3170; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3171; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3172; GFX6-NEXT:    s_and_b32 s1, s1, s4
3173; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3174; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3175; GFX6-NEXT:    s_and_b32 s0, s0, s4
3176; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3177; GFX6-NEXT:    s_or_b32 s0, s0, s1
3178; GFX6-NEXT:    s_and_b32 s1, s2, s4
3179; GFX6-NEXT:    s_and_b32 s2, s3, s4
3180; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3181; GFX6-NEXT:    s_or_b32 s1, s1, s2
3182; GFX6-NEXT:    ; return to shader part epilog
3183;
3184; GFX8-LABEL: s_saddsat_v4i16:
3185; GFX8:       ; %bb.0:
3186; GFX8-NEXT:    s_sext_i32_i16 s10, s0
3187; GFX8-NEXT:    s_sext_i32_i16 s11, 0
3188; GFX8-NEXT:    s_movk_i32 s9, 0x8000
3189; GFX8-NEXT:    s_max_i32 s12, s10, s11
3190; GFX8-NEXT:    s_min_i32 s10, s10, s11
3191; GFX8-NEXT:    s_sub_i32 s10, s9, s10
3192; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
3193; GFX8-NEXT:    s_movk_i32 s8, 0x7fff
3194; GFX8-NEXT:    s_sext_i32_i16 s10, s10
3195; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3196; GFX8-NEXT:    s_sub_i32 s12, s8, s12
3197; GFX8-NEXT:    s_max_i32 s2, s10, s2
3198; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3199; GFX8-NEXT:    s_sext_i32_i16 s10, s12
3200; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
3201; GFX8-NEXT:    s_min_i32 s2, s2, s10
3202; GFX8-NEXT:    s_add_i32 s0, s0, s2
3203; GFX8-NEXT:    s_sext_i32_i16 s2, s4
3204; GFX8-NEXT:    s_max_i32 s10, s2, s11
3205; GFX8-NEXT:    s_min_i32 s2, s2, s11
3206; GFX8-NEXT:    s_sub_i32 s2, s9, s2
3207; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3208; GFX8-NEXT:    s_sext_i32_i16 s6, s6
3209; GFX8-NEXT:    s_sub_i32 s10, s8, s10
3210; GFX8-NEXT:    s_max_i32 s2, s2, s6
3211; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3212; GFX8-NEXT:    s_sext_i32_i16 s6, s10
3213; GFX8-NEXT:    s_min_i32 s2, s2, s6
3214; GFX8-NEXT:    s_add_i32 s4, s4, s2
3215; GFX8-NEXT:    s_sext_i32_i16 s2, s1
3216; GFX8-NEXT:    s_max_i32 s6, s2, s11
3217; GFX8-NEXT:    s_min_i32 s2, s2, s11
3218; GFX8-NEXT:    s_sub_i32 s2, s9, s2
3219; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
3220; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3221; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3222; GFX8-NEXT:    s_sub_i32 s6, s8, s6
3223; GFX8-NEXT:    s_max_i32 s2, s2, s3
3224; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3225; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3226; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
3227; GFX8-NEXT:    s_min_i32 s2, s2, s3
3228; GFX8-NEXT:    s_add_i32 s1, s1, s2
3229; GFX8-NEXT:    s_sext_i32_i16 s2, s5
3230; GFX8-NEXT:    s_max_i32 s3, s2, s11
3231; GFX8-NEXT:    s_min_i32 s2, s2, s11
3232; GFX8-NEXT:    s_sub_i32 s2, s9, s2
3233; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3234; GFX8-NEXT:    s_sext_i32_i16 s6, s7
3235; GFX8-NEXT:    s_sub_i32 s3, s8, s3
3236; GFX8-NEXT:    s_max_i32 s2, s2, s6
3237; GFX8-NEXT:    s_sext_i32_i16 s2, s2
3238; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3239; GFX8-NEXT:    s_min_i32 s2, s2, s3
3240; GFX8-NEXT:    s_add_i32 s5, s5, s2
3241; GFX8-NEXT:    s_bfe_u32 s2, s4, 0x100000
3242; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
3243; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3244; GFX8-NEXT:    s_or_b32 s0, s0, s2
3245; GFX8-NEXT:    s_bfe_u32 s2, s5, 0x100000
3246; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
3247; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
3248; GFX8-NEXT:    s_or_b32 s1, s1, s2
3249; GFX8-NEXT:    ; return to shader part epilog
3250;
3251; GFX9-LABEL: s_saddsat_v4i16:
3252; GFX9:       ; %bb.0:
3253; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3254; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3255; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
3256; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
3257; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3258; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3259; GFX9-NEXT:    ; return to shader part epilog
3260;
3261; GFX10-LABEL: s_saddsat_v4i16:
3262; GFX10:       ; %bb.0:
3263; GFX10-NEXT:    v_pk_add_i16 v0, s0, s2 clamp
3264; GFX10-NEXT:    v_pk_add_i16 v1, s1, s3 clamp
3265; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
3266; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
3267; GFX10-NEXT:    ; return to shader part epilog
3268  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
3269  %cast = bitcast <4 x i16> %result to <2 x i32>
3270  ret <2 x i32> %cast
3271}
3272
3273; FIXME
3274; define <5 x i16> @v_saddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
3275;   %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3276;   ret <5 x i16> %result
3277; }
3278
3279; define amdgpu_ps <5 x i16> @s_saddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
3280;   %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
3281;   ret <5 x i16> %result
3282; }
3283
3284define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
3285; GFX6-LABEL: v_saddsat_v6i16:
3286; GFX6:       ; %bb.0:
3287; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3288; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3289; GFX6-NEXT:    s_brev_b32 s5, 1
3290; GFX6-NEXT:    v_min_i32_e32 v14, 0, v0
3291; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3292; GFX6-NEXT:    s_brev_b32 s4, -2
3293; GFX6-NEXT:    v_max_i32_e32 v12, 0, v0
3294; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, s5, v14
3295; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s4, v12
3296; GFX6-NEXT:    v_max_i32_e32 v6, v14, v6
3297; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3298; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
3299; GFX6-NEXT:    v_min_i32_e32 v12, 0, v1
3300; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
3301; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
3302; GFX6-NEXT:    v_max_i32_e32 v7, 0, v1
3303; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
3304; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s4, v7
3305; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
3306; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3307; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3308; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
3309; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
3310; GFX6-NEXT:    v_min_i32_e32 v8, 0, v2
3311; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
3312; GFX6-NEXT:    v_max_i32_e32 v7, 0, v2
3313; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
3314; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3315; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3316; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3317; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
3318; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3319; GFX6-NEXT:    v_min_i32_e32 v8, 0, v3
3320; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
3321; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
3322; GFX6-NEXT:    v_max_i32_e32 v7, 0, v3
3323; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3324; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3325; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3326; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3327; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3328; GFX6-NEXT:    v_min_i32_e32 v8, 0, v4
3329; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
3330; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
3331; GFX6-NEXT:    v_max_i32_e32 v7, 0, v4
3332; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3333; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3334; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3335; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3336; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3337; GFX6-NEXT:    v_min_i32_e32 v8, 0, v5
3338; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
3339; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
3340; GFX6-NEXT:    v_max_i32_e32 v7, 0, v5
3341; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
3342; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3343; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
3344; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
3345; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3346; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3347; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
3348; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3349; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3350; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3351; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
3352; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3353; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3354; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3355; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3356; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3357; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3358; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3359; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3360; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
3361; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3362; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
3363; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3364; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3365; GFX6-NEXT:    s_setpc_b64 s[30:31]
3366;
3367; GFX8-LABEL: v_saddsat_v6i16:
3368; GFX8:       ; %bb.0:
3369; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3370; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3371; GFX8-NEXT:    v_min_i16_e32 v11, 0, v0
3372; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
3373; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3374; GFX8-NEXT:    v_max_i16_e32 v9, 0, v0
3375; GFX8-NEXT:    v_sub_u16_e32 v11, s5, v11
3376; GFX8-NEXT:    v_sub_u16_e32 v9, s4, v9
3377; GFX8-NEXT:    v_max_i16_e32 v11, v11, v3
3378; GFX8-NEXT:    v_min_i16_e32 v13, 0, v6
3379; GFX8-NEXT:    v_min_i16_e32 v9, v11, v9
3380; GFX8-NEXT:    v_max_i16_e32 v11, 0, v6
3381; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
3382; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
3383; GFX8-NEXT:    v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3384; GFX8-NEXT:    v_min_i16_e32 v13, 0, v1
3385; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
3386; GFX8-NEXT:    v_min_i16_e32 v3, v3, v11
3387; GFX8-NEXT:    v_max_i16_e32 v11, 0, v1
3388; GFX8-NEXT:    v_sub_u16_e32 v13, s5, v13
3389; GFX8-NEXT:    v_sub_u16_e32 v11, s4, v11
3390; GFX8-NEXT:    v_max_i16_e32 v13, v13, v4
3391; GFX8-NEXT:    v_min_i16_e32 v14, 0, v7
3392; GFX8-NEXT:    v_min_i16_e32 v11, v13, v11
3393; GFX8-NEXT:    v_max_i16_e32 v13, 0, v7
3394; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
3395; GFX8-NEXT:    v_mov_b32_e32 v12, 0xffff8000
3396; GFX8-NEXT:    v_sub_u16_e32 v13, s4, v13
3397; GFX8-NEXT:    v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3398; GFX8-NEXT:    v_min_i16_e32 v14, 0, v2
3399; GFX8-NEXT:    v_mov_b32_e32 v10, 0x7fff
3400; GFX8-NEXT:    v_min_i16_e32 v4, v4, v13
3401; GFX8-NEXT:    v_max_i16_e32 v13, 0, v2
3402; GFX8-NEXT:    v_sub_u16_e32 v14, v12, v14
3403; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
3404; GFX8-NEXT:    v_sub_u16_e32 v13, v10, v13
3405; GFX8-NEXT:    v_max_i16_e32 v14, v14, v5
3406; GFX8-NEXT:    v_min_i16_e32 v13, v14, v13
3407; GFX8-NEXT:    v_max_i16_e32 v14, 0, v8
3408; GFX8-NEXT:    v_sub_u16_e32 v10, v10, v14
3409; GFX8-NEXT:    v_min_i16_e32 v14, 0, v8
3410; GFX8-NEXT:    v_sub_u16_e32 v12, v12, v14
3411; GFX8-NEXT:    v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3412; GFX8-NEXT:    v_add_u16_e32 v0, v0, v9
3413; GFX8-NEXT:    v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3414; GFX8-NEXT:    v_min_i16_e32 v5, v5, v10
3415; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
3416; GFX8-NEXT:    v_add_u16_e32 v1, v1, v11
3417; GFX8-NEXT:    v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3418; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
3419; GFX8-NEXT:    v_add_u16_e32 v2, v2, v13
3420; GFX8-NEXT:    v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3421; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
3422; GFX8-NEXT:    s_setpc_b64 s[30:31]
3423;
3424; GFX9-LABEL: v_saddsat_v6i16:
3425; GFX9:       ; %bb.0:
3426; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3427; GFX9-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
3428; GFX9-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
3429; GFX9-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
3430; GFX9-NEXT:    s_setpc_b64 s[30:31]
3431;
3432; GFX10-LABEL: v_saddsat_v6i16:
3433; GFX10:       ; %bb.0:
3434; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3435; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3436; GFX10-NEXT:    v_pk_add_i16 v0, v0, v3 clamp
3437; GFX10-NEXT:    v_pk_add_i16 v1, v1, v4 clamp
3438; GFX10-NEXT:    v_pk_add_i16 v2, v2, v5 clamp
3439; GFX10-NEXT:    s_setpc_b64 s[30:31]
3440  %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3441  %cast = bitcast <6 x i16> %result to <3 x float>
3442  ret <3 x float> %cast
3443}
3444
3445define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
3446; GFX6-LABEL: s_saddsat_v6i16:
3447; GFX6:       ; %bb.0:
3448; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3449; GFX6-NEXT:    s_brev_b32 s13, 1
3450; GFX6-NEXT:    s_min_i32 s15, s0, 0
3451; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3452; GFX6-NEXT:    s_brev_b32 s12, -2
3453; GFX6-NEXT:    s_max_i32 s14, s0, 0
3454; GFX6-NEXT:    s_sub_i32 s15, s13, s15
3455; GFX6-NEXT:    s_sub_i32 s14, s12, s14
3456; GFX6-NEXT:    s_max_i32 s6, s15, s6
3457; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3458; GFX6-NEXT:    s_min_i32 s6, s6, s14
3459; GFX6-NEXT:    s_min_i32 s14, s1, 0
3460; GFX6-NEXT:    s_add_i32 s0, s0, s6
3461; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
3462; GFX6-NEXT:    s_max_i32 s7, s1, 0
3463; GFX6-NEXT:    s_sub_i32 s14, s13, s14
3464; GFX6-NEXT:    s_sub_i32 s7, s12, s7
3465; GFX6-NEXT:    s_max_i32 s6, s14, s6
3466; GFX6-NEXT:    s_min_i32 s6, s6, s7
3467; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3468; GFX6-NEXT:    s_add_i32 s1, s1, s6
3469; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
3470; GFX6-NEXT:    s_min_i32 s8, s2, 0
3471; GFX6-NEXT:    s_max_i32 s7, s2, 0
3472; GFX6-NEXT:    s_sub_i32 s8, s13, s8
3473; GFX6-NEXT:    s_sub_i32 s7, s12, s7
3474; GFX6-NEXT:    s_max_i32 s6, s8, s6
3475; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3476; GFX6-NEXT:    s_min_i32 s6, s6, s7
3477; GFX6-NEXT:    s_min_i32 s8, s3, 0
3478; GFX6-NEXT:    s_add_i32 s2, s2, s6
3479; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
3480; GFX6-NEXT:    s_max_i32 s7, s3, 0
3481; GFX6-NEXT:    s_sub_i32 s8, s13, s8
3482; GFX6-NEXT:    s_sub_i32 s7, s12, s7
3483; GFX6-NEXT:    s_max_i32 s6, s8, s6
3484; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3485; GFX6-NEXT:    s_min_i32 s6, s6, s7
3486; GFX6-NEXT:    s_min_i32 s8, s4, 0
3487; GFX6-NEXT:    s_add_i32 s3, s3, s6
3488; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
3489; GFX6-NEXT:    s_max_i32 s7, s4, 0
3490; GFX6-NEXT:    s_sub_i32 s8, s13, s8
3491; GFX6-NEXT:    s_sub_i32 s7, s12, s7
3492; GFX6-NEXT:    s_max_i32 s6, s8, s6
3493; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3494; GFX6-NEXT:    s_min_i32 s6, s6, s7
3495; GFX6-NEXT:    s_min_i32 s8, s5, 0
3496; GFX6-NEXT:    s_add_i32 s4, s4, s6
3497; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
3498; GFX6-NEXT:    s_max_i32 s7, s5, 0
3499; GFX6-NEXT:    s_sub_i32 s8, s13, s8
3500; GFX6-NEXT:    s_sub_i32 s7, s12, s7
3501; GFX6-NEXT:    s_max_i32 s6, s8, s6
3502; GFX6-NEXT:    s_min_i32 s6, s6, s7
3503; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3504; GFX6-NEXT:    s_add_i32 s5, s5, s6
3505; GFX6-NEXT:    s_mov_b32 s6, 0xffff
3506; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3507; GFX6-NEXT:    s_and_b32 s1, s1, s6
3508; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3509; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3510; GFX6-NEXT:    s_and_b32 s0, s0, s6
3511; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3512; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3513; GFX6-NEXT:    s_or_b32 s0, s0, s1
3514; GFX6-NEXT:    s_and_b32 s1, s2, s6
3515; GFX6-NEXT:    s_and_b32 s2, s3, s6
3516; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3517; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3518; GFX6-NEXT:    s_and_b32 s3, s5, s6
3519; GFX6-NEXT:    s_or_b32 s1, s1, s2
3520; GFX6-NEXT:    s_and_b32 s2, s4, s6
3521; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3522; GFX6-NEXT:    s_or_b32 s2, s2, s3
3523; GFX6-NEXT:    ; return to shader part epilog
3524;
3525; GFX8-LABEL: s_saddsat_v6i16:
3526; GFX8:       ; %bb.0:
3527; GFX8-NEXT:    s_sext_i32_i16 s14, s0
3528; GFX8-NEXT:    s_sext_i32_i16 s15, 0
3529; GFX8-NEXT:    s_movk_i32 s13, 0x8000
3530; GFX8-NEXT:    s_max_i32 s16, s14, s15
3531; GFX8-NEXT:    s_min_i32 s14, s14, s15
3532; GFX8-NEXT:    s_sub_i32 s14, s13, s14
3533; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
3534; GFX8-NEXT:    s_movk_i32 s12, 0x7fff
3535; GFX8-NEXT:    s_sext_i32_i16 s14, s14
3536; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3537; GFX8-NEXT:    s_sub_i32 s16, s12, s16
3538; GFX8-NEXT:    s_max_i32 s3, s14, s3
3539; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3540; GFX8-NEXT:    s_sext_i32_i16 s14, s16
3541; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
3542; GFX8-NEXT:    s_min_i32 s3, s3, s14
3543; GFX8-NEXT:    s_add_i32 s0, s0, s3
3544; GFX8-NEXT:    s_sext_i32_i16 s3, s6
3545; GFX8-NEXT:    s_max_i32 s14, s3, s15
3546; GFX8-NEXT:    s_min_i32 s3, s3, s15
3547; GFX8-NEXT:    s_sub_i32 s3, s13, s3
3548; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3549; GFX8-NEXT:    s_sext_i32_i16 s9, s9
3550; GFX8-NEXT:    s_sub_i32 s14, s12, s14
3551; GFX8-NEXT:    s_max_i32 s3, s3, s9
3552; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3553; GFX8-NEXT:    s_sext_i32_i16 s9, s14
3554; GFX8-NEXT:    s_min_i32 s3, s3, s9
3555; GFX8-NEXT:    s_add_i32 s6, s6, s3
3556; GFX8-NEXT:    s_sext_i32_i16 s3, s1
3557; GFX8-NEXT:    s_max_i32 s9, s3, s15
3558; GFX8-NEXT:    s_min_i32 s3, s3, s15
3559; GFX8-NEXT:    s_sub_i32 s3, s13, s3
3560; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
3561; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3562; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3563; GFX8-NEXT:    s_sub_i32 s9, s12, s9
3564; GFX8-NEXT:    s_max_i32 s3, s3, s4
3565; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3566; GFX8-NEXT:    s_sext_i32_i16 s4, s9
3567; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
3568; GFX8-NEXT:    s_min_i32 s3, s3, s4
3569; GFX8-NEXT:    s_add_i32 s1, s1, s3
3570; GFX8-NEXT:    s_sext_i32_i16 s3, s7
3571; GFX8-NEXT:    s_max_i32 s4, s3, s15
3572; GFX8-NEXT:    s_min_i32 s3, s3, s15
3573; GFX8-NEXT:    s_sub_i32 s3, s13, s3
3574; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3575; GFX8-NEXT:    s_sext_i32_i16 s9, s10
3576; GFX8-NEXT:    s_sub_i32 s4, s12, s4
3577; GFX8-NEXT:    s_max_i32 s3, s3, s9
3578; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3579; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3580; GFX8-NEXT:    s_min_i32 s3, s3, s4
3581; GFX8-NEXT:    s_add_i32 s7, s7, s3
3582; GFX8-NEXT:    s_sext_i32_i16 s3, s2
3583; GFX8-NEXT:    s_max_i32 s4, s3, s15
3584; GFX8-NEXT:    s_min_i32 s3, s3, s15
3585; GFX8-NEXT:    s_sub_i32 s3, s13, s3
3586; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
3587; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3588; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3589; GFX8-NEXT:    s_sub_i32 s4, s12, s4
3590; GFX8-NEXT:    s_max_i32 s3, s3, s5
3591; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3592; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3593; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
3594; GFX8-NEXT:    s_min_i32 s3, s3, s4
3595; GFX8-NEXT:    s_add_i32 s2, s2, s3
3596; GFX8-NEXT:    s_sext_i32_i16 s3, s8
3597; GFX8-NEXT:    s_max_i32 s4, s3, s15
3598; GFX8-NEXT:    s_min_i32 s3, s3, s15
3599; GFX8-NEXT:    s_sub_i32 s3, s13, s3
3600; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3601; GFX8-NEXT:    s_sext_i32_i16 s5, s11
3602; GFX8-NEXT:    s_sub_i32 s4, s12, s4
3603; GFX8-NEXT:    s_max_i32 s3, s3, s5
3604; GFX8-NEXT:    s_sext_i32_i16 s3, s3
3605; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3606; GFX8-NEXT:    s_min_i32 s3, s3, s4
3607; GFX8-NEXT:    s_add_i32 s8, s8, s3
3608; GFX8-NEXT:    s_bfe_u32 s3, s6, 0x100000
3609; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
3610; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3611; GFX8-NEXT:    s_or_b32 s0, s0, s3
3612; GFX8-NEXT:    s_bfe_u32 s3, s7, 0x100000
3613; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
3614; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3615; GFX8-NEXT:    s_or_b32 s1, s1, s3
3616; GFX8-NEXT:    s_bfe_u32 s3, s8, 0x100000
3617; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
3618; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
3619; GFX8-NEXT:    s_or_b32 s2, s2, s3
3620; GFX8-NEXT:    ; return to shader part epilog
3621;
3622; GFX9-LABEL: s_saddsat_v6i16:
3623; GFX9:       ; %bb.0:
3624; GFX9-NEXT:    v_mov_b32_e32 v0, s3
3625; GFX9-NEXT:    v_mov_b32_e32 v1, s4
3626; GFX9-NEXT:    v_mov_b32_e32 v2, s5
3627; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
3628; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
3629; GFX9-NEXT:    v_pk_add_i16 v2, s2, v2 clamp
3630; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3631; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3632; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
3633; GFX9-NEXT:    ; return to shader part epilog
3634;
3635; GFX10-LABEL: s_saddsat_v6i16:
3636; GFX10:       ; %bb.0:
3637; GFX10-NEXT:    v_pk_add_i16 v0, s0, s3 clamp
3638; GFX10-NEXT:    v_pk_add_i16 v1, s1, s4 clamp
3639; GFX10-NEXT:    v_pk_add_i16 v2, s2, s5 clamp
3640; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
3641; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
3642; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
3643; GFX10-NEXT:    ; return to shader part epilog
3644  %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
3645  %cast = bitcast <6 x i16> %result to <3 x i32>
3646  ret <3 x i32> %cast
3647}
3648
3649define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
3650; GFX6-LABEL: v_saddsat_v8i16:
3651; GFX6:       ; %bb.0:
3652; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3653; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
3654; GFX6-NEXT:    s_brev_b32 s5, 1
3655; GFX6-NEXT:    v_min_i32_e32 v18, 0, v0
3656; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
3657; GFX6-NEXT:    s_brev_b32 s4, -2
3658; GFX6-NEXT:    v_max_i32_e32 v16, 0, v0
3659; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, s5, v18
3660; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
3661; GFX6-NEXT:    v_max_i32_e32 v8, v18, v8
3662; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3663; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
3664; GFX6-NEXT:    v_min_i32_e32 v16, 0, v1
3665; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
3666; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
3667; GFX6-NEXT:    v_max_i32_e32 v9, 0, v1
3668; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s5, v16
3669; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s4, v9
3670; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
3671; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3672; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3673; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
3674; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
3675; GFX6-NEXT:    v_min_i32_e32 v10, 0, v2
3676; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
3677; GFX6-NEXT:    v_max_i32_e32 v9, 0, v2
3678; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
3679; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3680; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3681; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3682; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
3683; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3684; GFX6-NEXT:    v_min_i32_e32 v10, 0, v3
3685; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
3686; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
3687; GFX6-NEXT:    v_max_i32_e32 v9, 0, v3
3688; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3689; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3690; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3691; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3692; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3693; GFX6-NEXT:    v_min_i32_e32 v10, 0, v4
3694; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
3695; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
3696; GFX6-NEXT:    v_max_i32_e32 v9, 0, v4
3697; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3698; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3699; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3700; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
3701; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3702; GFX6-NEXT:    v_min_i32_e32 v10, 0, v5
3703; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
3704; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
3705; GFX6-NEXT:    v_max_i32_e32 v9, 0, v5
3706; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3707; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3708; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3709; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
3710; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3711; GFX6-NEXT:    v_min_i32_e32 v10, 0, v6
3712; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
3713; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
3714; GFX6-NEXT:    v_max_i32_e32 v9, 0, v6
3715; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3716; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3717; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3718; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
3719; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3720; GFX6-NEXT:    v_min_i32_e32 v10, 0, v7
3721; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
3722; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
3723; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
3724; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
3725; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
3726; GFX6-NEXT:    s_mov_b32 s4, 0xffff
3727; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
3728; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
3729; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
3730; GFX6-NEXT:    v_and_b32_e32 v1, s4, v1
3731; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
3732; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
3733; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
3734; GFX6-NEXT:    v_and_b32_e32 v0, s4, v0
3735; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3736; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
3737; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
3738; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3739; GFX6-NEXT:    v_and_b32_e32 v1, s4, v2
3740; GFX6-NEXT:    v_and_b32_e32 v2, s4, v3
3741; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
3742; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
3743; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3744; GFX6-NEXT:    v_and_b32_e32 v3, s4, v5
3745; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
3746; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3747; GFX6-NEXT:    v_and_b32_e32 v2, s4, v4
3748; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
3749; GFX6-NEXT:    v_and_b32_e32 v4, s4, v7
3750; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
3751; GFX6-NEXT:    v_and_b32_e32 v3, s4, v6
3752; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
3753; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
3754; GFX6-NEXT:    s_setpc_b64 s[30:31]
3755;
3756; GFX8-LABEL: v_saddsat_v8i16:
3757; GFX8:       ; %bb.0:
3758; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3759; GFX8-NEXT:    s_movk_i32 s5, 0x8000
3760; GFX8-NEXT:    v_min_i16_e32 v14, 0, v0
3761; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
3762; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
3763; GFX8-NEXT:    v_max_i16_e32 v12, 0, v0
3764; GFX8-NEXT:    v_sub_u16_e32 v14, s5, v14
3765; GFX8-NEXT:    v_sub_u16_e32 v12, s4, v12
3766; GFX8-NEXT:    v_max_i16_e32 v14, v14, v4
3767; GFX8-NEXT:    v_min_i16_e32 v16, 0, v8
3768; GFX8-NEXT:    v_min_i16_e32 v12, v14, v12
3769; GFX8-NEXT:    v_max_i16_e32 v14, 0, v8
3770; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
3771; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
3772; GFX8-NEXT:    v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3773; GFX8-NEXT:    v_min_i16_e32 v16, 0, v1
3774; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
3775; GFX8-NEXT:    v_min_i16_e32 v4, v4, v14
3776; GFX8-NEXT:    v_max_i16_e32 v14, 0, v1
3777; GFX8-NEXT:    v_sub_u16_e32 v16, s5, v16
3778; GFX8-NEXT:    v_sub_u16_e32 v14, s4, v14
3779; GFX8-NEXT:    v_max_i16_e32 v16, v16, v5
3780; GFX8-NEXT:    v_min_i16_e32 v17, 0, v9
3781; GFX8-NEXT:    v_min_i16_e32 v14, v16, v14
3782; GFX8-NEXT:    v_max_i16_e32 v16, 0, v9
3783; GFX8-NEXT:    v_sub_u16_e32 v17, s5, v17
3784; GFX8-NEXT:    v_mov_b32_e32 v15, 0xffff8000
3785; GFX8-NEXT:    v_sub_u16_e32 v16, s4, v16
3786; GFX8-NEXT:    v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3787; GFX8-NEXT:    v_min_i16_e32 v17, 0, v2
3788; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
3789; GFX8-NEXT:    v_mov_b32_e32 v13, 0x7fff
3790; GFX8-NEXT:    v_min_i16_e32 v5, v5, v16
3791; GFX8-NEXT:    v_max_i16_e32 v16, 0, v2
3792; GFX8-NEXT:    v_sub_u16_e32 v17, v15, v17
3793; GFX8-NEXT:    v_sub_u16_e32 v16, v13, v16
3794; GFX8-NEXT:    v_max_i16_e32 v17, v17, v6
3795; GFX8-NEXT:    v_min_i16_e32 v18, 0, v10
3796; GFX8-NEXT:    v_min_i16_e32 v16, v17, v16
3797; GFX8-NEXT:    v_max_i16_e32 v17, 0, v10
3798; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
3799; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
3800; GFX8-NEXT:    v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3801; GFX8-NEXT:    v_min_i16_e32 v18, 0, v3
3802; GFX8-NEXT:    v_min_i16_e32 v6, v6, v17
3803; GFX8-NEXT:    v_max_i16_e32 v17, 0, v3
3804; GFX8-NEXT:    v_sub_u16_e32 v18, v15, v18
3805; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
3806; GFX8-NEXT:    v_sub_u16_e32 v17, v13, v17
3807; GFX8-NEXT:    v_max_i16_e32 v18, v18, v7
3808; GFX8-NEXT:    v_min_i16_e32 v17, v18, v17
3809; GFX8-NEXT:    v_max_i16_e32 v18, 0, v11
3810; GFX8-NEXT:    v_sub_u16_e32 v13, v13, v18
3811; GFX8-NEXT:    v_min_i16_e32 v18, 0, v11
3812; GFX8-NEXT:    v_sub_u16_e32 v15, v15, v18
3813; GFX8-NEXT:    v_add_u16_e32 v0, v0, v12
3814; GFX8-NEXT:    v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3815; GFX8-NEXT:    v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3816; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
3817; GFX8-NEXT:    v_add_u16_e32 v1, v1, v14
3818; GFX8-NEXT:    v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3819; GFX8-NEXT:    v_min_i16_e32 v7, v7, v13
3820; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
3821; GFX8-NEXT:    v_add_u16_e32 v2, v2, v16
3822; GFX8-NEXT:    v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3823; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
3824; GFX8-NEXT:    v_add_u16_e32 v3, v3, v17
3825; GFX8-NEXT:    v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3826; GFX8-NEXT:    v_or_b32_e32 v3, v3, v4
3827; GFX8-NEXT:    s_setpc_b64 s[30:31]
3828;
3829; GFX9-LABEL: v_saddsat_v8i16:
3830; GFX9:       ; %bb.0:
3831; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3832; GFX9-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
3833; GFX9-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
3834; GFX9-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
3835; GFX9-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
3836; GFX9-NEXT:    s_setpc_b64 s[30:31]
3837;
3838; GFX10-LABEL: v_saddsat_v8i16:
3839; GFX10:       ; %bb.0:
3840; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3841; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3842; GFX10-NEXT:    v_pk_add_i16 v0, v0, v4 clamp
3843; GFX10-NEXT:    v_pk_add_i16 v1, v1, v5 clamp
3844; GFX10-NEXT:    v_pk_add_i16 v2, v2, v6 clamp
3845; GFX10-NEXT:    v_pk_add_i16 v3, v3, v7 clamp
3846; GFX10-NEXT:    s_setpc_b64 s[30:31]
3847  %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
3848  %cast = bitcast <8 x i16> %result to <4 x float>
3849  ret <4 x float> %cast
3850}
3851
3852define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
3853; GFX6-LABEL: s_saddsat_v8i16:
3854; GFX6:       ; %bb.0:
3855; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
3856; GFX6-NEXT:    s_brev_b32 s17, 1
3857; GFX6-NEXT:    s_min_i32 s19, s0, 0
3858; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
3859; GFX6-NEXT:    s_brev_b32 s16, -2
3860; GFX6-NEXT:    s_max_i32 s18, s0, 0
3861; GFX6-NEXT:    s_sub_i32 s19, s17, s19
3862; GFX6-NEXT:    s_sub_i32 s18, s16, s18
3863; GFX6-NEXT:    s_max_i32 s8, s19, s8
3864; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3865; GFX6-NEXT:    s_min_i32 s8, s8, s18
3866; GFX6-NEXT:    s_min_i32 s18, s1, 0
3867; GFX6-NEXT:    s_add_i32 s0, s0, s8
3868; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
3869; GFX6-NEXT:    s_max_i32 s9, s1, 0
3870; GFX6-NEXT:    s_sub_i32 s18, s17, s18
3871; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3872; GFX6-NEXT:    s_max_i32 s8, s18, s8
3873; GFX6-NEXT:    s_min_i32 s8, s8, s9
3874; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3875; GFX6-NEXT:    s_add_i32 s1, s1, s8
3876; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
3877; GFX6-NEXT:    s_min_i32 s10, s2, 0
3878; GFX6-NEXT:    s_max_i32 s9, s2, 0
3879; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3880; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3881; GFX6-NEXT:    s_max_i32 s8, s10, s8
3882; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3883; GFX6-NEXT:    s_min_i32 s8, s8, s9
3884; GFX6-NEXT:    s_min_i32 s10, s3, 0
3885; GFX6-NEXT:    s_add_i32 s2, s2, s8
3886; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
3887; GFX6-NEXT:    s_max_i32 s9, s3, 0
3888; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3889; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3890; GFX6-NEXT:    s_max_i32 s8, s10, s8
3891; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3892; GFX6-NEXT:    s_min_i32 s8, s8, s9
3893; GFX6-NEXT:    s_min_i32 s10, s4, 0
3894; GFX6-NEXT:    s_add_i32 s3, s3, s8
3895; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
3896; GFX6-NEXT:    s_max_i32 s9, s4, 0
3897; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3898; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3899; GFX6-NEXT:    s_max_i32 s8, s10, s8
3900; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
3901; GFX6-NEXT:    s_min_i32 s8, s8, s9
3902; GFX6-NEXT:    s_min_i32 s10, s5, 0
3903; GFX6-NEXT:    s_add_i32 s4, s4, s8
3904; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
3905; GFX6-NEXT:    s_max_i32 s9, s5, 0
3906; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3907; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3908; GFX6-NEXT:    s_max_i32 s8, s10, s8
3909; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
3910; GFX6-NEXT:    s_min_i32 s8, s8, s9
3911; GFX6-NEXT:    s_min_i32 s10, s6, 0
3912; GFX6-NEXT:    s_add_i32 s5, s5, s8
3913; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
3914; GFX6-NEXT:    s_max_i32 s9, s6, 0
3915; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3916; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3917; GFX6-NEXT:    s_max_i32 s8, s10, s8
3918; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
3919; GFX6-NEXT:    s_min_i32 s8, s8, s9
3920; GFX6-NEXT:    s_min_i32 s10, s7, 0
3921; GFX6-NEXT:    s_add_i32 s6, s6, s8
3922; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
3923; GFX6-NEXT:    s_max_i32 s9, s7, 0
3924; GFX6-NEXT:    s_sub_i32 s10, s17, s10
3925; GFX6-NEXT:    s_sub_i32 s9, s16, s9
3926; GFX6-NEXT:    s_max_i32 s8, s10, s8
3927; GFX6-NEXT:    s_min_i32 s8, s8, s9
3928; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3929; GFX6-NEXT:    s_add_i32 s7, s7, s8
3930; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3931; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3932; GFX6-NEXT:    s_and_b32 s1, s1, s8
3933; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3934; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
3935; GFX6-NEXT:    s_and_b32 s0, s0, s8
3936; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
3937; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
3938; GFX6-NEXT:    s_or_b32 s0, s0, s1
3939; GFX6-NEXT:    s_and_b32 s1, s2, s8
3940; GFX6-NEXT:    s_and_b32 s2, s3, s8
3941; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
3942; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
3943; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
3944; GFX6-NEXT:    s_and_b32 s3, s5, s8
3945; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
3946; GFX6-NEXT:    s_or_b32 s1, s1, s2
3947; GFX6-NEXT:    s_and_b32 s2, s4, s8
3948; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
3949; GFX6-NEXT:    s_and_b32 s4, s7, s8
3950; GFX6-NEXT:    s_or_b32 s2, s2, s3
3951; GFX6-NEXT:    s_and_b32 s3, s6, s8
3952; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
3953; GFX6-NEXT:    s_or_b32 s3, s3, s4
3954; GFX6-NEXT:    ; return to shader part epilog
3955;
3956; GFX8-LABEL: s_saddsat_v8i16:
3957; GFX8:       ; %bb.0:
3958; GFX8-NEXT:    s_sext_i32_i16 s18, s0
3959; GFX8-NEXT:    s_sext_i32_i16 s19, 0
3960; GFX8-NEXT:    s_movk_i32 s17, 0x8000
3961; GFX8-NEXT:    s_max_i32 s20, s18, s19
3962; GFX8-NEXT:    s_min_i32 s18, s18, s19
3963; GFX8-NEXT:    s_sub_i32 s18, s17, s18
3964; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
3965; GFX8-NEXT:    s_movk_i32 s16, 0x7fff
3966; GFX8-NEXT:    s_sext_i32_i16 s18, s18
3967; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3968; GFX8-NEXT:    s_sub_i32 s20, s16, s20
3969; GFX8-NEXT:    s_max_i32 s4, s18, s4
3970; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3971; GFX8-NEXT:    s_sext_i32_i16 s18, s20
3972; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
3973; GFX8-NEXT:    s_min_i32 s4, s4, s18
3974; GFX8-NEXT:    s_add_i32 s0, s0, s4
3975; GFX8-NEXT:    s_sext_i32_i16 s4, s8
3976; GFX8-NEXT:    s_max_i32 s18, s4, s19
3977; GFX8-NEXT:    s_min_i32 s4, s4, s19
3978; GFX8-NEXT:    s_sub_i32 s4, s17, s4
3979; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3980; GFX8-NEXT:    s_sext_i32_i16 s12, s12
3981; GFX8-NEXT:    s_sub_i32 s18, s16, s18
3982; GFX8-NEXT:    s_max_i32 s4, s4, s12
3983; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3984; GFX8-NEXT:    s_sext_i32_i16 s12, s18
3985; GFX8-NEXT:    s_min_i32 s4, s4, s12
3986; GFX8-NEXT:    s_add_i32 s8, s8, s4
3987; GFX8-NEXT:    s_sext_i32_i16 s4, s1
3988; GFX8-NEXT:    s_max_i32 s12, s4, s19
3989; GFX8-NEXT:    s_min_i32 s4, s4, s19
3990; GFX8-NEXT:    s_sub_i32 s4, s17, s4
3991; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
3992; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3993; GFX8-NEXT:    s_sext_i32_i16 s5, s5
3994; GFX8-NEXT:    s_sub_i32 s12, s16, s12
3995; GFX8-NEXT:    s_max_i32 s4, s4, s5
3996; GFX8-NEXT:    s_sext_i32_i16 s4, s4
3997; GFX8-NEXT:    s_sext_i32_i16 s5, s12
3998; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
3999; GFX8-NEXT:    s_min_i32 s4, s4, s5
4000; GFX8-NEXT:    s_add_i32 s1, s1, s4
4001; GFX8-NEXT:    s_sext_i32_i16 s4, s9
4002; GFX8-NEXT:    s_max_i32 s5, s4, s19
4003; GFX8-NEXT:    s_min_i32 s4, s4, s19
4004; GFX8-NEXT:    s_sub_i32 s4, s17, s4
4005; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4006; GFX8-NEXT:    s_sext_i32_i16 s12, s13
4007; GFX8-NEXT:    s_sub_i32 s5, s16, s5
4008; GFX8-NEXT:    s_max_i32 s4, s4, s12
4009; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4010; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4011; GFX8-NEXT:    s_min_i32 s4, s4, s5
4012; GFX8-NEXT:    s_add_i32 s9, s9, s4
4013; GFX8-NEXT:    s_sext_i32_i16 s4, s2
4014; GFX8-NEXT:    s_max_i32 s5, s4, s19
4015; GFX8-NEXT:    s_min_i32 s4, s4, s19
4016; GFX8-NEXT:    s_sub_i32 s4, s17, s4
4017; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
4018; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4019; GFX8-NEXT:    s_sext_i32_i16 s6, s6
4020; GFX8-NEXT:    s_sub_i32 s5, s16, s5
4021; GFX8-NEXT:    s_max_i32 s4, s4, s6
4022; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4023; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4024; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
4025; GFX8-NEXT:    s_min_i32 s4, s4, s5
4026; GFX8-NEXT:    s_add_i32 s2, s2, s4
4027; GFX8-NEXT:    s_sext_i32_i16 s4, s10
4028; GFX8-NEXT:    s_max_i32 s5, s4, s19
4029; GFX8-NEXT:    s_min_i32 s4, s4, s19
4030; GFX8-NEXT:    s_sub_i32 s4, s17, s4
4031; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4032; GFX8-NEXT:    s_sext_i32_i16 s6, s14
4033; GFX8-NEXT:    s_sub_i32 s5, s16, s5
4034; GFX8-NEXT:    s_max_i32 s4, s4, s6
4035; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4036; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4037; GFX8-NEXT:    s_min_i32 s4, s4, s5
4038; GFX8-NEXT:    s_add_i32 s10, s10, s4
4039; GFX8-NEXT:    s_sext_i32_i16 s4, s3
4040; GFX8-NEXT:    s_max_i32 s5, s4, s19
4041; GFX8-NEXT:    s_min_i32 s4, s4, s19
4042; GFX8-NEXT:    s_sub_i32 s4, s17, s4
4043; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4044; GFX8-NEXT:    s_sext_i32_i16 s6, s7
4045; GFX8-NEXT:    s_sub_i32 s5, s16, s5
4046; GFX8-NEXT:    s_max_i32 s4, s4, s6
4047; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4048; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4049; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
4050; GFX8-NEXT:    s_min_i32 s4, s4, s5
4051; GFX8-NEXT:    s_add_i32 s3, s3, s4
4052; GFX8-NEXT:    s_sext_i32_i16 s4, s11
4053; GFX8-NEXT:    s_max_i32 s5, s4, s19
4054; GFX8-NEXT:    s_min_i32 s4, s4, s19
4055; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
4056; GFX8-NEXT:    s_sub_i32 s4, s17, s4
4057; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4058; GFX8-NEXT:    s_sext_i32_i16 s6, s15
4059; GFX8-NEXT:    s_sub_i32 s5, s16, s5
4060; GFX8-NEXT:    s_max_i32 s4, s4, s6
4061; GFX8-NEXT:    s_sext_i32_i16 s4, s4
4062; GFX8-NEXT:    s_sext_i32_i16 s5, s5
4063; GFX8-NEXT:    s_min_i32 s4, s4, s5
4064; GFX8-NEXT:    s_add_i32 s11, s11, s4
4065; GFX8-NEXT:    s_bfe_u32 s4, s8, 0x100000
4066; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x100000
4067; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4068; GFX8-NEXT:    s_or_b32 s0, s0, s4
4069; GFX8-NEXT:    s_bfe_u32 s4, s9, 0x100000
4070; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x100000
4071; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4072; GFX8-NEXT:    s_or_b32 s1, s1, s4
4073; GFX8-NEXT:    s_bfe_u32 s4, s10, 0x100000
4074; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
4075; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4076; GFX8-NEXT:    s_or_b32 s2, s2, s4
4077; GFX8-NEXT:    s_bfe_u32 s4, s11, 0x100000
4078; GFX8-NEXT:    s_bfe_u32 s3, s3, 0x100000
4079; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
4080; GFX8-NEXT:    s_or_b32 s3, s3, s4
4081; GFX8-NEXT:    ; return to shader part epilog
4082;
4083; GFX9-LABEL: s_saddsat_v8i16:
4084; GFX9:       ; %bb.0:
4085; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4086; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4087; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4088; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4089; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
4090; GFX9-NEXT:    v_pk_add_i16 v1, s1, v1 clamp
4091; GFX9-NEXT:    v_pk_add_i16 v2, s2, v2 clamp
4092; GFX9-NEXT:    v_pk_add_i16 v3, s3, v3 clamp
4093; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4094; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4095; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4096; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4097; GFX9-NEXT:    ; return to shader part epilog
4098;
4099; GFX10-LABEL: s_saddsat_v8i16:
4100; GFX10:       ; %bb.0:
4101; GFX10-NEXT:    v_pk_add_i16 v0, s0, s4 clamp
4102; GFX10-NEXT:    v_pk_add_i16 v1, s1, s5 clamp
4103; GFX10-NEXT:    v_pk_add_i16 v2, s2, s6 clamp
4104; GFX10-NEXT:    v_pk_add_i16 v3, s3, s7 clamp
4105; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4106; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4107; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
4108; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
4109; GFX10-NEXT:    ; return to shader part epilog
4110  %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
4111  %cast = bitcast <8 x i16> %result to <4 x i32>
4112  ret <4 x i32> %cast
4113}
4114
4115; FIXME: i48 broken because i48 add broken
4116; define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
4117;   %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4118;   ret i48 %result
4119; }
4120
4121; define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
4122;   %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4123;   ret i48 %result
4124; }
4125
4126; define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
4127;   %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4128;   %ext.result = zext i48 %result to i64
4129;   %cast = bitcast i64 %ext.result to <2 x float>
4130;   ret <2 x float> %cast
4131; }
4132
4133; define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
4134;   %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs)
4135;   %ext.result = zext i48 %result to i64
4136;   %cast = bitcast i64 %ext.result to <2 x float>
4137;   ret <2 x float> %cast
4138; }
4139
4140define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
4141; GFX6-LABEL: v_saddsat_i64:
4142; GFX6:       ; %bb.0:
4143; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4144; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
4145; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
4146; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4147; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
4148; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4149; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4150; GFX6-NEXT:    v_add_i32_e64 v2, s[6:7], 0, v0
4151; GFX6-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4152; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4153; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4154; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4155; GFX6-NEXT:    s_setpc_b64 s[30:31]
4156;
4157; GFX8-LABEL: v_saddsat_i64:
4158; GFX8:       ; %bb.0:
4159; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4160; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
4161; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
4162; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4163; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
4164; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4165; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4166; GFX8-NEXT:    v_add_u32_e64 v2, s[6:7], 0, v0
4167; GFX8-NEXT:    v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4168; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4169; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4170; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4171; GFX8-NEXT:    s_setpc_b64 s[30:31]
4172;
4173; GFX9-LABEL: v_saddsat_i64:
4174; GFX9:       ; %bb.0:
4175; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4176; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
4177; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
4178; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
4179; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
4180; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
4181; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4182; GFX9-NEXT:    v_add_co_u32_e64 v2, s[6:7], 0, v0
4183; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7]
4184; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4185; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
4186; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
4187; GFX9-NEXT:    s_setpc_b64 s[30:31]
4188;
4189; GFX10-LABEL: v_saddsat_i64:
4190; GFX10:       ; %bb.0:
4191; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4192; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4193; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
4194; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
4195; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
4196; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
4197; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
4198; GFX10-NEXT:    v_add_co_u32 v0, s5, v6, 0
4199; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
4200; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s4
4201; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
4202; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
4203; GFX10-NEXT:    s_setpc_b64 s[30:31]
4204  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4205  ret i64 %result
4206}
4207
4208define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
4209; GFX6-LABEL: s_saddsat_i64:
4210; GFX6:       ; %bb.0:
4211; GFX6-NEXT:    s_add_u32 s4, s0, s2
4212; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
4213; GFX6-NEXT:    s_and_b32 s5, s5, 1
4214; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
4215; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4216; GFX6-NEXT:    s_addc_u32 s5, s1, s3
4217; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4218; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4219; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4220; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
4221; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4222; GFX6-NEXT:    s_add_u32 s0, s2, 0
4223; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4224; GFX6-NEXT:    s_and_b32 s1, s1, 1
4225; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4226; GFX6-NEXT:    s_addc_u32 s1, s2, 0x80000000
4227; GFX6-NEXT:    v_mov_b32_e32 v0, s4
4228; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4229; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4230; GFX6-NEXT:    v_mov_b32_e32 v3, s5
4231; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4232; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4233; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4234; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4235; GFX6-NEXT:    ; return to shader part epilog
4236;
4237; GFX8-LABEL: s_saddsat_i64:
4238; GFX8:       ; %bb.0:
4239; GFX8-NEXT:    s_add_u32 s4, s0, s2
4240; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
4241; GFX8-NEXT:    s_and_b32 s5, s5, 1
4242; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
4243; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4244; GFX8-NEXT:    s_addc_u32 s5, s1, s3
4245; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4246; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4247; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4248; GFX8-NEXT:    s_ashr_i32 s2, s5, 31
4249; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4250; GFX8-NEXT:    s_add_u32 s0, s2, 0
4251; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4252; GFX8-NEXT:    s_and_b32 s1, s1, 1
4253; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4254; GFX8-NEXT:    s_addc_u32 s1, s2, 0x80000000
4255; GFX8-NEXT:    v_mov_b32_e32 v0, s4
4256; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4257; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4258; GFX8-NEXT:    v_mov_b32_e32 v3, s5
4259; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4260; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4261; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4262; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4263; GFX8-NEXT:    ; return to shader part epilog
4264;
4265; GFX9-LABEL: s_saddsat_i64:
4266; GFX9:       ; %bb.0:
4267; GFX9-NEXT:    s_add_u32 s4, s0, s2
4268; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
4269; GFX9-NEXT:    s_and_b32 s5, s5, 1
4270; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
4271; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4272; GFX9-NEXT:    s_addc_u32 s5, s1, s3
4273; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4274; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
4275; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
4276; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
4277; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4278; GFX9-NEXT:    s_add_u32 s0, s2, 0
4279; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4280; GFX9-NEXT:    s_and_b32 s1, s1, 1
4281; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4282; GFX9-NEXT:    s_addc_u32 s1, s2, 0x80000000
4283; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4284; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4285; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4286; GFX9-NEXT:    v_mov_b32_e32 v3, s5
4287; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4288; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
4289; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4290; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4291; GFX9-NEXT:    ; return to shader part epilog
4292;
4293; GFX10-LABEL: s_saddsat_i64:
4294; GFX10:       ; %bb.0:
4295; GFX10-NEXT:    s_add_u32 s4, s0, s2
4296; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
4297; GFX10-NEXT:    v_mov_b32_e32 v0, s4
4298; GFX10-NEXT:    s_and_b32 s5, s5, 1
4299; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
4300; GFX10-NEXT:    s_addc_u32 s5, s1, s3
4301; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
4302; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
4303; GFX10-NEXT:    s_ashr_i32 s2, s5, 31
4304; GFX10-NEXT:    v_mov_b32_e32 v1, s5
4305; GFX10-NEXT:    s_xor_b32 s3, s1, s0
4306; GFX10-NEXT:    s_add_u32 s0, s2, 0
4307; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
4308; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s3
4309; GFX10-NEXT:    s_and_b32 s1, s1, 1
4310; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
4311; GFX10-NEXT:    s_addc_u32 s1, s2, 0x80000000
4312; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4313; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s3
4314; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4315; GFX10-NEXT:    ; return to shader part epilog
4316  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4317  ret i64 %result
4318}
4319
4320define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
4321; GFX6-LABEL: saddsat_i64_sv:
4322; GFX6:       ; %bb.0:
4323; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4324; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4325; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
4326; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4327; GFX6-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
4328; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4329; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4330; GFX6-NEXT:    v_add_i32_e64 v4, s[2:3], 0, v0
4331; GFX6-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4332; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4333; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4334; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4335; GFX6-NEXT:    ; return to shader part epilog
4336;
4337; GFX8-LABEL: saddsat_i64_sv:
4338; GFX8:       ; %bb.0:
4339; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4340; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4341; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
4342; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4343; GFX8-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
4344; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4345; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4346; GFX8-NEXT:    v_add_u32_e64 v4, s[2:3], 0, v0
4347; GFX8-NEXT:    v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4348; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4349; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4350; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4351; GFX8-NEXT:    ; return to shader part epilog
4352;
4353; GFX9-LABEL: saddsat_i64_sv:
4354; GFX9:       ; %bb.0:
4355; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4356; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4357; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
4358; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
4359; GFX9-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
4360; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4361; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4362; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], 0, v0
4363; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3]
4364; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4365; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4366; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4367; GFX9-NEXT:    ; return to shader part epilog
4368;
4369; GFX10-LABEL: saddsat_i64_sv:
4370; GFX10:       ; %bb.0:
4371; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
4372; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4373; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1]
4374; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4375; GFX10-NEXT:    v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
4376; GFX10-NEXT:    v_add_co_u32 v0, s1, v4, 0
4377; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
4378; GFX10-NEXT:    s_xor_b32 vcc_lo, vcc_lo, s0
4379; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
4380; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4381; GFX10-NEXT:    ; return to shader part epilog
4382  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4383  %cast = bitcast i64 %result to <2 x float>
4384  ret <2 x float> %cast
4385}
4386
4387define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
4388; GFX6-LABEL: saddsat_i64_vs:
4389; GFX6:       ; %bb.0:
4390; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4391; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
4392; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
4393; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4394; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
4395; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4396; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
4397; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], 0, v0
4398; GFX6-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4399; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4400; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4401; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4402; GFX6-NEXT:    ; return to shader part epilog
4403;
4404; GFX8-LABEL: saddsat_i64_vs:
4405; GFX8:       ; %bb.0:
4406; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4407; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
4408; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
4409; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4410; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
4411; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4412; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
4413; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], 0, v0
4414; GFX8-NEXT:    v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4415; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4416; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4417; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4418; GFX8-NEXT:    ; return to shader part epilog
4419;
4420; GFX9-LABEL: saddsat_i64_vs:
4421; GFX9:       ; %bb.0:
4422; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4423; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
4424; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
4425; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
4426; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[0:1], 0
4427; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
4428; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
4429; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], 0, v0
4430; GFX9-NEXT:    v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1]
4431; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4432; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
4433; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
4434; GFX9-NEXT:    ; return to shader part epilog
4435;
4436; GFX10-LABEL: saddsat_i64_vs:
4437; GFX10:       ; %bb.0:
4438; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, s0
4439; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
4440; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[0:1], 0
4441; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
4442; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
4443; GFX10-NEXT:    v_add_co_u32 v0, s0, v4, 0
4444; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
4445; GFX10-NEXT:    s_xor_b32 vcc_lo, s1, vcc_lo
4446; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
4447; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
4448; GFX10-NEXT:    ; return to shader part epilog
4449  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
4450  %cast = bitcast i64 %result to <2 x float>
4451  ret <2 x float> %cast
4452}
4453
4454define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
4455; GFX6-LABEL: v_saddsat_v2i64:
4456; GFX6:       ; %bb.0:
4457; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4458; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v4
4459; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
4460; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4461; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
4462; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4463; GFX6-NEXT:    v_bfrev_b32_e32 v10, 1
4464; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], 0, v0
4465; GFX6-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4466; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4467; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4468; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4469; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v2, v6
4470; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
4471; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4472; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
4473; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4474; GFX6-NEXT:    v_add_i32_e64 v3, s[6:7], 0, v2
4475; GFX6-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4476; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4477; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4478; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4479; GFX6-NEXT:    s_setpc_b64 s[30:31]
4480;
4481; GFX8-LABEL: v_saddsat_v2i64:
4482; GFX8:       ; %bb.0:
4483; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4484; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v4
4485; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v5, vcc
4486; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4487; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
4488; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4489; GFX8-NEXT:    v_bfrev_b32_e32 v10, 1
4490; GFX8-NEXT:    v_add_u32_e64 v1, s[6:7], 0, v0
4491; GFX8-NEXT:    v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4492; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4493; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4494; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4495; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v2, v6
4496; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v3, v7, vcc
4497; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4498; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
4499; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4500; GFX8-NEXT:    v_add_u32_e64 v3, s[6:7], 0, v2
4501; GFX8-NEXT:    v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4502; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4503; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4504; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4505; GFX8-NEXT:    s_setpc_b64 s[30:31]
4506;
4507; GFX9-LABEL: v_saddsat_v2i64:
4508; GFX9:       ; %bb.0:
4509; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4510; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v4
4511; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
4512; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
4513; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
4514; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
4515; GFX9-NEXT:    v_bfrev_b32_e32 v10, 1
4516; GFX9-NEXT:    v_add_co_u32_e64 v1, s[6:7], 0, v0
4517; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7]
4518; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4519; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc
4520; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
4521; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v2, v6
4522; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
4523; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
4524; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
4525; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
4526; GFX9-NEXT:    v_add_co_u32_e64 v3, s[6:7], 0, v2
4527; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7]
4528; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
4529; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
4530; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
4531; GFX9-NEXT:    s_setpc_b64 s[30:31]
4532;
4533; GFX10-LABEL: v_saddsat_v2i64:
4534; GFX10:       ; %bb.0:
4535; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4536; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4537; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v4
4538; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo
4539; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v2, v6
4540; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo
4541; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 31, v9
4542; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
4543; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[4:5]
4544; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
4545; GFX10-NEXT:    v_cmp_gt_i64_e64 s6, 0, v[6:7]
4546; GFX10-NEXT:    v_add_co_u32 v1, s5, v12, 0
4547; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
4548; GFX10-NEXT:    v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
4549; GFX10-NEXT:    v_add_co_u32 v2, s7, v0, 0
4550; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
4551; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
4552; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v1, vcc_lo
4553; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
4554; GFX10-NEXT:    s_xor_b32 vcc_lo, s6, s5
4555; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
4556; GFX10-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
4557; GFX10-NEXT:    s_setpc_b64 s[30:31]
4558  %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4559  ret <2 x i64> %result
4560}
4561
4562define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
4563; GFX6-LABEL: s_saddsat_v2i64:
4564; GFX6:       ; %bb.0:
4565; GFX6-NEXT:    s_add_u32 s8, s0, s4
4566; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
4567; GFX6-NEXT:    s_and_b32 s9, s9, 1
4568; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
4569; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4570; GFX6-NEXT:    s_addc_u32 s9, s1, s5
4571; GFX6-NEXT:    v_mov_b32_e32 v1, s1
4572; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4573; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4574; GFX6-NEXT:    s_ashr_i32 s4, s9, 31
4575; GFX6-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4576; GFX6-NEXT:    s_add_u32 s0, s4, 0
4577; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4578; GFX6-NEXT:    s_and_b32 s1, s1, 1
4579; GFX6-NEXT:    s_brev_b32 s5, 1
4580; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4581; GFX6-NEXT:    s_addc_u32 s1, s4, s5
4582; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4583; GFX6-NEXT:    s_add_u32 s0, s2, s6
4584; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4585; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4586; GFX6-NEXT:    v_mov_b32_e32 v0, s8
4587; GFX6-NEXT:    s_and_b32 s1, s1, 1
4588; GFX6-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4589; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4590; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4591; GFX6-NEXT:    v_mov_b32_e32 v3, s9
4592; GFX6-NEXT:    s_addc_u32 s1, s3, s7
4593; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4594; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4595; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4596; GFX6-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4597; GFX6-NEXT:    s_ashr_i32 s4, s1, 31
4598; GFX6-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4599; GFX6-NEXT:    v_mov_b32_e32 v0, s0
4600; GFX6-NEXT:    s_add_u32 s0, s4, 0
4601; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
4602; GFX6-NEXT:    s_and_b32 s2, s2, 1
4603; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
4604; GFX6-NEXT:    s_addc_u32 s3, s4, s5
4605; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4606; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4607; GFX6-NEXT:    v_mov_b32_e32 v5, s1
4608; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4609; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4610; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
4611; GFX6-NEXT:    v_readfirstlane_b32 s1, v2
4612; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
4613; GFX6-NEXT:    v_readfirstlane_b32 s3, v1
4614; GFX6-NEXT:    ; return to shader part epilog
4615;
4616; GFX8-LABEL: s_saddsat_v2i64:
4617; GFX8:       ; %bb.0:
4618; GFX8-NEXT:    s_add_u32 s8, s0, s4
4619; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
4620; GFX8-NEXT:    s_and_b32 s9, s9, 1
4621; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
4622; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4623; GFX8-NEXT:    s_addc_u32 s9, s1, s5
4624; GFX8-NEXT:    v_mov_b32_e32 v1, s1
4625; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4626; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4627; GFX8-NEXT:    s_ashr_i32 s4, s9, 31
4628; GFX8-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4629; GFX8-NEXT:    s_add_u32 s0, s4, 0
4630; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4631; GFX8-NEXT:    s_and_b32 s1, s1, 1
4632; GFX8-NEXT:    s_brev_b32 s5, 1
4633; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4634; GFX8-NEXT:    s_addc_u32 s1, s4, s5
4635; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4636; GFX8-NEXT:    s_add_u32 s0, s2, s6
4637; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4638; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4639; GFX8-NEXT:    v_mov_b32_e32 v0, s8
4640; GFX8-NEXT:    s_and_b32 s1, s1, 1
4641; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4642; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4643; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4644; GFX8-NEXT:    v_mov_b32_e32 v3, s9
4645; GFX8-NEXT:    s_addc_u32 s1, s3, s7
4646; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4647; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4648; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4649; GFX8-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4650; GFX8-NEXT:    s_ashr_i32 s4, s1, 31
4651; GFX8-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4652; GFX8-NEXT:    v_mov_b32_e32 v0, s0
4653; GFX8-NEXT:    s_add_u32 s0, s4, 0
4654; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4655; GFX8-NEXT:    s_and_b32 s2, s2, 1
4656; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
4657; GFX8-NEXT:    s_addc_u32 s3, s4, s5
4658; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4659; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4660; GFX8-NEXT:    v_mov_b32_e32 v5, s1
4661; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4662; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4663; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
4664; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
4665; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
4666; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
4667; GFX8-NEXT:    ; return to shader part epilog
4668;
4669; GFX9-LABEL: s_saddsat_v2i64:
4670; GFX9:       ; %bb.0:
4671; GFX9-NEXT:    s_add_u32 s8, s0, s4
4672; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
4673; GFX9-NEXT:    s_and_b32 s9, s9, 1
4674; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
4675; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4676; GFX9-NEXT:    s_addc_u32 s9, s1, s5
4677; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4678; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4679; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
4680; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
4681; GFX9-NEXT:    s_xor_b64 vcc, s[0:1], vcc
4682; GFX9-NEXT:    s_add_u32 s0, s4, 0
4683; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4684; GFX9-NEXT:    s_and_b32 s1, s1, 1
4685; GFX9-NEXT:    s_brev_b32 s5, 1
4686; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4687; GFX9-NEXT:    s_addc_u32 s1, s4, s5
4688; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4689; GFX9-NEXT:    s_add_u32 s0, s2, s6
4690; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4691; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4692; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4693; GFX9-NEXT:    s_and_b32 s1, s1, 1
4694; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
4695; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4696; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4697; GFX9-NEXT:    v_mov_b32_e32 v3, s9
4698; GFX9-NEXT:    s_addc_u32 s1, s3, s7
4699; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4700; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
4701; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
4702; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
4703; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
4704; GFX9-NEXT:    s_xor_b64 vcc, s[2:3], vcc
4705; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4706; GFX9-NEXT:    s_add_u32 s0, s4, 0
4707; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4708; GFX9-NEXT:    s_and_b32 s2, s2, 1
4709; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
4710; GFX9-NEXT:    s_addc_u32 s3, s4, s5
4711; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4712; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4713; GFX9-NEXT:    v_mov_b32_e32 v5, s1
4714; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4715; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
4716; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
4717; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
4718; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
4719; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
4720; GFX9-NEXT:    ; return to shader part epilog
4721;
4722; GFX10-LABEL: s_saddsat_v2i64:
4723; GFX10:       ; %bb.0:
4724; GFX10-NEXT:    s_add_u32 s8, s0, s4
4725; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
4726; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[4:5], 0
4727; GFX10-NEXT:    s_and_b32 s9, s9, 1
4728; GFX10-NEXT:    v_mov_b32_e32 v0, s8
4729; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
4730; GFX10-NEXT:    s_brev_b32 s10, 1
4731; GFX10-NEXT:    s_addc_u32 s9, s1, s5
4732; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
4733; GFX10-NEXT:    s_ashr_i32 s1, s9, 31
4734; GFX10-NEXT:    v_mov_b32_e32 v1, s9
4735; GFX10-NEXT:    s_xor_b32 s8, s4, s0
4736; GFX10-NEXT:    s_add_u32 s0, s1, 0
4737; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
4738; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s8
4739; GFX10-NEXT:    s_and_b32 s4, s4, 1
4740; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
4741; GFX10-NEXT:    s_addc_u32 s1, s1, s10
4742; GFX10-NEXT:    s_add_u32 s4, s2, s6
4743; GFX10-NEXT:    s_cselect_b32 s5, 1, 0
4744; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s1, s8
4745; GFX10-NEXT:    s_and_b32 s5, s5, 1
4746; GFX10-NEXT:    v_mov_b32_e32 v2, s4
4747; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
4748; GFX10-NEXT:    s_addc_u32 s5, s3, s7
4749; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
4750; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[6:7], 0
4751; GFX10-NEXT:    s_ashr_i32 s1, s5, 31
4752; GFX10-NEXT:    v_mov_b32_e32 v3, s5
4753; GFX10-NEXT:    s_xor_b32 s2, s3, s2
4754; GFX10-NEXT:    s_add_u32 s0, s1, 0
4755; GFX10-NEXT:    s_cselect_b32 s3, 1, 0
4756; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s0, s2
4757; GFX10-NEXT:    s_and_b32 s3, s3, 1
4758; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
4759; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
4760; GFX10-NEXT:    s_addc_u32 s1, s1, s10
4761; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s1, s2
4762; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
4763; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
4764; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
4765; GFX10-NEXT:    ; return to shader part epilog
4766  %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
4767  ret <2 x i64> %result
4768}
4769
4770define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
4771; GFX6-LABEL: s_saddsat_i128:
4772; GFX6:       ; %bb.0:
4773; GFX6-NEXT:    s_add_u32 s4, s0, s4
4774; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
4775; GFX6-NEXT:    s_and_b32 s8, s8, 1
4776; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
4777; GFX6-NEXT:    s_addc_u32 s5, s1, s5
4778; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
4779; GFX6-NEXT:    s_and_b32 s8, s8, 1
4780; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
4781; GFX6-NEXT:    s_addc_u32 s8, s2, s6
4782; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
4783; GFX6-NEXT:    v_mov_b32_e32 v3, s1
4784; GFX6-NEXT:    s_and_b32 s9, s9, 1
4785; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4786; GFX6-NEXT:    s_cmp_lg_u32 s9, 0
4787; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4788; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
4789; GFX6-NEXT:    s_addc_u32 s9, s3, s7
4790; GFX6-NEXT:    v_mov_b32_e32 v1, s3
4791; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4792; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4793; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
4794; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
4795; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4796; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4797; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
4798; GFX6-NEXT:    s_ashr_i32 s3, s9, 31
4799; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
4800; GFX6-NEXT:    s_add_u32 s0, s3, 0
4801; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
4802; GFX6-NEXT:    s_and_b32 s1, s1, 1
4803; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
4804; GFX6-NEXT:    s_addc_u32 s1, s3, 0
4805; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
4806; GFX6-NEXT:    s_and_b32 s2, s2, 1
4807; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
4808; GFX6-NEXT:    s_addc_u32 s2, s3, 0
4809; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
4810; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
4811; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
4812; GFX6-NEXT:    s_and_b32 s6, s6, 1
4813; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
4814; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
4815; GFX6-NEXT:    s_addc_u32 s3, s3, 0x80000000
4816; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4817; GFX6-NEXT:    v_mov_b32_e32 v2, s1
4818; GFX6-NEXT:    v_mov_b32_e32 v3, s4
4819; GFX6-NEXT:    v_mov_b32_e32 v4, s5
4820; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4821; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4822; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4823; GFX6-NEXT:    v_mov_b32_e32 v2, s2
4824; GFX6-NEXT:    v_mov_b32_e32 v3, s3
4825; GFX6-NEXT:    v_mov_b32_e32 v4, s8
4826; GFX6-NEXT:    v_mov_b32_e32 v5, s9
4827; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4828; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4829; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
4830; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
4831; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
4832; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
4833; GFX6-NEXT:    ; return to shader part epilog
4834;
4835; GFX8-LABEL: s_saddsat_i128:
4836; GFX8:       ; %bb.0:
4837; GFX8-NEXT:    s_add_u32 s4, s0, s4
4838; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
4839; GFX8-NEXT:    s_and_b32 s8, s8, 1
4840; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
4841; GFX8-NEXT:    s_addc_u32 s5, s1, s5
4842; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
4843; GFX8-NEXT:    s_and_b32 s8, s8, 1
4844; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
4845; GFX8-NEXT:    s_addc_u32 s8, s2, s6
4846; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
4847; GFX8-NEXT:    s_and_b32 s9, s9, 1
4848; GFX8-NEXT:    v_mov_b32_e32 v3, s1
4849; GFX8-NEXT:    s_cmp_lg_u32 s9, 0
4850; GFX8-NEXT:    v_mov_b32_e32 v2, s0
4851; GFX8-NEXT:    s_addc_u32 s9, s3, s7
4852; GFX8-NEXT:    v_mov_b32_e32 v0, s2
4853; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
4854; GFX8-NEXT:    v_mov_b32_e32 v1, s3
4855; GFX8-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
4856; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4857; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4858; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4859; GFX8-NEXT:    s_and_b32 s0, 1, s2
4860; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
4861; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4862; GFX8-NEXT:    s_cmp_eq_u64 s[6:7], 0
4863; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
4864; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4865; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4866; GFX8-NEXT:    s_and_b32 s0, 1, s2
4867; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
4868; GFX8-NEXT:    s_ashr_i32 s3, s9, 31
4869; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
4870; GFX8-NEXT:    s_add_u32 s0, s3, 0
4871; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
4872; GFX8-NEXT:    s_and_b32 s1, s1, 1
4873; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
4874; GFX8-NEXT:    s_addc_u32 s1, s3, 0
4875; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
4876; GFX8-NEXT:    s_and_b32 s2, s2, 1
4877; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
4878; GFX8-NEXT:    s_addc_u32 s2, s3, 0
4879; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4880; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
4881; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
4882; GFX8-NEXT:    s_and_b32 s6, s6, 1
4883; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
4884; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
4885; GFX8-NEXT:    s_addc_u32 s3, s3, 0x80000000
4886; GFX8-NEXT:    v_mov_b32_e32 v1, s0
4887; GFX8-NEXT:    v_mov_b32_e32 v2, s1
4888; GFX8-NEXT:    v_mov_b32_e32 v3, s4
4889; GFX8-NEXT:    v_mov_b32_e32 v4, s5
4890; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4891; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4892; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4893; GFX8-NEXT:    v_mov_b32_e32 v2, s2
4894; GFX8-NEXT:    v_mov_b32_e32 v3, s3
4895; GFX8-NEXT:    v_mov_b32_e32 v4, s8
4896; GFX8-NEXT:    v_mov_b32_e32 v5, s9
4897; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4898; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4899; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
4900; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
4901; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
4902; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
4903; GFX8-NEXT:    ; return to shader part epilog
4904;
4905; GFX9-LABEL: s_saddsat_i128:
4906; GFX9:       ; %bb.0:
4907; GFX9-NEXT:    s_add_u32 s4, s0, s4
4908; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
4909; GFX9-NEXT:    s_and_b32 s8, s8, 1
4910; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
4911; GFX9-NEXT:    s_addc_u32 s5, s1, s5
4912; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
4913; GFX9-NEXT:    s_and_b32 s8, s8, 1
4914; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
4915; GFX9-NEXT:    s_addc_u32 s8, s2, s6
4916; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
4917; GFX9-NEXT:    s_and_b32 s9, s9, 1
4918; GFX9-NEXT:    v_mov_b32_e32 v3, s1
4919; GFX9-NEXT:    s_cmp_lg_u32 s9, 0
4920; GFX9-NEXT:    v_mov_b32_e32 v2, s0
4921; GFX9-NEXT:    s_addc_u32 s9, s3, s7
4922; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4923; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
4924; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4925; GFX9-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
4926; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4927; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
4928; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
4929; GFX9-NEXT:    s_and_b32 s0, 1, s2
4930; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
4931; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
4932; GFX9-NEXT:    s_cmp_eq_u64 s[6:7], 0
4933; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[6:7], 0
4934; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4935; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
4936; GFX9-NEXT:    s_and_b32 s0, 1, s2
4937; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
4938; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
4939; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
4940; GFX9-NEXT:    s_add_u32 s0, s3, 0
4941; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
4942; GFX9-NEXT:    s_and_b32 s1, s1, 1
4943; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
4944; GFX9-NEXT:    s_addc_u32 s1, s3, 0
4945; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
4946; GFX9-NEXT:    s_and_b32 s2, s2, 1
4947; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
4948; GFX9-NEXT:    s_addc_u32 s2, s3, 0
4949; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4950; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
4951; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
4952; GFX9-NEXT:    s_and_b32 s6, s6, 1
4953; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
4954; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
4955; GFX9-NEXT:    s_addc_u32 s3, s3, 0x80000000
4956; GFX9-NEXT:    v_mov_b32_e32 v1, s0
4957; GFX9-NEXT:    v_mov_b32_e32 v2, s1
4958; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4959; GFX9-NEXT:    v_mov_b32_e32 v4, s5
4960; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
4961; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
4962; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
4963; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4964; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4965; GFX9-NEXT:    v_mov_b32_e32 v4, s8
4966; GFX9-NEXT:    v_mov_b32_e32 v5, s9
4967; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
4968; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
4969; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
4970; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
4971; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
4972; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
4973; GFX9-NEXT:    ; return to shader part epilog
4974;
4975; GFX10-LABEL: s_saddsat_i128:
4976; GFX10:       ; %bb.0:
4977; GFX10-NEXT:    s_add_u32 s4, s0, s4
4978; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
4979; GFX10-NEXT:    s_and_b32 s8, s8, 1
4980; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
4981; GFX10-NEXT:    s_addc_u32 s5, s1, s5
4982; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
4983; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
4984; GFX10-NEXT:    s_and_b32 s8, s8, 1
4985; GFX10-NEXT:    v_mov_b32_e32 v2, s5
4986; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
4987; GFX10-NEXT:    s_addc_u32 s8, s2, s6
4988; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
4989; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
4990; GFX10-NEXT:    s_and_b32 s9, s9, 1
4991; GFX10-NEXT:    v_mov_b32_e32 v3, s8
4992; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
4993; GFX10-NEXT:    s_addc_u32 s9, s3, s7
4994; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[2:3]
4995; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[8:9], s[2:3]
4996; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
4997; GFX10-NEXT:    v_mov_b32_e32 v4, s9
4998; GFX10-NEXT:    s_and_b32 s0, 1, s0
4999; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], 0
5000; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5001; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[6:7], 0
5002; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
5003; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5004; GFX10-NEXT:    s_ashr_i32 s3, s9, 31
5005; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5006; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5007; GFX10-NEXT:    s_and_b32 s0, 1, s1
5008; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
5009; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s0
5010; GFX10-NEXT:    s_add_u32 s0, s3, 0
5011; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5012; GFX10-NEXT:    s_and_b32 s1, s1, 1
5013; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5014; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
5015; GFX10-NEXT:    v_mov_b32_e32 v1, s4
5016; GFX10-NEXT:    s_addc_u32 s1, s3, 0
5017; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
5018; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5019; GFX10-NEXT:    s_and_b32 s2, s2, 1
5020; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
5021; GFX10-NEXT:    s_addc_u32 s2, s3, 0
5022; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
5023; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
5024; GFX10-NEXT:    s_and_b32 s4, s4, 1
5025; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
5026; GFX10-NEXT:    s_addc_u32 s3, s3, 0x80000000
5027; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
5028; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
5029; GFX10-NEXT:    v_cndmask_b32_e64 v2, v3, s2, vcc_lo
5030; GFX10-NEXT:    v_cndmask_b32_e64 v3, v4, s3, vcc_lo
5031; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
5032; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
5033; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
5034; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
5035; GFX10-NEXT:    ; return to shader part epilog
5036  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5037  ret i128 %result
5038}
5039
5040define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
5041; GFX6-LABEL: saddsat_i128_sv:
5042; GFX6:       ; %bb.0:
5043; GFX6-NEXT:    v_mov_b32_e32 v4, s1
5044; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
5045; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
5046; GFX6-NEXT:    v_mov_b32_e32 v4, s2
5047; GFX6-NEXT:    v_mov_b32_e32 v5, s3
5048; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
5049; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
5050; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5051; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
5052; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5053; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5054; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5055; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5056; GFX6-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5057; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5058; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5059; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5060; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5061; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5062; GFX6-NEXT:    v_xor_b32_e32 v2, v2, v6
5063; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v3
5064; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
5065; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
5066; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
5067; GFX6-NEXT:    v_and_b32_e32 v2, 1, v2
5068; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5069; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
5070; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
5071; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
5072; GFX6-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5073; GFX6-NEXT:    ; return to shader part epilog
5074;
5075; GFX8-LABEL: saddsat_i128_sv:
5076; GFX8:       ; %bb.0:
5077; GFX8-NEXT:    v_mov_b32_e32 v4, s1
5078; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
5079; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
5080; GFX8-NEXT:    v_mov_b32_e32 v4, s2
5081; GFX8-NEXT:    v_mov_b32_e32 v5, s3
5082; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v2, vcc
5083; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v3, vcc
5084; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5085; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
5086; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5087; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5088; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5089; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5090; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5091; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5092; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5093; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5094; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5095; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5096; GFX8-NEXT:    v_xor_b32_e32 v2, v2, v6
5097; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v3
5098; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v3, vcc
5099; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v3, vcc
5100; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
5101; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
5102; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5103; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
5104; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
5105; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
5106; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5107; GFX8-NEXT:    ; return to shader part epilog
5108;
5109; GFX9-LABEL: saddsat_i128_sv:
5110; GFX9:       ; %bb.0:
5111; GFX9-NEXT:    v_mov_b32_e32 v4, s1
5112; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
5113; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
5114; GFX9-NEXT:    v_mov_b32_e32 v4, s2
5115; GFX9-NEXT:    v_mov_b32_e32 v5, s3
5116; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
5117; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
5118; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
5119; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
5120; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
5121; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5]
5122; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5123; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5]
5124; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
5125; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
5126; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
5127; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
5128; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5129; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
5130; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v6
5131; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v3
5132; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v3, vcc
5133; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
5134; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
5135; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
5136; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5137; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
5138; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
5139; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc
5140; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
5141; GFX9-NEXT:    ; return to shader part epilog
5142;
5143; GFX10-LABEL: saddsat_i128_sv:
5144; GFX10:       ; %bb.0:
5145; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v0
5146; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
5147; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
5148; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
5149; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
5150; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
5151; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
5152; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
5153; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
5154; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
5155; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
5156; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
5157; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
5158; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
5159; GFX10-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
5160; GFX10-NEXT:    v_xor_b32_e32 v2, v2, v6
5161; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v3, 0
5162; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo
5163; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
5164; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
5165; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
5166; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0x80000000, v3, vcc_lo
5167; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s0
5168; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s0
5169; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s0
5170; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s0
5171; GFX10-NEXT:    ; return to shader part epilog
5172  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5173  %cast = bitcast i128 %result to <4 x float>
5174  ret <4 x float> %cast
5175}
5176
5177define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
5178; GFX6-LABEL: saddsat_i128_vs:
5179; GFX6:       ; %bb.0:
5180; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5181; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s0, v0
5182; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
5183; GFX6-NEXT:    v_mov_b32_e32 v6, s2
5184; GFX6-NEXT:    v_mov_b32_e32 v7, s3
5185; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
5186; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
5187; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5188; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5189; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5190; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5191; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
5192; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5193; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5194; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5195; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5196; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
5197; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5198; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5199; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5200; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
5201; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5202; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5203; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5204; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5205; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5206; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5207; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5208; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5209; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5210; GFX6-NEXT:    ; return to shader part epilog
5211;
5212; GFX8-LABEL: saddsat_i128_vs:
5213; GFX8:       ; %bb.0:
5214; GFX8-NEXT:    v_mov_b32_e32 v5, s1
5215; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
5216; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
5217; GFX8-NEXT:    v_mov_b32_e32 v6, s2
5218; GFX8-NEXT:    v_mov_b32_e32 v7, s3
5219; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
5220; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v3, v7, vcc
5221; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5222; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], 0
5223; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5224; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5225; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5226; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5227; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5228; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
5229; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5230; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5231; GFX8-NEXT:    s_and_b32 s0, 1, s4
5232; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5233; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5234; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5235; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5236; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
5237; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5238; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
5239; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
5240; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
5241; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5242; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5243; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5244; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5245; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5246; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5247; GFX8-NEXT:    ; return to shader part epilog
5248;
5249; GFX9-LABEL: saddsat_i128_vs:
5250; GFX9:       ; %bb.0:
5251; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5252; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
5253; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
5254; GFX9-NEXT:    v_mov_b32_e32 v6, s2
5255; GFX9-NEXT:    v_mov_b32_e32 v7, s3
5256; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v6, vcc
5257; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
5258; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
5259; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], 0
5260; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5261; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3]
5262; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
5263; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5264; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
5265; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
5266; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5267; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5268; GFX9-NEXT:    s_and_b32 s0, 1, s4
5269; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5270; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5271; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5272; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5273; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
5274; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
5275; GFX9-NEXT:    v_bfrev_b32_e32 v8, 1
5276; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
5277; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
5278; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5279; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5280; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
5281; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
5282; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
5283; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
5284; GFX9-NEXT:    ; return to shader part epilog
5285;
5286; GFX10-LABEL: saddsat_i128_vs:
5287; GFX10:       ; %bb.0:
5288; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, s0
5289; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
5290; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
5291; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
5292; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
5293; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], 0
5294; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
5295; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
5296; GFX10-NEXT:    s_and_b32 s0, 1, s0
5297; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5298; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
5299; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
5300; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
5301; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5302; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
5303; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5304; GFX10-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
5305; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5306; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v7
5307; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5308; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
5309; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
5310; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo
5311; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
5312; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo
5313; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v2, s0
5314; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v3, s0
5315; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
5316; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
5317; GFX10-NEXT:    ; return to shader part epilog
5318  %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
5319  %cast = bitcast i128 %result to <4 x float>
5320  ret <4 x float> %cast
5321}
5322
5323define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
5324; GFX6-LABEL: v_saddsat_v2i128:
5325; GFX6:       ; %bb.0:
5326; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5327; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v0, v8
5328; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
5329; GFX6-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
5330; GFX6-NEXT:    v_addc_u32_e32 v17, vcc, v3, v11, vcc
5331; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5332; GFX6-NEXT:    v_bfrev_b32_e32 v18, 1
5333; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5334; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5335; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5336; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5337; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5338; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5339; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5340; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5341; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5342; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5343; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
5344; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0, v1
5345; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5346; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
5347; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v1, v18, vcc
5348; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5349; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5350; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5351; GFX6-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
5352; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
5353; GFX6-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
5354; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v4, v12
5355; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
5356; GFX6-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
5357; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
5358; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5359; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5360; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5361; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5362; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5363; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5364; GFX6-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5365; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5366; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5367; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5368; GFX6-NEXT:    v_xor_b32_e32 v4, v5, v4
5369; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5370; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0, v5
5371; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
5372; GFX6-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
5373; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, v5, v18, vcc
5374; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
5375; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5376; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5377; GFX6-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5378; GFX6-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5379; GFX6-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5380; GFX6-NEXT:    s_setpc_b64 s[30:31]
5381;
5382; GFX8-LABEL: v_saddsat_v2i128:
5383; GFX8:       ; %bb.0:
5384; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5385; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v8
5386; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v1, v9, vcc
5387; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, v2, v10, vcc
5388; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, v3, v11, vcc
5389; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5390; GFX8-NEXT:    v_bfrev_b32_e32 v18, 1
5391; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5392; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5393; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5394; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5395; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5396; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5397; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5398; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5399; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5400; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5401; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
5402; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0, v1
5403; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
5404; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
5405; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v1, v18, vcc
5406; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5407; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5408; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5409; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
5410; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
5411; GFX8-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
5412; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v12
5413; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, v5, v13, vcc
5414; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, v6, v14, vcc
5415; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v7, v15, vcc
5416; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5417; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5418; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5419; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5420; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5421; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5422; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5423; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5424; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5425; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5426; GFX8-NEXT:    v_xor_b32_e32 v4, v5, v4
5427; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5428; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0, v5
5429; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v5, vcc
5430; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
5431; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, v5, v18, vcc
5432; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
5433; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5434; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5435; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5436; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5437; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5438; GFX8-NEXT:    s_setpc_b64 s[30:31]
5439;
5440; GFX9-LABEL: v_saddsat_v2i128:
5441; GFX9:       ; %bb.0:
5442; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5443; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
5444; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v1, v9, vcc
5445; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v2, v10, vcc
5446; GFX9-NEXT:    v_addc_co_u32_e32 v17, vcc, v3, v11, vcc
5447; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1]
5448; GFX9-NEXT:    v_bfrev_b32_e32 v18, 1
5449; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5450; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3]
5451; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5452; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3]
5453; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
5454; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
5455; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
5456; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
5457; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
5458; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5459; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
5460; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v1
5461; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
5462; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
5463; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v1, v18, vcc
5464; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5465; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5466; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v2, vcc
5467; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
5468; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v10, vcc
5469; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v11, vcc
5470; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v4, v12
5471; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v5, v13, vcc
5472; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v6, v14, vcc
5473; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v15, vcc
5474; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5]
5475; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
5476; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7]
5477; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5478; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7]
5479; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
5480; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
5481; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
5482; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
5483; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
5484; GFX9-NEXT:    v_xor_b32_e32 v4, v5, v4
5485; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v11
5486; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 0, v5
5487; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
5488; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v5, vcc
5489; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, v5, v18, vcc
5490; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
5491; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
5492; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc
5493; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
5494; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v12, vcc
5495; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
5496; GFX9-NEXT:    s_setpc_b64 s[30:31]
5497;
5498; GFX10-LABEL: v_saddsat_v2i128:
5499; GFX10:       ; %bb.0:
5500; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5501; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5502; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v8
5503; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
5504; GFX10-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
5505; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
5506; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
5507; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
5508; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
5509; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
5510; GFX10-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
5511; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
5512; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
5513; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5514; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
5515; GFX10-NEXT:    v_cndmask_b32_e64 v1, v18, 0, vcc_lo
5516; GFX10-NEXT:    v_add_co_u32 v10, vcc_lo, v4, v12
5517; GFX10-NEXT:    v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo
5518; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo
5519; GFX10-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo
5520; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[10:11], v[4:5]
5521; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5522; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v17
5523; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, v[12:13], v[6:7]
5524; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
5525; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, v[12:13], v[6:7]
5526; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
5527; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v1, 0
5528; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
5529; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
5530; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[14:15]
5531; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v13
5532; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s4
5533; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v0
5534; GFX10-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s5
5535; GFX10-NEXT:    v_cmp_eq_u64_e64 s5, 0, v[14:15]
5536; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
5537; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo
5538; GFX10-NEXT:    v_cndmask_b32_e64 v1, v9, v3, s4
5539; GFX10-NEXT:    v_cndmask_b32_e64 v4, v18, 0, s5
5540; GFX10-NEXT:    v_xor_b32_e32 v4, v4, v0
5541; GFX10-NEXT:    v_cndmask_b32_e64 v0, v8, v2, s4
5542; GFX10-NEXT:    v_cndmask_b32_e64 v2, v16, v5, s4
5543; GFX10-NEXT:    v_and_b32_e32 v3, 1, v4
5544; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v7, 0
5545; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
5546; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo
5547; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v3
5548; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo
5549; GFX10-NEXT:    v_cndmask_b32_e64 v3, v17, v6, s4
5550; GFX10-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s5
5551; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v5, s5
5552; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v8, s5
5553; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s5
5554; GFX10-NEXT:    s_setpc_b64 s[30:31]
5555  %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
5556  ret <2 x i128> %result
5557}
5558
5559define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
5560; GFX6-LABEL: s_saddsat_v2i128:
5561; GFX6:       ; %bb.0:
5562; GFX6-NEXT:    s_add_u32 s8, s0, s8
5563; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
5564; GFX6-NEXT:    s_and_b32 s16, s16, 1
5565; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
5566; GFX6-NEXT:    s_addc_u32 s9, s1, s9
5567; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
5568; GFX6-NEXT:    s_and_b32 s16, s16, 1
5569; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
5570; GFX6-NEXT:    s_addc_u32 s16, s2, s10
5571; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
5572; GFX6-NEXT:    v_mov_b32_e32 v3, s1
5573; GFX6-NEXT:    s_and_b32 s17, s17, 1
5574; GFX6-NEXT:    v_mov_b32_e32 v2, s0
5575; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
5576; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5577; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5578; GFX6-NEXT:    s_addc_u32 s17, s3, s11
5579; GFX6-NEXT:    v_mov_b32_e32 v1, s3
5580; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5581; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
5582; GFX6-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5583; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
5584; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
5585; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5586; GFX6-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
5587; GFX6-NEXT:    s_ashr_i32 s3, s17, 31
5588; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5589; GFX6-NEXT:    s_add_u32 s0, s3, 0
5590; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
5591; GFX6-NEXT:    s_and_b32 s1, s1, 1
5592; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
5593; GFX6-NEXT:    s_addc_u32 s1, s3, 0
5594; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
5595; GFX6-NEXT:    s_and_b32 s2, s2, 1
5596; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
5597; GFX6-NEXT:    s_addc_u32 s2, s3, 0
5598; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
5599; GFX6-NEXT:    s_and_b32 s11, s11, 1
5600; GFX6-NEXT:    s_brev_b32 s10, 1
5601; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
5602; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
5603; GFX6-NEXT:    s_addc_u32 s3, s3, s10
5604; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5605; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5606; GFX6-NEXT:    s_add_u32 s0, s4, s12
5607; GFX6-NEXT:    v_mov_b32_e32 v2, s1
5608; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
5609; GFX6-NEXT:    s_and_b32 s1, s1, 1
5610; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
5611; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5612; GFX6-NEXT:    s_addc_u32 s1, s5, s13
5613; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5614; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5615; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
5616; GFX6-NEXT:    s_and_b32 s2, s2, 1
5617; GFX6-NEXT:    v_mov_b32_e32 v3, s8
5618; GFX6-NEXT:    v_mov_b32_e32 v4, s9
5619; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
5620; GFX6-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5621; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5622; GFX6-NEXT:    v_mov_b32_e32 v1, s3
5623; GFX6-NEXT:    v_mov_b32_e32 v2, s16
5624; GFX6-NEXT:    v_mov_b32_e32 v3, s17
5625; GFX6-NEXT:    s_addc_u32 s2, s6, s14
5626; GFX6-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5627; GFX6-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5628; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
5629; GFX6-NEXT:    v_mov_b32_e32 v2, s4
5630; GFX6-NEXT:    s_and_b32 s3, s3, 1
5631; GFX6-NEXT:    v_mov_b32_e32 v3, s5
5632; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
5633; GFX6-NEXT:    v_mov_b32_e32 v0, s6
5634; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5635; GFX6-NEXT:    s_addc_u32 s3, s7, s15
5636; GFX6-NEXT:    v_mov_b32_e32 v1, s7
5637; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5638; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5639; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
5640; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
5641; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
5642; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5643; GFX6-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[14:15], 0
5644; GFX6-NEXT:    s_ashr_i32 s7, s3, 31
5645; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
5646; GFX6-NEXT:    s_add_u32 s4, s7, 0
5647; GFX6-NEXT:    s_cselect_b32 s5, 1, 0
5648; GFX6-NEXT:    s_and_b32 s5, s5, 1
5649; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
5650; GFX6-NEXT:    s_addc_u32 s5, s7, 0
5651; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
5652; GFX6-NEXT:    s_and_b32 s6, s6, 1
5653; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
5654; GFX6-NEXT:    s_addc_u32 s6, s7, 0
5655; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
5656; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
5657; GFX6-NEXT:    v_xor_b32_e32 v0, v1, v0
5658; GFX6-NEXT:    s_and_b32 s8, s8, 1
5659; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
5660; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
5661; GFX6-NEXT:    s_addc_u32 s7, s7, s10
5662; GFX6-NEXT:    v_mov_b32_e32 v1, s4
5663; GFX6-NEXT:    v_mov_b32_e32 v2, s5
5664; GFX6-NEXT:    v_mov_b32_e32 v3, s0
5665; GFX6-NEXT:    v_mov_b32_e32 v8, s1
5666; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5667; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5668; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5669; GFX6-NEXT:    v_mov_b32_e32 v2, s6
5670; GFX6-NEXT:    v_mov_b32_e32 v3, s7
5671; GFX6-NEXT:    v_mov_b32_e32 v8, s2
5672; GFX6-NEXT:    v_mov_b32_e32 v9, s3
5673; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5674; GFX6-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5675; GFX6-NEXT:    v_readfirstlane_b32 s0, v5
5676; GFX6-NEXT:    v_readfirstlane_b32 s1, v4
5677; GFX6-NEXT:    v_readfirstlane_b32 s2, v6
5678; GFX6-NEXT:    v_readfirstlane_b32 s3, v7
5679; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
5680; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
5681; GFX6-NEXT:    v_readfirstlane_b32 s6, v2
5682; GFX6-NEXT:    v_readfirstlane_b32 s7, v3
5683; GFX6-NEXT:    ; return to shader part epilog
5684;
5685; GFX8-LABEL: s_saddsat_v2i128:
5686; GFX8:       ; %bb.0:
5687; GFX8-NEXT:    s_add_u32 s8, s0, s8
5688; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
5689; GFX8-NEXT:    s_and_b32 s16, s16, 1
5690; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
5691; GFX8-NEXT:    s_addc_u32 s9, s1, s9
5692; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
5693; GFX8-NEXT:    s_and_b32 s16, s16, 1
5694; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
5695; GFX8-NEXT:    s_addc_u32 s16, s2, s10
5696; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
5697; GFX8-NEXT:    s_and_b32 s17, s17, 1
5698; GFX8-NEXT:    v_mov_b32_e32 v3, s1
5699; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
5700; GFX8-NEXT:    v_mov_b32_e32 v2, s0
5701; GFX8-NEXT:    s_addc_u32 s17, s3, s11
5702; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5703; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5704; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5705; GFX8-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
5706; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5707; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5708; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
5709; GFX8-NEXT:    s_and_b32 s0, 1, s2
5710; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5711; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5712; GFX8-NEXT:    s_cmp_eq_u64 s[10:11], 0
5713; GFX8-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5714; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5715; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5716; GFX8-NEXT:    s_and_b32 s0, 1, s2
5717; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5718; GFX8-NEXT:    s_ashr_i32 s3, s17, 31
5719; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5720; GFX8-NEXT:    s_add_u32 s0, s3, 0
5721; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
5722; GFX8-NEXT:    s_and_b32 s1, s1, 1
5723; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
5724; GFX8-NEXT:    s_addc_u32 s1, s3, 0
5725; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5726; GFX8-NEXT:    s_and_b32 s2, s2, 1
5727; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
5728; GFX8-NEXT:    s_addc_u32 s2, s3, 0
5729; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
5730; GFX8-NEXT:    s_and_b32 s11, s11, 1
5731; GFX8-NEXT:    s_brev_b32 s10, 1
5732; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
5733; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5734; GFX8-NEXT:    s_addc_u32 s3, s3, s10
5735; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5736; GFX8-NEXT:    v_mov_b32_e32 v1, s0
5737; GFX8-NEXT:    s_add_u32 s0, s4, s12
5738; GFX8-NEXT:    v_mov_b32_e32 v2, s1
5739; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
5740; GFX8-NEXT:    s_and_b32 s1, s1, 1
5741; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
5742; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5743; GFX8-NEXT:    s_addc_u32 s1, s5, s13
5744; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5745; GFX8-NEXT:    v_mov_b32_e32 v0, s2
5746; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
5747; GFX8-NEXT:    s_and_b32 s2, s2, 1
5748; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
5749; GFX8-NEXT:    v_mov_b32_e32 v3, s8
5750; GFX8-NEXT:    v_mov_b32_e32 v4, s9
5751; GFX8-NEXT:    s_addc_u32 s2, s6, s14
5752; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5753; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5754; GFX8-NEXT:    v_mov_b32_e32 v1, s3
5755; GFX8-NEXT:    v_mov_b32_e32 v2, s16
5756; GFX8-NEXT:    v_mov_b32_e32 v3, s17
5757; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
5758; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5759; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5760; GFX8-NEXT:    s_and_b32 s3, s3, 1
5761; GFX8-NEXT:    v_mov_b32_e32 v2, s4
5762; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
5763; GFX8-NEXT:    v_mov_b32_e32 v3, s5
5764; GFX8-NEXT:    s_addc_u32 s3, s7, s15
5765; GFX8-NEXT:    v_mov_b32_e32 v0, s6
5766; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5767; GFX8-NEXT:    v_mov_b32_e32 v1, s7
5768; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
5769; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5770; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5771; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5772; GFX8-NEXT:    s_and_b32 s4, 1, s6
5773; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5774; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5775; GFX8-NEXT:    s_cmp_eq_u64 s[14:15], 0
5776; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
5777; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5778; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5779; GFX8-NEXT:    s_and_b32 s4, 1, s6
5780; GFX8-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
5781; GFX8-NEXT:    s_ashr_i32 s7, s3, 31
5782; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
5783; GFX8-NEXT:    s_add_u32 s4, s7, 0
5784; GFX8-NEXT:    s_cselect_b32 s5, 1, 0
5785; GFX8-NEXT:    s_and_b32 s5, s5, 1
5786; GFX8-NEXT:    s_cmp_lg_u32 s5, 0
5787; GFX8-NEXT:    s_addc_u32 s5, s7, 0
5788; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
5789; GFX8-NEXT:    s_and_b32 s6, s6, 1
5790; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
5791; GFX8-NEXT:    s_addc_u32 s6, s7, 0
5792; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5793; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
5794; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v0
5795; GFX8-NEXT:    s_and_b32 s8, s8, 1
5796; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
5797; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
5798; GFX8-NEXT:    s_addc_u32 s7, s7, s10
5799; GFX8-NEXT:    v_mov_b32_e32 v1, s4
5800; GFX8-NEXT:    v_mov_b32_e32 v2, s5
5801; GFX8-NEXT:    v_mov_b32_e32 v3, s0
5802; GFX8-NEXT:    v_mov_b32_e32 v8, s1
5803; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5804; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5805; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5806; GFX8-NEXT:    v_mov_b32_e32 v2, s6
5807; GFX8-NEXT:    v_mov_b32_e32 v3, s7
5808; GFX8-NEXT:    v_mov_b32_e32 v8, s2
5809; GFX8-NEXT:    v_mov_b32_e32 v9, s3
5810; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5811; GFX8-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5812; GFX8-NEXT:    v_readfirstlane_b32 s0, v5
5813; GFX8-NEXT:    v_readfirstlane_b32 s1, v4
5814; GFX8-NEXT:    v_readfirstlane_b32 s2, v6
5815; GFX8-NEXT:    v_readfirstlane_b32 s3, v7
5816; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
5817; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
5818; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
5819; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
5820; GFX8-NEXT:    ; return to shader part epilog
5821;
5822; GFX9-LABEL: s_saddsat_v2i128:
5823; GFX9:       ; %bb.0:
5824; GFX9-NEXT:    s_add_u32 s8, s0, s8
5825; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
5826; GFX9-NEXT:    s_and_b32 s16, s16, 1
5827; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
5828; GFX9-NEXT:    s_addc_u32 s9, s1, s9
5829; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
5830; GFX9-NEXT:    s_and_b32 s16, s16, 1
5831; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
5832; GFX9-NEXT:    s_addc_u32 s16, s2, s10
5833; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
5834; GFX9-NEXT:    s_and_b32 s17, s17, 1
5835; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5836; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
5837; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5838; GFX9-NEXT:    s_addc_u32 s17, s3, s11
5839; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5840; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
5841; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5842; GFX9-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
5843; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5844; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5845; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1]
5846; GFX9-NEXT:    s_and_b32 s0, 1, s2
5847; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5848; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
5849; GFX9-NEXT:    s_cmp_eq_u64 s[10:11], 0
5850; GFX9-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[10:11], 0
5851; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5852; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
5853; GFX9-NEXT:    s_and_b32 s0, 1, s2
5854; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
5855; GFX9-NEXT:    s_ashr_i32 s3, s17, 31
5856; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[0:1]
5857; GFX9-NEXT:    s_add_u32 s0, s3, 0
5858; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
5859; GFX9-NEXT:    s_and_b32 s1, s1, 1
5860; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
5861; GFX9-NEXT:    s_addc_u32 s1, s3, 0
5862; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5863; GFX9-NEXT:    s_and_b32 s2, s2, 1
5864; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
5865; GFX9-NEXT:    s_addc_u32 s2, s3, 0
5866; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
5867; GFX9-NEXT:    s_and_b32 s11, s11, 1
5868; GFX9-NEXT:    s_brev_b32 s10, 1
5869; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
5870; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5871; GFX9-NEXT:    s_addc_u32 s3, s3, s10
5872; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5873; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5874; GFX9-NEXT:    s_add_u32 s0, s4, s12
5875; GFX9-NEXT:    v_mov_b32_e32 v2, s1
5876; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
5877; GFX9-NEXT:    s_and_b32 s1, s1, 1
5878; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
5879; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5880; GFX9-NEXT:    s_addc_u32 s1, s5, s13
5881; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5882; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5883; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
5884; GFX9-NEXT:    s_and_b32 s2, s2, 1
5885; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
5886; GFX9-NEXT:    v_mov_b32_e32 v3, s8
5887; GFX9-NEXT:    v_mov_b32_e32 v4, s9
5888; GFX9-NEXT:    s_addc_u32 s2, s6, s14
5889; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v1, vcc
5890; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
5891; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5892; GFX9-NEXT:    v_mov_b32_e32 v2, s16
5893; GFX9-NEXT:    v_mov_b32_e32 v3, s17
5894; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
5895; GFX9-NEXT:    v_cndmask_b32_e32 v6, v2, v0, vcc
5896; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v1, vcc
5897; GFX9-NEXT:    s_and_b32 s3, s3, 1
5898; GFX9-NEXT:    v_mov_b32_e32 v2, s4
5899; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
5900; GFX9-NEXT:    v_mov_b32_e32 v3, s5
5901; GFX9-NEXT:    s_addc_u32 s3, s7, s15
5902; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5903; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
5904; GFX9-NEXT:    v_mov_b32_e32 v1, s7
5905; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
5906; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5907; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
5908; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
5909; GFX9-NEXT:    s_and_b32 s4, 1, s6
5910; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
5911; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
5912; GFX9-NEXT:    s_cmp_eq_u64 s[14:15], 0
5913; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], s[14:15], 0
5914; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5915; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
5916; GFX9-NEXT:    s_and_b32 s4, 1, s6
5917; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, s4
5918; GFX9-NEXT:    s_ashr_i32 s7, s3, 31
5919; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s[4:5]
5920; GFX9-NEXT:    s_add_u32 s4, s7, 0
5921; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
5922; GFX9-NEXT:    s_and_b32 s5, s5, 1
5923; GFX9-NEXT:    s_cmp_lg_u32 s5, 0
5924; GFX9-NEXT:    s_addc_u32 s5, s7, 0
5925; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
5926; GFX9-NEXT:    s_and_b32 s6, s6, 1
5927; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
5928; GFX9-NEXT:    s_addc_u32 s6, s7, 0
5929; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5930; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
5931; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
5932; GFX9-NEXT:    s_and_b32 s8, s8, 1
5933; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
5934; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
5935; GFX9-NEXT:    s_addc_u32 s7, s7, s10
5936; GFX9-NEXT:    v_mov_b32_e32 v1, s4
5937; GFX9-NEXT:    v_mov_b32_e32 v2, s5
5938; GFX9-NEXT:    v_mov_b32_e32 v3, s0
5939; GFX9-NEXT:    v_mov_b32_e32 v8, s1
5940; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
5941; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
5942; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
5943; GFX9-NEXT:    v_mov_b32_e32 v2, s6
5944; GFX9-NEXT:    v_mov_b32_e32 v3, s7
5945; GFX9-NEXT:    v_mov_b32_e32 v8, s2
5946; GFX9-NEXT:    v_mov_b32_e32 v9, s3
5947; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
5948; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
5949; GFX9-NEXT:    v_readfirstlane_b32 s0, v5
5950; GFX9-NEXT:    v_readfirstlane_b32 s1, v4
5951; GFX9-NEXT:    v_readfirstlane_b32 s2, v6
5952; GFX9-NEXT:    v_readfirstlane_b32 s3, v7
5953; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
5954; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
5955; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
5956; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
5957; GFX9-NEXT:    ; return to shader part epilog
5958;
5959; GFX10-LABEL: s_saddsat_v2i128:
5960; GFX10:       ; %bb.0:
5961; GFX10-NEXT:    s_add_u32 s8, s0, s8
5962; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
5963; GFX10-NEXT:    s_and_b32 s16, s16, 1
5964; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
5965; GFX10-NEXT:    s_addc_u32 s9, s1, s9
5966; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
5967; GFX10-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
5968; GFX10-NEXT:    s_and_b32 s16, s16, 1
5969; GFX10-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], 0
5970; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
5971; GFX10-NEXT:    v_mov_b32_e32 v2, s9
5972; GFX10-NEXT:    s_addc_u32 s16, s2, s10
5973; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
5974; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
5975; GFX10-NEXT:    s_and_b32 s17, s17, 1
5976; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
5977; GFX10-NEXT:    s_addc_u32 s17, s3, s11
5978; GFX10-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
5979; GFX10-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
5980; GFX10-NEXT:    v_mov_b32_e32 v3, s17
5981; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
5982; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
5983; GFX10-NEXT:    s_and_b32 s0, 1, s18
5984; GFX10-NEXT:    s_cmp_eq_u64 s[10:11], 0
5985; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
5986; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
5987; GFX10-NEXT:    s_ashr_i32 s3, s17, 31
5988; GFX10-NEXT:    s_and_b32 s0, 1, s0
5989; GFX10-NEXT:    s_brev_b32 s10, 1
5990; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
5991; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
5992; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
5993; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, 0, s0
5994; GFX10-NEXT:    s_add_u32 s0, s3, 0
5995; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
5996; GFX10-NEXT:    s_and_b32 s1, s1, 1
5997; GFX10-NEXT:    v_xor_b32_e32 v0, v1, v0
5998; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
5999; GFX10-NEXT:    v_mov_b32_e32 v1, s8
6000; GFX10-NEXT:    s_addc_u32 s1, s3, 0
6001; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6002; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
6003; GFX10-NEXT:    s_and_b32 s2, s2, 1
6004; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
6005; GFX10-NEXT:    s_addc_u32 s2, s3, 0
6006; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
6007; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
6008; GFX10-NEXT:    s_and_b32 s11, s11, 1
6009; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
6010; GFX10-NEXT:    s_addc_u32 s3, s3, s10
6011; GFX10-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
6012; GFX10-NEXT:    s_add_u32 s0, s4, s12
6013; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
6014; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, s1, vcc_lo
6015; GFX10-NEXT:    s_and_b32 s8, s8, 1
6016; GFX10-NEXT:    v_mov_b32_e32 v2, s16
6017; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
6018; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, s3, vcc_lo
6019; GFX10-NEXT:    s_addc_u32 s1, s5, s13
6020; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
6021; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
6022; GFX10-NEXT:    s_and_b32 s8, s8, 1
6023; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s2, vcc_lo
6024; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
6025; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[14:15], 0
6026; GFX10-NEXT:    s_addc_u32 s8, s6, s14
6027; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
6028; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
6029; GFX10-NEXT:    s_and_b32 s9, s9, 1
6030; GFX10-NEXT:    v_mov_b32_e32 v6, s1
6031; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
6032; GFX10-NEXT:    v_mov_b32_e32 v7, s8
6033; GFX10-NEXT:    s_addc_u32 s9, s7, s15
6034; GFX10-NEXT:    s_cmp_eq_u64 s[8:9], s[6:7]
6035; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, s[8:9], s[6:7]
6036; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6037; GFX10-NEXT:    v_mov_b32_e32 v8, s9
6038; GFX10-NEXT:    s_and_b32 s2, 1, s2
6039; GFX10-NEXT:    s_cmp_eq_u64 s[14:15], 0
6040; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
6041; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
6042; GFX10-NEXT:    s_cselect_b32 s2, 1, 0
6043; GFX10-NEXT:    s_ashr_i32 s5, s9, 31
6044; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
6045; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s3
6046; GFX10-NEXT:    s_and_b32 s3, 1, s2
6047; GFX10-NEXT:    s_add_u32 s2, s5, 0
6048; GFX10-NEXT:    v_cmp_ne_u32_e64 s3, 0, s3
6049; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
6050; GFX10-NEXT:    s_and_b32 s4, s4, 1
6051; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s3
6052; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
6053; GFX10-NEXT:    s_addc_u32 s3, s5, 0
6054; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
6055; GFX10-NEXT:    v_xor_b32_e32 v4, v5, v4
6056; GFX10-NEXT:    s_and_b32 s4, s4, 1
6057; GFX10-NEXT:    v_mov_b32_e32 v5, s0
6058; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
6059; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
6060; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
6061; GFX10-NEXT:    s_addc_u32 s4, s5, 0
6062; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
6063; GFX10-NEXT:    s_and_b32 s6, s6, 1
6064; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
6065; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
6066; GFX10-NEXT:    s_addc_u32 s1, s5, s10
6067; GFX10-NEXT:    v_cndmask_b32_e64 v4, v5, s2, vcc_lo
6068; GFX10-NEXT:    v_cndmask_b32_e64 v5, v6, s3, vcc_lo
6069; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, s4, vcc_lo
6070; GFX10-NEXT:    v_cndmask_b32_e64 v7, v8, s1, vcc_lo
6071; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
6072; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
6073; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
6074; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
6075; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
6076; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
6077; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
6078; GFX10-NEXT:    ; return to shader part epilog
6079  %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
6080  ret <2 x i128> %result
6081}
6082
6083declare i7 @llvm.sadd.sat.i7(i7, i7) #0
6084declare i8 @llvm.sadd.sat.i8(i8, i8) #0
6085declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) #0
6086declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) #0
6087
6088declare i16 @llvm.sadd.sat.i16(i16, i16) #0
6089declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
6090declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
6091declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
6092declare <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16>, <5 x i16>) #0
6093declare <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16>, <6 x i16>) #0
6094declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) #0
6095
6096declare i24 @llvm.sadd.sat.i24(i24, i24) #0
6097
6098declare i32 @llvm.sadd.sat.i32(i32, i32) #0
6099declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
6100declare <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
6101declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
6102declare <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32>, <5 x i32>) #0
6103declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
6104
6105declare i48 @llvm.sadd.sat.i48(i48, i48) #0
6106
6107declare i64 @llvm.sadd.sat.i64(i64, i64) #0
6108declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) #0
6109
6110declare i128 @llvm.sadd.sat.i128(i128, i128) #0
6111declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) #0
6112
6113attributes #0 = { nounwind readnone speculatable willreturn }
6114