1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5
6define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
7; GFX6-LABEL: v_saddsat_i8:
8; GFX6:       ; %bb.0:
9; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 8
11; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
12; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
13; GFX6-NEXT:    v_min_i32_e32 v0, 0x7f, v0
14; GFX6-NEXT:    v_max_i32_e32 v0, 0xffffff80, v0
15; GFX6-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX8-LABEL: v_saddsat_i8:
18; GFX8:       ; %bb.0:
19; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX8-NEXT:    v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
21; GFX8-NEXT:    v_min_i16_e32 v0, 0x7f, v0
22; GFX8-NEXT:    v_max_i16_e32 v0, 0xff80, v0
23; GFX8-NEXT:    s_setpc_b64 s[30:31]
24;
25; GFX9-LABEL: v_saddsat_i8:
26; GFX9:       ; %bb.0:
27; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
28; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
29; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
30; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
31; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 8, v0
32; GFX9-NEXT:    s_setpc_b64 s[30:31]
33  %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
34  ret i8 %result
35}
36
37define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
38; GFX6-LABEL: v_saddsat_i16:
39; GFX6:       ; %bb.0:
40; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
42; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
43; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
44; GFX6-NEXT:    v_min_i32_e32 v0, 0x7fff, v0
45; GFX6-NEXT:    v_max_i32_e32 v0, 0xffff8000, v0
46; GFX6-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX8-LABEL: v_saddsat_i16:
49; GFX8:       ; %bb.0:
50; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
52; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
53; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
54; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
55; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
56; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
57; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
58; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
59; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
60; GFX8-NEXT:    s_setpc_b64 s[30:31]
61;
62; GFX9-LABEL: v_saddsat_i16:
63; GFX9:       ; %bb.0:
64; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
65; GFX9-NEXT:    v_add_i16 v0, v0, v1 clamp
66; GFX9-NEXT:    s_setpc_b64 s[30:31]
67  %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
68  ret i16 %result
69}
70
71define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
72; GFX6-LABEL: v_saddsat_i32:
73; GFX6:       ; %bb.0:
74; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
76; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
77; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
78; GFX6-NEXT:    v_bfrev_b32_e32 v0, 1
79; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
80; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
81; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
82; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
83; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
84; GFX6-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX8-LABEL: v_saddsat_i32:
87; GFX8:       ; %bb.0:
88; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
90; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
91; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
92; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
93; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
94; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
95; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
96; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
97; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
98; GFX8-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX9-LABEL: v_saddsat_i32:
101; GFX9:       ; %bb.0:
102; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
103; GFX9-NEXT:    v_add_i32 v0, v0, v1 clamp
104; GFX9-NEXT:    s_setpc_b64 s[30:31]
105  %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs)
106  ret i32 %result
107}
108
109define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
110; GFX6-LABEL: v_saddsat_v2i16:
111; GFX6:       ; %bb.0:
112; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
114; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
115; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
116; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
117; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
118; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
119; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
120; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
121; GFX6-NEXT:    s_movk_i32 s5, 0x8000
122; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
123; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
124; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
125; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
126; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
127; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
128; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
129; GFX6-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX8-LABEL: v_saddsat_v2i16:
132; GFX8:       ; %bb.0:
133; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
135; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
136; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
137; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff8000
138; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fff
139; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v4
140; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc
141; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
142; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
143; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
144; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
145; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
146; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
147; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
148; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
149; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[6:7]
150; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
151; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
152; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
153; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
154; GFX8-NEXT:    s_setpc_b64 s[30:31]
155;
156; GFX9-LABEL: v_saddsat_v2i16:
157; GFX9:       ; %bb.0:
158; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
159; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
160; GFX9-NEXT:    s_setpc_b64 s[30:31]
161  %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
162  ret <2 x i16> %result
163}
164
165define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
166; GFX6-LABEL: v_saddsat_v3i16:
167; GFX6:       ; %bb.0:
168; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
170; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
171; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
172; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
173; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
174; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
175; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
176; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
177; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
178; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
179; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
180; GFX6-NEXT:    s_movk_i32 s5, 0x8000
181; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
182; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
183; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
184; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
185; GFX6-NEXT:    v_max_i32_e32 v3, s5, v2
186; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
187; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
188; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
189; GFX6-NEXT:    v_or_b32_e32 v2, 0xffff0000, v3
190; GFX6-NEXT:    v_alignbit_b32 v1, v3, v1, 16
191; GFX6-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX8-LABEL: v_saddsat_v3i16:
194; GFX8:       ; %bb.0:
195; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
197; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
198; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
199; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
200; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
201; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
202; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
203; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
204; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
205; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
206; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
207; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
208; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
209; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
210; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
211; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
212; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
213; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
214; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
215; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
216; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
217; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
218; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
219; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
220; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
221; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
222; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
223; GFX8-NEXT:    s_setpc_b64 s[30:31]
224;
225; GFX9-LABEL: v_saddsat_v3i16:
226; GFX9:       ; %bb.0:
227; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
229; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
230; GFX9-NEXT:    s_setpc_b64 s[30:31]
231  %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
232  ret <3 x i16> %result
233}
234
235define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
236; GFX6-LABEL: v_saddsat_v4i16:
237; GFX6:       ; %bb.0:
238; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 16
240; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 16
241; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
242; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
243; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
244; GFX6-NEXT:    s_movk_i32 s4, 0x7fff
245; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
246; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
247; GFX6-NEXT:    s_movk_i32 s5, 0x8000
248; GFX6-NEXT:    v_min_i32_e32 v0, s4, v0
249; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
250; GFX6-NEXT:    v_max_i32_e32 v0, s5, v0
251; GFX6-NEXT:    s_mov_b32 s6, 0xffff
252; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
253; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 16
254; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
255; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
256; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
257; GFX6-NEXT:    v_and_b32_e32 v0, s6, v0
258; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
259; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
260; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v7
261; GFX6-NEXT:    v_min_i32_e32 v1, s4, v1
262; GFX6-NEXT:    v_min_i32_e32 v2, s4, v2
263; GFX6-NEXT:    v_max_i32_e32 v1, s5, v1
264; GFX6-NEXT:    v_max_i32_e32 v2, s5, v2
265; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
266; GFX6-NEXT:    v_and_b32_e32 v2, s6, v2
267; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
268; GFX6-NEXT:    s_setpc_b64 s[30:31]
269;
270; GFX8-LABEL: v_saddsat_v4i16:
271; GFX8:       ; %bb.0:
272; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
274; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
275; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
276; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
277; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
278; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
279; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
280; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
281; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
282; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
283; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
284; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
285; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
286; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
287; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
288; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
289; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
290; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
291; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
292; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
293; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
294; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
295; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
296; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v5
297; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
298; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
299; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
300; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
301; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
302; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
303; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
304; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
305; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
306; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
307; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
308; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
309; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
310; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
311; GFX8-NEXT:    s_setpc_b64 s[30:31]
312;
313; GFX9-LABEL: v_saddsat_v4i16:
314; GFX9:       ; %bb.0:
315; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
316; GFX9-NEXT:    v_pk_add_i16 v0, v0, v2 clamp
317; GFX9-NEXT:    v_pk_add_i16 v1, v1, v3 clamp
318; GFX9-NEXT:    s_setpc_b64 s[30:31]
319  %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
320  %cast = bitcast <4 x i16> %result to <2 x float>
321  ret <2 x float> %cast
322}
323
324define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
325; GFX6-LABEL: v_saddsat_v2i32:
326; GFX6:       ; %bb.0:
327; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
329; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
330; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
331; GFX6-NEXT:    v_bfrev_b32_e32 v4, 1
332; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
333; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
334; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
335; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
336; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
337; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
338; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
339; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
340; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
341; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
342; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
343; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
344; GFX6-NEXT:    s_setpc_b64 s[30:31]
345;
346; GFX8-LABEL: v_saddsat_v2i32:
347; GFX8:       ; %bb.0:
348; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
350; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
351; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
352; GFX8-NEXT:    v_bfrev_b32_e32 v4, 1
353; GFX8-NEXT:    v_bfrev_b32_e32 v5, -2
354; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
355; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
356; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
357; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
358; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
359; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
360; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
361; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
362; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
363; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
364; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
365; GFX8-NEXT:    s_setpc_b64 s[30:31]
366;
367; GFX9-LABEL: v_saddsat_v2i32:
368; GFX9:       ; %bb.0:
369; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
371; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
372; GFX9-NEXT:    s_setpc_b64 s[30:31]
373  %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
374  ret <2 x i32> %result
375}
376
377define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
378; GFX6-LABEL: v_saddsat_i64:
379; GFX6:       ; %bb.0:
380; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
382; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
383; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
384; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
385; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
386; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
387; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
388; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
389; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
390; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
391; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
392; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
393; GFX6-NEXT:    s_setpc_b64 s[30:31]
394;
395; GFX8-LABEL: v_saddsat_i64:
396; GFX8:       ; %bb.0:
397; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
399; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
400; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
401; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
402; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
403; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
404; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
405; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
406; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
407; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
408; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
409; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
410; GFX8-NEXT:    s_setpc_b64 s[30:31]
411;
412; GFX9-LABEL: v_saddsat_i64:
413; GFX9:       ; %bb.0:
414; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v2
416; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
417; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
418; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
419; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
420; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
421; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
422; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
423; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
424; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
425; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
426; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
427; GFX9-NEXT:    s_setpc_b64 s[30:31]
428  %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
429  ret i64 %result
430}
431
432declare i8 @llvm.sadd.sat.i8(i8, i8) #0
433declare i16 @llvm.sadd.sat.i16(i16, i16) #0
434declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
435declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
436declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
437declare i32 @llvm.sadd.sat.i32(i32, i32) #0
438declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
439declare i64 @llvm.sadd.sat.i64(i64, i64) #0
440