1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s
6
7define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
8; GFX6-LABEL: v_uaddsat_i7:
9; GFX6:       ; %bb.0:
10; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
12; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
13; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
14; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
15; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
16; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 25, v0
17; GFX6-NEXT:    s_setpc_b64 s[30:31]
18;
19; GFX8-LABEL: v_uaddsat_i7:
20; GFX8:       ; %bb.0:
21; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
23; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
24; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
25; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
26; GFX8-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX9-LABEL: v_uaddsat_i7:
29; GFX9:       ; %bb.0:
30; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 9, v0
32; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 9, v1
33; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
34; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
35; GFX9-NEXT:    s_setpc_b64 s[30:31]
36;
37; GFX10-LABEL: v_uaddsat_i7:
38; GFX10:       ; %bb.0:
39; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
41; GFX10-NEXT:    v_lshlrev_b16 v0, 9, v0
42; GFX10-NEXT:    v_lshlrev_b16 v1, 9, v1
43; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
44; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
45; GFX10-NEXT:    s_setpc_b64 s[30:31]
46  %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
47  ret i7 %result
48}
49
50define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
51; GFX6-LABEL: s_uaddsat_i7:
52; GFX6:       ; %bb.0:
53; GFX6-NEXT:    s_lshl_b32 s0, s0, 25
54; GFX6-NEXT:    s_lshl_b32 s1, s1, 25
55; GFX6-NEXT:    s_not_b32 s2, s0
56; GFX6-NEXT:    s_min_u32 s1, s2, s1
57; GFX6-NEXT:    s_add_i32 s0, s0, s1
58; GFX6-NEXT:    s_lshr_b32 s0, s0, 25
59; GFX6-NEXT:    ; return to shader part epilog
60;
61; GFX8-LABEL: s_uaddsat_i7:
62; GFX8:       ; %bb.0:
63; GFX8-NEXT:    s_bfe_u32 s2, 9, 0x100000
64; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
65; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
66; GFX8-NEXT:    v_mov_b32_e32 v0, s1
67; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
68; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
69; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
70; GFX8-NEXT:    ; return to shader part epilog
71;
72; GFX9-LABEL: s_uaddsat_i7:
73; GFX9:       ; %bb.0:
74; GFX9-NEXT:    s_bfe_u32 s2, 9, 0x100000
75; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
76; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
77; GFX9-NEXT:    v_mov_b32_e32 v0, s1
78; GFX9-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
79; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 9, v0
80; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
81; GFX9-NEXT:    ; return to shader part epilog
82;
83; GFX10-LABEL: s_uaddsat_i7:
84; GFX10:       ; %bb.0:
85; GFX10-NEXT:    s_bfe_u32 s2, 9, 0x100000
86; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
87; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
88; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
89; GFX10-NEXT:    v_lshrrev_b16 v0, 9, v0
90; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
91; GFX10-NEXT:    ; return to shader part epilog
92  %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
93  ret i7 %result
94}
95
96define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
97; GFX6-LABEL: v_uaddsat_i8:
98; GFX6:       ; %bb.0:
99; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
101; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
102; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
103; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
104; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
105; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
106; GFX6-NEXT:    s_setpc_b64 s[30:31]
107;
108; GFX8-LABEL: v_uaddsat_i8:
109; GFX8:       ; %bb.0:
110; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
111; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
112; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
113; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
114; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
115; GFX8-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX9-LABEL: v_uaddsat_i8:
118; GFX9:       ; %bb.0:
119; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
121; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
122; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
123; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
124; GFX9-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: v_uaddsat_i8:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
130; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
131; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
132; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
133; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
134; GFX10-NEXT:    s_setpc_b64 s[30:31]
135  %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
136  ret i8 %result
137}
138
139define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
140; GFX6-LABEL: s_uaddsat_i8:
141; GFX6:       ; %bb.0:
142; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
143; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
144; GFX6-NEXT:    s_not_b32 s2, s0
145; GFX6-NEXT:    s_min_u32 s1, s2, s1
146; GFX6-NEXT:    s_add_i32 s0, s0, s1
147; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
148; GFX6-NEXT:    ; return to shader part epilog
149;
150; GFX8-LABEL: s_uaddsat_i8:
151; GFX8:       ; %bb.0:
152; GFX8-NEXT:    s_bfe_u32 s2, 8, 0x100000
153; GFX8-NEXT:    s_lshl_b32 s1, s1, s2
154; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
155; GFX8-NEXT:    v_mov_b32_e32 v0, s1
156; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
157; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
158; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
159; GFX8-NEXT:    ; return to shader part epilog
160;
161; GFX9-LABEL: s_uaddsat_i8:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_bfe_u32 s2, 8, 0x100000
164; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
165; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
166; GFX9-NEXT:    v_mov_b32_e32 v0, s1
167; GFX9-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
168; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
169; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
170; GFX9-NEXT:    ; return to shader part epilog
171;
172; GFX10-LABEL: s_uaddsat_i8:
173; GFX10:       ; %bb.0:
174; GFX10-NEXT:    s_bfe_u32 s2, 8, 0x100000
175; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
176; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
177; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
178; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
179; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
180; GFX10-NEXT:    ; return to shader part epilog
181  %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
182  ret i8 %result
183}
184
185define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
186; GFX6-LABEL: v_uaddsat_v2i8:
187; GFX6:       ; %bb.0:
188; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
190; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
191; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
192; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
193; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v0
194; GFX6-NEXT:    v_min_u32_e32 v1, v4, v1
195; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
196; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
197; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
198; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v1
199; GFX6-NEXT:    v_min_u32_e32 v2, v3, v2
200; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
201; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
202; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
203; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
204; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
205; GFX6-NEXT:    s_setpc_b64 s[30:31]
206;
207; GFX8-LABEL: v_uaddsat_v2i8:
208; GFX8:       ; %bb.0:
209; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX8-NEXT:    v_mov_b32_e32 v2, 8
211; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
212; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
213; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
214; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
215; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
216; GFX8-NEXT:    v_add_u16_e64 v1, v3, v2 clamp
217; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
218; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
219; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
220; GFX8-NEXT:    s_setpc_b64 s[30:31]
221;
222; GFX9-LABEL: v_uaddsat_v2i8:
223; GFX9:       ; %bb.0:
224; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX9-NEXT:    s_mov_b32 s4, 8
226; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
227; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
228; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
229; GFX9-NEXT:    v_and_or_b32 v0, v0, v4, v2
230; GFX9-NEXT:    v_and_or_b32 v1, v1, v4, v3
231; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
232; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
233; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
234; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
235; GFX9-NEXT:    s_movk_i32 s4, 0xff
236; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
237; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
238; GFX9-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX10-LABEL: v_uaddsat_v2i8:
241; GFX10:       ; %bb.0:
242; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
244; GFX10-NEXT:    s_mov_b32 s4, 8
245; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
246; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
247; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
248; GFX10-NEXT:    s_movk_i32 s4, 0xff
249; GFX10-NEXT:    v_and_or_b32 v0, v0, v2, v3
250; GFX10-NEXT:    v_and_or_b32 v1, v1, v2, v4
251; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
252; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
253; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
254; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
255; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
256; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
257; GFX10-NEXT:    s_setpc_b64 s[30:31]
258  %lhs = bitcast i16 %lhs.arg to <2 x i8>
259  %rhs = bitcast i16 %rhs.arg to <2 x i8>
260  %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
261  %cast.result = bitcast <2 x i8> %result to i16
262  ret i16 %cast.result
263}
264
265define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
266; GFX6-LABEL: s_uaddsat_v2i8:
267; GFX6:       ; %bb.0:
268; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
269; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
270; GFX6-NEXT:    s_lshr_b32 s3, s1, 8
271; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
272; GFX6-NEXT:    s_not_b32 s4, s0
273; GFX6-NEXT:    s_min_u32 s1, s4, s1
274; GFX6-NEXT:    s_add_i32 s0, s0, s1
275; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
276; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
277; GFX6-NEXT:    s_not_b32 s3, s1
278; GFX6-NEXT:    s_min_u32 s2, s3, s2
279; GFX6-NEXT:    s_add_i32 s1, s1, s2
280; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
281; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
282; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
283; GFX6-NEXT:    s_or_b32 s0, s0, s1
284; GFX6-NEXT:    ; return to shader part epilog
285;
286; GFX8-LABEL: s_uaddsat_v2i8:
287; GFX8:       ; %bb.0:
288; GFX8-NEXT:    s_bfe_u32 s4, 8, 0x100000
289; GFX8-NEXT:    s_lshr_b32 s3, s1, 8
290; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
291; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
292; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
293; GFX8-NEXT:    v_mov_b32_e32 v0, s1
294; GFX8-NEXT:    s_lshl_b32 s1, s3, s4
295; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
296; GFX8-NEXT:    s_lshl_b32 s0, s2, s4
297; GFX8-NEXT:    v_mov_b32_e32 v1, s1
298; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
299; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
300; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
301; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
302; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
303; GFX8-NEXT:    ; return to shader part epilog
304;
305; GFX9-LABEL: s_uaddsat_v2i8:
306; GFX9:       ; %bb.0:
307; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
308; GFX9-NEXT:    s_lshr_b32 s3, s1, 8
309; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
310; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
311; GFX9-NEXT:    s_mov_b32 s2, 0x80008
312; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
313; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
314; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
315; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
316; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
317; GFX9-NEXT:    s_lshl_b32 s1, s1, s2
318; GFX9-NEXT:    s_lshl_b32 s2, s3, 8
319; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
320; GFX9-NEXT:    v_mov_b32_e32 v0, s1
321; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
322; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
323; GFX9-NEXT:    s_movk_i32 s0, 0xff
324; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
325; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
326; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
327; GFX9-NEXT:    ; return to shader part epilog
328;
329; GFX10-LABEL: s_uaddsat_v2i8:
330; GFX10:       ; %bb.0:
331; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
332; GFX10-NEXT:    s_lshr_b32 s3, s1, 8
333; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
334; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
335; GFX10-NEXT:    s_mov_b32 s2, 0x80008
336; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
337; GFX10-NEXT:    s_lshr_b32 s4, s1, 16
338; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
339; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
340; GFX10-NEXT:    s_lshl_b32 s1, s1, s2
341; GFX10-NEXT:    s_lshl_b32 s2, s4, 8
342; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
343; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
344; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
345; GFX10-NEXT:    s_movk_i32 s0, 0xff
346; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
347; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
348; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
349; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
350; GFX10-NEXT:    ; return to shader part epilog
351  %lhs = bitcast i16 %lhs.arg to <2 x i8>
352  %rhs = bitcast i16 %rhs.arg to <2 x i8>
353  %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs)
354  %cast.result = bitcast <2 x i8> %result to i16
355  ret i16 %cast.result
356}
357
358define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
359; GFX6-LABEL: v_uaddsat_v4i8:
360; GFX6:       ; %bb.0:
361; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
363; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
364; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
365; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
366; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
367; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
368; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
369; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
370; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v0
371; GFX6-NEXT:    v_min_u32_e32 v1, v8, v1
372; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
373; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
374; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
375; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v1
376; GFX6-NEXT:    v_min_u32_e32 v2, v5, v2
377; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
378; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
379; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
380; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v2
381; GFX6-NEXT:    v_min_u32_e32 v3, v5, v3
382; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
383; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
384; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
385; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
386; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
387; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
388; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
389; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
390; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
391; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
392; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
393; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
394; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
395; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
396; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
397; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
398; GFX6-NEXT:    s_setpc_b64 s[30:31]
399;
400; GFX8-LABEL: v_uaddsat_v4i8:
401; GFX8:       ; %bb.0:
402; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
403; GFX8-NEXT:    v_mov_b32_e32 v2, 8
404; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
405; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
406; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
407; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
408; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
409; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
410; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
411; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
412; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
413; GFX8-NEXT:    v_add_u16_e64 v1, v3, v2 clamp
414; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
415; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v6
416; GFX8-NEXT:    v_add_u16_e64 v2, v2, v3 clamp
417; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v5
418; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 8, v7
419; GFX8-NEXT:    v_add_u16_e64 v3, v3, v4 clamp
420; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
421; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
422; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
423; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
424; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
425; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
426; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
427; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
428; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
429; GFX8-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX9-LABEL: v_uaddsat_v4i8:
432; GFX9:       ; %bb.0:
433; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX9-NEXT:    s_mov_b32 s4, 8
435; GFX9-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
436; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
437; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffff
438; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
439; GFX9-NEXT:    v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
440; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
441; GFX9-NEXT:    v_and_or_b32 v0, v0, v8, v2
442; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
443; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
444; GFX9-NEXT:    v_and_or_b32 v2, v3, v8, v2
445; GFX9-NEXT:    v_and_or_b32 v1, v1, v8, v5
446; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
447; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
448; GFX9-NEXT:    v_and_or_b32 v3, v6, v8, v3
449; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
450; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
451; GFX9-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
452; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
453; GFX9-NEXT:    v_pk_add_u16 v1, v2, v3 clamp
454; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
455; GFX9-NEXT:    v_mov_b32_e32 v2, 8
456; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
457; GFX9-NEXT:    s_movk_i32 s4, 0xff
458; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
459; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
460; GFX9-NEXT:    v_and_b32_e32 v2, s4, v1
461; GFX9-NEXT:    v_mov_b32_e32 v3, 24
462; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
463; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
464; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
465; GFX9-NEXT:    s_setpc_b64 s[30:31]
466;
467; GFX10-LABEL: v_uaddsat_v4i8:
468; GFX10:       ; %bb.0:
469; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
471; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
472; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
473; GFX10-NEXT:    s_mov_b32 s4, 8
474; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
475; GFX10-NEXT:    v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
476; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
477; GFX10-NEXT:    v_mov_b32_e32 v7, 0xffff
478; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
479; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
480; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
481; GFX10-NEXT:    s_movk_i32 s4, 0xff
482; GFX10-NEXT:    v_and_or_b32 v0, v0, v7, v2
483; GFX10-NEXT:    v_and_or_b32 v1, v1, v7, v6
484; GFX10-NEXT:    v_and_or_b32 v2, v3, v7, v4
485; GFX10-NEXT:    v_and_or_b32 v3, v8, v7, v5
486; GFX10-NEXT:    v_mov_b32_e32 v4, 24
487; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
488; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
489; GFX10-NEXT:    v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
490; GFX10-NEXT:    v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
491; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
492; GFX10-NEXT:    v_pk_add_u16 v1, v2, v3 clamp
493; GFX10-NEXT:    v_mov_b32_e32 v2, 8
494; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
495; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
496; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
497; GFX10-NEXT:    v_and_b32_e32 v3, s4, v1
498; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
499; GFX10-NEXT:    v_and_or_b32 v0, v0, s4, v2
500; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
501; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
502; GFX10-NEXT:    s_setpc_b64 s[30:31]
503  %lhs = bitcast i32 %lhs.arg to <4 x i8>
504  %rhs = bitcast i32 %rhs.arg to <4 x i8>
505  %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
506  %cast.result = bitcast <4 x i8> %result to i32
507  ret i32 %cast.result
508}
509
510define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
511; GFX6-LABEL: s_uaddsat_v4i8:
512; GFX6:       ; %bb.0:
513; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
514; GFX6-NEXT:    s_lshr_b32 s3, s0, 16
515; GFX6-NEXT:    s_lshr_b32 s4, s0, 24
516; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
517; GFX6-NEXT:    s_lshr_b32 s5, s1, 8
518; GFX6-NEXT:    s_lshr_b32 s6, s1, 16
519; GFX6-NEXT:    s_lshr_b32 s7, s1, 24
520; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
521; GFX6-NEXT:    s_not_b32 s8, s0
522; GFX6-NEXT:    s_min_u32 s1, s8, s1
523; GFX6-NEXT:    s_add_i32 s0, s0, s1
524; GFX6-NEXT:    s_lshl_b32 s1, s2, 24
525; GFX6-NEXT:    s_lshl_b32 s2, s5, 24
526; GFX6-NEXT:    s_not_b32 s5, s1
527; GFX6-NEXT:    s_min_u32 s2, s5, s2
528; GFX6-NEXT:    s_add_i32 s1, s1, s2
529; GFX6-NEXT:    s_lshl_b32 s2, s3, 24
530; GFX6-NEXT:    s_lshl_b32 s3, s6, 24
531; GFX6-NEXT:    s_not_b32 s5, s2
532; GFX6-NEXT:    s_min_u32 s3, s5, s3
533; GFX6-NEXT:    s_add_i32 s2, s2, s3
534; GFX6-NEXT:    s_lshl_b32 s3, s4, 24
535; GFX6-NEXT:    s_lshl_b32 s4, s7, 24
536; GFX6-NEXT:    s_not_b32 s5, s3
537; GFX6-NEXT:    s_lshr_b32 s1, s1, 24
538; GFX6-NEXT:    s_min_u32 s4, s5, s4
539; GFX6-NEXT:    s_lshr_b32 s0, s0, 24
540; GFX6-NEXT:    s_lshr_b32 s2, s2, 24
541; GFX6-NEXT:    s_add_i32 s3, s3, s4
542; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
543; GFX6-NEXT:    s_lshr_b32 s3, s3, 24
544; GFX6-NEXT:    s_or_b32 s0, s0, s1
545; GFX6-NEXT:    s_lshl_b32 s1, s2, 16
546; GFX6-NEXT:    s_or_b32 s0, s0, s1
547; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
548; GFX6-NEXT:    s_or_b32 s0, s0, s1
549; GFX6-NEXT:    ; return to shader part epilog
550;
551; GFX8-LABEL: s_uaddsat_v4i8:
552; GFX8:       ; %bb.0:
553; GFX8-NEXT:    s_bfe_u32 s8, 8, 0x100000
554; GFX8-NEXT:    s_lshr_b32 s5, s1, 8
555; GFX8-NEXT:    s_lshr_b32 s6, s1, 16
556; GFX8-NEXT:    s_lshr_b32 s7, s1, 24
557; GFX8-NEXT:    s_lshl_b32 s1, s1, s8
558; GFX8-NEXT:    s_lshr_b32 s2, s0, 8
559; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
560; GFX8-NEXT:    s_lshr_b32 s4, s0, 24
561; GFX8-NEXT:    s_lshl_b32 s0, s0, s8
562; GFX8-NEXT:    v_mov_b32_e32 v0, s1
563; GFX8-NEXT:    s_lshl_b32 s1, s5, s8
564; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
565; GFX8-NEXT:    s_lshl_b32 s0, s2, s8
566; GFX8-NEXT:    v_mov_b32_e32 v1, s1
567; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
568; GFX8-NEXT:    s_lshl_b32 s1, s6, s8
569; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
570; GFX8-NEXT:    s_lshl_b32 s0, s3, s8
571; GFX8-NEXT:    v_mov_b32_e32 v2, s1
572; GFX8-NEXT:    s_lshl_b32 s1, s7, s8
573; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
574; GFX8-NEXT:    v_add_u16_e64 v2, s0, v2 clamp
575; GFX8-NEXT:    s_lshl_b32 s0, s4, s8
576; GFX8-NEXT:    v_mov_b32_e32 v3, s1
577; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
578; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
579; GFX8-NEXT:    v_add_u16_e64 v3, s0, v3 clamp
580; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
581; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
582; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
583; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
584; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
585; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
586; GFX8-NEXT:    ; return to shader part epilog
587;
588; GFX9-LABEL: s_uaddsat_v4i8:
589; GFX9:       ; %bb.0:
590; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
591; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
592; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
593; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
594; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
595; GFX9-NEXT:    s_mov_b32 s4, 0x80008
596; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
597; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
598; GFX9-NEXT:    s_lshl_b32 s0, s0, s4
599; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
600; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
601; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
602; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
603; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
604; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
605; GFX9-NEXT:    s_lshl_b32 s3, s3, s4
606; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
607; GFX9-NEXT:    s_lshr_b32 s7, s1, 16
608; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
609; GFX9-NEXT:    s_pack_ll_b32_b16 s6, s8, s9
610; GFX9-NEXT:    s_lshl_b32 s1, s1, s4
611; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
612; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
613; GFX9-NEXT:    s_lshr_b32 s7, s6, 16
614; GFX9-NEXT:    s_lshl_b32 s4, s6, s4
615; GFX9-NEXT:    s_lshl_b32 s6, s7, 8
616; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
617; GFX9-NEXT:    v_mov_b32_e32 v0, s1
618; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
619; GFX9-NEXT:    v_mov_b32_e32 v1, s4
620; GFX9-NEXT:    s_mov_b32 s2, 8
621; GFX9-NEXT:    v_pk_add_u16 v1, s3, v1 clamp
622; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
623; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
624; GFX9-NEXT:    s_movk_i32 s0, 0xff
625; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
626; GFX9-NEXT:    s_mov_b32 s5, 24
627; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
628; GFX9-NEXT:    v_and_b32_e32 v2, s0, v1
629; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
630; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
631; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
632; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
633; GFX9-NEXT:    ; return to shader part epilog
634;
635; GFX10-LABEL: s_uaddsat_v4i8:
636; GFX10:       ; %bb.0:
637; GFX10-NEXT:    s_lshr_b32 s2, s0, 8
638; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
639; GFX10-NEXT:    s_lshr_b32 s4, s0, 24
640; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
641; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
642; GFX10-NEXT:    s_mov_b32 s3, 0x80008
643; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
644; GFX10-NEXT:    s_lshr_b32 s5, s1, 8
645; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
646; GFX10-NEXT:    s_lshr_b32 s7, s1, 24
647; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
648; GFX10-NEXT:    s_lshl_b32 s4, s4, 8
649; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
650; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
651; GFX10-NEXT:    s_pack_ll_b32_b16 s4, s6, s7
652; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
653; GFX10-NEXT:    s_lshr_b32 s5, s1, 16
654; GFX10-NEXT:    s_lshr_b32 s6, s4, 16
655; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
656; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
657; GFX10-NEXT:    s_lshl_b32 s1, s1, s3
658; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
659; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
660; GFX10-NEXT:    s_lshl_b32 s4, s6, 8
661; GFX10-NEXT:    s_pack_ll_b32_b16 s2, s2, s8
662; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
663; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
664; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
665; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3 clamp
666; GFX10-NEXT:    s_mov_b32 s0, 8
667; GFX10-NEXT:    s_movk_i32 s1, 0xff
668; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
669; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
670; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
671; GFX10-NEXT:    v_and_b32_e32 v3, s1, v1
672; GFX10-NEXT:    s_mov_b32 s0, 24
673; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
674; GFX10-NEXT:    v_and_or_b32 v0, v0, s1, v2
675; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
676; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
677; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
678; GFX10-NEXT:    ; return to shader part epilog
679  %lhs = bitcast i32 %lhs.arg to <4 x i8>
680  %rhs = bitcast i32 %rhs.arg to <4 x i8>
681  %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs)
682  %cast.result = bitcast <4 x i8> %result to i32
683  ret i32 %cast.result
684}
685
686define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) {
687; GFX6-LABEL: v_uaddsat_i24:
688; GFX6:       ; %bb.0:
689; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
690; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
691; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
692; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
693; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
694; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
695; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
696; GFX6-NEXT:    s_setpc_b64 s[30:31]
697;
698; GFX8-LABEL: v_uaddsat_i24:
699; GFX8:       ; %bb.0:
700; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
701; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
702; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
703; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
704; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
705; GFX8-NEXT:    s_setpc_b64 s[30:31]
706;
707; GFX9-LABEL: v_uaddsat_i24:
708; GFX9:       ; %bb.0:
709; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
710; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
711; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
712; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
713; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
714; GFX9-NEXT:    s_setpc_b64 s[30:31]
715;
716; GFX10-LABEL: v_uaddsat_i24:
717; GFX10:       ; %bb.0:
718; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
720; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
721; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
722; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
723; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
724; GFX10-NEXT:    s_setpc_b64 s[30:31]
725  %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
726  ret i24 %result
727}
728
729define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) {
730; GFX6-LABEL: s_uaddsat_i24:
731; GFX6:       ; %bb.0:
732; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
733; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
734; GFX6-NEXT:    s_not_b32 s2, s0
735; GFX6-NEXT:    s_min_u32 s1, s2, s1
736; GFX6-NEXT:    s_add_i32 s0, s0, s1
737; GFX6-NEXT:    s_lshr_b32 s0, s0, 8
738; GFX6-NEXT:    ; return to shader part epilog
739;
740; GFX8-LABEL: s_uaddsat_i24:
741; GFX8:       ; %bb.0:
742; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
743; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
744; GFX8-NEXT:    v_mov_b32_e32 v0, s1
745; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0 clamp
746; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
747; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
748; GFX8-NEXT:    ; return to shader part epilog
749;
750; GFX9-LABEL: s_uaddsat_i24:
751; GFX9:       ; %bb.0:
752; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
753; GFX9-NEXT:    s_lshl_b32 s0, s0, 8
754; GFX9-NEXT:    v_mov_b32_e32 v0, s1
755; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
756; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
757; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
758; GFX9-NEXT:    ; return to shader part epilog
759;
760; GFX10-LABEL: s_uaddsat_i24:
761; GFX10:       ; %bb.0:
762; GFX10-NEXT:    s_lshl_b32 s0, s0, 8
763; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
764; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
765; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
766; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
767; GFX10-NEXT:    ; return to shader part epilog
768  %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs)
769  ret i24 %result
770}
771
772define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
773; GFX6-LABEL: v_uaddsat_i32:
774; GFX6:       ; %bb.0:
775; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
777; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
778; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
779; GFX6-NEXT:    s_setpc_b64 s[30:31]
780;
781; GFX8-LABEL: v_uaddsat_i32:
782; GFX8:       ; %bb.0:
783; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
784; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v1 clamp
785; GFX8-NEXT:    s_setpc_b64 s[30:31]
786;
787; GFX9-LABEL: v_uaddsat_i32:
788; GFX9:       ; %bb.0:
789; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790; GFX9-NEXT:    v_add_u32_e64 v0, v0, v1 clamp
791; GFX9-NEXT:    s_setpc_b64 s[30:31]
792;
793; GFX10-LABEL: v_uaddsat_i32:
794; GFX10:       ; %bb.0:
795; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
797; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v1 clamp
798; GFX10-NEXT:    s_setpc_b64 s[30:31]
799  %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
800  ret i32 %result
801}
802
803define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) {
804; GFX6-LABEL: s_uaddsat_i32:
805; GFX6:       ; %bb.0:
806; GFX6-NEXT:    s_not_b32 s2, s0
807; GFX6-NEXT:    s_min_u32 s1, s2, s1
808; GFX6-NEXT:    s_add_i32 s0, s0, s1
809; GFX6-NEXT:    ; return to shader part epilog
810;
811; GFX8-LABEL: s_uaddsat_i32:
812; GFX8:       ; %bb.0:
813; GFX8-NEXT:    v_mov_b32_e32 v0, s1
814; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0 clamp
815; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
816; GFX8-NEXT:    ; return to shader part epilog
817;
818; GFX9-LABEL: s_uaddsat_i32:
819; GFX9:       ; %bb.0:
820; GFX9-NEXT:    v_mov_b32_e32 v0, s1
821; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
822; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
823; GFX9-NEXT:    ; return to shader part epilog
824;
825; GFX10-LABEL: s_uaddsat_i32:
826; GFX10:       ; %bb.0:
827; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
828; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
829; GFX10-NEXT:    ; return to shader part epilog
830  %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
831  ret i32 %result
832}
833
834define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
835; GFX6-LABEL: uaddsat_i32_sv:
836; GFX6:       ; %bb.0:
837; GFX6-NEXT:    s_not_b32 s1, s0
838; GFX6-NEXT:    v_min_u32_e32 v0, s1, v0
839; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
840; GFX6-NEXT:    ; return to shader part epilog
841;
842; GFX8-LABEL: uaddsat_i32_sv:
843; GFX8:       ; %bb.0:
844; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0 clamp
845; GFX8-NEXT:    ; return to shader part epilog
846;
847; GFX9-LABEL: uaddsat_i32_sv:
848; GFX9:       ; %bb.0:
849; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
850; GFX9-NEXT:    ; return to shader part epilog
851;
852; GFX10-LABEL: uaddsat_i32_sv:
853; GFX10:       ; %bb.0:
854; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, v0 clamp
855; GFX10-NEXT:    ; return to shader part epilog
856  %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
857  %cast = bitcast i32 %result to float
858  ret float %cast
859}
860
861define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
862; GFX6-LABEL: uaddsat_i32_vs:
863; GFX6:       ; %bb.0:
864; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v0
865; GFX6-NEXT:    v_min_u32_e32 v1, s0, v1
866; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
867; GFX6-NEXT:    ; return to shader part epilog
868;
869; GFX8-LABEL: uaddsat_i32_vs:
870; GFX8:       ; %bb.0:
871; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], v0, s0 clamp
872; GFX8-NEXT:    ; return to shader part epilog
873;
874; GFX9-LABEL: uaddsat_i32_vs:
875; GFX9:       ; %bb.0:
876; GFX9-NEXT:    v_add_u32_e64 v0, v0, s0 clamp
877; GFX9-NEXT:    ; return to shader part epilog
878;
879; GFX10-LABEL: uaddsat_i32_vs:
880; GFX10:       ; %bb.0:
881; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, s0 clamp
882; GFX10-NEXT:    ; return to shader part epilog
883  %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs)
884  %cast = bitcast i32 %result to float
885  ret float %cast
886}
887
888define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
889; GFX6-LABEL: v_uaddsat_v2i32:
890; GFX6:       ; %bb.0:
891; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
892; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v0
893; GFX6-NEXT:    v_min_u32_e32 v2, v4, v2
894; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
895; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v1
896; GFX6-NEXT:    v_min_u32_e32 v2, v2, v3
897; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
898; GFX6-NEXT:    s_setpc_b64 s[30:31]
899;
900; GFX8-LABEL: v_uaddsat_v2i32:
901; GFX8:       ; %bb.0:
902; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
903; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v2 clamp
904; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v3 clamp
905; GFX8-NEXT:    s_setpc_b64 s[30:31]
906;
907; GFX9-LABEL: v_uaddsat_v2i32:
908; GFX9:       ; %bb.0:
909; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910; GFX9-NEXT:    v_add_u32_e64 v0, v0, v2 clamp
911; GFX9-NEXT:    v_add_u32_e64 v1, v1, v3 clamp
912; GFX9-NEXT:    s_setpc_b64 s[30:31]
913;
914; GFX10-LABEL: v_uaddsat_v2i32:
915; GFX10:       ; %bb.0:
916; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
917; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
918; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v2 clamp
919; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v3 clamp
920; GFX10-NEXT:    s_setpc_b64 s[30:31]
921  %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
922  ret <2 x i32> %result
923}
924
925define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) {
926; GFX6-LABEL: s_uaddsat_v2i32:
927; GFX6:       ; %bb.0:
928; GFX6-NEXT:    s_not_b32 s4, s0
929; GFX6-NEXT:    s_min_u32 s2, s4, s2
930; GFX6-NEXT:    s_add_i32 s0, s0, s2
931; GFX6-NEXT:    s_not_b32 s2, s1
932; GFX6-NEXT:    s_min_u32 s2, s2, s3
933; GFX6-NEXT:    s_add_i32 s1, s1, s2
934; GFX6-NEXT:    ; return to shader part epilog
935;
936; GFX8-LABEL: s_uaddsat_v2i32:
937; GFX8:       ; %bb.0:
938; GFX8-NEXT:    v_mov_b32_e32 v0, s2
939; GFX8-NEXT:    v_mov_b32_e32 v1, s3
940; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], s0, v0 clamp
941; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], s1, v1 clamp
942; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
943; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
944; GFX8-NEXT:    ; return to shader part epilog
945;
946; GFX9-LABEL: s_uaddsat_v2i32:
947; GFX9:       ; %bb.0:
948; GFX9-NEXT:    v_mov_b32_e32 v0, s2
949; GFX9-NEXT:    v_mov_b32_e32 v1, s3
950; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
951; GFX9-NEXT:    v_add_u32_e64 v1, s1, v1 clamp
952; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
953; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
954; GFX9-NEXT:    ; return to shader part epilog
955;
956; GFX10-LABEL: s_uaddsat_v2i32:
957; GFX10:       ; %bb.0:
958; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s2 clamp
959; GFX10-NEXT:    v_add_nc_u32_e64 v1, s1, s3 clamp
960; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
961; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
962; GFX10-NEXT:    ; return to shader part epilog
963  %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
964  ret <2 x i32> %result
965}
966
967define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
968; GFX6-LABEL: v_uaddsat_v3i32:
969; GFX6:       ; %bb.0:
970; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
971; GFX6-NEXT:    v_xor_b32_e32 v6, -1, v0
972; GFX6-NEXT:    v_min_u32_e32 v3, v6, v3
973; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
974; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v1
975; GFX6-NEXT:    v_min_u32_e32 v3, v3, v4
976; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
977; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v2
978; GFX6-NEXT:    v_min_u32_e32 v3, v3, v5
979; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
980; GFX6-NEXT:    s_setpc_b64 s[30:31]
981;
982; GFX8-LABEL: v_uaddsat_v3i32:
983; GFX8:       ; %bb.0:
984; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
985; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v3 clamp
986; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v4 clamp
987; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v5 clamp
988; GFX8-NEXT:    s_setpc_b64 s[30:31]
989;
990; GFX9-LABEL: v_uaddsat_v3i32:
991; GFX9:       ; %bb.0:
992; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
993; GFX9-NEXT:    v_add_u32_e64 v0, v0, v3 clamp
994; GFX9-NEXT:    v_add_u32_e64 v1, v1, v4 clamp
995; GFX9-NEXT:    v_add_u32_e64 v2, v2, v5 clamp
996; GFX9-NEXT:    s_setpc_b64 s[30:31]
997;
998; GFX10-LABEL: v_uaddsat_v3i32:
999; GFX10:       ; %bb.0:
1000; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1001; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1002; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v3 clamp
1003; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v4 clamp
1004; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v5 clamp
1005; GFX10-NEXT:    s_setpc_b64 s[30:31]
1006  %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1007  ret <3 x i32> %result
1008}
1009
1010define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) {
1011; GFX6-LABEL: s_uaddsat_v3i32:
1012; GFX6:       ; %bb.0:
1013; GFX6-NEXT:    s_not_b32 s6, s0
1014; GFX6-NEXT:    s_min_u32 s3, s6, s3
1015; GFX6-NEXT:    s_add_i32 s0, s0, s3
1016; GFX6-NEXT:    s_not_b32 s3, s1
1017; GFX6-NEXT:    s_min_u32 s3, s3, s4
1018; GFX6-NEXT:    s_add_i32 s1, s1, s3
1019; GFX6-NEXT:    s_not_b32 s3, s2
1020; GFX6-NEXT:    s_min_u32 s3, s3, s5
1021; GFX6-NEXT:    s_add_i32 s2, s2, s3
1022; GFX6-NEXT:    ; return to shader part epilog
1023;
1024; GFX8-LABEL: s_uaddsat_v3i32:
1025; GFX8:       ; %bb.0:
1026; GFX8-NEXT:    v_mov_b32_e32 v0, s3
1027; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1028; GFX8-NEXT:    v_mov_b32_e32 v2, s5
1029; GFX8-NEXT:    v_add_u32_e64 v0, s[6:7], s0, v0 clamp
1030; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1031; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1032; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1033; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1034; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1035; GFX8-NEXT:    ; return to shader part epilog
1036;
1037; GFX9-LABEL: s_uaddsat_v3i32:
1038; GFX9:       ; %bb.0:
1039; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1040; GFX9-NEXT:    v_mov_b32_e32 v1, s4
1041; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1042; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
1043; GFX9-NEXT:    v_add_u32_e64 v1, s1, v1 clamp
1044; GFX9-NEXT:    v_add_u32_e64 v2, s2, v2 clamp
1045; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1046; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1047; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1048; GFX9-NEXT:    ; return to shader part epilog
1049;
1050; GFX10-LABEL: s_uaddsat_v3i32:
1051; GFX10:       ; %bb.0:
1052; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s3 clamp
1053; GFX10-NEXT:    v_add_nc_u32_e64 v1, s1, s4 clamp
1054; GFX10-NEXT:    v_add_nc_u32_e64 v2, s2, s5 clamp
1055; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1056; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1057; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1058; GFX10-NEXT:    ; return to shader part epilog
1059  %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs)
1060  ret <3 x i32> %result
1061}
1062
1063define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
1064; GFX6-LABEL: v_uaddsat_v4i32:
1065; GFX6:       ; %bb.0:
1066; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1067; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v0
1068; GFX6-NEXT:    v_min_u32_e32 v4, v8, v4
1069; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
1070; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v1
1071; GFX6-NEXT:    v_min_u32_e32 v4, v4, v5
1072; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1073; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v2
1074; GFX6-NEXT:    v_min_u32_e32 v4, v4, v6
1075; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1076; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v3
1077; GFX6-NEXT:    v_min_u32_e32 v4, v4, v7
1078; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1079; GFX6-NEXT:    s_setpc_b64 s[30:31]
1080;
1081; GFX8-LABEL: v_uaddsat_v4i32:
1082; GFX8:       ; %bb.0:
1083; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v4 clamp
1085; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v5 clamp
1086; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v6 clamp
1087; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v7 clamp
1088; GFX8-NEXT:    s_setpc_b64 s[30:31]
1089;
1090; GFX9-LABEL: v_uaddsat_v4i32:
1091; GFX9:       ; %bb.0:
1092; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1093; GFX9-NEXT:    v_add_u32_e64 v0, v0, v4 clamp
1094; GFX9-NEXT:    v_add_u32_e64 v1, v1, v5 clamp
1095; GFX9-NEXT:    v_add_u32_e64 v2, v2, v6 clamp
1096; GFX9-NEXT:    v_add_u32_e64 v3, v3, v7 clamp
1097; GFX9-NEXT:    s_setpc_b64 s[30:31]
1098;
1099; GFX10-LABEL: v_uaddsat_v4i32:
1100; GFX10:       ; %bb.0:
1101; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1103; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v4 clamp
1104; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v5 clamp
1105; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v6 clamp
1106; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v7 clamp
1107; GFX10-NEXT:    s_setpc_b64 s[30:31]
1108  %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1109  ret <4 x i32> %result
1110}
1111
1112define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) {
1113; GFX6-LABEL: s_uaddsat_v4i32:
1114; GFX6:       ; %bb.0:
1115; GFX6-NEXT:    s_not_b32 s8, s0
1116; GFX6-NEXT:    s_min_u32 s4, s8, s4
1117; GFX6-NEXT:    s_add_i32 s0, s0, s4
1118; GFX6-NEXT:    s_not_b32 s4, s1
1119; GFX6-NEXT:    s_min_u32 s4, s4, s5
1120; GFX6-NEXT:    s_add_i32 s1, s1, s4
1121; GFX6-NEXT:    s_not_b32 s4, s2
1122; GFX6-NEXT:    s_min_u32 s4, s4, s6
1123; GFX6-NEXT:    s_add_i32 s2, s2, s4
1124; GFX6-NEXT:    s_not_b32 s4, s3
1125; GFX6-NEXT:    s_min_u32 s4, s4, s7
1126; GFX6-NEXT:    s_add_i32 s3, s3, s4
1127; GFX6-NEXT:    ; return to shader part epilog
1128;
1129; GFX8-LABEL: s_uaddsat_v4i32:
1130; GFX8:       ; %bb.0:
1131; GFX8-NEXT:    v_mov_b32_e32 v0, s4
1132; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1133; GFX8-NEXT:    v_mov_b32_e32 v2, s6
1134; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1135; GFX8-NEXT:    v_add_u32_e64 v0, s[8:9], s0, v0 clamp
1136; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1137; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1138; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], s3, v3 clamp
1139; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1140; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1141; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1142; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1143; GFX8-NEXT:    ; return to shader part epilog
1144;
1145; GFX9-LABEL: s_uaddsat_v4i32:
1146; GFX9:       ; %bb.0:
1147; GFX9-NEXT:    v_mov_b32_e32 v0, s4
1148; GFX9-NEXT:    v_mov_b32_e32 v1, s5
1149; GFX9-NEXT:    v_mov_b32_e32 v2, s6
1150; GFX9-NEXT:    v_mov_b32_e32 v3, s7
1151; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
1152; GFX9-NEXT:    v_add_u32_e64 v1, s1, v1 clamp
1153; GFX9-NEXT:    v_add_u32_e64 v2, s2, v2 clamp
1154; GFX9-NEXT:    v_add_u32_e64 v3, s3, v3 clamp
1155; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1156; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1157; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1158; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1159; GFX9-NEXT:    ; return to shader part epilog
1160;
1161; GFX10-LABEL: s_uaddsat_v4i32:
1162; GFX10:       ; %bb.0:
1163; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s4 clamp
1164; GFX10-NEXT:    v_add_nc_u32_e64 v1, s1, s5 clamp
1165; GFX10-NEXT:    v_add_nc_u32_e64 v2, s2, s6 clamp
1166; GFX10-NEXT:    v_add_nc_u32_e64 v3, s3, s7 clamp
1167; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1168; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1169; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1170; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1171; GFX10-NEXT:    ; return to shader part epilog
1172  %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
1173  ret <4 x i32> %result
1174}
1175
1176define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
1177; GFX6-LABEL: v_uaddsat_v5i32:
1178; GFX6:       ; %bb.0:
1179; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1180; GFX6-NEXT:    v_xor_b32_e32 v10, -1, v0
1181; GFX6-NEXT:    v_min_u32_e32 v5, v10, v5
1182; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
1183; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v1
1184; GFX6-NEXT:    v_min_u32_e32 v5, v5, v6
1185; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
1186; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v2
1187; GFX6-NEXT:    v_min_u32_e32 v5, v5, v7
1188; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
1189; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
1190; GFX6-NEXT:    v_min_u32_e32 v5, v5, v8
1191; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
1192; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v4
1193; GFX6-NEXT:    v_min_u32_e32 v5, v5, v9
1194; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
1195; GFX6-NEXT:    s_setpc_b64 s[30:31]
1196;
1197; GFX8-LABEL: v_uaddsat_v5i32:
1198; GFX8:       ; %bb.0:
1199; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1200; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v5 clamp
1201; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v6 clamp
1202; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v7 clamp
1203; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v8 clamp
1204; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v9 clamp
1205; GFX8-NEXT:    s_setpc_b64 s[30:31]
1206;
1207; GFX9-LABEL: v_uaddsat_v5i32:
1208; GFX9:       ; %bb.0:
1209; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210; GFX9-NEXT:    v_add_u32_e64 v0, v0, v5 clamp
1211; GFX9-NEXT:    v_add_u32_e64 v1, v1, v6 clamp
1212; GFX9-NEXT:    v_add_u32_e64 v2, v2, v7 clamp
1213; GFX9-NEXT:    v_add_u32_e64 v3, v3, v8 clamp
1214; GFX9-NEXT:    v_add_u32_e64 v4, v4, v9 clamp
1215; GFX9-NEXT:    s_setpc_b64 s[30:31]
1216;
1217; GFX10-LABEL: v_uaddsat_v5i32:
1218; GFX10:       ; %bb.0:
1219; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1221; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v5 clamp
1222; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v6 clamp
1223; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v7 clamp
1224; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v8 clamp
1225; GFX10-NEXT:    v_add_nc_u32_e64 v4, v4, v9 clamp
1226; GFX10-NEXT:    s_setpc_b64 s[30:31]
1227  %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1228  ret <5 x i32> %result
1229}
1230
1231define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) {
1232; GFX6-LABEL: s_uaddsat_v5i32:
1233; GFX6:       ; %bb.0:
1234; GFX6-NEXT:    s_not_b32 s10, s0
1235; GFX6-NEXT:    s_min_u32 s5, s10, s5
1236; GFX6-NEXT:    s_add_i32 s0, s0, s5
1237; GFX6-NEXT:    s_not_b32 s5, s1
1238; GFX6-NEXT:    s_min_u32 s5, s5, s6
1239; GFX6-NEXT:    s_add_i32 s1, s1, s5
1240; GFX6-NEXT:    s_not_b32 s5, s2
1241; GFX6-NEXT:    s_min_u32 s5, s5, s7
1242; GFX6-NEXT:    s_add_i32 s2, s2, s5
1243; GFX6-NEXT:    s_not_b32 s5, s3
1244; GFX6-NEXT:    s_min_u32 s5, s5, s8
1245; GFX6-NEXT:    s_add_i32 s3, s3, s5
1246; GFX6-NEXT:    s_not_b32 s5, s4
1247; GFX6-NEXT:    s_min_u32 s5, s5, s9
1248; GFX6-NEXT:    s_add_i32 s4, s4, s5
1249; GFX6-NEXT:    ; return to shader part epilog
1250;
1251; GFX8-LABEL: s_uaddsat_v5i32:
1252; GFX8:       ; %bb.0:
1253; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1254; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1255; GFX8-NEXT:    v_mov_b32_e32 v2, s7
1256; GFX8-NEXT:    v_mov_b32_e32 v3, s8
1257; GFX8-NEXT:    v_mov_b32_e32 v4, s9
1258; GFX8-NEXT:    v_add_u32_e64 v0, s[10:11], s0, v0 clamp
1259; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], s1, v1 clamp
1260; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], s2, v2 clamp
1261; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], s3, v3 clamp
1262; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], s4, v4 clamp
1263; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1264; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1265; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1266; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1267; GFX8-NEXT:    v_readfirstlane_b32 s4, v4
1268; GFX8-NEXT:    ; return to shader part epilog
1269;
1270; GFX9-LABEL: s_uaddsat_v5i32:
1271; GFX9:       ; %bb.0:
1272; GFX9-NEXT:    v_mov_b32_e32 v0, s5
1273; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1274; GFX9-NEXT:    v_mov_b32_e32 v2, s7
1275; GFX9-NEXT:    v_mov_b32_e32 v3, s8
1276; GFX9-NEXT:    v_mov_b32_e32 v4, s9
1277; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
1278; GFX9-NEXT:    v_add_u32_e64 v1, s1, v1 clamp
1279; GFX9-NEXT:    v_add_u32_e64 v2, s2, v2 clamp
1280; GFX9-NEXT:    v_add_u32_e64 v3, s3, v3 clamp
1281; GFX9-NEXT:    v_add_u32_e64 v4, s4, v4 clamp
1282; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1283; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1284; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1285; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1286; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1287; GFX9-NEXT:    ; return to shader part epilog
1288;
1289; GFX10-LABEL: s_uaddsat_v5i32:
1290; GFX10:       ; %bb.0:
1291; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s5 clamp
1292; GFX10-NEXT:    v_add_nc_u32_e64 v1, s1, s6 clamp
1293; GFX10-NEXT:    v_add_nc_u32_e64 v2, s2, s7 clamp
1294; GFX10-NEXT:    v_add_nc_u32_e64 v3, s3, s8 clamp
1295; GFX10-NEXT:    v_add_nc_u32_e64 v4, s4, s9 clamp
1296; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1297; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1298; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1299; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1300; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
1301; GFX10-NEXT:    ; return to shader part epilog
1302  %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs)
1303  ret <5 x i32> %result
1304}
1305
1306define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
1307; GFX6-LABEL: v_uaddsat_v16i32:
1308; GFX6:       ; %bb.0:
1309; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1310; GFX6-NEXT:    v_xor_b32_e32 v32, -1, v0
1311; GFX6-NEXT:    v_min_u32_e32 v16, v32, v16
1312; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
1313; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v1
1314; GFX6-NEXT:    v_min_u32_e32 v16, v16, v17
1315; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
1316; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v2
1317; GFX6-NEXT:    v_min_u32_e32 v16, v16, v18
1318; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v16
1319; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v3
1320; GFX6-NEXT:    v_min_u32_e32 v16, v16, v19
1321; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v16
1322; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v4
1323; GFX6-NEXT:    v_min_u32_e32 v16, v16, v20
1324; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
1325; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v5
1326; GFX6-NEXT:    v_min_u32_e32 v16, v16, v21
1327; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
1328; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v6
1329; GFX6-NEXT:    v_min_u32_e32 v16, v16, v22
1330; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v16
1331; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v7
1332; GFX6-NEXT:    v_min_u32_e32 v16, v16, v23
1333; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
1334; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v8
1335; GFX6-NEXT:    v_min_u32_e32 v16, v16, v24
1336; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
1337; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v9
1338; GFX6-NEXT:    v_min_u32_e32 v16, v16, v25
1339; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
1340; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v10
1341; GFX6-NEXT:    v_min_u32_e32 v16, v16, v26
1342; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
1343; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v11
1344; GFX6-NEXT:    v_min_u32_e32 v16, v16, v27
1345; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
1346; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v12
1347; GFX6-NEXT:    v_min_u32_e32 v16, v16, v28
1348; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
1349; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v13
1350; GFX6-NEXT:    v_min_u32_e32 v16, v16, v29
1351; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
1352; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v14
1353; GFX6-NEXT:    v_min_u32_e32 v16, v16, v30
1354; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
1355; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v15
1356; GFX6-NEXT:    v_min_u32_e32 v16, v16, v31
1357; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
1358; GFX6-NEXT:    s_setpc_b64 s[30:31]
1359;
1360; GFX8-LABEL: v_uaddsat_v16i32:
1361; GFX8:       ; %bb.0:
1362; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1363; GFX8-NEXT:    v_add_u32_e64 v0, s[4:5], v0, v16 clamp
1364; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v1, v17 clamp
1365; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v2, v18 clamp
1366; GFX8-NEXT:    v_add_u32_e64 v3, s[4:5], v3, v19 clamp
1367; GFX8-NEXT:    v_add_u32_e64 v4, s[4:5], v4, v20 clamp
1368; GFX8-NEXT:    v_add_u32_e64 v5, s[4:5], v5, v21 clamp
1369; GFX8-NEXT:    v_add_u32_e64 v6, s[4:5], v6, v22 clamp
1370; GFX8-NEXT:    v_add_u32_e64 v7, s[4:5], v7, v23 clamp
1371; GFX8-NEXT:    v_add_u32_e64 v8, s[4:5], v8, v24 clamp
1372; GFX8-NEXT:    v_add_u32_e64 v9, s[4:5], v9, v25 clamp
1373; GFX8-NEXT:    v_add_u32_e64 v10, s[4:5], v10, v26 clamp
1374; GFX8-NEXT:    v_add_u32_e64 v11, s[4:5], v11, v27 clamp
1375; GFX8-NEXT:    v_add_u32_e64 v12, s[4:5], v12, v28 clamp
1376; GFX8-NEXT:    v_add_u32_e64 v13, s[4:5], v13, v29 clamp
1377; GFX8-NEXT:    v_add_u32_e64 v14, s[4:5], v14, v30 clamp
1378; GFX8-NEXT:    v_add_u32_e64 v15, s[4:5], v15, v31 clamp
1379; GFX8-NEXT:    s_setpc_b64 s[30:31]
1380;
1381; GFX9-LABEL: v_uaddsat_v16i32:
1382; GFX9:       ; %bb.0:
1383; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1384; GFX9-NEXT:    v_add_u32_e64 v0, v0, v16 clamp
1385; GFX9-NEXT:    v_add_u32_e64 v1, v1, v17 clamp
1386; GFX9-NEXT:    v_add_u32_e64 v2, v2, v18 clamp
1387; GFX9-NEXT:    v_add_u32_e64 v3, v3, v19 clamp
1388; GFX9-NEXT:    v_add_u32_e64 v4, v4, v20 clamp
1389; GFX9-NEXT:    v_add_u32_e64 v5, v5, v21 clamp
1390; GFX9-NEXT:    v_add_u32_e64 v6, v6, v22 clamp
1391; GFX9-NEXT:    v_add_u32_e64 v7, v7, v23 clamp
1392; GFX9-NEXT:    v_add_u32_e64 v8, v8, v24 clamp
1393; GFX9-NEXT:    v_add_u32_e64 v9, v9, v25 clamp
1394; GFX9-NEXT:    v_add_u32_e64 v10, v10, v26 clamp
1395; GFX9-NEXT:    v_add_u32_e64 v11, v11, v27 clamp
1396; GFX9-NEXT:    v_add_u32_e64 v12, v12, v28 clamp
1397; GFX9-NEXT:    v_add_u32_e64 v13, v13, v29 clamp
1398; GFX9-NEXT:    v_add_u32_e64 v14, v14, v30 clamp
1399; GFX9-NEXT:    v_add_u32_e64 v15, v15, v31 clamp
1400; GFX9-NEXT:    s_setpc_b64 s[30:31]
1401;
1402; GFX10-LABEL: v_uaddsat_v16i32:
1403; GFX10:       ; %bb.0:
1404; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1405; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1406; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, v16 clamp
1407; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, v17 clamp
1408; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, v18 clamp
1409; GFX10-NEXT:    v_add_nc_u32_e64 v3, v3, v19 clamp
1410; GFX10-NEXT:    v_add_nc_u32_e64 v4, v4, v20 clamp
1411; GFX10-NEXT:    v_add_nc_u32_e64 v5, v5, v21 clamp
1412; GFX10-NEXT:    v_add_nc_u32_e64 v6, v6, v22 clamp
1413; GFX10-NEXT:    v_add_nc_u32_e64 v7, v7, v23 clamp
1414; GFX10-NEXT:    v_add_nc_u32_e64 v8, v8, v24 clamp
1415; GFX10-NEXT:    v_add_nc_u32_e64 v9, v9, v25 clamp
1416; GFX10-NEXT:    v_add_nc_u32_e64 v10, v10, v26 clamp
1417; GFX10-NEXT:    v_add_nc_u32_e64 v11, v11, v27 clamp
1418; GFX10-NEXT:    v_add_nc_u32_e64 v12, v12, v28 clamp
1419; GFX10-NEXT:    v_add_nc_u32_e64 v13, v13, v29 clamp
1420; GFX10-NEXT:    v_add_nc_u32_e64 v14, v14, v30 clamp
1421; GFX10-NEXT:    v_add_nc_u32_e64 v15, v15, v31 clamp
1422; GFX10-NEXT:    s_setpc_b64 s[30:31]
1423  %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1424  ret <16 x i32> %result
1425}
1426
1427define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) {
1428; GFX6-LABEL: s_uaddsat_v16i32:
1429; GFX6:       ; %bb.0:
1430; GFX6-NEXT:    s_not_b32 s32, s0
1431; GFX6-NEXT:    s_min_u32 s16, s32, s16
1432; GFX6-NEXT:    s_add_i32 s0, s0, s16
1433; GFX6-NEXT:    s_not_b32 s16, s1
1434; GFX6-NEXT:    s_min_u32 s16, s16, s17
1435; GFX6-NEXT:    s_add_i32 s1, s1, s16
1436; GFX6-NEXT:    s_not_b32 s16, s2
1437; GFX6-NEXT:    s_min_u32 s16, s16, s18
1438; GFX6-NEXT:    s_add_i32 s2, s2, s16
1439; GFX6-NEXT:    s_not_b32 s16, s3
1440; GFX6-NEXT:    s_min_u32 s16, s16, s19
1441; GFX6-NEXT:    s_add_i32 s3, s3, s16
1442; GFX6-NEXT:    s_not_b32 s16, s4
1443; GFX6-NEXT:    s_min_u32 s16, s16, s20
1444; GFX6-NEXT:    s_add_i32 s4, s4, s16
1445; GFX6-NEXT:    s_not_b32 s16, s5
1446; GFX6-NEXT:    s_min_u32 s16, s16, s21
1447; GFX6-NEXT:    s_add_i32 s5, s5, s16
1448; GFX6-NEXT:    s_not_b32 s16, s6
1449; GFX6-NEXT:    s_min_u32 s16, s16, s22
1450; GFX6-NEXT:    s_add_i32 s6, s6, s16
1451; GFX6-NEXT:    s_not_b32 s16, s7
1452; GFX6-NEXT:    s_min_u32 s16, s16, s23
1453; GFX6-NEXT:    s_add_i32 s7, s7, s16
1454; GFX6-NEXT:    s_not_b32 s16, s8
1455; GFX6-NEXT:    s_min_u32 s16, s16, s24
1456; GFX6-NEXT:    s_add_i32 s8, s8, s16
1457; GFX6-NEXT:    s_not_b32 s16, s9
1458; GFX6-NEXT:    s_min_u32 s16, s16, s25
1459; GFX6-NEXT:    s_add_i32 s9, s9, s16
1460; GFX6-NEXT:    s_not_b32 s16, s10
1461; GFX6-NEXT:    s_min_u32 s16, s16, s26
1462; GFX6-NEXT:    s_add_i32 s10, s10, s16
1463; GFX6-NEXT:    s_not_b32 s16, s11
1464; GFX6-NEXT:    s_min_u32 s16, s16, s27
1465; GFX6-NEXT:    s_add_i32 s11, s11, s16
1466; GFX6-NEXT:    s_not_b32 s16, s12
1467; GFX6-NEXT:    s_min_u32 s16, s16, s28
1468; GFX6-NEXT:    s_add_i32 s12, s12, s16
1469; GFX6-NEXT:    s_not_b32 s16, s13
1470; GFX6-NEXT:    s_min_u32 s16, s16, s29
1471; GFX6-NEXT:    s_add_i32 s13, s13, s16
1472; GFX6-NEXT:    s_not_b32 s16, s14
1473; GFX6-NEXT:    s_min_u32 s16, s16, s30
1474; GFX6-NEXT:    s_add_i32 s14, s14, s16
1475; GFX6-NEXT:    s_not_b32 s16, s15
1476; GFX6-NEXT:    s_min_u32 s16, s16, s31
1477; GFX6-NEXT:    s_add_i32 s15, s15, s16
1478; GFX6-NEXT:    ; return to shader part epilog
1479;
1480; GFX8-LABEL: s_uaddsat_v16i32:
1481; GFX8:       ; %bb.0:
1482; GFX8-NEXT:    v_mov_b32_e32 v0, s16
1483; GFX8-NEXT:    v_mov_b32_e32 v1, s17
1484; GFX8-NEXT:    v_mov_b32_e32 v2, s18
1485; GFX8-NEXT:    v_mov_b32_e32 v3, s19
1486; GFX8-NEXT:    v_mov_b32_e32 v4, s20
1487; GFX8-NEXT:    v_mov_b32_e32 v5, s21
1488; GFX8-NEXT:    v_mov_b32_e32 v6, s22
1489; GFX8-NEXT:    v_mov_b32_e32 v7, s23
1490; GFX8-NEXT:    v_mov_b32_e32 v8, s24
1491; GFX8-NEXT:    v_mov_b32_e32 v9, s25
1492; GFX8-NEXT:    v_mov_b32_e32 v10, s26
1493; GFX8-NEXT:    v_mov_b32_e32 v11, s27
1494; GFX8-NEXT:    v_mov_b32_e32 v12, s28
1495; GFX8-NEXT:    v_mov_b32_e32 v13, s29
1496; GFX8-NEXT:    v_mov_b32_e32 v14, s30
1497; GFX8-NEXT:    v_mov_b32_e32 v15, s31
1498; GFX8-NEXT:    v_add_u32_e64 v0, s[32:33], s0, v0 clamp
1499; GFX8-NEXT:    v_add_u32_e64 v1, s[16:17], s1, v1 clamp
1500; GFX8-NEXT:    v_add_u32_e64 v2, s[16:17], s2, v2 clamp
1501; GFX8-NEXT:    v_add_u32_e64 v3, s[2:3], s3, v3 clamp
1502; GFX8-NEXT:    v_add_u32_e64 v4, s[2:3], s4, v4 clamp
1503; GFX8-NEXT:    v_add_u32_e64 v5, s[2:3], s5, v5 clamp
1504; GFX8-NEXT:    v_add_u32_e64 v6, s[2:3], s6, v6 clamp
1505; GFX8-NEXT:    v_add_u32_e64 v7, s[2:3], s7, v7 clamp
1506; GFX8-NEXT:    v_add_u32_e64 v8, s[2:3], s8, v8 clamp
1507; GFX8-NEXT:    v_add_u32_e64 v9, s[2:3], s9, v9 clamp
1508; GFX8-NEXT:    v_add_u32_e64 v10, s[2:3], s10, v10 clamp
1509; GFX8-NEXT:    v_add_u32_e64 v11, s[2:3], s11, v11 clamp
1510; GFX8-NEXT:    v_add_u32_e64 v12, s[2:3], s12, v12 clamp
1511; GFX8-NEXT:    v_add_u32_e64 v13, s[2:3], s13, v13 clamp
1512; GFX8-NEXT:    v_add_u32_e64 v14, s[2:3], s14, v14 clamp
1513; GFX8-NEXT:    v_add_u32_e64 v15, s[2:3], s15, v15 clamp
1514; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1515; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1516; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
1517; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
1518; GFX8-NEXT:    v_readfirstlane_b32 s4, v4
1519; GFX8-NEXT:    v_readfirstlane_b32 s5, v5
1520; GFX8-NEXT:    v_readfirstlane_b32 s6, v6
1521; GFX8-NEXT:    v_readfirstlane_b32 s7, v7
1522; GFX8-NEXT:    v_readfirstlane_b32 s8, v8
1523; GFX8-NEXT:    v_readfirstlane_b32 s9, v9
1524; GFX8-NEXT:    v_readfirstlane_b32 s10, v10
1525; GFX8-NEXT:    v_readfirstlane_b32 s11, v11
1526; GFX8-NEXT:    v_readfirstlane_b32 s12, v12
1527; GFX8-NEXT:    v_readfirstlane_b32 s13, v13
1528; GFX8-NEXT:    v_readfirstlane_b32 s14, v14
1529; GFX8-NEXT:    v_readfirstlane_b32 s15, v15
1530; GFX8-NEXT:    ; return to shader part epilog
1531;
1532; GFX9-LABEL: s_uaddsat_v16i32:
1533; GFX9:       ; %bb.0:
1534; GFX9-NEXT:    v_mov_b32_e32 v0, s16
1535; GFX9-NEXT:    v_mov_b32_e32 v1, s17
1536; GFX9-NEXT:    v_mov_b32_e32 v2, s18
1537; GFX9-NEXT:    v_mov_b32_e32 v3, s19
1538; GFX9-NEXT:    v_mov_b32_e32 v4, s20
1539; GFX9-NEXT:    v_mov_b32_e32 v5, s21
1540; GFX9-NEXT:    v_mov_b32_e32 v6, s22
1541; GFX9-NEXT:    v_mov_b32_e32 v7, s23
1542; GFX9-NEXT:    v_mov_b32_e32 v8, s24
1543; GFX9-NEXT:    v_mov_b32_e32 v9, s25
1544; GFX9-NEXT:    v_mov_b32_e32 v10, s26
1545; GFX9-NEXT:    v_mov_b32_e32 v11, s27
1546; GFX9-NEXT:    v_mov_b32_e32 v12, s28
1547; GFX9-NEXT:    v_mov_b32_e32 v13, s29
1548; GFX9-NEXT:    v_mov_b32_e32 v14, s30
1549; GFX9-NEXT:    v_mov_b32_e32 v15, s31
1550; GFX9-NEXT:    v_add_u32_e64 v0, s0, v0 clamp
1551; GFX9-NEXT:    v_add_u32_e64 v1, s1, v1 clamp
1552; GFX9-NEXT:    v_add_u32_e64 v2, s2, v2 clamp
1553; GFX9-NEXT:    v_add_u32_e64 v3, s3, v3 clamp
1554; GFX9-NEXT:    v_add_u32_e64 v4, s4, v4 clamp
1555; GFX9-NEXT:    v_add_u32_e64 v5, s5, v5 clamp
1556; GFX9-NEXT:    v_add_u32_e64 v6, s6, v6 clamp
1557; GFX9-NEXT:    v_add_u32_e64 v7, s7, v7 clamp
1558; GFX9-NEXT:    v_add_u32_e64 v8, s8, v8 clamp
1559; GFX9-NEXT:    v_add_u32_e64 v9, s9, v9 clamp
1560; GFX9-NEXT:    v_add_u32_e64 v10, s10, v10 clamp
1561; GFX9-NEXT:    v_add_u32_e64 v11, s11, v11 clamp
1562; GFX9-NEXT:    v_add_u32_e64 v12, s12, v12 clamp
1563; GFX9-NEXT:    v_add_u32_e64 v13, s13, v13 clamp
1564; GFX9-NEXT:    v_add_u32_e64 v14, s14, v14 clamp
1565; GFX9-NEXT:    v_add_u32_e64 v15, s15, v15 clamp
1566; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1567; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1568; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
1569; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
1570; GFX9-NEXT:    v_readfirstlane_b32 s4, v4
1571; GFX9-NEXT:    v_readfirstlane_b32 s5, v5
1572; GFX9-NEXT:    v_readfirstlane_b32 s6, v6
1573; GFX9-NEXT:    v_readfirstlane_b32 s7, v7
1574; GFX9-NEXT:    v_readfirstlane_b32 s8, v8
1575; GFX9-NEXT:    v_readfirstlane_b32 s9, v9
1576; GFX9-NEXT:    v_readfirstlane_b32 s10, v10
1577; GFX9-NEXT:    v_readfirstlane_b32 s11, v11
1578; GFX9-NEXT:    v_readfirstlane_b32 s12, v12
1579; GFX9-NEXT:    v_readfirstlane_b32 s13, v13
1580; GFX9-NEXT:    v_readfirstlane_b32 s14, v14
1581; GFX9-NEXT:    v_readfirstlane_b32 s15, v15
1582; GFX9-NEXT:    ; return to shader part epilog
1583;
1584; GFX10-LABEL: s_uaddsat_v16i32:
1585; GFX10:       ; %bb.0:
1586; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, s16 clamp
1587; GFX10-NEXT:    v_add_nc_u32_e64 v1, s1, s17 clamp
1588; GFX10-NEXT:    v_add_nc_u32_e64 v2, s2, s18 clamp
1589; GFX10-NEXT:    v_add_nc_u32_e64 v3, s3, s19 clamp
1590; GFX10-NEXT:    v_add_nc_u32_e64 v4, s4, s20 clamp
1591; GFX10-NEXT:    v_add_nc_u32_e64 v5, s5, s21 clamp
1592; GFX10-NEXT:    v_add_nc_u32_e64 v6, s6, s22 clamp
1593; GFX10-NEXT:    v_add_nc_u32_e64 v7, s7, s23 clamp
1594; GFX10-NEXT:    v_add_nc_u32_e64 v8, s8, s24 clamp
1595; GFX10-NEXT:    v_add_nc_u32_e64 v9, s9, s25 clamp
1596; GFX10-NEXT:    v_add_nc_u32_e64 v10, s10, s26 clamp
1597; GFX10-NEXT:    v_add_nc_u32_e64 v11, s11, s27 clamp
1598; GFX10-NEXT:    v_add_nc_u32_e64 v12, s12, s28 clamp
1599; GFX10-NEXT:    v_add_nc_u32_e64 v13, s13, s29 clamp
1600; GFX10-NEXT:    v_add_nc_u32_e64 v14, s14, s30 clamp
1601; GFX10-NEXT:    v_add_nc_u32_e64 v15, s15, s31 clamp
1602; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1603; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
1604; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
1605; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
1606; GFX10-NEXT:    v_readfirstlane_b32 s4, v4
1607; GFX10-NEXT:    v_readfirstlane_b32 s5, v5
1608; GFX10-NEXT:    v_readfirstlane_b32 s6, v6
1609; GFX10-NEXT:    v_readfirstlane_b32 s7, v7
1610; GFX10-NEXT:    v_readfirstlane_b32 s8, v8
1611; GFX10-NEXT:    v_readfirstlane_b32 s9, v9
1612; GFX10-NEXT:    v_readfirstlane_b32 s10, v10
1613; GFX10-NEXT:    v_readfirstlane_b32 s11, v11
1614; GFX10-NEXT:    v_readfirstlane_b32 s12, v12
1615; GFX10-NEXT:    v_readfirstlane_b32 s13, v13
1616; GFX10-NEXT:    v_readfirstlane_b32 s14, v14
1617; GFX10-NEXT:    v_readfirstlane_b32 s15, v15
1618; GFX10-NEXT:    ; return to shader part epilog
1619  %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs)
1620  ret <16 x i32> %result
1621}
1622
1623define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
1624; GFX6-LABEL: v_uaddsat_i16:
1625; GFX6:       ; %bb.0:
1626; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1627; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1628; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1629; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
1630; GFX6-NEXT:    v_min_u32_e32 v1, v2, v1
1631; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1632; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1633; GFX6-NEXT:    s_setpc_b64 s[30:31]
1634;
1635; GFX8-LABEL: v_uaddsat_i16:
1636; GFX8:       ; %bb.0:
1637; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1638; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
1639; GFX8-NEXT:    s_setpc_b64 s[30:31]
1640;
1641; GFX9-LABEL: v_uaddsat_i16:
1642; GFX9:       ; %bb.0:
1643; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1644; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
1645; GFX9-NEXT:    s_setpc_b64 s[30:31]
1646;
1647; GFX10-LABEL: v_uaddsat_i16:
1648; GFX10:       ; %bb.0:
1649; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1650; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1651; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1 clamp
1652; GFX10-NEXT:    s_setpc_b64 s[30:31]
1653  %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1654  ret i16 %result
1655}
1656
1657define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
1658; GFX6-LABEL: s_uaddsat_i16:
1659; GFX6:       ; %bb.0:
1660; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1661; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1662; GFX6-NEXT:    s_not_b32 s2, s0
1663; GFX6-NEXT:    s_min_u32 s1, s2, s1
1664; GFX6-NEXT:    s_add_i32 s0, s0, s1
1665; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
1666; GFX6-NEXT:    ; return to shader part epilog
1667;
1668; GFX8-LABEL: s_uaddsat_i16:
1669; GFX8:       ; %bb.0:
1670; GFX8-NEXT:    v_mov_b32_e32 v0, s1
1671; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
1672; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1673; GFX8-NEXT:    ; return to shader part epilog
1674;
1675; GFX9-LABEL: s_uaddsat_i16:
1676; GFX9:       ; %bb.0:
1677; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1678; GFX9-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
1679; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1680; GFX9-NEXT:    ; return to shader part epilog
1681;
1682; GFX10-LABEL: s_uaddsat_i16:
1683; GFX10:       ; %bb.0:
1684; GFX10-NEXT:    v_add_nc_u16 v0, s0, s1 clamp
1685; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1686; GFX10-NEXT:    ; return to shader part epilog
1687  %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1688  ret i16 %result
1689}
1690
1691define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
1692; GFX6-LABEL: uaddsat_i16_sv:
1693; GFX6:       ; %bb.0:
1694; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1695; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1696; GFX6-NEXT:    s_not_b32 s1, s0
1697; GFX6-NEXT:    v_min_u32_e32 v0, s1, v0
1698; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1699; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1700; GFX6-NEXT:    ; return to shader part epilog
1701;
1702; GFX8-LABEL: uaddsat_i16_sv:
1703; GFX8:       ; %bb.0:
1704; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
1705; GFX8-NEXT:    ; return to shader part epilog
1706;
1707; GFX9-LABEL: uaddsat_i16_sv:
1708; GFX9:       ; %bb.0:
1709; GFX9-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
1710; GFX9-NEXT:    ; return to shader part epilog
1711;
1712; GFX10-LABEL: uaddsat_i16_sv:
1713; GFX10:       ; %bb.0:
1714; GFX10-NEXT:    v_add_nc_u16 v0, s0, v0 clamp
1715; GFX10-NEXT:    ; return to shader part epilog
1716  %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1717  %cast = bitcast i16 %result to half
1718  ret half %cast
1719}
1720
1721define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
1722; GFX6-LABEL: uaddsat_i16_vs:
1723; GFX6:       ; %bb.0:
1724; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1725; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1726; GFX6-NEXT:    v_xor_b32_e32 v1, -1, v0
1727; GFX6-NEXT:    v_min_u32_e32 v1, s0, v1
1728; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1729; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1730; GFX6-NEXT:    ; return to shader part epilog
1731;
1732; GFX8-LABEL: uaddsat_i16_vs:
1733; GFX8:       ; %bb.0:
1734; GFX8-NEXT:    v_add_u16_e64 v0, v0, s0 clamp
1735; GFX8-NEXT:    ; return to shader part epilog
1736;
1737; GFX9-LABEL: uaddsat_i16_vs:
1738; GFX9:       ; %bb.0:
1739; GFX9-NEXT:    v_add_u16_e64 v0, v0, s0 clamp
1740; GFX9-NEXT:    ; return to shader part epilog
1741;
1742; GFX10-LABEL: uaddsat_i16_vs:
1743; GFX10:       ; %bb.0:
1744; GFX10-NEXT:    v_add_nc_u16 v0, v0, s0 clamp
1745; GFX10-NEXT:    ; return to shader part epilog
1746  %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
1747  %cast = bitcast i16 %result to half
1748  ret half %cast
1749}
1750
1751define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
1752; GFX6-LABEL: v_uaddsat_v2i16:
1753; GFX6:       ; %bb.0:
1754; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1755; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1756; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1757; GFX6-NEXT:    v_xor_b32_e32 v4, -1, v0
1758; GFX6-NEXT:    v_min_u32_e32 v2, v4, v2
1759; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1760; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1761; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
1762; GFX6-NEXT:    v_xor_b32_e32 v3, -1, v1
1763; GFX6-NEXT:    v_min_u32_e32 v2, v3, v2
1764; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1765; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1766; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1767; GFX6-NEXT:    s_setpc_b64 s[30:31]
1768;
1769; GFX8-LABEL: v_uaddsat_v2i16:
1770; GFX8:       ; %bb.0:
1771; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1772; GFX8-NEXT:    v_add_u16_e64 v2, v0, v1 clamp
1773; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1774; GFX8-NEXT:    v_mov_b32_e32 v1, 16
1775; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1776; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1777; GFX8-NEXT:    s_setpc_b64 s[30:31]
1778;
1779; GFX9-LABEL: v_uaddsat_v2i16:
1780; GFX9:       ; %bb.0:
1781; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1782; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
1783; GFX9-NEXT:    s_setpc_b64 s[30:31]
1784;
1785; GFX10-LABEL: v_uaddsat_v2i16:
1786; GFX10:       ; %bb.0:
1787; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1788; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1789; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
1790; GFX10-NEXT:    s_setpc_b64 s[30:31]
1791  %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1792  ret <2 x i16> %result
1793}
1794
1795define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) {
1796; GFX6-LABEL: s_uaddsat_v2i16:
1797; GFX6:       ; %bb.0:
1798; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1799; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
1800; GFX6-NEXT:    s_not_b32 s4, s0
1801; GFX6-NEXT:    s_min_u32 s2, s4, s2
1802; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1803; GFX6-NEXT:    s_add_i32 s0, s0, s2
1804; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
1805; GFX6-NEXT:    s_not_b32 s3, s1
1806; GFX6-NEXT:    s_min_u32 s2, s3, s2
1807; GFX6-NEXT:    s_add_i32 s1, s1, s2
1808; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
1809; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
1810; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1811; GFX6-NEXT:    s_or_b32 s0, s0, s1
1812; GFX6-NEXT:    ; return to shader part epilog
1813;
1814; GFX8-LABEL: s_uaddsat_v2i16:
1815; GFX8:       ; %bb.0:
1816; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
1817; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
1818; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1819; GFX8-NEXT:    v_mov_b32_e32 v0, s1
1820; GFX8-NEXT:    v_add_u16_e64 v1, s2, v1 clamp
1821; GFX8-NEXT:    v_mov_b32_e32 v2, 16
1822; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
1823; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1824; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1825; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1826; GFX8-NEXT:    ; return to shader part epilog
1827;
1828; GFX9-LABEL: s_uaddsat_v2i16:
1829; GFX9:       ; %bb.0:
1830; GFX9-NEXT:    v_mov_b32_e32 v0, s1
1831; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
1832; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1833; GFX9-NEXT:    ; return to shader part epilog
1834;
1835; GFX10-LABEL: s_uaddsat_v2i16:
1836; GFX10:       ; %bb.0:
1837; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
1838; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
1839; GFX10-NEXT:    ; return to shader part epilog
1840  %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1841  %cast = bitcast <2 x i16> %result to i32
1842  ret i32 %cast
1843}
1844
1845define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
1846; GFX6-LABEL: uaddsat_v2i16_sv:
1847; GFX6:       ; %bb.0:
1848; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1849; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1850; GFX6-NEXT:    s_not_b32 s2, s0
1851; GFX6-NEXT:    v_min_u32_e32 v0, s2, v0
1852; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
1853; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
1854; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1855; GFX6-NEXT:    s_not_b32 s1, s0
1856; GFX6-NEXT:    v_min_u32_e32 v1, s1, v1
1857; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
1858; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1859; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1860; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1861; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
1862; GFX6-NEXT:    ; return to shader part epilog
1863;
1864; GFX8-LABEL: uaddsat_v2i16_sv:
1865; GFX8:       ; %bb.0:
1866; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
1867; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1868; GFX8-NEXT:    v_add_u16_e64 v1, s0, v0 clamp
1869; GFX8-NEXT:    v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1870; GFX8-NEXT:    v_mov_b32_e32 v2, 16
1871; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1872; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1873; GFX8-NEXT:    ; return to shader part epilog
1874;
1875; GFX9-LABEL: uaddsat_v2i16_sv:
1876; GFX9:       ; %bb.0:
1877; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
1878; GFX9-NEXT:    ; return to shader part epilog
1879;
1880; GFX10-LABEL: uaddsat_v2i16_sv:
1881; GFX10:       ; %bb.0:
1882; GFX10-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
1883; GFX10-NEXT:    ; return to shader part epilog
1884  %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1885  %cast = bitcast <2 x i16> %result to float
1886  ret float %cast
1887}
1888
1889define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
1890; GFX6-LABEL: uaddsat_v2i16_vs:
1891; GFX6:       ; %bb.0:
1892; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1893; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
1894; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v0
1895; GFX6-NEXT:    v_min_u32_e32 v2, s0, v2
1896; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1897; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1898; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
1899; GFX6-NEXT:    v_xor_b32_e32 v2, -1, v1
1900; GFX6-NEXT:    v_min_u32_e32 v2, s0, v2
1901; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
1902; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1903; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1904; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1905; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
1906; GFX6-NEXT:    ; return to shader part epilog
1907;
1908; GFX8-LABEL: uaddsat_v2i16_vs:
1909; GFX8:       ; %bb.0:
1910; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
1911; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1912; GFX8-NEXT:    v_add_u16_e64 v1, v0, s0 clamp
1913; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1914; GFX8-NEXT:    v_mov_b32_e32 v2, 16
1915; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1916; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1917; GFX8-NEXT:    ; return to shader part epilog
1918;
1919; GFX9-LABEL: uaddsat_v2i16_vs:
1920; GFX9:       ; %bb.0:
1921; GFX9-NEXT:    v_pk_add_u16 v0, v0, s0 clamp
1922; GFX9-NEXT:    ; return to shader part epilog
1923;
1924; GFX10-LABEL: uaddsat_v2i16_vs:
1925; GFX10:       ; %bb.0:
1926; GFX10-NEXT:    v_pk_add_u16 v0, v0, s0 clamp
1927; GFX10-NEXT:    ; return to shader part epilog
1928  %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs)
1929  %cast = bitcast <2 x i16> %result to float
1930  ret float %cast
1931}
1932
1933; FIXME: v3i16 insert/extract
1934; define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
1935;   %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1936;   ret <3 x i16> %result
1937; }
1938
1939; define amdgpu_ps <3 x i16> @s_uaddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) {
1940;   %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs)
1941;   ret <3 x i16> %result
1942; }
1943
1944define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
1945; GFX6-LABEL: v_uaddsat_v4i16:
1946; GFX6:       ; %bb.0:
1947; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1948; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1949; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
1950; GFX6-NEXT:    v_xor_b32_e32 v8, -1, v0
1951; GFX6-NEXT:    v_min_u32_e32 v4, v8, v4
1952; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1953; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
1954; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
1955; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v1
1956; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
1957; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1958; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
1959; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
1960; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v2
1961; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
1962; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1963; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
1964; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
1965; GFX6-NEXT:    v_xor_b32_e32 v5, -1, v3
1966; GFX6-NEXT:    v_min_u32_e32 v4, v5, v4
1967; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1968; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1969; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1970; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1971; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1972; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1973; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
1974; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1975; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
1976; GFX6-NEXT:    s_setpc_b64 s[30:31]
1977;
1978; GFX8-LABEL: v_uaddsat_v4i16:
1979; GFX8:       ; %bb.0:
1980; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1981; GFX8-NEXT:    v_add_u16_e64 v4, v0, v2 clamp
1982; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1983; GFX8-NEXT:    v_add_u16_e64 v2, v1, v3 clamp
1984; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1985; GFX8-NEXT:    v_mov_b32_e32 v3, 16
1986; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1987; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1988; GFX8-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1989; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1990; GFX8-NEXT:    s_setpc_b64 s[30:31]
1991;
1992; GFX9-LABEL: v_uaddsat_v4i16:
1993; GFX9:       ; %bb.0:
1994; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1995; GFX9-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
1996; GFX9-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
1997; GFX9-NEXT:    s_setpc_b64 s[30:31]
1998;
1999; GFX10-LABEL: v_uaddsat_v4i16:
2000; GFX10:       ; %bb.0:
2001; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2003; GFX10-NEXT:    v_pk_add_u16 v0, v0, v2 clamp
2004; GFX10-NEXT:    v_pk_add_u16 v1, v1, v3 clamp
2005; GFX10-NEXT:    s_setpc_b64 s[30:31]
2006  %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2007  %cast = bitcast <4 x i16> %result to <2 x float>
2008  ret <2 x float> %cast
2009}
2010
2011define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) {
2012; GFX6-LABEL: s_uaddsat_v4i16:
2013; GFX6:       ; %bb.0:
2014; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2015; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2016; GFX6-NEXT:    s_not_b32 s8, s0
2017; GFX6-NEXT:    s_min_u32 s4, s8, s4
2018; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2019; GFX6-NEXT:    s_add_i32 s0, s0, s4
2020; GFX6-NEXT:    s_lshl_b32 s4, s5, 16
2021; GFX6-NEXT:    s_not_b32 s5, s1
2022; GFX6-NEXT:    s_min_u32 s4, s5, s4
2023; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2024; GFX6-NEXT:    s_add_i32 s1, s1, s4
2025; GFX6-NEXT:    s_lshl_b32 s4, s6, 16
2026; GFX6-NEXT:    s_not_b32 s5, s2
2027; GFX6-NEXT:    s_min_u32 s4, s5, s4
2028; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2029; GFX6-NEXT:    s_add_i32 s2, s2, s4
2030; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
2031; GFX6-NEXT:    s_not_b32 s5, s3
2032; GFX6-NEXT:    s_min_u32 s4, s5, s4
2033; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2034; GFX6-NEXT:    s_add_i32 s3, s3, s4
2035; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
2036; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2037; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2038; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
2039; GFX6-NEXT:    s_or_b32 s0, s0, s1
2040; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
2041; GFX6-NEXT:    s_or_b32 s1, s2, s1
2042; GFX6-NEXT:    ; return to shader part epilog
2043;
2044; GFX8-LABEL: s_uaddsat_v4i16:
2045; GFX8:       ; %bb.0:
2046; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
2047; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
2048; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
2049; GFX8-NEXT:    v_mov_b32_e32 v1, s6
2050; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
2051; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2052; GFX8-NEXT:    v_add_u16_e64 v1, s4, v1 clamp
2053; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2054; GFX8-NEXT:    v_mov_b32_e32 v4, 16
2055; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
2056; GFX8-NEXT:    v_mov_b32_e32 v2, s3
2057; GFX8-NEXT:    v_add_u16_e64 v3, s5, v3 clamp
2058; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2059; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
2060; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2061; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2062; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2063; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2064; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2065; GFX8-NEXT:    ; return to shader part epilog
2066;
2067; GFX9-LABEL: s_uaddsat_v4i16:
2068; GFX9:       ; %bb.0:
2069; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2070; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2071; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
2072; GFX9-NEXT:    v_pk_add_u16 v1, s1, v1 clamp
2073; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2074; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2075; GFX9-NEXT:    ; return to shader part epilog
2076;
2077; GFX10-LABEL: s_uaddsat_v4i16:
2078; GFX10:       ; %bb.0:
2079; GFX10-NEXT:    v_pk_add_u16 v0, s0, s2 clamp
2080; GFX10-NEXT:    v_pk_add_u16 v1, s1, s3 clamp
2081; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2082; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2083; GFX10-NEXT:    ; return to shader part epilog
2084  %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
2085  %cast = bitcast <4 x i16> %result to <2 x i32>
2086  ret <2 x i32> %cast
2087}
2088
2089; FIXME
2090; define <5 x i16> @v_uaddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) {
2091;   %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2092;   ret <5 x i16> %result
2093; }
2094
2095; define amdgpu_ps <5 x i16> @s_uaddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) {
2096;   %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs)
2097;   ret <5 x i16> %result
2098; }
2099
2100define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
2101; GFX6-LABEL: v_uaddsat_v6i16:
2102; GFX6:       ; %bb.0:
2103; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2104; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2105; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2106; GFX6-NEXT:    v_xor_b32_e32 v12, -1, v0
2107; GFX6-NEXT:    v_min_u32_e32 v6, v12, v6
2108; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2109; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
2110; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
2111; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v1
2112; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
2113; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2114; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
2115; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
2116; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v2
2117; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
2118; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2119; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
2120; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
2121; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v3
2122; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
2123; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2124; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
2125; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
2126; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v4
2127; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
2128; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2129; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
2130; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
2131; GFX6-NEXT:    v_xor_b32_e32 v7, -1, v5
2132; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2133; GFX6-NEXT:    v_min_u32_e32 v6, v7, v6
2134; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2135; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2136; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
2137; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2138; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2139; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
2140; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2141; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
2142; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
2143; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
2144; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2145; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
2146; GFX6-NEXT:    s_setpc_b64 s[30:31]
2147;
2148; GFX8-LABEL: v_uaddsat_v6i16:
2149; GFX8:       ; %bb.0:
2150; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2151; GFX8-NEXT:    v_add_u16_e64 v6, v0, v3 clamp
2152; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2153; GFX8-NEXT:    v_add_u16_e64 v3, v1, v4 clamp
2154; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2155; GFX8-NEXT:    v_add_u16_e64 v4, v2, v5 clamp
2156; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2157; GFX8-NEXT:    v_mov_b32_e32 v5, 16
2158; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2159; GFX8-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2160; GFX8-NEXT:    v_mov_b32_e32 v3, 16
2161; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2162; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2163; GFX8-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2164; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2165; GFX8-NEXT:    s_setpc_b64 s[30:31]
2166;
2167; GFX9-LABEL: v_uaddsat_v6i16:
2168; GFX9:       ; %bb.0:
2169; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2170; GFX9-NEXT:    v_pk_add_u16 v0, v0, v3 clamp
2171; GFX9-NEXT:    v_pk_add_u16 v1, v1, v4 clamp
2172; GFX9-NEXT:    v_pk_add_u16 v2, v2, v5 clamp
2173; GFX9-NEXT:    s_setpc_b64 s[30:31]
2174;
2175; GFX10-LABEL: v_uaddsat_v6i16:
2176; GFX10:       ; %bb.0:
2177; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2178; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2179; GFX10-NEXT:    v_pk_add_u16 v0, v0, v3 clamp
2180; GFX10-NEXT:    v_pk_add_u16 v1, v1, v4 clamp
2181; GFX10-NEXT:    v_pk_add_u16 v2, v2, v5 clamp
2182; GFX10-NEXT:    s_setpc_b64 s[30:31]
2183  %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2184  %cast = bitcast <6 x i16> %result to <3 x float>
2185  ret <3 x float> %cast
2186}
2187
2188define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) {
2189; GFX6-LABEL: s_uaddsat_v6i16:
2190; GFX6:       ; %bb.0:
2191; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2192; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
2193; GFX6-NEXT:    s_not_b32 s12, s0
2194; GFX6-NEXT:    s_min_u32 s6, s12, s6
2195; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2196; GFX6-NEXT:    s_add_i32 s0, s0, s6
2197; GFX6-NEXT:    s_lshl_b32 s6, s7, 16
2198; GFX6-NEXT:    s_not_b32 s7, s1
2199; GFX6-NEXT:    s_min_u32 s6, s7, s6
2200; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2201; GFX6-NEXT:    s_add_i32 s1, s1, s6
2202; GFX6-NEXT:    s_lshl_b32 s6, s8, 16
2203; GFX6-NEXT:    s_not_b32 s7, s2
2204; GFX6-NEXT:    s_min_u32 s6, s7, s6
2205; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2206; GFX6-NEXT:    s_add_i32 s2, s2, s6
2207; GFX6-NEXT:    s_lshl_b32 s6, s9, 16
2208; GFX6-NEXT:    s_not_b32 s7, s3
2209; GFX6-NEXT:    s_min_u32 s6, s7, s6
2210; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2211; GFX6-NEXT:    s_add_i32 s3, s3, s6
2212; GFX6-NEXT:    s_lshl_b32 s6, s10, 16
2213; GFX6-NEXT:    s_not_b32 s7, s4
2214; GFX6-NEXT:    s_min_u32 s6, s7, s6
2215; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
2216; GFX6-NEXT:    s_add_i32 s4, s4, s6
2217; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
2218; GFX6-NEXT:    s_not_b32 s7, s5
2219; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2220; GFX6-NEXT:    s_min_u32 s6, s7, s6
2221; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
2222; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2223; GFX6-NEXT:    s_add_i32 s5, s5, s6
2224; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2225; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
2226; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
2227; GFX6-NEXT:    s_or_b32 s0, s0, s1
2228; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
2229; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
2230; GFX6-NEXT:    s_or_b32 s1, s2, s1
2231; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
2232; GFX6-NEXT:    s_or_b32 s2, s4, s2
2233; GFX6-NEXT:    ; return to shader part epilog
2234;
2235; GFX8-LABEL: s_uaddsat_v6i16:
2236; GFX8:       ; %bb.0:
2237; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
2238; GFX8-NEXT:    s_lshr_b32 s6, s0, 16
2239; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
2240; GFX8-NEXT:    v_mov_b32_e32 v1, s9
2241; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
2242; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
2243; GFX8-NEXT:    v_mov_b32_e32 v0, s3
2244; GFX8-NEXT:    v_add_u16_e64 v1, s6, v1 clamp
2245; GFX8-NEXT:    v_mov_b32_e32 v3, s10
2246; GFX8-NEXT:    v_mov_b32_e32 v6, 16
2247; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
2248; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
2249; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2250; GFX8-NEXT:    v_add_u16_e64 v3, s7, v3 clamp
2251; GFX8-NEXT:    v_mov_b32_e32 v5, s11
2252; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2253; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
2254; GFX8-NEXT:    v_mov_b32_e32 v4, s5
2255; GFX8-NEXT:    v_add_u16_e64 v5, s8, v5 clamp
2256; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2257; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2258; GFX8-NEXT:    v_add_u16_e64 v4, s2, v4 clamp
2259; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2260; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2261; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2262; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2263; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2264; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
2265; GFX8-NEXT:    ; return to shader part epilog
2266;
2267; GFX9-LABEL: s_uaddsat_v6i16:
2268; GFX9:       ; %bb.0:
2269; GFX9-NEXT:    v_mov_b32_e32 v0, s3
2270; GFX9-NEXT:    v_mov_b32_e32 v1, s4
2271; GFX9-NEXT:    v_mov_b32_e32 v2, s5
2272; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
2273; GFX9-NEXT:    v_pk_add_u16 v1, s1, v1 clamp
2274; GFX9-NEXT:    v_pk_add_u16 v2, s2, v2 clamp
2275; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2276; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2277; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2278; GFX9-NEXT:    ; return to shader part epilog
2279;
2280; GFX10-LABEL: s_uaddsat_v6i16:
2281; GFX10:       ; %bb.0:
2282; GFX10-NEXT:    v_pk_add_u16 v0, s0, s3 clamp
2283; GFX10-NEXT:    v_pk_add_u16 v1, s1, s4 clamp
2284; GFX10-NEXT:    v_pk_add_u16 v2, s2, s5 clamp
2285; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2286; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2287; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
2288; GFX10-NEXT:    ; return to shader part epilog
2289  %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs)
2290  %cast = bitcast <6 x i16> %result to <3 x i32>
2291  ret <3 x i32> %cast
2292}
2293
2294define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
2295; GFX6-LABEL: v_uaddsat_v8i16:
2296; GFX6:       ; %bb.0:
2297; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2299; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
2300; GFX6-NEXT:    v_xor_b32_e32 v16, -1, v0
2301; GFX6-NEXT:    v_min_u32_e32 v8, v16, v8
2302; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2303; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
2304; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2305; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v1
2306; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2307; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2308; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
2309; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
2310; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v2
2311; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2312; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2313; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
2314; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
2315; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v3
2316; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2317; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2318; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
2319; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
2320; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v4
2321; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2322; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2323; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
2324; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
2325; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v5
2326; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2327; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2328; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
2329; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
2330; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v6
2331; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2332; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2333; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
2334; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
2335; GFX6-NEXT:    v_xor_b32_e32 v9, -1, v7
2336; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2337; GFX6-NEXT:    v_min_u32_e32 v8, v9, v8
2338; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2339; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2340; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
2341; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2342; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2343; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
2344; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
2345; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
2346; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
2347; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
2348; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
2349; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
2350; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2351; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
2352; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
2353; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
2354; GFX6-NEXT:    s_setpc_b64 s[30:31]
2355;
2356; GFX8-LABEL: v_uaddsat_v8i16:
2357; GFX8:       ; %bb.0:
2358; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2359; GFX8-NEXT:    v_add_u16_e64 v8, v0, v4 clamp
2360; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2361; GFX8-NEXT:    v_add_u16_e64 v4, v1, v5 clamp
2362; GFX8-NEXT:    v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2363; GFX8-NEXT:    v_add_u16_e64 v5, v2, v6 clamp
2364; GFX8-NEXT:    v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2365; GFX8-NEXT:    v_add_u16_e64 v6, v3, v7 clamp
2366; GFX8-NEXT:    v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2367; GFX8-NEXT:    v_mov_b32_e32 v7, 16
2368; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2369; GFX8-NEXT:    v_mov_b32_e32 v7, 16
2370; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2371; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2372; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2373; GFX8-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2374; GFX8-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2375; GFX8-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2376; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2377; GFX8-NEXT:    s_setpc_b64 s[30:31]
2378;
2379; GFX9-LABEL: v_uaddsat_v8i16:
2380; GFX9:       ; %bb.0:
2381; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2382; GFX9-NEXT:    v_pk_add_u16 v0, v0, v4 clamp
2383; GFX9-NEXT:    v_pk_add_u16 v1, v1, v5 clamp
2384; GFX9-NEXT:    v_pk_add_u16 v2, v2, v6 clamp
2385; GFX9-NEXT:    v_pk_add_u16 v3, v3, v7 clamp
2386; GFX9-NEXT:    s_setpc_b64 s[30:31]
2387;
2388; GFX10-LABEL: v_uaddsat_v8i16:
2389; GFX10:       ; %bb.0:
2390; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2391; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2392; GFX10-NEXT:    v_pk_add_u16 v0, v0, v4 clamp
2393; GFX10-NEXT:    v_pk_add_u16 v1, v1, v5 clamp
2394; GFX10-NEXT:    v_pk_add_u16 v2, v2, v6 clamp
2395; GFX10-NEXT:    v_pk_add_u16 v3, v3, v7 clamp
2396; GFX10-NEXT:    s_setpc_b64 s[30:31]
2397  %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2398  %cast = bitcast <8 x i16> %result to <4 x float>
2399  ret <4 x float> %cast
2400}
2401
2402define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) {
2403; GFX6-LABEL: s_uaddsat_v8i16:
2404; GFX6:       ; %bb.0:
2405; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
2406; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
2407; GFX6-NEXT:    s_not_b32 s16, s0
2408; GFX6-NEXT:    s_min_u32 s8, s16, s8
2409; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2410; GFX6-NEXT:    s_add_i32 s0, s0, s8
2411; GFX6-NEXT:    s_lshl_b32 s8, s9, 16
2412; GFX6-NEXT:    s_not_b32 s9, s1
2413; GFX6-NEXT:    s_min_u32 s8, s9, s8
2414; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
2415; GFX6-NEXT:    s_add_i32 s1, s1, s8
2416; GFX6-NEXT:    s_lshl_b32 s8, s10, 16
2417; GFX6-NEXT:    s_not_b32 s9, s2
2418; GFX6-NEXT:    s_min_u32 s8, s9, s8
2419; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
2420; GFX6-NEXT:    s_add_i32 s2, s2, s8
2421; GFX6-NEXT:    s_lshl_b32 s8, s11, 16
2422; GFX6-NEXT:    s_not_b32 s9, s3
2423; GFX6-NEXT:    s_min_u32 s8, s9, s8
2424; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
2425; GFX6-NEXT:    s_add_i32 s3, s3, s8
2426; GFX6-NEXT:    s_lshl_b32 s8, s12, 16
2427; GFX6-NEXT:    s_not_b32 s9, s4
2428; GFX6-NEXT:    s_min_u32 s8, s9, s8
2429; GFX6-NEXT:    s_lshl_b32 s5, s5, 16
2430; GFX6-NEXT:    s_add_i32 s4, s4, s8
2431; GFX6-NEXT:    s_lshl_b32 s8, s13, 16
2432; GFX6-NEXT:    s_not_b32 s9, s5
2433; GFX6-NEXT:    s_min_u32 s8, s9, s8
2434; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
2435; GFX6-NEXT:    s_add_i32 s5, s5, s8
2436; GFX6-NEXT:    s_lshl_b32 s8, s14, 16
2437; GFX6-NEXT:    s_not_b32 s9, s6
2438; GFX6-NEXT:    s_min_u32 s8, s9, s8
2439; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
2440; GFX6-NEXT:    s_add_i32 s6, s6, s8
2441; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
2442; GFX6-NEXT:    s_not_b32 s9, s7
2443; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
2444; GFX6-NEXT:    s_min_u32 s8, s9, s8
2445; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
2446; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
2447; GFX6-NEXT:    s_add_i32 s7, s7, s8
2448; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
2449; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
2450; GFX6-NEXT:    s_lshr_b32 s5, s5, 16
2451; GFX6-NEXT:    s_lshr_b32 s7, s7, 16
2452; GFX6-NEXT:    s_or_b32 s0, s0, s1
2453; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
2454; GFX6-NEXT:    s_lshr_b32 s4, s4, 16
2455; GFX6-NEXT:    s_lshr_b32 s6, s6, 16
2456; GFX6-NEXT:    s_or_b32 s1, s2, s1
2457; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
2458; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
2459; GFX6-NEXT:    s_or_b32 s2, s4, s2
2460; GFX6-NEXT:    s_or_b32 s3, s6, s3
2461; GFX6-NEXT:    ; return to shader part epilog
2462;
2463; GFX8-LABEL: s_uaddsat_v8i16:
2464; GFX8:       ; %bb.0:
2465; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
2466; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
2467; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
2468; GFX8-NEXT:    v_mov_b32_e32 v1, s12
2469; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
2470; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
2471; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
2472; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2473; GFX8-NEXT:    v_add_u16_e64 v1, s8, v1 clamp
2474; GFX8-NEXT:    v_mov_b32_e32 v3, s13
2475; GFX8-NEXT:    v_mov_b32_e32 v8, 16
2476; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
2477; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
2478; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
2479; GFX8-NEXT:    v_mov_b32_e32 v2, s5
2480; GFX8-NEXT:    v_add_u16_e64 v3, s9, v3 clamp
2481; GFX8-NEXT:    v_mov_b32_e32 v5, s14
2482; GFX8-NEXT:    v_mov_b32_e32 v7, s15
2483; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2484; GFX8-NEXT:    v_add_u16_e64 v2, s1, v2 clamp
2485; GFX8-NEXT:    v_mov_b32_e32 v4, s6
2486; GFX8-NEXT:    v_add_u16_e64 v5, s10, v5 clamp
2487; GFX8-NEXT:    v_mov_b32_e32 v6, s7
2488; GFX8-NEXT:    v_add_u16_e64 v7, s11, v7 clamp
2489; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2490; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2491; GFX8-NEXT:    v_add_u16_e64 v4, s2, v4 clamp
2492; GFX8-NEXT:    v_add_u16_e64 v6, s3, v6 clamp
2493; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2494; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2495; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2496; GFX8-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2497; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2498; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2499; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2500; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
2501; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
2502; GFX8-NEXT:    ; return to shader part epilog
2503;
2504; GFX9-LABEL: s_uaddsat_v8i16:
2505; GFX9:       ; %bb.0:
2506; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2507; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2508; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2509; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2510; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
2511; GFX9-NEXT:    v_pk_add_u16 v1, s1, v1 clamp
2512; GFX9-NEXT:    v_pk_add_u16 v2, s2, v2 clamp
2513; GFX9-NEXT:    v_pk_add_u16 v3, s3, v3 clamp
2514; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2515; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2516; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
2517; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
2518; GFX9-NEXT:    ; return to shader part epilog
2519;
2520; GFX10-LABEL: s_uaddsat_v8i16:
2521; GFX10:       ; %bb.0:
2522; GFX10-NEXT:    v_pk_add_u16 v0, s0, s4 clamp
2523; GFX10-NEXT:    v_pk_add_u16 v1, s1, s5 clamp
2524; GFX10-NEXT:    v_pk_add_u16 v2, s2, s6 clamp
2525; GFX10-NEXT:    v_pk_add_u16 v3, s3, s7 clamp
2526; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2527; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2528; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
2529; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
2530; GFX10-NEXT:    ; return to shader part epilog
2531  %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
2532  %cast = bitcast <8 x i16> %result to <4 x i32>
2533  ret <4 x i32> %cast
2534}
2535
2536; FIXME: i48 broken because i48 add broken
2537; define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
2538;   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2539;   ret i48 %result
2540; }
2541
2542; define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
2543;   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2544;   ret i48 %result
2545; }
2546
2547; define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
2548;   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2549;   %ext.result = zext i48 %result to i64
2550;   %cast = bitcast i64 %ext.result to <2 x float>
2551;   ret <2 x float> %cast
2552; }
2553
2554; define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
2555;   %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs)
2556;   %ext.result = zext i48 %result to i64
2557;   %cast = bitcast i64 %ext.result to <2 x float>
2558;   ret <2 x float> %cast
2559; }
2560
2561define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
2562; GFX6-LABEL: v_uaddsat_i64:
2563; GFX6:       ; %bb.0:
2564; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2565; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2566; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2567; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2568; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2569; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2570; GFX6-NEXT:    s_setpc_b64 s[30:31]
2571;
2572; GFX8-LABEL: v_uaddsat_i64:
2573; GFX8:       ; %bb.0:
2574; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2575; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
2576; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
2577; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2578; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2579; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2580; GFX8-NEXT:    s_setpc_b64 s[30:31]
2581;
2582; GFX9-LABEL: v_uaddsat_i64:
2583; GFX9:       ; %bb.0:
2584; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2585; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
2586; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
2587; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
2588; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2589; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2590; GFX9-NEXT:    s_setpc_b64 s[30:31]
2591;
2592; GFX10-LABEL: v_uaddsat_i64:
2593; GFX10:       ; %bb.0:
2594; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2595; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2596; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
2597; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
2598; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
2599; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2600; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2601; GFX10-NEXT:    s_setpc_b64 s[30:31]
2602  %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2603  ret i64 %result
2604}
2605
2606define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
2607; GFX6-LABEL: s_uaddsat_i64:
2608; GFX6:       ; %bb.0:
2609; GFX6-NEXT:    s_add_u32 s0, s0, s2
2610; GFX6-NEXT:    s_cselect_b32 s4, 1, 0
2611; GFX6-NEXT:    s_and_b32 s4, s4, 1
2612; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
2613; GFX6-NEXT:    v_mov_b32_e32 v0, s2
2614; GFX6-NEXT:    s_addc_u32 s1, s1, s3
2615; GFX6-NEXT:    v_mov_b32_e32 v1, s3
2616; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2617; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2618; GFX6-NEXT:    v_mov_b32_e32 v3, s1
2619; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2620; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2621; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2622; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
2623; GFX6-NEXT:    ; return to shader part epilog
2624;
2625; GFX8-LABEL: s_uaddsat_i64:
2626; GFX8:       ; %bb.0:
2627; GFX8-NEXT:    s_add_u32 s0, s0, s2
2628; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
2629; GFX8-NEXT:    s_and_b32 s4, s4, 1
2630; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
2631; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2632; GFX8-NEXT:    s_addc_u32 s1, s1, s3
2633; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2634; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2635; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2636; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2637; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2638; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2639; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2640; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2641; GFX8-NEXT:    ; return to shader part epilog
2642;
2643; GFX9-LABEL: s_uaddsat_i64:
2644; GFX9:       ; %bb.0:
2645; GFX9-NEXT:    s_add_u32 s0, s0, s2
2646; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
2647; GFX9-NEXT:    s_and_b32 s4, s4, 1
2648; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
2649; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2650; GFX9-NEXT:    s_addc_u32 s1, s1, s3
2651; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2652; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2653; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2654; GFX9-NEXT:    v_mov_b32_e32 v3, s1
2655; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2656; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2657; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2658; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2659; GFX9-NEXT:    ; return to shader part epilog
2660;
2661; GFX10-LABEL: s_uaddsat_i64:
2662; GFX10:       ; %bb.0:
2663; GFX10-NEXT:    s_add_u32 s0, s0, s2
2664; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
2665; GFX10-NEXT:    s_and_b32 s4, s4, 1
2666; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
2667; GFX10-NEXT:    s_addc_u32 s1, s1, s3
2668; GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
2669; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, s2
2670; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, s2
2671; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2672; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2673; GFX10-NEXT:    ; return to shader part epilog
2674  %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2675  ret i64 %result
2676}
2677
2678define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
2679; GFX6-LABEL: uaddsat_i64_sv:
2680; GFX6:       ; %bb.0:
2681; GFX6-NEXT:    v_mov_b32_e32 v3, s1
2682; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s0, v0
2683; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
2684; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
2685; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2686; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2687; GFX6-NEXT:    ; return to shader part epilog
2688;
2689; GFX8-LABEL: uaddsat_i64_sv:
2690; GFX8:       ; %bb.0:
2691; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2692; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
2693; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v1, vcc
2694; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
2695; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2696; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2697; GFX8-NEXT:    ; return to shader part epilog
2698;
2699; GFX9-LABEL: uaddsat_i64_sv:
2700; GFX9:       ; %bb.0:
2701; GFX9-NEXT:    v_mov_b32_e32 v3, s1
2702; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
2703; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
2704; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
2705; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
2706; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc
2707; GFX9-NEXT:    ; return to shader part epilog
2708;
2709; GFX10-LABEL: uaddsat_i64_sv:
2710; GFX10:       ; %bb.0:
2711; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, s0, v0
2712; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
2713; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
2714; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc_lo
2715; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, -1, vcc_lo
2716; GFX10-NEXT:    ; return to shader part epilog
2717  %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2718  %cast = bitcast i64 %result to <2 x float>
2719  ret <2 x float> %cast
2720}
2721
2722define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
2723; GFX6-LABEL: uaddsat_i64_vs:
2724; GFX6:       ; %bb.0:
2725; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2726; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
2727; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
2728; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2729; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2730; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2731; GFX6-NEXT:    ; return to shader part epilog
2732;
2733; GFX8-LABEL: uaddsat_i64_vs:
2734; GFX8:       ; %bb.0:
2735; GFX8-NEXT:    v_mov_b32_e32 v2, s1
2736; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2737; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
2738; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2739; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2740; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2741; GFX8-NEXT:    ; return to shader part epilog
2742;
2743; GFX9-LABEL: uaddsat_i64_vs:
2744; GFX9:       ; %bb.0:
2745; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2746; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
2747; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
2748; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
2749; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2750; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2751; GFX9-NEXT:    ; return to shader part epilog
2752;
2753; GFX10-LABEL: uaddsat_i64_vs:
2754; GFX10:       ; %bb.0:
2755; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
2756; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2757; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
2758; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2759; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2760; GFX10-NEXT:    ; return to shader part epilog
2761  %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs)
2762  %cast = bitcast i64 %result to <2 x float>
2763  ret <2 x float> %cast
2764}
2765
2766define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
2767; GFX6-LABEL: v_uaddsat_v2i64:
2768; GFX6:       ; %bb.0:
2769; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2770; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
2771; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
2772; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2773; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2774; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2775; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
2776; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
2777; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2778; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2779; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2780; GFX6-NEXT:    s_setpc_b64 s[30:31]
2781;
2782; GFX8-LABEL: v_uaddsat_v2i64:
2783; GFX8:       ; %bb.0:
2784; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2785; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
2786; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
2787; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2788; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2789; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2790; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
2791; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
2792; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2793; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2794; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2795; GFX8-NEXT:    s_setpc_b64 s[30:31]
2796;
2797; GFX9-LABEL: v_uaddsat_v2i64:
2798; GFX9:       ; %bb.0:
2799; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2800; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
2801; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
2802; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
2803; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
2804; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
2805; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
2806; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
2807; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7]
2808; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2809; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2810; GFX9-NEXT:    s_setpc_b64 s[30:31]
2811;
2812; GFX10-LABEL: v_uaddsat_v2i64:
2813; GFX10:       ; %bb.0:
2814; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2815; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2816; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
2817; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
2818; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
2819; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
2820; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5]
2821; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, v[2:3], v[6:7]
2822; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
2823; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
2824; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, s4
2825; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, s4
2826; GFX10-NEXT:    s_setpc_b64 s[30:31]
2827  %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2828  ret <2 x i64> %result
2829}
2830
2831define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
2832; GFX6-LABEL: s_uaddsat_v2i64:
2833; GFX6:       ; %bb.0:
2834; GFX6-NEXT:    s_add_u32 s0, s0, s4
2835; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
2836; GFX6-NEXT:    s_and_b32 s8, s8, 1
2837; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
2838; GFX6-NEXT:    v_mov_b32_e32 v0, s4
2839; GFX6-NEXT:    s_addc_u32 s1, s1, s5
2840; GFX6-NEXT:    v_mov_b32_e32 v1, s5
2841; GFX6-NEXT:    v_mov_b32_e32 v2, s0
2842; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2843; GFX6-NEXT:    s_add_u32 s0, s2, s6
2844; GFX6-NEXT:    v_mov_b32_e32 v3, s1
2845; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
2846; GFX6-NEXT:    s_and_b32 s1, s1, 1
2847; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
2848; GFX6-NEXT:    v_mov_b32_e32 v0, s6
2849; GFX6-NEXT:    s_addc_u32 s1, s3, s7
2850; GFX6-NEXT:    v_mov_b32_e32 v1, s7
2851; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2852; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2853; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2854; GFX6-NEXT:    v_mov_b32_e32 v4, s0
2855; GFX6-NEXT:    v_mov_b32_e32 v5, s1
2856; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
2857; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
2858; GFX6-NEXT:    v_readfirstlane_b32 s0, v2
2859; GFX6-NEXT:    v_readfirstlane_b32 s1, v3
2860; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
2861; GFX6-NEXT:    v_readfirstlane_b32 s3, v1
2862; GFX6-NEXT:    ; return to shader part epilog
2863;
2864; GFX8-LABEL: s_uaddsat_v2i64:
2865; GFX8:       ; %bb.0:
2866; GFX8-NEXT:    s_add_u32 s0, s0, s4
2867; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
2868; GFX8-NEXT:    s_and_b32 s8, s8, 1
2869; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
2870; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2871; GFX8-NEXT:    s_addc_u32 s1, s1, s5
2872; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2873; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2874; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2875; GFX8-NEXT:    s_add_u32 s0, s2, s6
2876; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2877; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
2878; GFX8-NEXT:    s_and_b32 s1, s1, 1
2879; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
2880; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2881; GFX8-NEXT:    s_addc_u32 s1, s3, s7
2882; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2883; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2884; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2885; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2886; GFX8-NEXT:    v_mov_b32_e32 v4, s0
2887; GFX8-NEXT:    v_mov_b32_e32 v5, s1
2888; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
2889; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
2890; GFX8-NEXT:    v_readfirstlane_b32 s0, v2
2891; GFX8-NEXT:    v_readfirstlane_b32 s1, v3
2892; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
2893; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
2894; GFX8-NEXT:    ; return to shader part epilog
2895;
2896; GFX9-LABEL: s_uaddsat_v2i64:
2897; GFX9:       ; %bb.0:
2898; GFX9-NEXT:    s_add_u32 s0, s0, s4
2899; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
2900; GFX9-NEXT:    s_and_b32 s8, s8, 1
2901; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
2902; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2903; GFX9-NEXT:    s_addc_u32 s1, s1, s5
2904; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2905; GFX9-NEXT:    v_mov_b32_e32 v2, s0
2906; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2907; GFX9-NEXT:    s_add_u32 s0, s2, s6
2908; GFX9-NEXT:    v_mov_b32_e32 v3, s1
2909; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
2910; GFX9-NEXT:    s_and_b32 s1, s1, 1
2911; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
2912; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2913; GFX9-NEXT:    s_addc_u32 s1, s3, s7
2914; GFX9-NEXT:    v_mov_b32_e32 v1, s7
2915; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2916; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2917; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
2918; GFX9-NEXT:    v_mov_b32_e32 v4, s0
2919; GFX9-NEXT:    v_mov_b32_e32 v5, s1
2920; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
2921; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
2922; GFX9-NEXT:    v_readfirstlane_b32 s0, v2
2923; GFX9-NEXT:    v_readfirstlane_b32 s1, v3
2924; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
2925; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
2926; GFX9-NEXT:    ; return to shader part epilog
2927;
2928; GFX10-LABEL: s_uaddsat_v2i64:
2929; GFX10:       ; %bb.0:
2930; GFX10-NEXT:    s_add_u32 s0, s0, s4
2931; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
2932; GFX10-NEXT:    s_and_b32 s8, s8, 1
2933; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
2934; GFX10-NEXT:    s_addc_u32 s1, s1, s5
2935; GFX10-NEXT:    s_add_u32 s2, s2, s6
2936; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
2937; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
2938; GFX10-NEXT:    s_and_b32 s8, s8, 1
2939; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
2940; GFX10-NEXT:    s_addc_u32 s3, s3, s7
2941; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, s4
2942; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
2943; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, s4
2944; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
2945; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
2946; GFX10-NEXT:    v_cndmask_b32_e64 v2, s2, -1, s5
2947; GFX10-NEXT:    v_cndmask_b32_e64 v3, s3, -1, s5
2948; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
2949; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
2950; GFX10-NEXT:    ; return to shader part epilog
2951  %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
2952  ret <2 x i64> %result
2953}
2954
2955define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
2956; GFX6-LABEL: s_uaddsat_i128:
2957; GFX6:       ; %bb.0:
2958; GFX6-NEXT:    s_add_u32 s0, s0, s4
2959; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
2960; GFX6-NEXT:    s_and_b32 s8, s8, 1
2961; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
2962; GFX6-NEXT:    s_addc_u32 s1, s1, s5
2963; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
2964; GFX6-NEXT:    s_and_b32 s8, s8, 1
2965; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
2966; GFX6-NEXT:    s_addc_u32 s2, s2, s6
2967; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
2968; GFX6-NEXT:    v_mov_b32_e32 v2, s4
2969; GFX6-NEXT:    s_and_b32 s8, s8, 1
2970; GFX6-NEXT:    v_mov_b32_e32 v3, s5
2971; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
2972; GFX6-NEXT:    v_mov_b32_e32 v0, s6
2973; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
2974; GFX6-NEXT:    s_addc_u32 s3, s3, s7
2975; GFX6-NEXT:    v_mov_b32_e32 v1, s7
2976; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
2977; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
2978; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
2979; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
2980; GFX6-NEXT:    v_mov_b32_e32 v1, s0
2981; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
2982; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
2983; GFX6-NEXT:    v_mov_b32_e32 v2, s1
2984; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
2985; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
2986; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
2987; GFX6-NEXT:    v_mov_b32_e32 v2, s2
2988; GFX6-NEXT:    v_mov_b32_e32 v3, s3
2989; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
2990; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
2991; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
2992; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
2993; GFX6-NEXT:    v_readfirstlane_b32 s2, v2
2994; GFX6-NEXT:    v_readfirstlane_b32 s3, v3
2995; GFX6-NEXT:    ; return to shader part epilog
2996;
2997; GFX8-LABEL: s_uaddsat_i128:
2998; GFX8:       ; %bb.0:
2999; GFX8-NEXT:    s_add_u32 s0, s0, s4
3000; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
3001; GFX8-NEXT:    s_and_b32 s8, s8, 1
3002; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
3003; GFX8-NEXT:    s_addc_u32 s1, s1, s5
3004; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
3005; GFX8-NEXT:    s_and_b32 s8, s8, 1
3006; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
3007; GFX8-NEXT:    s_addc_u32 s2, s2, s6
3008; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
3009; GFX8-NEXT:    s_and_b32 s8, s8, 1
3010; GFX8-NEXT:    v_mov_b32_e32 v2, s4
3011; GFX8-NEXT:    s_cmp_lg_u32 s8, 0
3012; GFX8-NEXT:    v_mov_b32_e32 v3, s5
3013; GFX8-NEXT:    s_addc_u32 s3, s3, s7
3014; GFX8-NEXT:    v_mov_b32_e32 v0, s6
3015; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3016; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3017; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
3018; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
3019; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3020; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3021; GFX8-NEXT:    s_and_b32 s4, 1, s6
3022; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3023; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
3024; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3025; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3026; GFX8-NEXT:    v_mov_b32_e32 v1, s0
3027; GFX8-NEXT:    v_mov_b32_e32 v2, s1
3028; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3029; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
3030; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
3031; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3032; GFX8-NEXT:    v_mov_b32_e32 v3, s3
3033; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3034; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3035; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
3036; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
3037; GFX8-NEXT:    v_readfirstlane_b32 s2, v2
3038; GFX8-NEXT:    v_readfirstlane_b32 s3, v3
3039; GFX8-NEXT:    ; return to shader part epilog
3040;
3041; GFX9-LABEL: s_uaddsat_i128:
3042; GFX9:       ; %bb.0:
3043; GFX9-NEXT:    s_add_u32 s0, s0, s4
3044; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
3045; GFX9-NEXT:    s_and_b32 s8, s8, 1
3046; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
3047; GFX9-NEXT:    s_addc_u32 s1, s1, s5
3048; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
3049; GFX9-NEXT:    s_and_b32 s8, s8, 1
3050; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
3051; GFX9-NEXT:    s_addc_u32 s2, s2, s6
3052; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
3053; GFX9-NEXT:    s_and_b32 s8, s8, 1
3054; GFX9-NEXT:    v_mov_b32_e32 v2, s4
3055; GFX9-NEXT:    s_cmp_lg_u32 s8, 0
3056; GFX9-NEXT:    v_mov_b32_e32 v3, s5
3057; GFX9-NEXT:    s_addc_u32 s3, s3, s7
3058; GFX9-NEXT:    v_mov_b32_e32 v0, s6
3059; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3060; GFX9-NEXT:    v_mov_b32_e32 v1, s7
3061; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
3062; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
3063; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3064; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3065; GFX9-NEXT:    s_and_b32 s4, 1, s6
3066; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3067; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
3068; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3069; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3070; GFX9-NEXT:    v_mov_b32_e32 v1, s0
3071; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3072; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3073; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
3074; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
3075; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3076; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3077; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3078; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3079; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
3080; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
3081; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
3082; GFX9-NEXT:    v_readfirstlane_b32 s3, v3
3083; GFX9-NEXT:    ; return to shader part epilog
3084;
3085; GFX10-LABEL: s_uaddsat_i128:
3086; GFX10:       ; %bb.0:
3087; GFX10-NEXT:    s_add_u32 s0, s0, s4
3088; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
3089; GFX10-NEXT:    s_and_b32 s8, s8, 1
3090; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
3091; GFX10-NEXT:    s_addc_u32 s1, s1, s5
3092; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
3093; GFX10-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
3094; GFX10-NEXT:    s_and_b32 s8, s8, 1
3095; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
3096; GFX10-NEXT:    s_addc_u32 s2, s2, s6
3097; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
3098; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
3099; GFX10-NEXT:    s_and_b32 s8, s8, 1
3100; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
3101; GFX10-NEXT:    s_addc_u32 s3, s3, s7
3102; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
3103; GFX10-NEXT:    v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
3104; GFX10-NEXT:    s_cselect_b32 s4, 1, 0
3105; GFX10-NEXT:    s_and_b32 s4, 1, s4
3106; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
3107; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
3108; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
3109; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
3110; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
3111; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, -1, vcc_lo
3112; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, -1, vcc_lo
3113; GFX10-NEXT:    v_cndmask_b32_e64 v2, s2, -1, vcc_lo
3114; GFX10-NEXT:    v_cndmask_b32_e64 v3, s3, -1, vcc_lo
3115; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
3116; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
3117; GFX10-NEXT:    v_readfirstlane_b32 s2, v2
3118; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
3119; GFX10-NEXT:    ; return to shader part epilog
3120  %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3121  ret i128 %result
3122}
3123
3124define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
3125; GFX6-LABEL: uaddsat_i128_sv:
3126; GFX6:       ; %bb.0:
3127; GFX6-NEXT:    v_mov_b32_e32 v5, s1
3128; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s0, v0
3129; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
3130; GFX6-NEXT:    v_mov_b32_e32 v6, s2
3131; GFX6-NEXT:    v_mov_b32_e32 v7, s3
3132; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v6, v2, vcc
3133; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
3134; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
3135; GFX6-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3136; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
3137; GFX6-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
3138; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
3139; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
3140; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
3141; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3142; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
3143; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
3144; GFX6-NEXT:    v_cndmask_b32_e64 v2, v6, -1, vcc
3145; GFX6-NEXT:    v_cndmask_b32_e64 v3, v7, -1, vcc
3146; GFX6-NEXT:    ; return to shader part epilog
3147;
3148; GFX8-LABEL: uaddsat_i128_sv:
3149; GFX8:       ; %bb.0:
3150; GFX8-NEXT:    v_mov_b32_e32 v5, s1
3151; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s0, v0
3152; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
3153; GFX8-NEXT:    v_mov_b32_e32 v6, s2
3154; GFX8-NEXT:    v_mov_b32_e32 v7, s3
3155; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v6, v2, vcc
3156; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
3157; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
3158; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3159; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
3160; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
3161; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
3162; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
3163; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3164; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3165; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
3166; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
3167; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, -1, vcc
3168; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, -1, vcc
3169; GFX8-NEXT:    ; return to shader part epilog
3170;
3171; GFX9-LABEL: uaddsat_i128_sv:
3172; GFX9:       ; %bb.0:
3173; GFX9-NEXT:    v_mov_b32_e32 v5, s1
3174; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
3175; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
3176; GFX9-NEXT:    v_mov_b32_e32 v6, s2
3177; GFX9-NEXT:    v_mov_b32_e32 v7, s3
3178; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
3179; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v3, vcc
3180; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1]
3181; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3182; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
3183; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
3184; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
3185; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
3186; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3187; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3188; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc
3189; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc
3190; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, -1, vcc
3191; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -1, vcc
3192; GFX9-NEXT:    ; return to shader part epilog
3193;
3194; GFX10-LABEL: uaddsat_i128_sv:
3195; GFX10:       ; %bb.0:
3196; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, s0, v0
3197; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
3198; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
3199; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
3200; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
3201; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
3202; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3]
3203; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
3204; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
3205; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
3206; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
3207; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
3208; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, -1, vcc_lo
3209; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -1, vcc_lo
3210; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, -1, vcc_lo
3211; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, -1, vcc_lo
3212; GFX10-NEXT:    ; return to shader part epilog
3213  %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3214  %cast = bitcast i128 %result to <4 x float>
3215  ret <4 x float> %cast
3216}
3217
3218define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
3219; GFX6-LABEL: uaddsat_i128_vs:
3220; GFX6:       ; %bb.0:
3221; GFX6-NEXT:    v_mov_b32_e32 v4, s1
3222; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
3223; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
3224; GFX6-NEXT:    v_mov_b32_e32 v4, s2
3225; GFX6-NEXT:    v_mov_b32_e32 v5, s3
3226; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
3227; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
3228; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3229; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
3230; GFX6-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3231; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
3232; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3233; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
3234; GFX6-NEXT:    v_and_b32_e32 v4, 1, v4
3235; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
3236; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3237; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3238; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3239; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3240; GFX6-NEXT:    ; return to shader part epilog
3241;
3242; GFX8-LABEL: uaddsat_i128_vs:
3243; GFX8:       ; %bb.0:
3244; GFX8-NEXT:    v_mov_b32_e32 v4, s1
3245; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
3246; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
3247; GFX8-NEXT:    v_mov_b32_e32 v4, s2
3248; GFX8-NEXT:    v_mov_b32_e32 v5, s3
3249; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
3250; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
3251; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3252; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
3253; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3254; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
3255; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3256; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
3257; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
3258; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
3259; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3260; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3261; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3262; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3263; GFX8-NEXT:    ; return to shader part epilog
3264;
3265; GFX9-LABEL: uaddsat_i128_vs:
3266; GFX9:       ; %bb.0:
3267; GFX9-NEXT:    v_mov_b32_e32 v4, s1
3268; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
3269; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
3270; GFX9-NEXT:    v_mov_b32_e32 v4, s2
3271; GFX9-NEXT:    v_mov_b32_e32 v5, s3
3272; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
3273; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
3274; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
3275; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
3276; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3]
3277; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
3278; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3]
3279; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
3280; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
3281; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
3282; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3283; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3284; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3285; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3286; GFX9-NEXT:    ; return to shader part epilog
3287;
3288; GFX10-LABEL: uaddsat_i128_vs:
3289; GFX10:       ; %bb.0:
3290; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, s0
3291; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3292; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
3293; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
3294; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
3295; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
3296; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3]
3297; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
3298; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
3299; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
3300; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
3301; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
3302; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3303; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3304; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3305; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3306; GFX10-NEXT:    ; return to shader part epilog
3307  %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs)
3308  %cast = bitcast i128 %result to <4 x float>
3309  ret <4 x float> %cast
3310}
3311
3312define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
3313; GFX6-LABEL: v_uaddsat_v2i128:
3314; GFX6:       ; %bb.0:
3315; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3316; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
3317; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
3318; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
3319; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
3320; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3321; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3322; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3323; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3324; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3325; GFX6-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3326; GFX6-NEXT:    v_and_b32_e32 v8, 1, v8
3327; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3328; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3329; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3330; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3331; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3332; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
3333; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v13, vcc
3334; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v6, v14, vcc
3335; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v7, v15, vcc
3336; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3337; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3338; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3339; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3340; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3341; GFX6-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3342; GFX6-NEXT:    v_and_b32_e32 v8, 1, v8
3343; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3344; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
3345; GFX6-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
3346; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
3347; GFX6-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
3348; GFX6-NEXT:    s_setpc_b64 s[30:31]
3349;
3350; GFX8-LABEL: v_uaddsat_v2i128:
3351; GFX8:       ; %bb.0:
3352; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3353; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
3354; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
3355; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v10, vcc
3356; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v11, vcc
3357; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3358; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3359; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3360; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3361; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3362; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3363; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
3364; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3365; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3366; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3367; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3368; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3369; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v12
3370; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v13, vcc
3371; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v6, v14, vcc
3372; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v15, vcc
3373; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3374; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3375; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3376; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3377; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3378; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3379; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
3380; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3381; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
3382; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
3383; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
3384; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
3385; GFX8-NEXT:    s_setpc_b64 s[30:31]
3386;
3387; GFX9-LABEL: v_uaddsat_v2i128:
3388; GFX9:       ; %bb.0:
3389; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3390; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
3391; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v9, vcc
3392; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v10, vcc
3393; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v11, vcc
3394; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9]
3395; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3396; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11]
3397; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3398; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11]
3399; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3400; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
3401; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3402; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
3403; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc
3404; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3405; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3406; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v12
3407; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v13, vcc
3408; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v14, vcc
3409; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v15, vcc
3410; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13]
3411; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
3412; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15]
3413; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
3414; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15]
3415; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
3416; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
3417; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
3418; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, -1, vcc
3419; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, -1, vcc
3420; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, -1, vcc
3421; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, -1, vcc
3422; GFX9-NEXT:    s_setpc_b64 s[30:31]
3423;
3424; GFX10-LABEL: v_uaddsat_v2i128:
3425; GFX10:       ; %bb.0:
3426; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3427; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3428; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v8
3429; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo
3430; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo
3431; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo
3432; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9]
3433; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
3434; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v12
3435; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo
3436; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo
3437; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo
3438; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11]
3439; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
3440; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13]
3441; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
3442; GFX10-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15]
3443; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
3444; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11]
3445; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
3446; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15]
3447; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
3448; GFX10-NEXT:    v_cndmask_b32_e32 v9, v13, v12, vcc_lo
3449; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
3450; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
3451; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
3452; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, -1, vcc_lo
3453; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v9
3454; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc_lo
3455; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc_lo
3456; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, -1, s4
3457; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, -1, s4
3458; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, -1, s4
3459; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, -1, s4
3460; GFX10-NEXT:    s_setpc_b64 s[30:31]
3461  %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3462  ret <2 x i128> %result
3463}
3464
3465define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
3466; GFX6-LABEL: s_uaddsat_v2i128:
3467; GFX6:       ; %bb.0:
3468; GFX6-NEXT:    s_add_u32 s0, s0, s8
3469; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
3470; GFX6-NEXT:    s_and_b32 s16, s16, 1
3471; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
3472; GFX6-NEXT:    s_addc_u32 s1, s1, s9
3473; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
3474; GFX6-NEXT:    s_and_b32 s16, s16, 1
3475; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
3476; GFX6-NEXT:    s_addc_u32 s2, s2, s10
3477; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
3478; GFX6-NEXT:    v_mov_b32_e32 v2, s8
3479; GFX6-NEXT:    s_and_b32 s16, s16, 1
3480; GFX6-NEXT:    v_mov_b32_e32 v3, s9
3481; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
3482; GFX6-NEXT:    v_mov_b32_e32 v0, s10
3483; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3484; GFX6-NEXT:    s_addc_u32 s3, s3, s11
3485; GFX6-NEXT:    v_mov_b32_e32 v1, s11
3486; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3487; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3488; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
3489; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
3490; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3491; GFX6-NEXT:    s_add_u32 s0, s4, s12
3492; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
3493; GFX6-NEXT:    v_mov_b32_e32 v2, s1
3494; GFX6-NEXT:    s_cselect_b32 s1, 1, 0
3495; GFX6-NEXT:    s_and_b32 s1, s1, 1
3496; GFX6-NEXT:    s_cmp_lg_u32 s1, 0
3497; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
3498; GFX6-NEXT:    s_addc_u32 s1, s5, s13
3499; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3500; GFX6-NEXT:    v_mov_b32_e32 v0, s2
3501; GFX6-NEXT:    s_cselect_b32 s2, 1, 0
3502; GFX6-NEXT:    s_and_b32 s2, s2, 1
3503; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
3504; GFX6-NEXT:    s_addc_u32 s2, s6, s14
3505; GFX6-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
3506; GFX6-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
3507; GFX6-NEXT:    v_mov_b32_e32 v1, s3
3508; GFX6-NEXT:    s_cselect_b32 s3, 1, 0
3509; GFX6-NEXT:    v_mov_b32_e32 v2, s12
3510; GFX6-NEXT:    s_and_b32 s3, s3, 1
3511; GFX6-NEXT:    v_mov_b32_e32 v3, s13
3512; GFX6-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
3513; GFX6-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
3514; GFX6-NEXT:    s_cmp_lg_u32 s3, 0
3515; GFX6-NEXT:    v_mov_b32_e32 v0, s14
3516; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3517; GFX6-NEXT:    s_addc_u32 s3, s7, s15
3518; GFX6-NEXT:    v_mov_b32_e32 v1, s15
3519; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3520; GFX6-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3521; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
3522; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
3523; GFX6-NEXT:    v_mov_b32_e32 v1, s0
3524; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
3525; GFX6-NEXT:    v_and_b32_e32 v0, 1, v0
3526; GFX6-NEXT:    v_mov_b32_e32 v2, s1
3527; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3528; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
3529; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
3530; GFX6-NEXT:    v_mov_b32_e32 v2, s2
3531; GFX6-NEXT:    v_mov_b32_e32 v3, s3
3532; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3533; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3534; GFX6-NEXT:    v_readfirstlane_b32 s0, v4
3535; GFX6-NEXT:    v_readfirstlane_b32 s1, v5
3536; GFX6-NEXT:    v_readfirstlane_b32 s2, v6
3537; GFX6-NEXT:    v_readfirstlane_b32 s3, v7
3538; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
3539; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
3540; GFX6-NEXT:    v_readfirstlane_b32 s6, v2
3541; GFX6-NEXT:    v_readfirstlane_b32 s7, v3
3542; GFX6-NEXT:    ; return to shader part epilog
3543;
3544; GFX8-LABEL: s_uaddsat_v2i128:
3545; GFX8:       ; %bb.0:
3546; GFX8-NEXT:    s_add_u32 s0, s0, s8
3547; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
3548; GFX8-NEXT:    s_and_b32 s16, s16, 1
3549; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
3550; GFX8-NEXT:    s_addc_u32 s1, s1, s9
3551; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
3552; GFX8-NEXT:    s_and_b32 s16, s16, 1
3553; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
3554; GFX8-NEXT:    s_addc_u32 s2, s2, s10
3555; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
3556; GFX8-NEXT:    s_and_b32 s16, s16, 1
3557; GFX8-NEXT:    v_mov_b32_e32 v2, s8
3558; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
3559; GFX8-NEXT:    v_mov_b32_e32 v3, s9
3560; GFX8-NEXT:    s_addc_u32 s3, s3, s11
3561; GFX8-NEXT:    v_mov_b32_e32 v0, s10
3562; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3563; GFX8-NEXT:    v_mov_b32_e32 v1, s11
3564; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
3565; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
3566; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3567; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3568; GFX8-NEXT:    s_and_b32 s8, 1, s10
3569; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3570; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
3571; GFX8-NEXT:    v_mov_b32_e32 v1, s0
3572; GFX8-NEXT:    s_add_u32 s0, s4, s12
3573; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3574; GFX8-NEXT:    v_mov_b32_e32 v2, s1
3575; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
3576; GFX8-NEXT:    s_and_b32 s1, s1, 1
3577; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
3578; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3579; GFX8-NEXT:    s_addc_u32 s1, s5, s13
3580; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3581; GFX8-NEXT:    v_mov_b32_e32 v0, s2
3582; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
3583; GFX8-NEXT:    s_and_b32 s2, s2, 1
3584; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
3585; GFX8-NEXT:    s_addc_u32 s2, s6, s14
3586; GFX8-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
3587; GFX8-NEXT:    v_mov_b32_e32 v1, s3
3588; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
3589; GFX8-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
3590; GFX8-NEXT:    s_and_b32 s3, s3, 1
3591; GFX8-NEXT:    v_mov_b32_e32 v2, s12
3592; GFX8-NEXT:    s_cmp_lg_u32 s3, 0
3593; GFX8-NEXT:    v_mov_b32_e32 v3, s13
3594; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
3595; GFX8-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
3596; GFX8-NEXT:    s_addc_u32 s3, s7, s15
3597; GFX8-NEXT:    v_mov_b32_e32 v0, s14
3598; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3599; GFX8-NEXT:    v_mov_b32_e32 v1, s15
3600; GFX8-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
3601; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
3602; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3603; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3604; GFX8-NEXT:    s_and_b32 s4, 1, s4
3605; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3606; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
3607; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3608; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3609; GFX8-NEXT:    v_mov_b32_e32 v1, s0
3610; GFX8-NEXT:    v_mov_b32_e32 v2, s1
3611; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3612; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
3613; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
3614; GFX8-NEXT:    v_mov_b32_e32 v2, s2
3615; GFX8-NEXT:    v_mov_b32_e32 v3, s3
3616; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3617; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3618; GFX8-NEXT:    v_readfirstlane_b32 s0, v4
3619; GFX8-NEXT:    v_readfirstlane_b32 s1, v5
3620; GFX8-NEXT:    v_readfirstlane_b32 s2, v6
3621; GFX8-NEXT:    v_readfirstlane_b32 s3, v7
3622; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
3623; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
3624; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
3625; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
3626; GFX8-NEXT:    ; return to shader part epilog
3627;
3628; GFX9-LABEL: s_uaddsat_v2i128:
3629; GFX9:       ; %bb.0:
3630; GFX9-NEXT:    s_add_u32 s0, s0, s8
3631; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
3632; GFX9-NEXT:    s_and_b32 s16, s16, 1
3633; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
3634; GFX9-NEXT:    s_addc_u32 s1, s1, s9
3635; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
3636; GFX9-NEXT:    s_and_b32 s16, s16, 1
3637; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
3638; GFX9-NEXT:    s_addc_u32 s2, s2, s10
3639; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
3640; GFX9-NEXT:    s_and_b32 s16, s16, 1
3641; GFX9-NEXT:    v_mov_b32_e32 v2, s8
3642; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
3643; GFX9-NEXT:    v_mov_b32_e32 v3, s9
3644; GFX9-NEXT:    s_addc_u32 s3, s3, s11
3645; GFX9-NEXT:    v_mov_b32_e32 v0, s10
3646; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3647; GFX9-NEXT:    v_mov_b32_e32 v1, s11
3648; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
3649; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
3650; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3651; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3652; GFX9-NEXT:    s_and_b32 s8, 1, s10
3653; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3654; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
3655; GFX9-NEXT:    v_mov_b32_e32 v1, s0
3656; GFX9-NEXT:    s_add_u32 s0, s4, s12
3657; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3658; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3659; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
3660; GFX9-NEXT:    s_and_b32 s1, s1, 1
3661; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
3662; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3663; GFX9-NEXT:    s_addc_u32 s1, s5, s13
3664; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3665; GFX9-NEXT:    v_mov_b32_e32 v0, s2
3666; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
3667; GFX9-NEXT:    s_and_b32 s2, s2, 1
3668; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
3669; GFX9-NEXT:    s_addc_u32 s2, s6, s14
3670; GFX9-NEXT:    v_cndmask_b32_e64 v4, v1, -1, vcc
3671; GFX9-NEXT:    v_mov_b32_e32 v1, s3
3672; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
3673; GFX9-NEXT:    v_cndmask_b32_e64 v5, v2, -1, vcc
3674; GFX9-NEXT:    s_and_b32 s3, s3, 1
3675; GFX9-NEXT:    v_mov_b32_e32 v2, s12
3676; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
3677; GFX9-NEXT:    v_mov_b32_e32 v3, s13
3678; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, -1, vcc
3679; GFX9-NEXT:    v_cndmask_b32_e64 v7, v1, -1, vcc
3680; GFX9-NEXT:    s_addc_u32 s3, s7, s15
3681; GFX9-NEXT:    v_mov_b32_e32 v0, s14
3682; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
3683; GFX9-NEXT:    v_mov_b32_e32 v1, s15
3684; GFX9-NEXT:    s_cmp_eq_u64 s[2:3], s[14:15]
3685; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
3686; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
3687; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3688; GFX9-NEXT:    s_and_b32 s4, 1, s4
3689; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
3690; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
3691; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
3692; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3693; GFX9-NEXT:    v_mov_b32_e32 v1, s0
3694; GFX9-NEXT:    v_mov_b32_e32 v2, s1
3695; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
3696; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
3697; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
3698; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3699; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3700; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -1, vcc
3701; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, -1, vcc
3702; GFX9-NEXT:    v_readfirstlane_b32 s0, v4
3703; GFX9-NEXT:    v_readfirstlane_b32 s1, v5
3704; GFX9-NEXT:    v_readfirstlane_b32 s2, v6
3705; GFX9-NEXT:    v_readfirstlane_b32 s3, v7
3706; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
3707; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
3708; GFX9-NEXT:    v_readfirstlane_b32 s6, v2
3709; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
3710; GFX9-NEXT:    ; return to shader part epilog
3711;
3712; GFX10-LABEL: s_uaddsat_v2i128:
3713; GFX10:       ; %bb.0:
3714; GFX10-NEXT:    s_add_u32 s0, s0, s8
3715; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
3716; GFX10-NEXT:    s_and_b32 s16, s16, 1
3717; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
3718; GFX10-NEXT:    s_addc_u32 s1, s1, s9
3719; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
3720; GFX10-NEXT:    v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
3721; GFX10-NEXT:    s_and_b32 s16, s16, 1
3722; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
3723; GFX10-NEXT:    s_addc_u32 s2, s2, s10
3724; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
3725; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s8
3726; GFX10-NEXT:    s_and_b32 s16, s16, 1
3727; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
3728; GFX10-NEXT:    s_addc_u32 s3, s3, s11
3729; GFX10-NEXT:    s_cmp_eq_u64 s[2:3], s[10:11]
3730; GFX10-NEXT:    v_cmp_lt_u64_e64 s10, s[2:3], s[10:11]
3731; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
3732; GFX10-NEXT:    s_and_b32 s8, 1, s16
3733; GFX10-NEXT:    s_add_u32 s4, s4, s12
3734; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
3735; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
3736; GFX10-NEXT:    s_and_b32 s9, s9, 1
3737; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s10
3738; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
3739; GFX10-NEXT:    s_addc_u32 s5, s5, s13
3740; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
3741; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
3742; GFX10-NEXT:    s_and_b32 s9, s9, 1
3743; GFX10-NEXT:    s_cmp_lg_u32 s9, 0
3744; GFX10-NEXT:    v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
3745; GFX10-NEXT:    s_addc_u32 s6, s6, s14
3746; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
3747; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
3748; GFX10-NEXT:    s_and_b32 s8, s8, 1
3749; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s9
3750; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
3751; GFX10-NEXT:    s_addc_u32 s7, s7, s15
3752; GFX10-NEXT:    s_cmp_eq_u64 s[6:7], s[14:15]
3753; GFX10-NEXT:    v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
3754; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
3755; GFX10-NEXT:    s_and_b32 s8, 1, s8
3756; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
3757; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s9
3758; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
3759; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
3760; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
3761; GFX10-NEXT:    v_cndmask_b32_e64 v1, s0, -1, vcc_lo
3762; GFX10-NEXT:    v_cndmask_b32_e64 v2, s1, -1, vcc_lo
3763; GFX10-NEXT:    v_cndmask_b32_e64 v3, s2, -1, vcc_lo
3764; GFX10-NEXT:    v_cndmask_b32_e64 v4, s3, -1, vcc_lo
3765; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
3766; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
3767; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
3768; GFX10-NEXT:    v_readfirstlane_b32 s2, v3
3769; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
3770; GFX10-NEXT:    v_cndmask_b32_e64 v0, s4, -1, vcc_lo
3771; GFX10-NEXT:    v_cndmask_b32_e64 v1, s5, -1, vcc_lo
3772; GFX10-NEXT:    v_cndmask_b32_e64 v2, s6, -1, vcc_lo
3773; GFX10-NEXT:    v_cndmask_b32_e64 v3, s7, -1, vcc_lo
3774; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
3775; GFX10-NEXT:    v_readfirstlane_b32 s5, v1
3776; GFX10-NEXT:    v_readfirstlane_b32 s6, v2
3777; GFX10-NEXT:    v_readfirstlane_b32 s7, v3
3778; GFX10-NEXT:    ; return to shader part epilog
3779  %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
3780  ret <2 x i128> %result
3781}
3782
3783declare i7 @llvm.uadd.sat.i7(i7, i7) #0
3784declare i8 @llvm.uadd.sat.i8(i8, i8) #0
3785declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) #0
3786declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) #0
3787
3788declare i16 @llvm.uadd.sat.i16(i16, i16) #0
3789declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) #0
3790declare <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16>, <3 x i16>) #0
3791declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) #0
3792declare <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16>, <5 x i16>) #0
3793declare <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16>, <6 x i16>) #0
3794declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) #0
3795
3796declare i24 @llvm.uadd.sat.i24(i24, i24) #0
3797
3798declare i32 @llvm.uadd.sat.i32(i32, i32) #0
3799declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) #0
3800declare <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32>, <3 x i32>) #0
3801declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) #0
3802declare <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32>, <5 x i32>) #0
3803declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) #0
3804
3805declare i48 @llvm.uadd.sat.i48(i48, i48) #0
3806
3807declare i64 @llvm.uadd.sat.i64(i64, i64) #0
3808declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) #0
3809
3810declare i128 @llvm.uadd.sat.i128(i128, i128) #0
3811declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>) #0
3812
3813attributes #0 = { nounwind readnone speculatable willreturn }
3814