1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s 6 7define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) { 8; GFX6-LABEL: v_uaddsat_i7: 9; GFX6: ; %bb.0: 10; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 12; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 13; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 14; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 15; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 16; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0 17; GFX6-NEXT: s_setpc_b64 s[30:31] 18; 19; GFX8-LABEL: v_uaddsat_i7: 20; GFX8: ; %bb.0: 21; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 23; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 24; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp 25; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 26; GFX8-NEXT: s_setpc_b64 s[30:31] 27; 28; GFX9-LABEL: v_uaddsat_i7: 29; GFX9: ; %bb.0: 30; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 32; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 33; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp 34; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 35; GFX9-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX10-LABEL: v_uaddsat_i7: 38; GFX10: ; %bb.0: 39; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 41; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0 42; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1 43; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp 44; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 45; GFX10-NEXT: s_setpc_b64 s[30:31] 46 %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) 47 ret i7 %result 48} 49 50define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { 51; GFX6-LABEL: s_uaddsat_i7: 52; GFX6: ; %bb.0: 53; GFX6-NEXT: s_lshl_b32 s0, s0, 25 54; GFX6-NEXT: s_lshl_b32 s1, s1, 25 55; GFX6-NEXT: s_not_b32 s2, s0 56; GFX6-NEXT: s_min_u32 s1, s2, s1 57; GFX6-NEXT: s_add_i32 s0, s0, s1 58; GFX6-NEXT: s_lshr_b32 s0, s0, 25 59; GFX6-NEXT: ; return to shader part epilog 60; 61; GFX8-LABEL: s_uaddsat_i7: 62; GFX8: ; %bb.0: 63; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 64; GFX8-NEXT: s_lshl_b32 s1, s1, s2 65; GFX8-NEXT: s_lshl_b32 s0, s0, s2 66; GFX8-NEXT: v_mov_b32_e32 v0, s1 67; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 68; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 69; GFX8-NEXT: v_readfirstlane_b32 s0, v0 70; GFX8-NEXT: ; return to shader part epilog 71; 72; GFX9-LABEL: s_uaddsat_i7: 73; GFX9: ; %bb.0: 74; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 75; GFX9-NEXT: s_lshl_b32 s1, s1, s2 76; GFX9-NEXT: s_lshl_b32 s0, s0, s2 77; GFX9-NEXT: v_mov_b32_e32 v0, s1 78; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp 79; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 80; GFX9-NEXT: v_readfirstlane_b32 s0, v0 81; GFX9-NEXT: ; return to shader part epilog 82; 83; GFX10-LABEL: s_uaddsat_i7: 84; GFX10: ; %bb.0: 85; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 86; GFX10-NEXT: s_lshl_b32 s0, s0, s2 87; GFX10-NEXT: s_lshl_b32 s1, s1, s2 88; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp 89; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0 90; GFX10-NEXT: v_readfirstlane_b32 s0, v0 91; GFX10-NEXT: ; return to shader part epilog 92 %result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs) 93 ret i7 %result 94} 95 96define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { 97; GFX6-LABEL: v_uaddsat_i8: 98; GFX6: ; %bb.0: 99; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 101; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 102; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 103; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 104; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 105; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 106; GFX6-NEXT: s_setpc_b64 s[30:31] 107; 108; GFX8-LABEL: v_uaddsat_i8: 109; GFX8: ; %bb.0: 110; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 112; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 113; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp 114; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 115; GFX8-NEXT: s_setpc_b64 s[30:31] 116; 117; GFX9-LABEL: v_uaddsat_i8: 118; GFX9: ; %bb.0: 119; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 120; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 121; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 122; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp 123; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 124; GFX9-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: v_uaddsat_i8: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 130; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 131; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 132; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp 133; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 134; GFX10-NEXT: s_setpc_b64 s[30:31] 135 %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) 136 ret i8 %result 137} 138 139define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { 140; GFX6-LABEL: s_uaddsat_i8: 141; GFX6: ; %bb.0: 142; GFX6-NEXT: s_lshl_b32 s0, s0, 24 143; GFX6-NEXT: s_lshl_b32 s1, s1, 24 144; GFX6-NEXT: s_not_b32 s2, s0 145; GFX6-NEXT: s_min_u32 s1, s2, s1 146; GFX6-NEXT: s_add_i32 s0, s0, s1 147; GFX6-NEXT: s_lshr_b32 s0, s0, 24 148; GFX6-NEXT: ; return to shader part epilog 149; 150; GFX8-LABEL: s_uaddsat_i8: 151; GFX8: ; %bb.0: 152; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 153; GFX8-NEXT: s_lshl_b32 s1, s1, s2 154; GFX8-NEXT: s_lshl_b32 s0, s0, s2 155; GFX8-NEXT: v_mov_b32_e32 v0, s1 156; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 157; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 158; GFX8-NEXT: v_readfirstlane_b32 s0, v0 159; GFX8-NEXT: ; return to shader part epilog 160; 161; GFX9-LABEL: s_uaddsat_i8: 162; GFX9: ; %bb.0: 163; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 164; GFX9-NEXT: s_lshl_b32 s1, s1, s2 165; GFX9-NEXT: s_lshl_b32 s0, s0, s2 166; GFX9-NEXT: v_mov_b32_e32 v0, s1 167; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp 168; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 169; GFX9-NEXT: v_readfirstlane_b32 s0, v0 170; GFX9-NEXT: ; return to shader part epilog 171; 172; GFX10-LABEL: s_uaddsat_i8: 173; GFX10: ; %bb.0: 174; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 175; GFX10-NEXT: s_lshl_b32 s0, s0, s2 176; GFX10-NEXT: s_lshl_b32 s1, s1, s2 177; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp 178; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0 179; GFX10-NEXT: v_readfirstlane_b32 s0, v0 180; GFX10-NEXT: ; return to shader part epilog 181 %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) 182 ret i8 %result 183} 184 185define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { 186; GFX6-LABEL: v_uaddsat_v2i8: 187; GFX6: ; %bb.0: 188; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 190; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 191; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 192; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 193; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 194; GFX6-NEXT: v_min_u32_e32 v1, v4, v1 195; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 196; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 197; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 198; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 199; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 200; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 201; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 202; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 203; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 204; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 205; GFX6-NEXT: s_setpc_b64 s[30:31] 206; 207; GFX8-LABEL: v_uaddsat_v2i8: 208; GFX8: ; %bb.0: 209; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; GFX8-NEXT: v_mov_b32_e32 v2, 8 211; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 212; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 213; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 214; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 215; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp 216; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp 217; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 218; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 219; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 220; GFX8-NEXT: s_setpc_b64 s[30:31] 221; 222; GFX9-LABEL: v_uaddsat_v2i8: 223; GFX9: ; %bb.0: 224; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX9-NEXT: s_mov_b32 s4, 8 226; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 227; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 228; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 229; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 230; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 231; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 232; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 233; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp 234; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 235; GFX9-NEXT: s_movk_i32 s4, 0xff 236; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 237; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 238; GFX9-NEXT: s_setpc_b64 s[30:31] 239; 240; GFX10-LABEL: v_uaddsat_v2i8: 241; GFX10: ; %bb.0: 242; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 244; GFX10-NEXT: s_mov_b32 s4, 8 245; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff 246; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 247; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 248; GFX10-NEXT: s_movk_i32 s4, 0xff 249; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 250; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 251; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 252; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 253; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp 254; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 255; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 256; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 257; GFX10-NEXT: s_setpc_b64 s[30:31] 258 %lhs = bitcast i16 %lhs.arg to <2 x i8> 259 %rhs = bitcast i16 %rhs.arg to <2 x i8> 260 %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 261 %cast.result = bitcast <2 x i8> %result to i16 262 ret i16 %cast.result 263} 264 265define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { 266; GFX6-LABEL: s_uaddsat_v2i8: 267; GFX6: ; %bb.0: 268; GFX6-NEXT: s_lshr_b32 s2, s0, 8 269; GFX6-NEXT: s_lshl_b32 s0, s0, 24 270; GFX6-NEXT: s_lshr_b32 s3, s1, 8 271; GFX6-NEXT: s_lshl_b32 s1, s1, 24 272; GFX6-NEXT: s_not_b32 s4, s0 273; GFX6-NEXT: s_min_u32 s1, s4, s1 274; GFX6-NEXT: s_add_i32 s0, s0, s1 275; GFX6-NEXT: s_lshl_b32 s1, s2, 24 276; GFX6-NEXT: s_lshl_b32 s2, s3, 24 277; GFX6-NEXT: s_not_b32 s3, s1 278; GFX6-NEXT: s_min_u32 s2, s3, s2 279; GFX6-NEXT: s_add_i32 s1, s1, s2 280; GFX6-NEXT: s_lshr_b32 s1, s1, 24 281; GFX6-NEXT: s_lshr_b32 s0, s0, 24 282; GFX6-NEXT: s_lshl_b32 s1, s1, 8 283; GFX6-NEXT: s_or_b32 s0, s0, s1 284; GFX6-NEXT: ; return to shader part epilog 285; 286; GFX8-LABEL: s_uaddsat_v2i8: 287; GFX8: ; %bb.0: 288; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 289; GFX8-NEXT: s_lshr_b32 s3, s1, 8 290; GFX8-NEXT: s_lshl_b32 s1, s1, s4 291; GFX8-NEXT: s_lshr_b32 s2, s0, 8 292; GFX8-NEXT: s_lshl_b32 s0, s0, s4 293; GFX8-NEXT: v_mov_b32_e32 v0, s1 294; GFX8-NEXT: s_lshl_b32 s1, s3, s4 295; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 296; GFX8-NEXT: s_lshl_b32 s0, s2, s4 297; GFX8-NEXT: v_mov_b32_e32 v1, s1 298; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp 299; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 300; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 301; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 302; GFX8-NEXT: v_readfirstlane_b32 s0, v0 303; GFX8-NEXT: ; return to shader part epilog 304; 305; GFX9-LABEL: s_uaddsat_v2i8: 306; GFX9: ; %bb.0: 307; GFX9-NEXT: s_lshr_b32 s2, s0, 8 308; GFX9-NEXT: s_lshr_b32 s3, s1, 8 309; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 310; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 311; GFX9-NEXT: s_mov_b32 s2, 0x80008 312; GFX9-NEXT: s_lshr_b32 s3, s0, 16 313; GFX9-NEXT: s_lshl_b32 s0, s0, s2 314; GFX9-NEXT: s_lshl_b32 s3, s3, 8 315; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 316; GFX9-NEXT: s_lshr_b32 s3, s1, 16 317; GFX9-NEXT: s_lshl_b32 s1, s1, s2 318; GFX9-NEXT: s_lshl_b32 s2, s3, 8 319; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 320; GFX9-NEXT: v_mov_b32_e32 v0, s1 321; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 322; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 323; GFX9-NEXT: s_movk_i32 s0, 0xff 324; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 325; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 326; GFX9-NEXT: v_readfirstlane_b32 s0, v0 327; GFX9-NEXT: ; return to shader part epilog 328; 329; GFX10-LABEL: s_uaddsat_v2i8: 330; GFX10: ; %bb.0: 331; GFX10-NEXT: s_lshr_b32 s2, s0, 8 332; GFX10-NEXT: s_lshr_b32 s3, s1, 8 333; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 334; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 335; GFX10-NEXT: s_mov_b32 s2, 0x80008 336; GFX10-NEXT: s_lshr_b32 s3, s0, 16 337; GFX10-NEXT: s_lshr_b32 s4, s1, 16 338; GFX10-NEXT: s_lshl_b32 s0, s0, s2 339; GFX10-NEXT: s_lshl_b32 s3, s3, 8 340; GFX10-NEXT: s_lshl_b32 s1, s1, s2 341; GFX10-NEXT: s_lshl_b32 s2, s4, 8 342; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 343; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 344; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp 345; GFX10-NEXT: s_movk_i32 s0, 0xff 346; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 347; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 348; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 349; GFX10-NEXT: v_readfirstlane_b32 s0, v0 350; GFX10-NEXT: ; return to shader part epilog 351 %lhs = bitcast i16 %lhs.arg to <2 x i8> 352 %rhs = bitcast i16 %rhs.arg to <2 x i8> 353 %result = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 354 %cast.result = bitcast <2 x i8> %result to i16 355 ret i16 %cast.result 356} 357 358define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { 359; GFX6-LABEL: v_uaddsat_v4i8: 360; GFX6: ; %bb.0: 361; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 363; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 364; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 365; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 366; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 367; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 368; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 369; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 370; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 371; GFX6-NEXT: v_min_u32_e32 v1, v8, v1 372; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 373; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 374; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 375; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 376; GFX6-NEXT: v_min_u32_e32 v2, v5, v2 377; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 378; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 379; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 380; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 381; GFX6-NEXT: v_min_u32_e32 v3, v5, v3 382; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 383; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 384; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 385; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 386; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 387; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 388; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 389; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 390; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 391; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 392; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 393; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 394; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 395; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 396; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 397; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 398; GFX6-NEXT: s_setpc_b64 s[30:31] 399; 400; GFX8-LABEL: v_uaddsat_v4i8: 401; GFX8: ; %bb.0: 402; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 403; GFX8-NEXT: v_mov_b32_e32 v2, 8 404; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 405; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 406; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 407; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 408; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 409; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 410; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 411; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 412; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp 413; GFX8-NEXT: v_add_u16_e64 v1, v3, v2 clamp 414; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 415; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 416; GFX8-NEXT: v_add_u16_e64 v2, v2, v3 clamp 417; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 418; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 419; GFX8-NEXT: v_add_u16_e64 v3, v3, v4 clamp 420; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 421; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 422; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 423; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 424; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 425; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 426; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 427; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 428; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 429; GFX8-NEXT: s_setpc_b64 s[30:31] 430; 431; GFX9-LABEL: v_uaddsat_v4i8: 432; GFX9: ; %bb.0: 433; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 434; GFX9-NEXT: s_mov_b32 s4, 8 435; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 436; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 437; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff 438; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 439; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 440; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 441; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 442; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 443; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 444; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 445; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 446; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 447; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 448; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 449; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 450; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 451; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 452; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp 453; GFX9-NEXT: v_pk_add_u16 v1, v2, v3 clamp 454; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 455; GFX9-NEXT: v_mov_b32_e32 v2, 8 456; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 457; GFX9-NEXT: s_movk_i32 s4, 0xff 458; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 459; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 460; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 461; GFX9-NEXT: v_mov_b32_e32 v3, 24 462; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 463; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 464; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 465; GFX9-NEXT: s_setpc_b64 s[30:31] 466; 467; GFX10-LABEL: v_uaddsat_v4i8: 468; GFX10: ; %bb.0: 469; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 471; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 472; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 473; GFX10-NEXT: s_mov_b32 s4, 8 474; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 475; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 476; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 477; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff 478; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 479; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 480; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 481; GFX10-NEXT: s_movk_i32 s4, 0xff 482; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 483; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 484; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 485; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 486; GFX10-NEXT: v_mov_b32_e32 v4, 24 487; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 488; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 489; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 490; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 491; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp 492; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp 493; GFX10-NEXT: v_mov_b32_e32 v2, 8 494; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 495; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 496; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 497; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 498; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 499; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 500; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 501; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 502; GFX10-NEXT: s_setpc_b64 s[30:31] 503 %lhs = bitcast i32 %lhs.arg to <4 x i8> 504 %rhs = bitcast i32 %rhs.arg to <4 x i8> 505 %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 506 %cast.result = bitcast <4 x i8> %result to i32 507 ret i32 %cast.result 508} 509 510define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { 511; GFX6-LABEL: s_uaddsat_v4i8: 512; GFX6: ; %bb.0: 513; GFX6-NEXT: s_lshr_b32 s2, s0, 8 514; GFX6-NEXT: s_lshr_b32 s3, s0, 16 515; GFX6-NEXT: s_lshr_b32 s4, s0, 24 516; GFX6-NEXT: s_lshl_b32 s0, s0, 24 517; GFX6-NEXT: s_lshr_b32 s5, s1, 8 518; GFX6-NEXT: s_lshr_b32 s6, s1, 16 519; GFX6-NEXT: s_lshr_b32 s7, s1, 24 520; GFX6-NEXT: s_lshl_b32 s1, s1, 24 521; GFX6-NEXT: s_not_b32 s8, s0 522; GFX6-NEXT: s_min_u32 s1, s8, s1 523; GFX6-NEXT: s_add_i32 s0, s0, s1 524; GFX6-NEXT: s_lshl_b32 s1, s2, 24 525; GFX6-NEXT: s_lshl_b32 s2, s5, 24 526; GFX6-NEXT: s_not_b32 s5, s1 527; GFX6-NEXT: s_min_u32 s2, s5, s2 528; GFX6-NEXT: s_add_i32 s1, s1, s2 529; GFX6-NEXT: s_lshl_b32 s2, s3, 24 530; GFX6-NEXT: s_lshl_b32 s3, s6, 24 531; GFX6-NEXT: s_not_b32 s5, s2 532; GFX6-NEXT: s_min_u32 s3, s5, s3 533; GFX6-NEXT: s_add_i32 s2, s2, s3 534; GFX6-NEXT: s_lshl_b32 s3, s4, 24 535; GFX6-NEXT: s_lshl_b32 s4, s7, 24 536; GFX6-NEXT: s_not_b32 s5, s3 537; GFX6-NEXT: s_lshr_b32 s1, s1, 24 538; GFX6-NEXT: s_min_u32 s4, s5, s4 539; GFX6-NEXT: s_lshr_b32 s0, s0, 24 540; GFX6-NEXT: s_lshr_b32 s2, s2, 24 541; GFX6-NEXT: s_add_i32 s3, s3, s4 542; GFX6-NEXT: s_lshl_b32 s1, s1, 8 543; GFX6-NEXT: s_lshr_b32 s3, s3, 24 544; GFX6-NEXT: s_or_b32 s0, s0, s1 545; GFX6-NEXT: s_lshl_b32 s1, s2, 16 546; GFX6-NEXT: s_or_b32 s0, s0, s1 547; GFX6-NEXT: s_lshl_b32 s1, s3, 24 548; GFX6-NEXT: s_or_b32 s0, s0, s1 549; GFX6-NEXT: ; return to shader part epilog 550; 551; GFX8-LABEL: s_uaddsat_v4i8: 552; GFX8: ; %bb.0: 553; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 554; GFX8-NEXT: s_lshr_b32 s5, s1, 8 555; GFX8-NEXT: s_lshr_b32 s6, s1, 16 556; GFX8-NEXT: s_lshr_b32 s7, s1, 24 557; GFX8-NEXT: s_lshl_b32 s1, s1, s8 558; GFX8-NEXT: s_lshr_b32 s2, s0, 8 559; GFX8-NEXT: s_lshr_b32 s3, s0, 16 560; GFX8-NEXT: s_lshr_b32 s4, s0, 24 561; GFX8-NEXT: s_lshl_b32 s0, s0, s8 562; GFX8-NEXT: v_mov_b32_e32 v0, s1 563; GFX8-NEXT: s_lshl_b32 s1, s5, s8 564; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 565; GFX8-NEXT: s_lshl_b32 s0, s2, s8 566; GFX8-NEXT: v_mov_b32_e32 v1, s1 567; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp 568; GFX8-NEXT: s_lshl_b32 s1, s6, s8 569; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 570; GFX8-NEXT: s_lshl_b32 s0, s3, s8 571; GFX8-NEXT: v_mov_b32_e32 v2, s1 572; GFX8-NEXT: s_lshl_b32 s1, s7, s8 573; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 574; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp 575; GFX8-NEXT: s_lshl_b32 s0, s4, s8 576; GFX8-NEXT: v_mov_b32_e32 v3, s1 577; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 578; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 579; GFX8-NEXT: v_add_u16_e64 v3, s0, v3 clamp 580; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 581; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 582; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 583; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 584; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 585; GFX8-NEXT: v_readfirstlane_b32 s0, v0 586; GFX8-NEXT: ; return to shader part epilog 587; 588; GFX9-LABEL: s_uaddsat_v4i8: 589; GFX9: ; %bb.0: 590; GFX9-NEXT: s_lshr_b32 s3, s0, 8 591; GFX9-NEXT: s_lshr_b32 s4, s0, 16 592; GFX9-NEXT: s_lshr_b32 s6, s0, 24 593; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 594; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 595; GFX9-NEXT: s_mov_b32 s4, 0x80008 596; GFX9-NEXT: s_lshr_b32 s6, s0, 16 597; GFX9-NEXT: s_lshr_b32 s7, s1, 8 598; GFX9-NEXT: s_lshl_b32 s0, s0, s4 599; GFX9-NEXT: s_lshl_b32 s6, s6, 8 600; GFX9-NEXT: s_lshr_b32 s8, s1, 16 601; GFX9-NEXT: s_lshr_b32 s9, s1, 24 602; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 603; GFX9-NEXT: s_lshr_b32 s6, s3, 16 604; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 605; GFX9-NEXT: s_lshl_b32 s3, s3, s4 606; GFX9-NEXT: s_lshl_b32 s6, s6, 8 607; GFX9-NEXT: s_lshr_b32 s7, s1, 16 608; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 609; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 610; GFX9-NEXT: s_lshl_b32 s1, s1, s4 611; GFX9-NEXT: s_lshl_b32 s7, s7, 8 612; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 613; GFX9-NEXT: s_lshr_b32 s7, s6, 16 614; GFX9-NEXT: s_lshl_b32 s4, s6, s4 615; GFX9-NEXT: s_lshl_b32 s6, s7, 8 616; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 617; GFX9-NEXT: v_mov_b32_e32 v0, s1 618; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 619; GFX9-NEXT: v_mov_b32_e32 v1, s4 620; GFX9-NEXT: s_mov_b32 s2, 8 621; GFX9-NEXT: v_pk_add_u16 v1, s3, v1 clamp 622; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 623; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 624; GFX9-NEXT: s_movk_i32 s0, 0xff 625; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 626; GFX9-NEXT: s_mov_b32 s5, 24 627; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 628; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 629; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 630; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 631; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 632; GFX9-NEXT: v_readfirstlane_b32 s0, v0 633; GFX9-NEXT: ; return to shader part epilog 634; 635; GFX10-LABEL: s_uaddsat_v4i8: 636; GFX10: ; %bb.0: 637; GFX10-NEXT: s_lshr_b32 s2, s0, 8 638; GFX10-NEXT: s_lshr_b32 s3, s0, 16 639; GFX10-NEXT: s_lshr_b32 s4, s0, 24 640; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 641; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 642; GFX10-NEXT: s_mov_b32 s3, 0x80008 643; GFX10-NEXT: s_lshr_b32 s4, s0, 16 644; GFX10-NEXT: s_lshr_b32 s5, s1, 8 645; GFX10-NEXT: s_lshr_b32 s6, s1, 16 646; GFX10-NEXT: s_lshr_b32 s7, s1, 24 647; GFX10-NEXT: s_lshl_b32 s0, s0, s3 648; GFX10-NEXT: s_lshl_b32 s4, s4, 8 649; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 650; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 651; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 652; GFX10-NEXT: s_lshr_b32 s8, s2, 16 653; GFX10-NEXT: s_lshr_b32 s5, s1, 16 654; GFX10-NEXT: s_lshr_b32 s6, s4, 16 655; GFX10-NEXT: s_lshl_b32 s2, s2, s3 656; GFX10-NEXT: s_lshl_b32 s8, s8, 8 657; GFX10-NEXT: s_lshl_b32 s1, s1, s3 658; GFX10-NEXT: s_lshl_b32 s5, s5, 8 659; GFX10-NEXT: s_lshl_b32 s3, s4, s3 660; GFX10-NEXT: s_lshl_b32 s4, s6, 8 661; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 662; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 663; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 664; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp 665; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp 666; GFX10-NEXT: s_mov_b32 s0, 8 667; GFX10-NEXT: s_movk_i32 s1, 0xff 668; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 669; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 670; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 671; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 672; GFX10-NEXT: s_mov_b32 s0, 24 673; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 674; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 675; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 676; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 677; GFX10-NEXT: v_readfirstlane_b32 s0, v0 678; GFX10-NEXT: ; return to shader part epilog 679 %lhs = bitcast i32 %lhs.arg to <4 x i8> 680 %rhs = bitcast i32 %rhs.arg to <4 x i8> 681 %result = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 682 %cast.result = bitcast <4 x i8> %result to i32 683 ret i32 %cast.result 684} 685 686define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) { 687; GFX6-LABEL: v_uaddsat_i24: 688; GFX6: ; %bb.0: 689; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 690; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 691; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 692; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 693; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 694; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 695; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 696; GFX6-NEXT: s_setpc_b64 s[30:31] 697; 698; GFX8-LABEL: v_uaddsat_i24: 699; GFX8: ; %bb.0: 700; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 701; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 702; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 703; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp 704; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 705; GFX8-NEXT: s_setpc_b64 s[30:31] 706; 707; GFX9-LABEL: v_uaddsat_i24: 708; GFX9: ; %bb.0: 709; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 710; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 711; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 712; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp 713; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 714; GFX9-NEXT: s_setpc_b64 s[30:31] 715; 716; GFX10-LABEL: v_uaddsat_i24: 717; GFX10: ; %bb.0: 718; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 720; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 721; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 722; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp 723; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 724; GFX10-NEXT: s_setpc_b64 s[30:31] 725 %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) 726 ret i24 %result 727} 728 729define amdgpu_ps i24 @s_uaddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { 730; GFX6-LABEL: s_uaddsat_i24: 731; GFX6: ; %bb.0: 732; GFX6-NEXT: s_lshl_b32 s0, s0, 8 733; GFX6-NEXT: s_lshl_b32 s1, s1, 8 734; GFX6-NEXT: s_not_b32 s2, s0 735; GFX6-NEXT: s_min_u32 s1, s2, s1 736; GFX6-NEXT: s_add_i32 s0, s0, s1 737; GFX6-NEXT: s_lshr_b32 s0, s0, 8 738; GFX6-NEXT: ; return to shader part epilog 739; 740; GFX8-LABEL: s_uaddsat_i24: 741; GFX8: ; %bb.0: 742; GFX8-NEXT: s_lshl_b32 s1, s1, 8 743; GFX8-NEXT: s_lshl_b32 s0, s0, 8 744; GFX8-NEXT: v_mov_b32_e32 v0, s1 745; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp 746; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 747; GFX8-NEXT: v_readfirstlane_b32 s0, v0 748; GFX8-NEXT: ; return to shader part epilog 749; 750; GFX9-LABEL: s_uaddsat_i24: 751; GFX9: ; %bb.0: 752; GFX9-NEXT: s_lshl_b32 s1, s1, 8 753; GFX9-NEXT: s_lshl_b32 s0, s0, 8 754; GFX9-NEXT: v_mov_b32_e32 v0, s1 755; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 756; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 757; GFX9-NEXT: v_readfirstlane_b32 s0, v0 758; GFX9-NEXT: ; return to shader part epilog 759; 760; GFX10-LABEL: s_uaddsat_i24: 761; GFX10: ; %bb.0: 762; GFX10-NEXT: s_lshl_b32 s0, s0, 8 763; GFX10-NEXT: s_lshl_b32 s1, s1, 8 764; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp 765; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 766; GFX10-NEXT: v_readfirstlane_b32 s0, v0 767; GFX10-NEXT: ; return to shader part epilog 768 %result = call i24 @llvm.uadd.sat.i24(i24 %lhs, i24 %rhs) 769 ret i24 %result 770} 771 772define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) { 773; GFX6-LABEL: v_uaddsat_i32: 774; GFX6: ; %bb.0: 775; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 776; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 777; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 778; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 779; GFX6-NEXT: s_setpc_b64 s[30:31] 780; 781; GFX8-LABEL: v_uaddsat_i32: 782; GFX8: ; %bb.0: 783; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 784; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v1 clamp 785; GFX8-NEXT: s_setpc_b64 s[30:31] 786; 787; GFX9-LABEL: v_uaddsat_i32: 788; GFX9: ; %bb.0: 789; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 790; GFX9-NEXT: v_add_u32_e64 v0, v0, v1 clamp 791; GFX9-NEXT: s_setpc_b64 s[30:31] 792; 793; GFX10-LABEL: v_uaddsat_i32: 794; GFX10: ; %bb.0: 795; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 796; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 797; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v1 clamp 798; GFX10-NEXT: s_setpc_b64 s[30:31] 799 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) 800 ret i32 %result 801} 802 803define amdgpu_ps i32 @s_uaddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { 804; GFX6-LABEL: s_uaddsat_i32: 805; GFX6: ; %bb.0: 806; GFX6-NEXT: s_not_b32 s2, s0 807; GFX6-NEXT: s_min_u32 s1, s2, s1 808; GFX6-NEXT: s_add_i32 s0, s0, s1 809; GFX6-NEXT: ; return to shader part epilog 810; 811; GFX8-LABEL: s_uaddsat_i32: 812; GFX8: ; %bb.0: 813; GFX8-NEXT: v_mov_b32_e32 v0, s1 814; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp 815; GFX8-NEXT: v_readfirstlane_b32 s0, v0 816; GFX8-NEXT: ; return to shader part epilog 817; 818; GFX9-LABEL: s_uaddsat_i32: 819; GFX9: ; %bb.0: 820; GFX9-NEXT: v_mov_b32_e32 v0, s1 821; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 822; GFX9-NEXT: v_readfirstlane_b32 s0, v0 823; GFX9-NEXT: ; return to shader part epilog 824; 825; GFX10-LABEL: s_uaddsat_i32: 826; GFX10: ; %bb.0: 827; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s1 clamp 828; GFX10-NEXT: v_readfirstlane_b32 s0, v0 829; GFX10-NEXT: ; return to shader part epilog 830 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) 831 ret i32 %result 832} 833 834define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { 835; GFX6-LABEL: uaddsat_i32_sv: 836; GFX6: ; %bb.0: 837; GFX6-NEXT: s_not_b32 s1, s0 838; GFX6-NEXT: v_min_u32_e32 v0, s1, v0 839; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 840; GFX6-NEXT: ; return to shader part epilog 841; 842; GFX8-LABEL: uaddsat_i32_sv: 843; GFX8: ; %bb.0: 844; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 clamp 845; GFX8-NEXT: ; return to shader part epilog 846; 847; GFX9-LABEL: uaddsat_i32_sv: 848; GFX9: ; %bb.0: 849; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 850; GFX9-NEXT: ; return to shader part epilog 851; 852; GFX10-LABEL: uaddsat_i32_sv: 853; GFX10: ; %bb.0: 854; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, v0 clamp 855; GFX10-NEXT: ; return to shader part epilog 856 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) 857 %cast = bitcast i32 %result to float 858 ret float %cast 859} 860 861define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { 862; GFX6-LABEL: uaddsat_i32_vs: 863; GFX6: ; %bb.0: 864; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0 865; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 866; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 867; GFX6-NEXT: ; return to shader part epilog 868; 869; GFX8-LABEL: uaddsat_i32_vs: 870; GFX8: ; %bb.0: 871; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], v0, s0 clamp 872; GFX8-NEXT: ; return to shader part epilog 873; 874; GFX9-LABEL: uaddsat_i32_vs: 875; GFX9: ; %bb.0: 876; GFX9-NEXT: v_add_u32_e64 v0, v0, s0 clamp 877; GFX9-NEXT: ; return to shader part epilog 878; 879; GFX10-LABEL: uaddsat_i32_vs: 880; GFX10: ; %bb.0: 881; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, s0 clamp 882; GFX10-NEXT: ; return to shader part epilog 883 %result = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) 884 %cast = bitcast i32 %result to float 885 ret float %cast 886} 887 888define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 889; GFX6-LABEL: v_uaddsat_v2i32: 890; GFX6: ; %bb.0: 891; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 892; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 893; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 894; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 895; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1 896; GFX6-NEXT: v_min_u32_e32 v2, v2, v3 897; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 898; GFX6-NEXT: s_setpc_b64 s[30:31] 899; 900; GFX8-LABEL: v_uaddsat_v2i32: 901; GFX8: ; %bb.0: 902; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 903; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v2 clamp 904; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v3 clamp 905; GFX8-NEXT: s_setpc_b64 s[30:31] 906; 907; GFX9-LABEL: v_uaddsat_v2i32: 908; GFX9: ; %bb.0: 909; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 910; GFX9-NEXT: v_add_u32_e64 v0, v0, v2 clamp 911; GFX9-NEXT: v_add_u32_e64 v1, v1, v3 clamp 912; GFX9-NEXT: s_setpc_b64 s[30:31] 913; 914; GFX10-LABEL: v_uaddsat_v2i32: 915; GFX10: ; %bb.0: 916; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 917; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 918; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v2 clamp 919; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v3 clamp 920; GFX10-NEXT: s_setpc_b64 s[30:31] 921 %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 922 ret <2 x i32> %result 923} 924 925define amdgpu_ps <2 x i32> @s_uaddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { 926; GFX6-LABEL: s_uaddsat_v2i32: 927; GFX6: ; %bb.0: 928; GFX6-NEXT: s_not_b32 s4, s0 929; GFX6-NEXT: s_min_u32 s2, s4, s2 930; GFX6-NEXT: s_add_i32 s0, s0, s2 931; GFX6-NEXT: s_not_b32 s2, s1 932; GFX6-NEXT: s_min_u32 s2, s2, s3 933; GFX6-NEXT: s_add_i32 s1, s1, s2 934; GFX6-NEXT: ; return to shader part epilog 935; 936; GFX8-LABEL: s_uaddsat_v2i32: 937; GFX8: ; %bb.0: 938; GFX8-NEXT: v_mov_b32_e32 v0, s2 939; GFX8-NEXT: v_mov_b32_e32 v1, s3 940; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], s0, v0 clamp 941; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp 942; GFX8-NEXT: v_readfirstlane_b32 s0, v0 943; GFX8-NEXT: v_readfirstlane_b32 s1, v1 944; GFX8-NEXT: ; return to shader part epilog 945; 946; GFX9-LABEL: s_uaddsat_v2i32: 947; GFX9: ; %bb.0: 948; GFX9-NEXT: v_mov_b32_e32 v0, s2 949; GFX9-NEXT: v_mov_b32_e32 v1, s3 950; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 951; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp 952; GFX9-NEXT: v_readfirstlane_b32 s0, v0 953; GFX9-NEXT: v_readfirstlane_b32 s1, v1 954; GFX9-NEXT: ; return to shader part epilog 955; 956; GFX10-LABEL: s_uaddsat_v2i32: 957; GFX10: ; %bb.0: 958; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s2 clamp 959; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s3 clamp 960; GFX10-NEXT: v_readfirstlane_b32 s0, v0 961; GFX10-NEXT: v_readfirstlane_b32 s1, v1 962; GFX10-NEXT: ; return to shader part epilog 963 %result = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 964 ret <2 x i32> %result 965} 966 967define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 968; GFX6-LABEL: v_uaddsat_v3i32: 969; GFX6: ; %bb.0: 970; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 971; GFX6-NEXT: v_xor_b32_e32 v6, -1, v0 972; GFX6-NEXT: v_min_u32_e32 v3, v6, v3 973; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 974; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 975; GFX6-NEXT: v_min_u32_e32 v3, v3, v4 976; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 977; GFX6-NEXT: v_xor_b32_e32 v3, -1, v2 978; GFX6-NEXT: v_min_u32_e32 v3, v3, v5 979; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 980; GFX6-NEXT: s_setpc_b64 s[30:31] 981; 982; GFX8-LABEL: v_uaddsat_v3i32: 983; GFX8: ; %bb.0: 984; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 985; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v3 clamp 986; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v4 clamp 987; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v5 clamp 988; GFX8-NEXT: s_setpc_b64 s[30:31] 989; 990; GFX9-LABEL: v_uaddsat_v3i32: 991; GFX9: ; %bb.0: 992; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 993; GFX9-NEXT: v_add_u32_e64 v0, v0, v3 clamp 994; GFX9-NEXT: v_add_u32_e64 v1, v1, v4 clamp 995; GFX9-NEXT: v_add_u32_e64 v2, v2, v5 clamp 996; GFX9-NEXT: s_setpc_b64 s[30:31] 997; 998; GFX10-LABEL: v_uaddsat_v3i32: 999; GFX10: ; %bb.0: 1000; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1001; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1002; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v3 clamp 1003; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v4 clamp 1004; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v5 clamp 1005; GFX10-NEXT: s_setpc_b64 s[30:31] 1006 %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1007 ret <3 x i32> %result 1008} 1009 1010define amdgpu_ps <3 x i32> @s_uaddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { 1011; GFX6-LABEL: s_uaddsat_v3i32: 1012; GFX6: ; %bb.0: 1013; GFX6-NEXT: s_not_b32 s6, s0 1014; GFX6-NEXT: s_min_u32 s3, s6, s3 1015; GFX6-NEXT: s_add_i32 s0, s0, s3 1016; GFX6-NEXT: s_not_b32 s3, s1 1017; GFX6-NEXT: s_min_u32 s3, s3, s4 1018; GFX6-NEXT: s_add_i32 s1, s1, s3 1019; GFX6-NEXT: s_not_b32 s3, s2 1020; GFX6-NEXT: s_min_u32 s3, s3, s5 1021; GFX6-NEXT: s_add_i32 s2, s2, s3 1022; GFX6-NEXT: ; return to shader part epilog 1023; 1024; GFX8-LABEL: s_uaddsat_v3i32: 1025; GFX8: ; %bb.0: 1026; GFX8-NEXT: v_mov_b32_e32 v0, s3 1027; GFX8-NEXT: v_mov_b32_e32 v1, s4 1028; GFX8-NEXT: v_mov_b32_e32 v2, s5 1029; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], s0, v0 clamp 1030; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp 1031; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp 1032; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1033; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1034; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1035; GFX8-NEXT: ; return to shader part epilog 1036; 1037; GFX9-LABEL: s_uaddsat_v3i32: 1038; GFX9: ; %bb.0: 1039; GFX9-NEXT: v_mov_b32_e32 v0, s3 1040; GFX9-NEXT: v_mov_b32_e32 v1, s4 1041; GFX9-NEXT: v_mov_b32_e32 v2, s5 1042; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 1043; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp 1044; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp 1045; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1046; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1047; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1048; GFX9-NEXT: ; return to shader part epilog 1049; 1050; GFX10-LABEL: s_uaddsat_v3i32: 1051; GFX10: ; %bb.0: 1052; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s3 clamp 1053; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s4 clamp 1054; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s5 clamp 1055; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1056; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1057; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1058; GFX10-NEXT: ; return to shader part epilog 1059 %result = call <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1060 ret <3 x i32> %result 1061} 1062 1063define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 1064; GFX6-LABEL: v_uaddsat_v4i32: 1065; GFX6: ; %bb.0: 1066; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1067; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 1068; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 1069; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 1070; GFX6-NEXT: v_xor_b32_e32 v4, -1, v1 1071; GFX6-NEXT: v_min_u32_e32 v4, v4, v5 1072; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1073; GFX6-NEXT: v_xor_b32_e32 v4, -1, v2 1074; GFX6-NEXT: v_min_u32_e32 v4, v4, v6 1075; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1076; GFX6-NEXT: v_xor_b32_e32 v4, -1, v3 1077; GFX6-NEXT: v_min_u32_e32 v4, v4, v7 1078; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1079; GFX6-NEXT: s_setpc_b64 s[30:31] 1080; 1081; GFX8-LABEL: v_uaddsat_v4i32: 1082; GFX8: ; %bb.0: 1083; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1084; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v4 clamp 1085; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v5 clamp 1086; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v6 clamp 1087; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v7 clamp 1088; GFX8-NEXT: s_setpc_b64 s[30:31] 1089; 1090; GFX9-LABEL: v_uaddsat_v4i32: 1091; GFX9: ; %bb.0: 1092; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1093; GFX9-NEXT: v_add_u32_e64 v0, v0, v4 clamp 1094; GFX9-NEXT: v_add_u32_e64 v1, v1, v5 clamp 1095; GFX9-NEXT: v_add_u32_e64 v2, v2, v6 clamp 1096; GFX9-NEXT: v_add_u32_e64 v3, v3, v7 clamp 1097; GFX9-NEXT: s_setpc_b64 s[30:31] 1098; 1099; GFX10-LABEL: v_uaddsat_v4i32: 1100; GFX10: ; %bb.0: 1101; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1102; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1103; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v4 clamp 1104; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v5 clamp 1105; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v6 clamp 1106; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v7 clamp 1107; GFX10-NEXT: s_setpc_b64 s[30:31] 1108 %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1109 ret <4 x i32> %result 1110} 1111 1112define amdgpu_ps <4 x i32> @s_uaddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { 1113; GFX6-LABEL: s_uaddsat_v4i32: 1114; GFX6: ; %bb.0: 1115; GFX6-NEXT: s_not_b32 s8, s0 1116; GFX6-NEXT: s_min_u32 s4, s8, s4 1117; GFX6-NEXT: s_add_i32 s0, s0, s4 1118; GFX6-NEXT: s_not_b32 s4, s1 1119; GFX6-NEXT: s_min_u32 s4, s4, s5 1120; GFX6-NEXT: s_add_i32 s1, s1, s4 1121; GFX6-NEXT: s_not_b32 s4, s2 1122; GFX6-NEXT: s_min_u32 s4, s4, s6 1123; GFX6-NEXT: s_add_i32 s2, s2, s4 1124; GFX6-NEXT: s_not_b32 s4, s3 1125; GFX6-NEXT: s_min_u32 s4, s4, s7 1126; GFX6-NEXT: s_add_i32 s3, s3, s4 1127; GFX6-NEXT: ; return to shader part epilog 1128; 1129; GFX8-LABEL: s_uaddsat_v4i32: 1130; GFX8: ; %bb.0: 1131; GFX8-NEXT: v_mov_b32_e32 v0, s4 1132; GFX8-NEXT: v_mov_b32_e32 v1, s5 1133; GFX8-NEXT: v_mov_b32_e32 v2, s6 1134; GFX8-NEXT: v_mov_b32_e32 v3, s7 1135; GFX8-NEXT: v_add_u32_e64 v0, s[8:9], s0, v0 clamp 1136; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp 1137; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp 1138; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp 1139; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1140; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1141; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1142; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1143; GFX8-NEXT: ; return to shader part epilog 1144; 1145; GFX9-LABEL: s_uaddsat_v4i32: 1146; GFX9: ; %bb.0: 1147; GFX9-NEXT: v_mov_b32_e32 v0, s4 1148; GFX9-NEXT: v_mov_b32_e32 v1, s5 1149; GFX9-NEXT: v_mov_b32_e32 v2, s6 1150; GFX9-NEXT: v_mov_b32_e32 v3, s7 1151; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 1152; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp 1153; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp 1154; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp 1155; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1156; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1157; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1158; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1159; GFX9-NEXT: ; return to shader part epilog 1160; 1161; GFX10-LABEL: s_uaddsat_v4i32: 1162; GFX10: ; %bb.0: 1163; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s4 clamp 1164; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s5 clamp 1165; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s6 clamp 1166; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s7 clamp 1167; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1168; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1169; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1170; GFX10-NEXT: v_readfirstlane_b32 s3, v3 1171; GFX10-NEXT: ; return to shader part epilog 1172 %result = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1173 ret <4 x i32> %result 1174} 1175 1176define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { 1177; GFX6-LABEL: v_uaddsat_v5i32: 1178; GFX6: ; %bb.0: 1179; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1180; GFX6-NEXT: v_xor_b32_e32 v10, -1, v0 1181; GFX6-NEXT: v_min_u32_e32 v5, v10, v5 1182; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 1183; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 1184; GFX6-NEXT: v_min_u32_e32 v5, v5, v6 1185; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 1186; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 1187; GFX6-NEXT: v_min_u32_e32 v5, v5, v7 1188; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 1189; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 1190; GFX6-NEXT: v_min_u32_e32 v5, v5, v8 1191; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 1192; GFX6-NEXT: v_xor_b32_e32 v5, -1, v4 1193; GFX6-NEXT: v_min_u32_e32 v5, v5, v9 1194; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 1195; GFX6-NEXT: s_setpc_b64 s[30:31] 1196; 1197; GFX8-LABEL: v_uaddsat_v5i32: 1198; GFX8: ; %bb.0: 1199; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1200; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v5 clamp 1201; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v6 clamp 1202; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v7 clamp 1203; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v8 clamp 1204; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v9 clamp 1205; GFX8-NEXT: s_setpc_b64 s[30:31] 1206; 1207; GFX9-LABEL: v_uaddsat_v5i32: 1208; GFX9: ; %bb.0: 1209; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1210; GFX9-NEXT: v_add_u32_e64 v0, v0, v5 clamp 1211; GFX9-NEXT: v_add_u32_e64 v1, v1, v6 clamp 1212; GFX9-NEXT: v_add_u32_e64 v2, v2, v7 clamp 1213; GFX9-NEXT: v_add_u32_e64 v3, v3, v8 clamp 1214; GFX9-NEXT: v_add_u32_e64 v4, v4, v9 clamp 1215; GFX9-NEXT: s_setpc_b64 s[30:31] 1216; 1217; GFX10-LABEL: v_uaddsat_v5i32: 1218; GFX10: ; %bb.0: 1219; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1220; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1221; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v5 clamp 1222; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v6 clamp 1223; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v7 clamp 1224; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v8 clamp 1225; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v9 clamp 1226; GFX10-NEXT: s_setpc_b64 s[30:31] 1227 %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1228 ret <5 x i32> %result 1229} 1230 1231define amdgpu_ps <5 x i32> @s_uaddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { 1232; GFX6-LABEL: s_uaddsat_v5i32: 1233; GFX6: ; %bb.0: 1234; GFX6-NEXT: s_not_b32 s10, s0 1235; GFX6-NEXT: s_min_u32 s5, s10, s5 1236; GFX6-NEXT: s_add_i32 s0, s0, s5 1237; GFX6-NEXT: s_not_b32 s5, s1 1238; GFX6-NEXT: s_min_u32 s5, s5, s6 1239; GFX6-NEXT: s_add_i32 s1, s1, s5 1240; GFX6-NEXT: s_not_b32 s5, s2 1241; GFX6-NEXT: s_min_u32 s5, s5, s7 1242; GFX6-NEXT: s_add_i32 s2, s2, s5 1243; GFX6-NEXT: s_not_b32 s5, s3 1244; GFX6-NEXT: s_min_u32 s5, s5, s8 1245; GFX6-NEXT: s_add_i32 s3, s3, s5 1246; GFX6-NEXT: s_not_b32 s5, s4 1247; GFX6-NEXT: s_min_u32 s5, s5, s9 1248; GFX6-NEXT: s_add_i32 s4, s4, s5 1249; GFX6-NEXT: ; return to shader part epilog 1250; 1251; GFX8-LABEL: s_uaddsat_v5i32: 1252; GFX8: ; %bb.0: 1253; GFX8-NEXT: v_mov_b32_e32 v0, s5 1254; GFX8-NEXT: v_mov_b32_e32 v1, s6 1255; GFX8-NEXT: v_mov_b32_e32 v2, s7 1256; GFX8-NEXT: v_mov_b32_e32 v3, s8 1257; GFX8-NEXT: v_mov_b32_e32 v4, s9 1258; GFX8-NEXT: v_add_u32_e64 v0, s[10:11], s0, v0 clamp 1259; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], s1, v1 clamp 1260; GFX8-NEXT: v_add_u32_e64 v2, s[0:1], s2, v2 clamp 1261; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], s3, v3 clamp 1262; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], s4, v4 clamp 1263; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1264; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1265; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1266; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1267; GFX8-NEXT: v_readfirstlane_b32 s4, v4 1268; GFX8-NEXT: ; return to shader part epilog 1269; 1270; GFX9-LABEL: s_uaddsat_v5i32: 1271; GFX9: ; %bb.0: 1272; GFX9-NEXT: v_mov_b32_e32 v0, s5 1273; GFX9-NEXT: v_mov_b32_e32 v1, s6 1274; GFX9-NEXT: v_mov_b32_e32 v2, s7 1275; GFX9-NEXT: v_mov_b32_e32 v3, s8 1276; GFX9-NEXT: v_mov_b32_e32 v4, s9 1277; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 1278; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp 1279; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp 1280; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp 1281; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp 1282; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1283; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1284; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1285; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1286; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1287; GFX9-NEXT: ; return to shader part epilog 1288; 1289; GFX10-LABEL: s_uaddsat_v5i32: 1290; GFX10: ; %bb.0: 1291; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s5 clamp 1292; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s6 clamp 1293; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s7 clamp 1294; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s8 clamp 1295; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s9 clamp 1296; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1297; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1298; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1299; GFX10-NEXT: v_readfirstlane_b32 s3, v3 1300; GFX10-NEXT: v_readfirstlane_b32 s4, v4 1301; GFX10-NEXT: ; return to shader part epilog 1302 %result = call <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1303 ret <5 x i32> %result 1304} 1305 1306define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 1307; GFX6-LABEL: v_uaddsat_v16i32: 1308; GFX6: ; %bb.0: 1309; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1310; GFX6-NEXT: v_xor_b32_e32 v32, -1, v0 1311; GFX6-NEXT: v_min_u32_e32 v16, v32, v16 1312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 1313; GFX6-NEXT: v_xor_b32_e32 v16, -1, v1 1314; GFX6-NEXT: v_min_u32_e32 v16, v16, v17 1315; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 1316; GFX6-NEXT: v_xor_b32_e32 v16, -1, v2 1317; GFX6-NEXT: v_min_u32_e32 v16, v16, v18 1318; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 1319; GFX6-NEXT: v_xor_b32_e32 v16, -1, v3 1320; GFX6-NEXT: v_min_u32_e32 v16, v16, v19 1321; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16 1322; GFX6-NEXT: v_xor_b32_e32 v16, -1, v4 1323; GFX6-NEXT: v_min_u32_e32 v16, v16, v20 1324; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16 1325; GFX6-NEXT: v_xor_b32_e32 v16, -1, v5 1326; GFX6-NEXT: v_min_u32_e32 v16, v16, v21 1327; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16 1328; GFX6-NEXT: v_xor_b32_e32 v16, -1, v6 1329; GFX6-NEXT: v_min_u32_e32 v16, v16, v22 1330; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16 1331; GFX6-NEXT: v_xor_b32_e32 v16, -1, v7 1332; GFX6-NEXT: v_min_u32_e32 v16, v16, v23 1333; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16 1334; GFX6-NEXT: v_xor_b32_e32 v16, -1, v8 1335; GFX6-NEXT: v_min_u32_e32 v16, v16, v24 1336; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16 1337; GFX6-NEXT: v_xor_b32_e32 v16, -1, v9 1338; GFX6-NEXT: v_min_u32_e32 v16, v16, v25 1339; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16 1340; GFX6-NEXT: v_xor_b32_e32 v16, -1, v10 1341; GFX6-NEXT: v_min_u32_e32 v16, v16, v26 1342; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16 1343; GFX6-NEXT: v_xor_b32_e32 v16, -1, v11 1344; GFX6-NEXT: v_min_u32_e32 v16, v16, v27 1345; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16 1346; GFX6-NEXT: v_xor_b32_e32 v16, -1, v12 1347; GFX6-NEXT: v_min_u32_e32 v16, v16, v28 1348; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16 1349; GFX6-NEXT: v_xor_b32_e32 v16, -1, v13 1350; GFX6-NEXT: v_min_u32_e32 v16, v16, v29 1351; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16 1352; GFX6-NEXT: v_xor_b32_e32 v16, -1, v14 1353; GFX6-NEXT: v_min_u32_e32 v16, v16, v30 1354; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16 1355; GFX6-NEXT: v_xor_b32_e32 v16, -1, v15 1356; GFX6-NEXT: v_min_u32_e32 v16, v16, v31 1357; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 1358; GFX6-NEXT: s_setpc_b64 s[30:31] 1359; 1360; GFX8-LABEL: v_uaddsat_v16i32: 1361; GFX8: ; %bb.0: 1362; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1363; GFX8-NEXT: v_add_u32_e64 v0, s[4:5], v0, v16 clamp 1364; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v1, v17 clamp 1365; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v2, v18 clamp 1366; GFX8-NEXT: v_add_u32_e64 v3, s[4:5], v3, v19 clamp 1367; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v4, v20 clamp 1368; GFX8-NEXT: v_add_u32_e64 v5, s[4:5], v5, v21 clamp 1369; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], v6, v22 clamp 1370; GFX8-NEXT: v_add_u32_e64 v7, s[4:5], v7, v23 clamp 1371; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], v8, v24 clamp 1372; GFX8-NEXT: v_add_u32_e64 v9, s[4:5], v9, v25 clamp 1373; GFX8-NEXT: v_add_u32_e64 v10, s[4:5], v10, v26 clamp 1374; GFX8-NEXT: v_add_u32_e64 v11, s[4:5], v11, v27 clamp 1375; GFX8-NEXT: v_add_u32_e64 v12, s[4:5], v12, v28 clamp 1376; GFX8-NEXT: v_add_u32_e64 v13, s[4:5], v13, v29 clamp 1377; GFX8-NEXT: v_add_u32_e64 v14, s[4:5], v14, v30 clamp 1378; GFX8-NEXT: v_add_u32_e64 v15, s[4:5], v15, v31 clamp 1379; GFX8-NEXT: s_setpc_b64 s[30:31] 1380; 1381; GFX9-LABEL: v_uaddsat_v16i32: 1382; GFX9: ; %bb.0: 1383; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1384; GFX9-NEXT: v_add_u32_e64 v0, v0, v16 clamp 1385; GFX9-NEXT: v_add_u32_e64 v1, v1, v17 clamp 1386; GFX9-NEXT: v_add_u32_e64 v2, v2, v18 clamp 1387; GFX9-NEXT: v_add_u32_e64 v3, v3, v19 clamp 1388; GFX9-NEXT: v_add_u32_e64 v4, v4, v20 clamp 1389; GFX9-NEXT: v_add_u32_e64 v5, v5, v21 clamp 1390; GFX9-NEXT: v_add_u32_e64 v6, v6, v22 clamp 1391; GFX9-NEXT: v_add_u32_e64 v7, v7, v23 clamp 1392; GFX9-NEXT: v_add_u32_e64 v8, v8, v24 clamp 1393; GFX9-NEXT: v_add_u32_e64 v9, v9, v25 clamp 1394; GFX9-NEXT: v_add_u32_e64 v10, v10, v26 clamp 1395; GFX9-NEXT: v_add_u32_e64 v11, v11, v27 clamp 1396; GFX9-NEXT: v_add_u32_e64 v12, v12, v28 clamp 1397; GFX9-NEXT: v_add_u32_e64 v13, v13, v29 clamp 1398; GFX9-NEXT: v_add_u32_e64 v14, v14, v30 clamp 1399; GFX9-NEXT: v_add_u32_e64 v15, v15, v31 clamp 1400; GFX9-NEXT: s_setpc_b64 s[30:31] 1401; 1402; GFX10-LABEL: v_uaddsat_v16i32: 1403; GFX10: ; %bb.0: 1404; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1405; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1406; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, v16 clamp 1407; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, v17 clamp 1408; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, v18 clamp 1409; GFX10-NEXT: v_add_nc_u32_e64 v3, v3, v19 clamp 1410; GFX10-NEXT: v_add_nc_u32_e64 v4, v4, v20 clamp 1411; GFX10-NEXT: v_add_nc_u32_e64 v5, v5, v21 clamp 1412; GFX10-NEXT: v_add_nc_u32_e64 v6, v6, v22 clamp 1413; GFX10-NEXT: v_add_nc_u32_e64 v7, v7, v23 clamp 1414; GFX10-NEXT: v_add_nc_u32_e64 v8, v8, v24 clamp 1415; GFX10-NEXT: v_add_nc_u32_e64 v9, v9, v25 clamp 1416; GFX10-NEXT: v_add_nc_u32_e64 v10, v10, v26 clamp 1417; GFX10-NEXT: v_add_nc_u32_e64 v11, v11, v27 clamp 1418; GFX10-NEXT: v_add_nc_u32_e64 v12, v12, v28 clamp 1419; GFX10-NEXT: v_add_nc_u32_e64 v13, v13, v29 clamp 1420; GFX10-NEXT: v_add_nc_u32_e64 v14, v14, v30 clamp 1421; GFX10-NEXT: v_add_nc_u32_e64 v15, v15, v31 clamp 1422; GFX10-NEXT: s_setpc_b64 s[30:31] 1423 %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 1424 ret <16 x i32> %result 1425} 1426 1427define amdgpu_ps <16 x i32> @s_uaddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { 1428; GFX6-LABEL: s_uaddsat_v16i32: 1429; GFX6: ; %bb.0: 1430; GFX6-NEXT: s_not_b32 s32, s0 1431; GFX6-NEXT: s_min_u32 s16, s32, s16 1432; GFX6-NEXT: s_add_i32 s0, s0, s16 1433; GFX6-NEXT: s_not_b32 s16, s1 1434; GFX6-NEXT: s_min_u32 s16, s16, s17 1435; GFX6-NEXT: s_add_i32 s1, s1, s16 1436; GFX6-NEXT: s_not_b32 s16, s2 1437; GFX6-NEXT: s_min_u32 s16, s16, s18 1438; GFX6-NEXT: s_add_i32 s2, s2, s16 1439; GFX6-NEXT: s_not_b32 s16, s3 1440; GFX6-NEXT: s_min_u32 s16, s16, s19 1441; GFX6-NEXT: s_add_i32 s3, s3, s16 1442; GFX6-NEXT: s_not_b32 s16, s4 1443; GFX6-NEXT: s_min_u32 s16, s16, s20 1444; GFX6-NEXT: s_add_i32 s4, s4, s16 1445; GFX6-NEXT: s_not_b32 s16, s5 1446; GFX6-NEXT: s_min_u32 s16, s16, s21 1447; GFX6-NEXT: s_add_i32 s5, s5, s16 1448; GFX6-NEXT: s_not_b32 s16, s6 1449; GFX6-NEXT: s_min_u32 s16, s16, s22 1450; GFX6-NEXT: s_add_i32 s6, s6, s16 1451; GFX6-NEXT: s_not_b32 s16, s7 1452; GFX6-NEXT: s_min_u32 s16, s16, s23 1453; GFX6-NEXT: s_add_i32 s7, s7, s16 1454; GFX6-NEXT: s_not_b32 s16, s8 1455; GFX6-NEXT: s_min_u32 s16, s16, s24 1456; GFX6-NEXT: s_add_i32 s8, s8, s16 1457; GFX6-NEXT: s_not_b32 s16, s9 1458; GFX6-NEXT: s_min_u32 s16, s16, s25 1459; GFX6-NEXT: s_add_i32 s9, s9, s16 1460; GFX6-NEXT: s_not_b32 s16, s10 1461; GFX6-NEXT: s_min_u32 s16, s16, s26 1462; GFX6-NEXT: s_add_i32 s10, s10, s16 1463; GFX6-NEXT: s_not_b32 s16, s11 1464; GFX6-NEXT: s_min_u32 s16, s16, s27 1465; GFX6-NEXT: s_add_i32 s11, s11, s16 1466; GFX6-NEXT: s_not_b32 s16, s12 1467; GFX6-NEXT: s_min_u32 s16, s16, s28 1468; GFX6-NEXT: s_add_i32 s12, s12, s16 1469; GFX6-NEXT: s_not_b32 s16, s13 1470; GFX6-NEXT: s_min_u32 s16, s16, s29 1471; GFX6-NEXT: s_add_i32 s13, s13, s16 1472; GFX6-NEXT: s_not_b32 s16, s14 1473; GFX6-NEXT: s_min_u32 s16, s16, s30 1474; GFX6-NEXT: s_add_i32 s14, s14, s16 1475; GFX6-NEXT: s_not_b32 s16, s15 1476; GFX6-NEXT: s_min_u32 s16, s16, s31 1477; GFX6-NEXT: s_add_i32 s15, s15, s16 1478; GFX6-NEXT: ; return to shader part epilog 1479; 1480; GFX8-LABEL: s_uaddsat_v16i32: 1481; GFX8: ; %bb.0: 1482; GFX8-NEXT: v_mov_b32_e32 v0, s16 1483; GFX8-NEXT: v_mov_b32_e32 v1, s17 1484; GFX8-NEXT: v_mov_b32_e32 v2, s18 1485; GFX8-NEXT: v_mov_b32_e32 v3, s19 1486; GFX8-NEXT: v_mov_b32_e32 v4, s20 1487; GFX8-NEXT: v_mov_b32_e32 v5, s21 1488; GFX8-NEXT: v_mov_b32_e32 v6, s22 1489; GFX8-NEXT: v_mov_b32_e32 v7, s23 1490; GFX8-NEXT: v_mov_b32_e32 v8, s24 1491; GFX8-NEXT: v_mov_b32_e32 v9, s25 1492; GFX8-NEXT: v_mov_b32_e32 v10, s26 1493; GFX8-NEXT: v_mov_b32_e32 v11, s27 1494; GFX8-NEXT: v_mov_b32_e32 v12, s28 1495; GFX8-NEXT: v_mov_b32_e32 v13, s29 1496; GFX8-NEXT: v_mov_b32_e32 v14, s30 1497; GFX8-NEXT: v_mov_b32_e32 v15, s31 1498; GFX8-NEXT: v_add_u32_e64 v0, s[32:33], s0, v0 clamp 1499; GFX8-NEXT: v_add_u32_e64 v1, s[16:17], s1, v1 clamp 1500; GFX8-NEXT: v_add_u32_e64 v2, s[16:17], s2, v2 clamp 1501; GFX8-NEXT: v_add_u32_e64 v3, s[2:3], s3, v3 clamp 1502; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], s4, v4 clamp 1503; GFX8-NEXT: v_add_u32_e64 v5, s[2:3], s5, v5 clamp 1504; GFX8-NEXT: v_add_u32_e64 v6, s[2:3], s6, v6 clamp 1505; GFX8-NEXT: v_add_u32_e64 v7, s[2:3], s7, v7 clamp 1506; GFX8-NEXT: v_add_u32_e64 v8, s[2:3], s8, v8 clamp 1507; GFX8-NEXT: v_add_u32_e64 v9, s[2:3], s9, v9 clamp 1508; GFX8-NEXT: v_add_u32_e64 v10, s[2:3], s10, v10 clamp 1509; GFX8-NEXT: v_add_u32_e64 v11, s[2:3], s11, v11 clamp 1510; GFX8-NEXT: v_add_u32_e64 v12, s[2:3], s12, v12 clamp 1511; GFX8-NEXT: v_add_u32_e64 v13, s[2:3], s13, v13 clamp 1512; GFX8-NEXT: v_add_u32_e64 v14, s[2:3], s14, v14 clamp 1513; GFX8-NEXT: v_add_u32_e64 v15, s[2:3], s15, v15 clamp 1514; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1515; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1516; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1517; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1518; GFX8-NEXT: v_readfirstlane_b32 s4, v4 1519; GFX8-NEXT: v_readfirstlane_b32 s5, v5 1520; GFX8-NEXT: v_readfirstlane_b32 s6, v6 1521; GFX8-NEXT: v_readfirstlane_b32 s7, v7 1522; GFX8-NEXT: v_readfirstlane_b32 s8, v8 1523; GFX8-NEXT: v_readfirstlane_b32 s9, v9 1524; GFX8-NEXT: v_readfirstlane_b32 s10, v10 1525; GFX8-NEXT: v_readfirstlane_b32 s11, v11 1526; GFX8-NEXT: v_readfirstlane_b32 s12, v12 1527; GFX8-NEXT: v_readfirstlane_b32 s13, v13 1528; GFX8-NEXT: v_readfirstlane_b32 s14, v14 1529; GFX8-NEXT: v_readfirstlane_b32 s15, v15 1530; GFX8-NEXT: ; return to shader part epilog 1531; 1532; GFX9-LABEL: s_uaddsat_v16i32: 1533; GFX9: ; %bb.0: 1534; GFX9-NEXT: v_mov_b32_e32 v0, s16 1535; GFX9-NEXT: v_mov_b32_e32 v1, s17 1536; GFX9-NEXT: v_mov_b32_e32 v2, s18 1537; GFX9-NEXT: v_mov_b32_e32 v3, s19 1538; GFX9-NEXT: v_mov_b32_e32 v4, s20 1539; GFX9-NEXT: v_mov_b32_e32 v5, s21 1540; GFX9-NEXT: v_mov_b32_e32 v6, s22 1541; GFX9-NEXT: v_mov_b32_e32 v7, s23 1542; GFX9-NEXT: v_mov_b32_e32 v8, s24 1543; GFX9-NEXT: v_mov_b32_e32 v9, s25 1544; GFX9-NEXT: v_mov_b32_e32 v10, s26 1545; GFX9-NEXT: v_mov_b32_e32 v11, s27 1546; GFX9-NEXT: v_mov_b32_e32 v12, s28 1547; GFX9-NEXT: v_mov_b32_e32 v13, s29 1548; GFX9-NEXT: v_mov_b32_e32 v14, s30 1549; GFX9-NEXT: v_mov_b32_e32 v15, s31 1550; GFX9-NEXT: v_add_u32_e64 v0, s0, v0 clamp 1551; GFX9-NEXT: v_add_u32_e64 v1, s1, v1 clamp 1552; GFX9-NEXT: v_add_u32_e64 v2, s2, v2 clamp 1553; GFX9-NEXT: v_add_u32_e64 v3, s3, v3 clamp 1554; GFX9-NEXT: v_add_u32_e64 v4, s4, v4 clamp 1555; GFX9-NEXT: v_add_u32_e64 v5, s5, v5 clamp 1556; GFX9-NEXT: v_add_u32_e64 v6, s6, v6 clamp 1557; GFX9-NEXT: v_add_u32_e64 v7, s7, v7 clamp 1558; GFX9-NEXT: v_add_u32_e64 v8, s8, v8 clamp 1559; GFX9-NEXT: v_add_u32_e64 v9, s9, v9 clamp 1560; GFX9-NEXT: v_add_u32_e64 v10, s10, v10 clamp 1561; GFX9-NEXT: v_add_u32_e64 v11, s11, v11 clamp 1562; GFX9-NEXT: v_add_u32_e64 v12, s12, v12 clamp 1563; GFX9-NEXT: v_add_u32_e64 v13, s13, v13 clamp 1564; GFX9-NEXT: v_add_u32_e64 v14, s14, v14 clamp 1565; GFX9-NEXT: v_add_u32_e64 v15, s15, v15 clamp 1566; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1567; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1568; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1569; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1570; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1571; GFX9-NEXT: v_readfirstlane_b32 s5, v5 1572; GFX9-NEXT: v_readfirstlane_b32 s6, v6 1573; GFX9-NEXT: v_readfirstlane_b32 s7, v7 1574; GFX9-NEXT: v_readfirstlane_b32 s8, v8 1575; GFX9-NEXT: v_readfirstlane_b32 s9, v9 1576; GFX9-NEXT: v_readfirstlane_b32 s10, v10 1577; GFX9-NEXT: v_readfirstlane_b32 s11, v11 1578; GFX9-NEXT: v_readfirstlane_b32 s12, v12 1579; GFX9-NEXT: v_readfirstlane_b32 s13, v13 1580; GFX9-NEXT: v_readfirstlane_b32 s14, v14 1581; GFX9-NEXT: v_readfirstlane_b32 s15, v15 1582; GFX9-NEXT: ; return to shader part epilog 1583; 1584; GFX10-LABEL: s_uaddsat_v16i32: 1585; GFX10: ; %bb.0: 1586; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, s16 clamp 1587; GFX10-NEXT: v_add_nc_u32_e64 v1, s1, s17 clamp 1588; GFX10-NEXT: v_add_nc_u32_e64 v2, s2, s18 clamp 1589; GFX10-NEXT: v_add_nc_u32_e64 v3, s3, s19 clamp 1590; GFX10-NEXT: v_add_nc_u32_e64 v4, s4, s20 clamp 1591; GFX10-NEXT: v_add_nc_u32_e64 v5, s5, s21 clamp 1592; GFX10-NEXT: v_add_nc_u32_e64 v6, s6, s22 clamp 1593; GFX10-NEXT: v_add_nc_u32_e64 v7, s7, s23 clamp 1594; GFX10-NEXT: v_add_nc_u32_e64 v8, s8, s24 clamp 1595; GFX10-NEXT: v_add_nc_u32_e64 v9, s9, s25 clamp 1596; GFX10-NEXT: v_add_nc_u32_e64 v10, s10, s26 clamp 1597; GFX10-NEXT: v_add_nc_u32_e64 v11, s11, s27 clamp 1598; GFX10-NEXT: v_add_nc_u32_e64 v12, s12, s28 clamp 1599; GFX10-NEXT: v_add_nc_u32_e64 v13, s13, s29 clamp 1600; GFX10-NEXT: v_add_nc_u32_e64 v14, s14, s30 clamp 1601; GFX10-NEXT: v_add_nc_u32_e64 v15, s15, s31 clamp 1602; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1603; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1604; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1605; GFX10-NEXT: v_readfirstlane_b32 s3, v3 1606; GFX10-NEXT: v_readfirstlane_b32 s4, v4 1607; GFX10-NEXT: v_readfirstlane_b32 s5, v5 1608; GFX10-NEXT: v_readfirstlane_b32 s6, v6 1609; GFX10-NEXT: v_readfirstlane_b32 s7, v7 1610; GFX10-NEXT: v_readfirstlane_b32 s8, v8 1611; GFX10-NEXT: v_readfirstlane_b32 s9, v9 1612; GFX10-NEXT: v_readfirstlane_b32 s10, v10 1613; GFX10-NEXT: v_readfirstlane_b32 s11, v11 1614; GFX10-NEXT: v_readfirstlane_b32 s12, v12 1615; GFX10-NEXT: v_readfirstlane_b32 s13, v13 1616; GFX10-NEXT: v_readfirstlane_b32 s14, v14 1617; GFX10-NEXT: v_readfirstlane_b32 s15, v15 1618; GFX10-NEXT: ; return to shader part epilog 1619 %result = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 1620 ret <16 x i32> %result 1621} 1622 1623define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) { 1624; GFX6-LABEL: v_uaddsat_i16: 1625; GFX6: ; %bb.0: 1626; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1627; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1628; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1629; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 1630; GFX6-NEXT: v_min_u32_e32 v1, v2, v1 1631; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1632; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1633; GFX6-NEXT: s_setpc_b64 s[30:31] 1634; 1635; GFX8-LABEL: v_uaddsat_i16: 1636; GFX8: ; %bb.0: 1637; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1638; GFX8-NEXT: v_add_u16_e64 v0, v0, v1 clamp 1639; GFX8-NEXT: s_setpc_b64 s[30:31] 1640; 1641; GFX9-LABEL: v_uaddsat_i16: 1642; GFX9: ; %bb.0: 1643; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1644; GFX9-NEXT: v_add_u16_e64 v0, v0, v1 clamp 1645; GFX9-NEXT: s_setpc_b64 s[30:31] 1646; 1647; GFX10-LABEL: v_uaddsat_i16: 1648; GFX10: ; %bb.0: 1649; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1650; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1651; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp 1652; GFX10-NEXT: s_setpc_b64 s[30:31] 1653 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) 1654 ret i16 %result 1655} 1656 1657define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { 1658; GFX6-LABEL: s_uaddsat_i16: 1659; GFX6: ; %bb.0: 1660; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1661; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1662; GFX6-NEXT: s_not_b32 s2, s0 1663; GFX6-NEXT: s_min_u32 s1, s2, s1 1664; GFX6-NEXT: s_add_i32 s0, s0, s1 1665; GFX6-NEXT: s_lshr_b32 s0, s0, 16 1666; GFX6-NEXT: ; return to shader part epilog 1667; 1668; GFX8-LABEL: s_uaddsat_i16: 1669; GFX8: ; %bb.0: 1670; GFX8-NEXT: v_mov_b32_e32 v0, s1 1671; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 1672; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1673; GFX8-NEXT: ; return to shader part epilog 1674; 1675; GFX9-LABEL: s_uaddsat_i16: 1676; GFX9: ; %bb.0: 1677; GFX9-NEXT: v_mov_b32_e32 v0, s1 1678; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp 1679; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1680; GFX9-NEXT: ; return to shader part epilog 1681; 1682; GFX10-LABEL: s_uaddsat_i16: 1683; GFX10: ; %bb.0: 1684; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp 1685; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1686; GFX10-NEXT: ; return to shader part epilog 1687 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) 1688 ret i16 %result 1689} 1690 1691define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { 1692; GFX6-LABEL: uaddsat_i16_sv: 1693; GFX6: ; %bb.0: 1694; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1695; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1696; GFX6-NEXT: s_not_b32 s1, s0 1697; GFX6-NEXT: v_min_u32_e32 v0, s1, v0 1698; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1699; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1700; GFX6-NEXT: ; return to shader part epilog 1701; 1702; GFX8-LABEL: uaddsat_i16_sv: 1703; GFX8: ; %bb.0: 1704; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 1705; GFX8-NEXT: ; return to shader part epilog 1706; 1707; GFX9-LABEL: uaddsat_i16_sv: 1708; GFX9: ; %bb.0: 1709; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp 1710; GFX9-NEXT: ; return to shader part epilog 1711; 1712; GFX10-LABEL: uaddsat_i16_sv: 1713; GFX10: ; %bb.0: 1714; GFX10-NEXT: v_add_nc_u16 v0, s0, v0 clamp 1715; GFX10-NEXT: ; return to shader part epilog 1716 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) 1717 %cast = bitcast i16 %result to half 1718 ret half %cast 1719} 1720 1721define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { 1722; GFX6-LABEL: uaddsat_i16_vs: 1723; GFX6: ; %bb.0: 1724; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1725; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1726; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0 1727; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 1728; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1729; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1730; GFX6-NEXT: ; return to shader part epilog 1731; 1732; GFX8-LABEL: uaddsat_i16_vs: 1733; GFX8: ; %bb.0: 1734; GFX8-NEXT: v_add_u16_e64 v0, v0, s0 clamp 1735; GFX8-NEXT: ; return to shader part epilog 1736; 1737; GFX9-LABEL: uaddsat_i16_vs: 1738; GFX9: ; %bb.0: 1739; GFX9-NEXT: v_add_u16_e64 v0, v0, s0 clamp 1740; GFX9-NEXT: ; return to shader part epilog 1741; 1742; GFX10-LABEL: uaddsat_i16_vs: 1743; GFX10: ; %bb.0: 1744; GFX10-NEXT: v_add_nc_u16 v0, v0, s0 clamp 1745; GFX10-NEXT: ; return to shader part epilog 1746 %result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) 1747 %cast = bitcast i16 %result to half 1748 ret half %cast 1749} 1750 1751define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 1752; GFX6-LABEL: v_uaddsat_v2i16: 1753; GFX6: ; %bb.0: 1754; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1755; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1756; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1757; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0 1758; GFX6-NEXT: v_min_u32_e32 v2, v4, v2 1759; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1760; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1761; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1762; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1 1763; GFX6-NEXT: v_min_u32_e32 v2, v3, v2 1764; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 1765; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1766; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1767; GFX6-NEXT: s_setpc_b64 s[30:31] 1768; 1769; GFX8-LABEL: v_uaddsat_v2i16: 1770; GFX8: ; %bb.0: 1771; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1772; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp 1773; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1774; GFX8-NEXT: v_mov_b32_e32 v1, 16 1775; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1776; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1777; GFX8-NEXT: s_setpc_b64 s[30:31] 1778; 1779; GFX9-LABEL: v_uaddsat_v2i16: 1780; GFX9: ; %bb.0: 1781; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1782; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp 1783; GFX9-NEXT: s_setpc_b64 s[30:31] 1784; 1785; GFX10-LABEL: v_uaddsat_v2i16: 1786; GFX10: ; %bb.0: 1787; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1788; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1789; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp 1790; GFX10-NEXT: s_setpc_b64 s[30:31] 1791 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1792 ret <2 x i16> %result 1793} 1794 1795define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { 1796; GFX6-LABEL: s_uaddsat_v2i16: 1797; GFX6: ; %bb.0: 1798; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1799; GFX6-NEXT: s_lshl_b32 s2, s2, 16 1800; GFX6-NEXT: s_not_b32 s4, s0 1801; GFX6-NEXT: s_min_u32 s2, s4, s2 1802; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1803; GFX6-NEXT: s_add_i32 s0, s0, s2 1804; GFX6-NEXT: s_lshl_b32 s2, s3, 16 1805; GFX6-NEXT: s_not_b32 s3, s1 1806; GFX6-NEXT: s_min_u32 s2, s3, s2 1807; GFX6-NEXT: s_add_i32 s1, s1, s2 1808; GFX6-NEXT: s_lshr_b32 s1, s1, 16 1809; GFX6-NEXT: s_lshr_b32 s0, s0, 16 1810; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1811; GFX6-NEXT: s_or_b32 s0, s0, s1 1812; GFX6-NEXT: ; return to shader part epilog 1813; 1814; GFX8-LABEL: s_uaddsat_v2i16: 1815; GFX8: ; %bb.0: 1816; GFX8-NEXT: s_lshr_b32 s3, s1, 16 1817; GFX8-NEXT: s_lshr_b32 s2, s0, 16 1818; GFX8-NEXT: v_mov_b32_e32 v1, s3 1819; GFX8-NEXT: v_mov_b32_e32 v0, s1 1820; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp 1821; GFX8-NEXT: v_mov_b32_e32 v2, 16 1822; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 1823; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1824; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1825; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1826; GFX8-NEXT: ; return to shader part epilog 1827; 1828; GFX9-LABEL: s_uaddsat_v2i16: 1829; GFX9: ; %bb.0: 1830; GFX9-NEXT: v_mov_b32_e32 v0, s1 1831; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 1832; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1833; GFX9-NEXT: ; return to shader part epilog 1834; 1835; GFX10-LABEL: s_uaddsat_v2i16: 1836; GFX10: ; %bb.0: 1837; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp 1838; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1839; GFX10-NEXT: ; return to shader part epilog 1840 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1841 %cast = bitcast <2 x i16> %result to i32 1842 ret i32 %cast 1843} 1844 1845define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { 1846; GFX6-LABEL: uaddsat_v2i16_sv: 1847; GFX6: ; %bb.0: 1848; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1849; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1850; GFX6-NEXT: s_not_b32 s2, s0 1851; GFX6-NEXT: v_min_u32_e32 v0, s2, v0 1852; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1853; GFX6-NEXT: s_lshl_b32 s0, s1, 16 1854; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1855; GFX6-NEXT: s_not_b32 s1, s0 1856; GFX6-NEXT: v_min_u32_e32 v1, s1, v1 1857; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 1858; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1859; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1860; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1861; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 1862; GFX6-NEXT: ; return to shader part epilog 1863; 1864; GFX8-LABEL: uaddsat_v2i16_sv: 1865; GFX8: ; %bb.0: 1866; GFX8-NEXT: s_lshr_b32 s1, s0, 16 1867; GFX8-NEXT: v_mov_b32_e32 v2, s1 1868; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp 1869; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1870; GFX8-NEXT: v_mov_b32_e32 v2, 16 1871; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1872; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1873; GFX8-NEXT: ; return to shader part epilog 1874; 1875; GFX9-LABEL: uaddsat_v2i16_sv: 1876; GFX9: ; %bb.0: 1877; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 1878; GFX9-NEXT: ; return to shader part epilog 1879; 1880; GFX10-LABEL: uaddsat_v2i16_sv: 1881; GFX10: ; %bb.0: 1882; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 clamp 1883; GFX10-NEXT: ; return to shader part epilog 1884 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1885 %cast = bitcast <2 x i16> %result to float 1886 ret float %cast 1887} 1888 1889define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { 1890; GFX6-LABEL: uaddsat_v2i16_vs: 1891; GFX6: ; %bb.0: 1892; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1893; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1894; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0 1895; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 1896; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1897; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1898; GFX6-NEXT: s_lshl_b32 s0, s1, 16 1899; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1 1900; GFX6-NEXT: v_min_u32_e32 v2, s0, v2 1901; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 1902; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1903; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1904; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1905; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 1906; GFX6-NEXT: ; return to shader part epilog 1907; 1908; GFX8-LABEL: uaddsat_v2i16_vs: 1909; GFX8: ; %bb.0: 1910; GFX8-NEXT: s_lshr_b32 s1, s0, 16 1911; GFX8-NEXT: v_mov_b32_e32 v2, s1 1912; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp 1913; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1914; GFX8-NEXT: v_mov_b32_e32 v2, 16 1915; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1916; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1917; GFX8-NEXT: ; return to shader part epilog 1918; 1919; GFX9-LABEL: uaddsat_v2i16_vs: 1920; GFX9: ; %bb.0: 1921; GFX9-NEXT: v_pk_add_u16 v0, v0, s0 clamp 1922; GFX9-NEXT: ; return to shader part epilog 1923; 1924; GFX10-LABEL: uaddsat_v2i16_vs: 1925; GFX10: ; %bb.0: 1926; GFX10-NEXT: v_pk_add_u16 v0, v0, s0 clamp 1927; GFX10-NEXT: ; return to shader part epilog 1928 %result = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1929 %cast = bitcast <2 x i16> %result to float 1930 ret float %cast 1931} 1932 1933; FIXME: v3i16 insert/extract 1934; define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 1935; %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 1936; ret <3 x i16> %result 1937; } 1938 1939; define amdgpu_ps <3 x i16> @s_uaddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) { 1940; %result = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 1941; ret <3 x i16> %result 1942; } 1943 1944define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 1945; GFX6-LABEL: v_uaddsat_v4i16: 1946; GFX6: ; %bb.0: 1947; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1948; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1949; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 1950; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0 1951; GFX6-NEXT: v_min_u32_e32 v4, v8, v4 1952; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1953; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 1954; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 1955; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1 1956; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 1957; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1958; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1959; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 1960; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2 1961; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 1962; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1963; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1964; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 1965; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3 1966; GFX6-NEXT: v_min_u32_e32 v4, v5, v4 1967; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1968; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1969; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1970; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1971; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1972; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1973; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 1974; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 1975; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 1976; GFX6-NEXT: s_setpc_b64 s[30:31] 1977; 1978; GFX8-LABEL: v_uaddsat_v4i16: 1979; GFX8: ; %bb.0: 1980; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1981; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp 1982; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1983; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp 1984; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1985; GFX8-NEXT: v_mov_b32_e32 v3, 16 1986; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1987; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1988; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1989; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1990; GFX8-NEXT: s_setpc_b64 s[30:31] 1991; 1992; GFX9-LABEL: v_uaddsat_v4i16: 1993; GFX9: ; %bb.0: 1994; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1995; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 clamp 1996; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 clamp 1997; GFX9-NEXT: s_setpc_b64 s[30:31] 1998; 1999; GFX10-LABEL: v_uaddsat_v4i16: 2000; GFX10: ; %bb.0: 2001; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2002; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2003; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 clamp 2004; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 clamp 2005; GFX10-NEXT: s_setpc_b64 s[30:31] 2006 %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 2007 %cast = bitcast <4 x i16> %result to <2 x float> 2008 ret <2 x float> %cast 2009} 2010 2011define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { 2012; GFX6-LABEL: s_uaddsat_v4i16: 2013; GFX6: ; %bb.0: 2014; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2015; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2016; GFX6-NEXT: s_not_b32 s8, s0 2017; GFX6-NEXT: s_min_u32 s4, s8, s4 2018; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2019; GFX6-NEXT: s_add_i32 s0, s0, s4 2020; GFX6-NEXT: s_lshl_b32 s4, s5, 16 2021; GFX6-NEXT: s_not_b32 s5, s1 2022; GFX6-NEXT: s_min_u32 s4, s5, s4 2023; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2024; GFX6-NEXT: s_add_i32 s1, s1, s4 2025; GFX6-NEXT: s_lshl_b32 s4, s6, 16 2026; GFX6-NEXT: s_not_b32 s5, s2 2027; GFX6-NEXT: s_min_u32 s4, s5, s4 2028; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2029; GFX6-NEXT: s_add_i32 s2, s2, s4 2030; GFX6-NEXT: s_lshl_b32 s4, s7, 16 2031; GFX6-NEXT: s_not_b32 s5, s3 2032; GFX6-NEXT: s_min_u32 s4, s5, s4 2033; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2034; GFX6-NEXT: s_add_i32 s3, s3, s4 2035; GFX6-NEXT: s_lshr_b32 s0, s0, 16 2036; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2037; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2038; GFX6-NEXT: s_lshr_b32 s2, s2, 16 2039; GFX6-NEXT: s_or_b32 s0, s0, s1 2040; GFX6-NEXT: s_lshl_b32 s1, s3, 16 2041; GFX6-NEXT: s_or_b32 s1, s2, s1 2042; GFX6-NEXT: ; return to shader part epilog 2043; 2044; GFX8-LABEL: s_uaddsat_v4i16: 2045; GFX8: ; %bb.0: 2046; GFX8-NEXT: s_lshr_b32 s6, s2, 16 2047; GFX8-NEXT: s_lshr_b32 s4, s0, 16 2048; GFX8-NEXT: s_lshr_b32 s7, s3, 16 2049; GFX8-NEXT: v_mov_b32_e32 v1, s6 2050; GFX8-NEXT: s_lshr_b32 s5, s1, 16 2051; GFX8-NEXT: v_mov_b32_e32 v0, s2 2052; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp 2053; GFX8-NEXT: v_mov_b32_e32 v3, s7 2054; GFX8-NEXT: v_mov_b32_e32 v4, 16 2055; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 2056; GFX8-NEXT: v_mov_b32_e32 v2, s3 2057; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp 2058; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2059; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp 2060; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2061; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2062; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2063; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2064; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2065; GFX8-NEXT: ; return to shader part epilog 2066; 2067; GFX9-LABEL: s_uaddsat_v4i16: 2068; GFX9: ; %bb.0: 2069; GFX9-NEXT: v_mov_b32_e32 v0, s2 2070; GFX9-NEXT: v_mov_b32_e32 v1, s3 2071; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 2072; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp 2073; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2074; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2075; GFX9-NEXT: ; return to shader part epilog 2076; 2077; GFX10-LABEL: s_uaddsat_v4i16: 2078; GFX10: ; %bb.0: 2079; GFX10-NEXT: v_pk_add_u16 v0, s0, s2 clamp 2080; GFX10-NEXT: v_pk_add_u16 v1, s1, s3 clamp 2081; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2082; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2083; GFX10-NEXT: ; return to shader part epilog 2084 %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 2085 %cast = bitcast <4 x i16> %result to <2 x i32> 2086 ret <2 x i32> %cast 2087} 2088 2089; FIXME 2090; define <5 x i16> @v_uaddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) { 2091; %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 2092; ret <5 x i16> %result 2093; } 2094 2095; define amdgpu_ps <5 x i16> @s_uaddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) { 2096; %result = call <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 2097; ret <5 x i16> %result 2098; } 2099 2100define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { 2101; GFX6-LABEL: v_uaddsat_v6i16: 2102; GFX6: ; %bb.0: 2103; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2104; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2105; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2106; GFX6-NEXT: v_xor_b32_e32 v12, -1, v0 2107; GFX6-NEXT: v_min_u32_e32 v6, v12, v6 2108; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2109; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 2110; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 2111; GFX6-NEXT: v_xor_b32_e32 v7, -1, v1 2112; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 2113; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2114; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 2115; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 2116; GFX6-NEXT: v_xor_b32_e32 v7, -1, v2 2117; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 2118; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2119; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 2120; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 2121; GFX6-NEXT: v_xor_b32_e32 v7, -1, v3 2122; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 2123; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2124; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 2125; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 2126; GFX6-NEXT: v_xor_b32_e32 v7, -1, v4 2127; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 2128; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2129; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 2130; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 2131; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5 2132; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2133; GFX6-NEXT: v_min_u32_e32 v6, v7, v6 2134; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2135; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2136; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 2137; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2138; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2139; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 2140; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2141; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 2142; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 2143; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 2144; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2145; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 2146; GFX6-NEXT: s_setpc_b64 s[30:31] 2147; 2148; GFX8-LABEL: v_uaddsat_v6i16: 2149; GFX8: ; %bb.0: 2150; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2151; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp 2152; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2153; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp 2154; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2155; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp 2156; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2157; GFX8-NEXT: v_mov_b32_e32 v5, 16 2158; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2159; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2160; GFX8-NEXT: v_mov_b32_e32 v3, 16 2161; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2162; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2163; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2164; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2165; GFX8-NEXT: s_setpc_b64 s[30:31] 2166; 2167; GFX9-LABEL: v_uaddsat_v6i16: 2168; GFX9: ; %bb.0: 2169; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2170; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 clamp 2171; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 clamp 2172; GFX9-NEXT: v_pk_add_u16 v2, v2, v5 clamp 2173; GFX9-NEXT: s_setpc_b64 s[30:31] 2174; 2175; GFX10-LABEL: v_uaddsat_v6i16: 2176; GFX10: ; %bb.0: 2177; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2178; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2179; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 clamp 2180; GFX10-NEXT: v_pk_add_u16 v1, v1, v4 clamp 2181; GFX10-NEXT: v_pk_add_u16 v2, v2, v5 clamp 2182; GFX10-NEXT: s_setpc_b64 s[30:31] 2183 %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 2184 %cast = bitcast <6 x i16> %result to <3 x float> 2185 ret <3 x float> %cast 2186} 2187 2188define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { 2189; GFX6-LABEL: s_uaddsat_v6i16: 2190; GFX6: ; %bb.0: 2191; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2192; GFX6-NEXT: s_lshl_b32 s6, s6, 16 2193; GFX6-NEXT: s_not_b32 s12, s0 2194; GFX6-NEXT: s_min_u32 s6, s12, s6 2195; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2196; GFX6-NEXT: s_add_i32 s0, s0, s6 2197; GFX6-NEXT: s_lshl_b32 s6, s7, 16 2198; GFX6-NEXT: s_not_b32 s7, s1 2199; GFX6-NEXT: s_min_u32 s6, s7, s6 2200; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2201; GFX6-NEXT: s_add_i32 s1, s1, s6 2202; GFX6-NEXT: s_lshl_b32 s6, s8, 16 2203; GFX6-NEXT: s_not_b32 s7, s2 2204; GFX6-NEXT: s_min_u32 s6, s7, s6 2205; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2206; GFX6-NEXT: s_add_i32 s2, s2, s6 2207; GFX6-NEXT: s_lshl_b32 s6, s9, 16 2208; GFX6-NEXT: s_not_b32 s7, s3 2209; GFX6-NEXT: s_min_u32 s6, s7, s6 2210; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2211; GFX6-NEXT: s_add_i32 s3, s3, s6 2212; GFX6-NEXT: s_lshl_b32 s6, s10, 16 2213; GFX6-NEXT: s_not_b32 s7, s4 2214; GFX6-NEXT: s_min_u32 s6, s7, s6 2215; GFX6-NEXT: s_lshl_b32 s5, s5, 16 2216; GFX6-NEXT: s_add_i32 s4, s4, s6 2217; GFX6-NEXT: s_lshl_b32 s6, s11, 16 2218; GFX6-NEXT: s_not_b32 s7, s5 2219; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2220; GFX6-NEXT: s_min_u32 s6, s7, s6 2221; GFX6-NEXT: s_lshr_b32 s0, s0, 16 2222; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2223; GFX6-NEXT: s_add_i32 s5, s5, s6 2224; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2225; GFX6-NEXT: s_lshr_b32 s2, s2, 16 2226; GFX6-NEXT: s_lshr_b32 s5, s5, 16 2227; GFX6-NEXT: s_or_b32 s0, s0, s1 2228; GFX6-NEXT: s_lshl_b32 s1, s3, 16 2229; GFX6-NEXT: s_lshr_b32 s4, s4, 16 2230; GFX6-NEXT: s_or_b32 s1, s2, s1 2231; GFX6-NEXT: s_lshl_b32 s2, s5, 16 2232; GFX6-NEXT: s_or_b32 s2, s4, s2 2233; GFX6-NEXT: ; return to shader part epilog 2234; 2235; GFX8-LABEL: s_uaddsat_v6i16: 2236; GFX8: ; %bb.0: 2237; GFX8-NEXT: s_lshr_b32 s9, s3, 16 2238; GFX8-NEXT: s_lshr_b32 s6, s0, 16 2239; GFX8-NEXT: s_lshr_b32 s10, s4, 16 2240; GFX8-NEXT: v_mov_b32_e32 v1, s9 2241; GFX8-NEXT: s_lshr_b32 s7, s1, 16 2242; GFX8-NEXT: s_lshr_b32 s11, s5, 16 2243; GFX8-NEXT: v_mov_b32_e32 v0, s3 2244; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp 2245; GFX8-NEXT: v_mov_b32_e32 v3, s10 2246; GFX8-NEXT: v_mov_b32_e32 v6, 16 2247; GFX8-NEXT: s_lshr_b32 s8, s2, 16 2248; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 2249; GFX8-NEXT: v_mov_b32_e32 v2, s4 2250; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp 2251; GFX8-NEXT: v_mov_b32_e32 v5, s11 2252; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2253; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp 2254; GFX8-NEXT: v_mov_b32_e32 v4, s5 2255; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp 2256; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2257; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2258; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp 2259; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2260; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2261; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2262; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2263; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2264; GFX8-NEXT: v_readfirstlane_b32 s2, v2 2265; GFX8-NEXT: ; return to shader part epilog 2266; 2267; GFX9-LABEL: s_uaddsat_v6i16: 2268; GFX9: ; %bb.0: 2269; GFX9-NEXT: v_mov_b32_e32 v0, s3 2270; GFX9-NEXT: v_mov_b32_e32 v1, s4 2271; GFX9-NEXT: v_mov_b32_e32 v2, s5 2272; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 2273; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp 2274; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp 2275; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2276; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2277; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2278; GFX9-NEXT: ; return to shader part epilog 2279; 2280; GFX10-LABEL: s_uaddsat_v6i16: 2281; GFX10: ; %bb.0: 2282; GFX10-NEXT: v_pk_add_u16 v0, s0, s3 clamp 2283; GFX10-NEXT: v_pk_add_u16 v1, s1, s4 clamp 2284; GFX10-NEXT: v_pk_add_u16 v2, s2, s5 clamp 2285; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2286; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2287; GFX10-NEXT: v_readfirstlane_b32 s2, v2 2288; GFX10-NEXT: ; return to shader part epilog 2289 %result = call <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 2290 %cast = bitcast <6 x i16> %result to <3 x i32> 2291 ret <3 x i32> %cast 2292} 2293 2294define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { 2295; GFX6-LABEL: v_uaddsat_v8i16: 2296; GFX6: ; %bb.0: 2297; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2298; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2299; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 2300; GFX6-NEXT: v_xor_b32_e32 v16, -1, v0 2301; GFX6-NEXT: v_min_u32_e32 v8, v16, v8 2302; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2303; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 2304; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2305; GFX6-NEXT: v_xor_b32_e32 v9, -1, v1 2306; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2307; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2308; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 2309; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 2310; GFX6-NEXT: v_xor_b32_e32 v9, -1, v2 2311; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2312; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2313; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 2314; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 2315; GFX6-NEXT: v_xor_b32_e32 v9, -1, v3 2316; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2317; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2318; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 2319; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 2320; GFX6-NEXT: v_xor_b32_e32 v9, -1, v4 2321; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2322; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2323; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 2324; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 2325; GFX6-NEXT: v_xor_b32_e32 v9, -1, v5 2326; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2327; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2328; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 2329; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 2330; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6 2331; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2332; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2333; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 2334; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 2335; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7 2336; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2337; GFX6-NEXT: v_min_u32_e32 v8, v9, v8 2338; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2339; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2340; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 2341; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2342; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2343; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 2344; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 2345; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2346; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 2347; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 2348; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v6 2349; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 2350; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2351; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 2352; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 2353; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 2354; GFX6-NEXT: s_setpc_b64 s[30:31] 2355; 2356; GFX8-LABEL: v_uaddsat_v8i16: 2357; GFX8: ; %bb.0: 2358; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2359; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp 2360; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2361; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp 2362; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2363; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp 2364; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2365; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp 2366; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2367; GFX8-NEXT: v_mov_b32_e32 v7, 16 2368; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2369; GFX8-NEXT: v_mov_b32_e32 v7, 16 2370; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2371; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2372; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2373; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2374; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2375; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2376; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2377; GFX8-NEXT: s_setpc_b64 s[30:31] 2378; 2379; GFX9-LABEL: v_uaddsat_v8i16: 2380; GFX9: ; %bb.0: 2381; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2382; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 clamp 2383; GFX9-NEXT: v_pk_add_u16 v1, v1, v5 clamp 2384; GFX9-NEXT: v_pk_add_u16 v2, v2, v6 clamp 2385; GFX9-NEXT: v_pk_add_u16 v3, v3, v7 clamp 2386; GFX9-NEXT: s_setpc_b64 s[30:31] 2387; 2388; GFX10-LABEL: v_uaddsat_v8i16: 2389; GFX10: ; %bb.0: 2390; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2391; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2392; GFX10-NEXT: v_pk_add_u16 v0, v0, v4 clamp 2393; GFX10-NEXT: v_pk_add_u16 v1, v1, v5 clamp 2394; GFX10-NEXT: v_pk_add_u16 v2, v2, v6 clamp 2395; GFX10-NEXT: v_pk_add_u16 v3, v3, v7 clamp 2396; GFX10-NEXT: s_setpc_b64 s[30:31] 2397 %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 2398 %cast = bitcast <8 x i16> %result to <4 x float> 2399 ret <4 x float> %cast 2400} 2401 2402define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { 2403; GFX6-LABEL: s_uaddsat_v8i16: 2404; GFX6: ; %bb.0: 2405; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2406; GFX6-NEXT: s_lshl_b32 s8, s8, 16 2407; GFX6-NEXT: s_not_b32 s16, s0 2408; GFX6-NEXT: s_min_u32 s8, s16, s8 2409; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2410; GFX6-NEXT: s_add_i32 s0, s0, s8 2411; GFX6-NEXT: s_lshl_b32 s8, s9, 16 2412; GFX6-NEXT: s_not_b32 s9, s1 2413; GFX6-NEXT: s_min_u32 s8, s9, s8 2414; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2415; GFX6-NEXT: s_add_i32 s1, s1, s8 2416; GFX6-NEXT: s_lshl_b32 s8, s10, 16 2417; GFX6-NEXT: s_not_b32 s9, s2 2418; GFX6-NEXT: s_min_u32 s8, s9, s8 2419; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2420; GFX6-NEXT: s_add_i32 s2, s2, s8 2421; GFX6-NEXT: s_lshl_b32 s8, s11, 16 2422; GFX6-NEXT: s_not_b32 s9, s3 2423; GFX6-NEXT: s_min_u32 s8, s9, s8 2424; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2425; GFX6-NEXT: s_add_i32 s3, s3, s8 2426; GFX6-NEXT: s_lshl_b32 s8, s12, 16 2427; GFX6-NEXT: s_not_b32 s9, s4 2428; GFX6-NEXT: s_min_u32 s8, s9, s8 2429; GFX6-NEXT: s_lshl_b32 s5, s5, 16 2430; GFX6-NEXT: s_add_i32 s4, s4, s8 2431; GFX6-NEXT: s_lshl_b32 s8, s13, 16 2432; GFX6-NEXT: s_not_b32 s9, s5 2433; GFX6-NEXT: s_min_u32 s8, s9, s8 2434; GFX6-NEXT: s_lshl_b32 s6, s6, 16 2435; GFX6-NEXT: s_add_i32 s5, s5, s8 2436; GFX6-NEXT: s_lshl_b32 s8, s14, 16 2437; GFX6-NEXT: s_not_b32 s9, s6 2438; GFX6-NEXT: s_min_u32 s8, s9, s8 2439; GFX6-NEXT: s_lshl_b32 s7, s7, 16 2440; GFX6-NEXT: s_add_i32 s6, s6, s8 2441; GFX6-NEXT: s_lshl_b32 s8, s15, 16 2442; GFX6-NEXT: s_not_b32 s9, s7 2443; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2444; GFX6-NEXT: s_min_u32 s8, s9, s8 2445; GFX6-NEXT: s_lshr_b32 s0, s0, 16 2446; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2447; GFX6-NEXT: s_add_i32 s7, s7, s8 2448; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2449; GFX6-NEXT: s_lshr_b32 s2, s2, 16 2450; GFX6-NEXT: s_lshr_b32 s5, s5, 16 2451; GFX6-NEXT: s_lshr_b32 s7, s7, 16 2452; GFX6-NEXT: s_or_b32 s0, s0, s1 2453; GFX6-NEXT: s_lshl_b32 s1, s3, 16 2454; GFX6-NEXT: s_lshr_b32 s4, s4, 16 2455; GFX6-NEXT: s_lshr_b32 s6, s6, 16 2456; GFX6-NEXT: s_or_b32 s1, s2, s1 2457; GFX6-NEXT: s_lshl_b32 s2, s5, 16 2458; GFX6-NEXT: s_lshl_b32 s3, s7, 16 2459; GFX6-NEXT: s_or_b32 s2, s4, s2 2460; GFX6-NEXT: s_or_b32 s3, s6, s3 2461; GFX6-NEXT: ; return to shader part epilog 2462; 2463; GFX8-LABEL: s_uaddsat_v8i16: 2464; GFX8: ; %bb.0: 2465; GFX8-NEXT: s_lshr_b32 s12, s4, 16 2466; GFX8-NEXT: s_lshr_b32 s8, s0, 16 2467; GFX8-NEXT: s_lshr_b32 s13, s5, 16 2468; GFX8-NEXT: v_mov_b32_e32 v1, s12 2469; GFX8-NEXT: s_lshr_b32 s9, s1, 16 2470; GFX8-NEXT: s_lshr_b32 s14, s6, 16 2471; GFX8-NEXT: s_lshr_b32 s15, s7, 16 2472; GFX8-NEXT: v_mov_b32_e32 v0, s4 2473; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp 2474; GFX8-NEXT: v_mov_b32_e32 v3, s13 2475; GFX8-NEXT: v_mov_b32_e32 v8, 16 2476; GFX8-NEXT: s_lshr_b32 s10, s2, 16 2477; GFX8-NEXT: s_lshr_b32 s11, s3, 16 2478; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp 2479; GFX8-NEXT: v_mov_b32_e32 v2, s5 2480; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp 2481; GFX8-NEXT: v_mov_b32_e32 v5, s14 2482; GFX8-NEXT: v_mov_b32_e32 v7, s15 2483; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2484; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp 2485; GFX8-NEXT: v_mov_b32_e32 v4, s6 2486; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp 2487; GFX8-NEXT: v_mov_b32_e32 v6, s7 2488; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp 2489; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2490; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2491; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp 2492; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp 2493; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2494; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2495; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2496; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2497; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2498; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2499; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2500; GFX8-NEXT: v_readfirstlane_b32 s2, v2 2501; GFX8-NEXT: v_readfirstlane_b32 s3, v3 2502; GFX8-NEXT: ; return to shader part epilog 2503; 2504; GFX9-LABEL: s_uaddsat_v8i16: 2505; GFX9: ; %bb.0: 2506; GFX9-NEXT: v_mov_b32_e32 v0, s4 2507; GFX9-NEXT: v_mov_b32_e32 v1, s5 2508; GFX9-NEXT: v_mov_b32_e32 v2, s6 2509; GFX9-NEXT: v_mov_b32_e32 v3, s7 2510; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp 2511; GFX9-NEXT: v_pk_add_u16 v1, s1, v1 clamp 2512; GFX9-NEXT: v_pk_add_u16 v2, s2, v2 clamp 2513; GFX9-NEXT: v_pk_add_u16 v3, s3, v3 clamp 2514; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2515; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2516; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2517; GFX9-NEXT: v_readfirstlane_b32 s3, v3 2518; GFX9-NEXT: ; return to shader part epilog 2519; 2520; GFX10-LABEL: s_uaddsat_v8i16: 2521; GFX10: ; %bb.0: 2522; GFX10-NEXT: v_pk_add_u16 v0, s0, s4 clamp 2523; GFX10-NEXT: v_pk_add_u16 v1, s1, s5 clamp 2524; GFX10-NEXT: v_pk_add_u16 v2, s2, s6 clamp 2525; GFX10-NEXT: v_pk_add_u16 v3, s3, s7 clamp 2526; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2527; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2528; GFX10-NEXT: v_readfirstlane_b32 s2, v2 2529; GFX10-NEXT: v_readfirstlane_b32 s3, v3 2530; GFX10-NEXT: ; return to shader part epilog 2531 %result = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 2532 %cast = bitcast <8 x i16> %result to <4 x i32> 2533 ret <4 x i32> %cast 2534} 2535 2536; FIXME: i48 broken because i48 add broken 2537; define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) { 2538; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) 2539; ret i48 %result 2540; } 2541 2542; define amdgpu_ps i48 @s_uaddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { 2543; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) 2544; ret i48 %result 2545; } 2546 2547; define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { 2548; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) 2549; %ext.result = zext i48 %result to i64 2550; %cast = bitcast i64 %ext.result to <2 x float> 2551; ret <2 x float> %cast 2552; } 2553 2554; define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { 2555; %result = call i48 @llvm.uadd.sat.i48(i48 %lhs, i48 %rhs) 2556; %ext.result = zext i48 %result to i64 2557; %cast = bitcast i64 %ext.result to <2 x float> 2558; ret <2 x float> %cast 2559; } 2560 2561define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) { 2562; GFX6-LABEL: v_uaddsat_i64: 2563; GFX6: ; %bb.0: 2564; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2565; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2566; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2567; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] 2568; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2569; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2570; GFX6-NEXT: s_setpc_b64 s[30:31] 2571; 2572; GFX8-LABEL: v_uaddsat_i64: 2573; GFX8: ; %bb.0: 2574; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2575; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 2576; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2577; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] 2578; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2579; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2580; GFX8-NEXT: s_setpc_b64 s[30:31] 2581; 2582; GFX9-LABEL: v_uaddsat_i64: 2583; GFX9: ; %bb.0: 2584; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2585; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 2586; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 2587; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] 2588; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2589; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2590; GFX9-NEXT: s_setpc_b64 s[30:31] 2591; 2592; GFX10-LABEL: v_uaddsat_i64: 2593; GFX10: ; %bb.0: 2594; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2595; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2596; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 2597; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 2598; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] 2599; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 2600; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo 2601; GFX10-NEXT: s_setpc_b64 s[30:31] 2602 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) 2603 ret i64 %result 2604} 2605 2606define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { 2607; GFX6-LABEL: s_uaddsat_i64: 2608; GFX6: ; %bb.0: 2609; GFX6-NEXT: s_add_u32 s0, s0, s2 2610; GFX6-NEXT: s_cselect_b32 s4, 1, 0 2611; GFX6-NEXT: s_and_b32 s4, s4, 1 2612; GFX6-NEXT: s_cmp_lg_u32 s4, 0 2613; GFX6-NEXT: v_mov_b32_e32 v0, s2 2614; GFX6-NEXT: s_addc_u32 s1, s1, s3 2615; GFX6-NEXT: v_mov_b32_e32 v1, s3 2616; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2617; GFX6-NEXT: v_mov_b32_e32 v2, s0 2618; GFX6-NEXT: v_mov_b32_e32 v3, s1 2619; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2620; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2621; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2622; GFX6-NEXT: v_readfirstlane_b32 s1, v1 2623; GFX6-NEXT: ; return to shader part epilog 2624; 2625; GFX8-LABEL: s_uaddsat_i64: 2626; GFX8: ; %bb.0: 2627; GFX8-NEXT: s_add_u32 s0, s0, s2 2628; GFX8-NEXT: s_cselect_b32 s4, 1, 0 2629; GFX8-NEXT: s_and_b32 s4, s4, 1 2630; GFX8-NEXT: s_cmp_lg_u32 s4, 0 2631; GFX8-NEXT: v_mov_b32_e32 v0, s2 2632; GFX8-NEXT: s_addc_u32 s1, s1, s3 2633; GFX8-NEXT: v_mov_b32_e32 v1, s3 2634; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2635; GFX8-NEXT: v_mov_b32_e32 v2, s0 2636; GFX8-NEXT: v_mov_b32_e32 v3, s1 2637; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2638; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2639; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2640; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2641; GFX8-NEXT: ; return to shader part epilog 2642; 2643; GFX9-LABEL: s_uaddsat_i64: 2644; GFX9: ; %bb.0: 2645; GFX9-NEXT: s_add_u32 s0, s0, s2 2646; GFX9-NEXT: s_cselect_b32 s4, 1, 0 2647; GFX9-NEXT: s_and_b32 s4, s4, 1 2648; GFX9-NEXT: s_cmp_lg_u32 s4, 0 2649; GFX9-NEXT: v_mov_b32_e32 v0, s2 2650; GFX9-NEXT: s_addc_u32 s1, s1, s3 2651; GFX9-NEXT: v_mov_b32_e32 v1, s3 2652; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2653; GFX9-NEXT: v_mov_b32_e32 v2, s0 2654; GFX9-NEXT: v_mov_b32_e32 v3, s1 2655; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2656; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2657; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2658; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2659; GFX9-NEXT: ; return to shader part epilog 2660; 2661; GFX10-LABEL: s_uaddsat_i64: 2662; GFX10: ; %bb.0: 2663; GFX10-NEXT: s_add_u32 s0, s0, s2 2664; GFX10-NEXT: s_cselect_b32 s4, 1, 0 2665; GFX10-NEXT: s_and_b32 s4, s4, 1 2666; GFX10-NEXT: s_cmp_lg_u32 s4, 0 2667; GFX10-NEXT: s_addc_u32 s1, s1, s3 2668; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3] 2669; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2 2670; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s2 2671; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2672; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2673; GFX10-NEXT: ; return to shader part epilog 2674 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) 2675 ret i64 %result 2676} 2677 2678define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { 2679; GFX6-LABEL: uaddsat_i64_sv: 2680; GFX6: ; %bb.0: 2681; GFX6-NEXT: v_mov_b32_e32 v3, s1 2682; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 2683; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 2684; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 2685; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2686; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2687; GFX6-NEXT: ; return to shader part epilog 2688; 2689; GFX8-LABEL: uaddsat_i64_sv: 2690; GFX8: ; %bb.0: 2691; GFX8-NEXT: v_mov_b32_e32 v3, s1 2692; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 2693; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 2694; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 2695; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2696; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2697; GFX8-NEXT: ; return to shader part epilog 2698; 2699; GFX9-LABEL: uaddsat_i64_sv: 2700; GFX9: ; %bb.0: 2701; GFX9-NEXT: v_mov_b32_e32 v3, s1 2702; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 2703; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc 2704; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 2705; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 2706; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc 2707; GFX9-NEXT: ; return to shader part epilog 2708; 2709; GFX10-LABEL: uaddsat_i64_sv: 2710; GFX10: ; %bb.0: 2711; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 2712; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 2713; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] 2714; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo 2715; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, -1, vcc_lo 2716; GFX10-NEXT: ; return to shader part epilog 2717 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) 2718 %cast = bitcast i64 %result to <2 x float> 2719 ret <2 x float> %cast 2720} 2721 2722define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { 2723; GFX6-LABEL: uaddsat_i64_vs: 2724; GFX6: ; %bb.0: 2725; GFX6-NEXT: v_mov_b32_e32 v2, s1 2726; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2727; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc 2728; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 2729; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2730; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2731; GFX6-NEXT: ; return to shader part epilog 2732; 2733; GFX8-LABEL: uaddsat_i64_vs: 2734; GFX8: ; %bb.0: 2735; GFX8-NEXT: v_mov_b32_e32 v2, s1 2736; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2737; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc 2738; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 2739; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2740; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2741; GFX8-NEXT: ; return to shader part epilog 2742; 2743; GFX9-LABEL: uaddsat_i64_vs: 2744; GFX9: ; %bb.0: 2745; GFX9-NEXT: v_mov_b32_e32 v2, s1 2746; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 2747; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 2748; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 2749; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2750; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2751; GFX9-NEXT: ; return to shader part epilog 2752; 2753; GFX10-LABEL: uaddsat_i64_vs: 2754; GFX10: ; %bb.0: 2755; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 2756; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2757; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] 2758; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 2759; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo 2760; GFX10-NEXT: ; return to shader part epilog 2761 %result = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) 2762 %cast = bitcast i64 %result to <2 x float> 2763 ret <2 x float> %cast 2764} 2765 2766define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 2767; GFX6-LABEL: v_uaddsat_v2i64: 2768; GFX6: ; %bb.0: 2769; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2770; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 2771; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc 2772; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] 2773; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2774; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2775; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 2776; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc 2777; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] 2778; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2779; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2780; GFX6-NEXT: s_setpc_b64 s[30:31] 2781; 2782; GFX8-LABEL: v_uaddsat_v2i64: 2783; GFX8: ; %bb.0: 2784; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2785; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 2786; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc 2787; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] 2788; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2789; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2790; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 2791; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc 2792; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] 2793; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2794; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2795; GFX8-NEXT: s_setpc_b64 s[30:31] 2796; 2797; GFX9-LABEL: v_uaddsat_v2i64: 2798; GFX9: ; %bb.0: 2799; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2800; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2801; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc 2802; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] 2803; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 2804; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 2805; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 2806; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 2807; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] 2808; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2809; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2810; GFX9-NEXT: s_setpc_b64 s[30:31] 2811; 2812; GFX10-LABEL: v_uaddsat_v2i64: 2813; GFX10: ; %bb.0: 2814; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2815; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2816; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2817; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 2818; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 2819; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo 2820; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] 2821; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] 2822; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 2823; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo 2824; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 2825; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 2826; GFX10-NEXT: s_setpc_b64 s[30:31] 2827 %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 2828 ret <2 x i64> %result 2829} 2830 2831define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { 2832; GFX6-LABEL: s_uaddsat_v2i64: 2833; GFX6: ; %bb.0: 2834; GFX6-NEXT: s_add_u32 s0, s0, s4 2835; GFX6-NEXT: s_cselect_b32 s8, 1, 0 2836; GFX6-NEXT: s_and_b32 s8, s8, 1 2837; GFX6-NEXT: s_cmp_lg_u32 s8, 0 2838; GFX6-NEXT: v_mov_b32_e32 v0, s4 2839; GFX6-NEXT: s_addc_u32 s1, s1, s5 2840; GFX6-NEXT: v_mov_b32_e32 v1, s5 2841; GFX6-NEXT: v_mov_b32_e32 v2, s0 2842; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2843; GFX6-NEXT: s_add_u32 s0, s2, s6 2844; GFX6-NEXT: v_mov_b32_e32 v3, s1 2845; GFX6-NEXT: s_cselect_b32 s1, 1, 0 2846; GFX6-NEXT: s_and_b32 s1, s1, 1 2847; GFX6-NEXT: s_cmp_lg_u32 s1, 0 2848; GFX6-NEXT: v_mov_b32_e32 v0, s6 2849; GFX6-NEXT: s_addc_u32 s1, s3, s7 2850; GFX6-NEXT: v_mov_b32_e32 v1, s7 2851; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2852; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2853; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2854; GFX6-NEXT: v_mov_b32_e32 v4, s0 2855; GFX6-NEXT: v_mov_b32_e32 v5, s1 2856; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 2857; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 2858; GFX6-NEXT: v_readfirstlane_b32 s0, v2 2859; GFX6-NEXT: v_readfirstlane_b32 s1, v3 2860; GFX6-NEXT: v_readfirstlane_b32 s2, v0 2861; GFX6-NEXT: v_readfirstlane_b32 s3, v1 2862; GFX6-NEXT: ; return to shader part epilog 2863; 2864; GFX8-LABEL: s_uaddsat_v2i64: 2865; GFX8: ; %bb.0: 2866; GFX8-NEXT: s_add_u32 s0, s0, s4 2867; GFX8-NEXT: s_cselect_b32 s8, 1, 0 2868; GFX8-NEXT: s_and_b32 s8, s8, 1 2869; GFX8-NEXT: s_cmp_lg_u32 s8, 0 2870; GFX8-NEXT: v_mov_b32_e32 v0, s4 2871; GFX8-NEXT: s_addc_u32 s1, s1, s5 2872; GFX8-NEXT: v_mov_b32_e32 v1, s5 2873; GFX8-NEXT: v_mov_b32_e32 v2, s0 2874; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2875; GFX8-NEXT: s_add_u32 s0, s2, s6 2876; GFX8-NEXT: v_mov_b32_e32 v3, s1 2877; GFX8-NEXT: s_cselect_b32 s1, 1, 0 2878; GFX8-NEXT: s_and_b32 s1, s1, 1 2879; GFX8-NEXT: s_cmp_lg_u32 s1, 0 2880; GFX8-NEXT: v_mov_b32_e32 v0, s6 2881; GFX8-NEXT: s_addc_u32 s1, s3, s7 2882; GFX8-NEXT: v_mov_b32_e32 v1, s7 2883; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2884; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2885; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2886; GFX8-NEXT: v_mov_b32_e32 v4, s0 2887; GFX8-NEXT: v_mov_b32_e32 v5, s1 2888; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 2889; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 2890; GFX8-NEXT: v_readfirstlane_b32 s0, v2 2891; GFX8-NEXT: v_readfirstlane_b32 s1, v3 2892; GFX8-NEXT: v_readfirstlane_b32 s2, v0 2893; GFX8-NEXT: v_readfirstlane_b32 s3, v1 2894; GFX8-NEXT: ; return to shader part epilog 2895; 2896; GFX9-LABEL: s_uaddsat_v2i64: 2897; GFX9: ; %bb.0: 2898; GFX9-NEXT: s_add_u32 s0, s0, s4 2899; GFX9-NEXT: s_cselect_b32 s8, 1, 0 2900; GFX9-NEXT: s_and_b32 s8, s8, 1 2901; GFX9-NEXT: s_cmp_lg_u32 s8, 0 2902; GFX9-NEXT: v_mov_b32_e32 v0, s4 2903; GFX9-NEXT: s_addc_u32 s1, s1, s5 2904; GFX9-NEXT: v_mov_b32_e32 v1, s5 2905; GFX9-NEXT: v_mov_b32_e32 v2, s0 2906; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2907; GFX9-NEXT: s_add_u32 s0, s2, s6 2908; GFX9-NEXT: v_mov_b32_e32 v3, s1 2909; GFX9-NEXT: s_cselect_b32 s1, 1, 0 2910; GFX9-NEXT: s_and_b32 s1, s1, 1 2911; GFX9-NEXT: s_cmp_lg_u32 s1, 0 2912; GFX9-NEXT: v_mov_b32_e32 v0, s6 2913; GFX9-NEXT: s_addc_u32 s1, s3, s7 2914; GFX9-NEXT: v_mov_b32_e32 v1, s7 2915; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2916; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2917; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 2918; GFX9-NEXT: v_mov_b32_e32 v4, s0 2919; GFX9-NEXT: v_mov_b32_e32 v5, s1 2920; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 2921; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 2922; GFX9-NEXT: v_readfirstlane_b32 s0, v2 2923; GFX9-NEXT: v_readfirstlane_b32 s1, v3 2924; GFX9-NEXT: v_readfirstlane_b32 s2, v0 2925; GFX9-NEXT: v_readfirstlane_b32 s3, v1 2926; GFX9-NEXT: ; return to shader part epilog 2927; 2928; GFX10-LABEL: s_uaddsat_v2i64: 2929; GFX10: ; %bb.0: 2930; GFX10-NEXT: s_add_u32 s0, s0, s4 2931; GFX10-NEXT: s_cselect_b32 s8, 1, 0 2932; GFX10-NEXT: s_and_b32 s8, s8, 1 2933; GFX10-NEXT: s_cmp_lg_u32 s8, 0 2934; GFX10-NEXT: s_addc_u32 s1, s1, s5 2935; GFX10-NEXT: s_add_u32 s2, s2, s6 2936; GFX10-NEXT: s_cselect_b32 s8, 1, 0 2937; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 2938; GFX10-NEXT: s_and_b32 s8, s8, 1 2939; GFX10-NEXT: s_cmp_lg_u32 s8, 0 2940; GFX10-NEXT: s_addc_u32 s3, s3, s7 2941; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4 2942; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] 2943; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4 2944; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2945; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2946; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5 2947; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5 2948; GFX10-NEXT: v_readfirstlane_b32 s2, v2 2949; GFX10-NEXT: v_readfirstlane_b32 s3, v3 2950; GFX10-NEXT: ; return to shader part epilog 2951 %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 2952 ret <2 x i64> %result 2953} 2954 2955define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { 2956; GFX6-LABEL: s_uaddsat_i128: 2957; GFX6: ; %bb.0: 2958; GFX6-NEXT: s_add_u32 s0, s0, s4 2959; GFX6-NEXT: s_cselect_b32 s8, 1, 0 2960; GFX6-NEXT: s_and_b32 s8, s8, 1 2961; GFX6-NEXT: s_cmp_lg_u32 s8, 0 2962; GFX6-NEXT: s_addc_u32 s1, s1, s5 2963; GFX6-NEXT: s_cselect_b32 s8, 1, 0 2964; GFX6-NEXT: s_and_b32 s8, s8, 1 2965; GFX6-NEXT: s_cmp_lg_u32 s8, 0 2966; GFX6-NEXT: s_addc_u32 s2, s2, s6 2967; GFX6-NEXT: s_cselect_b32 s8, 1, 0 2968; GFX6-NEXT: v_mov_b32_e32 v2, s4 2969; GFX6-NEXT: s_and_b32 s8, s8, 1 2970; GFX6-NEXT: v_mov_b32_e32 v3, s5 2971; GFX6-NEXT: s_cmp_lg_u32 s8, 0 2972; GFX6-NEXT: v_mov_b32_e32 v0, s6 2973; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 2974; GFX6-NEXT: s_addc_u32 s3, s3, s7 2975; GFX6-NEXT: v_mov_b32_e32 v1, s7 2976; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 2977; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 2978; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 2979; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] 2980; GFX6-NEXT: v_mov_b32_e32 v1, s0 2981; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 2982; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 2983; GFX6-NEXT: v_mov_b32_e32 v2, s1 2984; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 2985; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 2986; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 2987; GFX6-NEXT: v_mov_b32_e32 v2, s2 2988; GFX6-NEXT: v_mov_b32_e32 v3, s3 2989; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 2990; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 2991; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2992; GFX6-NEXT: v_readfirstlane_b32 s1, v1 2993; GFX6-NEXT: v_readfirstlane_b32 s2, v2 2994; GFX6-NEXT: v_readfirstlane_b32 s3, v3 2995; GFX6-NEXT: ; return to shader part epilog 2996; 2997; GFX8-LABEL: s_uaddsat_i128: 2998; GFX8: ; %bb.0: 2999; GFX8-NEXT: s_add_u32 s0, s0, s4 3000; GFX8-NEXT: s_cselect_b32 s8, 1, 0 3001; GFX8-NEXT: s_and_b32 s8, s8, 1 3002; GFX8-NEXT: s_cmp_lg_u32 s8, 0 3003; GFX8-NEXT: s_addc_u32 s1, s1, s5 3004; GFX8-NEXT: s_cselect_b32 s8, 1, 0 3005; GFX8-NEXT: s_and_b32 s8, s8, 1 3006; GFX8-NEXT: s_cmp_lg_u32 s8, 0 3007; GFX8-NEXT: s_addc_u32 s2, s2, s6 3008; GFX8-NEXT: s_cselect_b32 s8, 1, 0 3009; GFX8-NEXT: s_and_b32 s8, s8, 1 3010; GFX8-NEXT: v_mov_b32_e32 v2, s4 3011; GFX8-NEXT: s_cmp_lg_u32 s8, 0 3012; GFX8-NEXT: v_mov_b32_e32 v3, s5 3013; GFX8-NEXT: s_addc_u32 s3, s3, s7 3014; GFX8-NEXT: v_mov_b32_e32 v0, s6 3015; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3016; GFX8-NEXT: v_mov_b32_e32 v1, s7 3017; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 3018; GFX8-NEXT: s_cselect_b32 s6, 1, 0 3019; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3020; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3021; GFX8-NEXT: s_and_b32 s4, 1, s6 3022; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3023; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 3024; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3025; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3026; GFX8-NEXT: v_mov_b32_e32 v1, s0 3027; GFX8-NEXT: v_mov_b32_e32 v2, s1 3028; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3029; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 3030; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 3031; GFX8-NEXT: v_mov_b32_e32 v2, s2 3032; GFX8-NEXT: v_mov_b32_e32 v3, s3 3033; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3034; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3035; GFX8-NEXT: v_readfirstlane_b32 s0, v0 3036; GFX8-NEXT: v_readfirstlane_b32 s1, v1 3037; GFX8-NEXT: v_readfirstlane_b32 s2, v2 3038; GFX8-NEXT: v_readfirstlane_b32 s3, v3 3039; GFX8-NEXT: ; return to shader part epilog 3040; 3041; GFX9-LABEL: s_uaddsat_i128: 3042; GFX9: ; %bb.0: 3043; GFX9-NEXT: s_add_u32 s0, s0, s4 3044; GFX9-NEXT: s_cselect_b32 s8, 1, 0 3045; GFX9-NEXT: s_and_b32 s8, s8, 1 3046; GFX9-NEXT: s_cmp_lg_u32 s8, 0 3047; GFX9-NEXT: s_addc_u32 s1, s1, s5 3048; GFX9-NEXT: s_cselect_b32 s8, 1, 0 3049; GFX9-NEXT: s_and_b32 s8, s8, 1 3050; GFX9-NEXT: s_cmp_lg_u32 s8, 0 3051; GFX9-NEXT: s_addc_u32 s2, s2, s6 3052; GFX9-NEXT: s_cselect_b32 s8, 1, 0 3053; GFX9-NEXT: s_and_b32 s8, s8, 1 3054; GFX9-NEXT: v_mov_b32_e32 v2, s4 3055; GFX9-NEXT: s_cmp_lg_u32 s8, 0 3056; GFX9-NEXT: v_mov_b32_e32 v3, s5 3057; GFX9-NEXT: s_addc_u32 s3, s3, s7 3058; GFX9-NEXT: v_mov_b32_e32 v0, s6 3059; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3060; GFX9-NEXT: v_mov_b32_e32 v1, s7 3061; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 3062; GFX9-NEXT: s_cselect_b32 s6, 1, 0 3063; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3064; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3065; GFX9-NEXT: s_and_b32 s4, 1, s6 3066; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3067; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 3068; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3069; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3070; GFX9-NEXT: v_mov_b32_e32 v1, s0 3071; GFX9-NEXT: v_mov_b32_e32 v2, s1 3072; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3073; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 3074; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 3075; GFX9-NEXT: v_mov_b32_e32 v2, s2 3076; GFX9-NEXT: v_mov_b32_e32 v3, s3 3077; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3078; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3079; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3080; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3081; GFX9-NEXT: v_readfirstlane_b32 s2, v2 3082; GFX9-NEXT: v_readfirstlane_b32 s3, v3 3083; GFX9-NEXT: ; return to shader part epilog 3084; 3085; GFX10-LABEL: s_uaddsat_i128: 3086; GFX10: ; %bb.0: 3087; GFX10-NEXT: s_add_u32 s0, s0, s4 3088; GFX10-NEXT: s_cselect_b32 s8, 1, 0 3089; GFX10-NEXT: s_and_b32 s8, s8, 1 3090; GFX10-NEXT: s_cmp_lg_u32 s8, 0 3091; GFX10-NEXT: s_addc_u32 s1, s1, s5 3092; GFX10-NEXT: s_cselect_b32 s8, 1, 0 3093; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 3094; GFX10-NEXT: s_and_b32 s8, s8, 1 3095; GFX10-NEXT: s_cmp_lg_u32 s8, 0 3096; GFX10-NEXT: s_addc_u32 s2, s2, s6 3097; GFX10-NEXT: s_cselect_b32 s8, 1, 0 3098; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 3099; GFX10-NEXT: s_and_b32 s8, s8, 1 3100; GFX10-NEXT: s_cmp_lg_u32 s8, 0 3101; GFX10-NEXT: s_addc_u32 s3, s3, s7 3102; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 3103; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7] 3104; GFX10-NEXT: s_cselect_b32 s4, 1, 0 3105; GFX10-NEXT: s_and_b32 s4, 1, s4 3106; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 3107; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5 3108; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 3109; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 3110; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3111; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo 3112; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo 3113; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo 3114; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo 3115; GFX10-NEXT: v_readfirstlane_b32 s0, v0 3116; GFX10-NEXT: v_readfirstlane_b32 s1, v1 3117; GFX10-NEXT: v_readfirstlane_b32 s2, v2 3118; GFX10-NEXT: v_readfirstlane_b32 s3, v3 3119; GFX10-NEXT: ; return to shader part epilog 3120 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) 3121 ret i128 %result 3122} 3123 3124define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { 3125; GFX6-LABEL: uaddsat_i128_sv: 3126; GFX6: ; %bb.0: 3127; GFX6-NEXT: v_mov_b32_e32 v5, s1 3128; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0 3129; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc 3130; GFX6-NEXT: v_mov_b32_e32 v6, s2 3131; GFX6-NEXT: v_mov_b32_e32 v7, s3 3132; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc 3133; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc 3134; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 3135; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3136; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] 3137; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 3138; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 3139; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 3140; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 3141; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3142; GFX6-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 3143; GFX6-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 3144; GFX6-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc 3145; GFX6-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc 3146; GFX6-NEXT: ; return to shader part epilog 3147; 3148; GFX8-LABEL: uaddsat_i128_sv: 3149; GFX8: ; %bb.0: 3150; GFX8-NEXT: v_mov_b32_e32 v5, s1 3151; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 3152; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc 3153; GFX8-NEXT: v_mov_b32_e32 v6, s2 3154; GFX8-NEXT: v_mov_b32_e32 v7, s3 3155; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v2, vcc 3156; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc 3157; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 3158; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3159; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] 3160; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 3161; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 3162; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 3163; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3164; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3165; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 3166; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 3167; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc 3168; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc 3169; GFX8-NEXT: ; return to shader part epilog 3170; 3171; GFX9-LABEL: uaddsat_i128_sv: 3172; GFX9: ; %bb.0: 3173; GFX9-NEXT: v_mov_b32_e32 v5, s1 3174; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 3175; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc 3176; GFX9-NEXT: v_mov_b32_e32 v6, s2 3177; GFX9-NEXT: v_mov_b32_e32 v7, s3 3178; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc 3179; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v3, vcc 3180; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 3181; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3182; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] 3183; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 3184; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 3185; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 3186; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3187; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3188; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc 3189; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc 3190; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc 3191; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc 3192; GFX9-NEXT: ; return to shader part epilog 3193; 3194; GFX10-LABEL: uaddsat_i128_sv: 3195; GFX10: ; %bb.0: 3196; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0 3197; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 3198; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 3199; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 3200; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 3201; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 3202; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[2:3] 3203; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 3204; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 3205; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 3206; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 3207; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3208; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, -1, vcc_lo 3209; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, -1, vcc_lo 3210; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, -1, vcc_lo 3211; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, -1, vcc_lo 3212; GFX10-NEXT: ; return to shader part epilog 3213 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) 3214 %cast = bitcast i128 %result to <4 x float> 3215 ret <4 x float> %cast 3216} 3217 3218define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { 3219; GFX6-LABEL: uaddsat_i128_vs: 3220; GFX6: ; %bb.0: 3221; GFX6-NEXT: v_mov_b32_e32 v4, s1 3222; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 3223; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 3224; GFX6-NEXT: v_mov_b32_e32 v4, s2 3225; GFX6-NEXT: v_mov_b32_e32 v5, s3 3226; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 3227; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 3228; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 3229; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3230; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] 3231; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 3232; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] 3233; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 3234; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 3235; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3236; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3237; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3238; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3239; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3240; GFX6-NEXT: ; return to shader part epilog 3241; 3242; GFX8-LABEL: uaddsat_i128_vs: 3243; GFX8: ; %bb.0: 3244; GFX8-NEXT: v_mov_b32_e32 v4, s1 3245; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 3246; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 3247; GFX8-NEXT: v_mov_b32_e32 v4, s2 3248; GFX8-NEXT: v_mov_b32_e32 v5, s3 3249; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc 3250; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 3251; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 3252; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3253; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] 3254; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 3255; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] 3256; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 3257; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 3258; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3259; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3260; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3261; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3262; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3263; GFX8-NEXT: ; return to shader part epilog 3264; 3265; GFX9-LABEL: uaddsat_i128_vs: 3266; GFX9: ; %bb.0: 3267; GFX9-NEXT: v_mov_b32_e32 v4, s1 3268; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 3269; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 3270; GFX9-NEXT: v_mov_b32_e32 v4, s2 3271; GFX9-NEXT: v_mov_b32_e32 v5, s3 3272; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v4, vcc 3273; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 3274; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 3275; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 3276; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[2:3] 3277; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 3278; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] 3279; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 3280; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 3281; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 3282; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3283; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3284; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3285; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3286; GFX9-NEXT: ; return to shader part epilog 3287; 3288; GFX10-LABEL: uaddsat_i128_vs: 3289; GFX10: ; %bb.0: 3290; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0 3291; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 3292; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo 3293; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo 3294; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] 3295; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 3296; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[2:3] 3297; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo 3298; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3] 3299; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo 3300; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 3301; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 3302; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 3303; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo 3304; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo 3305; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo 3306; GFX10-NEXT: ; return to shader part epilog 3307 %result = call i128 @llvm.uadd.sat.i128(i128 %lhs, i128 %rhs) 3308 %cast = bitcast i128 %result to <4 x float> 3309 ret <4 x float> %cast 3310} 3311 3312define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { 3313; GFX6-LABEL: v_uaddsat_v2i128: 3314; GFX6: ; %bb.0: 3315; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3316; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 3317; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc 3318; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc 3319; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 3320; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] 3321; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3322; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] 3323; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3324; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] 3325; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3326; GFX6-NEXT: v_and_b32_e32 v8, 1, v8 3327; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3328; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3329; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3330; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3331; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3332; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v12 3333; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc 3334; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc 3335; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc 3336; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] 3337; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3338; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] 3339; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3340; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] 3341; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3342; GFX6-NEXT: v_and_b32_e32 v8, 1, v8 3343; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3344; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc 3345; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc 3346; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc 3347; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc 3348; GFX6-NEXT: s_setpc_b64 s[30:31] 3349; 3350; GFX8-LABEL: v_uaddsat_v2i128: 3351; GFX8: ; %bb.0: 3352; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3353; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 3354; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc 3355; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v10, vcc 3356; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 3357; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] 3358; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3359; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] 3360; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3361; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] 3362; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3363; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 3364; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3365; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3366; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3367; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3368; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3369; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v12 3370; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc 3371; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v6, v14, vcc 3372; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v15, vcc 3373; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] 3374; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3375; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] 3376; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3377; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] 3378; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3379; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 3380; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3381; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc 3382; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc 3383; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc 3384; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc 3385; GFX8-NEXT: s_setpc_b64 s[30:31] 3386; 3387; GFX9-LABEL: v_uaddsat_v2i128: 3388; GFX9: ; %bb.0: 3389; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3390; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 3391; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v9, vcc 3392; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v10, vcc 3393; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v11, vcc 3394; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[8:9] 3395; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3396; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] 3397; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3398; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] 3399; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3400; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 3401; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3402; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc 3403; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc 3404; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3405; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3406; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v12 3407; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v13, vcc 3408; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v14, vcc 3409; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v15, vcc 3410; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] 3411; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 3412; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[14:15] 3413; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 3414; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[14:15] 3415; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 3416; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 3417; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 3418; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, -1, vcc 3419; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, -1, vcc 3420; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, -1, vcc 3421; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, -1, vcc 3422; GFX9-NEXT: s_setpc_b64 s[30:31] 3423; 3424; GFX10-LABEL: v_uaddsat_v2i128: 3425; GFX10: ; %bb.0: 3426; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3427; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3428; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v8 3429; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo 3430; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo 3431; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo 3432; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[8:9] 3433; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 3434; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v12 3435; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v13, vcc_lo 3436; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v14, vcc_lo 3437; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v15, vcc_lo 3438; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[10:11] 3439; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo 3440; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] 3441; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo 3442; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] 3443; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo 3444; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] 3445; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo 3446; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] 3447; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 3448; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo 3449; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 3450; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 3451; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 3452; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo 3453; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9 3454; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo 3455; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo 3456; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, -1, s4 3457; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, -1, s4 3458; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, -1, s4 3459; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, -1, s4 3460; GFX10-NEXT: s_setpc_b64 s[30:31] 3461 %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 3462 ret <2 x i128> %result 3463} 3464 3465define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { 3466; GFX6-LABEL: s_uaddsat_v2i128: 3467; GFX6: ; %bb.0: 3468; GFX6-NEXT: s_add_u32 s0, s0, s8 3469; GFX6-NEXT: s_cselect_b32 s16, 1, 0 3470; GFX6-NEXT: s_and_b32 s16, s16, 1 3471; GFX6-NEXT: s_cmp_lg_u32 s16, 0 3472; GFX6-NEXT: s_addc_u32 s1, s1, s9 3473; GFX6-NEXT: s_cselect_b32 s16, 1, 0 3474; GFX6-NEXT: s_and_b32 s16, s16, 1 3475; GFX6-NEXT: s_cmp_lg_u32 s16, 0 3476; GFX6-NEXT: s_addc_u32 s2, s2, s10 3477; GFX6-NEXT: s_cselect_b32 s16, 1, 0 3478; GFX6-NEXT: v_mov_b32_e32 v2, s8 3479; GFX6-NEXT: s_and_b32 s16, s16, 1 3480; GFX6-NEXT: v_mov_b32_e32 v3, s9 3481; GFX6-NEXT: s_cmp_lg_u32 s16, 0 3482; GFX6-NEXT: v_mov_b32_e32 v0, s10 3483; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3484; GFX6-NEXT: s_addc_u32 s3, s3, s11 3485; GFX6-NEXT: v_mov_b32_e32 v1, s11 3486; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3487; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3488; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 3489; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] 3490; GFX6-NEXT: v_mov_b32_e32 v1, s0 3491; GFX6-NEXT: s_add_u32 s0, s4, s12 3492; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 3493; GFX6-NEXT: v_mov_b32_e32 v2, s1 3494; GFX6-NEXT: s_cselect_b32 s1, 1, 0 3495; GFX6-NEXT: s_and_b32 s1, s1, 1 3496; GFX6-NEXT: s_cmp_lg_u32 s1, 0 3497; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 3498; GFX6-NEXT: s_addc_u32 s1, s5, s13 3499; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3500; GFX6-NEXT: v_mov_b32_e32 v0, s2 3501; GFX6-NEXT: s_cselect_b32 s2, 1, 0 3502; GFX6-NEXT: s_and_b32 s2, s2, 1 3503; GFX6-NEXT: s_cmp_lg_u32 s2, 0 3504; GFX6-NEXT: s_addc_u32 s2, s6, s14 3505; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc 3506; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc 3507; GFX6-NEXT: v_mov_b32_e32 v1, s3 3508; GFX6-NEXT: s_cselect_b32 s3, 1, 0 3509; GFX6-NEXT: v_mov_b32_e32 v2, s12 3510; GFX6-NEXT: s_and_b32 s3, s3, 1 3511; GFX6-NEXT: v_mov_b32_e32 v3, s13 3512; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc 3513; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc 3514; GFX6-NEXT: s_cmp_lg_u32 s3, 0 3515; GFX6-NEXT: v_mov_b32_e32 v0, s14 3516; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3517; GFX6-NEXT: s_addc_u32 s3, s7, s15 3518; GFX6-NEXT: v_mov_b32_e32 v1, s15 3519; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3520; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3521; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 3522; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] 3523; GFX6-NEXT: v_mov_b32_e32 v1, s0 3524; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 3525; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 3526; GFX6-NEXT: v_mov_b32_e32 v2, s1 3527; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3528; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 3529; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 3530; GFX6-NEXT: v_mov_b32_e32 v2, s2 3531; GFX6-NEXT: v_mov_b32_e32 v3, s3 3532; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3533; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3534; GFX6-NEXT: v_readfirstlane_b32 s0, v4 3535; GFX6-NEXT: v_readfirstlane_b32 s1, v5 3536; GFX6-NEXT: v_readfirstlane_b32 s2, v6 3537; GFX6-NEXT: v_readfirstlane_b32 s3, v7 3538; GFX6-NEXT: v_readfirstlane_b32 s4, v0 3539; GFX6-NEXT: v_readfirstlane_b32 s5, v1 3540; GFX6-NEXT: v_readfirstlane_b32 s6, v2 3541; GFX6-NEXT: v_readfirstlane_b32 s7, v3 3542; GFX6-NEXT: ; return to shader part epilog 3543; 3544; GFX8-LABEL: s_uaddsat_v2i128: 3545; GFX8: ; %bb.0: 3546; GFX8-NEXT: s_add_u32 s0, s0, s8 3547; GFX8-NEXT: s_cselect_b32 s16, 1, 0 3548; GFX8-NEXT: s_and_b32 s16, s16, 1 3549; GFX8-NEXT: s_cmp_lg_u32 s16, 0 3550; GFX8-NEXT: s_addc_u32 s1, s1, s9 3551; GFX8-NEXT: s_cselect_b32 s16, 1, 0 3552; GFX8-NEXT: s_and_b32 s16, s16, 1 3553; GFX8-NEXT: s_cmp_lg_u32 s16, 0 3554; GFX8-NEXT: s_addc_u32 s2, s2, s10 3555; GFX8-NEXT: s_cselect_b32 s16, 1, 0 3556; GFX8-NEXT: s_and_b32 s16, s16, 1 3557; GFX8-NEXT: v_mov_b32_e32 v2, s8 3558; GFX8-NEXT: s_cmp_lg_u32 s16, 0 3559; GFX8-NEXT: v_mov_b32_e32 v3, s9 3560; GFX8-NEXT: s_addc_u32 s3, s3, s11 3561; GFX8-NEXT: v_mov_b32_e32 v0, s10 3562; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3563; GFX8-NEXT: v_mov_b32_e32 v1, s11 3564; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] 3565; GFX8-NEXT: s_cselect_b32 s10, 1, 0 3566; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3567; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3568; GFX8-NEXT: s_and_b32 s8, 1, s10 3569; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3570; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 3571; GFX8-NEXT: v_mov_b32_e32 v1, s0 3572; GFX8-NEXT: s_add_u32 s0, s4, s12 3573; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3574; GFX8-NEXT: v_mov_b32_e32 v2, s1 3575; GFX8-NEXT: s_cselect_b32 s1, 1, 0 3576; GFX8-NEXT: s_and_b32 s1, s1, 1 3577; GFX8-NEXT: s_cmp_lg_u32 s1, 0 3578; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3579; GFX8-NEXT: s_addc_u32 s1, s5, s13 3580; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3581; GFX8-NEXT: v_mov_b32_e32 v0, s2 3582; GFX8-NEXT: s_cselect_b32 s2, 1, 0 3583; GFX8-NEXT: s_and_b32 s2, s2, 1 3584; GFX8-NEXT: s_cmp_lg_u32 s2, 0 3585; GFX8-NEXT: s_addc_u32 s2, s6, s14 3586; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc 3587; GFX8-NEXT: v_mov_b32_e32 v1, s3 3588; GFX8-NEXT: s_cselect_b32 s3, 1, 0 3589; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc 3590; GFX8-NEXT: s_and_b32 s3, s3, 1 3591; GFX8-NEXT: v_mov_b32_e32 v2, s12 3592; GFX8-NEXT: s_cmp_lg_u32 s3, 0 3593; GFX8-NEXT: v_mov_b32_e32 v3, s13 3594; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc 3595; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc 3596; GFX8-NEXT: s_addc_u32 s3, s7, s15 3597; GFX8-NEXT: v_mov_b32_e32 v0, s14 3598; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3599; GFX8-NEXT: v_mov_b32_e32 v1, s15 3600; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] 3601; GFX8-NEXT: s_cselect_b32 s4, 1, 0 3602; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3603; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3604; GFX8-NEXT: s_and_b32 s4, 1, s4 3605; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3606; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 3607; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3608; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 3609; GFX8-NEXT: v_mov_b32_e32 v1, s0 3610; GFX8-NEXT: v_mov_b32_e32 v2, s1 3611; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3612; GFX8-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 3613; GFX8-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 3614; GFX8-NEXT: v_mov_b32_e32 v2, s2 3615; GFX8-NEXT: v_mov_b32_e32 v3, s3 3616; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3617; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3618; GFX8-NEXT: v_readfirstlane_b32 s0, v4 3619; GFX8-NEXT: v_readfirstlane_b32 s1, v5 3620; GFX8-NEXT: v_readfirstlane_b32 s2, v6 3621; GFX8-NEXT: v_readfirstlane_b32 s3, v7 3622; GFX8-NEXT: v_readfirstlane_b32 s4, v0 3623; GFX8-NEXT: v_readfirstlane_b32 s5, v1 3624; GFX8-NEXT: v_readfirstlane_b32 s6, v2 3625; GFX8-NEXT: v_readfirstlane_b32 s7, v3 3626; GFX8-NEXT: ; return to shader part epilog 3627; 3628; GFX9-LABEL: s_uaddsat_v2i128: 3629; GFX9: ; %bb.0: 3630; GFX9-NEXT: s_add_u32 s0, s0, s8 3631; GFX9-NEXT: s_cselect_b32 s16, 1, 0 3632; GFX9-NEXT: s_and_b32 s16, s16, 1 3633; GFX9-NEXT: s_cmp_lg_u32 s16, 0 3634; GFX9-NEXT: s_addc_u32 s1, s1, s9 3635; GFX9-NEXT: s_cselect_b32 s16, 1, 0 3636; GFX9-NEXT: s_and_b32 s16, s16, 1 3637; GFX9-NEXT: s_cmp_lg_u32 s16, 0 3638; GFX9-NEXT: s_addc_u32 s2, s2, s10 3639; GFX9-NEXT: s_cselect_b32 s16, 1, 0 3640; GFX9-NEXT: s_and_b32 s16, s16, 1 3641; GFX9-NEXT: v_mov_b32_e32 v2, s8 3642; GFX9-NEXT: s_cmp_lg_u32 s16, 0 3643; GFX9-NEXT: v_mov_b32_e32 v3, s9 3644; GFX9-NEXT: s_addc_u32 s3, s3, s11 3645; GFX9-NEXT: v_mov_b32_e32 v0, s10 3646; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3647; GFX9-NEXT: v_mov_b32_e32 v1, s11 3648; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] 3649; GFX9-NEXT: s_cselect_b32 s10, 1, 0 3650; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3651; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3652; GFX9-NEXT: s_and_b32 s8, 1, s10 3653; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3654; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 3655; GFX9-NEXT: v_mov_b32_e32 v1, s0 3656; GFX9-NEXT: s_add_u32 s0, s4, s12 3657; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3658; GFX9-NEXT: v_mov_b32_e32 v2, s1 3659; GFX9-NEXT: s_cselect_b32 s1, 1, 0 3660; GFX9-NEXT: s_and_b32 s1, s1, 1 3661; GFX9-NEXT: s_cmp_lg_u32 s1, 0 3662; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3663; GFX9-NEXT: s_addc_u32 s1, s5, s13 3664; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3665; GFX9-NEXT: v_mov_b32_e32 v0, s2 3666; GFX9-NEXT: s_cselect_b32 s2, 1, 0 3667; GFX9-NEXT: s_and_b32 s2, s2, 1 3668; GFX9-NEXT: s_cmp_lg_u32 s2, 0 3669; GFX9-NEXT: s_addc_u32 s2, s6, s14 3670; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc 3671; GFX9-NEXT: v_mov_b32_e32 v1, s3 3672; GFX9-NEXT: s_cselect_b32 s3, 1, 0 3673; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc 3674; GFX9-NEXT: s_and_b32 s3, s3, 1 3675; GFX9-NEXT: v_mov_b32_e32 v2, s12 3676; GFX9-NEXT: s_cmp_lg_u32 s3, 0 3677; GFX9-NEXT: v_mov_b32_e32 v3, s13 3678; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc 3679; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc 3680; GFX9-NEXT: s_addc_u32 s3, s7, s15 3681; GFX9-NEXT: v_mov_b32_e32 v0, s14 3682; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 3683; GFX9-NEXT: v_mov_b32_e32 v1, s15 3684; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] 3685; GFX9-NEXT: s_cselect_b32 s4, 1, 0 3686; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 3687; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3688; GFX9-NEXT: s_and_b32 s4, 1, s4 3689; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 3690; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 3691; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3692; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 3693; GFX9-NEXT: v_mov_b32_e32 v1, s0 3694; GFX9-NEXT: v_mov_b32_e32 v2, s1 3695; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 3696; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 3697; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 3698; GFX9-NEXT: v_mov_b32_e32 v2, s2 3699; GFX9-NEXT: v_mov_b32_e32 v3, s3 3700; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc 3701; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc 3702; GFX9-NEXT: v_readfirstlane_b32 s0, v4 3703; GFX9-NEXT: v_readfirstlane_b32 s1, v5 3704; GFX9-NEXT: v_readfirstlane_b32 s2, v6 3705; GFX9-NEXT: v_readfirstlane_b32 s3, v7 3706; GFX9-NEXT: v_readfirstlane_b32 s4, v0 3707; GFX9-NEXT: v_readfirstlane_b32 s5, v1 3708; GFX9-NEXT: v_readfirstlane_b32 s6, v2 3709; GFX9-NEXT: v_readfirstlane_b32 s7, v3 3710; GFX9-NEXT: ; return to shader part epilog 3711; 3712; GFX10-LABEL: s_uaddsat_v2i128: 3713; GFX10: ; %bb.0: 3714; GFX10-NEXT: s_add_u32 s0, s0, s8 3715; GFX10-NEXT: s_cselect_b32 s16, 1, 0 3716; GFX10-NEXT: s_and_b32 s16, s16, 1 3717; GFX10-NEXT: s_cmp_lg_u32 s16, 0 3718; GFX10-NEXT: s_addc_u32 s1, s1, s9 3719; GFX10-NEXT: s_cselect_b32 s16, 1, 0 3720; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] 3721; GFX10-NEXT: s_and_b32 s16, s16, 1 3722; GFX10-NEXT: s_cmp_lg_u32 s16, 0 3723; GFX10-NEXT: s_addc_u32 s2, s2, s10 3724; GFX10-NEXT: s_cselect_b32 s16, 1, 0 3725; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 3726; GFX10-NEXT: s_and_b32 s16, s16, 1 3727; GFX10-NEXT: s_cmp_lg_u32 s16, 0 3728; GFX10-NEXT: s_addc_u32 s3, s3, s11 3729; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] 3730; GFX10-NEXT: v_cmp_lt_u64_e64 s10, s[2:3], s[10:11] 3731; GFX10-NEXT: s_cselect_b32 s16, 1, 0 3732; GFX10-NEXT: s_and_b32 s8, 1, s16 3733; GFX10-NEXT: s_add_u32 s4, s4, s12 3734; GFX10-NEXT: s_cselect_b32 s9, 1, 0 3735; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 3736; GFX10-NEXT: s_and_b32 s9, s9, 1 3737; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s10 3738; GFX10-NEXT: s_cmp_lg_u32 s9, 0 3739; GFX10-NEXT: s_addc_u32 s5, s5, s13 3740; GFX10-NEXT: s_cselect_b32 s9, 1, 0 3741; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 3742; GFX10-NEXT: s_and_b32 s9, s9, 1 3743; GFX10-NEXT: s_cmp_lg_u32 s9, 0 3744; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] 3745; GFX10-NEXT: s_addc_u32 s6, s6, s14 3746; GFX10-NEXT: s_cselect_b32 s8, 1, 0 3747; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 3748; GFX10-NEXT: s_and_b32 s8, s8, 1 3749; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 3750; GFX10-NEXT: s_cmp_lg_u32 s8, 0 3751; GFX10-NEXT: s_addc_u32 s7, s7, s15 3752; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] 3753; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] 3754; GFX10-NEXT: s_cselect_b32 s8, 1, 0 3755; GFX10-NEXT: s_and_b32 s8, 1, s8 3756; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 3757; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 3758; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 3759; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3760; GFX10-NEXT: v_and_b32_e32 v0, 1, v1 3761; GFX10-NEXT: v_cndmask_b32_e64 v1, s0, -1, vcc_lo 3762; GFX10-NEXT: v_cndmask_b32_e64 v2, s1, -1, vcc_lo 3763; GFX10-NEXT: v_cndmask_b32_e64 v3, s2, -1, vcc_lo 3764; GFX10-NEXT: v_cndmask_b32_e64 v4, s3, -1, vcc_lo 3765; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 3766; GFX10-NEXT: v_readfirstlane_b32 s0, v1 3767; GFX10-NEXT: v_readfirstlane_b32 s1, v2 3768; GFX10-NEXT: v_readfirstlane_b32 s2, v3 3769; GFX10-NEXT: v_readfirstlane_b32 s3, v4 3770; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, -1, vcc_lo 3771; GFX10-NEXT: v_cndmask_b32_e64 v1, s5, -1, vcc_lo 3772; GFX10-NEXT: v_cndmask_b32_e64 v2, s6, -1, vcc_lo 3773; GFX10-NEXT: v_cndmask_b32_e64 v3, s7, -1, vcc_lo 3774; GFX10-NEXT: v_readfirstlane_b32 s4, v0 3775; GFX10-NEXT: v_readfirstlane_b32 s5, v1 3776; GFX10-NEXT: v_readfirstlane_b32 s6, v2 3777; GFX10-NEXT: v_readfirstlane_b32 s7, v3 3778; GFX10-NEXT: ; return to shader part epilog 3779 %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 3780 ret <2 x i128> %result 3781} 3782 3783declare i7 @llvm.uadd.sat.i7(i7, i7) #0 3784declare i8 @llvm.uadd.sat.i8(i8, i8) #0 3785declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) #0 3786declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) #0 3787 3788declare i16 @llvm.uadd.sat.i16(i16, i16) #0 3789declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 3790declare <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 3791declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 3792declare <5 x i16> @llvm.uadd.sat.v5i16(<5 x i16>, <5 x i16>) #0 3793declare <6 x i16> @llvm.uadd.sat.v6i16(<6 x i16>, <6 x i16>) #0 3794declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) #0 3795 3796declare i24 @llvm.uadd.sat.i24(i24, i24) #0 3797 3798declare i32 @llvm.uadd.sat.i32(i32, i32) #0 3799declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 3800declare <3 x i32> @llvm.uadd.sat.v3i32(<3 x i32>, <3 x i32>) #0 3801declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) #0 3802declare <5 x i32> @llvm.uadd.sat.v5i32(<5 x i32>, <5 x i32>) #0 3803declare <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32>, <16 x i32>) #0 3804 3805declare i48 @llvm.uadd.sat.i48(i48, i48) #0 3806 3807declare i64 @llvm.uadd.sat.i64(i64, i64) #0 3808declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) #0 3809 3810declare i128 @llvm.uadd.sat.i128(i128, i128) #0 3811declare <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128>, <2 x i128>) #0 3812 3813attributes #0 = { nounwind readnone speculatable willreturn } 3814