1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s 6 7define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) { 8; GFX6-LABEL: v_saddsat_i7: 9; GFX6: ; %bb.0: 10; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 12; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 13; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 14; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 15; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 16; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 17; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 18; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 19; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 20; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 21; GFX6-NEXT: s_setpc_b64 s[30:31] 22; 23; GFX8-LABEL: v_saddsat_i7: 24; GFX8: ; %bb.0: 25; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 27; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 28; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 29; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 30; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 31; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 32; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 33; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 34; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 35; GFX8-NEXT: v_ashrrev_i16_e32 v0, 9, v0 36; GFX8-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX9-LABEL: v_saddsat_i7: 39; GFX9: ; %bb.0: 40; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 42; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 43; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 44; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 45; GFX9-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX10-LABEL: v_saddsat_i7: 48; GFX10: ; %bb.0: 49; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 51; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0 52; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1 53; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp 54; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0 55; GFX10-NEXT: s_setpc_b64 s[30:31] 56 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) 57 ret i7 %result 58} 59 60define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) { 61; GFX6-LABEL: s_saddsat_i7: 62; GFX6: ; %bb.0: 63; GFX6-NEXT: s_lshl_b32 s0, s0, 25 64; GFX6-NEXT: s_min_i32 s3, s0, 0 65; GFX6-NEXT: s_lshl_b32 s1, s1, 25 66; GFX6-NEXT: s_max_i32 s2, s0, 0 67; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 68; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 69; GFX6-NEXT: s_max_i32 s1, s3, s1 70; GFX6-NEXT: s_min_i32 s1, s1, s2 71; GFX6-NEXT: s_add_i32 s0, s0, s1 72; GFX6-NEXT: s_ashr_i32 s0, s0, 25 73; GFX6-NEXT: ; return to shader part epilog 74; 75; GFX8-LABEL: s_saddsat_i7: 76; GFX8: ; %bb.0: 77; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000 78; GFX8-NEXT: s_lshl_b32 s0, s0, s2 79; GFX8-NEXT: s_sext_i32_i16 s3, s0 80; GFX8-NEXT: s_sext_i32_i16 s4, 0 81; GFX8-NEXT: s_max_i32 s5, s3, s4 82; GFX8-NEXT: s_min_i32 s3, s3, s4 83; GFX8-NEXT: s_lshl_b32 s1, s1, s2 84; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 85; GFX8-NEXT: s_sext_i32_i16 s3, s3 86; GFX8-NEXT: s_sext_i32_i16 s1, s1 87; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 88; GFX8-NEXT: s_max_i32 s1, s3, s1 89; GFX8-NEXT: s_sext_i32_i16 s1, s1 90; GFX8-NEXT: s_sext_i32_i16 s3, s5 91; GFX8-NEXT: s_min_i32 s1, s1, s3 92; GFX8-NEXT: s_add_i32 s0, s0, s1 93; GFX8-NEXT: s_sext_i32_i16 s0, s0 94; GFX8-NEXT: s_ashr_i32 s0, s0, s2 95; GFX8-NEXT: ; return to shader part epilog 96; 97; GFX9-LABEL: s_saddsat_i7: 98; GFX9: ; %bb.0: 99; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000 100; GFX9-NEXT: s_lshl_b32 s1, s1, s2 101; GFX9-NEXT: s_lshl_b32 s0, s0, s2 102; GFX9-NEXT: v_mov_b32_e32 v0, s1 103; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 104; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 105; GFX9-NEXT: v_readfirstlane_b32 s0, v0 106; GFX9-NEXT: ; return to shader part epilog 107; 108; GFX10-LABEL: s_saddsat_i7: 109; GFX10: ; %bb.0: 110; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000 111; GFX10-NEXT: s_lshl_b32 s0, s0, s2 112; GFX10-NEXT: s_lshl_b32 s1, s1, s2 113; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp 114; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0 115; GFX10-NEXT: v_readfirstlane_b32 s0, v0 116; GFX10-NEXT: ; return to shader part epilog 117 %result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs) 118 ret i7 %result 119} 120 121define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { 122; GFX6-LABEL: v_saddsat_i8: 123; GFX6: ; %bb.0: 124; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 125; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 126; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 127; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 128; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 129; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 130; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 131; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 132; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 133; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 134; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 135; GFX6-NEXT: s_setpc_b64 s[30:31] 136; 137; GFX8-LABEL: v_saddsat_i8: 138; GFX8: ; %bb.0: 139; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 141; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 142; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 143; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 144; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 145; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 146; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 147; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 148; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 149; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 150; GFX8-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX9-LABEL: v_saddsat_i8: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 156; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 157; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 158; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 159; GFX9-NEXT: s_setpc_b64 s[30:31] 160; 161; GFX10-LABEL: v_saddsat_i8: 162; GFX10: ; %bb.0: 163; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 164; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 165; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 166; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 167; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp 168; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 169; GFX10-NEXT: s_setpc_b64 s[30:31] 170 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) 171 ret i8 %result 172} 173 174define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) { 175; GFX6-LABEL: s_saddsat_i8: 176; GFX6: ; %bb.0: 177; GFX6-NEXT: s_lshl_b32 s0, s0, 24 178; GFX6-NEXT: s_min_i32 s3, s0, 0 179; GFX6-NEXT: s_lshl_b32 s1, s1, 24 180; GFX6-NEXT: s_max_i32 s2, s0, 0 181; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 182; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 183; GFX6-NEXT: s_max_i32 s1, s3, s1 184; GFX6-NEXT: s_min_i32 s1, s1, s2 185; GFX6-NEXT: s_add_i32 s0, s0, s1 186; GFX6-NEXT: s_ashr_i32 s0, s0, 24 187; GFX6-NEXT: ; return to shader part epilog 188; 189; GFX8-LABEL: s_saddsat_i8: 190; GFX8: ; %bb.0: 191; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 192; GFX8-NEXT: s_lshl_b32 s0, s0, s2 193; GFX8-NEXT: s_sext_i32_i16 s3, s0 194; GFX8-NEXT: s_sext_i32_i16 s4, 0 195; GFX8-NEXT: s_max_i32 s5, s3, s4 196; GFX8-NEXT: s_min_i32 s3, s3, s4 197; GFX8-NEXT: s_lshl_b32 s1, s1, s2 198; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 199; GFX8-NEXT: s_sext_i32_i16 s3, s3 200; GFX8-NEXT: s_sext_i32_i16 s1, s1 201; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 202; GFX8-NEXT: s_max_i32 s1, s3, s1 203; GFX8-NEXT: s_sext_i32_i16 s1, s1 204; GFX8-NEXT: s_sext_i32_i16 s3, s5 205; GFX8-NEXT: s_min_i32 s1, s1, s3 206; GFX8-NEXT: s_add_i32 s0, s0, s1 207; GFX8-NEXT: s_sext_i32_i16 s0, s0 208; GFX8-NEXT: s_ashr_i32 s0, s0, s2 209; GFX8-NEXT: ; return to shader part epilog 210; 211; GFX9-LABEL: s_saddsat_i8: 212; GFX9: ; %bb.0: 213; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 214; GFX9-NEXT: s_lshl_b32 s1, s1, s2 215; GFX9-NEXT: s_lshl_b32 s0, s0, s2 216; GFX9-NEXT: v_mov_b32_e32 v0, s1 217; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 218; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 219; GFX9-NEXT: v_readfirstlane_b32 s0, v0 220; GFX9-NEXT: ; return to shader part epilog 221; 222; GFX10-LABEL: s_saddsat_i8: 223; GFX10: ; %bb.0: 224; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 225; GFX10-NEXT: s_lshl_b32 s0, s0, s2 226; GFX10-NEXT: s_lshl_b32 s1, s1, s2 227; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp 228; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 229; GFX10-NEXT: v_readfirstlane_b32 s0, v0 230; GFX10-NEXT: ; return to shader part epilog 231 %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) 232 ret i8 %result 233} 234 235define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { 236; GFX6-LABEL: v_saddsat_v2i8: 237; GFX6: ; %bb.0: 238; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 239; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 240; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 241; GFX6-NEXT: s_brev_b32 s5, 1 242; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 243; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 244; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 245; GFX6-NEXT: s_brev_b32 s4, -2 246; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 247; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 248; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 249; GFX6-NEXT: v_max_i32_e32 v1, v5, v1 250; GFX6-NEXT: v_min_i32_e32 v1, v1, v4 251; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 252; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 253; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 254; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 255; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 256; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 257; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 258; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 259; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 260; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 261; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 262; GFX6-NEXT: v_mov_b32_e32 v2, 0xff 263; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 264; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 265; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 266; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 267; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 268; GFX6-NEXT: s_setpc_b64 s[30:31] 269; 270; GFX8-LABEL: v_saddsat_v2i8: 271; GFX8: ; %bb.0: 272; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 273; GFX8-NEXT: v_mov_b32_e32 v2, 8 274; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 275; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 276; GFX8-NEXT: s_movk_i32 s5, 0x8000 277; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 278; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 279; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 280; GFX8-NEXT: s_movk_i32 s4, 0x7fff 281; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 282; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 283; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 284; GFX8-NEXT: v_max_i16_e32 v1, v5, v1 285; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 286; GFX8-NEXT: v_min_i16_e32 v4, 0, v3 287; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 288; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 289; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 290; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 291; GFX8-NEXT: v_max_i16_e32 v2, v4, v2 292; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 293; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 294; GFX8-NEXT: v_mov_b32_e32 v2, 0xff 295; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 296; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 297; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 298; GFX8-NEXT: s_setpc_b64 s[30:31] 299; 300; GFX9-LABEL: v_saddsat_v2i8: 301; GFX9: ; %bb.0: 302; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 303; GFX9-NEXT: s_mov_b32 s4, 8 304; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 305; GFX9-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 306; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 307; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 308; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 309; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 310; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 311; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 312; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 313; GFX9-NEXT: s_movk_i32 s4, 0xff 314; GFX9-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 315; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 316; GFX9-NEXT: s_setpc_b64 s[30:31] 317; 318; GFX10-LABEL: v_saddsat_v2i8: 319; GFX10: ; %bb.0: 320; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 321; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX10-NEXT: s_mov_b32 s4, 8 323; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff 324; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 325; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 326; GFX10-NEXT: s_movk_i32 s4, 0xff 327; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 328; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 329; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 330; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 331; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp 332; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 333; GFX10-NEXT: v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 334; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 335; GFX10-NEXT: s_setpc_b64 s[30:31] 336 %lhs = bitcast i16 %lhs.arg to <2 x i8> 337 %rhs = bitcast i16 %rhs.arg to <2 x i8> 338 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 339 %cast.result = bitcast <2 x i8> %result to i16 340 ret i16 %cast.result 341} 342 343define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { 344; GFX6-LABEL: s_saddsat_v2i8: 345; GFX6: ; %bb.0: 346; GFX6-NEXT: s_lshr_b32 s2, s0, 8 347; GFX6-NEXT: s_lshl_b32 s0, s0, 24 348; GFX6-NEXT: s_brev_b32 s5, 1 349; GFX6-NEXT: s_min_i32 s7, s0, 0 350; GFX6-NEXT: s_lshr_b32 s3, s1, 8 351; GFX6-NEXT: s_lshl_b32 s1, s1, 24 352; GFX6-NEXT: s_brev_b32 s4, -2 353; GFX6-NEXT: s_max_i32 s6, s0, 0 354; GFX6-NEXT: s_sub_i32 s7, s5, s7 355; GFX6-NEXT: s_sub_i32 s6, s4, s6 356; GFX6-NEXT: s_max_i32 s1, s7, s1 357; GFX6-NEXT: s_min_i32 s1, s1, s6 358; GFX6-NEXT: s_add_i32 s0, s0, s1 359; GFX6-NEXT: s_lshl_b32 s1, s2, 24 360; GFX6-NEXT: s_lshl_b32 s2, s3, 24 361; GFX6-NEXT: s_max_i32 s3, s1, 0 362; GFX6-NEXT: s_sub_i32 s3, s4, s3 363; GFX6-NEXT: s_min_i32 s4, s1, 0 364; GFX6-NEXT: s_sub_i32 s4, s5, s4 365; GFX6-NEXT: s_max_i32 s2, s4, s2 366; GFX6-NEXT: s_min_i32 s2, s2, s3 367; GFX6-NEXT: s_add_i32 s1, s1, s2 368; GFX6-NEXT: s_ashr_i32 s1, s1, 24 369; GFX6-NEXT: s_movk_i32 s2, 0xff 370; GFX6-NEXT: s_ashr_i32 s0, s0, 24 371; GFX6-NEXT: s_and_b32 s1, s1, s2 372; GFX6-NEXT: s_and_b32 s0, s0, s2 373; GFX6-NEXT: s_lshl_b32 s1, s1, 8 374; GFX6-NEXT: s_or_b32 s0, s0, s1 375; GFX6-NEXT: ; return to shader part epilog 376; 377; GFX8-LABEL: s_saddsat_v2i8: 378; GFX8: ; %bb.0: 379; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 380; GFX8-NEXT: s_lshr_b32 s2, s0, 8 381; GFX8-NEXT: s_lshl_b32 s0, s0, s4 382; GFX8-NEXT: s_sext_i32_i16 s7, s0 383; GFX8-NEXT: s_sext_i32_i16 s8, 0 384; GFX8-NEXT: s_movk_i32 s6, 0x8000 385; GFX8-NEXT: s_max_i32 s9, s7, s8 386; GFX8-NEXT: s_min_i32 s7, s7, s8 387; GFX8-NEXT: s_lshr_b32 s3, s1, 8 388; GFX8-NEXT: s_lshl_b32 s1, s1, s4 389; GFX8-NEXT: s_sub_i32 s7, s6, s7 390; GFX8-NEXT: s_movk_i32 s5, 0x7fff 391; GFX8-NEXT: s_sext_i32_i16 s7, s7 392; GFX8-NEXT: s_sext_i32_i16 s1, s1 393; GFX8-NEXT: s_sub_i32 s9, s5, s9 394; GFX8-NEXT: s_max_i32 s1, s7, s1 395; GFX8-NEXT: s_sext_i32_i16 s1, s1 396; GFX8-NEXT: s_sext_i32_i16 s7, s9 397; GFX8-NEXT: s_min_i32 s1, s1, s7 398; GFX8-NEXT: s_add_i32 s0, s0, s1 399; GFX8-NEXT: s_lshl_b32 s1, s2, s4 400; GFX8-NEXT: s_lshl_b32 s2, s3, s4 401; GFX8-NEXT: s_sext_i32_i16 s3, s1 402; GFX8-NEXT: s_max_i32 s7, s3, s8 403; GFX8-NEXT: s_min_i32 s3, s3, s8 404; GFX8-NEXT: s_sub_i32 s3, s6, s3 405; GFX8-NEXT: s_sext_i32_i16 s3, s3 406; GFX8-NEXT: s_sext_i32_i16 s2, s2 407; GFX8-NEXT: s_sub_i32 s5, s5, s7 408; GFX8-NEXT: s_max_i32 s2, s3, s2 409; GFX8-NEXT: s_sext_i32_i16 s2, s2 410; GFX8-NEXT: s_sext_i32_i16 s3, s5 411; GFX8-NEXT: s_min_i32 s2, s2, s3 412; GFX8-NEXT: s_add_i32 s1, s1, s2 413; GFX8-NEXT: s_sext_i32_i16 s1, s1 414; GFX8-NEXT: s_sext_i32_i16 s0, s0 415; GFX8-NEXT: s_ashr_i32 s1, s1, s4 416; GFX8-NEXT: s_movk_i32 s2, 0xff 417; GFX8-NEXT: s_ashr_i32 s0, s0, s4 418; GFX8-NEXT: s_and_b32 s1, s1, s2 419; GFX8-NEXT: s_and_b32 s0, s0, s2 420; GFX8-NEXT: s_lshl_b32 s1, s1, s4 421; GFX8-NEXT: s_or_b32 s0, s0, s1 422; GFX8-NEXT: ; return to shader part epilog 423; 424; GFX9-LABEL: s_saddsat_v2i8: 425; GFX9: ; %bb.0: 426; GFX9-NEXT: s_lshr_b32 s2, s0, 8 427; GFX9-NEXT: s_lshr_b32 s3, s1, 8 428; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 429; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 430; GFX9-NEXT: s_mov_b32 s2, 0x80008 431; GFX9-NEXT: s_lshr_b32 s3, s0, 16 432; GFX9-NEXT: s_lshl_b32 s0, s0, s2 433; GFX9-NEXT: s_lshl_b32 s3, s3, 8 434; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 435; GFX9-NEXT: s_lshr_b32 s3, s1, 16 436; GFX9-NEXT: s_lshl_b32 s1, s1, s2 437; GFX9-NEXT: s_lshl_b32 s2, s3, 8 438; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 439; GFX9-NEXT: v_mov_b32_e32 v0, s1 440; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 441; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 442; GFX9-NEXT: s_movk_i32 s0, 0xff 443; GFX9-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 444; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 445; GFX9-NEXT: v_readfirstlane_b32 s0, v0 446; GFX9-NEXT: ; return to shader part epilog 447; 448; GFX10-LABEL: s_saddsat_v2i8: 449; GFX10: ; %bb.0: 450; GFX10-NEXT: s_lshr_b32 s2, s0, 8 451; GFX10-NEXT: s_lshr_b32 s3, s1, 8 452; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 453; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 454; GFX10-NEXT: s_mov_b32 s2, 0x80008 455; GFX10-NEXT: s_lshr_b32 s3, s0, 16 456; GFX10-NEXT: s_lshr_b32 s4, s1, 16 457; GFX10-NEXT: s_lshl_b32 s0, s0, s2 458; GFX10-NEXT: s_lshl_b32 s3, s3, 8 459; GFX10-NEXT: s_lshl_b32 s1, s1, s2 460; GFX10-NEXT: s_lshl_b32 s2, s4, 8 461; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 462; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 463; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp 464; GFX10-NEXT: s_movk_i32 s0, 0xff 465; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 466; GFX10-NEXT: v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 467; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 468; GFX10-NEXT: v_readfirstlane_b32 s0, v0 469; GFX10-NEXT: ; return to shader part epilog 470 %lhs = bitcast i16 %lhs.arg to <2 x i8> 471 %rhs = bitcast i16 %rhs.arg to <2 x i8> 472 %result = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 473 %cast.result = bitcast <2 x i8> %result to i16 474 ret i16 %cast.result 475} 476 477define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { 478; GFX6-LABEL: v_saddsat_v4i8: 479; GFX6: ; %bb.0: 480; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 481; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 482; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 483; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 484; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 485; GFX6-NEXT: s_brev_b32 s5, 1 486; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 487; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 488; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 489; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 490; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 491; GFX6-NEXT: s_brev_b32 s4, -2 492; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 493; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 494; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 495; GFX6-NEXT: v_max_i32_e32 v1, v10, v1 496; GFX6-NEXT: v_min_i32_e32 v1, v1, v8 497; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 498; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 499; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 500; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 501; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 502; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 503; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 504; GFX6-NEXT: v_max_i32_e32 v2, v8, v2 505; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 506; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 507; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 508; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 509; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 510; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 511; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 512; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 513; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 514; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 515; GFX6-NEXT: v_min_i32_e32 v3, v3, v5 516; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 517; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 518; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 519; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 520; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 521; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 522; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 523; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 524; GFX6-NEXT: s_movk_i32 s4, 0xff 525; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 526; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 527; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 528; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 529; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 530; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 531; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 532; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 533; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 534; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 535; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 536; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 537; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 538; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 539; GFX6-NEXT: v_and_b32_e32 v1, s4, v3 540; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 541; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 542; GFX6-NEXT: s_setpc_b64 s[30:31] 543; 544; GFX8-LABEL: v_saddsat_v4i8: 545; GFX8: ; %bb.0: 546; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 547; GFX8-NEXT: v_mov_b32_e32 v2, 8 548; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 549; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 550; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 551; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 552; GFX8-NEXT: s_movk_i32 s5, 0x8000 553; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 554; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 555; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 556; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 557; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 558; GFX8-NEXT: s_movk_i32 s4, 0x7fff 559; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 560; GFX8-NEXT: v_sub_u16_e32 v10, s5, v10 561; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 562; GFX8-NEXT: v_max_i16_e32 v1, v10, v1 563; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 564; GFX8-NEXT: v_min_i16_e32 v8, 0, v3 565; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 566; GFX8-NEXT: v_max_i16_e32 v1, 0, v3 567; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 568; GFX8-NEXT: v_sub_u16_e32 v1, s4, v1 569; GFX8-NEXT: v_max_i16_e32 v2, v8, v2 570; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 571; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 572; GFX8-NEXT: v_add_u16_e32 v1, v3, v1 573; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 574; GFX8-NEXT: v_min_i16_e32 v6, 0, v2 575; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fff 576; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 577; GFX8-NEXT: v_sub_u16_e32 v6, s5, v6 578; GFX8-NEXT: v_sub_u16_e32 v4, v9, v4 579; GFX8-NEXT: v_max_i16_e32 v3, v6, v3 580; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 581; GFX8-NEXT: v_add_u16_e32 v2, v2, v3 582; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 583; GFX8-NEXT: v_min_i16_e32 v6, 0, v3 584; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 585; GFX8-NEXT: v_max_i16_e32 v5, 0, v3 586; GFX8-NEXT: v_sub_u16_e32 v6, 0x8000, v6 587; GFX8-NEXT: v_sub_u16_e32 v5, v9, v5 588; GFX8-NEXT: v_max_i16_e32 v4, v6, v4 589; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 590; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 591; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 592; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 593; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 594; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 595; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 596; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 597; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 598; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 599; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 600; GFX8-NEXT: s_setpc_b64 s[30:31] 601; 602; GFX9-LABEL: v_saddsat_v4i8: 603; GFX9: ; %bb.0: 604; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 605; GFX9-NEXT: s_mov_b32 s4, 8 606; GFX9-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 607; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 608; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff 609; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 610; GFX9-NEXT: v_lshrrev_b32_sdwa v5, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 611; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 612; GFX9-NEXT: v_and_or_b32 v0, v0, v8, v2 613; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 614; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 615; GFX9-NEXT: v_and_or_b32 v2, v3, v8, v2 616; GFX9-NEXT: v_and_or_b32 v1, v1, v8, v5 617; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v7 618; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 619; GFX9-NEXT: v_and_or_b32 v3, v6, v8, v3 620; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 621; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 622; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 623; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 624; GFX9-NEXT: v_pk_add_i16 v1, v2, v3 clamp 625; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 626; GFX9-NEXT: v_mov_b32_e32 v2, 8 627; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 628; GFX9-NEXT: s_movk_i32 s4, 0xff 629; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 630; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 631; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 632; GFX9-NEXT: v_mov_b32_e32 v3, 24 633; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 634; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 635; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 636; GFX9-NEXT: s_setpc_b64 s[30:31] 637; 638; GFX10-LABEL: v_saddsat_v4i8: 639; GFX10: ; %bb.0: 640; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 642; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 643; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 644; GFX10-NEXT: s_mov_b32 s4, 8 645; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 646; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 647; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 648; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff 649; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 650; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 651; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 652; GFX10-NEXT: s_movk_i32 s4, 0xff 653; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v2 654; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v6 655; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 656; GFX10-NEXT: v_and_or_b32 v3, v8, v7, v5 657; GFX10-NEXT: v_mov_b32_e32 v4, 24 658; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 659; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 660; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 661; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 662; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp 663; GFX10-NEXT: v_pk_add_i16 v1, v2, v3 clamp 664; GFX10-NEXT: v_mov_b32_e32 v2, 8 665; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 666; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 667; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 668; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 669; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 670; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 671; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 672; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 673; GFX10-NEXT: s_setpc_b64 s[30:31] 674 %lhs = bitcast i32 %lhs.arg to <4 x i8> 675 %rhs = bitcast i32 %rhs.arg to <4 x i8> 676 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 677 %cast.result = bitcast <4 x i8> %result to i32 678 ret i32 %cast.result 679} 680 681define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { 682; GFX6-LABEL: s_saddsat_v4i8: 683; GFX6: ; %bb.0: 684; GFX6-NEXT: s_lshr_b32 s2, s0, 8 685; GFX6-NEXT: s_lshr_b32 s3, s0, 16 686; GFX6-NEXT: s_lshr_b32 s4, s0, 24 687; GFX6-NEXT: s_lshl_b32 s0, s0, 24 688; GFX6-NEXT: s_brev_b32 s9, 1 689; GFX6-NEXT: s_min_i32 s11, s0, 0 690; GFX6-NEXT: s_lshr_b32 s5, s1, 8 691; GFX6-NEXT: s_lshr_b32 s6, s1, 16 692; GFX6-NEXT: s_lshr_b32 s7, s1, 24 693; GFX6-NEXT: s_lshl_b32 s1, s1, 24 694; GFX6-NEXT: s_brev_b32 s8, -2 695; GFX6-NEXT: s_max_i32 s10, s0, 0 696; GFX6-NEXT: s_sub_i32 s11, s9, s11 697; GFX6-NEXT: s_sub_i32 s10, s8, s10 698; GFX6-NEXT: s_max_i32 s1, s11, s1 699; GFX6-NEXT: s_min_i32 s1, s1, s10 700; GFX6-NEXT: s_add_i32 s0, s0, s1 701; GFX6-NEXT: s_lshl_b32 s1, s2, 24 702; GFX6-NEXT: s_min_i32 s10, s1, 0 703; GFX6-NEXT: s_lshl_b32 s2, s5, 24 704; GFX6-NEXT: s_max_i32 s5, s1, 0 705; GFX6-NEXT: s_sub_i32 s10, s9, s10 706; GFX6-NEXT: s_sub_i32 s5, s8, s5 707; GFX6-NEXT: s_max_i32 s2, s10, s2 708; GFX6-NEXT: s_min_i32 s2, s2, s5 709; GFX6-NEXT: s_add_i32 s1, s1, s2 710; GFX6-NEXT: s_lshl_b32 s2, s3, 24 711; GFX6-NEXT: s_lshl_b32 s3, s6, 24 712; GFX6-NEXT: s_min_i32 s6, s2, 0 713; GFX6-NEXT: s_max_i32 s5, s2, 0 714; GFX6-NEXT: s_sub_i32 s6, s9, s6 715; GFX6-NEXT: s_sub_i32 s5, s8, s5 716; GFX6-NEXT: s_max_i32 s3, s6, s3 717; GFX6-NEXT: s_min_i32 s3, s3, s5 718; GFX6-NEXT: s_add_i32 s2, s2, s3 719; GFX6-NEXT: s_lshl_b32 s3, s4, 24 720; GFX6-NEXT: s_min_i32 s6, s3, 0 721; GFX6-NEXT: s_lshl_b32 s4, s7, 24 722; GFX6-NEXT: s_max_i32 s5, s3, 0 723; GFX6-NEXT: s_sub_i32 s6, s9, s6 724; GFX6-NEXT: s_sub_i32 s5, s8, s5 725; GFX6-NEXT: s_max_i32 s4, s6, s4 726; GFX6-NEXT: s_min_i32 s4, s4, s5 727; GFX6-NEXT: s_ashr_i32 s1, s1, 24 728; GFX6-NEXT: s_add_i32 s3, s3, s4 729; GFX6-NEXT: s_movk_i32 s4, 0xff 730; GFX6-NEXT: s_ashr_i32 s0, s0, 24 731; GFX6-NEXT: s_and_b32 s1, s1, s4 732; GFX6-NEXT: s_ashr_i32 s2, s2, 24 733; GFX6-NEXT: s_and_b32 s0, s0, s4 734; GFX6-NEXT: s_lshl_b32 s1, s1, 8 735; GFX6-NEXT: s_or_b32 s0, s0, s1 736; GFX6-NEXT: s_and_b32 s1, s2, s4 737; GFX6-NEXT: s_ashr_i32 s3, s3, 24 738; GFX6-NEXT: s_lshl_b32 s1, s1, 16 739; GFX6-NEXT: s_or_b32 s0, s0, s1 740; GFX6-NEXT: s_and_b32 s1, s3, s4 741; GFX6-NEXT: s_lshl_b32 s1, s1, 24 742; GFX6-NEXT: s_or_b32 s0, s0, s1 743; GFX6-NEXT: ; return to shader part epilog 744; 745; GFX8-LABEL: s_saddsat_v4i8: 746; GFX8: ; %bb.0: 747; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000 748; GFX8-NEXT: s_lshr_b32 s2, s0, 8 749; GFX8-NEXT: s_lshr_b32 s3, s0, 16 750; GFX8-NEXT: s_lshr_b32 s4, s0, 24 751; GFX8-NEXT: s_lshl_b32 s0, s0, s8 752; GFX8-NEXT: s_sext_i32_i16 s11, s0 753; GFX8-NEXT: s_sext_i32_i16 s12, 0 754; GFX8-NEXT: s_movk_i32 s10, 0x8000 755; GFX8-NEXT: s_max_i32 s13, s11, s12 756; GFX8-NEXT: s_min_i32 s11, s11, s12 757; GFX8-NEXT: s_lshr_b32 s5, s1, 8 758; GFX8-NEXT: s_lshr_b32 s6, s1, 16 759; GFX8-NEXT: s_lshr_b32 s7, s1, 24 760; GFX8-NEXT: s_lshl_b32 s1, s1, s8 761; GFX8-NEXT: s_sub_i32 s11, s10, s11 762; GFX8-NEXT: s_movk_i32 s9, 0x7fff 763; GFX8-NEXT: s_sext_i32_i16 s11, s11 764; GFX8-NEXT: s_sext_i32_i16 s1, s1 765; GFX8-NEXT: s_sub_i32 s13, s9, s13 766; GFX8-NEXT: s_max_i32 s1, s11, s1 767; GFX8-NEXT: s_sext_i32_i16 s1, s1 768; GFX8-NEXT: s_sext_i32_i16 s11, s13 769; GFX8-NEXT: s_min_i32 s1, s1, s11 770; GFX8-NEXT: s_add_i32 s0, s0, s1 771; GFX8-NEXT: s_lshl_b32 s1, s2, s8 772; GFX8-NEXT: s_lshl_b32 s2, s5, s8 773; GFX8-NEXT: s_sext_i32_i16 s5, s1 774; GFX8-NEXT: s_max_i32 s11, s5, s12 775; GFX8-NEXT: s_min_i32 s5, s5, s12 776; GFX8-NEXT: s_sub_i32 s5, s10, s5 777; GFX8-NEXT: s_sext_i32_i16 s5, s5 778; GFX8-NEXT: s_sext_i32_i16 s2, s2 779; GFX8-NEXT: s_sub_i32 s11, s9, s11 780; GFX8-NEXT: s_max_i32 s2, s5, s2 781; GFX8-NEXT: s_sext_i32_i16 s2, s2 782; GFX8-NEXT: s_sext_i32_i16 s5, s11 783; GFX8-NEXT: s_min_i32 s2, s2, s5 784; GFX8-NEXT: s_add_i32 s1, s1, s2 785; GFX8-NEXT: s_lshl_b32 s2, s3, s8 786; GFX8-NEXT: s_sext_i32_i16 s5, s2 787; GFX8-NEXT: s_lshl_b32 s3, s6, s8 788; GFX8-NEXT: s_max_i32 s6, s5, s12 789; GFX8-NEXT: s_min_i32 s5, s5, s12 790; GFX8-NEXT: s_sub_i32 s5, s10, s5 791; GFX8-NEXT: s_sext_i32_i16 s5, s5 792; GFX8-NEXT: s_sext_i32_i16 s3, s3 793; GFX8-NEXT: s_sub_i32 s6, s9, s6 794; GFX8-NEXT: s_max_i32 s3, s5, s3 795; GFX8-NEXT: s_sext_i32_i16 s3, s3 796; GFX8-NEXT: s_sext_i32_i16 s5, s6 797; GFX8-NEXT: s_min_i32 s3, s3, s5 798; GFX8-NEXT: s_add_i32 s2, s2, s3 799; GFX8-NEXT: s_lshl_b32 s3, s4, s8 800; GFX8-NEXT: s_sext_i32_i16 s5, s3 801; GFX8-NEXT: s_max_i32 s6, s5, s12 802; GFX8-NEXT: s_min_i32 s5, s5, s12 803; GFX8-NEXT: s_lshl_b32 s4, s7, s8 804; GFX8-NEXT: s_sub_i32 s5, s10, s5 805; GFX8-NEXT: s_sext_i32_i16 s5, s5 806; GFX8-NEXT: s_sext_i32_i16 s4, s4 807; GFX8-NEXT: s_sub_i32 s6, s9, s6 808; GFX8-NEXT: s_max_i32 s4, s5, s4 809; GFX8-NEXT: s_sext_i32_i16 s4, s4 810; GFX8-NEXT: s_sext_i32_i16 s5, s6 811; GFX8-NEXT: s_sext_i32_i16 s1, s1 812; GFX8-NEXT: s_min_i32 s4, s4, s5 813; GFX8-NEXT: s_sext_i32_i16 s0, s0 814; GFX8-NEXT: s_ashr_i32 s1, s1, s8 815; GFX8-NEXT: s_add_i32 s3, s3, s4 816; GFX8-NEXT: s_movk_i32 s4, 0xff 817; GFX8-NEXT: s_ashr_i32 s0, s0, s8 818; GFX8-NEXT: s_sext_i32_i16 s2, s2 819; GFX8-NEXT: s_and_b32 s1, s1, s4 820; GFX8-NEXT: s_ashr_i32 s2, s2, s8 821; GFX8-NEXT: s_and_b32 s0, s0, s4 822; GFX8-NEXT: s_lshl_b32 s1, s1, 8 823; GFX8-NEXT: s_sext_i32_i16 s3, s3 824; GFX8-NEXT: s_or_b32 s0, s0, s1 825; GFX8-NEXT: s_and_b32 s1, s2, s4 826; GFX8-NEXT: s_ashr_i32 s3, s3, s8 827; GFX8-NEXT: s_lshl_b32 s1, s1, 16 828; GFX8-NEXT: s_or_b32 s0, s0, s1 829; GFX8-NEXT: s_and_b32 s1, s3, s4 830; GFX8-NEXT: s_lshl_b32 s1, s1, 24 831; GFX8-NEXT: s_or_b32 s0, s0, s1 832; GFX8-NEXT: ; return to shader part epilog 833; 834; GFX9-LABEL: s_saddsat_v4i8: 835; GFX9: ; %bb.0: 836; GFX9-NEXT: s_lshr_b32 s3, s0, 8 837; GFX9-NEXT: s_lshr_b32 s4, s0, 16 838; GFX9-NEXT: s_lshr_b32 s6, s0, 24 839; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 840; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 841; GFX9-NEXT: s_mov_b32 s4, 0x80008 842; GFX9-NEXT: s_lshr_b32 s6, s0, 16 843; GFX9-NEXT: s_lshr_b32 s7, s1, 8 844; GFX9-NEXT: s_lshl_b32 s0, s0, s4 845; GFX9-NEXT: s_lshl_b32 s6, s6, 8 846; GFX9-NEXT: s_lshr_b32 s8, s1, 16 847; GFX9-NEXT: s_lshr_b32 s9, s1, 24 848; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 849; GFX9-NEXT: s_lshr_b32 s6, s3, 16 850; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 851; GFX9-NEXT: s_lshl_b32 s3, s3, s4 852; GFX9-NEXT: s_lshl_b32 s6, s6, 8 853; GFX9-NEXT: s_lshr_b32 s7, s1, 16 854; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 855; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 856; GFX9-NEXT: s_lshl_b32 s1, s1, s4 857; GFX9-NEXT: s_lshl_b32 s7, s7, 8 858; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 859; GFX9-NEXT: s_lshr_b32 s7, s6, 16 860; GFX9-NEXT: s_lshl_b32 s4, s6, s4 861; GFX9-NEXT: s_lshl_b32 s6, s7, 8 862; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 863; GFX9-NEXT: v_mov_b32_e32 v0, s1 864; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 865; GFX9-NEXT: v_mov_b32_e32 v1, s4 866; GFX9-NEXT: s_mov_b32 s2, 8 867; GFX9-NEXT: v_pk_add_i16 v1, s3, v1 clamp 868; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 869; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 870; GFX9-NEXT: s_movk_i32 s0, 0xff 871; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 872; GFX9-NEXT: s_mov_b32 s5, 24 873; GFX9-NEXT: v_and_or_b32 v0, v0, s0, v2 874; GFX9-NEXT: v_and_b32_e32 v2, s0, v1 875; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 876; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 877; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 878; GFX9-NEXT: v_readfirstlane_b32 s0, v0 879; GFX9-NEXT: ; return to shader part epilog 880; 881; GFX10-LABEL: s_saddsat_v4i8: 882; GFX10: ; %bb.0: 883; GFX10-NEXT: s_lshr_b32 s2, s0, 8 884; GFX10-NEXT: s_lshr_b32 s3, s0, 16 885; GFX10-NEXT: s_lshr_b32 s4, s0, 24 886; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 887; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 888; GFX10-NEXT: s_mov_b32 s3, 0x80008 889; GFX10-NEXT: s_lshr_b32 s4, s0, 16 890; GFX10-NEXT: s_lshr_b32 s5, s1, 8 891; GFX10-NEXT: s_lshr_b32 s6, s1, 16 892; GFX10-NEXT: s_lshr_b32 s7, s1, 24 893; GFX10-NEXT: s_lshl_b32 s0, s0, s3 894; GFX10-NEXT: s_lshl_b32 s4, s4, 8 895; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 896; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 897; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 898; GFX10-NEXT: s_lshr_b32 s8, s2, 16 899; GFX10-NEXT: s_lshr_b32 s5, s1, 16 900; GFX10-NEXT: s_lshr_b32 s6, s4, 16 901; GFX10-NEXT: s_lshl_b32 s2, s2, s3 902; GFX10-NEXT: s_lshl_b32 s8, s8, 8 903; GFX10-NEXT: s_lshl_b32 s1, s1, s3 904; GFX10-NEXT: s_lshl_b32 s5, s5, 8 905; GFX10-NEXT: s_lshl_b32 s3, s4, s3 906; GFX10-NEXT: s_lshl_b32 s4, s6, 8 907; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 908; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 909; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 910; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp 911; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp 912; GFX10-NEXT: s_mov_b32 s0, 8 913; GFX10-NEXT: s_movk_i32 s1, 0xff 914; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 915; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 916; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 917; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 918; GFX10-NEXT: s_mov_b32 s0, 24 919; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 920; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 921; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 922; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 923; GFX10-NEXT: v_readfirstlane_b32 s0, v0 924; GFX10-NEXT: ; return to shader part epilog 925 %lhs = bitcast i32 %lhs.arg to <4 x i8> 926 %rhs = bitcast i32 %rhs.arg to <4 x i8> 927 %result = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 928 %cast.result = bitcast <4 x i8> %result to i32 929 ret i32 %cast.result 930} 931 932define i24 @v_saddsat_i24(i24 %lhs, i24 %rhs) { 933; GFX6-LABEL: v_saddsat_i24: 934; GFX6: ; %bb.0: 935; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 936; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 937; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 938; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 939; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 940; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 941; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 942; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 943; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 944; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 945; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 946; GFX6-NEXT: s_setpc_b64 s[30:31] 947; 948; GFX8-LABEL: v_saddsat_i24: 949; GFX8: ; %bb.0: 950; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 951; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 952; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24 953; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 954; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 955; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 956; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 957; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 958; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 959; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 960; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 961; GFX8-NEXT: s_setpc_b64 s[30:31] 962; 963; GFX9-LABEL: v_saddsat_i24: 964; GFX9: ; %bb.0: 965; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 966; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 967; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 968; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp 969; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 970; GFX9-NEXT: s_setpc_b64 s[30:31] 971; 972; GFX10-LABEL: v_saddsat_i24: 973; GFX10: ; %bb.0: 974; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 975; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 976; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 977; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 978; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp 979; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 980; GFX10-NEXT: s_setpc_b64 s[30:31] 981 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) 982 ret i24 %result 983} 984 985define amdgpu_ps i24 @s_saddsat_i24(i24 inreg %lhs, i24 inreg %rhs) { 986; GFX6-LABEL: s_saddsat_i24: 987; GFX6: ; %bb.0: 988; GFX6-NEXT: s_lshl_b32 s0, s0, 8 989; GFX6-NEXT: s_min_i32 s3, s0, 0 990; GFX6-NEXT: s_lshl_b32 s1, s1, 8 991; GFX6-NEXT: s_max_i32 s2, s0, 0 992; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 993; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 994; GFX6-NEXT: s_max_i32 s1, s3, s1 995; GFX6-NEXT: s_min_i32 s1, s1, s2 996; GFX6-NEXT: s_add_i32 s0, s0, s1 997; GFX6-NEXT: s_ashr_i32 s0, s0, 8 998; GFX6-NEXT: ; return to shader part epilog 999; 1000; GFX8-LABEL: s_saddsat_i24: 1001; GFX8: ; %bb.0: 1002; GFX8-NEXT: s_add_i32 s2, s0, s1 1003; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000 1004; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000 1005; GFX8-NEXT: s_cmp_lt_i32 s3, s0 1006; GFX8-NEXT: s_cselect_b32 s0, 1, 0 1007; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 1008; GFX8-NEXT: s_cmp_lt_i32 s1, 0 1009; GFX8-NEXT: s_cselect_b32 s1, 1, 0 1010; GFX8-NEXT: s_xor_b32 s0, s1, s0 1011; GFX8-NEXT: s_ashr_i32 s1, s3, 23 1012; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 1013; GFX8-NEXT: s_and_b32 s0, s0, 1 1014; GFX8-NEXT: s_cmp_lg_u32 s0, 0 1015; GFX8-NEXT: s_cselect_b32 s0, s1, s2 1016; GFX8-NEXT: ; return to shader part epilog 1017; 1018; GFX9-LABEL: s_saddsat_i24: 1019; GFX9: ; %bb.0: 1020; GFX9-NEXT: s_lshl_b32 s1, s1, 8 1021; GFX9-NEXT: s_lshl_b32 s0, s0, 8 1022; GFX9-NEXT: v_mov_b32_e32 v0, s1 1023; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1024; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1025; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1026; GFX9-NEXT: ; return to shader part epilog 1027; 1028; GFX10-LABEL: s_saddsat_i24: 1029; GFX10: ; %bb.0: 1030; GFX10-NEXT: s_lshl_b32 s0, s0, 8 1031; GFX10-NEXT: s_lshl_b32 s1, s1, 8 1032; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp 1033; GFX10-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1034; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1035; GFX10-NEXT: ; return to shader part epilog 1036 %result = call i24 @llvm.sadd.sat.i24(i24 %lhs, i24 %rhs) 1037 ret i24 %result 1038} 1039 1040define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { 1041; GFX6-LABEL: v_saddsat_i32: 1042; GFX6: ; %bb.0: 1043; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1044; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 1045; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 1046; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 1047; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 1048; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 1049; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 1050; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1051; GFX6-NEXT: s_setpc_b64 s[30:31] 1052; 1053; GFX8-LABEL: v_saddsat_i32: 1054; GFX8: ; %bb.0: 1055; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1056; GFX8-NEXT: v_min_i32_e32 v3, 0, v0 1057; GFX8-NEXT: v_max_i32_e32 v2, 0, v0 1058; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 0x80000000, v3 1059; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7fffffff, v2 1060; GFX8-NEXT: v_max_i32_e32 v1, v3, v1 1061; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 1062; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1063; GFX8-NEXT: s_setpc_b64 s[30:31] 1064; 1065; GFX9-LABEL: v_saddsat_i32: 1066; GFX9: ; %bb.0: 1067; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1068; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp 1069; GFX9-NEXT: s_setpc_b64 s[30:31] 1070; 1071; GFX10-LABEL: v_saddsat_i32: 1072; GFX10: ; %bb.0: 1073; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1074; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1075; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp 1076; GFX10-NEXT: s_setpc_b64 s[30:31] 1077 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1078 ret i32 %result 1079} 1080 1081define amdgpu_ps i32 @s_saddsat_i32(i32 inreg %lhs, i32 inreg %rhs) { 1082; GCN-LABEL: s_saddsat_i32: 1083; GCN: ; %bb.0: 1084; GCN-NEXT: s_cmp_gt_i32 s0, 0 1085; GCN-NEXT: s_cselect_b32 s2, s0, 0 1086; GCN-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1087; GCN-NEXT: s_cmp_lt_i32 s0, 0 1088; GCN-NEXT: s_cselect_b32 s3, s0, 0 1089; GCN-NEXT: s_sub_i32 s3, 0x80000000, s3 1090; GCN-NEXT: s_cmp_gt_i32 s3, s1 1091; GCN-NEXT: s_cselect_b32 s1, s3, s1 1092; GCN-NEXT: s_cmp_lt_i32 s1, s2 1093; GCN-NEXT: s_cselect_b32 s1, s1, s2 1094; GCN-NEXT: s_add_i32 s0, s0, s1 1095; GCN-NEXT: ; return to shader part epilog 1096; GFX6-LABEL: s_saddsat_i32: 1097; GFX6: ; %bb.0: 1098; GFX6-NEXT: s_min_i32 s3, s0, 0 1099; GFX6-NEXT: s_max_i32 s2, s0, 0 1100; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 1101; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1102; GFX6-NEXT: s_max_i32 s1, s3, s1 1103; GFX6-NEXT: s_min_i32 s1, s1, s2 1104; GFX6-NEXT: s_add_i32 s0, s0, s1 1105; GFX6-NEXT: ; return to shader part epilog 1106; 1107; GFX8-LABEL: s_saddsat_i32: 1108; GFX8: ; %bb.0: 1109; GFX8-NEXT: s_min_i32 s3, s0, 0 1110; GFX8-NEXT: s_max_i32 s2, s0, 0 1111; GFX8-NEXT: s_sub_i32 s3, 0x80000000, s3 1112; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 1113; GFX8-NEXT: s_max_i32 s1, s3, s1 1114; GFX8-NEXT: s_min_i32 s1, s1, s2 1115; GFX8-NEXT: s_add_i32 s0, s0, s1 1116; GFX8-NEXT: ; return to shader part epilog 1117; 1118; GFX9-LABEL: s_saddsat_i32: 1119; GFX9: ; %bb.0: 1120; GFX9-NEXT: v_mov_b32_e32 v0, s1 1121; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1122; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1123; GFX9-NEXT: ; return to shader part epilog 1124; 1125; GFX10-LABEL: s_saddsat_i32: 1126; GFX10: ; %bb.0: 1127; GFX10-NEXT: v_add_nc_i32 v0, s0, s1 clamp 1128; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1129; GFX10-NEXT: ; return to shader part epilog 1130 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1131 ret i32 %result 1132} 1133 1134define amdgpu_ps float @saddsat_i32_sv(i32 inreg %lhs, i32 %rhs) { 1135; GFX6-LABEL: saddsat_i32_sv: 1136; GFX6: ; %bb.0: 1137; GFX6-NEXT: s_min_i32 s2, s0, 0 1138; GFX6-NEXT: s_max_i32 s1, s0, 0 1139; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 1140; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 1141; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 1142; GFX6-NEXT: v_min_i32_e32 v0, s1, v0 1143; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1144; GFX6-NEXT: ; return to shader part epilog 1145; 1146; GFX8-LABEL: saddsat_i32_sv: 1147; GFX8: ; %bb.0: 1148; GFX8-NEXT: s_min_i32 s2, s0, 0 1149; GFX8-NEXT: s_max_i32 s1, s0, 0 1150; GFX8-NEXT: s_sub_i32 s2, 0x80000000, s2 1151; GFX8-NEXT: s_sub_i32 s1, 0x7fffffff, s1 1152; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 1153; GFX8-NEXT: v_min_i32_e32 v0, s1, v0 1154; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1155; GFX8-NEXT: ; return to shader part epilog 1156; 1157; GFX9-LABEL: saddsat_i32_sv: 1158; GFX9: ; %bb.0: 1159; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1160; GFX9-NEXT: ; return to shader part epilog 1161; 1162; GFX10-LABEL: saddsat_i32_sv: 1163; GFX10: ; %bb.0: 1164; GFX10-NEXT: v_add_nc_i32 v0, s0, v0 clamp 1165; GFX10-NEXT: ; return to shader part epilog 1166 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1167 %cast = bitcast i32 %result to float 1168 ret float %cast 1169} 1170 1171define amdgpu_ps float @saddsat_i32_vs(i32 %lhs, i32 inreg %rhs) { 1172; GFX6-LABEL: saddsat_i32_vs: 1173; GFX6: ; %bb.0: 1174; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 1175; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 1176; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 1177; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 1178; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 1179; GFX6-NEXT: v_min_i32_e32 v1, v2, v1 1180; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1181; GFX6-NEXT: ; return to shader part epilog 1182; 1183; GFX8-LABEL: saddsat_i32_vs: 1184; GFX8: ; %bb.0: 1185; GFX8-NEXT: v_min_i32_e32 v2, 0, v0 1186; GFX8-NEXT: v_max_i32_e32 v1, 0, v0 1187; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x80000000, v2 1188; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0x7fffffff, v1 1189; GFX8-NEXT: v_max_i32_e32 v2, s0, v2 1190; GFX8-NEXT: v_min_i32_e32 v1, v2, v1 1191; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1192; GFX8-NEXT: ; return to shader part epilog 1193; 1194; GFX9-LABEL: saddsat_i32_vs: 1195; GFX9: ; %bb.0: 1196; GFX9-NEXT: v_add_i32 v0, v0, s0 clamp 1197; GFX9-NEXT: ; return to shader part epilog 1198; 1199; GFX10-LABEL: saddsat_i32_vs: 1200; GFX10: ; %bb.0: 1201; GFX10-NEXT: v_add_nc_i32 v0, v0, s0 clamp 1202; GFX10-NEXT: ; return to shader part epilog 1203 %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) 1204 %cast = bitcast i32 %result to float 1205 ret float %cast 1206} 1207 1208define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 1209; GFX6-LABEL: v_saddsat_v2i32: 1210; GFX6: ; %bb.0: 1211; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1212; GFX6-NEXT: s_brev_b32 s5, 1 1213; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 1214; GFX6-NEXT: s_brev_b32 s4, -2 1215; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 1216; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 1217; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 1218; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 1219; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 1220; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 1221; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1222; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 1223; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 1224; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1225; GFX6-NEXT: v_max_i32_e32 v3, v4, v3 1226; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 1227; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 1228; GFX6-NEXT: s_setpc_b64 s[30:31] 1229; 1230; GFX8-LABEL: v_saddsat_v2i32: 1231; GFX8: ; %bb.0: 1232; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1233; GFX8-NEXT: s_brev_b32 s5, 1 1234; GFX8-NEXT: v_min_i32_e32 v5, 0, v0 1235; GFX8-NEXT: s_brev_b32 s4, -2 1236; GFX8-NEXT: v_max_i32_e32 v4, 0, v0 1237; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 1238; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 1239; GFX8-NEXT: v_max_i32_e32 v2, v5, v2 1240; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 1241; GFX8-NEXT: v_min_i32_e32 v4, 0, v1 1242; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1243; GFX8-NEXT: v_max_i32_e32 v2, 0, v1 1244; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 1245; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 1246; GFX8-NEXT: v_max_i32_e32 v3, v4, v3 1247; GFX8-NEXT: v_min_i32_e32 v2, v3, v2 1248; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 1249; GFX8-NEXT: s_setpc_b64 s[30:31] 1250; 1251; GFX9-LABEL: v_saddsat_v2i32: 1252; GFX9: ; %bb.0: 1253; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1254; GFX9-NEXT: v_add_i32 v0, v0, v2 clamp 1255; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp 1256; GFX9-NEXT: s_setpc_b64 s[30:31] 1257; 1258; GFX10-LABEL: v_saddsat_v2i32: 1259; GFX10: ; %bb.0: 1260; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1261; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1262; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp 1263; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp 1264; GFX10-NEXT: s_setpc_b64 s[30:31] 1265 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1266 ret <2 x i32> %result 1267} 1268 1269define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { 1270; GFX6-LABEL: s_saddsat_v2i32: 1271; GFX6: ; %bb.0: 1272; GFX6-NEXT: s_brev_b32 s5, 1 1273; GFX6-NEXT: s_min_i32 s7, s0, 0 1274; GFX6-NEXT: s_brev_b32 s4, -2 1275; GFX6-NEXT: s_max_i32 s6, s0, 0 1276; GFX6-NEXT: s_sub_i32 s7, s5, s7 1277; GFX6-NEXT: s_sub_i32 s6, s4, s6 1278; GFX6-NEXT: s_max_i32 s2, s7, s2 1279; GFX6-NEXT: s_min_i32 s2, s2, s6 1280; GFX6-NEXT: s_add_i32 s0, s0, s2 1281; GFX6-NEXT: s_max_i32 s2, s1, 0 1282; GFX6-NEXT: s_sub_i32 s2, s4, s2 1283; GFX6-NEXT: s_min_i32 s4, s1, 0 1284; GFX6-NEXT: s_sub_i32 s4, s5, s4 1285; GFX6-NEXT: s_max_i32 s3, s4, s3 1286; GFX6-NEXT: s_min_i32 s2, s3, s2 1287; GFX6-NEXT: s_add_i32 s1, s1, s2 1288; GFX6-NEXT: ; return to shader part epilog 1289; 1290; GFX8-LABEL: s_saddsat_v2i32: 1291; GFX8: ; %bb.0: 1292; GFX8-NEXT: s_brev_b32 s5, 1 1293; GFX8-NEXT: s_min_i32 s7, s0, 0 1294; GFX8-NEXT: s_brev_b32 s4, -2 1295; GFX8-NEXT: s_max_i32 s6, s0, 0 1296; GFX8-NEXT: s_sub_i32 s7, s5, s7 1297; GFX8-NEXT: s_sub_i32 s6, s4, s6 1298; GFX8-NEXT: s_max_i32 s2, s7, s2 1299; GFX8-NEXT: s_min_i32 s2, s2, s6 1300; GFX8-NEXT: s_add_i32 s0, s0, s2 1301; GFX8-NEXT: s_max_i32 s2, s1, 0 1302; GFX8-NEXT: s_sub_i32 s2, s4, s2 1303; GFX8-NEXT: s_min_i32 s4, s1, 0 1304; GFX8-NEXT: s_sub_i32 s4, s5, s4 1305; GFX8-NEXT: s_max_i32 s3, s4, s3 1306; GFX8-NEXT: s_min_i32 s2, s3, s2 1307; GFX8-NEXT: s_add_i32 s1, s1, s2 1308; GFX8-NEXT: ; return to shader part epilog 1309; 1310; GFX9-LABEL: s_saddsat_v2i32: 1311; GFX9: ; %bb.0: 1312; GFX9-NEXT: v_mov_b32_e32 v0, s2 1313; GFX9-NEXT: v_mov_b32_e32 v1, s3 1314; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1315; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1316; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1317; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1318; GFX9-NEXT: ; return to shader part epilog 1319; 1320; GFX10-LABEL: s_saddsat_v2i32: 1321; GFX10: ; %bb.0: 1322; GFX10-NEXT: v_add_nc_i32 v0, s0, s2 clamp 1323; GFX10-NEXT: v_add_nc_i32 v1, s1, s3 clamp 1324; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1325; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1326; GFX10-NEXT: ; return to shader part epilog 1327 %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1328 ret <2 x i32> %result 1329} 1330 1331define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 1332; GFX6-LABEL: v_saddsat_v3i32: 1333; GFX6: ; %bb.0: 1334; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1335; GFX6-NEXT: s_brev_b32 s5, 1 1336; GFX6-NEXT: v_min_i32_e32 v7, 0, v0 1337; GFX6-NEXT: s_brev_b32 s4, -2 1338; GFX6-NEXT: v_max_i32_e32 v6, 0, v0 1339; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s5, v7 1340; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 1341; GFX6-NEXT: v_max_i32_e32 v3, v7, v3 1342; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 1343; GFX6-NEXT: v_min_i32_e32 v6, 0, v1 1344; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 1345; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 1346; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 1347; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 1348; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 1349; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 1350; GFX6-NEXT: v_min_i32_e32 v4, 0, v2 1351; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1352; GFX6-NEXT: v_max_i32_e32 v3, 0, v2 1353; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 1354; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 1355; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 1356; GFX6-NEXT: v_min_i32_e32 v3, v4, v3 1357; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1358; GFX6-NEXT: s_setpc_b64 s[30:31] 1359; 1360; GFX8-LABEL: v_saddsat_v3i32: 1361; GFX8: ; %bb.0: 1362; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1363; GFX8-NEXT: s_brev_b32 s5, 1 1364; GFX8-NEXT: v_min_i32_e32 v7, 0, v0 1365; GFX8-NEXT: s_brev_b32 s4, -2 1366; GFX8-NEXT: v_max_i32_e32 v6, 0, v0 1367; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s5, v7 1368; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 1369; GFX8-NEXT: v_max_i32_e32 v3, v7, v3 1370; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 1371; GFX8-NEXT: v_min_i32_e32 v6, 0, v1 1372; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 1373; GFX8-NEXT: v_max_i32_e32 v3, 0, v1 1374; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 1375; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 1376; GFX8-NEXT: v_max_i32_e32 v4, v6, v4 1377; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 1378; GFX8-NEXT: v_min_i32_e32 v4, 0, v2 1379; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 1380; GFX8-NEXT: v_max_i32_e32 v3, 0, v2 1381; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s5, v4 1382; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 1383; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 1384; GFX8-NEXT: v_min_i32_e32 v3, v4, v3 1385; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 1386; GFX8-NEXT: s_setpc_b64 s[30:31] 1387; 1388; GFX9-LABEL: v_saddsat_v3i32: 1389; GFX9: ; %bb.0: 1390; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1391; GFX9-NEXT: v_add_i32 v0, v0, v3 clamp 1392; GFX9-NEXT: v_add_i32 v1, v1, v4 clamp 1393; GFX9-NEXT: v_add_i32 v2, v2, v5 clamp 1394; GFX9-NEXT: s_setpc_b64 s[30:31] 1395; 1396; GFX10-LABEL: v_saddsat_v3i32: 1397; GFX10: ; %bb.0: 1398; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1399; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1400; GFX10-NEXT: v_add_nc_i32 v0, v0, v3 clamp 1401; GFX10-NEXT: v_add_nc_i32 v1, v1, v4 clamp 1402; GFX10-NEXT: v_add_nc_i32 v2, v2, v5 clamp 1403; GFX10-NEXT: s_setpc_b64 s[30:31] 1404 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1405 ret <3 x i32> %result 1406} 1407 1408define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { 1409; GFX6-LABEL: s_saddsat_v3i32: 1410; GFX6: ; %bb.0: 1411; GFX6-NEXT: s_brev_b32 s7, 1 1412; GFX6-NEXT: s_min_i32 s9, s0, 0 1413; GFX6-NEXT: s_brev_b32 s6, -2 1414; GFX6-NEXT: s_max_i32 s8, s0, 0 1415; GFX6-NEXT: s_sub_i32 s9, s7, s9 1416; GFX6-NEXT: s_sub_i32 s8, s6, s8 1417; GFX6-NEXT: s_max_i32 s3, s9, s3 1418; GFX6-NEXT: s_min_i32 s3, s3, s8 1419; GFX6-NEXT: s_min_i32 s8, s1, 0 1420; GFX6-NEXT: s_add_i32 s0, s0, s3 1421; GFX6-NEXT: s_max_i32 s3, s1, 0 1422; GFX6-NEXT: s_sub_i32 s8, s7, s8 1423; GFX6-NEXT: s_sub_i32 s3, s6, s3 1424; GFX6-NEXT: s_max_i32 s4, s8, s4 1425; GFX6-NEXT: s_min_i32 s3, s4, s3 1426; GFX6-NEXT: s_min_i32 s4, s2, 0 1427; GFX6-NEXT: s_add_i32 s1, s1, s3 1428; GFX6-NEXT: s_max_i32 s3, s2, 0 1429; GFX6-NEXT: s_sub_i32 s4, s7, s4 1430; GFX6-NEXT: s_sub_i32 s3, s6, s3 1431; GFX6-NEXT: s_max_i32 s4, s4, s5 1432; GFX6-NEXT: s_min_i32 s3, s4, s3 1433; GFX6-NEXT: s_add_i32 s2, s2, s3 1434; GFX6-NEXT: ; return to shader part epilog 1435; 1436; GFX8-LABEL: s_saddsat_v3i32: 1437; GFX8: ; %bb.0: 1438; GFX8-NEXT: s_brev_b32 s7, 1 1439; GFX8-NEXT: s_min_i32 s9, s0, 0 1440; GFX8-NEXT: s_brev_b32 s6, -2 1441; GFX8-NEXT: s_max_i32 s8, s0, 0 1442; GFX8-NEXT: s_sub_i32 s9, s7, s9 1443; GFX8-NEXT: s_sub_i32 s8, s6, s8 1444; GFX8-NEXT: s_max_i32 s3, s9, s3 1445; GFX8-NEXT: s_min_i32 s3, s3, s8 1446; GFX8-NEXT: s_min_i32 s8, s1, 0 1447; GFX8-NEXT: s_add_i32 s0, s0, s3 1448; GFX8-NEXT: s_max_i32 s3, s1, 0 1449; GFX8-NEXT: s_sub_i32 s8, s7, s8 1450; GFX8-NEXT: s_sub_i32 s3, s6, s3 1451; GFX8-NEXT: s_max_i32 s4, s8, s4 1452; GFX8-NEXT: s_min_i32 s3, s4, s3 1453; GFX8-NEXT: s_min_i32 s4, s2, 0 1454; GFX8-NEXT: s_add_i32 s1, s1, s3 1455; GFX8-NEXT: s_max_i32 s3, s2, 0 1456; GFX8-NEXT: s_sub_i32 s4, s7, s4 1457; GFX8-NEXT: s_sub_i32 s3, s6, s3 1458; GFX8-NEXT: s_max_i32 s4, s4, s5 1459; GFX8-NEXT: s_min_i32 s3, s4, s3 1460; GFX8-NEXT: s_add_i32 s2, s2, s3 1461; GFX8-NEXT: ; return to shader part epilog 1462; 1463; GFX9-LABEL: s_saddsat_v3i32: 1464; GFX9: ; %bb.0: 1465; GFX9-NEXT: v_mov_b32_e32 v0, s3 1466; GFX9-NEXT: v_mov_b32_e32 v1, s4 1467; GFX9-NEXT: v_mov_b32_e32 v2, s5 1468; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1469; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1470; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1471; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1472; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1473; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1474; GFX9-NEXT: ; return to shader part epilog 1475; 1476; GFX10-LABEL: s_saddsat_v3i32: 1477; GFX10: ; %bb.0: 1478; GFX10-NEXT: v_add_nc_i32 v0, s0, s3 clamp 1479; GFX10-NEXT: v_add_nc_i32 v1, s1, s4 clamp 1480; GFX10-NEXT: v_add_nc_i32 v2, s2, s5 clamp 1481; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1482; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1483; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1484; GFX10-NEXT: ; return to shader part epilog 1485 %result = call <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1486 ret <3 x i32> %result 1487} 1488 1489define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 1490; GFX6-LABEL: v_saddsat_v4i32: 1491; GFX6: ; %bb.0: 1492; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1493; GFX6-NEXT: s_brev_b32 s5, 1 1494; GFX6-NEXT: v_min_i32_e32 v9, 0, v0 1495; GFX6-NEXT: s_brev_b32 s4, -2 1496; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 1497; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s5, v9 1498; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 1499; GFX6-NEXT: v_max_i32_e32 v4, v9, v4 1500; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 1501; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 1502; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 1503; GFX6-NEXT: v_max_i32_e32 v4, 0, v1 1504; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 1505; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 1506; GFX6-NEXT: v_max_i32_e32 v5, v8, v5 1507; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1508; GFX6-NEXT: v_min_i32_e32 v5, 0, v2 1509; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1510; GFX6-NEXT: v_max_i32_e32 v4, 0, v2 1511; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 1512; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 1513; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 1514; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1515; GFX6-NEXT: v_min_i32_e32 v5, 0, v3 1516; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1517; GFX6-NEXT: v_max_i32_e32 v4, 0, v3 1518; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x80000000, v5 1519; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0x7fffffff, v4 1520; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 1521; GFX6-NEXT: v_min_i32_e32 v4, v5, v4 1522; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1523; GFX6-NEXT: s_setpc_b64 s[30:31] 1524; 1525; GFX8-LABEL: v_saddsat_v4i32: 1526; GFX8: ; %bb.0: 1527; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1528; GFX8-NEXT: s_brev_b32 s5, 1 1529; GFX8-NEXT: v_min_i32_e32 v9, 0, v0 1530; GFX8-NEXT: s_brev_b32 s4, -2 1531; GFX8-NEXT: v_max_i32_e32 v8, 0, v0 1532; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s5, v9 1533; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s4, v8 1534; GFX8-NEXT: v_max_i32_e32 v4, v9, v4 1535; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 1536; GFX8-NEXT: v_min_i32_e32 v8, 0, v1 1537; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v4 1538; GFX8-NEXT: v_max_i32_e32 v4, 0, v1 1539; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 1540; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 1541; GFX8-NEXT: v_max_i32_e32 v5, v8, v5 1542; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1543; GFX8-NEXT: v_min_i32_e32 v5, 0, v2 1544; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4 1545; GFX8-NEXT: v_max_i32_e32 v4, 0, v2 1546; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s5, v5 1547; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 1548; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 1549; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1550; GFX8-NEXT: v_min_i32_e32 v5, 0, v3 1551; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 1552; GFX8-NEXT: v_max_i32_e32 v4, 0, v3 1553; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x80000000, v5 1554; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0x7fffffff, v4 1555; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 1556; GFX8-NEXT: v_min_i32_e32 v4, v5, v4 1557; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 1558; GFX8-NEXT: s_setpc_b64 s[30:31] 1559; 1560; GFX9-LABEL: v_saddsat_v4i32: 1561; GFX9: ; %bb.0: 1562; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1563; GFX9-NEXT: v_add_i32 v0, v0, v4 clamp 1564; GFX9-NEXT: v_add_i32 v1, v1, v5 clamp 1565; GFX9-NEXT: v_add_i32 v2, v2, v6 clamp 1566; GFX9-NEXT: v_add_i32 v3, v3, v7 clamp 1567; GFX9-NEXT: s_setpc_b64 s[30:31] 1568; 1569; GFX10-LABEL: v_saddsat_v4i32: 1570; GFX10: ; %bb.0: 1571; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1572; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1573; GFX10-NEXT: v_add_nc_i32 v0, v0, v4 clamp 1574; GFX10-NEXT: v_add_nc_i32 v1, v1, v5 clamp 1575; GFX10-NEXT: v_add_nc_i32 v2, v2, v6 clamp 1576; GFX10-NEXT: v_add_nc_i32 v3, v3, v7 clamp 1577; GFX10-NEXT: s_setpc_b64 s[30:31] 1578 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1579 ret <4 x i32> %result 1580} 1581 1582define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { 1583; GFX6-LABEL: s_saddsat_v4i32: 1584; GFX6: ; %bb.0: 1585; GFX6-NEXT: s_brev_b32 s9, 1 1586; GFX6-NEXT: s_min_i32 s11, s0, 0 1587; GFX6-NEXT: s_brev_b32 s8, -2 1588; GFX6-NEXT: s_max_i32 s10, s0, 0 1589; GFX6-NEXT: s_sub_i32 s11, s9, s11 1590; GFX6-NEXT: s_sub_i32 s10, s8, s10 1591; GFX6-NEXT: s_max_i32 s4, s11, s4 1592; GFX6-NEXT: s_min_i32 s4, s4, s10 1593; GFX6-NEXT: s_min_i32 s10, s1, 0 1594; GFX6-NEXT: s_add_i32 s0, s0, s4 1595; GFX6-NEXT: s_max_i32 s4, s1, 0 1596; GFX6-NEXT: s_sub_i32 s10, s9, s10 1597; GFX6-NEXT: s_sub_i32 s4, s8, s4 1598; GFX6-NEXT: s_max_i32 s5, s10, s5 1599; GFX6-NEXT: s_min_i32 s4, s5, s4 1600; GFX6-NEXT: s_min_i32 s5, s2, 0 1601; GFX6-NEXT: s_add_i32 s1, s1, s4 1602; GFX6-NEXT: s_max_i32 s4, s2, 0 1603; GFX6-NEXT: s_sub_i32 s5, s9, s5 1604; GFX6-NEXT: s_sub_i32 s4, s8, s4 1605; GFX6-NEXT: s_max_i32 s5, s5, s6 1606; GFX6-NEXT: s_min_i32 s4, s5, s4 1607; GFX6-NEXT: s_min_i32 s5, s3, 0 1608; GFX6-NEXT: s_add_i32 s2, s2, s4 1609; GFX6-NEXT: s_max_i32 s4, s3, 0 1610; GFX6-NEXT: s_sub_i32 s5, s9, s5 1611; GFX6-NEXT: s_sub_i32 s4, s8, s4 1612; GFX6-NEXT: s_max_i32 s5, s5, s7 1613; GFX6-NEXT: s_min_i32 s4, s5, s4 1614; GFX6-NEXT: s_add_i32 s3, s3, s4 1615; GFX6-NEXT: ; return to shader part epilog 1616; 1617; GFX8-LABEL: s_saddsat_v4i32: 1618; GFX8: ; %bb.0: 1619; GFX8-NEXT: s_brev_b32 s9, 1 1620; GFX8-NEXT: s_min_i32 s11, s0, 0 1621; GFX8-NEXT: s_brev_b32 s8, -2 1622; GFX8-NEXT: s_max_i32 s10, s0, 0 1623; GFX8-NEXT: s_sub_i32 s11, s9, s11 1624; GFX8-NEXT: s_sub_i32 s10, s8, s10 1625; GFX8-NEXT: s_max_i32 s4, s11, s4 1626; GFX8-NEXT: s_min_i32 s4, s4, s10 1627; GFX8-NEXT: s_min_i32 s10, s1, 0 1628; GFX8-NEXT: s_add_i32 s0, s0, s4 1629; GFX8-NEXT: s_max_i32 s4, s1, 0 1630; GFX8-NEXT: s_sub_i32 s10, s9, s10 1631; GFX8-NEXT: s_sub_i32 s4, s8, s4 1632; GFX8-NEXT: s_max_i32 s5, s10, s5 1633; GFX8-NEXT: s_min_i32 s4, s5, s4 1634; GFX8-NEXT: s_min_i32 s5, s2, 0 1635; GFX8-NEXT: s_add_i32 s1, s1, s4 1636; GFX8-NEXT: s_max_i32 s4, s2, 0 1637; GFX8-NEXT: s_sub_i32 s5, s9, s5 1638; GFX8-NEXT: s_sub_i32 s4, s8, s4 1639; GFX8-NEXT: s_max_i32 s5, s5, s6 1640; GFX8-NEXT: s_min_i32 s4, s5, s4 1641; GFX8-NEXT: s_min_i32 s5, s3, 0 1642; GFX8-NEXT: s_add_i32 s2, s2, s4 1643; GFX8-NEXT: s_max_i32 s4, s3, 0 1644; GFX8-NEXT: s_sub_i32 s5, s9, s5 1645; GFX8-NEXT: s_sub_i32 s4, s8, s4 1646; GFX8-NEXT: s_max_i32 s5, s5, s7 1647; GFX8-NEXT: s_min_i32 s4, s5, s4 1648; GFX8-NEXT: s_add_i32 s3, s3, s4 1649; GFX8-NEXT: ; return to shader part epilog 1650; 1651; GFX9-LABEL: s_saddsat_v4i32: 1652; GFX9: ; %bb.0: 1653; GFX9-NEXT: v_mov_b32_e32 v0, s4 1654; GFX9-NEXT: v_mov_b32_e32 v1, s5 1655; GFX9-NEXT: v_mov_b32_e32 v2, s6 1656; GFX9-NEXT: v_mov_b32_e32 v3, s7 1657; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1658; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1659; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1660; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 1661; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1662; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1663; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1664; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1665; GFX9-NEXT: ; return to shader part epilog 1666; 1667; GFX10-LABEL: s_saddsat_v4i32: 1668; GFX10: ; %bb.0: 1669; GFX10-NEXT: v_add_nc_i32 v0, s0, s4 clamp 1670; GFX10-NEXT: v_add_nc_i32 v1, s1, s5 clamp 1671; GFX10-NEXT: v_add_nc_i32 v2, s2, s6 clamp 1672; GFX10-NEXT: v_add_nc_i32 v3, s3, s7 clamp 1673; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1674; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1675; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1676; GFX10-NEXT: v_readfirstlane_b32 s3, v3 1677; GFX10-NEXT: ; return to shader part epilog 1678 %result = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1679 ret <4 x i32> %result 1680} 1681 1682define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { 1683; GFX6-LABEL: v_saddsat_v5i32: 1684; GFX6: ; %bb.0: 1685; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1686; GFX6-NEXT: s_brev_b32 s5, 1 1687; GFX6-NEXT: v_min_i32_e32 v12, 0, v0 1688; GFX6-NEXT: s_brev_b32 s4, -2 1689; GFX6-NEXT: v_max_i32_e32 v10, 0, v0 1690; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 1691; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s4, v10 1692; GFX6-NEXT: v_max_i32_e32 v5, v12, v5 1693; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 1694; GFX6-NEXT: v_min_i32_e32 v10, 0, v1 1695; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5 1696; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 1697; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 1698; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 1699; GFX6-NEXT: v_max_i32_e32 v6, v10, v6 1700; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1701; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 1702; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 1703; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 1704; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 1705; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 1706; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 1707; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 1708; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1709; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 1710; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 1711; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 1712; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 1713; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 1714; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 1715; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 1716; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1717; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 1718; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 1719; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 1720; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 1721; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 1722; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 1723; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 1724; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 1725; GFX6-NEXT: s_setpc_b64 s[30:31] 1726; 1727; GFX8-LABEL: v_saddsat_v5i32: 1728; GFX8: ; %bb.0: 1729; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1730; GFX8-NEXT: s_brev_b32 s5, 1 1731; GFX8-NEXT: v_min_i32_e32 v12, 0, v0 1732; GFX8-NEXT: s_brev_b32 s4, -2 1733; GFX8-NEXT: v_max_i32_e32 v10, 0, v0 1734; GFX8-NEXT: v_sub_u32_e32 v12, vcc, s5, v12 1735; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s4, v10 1736; GFX8-NEXT: v_max_i32_e32 v5, v12, v5 1737; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 1738; GFX8-NEXT: v_min_i32_e32 v10, 0, v1 1739; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 1740; GFX8-NEXT: v_max_i32_e32 v5, 0, v1 1741; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s5, v10 1742; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 1743; GFX8-NEXT: v_max_i32_e32 v6, v10, v6 1744; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1745; GFX8-NEXT: v_min_i32_e32 v6, 0, v2 1746; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v5 1747; GFX8-NEXT: v_max_i32_e32 v5, 0, v2 1748; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 1749; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 1750; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 1751; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 1752; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1753; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 1754; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 1755; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 1756; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 1757; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 1758; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 1759; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 1760; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1761; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 1762; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 1763; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 1764; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 1765; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 1766; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 1767; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 1768; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 1769; GFX8-NEXT: s_setpc_b64 s[30:31] 1770; 1771; GFX9-LABEL: v_saddsat_v5i32: 1772; GFX9: ; %bb.0: 1773; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1774; GFX9-NEXT: v_add_i32 v0, v0, v5 clamp 1775; GFX9-NEXT: v_add_i32 v1, v1, v6 clamp 1776; GFX9-NEXT: v_add_i32 v2, v2, v7 clamp 1777; GFX9-NEXT: v_add_i32 v3, v3, v8 clamp 1778; GFX9-NEXT: v_add_i32 v4, v4, v9 clamp 1779; GFX9-NEXT: s_setpc_b64 s[30:31] 1780; 1781; GFX10-LABEL: v_saddsat_v5i32: 1782; GFX10: ; %bb.0: 1783; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1784; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1785; GFX10-NEXT: v_add_nc_i32 v0, v0, v5 clamp 1786; GFX10-NEXT: v_add_nc_i32 v1, v1, v6 clamp 1787; GFX10-NEXT: v_add_nc_i32 v2, v2, v7 clamp 1788; GFX10-NEXT: v_add_nc_i32 v3, v3, v8 clamp 1789; GFX10-NEXT: v_add_nc_i32 v4, v4, v9 clamp 1790; GFX10-NEXT: s_setpc_b64 s[30:31] 1791 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1792 ret <5 x i32> %result 1793} 1794 1795define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { 1796; GFX6-LABEL: s_saddsat_v5i32: 1797; GFX6: ; %bb.0: 1798; GFX6-NEXT: s_brev_b32 s11, 1 1799; GFX6-NEXT: s_min_i32 s13, s0, 0 1800; GFX6-NEXT: s_brev_b32 s10, -2 1801; GFX6-NEXT: s_max_i32 s12, s0, 0 1802; GFX6-NEXT: s_sub_i32 s13, s11, s13 1803; GFX6-NEXT: s_sub_i32 s12, s10, s12 1804; GFX6-NEXT: s_max_i32 s5, s13, s5 1805; GFX6-NEXT: s_min_i32 s5, s5, s12 1806; GFX6-NEXT: s_min_i32 s12, s1, 0 1807; GFX6-NEXT: s_add_i32 s0, s0, s5 1808; GFX6-NEXT: s_max_i32 s5, s1, 0 1809; GFX6-NEXT: s_sub_i32 s12, s11, s12 1810; GFX6-NEXT: s_sub_i32 s5, s10, s5 1811; GFX6-NEXT: s_max_i32 s6, s12, s6 1812; GFX6-NEXT: s_min_i32 s5, s6, s5 1813; GFX6-NEXT: s_min_i32 s6, s2, 0 1814; GFX6-NEXT: s_add_i32 s1, s1, s5 1815; GFX6-NEXT: s_max_i32 s5, s2, 0 1816; GFX6-NEXT: s_sub_i32 s6, s11, s6 1817; GFX6-NEXT: s_sub_i32 s5, s10, s5 1818; GFX6-NEXT: s_max_i32 s6, s6, s7 1819; GFX6-NEXT: s_min_i32 s5, s6, s5 1820; GFX6-NEXT: s_min_i32 s6, s3, 0 1821; GFX6-NEXT: s_add_i32 s2, s2, s5 1822; GFX6-NEXT: s_max_i32 s5, s3, 0 1823; GFX6-NEXT: s_sub_i32 s6, s11, s6 1824; GFX6-NEXT: s_sub_i32 s5, s10, s5 1825; GFX6-NEXT: s_max_i32 s6, s6, s8 1826; GFX6-NEXT: s_min_i32 s5, s6, s5 1827; GFX6-NEXT: s_min_i32 s6, s4, 0 1828; GFX6-NEXT: s_add_i32 s3, s3, s5 1829; GFX6-NEXT: s_max_i32 s5, s4, 0 1830; GFX6-NEXT: s_sub_i32 s6, s11, s6 1831; GFX6-NEXT: s_sub_i32 s5, s10, s5 1832; GFX6-NEXT: s_max_i32 s6, s6, s9 1833; GFX6-NEXT: s_min_i32 s5, s6, s5 1834; GFX6-NEXT: s_add_i32 s4, s4, s5 1835; GFX6-NEXT: ; return to shader part epilog 1836; 1837; GFX8-LABEL: s_saddsat_v5i32: 1838; GFX8: ; %bb.0: 1839; GFX8-NEXT: s_brev_b32 s11, 1 1840; GFX8-NEXT: s_min_i32 s13, s0, 0 1841; GFX8-NEXT: s_brev_b32 s10, -2 1842; GFX8-NEXT: s_max_i32 s12, s0, 0 1843; GFX8-NEXT: s_sub_i32 s13, s11, s13 1844; GFX8-NEXT: s_sub_i32 s12, s10, s12 1845; GFX8-NEXT: s_max_i32 s5, s13, s5 1846; GFX8-NEXT: s_min_i32 s5, s5, s12 1847; GFX8-NEXT: s_min_i32 s12, s1, 0 1848; GFX8-NEXT: s_add_i32 s0, s0, s5 1849; GFX8-NEXT: s_max_i32 s5, s1, 0 1850; GFX8-NEXT: s_sub_i32 s12, s11, s12 1851; GFX8-NEXT: s_sub_i32 s5, s10, s5 1852; GFX8-NEXT: s_max_i32 s6, s12, s6 1853; GFX8-NEXT: s_min_i32 s5, s6, s5 1854; GFX8-NEXT: s_min_i32 s6, s2, 0 1855; GFX8-NEXT: s_add_i32 s1, s1, s5 1856; GFX8-NEXT: s_max_i32 s5, s2, 0 1857; GFX8-NEXT: s_sub_i32 s6, s11, s6 1858; GFX8-NEXT: s_sub_i32 s5, s10, s5 1859; GFX8-NEXT: s_max_i32 s6, s6, s7 1860; GFX8-NEXT: s_min_i32 s5, s6, s5 1861; GFX8-NEXT: s_min_i32 s6, s3, 0 1862; GFX8-NEXT: s_add_i32 s2, s2, s5 1863; GFX8-NEXT: s_max_i32 s5, s3, 0 1864; GFX8-NEXT: s_sub_i32 s6, s11, s6 1865; GFX8-NEXT: s_sub_i32 s5, s10, s5 1866; GFX8-NEXT: s_max_i32 s6, s6, s8 1867; GFX8-NEXT: s_min_i32 s5, s6, s5 1868; GFX8-NEXT: s_min_i32 s6, s4, 0 1869; GFX8-NEXT: s_add_i32 s3, s3, s5 1870; GFX8-NEXT: s_max_i32 s5, s4, 0 1871; GFX8-NEXT: s_sub_i32 s6, s11, s6 1872; GFX8-NEXT: s_sub_i32 s5, s10, s5 1873; GFX8-NEXT: s_max_i32 s6, s6, s9 1874; GFX8-NEXT: s_min_i32 s5, s6, s5 1875; GFX8-NEXT: s_add_i32 s4, s4, s5 1876; GFX8-NEXT: ; return to shader part epilog 1877; 1878; GFX9-LABEL: s_saddsat_v5i32: 1879; GFX9: ; %bb.0: 1880; GFX9-NEXT: v_mov_b32_e32 v0, s5 1881; GFX9-NEXT: v_mov_b32_e32 v1, s6 1882; GFX9-NEXT: v_mov_b32_e32 v2, s7 1883; GFX9-NEXT: v_mov_b32_e32 v3, s8 1884; GFX9-NEXT: v_mov_b32_e32 v4, s9 1885; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 1886; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 1887; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 1888; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 1889; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp 1890; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1891; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1892; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1893; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1894; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1895; GFX9-NEXT: ; return to shader part epilog 1896; 1897; GFX10-LABEL: s_saddsat_v5i32: 1898; GFX10: ; %bb.0: 1899; GFX10-NEXT: v_add_nc_i32 v0, s0, s5 clamp 1900; GFX10-NEXT: v_add_nc_i32 v1, s1, s6 clamp 1901; GFX10-NEXT: v_add_nc_i32 v2, s2, s7 clamp 1902; GFX10-NEXT: v_add_nc_i32 v3, s3, s8 clamp 1903; GFX10-NEXT: v_add_nc_i32 v4, s4, s9 clamp 1904; GFX10-NEXT: v_readfirstlane_b32 s0, v0 1905; GFX10-NEXT: v_readfirstlane_b32 s1, v1 1906; GFX10-NEXT: v_readfirstlane_b32 s2, v2 1907; GFX10-NEXT: v_readfirstlane_b32 s3, v3 1908; GFX10-NEXT: v_readfirstlane_b32 s4, v4 1909; GFX10-NEXT: ; return to shader part epilog 1910 %result = call <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1911 ret <5 x i32> %result 1912} 1913 1914define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 1915; GFX6-LABEL: v_saddsat_v16i32: 1916; GFX6: ; %bb.0: 1917; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1918; GFX6-NEXT: s_brev_b32 s4, 1 1919; GFX6-NEXT: v_min_i32_e32 v32, 0, v0 1920; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s4, v32 1921; GFX6-NEXT: v_max_i32_e32 v16, v32, v16 1922; GFX6-NEXT: s_brev_b32 s5, -2 1923; GFX6-NEXT: v_max_i32_e32 v32, 0, v0 1924; GFX6-NEXT: v_sub_i32_e32 v32, vcc, s5, v32 1925; GFX6-NEXT: v_min_i32_e32 v16, v16, v32 1926; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16 1927; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 1928; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 1929; GFX6-NEXT: v_max_i32_e32 v16, v16, v17 1930; GFX6-NEXT: v_max_i32_e32 v17, 0, v1 1931; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 1932; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 1933; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16 1934; GFX6-NEXT: v_min_i32_e32 v16, 0, v2 1935; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 1936; GFX6-NEXT: v_max_i32_e32 v17, 0, v2 1937; GFX6-NEXT: v_max_i32_e32 v16, v16, v18 1938; GFX6-NEXT: v_sub_i32_e32 v17, vcc, s5, v17 1939; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 1940; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16 1941; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 1942; GFX6-NEXT: v_min_i32_e32 v17, 0, v3 1943; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1944; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 1945; GFX6-NEXT: v_bfrev_b32_e32 v18, -2 1946; GFX6-NEXT: v_max_i32_e32 v19, 0, v3 1947; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1948; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1949; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v17 1950; GFX6-NEXT: v_min_i32_e32 v17, 0, v4 1951; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1952; GFX6-NEXT: v_max_i32_e32 v19, 0, v4 1953; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 1954; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1955; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1956; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v17 1957; GFX6-NEXT: v_min_i32_e32 v17, 0, v5 1958; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1959; GFX6-NEXT: v_max_i32_e32 v19, 0, v5 1960; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 1961; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1962; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1963; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v17 1964; GFX6-NEXT: v_min_i32_e32 v17, 0, v6 1965; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1966; GFX6-NEXT: v_max_i32_e32 v19, 0, v6 1967; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 1968; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1969; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1970; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v17 1971; GFX6-NEXT: v_min_i32_e32 v17, 0, v7 1972; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1973; GFX6-NEXT: v_max_i32_e32 v19, 0, v7 1974; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 1975; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1976; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1977; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v17 1978; GFX6-NEXT: v_min_i32_e32 v17, 0, v8 1979; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1980; GFX6-NEXT: v_max_i32_e32 v19, 0, v8 1981; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 1982; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1983; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1984; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v17 1985; GFX6-NEXT: v_min_i32_e32 v17, 0, v9 1986; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1987; GFX6-NEXT: v_max_i32_e32 v19, 0, v9 1988; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 1989; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1990; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1991; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 1992; GFX6-NEXT: v_min_i32_e32 v17, 0, v10 1993; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 1994; GFX6-NEXT: v_max_i32_e32 v19, 0, v10 1995; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 1996; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 1997; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 1998; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v17 1999; GFX6-NEXT: v_min_i32_e32 v17, 0, v11 2000; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 2001; GFX6-NEXT: v_max_i32_e32 v19, 0, v11 2002; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 2003; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 2004; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 2005; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v17 2006; GFX6-NEXT: v_min_i32_e32 v17, 0, v12 2007; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 2008; GFX6-NEXT: v_max_i32_e32 v19, 0, v12 2009; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 2010; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 2011; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 2012; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v17 2013; GFX6-NEXT: v_min_i32_e32 v17, 0, v13 2014; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 2015; GFX6-NEXT: v_max_i32_e32 v19, 0, v13 2016; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 2017; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 2018; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 2019; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v17 2020; GFX6-NEXT: v_min_i32_e32 v17, 0, v14 2021; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v16, v17 2022; GFX6-NEXT: v_max_i32_e32 v19, 0, v14 2023; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 2024; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v18, v19 2025; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 2026; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v17 2027; GFX6-NEXT: v_max_i32_e32 v17, 0, v15 2028; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v18, v17 2029; GFX6-NEXT: v_min_i32_e32 v18, 0, v15 2030; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 2031; GFX6-NEXT: v_max_i32_e32 v16, v16, v31 2032; GFX6-NEXT: v_min_i32_e32 v16, v16, v17 2033; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16 2034; GFX6-NEXT: s_setpc_b64 s[30:31] 2035; 2036; GFX8-LABEL: v_saddsat_v16i32: 2037; GFX8: ; %bb.0: 2038; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2039; GFX8-NEXT: s_brev_b32 s4, 1 2040; GFX8-NEXT: v_min_i32_e32 v32, 0, v0 2041; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s4, v32 2042; GFX8-NEXT: v_max_i32_e32 v16, v32, v16 2043; GFX8-NEXT: s_brev_b32 s5, -2 2044; GFX8-NEXT: v_max_i32_e32 v32, 0, v0 2045; GFX8-NEXT: v_sub_u32_e32 v32, vcc, s5, v32 2046; GFX8-NEXT: v_min_i32_e32 v16, v16, v32 2047; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v16 2048; GFX8-NEXT: v_min_i32_e32 v16, 0, v1 2049; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 2050; GFX8-NEXT: v_max_i32_e32 v16, v16, v17 2051; GFX8-NEXT: v_max_i32_e32 v17, 0, v1 2052; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 2053; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 2054; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v16 2055; GFX8-NEXT: v_min_i32_e32 v16, 0, v2 2056; GFX8-NEXT: v_sub_u32_e32 v16, vcc, s4, v16 2057; GFX8-NEXT: v_max_i32_e32 v17, 0, v2 2058; GFX8-NEXT: v_max_i32_e32 v16, v16, v18 2059; GFX8-NEXT: v_sub_u32_e32 v17, vcc, s5, v17 2060; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 2061; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v16 2062; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 2063; GFX8-NEXT: v_min_i32_e32 v17, 0, v3 2064; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2065; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 2066; GFX8-NEXT: v_bfrev_b32_e32 v18, -2 2067; GFX8-NEXT: v_max_i32_e32 v19, 0, v3 2068; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2069; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2070; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17 2071; GFX8-NEXT: v_min_i32_e32 v17, 0, v4 2072; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2073; GFX8-NEXT: v_max_i32_e32 v19, 0, v4 2074; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 2075; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2076; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2077; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 2078; GFX8-NEXT: v_min_i32_e32 v17, 0, v5 2079; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2080; GFX8-NEXT: v_max_i32_e32 v19, 0, v5 2081; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 2082; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2083; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2084; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17 2085; GFX8-NEXT: v_min_i32_e32 v17, 0, v6 2086; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2087; GFX8-NEXT: v_max_i32_e32 v19, 0, v6 2088; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 2089; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2090; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2091; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 2092; GFX8-NEXT: v_min_i32_e32 v17, 0, v7 2093; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2094; GFX8-NEXT: v_max_i32_e32 v19, 0, v7 2095; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 2096; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2097; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2098; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v17 2099; GFX8-NEXT: v_min_i32_e32 v17, 0, v8 2100; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2101; GFX8-NEXT: v_max_i32_e32 v19, 0, v8 2102; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 2103; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2104; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2105; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v17 2106; GFX8-NEXT: v_min_i32_e32 v17, 0, v9 2107; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2108; GFX8-NEXT: v_max_i32_e32 v19, 0, v9 2109; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 2110; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2111; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2112; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v17 2113; GFX8-NEXT: v_min_i32_e32 v17, 0, v10 2114; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2115; GFX8-NEXT: v_max_i32_e32 v19, 0, v10 2116; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 2117; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2118; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2119; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v17 2120; GFX8-NEXT: v_min_i32_e32 v17, 0, v11 2121; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2122; GFX8-NEXT: v_max_i32_e32 v19, 0, v11 2123; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 2124; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2125; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2126; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v17 2127; GFX8-NEXT: v_min_i32_e32 v17, 0, v12 2128; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2129; GFX8-NEXT: v_max_i32_e32 v19, 0, v12 2130; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 2131; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2132; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2133; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v17 2134; GFX8-NEXT: v_min_i32_e32 v17, 0, v13 2135; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2136; GFX8-NEXT: v_max_i32_e32 v19, 0, v13 2137; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 2138; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2139; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2140; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v17 2141; GFX8-NEXT: v_min_i32_e32 v17, 0, v14 2142; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v16, v17 2143; GFX8-NEXT: v_max_i32_e32 v19, 0, v14 2144; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 2145; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v18, v19 2146; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 2147; GFX8-NEXT: v_add_u32_e32 v14, vcc, v14, v17 2148; GFX8-NEXT: v_max_i32_e32 v17, 0, v15 2149; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v18, v17 2150; GFX8-NEXT: v_min_i32_e32 v18, 0, v15 2151; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v16, v18 2152; GFX8-NEXT: v_max_i32_e32 v16, v16, v31 2153; GFX8-NEXT: v_min_i32_e32 v16, v16, v17 2154; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v16 2155; GFX8-NEXT: s_setpc_b64 s[30:31] 2156; 2157; GFX9-LABEL: v_saddsat_v16i32: 2158; GFX9: ; %bb.0: 2159; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2160; GFX9-NEXT: v_add_i32 v0, v0, v16 clamp 2161; GFX9-NEXT: v_add_i32 v1, v1, v17 clamp 2162; GFX9-NEXT: v_add_i32 v2, v2, v18 clamp 2163; GFX9-NEXT: v_add_i32 v3, v3, v19 clamp 2164; GFX9-NEXT: v_add_i32 v4, v4, v20 clamp 2165; GFX9-NEXT: v_add_i32 v5, v5, v21 clamp 2166; GFX9-NEXT: v_add_i32 v6, v6, v22 clamp 2167; GFX9-NEXT: v_add_i32 v7, v7, v23 clamp 2168; GFX9-NEXT: v_add_i32 v8, v8, v24 clamp 2169; GFX9-NEXT: v_add_i32 v9, v9, v25 clamp 2170; GFX9-NEXT: v_add_i32 v10, v10, v26 clamp 2171; GFX9-NEXT: v_add_i32 v11, v11, v27 clamp 2172; GFX9-NEXT: v_add_i32 v12, v12, v28 clamp 2173; GFX9-NEXT: v_add_i32 v13, v13, v29 clamp 2174; GFX9-NEXT: v_add_i32 v14, v14, v30 clamp 2175; GFX9-NEXT: v_add_i32 v15, v15, v31 clamp 2176; GFX9-NEXT: s_setpc_b64 s[30:31] 2177; 2178; GFX10-LABEL: v_saddsat_v16i32: 2179; GFX10: ; %bb.0: 2180; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2181; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2182; GFX10-NEXT: v_add_nc_i32 v0, v0, v16 clamp 2183; GFX10-NEXT: v_add_nc_i32 v1, v1, v17 clamp 2184; GFX10-NEXT: v_add_nc_i32 v2, v2, v18 clamp 2185; GFX10-NEXT: v_add_nc_i32 v3, v3, v19 clamp 2186; GFX10-NEXT: v_add_nc_i32 v4, v4, v20 clamp 2187; GFX10-NEXT: v_add_nc_i32 v5, v5, v21 clamp 2188; GFX10-NEXT: v_add_nc_i32 v6, v6, v22 clamp 2189; GFX10-NEXT: v_add_nc_i32 v7, v7, v23 clamp 2190; GFX10-NEXT: v_add_nc_i32 v8, v8, v24 clamp 2191; GFX10-NEXT: v_add_nc_i32 v9, v9, v25 clamp 2192; GFX10-NEXT: v_add_nc_i32 v10, v10, v26 clamp 2193; GFX10-NEXT: v_add_nc_i32 v11, v11, v27 clamp 2194; GFX10-NEXT: v_add_nc_i32 v12, v12, v28 clamp 2195; GFX10-NEXT: v_add_nc_i32 v13, v13, v29 clamp 2196; GFX10-NEXT: v_add_nc_i32 v14, v14, v30 clamp 2197; GFX10-NEXT: v_add_nc_i32 v15, v15, v31 clamp 2198; GFX10-NEXT: s_setpc_b64 s[30:31] 2199 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2200 ret <16 x i32> %result 2201} 2202 2203define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { 2204; GFX6-LABEL: s_saddsat_v16i32: 2205; GFX6: ; %bb.0: 2206; GFX6-NEXT: s_brev_b32 s33, 1 2207; GFX6-NEXT: s_min_i32 s35, s0, 0 2208; GFX6-NEXT: s_brev_b32 s32, -2 2209; GFX6-NEXT: s_max_i32 s34, s0, 0 2210; GFX6-NEXT: s_sub_i32 s35, s33, s35 2211; GFX6-NEXT: s_sub_i32 s34, s32, s34 2212; GFX6-NEXT: s_max_i32 s16, s35, s16 2213; GFX6-NEXT: s_min_i32 s16, s16, s34 2214; GFX6-NEXT: s_min_i32 s34, s1, 0 2215; GFX6-NEXT: s_add_i32 s0, s0, s16 2216; GFX6-NEXT: s_max_i32 s16, s1, 0 2217; GFX6-NEXT: s_sub_i32 s34, s33, s34 2218; GFX6-NEXT: s_sub_i32 s16, s32, s16 2219; GFX6-NEXT: s_max_i32 s17, s34, s17 2220; GFX6-NEXT: s_min_i32 s16, s17, s16 2221; GFX6-NEXT: s_min_i32 s17, s2, 0 2222; GFX6-NEXT: s_add_i32 s1, s1, s16 2223; GFX6-NEXT: s_max_i32 s16, s2, 0 2224; GFX6-NEXT: s_sub_i32 s17, s33, s17 2225; GFX6-NEXT: s_sub_i32 s16, s32, s16 2226; GFX6-NEXT: s_max_i32 s17, s17, s18 2227; GFX6-NEXT: s_min_i32 s16, s17, s16 2228; GFX6-NEXT: s_min_i32 s17, s3, 0 2229; GFX6-NEXT: s_add_i32 s2, s2, s16 2230; GFX6-NEXT: s_max_i32 s16, s3, 0 2231; GFX6-NEXT: s_sub_i32 s17, s33, s17 2232; GFX6-NEXT: s_sub_i32 s16, s32, s16 2233; GFX6-NEXT: s_max_i32 s17, s17, s19 2234; GFX6-NEXT: s_min_i32 s16, s17, s16 2235; GFX6-NEXT: s_min_i32 s17, s4, 0 2236; GFX6-NEXT: s_add_i32 s3, s3, s16 2237; GFX6-NEXT: s_max_i32 s16, s4, 0 2238; GFX6-NEXT: s_sub_i32 s17, s33, s17 2239; GFX6-NEXT: s_sub_i32 s16, s32, s16 2240; GFX6-NEXT: s_max_i32 s17, s17, s20 2241; GFX6-NEXT: s_min_i32 s16, s17, s16 2242; GFX6-NEXT: s_min_i32 s17, s5, 0 2243; GFX6-NEXT: s_add_i32 s4, s4, s16 2244; GFX6-NEXT: s_max_i32 s16, s5, 0 2245; GFX6-NEXT: s_sub_i32 s17, s33, s17 2246; GFX6-NEXT: s_sub_i32 s16, s32, s16 2247; GFX6-NEXT: s_max_i32 s17, s17, s21 2248; GFX6-NEXT: s_min_i32 s16, s17, s16 2249; GFX6-NEXT: s_min_i32 s17, s6, 0 2250; GFX6-NEXT: s_add_i32 s5, s5, s16 2251; GFX6-NEXT: s_max_i32 s16, s6, 0 2252; GFX6-NEXT: s_sub_i32 s17, s33, s17 2253; GFX6-NEXT: s_sub_i32 s16, s32, s16 2254; GFX6-NEXT: s_max_i32 s17, s17, s22 2255; GFX6-NEXT: s_min_i32 s16, s17, s16 2256; GFX6-NEXT: s_min_i32 s17, s7, 0 2257; GFX6-NEXT: s_add_i32 s6, s6, s16 2258; GFX6-NEXT: s_max_i32 s16, s7, 0 2259; GFX6-NEXT: s_sub_i32 s17, s33, s17 2260; GFX6-NEXT: s_sub_i32 s16, s32, s16 2261; GFX6-NEXT: s_max_i32 s17, s17, s23 2262; GFX6-NEXT: s_min_i32 s16, s17, s16 2263; GFX6-NEXT: s_min_i32 s17, s8, 0 2264; GFX6-NEXT: s_add_i32 s7, s7, s16 2265; GFX6-NEXT: s_max_i32 s16, s8, 0 2266; GFX6-NEXT: s_sub_i32 s17, s33, s17 2267; GFX6-NEXT: s_sub_i32 s16, s32, s16 2268; GFX6-NEXT: s_max_i32 s17, s17, s24 2269; GFX6-NEXT: s_min_i32 s16, s17, s16 2270; GFX6-NEXT: s_min_i32 s17, s9, 0 2271; GFX6-NEXT: s_add_i32 s8, s8, s16 2272; GFX6-NEXT: s_max_i32 s16, s9, 0 2273; GFX6-NEXT: s_sub_i32 s17, s33, s17 2274; GFX6-NEXT: s_sub_i32 s16, s32, s16 2275; GFX6-NEXT: s_max_i32 s17, s17, s25 2276; GFX6-NEXT: s_min_i32 s16, s17, s16 2277; GFX6-NEXT: s_min_i32 s17, s10, 0 2278; GFX6-NEXT: s_add_i32 s9, s9, s16 2279; GFX6-NEXT: s_max_i32 s16, s10, 0 2280; GFX6-NEXT: s_sub_i32 s17, s33, s17 2281; GFX6-NEXT: s_sub_i32 s16, s32, s16 2282; GFX6-NEXT: s_max_i32 s17, s17, s26 2283; GFX6-NEXT: s_min_i32 s16, s17, s16 2284; GFX6-NEXT: s_min_i32 s17, s11, 0 2285; GFX6-NEXT: s_add_i32 s10, s10, s16 2286; GFX6-NEXT: s_max_i32 s16, s11, 0 2287; GFX6-NEXT: s_sub_i32 s17, s33, s17 2288; GFX6-NEXT: s_sub_i32 s16, s32, s16 2289; GFX6-NEXT: s_max_i32 s17, s17, s27 2290; GFX6-NEXT: s_min_i32 s16, s17, s16 2291; GFX6-NEXT: s_min_i32 s17, s12, 0 2292; GFX6-NEXT: s_add_i32 s11, s11, s16 2293; GFX6-NEXT: s_max_i32 s16, s12, 0 2294; GFX6-NEXT: s_sub_i32 s17, s33, s17 2295; GFX6-NEXT: s_sub_i32 s16, s32, s16 2296; GFX6-NEXT: s_max_i32 s17, s17, s28 2297; GFX6-NEXT: s_min_i32 s16, s17, s16 2298; GFX6-NEXT: s_min_i32 s17, s13, 0 2299; GFX6-NEXT: s_add_i32 s12, s12, s16 2300; GFX6-NEXT: s_max_i32 s16, s13, 0 2301; GFX6-NEXT: s_sub_i32 s17, s33, s17 2302; GFX6-NEXT: s_sub_i32 s16, s32, s16 2303; GFX6-NEXT: s_max_i32 s17, s17, s29 2304; GFX6-NEXT: s_min_i32 s16, s17, s16 2305; GFX6-NEXT: s_min_i32 s17, s14, 0 2306; GFX6-NEXT: s_add_i32 s13, s13, s16 2307; GFX6-NEXT: s_max_i32 s16, s14, 0 2308; GFX6-NEXT: s_sub_i32 s17, s33, s17 2309; GFX6-NEXT: s_sub_i32 s16, s32, s16 2310; GFX6-NEXT: s_max_i32 s17, s17, s30 2311; GFX6-NEXT: s_min_i32 s16, s17, s16 2312; GFX6-NEXT: s_min_i32 s17, s15, 0 2313; GFX6-NEXT: s_add_i32 s14, s14, s16 2314; GFX6-NEXT: s_max_i32 s16, s15, 0 2315; GFX6-NEXT: s_sub_i32 s17, s33, s17 2316; GFX6-NEXT: s_sub_i32 s16, s32, s16 2317; GFX6-NEXT: s_max_i32 s17, s17, s31 2318; GFX6-NEXT: s_min_i32 s16, s17, s16 2319; GFX6-NEXT: s_add_i32 s15, s15, s16 2320; GFX6-NEXT: ; return to shader part epilog 2321; 2322; GFX8-LABEL: s_saddsat_v16i32: 2323; GFX8: ; %bb.0: 2324; GFX8-NEXT: s_brev_b32 s33, 1 2325; GFX8-NEXT: s_min_i32 s35, s0, 0 2326; GFX8-NEXT: s_brev_b32 s32, -2 2327; GFX8-NEXT: s_max_i32 s34, s0, 0 2328; GFX8-NEXT: s_sub_i32 s35, s33, s35 2329; GFX8-NEXT: s_sub_i32 s34, s32, s34 2330; GFX8-NEXT: s_max_i32 s16, s35, s16 2331; GFX8-NEXT: s_min_i32 s16, s16, s34 2332; GFX8-NEXT: s_min_i32 s34, s1, 0 2333; GFX8-NEXT: s_add_i32 s0, s0, s16 2334; GFX8-NEXT: s_max_i32 s16, s1, 0 2335; GFX8-NEXT: s_sub_i32 s34, s33, s34 2336; GFX8-NEXT: s_sub_i32 s16, s32, s16 2337; GFX8-NEXT: s_max_i32 s17, s34, s17 2338; GFX8-NEXT: s_min_i32 s16, s17, s16 2339; GFX8-NEXT: s_min_i32 s17, s2, 0 2340; GFX8-NEXT: s_add_i32 s1, s1, s16 2341; GFX8-NEXT: s_max_i32 s16, s2, 0 2342; GFX8-NEXT: s_sub_i32 s17, s33, s17 2343; GFX8-NEXT: s_sub_i32 s16, s32, s16 2344; GFX8-NEXT: s_max_i32 s17, s17, s18 2345; GFX8-NEXT: s_min_i32 s16, s17, s16 2346; GFX8-NEXT: s_min_i32 s17, s3, 0 2347; GFX8-NEXT: s_add_i32 s2, s2, s16 2348; GFX8-NEXT: s_max_i32 s16, s3, 0 2349; GFX8-NEXT: s_sub_i32 s17, s33, s17 2350; GFX8-NEXT: s_sub_i32 s16, s32, s16 2351; GFX8-NEXT: s_max_i32 s17, s17, s19 2352; GFX8-NEXT: s_min_i32 s16, s17, s16 2353; GFX8-NEXT: s_min_i32 s17, s4, 0 2354; GFX8-NEXT: s_add_i32 s3, s3, s16 2355; GFX8-NEXT: s_max_i32 s16, s4, 0 2356; GFX8-NEXT: s_sub_i32 s17, s33, s17 2357; GFX8-NEXT: s_sub_i32 s16, s32, s16 2358; GFX8-NEXT: s_max_i32 s17, s17, s20 2359; GFX8-NEXT: s_min_i32 s16, s17, s16 2360; GFX8-NEXT: s_min_i32 s17, s5, 0 2361; GFX8-NEXT: s_add_i32 s4, s4, s16 2362; GFX8-NEXT: s_max_i32 s16, s5, 0 2363; GFX8-NEXT: s_sub_i32 s17, s33, s17 2364; GFX8-NEXT: s_sub_i32 s16, s32, s16 2365; GFX8-NEXT: s_max_i32 s17, s17, s21 2366; GFX8-NEXT: s_min_i32 s16, s17, s16 2367; GFX8-NEXT: s_min_i32 s17, s6, 0 2368; GFX8-NEXT: s_add_i32 s5, s5, s16 2369; GFX8-NEXT: s_max_i32 s16, s6, 0 2370; GFX8-NEXT: s_sub_i32 s17, s33, s17 2371; GFX8-NEXT: s_sub_i32 s16, s32, s16 2372; GFX8-NEXT: s_max_i32 s17, s17, s22 2373; GFX8-NEXT: s_min_i32 s16, s17, s16 2374; GFX8-NEXT: s_min_i32 s17, s7, 0 2375; GFX8-NEXT: s_add_i32 s6, s6, s16 2376; GFX8-NEXT: s_max_i32 s16, s7, 0 2377; GFX8-NEXT: s_sub_i32 s17, s33, s17 2378; GFX8-NEXT: s_sub_i32 s16, s32, s16 2379; GFX8-NEXT: s_max_i32 s17, s17, s23 2380; GFX8-NEXT: s_min_i32 s16, s17, s16 2381; GFX8-NEXT: s_min_i32 s17, s8, 0 2382; GFX8-NEXT: s_add_i32 s7, s7, s16 2383; GFX8-NEXT: s_max_i32 s16, s8, 0 2384; GFX8-NEXT: s_sub_i32 s17, s33, s17 2385; GFX8-NEXT: s_sub_i32 s16, s32, s16 2386; GFX8-NEXT: s_max_i32 s17, s17, s24 2387; GFX8-NEXT: s_min_i32 s16, s17, s16 2388; GFX8-NEXT: s_min_i32 s17, s9, 0 2389; GFX8-NEXT: s_add_i32 s8, s8, s16 2390; GFX8-NEXT: s_max_i32 s16, s9, 0 2391; GFX8-NEXT: s_sub_i32 s17, s33, s17 2392; GFX8-NEXT: s_sub_i32 s16, s32, s16 2393; GFX8-NEXT: s_max_i32 s17, s17, s25 2394; GFX8-NEXT: s_min_i32 s16, s17, s16 2395; GFX8-NEXT: s_min_i32 s17, s10, 0 2396; GFX8-NEXT: s_add_i32 s9, s9, s16 2397; GFX8-NEXT: s_max_i32 s16, s10, 0 2398; GFX8-NEXT: s_sub_i32 s17, s33, s17 2399; GFX8-NEXT: s_sub_i32 s16, s32, s16 2400; GFX8-NEXT: s_max_i32 s17, s17, s26 2401; GFX8-NEXT: s_min_i32 s16, s17, s16 2402; GFX8-NEXT: s_min_i32 s17, s11, 0 2403; GFX8-NEXT: s_add_i32 s10, s10, s16 2404; GFX8-NEXT: s_max_i32 s16, s11, 0 2405; GFX8-NEXT: s_sub_i32 s17, s33, s17 2406; GFX8-NEXT: s_sub_i32 s16, s32, s16 2407; GFX8-NEXT: s_max_i32 s17, s17, s27 2408; GFX8-NEXT: s_min_i32 s16, s17, s16 2409; GFX8-NEXT: s_min_i32 s17, s12, 0 2410; GFX8-NEXT: s_add_i32 s11, s11, s16 2411; GFX8-NEXT: s_max_i32 s16, s12, 0 2412; GFX8-NEXT: s_sub_i32 s17, s33, s17 2413; GFX8-NEXT: s_sub_i32 s16, s32, s16 2414; GFX8-NEXT: s_max_i32 s17, s17, s28 2415; GFX8-NEXT: s_min_i32 s16, s17, s16 2416; GFX8-NEXT: s_min_i32 s17, s13, 0 2417; GFX8-NEXT: s_add_i32 s12, s12, s16 2418; GFX8-NEXT: s_max_i32 s16, s13, 0 2419; GFX8-NEXT: s_sub_i32 s17, s33, s17 2420; GFX8-NEXT: s_sub_i32 s16, s32, s16 2421; GFX8-NEXT: s_max_i32 s17, s17, s29 2422; GFX8-NEXT: s_min_i32 s16, s17, s16 2423; GFX8-NEXT: s_min_i32 s17, s14, 0 2424; GFX8-NEXT: s_add_i32 s13, s13, s16 2425; GFX8-NEXT: s_max_i32 s16, s14, 0 2426; GFX8-NEXT: s_sub_i32 s17, s33, s17 2427; GFX8-NEXT: s_sub_i32 s16, s32, s16 2428; GFX8-NEXT: s_max_i32 s17, s17, s30 2429; GFX8-NEXT: s_min_i32 s16, s17, s16 2430; GFX8-NEXT: s_min_i32 s17, s15, 0 2431; GFX8-NEXT: s_add_i32 s14, s14, s16 2432; GFX8-NEXT: s_max_i32 s16, s15, 0 2433; GFX8-NEXT: s_sub_i32 s17, s33, s17 2434; GFX8-NEXT: s_sub_i32 s16, s32, s16 2435; GFX8-NEXT: s_max_i32 s17, s17, s31 2436; GFX8-NEXT: s_min_i32 s16, s17, s16 2437; GFX8-NEXT: s_add_i32 s15, s15, s16 2438; GFX8-NEXT: ; return to shader part epilog 2439; 2440; GFX9-LABEL: s_saddsat_v16i32: 2441; GFX9: ; %bb.0: 2442; GFX9-NEXT: v_mov_b32_e32 v0, s16 2443; GFX9-NEXT: v_mov_b32_e32 v1, s17 2444; GFX9-NEXT: v_mov_b32_e32 v2, s18 2445; GFX9-NEXT: v_mov_b32_e32 v3, s19 2446; GFX9-NEXT: v_mov_b32_e32 v4, s20 2447; GFX9-NEXT: v_mov_b32_e32 v5, s21 2448; GFX9-NEXT: v_mov_b32_e32 v6, s22 2449; GFX9-NEXT: v_mov_b32_e32 v7, s23 2450; GFX9-NEXT: v_mov_b32_e32 v8, s24 2451; GFX9-NEXT: v_mov_b32_e32 v9, s25 2452; GFX9-NEXT: v_mov_b32_e32 v10, s26 2453; GFX9-NEXT: v_mov_b32_e32 v11, s27 2454; GFX9-NEXT: v_mov_b32_e32 v12, s28 2455; GFX9-NEXT: v_mov_b32_e32 v13, s29 2456; GFX9-NEXT: v_mov_b32_e32 v14, s30 2457; GFX9-NEXT: v_mov_b32_e32 v15, s31 2458; GFX9-NEXT: v_add_i32 v0, s0, v0 clamp 2459; GFX9-NEXT: v_add_i32 v1, s1, v1 clamp 2460; GFX9-NEXT: v_add_i32 v2, s2, v2 clamp 2461; GFX9-NEXT: v_add_i32 v3, s3, v3 clamp 2462; GFX9-NEXT: v_add_i32 v4, s4, v4 clamp 2463; GFX9-NEXT: v_add_i32 v5, s5, v5 clamp 2464; GFX9-NEXT: v_add_i32 v6, s6, v6 clamp 2465; GFX9-NEXT: v_add_i32 v7, s7, v7 clamp 2466; GFX9-NEXT: v_add_i32 v8, s8, v8 clamp 2467; GFX9-NEXT: v_add_i32 v9, s9, v9 clamp 2468; GFX9-NEXT: v_add_i32 v10, s10, v10 clamp 2469; GFX9-NEXT: v_add_i32 v11, s11, v11 clamp 2470; GFX9-NEXT: v_add_i32 v12, s12, v12 clamp 2471; GFX9-NEXT: v_add_i32 v13, s13, v13 clamp 2472; GFX9-NEXT: v_add_i32 v14, s14, v14 clamp 2473; GFX9-NEXT: v_add_i32 v15, s15, v15 clamp 2474; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2475; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2476; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2477; GFX9-NEXT: v_readfirstlane_b32 s3, v3 2478; GFX9-NEXT: v_readfirstlane_b32 s4, v4 2479; GFX9-NEXT: v_readfirstlane_b32 s5, v5 2480; GFX9-NEXT: v_readfirstlane_b32 s6, v6 2481; GFX9-NEXT: v_readfirstlane_b32 s7, v7 2482; GFX9-NEXT: v_readfirstlane_b32 s8, v8 2483; GFX9-NEXT: v_readfirstlane_b32 s9, v9 2484; GFX9-NEXT: v_readfirstlane_b32 s10, v10 2485; GFX9-NEXT: v_readfirstlane_b32 s11, v11 2486; GFX9-NEXT: v_readfirstlane_b32 s12, v12 2487; GFX9-NEXT: v_readfirstlane_b32 s13, v13 2488; GFX9-NEXT: v_readfirstlane_b32 s14, v14 2489; GFX9-NEXT: v_readfirstlane_b32 s15, v15 2490; GFX9-NEXT: ; return to shader part epilog 2491; 2492; GFX10-LABEL: s_saddsat_v16i32: 2493; GFX10: ; %bb.0: 2494; GFX10-NEXT: v_add_nc_i32 v0, s0, s16 clamp 2495; GFX10-NEXT: v_add_nc_i32 v1, s1, s17 clamp 2496; GFX10-NEXT: v_add_nc_i32 v2, s2, s18 clamp 2497; GFX10-NEXT: v_add_nc_i32 v3, s3, s19 clamp 2498; GFX10-NEXT: v_add_nc_i32 v4, s4, s20 clamp 2499; GFX10-NEXT: v_add_nc_i32 v5, s5, s21 clamp 2500; GFX10-NEXT: v_add_nc_i32 v6, s6, s22 clamp 2501; GFX10-NEXT: v_add_nc_i32 v7, s7, s23 clamp 2502; GFX10-NEXT: v_add_nc_i32 v8, s8, s24 clamp 2503; GFX10-NEXT: v_add_nc_i32 v9, s9, s25 clamp 2504; GFX10-NEXT: v_add_nc_i32 v10, s10, s26 clamp 2505; GFX10-NEXT: v_add_nc_i32 v11, s11, s27 clamp 2506; GFX10-NEXT: v_add_nc_i32 v12, s12, s28 clamp 2507; GFX10-NEXT: v_add_nc_i32 v13, s13, s29 clamp 2508; GFX10-NEXT: v_add_nc_i32 v14, s14, s30 clamp 2509; GFX10-NEXT: v_add_nc_i32 v15, s15, s31 clamp 2510; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2511; GFX10-NEXT: v_readfirstlane_b32 s1, v1 2512; GFX10-NEXT: v_readfirstlane_b32 s2, v2 2513; GFX10-NEXT: v_readfirstlane_b32 s3, v3 2514; GFX10-NEXT: v_readfirstlane_b32 s4, v4 2515; GFX10-NEXT: v_readfirstlane_b32 s5, v5 2516; GFX10-NEXT: v_readfirstlane_b32 s6, v6 2517; GFX10-NEXT: v_readfirstlane_b32 s7, v7 2518; GFX10-NEXT: v_readfirstlane_b32 s8, v8 2519; GFX10-NEXT: v_readfirstlane_b32 s9, v9 2520; GFX10-NEXT: v_readfirstlane_b32 s10, v10 2521; GFX10-NEXT: v_readfirstlane_b32 s11, v11 2522; GFX10-NEXT: v_readfirstlane_b32 s12, v12 2523; GFX10-NEXT: v_readfirstlane_b32 s13, v13 2524; GFX10-NEXT: v_readfirstlane_b32 s14, v14 2525; GFX10-NEXT: v_readfirstlane_b32 s15, v15 2526; GFX10-NEXT: ; return to shader part epilog 2527 %result = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2528 ret <16 x i32> %result 2529} 2530 2531define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { 2532; GFX6-LABEL: v_saddsat_i16: 2533; GFX6: ; %bb.0: 2534; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2535; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2536; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 2537; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2538; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 2539; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0x80000000, v3 2540; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x7fffffff, v2 2541; GFX6-NEXT: v_max_i32_e32 v1, v3, v1 2542; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 2543; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2544; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2545; GFX6-NEXT: s_setpc_b64 s[30:31] 2546; 2547; GFX8-LABEL: v_saddsat_i16: 2548; GFX8: ; %bb.0: 2549; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2550; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 2551; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 2552; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 2553; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 2554; GFX8-NEXT: v_max_i16_e32 v1, v3, v1 2555; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 2556; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 2557; GFX8-NEXT: s_setpc_b64 s[30:31] 2558; 2559; GFX9-LABEL: v_saddsat_i16: 2560; GFX9: ; %bb.0: 2561; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp 2563; GFX9-NEXT: s_setpc_b64 s[30:31] 2564; 2565; GFX10-LABEL: v_saddsat_i16: 2566; GFX10: ; %bb.0: 2567; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2568; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2569; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp 2570; GFX10-NEXT: s_setpc_b64 s[30:31] 2571 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2572 ret i16 %result 2573} 2574 2575define amdgpu_ps i16 @s_saddsat_i16(i16 inreg %lhs, i16 inreg %rhs) { 2576; GFX6-LABEL: s_saddsat_i16: 2577; GFX6: ; %bb.0: 2578; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2579; GFX6-NEXT: s_min_i32 s3, s0, 0 2580; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2581; GFX6-NEXT: s_max_i32 s2, s0, 0 2582; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 2583; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 2584; GFX6-NEXT: s_max_i32 s1, s3, s1 2585; GFX6-NEXT: s_min_i32 s1, s1, s2 2586; GFX6-NEXT: s_add_i32 s0, s0, s1 2587; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2588; GFX6-NEXT: ; return to shader part epilog 2589; 2590; GFX8-LABEL: s_saddsat_i16: 2591; GFX8: ; %bb.0: 2592; GFX8-NEXT: s_sext_i32_i16 s2, s0 2593; GFX8-NEXT: s_sext_i32_i16 s3, 0 2594; GFX8-NEXT: s_max_i32 s4, s2, s3 2595; GFX8-NEXT: s_min_i32 s2, s2, s3 2596; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 2597; GFX8-NEXT: s_sext_i32_i16 s2, s2 2598; GFX8-NEXT: s_sext_i32_i16 s1, s1 2599; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 2600; GFX8-NEXT: s_max_i32 s1, s2, s1 2601; GFX8-NEXT: s_sext_i32_i16 s1, s1 2602; GFX8-NEXT: s_sext_i32_i16 s2, s4 2603; GFX8-NEXT: s_min_i32 s1, s1, s2 2604; GFX8-NEXT: s_add_i32 s0, s0, s1 2605; GFX8-NEXT: ; return to shader part epilog 2606; 2607; GFX9-LABEL: s_saddsat_i16: 2608; GFX9: ; %bb.0: 2609; GFX9-NEXT: v_mov_b32_e32 v0, s1 2610; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 2611; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2612; GFX9-NEXT: ; return to shader part epilog 2613; 2614; GFX10-LABEL: s_saddsat_i16: 2615; GFX10: ; %bb.0: 2616; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp 2617; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2618; GFX10-NEXT: ; return to shader part epilog 2619 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2620 ret i16 %result 2621} 2622 2623define amdgpu_ps half @saddsat_i16_sv(i16 inreg %lhs, i16 %rhs) { 2624; GFX6-LABEL: saddsat_i16_sv: 2625; GFX6: ; %bb.0: 2626; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2627; GFX6-NEXT: s_min_i32 s2, s0, 0 2628; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2629; GFX6-NEXT: s_max_i32 s1, s0, 0 2630; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 2631; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 2632; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 2633; GFX6-NEXT: v_min_i32_e32 v0, s1, v0 2634; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2635; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2636; GFX6-NEXT: ; return to shader part epilog 2637; 2638; GFX8-LABEL: saddsat_i16_sv: 2639; GFX8: ; %bb.0: 2640; GFX8-NEXT: s_sext_i32_i16 s1, s0 2641; GFX8-NEXT: s_sext_i32_i16 s2, 0 2642; GFX8-NEXT: s_max_i32 s3, s1, s2 2643; GFX8-NEXT: s_min_i32 s1, s1, s2 2644; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 2645; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 2646; GFX8-NEXT: v_max_i16_e32 v0, s1, v0 2647; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 2648; GFX8-NEXT: v_add_u16_e32 v0, s0, v0 2649; GFX8-NEXT: ; return to shader part epilog 2650; 2651; GFX9-LABEL: saddsat_i16_sv: 2652; GFX9: ; %bb.0: 2653; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp 2654; GFX9-NEXT: ; return to shader part epilog 2655; 2656; GFX10-LABEL: saddsat_i16_sv: 2657; GFX10: ; %bb.0: 2658; GFX10-NEXT: v_add_nc_i16 v0, s0, v0 clamp 2659; GFX10-NEXT: ; return to shader part epilog 2660 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2661 %cast = bitcast i16 %result to half 2662 ret half %cast 2663} 2664 2665define amdgpu_ps half @saddsat_i16_vs(i16 %lhs, i16 inreg %rhs) { 2666; GFX6-LABEL: saddsat_i16_vs: 2667; GFX6: ; %bb.0: 2668; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2669; GFX6-NEXT: v_min_i32_e32 v2, 0, v0 2670; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2671; GFX6-NEXT: v_max_i32_e32 v1, 0, v0 2672; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0x80000000, v2 2673; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0x7fffffff, v1 2674; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 2675; GFX6-NEXT: v_min_i32_e32 v1, v2, v1 2676; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2677; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2678; GFX6-NEXT: ; return to shader part epilog 2679; 2680; GFX8-LABEL: saddsat_i16_vs: 2681; GFX8: ; %bb.0: 2682; GFX8-NEXT: v_min_i16_e32 v2, 0, v0 2683; GFX8-NEXT: v_max_i16_e32 v1, 0, v0 2684; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 2685; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 2686; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 2687; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 2688; GFX8-NEXT: v_add_u16_e32 v0, v0, v1 2689; GFX8-NEXT: ; return to shader part epilog 2690; 2691; GFX9-LABEL: saddsat_i16_vs: 2692; GFX9: ; %bb.0: 2693; GFX9-NEXT: v_add_i16 v0, v0, s0 clamp 2694; GFX9-NEXT: ; return to shader part epilog 2695; 2696; GFX10-LABEL: saddsat_i16_vs: 2697; GFX10: ; %bb.0: 2698; GFX10-NEXT: v_add_nc_i16 v0, v0, s0 clamp 2699; GFX10-NEXT: ; return to shader part epilog 2700 %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) 2701 %cast = bitcast i16 %result to half 2702 ret half %cast 2703} 2704 2705define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 2706; GFX6-LABEL: v_saddsat_v2i16: 2707; GFX6: ; %bb.0: 2708; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2709; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2710; GFX6-NEXT: s_brev_b32 s5, 1 2711; GFX6-NEXT: v_min_i32_e32 v5, 0, v0 2712; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2713; GFX6-NEXT: s_brev_b32 s4, -2 2714; GFX6-NEXT: v_max_i32_e32 v4, 0, v0 2715; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v5 2716; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 2717; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 2718; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2719; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 2720; GFX6-NEXT: v_min_i32_e32 v4, 0, v1 2721; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2722; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2723; GFX6-NEXT: v_max_i32_e32 v3, 0, v1 2724; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v4 2725; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 2726; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 2727; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 2728; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 2729; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2730; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2731; GFX6-NEXT: s_setpc_b64 s[30:31] 2732; 2733; GFX8-LABEL: v_saddsat_v2i16: 2734; GFX8: ; %bb.0: 2735; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2736; GFX8-NEXT: s_movk_i32 s5, 0x8000 2737; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 2738; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2739; GFX8-NEXT: s_movk_i32 s4, 0x7fff 2740; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 2741; GFX8-NEXT: v_sub_u16_e32 v4, s5, v4 2742; GFX8-NEXT: v_sub_u16_e32 v3, s4, v3 2743; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 2744; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 2745; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 2746; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 2747; GFX8-NEXT: v_sub_u16_e32 v5, s5, v5 2748; GFX8-NEXT: v_sub_u16_e32 v4, s4, v4 2749; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2750; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 2751; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 2752; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2753; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 2754; GFX8-NEXT: s_setpc_b64 s[30:31] 2755; 2756; GFX9-LABEL: v_saddsat_v2i16: 2757; GFX9: ; %bb.0: 2758; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2759; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp 2760; GFX9-NEXT: s_setpc_b64 s[30:31] 2761; 2762; GFX10-LABEL: v_saddsat_v2i16: 2763; GFX10: ; %bb.0: 2764; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2765; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2766; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp 2767; GFX10-NEXT: s_setpc_b64 s[30:31] 2768 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2769 ret <2 x i16> %result 2770} 2771 2772define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { 2773; GFX6-LABEL: s_saddsat_v2i16: 2774; GFX6: ; %bb.0: 2775; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2776; GFX6-NEXT: s_brev_b32 s5, 1 2777; GFX6-NEXT: s_min_i32 s7, s0, 0 2778; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2779; GFX6-NEXT: s_brev_b32 s4, -2 2780; GFX6-NEXT: s_max_i32 s6, s0, 0 2781; GFX6-NEXT: s_sub_i32 s7, s5, s7 2782; GFX6-NEXT: s_sub_i32 s6, s4, s6 2783; GFX6-NEXT: s_max_i32 s2, s7, s2 2784; GFX6-NEXT: s_min_i32 s2, s2, s6 2785; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2786; GFX6-NEXT: s_add_i32 s0, s0, s2 2787; GFX6-NEXT: s_lshl_b32 s2, s3, 16 2788; GFX6-NEXT: s_max_i32 s3, s1, 0 2789; GFX6-NEXT: s_sub_i32 s3, s4, s3 2790; GFX6-NEXT: s_min_i32 s4, s1, 0 2791; GFX6-NEXT: s_sub_i32 s4, s5, s4 2792; GFX6-NEXT: s_max_i32 s2, s4, s2 2793; GFX6-NEXT: s_min_i32 s2, s2, s3 2794; GFX6-NEXT: s_add_i32 s1, s1, s2 2795; GFX6-NEXT: s_ashr_i32 s1, s1, 16 2796; GFX6-NEXT: s_mov_b32 s2, 0xffff 2797; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2798; GFX6-NEXT: s_and_b32 s1, s1, s2 2799; GFX6-NEXT: s_and_b32 s0, s0, s2 2800; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2801; GFX6-NEXT: s_or_b32 s0, s0, s1 2802; GFX6-NEXT: ; return to shader part epilog 2803; 2804; GFX8-LABEL: s_saddsat_v2i16: 2805; GFX8: ; %bb.0: 2806; GFX8-NEXT: s_sext_i32_i16 s6, s0 2807; GFX8-NEXT: s_sext_i32_i16 s7, 0 2808; GFX8-NEXT: s_movk_i32 s5, 0x8000 2809; GFX8-NEXT: s_max_i32 s8, s6, s7 2810; GFX8-NEXT: s_min_i32 s6, s6, s7 2811; GFX8-NEXT: s_sub_i32 s6, s5, s6 2812; GFX8-NEXT: s_lshr_b32 s3, s1, 16 2813; GFX8-NEXT: s_movk_i32 s4, 0x7fff 2814; GFX8-NEXT: s_sext_i32_i16 s6, s6 2815; GFX8-NEXT: s_sext_i32_i16 s1, s1 2816; GFX8-NEXT: s_sub_i32 s8, s4, s8 2817; GFX8-NEXT: s_max_i32 s1, s6, s1 2818; GFX8-NEXT: s_sext_i32_i16 s1, s1 2819; GFX8-NEXT: s_sext_i32_i16 s6, s8 2820; GFX8-NEXT: s_lshr_b32 s2, s0, 16 2821; GFX8-NEXT: s_min_i32 s1, s1, s6 2822; GFX8-NEXT: s_add_i32 s0, s0, s1 2823; GFX8-NEXT: s_sext_i32_i16 s1, s2 2824; GFX8-NEXT: s_max_i32 s6, s1, s7 2825; GFX8-NEXT: s_min_i32 s1, s1, s7 2826; GFX8-NEXT: s_sub_i32 s1, s5, s1 2827; GFX8-NEXT: s_sext_i32_i16 s1, s1 2828; GFX8-NEXT: s_sext_i32_i16 s3, s3 2829; GFX8-NEXT: s_sub_i32 s4, s4, s6 2830; GFX8-NEXT: s_max_i32 s1, s1, s3 2831; GFX8-NEXT: s_sext_i32_i16 s1, s1 2832; GFX8-NEXT: s_sext_i32_i16 s3, s4 2833; GFX8-NEXT: s_min_i32 s1, s1, s3 2834; GFX8-NEXT: s_add_i32 s2, s2, s1 2835; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 2836; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 2837; GFX8-NEXT: s_lshl_b32 s1, s1, 16 2838; GFX8-NEXT: s_or_b32 s0, s0, s1 2839; GFX8-NEXT: ; return to shader part epilog 2840; 2841; GFX9-LABEL: s_saddsat_v2i16: 2842; GFX9: ; %bb.0: 2843; GFX9-NEXT: v_mov_b32_e32 v0, s1 2844; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2845; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2846; GFX9-NEXT: ; return to shader part epilog 2847; 2848; GFX10-LABEL: s_saddsat_v2i16: 2849; GFX10: ; %bb.0: 2850; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp 2851; GFX10-NEXT: v_readfirstlane_b32 s0, v0 2852; GFX10-NEXT: ; return to shader part epilog 2853 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2854 %cast = bitcast <2 x i16> %result to i32 2855 ret i32 %cast 2856} 2857 2858define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { 2859; GFX6-LABEL: saddsat_v2i16_sv: 2860; GFX6: ; %bb.0: 2861; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2862; GFX6-NEXT: s_brev_b32 s3, 1 2863; GFX6-NEXT: s_min_i32 s5, s0, 0 2864; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2865; GFX6-NEXT: s_brev_b32 s2, -2 2866; GFX6-NEXT: s_max_i32 s4, s0, 0 2867; GFX6-NEXT: s_sub_i32 s5, s3, s5 2868; GFX6-NEXT: s_sub_i32 s4, s2, s4 2869; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 2870; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 2871; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2872; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2873; GFX6-NEXT: s_max_i32 s1, s0, 0 2874; GFX6-NEXT: s_sub_i32 s1, s2, s1 2875; GFX6-NEXT: s_min_i32 s2, s0, 0 2876; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2877; GFX6-NEXT: s_sub_i32 s2, s3, s2 2878; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 2879; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 2880; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 2881; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2882; GFX6-NEXT: s_mov_b32 s0, 0xffff 2883; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2884; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 2885; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 2886; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2887; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2888; GFX6-NEXT: ; return to shader part epilog 2889; 2890; GFX8-LABEL: saddsat_v2i16_sv: 2891; GFX8: ; %bb.0: 2892; GFX8-NEXT: s_sext_i32_i16 s4, s0 2893; GFX8-NEXT: s_sext_i32_i16 s5, 0 2894; GFX8-NEXT: s_movk_i32 s3, 0x8000 2895; GFX8-NEXT: s_max_i32 s6, s4, s5 2896; GFX8-NEXT: s_min_i32 s4, s4, s5 2897; GFX8-NEXT: s_lshr_b32 s1, s0, 16 2898; GFX8-NEXT: s_movk_i32 s2, 0x7fff 2899; GFX8-NEXT: s_sub_i32 s4, s3, s4 2900; GFX8-NEXT: s_sub_i32 s6, s2, s6 2901; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 2902; GFX8-NEXT: s_sext_i32_i16 s4, s1 2903; GFX8-NEXT: v_min_i16_e32 v1, s6, v1 2904; GFX8-NEXT: s_max_i32 s6, s4, s5 2905; GFX8-NEXT: s_min_i32 s4, s4, s5 2906; GFX8-NEXT: s_sub_i32 s3, s3, s4 2907; GFX8-NEXT: v_mov_b32_e32 v2, s3 2908; GFX8-NEXT: s_sub_i32 s2, s2, s6 2909; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2910; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 2911; GFX8-NEXT: v_mov_b32_e32 v2, s1 2912; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 2913; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2914; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 2915; GFX8-NEXT: ; return to shader part epilog 2916; 2917; GFX9-LABEL: saddsat_v2i16_sv: 2918; GFX9: ; %bb.0: 2919; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2920; GFX9-NEXT: ; return to shader part epilog 2921; 2922; GFX10-LABEL: saddsat_v2i16_sv: 2923; GFX10: ; %bb.0: 2924; GFX10-NEXT: v_pk_add_i16 v0, s0, v0 clamp 2925; GFX10-NEXT: ; return to shader part epilog 2926 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2927 %cast = bitcast <2 x i16> %result to float 2928 ret float %cast 2929} 2930 2931define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { 2932; GFX6-LABEL: saddsat_v2i16_vs: 2933; GFX6: ; %bb.0: 2934; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2935; GFX6-NEXT: s_brev_b32 s3, 1 2936; GFX6-NEXT: v_min_i32_e32 v3, 0, v0 2937; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2938; GFX6-NEXT: s_brev_b32 s2, -2 2939; GFX6-NEXT: v_max_i32_e32 v2, 0, v0 2940; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 2941; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 2942; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 2943; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2944; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 2945; GFX6-NEXT: v_min_i32_e32 v3, 0, v1 2946; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2947; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2948; GFX6-NEXT: v_max_i32_e32 v2, 0, v1 2949; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v3 2950; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 2951; GFX6-NEXT: v_max_i32_e32 v3, s0, v3 2952; GFX6-NEXT: v_min_i32_e32 v2, v3, v2 2953; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 2954; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2955; GFX6-NEXT: s_mov_b32 s0, 0xffff 2956; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2957; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 2958; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 2959; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2960; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2961; GFX6-NEXT: ; return to shader part epilog 2962; 2963; GFX8-LABEL: saddsat_v2i16_vs: 2964; GFX8: ; %bb.0: 2965; GFX8-NEXT: s_movk_i32 s3, 0x8000 2966; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 2967; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 2968; GFX8-NEXT: s_movk_i32 s2, 0x7fff 2969; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 2970; GFX8-NEXT: v_sub_u16_e32 v3, s3, v3 2971; GFX8-NEXT: v_sub_u16_e32 v2, s2, v2 2972; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 2973; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 2974; GFX8-NEXT: s_lshr_b32 s1, s0, 16 2975; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 2976; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 2977; GFX8-NEXT: v_sub_u16_e32 v4, s3, v4 2978; GFX8-NEXT: v_sub_u16_e32 v3, s2, v3 2979; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 2980; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 2981; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 2982; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2983; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 2984; GFX8-NEXT: ; return to shader part epilog 2985; 2986; GFX9-LABEL: saddsat_v2i16_vs: 2987; GFX9: ; %bb.0: 2988; GFX9-NEXT: v_pk_add_i16 v0, v0, s0 clamp 2989; GFX9-NEXT: ; return to shader part epilog 2990; 2991; GFX10-LABEL: saddsat_v2i16_vs: 2992; GFX10: ; %bb.0: 2993; GFX10-NEXT: v_pk_add_i16 v0, v0, s0 clamp 2994; GFX10-NEXT: ; return to shader part epilog 2995 %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2996 %cast = bitcast <2 x i16> %result to float 2997 ret float %cast 2998} 2999 3000; FIXME: v3i16 insert/extract 3001; define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 3002; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3003; ret <3 x i16> %result 3004; } 3005 3006; define amdgpu_ps <3 x i16> @s_saddsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) { 3007; %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3008; ret <3 x i16> %result 3009; } 3010 3011define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 3012; GFX6-LABEL: v_saddsat_v4i16: 3013; GFX6: ; %bb.0: 3014; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3015; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3016; GFX6-NEXT: s_brev_b32 s5, 1 3017; GFX6-NEXT: v_min_i32_e32 v10, 0, v0 3018; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3019; GFX6-NEXT: s_brev_b32 s4, -2 3020; GFX6-NEXT: v_max_i32_e32 v8, 0, v0 3021; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 3022; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s4, v8 3023; GFX6-NEXT: v_max_i32_e32 v4, v10, v4 3024; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3025; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 3026; GFX6-NEXT: v_min_i32_e32 v8, 0, v1 3027; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 3028; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 3029; GFX6-NEXT: v_max_i32_e32 v5, 0, v1 3030; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 3031; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 3032; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 3033; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3034; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3035; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3036; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 3037; GFX6-NEXT: v_min_i32_e32 v6, 0, v2 3038; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 3039; GFX6-NEXT: v_max_i32_e32 v5, 0, v2 3040; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 3041; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 3042; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 3043; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3044; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 3045; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3046; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 3047; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3048; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 3049; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 3050; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v11, v6 3051; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 3052; GFX6-NEXT: v_max_i32_e32 v4, v6, v4 3053; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3054; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 3055; GFX6-NEXT: s_mov_b32 s4, 0xffff 3056; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3057; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 3058; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 3059; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3060; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3061; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 3062; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3063; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3064; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 3065; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 3066; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3067; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3068; GFX6-NEXT: s_setpc_b64 s[30:31] 3069; 3070; GFX8-LABEL: v_saddsat_v4i16: 3071; GFX8: ; %bb.0: 3072; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3073; GFX8-NEXT: s_movk_i32 s5, 0x8000 3074; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 3075; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 3076; GFX8-NEXT: s_movk_i32 s4, 0x7fff 3077; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 3078; GFX8-NEXT: v_sub_u16_e32 v7, s5, v7 3079; GFX8-NEXT: v_sub_u16_e32 v6, s4, v6 3080; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 3081; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 3082; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 3083; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 3084; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 3085; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 3086; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3087; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 3088; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 3089; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 3090; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 3091; GFX8-NEXT: v_sub_u16_e32 v8, s5, v8 3092; GFX8-NEXT: v_sub_u16_e32 v7, s4, v7 3093; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 3094; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 3095; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 3096; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 3097; GFX8-NEXT: v_sub_u16_e32 v9, s5, v9 3098; GFX8-NEXT: v_sub_u16_e32 v8, s4, v8 3099; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3100; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 3101; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 3102; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3103; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 3104; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 3105; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3106; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 3107; GFX8-NEXT: s_setpc_b64 s[30:31] 3108; 3109; GFX9-LABEL: v_saddsat_v4i16: 3110; GFX9: ; %bb.0: 3111; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3112; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp 3113; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp 3114; GFX9-NEXT: s_setpc_b64 s[30:31] 3115; 3116; GFX10-LABEL: v_saddsat_v4i16: 3117; GFX10: ; %bb.0: 3118; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3119; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3120; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp 3121; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp 3122; GFX10-NEXT: s_setpc_b64 s[30:31] 3123 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3124 %cast = bitcast <4 x i16> %result to <2 x float> 3125 ret <2 x float> %cast 3126} 3127 3128define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { 3129; GFX6-LABEL: s_saddsat_v4i16: 3130; GFX6: ; %bb.0: 3131; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3132; GFX6-NEXT: s_brev_b32 s9, 1 3133; GFX6-NEXT: s_min_i32 s11, s0, 0 3134; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3135; GFX6-NEXT: s_brev_b32 s8, -2 3136; GFX6-NEXT: s_max_i32 s10, s0, 0 3137; GFX6-NEXT: s_sub_i32 s11, s9, s11 3138; GFX6-NEXT: s_sub_i32 s10, s8, s10 3139; GFX6-NEXT: s_max_i32 s4, s11, s4 3140; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3141; GFX6-NEXT: s_min_i32 s4, s4, s10 3142; GFX6-NEXT: s_min_i32 s10, s1, 0 3143; GFX6-NEXT: s_add_i32 s0, s0, s4 3144; GFX6-NEXT: s_lshl_b32 s4, s5, 16 3145; GFX6-NEXT: s_max_i32 s5, s1, 0 3146; GFX6-NEXT: s_sub_i32 s10, s9, s10 3147; GFX6-NEXT: s_sub_i32 s5, s8, s5 3148; GFX6-NEXT: s_max_i32 s4, s10, s4 3149; GFX6-NEXT: s_min_i32 s4, s4, s5 3150; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3151; GFX6-NEXT: s_add_i32 s1, s1, s4 3152; GFX6-NEXT: s_lshl_b32 s4, s6, 16 3153; GFX6-NEXT: s_min_i32 s6, s2, 0 3154; GFX6-NEXT: s_max_i32 s5, s2, 0 3155; GFX6-NEXT: s_sub_i32 s6, s9, s6 3156; GFX6-NEXT: s_sub_i32 s5, s8, s5 3157; GFX6-NEXT: s_max_i32 s4, s6, s4 3158; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3159; GFX6-NEXT: s_min_i32 s4, s4, s5 3160; GFX6-NEXT: s_min_i32 s6, s3, 0 3161; GFX6-NEXT: s_add_i32 s2, s2, s4 3162; GFX6-NEXT: s_lshl_b32 s4, s7, 16 3163; GFX6-NEXT: s_max_i32 s5, s3, 0 3164; GFX6-NEXT: s_sub_i32 s6, s9, s6 3165; GFX6-NEXT: s_sub_i32 s5, s8, s5 3166; GFX6-NEXT: s_max_i32 s4, s6, s4 3167; GFX6-NEXT: s_min_i32 s4, s4, s5 3168; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3169; GFX6-NEXT: s_add_i32 s3, s3, s4 3170; GFX6-NEXT: s_mov_b32 s4, 0xffff 3171; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3172; GFX6-NEXT: s_and_b32 s1, s1, s4 3173; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3174; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3175; GFX6-NEXT: s_and_b32 s0, s0, s4 3176; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3177; GFX6-NEXT: s_or_b32 s0, s0, s1 3178; GFX6-NEXT: s_and_b32 s1, s2, s4 3179; GFX6-NEXT: s_and_b32 s2, s3, s4 3180; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3181; GFX6-NEXT: s_or_b32 s1, s1, s2 3182; GFX6-NEXT: ; return to shader part epilog 3183; 3184; GFX8-LABEL: s_saddsat_v4i16: 3185; GFX8: ; %bb.0: 3186; GFX8-NEXT: s_sext_i32_i16 s10, s0 3187; GFX8-NEXT: s_sext_i32_i16 s11, 0 3188; GFX8-NEXT: s_movk_i32 s9, 0x8000 3189; GFX8-NEXT: s_max_i32 s12, s10, s11 3190; GFX8-NEXT: s_min_i32 s10, s10, s11 3191; GFX8-NEXT: s_sub_i32 s10, s9, s10 3192; GFX8-NEXT: s_lshr_b32 s6, s2, 16 3193; GFX8-NEXT: s_movk_i32 s8, 0x7fff 3194; GFX8-NEXT: s_sext_i32_i16 s10, s10 3195; GFX8-NEXT: s_sext_i32_i16 s2, s2 3196; GFX8-NEXT: s_sub_i32 s12, s8, s12 3197; GFX8-NEXT: s_max_i32 s2, s10, s2 3198; GFX8-NEXT: s_sext_i32_i16 s2, s2 3199; GFX8-NEXT: s_sext_i32_i16 s10, s12 3200; GFX8-NEXT: s_lshr_b32 s4, s0, 16 3201; GFX8-NEXT: s_min_i32 s2, s2, s10 3202; GFX8-NEXT: s_add_i32 s0, s0, s2 3203; GFX8-NEXT: s_sext_i32_i16 s2, s4 3204; GFX8-NEXT: s_max_i32 s10, s2, s11 3205; GFX8-NEXT: s_min_i32 s2, s2, s11 3206; GFX8-NEXT: s_sub_i32 s2, s9, s2 3207; GFX8-NEXT: s_sext_i32_i16 s2, s2 3208; GFX8-NEXT: s_sext_i32_i16 s6, s6 3209; GFX8-NEXT: s_sub_i32 s10, s8, s10 3210; GFX8-NEXT: s_max_i32 s2, s2, s6 3211; GFX8-NEXT: s_sext_i32_i16 s2, s2 3212; GFX8-NEXT: s_sext_i32_i16 s6, s10 3213; GFX8-NEXT: s_min_i32 s2, s2, s6 3214; GFX8-NEXT: s_add_i32 s4, s4, s2 3215; GFX8-NEXT: s_sext_i32_i16 s2, s1 3216; GFX8-NEXT: s_max_i32 s6, s2, s11 3217; GFX8-NEXT: s_min_i32 s2, s2, s11 3218; GFX8-NEXT: s_sub_i32 s2, s9, s2 3219; GFX8-NEXT: s_lshr_b32 s7, s3, 16 3220; GFX8-NEXT: s_sext_i32_i16 s2, s2 3221; GFX8-NEXT: s_sext_i32_i16 s3, s3 3222; GFX8-NEXT: s_sub_i32 s6, s8, s6 3223; GFX8-NEXT: s_max_i32 s2, s2, s3 3224; GFX8-NEXT: s_sext_i32_i16 s2, s2 3225; GFX8-NEXT: s_sext_i32_i16 s3, s6 3226; GFX8-NEXT: s_lshr_b32 s5, s1, 16 3227; GFX8-NEXT: s_min_i32 s2, s2, s3 3228; GFX8-NEXT: s_add_i32 s1, s1, s2 3229; GFX8-NEXT: s_sext_i32_i16 s2, s5 3230; GFX8-NEXT: s_max_i32 s3, s2, s11 3231; GFX8-NEXT: s_min_i32 s2, s2, s11 3232; GFX8-NEXT: s_sub_i32 s2, s9, s2 3233; GFX8-NEXT: s_sext_i32_i16 s2, s2 3234; GFX8-NEXT: s_sext_i32_i16 s6, s7 3235; GFX8-NEXT: s_sub_i32 s3, s8, s3 3236; GFX8-NEXT: s_max_i32 s2, s2, s6 3237; GFX8-NEXT: s_sext_i32_i16 s2, s2 3238; GFX8-NEXT: s_sext_i32_i16 s3, s3 3239; GFX8-NEXT: s_min_i32 s2, s2, s3 3240; GFX8-NEXT: s_add_i32 s5, s5, s2 3241; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000 3242; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 3243; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3244; GFX8-NEXT: s_or_b32 s0, s0, s2 3245; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000 3246; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 3247; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3248; GFX8-NEXT: s_or_b32 s1, s1, s2 3249; GFX8-NEXT: ; return to shader part epilog 3250; 3251; GFX9-LABEL: s_saddsat_v4i16: 3252; GFX9: ; %bb.0: 3253; GFX9-NEXT: v_mov_b32_e32 v0, s2 3254; GFX9-NEXT: v_mov_b32_e32 v1, s3 3255; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 3256; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 3257; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3258; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3259; GFX9-NEXT: ; return to shader part epilog 3260; 3261; GFX10-LABEL: s_saddsat_v4i16: 3262; GFX10: ; %bb.0: 3263; GFX10-NEXT: v_pk_add_i16 v0, s0, s2 clamp 3264; GFX10-NEXT: v_pk_add_i16 v1, s1, s3 clamp 3265; GFX10-NEXT: v_readfirstlane_b32 s0, v0 3266; GFX10-NEXT: v_readfirstlane_b32 s1, v1 3267; GFX10-NEXT: ; return to shader part epilog 3268 %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3269 %cast = bitcast <4 x i16> %result to <2 x i32> 3270 ret <2 x i32> %cast 3271} 3272 3273; FIXME 3274; define <5 x i16> @v_saddsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) { 3275; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3276; ret <5 x i16> %result 3277; } 3278 3279; define amdgpu_ps <5 x i16> @s_saddsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) { 3280; %result = call <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3281; ret <5 x i16> %result 3282; } 3283 3284define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { 3285; GFX6-LABEL: v_saddsat_v6i16: 3286; GFX6: ; %bb.0: 3287; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3288; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3289; GFX6-NEXT: s_brev_b32 s5, 1 3290; GFX6-NEXT: v_min_i32_e32 v14, 0, v0 3291; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3292; GFX6-NEXT: s_brev_b32 s4, -2 3293; GFX6-NEXT: v_max_i32_e32 v12, 0, v0 3294; GFX6-NEXT: v_sub_i32_e32 v14, vcc, s5, v14 3295; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s4, v12 3296; GFX6-NEXT: v_max_i32_e32 v6, v14, v6 3297; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3298; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 3299; GFX6-NEXT: v_min_i32_e32 v12, 0, v1 3300; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 3301; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 3302; GFX6-NEXT: v_max_i32_e32 v7, 0, v1 3303; GFX6-NEXT: v_sub_i32_e32 v12, vcc, s5, v12 3304; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s4, v7 3305; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 3306; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3307; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3308; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6 3309; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 3310; GFX6-NEXT: v_min_i32_e32 v8, 0, v2 3311; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 3312; GFX6-NEXT: v_max_i32_e32 v7, 0, v2 3313; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 3314; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3315; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3316; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3317; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 3318; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3319; GFX6-NEXT: v_min_i32_e32 v8, 0, v3 3320; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 3321; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 3322; GFX6-NEXT: v_max_i32_e32 v7, 0, v3 3323; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3324; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3325; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3326; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3327; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3328; GFX6-NEXT: v_min_i32_e32 v8, 0, v4 3329; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 3330; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 3331; GFX6-NEXT: v_max_i32_e32 v7, 0, v4 3332; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3333; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3334; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3335; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3336; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3337; GFX6-NEXT: v_min_i32_e32 v8, 0, v5 3338; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 3339; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 3340; GFX6-NEXT: v_max_i32_e32 v7, 0, v5 3341; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 3342; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3343; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v13, v7 3344; GFX6-NEXT: v_max_i32_e32 v6, v8, v6 3345; GFX6-NEXT: s_mov_b32 s4, 0xffff 3346; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3347; GFX6-NEXT: v_min_i32_e32 v6, v6, v7 3348; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 3349; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3350; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3351; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 3352; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 3353; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3354; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3355; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3356; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 3357; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 3358; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3359; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3360; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 3361; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3362; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 3363; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3364; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3365; GFX6-NEXT: s_setpc_b64 s[30:31] 3366; 3367; GFX8-LABEL: v_saddsat_v6i16: 3368; GFX8: ; %bb.0: 3369; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3370; GFX8-NEXT: s_movk_i32 s5, 0x8000 3371; GFX8-NEXT: v_min_i16_e32 v11, 0, v0 3372; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 3373; GFX8-NEXT: s_movk_i32 s4, 0x7fff 3374; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 3375; GFX8-NEXT: v_sub_u16_e32 v11, s5, v11 3376; GFX8-NEXT: v_sub_u16_e32 v9, s4, v9 3377; GFX8-NEXT: v_max_i16_e32 v11, v11, v3 3378; GFX8-NEXT: v_min_i16_e32 v13, 0, v6 3379; GFX8-NEXT: v_min_i16_e32 v9, v11, v9 3380; GFX8-NEXT: v_max_i16_e32 v11, 0, v6 3381; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 3382; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 3383; GFX8-NEXT: v_max_i16_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3384; GFX8-NEXT: v_min_i16_e32 v13, 0, v1 3385; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 3386; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 3387; GFX8-NEXT: v_max_i16_e32 v11, 0, v1 3388; GFX8-NEXT: v_sub_u16_e32 v13, s5, v13 3389; GFX8-NEXT: v_sub_u16_e32 v11, s4, v11 3390; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 3391; GFX8-NEXT: v_min_i16_e32 v14, 0, v7 3392; GFX8-NEXT: v_min_i16_e32 v11, v13, v11 3393; GFX8-NEXT: v_max_i16_e32 v13, 0, v7 3394; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 3395; GFX8-NEXT: v_mov_b32_e32 v12, 0xffff8000 3396; GFX8-NEXT: v_sub_u16_e32 v13, s4, v13 3397; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3398; GFX8-NEXT: v_min_i16_e32 v14, 0, v2 3399; GFX8-NEXT: v_mov_b32_e32 v10, 0x7fff 3400; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 3401; GFX8-NEXT: v_max_i16_e32 v13, 0, v2 3402; GFX8-NEXT: v_sub_u16_e32 v14, v12, v14 3403; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 3404; GFX8-NEXT: v_sub_u16_e32 v13, v10, v13 3405; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 3406; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 3407; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 3408; GFX8-NEXT: v_sub_u16_e32 v10, v10, v14 3409; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 3410; GFX8-NEXT: v_sub_u16_e32 v12, v12, v14 3411; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3412; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 3413; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3414; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 3415; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 3416; GFX8-NEXT: v_add_u16_e32 v1, v1, v11 3417; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3418; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 3419; GFX8-NEXT: v_add_u16_e32 v2, v2, v13 3420; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3421; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 3422; GFX8-NEXT: s_setpc_b64 s[30:31] 3423; 3424; GFX9-LABEL: v_saddsat_v6i16: 3425; GFX9: ; %bb.0: 3426; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3427; GFX9-NEXT: v_pk_add_i16 v0, v0, v3 clamp 3428; GFX9-NEXT: v_pk_add_i16 v1, v1, v4 clamp 3429; GFX9-NEXT: v_pk_add_i16 v2, v2, v5 clamp 3430; GFX9-NEXT: s_setpc_b64 s[30:31] 3431; 3432; GFX10-LABEL: v_saddsat_v6i16: 3433; GFX10: ; %bb.0: 3434; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3435; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3436; GFX10-NEXT: v_pk_add_i16 v0, v0, v3 clamp 3437; GFX10-NEXT: v_pk_add_i16 v1, v1, v4 clamp 3438; GFX10-NEXT: v_pk_add_i16 v2, v2, v5 clamp 3439; GFX10-NEXT: s_setpc_b64 s[30:31] 3440 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3441 %cast = bitcast <6 x i16> %result to <3 x float> 3442 ret <3 x float> %cast 3443} 3444 3445define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { 3446; GFX6-LABEL: s_saddsat_v6i16: 3447; GFX6: ; %bb.0: 3448; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3449; GFX6-NEXT: s_brev_b32 s13, 1 3450; GFX6-NEXT: s_min_i32 s15, s0, 0 3451; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3452; GFX6-NEXT: s_brev_b32 s12, -2 3453; GFX6-NEXT: s_max_i32 s14, s0, 0 3454; GFX6-NEXT: s_sub_i32 s15, s13, s15 3455; GFX6-NEXT: s_sub_i32 s14, s12, s14 3456; GFX6-NEXT: s_max_i32 s6, s15, s6 3457; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3458; GFX6-NEXT: s_min_i32 s6, s6, s14 3459; GFX6-NEXT: s_min_i32 s14, s1, 0 3460; GFX6-NEXT: s_add_i32 s0, s0, s6 3461; GFX6-NEXT: s_lshl_b32 s6, s7, 16 3462; GFX6-NEXT: s_max_i32 s7, s1, 0 3463; GFX6-NEXT: s_sub_i32 s14, s13, s14 3464; GFX6-NEXT: s_sub_i32 s7, s12, s7 3465; GFX6-NEXT: s_max_i32 s6, s14, s6 3466; GFX6-NEXT: s_min_i32 s6, s6, s7 3467; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3468; GFX6-NEXT: s_add_i32 s1, s1, s6 3469; GFX6-NEXT: s_lshl_b32 s6, s8, 16 3470; GFX6-NEXT: s_min_i32 s8, s2, 0 3471; GFX6-NEXT: s_max_i32 s7, s2, 0 3472; GFX6-NEXT: s_sub_i32 s8, s13, s8 3473; GFX6-NEXT: s_sub_i32 s7, s12, s7 3474; GFX6-NEXT: s_max_i32 s6, s8, s6 3475; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3476; GFX6-NEXT: s_min_i32 s6, s6, s7 3477; GFX6-NEXT: s_min_i32 s8, s3, 0 3478; GFX6-NEXT: s_add_i32 s2, s2, s6 3479; GFX6-NEXT: s_lshl_b32 s6, s9, 16 3480; GFX6-NEXT: s_max_i32 s7, s3, 0 3481; GFX6-NEXT: s_sub_i32 s8, s13, s8 3482; GFX6-NEXT: s_sub_i32 s7, s12, s7 3483; GFX6-NEXT: s_max_i32 s6, s8, s6 3484; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3485; GFX6-NEXT: s_min_i32 s6, s6, s7 3486; GFX6-NEXT: s_min_i32 s8, s4, 0 3487; GFX6-NEXT: s_add_i32 s3, s3, s6 3488; GFX6-NEXT: s_lshl_b32 s6, s10, 16 3489; GFX6-NEXT: s_max_i32 s7, s4, 0 3490; GFX6-NEXT: s_sub_i32 s8, s13, s8 3491; GFX6-NEXT: s_sub_i32 s7, s12, s7 3492; GFX6-NEXT: s_max_i32 s6, s8, s6 3493; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3494; GFX6-NEXT: s_min_i32 s6, s6, s7 3495; GFX6-NEXT: s_min_i32 s8, s5, 0 3496; GFX6-NEXT: s_add_i32 s4, s4, s6 3497; GFX6-NEXT: s_lshl_b32 s6, s11, 16 3498; GFX6-NEXT: s_max_i32 s7, s5, 0 3499; GFX6-NEXT: s_sub_i32 s8, s13, s8 3500; GFX6-NEXT: s_sub_i32 s7, s12, s7 3501; GFX6-NEXT: s_max_i32 s6, s8, s6 3502; GFX6-NEXT: s_min_i32 s6, s6, s7 3503; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3504; GFX6-NEXT: s_add_i32 s5, s5, s6 3505; GFX6-NEXT: s_mov_b32 s6, 0xffff 3506; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3507; GFX6-NEXT: s_and_b32 s1, s1, s6 3508; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3509; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3510; GFX6-NEXT: s_and_b32 s0, s0, s6 3511; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3512; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3513; GFX6-NEXT: s_or_b32 s0, s0, s1 3514; GFX6-NEXT: s_and_b32 s1, s2, s6 3515; GFX6-NEXT: s_and_b32 s2, s3, s6 3516; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3517; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3518; GFX6-NEXT: s_and_b32 s3, s5, s6 3519; GFX6-NEXT: s_or_b32 s1, s1, s2 3520; GFX6-NEXT: s_and_b32 s2, s4, s6 3521; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3522; GFX6-NEXT: s_or_b32 s2, s2, s3 3523; GFX6-NEXT: ; return to shader part epilog 3524; 3525; GFX8-LABEL: s_saddsat_v6i16: 3526; GFX8: ; %bb.0: 3527; GFX8-NEXT: s_sext_i32_i16 s14, s0 3528; GFX8-NEXT: s_sext_i32_i16 s15, 0 3529; GFX8-NEXT: s_movk_i32 s13, 0x8000 3530; GFX8-NEXT: s_max_i32 s16, s14, s15 3531; GFX8-NEXT: s_min_i32 s14, s14, s15 3532; GFX8-NEXT: s_sub_i32 s14, s13, s14 3533; GFX8-NEXT: s_lshr_b32 s9, s3, 16 3534; GFX8-NEXT: s_movk_i32 s12, 0x7fff 3535; GFX8-NEXT: s_sext_i32_i16 s14, s14 3536; GFX8-NEXT: s_sext_i32_i16 s3, s3 3537; GFX8-NEXT: s_sub_i32 s16, s12, s16 3538; GFX8-NEXT: s_max_i32 s3, s14, s3 3539; GFX8-NEXT: s_sext_i32_i16 s3, s3 3540; GFX8-NEXT: s_sext_i32_i16 s14, s16 3541; GFX8-NEXT: s_lshr_b32 s6, s0, 16 3542; GFX8-NEXT: s_min_i32 s3, s3, s14 3543; GFX8-NEXT: s_add_i32 s0, s0, s3 3544; GFX8-NEXT: s_sext_i32_i16 s3, s6 3545; GFX8-NEXT: s_max_i32 s14, s3, s15 3546; GFX8-NEXT: s_min_i32 s3, s3, s15 3547; GFX8-NEXT: s_sub_i32 s3, s13, s3 3548; GFX8-NEXT: s_sext_i32_i16 s3, s3 3549; GFX8-NEXT: s_sext_i32_i16 s9, s9 3550; GFX8-NEXT: s_sub_i32 s14, s12, s14 3551; GFX8-NEXT: s_max_i32 s3, s3, s9 3552; GFX8-NEXT: s_sext_i32_i16 s3, s3 3553; GFX8-NEXT: s_sext_i32_i16 s9, s14 3554; GFX8-NEXT: s_min_i32 s3, s3, s9 3555; GFX8-NEXT: s_add_i32 s6, s6, s3 3556; GFX8-NEXT: s_sext_i32_i16 s3, s1 3557; GFX8-NEXT: s_max_i32 s9, s3, s15 3558; GFX8-NEXT: s_min_i32 s3, s3, s15 3559; GFX8-NEXT: s_sub_i32 s3, s13, s3 3560; GFX8-NEXT: s_lshr_b32 s10, s4, 16 3561; GFX8-NEXT: s_sext_i32_i16 s3, s3 3562; GFX8-NEXT: s_sext_i32_i16 s4, s4 3563; GFX8-NEXT: s_sub_i32 s9, s12, s9 3564; GFX8-NEXT: s_max_i32 s3, s3, s4 3565; GFX8-NEXT: s_sext_i32_i16 s3, s3 3566; GFX8-NEXT: s_sext_i32_i16 s4, s9 3567; GFX8-NEXT: s_lshr_b32 s7, s1, 16 3568; GFX8-NEXT: s_min_i32 s3, s3, s4 3569; GFX8-NEXT: s_add_i32 s1, s1, s3 3570; GFX8-NEXT: s_sext_i32_i16 s3, s7 3571; GFX8-NEXT: s_max_i32 s4, s3, s15 3572; GFX8-NEXT: s_min_i32 s3, s3, s15 3573; GFX8-NEXT: s_sub_i32 s3, s13, s3 3574; GFX8-NEXT: s_sext_i32_i16 s3, s3 3575; GFX8-NEXT: s_sext_i32_i16 s9, s10 3576; GFX8-NEXT: s_sub_i32 s4, s12, s4 3577; GFX8-NEXT: s_max_i32 s3, s3, s9 3578; GFX8-NEXT: s_sext_i32_i16 s3, s3 3579; GFX8-NEXT: s_sext_i32_i16 s4, s4 3580; GFX8-NEXT: s_min_i32 s3, s3, s4 3581; GFX8-NEXT: s_add_i32 s7, s7, s3 3582; GFX8-NEXT: s_sext_i32_i16 s3, s2 3583; GFX8-NEXT: s_max_i32 s4, s3, s15 3584; GFX8-NEXT: s_min_i32 s3, s3, s15 3585; GFX8-NEXT: s_sub_i32 s3, s13, s3 3586; GFX8-NEXT: s_lshr_b32 s11, s5, 16 3587; GFX8-NEXT: s_sext_i32_i16 s3, s3 3588; GFX8-NEXT: s_sext_i32_i16 s5, s5 3589; GFX8-NEXT: s_sub_i32 s4, s12, s4 3590; GFX8-NEXT: s_max_i32 s3, s3, s5 3591; GFX8-NEXT: s_sext_i32_i16 s3, s3 3592; GFX8-NEXT: s_sext_i32_i16 s4, s4 3593; GFX8-NEXT: s_lshr_b32 s8, s2, 16 3594; GFX8-NEXT: s_min_i32 s3, s3, s4 3595; GFX8-NEXT: s_add_i32 s2, s2, s3 3596; GFX8-NEXT: s_sext_i32_i16 s3, s8 3597; GFX8-NEXT: s_max_i32 s4, s3, s15 3598; GFX8-NEXT: s_min_i32 s3, s3, s15 3599; GFX8-NEXT: s_sub_i32 s3, s13, s3 3600; GFX8-NEXT: s_sext_i32_i16 s3, s3 3601; GFX8-NEXT: s_sext_i32_i16 s5, s11 3602; GFX8-NEXT: s_sub_i32 s4, s12, s4 3603; GFX8-NEXT: s_max_i32 s3, s3, s5 3604; GFX8-NEXT: s_sext_i32_i16 s3, s3 3605; GFX8-NEXT: s_sext_i32_i16 s4, s4 3606; GFX8-NEXT: s_min_i32 s3, s3, s4 3607; GFX8-NEXT: s_add_i32 s8, s8, s3 3608; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000 3609; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 3610; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3611; GFX8-NEXT: s_or_b32 s0, s0, s3 3612; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000 3613; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 3614; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3615; GFX8-NEXT: s_or_b32 s1, s1, s3 3616; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000 3617; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 3618; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3619; GFX8-NEXT: s_or_b32 s2, s2, s3 3620; GFX8-NEXT: ; return to shader part epilog 3621; 3622; GFX9-LABEL: s_saddsat_v6i16: 3623; GFX9: ; %bb.0: 3624; GFX9-NEXT: v_mov_b32_e32 v0, s3 3625; GFX9-NEXT: v_mov_b32_e32 v1, s4 3626; GFX9-NEXT: v_mov_b32_e32 v2, s5 3627; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 3628; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 3629; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp 3630; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3631; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3632; GFX9-NEXT: v_readfirstlane_b32 s2, v2 3633; GFX9-NEXT: ; return to shader part epilog 3634; 3635; GFX10-LABEL: s_saddsat_v6i16: 3636; GFX10: ; %bb.0: 3637; GFX10-NEXT: v_pk_add_i16 v0, s0, s3 clamp 3638; GFX10-NEXT: v_pk_add_i16 v1, s1, s4 clamp 3639; GFX10-NEXT: v_pk_add_i16 v2, s2, s5 clamp 3640; GFX10-NEXT: v_readfirstlane_b32 s0, v0 3641; GFX10-NEXT: v_readfirstlane_b32 s1, v1 3642; GFX10-NEXT: v_readfirstlane_b32 s2, v2 3643; GFX10-NEXT: ; return to shader part epilog 3644 %result = call <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3645 %cast = bitcast <6 x i16> %result to <3 x i32> 3646 ret <3 x i32> %cast 3647} 3648 3649define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { 3650; GFX6-LABEL: v_saddsat_v8i16: 3651; GFX6: ; %bb.0: 3652; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3653; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3654; GFX6-NEXT: s_brev_b32 s5, 1 3655; GFX6-NEXT: v_min_i32_e32 v18, 0, v0 3656; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 3657; GFX6-NEXT: s_brev_b32 s4, -2 3658; GFX6-NEXT: v_max_i32_e32 v16, 0, v0 3659; GFX6-NEXT: v_sub_i32_e32 v18, vcc, s5, v18 3660; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s4, v16 3661; GFX6-NEXT: v_max_i32_e32 v8, v18, v8 3662; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3663; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 3664; GFX6-NEXT: v_min_i32_e32 v16, 0, v1 3665; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8 3666; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 3667; GFX6-NEXT: v_max_i32_e32 v9, 0, v1 3668; GFX6-NEXT: v_sub_i32_e32 v16, vcc, s5, v16 3669; GFX6-NEXT: v_sub_i32_e32 v9, vcc, s4, v9 3670; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 3671; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3672; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3673; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8 3674; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 3675; GFX6-NEXT: v_min_i32_e32 v10, 0, v2 3676; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 3677; GFX6-NEXT: v_max_i32_e32 v9, 0, v2 3678; GFX6-NEXT: v_sub_i32_e32 v10, vcc, s5, v10 3679; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3680; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3681; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3682; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 3683; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3684; GFX6-NEXT: v_min_i32_e32 v10, 0, v3 3685; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 3686; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 3687; GFX6-NEXT: v_max_i32_e32 v9, 0, v3 3688; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3689; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3690; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3691; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3692; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3693; GFX6-NEXT: v_min_i32_e32 v10, 0, v4 3694; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8 3695; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 3696; GFX6-NEXT: v_max_i32_e32 v9, 0, v4 3697; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3698; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3699; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3700; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3701; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3702; GFX6-NEXT: v_min_i32_e32 v10, 0, v5 3703; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 3704; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 3705; GFX6-NEXT: v_max_i32_e32 v9, 0, v5 3706; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3707; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3708; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3709; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3710; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3711; GFX6-NEXT: v_min_i32_e32 v10, 0, v6 3712; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 3713; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 3714; GFX6-NEXT: v_max_i32_e32 v9, 0, v6 3715; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3716; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3717; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3718; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 3719; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3720; GFX6-NEXT: v_min_i32_e32 v10, 0, v7 3721; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3722; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 3723; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 3724; GFX6-NEXT: v_max_i32_e32 v9, 0, v7 3725; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v19, v10 3726; GFX6-NEXT: s_mov_b32 s4, 0xffff 3727; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3728; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v17, v9 3729; GFX6-NEXT: v_max_i32_e32 v8, v10, v8 3730; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 3731; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3732; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3733; GFX6-NEXT: v_min_i32_e32 v8, v8, v9 3734; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 3735; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3736; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3737; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8 3738; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3739; GFX6-NEXT: v_and_b32_e32 v1, s4, v2 3740; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 3741; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3742; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 3743; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3744; GFX6-NEXT: v_and_b32_e32 v3, s4, v5 3745; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 3746; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3747; GFX6-NEXT: v_and_b32_e32 v2, s4, v4 3748; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3749; GFX6-NEXT: v_and_b32_e32 v4, s4, v7 3750; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3751; GFX6-NEXT: v_and_b32_e32 v3, s4, v6 3752; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3753; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 3754; GFX6-NEXT: s_setpc_b64 s[30:31] 3755; 3756; GFX8-LABEL: v_saddsat_v8i16: 3757; GFX8: ; %bb.0: 3758; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3759; GFX8-NEXT: s_movk_i32 s5, 0x8000 3760; GFX8-NEXT: v_min_i16_e32 v14, 0, v0 3761; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 3762; GFX8-NEXT: s_movk_i32 s4, 0x7fff 3763; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 3764; GFX8-NEXT: v_sub_u16_e32 v14, s5, v14 3765; GFX8-NEXT: v_sub_u16_e32 v12, s4, v12 3766; GFX8-NEXT: v_max_i16_e32 v14, v14, v4 3767; GFX8-NEXT: v_min_i16_e32 v16, 0, v8 3768; GFX8-NEXT: v_min_i16_e32 v12, v14, v12 3769; GFX8-NEXT: v_max_i16_e32 v14, 0, v8 3770; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 3771; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 3772; GFX8-NEXT: v_max_i16_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3773; GFX8-NEXT: v_min_i16_e32 v16, 0, v1 3774; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 3775; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 3776; GFX8-NEXT: v_max_i16_e32 v14, 0, v1 3777; GFX8-NEXT: v_sub_u16_e32 v16, s5, v16 3778; GFX8-NEXT: v_sub_u16_e32 v14, s4, v14 3779; GFX8-NEXT: v_max_i16_e32 v16, v16, v5 3780; GFX8-NEXT: v_min_i16_e32 v17, 0, v9 3781; GFX8-NEXT: v_min_i16_e32 v14, v16, v14 3782; GFX8-NEXT: v_max_i16_e32 v16, 0, v9 3783; GFX8-NEXT: v_sub_u16_e32 v17, s5, v17 3784; GFX8-NEXT: v_mov_b32_e32 v15, 0xffff8000 3785; GFX8-NEXT: v_sub_u16_e32 v16, s4, v16 3786; GFX8-NEXT: v_max_i16_sdwa v5, v17, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3787; GFX8-NEXT: v_min_i16_e32 v17, 0, v2 3788; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 3789; GFX8-NEXT: v_mov_b32_e32 v13, 0x7fff 3790; GFX8-NEXT: v_min_i16_e32 v5, v5, v16 3791; GFX8-NEXT: v_max_i16_e32 v16, 0, v2 3792; GFX8-NEXT: v_sub_u16_e32 v17, v15, v17 3793; GFX8-NEXT: v_sub_u16_e32 v16, v13, v16 3794; GFX8-NEXT: v_max_i16_e32 v17, v17, v6 3795; GFX8-NEXT: v_min_i16_e32 v18, 0, v10 3796; GFX8-NEXT: v_min_i16_e32 v16, v17, v16 3797; GFX8-NEXT: v_max_i16_e32 v17, 0, v10 3798; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 3799; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 3800; GFX8-NEXT: v_max_i16_sdwa v6, v18, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3801; GFX8-NEXT: v_min_i16_e32 v18, 0, v3 3802; GFX8-NEXT: v_min_i16_e32 v6, v6, v17 3803; GFX8-NEXT: v_max_i16_e32 v17, 0, v3 3804; GFX8-NEXT: v_sub_u16_e32 v18, v15, v18 3805; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 3806; GFX8-NEXT: v_sub_u16_e32 v17, v13, v17 3807; GFX8-NEXT: v_max_i16_e32 v18, v18, v7 3808; GFX8-NEXT: v_min_i16_e32 v17, v18, v17 3809; GFX8-NEXT: v_max_i16_e32 v18, 0, v11 3810; GFX8-NEXT: v_sub_u16_e32 v13, v13, v18 3811; GFX8-NEXT: v_min_i16_e32 v18, 0, v11 3812; GFX8-NEXT: v_sub_u16_e32 v15, v15, v18 3813; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 3814; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3815; GFX8-NEXT: v_max_i16_sdwa v7, v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3816; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 3817; GFX8-NEXT: v_add_u16_e32 v1, v1, v14 3818; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3819; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 3820; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 3821; GFX8-NEXT: v_add_u16_e32 v2, v2, v16 3822; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3823; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 3824; GFX8-NEXT: v_add_u16_e32 v3, v3, v17 3825; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3826; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 3827; GFX8-NEXT: s_setpc_b64 s[30:31] 3828; 3829; GFX9-LABEL: v_saddsat_v8i16: 3830; GFX9: ; %bb.0: 3831; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3832; GFX9-NEXT: v_pk_add_i16 v0, v0, v4 clamp 3833; GFX9-NEXT: v_pk_add_i16 v1, v1, v5 clamp 3834; GFX9-NEXT: v_pk_add_i16 v2, v2, v6 clamp 3835; GFX9-NEXT: v_pk_add_i16 v3, v3, v7 clamp 3836; GFX9-NEXT: s_setpc_b64 s[30:31] 3837; 3838; GFX10-LABEL: v_saddsat_v8i16: 3839; GFX10: ; %bb.0: 3840; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3841; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3842; GFX10-NEXT: v_pk_add_i16 v0, v0, v4 clamp 3843; GFX10-NEXT: v_pk_add_i16 v1, v1, v5 clamp 3844; GFX10-NEXT: v_pk_add_i16 v2, v2, v6 clamp 3845; GFX10-NEXT: v_pk_add_i16 v3, v3, v7 clamp 3846; GFX10-NEXT: s_setpc_b64 s[30:31] 3847 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 3848 %cast = bitcast <8 x i16> %result to <4 x float> 3849 ret <4 x float> %cast 3850} 3851 3852define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { 3853; GFX6-LABEL: s_saddsat_v8i16: 3854; GFX6: ; %bb.0: 3855; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3856; GFX6-NEXT: s_brev_b32 s17, 1 3857; GFX6-NEXT: s_min_i32 s19, s0, 0 3858; GFX6-NEXT: s_lshl_b32 s8, s8, 16 3859; GFX6-NEXT: s_brev_b32 s16, -2 3860; GFX6-NEXT: s_max_i32 s18, s0, 0 3861; GFX6-NEXT: s_sub_i32 s19, s17, s19 3862; GFX6-NEXT: s_sub_i32 s18, s16, s18 3863; GFX6-NEXT: s_max_i32 s8, s19, s8 3864; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3865; GFX6-NEXT: s_min_i32 s8, s8, s18 3866; GFX6-NEXT: s_min_i32 s18, s1, 0 3867; GFX6-NEXT: s_add_i32 s0, s0, s8 3868; GFX6-NEXT: s_lshl_b32 s8, s9, 16 3869; GFX6-NEXT: s_max_i32 s9, s1, 0 3870; GFX6-NEXT: s_sub_i32 s18, s17, s18 3871; GFX6-NEXT: s_sub_i32 s9, s16, s9 3872; GFX6-NEXT: s_max_i32 s8, s18, s8 3873; GFX6-NEXT: s_min_i32 s8, s8, s9 3874; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3875; GFX6-NEXT: s_add_i32 s1, s1, s8 3876; GFX6-NEXT: s_lshl_b32 s8, s10, 16 3877; GFX6-NEXT: s_min_i32 s10, s2, 0 3878; GFX6-NEXT: s_max_i32 s9, s2, 0 3879; GFX6-NEXT: s_sub_i32 s10, s17, s10 3880; GFX6-NEXT: s_sub_i32 s9, s16, s9 3881; GFX6-NEXT: s_max_i32 s8, s10, s8 3882; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3883; GFX6-NEXT: s_min_i32 s8, s8, s9 3884; GFX6-NEXT: s_min_i32 s10, s3, 0 3885; GFX6-NEXT: s_add_i32 s2, s2, s8 3886; GFX6-NEXT: s_lshl_b32 s8, s11, 16 3887; GFX6-NEXT: s_max_i32 s9, s3, 0 3888; GFX6-NEXT: s_sub_i32 s10, s17, s10 3889; GFX6-NEXT: s_sub_i32 s9, s16, s9 3890; GFX6-NEXT: s_max_i32 s8, s10, s8 3891; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3892; GFX6-NEXT: s_min_i32 s8, s8, s9 3893; GFX6-NEXT: s_min_i32 s10, s4, 0 3894; GFX6-NEXT: s_add_i32 s3, s3, s8 3895; GFX6-NEXT: s_lshl_b32 s8, s12, 16 3896; GFX6-NEXT: s_max_i32 s9, s4, 0 3897; GFX6-NEXT: s_sub_i32 s10, s17, s10 3898; GFX6-NEXT: s_sub_i32 s9, s16, s9 3899; GFX6-NEXT: s_max_i32 s8, s10, s8 3900; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3901; GFX6-NEXT: s_min_i32 s8, s8, s9 3902; GFX6-NEXT: s_min_i32 s10, s5, 0 3903; GFX6-NEXT: s_add_i32 s4, s4, s8 3904; GFX6-NEXT: s_lshl_b32 s8, s13, 16 3905; GFX6-NEXT: s_max_i32 s9, s5, 0 3906; GFX6-NEXT: s_sub_i32 s10, s17, s10 3907; GFX6-NEXT: s_sub_i32 s9, s16, s9 3908; GFX6-NEXT: s_max_i32 s8, s10, s8 3909; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3910; GFX6-NEXT: s_min_i32 s8, s8, s9 3911; GFX6-NEXT: s_min_i32 s10, s6, 0 3912; GFX6-NEXT: s_add_i32 s5, s5, s8 3913; GFX6-NEXT: s_lshl_b32 s8, s14, 16 3914; GFX6-NEXT: s_max_i32 s9, s6, 0 3915; GFX6-NEXT: s_sub_i32 s10, s17, s10 3916; GFX6-NEXT: s_sub_i32 s9, s16, s9 3917; GFX6-NEXT: s_max_i32 s8, s10, s8 3918; GFX6-NEXT: s_lshl_b32 s7, s7, 16 3919; GFX6-NEXT: s_min_i32 s8, s8, s9 3920; GFX6-NEXT: s_min_i32 s10, s7, 0 3921; GFX6-NEXT: s_add_i32 s6, s6, s8 3922; GFX6-NEXT: s_lshl_b32 s8, s15, 16 3923; GFX6-NEXT: s_max_i32 s9, s7, 0 3924; GFX6-NEXT: s_sub_i32 s10, s17, s10 3925; GFX6-NEXT: s_sub_i32 s9, s16, s9 3926; GFX6-NEXT: s_max_i32 s8, s10, s8 3927; GFX6-NEXT: s_min_i32 s8, s8, s9 3928; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3929; GFX6-NEXT: s_add_i32 s7, s7, s8 3930; GFX6-NEXT: s_mov_b32 s8, 0xffff 3931; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3932; GFX6-NEXT: s_and_b32 s1, s1, s8 3933; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3934; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3935; GFX6-NEXT: s_and_b32 s0, s0, s8 3936; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3937; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3938; GFX6-NEXT: s_or_b32 s0, s0, s1 3939; GFX6-NEXT: s_and_b32 s1, s2, s8 3940; GFX6-NEXT: s_and_b32 s2, s3, s8 3941; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3942; GFX6-NEXT: s_ashr_i32 s7, s7, 16 3943; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3944; GFX6-NEXT: s_and_b32 s3, s5, s8 3945; GFX6-NEXT: s_ashr_i32 s6, s6, 16 3946; GFX6-NEXT: s_or_b32 s1, s1, s2 3947; GFX6-NEXT: s_and_b32 s2, s4, s8 3948; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3949; GFX6-NEXT: s_and_b32 s4, s7, s8 3950; GFX6-NEXT: s_or_b32 s2, s2, s3 3951; GFX6-NEXT: s_and_b32 s3, s6, s8 3952; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3953; GFX6-NEXT: s_or_b32 s3, s3, s4 3954; GFX6-NEXT: ; return to shader part epilog 3955; 3956; GFX8-LABEL: s_saddsat_v8i16: 3957; GFX8: ; %bb.0: 3958; GFX8-NEXT: s_sext_i32_i16 s18, s0 3959; GFX8-NEXT: s_sext_i32_i16 s19, 0 3960; GFX8-NEXT: s_movk_i32 s17, 0x8000 3961; GFX8-NEXT: s_max_i32 s20, s18, s19 3962; GFX8-NEXT: s_min_i32 s18, s18, s19 3963; GFX8-NEXT: s_sub_i32 s18, s17, s18 3964; GFX8-NEXT: s_lshr_b32 s12, s4, 16 3965; GFX8-NEXT: s_movk_i32 s16, 0x7fff 3966; GFX8-NEXT: s_sext_i32_i16 s18, s18 3967; GFX8-NEXT: s_sext_i32_i16 s4, s4 3968; GFX8-NEXT: s_sub_i32 s20, s16, s20 3969; GFX8-NEXT: s_max_i32 s4, s18, s4 3970; GFX8-NEXT: s_sext_i32_i16 s4, s4 3971; GFX8-NEXT: s_sext_i32_i16 s18, s20 3972; GFX8-NEXT: s_lshr_b32 s8, s0, 16 3973; GFX8-NEXT: s_min_i32 s4, s4, s18 3974; GFX8-NEXT: s_add_i32 s0, s0, s4 3975; GFX8-NEXT: s_sext_i32_i16 s4, s8 3976; GFX8-NEXT: s_max_i32 s18, s4, s19 3977; GFX8-NEXT: s_min_i32 s4, s4, s19 3978; GFX8-NEXT: s_sub_i32 s4, s17, s4 3979; GFX8-NEXT: s_sext_i32_i16 s4, s4 3980; GFX8-NEXT: s_sext_i32_i16 s12, s12 3981; GFX8-NEXT: s_sub_i32 s18, s16, s18 3982; GFX8-NEXT: s_max_i32 s4, s4, s12 3983; GFX8-NEXT: s_sext_i32_i16 s4, s4 3984; GFX8-NEXT: s_sext_i32_i16 s12, s18 3985; GFX8-NEXT: s_min_i32 s4, s4, s12 3986; GFX8-NEXT: s_add_i32 s8, s8, s4 3987; GFX8-NEXT: s_sext_i32_i16 s4, s1 3988; GFX8-NEXT: s_max_i32 s12, s4, s19 3989; GFX8-NEXT: s_min_i32 s4, s4, s19 3990; GFX8-NEXT: s_sub_i32 s4, s17, s4 3991; GFX8-NEXT: s_lshr_b32 s13, s5, 16 3992; GFX8-NEXT: s_sext_i32_i16 s4, s4 3993; GFX8-NEXT: s_sext_i32_i16 s5, s5 3994; GFX8-NEXT: s_sub_i32 s12, s16, s12 3995; GFX8-NEXT: s_max_i32 s4, s4, s5 3996; GFX8-NEXT: s_sext_i32_i16 s4, s4 3997; GFX8-NEXT: s_sext_i32_i16 s5, s12 3998; GFX8-NEXT: s_lshr_b32 s9, s1, 16 3999; GFX8-NEXT: s_min_i32 s4, s4, s5 4000; GFX8-NEXT: s_add_i32 s1, s1, s4 4001; GFX8-NEXT: s_sext_i32_i16 s4, s9 4002; GFX8-NEXT: s_max_i32 s5, s4, s19 4003; GFX8-NEXT: s_min_i32 s4, s4, s19 4004; GFX8-NEXT: s_sub_i32 s4, s17, s4 4005; GFX8-NEXT: s_sext_i32_i16 s4, s4 4006; GFX8-NEXT: s_sext_i32_i16 s12, s13 4007; GFX8-NEXT: s_sub_i32 s5, s16, s5 4008; GFX8-NEXT: s_max_i32 s4, s4, s12 4009; GFX8-NEXT: s_sext_i32_i16 s4, s4 4010; GFX8-NEXT: s_sext_i32_i16 s5, s5 4011; GFX8-NEXT: s_min_i32 s4, s4, s5 4012; GFX8-NEXT: s_add_i32 s9, s9, s4 4013; GFX8-NEXT: s_sext_i32_i16 s4, s2 4014; GFX8-NEXT: s_max_i32 s5, s4, s19 4015; GFX8-NEXT: s_min_i32 s4, s4, s19 4016; GFX8-NEXT: s_sub_i32 s4, s17, s4 4017; GFX8-NEXT: s_lshr_b32 s14, s6, 16 4018; GFX8-NEXT: s_sext_i32_i16 s4, s4 4019; GFX8-NEXT: s_sext_i32_i16 s6, s6 4020; GFX8-NEXT: s_sub_i32 s5, s16, s5 4021; GFX8-NEXT: s_max_i32 s4, s4, s6 4022; GFX8-NEXT: s_sext_i32_i16 s4, s4 4023; GFX8-NEXT: s_sext_i32_i16 s5, s5 4024; GFX8-NEXT: s_lshr_b32 s10, s2, 16 4025; GFX8-NEXT: s_min_i32 s4, s4, s5 4026; GFX8-NEXT: s_add_i32 s2, s2, s4 4027; GFX8-NEXT: s_sext_i32_i16 s4, s10 4028; GFX8-NEXT: s_max_i32 s5, s4, s19 4029; GFX8-NEXT: s_min_i32 s4, s4, s19 4030; GFX8-NEXT: s_sub_i32 s4, s17, s4 4031; GFX8-NEXT: s_sext_i32_i16 s4, s4 4032; GFX8-NEXT: s_sext_i32_i16 s6, s14 4033; GFX8-NEXT: s_sub_i32 s5, s16, s5 4034; GFX8-NEXT: s_max_i32 s4, s4, s6 4035; GFX8-NEXT: s_sext_i32_i16 s4, s4 4036; GFX8-NEXT: s_sext_i32_i16 s5, s5 4037; GFX8-NEXT: s_min_i32 s4, s4, s5 4038; GFX8-NEXT: s_add_i32 s10, s10, s4 4039; GFX8-NEXT: s_sext_i32_i16 s4, s3 4040; GFX8-NEXT: s_max_i32 s5, s4, s19 4041; GFX8-NEXT: s_min_i32 s4, s4, s19 4042; GFX8-NEXT: s_sub_i32 s4, s17, s4 4043; GFX8-NEXT: s_sext_i32_i16 s4, s4 4044; GFX8-NEXT: s_sext_i32_i16 s6, s7 4045; GFX8-NEXT: s_sub_i32 s5, s16, s5 4046; GFX8-NEXT: s_max_i32 s4, s4, s6 4047; GFX8-NEXT: s_sext_i32_i16 s4, s4 4048; GFX8-NEXT: s_sext_i32_i16 s5, s5 4049; GFX8-NEXT: s_lshr_b32 s11, s3, 16 4050; GFX8-NEXT: s_min_i32 s4, s4, s5 4051; GFX8-NEXT: s_add_i32 s3, s3, s4 4052; GFX8-NEXT: s_sext_i32_i16 s4, s11 4053; GFX8-NEXT: s_max_i32 s5, s4, s19 4054; GFX8-NEXT: s_min_i32 s4, s4, s19 4055; GFX8-NEXT: s_lshr_b32 s15, s7, 16 4056; GFX8-NEXT: s_sub_i32 s4, s17, s4 4057; GFX8-NEXT: s_sext_i32_i16 s4, s4 4058; GFX8-NEXT: s_sext_i32_i16 s6, s15 4059; GFX8-NEXT: s_sub_i32 s5, s16, s5 4060; GFX8-NEXT: s_max_i32 s4, s4, s6 4061; GFX8-NEXT: s_sext_i32_i16 s4, s4 4062; GFX8-NEXT: s_sext_i32_i16 s5, s5 4063; GFX8-NEXT: s_min_i32 s4, s4, s5 4064; GFX8-NEXT: s_add_i32 s11, s11, s4 4065; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000 4066; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 4067; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4068; GFX8-NEXT: s_or_b32 s0, s0, s4 4069; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000 4070; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 4071; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4072; GFX8-NEXT: s_or_b32 s1, s1, s4 4073; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000 4074; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 4075; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4076; GFX8-NEXT: s_or_b32 s2, s2, s4 4077; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000 4078; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 4079; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4080; GFX8-NEXT: s_or_b32 s3, s3, s4 4081; GFX8-NEXT: ; return to shader part epilog 4082; 4083; GFX9-LABEL: s_saddsat_v8i16: 4084; GFX9: ; %bb.0: 4085; GFX9-NEXT: v_mov_b32_e32 v0, s4 4086; GFX9-NEXT: v_mov_b32_e32 v1, s5 4087; GFX9-NEXT: v_mov_b32_e32 v2, s6 4088; GFX9-NEXT: v_mov_b32_e32 v3, s7 4089; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp 4090; GFX9-NEXT: v_pk_add_i16 v1, s1, v1 clamp 4091; GFX9-NEXT: v_pk_add_i16 v2, s2, v2 clamp 4092; GFX9-NEXT: v_pk_add_i16 v3, s3, v3 clamp 4093; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4094; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4095; GFX9-NEXT: v_readfirstlane_b32 s2, v2 4096; GFX9-NEXT: v_readfirstlane_b32 s3, v3 4097; GFX9-NEXT: ; return to shader part epilog 4098; 4099; GFX10-LABEL: s_saddsat_v8i16: 4100; GFX10: ; %bb.0: 4101; GFX10-NEXT: v_pk_add_i16 v0, s0, s4 clamp 4102; GFX10-NEXT: v_pk_add_i16 v1, s1, s5 clamp 4103; GFX10-NEXT: v_pk_add_i16 v2, s2, s6 clamp 4104; GFX10-NEXT: v_pk_add_i16 v3, s3, s7 clamp 4105; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4106; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4107; GFX10-NEXT: v_readfirstlane_b32 s2, v2 4108; GFX10-NEXT: v_readfirstlane_b32 s3, v3 4109; GFX10-NEXT: ; return to shader part epilog 4110 %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 4111 %cast = bitcast <8 x i16> %result to <4 x i32> 4112 ret <4 x i32> %cast 4113} 4114 4115; FIXME: i48 broken because i48 add broken 4116; define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { 4117; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4118; ret i48 %result 4119; } 4120 4121; define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { 4122; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4123; ret i48 %result 4124; } 4125 4126; define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { 4127; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4128; %ext.result = zext i48 %result to i64 4129; %cast = bitcast i64 %ext.result to <2 x float> 4130; ret <2 x float> %cast 4131; } 4132 4133; define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { 4134; %result = call i48 @llvm.sadd.sat.i48(i48 %lhs, i48 %rhs) 4135; %ext.result = zext i48 %result to i64 4136; %cast = bitcast i64 %ext.result to <2 x float> 4137; ret <2 x float> %cast 4138; } 4139 4140define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { 4141; GFX6-LABEL: v_saddsat_i64: 4142; GFX6: ; %bb.0: 4143; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4144; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 4145; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 4146; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4147; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 4148; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4149; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 4150; GFX6-NEXT: v_add_i32_e64 v2, s[6:7], 0, v0 4151; GFX6-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] 4152; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc 4153; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 4154; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4155; GFX6-NEXT: s_setpc_b64 s[30:31] 4156; 4157; GFX8-LABEL: v_saddsat_i64: 4158; GFX8: ; %bb.0: 4159; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4160; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 4161; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc 4162; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4163; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 4164; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4165; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4166; GFX8-NEXT: v_add_u32_e64 v2, s[6:7], 0, v0 4167; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], v0, v1, s[6:7] 4168; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 4169; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 4170; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4171; GFX8-NEXT: s_setpc_b64 s[30:31] 4172; 4173; GFX9-LABEL: v_saddsat_i64: 4174; GFX9: ; %bb.0: 4175; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4176; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 4177; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc 4178; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4179; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] 4180; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4181; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4182; GFX9-NEXT: v_add_co_u32_e64 v2, s[6:7], 0, v0 4183; GFX9-NEXT: v_addc_co_u32_e64 v1, s[6:7], v0, v1, s[6:7] 4184; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4185; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 4186; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4187; GFX9-NEXT: s_setpc_b64 s[30:31] 4188; 4189; GFX10-LABEL: v_saddsat_i64: 4190; GFX10: ; %bb.0: 4191; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4192; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4193; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 4194; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4195; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 4196; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4197; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] 4198; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0 4199; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5 4200; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 4201; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo 4202; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 4203; GFX10-NEXT: s_setpc_b64 s[30:31] 4204 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4205 ret i64 %result 4206} 4207 4208define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { 4209; GFX6-LABEL: s_saddsat_i64: 4210; GFX6: ; %bb.0: 4211; GFX6-NEXT: s_add_u32 s4, s0, s2 4212; GFX6-NEXT: s_cselect_b32 s5, 1, 0 4213; GFX6-NEXT: s_and_b32 s5, s5, 1 4214; GFX6-NEXT: s_cmp_lg_u32 s5, 0 4215; GFX6-NEXT: v_mov_b32_e32 v0, s0 4216; GFX6-NEXT: s_addc_u32 s5, s1, s3 4217; GFX6-NEXT: v_mov_b32_e32 v1, s1 4218; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4219; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4220; GFX6-NEXT: s_ashr_i32 s2, s5, 31 4221; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4222; GFX6-NEXT: s_add_u32 s0, s2, 0 4223; GFX6-NEXT: s_cselect_b32 s1, 1, 0 4224; GFX6-NEXT: s_and_b32 s1, s1, 1 4225; GFX6-NEXT: s_cmp_lg_u32 s1, 0 4226; GFX6-NEXT: s_addc_u32 s1, s2, 0x80000000 4227; GFX6-NEXT: v_mov_b32_e32 v0, s4 4228; GFX6-NEXT: v_mov_b32_e32 v1, s0 4229; GFX6-NEXT: v_mov_b32_e32 v2, s1 4230; GFX6-NEXT: v_mov_b32_e32 v3, s5 4231; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4232; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 4233; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4234; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4235; GFX6-NEXT: ; return to shader part epilog 4236; 4237; GFX8-LABEL: s_saddsat_i64: 4238; GFX8: ; %bb.0: 4239; GFX8-NEXT: s_add_u32 s4, s0, s2 4240; GFX8-NEXT: s_cselect_b32 s5, 1, 0 4241; GFX8-NEXT: s_and_b32 s5, s5, 1 4242; GFX8-NEXT: s_cmp_lg_u32 s5, 0 4243; GFX8-NEXT: v_mov_b32_e32 v0, s0 4244; GFX8-NEXT: s_addc_u32 s5, s1, s3 4245; GFX8-NEXT: v_mov_b32_e32 v1, s1 4246; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4247; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4248; GFX8-NEXT: s_ashr_i32 s2, s5, 31 4249; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4250; GFX8-NEXT: s_add_u32 s0, s2, 0 4251; GFX8-NEXT: s_cselect_b32 s1, 1, 0 4252; GFX8-NEXT: s_and_b32 s1, s1, 1 4253; GFX8-NEXT: s_cmp_lg_u32 s1, 0 4254; GFX8-NEXT: s_addc_u32 s1, s2, 0x80000000 4255; GFX8-NEXT: v_mov_b32_e32 v0, s4 4256; GFX8-NEXT: v_mov_b32_e32 v1, s0 4257; GFX8-NEXT: v_mov_b32_e32 v2, s1 4258; GFX8-NEXT: v_mov_b32_e32 v3, s5 4259; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4260; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 4261; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4262; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4263; GFX8-NEXT: ; return to shader part epilog 4264; 4265; GFX9-LABEL: s_saddsat_i64: 4266; GFX9: ; %bb.0: 4267; GFX9-NEXT: s_add_u32 s4, s0, s2 4268; GFX9-NEXT: s_cselect_b32 s5, 1, 0 4269; GFX9-NEXT: s_and_b32 s5, s5, 1 4270; GFX9-NEXT: s_cmp_lg_u32 s5, 0 4271; GFX9-NEXT: v_mov_b32_e32 v0, s0 4272; GFX9-NEXT: s_addc_u32 s5, s1, s3 4273; GFX9-NEXT: v_mov_b32_e32 v1, s1 4274; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4275; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 4276; GFX9-NEXT: s_ashr_i32 s2, s5, 31 4277; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4278; GFX9-NEXT: s_add_u32 s0, s2, 0 4279; GFX9-NEXT: s_cselect_b32 s1, 1, 0 4280; GFX9-NEXT: s_and_b32 s1, s1, 1 4281; GFX9-NEXT: s_cmp_lg_u32 s1, 0 4282; GFX9-NEXT: s_addc_u32 s1, s2, 0x80000000 4283; GFX9-NEXT: v_mov_b32_e32 v0, s4 4284; GFX9-NEXT: v_mov_b32_e32 v1, s0 4285; GFX9-NEXT: v_mov_b32_e32 v2, s1 4286; GFX9-NEXT: v_mov_b32_e32 v3, s5 4287; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4288; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 4289; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4290; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4291; GFX9-NEXT: ; return to shader part epilog 4292; 4293; GFX10-LABEL: s_saddsat_i64: 4294; GFX10: ; %bb.0: 4295; GFX10-NEXT: s_add_u32 s4, s0, s2 4296; GFX10-NEXT: s_cselect_b32 s5, 1, 0 4297; GFX10-NEXT: v_mov_b32_e32 v0, s4 4298; GFX10-NEXT: s_and_b32 s5, s5, 1 4299; GFX10-NEXT: s_cmp_lg_u32 s5, 0 4300; GFX10-NEXT: s_addc_u32 s5, s1, s3 4301; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4302; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 4303; GFX10-NEXT: s_ashr_i32 s2, s5, 31 4304; GFX10-NEXT: v_mov_b32_e32 v1, s5 4305; GFX10-NEXT: s_xor_b32 s3, s1, s0 4306; GFX10-NEXT: s_add_u32 s0, s2, 0 4307; GFX10-NEXT: s_cselect_b32 s1, 1, 0 4308; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s3 4309; GFX10-NEXT: s_and_b32 s1, s1, 1 4310; GFX10-NEXT: s_cmp_lg_u32 s1, 0 4311; GFX10-NEXT: s_addc_u32 s1, s2, 0x80000000 4312; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4313; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s3 4314; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4315; GFX10-NEXT: ; return to shader part epilog 4316 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4317 ret i64 %result 4318} 4319 4320define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { 4321; GFX6-LABEL: saddsat_i64_sv: 4322; GFX6: ; %bb.0: 4323; GFX6-NEXT: v_mov_b32_e32 v3, s1 4324; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4325; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 4326; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4327; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] 4328; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4329; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 4330; GFX6-NEXT: v_add_i32_e64 v4, s[2:3], 0, v0 4331; GFX6-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] 4332; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4333; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4334; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4335; GFX6-NEXT: ; return to shader part epilog 4336; 4337; GFX8-LABEL: saddsat_i64_sv: 4338; GFX8: ; %bb.0: 4339; GFX8-NEXT: v_mov_b32_e32 v3, s1 4340; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4341; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc 4342; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4343; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] 4344; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4345; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4346; GFX8-NEXT: v_add_u32_e64 v4, s[2:3], 0, v0 4347; GFX8-NEXT: v_addc_u32_e64 v1, s[2:3], v0, v1, s[2:3] 4348; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4349; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4350; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4351; GFX8-NEXT: ; return to shader part epilog 4352; 4353; GFX9-LABEL: saddsat_i64_sv: 4354; GFX9: ; %bb.0: 4355; GFX9-NEXT: v_mov_b32_e32 v3, s1 4356; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4357; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc 4358; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4359; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] 4360; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4361; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4362; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], 0, v0 4363; GFX9-NEXT: v_addc_co_u32_e64 v1, s[2:3], v0, v1, s[2:3] 4364; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4365; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4366; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4367; GFX9-NEXT: ; return to shader part epilog 4368; 4369; GFX10-LABEL: saddsat_i64_sv: 4370; GFX10: ; %bb.0: 4371; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 4372; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4373; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1] 4374; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4375; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3] 4376; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0 4377; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 4378; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 4379; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 4380; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4381; GFX10-NEXT: ; return to shader part epilog 4382 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4383 %cast = bitcast i64 %result to <2 x float> 4384 ret <2 x float> %cast 4385} 4386 4387define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { 4388; GFX6-LABEL: saddsat_i64_vs: 4389; GFX6: ; %bb.0: 4390; GFX6-NEXT: v_mov_b32_e32 v3, s1 4391; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v0 4392; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 4393; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4394; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 4395; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4396; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 4397; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], 0, v0 4398; GFX6-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] 4399; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc 4400; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4401; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4402; GFX6-NEXT: ; return to shader part epilog 4403; 4404; GFX8-LABEL: saddsat_i64_vs: 4405; GFX8: ; %bb.0: 4406; GFX8-NEXT: v_mov_b32_e32 v3, s1 4407; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 4408; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 4409; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4410; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 4411; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4412; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4413; GFX8-NEXT: v_add_u32_e64 v4, s[0:1], 0, v0 4414; GFX8-NEXT: v_addc_u32_e64 v1, s[0:1], v0, v1, s[0:1] 4415; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc 4416; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4417; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4418; GFX8-NEXT: ; return to shader part epilog 4419; 4420; GFX9-LABEL: saddsat_i64_vs: 4421; GFX9: ; %bb.0: 4422; GFX9-NEXT: v_mov_b32_e32 v3, s1 4423; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 4424; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc 4425; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4426; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[0:1], 0 4427; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4428; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 4429; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], 0, v0 4430; GFX9-NEXT: v_addc_co_u32_e64 v1, s[0:1], v0, v1, s[0:1] 4431; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc 4432; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc 4433; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4434; GFX9-NEXT: ; return to shader part epilog 4435; 4436; GFX10-LABEL: saddsat_i64_vs: 4437; GFX10: ; %bb.0: 4438; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 4439; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4440; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 4441; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4442; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4443; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0 4444; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 4445; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo 4446; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo 4447; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4448; GFX10-NEXT: ; return to shader part epilog 4449 %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) 4450 %cast = bitcast i64 %result to <2 x float> 4451 ret <2 x float> %cast 4452} 4453 4454define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 4455; GFX6-LABEL: v_saddsat_v2i64: 4456; GFX6: ; %bb.0: 4457; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4458; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v4 4459; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc 4460; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] 4461; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] 4462; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4463; GFX6-NEXT: v_bfrev_b32_e32 v10, 1 4464; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], 0, v0 4465; GFX6-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] 4466; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc 4467; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc 4468; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc 4469; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v6 4470; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc 4471; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] 4472; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] 4473; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4474; GFX6-NEXT: v_add_i32_e64 v3, s[6:7], 0, v2 4475; GFX6-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] 4476; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc 4477; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 4478; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 4479; GFX6-NEXT: s_setpc_b64 s[30:31] 4480; 4481; GFX8-LABEL: v_saddsat_v2i64: 4482; GFX8: ; %bb.0: 4483; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4484; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v4 4485; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v5, vcc 4486; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] 4487; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] 4488; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4489; GFX8-NEXT: v_bfrev_b32_e32 v10, 1 4490; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], 0, v0 4491; GFX8-NEXT: v_addc_u32_e64 v4, s[6:7], v0, v10, s[6:7] 4492; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 4493; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc 4494; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc 4495; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v6 4496; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v3, v7, vcc 4497; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] 4498; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] 4499; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4500; GFX8-NEXT: v_add_u32_e64 v3, s[6:7], 0, v2 4501; GFX8-NEXT: v_addc_u32_e64 v6, s[6:7], v2, v10, s[6:7] 4502; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc 4503; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 4504; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 4505; GFX8-NEXT: s_setpc_b64 s[30:31] 4506; 4507; GFX9-LABEL: v_saddsat_v2i64: 4508; GFX9: ; %bb.0: 4509; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4510; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 4511; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc 4512; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] 4513; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] 4514; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4515; GFX9-NEXT: v_bfrev_b32_e32 v10, 1 4516; GFX9-NEXT: v_add_co_u32_e64 v1, s[6:7], 0, v0 4517; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], v0, v10, s[6:7] 4518; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4519; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc 4520; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc 4521; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 4522; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc 4523; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] 4524; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] 4525; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4526; GFX9-NEXT: v_add_co_u32_e64 v3, s[6:7], 0, v2 4527; GFX9-NEXT: v_addc_co_u32_e64 v6, s[6:7], v2, v10, s[6:7] 4528; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4529; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc 4530; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 4531; GFX9-NEXT: s_setpc_b64 s[30:31] 4532; 4533; GFX10-LABEL: v_saddsat_v2i64: 4534; GFX10: ; %bb.0: 4535; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4536; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4537; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v4 4538; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo 4539; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v2, v6 4540; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo 4541; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 4542; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] 4543; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] 4544; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v11 4545; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] 4546; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0 4547; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5 4548; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] 4549; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0 4550; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7 4551; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 4552; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo 4553; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo 4554; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 4555; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo 4556; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo 4557; GFX10-NEXT: s_setpc_b64 s[30:31] 4558 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 4559 ret <2 x i64> %result 4560} 4561 4562define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { 4563; GFX6-LABEL: s_saddsat_v2i64: 4564; GFX6: ; %bb.0: 4565; GFX6-NEXT: s_add_u32 s8, s0, s4 4566; GFX6-NEXT: s_cselect_b32 s9, 1, 0 4567; GFX6-NEXT: s_and_b32 s9, s9, 1 4568; GFX6-NEXT: s_cmp_lg_u32 s9, 0 4569; GFX6-NEXT: v_mov_b32_e32 v0, s0 4570; GFX6-NEXT: s_addc_u32 s9, s1, s5 4571; GFX6-NEXT: v_mov_b32_e32 v1, s1 4572; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4573; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 4574; GFX6-NEXT: s_ashr_i32 s4, s9, 31 4575; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4576; GFX6-NEXT: s_add_u32 s0, s4, 0 4577; GFX6-NEXT: s_cselect_b32 s1, 1, 0 4578; GFX6-NEXT: s_and_b32 s1, s1, 1 4579; GFX6-NEXT: s_brev_b32 s5, 1 4580; GFX6-NEXT: s_cmp_lg_u32 s1, 0 4581; GFX6-NEXT: s_addc_u32 s1, s4, s5 4582; GFX6-NEXT: v_mov_b32_e32 v1, s0 4583; GFX6-NEXT: s_add_u32 s0, s2, s6 4584; GFX6-NEXT: v_mov_b32_e32 v2, s1 4585; GFX6-NEXT: s_cselect_b32 s1, 1, 0 4586; GFX6-NEXT: v_mov_b32_e32 v0, s8 4587; GFX6-NEXT: s_and_b32 s1, s1, 1 4588; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 4589; GFX6-NEXT: s_cmp_lg_u32 s1, 0 4590; GFX6-NEXT: v_mov_b32_e32 v0, s2 4591; GFX6-NEXT: v_mov_b32_e32 v3, s9 4592; GFX6-NEXT: s_addc_u32 s1, s3, s7 4593; GFX6-NEXT: v_mov_b32_e32 v1, s3 4594; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 4595; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4596; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 4597; GFX6-NEXT: s_ashr_i32 s4, s1, 31 4598; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc 4599; GFX6-NEXT: v_mov_b32_e32 v0, s0 4600; GFX6-NEXT: s_add_u32 s0, s4, 0 4601; GFX6-NEXT: s_cselect_b32 s2, 1, 0 4602; GFX6-NEXT: s_and_b32 s2, s2, 1 4603; GFX6-NEXT: s_cmp_lg_u32 s2, 0 4604; GFX6-NEXT: s_addc_u32 s3, s4, s5 4605; GFX6-NEXT: v_mov_b32_e32 v1, s0 4606; GFX6-NEXT: v_mov_b32_e32 v3, s3 4607; GFX6-NEXT: v_mov_b32_e32 v5, s1 4608; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4609; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 4610; GFX6-NEXT: v_readfirstlane_b32 s0, v4 4611; GFX6-NEXT: v_readfirstlane_b32 s1, v2 4612; GFX6-NEXT: v_readfirstlane_b32 s2, v0 4613; GFX6-NEXT: v_readfirstlane_b32 s3, v1 4614; GFX6-NEXT: ; return to shader part epilog 4615; 4616; GFX8-LABEL: s_saddsat_v2i64: 4617; GFX8: ; %bb.0: 4618; GFX8-NEXT: s_add_u32 s8, s0, s4 4619; GFX8-NEXT: s_cselect_b32 s9, 1, 0 4620; GFX8-NEXT: s_and_b32 s9, s9, 1 4621; GFX8-NEXT: s_cmp_lg_u32 s9, 0 4622; GFX8-NEXT: v_mov_b32_e32 v0, s0 4623; GFX8-NEXT: s_addc_u32 s9, s1, s5 4624; GFX8-NEXT: v_mov_b32_e32 v1, s1 4625; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4626; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 4627; GFX8-NEXT: s_ashr_i32 s4, s9, 31 4628; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4629; GFX8-NEXT: s_add_u32 s0, s4, 0 4630; GFX8-NEXT: s_cselect_b32 s1, 1, 0 4631; GFX8-NEXT: s_and_b32 s1, s1, 1 4632; GFX8-NEXT: s_brev_b32 s5, 1 4633; GFX8-NEXT: s_cmp_lg_u32 s1, 0 4634; GFX8-NEXT: s_addc_u32 s1, s4, s5 4635; GFX8-NEXT: v_mov_b32_e32 v1, s0 4636; GFX8-NEXT: s_add_u32 s0, s2, s6 4637; GFX8-NEXT: v_mov_b32_e32 v2, s1 4638; GFX8-NEXT: s_cselect_b32 s1, 1, 0 4639; GFX8-NEXT: v_mov_b32_e32 v0, s8 4640; GFX8-NEXT: s_and_b32 s1, s1, 1 4641; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 4642; GFX8-NEXT: s_cmp_lg_u32 s1, 0 4643; GFX8-NEXT: v_mov_b32_e32 v0, s2 4644; GFX8-NEXT: v_mov_b32_e32 v3, s9 4645; GFX8-NEXT: s_addc_u32 s1, s3, s7 4646; GFX8-NEXT: v_mov_b32_e32 v1, s3 4647; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 4648; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4649; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 4650; GFX8-NEXT: s_ashr_i32 s4, s1, 31 4651; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc 4652; GFX8-NEXT: v_mov_b32_e32 v0, s0 4653; GFX8-NEXT: s_add_u32 s0, s4, 0 4654; GFX8-NEXT: s_cselect_b32 s2, 1, 0 4655; GFX8-NEXT: s_and_b32 s2, s2, 1 4656; GFX8-NEXT: s_cmp_lg_u32 s2, 0 4657; GFX8-NEXT: s_addc_u32 s3, s4, s5 4658; GFX8-NEXT: v_mov_b32_e32 v1, s0 4659; GFX8-NEXT: v_mov_b32_e32 v3, s3 4660; GFX8-NEXT: v_mov_b32_e32 v5, s1 4661; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4662; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 4663; GFX8-NEXT: v_readfirstlane_b32 s0, v4 4664; GFX8-NEXT: v_readfirstlane_b32 s1, v2 4665; GFX8-NEXT: v_readfirstlane_b32 s2, v0 4666; GFX8-NEXT: v_readfirstlane_b32 s3, v1 4667; GFX8-NEXT: ; return to shader part epilog 4668; 4669; GFX9-LABEL: s_saddsat_v2i64: 4670; GFX9: ; %bb.0: 4671; GFX9-NEXT: s_add_u32 s8, s0, s4 4672; GFX9-NEXT: s_cselect_b32 s9, 1, 0 4673; GFX9-NEXT: s_and_b32 s9, s9, 1 4674; GFX9-NEXT: s_cmp_lg_u32 s9, 0 4675; GFX9-NEXT: v_mov_b32_e32 v0, s0 4676; GFX9-NEXT: s_addc_u32 s9, s1, s5 4677; GFX9-NEXT: v_mov_b32_e32 v1, s1 4678; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4679; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 4680; GFX9-NEXT: s_ashr_i32 s4, s9, 31 4681; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4682; GFX9-NEXT: s_add_u32 s0, s4, 0 4683; GFX9-NEXT: s_cselect_b32 s1, 1, 0 4684; GFX9-NEXT: s_and_b32 s1, s1, 1 4685; GFX9-NEXT: s_brev_b32 s5, 1 4686; GFX9-NEXT: s_cmp_lg_u32 s1, 0 4687; GFX9-NEXT: s_addc_u32 s1, s4, s5 4688; GFX9-NEXT: v_mov_b32_e32 v1, s0 4689; GFX9-NEXT: s_add_u32 s0, s2, s6 4690; GFX9-NEXT: v_mov_b32_e32 v2, s1 4691; GFX9-NEXT: s_cselect_b32 s1, 1, 0 4692; GFX9-NEXT: v_mov_b32_e32 v0, s8 4693; GFX9-NEXT: s_and_b32 s1, s1, 1 4694; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc 4695; GFX9-NEXT: s_cmp_lg_u32 s1, 0 4696; GFX9-NEXT: v_mov_b32_e32 v0, s2 4697; GFX9-NEXT: v_mov_b32_e32 v3, s9 4698; GFX9-NEXT: s_addc_u32 s1, s3, s7 4699; GFX9-NEXT: v_mov_b32_e32 v1, s3 4700; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 4701; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4702; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 4703; GFX9-NEXT: s_ashr_i32 s4, s1, 31 4704; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc 4705; GFX9-NEXT: v_mov_b32_e32 v0, s0 4706; GFX9-NEXT: s_add_u32 s0, s4, 0 4707; GFX9-NEXT: s_cselect_b32 s2, 1, 0 4708; GFX9-NEXT: s_and_b32 s2, s2, 1 4709; GFX9-NEXT: s_cmp_lg_u32 s2, 0 4710; GFX9-NEXT: s_addc_u32 s3, s4, s5 4711; GFX9-NEXT: v_mov_b32_e32 v1, s0 4712; GFX9-NEXT: v_mov_b32_e32 v3, s3 4713; GFX9-NEXT: v_mov_b32_e32 v5, s1 4714; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4715; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 4716; GFX9-NEXT: v_readfirstlane_b32 s0, v4 4717; GFX9-NEXT: v_readfirstlane_b32 s1, v2 4718; GFX9-NEXT: v_readfirstlane_b32 s2, v0 4719; GFX9-NEXT: v_readfirstlane_b32 s3, v1 4720; GFX9-NEXT: ; return to shader part epilog 4721; 4722; GFX10-LABEL: s_saddsat_v2i64: 4723; GFX10: ; %bb.0: 4724; GFX10-NEXT: s_add_u32 s8, s0, s4 4725; GFX10-NEXT: s_cselect_b32 s9, 1, 0 4726; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 4727; GFX10-NEXT: s_and_b32 s9, s9, 1 4728; GFX10-NEXT: v_mov_b32_e32 v0, s8 4729; GFX10-NEXT: s_cmp_lg_u32 s9, 0 4730; GFX10-NEXT: s_brev_b32 s10, 1 4731; GFX10-NEXT: s_addc_u32 s9, s1, s5 4732; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] 4733; GFX10-NEXT: s_ashr_i32 s1, s9, 31 4734; GFX10-NEXT: v_mov_b32_e32 v1, s9 4735; GFX10-NEXT: s_xor_b32 s8, s4, s0 4736; GFX10-NEXT: s_add_u32 s0, s1, 0 4737; GFX10-NEXT: s_cselect_b32 s4, 1, 0 4738; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 4739; GFX10-NEXT: s_and_b32 s4, s4, 1 4740; GFX10-NEXT: s_cmp_lg_u32 s4, 0 4741; GFX10-NEXT: s_addc_u32 s1, s1, s10 4742; GFX10-NEXT: s_add_u32 s4, s2, s6 4743; GFX10-NEXT: s_cselect_b32 s5, 1, 0 4744; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 4745; GFX10-NEXT: s_and_b32 s5, s5, 1 4746; GFX10-NEXT: v_mov_b32_e32 v2, s4 4747; GFX10-NEXT: s_cmp_lg_u32 s5, 0 4748; GFX10-NEXT: s_addc_u32 s5, s3, s7 4749; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] 4750; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 4751; GFX10-NEXT: s_ashr_i32 s1, s5, 31 4752; GFX10-NEXT: v_mov_b32_e32 v3, s5 4753; GFX10-NEXT: s_xor_b32 s2, s3, s2 4754; GFX10-NEXT: s_add_u32 s0, s1, 0 4755; GFX10-NEXT: s_cselect_b32 s3, 1, 0 4756; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 4757; GFX10-NEXT: s_and_b32 s3, s3, 1 4758; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4759; GFX10-NEXT: s_cmp_lg_u32 s3, 0 4760; GFX10-NEXT: s_addc_u32 s1, s1, s10 4761; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 4762; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4763; GFX10-NEXT: v_readfirstlane_b32 s2, v2 4764; GFX10-NEXT: v_readfirstlane_b32 s3, v3 4765; GFX10-NEXT: ; return to shader part epilog 4766 %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 4767 ret <2 x i64> %result 4768} 4769 4770define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { 4771; GFX6-LABEL: s_saddsat_i128: 4772; GFX6: ; %bb.0: 4773; GFX6-NEXT: s_add_u32 s4, s0, s4 4774; GFX6-NEXT: s_cselect_b32 s8, 1, 0 4775; GFX6-NEXT: s_and_b32 s8, s8, 1 4776; GFX6-NEXT: s_cmp_lg_u32 s8, 0 4777; GFX6-NEXT: s_addc_u32 s5, s1, s5 4778; GFX6-NEXT: s_cselect_b32 s8, 1, 0 4779; GFX6-NEXT: s_and_b32 s8, s8, 1 4780; GFX6-NEXT: s_cmp_lg_u32 s8, 0 4781; GFX6-NEXT: s_addc_u32 s8, s2, s6 4782; GFX6-NEXT: s_cselect_b32 s9, 1, 0 4783; GFX6-NEXT: v_mov_b32_e32 v3, s1 4784; GFX6-NEXT: s_and_b32 s9, s9, 1 4785; GFX6-NEXT: v_mov_b32_e32 v2, s0 4786; GFX6-NEXT: s_cmp_lg_u32 s9, 0 4787; GFX6-NEXT: v_mov_b32_e32 v0, s2 4788; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 4789; GFX6-NEXT: s_addc_u32 s9, s3, s7 4790; GFX6-NEXT: v_mov_b32_e32 v1, s3 4791; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 4792; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4793; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 4794; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 4795; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 4796; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 4797; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], 0 4798; GFX6-NEXT: s_ashr_i32 s3, s9, 31 4799; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 4800; GFX6-NEXT: s_add_u32 s0, s3, 0 4801; GFX6-NEXT: s_cselect_b32 s1, 1, 0 4802; GFX6-NEXT: s_and_b32 s1, s1, 1 4803; GFX6-NEXT: s_cmp_lg_u32 s1, 0 4804; GFX6-NEXT: s_addc_u32 s1, s3, 0 4805; GFX6-NEXT: s_cselect_b32 s2, 1, 0 4806; GFX6-NEXT: s_and_b32 s2, s2, 1 4807; GFX6-NEXT: s_cmp_lg_u32 s2, 0 4808; GFX6-NEXT: s_addc_u32 s2, s3, 0 4809; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 4810; GFX6-NEXT: s_cselect_b32 s6, 1, 0 4811; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 4812; GFX6-NEXT: s_and_b32 s6, s6, 1 4813; GFX6-NEXT: s_cmp_lg_u32 s6, 0 4814; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 4815; GFX6-NEXT: s_addc_u32 s3, s3, 0x80000000 4816; GFX6-NEXT: v_mov_b32_e32 v1, s0 4817; GFX6-NEXT: v_mov_b32_e32 v2, s1 4818; GFX6-NEXT: v_mov_b32_e32 v3, s4 4819; GFX6-NEXT: v_mov_b32_e32 v4, s5 4820; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 4821; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 4822; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 4823; GFX6-NEXT: v_mov_b32_e32 v2, s2 4824; GFX6-NEXT: v_mov_b32_e32 v3, s3 4825; GFX6-NEXT: v_mov_b32_e32 v4, s8 4826; GFX6-NEXT: v_mov_b32_e32 v5, s9 4827; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4828; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4829; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4830; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4831; GFX6-NEXT: v_readfirstlane_b32 s2, v2 4832; GFX6-NEXT: v_readfirstlane_b32 s3, v3 4833; GFX6-NEXT: ; return to shader part epilog 4834; 4835; GFX8-LABEL: s_saddsat_i128: 4836; GFX8: ; %bb.0: 4837; GFX8-NEXT: s_add_u32 s4, s0, s4 4838; GFX8-NEXT: s_cselect_b32 s8, 1, 0 4839; GFX8-NEXT: s_and_b32 s8, s8, 1 4840; GFX8-NEXT: s_cmp_lg_u32 s8, 0 4841; GFX8-NEXT: s_addc_u32 s5, s1, s5 4842; GFX8-NEXT: s_cselect_b32 s8, 1, 0 4843; GFX8-NEXT: s_and_b32 s8, s8, 1 4844; GFX8-NEXT: s_cmp_lg_u32 s8, 0 4845; GFX8-NEXT: s_addc_u32 s8, s2, s6 4846; GFX8-NEXT: s_cselect_b32 s9, 1, 0 4847; GFX8-NEXT: s_and_b32 s9, s9, 1 4848; GFX8-NEXT: v_mov_b32_e32 v3, s1 4849; GFX8-NEXT: s_cmp_lg_u32 s9, 0 4850; GFX8-NEXT: v_mov_b32_e32 v2, s0 4851; GFX8-NEXT: s_addc_u32 s9, s3, s7 4852; GFX8-NEXT: v_mov_b32_e32 v0, s2 4853; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 4854; GFX8-NEXT: v_mov_b32_e32 v1, s3 4855; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 4856; GFX8-NEXT: s_cselect_b32 s2, 1, 0 4857; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 4858; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4859; GFX8-NEXT: s_and_b32 s0, 1, s2 4860; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 4861; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 4862; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 4863; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 4864; GFX8-NEXT: s_cselect_b32 s2, 1, 0 4865; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 4866; GFX8-NEXT: s_and_b32 s0, 1, s2 4867; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 4868; GFX8-NEXT: s_ashr_i32 s3, s9, 31 4869; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 4870; GFX8-NEXT: s_add_u32 s0, s3, 0 4871; GFX8-NEXT: s_cselect_b32 s1, 1, 0 4872; GFX8-NEXT: s_and_b32 s1, s1, 1 4873; GFX8-NEXT: s_cmp_lg_u32 s1, 0 4874; GFX8-NEXT: s_addc_u32 s1, s3, 0 4875; GFX8-NEXT: s_cselect_b32 s2, 1, 0 4876; GFX8-NEXT: s_and_b32 s2, s2, 1 4877; GFX8-NEXT: s_cmp_lg_u32 s2, 0 4878; GFX8-NEXT: s_addc_u32 s2, s3, 0 4879; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4880; GFX8-NEXT: s_cselect_b32 s6, 1, 0 4881; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 4882; GFX8-NEXT: s_and_b32 s6, s6, 1 4883; GFX8-NEXT: s_cmp_lg_u32 s6, 0 4884; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 4885; GFX8-NEXT: s_addc_u32 s3, s3, 0x80000000 4886; GFX8-NEXT: v_mov_b32_e32 v1, s0 4887; GFX8-NEXT: v_mov_b32_e32 v2, s1 4888; GFX8-NEXT: v_mov_b32_e32 v3, s4 4889; GFX8-NEXT: v_mov_b32_e32 v4, s5 4890; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 4891; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 4892; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 4893; GFX8-NEXT: v_mov_b32_e32 v2, s2 4894; GFX8-NEXT: v_mov_b32_e32 v3, s3 4895; GFX8-NEXT: v_mov_b32_e32 v4, s8 4896; GFX8-NEXT: v_mov_b32_e32 v5, s9 4897; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4898; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4899; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4900; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4901; GFX8-NEXT: v_readfirstlane_b32 s2, v2 4902; GFX8-NEXT: v_readfirstlane_b32 s3, v3 4903; GFX8-NEXT: ; return to shader part epilog 4904; 4905; GFX9-LABEL: s_saddsat_i128: 4906; GFX9: ; %bb.0: 4907; GFX9-NEXT: s_add_u32 s4, s0, s4 4908; GFX9-NEXT: s_cselect_b32 s8, 1, 0 4909; GFX9-NEXT: s_and_b32 s8, s8, 1 4910; GFX9-NEXT: s_cmp_lg_u32 s8, 0 4911; GFX9-NEXT: s_addc_u32 s5, s1, s5 4912; GFX9-NEXT: s_cselect_b32 s8, 1, 0 4913; GFX9-NEXT: s_and_b32 s8, s8, 1 4914; GFX9-NEXT: s_cmp_lg_u32 s8, 0 4915; GFX9-NEXT: s_addc_u32 s8, s2, s6 4916; GFX9-NEXT: s_cselect_b32 s9, 1, 0 4917; GFX9-NEXT: s_and_b32 s9, s9, 1 4918; GFX9-NEXT: v_mov_b32_e32 v3, s1 4919; GFX9-NEXT: s_cmp_lg_u32 s9, 0 4920; GFX9-NEXT: v_mov_b32_e32 v2, s0 4921; GFX9-NEXT: s_addc_u32 s9, s3, s7 4922; GFX9-NEXT: v_mov_b32_e32 v0, s2 4923; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] 4924; GFX9-NEXT: v_mov_b32_e32 v1, s3 4925; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 4926; GFX9-NEXT: s_cselect_b32 s2, 1, 0 4927; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 4928; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4929; GFX9-NEXT: s_and_b32 s0, 1, s2 4930; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 4931; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 4932; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 4933; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], 0 4934; GFX9-NEXT: s_cselect_b32 s2, 1, 0 4935; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 4936; GFX9-NEXT: s_and_b32 s0, 1, s2 4937; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 4938; GFX9-NEXT: s_ashr_i32 s3, s9, 31 4939; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 4940; GFX9-NEXT: s_add_u32 s0, s3, 0 4941; GFX9-NEXT: s_cselect_b32 s1, 1, 0 4942; GFX9-NEXT: s_and_b32 s1, s1, 1 4943; GFX9-NEXT: s_cmp_lg_u32 s1, 0 4944; GFX9-NEXT: s_addc_u32 s1, s3, 0 4945; GFX9-NEXT: s_cselect_b32 s2, 1, 0 4946; GFX9-NEXT: s_and_b32 s2, s2, 1 4947; GFX9-NEXT: s_cmp_lg_u32 s2, 0 4948; GFX9-NEXT: s_addc_u32 s2, s3, 0 4949; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4950; GFX9-NEXT: s_cselect_b32 s6, 1, 0 4951; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 4952; GFX9-NEXT: s_and_b32 s6, s6, 1 4953; GFX9-NEXT: s_cmp_lg_u32 s6, 0 4954; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 4955; GFX9-NEXT: s_addc_u32 s3, s3, 0x80000000 4956; GFX9-NEXT: v_mov_b32_e32 v1, s0 4957; GFX9-NEXT: v_mov_b32_e32 v2, s1 4958; GFX9-NEXT: v_mov_b32_e32 v3, s4 4959; GFX9-NEXT: v_mov_b32_e32 v4, s5 4960; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 4961; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 4962; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc 4963; GFX9-NEXT: v_mov_b32_e32 v2, s2 4964; GFX9-NEXT: v_mov_b32_e32 v3, s3 4965; GFX9-NEXT: v_mov_b32_e32 v4, s8 4966; GFX9-NEXT: v_mov_b32_e32 v5, s9 4967; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4968; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4969; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4970; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4971; GFX9-NEXT: v_readfirstlane_b32 s2, v2 4972; GFX9-NEXT: v_readfirstlane_b32 s3, v3 4973; GFX9-NEXT: ; return to shader part epilog 4974; 4975; GFX10-LABEL: s_saddsat_i128: 4976; GFX10: ; %bb.0: 4977; GFX10-NEXT: s_add_u32 s4, s0, s4 4978; GFX10-NEXT: s_cselect_b32 s8, 1, 0 4979; GFX10-NEXT: s_and_b32 s8, s8, 1 4980; GFX10-NEXT: s_cmp_lg_u32 s8, 0 4981; GFX10-NEXT: s_addc_u32 s5, s1, s5 4982; GFX10-NEXT: s_cselect_b32 s8, 1, 0 4983; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1] 4984; GFX10-NEXT: s_and_b32 s8, s8, 1 4985; GFX10-NEXT: v_mov_b32_e32 v2, s5 4986; GFX10-NEXT: s_cmp_lg_u32 s8, 0 4987; GFX10-NEXT: s_addc_u32 s8, s2, s6 4988; GFX10-NEXT: s_cselect_b32 s9, 1, 0 4989; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 4990; GFX10-NEXT: s_and_b32 s9, s9, 1 4991; GFX10-NEXT: v_mov_b32_e32 v3, s8 4992; GFX10-NEXT: s_cmp_lg_u32 s9, 0 4993; GFX10-NEXT: s_addc_u32 s9, s3, s7 4994; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] 4995; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[2:3] 4996; GFX10-NEXT: s_cselect_b32 s0, 1, 0 4997; GFX10-NEXT: v_mov_b32_e32 v4, s9 4998; GFX10-NEXT: s_and_b32 s0, 1, s0 4999; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 5000; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5001; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0 5002; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 5003; GFX10-NEXT: s_cselect_b32 s1, 1, 0 5004; GFX10-NEXT: s_ashr_i32 s3, s9, 31 5005; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5006; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5007; GFX10-NEXT: s_and_b32 s0, 1, s1 5008; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 5009; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 5010; GFX10-NEXT: s_add_u32 s0, s3, 0 5011; GFX10-NEXT: s_cselect_b32 s1, 1, 0 5012; GFX10-NEXT: s_and_b32 s1, s1, 1 5013; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5014; GFX10-NEXT: s_cmp_lg_u32 s1, 0 5015; GFX10-NEXT: v_mov_b32_e32 v1, s4 5016; GFX10-NEXT: s_addc_u32 s1, s3, 0 5017; GFX10-NEXT: s_cselect_b32 s2, 1, 0 5018; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5019; GFX10-NEXT: s_and_b32 s2, s2, 1 5020; GFX10-NEXT: s_cmp_lg_u32 s2, 0 5021; GFX10-NEXT: s_addc_u32 s2, s3, 0 5022; GFX10-NEXT: s_cselect_b32 s4, 1, 0 5023; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5024; GFX10-NEXT: s_and_b32 s4, s4, 1 5025; GFX10-NEXT: s_cmp_lg_u32 s4, 0 5026; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000 5027; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo 5028; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo 5029; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo 5030; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo 5031; GFX10-NEXT: v_readfirstlane_b32 s0, v0 5032; GFX10-NEXT: v_readfirstlane_b32 s1, v1 5033; GFX10-NEXT: v_readfirstlane_b32 s2, v2 5034; GFX10-NEXT: v_readfirstlane_b32 s3, v3 5035; GFX10-NEXT: ; return to shader part epilog 5036 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5037 ret i128 %result 5038} 5039 5040define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { 5041; GFX6-LABEL: saddsat_i128_sv: 5042; GFX6: ; %bb.0: 5043; GFX6-NEXT: v_mov_b32_e32 v4, s1 5044; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 5045; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc 5046; GFX6-NEXT: v_mov_b32_e32 v4, s2 5047; GFX6-NEXT: v_mov_b32_e32 v5, s3 5048; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc 5049; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc 5050; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5051; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 5052; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5053; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5054; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5055; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5056; GFX6-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5057; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5058; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5059; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5060; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5061; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5062; GFX6-NEXT: v_xor_b32_e32 v2, v2, v6 5063; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v3 5064; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc 5065; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc 5066; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc 5067; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 5068; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5069; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 5070; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 5071; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc 5072; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5073; GFX6-NEXT: ; return to shader part epilog 5074; 5075; GFX8-LABEL: saddsat_i128_sv: 5076; GFX8: ; %bb.0: 5077; GFX8-NEXT: v_mov_b32_e32 v4, s1 5078; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 5079; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc 5080; GFX8-NEXT: v_mov_b32_e32 v4, s2 5081; GFX8-NEXT: v_mov_b32_e32 v5, s3 5082; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v2, vcc 5083; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc 5084; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5085; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 5086; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5087; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5088; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5089; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5090; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5091; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5092; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5093; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5094; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5095; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5096; GFX8-NEXT: v_xor_b32_e32 v2, v2, v6 5097; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v3 5098; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc 5099; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc 5100; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc 5101; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 5102; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5103; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 5104; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 5105; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc 5106; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5107; GFX8-NEXT: ; return to shader part epilog 5108; 5109; GFX9-LABEL: saddsat_i128_sv: 5110; GFX9: ; %bb.0: 5111; GFX9-NEXT: v_mov_b32_e32 v4, s1 5112; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 5113; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc 5114; GFX9-NEXT: v_mov_b32_e32 v4, s2 5115; GFX9-NEXT: v_mov_b32_e32 v5, s3 5116; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 5117; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v3, vcc 5118; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 5119; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 5120; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5121; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[4:5] 5122; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5123; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 5124; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 5125; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] 5126; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc 5127; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5128; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5129; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc 5130; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 5131; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v3 5132; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v3, vcc 5133; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc 5134; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 5135; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 5136; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5137; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 5138; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 5139; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc 5140; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5141; GFX9-NEXT: ; return to shader part epilog 5142; 5143; GFX10-LABEL: saddsat_i128_sv: 5144; GFX10: ; %bb.0: 5145; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 5146; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 5147; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo 5148; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo 5149; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1] 5150; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo 5151; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5] 5152; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo 5153; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] 5154; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 5155; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] 5156; GFX10-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo 5157; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] 5158; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 5159; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo 5160; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 5161; GFX10-NEXT: v_add_co_u32 v6, vcc_lo, v3, 0 5162; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo 5163; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 5164; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 5165; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo 5166; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0x80000000, v3, vcc_lo 5167; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 5168; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 5169; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0 5170; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v3, s0 5171; GFX10-NEXT: ; return to shader part epilog 5172 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5173 %cast = bitcast i128 %result to <4 x float> 5174 ret <4 x float> %cast 5175} 5176 5177define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { 5178; GFX6-LABEL: saddsat_i128_vs: 5179; GFX6: ; %bb.0: 5180; GFX6-NEXT: v_mov_b32_e32 v5, s1 5181; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v0 5182; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 5183; GFX6-NEXT: v_mov_b32_e32 v6, s2 5184; GFX6-NEXT: v_mov_b32_e32 v7, s3 5185; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc 5186; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc 5187; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5188; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5189; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5190; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5191; GFX6-NEXT: v_bfrev_b32_e32 v8, 1 5192; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5193; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5194; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5195; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5196; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[2:3], 0 5197; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5198; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5199; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v7 5200; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 5201; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5202; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 5203; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc 5204; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5205; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5206; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5207; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 5208; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 5209; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 5210; GFX6-NEXT: ; return to shader part epilog 5211; 5212; GFX8-LABEL: saddsat_i128_vs: 5213; GFX8: ; %bb.0: 5214; GFX8-NEXT: v_mov_b32_e32 v5, s1 5215; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 5216; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 5217; GFX8-NEXT: v_mov_b32_e32 v6, s2 5218; GFX8-NEXT: v_mov_b32_e32 v7, s3 5219; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc 5220; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v3, v7, vcc 5221; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5222; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 5223; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5224; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5225; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5226; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5227; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5228; GFX8-NEXT: s_cselect_b32 s4, 1, 0 5229; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5230; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5231; GFX8-NEXT: s_and_b32 s0, 1, s4 5232; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5233; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5234; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5235; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v7 5236; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 5237; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5238; GFX8-NEXT: v_bfrev_b32_e32 v8, 1 5239; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 5240; GFX8-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc 5241; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5242; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5243; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5244; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 5245; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 5246; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 5247; GFX8-NEXT: ; return to shader part epilog 5248; 5249; GFX9-LABEL: saddsat_i128_vs: 5250; GFX9: ; %bb.0: 5251; GFX9-NEXT: v_mov_b32_e32 v5, s1 5252; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 5253; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc 5254; GFX9-NEXT: v_mov_b32_e32 v6, s2 5255; GFX9-NEXT: v_mov_b32_e32 v7, s3 5256; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v6, vcc 5257; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v3, v7, vcc 5258; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5259; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 5260; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5261; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5262; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 5263; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5264; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5265; GFX9-NEXT: s_cselect_b32 s4, 1, 0 5266; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5267; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5268; GFX9-NEXT: s_and_b32 s0, 1, s4 5269; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5270; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5271; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5272; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v7 5273; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 5274; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 5275; GFX9-NEXT: v_bfrev_b32_e32 v8, 1 5276; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 5277; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc 5278; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5279; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5280; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5281; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc 5282; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc 5283; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 5284; GFX9-NEXT: ; return to shader part epilog 5285; 5286; GFX10-LABEL: saddsat_i128_vs: 5287; GFX10: ; %bb.0: 5288; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, s0 5289; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5290; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5291; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5292; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 5293; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 5294; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 5295; GFX10-NEXT: s_cselect_b32 s0, 1, 0 5296; GFX10-NEXT: s_and_b32 s0, 1, s0 5297; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5298; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] 5299; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 5300; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 5301; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5302; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 5303; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5304; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 5305; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5306; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v7 5307; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5308; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 5309; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 5310; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v1, vcc_lo 5311; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 5312; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0x80000000, v1, vcc_lo 5313; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v2, s0 5314; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v3, s0 5315; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v8, s0 5316; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 5317; GFX10-NEXT: ; return to shader part epilog 5318 %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) 5319 %cast = bitcast i128 %result to <4 x float> 5320 ret <4 x float> %cast 5321} 5322 5323define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { 5324; GFX6-LABEL: v_saddsat_v2i128: 5325; GFX6: ; %bb.0: 5326; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5327; GFX6-NEXT: v_add_i32_e32 v8, vcc, v0, v8 5328; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc 5329; GFX6-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc 5330; GFX6-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc 5331; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5332; GFX6-NEXT: v_bfrev_b32_e32 v18, 1 5333; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5334; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5335; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5336; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5337; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5338; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5339; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5340; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5341; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5342; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5343; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v17 5344; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0, v1 5345; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5346; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc 5347; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v1, v18, vcc 5348; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5349; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5350; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5351; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc 5352; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc 5353; GFX6-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc 5354; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v12 5355; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc 5356; GFX6-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc 5357; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc 5358; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5359; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5360; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5361; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5362; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5363; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5364; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5365; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5366; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5367; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5368; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 5369; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v11 5370; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0, v5 5371; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc 5372; GFX6-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc 5373; GFX6-NEXT: v_addc_u32_e32 v13, vcc, v5, v18, vcc 5374; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 5375; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5376; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5377; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 5378; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc 5379; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc 5380; GFX6-NEXT: s_setpc_b64 s[30:31] 5381; 5382; GFX8-LABEL: v_saddsat_v2i128: 5383; GFX8: ; %bb.0: 5384; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5385; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v8 5386; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc 5387; GFX8-NEXT: v_addc_u32_e32 v16, vcc, v2, v10, vcc 5388; GFX8-NEXT: v_addc_u32_e32 v17, vcc, v3, v11, vcc 5389; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5390; GFX8-NEXT: v_bfrev_b32_e32 v18, 1 5391; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5392; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5393; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5394; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5395; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5396; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5397; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5398; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5399; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5400; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5401; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v17 5402; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0, v1 5403; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5404; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc 5405; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v1, v18, vcc 5406; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5407; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5408; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5409; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc 5410; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc 5411; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc 5412; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v12 5413; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v5, v13, vcc 5414; GFX8-NEXT: v_addc_u32_e32 v10, vcc, v6, v14, vcc 5415; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v7, v15, vcc 5416; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5417; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5418; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5419; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5420; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5421; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5422; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5423; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5424; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5425; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5426; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 5427; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v11 5428; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0, v5 5429; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc 5430; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc 5431; GFX8-NEXT: v_addc_u32_e32 v13, vcc, v5, v18, vcc 5432; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 5433; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5434; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5435; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 5436; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc 5437; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc 5438; GFX8-NEXT: s_setpc_b64 s[30:31] 5439; 5440; GFX9-LABEL: v_saddsat_v2i128: 5441; GFX9: ; %bb.0: 5442; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5443; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 5444; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v9, vcc 5445; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v2, v10, vcc 5446; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, v3, v11, vcc 5447; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] 5448; GFX9-NEXT: v_bfrev_b32_e32 v18, 1 5449; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5450; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[16:17], v[2:3] 5451; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5452; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[16:17], v[2:3] 5453; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5454; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] 5455; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5456; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5457; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 5458; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5459; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v17 5460; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v1 5461; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 5462; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc 5463; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v18, vcc 5464; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5465; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5466; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc 5467; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc 5468; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v10, vcc 5469; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v11, vcc 5470; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v12 5471; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v13, vcc 5472; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v6, v14, vcc 5473; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v15, vcc 5474; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5475; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5476; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5477; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5478; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5479; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5480; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] 5481; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5482; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5483; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 5484; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 5485; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v11 5486; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0, v5 5487; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc 5488; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v5, vcc 5489; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v5, v18, vcc 5490; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 5491; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5492; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5493; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 5494; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v12, vcc 5495; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc 5496; GFX9-NEXT: s_setpc_b64 s[30:31] 5497; 5498; GFX10-LABEL: v_saddsat_v2i128: 5499; GFX10: ; %bb.0: 5500; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5501; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5502; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, v8 5503; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo 5504; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo 5505; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo 5506; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1] 5507; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5508; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3] 5509; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5510; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11] 5511; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo 5512; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3] 5513; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5514; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] 5515; GFX10-NEXT: v_cndmask_b32_e64 v1, v18, 0, vcc_lo 5516; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, v12 5517; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v5, v13, vcc_lo 5518; GFX10-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, v6, v14, vcc_lo 5519; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v7, v15, vcc_lo 5520; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[10:11], v[4:5] 5521; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5522; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v17 5523; GFX10-NEXT: v_cmp_eq_u64_e64 s5, v[12:13], v[6:7] 5524; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 5525; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[6:7] 5526; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5527; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v1, 0 5528; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo 5529; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 5530; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[14:15] 5531; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v13 5532; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4 5533; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 5534; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v4, s5 5535; GFX10-NEXT: v_cmp_eq_u64_e64 s5, 0, v[14:15] 5536; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 5537; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0x80000000, v1, vcc_lo 5538; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s4 5539; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 5540; GFX10-NEXT: v_xor_b32_e32 v4, v4, v0 5541; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v2, s4 5542; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, v5, s4 5543; GFX10-NEXT: v_and_b32_e32 v3, 1, v4 5544; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v7, 0 5545; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo 5546; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v7, vcc_lo 5547; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v3 5548; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0x80000000, v7, vcc_lo 5549; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, v6, s4 5550; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 5551; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 5552; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, v8, s5 5553; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, v7, s5 5554; GFX10-NEXT: s_setpc_b64 s[30:31] 5555 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 5556 ret <2 x i128> %result 5557} 5558 5559define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { 5560; GFX6-LABEL: s_saddsat_v2i128: 5561; GFX6: ; %bb.0: 5562; GFX6-NEXT: s_add_u32 s8, s0, s8 5563; GFX6-NEXT: s_cselect_b32 s16, 1, 0 5564; GFX6-NEXT: s_and_b32 s16, s16, 1 5565; GFX6-NEXT: s_cmp_lg_u32 s16, 0 5566; GFX6-NEXT: s_addc_u32 s9, s1, s9 5567; GFX6-NEXT: s_cselect_b32 s16, 1, 0 5568; GFX6-NEXT: s_and_b32 s16, s16, 1 5569; GFX6-NEXT: s_cmp_lg_u32 s16, 0 5570; GFX6-NEXT: s_addc_u32 s16, s2, s10 5571; GFX6-NEXT: s_cselect_b32 s17, 1, 0 5572; GFX6-NEXT: v_mov_b32_e32 v3, s1 5573; GFX6-NEXT: s_and_b32 s17, s17, 1 5574; GFX6-NEXT: v_mov_b32_e32 v2, s0 5575; GFX6-NEXT: s_cmp_lg_u32 s17, 0 5576; GFX6-NEXT: v_mov_b32_e32 v0, s2 5577; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] 5578; GFX6-NEXT: s_addc_u32 s17, s3, s11 5579; GFX6-NEXT: v_mov_b32_e32 v1, s3 5580; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5581; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] 5582; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 5583; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 5584; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] 5585; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5586; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0 5587; GFX6-NEXT: s_ashr_i32 s3, s17, 31 5588; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5589; GFX6-NEXT: s_add_u32 s0, s3, 0 5590; GFX6-NEXT: s_cselect_b32 s1, 1, 0 5591; GFX6-NEXT: s_and_b32 s1, s1, 1 5592; GFX6-NEXT: s_cmp_lg_u32 s1, 0 5593; GFX6-NEXT: s_addc_u32 s1, s3, 0 5594; GFX6-NEXT: s_cselect_b32 s2, 1, 0 5595; GFX6-NEXT: s_and_b32 s2, s2, 1 5596; GFX6-NEXT: s_cmp_lg_u32 s2, 0 5597; GFX6-NEXT: s_addc_u32 s2, s3, 0 5598; GFX6-NEXT: s_cselect_b32 s11, 1, 0 5599; GFX6-NEXT: s_and_b32 s11, s11, 1 5600; GFX6-NEXT: s_brev_b32 s10, 1 5601; GFX6-NEXT: s_cmp_lg_u32 s11, 0 5602; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 5603; GFX6-NEXT: s_addc_u32 s3, s3, s10 5604; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5605; GFX6-NEXT: v_mov_b32_e32 v1, s0 5606; GFX6-NEXT: s_add_u32 s0, s4, s12 5607; GFX6-NEXT: v_mov_b32_e32 v2, s1 5608; GFX6-NEXT: s_cselect_b32 s1, 1, 0 5609; GFX6-NEXT: s_and_b32 s1, s1, 1 5610; GFX6-NEXT: s_cmp_lg_u32 s1, 0 5611; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5612; GFX6-NEXT: s_addc_u32 s1, s5, s13 5613; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5614; GFX6-NEXT: v_mov_b32_e32 v0, s2 5615; GFX6-NEXT: s_cselect_b32 s2, 1, 0 5616; GFX6-NEXT: s_and_b32 s2, s2, 1 5617; GFX6-NEXT: v_mov_b32_e32 v3, s8 5618; GFX6-NEXT: v_mov_b32_e32 v4, s9 5619; GFX6-NEXT: s_cmp_lg_u32 s2, 0 5620; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 5621; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 5622; GFX6-NEXT: v_mov_b32_e32 v1, s3 5623; GFX6-NEXT: v_mov_b32_e32 v2, s16 5624; GFX6-NEXT: v_mov_b32_e32 v3, s17 5625; GFX6-NEXT: s_addc_u32 s2, s6, s14 5626; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc 5627; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc 5628; GFX6-NEXT: s_cselect_b32 s3, 1, 0 5629; GFX6-NEXT: v_mov_b32_e32 v2, s4 5630; GFX6-NEXT: s_and_b32 s3, s3, 1 5631; GFX6-NEXT: v_mov_b32_e32 v3, s5 5632; GFX6-NEXT: s_cmp_lg_u32 s3, 0 5633; GFX6-NEXT: v_mov_b32_e32 v0, s6 5634; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 5635; GFX6-NEXT: s_addc_u32 s3, s7, s15 5636; GFX6-NEXT: v_mov_b32_e32 v1, s7 5637; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5638; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5639; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 5640; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 5641; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] 5642; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 5643; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[14:15], 0 5644; GFX6-NEXT: s_ashr_i32 s7, s3, 31 5645; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 5646; GFX6-NEXT: s_add_u32 s4, s7, 0 5647; GFX6-NEXT: s_cselect_b32 s5, 1, 0 5648; GFX6-NEXT: s_and_b32 s5, s5, 1 5649; GFX6-NEXT: s_cmp_lg_u32 s5, 0 5650; GFX6-NEXT: s_addc_u32 s5, s7, 0 5651; GFX6-NEXT: s_cselect_b32 s6, 1, 0 5652; GFX6-NEXT: s_and_b32 s6, s6, 1 5653; GFX6-NEXT: s_cmp_lg_u32 s6, 0 5654; GFX6-NEXT: s_addc_u32 s6, s7, 0 5655; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 5656; GFX6-NEXT: s_cselect_b32 s8, 1, 0 5657; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5658; GFX6-NEXT: s_and_b32 s8, s8, 1 5659; GFX6-NEXT: s_cmp_lg_u32 s8, 0 5660; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5661; GFX6-NEXT: s_addc_u32 s7, s7, s10 5662; GFX6-NEXT: v_mov_b32_e32 v1, s4 5663; GFX6-NEXT: v_mov_b32_e32 v2, s5 5664; GFX6-NEXT: v_mov_b32_e32 v3, s0 5665; GFX6-NEXT: v_mov_b32_e32 v8, s1 5666; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5667; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 5668; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc 5669; GFX6-NEXT: v_mov_b32_e32 v2, s6 5670; GFX6-NEXT: v_mov_b32_e32 v3, s7 5671; GFX6-NEXT: v_mov_b32_e32 v8, s2 5672; GFX6-NEXT: v_mov_b32_e32 v9, s3 5673; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc 5674; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 5675; GFX6-NEXT: v_readfirstlane_b32 s0, v5 5676; GFX6-NEXT: v_readfirstlane_b32 s1, v4 5677; GFX6-NEXT: v_readfirstlane_b32 s2, v6 5678; GFX6-NEXT: v_readfirstlane_b32 s3, v7 5679; GFX6-NEXT: v_readfirstlane_b32 s4, v0 5680; GFX6-NEXT: v_readfirstlane_b32 s5, v1 5681; GFX6-NEXT: v_readfirstlane_b32 s6, v2 5682; GFX6-NEXT: v_readfirstlane_b32 s7, v3 5683; GFX6-NEXT: ; return to shader part epilog 5684; 5685; GFX8-LABEL: s_saddsat_v2i128: 5686; GFX8: ; %bb.0: 5687; GFX8-NEXT: s_add_u32 s8, s0, s8 5688; GFX8-NEXT: s_cselect_b32 s16, 1, 0 5689; GFX8-NEXT: s_and_b32 s16, s16, 1 5690; GFX8-NEXT: s_cmp_lg_u32 s16, 0 5691; GFX8-NEXT: s_addc_u32 s9, s1, s9 5692; GFX8-NEXT: s_cselect_b32 s16, 1, 0 5693; GFX8-NEXT: s_and_b32 s16, s16, 1 5694; GFX8-NEXT: s_cmp_lg_u32 s16, 0 5695; GFX8-NEXT: s_addc_u32 s16, s2, s10 5696; GFX8-NEXT: s_cselect_b32 s17, 1, 0 5697; GFX8-NEXT: s_and_b32 s17, s17, 1 5698; GFX8-NEXT: v_mov_b32_e32 v3, s1 5699; GFX8-NEXT: s_cmp_lg_u32 s17, 0 5700; GFX8-NEXT: v_mov_b32_e32 v2, s0 5701; GFX8-NEXT: s_addc_u32 s17, s3, s11 5702; GFX8-NEXT: v_mov_b32_e32 v0, s2 5703; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] 5704; GFX8-NEXT: v_mov_b32_e32 v1, s3 5705; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 5706; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5707; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5708; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] 5709; GFX8-NEXT: s_and_b32 s0, 1, s2 5710; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5711; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5712; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 5713; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 5714; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5715; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5716; GFX8-NEXT: s_and_b32 s0, 1, s2 5717; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5718; GFX8-NEXT: s_ashr_i32 s3, s17, 31 5719; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5720; GFX8-NEXT: s_add_u32 s0, s3, 0 5721; GFX8-NEXT: s_cselect_b32 s1, 1, 0 5722; GFX8-NEXT: s_and_b32 s1, s1, 1 5723; GFX8-NEXT: s_cmp_lg_u32 s1, 0 5724; GFX8-NEXT: s_addc_u32 s1, s3, 0 5725; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5726; GFX8-NEXT: s_and_b32 s2, s2, 1 5727; GFX8-NEXT: s_cmp_lg_u32 s2, 0 5728; GFX8-NEXT: s_addc_u32 s2, s3, 0 5729; GFX8-NEXT: s_cselect_b32 s11, 1, 0 5730; GFX8-NEXT: s_and_b32 s11, s11, 1 5731; GFX8-NEXT: s_brev_b32 s10, 1 5732; GFX8-NEXT: s_cmp_lg_u32 s11, 0 5733; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5734; GFX8-NEXT: s_addc_u32 s3, s3, s10 5735; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5736; GFX8-NEXT: v_mov_b32_e32 v1, s0 5737; GFX8-NEXT: s_add_u32 s0, s4, s12 5738; GFX8-NEXT: v_mov_b32_e32 v2, s1 5739; GFX8-NEXT: s_cselect_b32 s1, 1, 0 5740; GFX8-NEXT: s_and_b32 s1, s1, 1 5741; GFX8-NEXT: s_cmp_lg_u32 s1, 0 5742; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5743; GFX8-NEXT: s_addc_u32 s1, s5, s13 5744; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5745; GFX8-NEXT: v_mov_b32_e32 v0, s2 5746; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5747; GFX8-NEXT: s_and_b32 s2, s2, 1 5748; GFX8-NEXT: s_cmp_lg_u32 s2, 0 5749; GFX8-NEXT: v_mov_b32_e32 v3, s8 5750; GFX8-NEXT: v_mov_b32_e32 v4, s9 5751; GFX8-NEXT: s_addc_u32 s2, s6, s14 5752; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 5753; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 5754; GFX8-NEXT: v_mov_b32_e32 v1, s3 5755; GFX8-NEXT: v_mov_b32_e32 v2, s16 5756; GFX8-NEXT: v_mov_b32_e32 v3, s17 5757; GFX8-NEXT: s_cselect_b32 s3, 1, 0 5758; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc 5759; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc 5760; GFX8-NEXT: s_and_b32 s3, s3, 1 5761; GFX8-NEXT: v_mov_b32_e32 v2, s4 5762; GFX8-NEXT: s_cmp_lg_u32 s3, 0 5763; GFX8-NEXT: v_mov_b32_e32 v3, s5 5764; GFX8-NEXT: s_addc_u32 s3, s7, s15 5765; GFX8-NEXT: v_mov_b32_e32 v0, s6 5766; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 5767; GFX8-NEXT: v_mov_b32_e32 v1, s7 5768; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 5769; GFX8-NEXT: s_cselect_b32 s6, 1, 0 5770; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5771; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5772; GFX8-NEXT: s_and_b32 s4, 1, s6 5773; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5774; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 5775; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 5776; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 5777; GFX8-NEXT: s_cselect_b32 s6, 1, 0 5778; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 5779; GFX8-NEXT: s_and_b32 s4, 1, s6 5780; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 5781; GFX8-NEXT: s_ashr_i32 s7, s3, 31 5782; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 5783; GFX8-NEXT: s_add_u32 s4, s7, 0 5784; GFX8-NEXT: s_cselect_b32 s5, 1, 0 5785; GFX8-NEXT: s_and_b32 s5, s5, 1 5786; GFX8-NEXT: s_cmp_lg_u32 s5, 0 5787; GFX8-NEXT: s_addc_u32 s5, s7, 0 5788; GFX8-NEXT: s_cselect_b32 s6, 1, 0 5789; GFX8-NEXT: s_and_b32 s6, s6, 1 5790; GFX8-NEXT: s_cmp_lg_u32 s6, 0 5791; GFX8-NEXT: s_addc_u32 s6, s7, 0 5792; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5793; GFX8-NEXT: s_cselect_b32 s8, 1, 0 5794; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5795; GFX8-NEXT: s_and_b32 s8, s8, 1 5796; GFX8-NEXT: s_cmp_lg_u32 s8, 0 5797; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5798; GFX8-NEXT: s_addc_u32 s7, s7, s10 5799; GFX8-NEXT: v_mov_b32_e32 v1, s4 5800; GFX8-NEXT: v_mov_b32_e32 v2, s5 5801; GFX8-NEXT: v_mov_b32_e32 v3, s0 5802; GFX8-NEXT: v_mov_b32_e32 v8, s1 5803; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5804; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 5805; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc 5806; GFX8-NEXT: v_mov_b32_e32 v2, s6 5807; GFX8-NEXT: v_mov_b32_e32 v3, s7 5808; GFX8-NEXT: v_mov_b32_e32 v8, s2 5809; GFX8-NEXT: v_mov_b32_e32 v9, s3 5810; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc 5811; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 5812; GFX8-NEXT: v_readfirstlane_b32 s0, v5 5813; GFX8-NEXT: v_readfirstlane_b32 s1, v4 5814; GFX8-NEXT: v_readfirstlane_b32 s2, v6 5815; GFX8-NEXT: v_readfirstlane_b32 s3, v7 5816; GFX8-NEXT: v_readfirstlane_b32 s4, v0 5817; GFX8-NEXT: v_readfirstlane_b32 s5, v1 5818; GFX8-NEXT: v_readfirstlane_b32 s6, v2 5819; GFX8-NEXT: v_readfirstlane_b32 s7, v3 5820; GFX8-NEXT: ; return to shader part epilog 5821; 5822; GFX9-LABEL: s_saddsat_v2i128: 5823; GFX9: ; %bb.0: 5824; GFX9-NEXT: s_add_u32 s8, s0, s8 5825; GFX9-NEXT: s_cselect_b32 s16, 1, 0 5826; GFX9-NEXT: s_and_b32 s16, s16, 1 5827; GFX9-NEXT: s_cmp_lg_u32 s16, 0 5828; GFX9-NEXT: s_addc_u32 s9, s1, s9 5829; GFX9-NEXT: s_cselect_b32 s16, 1, 0 5830; GFX9-NEXT: s_and_b32 s16, s16, 1 5831; GFX9-NEXT: s_cmp_lg_u32 s16, 0 5832; GFX9-NEXT: s_addc_u32 s16, s2, s10 5833; GFX9-NEXT: s_cselect_b32 s17, 1, 0 5834; GFX9-NEXT: s_and_b32 s17, s17, 1 5835; GFX9-NEXT: v_mov_b32_e32 v3, s1 5836; GFX9-NEXT: s_cmp_lg_u32 s17, 0 5837; GFX9-NEXT: v_mov_b32_e32 v2, s0 5838; GFX9-NEXT: s_addc_u32 s17, s3, s11 5839; GFX9-NEXT: v_mov_b32_e32 v0, s2 5840; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] 5841; GFX9-NEXT: v_mov_b32_e32 v1, s3 5842; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 5843; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5844; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5845; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] 5846; GFX9-NEXT: s_and_b32 s0, 1, s2 5847; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5848; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5849; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 5850; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[10:11], 0 5851; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5852; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5853; GFX9-NEXT: s_and_b32 s0, 1, s2 5854; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 5855; GFX9-NEXT: s_ashr_i32 s3, s17, 31 5856; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] 5857; GFX9-NEXT: s_add_u32 s0, s3, 0 5858; GFX9-NEXT: s_cselect_b32 s1, 1, 0 5859; GFX9-NEXT: s_and_b32 s1, s1, 1 5860; GFX9-NEXT: s_cmp_lg_u32 s1, 0 5861; GFX9-NEXT: s_addc_u32 s1, s3, 0 5862; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5863; GFX9-NEXT: s_and_b32 s2, s2, 1 5864; GFX9-NEXT: s_cmp_lg_u32 s2, 0 5865; GFX9-NEXT: s_addc_u32 s2, s3, 0 5866; GFX9-NEXT: s_cselect_b32 s11, 1, 0 5867; GFX9-NEXT: s_and_b32 s11, s11, 1 5868; GFX9-NEXT: s_brev_b32 s10, 1 5869; GFX9-NEXT: s_cmp_lg_u32 s11, 0 5870; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5871; GFX9-NEXT: s_addc_u32 s3, s3, s10 5872; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5873; GFX9-NEXT: v_mov_b32_e32 v1, s0 5874; GFX9-NEXT: s_add_u32 s0, s4, s12 5875; GFX9-NEXT: v_mov_b32_e32 v2, s1 5876; GFX9-NEXT: s_cselect_b32 s1, 1, 0 5877; GFX9-NEXT: s_and_b32 s1, s1, 1 5878; GFX9-NEXT: s_cmp_lg_u32 s1, 0 5879; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5880; GFX9-NEXT: s_addc_u32 s1, s5, s13 5881; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5882; GFX9-NEXT: v_mov_b32_e32 v0, s2 5883; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5884; GFX9-NEXT: s_and_b32 s2, s2, 1 5885; GFX9-NEXT: s_cmp_lg_u32 s2, 0 5886; GFX9-NEXT: v_mov_b32_e32 v3, s8 5887; GFX9-NEXT: v_mov_b32_e32 v4, s9 5888; GFX9-NEXT: s_addc_u32 s2, s6, s14 5889; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 5890; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 5891; GFX9-NEXT: v_mov_b32_e32 v1, s3 5892; GFX9-NEXT: v_mov_b32_e32 v2, s16 5893; GFX9-NEXT: v_mov_b32_e32 v3, s17 5894; GFX9-NEXT: s_cselect_b32 s3, 1, 0 5895; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc 5896; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc 5897; GFX9-NEXT: s_and_b32 s3, s3, 1 5898; GFX9-NEXT: v_mov_b32_e32 v2, s4 5899; GFX9-NEXT: s_cmp_lg_u32 s3, 0 5900; GFX9-NEXT: v_mov_b32_e32 v3, s5 5901; GFX9-NEXT: s_addc_u32 s3, s7, s15 5902; GFX9-NEXT: v_mov_b32_e32 v0, s6 5903; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 5904; GFX9-NEXT: v_mov_b32_e32 v1, s7 5905; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 5906; GFX9-NEXT: s_cselect_b32 s6, 1, 0 5907; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5908; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 5909; GFX9-NEXT: s_and_b32 s4, 1, s6 5910; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5911; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 5912; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 5913; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], s[14:15], 0 5914; GFX9-NEXT: s_cselect_b32 s6, 1, 0 5915; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 5916; GFX9-NEXT: s_and_b32 s4, 1, s6 5917; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 5918; GFX9-NEXT: s_ashr_i32 s7, s3, 31 5919; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5] 5920; GFX9-NEXT: s_add_u32 s4, s7, 0 5921; GFX9-NEXT: s_cselect_b32 s5, 1, 0 5922; GFX9-NEXT: s_and_b32 s5, s5, 1 5923; GFX9-NEXT: s_cmp_lg_u32 s5, 0 5924; GFX9-NEXT: s_addc_u32 s5, s7, 0 5925; GFX9-NEXT: s_cselect_b32 s6, 1, 0 5926; GFX9-NEXT: s_and_b32 s6, s6, 1 5927; GFX9-NEXT: s_cmp_lg_u32 s6, 0 5928; GFX9-NEXT: s_addc_u32 s6, s7, 0 5929; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5930; GFX9-NEXT: s_cselect_b32 s8, 1, 0 5931; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5932; GFX9-NEXT: s_and_b32 s8, s8, 1 5933; GFX9-NEXT: s_cmp_lg_u32 s8, 0 5934; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5935; GFX9-NEXT: s_addc_u32 s7, s7, s10 5936; GFX9-NEXT: v_mov_b32_e32 v1, s4 5937; GFX9-NEXT: v_mov_b32_e32 v2, s5 5938; GFX9-NEXT: v_mov_b32_e32 v3, s0 5939; GFX9-NEXT: v_mov_b32_e32 v8, s1 5940; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5941; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc 5942; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc 5943; GFX9-NEXT: v_mov_b32_e32 v2, s6 5944; GFX9-NEXT: v_mov_b32_e32 v3, s7 5945; GFX9-NEXT: v_mov_b32_e32 v8, s2 5946; GFX9-NEXT: v_mov_b32_e32 v9, s3 5947; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc 5948; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 5949; GFX9-NEXT: v_readfirstlane_b32 s0, v5 5950; GFX9-NEXT: v_readfirstlane_b32 s1, v4 5951; GFX9-NEXT: v_readfirstlane_b32 s2, v6 5952; GFX9-NEXT: v_readfirstlane_b32 s3, v7 5953; GFX9-NEXT: v_readfirstlane_b32 s4, v0 5954; GFX9-NEXT: v_readfirstlane_b32 s5, v1 5955; GFX9-NEXT: v_readfirstlane_b32 s6, v2 5956; GFX9-NEXT: v_readfirstlane_b32 s7, v3 5957; GFX9-NEXT: ; return to shader part epilog 5958; 5959; GFX10-LABEL: s_saddsat_v2i128: 5960; GFX10: ; %bb.0: 5961; GFX10-NEXT: s_add_u32 s8, s0, s8 5962; GFX10-NEXT: s_cselect_b32 s16, 1, 0 5963; GFX10-NEXT: s_and_b32 s16, s16, 1 5964; GFX10-NEXT: s_cmp_lg_u32 s16, 0 5965; GFX10-NEXT: s_addc_u32 s9, s1, s9 5966; GFX10-NEXT: s_cselect_b32 s16, 1, 0 5967; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] 5968; GFX10-NEXT: s_and_b32 s16, s16, 1 5969; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], 0 5970; GFX10-NEXT: s_cmp_lg_u32 s16, 0 5971; GFX10-NEXT: v_mov_b32_e32 v2, s9 5972; GFX10-NEXT: s_addc_u32 s16, s2, s10 5973; GFX10-NEXT: s_cselect_b32 s17, 1, 0 5974; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 5975; GFX10-NEXT: s_and_b32 s17, s17, 1 5976; GFX10-NEXT: s_cmp_lg_u32 s17, 0 5977; GFX10-NEXT: s_addc_u32 s17, s3, s11 5978; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] 5979; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 5980; GFX10-NEXT: v_mov_b32_e32 v3, s17 5981; GFX10-NEXT: s_cselect_b32 s18, 1, 0 5982; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5983; GFX10-NEXT: s_and_b32 s0, 1, s18 5984; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 5985; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5986; GFX10-NEXT: s_cselect_b32 s0, 1, 0 5987; GFX10-NEXT: s_ashr_i32 s3, s17, 31 5988; GFX10-NEXT: s_and_b32 s0, 1, s0 5989; GFX10-NEXT: s_brev_b32 s10, 1 5990; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5991; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 5992; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 5993; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s0 5994; GFX10-NEXT: s_add_u32 s0, s3, 0 5995; GFX10-NEXT: s_cselect_b32 s1, 1, 0 5996; GFX10-NEXT: s_and_b32 s1, s1, 1 5997; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5998; GFX10-NEXT: s_cmp_lg_u32 s1, 0 5999; GFX10-NEXT: v_mov_b32_e32 v1, s8 6000; GFX10-NEXT: s_addc_u32 s1, s3, 0 6001; GFX10-NEXT: s_cselect_b32 s2, 1, 0 6002; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 6003; GFX10-NEXT: s_and_b32 s2, s2, 1 6004; GFX10-NEXT: s_cmp_lg_u32 s2, 0 6005; GFX10-NEXT: s_addc_u32 s2, s3, 0 6006; GFX10-NEXT: s_cselect_b32 s11, 1, 0 6007; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 6008; GFX10-NEXT: s_and_b32 s11, s11, 1 6009; GFX10-NEXT: s_cmp_lg_u32 s11, 0 6010; GFX10-NEXT: s_addc_u32 s3, s3, s10 6011; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo 6012; GFX10-NEXT: s_add_u32 s0, s4, s12 6013; GFX10-NEXT: s_cselect_b32 s8, 1, 0 6014; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo 6015; GFX10-NEXT: s_and_b32 s8, s8, 1 6016; GFX10-NEXT: v_mov_b32_e32 v2, s16 6017; GFX10-NEXT: s_cmp_lg_u32 s8, 0 6018; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo 6019; GFX10-NEXT: s_addc_u32 s1, s5, s13 6020; GFX10-NEXT: s_cselect_b32 s8, 1, 0 6021; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 6022; GFX10-NEXT: s_and_b32 s8, s8, 1 6023; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo 6024; GFX10-NEXT: s_cmp_lg_u32 s8, 0 6025; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[14:15], 0 6026; GFX10-NEXT: s_addc_u32 s8, s6, s14 6027; GFX10-NEXT: s_cselect_b32 s9, 1, 0 6028; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 6029; GFX10-NEXT: s_and_b32 s9, s9, 1 6030; GFX10-NEXT: v_mov_b32_e32 v6, s1 6031; GFX10-NEXT: s_cmp_lg_u32 s9, 0 6032; GFX10-NEXT: v_mov_b32_e32 v7, s8 6033; GFX10-NEXT: s_addc_u32 s9, s7, s15 6034; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7] 6035; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[6:7] 6036; GFX10-NEXT: s_cselect_b32 s2, 1, 0 6037; GFX10-NEXT: v_mov_b32_e32 v8, s9 6038; GFX10-NEXT: s_and_b32 s2, 1, s2 6039; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 6040; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 6041; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 6042; GFX10-NEXT: s_cselect_b32 s2, 1, 0 6043; GFX10-NEXT: s_ashr_i32 s5, s9, 31 6044; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo 6045; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3 6046; GFX10-NEXT: s_and_b32 s3, 1, s2 6047; GFX10-NEXT: s_add_u32 s2, s5, 0 6048; GFX10-NEXT: v_cmp_ne_u32_e64 s3, 0, s3 6049; GFX10-NEXT: s_cselect_b32 s4, 1, 0 6050; GFX10-NEXT: s_and_b32 s4, s4, 1 6051; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s3 6052; GFX10-NEXT: s_cmp_lg_u32 s4, 0 6053; GFX10-NEXT: s_addc_u32 s3, s5, 0 6054; GFX10-NEXT: s_cselect_b32 s4, 1, 0 6055; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4 6056; GFX10-NEXT: s_and_b32 s4, s4, 1 6057; GFX10-NEXT: v_mov_b32_e32 v5, s0 6058; GFX10-NEXT: s_cmp_lg_u32 s4, 0 6059; GFX10-NEXT: v_readfirstlane_b32 s0, v0 6060; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 6061; GFX10-NEXT: s_addc_u32 s4, s5, 0 6062; GFX10-NEXT: s_cselect_b32 s6, 1, 0 6063; GFX10-NEXT: s_and_b32 s6, s6, 1 6064; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4 6065; GFX10-NEXT: s_cmp_lg_u32 s6, 0 6066; GFX10-NEXT: s_addc_u32 s1, s5, s10 6067; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo 6068; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo 6069; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo 6070; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo 6071; GFX10-NEXT: v_readfirstlane_b32 s1, v1 6072; GFX10-NEXT: v_readfirstlane_b32 s2, v2 6073; GFX10-NEXT: v_readfirstlane_b32 s3, v3 6074; GFX10-NEXT: v_readfirstlane_b32 s4, v4 6075; GFX10-NEXT: v_readfirstlane_b32 s5, v5 6076; GFX10-NEXT: v_readfirstlane_b32 s6, v6 6077; GFX10-NEXT: v_readfirstlane_b32 s7, v7 6078; GFX10-NEXT: ; return to shader part epilog 6079 %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 6080 ret <2 x i128> %result 6081} 6082 6083declare i7 @llvm.sadd.sat.i7(i7, i7) #0 6084declare i8 @llvm.sadd.sat.i8(i8, i8) #0 6085declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) #0 6086declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) #0 6087 6088declare i16 @llvm.sadd.sat.i16(i16, i16) #0 6089declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) #0 6090declare <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16>, <3 x i16>) #0 6091declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) #0 6092declare <5 x i16> @llvm.sadd.sat.v5i16(<5 x i16>, <5 x i16>) #0 6093declare <6 x i16> @llvm.sadd.sat.v6i16(<6 x i16>, <6 x i16>) #0 6094declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) #0 6095 6096declare i24 @llvm.sadd.sat.i24(i24, i24) #0 6097 6098declare i32 @llvm.sadd.sat.i32(i32, i32) #0 6099declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) #0 6100declare <3 x i32> @llvm.sadd.sat.v3i32(<3 x i32>, <3 x i32>) #0 6101declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) #0 6102declare <5 x i32> @llvm.sadd.sat.v5i32(<5 x i32>, <5 x i32>) #0 6103declare <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32>, <16 x i32>) #0 6104 6105declare i48 @llvm.sadd.sat.i48(i48, i48) #0 6106 6107declare i64 @llvm.sadd.sat.i64(i64, i64) #0 6108declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) #0 6109 6110declare i128 @llvm.sadd.sat.i128(i128, i128) #0 6111declare <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128>, <2 x i128>) #0 6112 6113attributes #0 = { nounwind readnone speculatable willreturn } 6114