1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 6 7define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) { 8; GFX906-LABEL: v_sdot2: 9; GFX906: ; %bb.0: 10; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 12; GFX906-NEXT: s_setpc_b64 s[30:31] 13; 14; GFX908-LABEL: v_sdot2: 15; GFX908: ; %bb.0: 16; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 17; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 18; GFX908-NEXT: s_setpc_b64 s[30:31] 19; 20; GFX10-LABEL: v_sdot2: 21; GFX10: ; %bb.0: 22; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 23; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 24; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 25; GFX10-NEXT: ; implicit-def: $vcc_hi 26; GFX10-NEXT: s_setpc_b64 s[30:31] 27 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 28 ret i32 %r 29} 30 31define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) { 32; GFX906-LABEL: v_sdot2_clamp: 33; GFX906: ; %bb.0: 34; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 36; GFX906-NEXT: s_setpc_b64 s[30:31] 37; 38; GFX908-LABEL: v_sdot2_clamp: 39; GFX908: ; %bb.0: 40; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 42; GFX908-NEXT: s_setpc_b64 s[30:31] 43; 44; GFX10-LABEL: v_sdot2_clamp: 45; GFX10: ; %bb.0: 46; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 48; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 clamp 49; GFX10-NEXT: ; implicit-def: $vcc_hi 50; GFX10-NEXT: s_setpc_b64 s[30:31] 51 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true) 52 ret i32 %r 53} 54 55define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) { 56; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr: 57; GFX906: ; %bb.0: 58; GFX906-NEXT: v_mov_b32_e32 v0, s1 59; GFX906-NEXT: v_mov_b32_e32 v1, s2 60; GFX906-NEXT: v_dot2_i32_i16 v0, s0, v0, v1 61; GFX906-NEXT: ; return to shader part epilog 62; 63; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr: 64; GFX908: ; %bb.0: 65; GFX908-NEXT: v_mov_b32_e32 v0, s1 66; GFX908-NEXT: v_mov_b32_e32 v1, s2 67; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1 68; GFX908-NEXT: ; return to shader part epilog 69; 70; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr: 71; GFX10: ; %bb.0: 72; GFX10-NEXT: v_mov_b32_e32 v0, s2 73; GFX10-NEXT: ; implicit-def: $vcc_hi 74; GFX10-NEXT: v_dot2_i32_i16 v0, s0, s1, v0 75; GFX10-NEXT: ; return to shader part epilog 76 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false) 77 %cast = bitcast i32 %r to float 78 ret float %cast 79} 80 81define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) { 82; GFX906-LABEL: v_sdot2_inline_literal_a: 83; GFX906: ; %bb.0: 84; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 85; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 86; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 87; GFX906-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX908-LABEL: v_sdot2_inline_literal_a: 90; GFX908: ; %bb.0: 91; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 93; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 94; GFX908-NEXT: s_setpc_b64 s[30:31] 95; 96; GFX10-LABEL: v_sdot2_inline_literal_a: 97; GFX10: ; %bb.0: 98; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 100; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 101; GFX10-NEXT: ; implicit-def: $vcc_hi 102; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 103; GFX10-NEXT: s_setpc_b64 s[30:31] 104 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false) 105 ret i32 %r 106} 107 108define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) { 109; GFX906-LABEL: v_sdot2_inline_literal_b: 110; GFX906: ; %bb.0: 111; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 112; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 113; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 114; GFX906-NEXT: s_setpc_b64 s[30:31] 115; 116; GFX908-LABEL: v_sdot2_inline_literal_b: 117; GFX908: ; %bb.0: 118; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 120; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 121; GFX908-NEXT: s_setpc_b64 s[30:31] 122; 123; GFX10-LABEL: v_sdot2_inline_literal_b: 124; GFX10: ; %bb.0: 125; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 127; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 128; GFX10-NEXT: ; implicit-def: $vcc_hi 129; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 130; GFX10-NEXT: s_setpc_b64 s[30:31] 131 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 132 ret i32 %r 133} 134 135define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { 136; GFX906-LABEL: v_sdot2_inline_literal_a_b: 137; GFX906: ; %bb.0: 138; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 139; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 140; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 141; GFX906-NEXT: v_mov_b32_e32 v0, s5 142; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 143; GFX906-NEXT: s_setpc_b64 s[30:31] 144; 145; GFX908-LABEL: v_sdot2_inline_literal_a_b: 146; GFX908: ; %bb.0: 147; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 148; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 149; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 150; GFX908-NEXT: v_mov_b32_e32 v0, s5 151; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 152; GFX908-NEXT: s_setpc_b64 s[30:31] 153; 154; GFX10-LABEL: v_sdot2_inline_literal_a_b: 155; GFX10: ; %bb.0: 156; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 158; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 159; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 160; GFX10-NEXT: ; implicit-def: $vcc_hi 161; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1 162; GFX10-NEXT: s_setpc_b64 s[30:31] 163 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false) 164 ret i32 %r 165} 166 167define i32 @v_sdot2_inline_literal_a_b_c() { 168; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: 169; GFX906: ; %bb.0: 170; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 172; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 173; GFX906-NEXT: v_mov_b32_e32 v0, s5 174; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 175; GFX906-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: 178; GFX908: ; %bb.0: 179; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 181; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 182; GFX908-NEXT: v_mov_b32_e32 v0, s5 183; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 184; GFX908-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: 187; GFX10: ; %bb.0: 188; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 190; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 191; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 192; GFX10-NEXT: ; implicit-def: $vcc_hi 193; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8 194; GFX10-NEXT: s_setpc_b64 s[30:31] 195 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false) 196 ret i32 %r 197} 198 199define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) { 200; GFX906-LABEL: v_sdot2_inline_literal_c: 201; GFX906: ; %bb.0: 202; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 204; GFX906-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX908-LABEL: v_sdot2_inline_literal_c: 207; GFX908: ; %bb.0: 208; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 210; GFX908-NEXT: s_setpc_b64 s[30:31] 211; 212; GFX10-LABEL: v_sdot2_inline_literal_c: 213; GFX10: ; %bb.0: 214; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 216; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, 7 217; GFX10-NEXT: ; implicit-def: $vcc_hi 218; GFX10-NEXT: s_setpc_b64 s[30:31] 219 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false) 220 ret i32 %r 221} 222 223define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) { 224; GFX906-LABEL: v_sdot2_fneg_a: 225; GFX906: ; %bb.0: 226; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 228; GFX906-NEXT: s_setpc_b64 s[30:31] 229; 230; GFX908-LABEL: v_sdot2_fneg_a: 231; GFX908: ; %bb.0: 232; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 233; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 234; GFX908-NEXT: s_setpc_b64 s[30:31] 235; 236; GFX10-LABEL: v_sdot2_fneg_a: 237; GFX10: ; %bb.0: 238; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 239; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 240; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] 241; GFX10-NEXT: ; implicit-def: $vcc_hi 242; GFX10-NEXT: s_setpc_b64 s[30:31] 243 %neg.a = fneg <2 x half> %a 244 %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16> 245 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false) 246 ret i32 %r 247} 248 249define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) { 250; GFX906-LABEL: v_sdot2_fneg_b: 251; GFX906: ; %bb.0: 252; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 253; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 254; GFX906-NEXT: s_setpc_b64 s[30:31] 255; 256; GFX908-LABEL: v_sdot2_fneg_b: 257; GFX908: ; %bb.0: 258; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 260; GFX908-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX10-LABEL: v_sdot2_fneg_b: 263; GFX10: ; %bb.0: 264; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 266; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] 267; GFX10-NEXT: ; implicit-def: $vcc_hi 268; GFX10-NEXT: s_setpc_b64 s[30:31] 269 %neg.b = fneg <2 x half> %b 270 %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16> 271 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false) 272 ret i32 %r 273} 274 275define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) { 276; GFX906-LABEL: v_sdot2_fnegf32_c: 277; GFX906: ; %bb.0: 278; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 280; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 281; GFX906-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX908-LABEL: v_sdot2_fnegf32_c: 284; GFX908: ; %bb.0: 285; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 287; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 288; GFX908-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX10-LABEL: v_sdot2_fnegf32_c: 291; GFX10: ; %bb.0: 292; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 294; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 295; GFX10-NEXT: ; implicit-def: $vcc_hi 296; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 297; GFX10-NEXT: s_setpc_b64 s[30:31] 298 %neg.c = fneg float %c 299 %cast.neg.c = bitcast float %neg.c to i32 300 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 301 ret i32 %r 302} 303 304define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) { 305; GFX906-LABEL: v_sdot2_fnegv2f16_c: 306; GFX906: ; %bb.0: 307; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 309; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 310; GFX906-NEXT: s_setpc_b64 s[30:31] 311; 312; GFX908-LABEL: v_sdot2_fnegv2f16_c: 313; GFX908: ; %bb.0: 314; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 315; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 316; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 317; GFX908-NEXT: s_setpc_b64 s[30:31] 318; 319; GFX10-LABEL: v_sdot2_fnegv2f16_c: 320; GFX10: ; %bb.0: 321; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 323; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 324; GFX10-NEXT: ; implicit-def: $vcc_hi 325; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 326; GFX10-NEXT: s_setpc_b64 s[30:31] 327 %neg.c = fneg <2 x half> %c 328 %cast.neg.c = bitcast <2 x half> %neg.c to i32 329 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false) 330 ret i32 %r 331} 332 333define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { 334; GFX906-LABEL: v_sdot2_shuffle10_a: 335; GFX906: ; %bb.0: 336; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 337; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 338; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 339; GFX906-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX908-LABEL: v_sdot2_shuffle10_a: 342; GFX908: ; %bb.0: 343; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 345; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 346; GFX908-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX10-LABEL: v_sdot2_shuffle10_a: 349; GFX10: ; %bb.0: 350; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 352; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 353; GFX10-NEXT: ; implicit-def: $vcc_hi 354; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 355; GFX10-NEXT: s_setpc_b64 s[30:31] 356 %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 357 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) 358 ret i32 %r 359} 360 361define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { 362; GFX906-LABEL: v_sdot2_shuffle10_b: 363; GFX906: ; %bb.0: 364; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 365; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 366; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 367; GFX906-NEXT: s_setpc_b64 s[30:31] 368; 369; GFX908-LABEL: v_sdot2_shuffle10_b: 370; GFX908: ; %bb.0: 371; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 373; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 374; GFX908-NEXT: s_setpc_b64 s[30:31] 375; 376; GFX10-LABEL: v_sdot2_shuffle10_b: 377; GFX10: ; %bb.0: 378; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 380; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 381; GFX10-NEXT: ; implicit-def: $vcc_hi 382; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 383; GFX10-NEXT: s_setpc_b64 s[30:31] 384 %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0> 385 %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) 386 ret i32 %r 387} 388 389declare i32 @llvm.amdgcn.sdot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0 390 391attributes #0 = { nounwind readnone speculatable } 392