1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6 7define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { 8; GFX9-LABEL: s_shl_v2i16: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 11; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 12; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 13; GFX9-NEXT: s_mov_b32 s7, 0xf000 14; GFX9-NEXT: s_mov_b32 s6, -1 15; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16; GFX9-NEXT: v_mov_b32_e32 v0, s2 17; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 18; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 19; GFX9-NEXT: s_endpgm 20; 21; VI-LABEL: s_shl_v2i16: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 24; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 25; VI-NEXT: s_load_dword s0, s[0:1], 0x30 26; VI-NEXT: s_mov_b32 s3, 0xffff 27; VI-NEXT: s_mov_b32 s7, 0xf000 28; VI-NEXT: s_mov_b32 s6, -1 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: s_lshr_b32 s1, s2, 16 31; VI-NEXT: s_lshr_b32 s8, s0, 16 32; VI-NEXT: s_and_b32 s2, s2, s3 33; VI-NEXT: s_and_b32 s0, s0, s3 34; VI-NEXT: s_lshl_b32 s0, s2, s0 35; VI-NEXT: s_lshl_b32 s1, s1, s8 36; VI-NEXT: s_lshl_b32 s1, s1, 16 37; VI-NEXT: s_and_b32 s0, s0, s3 38; VI-NEXT: s_or_b32 s0, s0, s1 39; VI-NEXT: v_mov_b32_e32 v0, s0 40; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 41; VI-NEXT: s_endpgm 42; 43; CI-LABEL: s_shl_v2i16: 44; CI: ; %bb.0: 45; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 46; CI-NEXT: s_load_dword s2, s[0:1], 0xb 47; CI-NEXT: s_load_dword s0, s[0:1], 0xc 48; CI-NEXT: s_mov_b32 s3, 0xffff 49; CI-NEXT: s_mov_b32 s7, 0xf000 50; CI-NEXT: s_mov_b32 s6, -1 51; CI-NEXT: s_waitcnt lgkmcnt(0) 52; CI-NEXT: s_lshr_b32 s1, s2, 16 53; CI-NEXT: s_and_b32 s8, s0, s3 54; CI-NEXT: s_lshr_b32 s0, s0, 16 55; CI-NEXT: s_lshl_b32 s0, s1, s0 56; CI-NEXT: s_lshl_b32 s1, s2, s8 57; CI-NEXT: s_lshl_b32 s0, s0, 16 58; CI-NEXT: s_and_b32 s1, s1, s3 59; CI-NEXT: s_or_b32 s0, s1, s0 60; CI-NEXT: v_mov_b32_e32 v0, s0 61; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 62; CI-NEXT: s_endpgm 63; 64; GFX10-LABEL: s_shl_v2i16: 65; GFX10: ; %bb.0: 66; GFX10-NEXT: s_clause 0x2 67; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 68; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30 69; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 70; GFX10-NEXT: s_mov_b32 s7, 0x31016000 71; GFX10-NEXT: s_mov_b32 s6, -1 72; GFX10-NEXT: s_waitcnt lgkmcnt(0) 73; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 74; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 75; GFX10-NEXT: s_endpgm 76 %result = shl <2 x i16> %lhs, %rhs 77 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 78 ret void 79} 80 81define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 82; GFX9-LABEL: v_shl_v2i16: 83; GFX9: ; %bb.0: 84; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 85; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 86; GFX9-NEXT: s_waitcnt lgkmcnt(0) 87; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 88; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 89; GFX9-NEXT: s_waitcnt vmcnt(0) 90; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 91; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 92; GFX9-NEXT: s_endpgm 93; 94; VI-LABEL: v_shl_v2i16: 95; VI: ; %bb.0: 96; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 97; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: v_mov_b32_e32 v1, s3 100; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 101; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 102; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 103; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 104; VI-NEXT: flat_load_dword v5, v[0:1] 105; VI-NEXT: flat_load_dword v2, v[2:3] 106; VI-NEXT: v_mov_b32_e32 v1, s1 107; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 108; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 111; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 112; VI-NEXT: v_or_b32_e32 v2, v3, v2 113; VI-NEXT: flat_store_dword v[0:1], v2 114; VI-NEXT: s_endpgm 115; 116; CI-LABEL: v_shl_v2i16: 117; CI: ; %bb.0: 118; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 119; CI-NEXT: s_mov_b32 s3, 0xf000 120; CI-NEXT: s_mov_b32 s2, 0 121; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 122; CI-NEXT: v_mov_b32_e32 v1, 0 123; CI-NEXT: s_waitcnt lgkmcnt(0) 124; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 125; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 126; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 127; CI-NEXT: s_mov_b32 s0, 0xffff 128; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 129; CI-NEXT: s_waitcnt vmcnt(1) 130; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 131; CI-NEXT: s_waitcnt vmcnt(0) 132; CI-NEXT: v_and_b32_e32 v5, s0, v3 133; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 134; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 135; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 136; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 137; CI-NEXT: v_and_b32_e32 v2, s0, v2 138; CI-NEXT: v_or_b32_e32 v2, v2, v3 139; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 140; CI-NEXT: s_endpgm 141; 142; GFX10-LABEL: v_shl_v2i16: 143; GFX10: ; %bb.0: 144; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 145; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: s_clause 0x1 148; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 149; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 150; GFX10-NEXT: s_waitcnt vmcnt(0) 151; GFX10-NEXT: v_pk_lshlrev_b16 v1, v2, v1 152; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 153; GFX10-NEXT: s_endpgm 154 %tid = call i32 @llvm.amdgcn.workitem.id.x() 155 %tid.ext = sext i32 %tid to i64 156 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 157 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 158 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 159 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 160 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 161 %result = shl <2 x i16> %a, %b 162 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 163 ret void 164} 165 166define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 167; GFX9-LABEL: shl_v_s_v2i16: 168; GFX9: ; %bb.0: 169; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 170; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 171; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 172; GFX9-NEXT: s_waitcnt lgkmcnt(0) 173; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 174; GFX9-NEXT: s_waitcnt vmcnt(0) 175; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 176; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 177; GFX9-NEXT: s_endpgm 178; 179; VI-LABEL: shl_v_s_v2i16: 180; VI: ; %bb.0: 181; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 182; VI-NEXT: s_load_dword s0, s[0:1], 0x34 183; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 184; VI-NEXT: s_waitcnt lgkmcnt(0) 185; VI-NEXT: v_mov_b32_e32 v1, s7 186; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 187; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 188; VI-NEXT: flat_load_dword v3, v[0:1] 189; VI-NEXT: s_lshr_b32 s1, s0, 16 190; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 191; VI-NEXT: v_mov_b32_e32 v2, s1 192; VI-NEXT: v_mov_b32_e32 v1, s5 193; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 194; VI-NEXT: s_waitcnt vmcnt(0) 195; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 196; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 197; VI-NEXT: v_or_b32_e32 v2, v4, v2 198; VI-NEXT: flat_store_dword v[0:1], v2 199; VI-NEXT: s_endpgm 200; 201; CI-LABEL: shl_v_s_v2i16: 202; CI: ; %bb.0: 203; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 204; CI-NEXT: s_load_dword s8, s[0:1], 0xd 205; CI-NEXT: s_mov_b32 s3, 0xf000 206; CI-NEXT: s_mov_b32 s2, 0 207; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 208; CI-NEXT: s_waitcnt lgkmcnt(0) 209; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 210; CI-NEXT: v_mov_b32_e32 v1, 0 211; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 212; CI-NEXT: s_mov_b32 s0, 0xffff 213; CI-NEXT: s_lshr_b32 s1, s8, 16 214; CI-NEXT: s_and_b32 s8, s8, s0 215; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 216; CI-NEXT: s_waitcnt vmcnt(0) 217; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 218; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 219; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 220; CI-NEXT: v_and_b32_e32 v2, s0, v2 221; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 222; CI-NEXT: v_or_b32_e32 v2, v2, v3 223; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 224; CI-NEXT: s_endpgm 225; 226; GFX10-LABEL: shl_v_s_v2i16: 227; GFX10: ; %bb.0: 228; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 229; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 230; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 231; GFX10-NEXT: s_waitcnt lgkmcnt(0) 232; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 233; GFX10-NEXT: s_waitcnt vmcnt(0) 234; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1 235; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 236; GFX10-NEXT: s_endpgm 237 %tid = call i32 @llvm.amdgcn.workitem.id.x() 238 %tid.ext = sext i32 %tid to i64 239 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 240 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 241 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 242 %result = shl <2 x i16> %vgpr, %sgpr 243 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 244 ret void 245} 246 247define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { 248; GFX9-LABEL: shl_s_v_v2i16: 249; GFX9: ; %bb.0: 250; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 251; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 252; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 253; GFX9-NEXT: s_waitcnt lgkmcnt(0) 254; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 255; GFX9-NEXT: s_waitcnt vmcnt(0) 256; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 257; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 258; GFX9-NEXT: s_endpgm 259; 260; VI-LABEL: shl_s_v_v2i16: 261; VI: ; %bb.0: 262; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 263; VI-NEXT: s_load_dword s0, s[0:1], 0x34 264; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 265; VI-NEXT: s_waitcnt lgkmcnt(0) 266; VI-NEXT: v_mov_b32_e32 v1, s7 267; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 268; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 269; VI-NEXT: flat_load_dword v3, v[0:1] 270; VI-NEXT: s_lshr_b32 s1, s0, 16 271; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 272; VI-NEXT: v_mov_b32_e32 v2, s1 273; VI-NEXT: v_mov_b32_e32 v1, s5 274; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 275; VI-NEXT: s_waitcnt vmcnt(0) 276; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 277; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 278; VI-NEXT: v_or_b32_e32 v2, v4, v2 279; VI-NEXT: flat_store_dword v[0:1], v2 280; VI-NEXT: s_endpgm 281; 282; CI-LABEL: shl_s_v_v2i16: 283; CI: ; %bb.0: 284; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 285; CI-NEXT: s_load_dword s8, s[0:1], 0xd 286; CI-NEXT: s_mov_b32 s3, 0xf000 287; CI-NEXT: s_mov_b32 s2, 0 288; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 289; CI-NEXT: s_waitcnt lgkmcnt(0) 290; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 291; CI-NEXT: v_mov_b32_e32 v1, 0 292; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 293; CI-NEXT: s_mov_b32 s0, 0xffff 294; CI-NEXT: s_lshr_b32 s1, s8, 16 295; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 296; CI-NEXT: s_waitcnt vmcnt(0) 297; CI-NEXT: v_and_b32_e32 v3, s0, v2 298; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 299; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 300; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 301; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 302; CI-NEXT: v_and_b32_e32 v3, s0, v3 303; CI-NEXT: v_or_b32_e32 v2, v3, v2 304; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 305; CI-NEXT: s_endpgm 306; 307; GFX10-LABEL: shl_s_v_v2i16: 308; GFX10: ; %bb.0: 309; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 310; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 311; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 312; GFX10-NEXT: s_waitcnt lgkmcnt(0) 313; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 314; GFX10-NEXT: s_waitcnt vmcnt(0) 315; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 316; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 317; GFX10-NEXT: s_endpgm 318 %tid = call i32 @llvm.amdgcn.workitem.id.x() 319 %tid.ext = sext i32 %tid to i64 320 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 321 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 322 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 323 %result = shl <2 x i16> %sgpr, %vgpr 324 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 325 ret void 326} 327 328define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 329; GFX9-LABEL: shl_imm_v_v2i16: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 332; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 334; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 335; GFX9-NEXT: s_waitcnt vmcnt(0) 336; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 337; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 338; GFX9-NEXT: s_endpgm 339; 340; VI-LABEL: shl_imm_v_v2i16: 341; VI: ; %bb.0: 342; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 343; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 344; VI-NEXT: v_mov_b32_e32 v4, 8 345; VI-NEXT: s_waitcnt lgkmcnt(0) 346; VI-NEXT: v_mov_b32_e32 v1, s3 347; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 348; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 349; VI-NEXT: flat_load_dword v3, v[0:1] 350; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 351; VI-NEXT: v_mov_b32_e32 v1, s1 352; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 353; VI-NEXT: s_waitcnt vmcnt(0) 354; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 355; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 356; VI-NEXT: v_or_b32_e32 v2, v2, v3 357; VI-NEXT: flat_store_dword v[0:1], v2 358; VI-NEXT: s_endpgm 359; 360; CI-LABEL: shl_imm_v_v2i16: 361; CI: ; %bb.0: 362; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 363; CI-NEXT: s_mov_b32 s3, 0xf000 364; CI-NEXT: s_mov_b32 s2, 0 365; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 366; CI-NEXT: v_mov_b32_e32 v1, 0 367; CI-NEXT: s_waitcnt lgkmcnt(0) 368; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 369; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 370; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 371; CI-NEXT: s_waitcnt vmcnt(0) 372; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 373; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 374; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 375; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 376; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 377; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 378; CI-NEXT: v_or_b32_e32 v2, v3, v2 379; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 380; CI-NEXT: s_endpgm 381; 382; GFX10-LABEL: shl_imm_v_v2i16: 383; GFX10: ; %bb.0: 384; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 385; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; GFX10-NEXT: s_waitcnt lgkmcnt(0) 387; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 388; GFX10-NEXT: s_waitcnt vmcnt(0) 389; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 390; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 391; GFX10-NEXT: s_endpgm 392 %tid = call i32 @llvm.amdgcn.workitem.id.x() 393 %tid.ext = sext i32 %tid to i64 394 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 395 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 396 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 397 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr 398 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 399 ret void 400} 401 402define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 403; GFX9-LABEL: shl_v_imm_v2i16: 404; GFX9: ; %bb.0: 405; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 406; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 408; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 409; GFX9-NEXT: s_waitcnt vmcnt(0) 410; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 411; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 412; GFX9-NEXT: s_endpgm 413; 414; VI-LABEL: shl_v_imm_v2i16: 415; VI: ; %bb.0: 416; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 417; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 418; VI-NEXT: s_waitcnt lgkmcnt(0) 419; VI-NEXT: v_mov_b32_e32 v1, s3 420; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 421; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 422; VI-NEXT: flat_load_dword v3, v[0:1] 423; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 424; VI-NEXT: v_mov_b32_e32 v1, s1 425; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 426; VI-NEXT: s_waitcnt vmcnt(0) 427; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 428; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 429; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 430; VI-NEXT: v_or_b32_e32 v2, v3, v2 431; VI-NEXT: flat_store_dword v[0:1], v2 432; VI-NEXT: s_endpgm 433; 434; CI-LABEL: shl_v_imm_v2i16: 435; CI: ; %bb.0: 436; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 437; CI-NEXT: s_mov_b32 s3, 0xf000 438; CI-NEXT: s_mov_b32 s2, 0 439; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 440; CI-NEXT: v_mov_b32_e32 v1, 0 441; CI-NEXT: s_waitcnt lgkmcnt(0) 442; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 443; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 444; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 445; CI-NEXT: s_waitcnt vmcnt(0) 446; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 447; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 448; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 449; CI-NEXT: s_endpgm 450; 451; GFX10-LABEL: shl_v_imm_v2i16: 452; GFX10: ; %bb.0: 453; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 454; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 455; GFX10-NEXT: s_waitcnt lgkmcnt(0) 456; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 457; GFX10-NEXT: s_waitcnt vmcnt(0) 458; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 459; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 460; GFX10-NEXT: s_endpgm 461 %tid = call i32 @llvm.amdgcn.workitem.id.x() 462 %tid.ext = sext i32 %tid to i64 463 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 464 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 465 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 466 %result = shl <2 x i16> %vgpr, <i16 8, i16 8> 467 store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep 468 ret void 469} 470 471define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 472; GFX9-LABEL: v_shl_v4i16: 473; GFX9: ; %bb.0: 474; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 475; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 476; GFX9-NEXT: s_waitcnt lgkmcnt(0) 477; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 478; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 479; GFX9-NEXT: s_waitcnt vmcnt(0) 480; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 481; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 482; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; VI-LABEL: v_shl_v4i16: 486; VI: ; %bb.0: 487; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 488; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 489; VI-NEXT: s_waitcnt lgkmcnt(0) 490; VI-NEXT: v_mov_b32_e32 v1, s3 491; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 492; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 493; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 494; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 495; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 496; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 497; VI-NEXT: v_mov_b32_e32 v5, s1 498; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 499; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 500; VI-NEXT: s_waitcnt vmcnt(0) 501; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 502; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 503; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 504; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 505; VI-NEXT: v_or_b32_e32 v1, v6, v1 506; VI-NEXT: v_or_b32_e32 v0, v3, v0 507; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 508; VI-NEXT: s_endpgm 509; 510; CI-LABEL: v_shl_v4i16: 511; CI: ; %bb.0: 512; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 513; CI-NEXT: s_mov_b32 s3, 0xf000 514; CI-NEXT: s_mov_b32 s2, 0 515; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 516; CI-NEXT: v_mov_b32_e32 v1, 0 517; CI-NEXT: s_waitcnt lgkmcnt(0) 518; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 519; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 520; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 521; CI-NEXT: s_mov_b32 s0, 0xffff 522; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 523; CI-NEXT: s_waitcnt vmcnt(1) 524; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 525; CI-NEXT: s_waitcnt vmcnt(0) 526; CI-NEXT: v_and_b32_e32 v8, s0, v4 527; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 528; CI-NEXT: v_and_b32_e32 v9, s0, v5 529; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 530; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 531; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 532; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 533; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 534; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 535; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 536; CI-NEXT: v_and_b32_e32 v3, s0, v3 537; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 538; CI-NEXT: v_and_b32_e32 v2, s0, v2 539; CI-NEXT: v_or_b32_e32 v3, v3, v5 540; CI-NEXT: v_or_b32_e32 v2, v2, v4 541; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 542; CI-NEXT: s_endpgm 543; 544; GFX10-LABEL: v_shl_v4i16: 545; GFX10: ; %bb.0: 546; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 547; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 548; GFX10-NEXT: s_waitcnt lgkmcnt(0) 549; GFX10-NEXT: s_clause 0x1 550; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 551; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 552; GFX10-NEXT: s_waitcnt vmcnt(0) 553; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 554; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 555; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 556; GFX10-NEXT: s_endpgm 557 %tid = call i32 @llvm.amdgcn.workitem.id.x() 558 %tid.ext = sext i32 %tid to i64 559 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 560 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 561 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 562 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 563 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 564 %result = shl <4 x i16> %a, %b 565 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 566 ret void 567} 568 569define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { 570; GFX9-LABEL: shl_v_imm_v4i16: 571; GFX9: ; %bb.0: 572; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 573; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 574; GFX9-NEXT: s_waitcnt lgkmcnt(0) 575; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 576; GFX9-NEXT: s_waitcnt vmcnt(0) 577; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 578; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 579; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 580; GFX9-NEXT: s_endpgm 581; 582; VI-LABEL: shl_v_imm_v4i16: 583; VI: ; %bb.0: 584; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 585; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 588; VI-NEXT: v_mov_b32_e32 v1, s3 589; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 590; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 591; VI-NEXT: s_mov_b32 s2, 0xff000000 592; VI-NEXT: v_mov_b32_e32 v3, s1 593; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 594; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 595; VI-NEXT: s_waitcnt vmcnt(0) 596; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 597; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 598; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 599; VI-NEXT: v_and_b32_e32 v0, s2, v0 600; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 601; VI-NEXT: v_and_b32_e32 v4, s2, v4 602; VI-NEXT: v_or_b32_e32 v1, v1, v4 603; VI-NEXT: v_or_b32_e32 v0, v5, v0 604; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 605; VI-NEXT: s_endpgm 606; 607; CI-LABEL: shl_v_imm_v4i16: 608; CI: ; %bb.0: 609; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 610; CI-NEXT: s_mov_b32 s3, 0xf000 611; CI-NEXT: s_mov_b32 s2, 0 612; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 613; CI-NEXT: v_mov_b32_e32 v1, 0 614; CI-NEXT: s_waitcnt lgkmcnt(0) 615; CI-NEXT: s_mov_b64 s[0:1], s[6:7] 616; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 617; CI-NEXT: s_mov_b32 s0, 0xff00 618; CI-NEXT: s_mov_b64 s[6:7], s[2:3] 619; CI-NEXT: s_waitcnt vmcnt(0) 620; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 621; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 622; CI-NEXT: v_and_b32_e32 v4, s0, v4 623; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 624; CI-NEXT: v_and_b32_e32 v3, s0, v3 625; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 626; CI-NEXT: v_or_b32_e32 v3, v3, v4 627; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 628; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 629; CI-NEXT: s_endpgm 630; 631; GFX10-LABEL: shl_v_imm_v4i16: 632; GFX10: ; %bb.0: 633; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 634; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 635; GFX10-NEXT: s_waitcnt lgkmcnt(0) 636; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 637; GFX10-NEXT: s_waitcnt vmcnt(0) 638; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 639; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 640; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 641; GFX10-NEXT: s_endpgm 642 %tid = call i32 @llvm.amdgcn.workitem.id.x() 643 %tid.ext = sext i32 %tid to i64 644 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 645 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 646 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 647 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> 648 store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep 649 ret void 650} 651 652declare i32 @llvm.amdgcn.workitem.id.x() #1 653 654attributes #0 = { nounwind } 655attributes #1 = { nounwind readnone } 656