1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 6; GFX9-LABEL: store_load_sindex_kernel: 7; GFX9: ; %bb.0: ; %bb 8; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 9; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 10; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 11; GFX9-NEXT: v_mov_b32_e32 v0, 15 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_lshl_b32 s1, s0, 2 14; GFX9-NEXT: s_and_b32 s0, s0, 15 15; GFX9-NEXT: s_add_i32 s1, s1, 4 16; GFX9-NEXT: s_lshl_b32 s0, s0, 2 17; GFX9-NEXT: scratch_store_dword off, v0, s1 18; GFX9-NEXT: s_waitcnt vmcnt(0) 19; GFX9-NEXT: s_add_i32 s0, s0, 4 20; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 21; GFX9-NEXT: s_waitcnt vmcnt(0) 22; GFX9-NEXT: s_endpgm 23; 24; GFX10-LABEL: store_load_sindex_kernel: 25; GFX10: ; %bb.0: ; %bb 26; GFX10-NEXT: s_add_u32 s2, s2, s5 27; GFX10-NEXT: s_addc_u32 s3, s3, 0 28; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 29; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 30; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 31; GFX10-NEXT: v_mov_b32_e32 v0, 15 32; GFX10-NEXT: s_waitcnt lgkmcnt(0) 33; GFX10-NEXT: s_and_b32 s1, s0, 15 34; GFX10-NEXT: s_lshl_b32 s0, s0, 2 35; GFX10-NEXT: s_lshl_b32 s1, s1, 2 36; GFX10-NEXT: s_add_i32 s0, s0, 4 37; GFX10-NEXT: s_add_i32 s1, s1, 4 38; GFX10-NEXT: scratch_store_dword off, v0, s0 39; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 40; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 41; GFX10-NEXT: s_waitcnt vmcnt(0) 42; GFX10-NEXT: s_endpgm 43bb: 44 %i = alloca [32 x float], align 4, addrspace(5) 45 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 46 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 47 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 48 store volatile i32 15, i32 addrspace(5)* %i8, align 4 49 %i9 = and i32 %idx, 15 50 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 51 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 52 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 53 ret void 54} 55 56define amdgpu_kernel void @store_load_vindex_kernel() { 57; GFX9-LABEL: store_load_vindex_kernel: 58; GFX9: ; %bb.0: ; %bb 59; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 60; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 61; GFX9-NEXT: v_mov_b32_e32 v2, 4 62; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 63; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 64; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 65; GFX9-NEXT: v_mov_b32_e32 v3, 15 66; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 67; GFX9-NEXT: scratch_store_dword v1, v3, off 68; GFX9-NEXT: s_waitcnt vmcnt(0) 69; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 70; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 71; GFX9-NEXT: s_waitcnt vmcnt(0) 72; GFX9-NEXT: s_endpgm 73; 74; GFX10-LABEL: store_load_vindex_kernel: 75; GFX10: ; %bb.0: ; %bb 76; GFX10-NEXT: s_add_u32 s0, s0, s3 77; GFX10-NEXT: s_addc_u32 s1, s1, 0 78; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 79; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 80; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 81; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 82; GFX10-NEXT: v_mov_b32_e32 v2, 4 83; GFX10-NEXT: v_mov_b32_e32 v3, 15 84; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 85; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 86; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 87; GFX10-NEXT: scratch_store_dword v0, v3, off 88; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 89; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 90; GFX10-NEXT: s_waitcnt vmcnt(0) 91; GFX10-NEXT: s_endpgm 92bb: 93 %i = alloca [32 x float], align 4, addrspace(5) 94 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 95 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 96 %i3 = zext i32 %i2 to i64 97 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 98 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 99 store volatile i32 15, i32 addrspace(5)* %i8, align 4 100 %i9 = sub nsw i32 31, %i2 101 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 102 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 103 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 104 ret void 105} 106 107define void @store_load_vindex_foo(i32 %idx) { 108; GFX9-LABEL: store_load_vindex_foo: 109; GFX9: ; %bb.0: ; %bb 110; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 112; GFX9-NEXT: v_mov_b32_e32 v2, s32 113; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 114; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 115; GFX9-NEXT: v_mov_b32_e32 v3, 15 116; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 117; GFX9-NEXT: scratch_store_dword v1, v3, off 118; GFX9-NEXT: s_waitcnt vmcnt(0) 119; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 120; GFX9-NEXT: scratch_load_dword v0, v0, off glc 121; GFX9-NEXT: s_waitcnt vmcnt(0) 122; GFX9-NEXT: s_setpc_b64 s[30:31] 123; 124; GFX10-LABEL: store_load_vindex_foo: 125; GFX10: ; %bb.0: ; %bb 126; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 128; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 129; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 130; GFX10-NEXT: v_mov_b32_e32 v2, s32 131; GFX10-NEXT: v_mov_b32_e32 v3, 15 132; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 133; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 134; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 135; GFX10-NEXT: scratch_store_dword v0, v3, off 136; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 137; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 138; GFX10-NEXT: s_waitcnt vmcnt(0) 139; GFX10-NEXT: s_setpc_b64 s[30:31] 140bb: 141 %i = alloca [32 x float], align 4, addrspace(5) 142 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 143 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 144 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 145 store volatile i32 15, i32 addrspace(5)* %i8, align 4 146 %i9 = and i32 %idx, 15 147 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 148 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 149 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 150 ret void 151} 152 153define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 154; GFX9-LABEL: private_ptr_foo: 155; GFX9: ; %bb.0: 156; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 158; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 159; GFX9-NEXT: s_waitcnt vmcnt(0) 160; GFX9-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX10-LABEL: private_ptr_foo: 163; GFX10: ; %bb.0: 164; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 166; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 167; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 168; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 169; GFX10-NEXT: s_setpc_b64 s[30:31] 170 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 171 store float 1.000000e+01, float addrspace(5)* %gep, align 4 172 ret void 173} 174 175define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 176; GFX9-LABEL: store_load_sindex_small_offset_kernel: 177; GFX9: ; %bb.0: ; %bb 178; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 179; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 180; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 181; GFX9-NEXT: s_mov_b32 vcc_hi, 0 182; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 183; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 184; GFX9-NEXT: s_lshl_b32 s1, s0, 2 185; GFX9-NEXT: s_and_b32 s0, s0, 15 186; GFX9-NEXT: v_mov_b32_e32 v0, 15 187; GFX9-NEXT: s_addk_i32 s1, 0x104 188; GFX9-NEXT: s_lshl_b32 s0, s0, 2 189; GFX9-NEXT: scratch_store_dword off, v0, s1 190; GFX9-NEXT: s_waitcnt vmcnt(0) 191; GFX9-NEXT: s_addk_i32 s0, 0x104 192; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 193; GFX9-NEXT: s_waitcnt vmcnt(0) 194; GFX9-NEXT: s_endpgm 195; 196; GFX10-LABEL: store_load_sindex_small_offset_kernel: 197; GFX10: ; %bb.0: ; %bb 198; GFX10-NEXT: s_add_u32 s2, s2, s5 199; GFX10-NEXT: s_addc_u32 s3, s3, 0 200; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 201; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 202; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 203; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 204; GFX10-NEXT: s_waitcnt vmcnt(0) 205; GFX10-NEXT: v_mov_b32_e32 v0, 15 206; GFX10-NEXT: s_waitcnt lgkmcnt(0) 207; GFX10-NEXT: s_and_b32 s1, s0, 15 208; GFX10-NEXT: s_lshl_b32 s0, s0, 2 209; GFX10-NEXT: s_lshl_b32 s1, s1, 2 210; GFX10-NEXT: s_addk_i32 s0, 0x104 211; GFX10-NEXT: s_addk_i32 s1, 0x104 212; GFX10-NEXT: scratch_store_dword off, v0, s0 213; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 214; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 215; GFX10-NEXT: s_waitcnt vmcnt(0) 216; GFX10-NEXT: s_endpgm 217bb: 218 %padding = alloca [64 x i32], align 4, addrspace(5) 219 %i = alloca [32 x float], align 4, addrspace(5) 220 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 221 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 222 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 223 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 224 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 225 store volatile i32 15, i32 addrspace(5)* %i8, align 4 226 %i9 = and i32 %idx, 15 227 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 228 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 229 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 230 ret void 231} 232 233define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 234; GFX9-LABEL: store_load_vindex_small_offset_kernel: 235; GFX9: ; %bb.0: ; %bb 236; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 237; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 238; GFX9-NEXT: s_mov_b32 vcc_hi, 0 239; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 240; GFX9-NEXT: s_waitcnt vmcnt(0) 241; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 242; GFX9-NEXT: v_mov_b32_e32 v2, 0x104 243; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 244; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 245; GFX9-NEXT: v_mov_b32_e32 v3, 15 246; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 247; GFX9-NEXT: scratch_store_dword v1, v3, off 248; GFX9-NEXT: s_waitcnt vmcnt(0) 249; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 250; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 251; GFX9-NEXT: s_waitcnt vmcnt(0) 252; GFX9-NEXT: s_endpgm 253; 254; GFX10-LABEL: store_load_vindex_small_offset_kernel: 255; GFX10: ; %bb.0: ; %bb 256; GFX10-NEXT: s_add_u32 s0, s0, s3 257; GFX10-NEXT: s_addc_u32 s1, s1, 0 258; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 259; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 260; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 261; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 262; GFX10-NEXT: v_mov_b32_e32 v2, 0x104 263; GFX10-NEXT: v_mov_b32_e32 v3, 15 264; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 265; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 266; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 267; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc 268; GFX10-NEXT: s_waitcnt vmcnt(0) 269; GFX10-NEXT: scratch_store_dword v0, v3, off 270; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 271; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 272; GFX10-NEXT: s_waitcnt vmcnt(0) 273; GFX10-NEXT: s_endpgm 274bb: 275 %padding = alloca [64 x i32], align 4, addrspace(5) 276 %i = alloca [32 x float], align 4, addrspace(5) 277 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 278 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 279 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 280 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 281 %i3 = zext i32 %i2 to i64 282 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 283 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 284 store volatile i32 15, i32 addrspace(5)* %i8, align 4 285 %i9 = sub nsw i32 31, %i2 286 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 287 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 288 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 289 ret void 290} 291 292define void @store_load_vindex_small_offset_foo(i32 %idx) { 293; GFX9-LABEL: store_load_vindex_small_offset_foo: 294; GFX9: ; %bb.0: ; %bb 295; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 296; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 297; GFX9-NEXT: s_waitcnt vmcnt(0) 298; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 299; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 300; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi 301; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 302; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 303; GFX9-NEXT: v_mov_b32_e32 v3, 15 304; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 305; GFX9-NEXT: scratch_store_dword v1, v3, off 306; GFX9-NEXT: s_waitcnt vmcnt(0) 307; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 308; GFX9-NEXT: scratch_load_dword v0, v0, off glc 309; GFX9-NEXT: s_waitcnt vmcnt(0) 310; GFX9-NEXT: s_setpc_b64 s[30:31] 311; 312; GFX10-LABEL: store_load_vindex_small_offset_foo: 313; GFX10: ; %bb.0: ; %bb 314; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 315; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 316; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 317; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 318; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 319; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 320; GFX10-NEXT: v_mov_b32_e32 v3, 15 321; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 322; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 323; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 324; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc 325; GFX10-NEXT: s_waitcnt vmcnt(0) 326; GFX10-NEXT: scratch_store_dword v0, v3, off 327; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 328; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 329; GFX10-NEXT: s_waitcnt vmcnt(0) 330; GFX10-NEXT: s_setpc_b64 s[30:31] 331bb: 332 %padding = alloca [64 x i32], align 4, addrspace(5) 333 %i = alloca [32 x float], align 4, addrspace(5) 334 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 335 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 336 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 337 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 338 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 339 store volatile i32 15, i32 addrspace(5)* %i8, align 4 340 %i9 = and i32 %idx, 15 341 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 342 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 343 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 344 ret void 345} 346 347define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 348; GFX9-LABEL: store_load_sindex_large_offset_kernel: 349; GFX9: ; %bb.0: ; %bb 350; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 351; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 352; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 353; GFX9-NEXT: s_mov_b32 vcc_hi, 0 354; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 355; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 356; GFX9-NEXT: s_lshl_b32 s1, s0, 2 357; GFX9-NEXT: s_and_b32 s0, s0, 15 358; GFX9-NEXT: v_mov_b32_e32 v0, 15 359; GFX9-NEXT: s_addk_i32 s1, 0x4004 360; GFX9-NEXT: s_lshl_b32 s0, s0, 2 361; GFX9-NEXT: scratch_store_dword off, v0, s1 362; GFX9-NEXT: s_waitcnt vmcnt(0) 363; GFX9-NEXT: s_addk_i32 s0, 0x4004 364; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 365; GFX9-NEXT: s_waitcnt vmcnt(0) 366; GFX9-NEXT: s_endpgm 367; 368; GFX10-LABEL: store_load_sindex_large_offset_kernel: 369; GFX10: ; %bb.0: ; %bb 370; GFX10-NEXT: s_add_u32 s2, s2, s5 371; GFX10-NEXT: s_addc_u32 s3, s3, 0 372; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 373; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 374; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 375; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 376; GFX10-NEXT: s_waitcnt vmcnt(0) 377; GFX10-NEXT: v_mov_b32_e32 v0, 15 378; GFX10-NEXT: s_waitcnt lgkmcnt(0) 379; GFX10-NEXT: s_and_b32 s1, s0, 15 380; GFX10-NEXT: s_lshl_b32 s0, s0, 2 381; GFX10-NEXT: s_lshl_b32 s1, s1, 2 382; GFX10-NEXT: s_addk_i32 s0, 0x4004 383; GFX10-NEXT: s_addk_i32 s1, 0x4004 384; GFX10-NEXT: scratch_store_dword off, v0, s0 385; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 386; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 387; GFX10-NEXT: s_waitcnt vmcnt(0) 388; GFX10-NEXT: s_endpgm 389bb: 390 %padding = alloca [4096 x i32], align 4, addrspace(5) 391 %i = alloca [32 x float], align 4, addrspace(5) 392 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 393 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 394 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 395 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 396 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 397 store volatile i32 15, i32 addrspace(5)* %i8, align 4 398 %i9 = and i32 %idx, 15 399 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 400 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 401 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 402 ret void 403} 404 405define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 406; GFX9-LABEL: store_load_vindex_large_offset_kernel: 407; GFX9: ; %bb.0: ; %bb 408; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 409; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 410; GFX9-NEXT: s_mov_b32 vcc_hi, 0 411; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 412; GFX9-NEXT: s_waitcnt vmcnt(0) 413; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 414; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004 415; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 416; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 417; GFX9-NEXT: v_mov_b32_e32 v3, 15 418; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 419; GFX9-NEXT: scratch_store_dword v1, v3, off 420; GFX9-NEXT: s_waitcnt vmcnt(0) 421; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 422; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 423; GFX9-NEXT: s_waitcnt vmcnt(0) 424; GFX9-NEXT: s_endpgm 425; 426; GFX10-LABEL: store_load_vindex_large_offset_kernel: 427; GFX10: ; %bb.0: ; %bb 428; GFX10-NEXT: s_add_u32 s0, s0, s3 429; GFX10-NEXT: s_addc_u32 s1, s1, 0 430; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 431; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 432; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 433; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 434; GFX10-NEXT: v_mov_b32_e32 v2, 0x4004 435; GFX10-NEXT: v_mov_b32_e32 v3, 15 436; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 437; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 438; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 439; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc 440; GFX10-NEXT: s_waitcnt vmcnt(0) 441; GFX10-NEXT: scratch_store_dword v0, v3, off 442; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 443; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 444; GFX10-NEXT: s_waitcnt vmcnt(0) 445; GFX10-NEXT: s_endpgm 446bb: 447 %padding = alloca [4096 x i32], align 4, addrspace(5) 448 %i = alloca [32 x float], align 4, addrspace(5) 449 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 450 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 451 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 452 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 453 %i3 = zext i32 %i2 to i64 454 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 455 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 456 store volatile i32 15, i32 addrspace(5)* %i8, align 4 457 %i9 = sub nsw i32 31, %i2 458 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 459 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 460 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 461 ret void 462} 463 464define void @store_load_vindex_large_offset_foo(i32 %idx) { 465; GFX9-LABEL: store_load_vindex_large_offset_foo: 466; GFX9: ; %bb.0: ; %bb 467; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 468; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 469; GFX9-NEXT: s_waitcnt vmcnt(0) 470; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 471; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 472; GFX9-NEXT: v_mov_b32_e32 v2, vcc_hi 473; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 474; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 475; GFX9-NEXT: v_mov_b32_e32 v3, 15 476; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 477; GFX9-NEXT: scratch_store_dword v1, v3, off 478; GFX9-NEXT: s_waitcnt vmcnt(0) 479; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 480; GFX9-NEXT: scratch_load_dword v0, v0, off glc 481; GFX9-NEXT: s_waitcnt vmcnt(0) 482; GFX9-NEXT: s_setpc_b64 s[30:31] 483; 484; GFX10-LABEL: store_load_vindex_large_offset_foo: 485; GFX10: ; %bb.0: ; %bb 486; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 488; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 489; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 490; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 492; GFX10-NEXT: v_mov_b32_e32 v3, 15 493; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 494; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 495; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 496; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc 497; GFX10-NEXT: s_waitcnt vmcnt(0) 498; GFX10-NEXT: scratch_store_dword v0, v3, off 499; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 500; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 501; GFX10-NEXT: s_waitcnt vmcnt(0) 502; GFX10-NEXT: s_setpc_b64 s[30:31] 503bb: 504 %padding = alloca [4096 x i32], align 4, addrspace(5) 505 %i = alloca [32 x float], align 4, addrspace(5) 506 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 507 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 508 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 509 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 510 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 511 store volatile i32 15, i32 addrspace(5)* %i8, align 4 512 %i9 = and i32 %idx, 15 513 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 514 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 515 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 516 ret void 517} 518 519define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 520; GFX9-LABEL: store_load_large_imm_offset_kernel: 521; GFX9: ; %bb.0: ; %bb 522; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 523; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 524; GFX9-NEXT: v_mov_b32_e32 v0, 13 525; GFX9-NEXT: s_mov_b32 vcc_hi, 0 526; GFX9-NEXT: s_movk_i32 s0, 0x3e80 527; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 528; GFX9-NEXT: s_waitcnt vmcnt(0) 529; GFX9-NEXT: v_mov_b32_e32 v0, 15 530; GFX9-NEXT: s_add_i32 s0, s0, 4 531; GFX9-NEXT: scratch_store_dword off, v0, s0 532; GFX9-NEXT: s_waitcnt vmcnt(0) 533; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 534; GFX9-NEXT: s_waitcnt vmcnt(0) 535; GFX9-NEXT: s_endpgm 536; 537; GFX10-LABEL: store_load_large_imm_offset_kernel: 538; GFX10: ; %bb.0: ; %bb 539; GFX10-NEXT: s_add_u32 s0, s0, s3 540; GFX10-NEXT: s_addc_u32 s1, s1, 0 541; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 542; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 543; GFX10-NEXT: v_mov_b32_e32 v0, 13 544; GFX10-NEXT: v_mov_b32_e32 v1, 15 545; GFX10-NEXT: s_movk_i32 s0, 0x3e80 546; GFX10-NEXT: s_add_i32 s0, s0, 4 547; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 548; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 549; GFX10-NEXT: scratch_store_dword off, v1, s0 550; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 551; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 552; GFX10-NEXT: s_waitcnt vmcnt(0) 553; GFX10-NEXT: s_endpgm 554bb: 555 %i = alloca [4096 x i32], align 4, addrspace(5) 556 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 557 store volatile i32 13, i32 addrspace(5)* %i1, align 4 558 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 559 store volatile i32 15, i32 addrspace(5)* %i7, align 4 560 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 561 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 562 ret void 563} 564 565define void @store_load_large_imm_offset_foo() { 566; GFX9-LABEL: store_load_large_imm_offset_foo: 567; GFX9: ; %bb.0: ; %bb 568; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 569; GFX9-NEXT: v_mov_b32_e32 v0, 13 570; GFX9-NEXT: s_movk_i32 s0, 0x3e80 571; GFX9-NEXT: scratch_store_dword off, v0, s32 572; GFX9-NEXT: s_waitcnt vmcnt(0) 573; GFX9-NEXT: v_mov_b32_e32 v0, 15 574; GFX9-NEXT: s_add_i32 s0, s0, s32 575; GFX9-NEXT: scratch_store_dword off, v0, s0 576; GFX9-NEXT: s_waitcnt vmcnt(0) 577; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 578; GFX9-NEXT: s_waitcnt vmcnt(0) 579; GFX9-NEXT: s_setpc_b64 s[30:31] 580; 581; GFX10-LABEL: store_load_large_imm_offset_foo: 582; GFX10: ; %bb.0: ; %bb 583; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 584; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 585; GFX10-NEXT: v_mov_b32_e32 v0, 13 586; GFX10-NEXT: v_mov_b32_e32 v1, 15 587; GFX10-NEXT: s_movk_i32 s0, 0x3e80 588; GFX10-NEXT: s_add_i32 s0, s0, s32 589; GFX10-NEXT: scratch_store_dword off, v0, s32 590; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 591; GFX10-NEXT: scratch_store_dword off, v1, s0 592; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 593; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 594; GFX10-NEXT: s_waitcnt vmcnt(0) 595; GFX10-NEXT: s_setpc_b64 s[30:31] 596bb: 597 %i = alloca [4096 x i32], align 4, addrspace(5) 598 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 599 store volatile i32 13, i32 addrspace(5)* %i1, align 4 600 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 601 store volatile i32 15, i32 addrspace(5)* %i7, align 4 602 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 603 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 604 ret void 605} 606 607define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 608; GFX9-LABEL: store_load_vidx_sidx_offset: 609; GFX9: ; %bb.0: ; %bb 610; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 611; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 612; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 613; GFX9-NEXT: v_mov_b32_e32 v1, 15 614; GFX9-NEXT: s_waitcnt lgkmcnt(0) 615; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 616; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 617; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 618; GFX9-NEXT: s_waitcnt vmcnt(0) 619; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 620; GFX9-NEXT: s_waitcnt vmcnt(0) 621; GFX9-NEXT: s_endpgm 622; 623; GFX10-LABEL: store_load_vidx_sidx_offset: 624; GFX10: ; %bb.0: ; %bb 625; GFX10-NEXT: s_add_u32 s2, s2, s5 626; GFX10-NEXT: s_addc_u32 s3, s3, 0 627; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 628; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 629; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 630; GFX10-NEXT: v_mov_b32_e32 v1, 15 631; GFX10-NEXT: s_waitcnt lgkmcnt(0) 632; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 633; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 634; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 635; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 636; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 637; GFX10-NEXT: s_waitcnt vmcnt(0) 638; GFX10-NEXT: s_endpgm 639bb: 640 %alloca = alloca [32 x i32], align 4, addrspace(5) 641 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 642 %add1 = add nsw i32 %sidx, %vidx 643 %add2 = add nsw i32 %add1, 256 644 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 645 store volatile i32 15, i32 addrspace(5)* %gep, align 4 646 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 647 ret void 648} 649 650define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 651; GFX9-LABEL: store_load_i64_aligned: 652; GFX9: ; %bb.0: ; %bb 653; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 654; GFX9-NEXT: v_mov_b32_e32 v1, 15 655; GFX9-NEXT: v_mov_b32_e32 v2, 0 656; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 657; GFX9-NEXT: s_waitcnt vmcnt(0) 658; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 659; GFX9-NEXT: s_waitcnt vmcnt(0) 660; GFX9-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX10-LABEL: store_load_i64_aligned: 663; GFX10: ; %bb.0: ; %bb 664; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 666; GFX10-NEXT: v_mov_b32_e32 v1, 15 667; GFX10-NEXT: v_mov_b32_e32 v2, 0 668; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 669; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 670; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 671; GFX10-NEXT: s_waitcnt vmcnt(0) 672; GFX10-NEXT: s_setpc_b64 s[30:31] 673bb: 674 store volatile i64 15, i64 addrspace(5)* %arg, align 8 675 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 676 ret void 677} 678 679define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 680; GFX9-LABEL: store_load_i64_unaligned: 681; GFX9: ; %bb.0: ; %bb 682; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX9-NEXT: v_mov_b32_e32 v1, 15 684; GFX9-NEXT: v_mov_b32_e32 v2, 0 685; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 686; GFX9-NEXT: s_waitcnt vmcnt(0) 687; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 688; GFX9-NEXT: s_waitcnt vmcnt(0) 689; GFX9-NEXT: s_setpc_b64 s[30:31] 690; 691; GFX10-LABEL: store_load_i64_unaligned: 692; GFX10: ; %bb.0: ; %bb 693; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 694; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 695; GFX10-NEXT: v_mov_b32_e32 v1, 15 696; GFX10-NEXT: v_mov_b32_e32 v2, 0 697; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 698; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 699; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 700; GFX10-NEXT: s_waitcnt vmcnt(0) 701; GFX10-NEXT: s_setpc_b64 s[30:31] 702bb: 703 store volatile i64 15, i64 addrspace(5)* %arg, align 1 704 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 705 ret void 706} 707 708define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 709; GFX9-LABEL: store_load_v3i32_unaligned: 710; GFX9: ; %bb.0: ; %bb 711; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 712; GFX9-NEXT: s_mov_b32 s2, 3 713; GFX9-NEXT: s_mov_b32 s1, 2 714; GFX9-NEXT: s_mov_b32 s0, 1 715; GFX9-NEXT: v_mov_b32_e32 v3, s2 716; GFX9-NEXT: v_mov_b32_e32 v2, s1 717; GFX9-NEXT: v_mov_b32_e32 v1, s0 718; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 719; GFX9-NEXT: s_waitcnt vmcnt(0) 720; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 721; GFX9-NEXT: s_waitcnt vmcnt(0) 722; GFX9-NEXT: s_setpc_b64 s[30:31] 723; 724; GFX10-LABEL: store_load_v3i32_unaligned: 725; GFX10: ; %bb.0: ; %bb 726; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 727; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 728; GFX10-NEXT: s_mov_b32 s2, 3 729; GFX10-NEXT: s_mov_b32 s1, 2 730; GFX10-NEXT: s_mov_b32 s0, 1 731; GFX10-NEXT: v_mov_b32_e32 v3, s2 732; GFX10-NEXT: v_mov_b32_e32 v2, s1 733; GFX10-NEXT: v_mov_b32_e32 v1, s0 734; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 735; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 736; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 737; GFX10-NEXT: s_waitcnt vmcnt(0) 738; GFX10-NEXT: s_setpc_b64 s[30:31] 739bb: 740 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 741 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 742 ret void 743} 744 745define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 746; GFX9-LABEL: store_load_v4i32_unaligned: 747; GFX9: ; %bb.0: ; %bb 748; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 749; GFX9-NEXT: s_mov_b32 s3, 4 750; GFX9-NEXT: s_mov_b32 s2, 3 751; GFX9-NEXT: s_mov_b32 s1, 2 752; GFX9-NEXT: s_mov_b32 s0, 1 753; GFX9-NEXT: v_mov_b32_e32 v4, s3 754; GFX9-NEXT: v_mov_b32_e32 v3, s2 755; GFX9-NEXT: v_mov_b32_e32 v2, s1 756; GFX9-NEXT: v_mov_b32_e32 v1, s0 757; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 758; GFX9-NEXT: s_waitcnt vmcnt(0) 759; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 760; GFX9-NEXT: s_waitcnt vmcnt(0) 761; GFX9-NEXT: s_setpc_b64 s[30:31] 762; 763; GFX10-LABEL: store_load_v4i32_unaligned: 764; GFX10: ; %bb.0: ; %bb 765; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 767; GFX10-NEXT: s_mov_b32 s3, 4 768; GFX10-NEXT: s_mov_b32 s2, 3 769; GFX10-NEXT: s_mov_b32 s1, 2 770; GFX10-NEXT: s_mov_b32 s0, 1 771; GFX10-NEXT: v_mov_b32_e32 v4, s3 772; GFX10-NEXT: v_mov_b32_e32 v3, s2 773; GFX10-NEXT: v_mov_b32_e32 v2, s1 774; GFX10-NEXT: v_mov_b32_e32 v1, s0 775; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 776; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 777; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 778; GFX10-NEXT: s_waitcnt vmcnt(0) 779; GFX10-NEXT: s_setpc_b64 s[30:31] 780bb: 781 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 782 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 783 ret void 784} 785 786declare i32 @llvm.amdgcn.workitem.id.x() 787