1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 7 8define amdgpu_kernel void @zero_init_kernel() { 9; GFX9-LABEL: zero_init_kernel: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 12; GFX9-NEXT: s_mov_b32 s0, 0 13; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 14; GFX9-NEXT: s_mov_b32 s1, s0 15; GFX9-NEXT: s_mov_b32 s2, s0 16; GFX9-NEXT: s_mov_b32 s3, s0 17; GFX9-NEXT: v_mov_b32_e32 v0, s0 18; GFX9-NEXT: v_mov_b32_e32 v1, s1 19; GFX9-NEXT: v_mov_b32_e32 v2, s2 20; GFX9-NEXT: v_mov_b32_e32 v3, s3 21; GFX9-NEXT: s_mov_b32 vcc_hi, 0 22; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 23; GFX9-NEXT: s_mov_b32 vcc_hi, 0 24; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 25; GFX9-NEXT: s_mov_b32 vcc_hi, 0 26; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 27; GFX9-NEXT: s_mov_b32 vcc_hi, 0 28; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 29; GFX9-NEXT: s_endpgm 30; 31; GFX10-LABEL: zero_init_kernel: 32; GFX10: ; %bb.0: 33; GFX10-NEXT: s_add_u32 s0, s0, s3 34; GFX10-NEXT: s_addc_u32 s1, s1, 0 35; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 36; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 37; GFX10-NEXT: s_mov_b32 s0, 0 38; GFX10-NEXT: s_mov_b32 s1, s0 39; GFX10-NEXT: s_mov_b32 s2, s0 40; GFX10-NEXT: s_mov_b32 s3, s0 41; GFX10-NEXT: v_mov_b32_e32 v0, s0 42; GFX10-NEXT: v_mov_b32_e32 v1, s1 43; GFX10-NEXT: v_mov_b32_e32 v2, s2 44; GFX10-NEXT: v_mov_b32_e32 v3, s3 45; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 49; GFX10-NEXT: s_endpgm 50; 51; GFX9-PAL-LABEL: zero_init_kernel: 52; GFX9-PAL: ; %bb.0: 53; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 54; GFX9-PAL-NEXT: s_mov_b32 s2, s0 55; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 56; GFX9-PAL-NEXT: s_mov_b32 s0, 0 57; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 58; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 59; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 60; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 61; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 62; GFX9-PAL-NEXT: s_mov_b32 s1, s0 63; GFX9-PAL-NEXT: s_mov_b32 s2, s0 64; GFX9-PAL-NEXT: s_mov_b32 s3, s0 65; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 66; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 67; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 68; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 69; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 70; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 71; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 72; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 73; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 74; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 75; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 76; GFX9-PAL-NEXT: s_endpgm 77; 78; GFX1010-PAL-LABEL: zero_init_kernel: 79; GFX1010-PAL: ; %bb.0: 80; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 81; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 82; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 83; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 84; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 85; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 86; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 87; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 88; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 89; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 90; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 91; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 92; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 93; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 94; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 95; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 96; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 97; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 98; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 99; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 100; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 101; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 102; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 103; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 104; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 105; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 106; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 107; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 108; GFX1010-PAL-NEXT: s_endpgm 109; 110; GFX1030-PAL-LABEL: zero_init_kernel: 111; GFX1030-PAL: ; %bb.0: 112; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 113; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 114; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 115; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 116; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 117; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 118; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 119; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 120; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 121; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 122; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 123; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 124; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 125; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 126; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 127; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 128; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 129; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 130; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 131; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 132; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 133; GFX1030-PAL-NEXT: s_endpgm 134 %alloca = alloca [32 x i16], align 2, addrspace(5) 135 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 136 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 137 ret void 138} 139 140define void @zero_init_foo() { 141; GFX9-LABEL: zero_init_foo: 142; GFX9: ; %bb.0: 143; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX9-NEXT: s_mov_b32 s0, 0 145; GFX9-NEXT: s_mov_b32 s1, s0 146; GFX9-NEXT: s_mov_b32 s2, s0 147; GFX9-NEXT: s_mov_b32 s3, s0 148; GFX9-NEXT: v_mov_b32_e32 v0, s0 149; GFX9-NEXT: v_mov_b32_e32 v1, s1 150; GFX9-NEXT: v_mov_b32_e32 v2, s2 151; GFX9-NEXT: v_mov_b32_e32 v3, s3 152; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 153; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 154; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 155; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 156; GFX9-NEXT: s_waitcnt vmcnt(0) 157; GFX9-NEXT: s_setpc_b64 s[30:31] 158; 159; GFX10-LABEL: zero_init_foo: 160; GFX10: ; %bb.0: 161; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 162; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 163; GFX10-NEXT: s_mov_b32 s0, 0 164; GFX10-NEXT: s_mov_b32 s1, s0 165; GFX10-NEXT: s_mov_b32 s2, s0 166; GFX10-NEXT: s_mov_b32 s3, s0 167; GFX10-NEXT: v_mov_b32_e32 v0, s0 168; GFX10-NEXT: v_mov_b32_e32 v1, s1 169; GFX10-NEXT: v_mov_b32_e32 v2, s2 170; GFX10-NEXT: v_mov_b32_e32 v3, s3 171; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 172; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 173; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 174; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 175; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 176; GFX10-NEXT: s_setpc_b64 s[30:31] 177; 178; GFX9-PAL-LABEL: zero_init_foo: 179; GFX9-PAL: ; %bb.0: 180; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 181; GFX9-PAL-NEXT: s_mov_b32 s0, 0 182; GFX9-PAL-NEXT: s_mov_b32 s1, s0 183; GFX9-PAL-NEXT: s_mov_b32 s2, s0 184; GFX9-PAL-NEXT: s_mov_b32 s3, s0 185; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 186; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 187; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 188; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 189; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 190; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 191; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 192; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 193; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 194; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 195; 196; GFX10-PAL-LABEL: zero_init_foo: 197; GFX10-PAL: ; %bb.0: 198; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 200; GFX10-PAL-NEXT: s_mov_b32 s0, 0 201; GFX10-PAL-NEXT: s_mov_b32 s1, s0 202; GFX10-PAL-NEXT: s_mov_b32 s2, s0 203; GFX10-PAL-NEXT: s_mov_b32 s3, s0 204; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 205; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 206; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 207; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 208; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 209; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 210; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 211; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 212; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 213; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 214 %alloca = alloca [32 x i16], align 2, addrspace(5) 215 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 216 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 217 ret void 218} 219 220define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 221; GFX9-LABEL: store_load_sindex_kernel: 222; GFX9: ; %bb.0: ; %bb 223; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 224; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 225; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 226; GFX9-NEXT: v_mov_b32_e32 v0, 15 227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 228; GFX9-NEXT: s_lshl_b32 s1, s0, 2 229; GFX9-NEXT: s_and_b32 s0, s0, 15 230; GFX9-NEXT: s_lshl_b32 s0, s0, 2 231; GFX9-NEXT: s_add_i32 s1, s1, 4 232; GFX9-NEXT: scratch_store_dword off, v0, s1 233; GFX9-NEXT: s_waitcnt vmcnt(0) 234; GFX9-NEXT: s_add_i32 s0, s0, 4 235; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 236; GFX9-NEXT: s_waitcnt vmcnt(0) 237; GFX9-NEXT: s_endpgm 238; 239; GFX10-LABEL: store_load_sindex_kernel: 240; GFX10: ; %bb.0: ; %bb 241; GFX10-NEXT: s_add_u32 s2, s2, s5 242; GFX10-NEXT: s_addc_u32 s3, s3, 0 243; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 244; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 245; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 246; GFX10-NEXT: v_mov_b32_e32 v0, 15 247; GFX10-NEXT: s_waitcnt lgkmcnt(0) 248; GFX10-NEXT: s_and_b32 s1, s0, 15 249; GFX10-NEXT: s_lshl_b32 s0, s0, 2 250; GFX10-NEXT: s_lshl_b32 s1, s1, 2 251; GFX10-NEXT: s_add_i32 s0, s0, 4 252; GFX10-NEXT: s_add_i32 s1, s1, 4 253; GFX10-NEXT: scratch_store_dword off, v0, s0 254; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 255; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 256; GFX10-NEXT: s_waitcnt vmcnt(0) 257; GFX10-NEXT: s_endpgm 258; 259; GFX9-PAL-LABEL: store_load_sindex_kernel: 260; GFX9-PAL: ; %bb.0: ; %bb 261; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 262; GFX9-PAL-NEXT: s_mov_b32 s4, s0 263; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 264; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 265; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 266; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 267; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 268; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 269; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 270; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 271; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 272; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 273; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 274; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 275; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 276; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 277; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 278; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 279; GFX9-PAL-NEXT: s_endpgm 280; 281; GFX10-PAL-LABEL: store_load_sindex_kernel: 282; GFX10-PAL: ; %bb.0: ; %bb 283; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 284; GFX10-PAL-NEXT: s_mov_b32 s4, s0 285; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 286; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 287; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 288; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 289; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 290; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 291; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 292; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 293; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 294; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 295; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 296; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 297; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 298; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 299; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 300; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 301; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 302; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 303; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 304; GFX10-PAL-NEXT: s_endpgm 305bb: 306 %i = alloca [32 x float], align 4, addrspace(5) 307 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 308 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 309 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 310 store volatile i32 15, i32 addrspace(5)* %i8, align 4 311 %i9 = and i32 %idx, 15 312 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 313 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 314 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 315 ret void 316} 317 318define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 319; GFX9-LABEL: store_load_sindex_foo: 320; GFX9: ; %bb.0: ; %bb 321; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 322; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 323; GFX9-NEXT: s_lshl_b32 s0, s2, 2 324; GFX9-NEXT: s_add_i32 s0, s0, 4 325; GFX9-NEXT: v_mov_b32_e32 v0, 15 326; GFX9-NEXT: scratch_store_dword off, v0, s0 327; GFX9-NEXT: s_waitcnt vmcnt(0) 328; GFX9-NEXT: s_and_b32 s0, s2, 15 329; GFX9-NEXT: s_lshl_b32 s0, s0, 2 330; GFX9-NEXT: s_add_i32 s0, s0, 4 331; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 332; GFX9-NEXT: s_waitcnt vmcnt(0) 333; GFX9-NEXT: s_endpgm 334; 335; GFX10-LABEL: store_load_sindex_foo: 336; GFX10: ; %bb.0: ; %bb 337; GFX10-NEXT: s_add_u32 s0, s0, s3 338; GFX10-NEXT: s_addc_u32 s1, s1, 0 339; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 340; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 341; GFX10-NEXT: s_and_b32 s0, s2, 15 342; GFX10-NEXT: v_mov_b32_e32 v0, 15 343; GFX10-NEXT: s_lshl_b32 s1, s2, 2 344; GFX10-NEXT: s_lshl_b32 s0, s0, 2 345; GFX10-NEXT: s_add_i32 s1, s1, 4 346; GFX10-NEXT: s_add_i32 s0, s0, 4 347; GFX10-NEXT: scratch_store_dword off, v0, s1 348; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 349; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 350; GFX10-NEXT: s_waitcnt vmcnt(0) 351; GFX10-NEXT: s_endpgm 352; 353; GFX9-PAL-LABEL: store_load_sindex_foo: 354; GFX9-PAL: ; %bb.0: ; %bb 355; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 356; GFX9-PAL-NEXT: s_mov_b32 s2, s0 357; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 358; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 359; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 361; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 362; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 363; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 364; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 365; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 366; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 367; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 368; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 369; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 370; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 371; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 372; GFX9-PAL-NEXT: s_endpgm 373; 374; GFX10-PAL-LABEL: store_load_sindex_foo: 375; GFX10-PAL: ; %bb.0: ; %bb 376; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 377; GFX10-PAL-NEXT: s_mov_b32 s2, s0 378; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 379; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 380; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 381; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 382; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 383; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 384; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 385; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 386; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 387; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 388; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 389; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 390; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 391; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 392; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 393; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 394; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 395; GFX10-PAL-NEXT: s_endpgm 396bb: 397 %i = alloca [32 x float], align 4, addrspace(5) 398 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 399 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 400 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 401 store volatile i32 15, i32 addrspace(5)* %i8, align 4 402 %i9 = and i32 %idx, 15 403 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 404 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 405 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 406 ret void 407} 408 409define amdgpu_kernel void @store_load_vindex_kernel() { 410; GFX9-LABEL: store_load_vindex_kernel: 411; GFX9: ; %bb.0: ; %bb 412; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 413; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 414; GFX9-NEXT: v_mov_b32_e32 v1, 4 415; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 416; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 417; GFX9-NEXT: v_mov_b32_e32 v3, 15 418; GFX9-NEXT: scratch_store_dword v2, v3, off 419; GFX9-NEXT: s_waitcnt vmcnt(0) 420; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 421; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 422; GFX9-NEXT: s_waitcnt vmcnt(0) 423; GFX9-NEXT: s_endpgm 424; 425; GFX10-LABEL: store_load_vindex_kernel: 426; GFX10: ; %bb.0: ; %bb 427; GFX10-NEXT: s_add_u32 s0, s0, s3 428; GFX10-NEXT: s_addc_u32 s1, s1, 0 429; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 430; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 431; GFX10-NEXT: v_mov_b32_e32 v1, 4 432; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 433; GFX10-NEXT: v_mov_b32_e32 v3, 15 434; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 435; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 436; GFX10-NEXT: scratch_store_dword v2, v3, off 437; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 438; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 439; GFX10-NEXT: s_waitcnt vmcnt(0) 440; GFX10-NEXT: s_endpgm 441; 442; GFX9-PAL-LABEL: store_load_vindex_kernel: 443; GFX9-PAL: ; %bb.0: ; %bb 444; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 445; GFX9-PAL-NEXT: s_mov_b32 s2, s0 446; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 447; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 448; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 449; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 450; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 451; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 453; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 454; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 455; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 456; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 457; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 458; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 459; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 460; GFX9-PAL-NEXT: s_endpgm 461; 462; GFX10-PAL-LABEL: store_load_vindex_kernel: 463; GFX10-PAL: ; %bb.0: ; %bb 464; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 465; GFX10-PAL-NEXT: s_mov_b32 s2, s0 466; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 467; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 468; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 469; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 470; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 471; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 472; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 473; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 474; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 475; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 476; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 477; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 478; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 479; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 480; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 481; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 482; GFX10-PAL-NEXT: s_endpgm 483bb: 484 %i = alloca [32 x float], align 4, addrspace(5) 485 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 486 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 487 %i3 = zext i32 %i2 to i64 488 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 489 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 490 store volatile i32 15, i32 addrspace(5)* %i8, align 4 491 %i9 = sub nsw i32 31, %i2 492 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 493 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 494 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 495 ret void 496} 497 498define void @store_load_vindex_foo(i32 %idx) { 499; GFX9-LABEL: store_load_vindex_foo: 500; GFX9: ; %bb.0: ; %bb 501; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 502; GFX9-NEXT: v_mov_b32_e32 v1, s32 503; GFX9-NEXT: v_mov_b32_e32 v3, 15 504; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 505; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 506; GFX9-NEXT: scratch_store_dword v2, v3, off 507; GFX9-NEXT: s_waitcnt vmcnt(0) 508; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 509; GFX9-NEXT: scratch_load_dword v0, v0, off glc 510; GFX9-NEXT: s_waitcnt vmcnt(0) 511; GFX9-NEXT: s_setpc_b64 s[30:31] 512; 513; GFX10-LABEL: store_load_vindex_foo: 514; GFX10: ; %bb.0: ; %bb 515; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 516; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 517; GFX10-NEXT: v_mov_b32_e32 v1, 15 518; GFX10-NEXT: v_mov_b32_e32 v2, s32 519; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 520; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 521; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 522; GFX10-NEXT: scratch_store_dword v0, v1, off 523; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 525; GFX10-NEXT: s_waitcnt vmcnt(0) 526; GFX10-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX9-PAL-LABEL: store_load_vindex_foo: 529; GFX9-PAL: ; %bb.0: ; %bb 530; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 532; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 533; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 534; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 535; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 536; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 537; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 538; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 539; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 540; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 541; 542; GFX10-PAL-LABEL: store_load_vindex_foo: 543; GFX10-PAL: ; %bb.0: ; %bb 544; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 546; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 547; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 548; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 549; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 550; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 551; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 552; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 553; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 554; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 555; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 556bb: 557 %i = alloca [32 x float], align 4, addrspace(5) 558 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 559 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 560 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 561 store volatile i32 15, i32 addrspace(5)* %i8, align 4 562 %i9 = and i32 %idx, 15 563 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 564 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 565 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 566 ret void 567} 568 569define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 570; GFX9-LABEL: private_ptr_foo: 571; GFX9: ; %bb.0: 572; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 573; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 574; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 575; GFX9-NEXT: s_waitcnt vmcnt(0) 576; GFX9-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX10-LABEL: private_ptr_foo: 579; GFX10: ; %bb.0: 580; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 582; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 583; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 584; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 585; GFX10-NEXT: s_setpc_b64 s[30:31] 586; 587; GFX9-PAL-LABEL: private_ptr_foo: 588; GFX9-PAL: ; %bb.0: 589; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 590; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 591; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 592; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 593; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 594; 595; GFX10-PAL-LABEL: private_ptr_foo: 596; GFX10-PAL: ; %bb.0: 597; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 598; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 599; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 600; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 601; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 602; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 603 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 604 store float 1.000000e+01, float addrspace(5)* %gep, align 4 605 ret void 606} 607 608define amdgpu_kernel void @zero_init_small_offset_kernel() { 609; GFX9-LABEL: zero_init_small_offset_kernel: 610; GFX9: ; %bb.0: 611; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 612; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 613; GFX9-NEXT: s_mov_b32 vcc_hi, 0 614; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 615; GFX9-NEXT: s_waitcnt vmcnt(0) 616; GFX9-NEXT: s_mov_b32 s0, 0 617; GFX9-NEXT: s_mov_b32 s1, s0 618; GFX9-NEXT: s_mov_b32 s2, s0 619; GFX9-NEXT: s_mov_b32 s3, s0 620; GFX9-NEXT: v_mov_b32_e32 v0, s0 621; GFX9-NEXT: v_mov_b32_e32 v1, s1 622; GFX9-NEXT: v_mov_b32_e32 v2, s2 623; GFX9-NEXT: v_mov_b32_e32 v3, s3 624; GFX9-NEXT: s_mov_b32 vcc_hi, 0 625; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 626; GFX9-NEXT: s_mov_b32 vcc_hi, 0 627; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 628; GFX9-NEXT: s_mov_b32 vcc_hi, 0 629; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 630; GFX9-NEXT: s_mov_b32 vcc_hi, 0 631; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 632; GFX9-NEXT: s_endpgm 633; 634; GFX10-LABEL: zero_init_small_offset_kernel: 635; GFX10: ; %bb.0: 636; GFX10-NEXT: s_add_u32 s0, s0, s3 637; GFX10-NEXT: s_addc_u32 s1, s1, 0 638; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 639; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 640; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 641; GFX10-NEXT: s_waitcnt vmcnt(0) 642; GFX10-NEXT: s_mov_b32 s0, 0 643; GFX10-NEXT: s_mov_b32 s1, s0 644; GFX10-NEXT: s_mov_b32 s2, s0 645; GFX10-NEXT: s_mov_b32 s3, s0 646; GFX10-NEXT: v_mov_b32_e32 v0, s0 647; GFX10-NEXT: v_mov_b32_e32 v1, s1 648; GFX10-NEXT: v_mov_b32_e32 v2, s2 649; GFX10-NEXT: v_mov_b32_e32 v3, s3 650; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 651; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 652; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 653; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 654; GFX10-NEXT: s_endpgm 655; 656; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 657; GFX9-PAL: ; %bb.0: 658; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 659; GFX9-PAL-NEXT: s_mov_b32 s2, s0 660; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 661; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 662; GFX9-PAL-NEXT: s_mov_b32 s0, 0 663; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 664; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 665; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 666; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 667; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 668; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 669; GFX9-PAL-NEXT: s_mov_b32 s1, s0 670; GFX9-PAL-NEXT: s_mov_b32 s2, s0 671; GFX9-PAL-NEXT: s_mov_b32 s3, s0 672; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 673; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 674; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 675; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 676; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 677; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 678; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 679; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 680; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 681; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 682; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 683; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 684; GFX9-PAL-NEXT: s_endpgm 685; 686; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 687; GFX1010-PAL: ; %bb.0: 688; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 689; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 690; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 691; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 692; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 693; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 694; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 695; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 696; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 697; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 698; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 699; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 700; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 701; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 702; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 703; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 704; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 705; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 706; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 707; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 708; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 709; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 710; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 711; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 712; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 713; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 714; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 715; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 716; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 717; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 718; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 719; GFX1010-PAL-NEXT: s_endpgm 720; 721; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 722; GFX1030-PAL: ; %bb.0: 723; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 724; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 725; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 726; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 727; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 728; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 729; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 730; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 731; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 732; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 733; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 734; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 735; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 736; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 737; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 738; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 739; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 740; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 741; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 742; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 743; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 744; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 745; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 746; GFX1030-PAL-NEXT: s_endpgm 747 %padding = alloca [64 x i32], align 4, addrspace(5) 748 %alloca = alloca [32 x i16], align 2, addrspace(5) 749 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 750 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 751 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 752 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 753 ret void 754} 755 756define void @zero_init_small_offset_foo() { 757; GFX9-LABEL: zero_init_small_offset_foo: 758; GFX9: ; %bb.0: 759; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 760; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 761; GFX9-NEXT: s_waitcnt vmcnt(0) 762; GFX9-NEXT: s_mov_b32 s0, 0 763; GFX9-NEXT: s_mov_b32 s1, s0 764; GFX9-NEXT: s_mov_b32 s2, s0 765; GFX9-NEXT: s_mov_b32 s3, s0 766; GFX9-NEXT: v_mov_b32_e32 v0, s0 767; GFX9-NEXT: v_mov_b32_e32 v1, s1 768; GFX9-NEXT: v_mov_b32_e32 v2, s2 769; GFX9-NEXT: v_mov_b32_e32 v3, s3 770; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 771; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 772; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 773; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 774; GFX9-NEXT: s_waitcnt vmcnt(0) 775; GFX9-NEXT: s_setpc_b64 s[30:31] 776; 777; GFX10-LABEL: zero_init_small_offset_foo: 778; GFX10: ; %bb.0: 779; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 780; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 781; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 782; GFX10-NEXT: s_waitcnt vmcnt(0) 783; GFX10-NEXT: s_mov_b32 s0, 0 784; GFX10-NEXT: s_mov_b32 s1, s0 785; GFX10-NEXT: s_mov_b32 s2, s0 786; GFX10-NEXT: s_mov_b32 s3, s0 787; GFX10-NEXT: v_mov_b32_e32 v0, s0 788; GFX10-NEXT: v_mov_b32_e32 v1, s1 789; GFX10-NEXT: v_mov_b32_e32 v2, s2 790; GFX10-NEXT: v_mov_b32_e32 v3, s3 791; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 792; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 793; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 794; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 795; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 796; GFX10-NEXT: s_setpc_b64 s[30:31] 797; 798; GFX9-PAL-LABEL: zero_init_small_offset_foo: 799; GFX9-PAL: ; %bb.0: 800; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 801; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 802; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 803; GFX9-PAL-NEXT: s_mov_b32 s0, 0 804; GFX9-PAL-NEXT: s_mov_b32 s1, s0 805; GFX9-PAL-NEXT: s_mov_b32 s2, s0 806; GFX9-PAL-NEXT: s_mov_b32 s3, s0 807; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 808; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 809; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 810; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 811; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 812; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 813; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 814; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 815; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 816; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 817; 818; GFX10-PAL-LABEL: zero_init_small_offset_foo: 819; GFX10-PAL: ; %bb.0: 820; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 821; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 822; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 823; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 824; GFX10-PAL-NEXT: s_mov_b32 s0, 0 825; GFX10-PAL-NEXT: s_mov_b32 s1, s0 826; GFX10-PAL-NEXT: s_mov_b32 s2, s0 827; GFX10-PAL-NEXT: s_mov_b32 s3, s0 828; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 829; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 830; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 831; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 832; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 833; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 834; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 835; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 836; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 837; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 838 %padding = alloca [64 x i32], align 4, addrspace(5) 839 %alloca = alloca [32 x i16], align 2, addrspace(5) 840 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 841 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 842 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 843 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 844 ret void 845} 846 847define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 848; GFX9-LABEL: store_load_sindex_small_offset_kernel: 849; GFX9: ; %bb.0: ; %bb 850; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 851; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 852; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 853; GFX9-NEXT: s_mov_b32 vcc_hi, 0 854; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 855; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 856; GFX9-NEXT: s_lshl_b32 s1, s0, 2 857; GFX9-NEXT: s_and_b32 s0, s0, 15 858; GFX9-NEXT: s_lshl_b32 s0, s0, 2 859; GFX9-NEXT: v_mov_b32_e32 v0, 15 860; GFX9-NEXT: s_addk_i32 s1, 0x104 861; GFX9-NEXT: scratch_store_dword off, v0, s1 862; GFX9-NEXT: s_waitcnt vmcnt(0) 863; GFX9-NEXT: s_addk_i32 s0, 0x104 864; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 865; GFX9-NEXT: s_waitcnt vmcnt(0) 866; GFX9-NEXT: s_endpgm 867; 868; GFX10-LABEL: store_load_sindex_small_offset_kernel: 869; GFX10: ; %bb.0: ; %bb 870; GFX10-NEXT: s_add_u32 s2, s2, s5 871; GFX10-NEXT: s_addc_u32 s3, s3, 0 872; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 873; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 874; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 875; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 876; GFX10-NEXT: s_waitcnt vmcnt(0) 877; GFX10-NEXT: v_mov_b32_e32 v0, 15 878; GFX10-NEXT: s_waitcnt lgkmcnt(0) 879; GFX10-NEXT: s_and_b32 s1, s0, 15 880; GFX10-NEXT: s_lshl_b32 s0, s0, 2 881; GFX10-NEXT: s_lshl_b32 s1, s1, 2 882; GFX10-NEXT: s_addk_i32 s0, 0x104 883; GFX10-NEXT: s_addk_i32 s1, 0x104 884; GFX10-NEXT: scratch_store_dword off, v0, s0 885; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 886; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 887; GFX10-NEXT: s_waitcnt vmcnt(0) 888; GFX10-NEXT: s_endpgm 889; 890; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 891; GFX9-PAL: ; %bb.0: ; %bb 892; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 893; GFX9-PAL-NEXT: s_mov_b32 s4, s0 894; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 895; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 896; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 897; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 898; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 899; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 900; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 901; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 902; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 903; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 904; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 905; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 906; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 907; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 908; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 909; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 910; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 911; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 912; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 913; GFX9-PAL-NEXT: s_endpgm 914; 915; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 916; GFX1010-PAL: ; %bb.0: ; %bb 917; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 918; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 919; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 920; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 921; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 922; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 923; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 924; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 925; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 926; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 927; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 928; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 929; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 930; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 931; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 932; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 933; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 934; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 935; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 936; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 937; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 938; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 939; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 940; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 941; GFX1010-PAL-NEXT: s_endpgm 942; 943; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 944; GFX1030-PAL: ; %bb.0: ; %bb 945; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 946; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 947; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 948; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 949; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 950; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 951; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 952; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 953; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 954; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 955; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 956; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 957; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 958; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 959; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 960; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 961; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 962; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 963; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 964; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 965; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 966; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 967; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 968; GFX1030-PAL-NEXT: s_endpgm 969bb: 970 %padding = alloca [64 x i32], align 4, addrspace(5) 971 %i = alloca [32 x float], align 4, addrspace(5) 972 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 973 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 974 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 975 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 976 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 977 store volatile i32 15, i32 addrspace(5)* %i8, align 4 978 %i9 = and i32 %idx, 15 979 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 980 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 981 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 982 ret void 983} 984 985define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 986; GFX9-LABEL: store_load_sindex_small_offset_foo: 987; GFX9: ; %bb.0: ; %bb 988; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 989; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 990; GFX9-NEXT: s_mov_b32 vcc_hi, 0 991; GFX9-NEXT: s_lshl_b32 s0, s2, 2 992; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 993; GFX9-NEXT: s_waitcnt vmcnt(0) 994; GFX9-NEXT: s_addk_i32 s0, 0x104 995; GFX9-NEXT: v_mov_b32_e32 v0, 15 996; GFX9-NEXT: scratch_store_dword off, v0, s0 997; GFX9-NEXT: s_waitcnt vmcnt(0) 998; GFX9-NEXT: s_and_b32 s0, s2, 15 999; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1000; GFX9-NEXT: s_addk_i32 s0, 0x104 1001; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1002; GFX9-NEXT: s_waitcnt vmcnt(0) 1003; GFX9-NEXT: s_endpgm 1004; 1005; GFX10-LABEL: store_load_sindex_small_offset_foo: 1006; GFX10: ; %bb.0: ; %bb 1007; GFX10-NEXT: s_add_u32 s0, s0, s3 1008; GFX10-NEXT: s_addc_u32 s1, s1, 0 1009; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1010; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1011; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1012; GFX10-NEXT: s_waitcnt vmcnt(0) 1013; GFX10-NEXT: s_and_b32 s0, s2, 15 1014; GFX10-NEXT: v_mov_b32_e32 v0, 15 1015; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1016; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1017; GFX10-NEXT: s_addk_i32 s1, 0x104 1018; GFX10-NEXT: s_addk_i32 s0, 0x104 1019; GFX10-NEXT: scratch_store_dword off, v0, s1 1020; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1021; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1022; GFX10-NEXT: s_waitcnt vmcnt(0) 1023; GFX10-NEXT: s_endpgm 1024; 1025; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1026; GFX9-PAL: ; %bb.0: ; %bb 1027; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1028; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1029; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1030; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1031; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1032; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1033; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1034; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1035; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1036; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1037; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1038; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1039; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1040; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1041; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1042; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1043; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1044; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1045; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1046; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1047; GFX9-PAL-NEXT: s_endpgm 1048; 1049; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1050; GFX1010-PAL: ; %bb.0: ; %bb 1051; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1052; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1053; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1054; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1056; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1057; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1058; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1059; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1060; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1061; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1062; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1063; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1064; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1065; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1066; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1067; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1068; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1069; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1070; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1071; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1072; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1073; GFX1010-PAL-NEXT: s_endpgm 1074; 1075; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1076; GFX1030-PAL: ; %bb.0: ; %bb 1077; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1078; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1079; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1080; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1082; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1083; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1084; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1085; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1086; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1087; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1088; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1089; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1090; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1091; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1092; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1093; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1094; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1095; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1096; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1097; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1098; GFX1030-PAL-NEXT: s_endpgm 1099bb: 1100 %padding = alloca [64 x i32], align 4, addrspace(5) 1101 %i = alloca [32 x float], align 4, addrspace(5) 1102 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1103 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1104 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1105 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1106 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1107 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1108 %i9 = and i32 %idx, 15 1109 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1110 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1111 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1112 ret void 1113} 1114 1115define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1116; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1117; GFX9: ; %bb.0: ; %bb 1118; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1119; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1120; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1121; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1122; GFX9-NEXT: s_waitcnt vmcnt(0) 1123; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1124; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 1125; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1126; GFX9-NEXT: v_mov_b32_e32 v3, 15 1127; GFX9-NEXT: scratch_store_dword v2, v3, off 1128; GFX9-NEXT: s_waitcnt vmcnt(0) 1129; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1130; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1131; GFX9-NEXT: s_waitcnt vmcnt(0) 1132; GFX9-NEXT: s_endpgm 1133; 1134; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1135; GFX10: ; %bb.0: ; %bb 1136; GFX10-NEXT: s_add_u32 s0, s0, s3 1137; GFX10-NEXT: s_addc_u32 s1, s1, 0 1138; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1139; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1140; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 1141; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1142; GFX10-NEXT: v_mov_b32_e32 v3, 15 1143; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1144; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1145; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1146; GFX10-NEXT: s_waitcnt vmcnt(0) 1147; GFX10-NEXT: scratch_store_dword v2, v3, off 1148; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1149; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1150; GFX10-NEXT: s_waitcnt vmcnt(0) 1151; GFX10-NEXT: s_endpgm 1152; 1153; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1154; GFX9-PAL: ; %bb.0: ; %bb 1155; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1156; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1157; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1158; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1159; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1160; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1161; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1163; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1164; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1165; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1166; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1167; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1168; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1169; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1170; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1171; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1172; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1173; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1174; GFX9-PAL-NEXT: s_endpgm 1175; 1176; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1177; GFX1010-PAL: ; %bb.0: ; %bb 1178; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1179; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1180; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1181; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1182; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1183; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1184; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1185; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1186; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1187; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1188; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1189; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 1190; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1191; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1192; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1193; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc 1194; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1195; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off 1196; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1197; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1198; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1199; GFX1010-PAL-NEXT: s_endpgm 1200; 1201; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1202; GFX1030-PAL: ; %bb.0: ; %bb 1203; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1204; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1205; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1206; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1207; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1208; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1209; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1210; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1211; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1212; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1213; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1214; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 1215; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1216; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1217; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1218; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1219; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off 1220; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1221; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1222; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1223; GFX1030-PAL-NEXT: s_endpgm 1224bb: 1225 %padding = alloca [64 x i32], align 4, addrspace(5) 1226 %i = alloca [32 x float], align 4, addrspace(5) 1227 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1228 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1229 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1230 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1231 %i3 = zext i32 %i2 to i64 1232 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1233 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1234 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1235 %i9 = sub nsw i32 31, %i2 1236 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1237 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1238 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1239 ret void 1240} 1241 1242define void @store_load_vindex_small_offset_foo(i32 %idx) { 1243; GFX9-LABEL: store_load_vindex_small_offset_foo: 1244; GFX9: ; %bb.0: ; %bb 1245; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1246; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1247; GFX9-NEXT: s_waitcnt vmcnt(0) 1248; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1249; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1250; GFX9-NEXT: v_mov_b32_e32 v3, 15 1251; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1252; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1253; GFX9-NEXT: scratch_store_dword v2, v3, off 1254; GFX9-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1256; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1257; GFX9-NEXT: s_waitcnt vmcnt(0) 1258; GFX9-NEXT: s_setpc_b64 s[30:31] 1259; 1260; GFX10-LABEL: store_load_vindex_small_offset_foo: 1261; GFX10: ; %bb.0: ; %bb 1262; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1263; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1264; GFX10-NEXT: v_mov_b32_e32 v1, 15 1265; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1266; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1267; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1268; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1269; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1270; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1271; GFX10-NEXT: s_waitcnt vmcnt(0) 1272; GFX10-NEXT: scratch_store_dword v0, v1, off 1273; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1274; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 1275; GFX10-NEXT: s_waitcnt vmcnt(0) 1276; GFX10-NEXT: s_setpc_b64 s[30:31] 1277; 1278; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1279; GFX9-PAL: ; %bb.0: ; %bb 1280; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1281; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1282; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1283; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1284; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1285; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1286; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1287; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1288; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1289; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1290; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1291; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1292; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1293; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1294; 1295; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1296; GFX10-PAL: ; %bb.0: ; %bb 1297; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1298; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1299; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1300; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1301; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1302; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1303; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1304; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1305; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1306; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1307; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1308; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1309; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 1310; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1311; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1312bb: 1313 %padding = alloca [64 x i32], align 4, addrspace(5) 1314 %i = alloca [32 x float], align 4, addrspace(5) 1315 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1316 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1317 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1318 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1319 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1320 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1321 %i9 = and i32 %idx, 15 1322 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1323 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1324 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1325 ret void 1326} 1327 1328define amdgpu_kernel void @zero_init_large_offset_kernel() { 1329; GFX9-LABEL: zero_init_large_offset_kernel: 1330; GFX9: ; %bb.0: 1331; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1332; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1333; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1334; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1335; GFX9-NEXT: s_waitcnt vmcnt(0) 1336; GFX9-NEXT: s_mov_b32 s0, 0 1337; GFX9-NEXT: s_mov_b32 s1, s0 1338; GFX9-NEXT: s_mov_b32 s2, s0 1339; GFX9-NEXT: s_mov_b32 s3, s0 1340; GFX9-NEXT: v_mov_b32_e32 v0, s0 1341; GFX9-NEXT: v_mov_b32_e32 v1, s1 1342; GFX9-NEXT: v_mov_b32_e32 v2, s2 1343; GFX9-NEXT: v_mov_b32_e32 v3, s3 1344; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1345; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1346; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1347; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1348; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1349; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1350; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1351; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1352; GFX9-NEXT: s_endpgm 1353; 1354; GFX10-LABEL: zero_init_large_offset_kernel: 1355; GFX10: ; %bb.0: 1356; GFX10-NEXT: s_add_u32 s0, s0, s3 1357; GFX10-NEXT: s_addc_u32 s1, s1, 0 1358; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1359; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1360; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1361; GFX10-NEXT: s_waitcnt vmcnt(0) 1362; GFX10-NEXT: s_mov_b32 s0, 0 1363; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1364; GFX10-NEXT: s_mov_b32 s1, s0 1365; GFX10-NEXT: s_mov_b32 s2, s0 1366; GFX10-NEXT: s_mov_b32 s3, s0 1367; GFX10-NEXT: v_mov_b32_e32 v0, s0 1368; GFX10-NEXT: v_mov_b32_e32 v1, s1 1369; GFX10-NEXT: v_mov_b32_e32 v2, s2 1370; GFX10-NEXT: v_mov_b32_e32 v3, s3 1371; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1372; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1373; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1374; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1375; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1376; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1377; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1378; GFX10-NEXT: s_endpgm 1379; 1380; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1381; GFX9-PAL: ; %bb.0: 1382; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1383; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1384; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1385; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1386; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1387; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1389; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1390; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1391; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1392; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1393; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1394; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1395; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1396; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1397; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1398; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1399; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1400; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1401; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1402; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1403; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1404; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1405; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1406; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1407; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1408; GFX9-PAL-NEXT: s_endpgm 1409; 1410; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 1411; GFX1010-PAL: ; %bb.0: 1412; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1413; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1414; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1415; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1417; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1418; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1419; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1420; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1421; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1422; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1423; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 1424; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1425; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1426; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1427; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1428; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1429; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1430; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1431; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1432; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1433; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1434; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1435; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1436; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1437; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1438; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1439; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1440; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1441; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1442; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1443; GFX1010-PAL-NEXT: s_endpgm 1444; 1445; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 1446; GFX1030-PAL: ; %bb.0: 1447; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1448; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1449; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1450; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1452; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1453; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1454; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1455; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1456; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1457; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1458; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1459; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1460; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1461; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1462; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1463; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1464; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1465; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1466; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1467; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1468; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1469; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1470; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1471; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1472; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1473; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1474; GFX1030-PAL-NEXT: s_endpgm 1475 %padding = alloca [4096 x i32], align 4, addrspace(5) 1476 %alloca = alloca [32 x i16], align 2, addrspace(5) 1477 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1478 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1479 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1480 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1481 ret void 1482} 1483 1484define void @zero_init_large_offset_foo() { 1485; GFX9-LABEL: zero_init_large_offset_foo: 1486; GFX9: ; %bb.0: 1487; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1488; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1489; GFX9-NEXT: s_waitcnt vmcnt(0) 1490; GFX9-NEXT: s_mov_b32 s0, 0 1491; GFX9-NEXT: s_mov_b32 s1, s0 1492; GFX9-NEXT: s_mov_b32 s2, s0 1493; GFX9-NEXT: s_mov_b32 s3, s0 1494; GFX9-NEXT: v_mov_b32_e32 v0, s0 1495; GFX9-NEXT: v_mov_b32_e32 v1, s1 1496; GFX9-NEXT: v_mov_b32_e32 v2, s2 1497; GFX9-NEXT: v_mov_b32_e32 v3, s3 1498; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1499; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1500; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1501; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1502; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1503; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1504; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1505; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1506; GFX9-NEXT: s_waitcnt vmcnt(0) 1507; GFX9-NEXT: s_setpc_b64 s[30:31] 1508; 1509; GFX10-LABEL: zero_init_large_offset_foo: 1510; GFX10: ; %bb.0: 1511; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1512; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1513; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1514; GFX10-NEXT: s_waitcnt vmcnt(0) 1515; GFX10-NEXT: s_mov_b32 s0, 0 1516; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1517; GFX10-NEXT: s_mov_b32 s1, s0 1518; GFX10-NEXT: s_mov_b32 s2, s0 1519; GFX10-NEXT: s_mov_b32 s3, s0 1520; GFX10-NEXT: v_mov_b32_e32 v0, s0 1521; GFX10-NEXT: v_mov_b32_e32 v1, s1 1522; GFX10-NEXT: v_mov_b32_e32 v2, s2 1523; GFX10-NEXT: v_mov_b32_e32 v3, s3 1524; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1525; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1526; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1527; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1528; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1529; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1530; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1531; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1532; GFX10-NEXT: s_setpc_b64 s[30:31] 1533; 1534; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1535; GFX9-PAL: ; %bb.0: 1536; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1537; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1538; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1539; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1540; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1541; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1542; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1543; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1544; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1545; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1546; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1547; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1548; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1549; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1550; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1551; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1552; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1553; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 1554; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1555; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1556; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1557; 1558; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 1559; GFX1010-PAL: ; %bb.0: 1560; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1561; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1562; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1563; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1564; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1565; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1566; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1567; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1568; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1569; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1570; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1571; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1572; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1573; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1574; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1575; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1576; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1577; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1578; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1579; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1580; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1581; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1582; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1583; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1584; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 1585; 1586; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 1587; GFX1030-PAL: ; %bb.0: 1588; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1589; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1590; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1591; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1592; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1593; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1594; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1595; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1596; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1597; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1598; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1599; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1600; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1601; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1602; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1603; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1604; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1605; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1606; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 1607; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1608; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1609; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 1610 %padding = alloca [4096 x i32], align 4, addrspace(5) 1611 %alloca = alloca [32 x i16], align 2, addrspace(5) 1612 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1613 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1614 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1615 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1616 ret void 1617} 1618 1619define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1620; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1621; GFX9: ; %bb.0: ; %bb 1622; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1623; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1624; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1625; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1626; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1627; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1628; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1629; GFX9-NEXT: s_and_b32 s0, s0, 15 1630; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1631; GFX9-NEXT: v_mov_b32_e32 v0, 15 1632; GFX9-NEXT: s_addk_i32 s1, 0x4004 1633; GFX9-NEXT: scratch_store_dword off, v0, s1 1634; GFX9-NEXT: s_waitcnt vmcnt(0) 1635; GFX9-NEXT: s_addk_i32 s0, 0x4004 1636; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1637; GFX9-NEXT: s_waitcnt vmcnt(0) 1638; GFX9-NEXT: s_endpgm 1639; 1640; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1641; GFX10: ; %bb.0: ; %bb 1642; GFX10-NEXT: s_add_u32 s2, s2, s5 1643; GFX10-NEXT: s_addc_u32 s3, s3, 0 1644; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1645; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1646; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1647; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1648; GFX10-NEXT: s_waitcnt vmcnt(0) 1649; GFX10-NEXT: v_mov_b32_e32 v0, 15 1650; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1651; GFX10-NEXT: s_and_b32 s1, s0, 15 1652; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1653; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1654; GFX10-NEXT: s_addk_i32 s0, 0x4004 1655; GFX10-NEXT: s_addk_i32 s1, 0x4004 1656; GFX10-NEXT: scratch_store_dword off, v0, s0 1657; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1658; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1659; GFX10-NEXT: s_waitcnt vmcnt(0) 1660; GFX10-NEXT: s_endpgm 1661; 1662; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1663; GFX9-PAL: ; %bb.0: ; %bb 1664; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1665; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1666; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1667; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1668; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1669; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1671; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1672; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1673; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1674; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1675; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1676; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1677; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1678; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1679; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 1680; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1681; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1682; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 1683; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1684; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1685; GFX9-PAL-NEXT: s_endpgm 1686; 1687; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 1688; GFX1010-PAL: ; %bb.0: ; %bb 1689; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1690; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1691; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1692; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1694; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1695; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1696; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1697; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1698; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1699; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1700; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1701; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1702; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1703; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1704; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1705; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1706; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1707; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 1708; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 1709; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1710; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1711; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1712; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1713; GFX1010-PAL-NEXT: s_endpgm 1714; 1715; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 1716; GFX1030-PAL: ; %bb.0: ; %bb 1717; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1718; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1719; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1720; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1721; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1722; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1723; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1724; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1725; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1726; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1727; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1728; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1729; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1730; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1731; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1732; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1733; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1734; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 1735; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 1736; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1737; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1738; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1739; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1740; GFX1030-PAL-NEXT: s_endpgm 1741bb: 1742 %padding = alloca [4096 x i32], align 4, addrspace(5) 1743 %i = alloca [32 x float], align 4, addrspace(5) 1744 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1745 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1746 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1747 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1748 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1749 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1750 %i9 = and i32 %idx, 15 1751 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1752 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1753 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1754 ret void 1755} 1756 1757define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 1758; GFX9-LABEL: store_load_sindex_large_offset_foo: 1759; GFX9: ; %bb.0: ; %bb 1760; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1761; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1762; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1763; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1764; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1765; GFX9-NEXT: s_waitcnt vmcnt(0) 1766; GFX9-NEXT: s_addk_i32 s0, 0x4004 1767; GFX9-NEXT: v_mov_b32_e32 v0, 15 1768; GFX9-NEXT: scratch_store_dword off, v0, s0 1769; GFX9-NEXT: s_waitcnt vmcnt(0) 1770; GFX9-NEXT: s_and_b32 s0, s2, 15 1771; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1772; GFX9-NEXT: s_addk_i32 s0, 0x4004 1773; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1774; GFX9-NEXT: s_waitcnt vmcnt(0) 1775; GFX9-NEXT: s_endpgm 1776; 1777; GFX10-LABEL: store_load_sindex_large_offset_foo: 1778; GFX10: ; %bb.0: ; %bb 1779; GFX10-NEXT: s_add_u32 s0, s0, s3 1780; GFX10-NEXT: s_addc_u32 s1, s1, 0 1781; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1782; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1783; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1784; GFX10-NEXT: s_waitcnt vmcnt(0) 1785; GFX10-NEXT: s_and_b32 s0, s2, 15 1786; GFX10-NEXT: v_mov_b32_e32 v0, 15 1787; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1788; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1789; GFX10-NEXT: s_addk_i32 s1, 0x4004 1790; GFX10-NEXT: s_addk_i32 s0, 0x4004 1791; GFX10-NEXT: scratch_store_dword off, v0, s1 1792; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1793; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1794; GFX10-NEXT: s_waitcnt vmcnt(0) 1795; GFX10-NEXT: s_endpgm 1796; 1797; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 1798; GFX9-PAL: ; %bb.0: ; %bb 1799; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1800; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1801; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1802; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1803; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1804; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1805; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1806; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1807; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1808; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1809; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1810; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1811; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1812; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 1813; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1814; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1815; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1816; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 1817; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1818; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1819; GFX9-PAL-NEXT: s_endpgm 1820; 1821; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 1822; GFX1010-PAL: ; %bb.0: ; %bb 1823; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1824; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1825; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1826; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1828; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1829; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1830; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1831; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1832; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1833; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1834; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1835; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1836; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1837; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1838; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1839; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 1840; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 1841; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1842; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1843; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1844; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1845; GFX1010-PAL-NEXT: s_endpgm 1846; 1847; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 1848; GFX1030-PAL: ; %bb.0: ; %bb 1849; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1850; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1851; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1852; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1853; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1854; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1855; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1856; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1857; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1858; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1859; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1860; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1861; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1862; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1863; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1864; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 1865; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 1866; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1867; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1868; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1869; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1870; GFX1030-PAL-NEXT: s_endpgm 1871bb: 1872 %padding = alloca [4096 x i32], align 4, addrspace(5) 1873 %i = alloca [32 x float], align 4, addrspace(5) 1874 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1875 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1876 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1877 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1878 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1879 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1880 %i9 = and i32 %idx, 15 1881 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1882 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1883 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1884 ret void 1885} 1886 1887define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 1888; GFX9-LABEL: store_load_vindex_large_offset_kernel: 1889; GFX9: ; %bb.0: ; %bb 1890; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1891; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1892; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1893; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1894; GFX9-NEXT: s_waitcnt vmcnt(0) 1895; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1896; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 1897; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1898; GFX9-NEXT: v_mov_b32_e32 v3, 15 1899; GFX9-NEXT: scratch_store_dword v2, v3, off 1900; GFX9-NEXT: s_waitcnt vmcnt(0) 1901; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1902; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1903; GFX9-NEXT: s_waitcnt vmcnt(0) 1904; GFX9-NEXT: s_endpgm 1905; 1906; GFX10-LABEL: store_load_vindex_large_offset_kernel: 1907; GFX10: ; %bb.0: ; %bb 1908; GFX10-NEXT: s_add_u32 s0, s0, s3 1909; GFX10-NEXT: s_addc_u32 s1, s1, 0 1910; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1911; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1912; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 1913; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1914; GFX10-NEXT: v_mov_b32_e32 v3, 15 1915; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1916; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1917; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1918; GFX10-NEXT: s_waitcnt vmcnt(0) 1919; GFX10-NEXT: scratch_store_dword v2, v3, off 1920; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1921; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1922; GFX10-NEXT: s_waitcnt vmcnt(0) 1923; GFX10-NEXT: s_endpgm 1924; 1925; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 1926; GFX9-PAL: ; %bb.0: ; %bb 1927; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1928; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1929; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1930; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1931; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1932; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1933; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1934; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1935; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1936; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1937; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1938; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1939; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1940; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1941; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1942; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1943; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1944; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1945; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1946; GFX9-PAL-NEXT: s_endpgm 1947; 1948; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 1949; GFX1010-PAL: ; %bb.0: ; %bb 1950; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1951; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1952; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1953; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1954; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1955; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1956; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1957; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1958; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1959; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1960; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1961; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 1962; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1963; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1964; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1965; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc 1966; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1967; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off 1968; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1969; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1970; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1971; GFX1010-PAL-NEXT: s_endpgm 1972; 1973; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 1974; GFX1030-PAL: ; %bb.0: ; %bb 1975; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1976; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1977; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1978; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1979; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1980; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1981; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1982; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1983; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1984; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1985; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1986; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 1987; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1988; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1989; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1990; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1991; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off 1992; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1993; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1994; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1995; GFX1030-PAL-NEXT: s_endpgm 1996bb: 1997 %padding = alloca [4096 x i32], align 4, addrspace(5) 1998 %i = alloca [32 x float], align 4, addrspace(5) 1999 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2000 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2001 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2002 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2003 %i3 = zext i32 %i2 to i64 2004 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2005 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2006 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2007 %i9 = sub nsw i32 31, %i2 2008 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2009 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2010 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2011 ret void 2012} 2013 2014define void @store_load_vindex_large_offset_foo(i32 %idx) { 2015; GFX9-LABEL: store_load_vindex_large_offset_foo: 2016; GFX9: ; %bb.0: ; %bb 2017; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2018; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 2019; GFX9-NEXT: s_waitcnt vmcnt(0) 2020; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4000 2021; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 2022; GFX9-NEXT: v_mov_b32_e32 v3, 15 2023; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2024; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 2025; GFX9-NEXT: scratch_store_dword v2, v3, off 2026; GFX9-NEXT: s_waitcnt vmcnt(0) 2027; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2028; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2029; GFX9-NEXT: s_waitcnt vmcnt(0) 2030; GFX9-NEXT: s_setpc_b64 s[30:31] 2031; 2032; GFX10-LABEL: store_load_vindex_large_offset_foo: 2033; GFX10: ; %bb.0: ; %bb 2034; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2035; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2036; GFX10-NEXT: v_mov_b32_e32 v1, 15 2037; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4000 2038; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 2039; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 2040; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 2041; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 2042; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 2043; GFX10-NEXT: s_waitcnt vmcnt(0) 2044; GFX10-NEXT: scratch_store_dword v0, v1, off 2045; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2046; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 2047; GFX10-NEXT: s_waitcnt vmcnt(0) 2048; GFX10-NEXT: s_setpc_b64 s[30:31] 2049; 2050; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 2051; GFX9-PAL: ; %bb.0: ; %bb 2052; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2053; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 2054; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2055; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4000 2056; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 2057; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 2058; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2059; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 2060; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 2061; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2062; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2063; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 2064; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2065; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2066; 2067; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 2068; GFX10-PAL: ; %bb.0: ; %bb 2069; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2070; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2071; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2072; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4000 2073; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 2074; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 2075; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 2076; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 2077; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 2078; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2079; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 2080; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2081; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 2082; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2083; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2084bb: 2085 %padding = alloca [4096 x i32], align 4, addrspace(5) 2086 %i = alloca [32 x float], align 4, addrspace(5) 2087 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2088 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2089 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2090 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2091 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2092 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2093 %i9 = and i32 %idx, 15 2094 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2095 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2096 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2097 ret void 2098} 2099 2100define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 2101; GFX9-LABEL: store_load_large_imm_offset_kernel: 2102; GFX9: ; %bb.0: ; %bb 2103; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2104; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2105; GFX9-NEXT: s_movk_i32 s0, 0x3000 2106; GFX9-NEXT: v_mov_b32_e32 v0, 13 2107; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2108; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2109; GFX9-NEXT: s_waitcnt vmcnt(0) 2110; GFX9-NEXT: s_add_i32 s0, s0, 4 2111; GFX9-NEXT: v_mov_b32_e32 v0, 15 2112; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2113; GFX9-NEXT: s_waitcnt vmcnt(0) 2114; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2115; GFX9-NEXT: s_waitcnt vmcnt(0) 2116; GFX9-NEXT: s_endpgm 2117; 2118; GFX10-LABEL: store_load_large_imm_offset_kernel: 2119; GFX10: ; %bb.0: ; %bb 2120; GFX10-NEXT: s_add_u32 s0, s0, s3 2121; GFX10-NEXT: s_addc_u32 s1, s1, 0 2122; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2123; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2124; GFX10-NEXT: v_mov_b32_e32 v0, 13 2125; GFX10-NEXT: v_mov_b32_e32 v1, 15 2126; GFX10-NEXT: s_movk_i32 s0, 0x3800 2127; GFX10-NEXT: s_add_i32 s0, s0, 4 2128; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 2129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2130; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2131; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2132; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2133; GFX10-NEXT: s_waitcnt vmcnt(0) 2134; GFX10-NEXT: s_endpgm 2135; 2136; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 2137; GFX9-PAL: ; %bb.0: ; %bb 2138; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2139; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2140; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2141; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2142; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2143; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2144; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2145; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2146; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2147; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2148; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2149; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2150; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 2151; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2152; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2153; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2154; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2155; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2156; GFX9-PAL-NEXT: s_endpgm 2157; 2158; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 2159; GFX1010-PAL: ; %bb.0: ; %bb 2160; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2161; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2162; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2163; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2164; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2165; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2166; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2167; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2168; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2169; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 2170; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 2171; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 2172; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2173; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 2174; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 2175; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2176; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2177; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2178; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2179; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2180; GFX1010-PAL-NEXT: s_endpgm 2181; 2182; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 2183; GFX1030-PAL: ; %bb.0: ; %bb 2184; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2185; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2186; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2187; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2188; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2189; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2190; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2191; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2192; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2193; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 2194; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 2195; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 2196; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 2197; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 2198; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2199; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2200; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2201; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2202; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2203; GFX1030-PAL-NEXT: s_endpgm 2204bb: 2205 %i = alloca [4096 x i32], align 4, addrspace(5) 2206 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2207 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2208 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2209 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2210 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2211 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2212 ret void 2213} 2214 2215define void @store_load_large_imm_offset_foo() { 2216; GFX9-LABEL: store_load_large_imm_offset_foo: 2217; GFX9: ; %bb.0: ; %bb 2218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2219; GFX9-NEXT: s_movk_i32 s0, 0x3000 2220; GFX9-NEXT: v_mov_b32_e32 v0, 13 2221; GFX9-NEXT: scratch_store_dword off, v0, s32 2222; GFX9-NEXT: s_waitcnt vmcnt(0) 2223; GFX9-NEXT: s_add_i32 s0, s0, s32 2224; GFX9-NEXT: v_mov_b32_e32 v0, 15 2225; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2226; GFX9-NEXT: s_waitcnt vmcnt(0) 2227; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2228; GFX9-NEXT: s_waitcnt vmcnt(0) 2229; GFX9-NEXT: s_setpc_b64 s[30:31] 2230; 2231; GFX10-LABEL: store_load_large_imm_offset_foo: 2232; GFX10: ; %bb.0: ; %bb 2233; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2234; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2235; GFX10-NEXT: v_mov_b32_e32 v0, 13 2236; GFX10-NEXT: v_mov_b32_e32 v1, 15 2237; GFX10-NEXT: s_movk_i32 s0, 0x3800 2238; GFX10-NEXT: s_add_i32 s0, s0, s32 2239; GFX10-NEXT: scratch_store_dword off, v0, s32 2240; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2241; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2242; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2243; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2244; GFX10-NEXT: s_waitcnt vmcnt(0) 2245; GFX10-NEXT: s_setpc_b64 s[30:31] 2246; 2247; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 2248; GFX9-PAL: ; %bb.0: ; %bb 2249; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2250; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2251; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2252; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 2253; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2254; GFX9-PAL-NEXT: s_add_i32 s0, s0, s32 2255; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2256; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2257; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2258; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2259; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2260; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2261; 2262; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 2263; GFX10-PAL: ; %bb.0: ; %bb 2264; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2265; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2266; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 2267; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2268; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 2269; GFX10-PAL-NEXT: s_add_i32 s0, s0, s32 2270; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 2271; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2272; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2273; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2274; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2275; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2276; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2277bb: 2278 %i = alloca [4096 x i32], align 4, addrspace(5) 2279 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2280 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2281 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2282 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2283 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2284 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2285 ret void 2286} 2287 2288define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 2289; GFX9-LABEL: store_load_vidx_sidx_offset: 2290; GFX9: ; %bb.0: ; %bb 2291; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2292; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2293; GFX9-NEXT: v_mov_b32_e32 v1, 4 2294; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2296; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 2297; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2298; GFX9-NEXT: v_mov_b32_e32 v1, 15 2299; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 2300; GFX9-NEXT: s_waitcnt vmcnt(0) 2301; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2302; GFX9-NEXT: s_waitcnt vmcnt(0) 2303; GFX9-NEXT: s_endpgm 2304; 2305; GFX10-LABEL: store_load_vidx_sidx_offset: 2306; GFX10: ; %bb.0: ; %bb 2307; GFX10-NEXT: s_add_u32 s2, s2, s5 2308; GFX10-NEXT: s_addc_u32 s3, s3, 0 2309; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2310; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2311; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2312; GFX10-NEXT: v_mov_b32_e32 v1, 15 2313; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2314; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 2315; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2316; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 2317; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2318; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2319; GFX10-NEXT: s_waitcnt vmcnt(0) 2320; GFX10-NEXT: s_endpgm 2321; 2322; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 2323; GFX9-PAL: ; %bb.0: ; %bb 2324; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2325; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2326; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2327; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 2328; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 2329; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2330; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2331; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2332; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 2333; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2334; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2335; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2336; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2337; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2338; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2339; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2340; GFX9-PAL-NEXT: s_endpgm 2341; 2342; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 2343; GFX10-PAL: ; %bb.0: ; %bb 2344; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 2345; GFX10-PAL-NEXT: s_mov_b32 s4, s0 2346; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2347; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2348; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2349; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 2350; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 2351; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2352; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2353; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 2354; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2355; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2356; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 2357; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2358; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2359; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2360; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2361; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2362; GFX10-PAL-NEXT: s_endpgm 2363bb: 2364 %alloca = alloca [32 x i32], align 4, addrspace(5) 2365 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 2366 %add1 = add nsw i32 %sidx, %vidx 2367 %add2 = add nsw i32 %add1, 256 2368 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 2369 store volatile i32 15, i32 addrspace(5)* %gep, align 4 2370 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 2371 ret void 2372} 2373 2374define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 2375; GFX9-LABEL: store_load_i64_aligned: 2376; GFX9: ; %bb.0: ; %bb 2377; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2378; GFX9-NEXT: v_mov_b32_e32 v1, 15 2379; GFX9-NEXT: v_mov_b32_e32 v2, 0 2380; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2381; GFX9-NEXT: s_waitcnt vmcnt(0) 2382; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2383; GFX9-NEXT: s_waitcnt vmcnt(0) 2384; GFX9-NEXT: s_setpc_b64 s[30:31] 2385; 2386; GFX10-LABEL: store_load_i64_aligned: 2387; GFX10: ; %bb.0: ; %bb 2388; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2389; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2390; GFX10-NEXT: v_mov_b32_e32 v1, 15 2391; GFX10-NEXT: v_mov_b32_e32 v2, 0 2392; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2393; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2394; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2395; GFX10-NEXT: s_waitcnt vmcnt(0) 2396; GFX10-NEXT: s_setpc_b64 s[30:31] 2397; 2398; GFX9-PAL-LABEL: store_load_i64_aligned: 2399; GFX9-PAL: ; %bb.0: ; %bb 2400; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2401; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2402; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2403; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2404; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2405; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2406; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2407; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2408; 2409; GFX10-PAL-LABEL: store_load_i64_aligned: 2410; GFX10-PAL: ; %bb.0: ; %bb 2411; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2412; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2413; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2414; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2415; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2416; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2417; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2418; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2419; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2420bb: 2421 store volatile i64 15, i64 addrspace(5)* %arg, align 8 2422 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 2423 ret void 2424} 2425 2426define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2427; GFX9-LABEL: store_load_i64_unaligned: 2428; GFX9: ; %bb.0: ; %bb 2429; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2430; GFX9-NEXT: v_mov_b32_e32 v1, 15 2431; GFX9-NEXT: v_mov_b32_e32 v2, 0 2432; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2433; GFX9-NEXT: s_waitcnt vmcnt(0) 2434; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2435; GFX9-NEXT: s_waitcnt vmcnt(0) 2436; GFX9-NEXT: s_setpc_b64 s[30:31] 2437; 2438; GFX10-LABEL: store_load_i64_unaligned: 2439; GFX10: ; %bb.0: ; %bb 2440; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2441; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2442; GFX10-NEXT: v_mov_b32_e32 v1, 15 2443; GFX10-NEXT: v_mov_b32_e32 v2, 0 2444; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2445; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2446; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2447; GFX10-NEXT: s_waitcnt vmcnt(0) 2448; GFX10-NEXT: s_setpc_b64 s[30:31] 2449; 2450; GFX9-PAL-LABEL: store_load_i64_unaligned: 2451; GFX9-PAL: ; %bb.0: ; %bb 2452; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2453; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2454; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2455; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2456; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2457; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2458; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2459; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2460; 2461; GFX10-PAL-LABEL: store_load_i64_unaligned: 2462; GFX10-PAL: ; %bb.0: ; %bb 2463; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2464; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2465; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2466; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2467; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2468; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2469; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2470; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2471; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2472bb: 2473 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2474 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2475 ret void 2476} 2477 2478define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 2479; GFX9-LABEL: store_load_v3i32_unaligned: 2480; GFX9: ; %bb.0: ; %bb 2481; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2482; GFX9-NEXT: v_mov_b32_e32 v1, 1 2483; GFX9-NEXT: v_mov_b32_e32 v2, 2 2484; GFX9-NEXT: v_mov_b32_e32 v3, 3 2485; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2486; GFX9-NEXT: s_waitcnt vmcnt(0) 2487; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 2488; GFX9-NEXT: s_waitcnt vmcnt(0) 2489; GFX9-NEXT: s_setpc_b64 s[30:31] 2490; 2491; GFX10-LABEL: store_load_v3i32_unaligned: 2492; GFX10: ; %bb.0: ; %bb 2493; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2494; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2495; GFX10-NEXT: v_mov_b32_e32 v1, 1 2496; GFX10-NEXT: v_mov_b32_e32 v2, 2 2497; GFX10-NEXT: v_mov_b32_e32 v3, 3 2498; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2499; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2500; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 2501; GFX10-NEXT: s_waitcnt vmcnt(0) 2502; GFX10-NEXT: s_setpc_b64 s[30:31] 2503; 2504; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 2505; GFX9-PAL: ; %bb.0: ; %bb 2506; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2507; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2508; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2509; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2510; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2511; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2512; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 2513; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2514; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2515; 2516; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 2517; GFX10-PAL: ; %bb.0: ; %bb 2518; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2519; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2520; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2521; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2522; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2523; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2524; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2525; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 2526; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2527; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2528bb: 2529 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 2530 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 2531 ret void 2532} 2533 2534define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 2535; GFX9-LABEL: store_load_v4i32_unaligned: 2536; GFX9: ; %bb.0: ; %bb 2537; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2538; GFX9-NEXT: v_mov_b32_e32 v1, 1 2539; GFX9-NEXT: v_mov_b32_e32 v2, 2 2540; GFX9-NEXT: v_mov_b32_e32 v3, 3 2541; GFX9-NEXT: v_mov_b32_e32 v4, 4 2542; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2543; GFX9-NEXT: s_waitcnt vmcnt(0) 2544; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 2545; GFX9-NEXT: s_waitcnt vmcnt(0) 2546; GFX9-NEXT: s_setpc_b64 s[30:31] 2547; 2548; GFX10-LABEL: store_load_v4i32_unaligned: 2549; GFX10: ; %bb.0: ; %bb 2550; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2551; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2552; GFX10-NEXT: v_mov_b32_e32 v1, 1 2553; GFX10-NEXT: v_mov_b32_e32 v2, 2 2554; GFX10-NEXT: v_mov_b32_e32 v3, 3 2555; GFX10-NEXT: v_mov_b32_e32 v4, 4 2556; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2557; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2558; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 2559; GFX10-NEXT: s_waitcnt vmcnt(0) 2560; GFX10-NEXT: s_setpc_b64 s[30:31] 2561; 2562; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 2563; GFX9-PAL: ; %bb.0: ; %bb 2564; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2565; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2566; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2567; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2568; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 2569; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2570; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2571; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 2572; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2573; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2574; 2575; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 2576; GFX10-PAL: ; %bb.0: ; %bb 2577; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2578; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2579; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2580; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2581; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2582; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 2583; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2584; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2585; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 2586; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2587; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2588bb: 2589 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 2590 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 2591 ret void 2592} 2593 2594define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 2595; GFX9-LABEL: store_load_i32_negative_unaligned: 2596; GFX9: ; %bb.0: ; %bb 2597; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2598; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 2599; GFX9-NEXT: v_mov_b32_e32 v1, 1 2600; GFX9-NEXT: scratch_store_byte v0, v1, off 2601; GFX9-NEXT: s_waitcnt vmcnt(0) 2602; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 2603; GFX9-NEXT: s_waitcnt vmcnt(0) 2604; GFX9-NEXT: s_setpc_b64 s[30:31] 2605; 2606; GFX10-LABEL: store_load_i32_negative_unaligned: 2607; GFX10: ; %bb.0: ; %bb 2608; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2609; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2610; GFX10-NEXT: v_mov_b32_e32 v1, 1 2611; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 2612; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2613; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 2614; GFX10-NEXT: s_waitcnt vmcnt(0) 2615; GFX10-NEXT: s_setpc_b64 s[30:31] 2616; 2617; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 2618; GFX9-PAL: ; %bb.0: ; %bb 2619; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2620; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 2621; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2622; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 2623; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2624; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 2625; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2626; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2627; 2628; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 2629; GFX1010-PAL: ; %bb.0: ; %bb 2630; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2631; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2632; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 2633; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 2634; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 2635; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2636; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 2637; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2638; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2639; 2640; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 2641; GFX1030-PAL: ; %bb.0: ; %bb 2642; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2643; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2644; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 2645; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 2646; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2647; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 2648; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2649; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2650bb: 2651 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 2652 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 2653 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 2654 ret void 2655} 2656 2657define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 2658; GFX9-LABEL: store_load_i32_large_negative_unaligned: 2659; GFX9: ; %bb.0: ; %bb 2660; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2661; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 2662; GFX9-NEXT: v_mov_b32_e32 v1, 1 2663; GFX9-NEXT: scratch_store_byte v0, v1, off 2664; GFX9-NEXT: s_waitcnt vmcnt(0) 2665; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 2666; GFX9-NEXT: s_waitcnt vmcnt(0) 2667; GFX9-NEXT: s_setpc_b64 s[30:31] 2668; 2669; GFX10-LABEL: store_load_i32_large_negative_unaligned: 2670; GFX10: ; %bb.0: ; %bb 2671; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2672; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2673; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 2674; GFX10-NEXT: v_mov_b32_e32 v1, 1 2675; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 2676; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2677; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 2678; GFX10-NEXT: s_waitcnt vmcnt(0) 2679; GFX10-NEXT: s_setpc_b64 s[30:31] 2680; 2681; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 2682; GFX9-PAL: ; %bb.0: ; %bb 2683; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2684; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 2685; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2686; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 2687; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2688; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 2689; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2690; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2691; 2692; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 2693; GFX1010-PAL: ; %bb.0: ; %bb 2694; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2695; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2696; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 2697; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 2698; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 2699; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2700; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 2701; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2702; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2703; 2704; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 2705; GFX1030-PAL: ; %bb.0: ; %bb 2706; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2707; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2708; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 2709; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 2710; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 2711; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2712; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 2713; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2714; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2715bb: 2716 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 2717 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 2718 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 2719 ret void 2720} 2721 2722declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 2723declare i32 @llvm.amdgcn.workitem.id.x() 2724