1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4 5; Test using saddr addressing mode of global_*load_* flat instructions. 6 7; -------------------------------------------------------------------------------- 8; No vgpr offset, constants 9; -------------------------------------------------------------------------------- 10 11; SGPR base only 12define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) { 13; GCN-LABEL: global_load_saddr_i8_offset_0: 14; GCN: ; %bb.0: 15; GCN-NEXT: v_mov_b32_e32 v0, 0 16; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 17; GCN-NEXT: s_waitcnt vmcnt(0) 18; GCN-NEXT: ; return to shader part epilog 19 %load = load i8, i8 addrspace(1)* %sbase 20 %zext = zext i8 %load to i32 21 %to.vgpr = bitcast i32 %zext to float 22 ret float %to.vgpr 23} 24 25; SGPR base with maximum gfx9 immediate offset 26define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) { 27; GFX9-LABEL: global_load_saddr_i8_offset_4095: 28; GFX9: ; %bb.0: 29; GFX9-NEXT: v_mov_b32_e32 v0, 0 30; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 31; GFX9-NEXT: s_waitcnt vmcnt(0) 32; GFX9-NEXT: ; return to shader part epilog 33; 34; GFX10-LABEL: global_load_saddr_i8_offset_4095: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 37; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 38; GFX10-NEXT: s_waitcnt vmcnt(0) 39; GFX10-NEXT: ; return to shader part epilog 40 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 41 %load = load i8, i8 addrspace(1)* %gep0 42 %zext = zext i8 %load to i32 43 %to.vgpr = bitcast i32 %zext to float 44 ret float %to.vgpr 45} 46 47; SGPR base with maximum gfx9 immediate offset + 1 48define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) { 49; GCN-LABEL: global_load_saddr_i8_offset_4096: 50; GCN: ; %bb.0: 51; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 52; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 53; GCN-NEXT: s_waitcnt vmcnt(0) 54; GCN-NEXT: ; return to shader part epilog 55 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096 56 %load = load i8, i8 addrspace(1)* %gep0 57 %zext = zext i8 %load to i32 58 %to.vgpr = bitcast i32 %zext to float 59 ret float %to.vgpr 60} 61 62; SGPR base with maximum gfx9 immediate offset + 2 63define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) { 64; GCN-LABEL: global_load_saddr_i8_offset_4097: 65; GCN: ; %bb.0: 66; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 67; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 68; GCN-NEXT: s_waitcnt vmcnt(0) 69; GCN-NEXT: ; return to shader part epilog 70 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097 71 %load = load i8, i8 addrspace(1)* %gep0 72 %zext = zext i8 %load to i32 73 %to.vgpr = bitcast i32 %zext to float 74 ret float %to.vgpr 75} 76 77; SGPR base with maximum negative gfx9 immediate offset 78define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) { 79; GFX9-LABEL: global_load_saddr_i8_offset_neg4096: 80; GFX9: ; %bb.0: 81; GFX9-NEXT: v_mov_b32_e32 v0, 0 82; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 83; GFX9-NEXT: s_waitcnt vmcnt(0) 84; GFX9-NEXT: ; return to shader part epilog 85; 86; GFX10-LABEL: global_load_saddr_i8_offset_neg4096: 87; GFX10: ; %bb.0: 88; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 89; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 90; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 91; GFX10-NEXT: s_waitcnt vmcnt(0) 92; GFX10-NEXT: ; return to shader part epilog 93 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096 94 %load = load i8, i8 addrspace(1)* %gep0 95 %zext = zext i8 %load to i32 96 %to.vgpr = bitcast i32 %zext to float 97 ret float %to.vgpr 98} 99 100; SGPR base with maximum negative gfx9 immediate offset -1 101define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) { 102; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: 103; GFX9: ; %bb.0: 104; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff 105; GFX9-NEXT: s_addc_u32 s1, s3, -1 106; GFX9-NEXT: v_mov_b32_e32 v0, 0 107; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 108; GFX9-NEXT: s_waitcnt vmcnt(0) 109; GFX9-NEXT: ; return to shader part epilog 110; 111; GFX10-LABEL: global_load_saddr_i8_offset_neg4097: 112; GFX10: ; %bb.0: 113; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 114; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 115; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 116; GFX10-NEXT: s_waitcnt vmcnt(0) 117; GFX10-NEXT: ; return to shader part epilog 118 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097 119 %load = load i8, i8 addrspace(1)* %gep0 120 %zext = zext i8 %load to i32 121 %to.vgpr = bitcast i32 %zext to float 122 ret float %to.vgpr 123} 124 125; SGPR base with maximum negative gfx9 immediate offset -2 126define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) { 127; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: 128; GFX9: ; %bb.0: 129; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe 130; GFX9-NEXT: s_addc_u32 s1, s3, -1 131; GFX9-NEXT: v_mov_b32_e32 v0, 0 132; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 133; GFX9-NEXT: s_waitcnt vmcnt(0) 134; GFX9-NEXT: ; return to shader part epilog 135; 136; GFX10-LABEL: global_load_saddr_i8_offset_neg4098: 137; GFX10: ; %bb.0: 138; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 139; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 140; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 141; GFX10-NEXT: s_waitcnt vmcnt(0) 142; GFX10-NEXT: ; return to shader part epilog 143 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098 144 %load = load i8, i8 addrspace(1)* %gep0 145 %zext = zext i8 %load to i32 146 %to.vgpr = bitcast i32 %zext to float 147 ret float %to.vgpr 148} 149 150; SGPR base with maximum gfx10 immediate offset 151define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) { 152; GFX9-LABEL: global_load_saddr_i8_offset_2048: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: v_mov_b32_e32 v0, 0 155; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 156; GFX9-NEXT: s_waitcnt vmcnt(0) 157; GFX9-NEXT: ; return to shader part epilog 158; 159; GFX10-LABEL: global_load_saddr_i8_offset_2048: 160; GFX10: ; %bb.0: 161; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 162; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 163; GFX10-NEXT: s_waitcnt vmcnt(0) 164; GFX10-NEXT: ; return to shader part epilog 165 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048 166 %load = load i8, i8 addrspace(1)* %gep0 167 %zext = zext i8 %load to i32 168 %to.vgpr = bitcast i32 %zext to float 169 ret float %to.vgpr 170} 171 172; SGPR base with maximum gfx10 immediate offset + 1 173define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) { 174; GFX9-LABEL: global_load_saddr_i8_offset_2049: 175; GFX9: ; %bb.0: 176; GFX9-NEXT: v_mov_b32_e32 v0, 0 177; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049 178; GFX9-NEXT: s_waitcnt vmcnt(0) 179; GFX9-NEXT: ; return to shader part epilog 180; 181; GFX10-LABEL: global_load_saddr_i8_offset_2049: 182; GFX10: ; %bb.0: 183; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 184; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 185; GFX10-NEXT: s_waitcnt vmcnt(0) 186; GFX10-NEXT: ; return to shader part epilog 187 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049 188 %load = load i8, i8 addrspace(1)* %gep0 189 %zext = zext i8 %load to i32 190 %to.vgpr = bitcast i32 %zext to float 191 ret float %to.vgpr 192} 193 194; SGPR base with maximum gfx10 immediate offset + 2 195define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) { 196; GFX9-LABEL: global_load_saddr_i8_offset_2050: 197; GFX9: ; %bb.0: 198; GFX9-NEXT: v_mov_b32_e32 v0, 0 199; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050 200; GFX9-NEXT: s_waitcnt vmcnt(0) 201; GFX9-NEXT: ; return to shader part epilog 202; 203; GFX10-LABEL: global_load_saddr_i8_offset_2050: 204; GFX10: ; %bb.0: 205; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 206; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 207; GFX10-NEXT: s_waitcnt vmcnt(0) 208; GFX10-NEXT: ; return to shader part epilog 209 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050 210 %load = load i8, i8 addrspace(1)* %gep0 211 %zext = zext i8 %load to i32 212 %to.vgpr = bitcast i32 %zext to float 213 ret float %to.vgpr 214} 215 216; SGPR base with maximum negative gfx10 immediate offset 217define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) { 218; GCN-LABEL: global_load_saddr_i8_offset_neg2048: 219; GCN: ; %bb.0: 220; GCN-NEXT: v_mov_b32_e32 v0, 0 221; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 222; GCN-NEXT: s_waitcnt vmcnt(0) 223; GCN-NEXT: ; return to shader part epilog 224 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048 225 %load = load i8, i8 addrspace(1)* %gep0 226 %zext = zext i8 %load to i32 227 %to.vgpr = bitcast i32 %zext to float 228 ret float %to.vgpr 229} 230 231; SGPR base with maximum negative gfx10 immediate offset - 1 232define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) { 233; GFX9-LABEL: global_load_saddr_i8_offset_neg2049: 234; GFX9: ; %bb.0: 235; GFX9-NEXT: v_mov_b32_e32 v0, 0 236; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 237; GFX9-NEXT: s_waitcnt vmcnt(0) 238; GFX9-NEXT: ; return to shader part epilog 239; 240; GFX10-LABEL: global_load_saddr_i8_offset_neg2049: 241; GFX10: ; %bb.0: 242; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 243; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 244; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 245; GFX10-NEXT: s_waitcnt vmcnt(0) 246; GFX10-NEXT: ; return to shader part epilog 247 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049 248 %load = load i8, i8 addrspace(1)* %gep0 249 %zext = zext i8 %load to i32 250 %to.vgpr = bitcast i32 %zext to float 251 ret float %to.vgpr 252} 253 254; SGPR base with maximum negative gfx10 immediate offset - 1 255define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) { 256; GFX9-LABEL: global_load_saddr_i8_offset_neg2050: 257; GFX9: ; %bb.0: 258; GFX9-NEXT: v_mov_b32_e32 v0, 0 259; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050 260; GFX9-NEXT: s_waitcnt vmcnt(0) 261; GFX9-NEXT: ; return to shader part epilog 262; 263; GFX10-LABEL: global_load_saddr_i8_offset_neg2050: 264; GFX10: ; %bb.0: 265; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 266; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 267; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 268; GFX10-NEXT: s_waitcnt vmcnt(0) 269; GFX10-NEXT: ; return to shader part epilog 270 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050 271 %load = load i8, i8 addrspace(1)* %gep0 272 %zext = zext i8 %load to i32 273 %to.vgpr = bitcast i32 %zext to float 274 ret float %to.vgpr 275} 276 277define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) { 278; GFX9-LABEL: global_load_saddr_i8_offset_4294967295: 279; GFX9: ; %bb.0: 280; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000 281; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 282; GFX9-NEXT: s_waitcnt vmcnt(0) 283; GFX9-NEXT: ; return to shader part epilog 284; 285; GFX10-LABEL: global_load_saddr_i8_offset_4294967295: 286; GFX10: ; %bb.0: 287; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800 288; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 289; GFX10-NEXT: s_waitcnt vmcnt(0) 290; GFX10-NEXT: ; return to shader part epilog 291 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295 292 %load = load i8, i8 addrspace(1)* %gep0 293 %zext = zext i8 %load to i32 294 %to.vgpr = bitcast i32 %zext to float 295 ret float %to.vgpr 296} 297 298define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) { 299; GFX9-LABEL: global_load_saddr_i8_offset_4294967296: 300; GFX9: ; %bb.0: 301; GFX9-NEXT: v_mov_b32_e32 v1, s3 302; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 303; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc 304; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 305; GFX9-NEXT: s_waitcnt vmcnt(0) 306; GFX9-NEXT: ; return to shader part epilog 307; 308; GFX10-LABEL: global_load_saddr_i8_offset_4294967296: 309; GFX10: ; %bb.0: 310; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 311; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 312; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 313; GFX10-NEXT: s_waitcnt vmcnt(0) 314; GFX10-NEXT: ; return to shader part epilog 315 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296 316 %load = load i8, i8 addrspace(1)* %gep0 317 %zext = zext i8 %load to i32 318 %to.vgpr = bitcast i32 %zext to float 319 ret float %to.vgpr 320} 321 322define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) { 323; GFX9-LABEL: global_load_saddr_i8_offset_4294967297: 324; GFX9: ; %bb.0: 325; GFX9-NEXT: v_mov_b32_e32 v1, s3 326; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 327; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc 328; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 329; GFX9-NEXT: s_waitcnt vmcnt(0) 330; GFX9-NEXT: ; return to shader part epilog 331; 332; GFX10-LABEL: global_load_saddr_i8_offset_4294967297: 333; GFX10: ; %bb.0: 334; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 335; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 336; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 337; GFX10-NEXT: s_waitcnt vmcnt(0) 338; GFX10-NEXT: ; return to shader part epilog 339 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297 340 %load = load i8, i8 addrspace(1)* %gep0 341 %zext = zext i8 %load to i32 342 %to.vgpr = bitcast i32 %zext to float 343 ret float %to.vgpr 344} 345 346define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) { 347; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: 348; GFX9: ; %bb.0: 349; GFX9-NEXT: s_add_u32 s0, s2, 0xfff 350; GFX9-NEXT: s_addc_u32 s1, s3, 1 351; GFX9-NEXT: v_mov_b32_e32 v0, 0 352; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 353; GFX9-NEXT: s_waitcnt vmcnt(0) 354; GFX9-NEXT: ; return to shader part epilog 355; 356; GFX10-LABEL: global_load_saddr_i8_offset_4294971391: 357; GFX10: ; %bb.0: 358; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 359; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 360; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 361; GFX10-NEXT: s_waitcnt vmcnt(0) 362; GFX10-NEXT: ; return to shader part epilog 363 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391 364 %load = load i8, i8 addrspace(1)* %gep0 365 %zext = zext i8 %load to i32 366 %to.vgpr = bitcast i32 %zext to float 367 ret float %to.vgpr 368} 369 370define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) { 371; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: 372; GFX9: ; %bb.0: 373; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 374; GFX9-NEXT: s_addc_u32 s1, s3, 1 375; GFX9-NEXT: v_mov_b32_e32 v0, 0 376; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 377; GFX9-NEXT: s_waitcnt vmcnt(0) 378; GFX9-NEXT: ; return to shader part epilog 379; 380; GFX10-LABEL: global_load_saddr_i8_offset_4294971392: 381; GFX10: ; %bb.0: 382; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 383; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 384; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 385; GFX10-NEXT: s_waitcnt vmcnt(0) 386; GFX10-NEXT: ; return to shader part epilog 387 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392 388 %load = load i8, i8 addrspace(1)* %gep0 389 %zext = zext i8 %load to i32 390 %to.vgpr = bitcast i32 %zext to float 391 ret float %to.vgpr 392} 393 394define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) { 395; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295: 396; GFX9: ; %bb.0: 397; GFX9-NEXT: v_mov_b32_e32 v0, s2 398; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 399; GFX9-NEXT: v_mov_b32_e32 v1, s3 400; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 401; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 402; GFX9-NEXT: s_waitcnt vmcnt(0) 403; GFX9-NEXT: ; return to shader part epilog 404; 405; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295: 406; GFX10: ; %bb.0: 407; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 408; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 409; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 410; GFX10-NEXT: s_waitcnt vmcnt(0) 411; GFX10-NEXT: ; return to shader part epilog 412 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295 413 %load = load i8, i8 addrspace(1)* %gep0 414 %zext = zext i8 %load to i32 415 %to.vgpr = bitcast i32 %zext to float 416 ret float %to.vgpr 417} 418 419define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) { 420; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296: 421; GFX9: ; %bb.0: 422; GFX9-NEXT: v_mov_b32_e32 v1, s3 423; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 424; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 425; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 426; GFX9-NEXT: s_waitcnt vmcnt(0) 427; GFX9-NEXT: ; return to shader part epilog 428; 429; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296: 430; GFX10: ; %bb.0: 431; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 432; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 433; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 434; GFX10-NEXT: s_waitcnt vmcnt(0) 435; GFX10-NEXT: ; return to shader part epilog 436 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296 437 %load = load i8, i8 addrspace(1)* %gep0 438 %zext = zext i8 %load to i32 439 %to.vgpr = bitcast i32 %zext to float 440 ret float %to.vgpr 441} 442 443define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) { 444; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297: 445; GFX9: ; %bb.0: 446; GFX9-NEXT: v_mov_b32_e32 v1, s3 447; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 448; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 449; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 450; GFX9-NEXT: s_waitcnt vmcnt(0) 451; GFX9-NEXT: ; return to shader part epilog 452; 453; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297: 454; GFX10: ; %bb.0: 455; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 456; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 457; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 458; GFX10-NEXT: s_waitcnt vmcnt(0) 459; GFX10-NEXT: ; return to shader part epilog 460 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297 461 %load = load i8, i8 addrspace(1)* %gep0 462 %zext = zext i8 %load to i32 463 %to.vgpr = bitcast i32 %zext to float 464 ret float %to.vgpr 465} 466 467; -------------------------------------------------------------------------------- 468; Basic addressing patterns 469; -------------------------------------------------------------------------------- 470 471; Basic pattern, no immediate offset. 472define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 473; GCN-LABEL: global_load_saddr_i8_zext_vgpr: 474; GCN: ; %bb.0: 475; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 476; GCN-NEXT: s_waitcnt vmcnt(0) 477; GCN-NEXT: ; return to shader part epilog 478 %zext.offset = zext i32 %voffset to i64 479 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 480 %load = load i8, i8 addrspace(1)* %gep0 481 %zext = zext i8 %load to i32 482 %to.vgpr = bitcast i32 %zext to float 483 ret float %to.vgpr 484} 485 486; Maximum positive offset on gfx9 487define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 488; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 489; GFX9: ; %bb.0: 490; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 491; GFX9-NEXT: s_waitcnt vmcnt(0) 492; GFX9-NEXT: ; return to shader part epilog 493; 494; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 495; GFX10: ; %bb.0: 496; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 497; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 498; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 499; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 500; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 501; GFX10-NEXT: s_waitcnt vmcnt(0) 502; GFX10-NEXT: ; return to shader part epilog 503 %zext.offset = zext i32 %voffset to i64 504 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 505 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 506 %load = load i8, i8 addrspace(1)* %gep1 507 %zext = zext i8 %load to i32 508 %to.vgpr = bitcast i32 %zext to float 509 ret float %to.vgpr 510} 511 512; Maximum positive offset on gfx9 + 1 513define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 514; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 515; GFX9: ; %bb.0: 516; GFX9-NEXT: v_mov_b32_e32 v1, s3 517; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 518; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 519; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 520; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 521; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 522; GFX9-NEXT: s_waitcnt vmcnt(0) 523; GFX9-NEXT: ; return to shader part epilog 524; 525; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 526; GFX10: ; %bb.0: 527; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 528; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 529; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 530; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 531; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 532; GFX10-NEXT: s_waitcnt vmcnt(0) 533; GFX10-NEXT: ; return to shader part epilog 534 %zext.offset = zext i32 %voffset to i64 535 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 536 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096 537 %load = load i8, i8 addrspace(1)* %gep1 538 %zext = zext i8 %load to i32 539 %to.vgpr = bitcast i32 %zext to float 540 ret float %to.vgpr 541} 542 543; Maximum negative offset on gfx9 544define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 545; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 546; GFX9: ; %bb.0: 547; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 548; GFX9-NEXT: s_waitcnt vmcnt(0) 549; GFX9-NEXT: ; return to shader part epilog 550; 551; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 552; GFX10: ; %bb.0: 553; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 554; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 555; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 556; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 557; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 558; GFX10-NEXT: s_waitcnt vmcnt(0) 559; GFX10-NEXT: ; return to shader part epilog 560 %zext.offset = zext i32 %voffset to i64 561 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 562 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096 563 %load = load i8, i8 addrspace(1)* %gep1 564 %zext = zext i8 %load to i32 565 %to.vgpr = bitcast i32 %zext to float 566 ret float %to.vgpr 567} 568 569; Maximum negative offset on gfx9 - 1 570define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 571; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 572; GFX9: ; %bb.0: 573; GFX9-NEXT: v_mov_b32_e32 v1, s3 574; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 575; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 576; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 577; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 578; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 579; GFX9-NEXT: s_waitcnt vmcnt(0) 580; GFX9-NEXT: ; return to shader part epilog 581; 582; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 583; GFX10: ; %bb.0: 584; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 585; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 586; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 587; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 588; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 589; GFX10-NEXT: s_waitcnt vmcnt(0) 590; GFX10-NEXT: ; return to shader part epilog 591 %zext.offset = zext i32 %voffset to i64 592 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 593 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097 594 %load = load i8, i8 addrspace(1)* %gep1 595 %zext = zext i8 %load to i32 596 %to.vgpr = bitcast i32 %zext to float 597 ret float %to.vgpr 598} 599 600; Maximum positive offset on gfx10 601define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 602; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: 603; GCN: ; %bb.0: 604; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 605; GCN-NEXT: s_waitcnt vmcnt(0) 606; GCN-NEXT: ; return to shader part epilog 607 %zext.offset = zext i32 %voffset to i64 608 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 609 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 610 %load = load i8, i8 addrspace(1)* %gep1 611 %zext = zext i8 %load to i32 612 %to.vgpr = bitcast i32 %zext to float 613 ret float %to.vgpr 614} 615 616; Maximum positive offset on gfx10 + 1 617define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 618; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 619; GFX9: ; %bb.0: 620; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 621; GFX9-NEXT: s_waitcnt vmcnt(0) 622; GFX9-NEXT: ; return to shader part epilog 623; 624; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 625; GFX10: ; %bb.0: 626; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 627; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 628; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 629; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 630; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 631; GFX10-NEXT: s_waitcnt vmcnt(0) 632; GFX10-NEXT: ; return to shader part epilog 633 %zext.offset = zext i32 %voffset to i64 634 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 635 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 636 %load = load i8, i8 addrspace(1)* %gep1 637 %zext = zext i8 %load to i32 638 %to.vgpr = bitcast i32 %zext to float 639 ret float %to.vgpr 640} 641 642; Maximum negative offset on gfx10 643define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 644; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: 645; GCN: ; %bb.0: 646; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 647; GCN-NEXT: s_waitcnt vmcnt(0) 648; GCN-NEXT: ; return to shader part epilog 649 %zext.offset = zext i32 %voffset to i64 650 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 651 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 652 %load = load i8, i8 addrspace(1)* %gep1 653 %zext = zext i8 %load to i32 654 %to.vgpr = bitcast i32 %zext to float 655 ret float %to.vgpr 656} 657 658; Maximum negative offset on gfx10 - 1 659define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 660; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 661; GFX9: ; %bb.0: 662; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 663; GFX9-NEXT: s_waitcnt vmcnt(0) 664; GFX9-NEXT: ; return to shader part epilog 665; 666; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 667; GFX10: ; %bb.0: 668; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 669; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 670; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0 671; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 672; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 673; GFX10-NEXT: s_waitcnt vmcnt(0) 674; GFX10-NEXT: ; return to shader part epilog 675 %zext.offset = zext i32 %voffset to i64 676 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 677 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049 678 %load = load i8, i8 addrspace(1)* %gep1 679 %zext = zext i8 %load to i32 680 %to.vgpr = bitcast i32 %zext to float 681 ret float %to.vgpr 682} 683 684; Maximum positive offset on gfx9, and immediate needs to be moved lower. 685define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 686; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 687; GFX9: ; %bb.0: 688; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 689; GFX9-NEXT: s_waitcnt vmcnt(0) 690; GFX9-NEXT: ; return to shader part epilog 691; 692; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 693; GFX10: ; %bb.0: 694; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 695; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 696; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 697; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 698; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 699; GFX10-NEXT: s_waitcnt vmcnt(0) 700; GFX10-NEXT: ; return to shader part epilog 701 %zext.offset = zext i32 %voffset to i64 702 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 703 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset 704 %load = load i8, i8 addrspace(1)* %gep1 705 %zext = zext i8 %load to i32 706 %to.vgpr = bitcast i32 %zext to float 707 ret float %to.vgpr 708} 709 710; pointer addressing done in integers 711define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 712; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: 713; GCN: ; %bb.0: 714; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 715; GCN-NEXT: s_waitcnt vmcnt(0) 716; GCN-NEXT: ; return to shader part epilog 717 %zext.offset = zext i32 %voffset to i64 718 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 719 %add = add i64 %sbase.as.int, %zext.offset 720 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 721 %load = load i8, i8 addrspace(1)* %dirty.gep 722 %zext = zext i8 %load to i32 723 %to.vgpr = bitcast i32 %zext to float 724 ret float %to.vgpr 725} 726 727; zext forced to LHS of addressing expression 728define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 729; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: 730; GCN: ; %bb.0: 731; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 732; GCN-NEXT: s_waitcnt vmcnt(0) 733; GCN-NEXT: ; return to shader part epilog 734 %zext.offset = zext i32 %voffset to i64 735 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 736 %add = add i64 %zext.offset, %sbase.as.int 737 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 738 %load = load i8, i8 addrspace(1)* %dirty.gep 739 %zext = zext i8 %load to i32 740 %to.vgpr = bitcast i32 %zext to float 741 ret float %to.vgpr 742} 743 744; zext forced to LHS of addressing expression, with immediate offset 745define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 746; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: 747; GCN: ; %bb.0: 748; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 749; GCN-NEXT: s_waitcnt vmcnt(0) 750; GCN-NEXT: ; return to shader part epilog 751 %zext.offset = zext i32 %voffset to i64 752 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 753 %add = add i64 %zext.offset, %sbase.as.int 754 %add.immoffset = add i64 %add, 128 755 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 756 %load = load i8, i8 addrspace(1)* %dirty.gep 757 %zext = zext i8 %load to i32 758 %to.vgpr = bitcast i32 %zext to float 759 ret float %to.vgpr 760} 761 762; zext forced to LHS of addressing expression, with immediate offset in non-canonical position 763define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 764; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: 765; GCN: ; %bb.0: 766; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 767; GCN-NEXT: s_waitcnt vmcnt(0) 768; GCN-NEXT: ; return to shader part epilog 769 %zext.offset = zext i32 %voffset to i64 770 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 771 %add.immoffset = add i64 %sbase.as.int, 128 772 %add = add i64 %zext.offset, %add.immoffset 773 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 774 %load = load i8, i8 addrspace(1)* %dirty.gep 775 %zext = zext i8 %load to i32 776 %to.vgpr = bitcast i32 %zext to float 777 ret float %to.vgpr 778} 779 780; -------------------------------------------------------------------------------- 781; Uniformity edge cases 782; -------------------------------------------------------------------------------- 783 784@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef 785 786; Base pointer is uniform, but also in VGPRs 787define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { 788; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 789; GFX9: ; %bb.0: 790; GFX9-NEXT: v_mov_b32_e32 v1, 0 791; GFX9-NEXT: ds_read_b64 v[1:2], v1 792; GFX9-NEXT: s_waitcnt lgkmcnt(0) 793; GFX9-NEXT: v_readfirstlane_b32 s0, v1 794; GFX9-NEXT: v_readfirstlane_b32 s1, v2 795; GFX9-NEXT: s_nop 4 796; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 797; GFX9-NEXT: s_waitcnt vmcnt(0) 798; GFX9-NEXT: ; return to shader part epilog 799; 800; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 801; GFX10: ; %bb.0: 802; GFX10-NEXT: v_mov_b32_e32 v1, 0 803; GFX10-NEXT: ds_read_b64 v[1:2], v1 804; GFX10-NEXT: s_waitcnt lgkmcnt(0) 805; GFX10-NEXT: v_readfirstlane_b32 s0, v1 806; GFX10-NEXT: v_readfirstlane_b32 s1, v2 807; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] 808; GFX10-NEXT: s_waitcnt vmcnt(0) 809; GFX10-NEXT: ; return to shader part epilog 810 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 811 %zext.offset = zext i32 %voffset to i64 812 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 813 %load = load i8, i8 addrspace(1)* %gep0 814 %zext = zext i8 %load to i32 815 %to.vgpr = bitcast i32 %zext to float 816 ret float %to.vgpr 817} 818 819; Base pointer is uniform, but also in VGPRs, with imm offset 820define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { 821; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 822; GFX9: ; %bb.0: 823; GFX9-NEXT: v_mov_b32_e32 v1, 0 824; GFX9-NEXT: ds_read_b64 v[1:2], v1 825; GFX9-NEXT: s_waitcnt lgkmcnt(0) 826; GFX9-NEXT: v_readfirstlane_b32 s0, v1 827; GFX9-NEXT: v_readfirstlane_b32 s1, v2 828; GFX9-NEXT: s_nop 4 829; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 830; GFX9-NEXT: s_waitcnt vmcnt(0) 831; GFX9-NEXT: ; return to shader part epilog 832; 833; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 834; GFX10: ; %bb.0: 835; GFX10-NEXT: v_mov_b32_e32 v1, 0 836; GFX10-NEXT: ds_read_b64 v[1:2], v1 837; GFX10-NEXT: s_waitcnt lgkmcnt(0) 838; GFX10-NEXT: v_readfirstlane_b32 s0, v1 839; GFX10-NEXT: v_readfirstlane_b32 s1, v2 840; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 841; GFX10-NEXT: s_waitcnt vmcnt(0) 842; GFX10-NEXT: ; return to shader part epilog 843 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 844 %zext.offset = zext i32 %voffset to i64 845 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 846 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 847 %load = load i8, i8 addrspace(1)* %gep1 848 %zext = zext i8 %load to i32 849 %to.vgpr = bitcast i32 %zext to float 850 ret float %to.vgpr 851} 852 853; Both 64-bit base and 32-bit offset are scalar 854define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 855; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: 856; GCN: ; %bb.0: 857; GCN-NEXT: v_mov_b32_e32 v0, s4 858; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 859; GCN-NEXT: s_waitcnt vmcnt(0) 860; GCN-NEXT: ; return to shader part epilog 861 %zext.offset = zext i32 %soffset to i64 862 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 863 %load = load i8, i8 addrspace(1)* %gep0 864 %zext = zext i8 %load to i32 865 %to.vgpr = bitcast i32 %zext to float 866 ret float %to.vgpr 867} 868 869; Both 64-bit base and 32-bit offset are scalar, with immediate offset. 870define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 871; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: 872; GCN: ; %bb.0: 873; GCN-NEXT: v_mov_b32_e32 v0, s4 874; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 875; GCN-NEXT: s_waitcnt vmcnt(0) 876; GCN-NEXT: ; return to shader part epilog 877 %zext.offset = zext i32 %soffset to i64 878 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 879 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24 880 %load = load i8, i8 addrspace(1)* %gep1 881 %zext = zext i8 %load to i32 882 %to.vgpr = bitcast i32 %zext to float 883 ret float %to.vgpr 884} 885 886; Both components uniform, zext forced to LHS of addressing expression 887define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 888; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: 889; GCN: ; %bb.0: 890; GCN-NEXT: v_mov_b32_e32 v0, s4 891; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 892; GCN-NEXT: s_waitcnt vmcnt(0) 893; GCN-NEXT: ; return to shader part epilog 894 %zext.offset = zext i32 %soffset to i64 895 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 896 %add = add i64 %zext.offset, %sbase.as.int 897 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 898 %load = load i8, i8 addrspace(1)* %dirty.gep 899 %zext = zext i8 %load to i32 900 %to.vgpr = bitcast i32 %zext to float 901 ret float %to.vgpr 902} 903 904; Both components uniform, zext forced to LHS of addressing expression, with immediate offset 905define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 906; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: 907; GCN: ; %bb.0: 908; GCN-NEXT: v_mov_b32_e32 v0, s4 909; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 910; GCN-NEXT: s_waitcnt vmcnt(0) 911; GCN-NEXT: ; return to shader part epilog 912 %zext.offset = zext i32 %soffset to i64 913 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 914 %add = add i64 %zext.offset, %sbase.as.int 915 %add.immoffset = add i64 %add, 128 916 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 917 %load = load i8, i8 addrspace(1)* %dirty.gep 918 %zext = zext i8 %load to i32 919 %to.vgpr = bitcast i32 %zext to float 920 ret float %to.vgpr 921} 922 923; divergent 64-bit base, 32-bit scalar offset. 924define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 925; GFX9-LABEL: global_load_i8_vgpr64_sgpr32: 926; GFX9: ; %bb.0: 927; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 928; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 929; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 930; GFX9-NEXT: s_waitcnt vmcnt(0) 931; GFX9-NEXT: ; return to shader part epilog 932; 933; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: 934; GFX10: ; %bb.0: 935; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 936; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 937; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 938; GFX10-NEXT: s_waitcnt vmcnt(0) 939; GFX10-NEXT: ; return to shader part epilog 940 %zext.offset = zext i32 %soffset to i64 941 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 942 %load = load i8, i8 addrspace(1)* %gep0 943 %zext = zext i8 %load to i32 944 %to.vgpr = bitcast i32 %zext to float 945 ret float %to.vgpr 946} 947 948; divergent 64-bit base, 32-bit scalar offset, with imm offset 949define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 950; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 951; GFX9: ; %bb.0: 952; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 953; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 954; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 955; GFX9-NEXT: s_waitcnt vmcnt(0) 956; GFX9-NEXT: ; return to shader part epilog 957; 958; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 959; GFX10: ; %bb.0: 960; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 961; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 962; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 963; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 964; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 965; GFX10-NEXT: s_waitcnt vmcnt(0) 966; GFX10-NEXT: ; return to shader part epilog 967 %zext.offset = zext i32 %soffset to i64 968 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 969 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 970 %load = load i8, i8 addrspace(1)* %gep1 971 %zext = zext i8 %load to i32 972 %to.vgpr = bitcast i32 %zext to float 973 ret float %to.vgpr 974} 975 976; -------------------------------------------------------------------------------- 977; Natural addressing shifts with restricted range 978; -------------------------------------------------------------------------------- 979 980; Cannot push the shift into 32-bits, and cannot match. 981define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 982; GFX9-LABEL: global_load_saddr_f32_natural_addressing: 983; GFX9: ; %bb.0: 984; GFX9-NEXT: global_load_dword v0, v[0:1], off 985; GFX9-NEXT: v_mov_b32_e32 v1, 0 986; GFX9-NEXT: v_mov_b32_e32 v2, s3 987; GFX9-NEXT: s_waitcnt vmcnt(0) 988; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 989; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 990; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 991; GFX9-NEXT: global_load_dword v0, v[0:1], off 992; GFX9-NEXT: s_waitcnt vmcnt(0) 993; GFX9-NEXT: ; return to shader part epilog 994; 995; GFX10-LABEL: global_load_saddr_f32_natural_addressing: 996; GFX10: ; %bb.0: 997; GFX10-NEXT: global_load_dword v0, v[0:1], off 998; GFX10-NEXT: v_mov_b32_e32 v1, 0 999; GFX10-NEXT: s_waitcnt vmcnt(0) 1000; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1001; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 1002; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1003; GFX10-NEXT: global_load_dword v0, v[0:1], off 1004; GFX10-NEXT: s_waitcnt vmcnt(0) 1005; GFX10-NEXT: ; return to shader part epilog 1006 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 1007 %zext.offset = zext i32 %voffset to i64 1008 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1009 %load = load float, float addrspace(1)* %gep 1010 ret float %load 1011} 1012 1013; Cannot push the shift into 32-bits, with an immediate offset. 1014define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1015; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset: 1016; GCN: ; %bb.0: 1017; GCN-NEXT: global_load_dword v0, v[0:1], off 1018; GCN-NEXT: s_waitcnt vmcnt(0) 1019; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 1020; GCN-NEXT: s_waitcnt vmcnt(0) 1021; GCN-NEXT: ; return to shader part epilog 1022 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 1023 %zext.offset = zext i32 %voffset to i64 1024 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1025 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128 1026 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 1027 %load = load float, float addrspace(1)* %gep1.cast 1028 ret float %load 1029} 1030 1031; Range is sufficiently restricted to push the shift into 32-bits. 1032define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1033; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range: 1034; GCN: ; %bb.0: 1035; GCN-NEXT: global_load_dword v0, v[0:1], off 1036; GCN-NEXT: s_waitcnt vmcnt(0) 1037; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1038; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1039; GCN-NEXT: s_waitcnt vmcnt(0) 1040; GCN-NEXT: ; return to shader part epilog 1041 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 1042 %zext.offset = zext i32 %voffset to i64 1043 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1044 %load = load float, float addrspace(1)* %gep 1045 ret float %load 1046} 1047 1048; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset 1049define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1050; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: 1051; GCN: ; %bb.0: 1052; GCN-NEXT: global_load_dword v0, v[0:1], off 1053; GCN-NEXT: s_waitcnt vmcnt(0) 1054; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1055; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 1056; GCN-NEXT: s_waitcnt vmcnt(0) 1057; GCN-NEXT: ; return to shader part epilog 1058 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 1059 %zext.offset = zext i32 %voffset to i64 1060 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1061 %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100 1062 %load = load float, float addrspace(1)* %gep1 1063 ret float %load 1064} 1065 1066; Range is 1 beyond the limit where we can move the shift into 32-bits. 1067define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1068; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 1069; GFX9: ; %bb.0: 1070; GFX9-NEXT: global_load_dword v0, v[0:1], off 1071; GFX9-NEXT: v_mov_b32_e32 v1, 0 1072; GFX9-NEXT: v_mov_b32_e32 v2, s3 1073; GFX9-NEXT: s_waitcnt vmcnt(0) 1074; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1075; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 1076; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1077; GFX9-NEXT: global_load_dword v0, v[0:1], off 1078; GFX9-NEXT: s_waitcnt vmcnt(0) 1079; GFX9-NEXT: ; return to shader part epilog 1080; 1081; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 1082; GFX10: ; %bb.0: 1083; GFX10-NEXT: global_load_dword v0, v[0:1], off 1084; GFX10-NEXT: v_mov_b32_e32 v1, 0 1085; GFX10-NEXT: s_waitcnt vmcnt(0) 1086; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1087; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 1088; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1089; GFX10-NEXT: global_load_dword v0, v[0:1], off 1090; GFX10-NEXT: s_waitcnt vmcnt(0) 1091; GFX10-NEXT: ; return to shader part epilog 1092 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 1093 %zext.offset = zext i32 %voffset to i64 1094 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1095 %load = load float, float addrspace(1)* %gep 1096 ret float %load 1097} 1098 1099; -------------------------------------------------------------------------------- 1100; Stress various type loads 1101; -------------------------------------------------------------------------------- 1102 1103define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1104; GCN-LABEL: global_load_saddr_i16: 1105; GCN: ; %bb.0: 1106; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1107; GCN-NEXT: s_waitcnt vmcnt(0) 1108; GCN-NEXT: ; return to shader part epilog 1109 %zext.offset = zext i32 %voffset to i64 1110 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1111 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1112 %load = load i16, i16 addrspace(1)* %gep0.cast 1113 %cast.load = bitcast i16 %load to half 1114 ret half %cast.load 1115} 1116 1117define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1118; GCN-LABEL: global_load_saddr_i16_immneg128: 1119; GCN: ; %bb.0: 1120; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1121; GCN-NEXT: s_waitcnt vmcnt(0) 1122; GCN-NEXT: ; return to shader part epilog 1123 %zext.offset = zext i32 %voffset to i64 1124 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1125 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1126 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1127 %load = load i16, i16 addrspace(1)* %gep1.cast 1128 %cast.load = bitcast i16 %load to half 1129 ret half %cast.load 1130} 1131 1132define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1133; GCN-LABEL: global_load_saddr_f16: 1134; GCN: ; %bb.0: 1135; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1136; GCN-NEXT: s_waitcnt vmcnt(0) 1137; GCN-NEXT: ; return to shader part epilog 1138 %zext.offset = zext i32 %voffset to i64 1139 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1140 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* 1141 %load = load half, half addrspace(1)* %gep0.cast 1142 ret half %load 1143} 1144 1145define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1146; GCN-LABEL: global_load_saddr_f16_immneg128: 1147; GCN: ; %bb.0: 1148; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1149; GCN-NEXT: s_waitcnt vmcnt(0) 1150; GCN-NEXT: ; return to shader part epilog 1151 %zext.offset = zext i32 %voffset to i64 1152 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1153 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1154 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)* 1155 %load = load half, half addrspace(1)* %gep1.cast 1156 ret half %load 1157} 1158 1159define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1160; GCN-LABEL: global_load_saddr_i32: 1161; GCN: ; %bb.0: 1162; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1163; GCN-NEXT: s_waitcnt vmcnt(0) 1164; GCN-NEXT: ; return to shader part epilog 1165 %zext.offset = zext i32 %voffset to i64 1166 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1167 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 1168 %load = load i32, i32 addrspace(1)* %gep0.cast 1169 %cast.load = bitcast i32 %load to float 1170 ret float %cast.load 1171} 1172 1173define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1174; GCN-LABEL: global_load_saddr_i32_immneg128: 1175; GCN: ; %bb.0: 1176; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1177; GCN-NEXT: s_waitcnt vmcnt(0) 1178; GCN-NEXT: ; return to shader part epilog 1179 %zext.offset = zext i32 %voffset to i64 1180 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1181 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1182 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 1183 %load = load i32, i32 addrspace(1)* %gep1.cast 1184 %cast.load = bitcast i32 %load to float 1185 ret float %cast.load 1186} 1187 1188define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1189; GCN-LABEL: global_load_saddr_f32: 1190; GCN: ; %bb.0: 1191; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1192; GCN-NEXT: s_waitcnt vmcnt(0) 1193; GCN-NEXT: ; return to shader part epilog 1194 %zext.offset = zext i32 %voffset to i64 1195 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1196 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* 1197 %load = load float, float addrspace(1)* %gep0.cast 1198 ret float %load 1199} 1200 1201define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1202; GCN-LABEL: global_load_saddr_f32_immneg128: 1203; GCN: ; %bb.0: 1204; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1205; GCN-NEXT: s_waitcnt vmcnt(0) 1206; GCN-NEXT: ; return to shader part epilog 1207 %zext.offset = zext i32 %voffset to i64 1208 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1209 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1210 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 1211 %load = load float, float addrspace(1)* %gep1.cast 1212 ret float %load 1213} 1214 1215define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1216; GCN-LABEL: global_load_saddr_v2i16: 1217; GCN: ; %bb.0: 1218; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1219; GCN-NEXT: s_waitcnt vmcnt(0) 1220; GCN-NEXT: ; return to shader part epilog 1221 %zext.offset = zext i32 %voffset to i64 1222 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1223 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* 1224 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast 1225 %cast.load = bitcast <2 x i16> %load to <2 x half> 1226 ret <2 x half> %cast.load 1227} 1228 1229define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1230; GCN-LABEL: global_load_saddr_v2i16_immneg128: 1231; GCN: ; %bb.0: 1232; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1233; GCN-NEXT: s_waitcnt vmcnt(0) 1234; GCN-NEXT: ; return to shader part epilog 1235 %zext.offset = zext i32 %voffset to i64 1236 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1237 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1238 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)* 1239 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast 1240 %cast.load = bitcast <2 x i16> %load to <2 x half> 1241 ret <2 x half> %cast.load 1242} 1243 1244define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1245; GCN-LABEL: global_load_saddr_v2f16: 1246; GCN: ; %bb.0: 1247; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1248; GCN-NEXT: s_waitcnt vmcnt(0) 1249; GCN-NEXT: ; return to shader part epilog 1250 %zext.offset = zext i32 %voffset to i64 1251 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1252 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* 1253 %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast 1254 ret <2 x half> %load 1255} 1256 1257define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1258; GCN-LABEL: global_load_saddr_v2f16_immneg128: 1259; GCN: ; %bb.0: 1260; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1261; GCN-NEXT: s_waitcnt vmcnt(0) 1262; GCN-NEXT: ; return to shader part epilog 1263 %zext.offset = zext i32 %voffset to i64 1264 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1265 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1266 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* 1267 %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast 1268 ret <2 x half> %load 1269} 1270 1271define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1272; GCN-LABEL: global_load_saddr_p3: 1273; GCN: ; %bb.0: 1274; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1275; GCN-NEXT: s_waitcnt vmcnt(0) 1276; GCN-NEXT: ; return to shader part epilog 1277 %zext.offset = zext i32 %voffset to i64 1278 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1279 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* 1280 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast 1281 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 1282 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 1283 ret <2 x half> %cast.load1 1284} 1285 1286define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1287; GCN-LABEL: global_load_saddr_p3_immneg128: 1288; GCN: ; %bb.0: 1289; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1290; GCN-NEXT: s_waitcnt vmcnt(0) 1291; GCN-NEXT: ; return to shader part epilog 1292 %zext.offset = zext i32 %voffset to i64 1293 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1294 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1295 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)* 1296 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast 1297 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 1298 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 1299 ret <2 x half> %cast.load1 1300} 1301 1302define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1303; GCN-LABEL: global_load_saddr_f64: 1304; GCN: ; %bb.0: 1305; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1306; GCN-NEXT: s_waitcnt vmcnt(0) 1307; GCN-NEXT: ; return to shader part epilog 1308 %zext.offset = zext i32 %voffset to i64 1309 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1310 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* 1311 %load = load double, double addrspace(1)* %gep0.cast 1312 %cast.load = bitcast double %load to <2 x float> 1313 ret <2 x float> %cast.load 1314} 1315 1316define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1317; GCN-LABEL: global_load_saddr_f64_immneg128: 1318; GCN: ; %bb.0: 1319; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1320; GCN-NEXT: s_waitcnt vmcnt(0) 1321; GCN-NEXT: ; return to shader part epilog 1322 %zext.offset = zext i32 %voffset to i64 1323 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1324 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1325 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)* 1326 %load = load double, double addrspace(1)* %gep1.cast 1327 %cast.load = bitcast double %load to <2 x float> 1328 ret <2 x float> %cast.load 1329} 1330 1331define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1332; GCN-LABEL: global_load_saddr_i64: 1333; GCN: ; %bb.0: 1334; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1335; GCN-NEXT: s_waitcnt vmcnt(0) 1336; GCN-NEXT: ; return to shader part epilog 1337 %zext.offset = zext i32 %voffset to i64 1338 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1339 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 1340 %load = load i64, i64 addrspace(1)* %gep0.cast 1341 %cast.load = bitcast i64 %load to <2 x float> 1342 ret <2 x float> %cast.load 1343} 1344 1345define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1346; GCN-LABEL: global_load_saddr_i64_immneg128: 1347; GCN: ; %bb.0: 1348; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1349; GCN-NEXT: s_waitcnt vmcnt(0) 1350; GCN-NEXT: ; return to shader part epilog 1351 %zext.offset = zext i32 %voffset to i64 1352 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1353 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1354 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 1355 %load = load i64, i64 addrspace(1)* %gep1.cast 1356 %cast.load = bitcast i64 %load to <2 x float> 1357 ret <2 x float> %cast.load 1358} 1359 1360define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1361; GCN-LABEL: global_load_saddr_v2f32: 1362; GCN: ; %bb.0: 1363; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1364; GCN-NEXT: s_waitcnt vmcnt(0) 1365; GCN-NEXT: ; return to shader part epilog 1366 %zext.offset = zext i32 %voffset to i64 1367 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1368 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* 1369 %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast 1370 ret <2 x float> %load 1371} 1372 1373define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1374; GCN-LABEL: global_load_saddr_v2f32_immneg128: 1375; GCN: ; %bb.0: 1376; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1377; GCN-NEXT: s_waitcnt vmcnt(0) 1378; GCN-NEXT: ; return to shader part epilog 1379 %zext.offset = zext i32 %voffset to i64 1380 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1381 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1382 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)* 1383 %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast 1384 ret <2 x float> %load 1385} 1386 1387define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1388; GCN-LABEL: global_load_saddr_v2i32: 1389; GCN: ; %bb.0: 1390; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1391; GCN-NEXT: s_waitcnt vmcnt(0) 1392; GCN-NEXT: ; return to shader part epilog 1393 %zext.offset = zext i32 %voffset to i64 1394 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1395 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* 1396 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast 1397 %cast.load = bitcast <2 x i32> %load to <2 x float> 1398 ret <2 x float> %cast.load 1399} 1400 1401define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1402; GCN-LABEL: global_load_saddr_v2i32_immneg128: 1403; GCN: ; %bb.0: 1404; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1405; GCN-NEXT: s_waitcnt vmcnt(0) 1406; GCN-NEXT: ; return to shader part epilog 1407 %zext.offset = zext i32 %voffset to i64 1408 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1409 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1410 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)* 1411 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast 1412 %cast.load = bitcast <2 x i32> %load to <2 x float> 1413 ret <2 x float> %cast.load 1414} 1415 1416define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1417; GCN-LABEL: global_load_saddr_v4i16: 1418; GCN: ; %bb.0: 1419; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1420; GCN-NEXT: s_waitcnt vmcnt(0) 1421; GCN-NEXT: ; return to shader part epilog 1422 %zext.offset = zext i32 %voffset to i64 1423 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1424 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* 1425 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast 1426 %cast.load = bitcast <4 x i16> %load to <2 x float> 1427 ret <2 x float> %cast.load 1428} 1429 1430define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1431; GCN-LABEL: global_load_saddr_v4i16_immneg128: 1432; GCN: ; %bb.0: 1433; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1434; GCN-NEXT: s_waitcnt vmcnt(0) 1435; GCN-NEXT: ; return to shader part epilog 1436 %zext.offset = zext i32 %voffset to i64 1437 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1438 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1439 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)* 1440 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast 1441 %cast.load = bitcast <4 x i16> %load to <2 x float> 1442 ret <2 x float> %cast.load 1443} 1444 1445define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1446; GCN-LABEL: global_load_saddr_v4f16: 1447; GCN: ; %bb.0: 1448; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1449; GCN-NEXT: s_waitcnt vmcnt(0) 1450; GCN-NEXT: ; return to shader part epilog 1451 %zext.offset = zext i32 %voffset to i64 1452 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1453 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* 1454 %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast 1455 %cast.load = bitcast <4 x half> %load to <2 x float> 1456 ret <2 x float> %cast.load 1457} 1458 1459define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1460; GCN-LABEL: global_load_saddr_v4f16_immneg128: 1461; GCN: ; %bb.0: 1462; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1463; GCN-NEXT: s_waitcnt vmcnt(0) 1464; GCN-NEXT: ; return to shader part epilog 1465 %zext.offset = zext i32 %voffset to i64 1466 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1467 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1468 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)* 1469 %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast 1470 %cast.load = bitcast <4 x half> %load to <2 x float> 1471 ret <2 x float> %cast.load 1472} 1473 1474define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1475; GCN-LABEL: global_load_saddr_p1: 1476; GCN: ; %bb.0: 1477; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1478; GCN-NEXT: s_waitcnt vmcnt(0) 1479; GCN-NEXT: ; return to shader part epilog 1480 %zext.offset = zext i32 %voffset to i64 1481 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1482 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* 1483 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast 1484 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 1485 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 1486 ret <2 x float> %cast.load1 1487} 1488 1489define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1490; GCN-LABEL: global_load_saddr_p1_immneg128: 1491; GCN: ; %bb.0: 1492; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1493; GCN-NEXT: s_waitcnt vmcnt(0) 1494; GCN-NEXT: ; return to shader part epilog 1495 %zext.offset = zext i32 %voffset to i64 1496 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1497 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1498 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)* 1499 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast 1500 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 1501 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 1502 ret <2 x float> %cast.load1 1503} 1504 1505define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1506; GCN-LABEL: global_load_saddr_v3f32: 1507; GCN: ; %bb.0: 1508; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1509; GCN-NEXT: s_waitcnt vmcnt(0) 1510; GCN-NEXT: ; return to shader part epilog 1511 %zext.offset = zext i32 %voffset to i64 1512 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1513 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* 1514 %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast 1515 ret <3 x float> %load 1516} 1517 1518define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1519; GCN-LABEL: global_load_saddr_v3f32_immneg128: 1520; GCN: ; %bb.0: 1521; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1522; GCN-NEXT: s_waitcnt vmcnt(0) 1523; GCN-NEXT: ; return to shader part epilog 1524 %zext.offset = zext i32 %voffset to i64 1525 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1526 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1527 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)* 1528 %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast 1529 ret <3 x float> %load 1530} 1531 1532define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1533; GCN-LABEL: global_load_saddr_v3i32: 1534; GCN: ; %bb.0: 1535; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1536; GCN-NEXT: s_waitcnt vmcnt(0) 1537; GCN-NEXT: ; return to shader part epilog 1538 %zext.offset = zext i32 %voffset to i64 1539 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1540 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* 1541 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast 1542 %cast.load = bitcast <3 x i32> %load to <3 x float> 1543 ret <3 x float> %cast.load 1544} 1545 1546define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1547; GCN-LABEL: global_load_saddr_v3i32_immneg128: 1548; GCN: ; %bb.0: 1549; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1550; GCN-NEXT: s_waitcnt vmcnt(0) 1551; GCN-NEXT: ; return to shader part epilog 1552 %zext.offset = zext i32 %voffset to i64 1553 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1554 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1555 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* 1556 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast 1557 %cast.load = bitcast <3 x i32> %load to <3 x float> 1558 ret <3 x float> %cast.load 1559} 1560 1561define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1562; GCN-LABEL: global_load_saddr_v6f16: 1563; GCN: ; %bb.0: 1564; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1565; GCN-NEXT: s_waitcnt vmcnt(0) 1566; GCN-NEXT: ; return to shader part epilog 1567 %zext.offset = zext i32 %voffset to i64 1568 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1569 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* 1570 %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast 1571 ret <6 x half> %load 1572} 1573 1574define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1575; GCN-LABEL: global_load_saddr_v6f16_immneg128: 1576; GCN: ; %bb.0: 1577; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1578; GCN-NEXT: s_waitcnt vmcnt(0) 1579; GCN-NEXT: ; return to shader part epilog 1580 %zext.offset = zext i32 %voffset to i64 1581 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1582 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1583 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)* 1584 %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast 1585 ret <6 x half> %load 1586} 1587 1588define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1589; GCN-LABEL: global_load_saddr_v4f32: 1590; GCN: ; %bb.0: 1591; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1592; GCN-NEXT: s_waitcnt vmcnt(0) 1593; GCN-NEXT: ; return to shader part epilog 1594 %zext.offset = zext i32 %voffset to i64 1595 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1596 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* 1597 %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast 1598 ret <4 x float> %load 1599} 1600 1601define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1602; GCN-LABEL: global_load_saddr_v4f32_immneg128: 1603; GCN: ; %bb.0: 1604; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1605; GCN-NEXT: s_waitcnt vmcnt(0) 1606; GCN-NEXT: ; return to shader part epilog 1607 %zext.offset = zext i32 %voffset to i64 1608 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1609 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1610 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)* 1611 %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast 1612 ret <4 x float> %load 1613} 1614 1615define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1616; GCN-LABEL: global_load_saddr_v4i32: 1617; GCN: ; %bb.0: 1618; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1619; GCN-NEXT: s_waitcnt vmcnt(0) 1620; GCN-NEXT: ; return to shader part epilog 1621 %zext.offset = zext i32 %voffset to i64 1622 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1623 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* 1624 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast 1625 %cast.load = bitcast <4 x i32> %load to <4 x float> 1626 ret <4 x float> %cast.load 1627} 1628 1629define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1630; GCN-LABEL: global_load_saddr_v4i32_immneg128: 1631; GCN: ; %bb.0: 1632; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1633; GCN-NEXT: s_waitcnt vmcnt(0) 1634; GCN-NEXT: ; return to shader part epilog 1635 %zext.offset = zext i32 %voffset to i64 1636 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1637 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1638 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)* 1639 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast 1640 %cast.load = bitcast <4 x i32> %load to <4 x float> 1641 ret <4 x float> %cast.load 1642} 1643 1644define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1645; GCN-LABEL: global_load_saddr_v2i64: 1646; GCN: ; %bb.0: 1647; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1648; GCN-NEXT: s_waitcnt vmcnt(0) 1649; GCN-NEXT: ; return to shader part epilog 1650 %zext.offset = zext i32 %voffset to i64 1651 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1652 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* 1653 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast 1654 %cast.load = bitcast <2 x i64> %load to <4 x float> 1655 ret <4 x float> %cast.load 1656} 1657 1658define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1659; GCN-LABEL: global_load_saddr_v2i64_immneg128: 1660; GCN: ; %bb.0: 1661; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1662; GCN-NEXT: s_waitcnt vmcnt(0) 1663; GCN-NEXT: ; return to shader part epilog 1664 %zext.offset = zext i32 %voffset to i64 1665 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1666 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1667 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)* 1668 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast 1669 %cast.load = bitcast <2 x i64> %load to <4 x float> 1670 ret <4 x float> %cast.load 1671} 1672 1673define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1674; GCN-LABEL: global_load_saddr_i128: 1675; GCN: ; %bb.0: 1676; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1677; GCN-NEXT: s_waitcnt vmcnt(0) 1678; GCN-NEXT: ; return to shader part epilog 1679 %zext.offset = zext i32 %voffset to i64 1680 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1681 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* 1682 %load = load i128, i128 addrspace(1)* %gep0.cast 1683 %cast.load = bitcast i128 %load to <4 x float> 1684 ret <4 x float> %cast.load 1685} 1686 1687define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1688; GCN-LABEL: global_load_saddr_i128_immneg128: 1689; GCN: ; %bb.0: 1690; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1691; GCN-NEXT: s_waitcnt vmcnt(0) 1692; GCN-NEXT: ; return to shader part epilog 1693 %zext.offset = zext i32 %voffset to i64 1694 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1695 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1696 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)* 1697 %load = load i128, i128 addrspace(1)* %gep1.cast 1698 %cast.load = bitcast i128 %load to <4 x float> 1699 ret <4 x float> %cast.load 1700} 1701 1702define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1703; GCN-LABEL: global_load_saddr_v2p1: 1704; GCN: ; %bb.0: 1705; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1706; GCN-NEXT: s_waitcnt vmcnt(0) 1707; GCN-NEXT: ; return to shader part epilog 1708 %zext.offset = zext i32 %voffset to i64 1709 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1710 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* 1711 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast 1712 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 1713 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 1714 ret <4 x float> %cast.load1 1715} 1716 1717define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1718; GCN-LABEL: global_load_saddr_v2p1_immneg128: 1719; GCN: ; %bb.0: 1720; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1721; GCN-NEXT: s_waitcnt vmcnt(0) 1722; GCN-NEXT: ; return to shader part epilog 1723 %zext.offset = zext i32 %voffset to i64 1724 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1725 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1726 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)* 1727 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast 1728 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 1729 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 1730 ret <4 x float> %cast.load1 1731} 1732 1733define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1734; GCN-LABEL: global_load_saddr_v4p3: 1735; GCN: ; %bb.0: 1736; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1737; GCN-NEXT: s_waitcnt vmcnt(0) 1738; GCN-NEXT: ; return to shader part epilog 1739 %zext.offset = zext i32 %voffset to i64 1740 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1741 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* 1742 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast 1743 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 1744 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 1745 ret <4 x float> %cast.load1 1746} 1747 1748define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1749; GCN-LABEL: global_load_saddr_v4p3_immneg128: 1750; GCN: ; %bb.0: 1751; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1752; GCN-NEXT: s_waitcnt vmcnt(0) 1753; GCN-NEXT: ; return to shader part epilog 1754 %zext.offset = zext i32 %voffset to i64 1755 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1756 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1757 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)* 1758 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast 1759 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 1760 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 1761 ret <4 x float> %cast.load1 1762} 1763 1764; -------------------------------------------------------------------------------- 1765; Extending loads 1766; -------------------------------------------------------------------------------- 1767 1768define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1769; GCN-LABEL: global_sextload_saddr_i8: 1770; GCN: ; %bb.0: 1771; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] 1772; GCN-NEXT: s_waitcnt vmcnt(0) 1773; GCN-NEXT: ; return to shader part epilog 1774 %zext.offset = zext i32 %voffset to i64 1775 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1776 %load = load i8, i8 addrspace(1)* %gep0 1777 %sextload = sext i8 %load to i32 1778 %cast.load = bitcast i32 %sextload to float 1779 ret float %cast.load 1780} 1781 1782define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1783; GCN-LABEL: global_sextload_saddr_i8_immneg128: 1784; GCN: ; %bb.0: 1785; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 1786; GCN-NEXT: s_waitcnt vmcnt(0) 1787; GCN-NEXT: ; return to shader part epilog 1788 %zext.offset = zext i32 %voffset to i64 1789 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1790 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1791 %load = load i8, i8 addrspace(1)* %gep1 1792 %sextload = sext i8 %load to i32 1793 %cast.load = bitcast i32 %sextload to float 1794 ret float %cast.load 1795} 1796 1797define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1798; GCN-LABEL: global_sextload_saddr_i16: 1799; GCN: ; %bb.0: 1800; GCN-NEXT: global_load_sshort v0, v0, s[2:3] 1801; GCN-NEXT: s_waitcnt vmcnt(0) 1802; GCN-NEXT: ; return to shader part epilog 1803 %zext.offset = zext i32 %voffset to i64 1804 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1805 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1806 %load = load i16, i16 addrspace(1)* %gep0.cast 1807 %sextload = sext i16 %load to i32 1808 %cast.load = bitcast i32 %sextload to float 1809 ret float %cast.load 1810} 1811 1812define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1813; GCN-LABEL: global_sextload_saddr_i16_immneg128: 1814; GCN: ; %bb.0: 1815; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 1816; GCN-NEXT: s_waitcnt vmcnt(0) 1817; GCN-NEXT: ; return to shader part epilog 1818 %zext.offset = zext i32 %voffset to i64 1819 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1820 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1821 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1822 %load = load i16, i16 addrspace(1)* %gep1.cast 1823 %sextload = sext i16 %load to i32 1824 %cast.load = bitcast i32 %sextload to float 1825 ret float %cast.load 1826} 1827 1828define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1829; GCN-LABEL: global_zextload_saddr_i8: 1830; GCN: ; %bb.0: 1831; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 1832; GCN-NEXT: s_waitcnt vmcnt(0) 1833; GCN-NEXT: ; return to shader part epilog 1834 %zext.offset = zext i32 %voffset to i64 1835 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1836 %load = load i8, i8 addrspace(1)* %gep0 1837 %zextload = zext i8 %load to i32 1838 %cast.load = bitcast i32 %zextload to float 1839 ret float %cast.load 1840} 1841 1842define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1843; GCN-LABEL: global_zextload_saddr_i8_immneg128: 1844; GCN: ; %bb.0: 1845; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 1846; GCN-NEXT: s_waitcnt vmcnt(0) 1847; GCN-NEXT: ; return to shader part epilog 1848 %zext.offset = zext i32 %voffset to i64 1849 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1850 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1851 %load = load i8, i8 addrspace(1)* %gep1 1852 %zextload = zext i8 %load to i32 1853 %cast.load = bitcast i32 %zextload to float 1854 ret float %cast.load 1855} 1856 1857define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1858; GCN-LABEL: global_zextload_saddr_i16: 1859; GCN: ; %bb.0: 1860; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1861; GCN-NEXT: s_waitcnt vmcnt(0) 1862; GCN-NEXT: ; return to shader part epilog 1863 %zext.offset = zext i32 %voffset to i64 1864 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1865 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1866 %load = load i16, i16 addrspace(1)* %gep0.cast 1867 %zextload = zext i16 %load to i32 1868 %cast.load = bitcast i32 %zextload to float 1869 ret float %cast.load 1870} 1871 1872define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1873; GCN-LABEL: global_zextload_saddr_i16_immneg128: 1874; GCN: ; %bb.0: 1875; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1876; GCN-NEXT: s_waitcnt vmcnt(0) 1877; GCN-NEXT: ; return to shader part epilog 1878 %zext.offset = zext i32 %voffset to i64 1879 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1880 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1881 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1882 %load = load i16, i16 addrspace(1)* %gep1.cast 1883 %zextload = zext i16 %load to i32 1884 %cast.load = bitcast i32 %zextload to float 1885 ret float %cast.load 1886} 1887 1888; -------------------------------------------------------------------------------- 1889; Atomic load 1890; -------------------------------------------------------------------------------- 1891 1892define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1893; GFX9-LABEL: atomic_global_load_saddr_i32: 1894; GFX9: ; %bb.0: 1895; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1896; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 1897; GFX9-NEXT: s_waitcnt vmcnt(0) 1898; GFX9-NEXT: buffer_wbinvl1 1899; GFX9-NEXT: ; return to shader part epilog 1900; 1901; GFX10-LABEL: atomic_global_load_saddr_i32: 1902; GFX10: ; %bb.0: 1903; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1904; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1905; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 1906; GFX10-NEXT: s_waitcnt vmcnt(0) 1907; GFX10-NEXT: buffer_gl0_inv 1908; GFX10-NEXT: buffer_gl1_inv 1909; GFX10-NEXT: ; return to shader part epilog 1910 %zext.offset = zext i32 %voffset to i64 1911 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1912 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 1913 %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4 1914 %cast.load = bitcast i32 %load to float 1915 ret float %cast.load 1916} 1917 1918define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1919; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: 1920; GFX9: ; %bb.0: 1921; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1922; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc 1923; GFX9-NEXT: s_waitcnt vmcnt(0) 1924; GFX9-NEXT: buffer_wbinvl1 1925; GFX9-NEXT: ; return to shader part epilog 1926; 1927; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: 1928; GFX10: ; %bb.0: 1929; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1930; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1931; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc 1932; GFX10-NEXT: s_waitcnt vmcnt(0) 1933; GFX10-NEXT: buffer_gl0_inv 1934; GFX10-NEXT: buffer_gl1_inv 1935; GFX10-NEXT: ; return to shader part epilog 1936 %zext.offset = zext i32 %voffset to i64 1937 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1938 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1939 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 1940 %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4 1941 %cast.load = bitcast i32 %load to float 1942 ret float %cast.load 1943} 1944 1945define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1946; GFX9-LABEL: atomic_global_load_saddr_i64: 1947; GFX9: ; %bb.0: 1948; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1949; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc 1950; GFX9-NEXT: s_waitcnt vmcnt(0) 1951; GFX9-NEXT: buffer_wbinvl1 1952; GFX9-NEXT: ; return to shader part epilog 1953; 1954; GFX10-LABEL: atomic_global_load_saddr_i64: 1955; GFX10: ; %bb.0: 1956; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1957; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1958; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc 1959; GFX10-NEXT: s_waitcnt vmcnt(0) 1960; GFX10-NEXT: buffer_gl0_inv 1961; GFX10-NEXT: buffer_gl1_inv 1962; GFX10-NEXT: ; return to shader part epilog 1963 %zext.offset = zext i32 %voffset to i64 1964 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1965 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 1966 %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8 1967 %cast.load = bitcast i64 %load to <2 x float> 1968 ret <2 x float> %cast.load 1969} 1970 1971define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1972; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: 1973; GFX9: ; %bb.0: 1974; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1975; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc 1976; GFX9-NEXT: s_waitcnt vmcnt(0) 1977; GFX9-NEXT: buffer_wbinvl1 1978; GFX9-NEXT: ; return to shader part epilog 1979; 1980; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: 1981; GFX10: ; %bb.0: 1982; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1983; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1984; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc 1985; GFX10-NEXT: s_waitcnt vmcnt(0) 1986; GFX10-NEXT: buffer_gl0_inv 1987; GFX10-NEXT: buffer_gl1_inv 1988; GFX10-NEXT: ; return to shader part epilog 1989 %zext.offset = zext i32 %voffset to i64 1990 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1991 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1992 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 1993 %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8 1994 %cast.load = bitcast i64 %load to <2 x float> 1995 ret <2 x float> %cast.load 1996} 1997 1998; -------------------------------------------------------------------------------- 1999; D16 load (low 16) 2000; -------------------------------------------------------------------------------- 2001 2002define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2003; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: 2004; GCN: ; %bb.0: 2005; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] 2006; GCN-NEXT: s_waitcnt vmcnt(0) 2007; GCN-NEXT: ; return to shader part epilog 2008 %zext.offset = zext i32 %voffset to i64 2009 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2010 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2011 %load = load i16, i16 addrspace(1)* %gep0.cast 2012 %build = insertelement <2 x i16> undef, i16 %load, i32 0 2013 %cast = bitcast <2 x i16> %build to <2 x half> 2014 ret <2 x half> %cast 2015} 2016 2017define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2018; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: 2019; GCN: ; %bb.0: 2020; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 2021; GCN-NEXT: s_waitcnt vmcnt(0) 2022; GCN-NEXT: ; return to shader part epilog 2023 %zext.offset = zext i32 %voffset to i64 2024 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2025 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2026 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2027 %load = load i16, i16 addrspace(1)* %gep1.cast 2028 %build = insertelement <2 x i16> undef, i16 %load, i32 0 2029 %cast = bitcast <2 x i16> %build to <2 x half> 2030 ret <2 x half> %cast 2031} 2032 2033define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2034; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi: 2035; GCN: ; %bb.0: 2036; GCN-NEXT: v_mov_b32_e32 v1, 0 2037; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 2038; GCN-NEXT: s_waitcnt vmcnt(0) 2039; GCN-NEXT: v_mov_b32_e32 v0, v1 2040; GCN-NEXT: ; return to shader part epilog 2041 %zext.offset = zext i32 %voffset to i64 2042 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2043 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2044 %load = load i16, i16 addrspace(1)* %gep0.cast 2045 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 2046 %cast = bitcast <2 x i16> %build to <2 x half> 2047 ret <2 x half> %cast 2048} 2049 2050define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2051; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: 2052; GCN: ; %bb.0: 2053; GCN-NEXT: v_mov_b32_e32 v1, 0 2054; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 2055; GCN-NEXT: s_waitcnt vmcnt(0) 2056; GCN-NEXT: v_mov_b32_e32 v0, v1 2057; GCN-NEXT: ; return to shader part epilog 2058 %zext.offset = zext i32 %voffset to i64 2059 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2060 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2061 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2062 %load = load i16, i16 addrspace(1)* %gep1.cast 2063 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 2064 %cast = bitcast <2 x i16> %build to <2 x half> 2065 ret <2 x half> %cast 2066} 2067 2068define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2069; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi: 2070; GCN: ; %bb.0: 2071; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 2072; GCN-NEXT: s_waitcnt vmcnt(0) 2073; GCN-NEXT: v_mov_b32_e32 v0, v1 2074; GCN-NEXT: ; return to shader part epilog 2075 %zext.offset = zext i32 %voffset to i64 2076 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2077 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2078 %load = load i16, i16 addrspace(1)* %gep0.cast 2079 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 2080 %cast = bitcast <2 x i16> %build to <2 x half> 2081 ret <2 x half> %cast 2082} 2083 2084define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2085; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: 2086; GCN: ; %bb.0: 2087; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 2088; GCN-NEXT: s_waitcnt vmcnt(0) 2089; GCN-NEXT: v_mov_b32_e32 v0, v1 2090; GCN-NEXT: ; return to shader part epilog 2091 %zext.offset = zext i32 %voffset to i64 2092 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2093 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2094 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2095 %load = load i16, i16 addrspace(1)* %gep1.cast 2096 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 2097 %cast = bitcast <2 x i16> %build to <2 x half> 2098 ret <2 x half> %cast 2099} 2100 2101define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2102; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: 2103; GCN: ; %bb.0: 2104; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] 2105; GCN-NEXT: s_waitcnt vmcnt(0) 2106; GCN-NEXT: v_mov_b32_e32 v0, v1 2107; GCN-NEXT: ; return to shader part epilog 2108 %zext.offset = zext i32 %voffset to i64 2109 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2110 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2111 %load = load i8, i8 addrspace(1)* %gep0.cast 2112 %zext.load = zext i8 %load to i16 2113 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 2114 %cast = bitcast <2 x i16> %build to <2 x half> 2115 ret <2 x half> %cast 2116} 2117 2118define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2119; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: 2120; GCN: ; %bb.0: 2121; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128 2122; GCN-NEXT: s_waitcnt vmcnt(0) 2123; GCN-NEXT: v_mov_b32_e32 v0, v1 2124; GCN-NEXT: ; return to shader part epilog 2125 %zext.offset = zext i32 %voffset to i64 2126 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2127 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2128 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2129 %load = load i8, i8 addrspace(1)* %gep1.cast 2130 %zext.load = zext i8 %load to i16 2131 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 2132 %cast = bitcast <2 x i16> %build to <2 x half> 2133 ret <2 x half> %cast 2134} 2135 2136define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2137; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: 2138; GCN: ; %bb.0: 2139; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] 2140; GCN-NEXT: s_waitcnt vmcnt(0) 2141; GCN-NEXT: v_mov_b32_e32 v0, v1 2142; GCN-NEXT: ; return to shader part epilog 2143 %zext.offset = zext i32 %voffset to i64 2144 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2145 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2146 %load = load i8, i8 addrspace(1)* %gep0.cast 2147 %sext.load = sext i8 %load to i16 2148 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 2149 %cast = bitcast <2 x i16> %build to <2 x half> 2150 ret <2 x half> %cast 2151} 2152 2153define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2154; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: 2155; GCN: ; %bb.0: 2156; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128 2157; GCN-NEXT: s_waitcnt vmcnt(0) 2158; GCN-NEXT: v_mov_b32_e32 v0, v1 2159; GCN-NEXT: ; return to shader part epilog 2160 %zext.offset = zext i32 %voffset to i64 2161 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2162 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2163 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2164 %load = load i8, i8 addrspace(1)* %gep1.cast 2165 %sext.load = sext i8 %load to i16 2166 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 2167 %cast = bitcast <2 x i16> %build to <2 x half> 2168 ret <2 x half> %cast 2169} 2170 2171; -------------------------------------------------------------------------------- 2172; D16 hi load (hi16) 2173; -------------------------------------------------------------------------------- 2174 2175define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2176; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: 2177; GCN: ; %bb.0: 2178; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] 2179; GCN-NEXT: s_waitcnt vmcnt(0) 2180; GCN-NEXT: ; return to shader part epilog 2181 %zext.offset = zext i32 %voffset to i64 2182 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2183 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2184 %load = load i16, i16 addrspace(1)* %gep0.cast 2185 %build = insertelement <2 x i16> undef, i16 %load, i32 1 2186 %cast = bitcast <2 x i16> %build to <2 x half> 2187 ret <2 x half> %cast 2188} 2189 2190define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2191; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: 2192; GCN: ; %bb.0: 2193; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 2194; GCN-NEXT: s_waitcnt vmcnt(0) 2195; GCN-NEXT: ; return to shader part epilog 2196 %zext.offset = zext i32 %voffset to i64 2197 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2198 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2199 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2200 %load = load i16, i16 addrspace(1)* %gep1.cast 2201 %build = insertelement <2 x i16> undef, i16 %load, i32 1 2202 %cast = bitcast <2 x i16> %build to <2 x half> 2203 ret <2 x half> %cast 2204} 2205 2206define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2207; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi: 2208; GCN: ; %bb.0: 2209; GCN-NEXT: v_mov_b32_e32 v1, 0 2210; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 2211; GCN-NEXT: s_waitcnt vmcnt(0) 2212; GCN-NEXT: v_mov_b32_e32 v0, v1 2213; GCN-NEXT: ; return to shader part epilog 2214 %zext.offset = zext i32 %voffset to i64 2215 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2216 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2217 %load = load i16, i16 addrspace(1)* %gep0.cast 2218 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 2219 %cast = bitcast <2 x i16> %build to <2 x half> 2220 ret <2 x half> %cast 2221} 2222 2223define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2224; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: 2225; GCN: ; %bb.0: 2226; GCN-NEXT: v_mov_b32_e32 v1, 0 2227; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 2228; GCN-NEXT: s_waitcnt vmcnt(0) 2229; GCN-NEXT: v_mov_b32_e32 v0, v1 2230; GCN-NEXT: ; return to shader part epilog 2231 %zext.offset = zext i32 %voffset to i64 2232 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2233 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2234 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2235 %load = load i16, i16 addrspace(1)* %gep1.cast 2236 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 2237 %cast = bitcast <2 x i16> %build to <2 x half> 2238 ret <2 x half> %cast 2239} 2240 2241define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2242; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi: 2243; GCN: ; %bb.0: 2244; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 2245; GCN-NEXT: s_waitcnt vmcnt(0) 2246; GCN-NEXT: v_mov_b32_e32 v0, v1 2247; GCN-NEXT: ; return to shader part epilog 2248 %zext.offset = zext i32 %voffset to i64 2249 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2250 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2251 %load = load i16, i16 addrspace(1)* %gep0.cast 2252 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 2253 %cast = bitcast <2 x i16> %build to <2 x half> 2254 ret <2 x half> %cast 2255} 2256 2257define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2258; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: 2259; GCN: ; %bb.0: 2260; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 2261; GCN-NEXT: s_waitcnt vmcnt(0) 2262; GCN-NEXT: v_mov_b32_e32 v0, v1 2263; GCN-NEXT: ; return to shader part epilog 2264 %zext.offset = zext i32 %voffset to i64 2265 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2266 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2267 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2268 %load = load i16, i16 addrspace(1)* %gep1.cast 2269 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 2270 %cast = bitcast <2 x i16> %build to <2 x half> 2271 ret <2 x half> %cast 2272} 2273 2274define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2275; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: 2276; GCN: ; %bb.0: 2277; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] 2278; GCN-NEXT: s_waitcnt vmcnt(0) 2279; GCN-NEXT: v_mov_b32_e32 v0, v1 2280; GCN-NEXT: ; return to shader part epilog 2281 %zext.offset = zext i32 %voffset to i64 2282 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2283 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2284 %load = load i8, i8 addrspace(1)* %gep0.cast 2285 %zext.load = zext i8 %load to i16 2286 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 2287 %cast = bitcast <2 x i16> %build to <2 x half> 2288 ret <2 x half> %cast 2289} 2290 2291define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2292; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: 2293; GCN: ; %bb.0: 2294; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128 2295; GCN-NEXT: s_waitcnt vmcnt(0) 2296; GCN-NEXT: v_mov_b32_e32 v0, v1 2297; GCN-NEXT: ; return to shader part epilog 2298 %zext.offset = zext i32 %voffset to i64 2299 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2300 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2301 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2302 %load = load i8, i8 addrspace(1)* %gep1.cast 2303 %zext.load = zext i8 %load to i16 2304 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 2305 %cast = bitcast <2 x i16> %build to <2 x half> 2306 ret <2 x half> %cast 2307} 2308 2309define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2310; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: 2311; GCN: ; %bb.0: 2312; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] 2313; GCN-NEXT: s_waitcnt vmcnt(0) 2314; GCN-NEXT: v_mov_b32_e32 v0, v1 2315; GCN-NEXT: ; return to shader part epilog 2316 %zext.offset = zext i32 %voffset to i64 2317 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2318 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2319 %load = load i8, i8 addrspace(1)* %gep0.cast 2320 %sext.load = sext i8 %load to i16 2321 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 2322 %cast = bitcast <2 x i16> %build to <2 x half> 2323 ret <2 x half> %cast 2324} 2325 2326define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2327; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: 2328; GCN: ; %bb.0: 2329; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128 2330; GCN-NEXT: s_waitcnt vmcnt(0) 2331; GCN-NEXT: v_mov_b32_e32 v0, v1 2332; GCN-NEXT: ; return to shader part epilog 2333 %zext.offset = zext i32 %voffset to i64 2334 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2335 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2336 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2337 %load = load i8, i8 addrspace(1)* %gep1.cast 2338 %sext.load = sext i8 %load to i16 2339 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 2340 %cast = bitcast <2 x i16> %build to <2 x half> 2341 ret <2 x half> %cast 2342} 2343 2344; -------------------------------------------------------------------------------- 2345; or-with-constant as add 2346; -------------------------------------------------------------------------------- 2347 2348; Check add-as-or with split 64-bit or. 2349define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) { 2350; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: 2351; GCN: ; %bb.0: 2352; GCN-NEXT: v_or_b32_e32 v0, 16, v0 2353; GCN-NEXT: v_mov_b32_e32 v1, 0 2354; GCN-NEXT: global_load_ubyte v0, v[0:1], off 2355; GCN-NEXT: s_waitcnt vmcnt(0) 2356; GCN-NEXT: ; return to shader part epilog 2357 %zext.idx = zext i32 %idx to i64 2358 %or = or i64 %zext.idx, 16 2359 %addr = inttoptr i64 %or to i8 addrspace(1)* 2360 %load = load i8, i8 addrspace(1)* %addr 2361 %zext = zext i8 %load to i32 2362 %to.vgpr = bitcast i32 %zext to float 2363 ret float %to.vgpr 2364} 2365 2366define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) { 2367; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: 2368; GCN: ; %bb.0: 2369; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0 2370; GCN-NEXT: v_mov_b32_e32 v1, 0 2371; GCN-NEXT: global_load_ubyte v0, v[0:1], off 2372; GCN-NEXT: s_waitcnt vmcnt(0) 2373; GCN-NEXT: ; return to shader part epilog 2374 %zext.idx = zext i32 %idx to i64 2375 %or = or i64 %zext.idx, 4160 2376 %addr = inttoptr i64 %or to i8 addrspace(1)* 2377 %load = load i8, i8 addrspace(1)* %addr 2378 %zext = zext i8 %load to i32 2379 %to.vgpr = bitcast i32 %zext to float 2380 ret float %to.vgpr 2381} 2382 2383; -------------------------------------------------------------------------------- 2384; Full 64-bit scalar add. 2385; -------------------------------------------------------------------------------- 2386 2387define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) { 2388; GFX9-LABEL: global_addr_64bit_lsr_iv: 2389; GFX9: ; %bb.0: ; %bb 2390; GFX9-NEXT: s_mov_b64 s[0:1], 0 2391; GFX9-NEXT: v_mov_b32_e32 v0, 0 2392; GFX9-NEXT: BB128_1: ; %bb3 2393; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2394; GFX9-NEXT: s_add_u32 s4, s2, s0 2395; GFX9-NEXT: s_addc_u32 s5, s3, s1 2396; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 2397; GFX9-NEXT: s_waitcnt vmcnt(0) 2398; GFX9-NEXT: s_add_u32 s0, s0, 4 2399; GFX9-NEXT: s_addc_u32 s1, s1, 0 2400; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 2401; GFX9-NEXT: s_cbranch_scc0 BB128_1 2402; GFX9-NEXT: ; %bb.2: ; %bb2 2403; GFX9-NEXT: s_endpgm 2404; 2405; GFX10-LABEL: global_addr_64bit_lsr_iv: 2406; GFX10: ; %bb.0: ; %bb 2407; GFX10-NEXT: v_mov_b32_e32 v0, 0 2408; GFX10-NEXT: s_mov_b64 s[0:1], 0 2409; GFX10-NEXT: BB128_1: ; %bb3 2410; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2411; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2412; GFX10-NEXT: s_add_u32 s4, s2, s0 2413; GFX10-NEXT: s_addc_u32 s5, s3, s1 2414; GFX10-NEXT: s_add_u32 s0, s0, 4 2415; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 2416; GFX10-NEXT: s_waitcnt vmcnt(0) 2417; GFX10-NEXT: s_addc_u32 s1, s1, 0 2418; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 2419; GFX10-NEXT: s_cbranch_scc0 BB128_1 2420; GFX10-NEXT: ; %bb.2: ; %bb2 2421; GFX10-NEXT: s_endpgm 2422bb: 2423 br label %bb3 2424 2425bb2: ; preds = %bb3 2426 ret void 2427 2428bb3: ; preds = %bb3, %bb 2429 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] 2430 %i4 = zext i32 %i to i64 2431 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4 2432 %i6 = load volatile float, float addrspace(1)* %i5, align 4 2433 %i8 = add nuw nsw i32 %i, 1 2434 %i9 = icmp eq i32 %i8, 256 2435 br i1 %i9, label %bb2, label %bb3 2436} 2437 2438; Make sure we only have a single zero vaddr initialization. 2439 2440define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) { 2441; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: 2442; GFX9: ; %bb.0: ; %bb 2443; GFX9-NEXT: s_mov_b64 s[0:1], 0 2444; GFX9-NEXT: v_mov_b32_e32 v0, 0 2445; GFX9-NEXT: BB129_1: ; %bb3 2446; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 2447; GFX9-NEXT: s_add_u32 s4, s2, s0 2448; GFX9-NEXT: s_addc_u32 s5, s3, s1 2449; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 2450; GFX9-NEXT: s_waitcnt vmcnt(0) 2451; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 2452; GFX9-NEXT: s_waitcnt vmcnt(0) 2453; GFX9-NEXT: s_add_u32 s0, s0, 4 2454; GFX9-NEXT: s_addc_u32 s1, s1, 0 2455; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 2456; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 2457; GFX9-NEXT: s_cbranch_scc0 BB129_1 2458; GFX9-NEXT: ; %bb.2: ; %bb2 2459; GFX9-NEXT: s_endpgm 2460; 2461; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: 2462; GFX10: ; %bb.0: ; %bb 2463; GFX10-NEXT: v_mov_b32_e32 v0, 0 2464; GFX10-NEXT: s_mov_b64 s[0:1], 0 2465; GFX10-NEXT: BB129_1: ; %bb3 2466; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2467; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2468; GFX10-NEXT: s_add_u32 s4, s2, s0 2469; GFX10-NEXT: s_addc_u32 s5, s3, s1 2470; GFX10-NEXT: s_add_u32 s0, s0, 4 2471; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 2472; GFX10-NEXT: s_waitcnt vmcnt(0) 2473; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 2474; GFX10-NEXT: s_waitcnt vmcnt(0) 2475; GFX10-NEXT: s_addc_u32 s1, s1, 0 2476; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 2477; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 2478; GFX10-NEXT: s_cbranch_scc0 BB129_1 2479; GFX10-NEXT: ; %bb.2: ; %bb2 2480; GFX10-NEXT: s_endpgm 2481bb: 2482 br label %bb3 2483 2484bb2: ; preds = %bb3 2485 ret void 2486 2487bb3: ; preds = %bb3, %bb 2488 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] 2489 %i4 = zext i32 %i to i64 2490 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4 2491 %i6 = load volatile float, float addrspace(1)* %i5, align 4 2492 %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4 2493 %i6.1 = load volatile float, float addrspace(1)* %i5, align 4 2494 %i8 = add nuw nsw i32 %i, 1 2495 %i9 = icmp eq i32 %i8, 256 2496 br i1 %i9, label %bb2, label %bb3 2497} 2498 2499!0 = !{i32 0, i32 1073741824} ; (1 << 30) 2500!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 2501