1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4 5; Test using saddr addressing mode of global_*load_* flat instructions. 6 7; -------------------------------------------------------------------------------- 8; Basic addressing patterns 9; -------------------------------------------------------------------------------- 10 11; Basic pattern, no immediate offset. 12define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 13; GCN-LABEL: global_load_saddr_i8_zext_vgpr: 14; GCN: ; %bb.0: 15; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 16; GCN-NEXT: s_waitcnt vmcnt(0) 17; GCN-NEXT: ; return to shader part epilog 18 %zext.offset = zext i32 %voffset to i64 19 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 20 %load = load i8, i8 addrspace(1)* %gep0 21 %zext = zext i8 %load to i32 22 %to.vgpr = bitcast i32 %zext to float 23 ret float %to.vgpr 24} 25 26; Maximum positive offset on gfx9 27define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 28; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 29; GFX9: ; %bb.0: 30; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 31; GFX9-NEXT: s_waitcnt vmcnt(0) 32; GFX9-NEXT: ; return to shader part epilog 33; 34; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 37; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 38; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 39; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 40; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 41; GFX10-NEXT: s_waitcnt vmcnt(0) 42; GFX10-NEXT: ; return to shader part epilog 43 %zext.offset = zext i32 %voffset to i64 44 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 45 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 46 %load = load i8, i8 addrspace(1)* %gep1 47 %zext = zext i8 %load to i32 48 %to.vgpr = bitcast i32 %zext to float 49 ret float %to.vgpr 50} 51 52; Maximum positive offset on gfx9 + 1 53define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 54; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 55; GFX9: ; %bb.0: 56; GFX9-NEXT: v_mov_b32_e32 v1, s3 57; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 58; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 59; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 60; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 61; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 62; GFX9-NEXT: s_waitcnt vmcnt(0) 63; GFX9-NEXT: ; return to shader part epilog 64; 65; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 66; GFX10: ; %bb.0: 67; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 68; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 69; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x1000, v0 70; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 71; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 72; GFX10-NEXT: s_waitcnt vmcnt(0) 73; GFX10-NEXT: ; return to shader part epilog 74 %zext.offset = zext i32 %voffset to i64 75 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 76 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096 77 %load = load i8, i8 addrspace(1)* %gep1 78 %zext = zext i8 %load to i32 79 %to.vgpr = bitcast i32 %zext to float 80 ret float %to.vgpr 81} 82 83; Maximum negative offset on gfx9 84define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 85; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 86; GFX9: ; %bb.0: 87; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 88; GFX9-NEXT: s_waitcnt vmcnt(0) 89; GFX9-NEXT: ; return to shader part epilog 90; 91; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 92; GFX10: ; %bb.0: 93; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 94; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 95; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 96; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 97; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 98; GFX10-NEXT: s_waitcnt vmcnt(0) 99; GFX10-NEXT: ; return to shader part epilog 100 %zext.offset = zext i32 %voffset to i64 101 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 102 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096 103 %load = load i8, i8 addrspace(1)* %gep1 104 %zext = zext i8 %load to i32 105 %to.vgpr = bitcast i32 %zext to float 106 ret float %to.vgpr 107} 108 109; Maximum negative offset on gfx9 - 1 110define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 111; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 112; GFX9: ; %bb.0: 113; GFX9-NEXT: v_mov_b32_e32 v1, s3 114; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 115; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 116; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 117; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 118; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 119; GFX9-NEXT: s_waitcnt vmcnt(0) 120; GFX9-NEXT: ; return to shader part epilog 121; 122; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 123; GFX10: ; %bb.0: 124; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 125; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 126; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 127; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 128; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 129; GFX10-NEXT: s_waitcnt vmcnt(0) 130; GFX10-NEXT: ; return to shader part epilog 131 %zext.offset = zext i32 %voffset to i64 132 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 133 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097 134 %load = load i8, i8 addrspace(1)* %gep1 135 %zext = zext i8 %load to i32 136 %to.vgpr = bitcast i32 %zext to float 137 ret float %to.vgpr 138} 139 140; Maximum positive offset on gfx10 141define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 142; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: 143; GCN: ; %bb.0: 144; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 145; GCN-NEXT: s_waitcnt vmcnt(0) 146; GCN-NEXT: ; return to shader part epilog 147 %zext.offset = zext i32 %voffset to i64 148 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 149 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 150 %load = load i8, i8 addrspace(1)* %gep1 151 %zext = zext i8 %load to i32 152 %to.vgpr = bitcast i32 %zext to float 153 ret float %to.vgpr 154} 155 156; Maximum positive offset on gfx10 + 1 157define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 158; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 159; GFX9: ; %bb.0: 160; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 161; GFX9-NEXT: s_waitcnt vmcnt(0) 162; GFX9-NEXT: ; return to shader part epilog 163; 164; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 165; GFX10: ; %bb.0: 166; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 167; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 168; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 169; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 170; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 171; GFX10-NEXT: s_waitcnt vmcnt(0) 172; GFX10-NEXT: ; return to shader part epilog 173 %zext.offset = zext i32 %voffset to i64 174 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 175 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 176 %load = load i8, i8 addrspace(1)* %gep1 177 %zext = zext i8 %load to i32 178 %to.vgpr = bitcast i32 %zext to float 179 ret float %to.vgpr 180} 181 182; Maximum negative offset on gfx10 183define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 184; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: 185; GCN: ; %bb.0: 186; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 187; GCN-NEXT: s_waitcnt vmcnt(0) 188; GCN-NEXT: ; return to shader part epilog 189 %zext.offset = zext i32 %voffset to i64 190 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 191 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 192 %load = load i8, i8 addrspace(1)* %gep1 193 %zext = zext i8 %load to i32 194 %to.vgpr = bitcast i32 %zext to float 195 ret float %to.vgpr 196} 197 198; Maximum negative offset on gfx10 - 1 199define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 200; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 201; GFX9: ; %bb.0: 202; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 203; GFX9-NEXT: s_waitcnt vmcnt(0) 204; GFX9-NEXT: ; return to shader part epilog 205; 206; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 207; GFX10: ; %bb.0: 208; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 209; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 210; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff800, v0 211; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 212; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 213; GFX10-NEXT: s_waitcnt vmcnt(0) 214; GFX10-NEXT: ; return to shader part epilog 215 %zext.offset = zext i32 %voffset to i64 216 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 217 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049 218 %load = load i8, i8 addrspace(1)* %gep1 219 %zext = zext i8 %load to i32 220 %to.vgpr = bitcast i32 %zext to float 221 ret float %to.vgpr 222} 223 224; Maximum positive offset on gfx9, and immediate needs to be moved lower. 225define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 226; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 227; GFX9: ; %bb.0: 228; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 229; GFX9-NEXT: s_waitcnt vmcnt(0) 230; GFX9-NEXT: ; return to shader part epilog 231; 232; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 233; GFX10: ; %bb.0: 234; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 235; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 236; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 237; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 238; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 239; GFX10-NEXT: s_waitcnt vmcnt(0) 240; GFX10-NEXT: ; return to shader part epilog 241 %zext.offset = zext i32 %voffset to i64 242 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 243 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset 244 %load = load i8, i8 addrspace(1)* %gep1 245 %zext = zext i8 %load to i32 246 %to.vgpr = bitcast i32 %zext to float 247 ret float %to.vgpr 248} 249 250; pointer addressing done in integers 251define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 252; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: 253; GCN: ; %bb.0: 254; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 255; GCN-NEXT: s_waitcnt vmcnt(0) 256; GCN-NEXT: ; return to shader part epilog 257 %zext.offset = zext i32 %voffset to i64 258 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 259 %add = add i64 %sbase.as.int, %zext.offset 260 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 261 %load = load i8, i8 addrspace(1)* %dirty.gep 262 %zext = zext i8 %load to i32 263 %to.vgpr = bitcast i32 %zext to float 264 ret float %to.vgpr 265} 266 267; zext forced to LHS of addressing expression 268define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 269; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: 270; GCN: ; %bb.0: 271; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 272; GCN-NEXT: s_waitcnt vmcnt(0) 273; GCN-NEXT: ; return to shader part epilog 274 %zext.offset = zext i32 %voffset to i64 275 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 276 %add = add i64 %zext.offset, %sbase.as.int 277 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 278 %load = load i8, i8 addrspace(1)* %dirty.gep 279 %zext = zext i8 %load to i32 280 %to.vgpr = bitcast i32 %zext to float 281 ret float %to.vgpr 282} 283 284; zext forced to LHS of addressing expression, with immediate offset 285define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 286; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: 287; GCN: ; %bb.0: 288; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 289; GCN-NEXT: s_waitcnt vmcnt(0) 290; GCN-NEXT: ; return to shader part epilog 291 %zext.offset = zext i32 %voffset to i64 292 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 293 %add = add i64 %zext.offset, %sbase.as.int 294 %add.immoffset = add i64 %add, 128 295 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 296 %load = load i8, i8 addrspace(1)* %dirty.gep 297 %zext = zext i8 %load to i32 298 %to.vgpr = bitcast i32 %zext to float 299 ret float %to.vgpr 300} 301 302; zext forced to LHS of addressing expression, with immediate offset in non-canonical position 303define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 304; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: 305; GCN: ; %bb.0: 306; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 307; GCN-NEXT: s_waitcnt vmcnt(0) 308; GCN-NEXT: ; return to shader part epilog 309 %zext.offset = zext i32 %voffset to i64 310 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 311 %add.immoffset = add i64 %sbase.as.int, 128 312 %add = add i64 %zext.offset, %add.immoffset 313 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 314 %load = load i8, i8 addrspace(1)* %dirty.gep 315 %zext = zext i8 %load to i32 316 %to.vgpr = bitcast i32 %zext to float 317 ret float %to.vgpr 318} 319 320; -------------------------------------------------------------------------------- 321; Uniformity edge cases 322; -------------------------------------------------------------------------------- 323 324@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef 325 326; Base pointer is uniform, but also in VGPRs 327define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { 328; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 329; GCN: ; %bb.0: 330; GCN-NEXT: v_mov_b32_e32 v1, 0 331; GCN-NEXT: ds_read_b64 v[1:2], v1 332; GCN-NEXT: s_waitcnt lgkmcnt(0) 333; GCN-NEXT: v_readfirstlane_b32 s0, v1 334; GCN-NEXT: v_readfirstlane_b32 s1, v2 335; GCN-NEXT: s_nop 4 336; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] 337; GCN-NEXT: s_waitcnt vmcnt(0) 338; GCN-NEXT: ; return to shader part epilog 339 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 340 %zext.offset = zext i32 %voffset to i64 341 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 342 %load = load i8, i8 addrspace(1)* %gep0 343 %zext = zext i8 %load to i32 344 %to.vgpr = bitcast i32 %zext to float 345 ret float %to.vgpr 346} 347 348; Base pointer is uniform, but also in VGPRs, with imm offset 349define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { 350; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 351; GCN: ; %bb.0: 352; GCN-NEXT: v_mov_b32_e32 v1, 0 353; GCN-NEXT: ds_read_b64 v[1:2], v1 354; GCN-NEXT: s_waitcnt lgkmcnt(0) 355; GCN-NEXT: v_readfirstlane_b32 s0, v1 356; GCN-NEXT: v_readfirstlane_b32 s1, v2 357; GCN-NEXT: s_nop 4 358; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 359; GCN-NEXT: s_waitcnt vmcnt(0) 360; GCN-NEXT: ; return to shader part epilog 361 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 362 %zext.offset = zext i32 %voffset to i64 363 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 364 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 365 %load = load i8, i8 addrspace(1)* %gep1 366 %zext = zext i8 %load to i32 367 %to.vgpr = bitcast i32 %zext to float 368 ret float %to.vgpr 369} 370 371; Both 64-bit base and 32-bit offset are scalar 372define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 373; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: 374; GCN: ; %bb.0: 375; GCN-NEXT: v_mov_b32_e32 v0, s4 376; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 377; GCN-NEXT: s_waitcnt vmcnt(0) 378; GCN-NEXT: ; return to shader part epilog 379 %zext.offset = zext i32 %soffset to i64 380 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 381 %load = load i8, i8 addrspace(1)* %gep0 382 %zext = zext i8 %load to i32 383 %to.vgpr = bitcast i32 %zext to float 384 ret float %to.vgpr 385} 386 387; Both 64-bit base and 32-bit offset are scalar, with immediate offset. 388define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 389; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: 390; GCN: ; %bb.0: 391; GCN-NEXT: v_mov_b32_e32 v0, s4 392; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 393; GCN-NEXT: s_waitcnt vmcnt(0) 394; GCN-NEXT: ; return to shader part epilog 395 %zext.offset = zext i32 %soffset to i64 396 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 397 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24 398 %load = load i8, i8 addrspace(1)* %gep1 399 %zext = zext i8 %load to i32 400 %to.vgpr = bitcast i32 %zext to float 401 ret float %to.vgpr 402} 403 404; Both components uniform, zext forced to LHS of addressing expression 405define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 406; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: 407; GCN: ; %bb.0: 408; GCN-NEXT: v_mov_b32_e32 v0, s4 409; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 410; GCN-NEXT: s_waitcnt vmcnt(0) 411; GCN-NEXT: ; return to shader part epilog 412 %zext.offset = zext i32 %soffset to i64 413 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 414 %add = add i64 %zext.offset, %sbase.as.int 415 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 416 %load = load i8, i8 addrspace(1)* %dirty.gep 417 %zext = zext i8 %load to i32 418 %to.vgpr = bitcast i32 %zext to float 419 ret float %to.vgpr 420} 421 422; Both components uniform, zext forced to LHS of addressing expression, with immediate offset 423define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 424; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: 425; GCN: ; %bb.0: 426; GCN-NEXT: v_mov_b32_e32 v0, s4 427; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 428; GCN-NEXT: s_waitcnt vmcnt(0) 429; GCN-NEXT: ; return to shader part epilog 430 %zext.offset = zext i32 %soffset to i64 431 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 432 %add = add i64 %zext.offset, %sbase.as.int 433 %add.immoffset = add i64 %add, 128 434 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 435 %load = load i8, i8 addrspace(1)* %dirty.gep 436 %zext = zext i8 %load to i32 437 %to.vgpr = bitcast i32 %zext to float 438 ret float %to.vgpr 439} 440 441; divergent 64-bit base, 32-bit scalar offset. 442define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 443; GFX9-LABEL: global_load_i8_vgpr64_sgpr32: 444; GFX9: ; %bb.0: 445; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 446; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 447; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 448; GFX9-NEXT: s_waitcnt vmcnt(0) 449; GFX9-NEXT: ; return to shader part epilog 450; 451; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: 452; GFX10: ; %bb.0: 453; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2 454; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 455; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 456; GFX10-NEXT: s_waitcnt vmcnt(0) 457; GFX10-NEXT: ; return to shader part epilog 458 %zext.offset = zext i32 %soffset to i64 459 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 460 %load = load i8, i8 addrspace(1)* %gep0 461 %zext = zext i8 %load to i32 462 %to.vgpr = bitcast i32 %zext to float 463 ret float %to.vgpr 464} 465 466; divergent 64-bit base, 32-bit scalar offset, with imm offset 467define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 468; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 469; GFX9: ; %bb.0: 470; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 471; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 472; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 473; GFX9-NEXT: s_waitcnt vmcnt(0) 474; GFX9-NEXT: ; return to shader part epilog 475; 476; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 477; GFX10: ; %bb.0: 478; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2 479; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 480; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 481; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 482; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 483; GFX10-NEXT: s_waitcnt vmcnt(0) 484; GFX10-NEXT: ; return to shader part epilog 485 %zext.offset = zext i32 %soffset to i64 486 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 487 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 488 %load = load i8, i8 addrspace(1)* %gep1 489 %zext = zext i8 %load to i32 490 %to.vgpr = bitcast i32 %zext to float 491 ret float %to.vgpr 492} 493 494; -------------------------------------------------------------------------------- 495; Natural addressing shifts with restricted range 496; -------------------------------------------------------------------------------- 497 498; Cannot push the shift into 32-bits, and cannot match. 499define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 500; GFX9-LABEL: global_load_saddr_f32_natural_addressing: 501; GFX9: ; %bb.0: 502; GFX9-NEXT: global_load_dword v0, v[0:1], off 503; GFX9-NEXT: v_mov_b32_e32 v1, 0 504; GFX9-NEXT: v_mov_b32_e32 v2, s3 505; GFX9-NEXT: s_waitcnt vmcnt(0) 506; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 507; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 508; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 509; GFX9-NEXT: global_load_dword v0, v[0:1], off 510; GFX9-NEXT: s_waitcnt vmcnt(0) 511; GFX9-NEXT: ; return to shader part epilog 512; 513; GFX10-LABEL: global_load_saddr_f32_natural_addressing: 514; GFX10: ; %bb.0: 515; GFX10-NEXT: global_load_dword v0, v[0:1], off 516; GFX10-NEXT: v_mov_b32_e32 v1, 0 517; GFX10-NEXT: s_waitcnt vmcnt(0) 518; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 519; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 520; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 521; GFX10-NEXT: global_load_dword v0, v[0:1], off 522; GFX10-NEXT: s_waitcnt vmcnt(0) 523; GFX10-NEXT: ; return to shader part epilog 524 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 525 %zext.offset = zext i32 %voffset to i64 526 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 527 %load = load float, float addrspace(1)* %gep 528 ret float %load 529} 530 531; Cannot push the shift into 32-bits, with an immediate offset. 532define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 533; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset: 534; GCN: ; %bb.0: 535; GCN-NEXT: global_load_dword v0, v[0:1], off 536; GCN-NEXT: s_waitcnt vmcnt(0) 537; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 538; GCN-NEXT: s_waitcnt vmcnt(0) 539; GCN-NEXT: ; return to shader part epilog 540 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 541 %zext.offset = zext i32 %voffset to i64 542 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 543 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128 544 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 545 %load = load float, float addrspace(1)* %gep1.cast 546 ret float %load 547} 548 549; Range is sufficiently restricted to push the shift into 32-bits. 550define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 551; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range: 552; GCN: ; %bb.0: 553; GCN-NEXT: global_load_dword v0, v[0:1], off 554; GCN-NEXT: s_waitcnt vmcnt(0) 555; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 556; GCN-NEXT: global_load_dword v0, v0, s[2:3] 557; GCN-NEXT: s_waitcnt vmcnt(0) 558; GCN-NEXT: ; return to shader part epilog 559 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 560 %zext.offset = zext i32 %voffset to i64 561 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 562 %load = load float, float addrspace(1)* %gep 563 ret float %load 564} 565 566; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset 567define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 568; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: 569; GCN: ; %bb.0: 570; GCN-NEXT: global_load_dword v0, v[0:1], off 571; GCN-NEXT: s_waitcnt vmcnt(0) 572; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 573; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 574; GCN-NEXT: s_waitcnt vmcnt(0) 575; GCN-NEXT: ; return to shader part epilog 576 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 577 %zext.offset = zext i32 %voffset to i64 578 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 579 %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100 580 %load = load float, float addrspace(1)* %gep1 581 ret float %load 582} 583 584; Range is 1 beyond the limit where we can move the shift into 32-bits. 585define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 586; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 587; GFX9: ; %bb.0: 588; GFX9-NEXT: global_load_dword v0, v[0:1], off 589; GFX9-NEXT: v_mov_b32_e32 v1, 0 590; GFX9-NEXT: v_mov_b32_e32 v2, s3 591; GFX9-NEXT: s_waitcnt vmcnt(0) 592; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 593; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 594; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 595; GFX9-NEXT: global_load_dword v0, v[0:1], off 596; GFX9-NEXT: s_waitcnt vmcnt(0) 597; GFX9-NEXT: ; return to shader part epilog 598; 599; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 600; GFX10: ; %bb.0: 601; GFX10-NEXT: global_load_dword v0, v[0:1], off 602; GFX10-NEXT: v_mov_b32_e32 v1, 0 603; GFX10-NEXT: s_waitcnt vmcnt(0) 604; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 605; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 606; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 607; GFX10-NEXT: global_load_dword v0, v[0:1], off 608; GFX10-NEXT: s_waitcnt vmcnt(0) 609; GFX10-NEXT: ; return to shader part epilog 610 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 611 %zext.offset = zext i32 %voffset to i64 612 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 613 %load = load float, float addrspace(1)* %gep 614 ret float %load 615} 616 617; -------------------------------------------------------------------------------- 618; Stress various type loads 619; -------------------------------------------------------------------------------- 620 621define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 622; GCN-LABEL: global_load_saddr_i16: 623; GCN: ; %bb.0: 624; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 625; GCN-NEXT: s_waitcnt vmcnt(0) 626; GCN-NEXT: ; return to shader part epilog 627 %zext.offset = zext i32 %voffset to i64 628 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 629 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 630 %load = load i16, i16 addrspace(1)* %gep0.cast 631 %cast.load = bitcast i16 %load to half 632 ret half %cast.load 633} 634 635define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 636; GCN-LABEL: global_load_saddr_i16_immneg128: 637; GCN: ; %bb.0: 638; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 639; GCN-NEXT: s_waitcnt vmcnt(0) 640; GCN-NEXT: ; return to shader part epilog 641 %zext.offset = zext i32 %voffset to i64 642 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 643 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 644 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 645 %load = load i16, i16 addrspace(1)* %gep1.cast 646 %cast.load = bitcast i16 %load to half 647 ret half %cast.load 648} 649 650define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 651; GCN-LABEL: global_load_saddr_f16: 652; GCN: ; %bb.0: 653; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 654; GCN-NEXT: s_waitcnt vmcnt(0) 655; GCN-NEXT: ; return to shader part epilog 656 %zext.offset = zext i32 %voffset to i64 657 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 658 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* 659 %load = load half, half addrspace(1)* %gep0.cast 660 ret half %load 661} 662 663define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 664; GCN-LABEL: global_load_saddr_f16_immneg128: 665; GCN: ; %bb.0: 666; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 667; GCN-NEXT: s_waitcnt vmcnt(0) 668; GCN-NEXT: ; return to shader part epilog 669 %zext.offset = zext i32 %voffset to i64 670 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 671 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 672 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)* 673 %load = load half, half addrspace(1)* %gep1.cast 674 ret half %load 675} 676 677define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 678; GCN-LABEL: global_load_saddr_i32: 679; GCN: ; %bb.0: 680; GCN-NEXT: global_load_dword v0, v0, s[2:3] 681; GCN-NEXT: s_waitcnt vmcnt(0) 682; GCN-NEXT: ; return to shader part epilog 683 %zext.offset = zext i32 %voffset to i64 684 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 685 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 686 %load = load i32, i32 addrspace(1)* %gep0.cast 687 %cast.load = bitcast i32 %load to float 688 ret float %cast.load 689} 690 691define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 692; GCN-LABEL: global_load_saddr_i32_immneg128: 693; GCN: ; %bb.0: 694; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 695; GCN-NEXT: s_waitcnt vmcnt(0) 696; GCN-NEXT: ; return to shader part epilog 697 %zext.offset = zext i32 %voffset to i64 698 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 699 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 700 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 701 %load = load i32, i32 addrspace(1)* %gep1.cast 702 %cast.load = bitcast i32 %load to float 703 ret float %cast.load 704} 705 706define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 707; GCN-LABEL: global_load_saddr_f32: 708; GCN: ; %bb.0: 709; GCN-NEXT: global_load_dword v0, v0, s[2:3] 710; GCN-NEXT: s_waitcnt vmcnt(0) 711; GCN-NEXT: ; return to shader part epilog 712 %zext.offset = zext i32 %voffset to i64 713 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 714 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* 715 %load = load float, float addrspace(1)* %gep0.cast 716 ret float %load 717} 718 719define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 720; GCN-LABEL: global_load_saddr_f32_immneg128: 721; GCN: ; %bb.0: 722; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 723; GCN-NEXT: s_waitcnt vmcnt(0) 724; GCN-NEXT: ; return to shader part epilog 725 %zext.offset = zext i32 %voffset to i64 726 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 727 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 728 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 729 %load = load float, float addrspace(1)* %gep1.cast 730 ret float %load 731} 732 733define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 734; GCN-LABEL: global_load_saddr_v2i16: 735; GCN: ; %bb.0: 736; GCN-NEXT: global_load_dword v0, v0, s[2:3] 737; GCN-NEXT: s_waitcnt vmcnt(0) 738; GCN-NEXT: ; return to shader part epilog 739 %zext.offset = zext i32 %voffset to i64 740 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 741 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* 742 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast 743 %cast.load = bitcast <2 x i16> %load to <2 x half> 744 ret <2 x half> %cast.load 745} 746 747define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 748; GCN-LABEL: global_load_saddr_v2i16_immneg128: 749; GCN: ; %bb.0: 750; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 751; GCN-NEXT: s_waitcnt vmcnt(0) 752; GCN-NEXT: ; return to shader part epilog 753 %zext.offset = zext i32 %voffset to i64 754 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 755 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 756 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)* 757 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast 758 %cast.load = bitcast <2 x i16> %load to <2 x half> 759 ret <2 x half> %cast.load 760} 761 762define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 763; GCN-LABEL: global_load_saddr_v2f16: 764; GCN: ; %bb.0: 765; GCN-NEXT: global_load_dword v0, v0, s[2:3] 766; GCN-NEXT: s_waitcnt vmcnt(0) 767; GCN-NEXT: ; return to shader part epilog 768 %zext.offset = zext i32 %voffset to i64 769 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 770 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* 771 %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast 772 ret <2 x half> %load 773} 774 775define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 776; GCN-LABEL: global_load_saddr_v2f16_immneg128: 777; GCN: ; %bb.0: 778; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 779; GCN-NEXT: s_waitcnt vmcnt(0) 780; GCN-NEXT: ; return to shader part epilog 781 %zext.offset = zext i32 %voffset to i64 782 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 783 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 784 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* 785 %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast 786 ret <2 x half> %load 787} 788 789define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 790; GCN-LABEL: global_load_saddr_p3: 791; GCN: ; %bb.0: 792; GCN-NEXT: global_load_dword v0, v0, s[2:3] 793; GCN-NEXT: s_waitcnt vmcnt(0) 794; GCN-NEXT: ; return to shader part epilog 795 %zext.offset = zext i32 %voffset to i64 796 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 797 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* 798 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast 799 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 800 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 801 ret <2 x half> %cast.load1 802} 803 804define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 805; GCN-LABEL: global_load_saddr_p3_immneg128: 806; GCN: ; %bb.0: 807; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 808; GCN-NEXT: s_waitcnt vmcnt(0) 809; GCN-NEXT: ; return to shader part epilog 810 %zext.offset = zext i32 %voffset to i64 811 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 812 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 813 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)* 814 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast 815 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 816 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 817 ret <2 x half> %cast.load1 818} 819 820define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 821; GCN-LABEL: global_load_saddr_f64: 822; GCN: ; %bb.0: 823; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 824; GCN-NEXT: s_waitcnt vmcnt(0) 825; GCN-NEXT: ; return to shader part epilog 826 %zext.offset = zext i32 %voffset to i64 827 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 828 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* 829 %load = load double, double addrspace(1)* %gep0.cast 830 %cast.load = bitcast double %load to <2 x float> 831 ret <2 x float> %cast.load 832} 833 834define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 835; GCN-LABEL: global_load_saddr_f64_immneg128: 836; GCN: ; %bb.0: 837; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 838; GCN-NEXT: s_waitcnt vmcnt(0) 839; GCN-NEXT: ; return to shader part epilog 840 %zext.offset = zext i32 %voffset to i64 841 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 842 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 843 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)* 844 %load = load double, double addrspace(1)* %gep1.cast 845 %cast.load = bitcast double %load to <2 x float> 846 ret <2 x float> %cast.load 847} 848 849define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 850; GCN-LABEL: global_load_saddr_i64: 851; GCN: ; %bb.0: 852; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 853; GCN-NEXT: s_waitcnt vmcnt(0) 854; GCN-NEXT: ; return to shader part epilog 855 %zext.offset = zext i32 %voffset to i64 856 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 857 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 858 %load = load i64, i64 addrspace(1)* %gep0.cast 859 %cast.load = bitcast i64 %load to <2 x float> 860 ret <2 x float> %cast.load 861} 862 863define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 864; GCN-LABEL: global_load_saddr_i64_immneg128: 865; GCN: ; %bb.0: 866; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 867; GCN-NEXT: s_waitcnt vmcnt(0) 868; GCN-NEXT: ; return to shader part epilog 869 %zext.offset = zext i32 %voffset to i64 870 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 871 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 872 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 873 %load = load i64, i64 addrspace(1)* %gep1.cast 874 %cast.load = bitcast i64 %load to <2 x float> 875 ret <2 x float> %cast.load 876} 877 878define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 879; GCN-LABEL: global_load_saddr_v2f32: 880; GCN: ; %bb.0: 881; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 882; GCN-NEXT: s_waitcnt vmcnt(0) 883; GCN-NEXT: ; return to shader part epilog 884 %zext.offset = zext i32 %voffset to i64 885 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 886 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* 887 %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast 888 ret <2 x float> %load 889} 890 891define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 892; GCN-LABEL: global_load_saddr_v2f32_immneg128: 893; GCN: ; %bb.0: 894; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 895; GCN-NEXT: s_waitcnt vmcnt(0) 896; GCN-NEXT: ; return to shader part epilog 897 %zext.offset = zext i32 %voffset to i64 898 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 899 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 900 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)* 901 %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast 902 ret <2 x float> %load 903} 904 905define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 906; GCN-LABEL: global_load_saddr_v2i32: 907; GCN: ; %bb.0: 908; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 909; GCN-NEXT: s_waitcnt vmcnt(0) 910; GCN-NEXT: ; return to shader part epilog 911 %zext.offset = zext i32 %voffset to i64 912 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 913 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* 914 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast 915 %cast.load = bitcast <2 x i32> %load to <2 x float> 916 ret <2 x float> %cast.load 917} 918 919define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 920; GCN-LABEL: global_load_saddr_v2i32_immneg128: 921; GCN: ; %bb.0: 922; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 923; GCN-NEXT: s_waitcnt vmcnt(0) 924; GCN-NEXT: ; return to shader part epilog 925 %zext.offset = zext i32 %voffset to i64 926 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 927 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 928 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)* 929 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast 930 %cast.load = bitcast <2 x i32> %load to <2 x float> 931 ret <2 x float> %cast.load 932} 933 934define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 935; GCN-LABEL: global_load_saddr_v4i16: 936; GCN: ; %bb.0: 937; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 938; GCN-NEXT: s_waitcnt vmcnt(0) 939; GCN-NEXT: ; return to shader part epilog 940 %zext.offset = zext i32 %voffset to i64 941 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 942 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* 943 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast 944 %cast.load = bitcast <4 x i16> %load to <2 x float> 945 ret <2 x float> %cast.load 946} 947 948define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 949; GCN-LABEL: global_load_saddr_v4i16_immneg128: 950; GCN: ; %bb.0: 951; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 952; GCN-NEXT: s_waitcnt vmcnt(0) 953; GCN-NEXT: ; return to shader part epilog 954 %zext.offset = zext i32 %voffset to i64 955 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 956 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 957 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)* 958 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast 959 %cast.load = bitcast <4 x i16> %load to <2 x float> 960 ret <2 x float> %cast.load 961} 962 963define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 964; GCN-LABEL: global_load_saddr_v4f16: 965; GCN: ; %bb.0: 966; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 967; GCN-NEXT: s_waitcnt vmcnt(0) 968; GCN-NEXT: ; return to shader part epilog 969 %zext.offset = zext i32 %voffset to i64 970 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 971 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* 972 %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast 973 %cast.load = bitcast <4 x half> %load to <2 x float> 974 ret <2 x float> %cast.load 975} 976 977define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 978; GCN-LABEL: global_load_saddr_v4f16_immneg128: 979; GCN: ; %bb.0: 980; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 981; GCN-NEXT: s_waitcnt vmcnt(0) 982; GCN-NEXT: ; return to shader part epilog 983 %zext.offset = zext i32 %voffset to i64 984 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 985 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 986 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)* 987 %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast 988 %cast.load = bitcast <4 x half> %load to <2 x float> 989 ret <2 x float> %cast.load 990} 991 992define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 993; GCN-LABEL: global_load_saddr_p1: 994; GCN: ; %bb.0: 995; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 996; GCN-NEXT: s_waitcnt vmcnt(0) 997; GCN-NEXT: ; return to shader part epilog 998 %zext.offset = zext i32 %voffset to i64 999 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1000 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* 1001 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast 1002 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 1003 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 1004 ret <2 x float> %cast.load1 1005} 1006 1007define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1008; GCN-LABEL: global_load_saddr_p1_immneg128: 1009; GCN: ; %bb.0: 1010; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1011; GCN-NEXT: s_waitcnt vmcnt(0) 1012; GCN-NEXT: ; return to shader part epilog 1013 %zext.offset = zext i32 %voffset to i64 1014 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1015 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1016 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)* 1017 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast 1018 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 1019 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 1020 ret <2 x float> %cast.load1 1021} 1022 1023define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1024; GCN-LABEL: global_load_saddr_v3f32: 1025; GCN: ; %bb.0: 1026; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1027; GCN-NEXT: s_waitcnt vmcnt(0) 1028; GCN-NEXT: ; return to shader part epilog 1029 %zext.offset = zext i32 %voffset to i64 1030 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1031 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* 1032 %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast 1033 ret <3 x float> %load 1034} 1035 1036define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1037; GCN-LABEL: global_load_saddr_v3f32_immneg128: 1038; GCN: ; %bb.0: 1039; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1040; GCN-NEXT: s_waitcnt vmcnt(0) 1041; GCN-NEXT: ; return to shader part epilog 1042 %zext.offset = zext i32 %voffset to i64 1043 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1044 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1045 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)* 1046 %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast 1047 ret <3 x float> %load 1048} 1049 1050define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1051; GCN-LABEL: global_load_saddr_v3i32: 1052; GCN: ; %bb.0: 1053; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1054; GCN-NEXT: s_waitcnt vmcnt(0) 1055; GCN-NEXT: ; return to shader part epilog 1056 %zext.offset = zext i32 %voffset to i64 1057 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1058 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* 1059 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast 1060 %cast.load = bitcast <3 x i32> %load to <3 x float> 1061 ret <3 x float> %cast.load 1062} 1063 1064define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1065; GCN-LABEL: global_load_saddr_v3i32_immneg128: 1066; GCN: ; %bb.0: 1067; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1068; GCN-NEXT: s_waitcnt vmcnt(0) 1069; GCN-NEXT: ; return to shader part epilog 1070 %zext.offset = zext i32 %voffset to i64 1071 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1072 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1073 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* 1074 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast 1075 %cast.load = bitcast <3 x i32> %load to <3 x float> 1076 ret <3 x float> %cast.load 1077} 1078 1079define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1080; GCN-LABEL: global_load_saddr_v6f16: 1081; GCN: ; %bb.0: 1082; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 1083; GCN-NEXT: s_waitcnt vmcnt(0) 1084; GCN-NEXT: ; return to shader part epilog 1085 %zext.offset = zext i32 %voffset to i64 1086 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1087 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* 1088 %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast 1089 ret <6 x half> %load 1090} 1091 1092define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1093; GCN-LABEL: global_load_saddr_v6f16_immneg128: 1094; GCN: ; %bb.0: 1095; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 1096; GCN-NEXT: s_waitcnt vmcnt(0) 1097; GCN-NEXT: ; return to shader part epilog 1098 %zext.offset = zext i32 %voffset to i64 1099 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1100 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1101 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)* 1102 %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast 1103 ret <6 x half> %load 1104} 1105 1106define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1107; GCN-LABEL: global_load_saddr_v4f32: 1108; GCN: ; %bb.0: 1109; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1110; GCN-NEXT: s_waitcnt vmcnt(0) 1111; GCN-NEXT: ; return to shader part epilog 1112 %zext.offset = zext i32 %voffset to i64 1113 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1114 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* 1115 %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast 1116 ret <4 x float> %load 1117} 1118 1119define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1120; GCN-LABEL: global_load_saddr_v4f32_immneg128: 1121; GCN: ; %bb.0: 1122; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1123; GCN-NEXT: s_waitcnt vmcnt(0) 1124; GCN-NEXT: ; return to shader part epilog 1125 %zext.offset = zext i32 %voffset to i64 1126 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1127 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1128 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)* 1129 %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast 1130 ret <4 x float> %load 1131} 1132 1133define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1134; GCN-LABEL: global_load_saddr_v4i32: 1135; GCN: ; %bb.0: 1136; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1137; GCN-NEXT: s_waitcnt vmcnt(0) 1138; GCN-NEXT: ; return to shader part epilog 1139 %zext.offset = zext i32 %voffset to i64 1140 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1141 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* 1142 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast 1143 %cast.load = bitcast <4 x i32> %load to <4 x float> 1144 ret <4 x float> %cast.load 1145} 1146 1147define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1148; GCN-LABEL: global_load_saddr_v4i32_immneg128: 1149; GCN: ; %bb.0: 1150; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1151; GCN-NEXT: s_waitcnt vmcnt(0) 1152; GCN-NEXT: ; return to shader part epilog 1153 %zext.offset = zext i32 %voffset to i64 1154 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1155 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1156 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)* 1157 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast 1158 %cast.load = bitcast <4 x i32> %load to <4 x float> 1159 ret <4 x float> %cast.load 1160} 1161 1162define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1163; GCN-LABEL: global_load_saddr_v2i64: 1164; GCN: ; %bb.0: 1165; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1166; GCN-NEXT: s_waitcnt vmcnt(0) 1167; GCN-NEXT: ; return to shader part epilog 1168 %zext.offset = zext i32 %voffset to i64 1169 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1170 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* 1171 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast 1172 %cast.load = bitcast <2 x i64> %load to <4 x float> 1173 ret <4 x float> %cast.load 1174} 1175 1176define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1177; GCN-LABEL: global_load_saddr_v2i64_immneg128: 1178; GCN: ; %bb.0: 1179; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1180; GCN-NEXT: s_waitcnt vmcnt(0) 1181; GCN-NEXT: ; return to shader part epilog 1182 %zext.offset = zext i32 %voffset to i64 1183 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1184 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1185 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)* 1186 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast 1187 %cast.load = bitcast <2 x i64> %load to <4 x float> 1188 ret <4 x float> %cast.load 1189} 1190 1191define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1192; GCN-LABEL: global_load_saddr_i128: 1193; GCN: ; %bb.0: 1194; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1195; GCN-NEXT: s_waitcnt vmcnt(0) 1196; GCN-NEXT: ; return to shader part epilog 1197 %zext.offset = zext i32 %voffset to i64 1198 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1199 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* 1200 %load = load i128, i128 addrspace(1)* %gep0.cast 1201 %cast.load = bitcast i128 %load to <4 x float> 1202 ret <4 x float> %cast.load 1203} 1204 1205define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1206; GCN-LABEL: global_load_saddr_i128_immneg128: 1207; GCN: ; %bb.0: 1208; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1209; GCN-NEXT: s_waitcnt vmcnt(0) 1210; GCN-NEXT: ; return to shader part epilog 1211 %zext.offset = zext i32 %voffset to i64 1212 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1213 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1214 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)* 1215 %load = load i128, i128 addrspace(1)* %gep1.cast 1216 %cast.load = bitcast i128 %load to <4 x float> 1217 ret <4 x float> %cast.load 1218} 1219 1220define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1221; GCN-LABEL: global_load_saddr_v2p1: 1222; GCN: ; %bb.0: 1223; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1224; GCN-NEXT: s_waitcnt vmcnt(0) 1225; GCN-NEXT: ; return to shader part epilog 1226 %zext.offset = zext i32 %voffset to i64 1227 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1228 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* 1229 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast 1230 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 1231 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 1232 ret <4 x float> %cast.load1 1233} 1234 1235define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1236; GCN-LABEL: global_load_saddr_v2p1_immneg128: 1237; GCN: ; %bb.0: 1238; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1239; GCN-NEXT: s_waitcnt vmcnt(0) 1240; GCN-NEXT: ; return to shader part epilog 1241 %zext.offset = zext i32 %voffset to i64 1242 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1243 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1244 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)* 1245 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast 1246 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 1247 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 1248 ret <4 x float> %cast.load1 1249} 1250 1251define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1252; GCN-LABEL: global_load_saddr_v4p3: 1253; GCN: ; %bb.0: 1254; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 1255; GCN-NEXT: s_waitcnt vmcnt(0) 1256; GCN-NEXT: ; return to shader part epilog 1257 %zext.offset = zext i32 %voffset to i64 1258 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1259 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* 1260 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast 1261 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 1262 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 1263 ret <4 x float> %cast.load1 1264} 1265 1266define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1267; GCN-LABEL: global_load_saddr_v4p3_immneg128: 1268; GCN: ; %bb.0: 1269; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 1270; GCN-NEXT: s_waitcnt vmcnt(0) 1271; GCN-NEXT: ; return to shader part epilog 1272 %zext.offset = zext i32 %voffset to i64 1273 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1274 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1275 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)* 1276 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast 1277 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 1278 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 1279 ret <4 x float> %cast.load1 1280} 1281 1282; -------------------------------------------------------------------------------- 1283; Extending loads 1284; -------------------------------------------------------------------------------- 1285 1286define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1287; GCN-LABEL: global_sextload_saddr_i8: 1288; GCN: ; %bb.0: 1289; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] 1290; GCN-NEXT: s_waitcnt vmcnt(0) 1291; GCN-NEXT: ; return to shader part epilog 1292 %zext.offset = zext i32 %voffset to i64 1293 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1294 %load = load i8, i8 addrspace(1)* %gep0 1295 %sextload = sext i8 %load to i32 1296 %cast.load = bitcast i32 %sextload to float 1297 ret float %cast.load 1298} 1299 1300define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1301; GCN-LABEL: global_sextload_saddr_i8_immneg128: 1302; GCN: ; %bb.0: 1303; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 1304; GCN-NEXT: s_waitcnt vmcnt(0) 1305; GCN-NEXT: ; return to shader part epilog 1306 %zext.offset = zext i32 %voffset to i64 1307 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1308 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1309 %load = load i8, i8 addrspace(1)* %gep1 1310 %sextload = sext i8 %load to i32 1311 %cast.load = bitcast i32 %sextload to float 1312 ret float %cast.load 1313} 1314 1315define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1316; GCN-LABEL: global_sextload_saddr_i16: 1317; GCN: ; %bb.0: 1318; GCN-NEXT: global_load_sshort v0, v0, s[2:3] 1319; GCN-NEXT: s_waitcnt vmcnt(0) 1320; GCN-NEXT: ; return to shader part epilog 1321 %zext.offset = zext i32 %voffset to i64 1322 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1323 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1324 %load = load i16, i16 addrspace(1)* %gep0.cast 1325 %sextload = sext i16 %load to i32 1326 %cast.load = bitcast i32 %sextload to float 1327 ret float %cast.load 1328} 1329 1330define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1331; GCN-LABEL: global_sextload_saddr_i16_immneg128: 1332; GCN: ; %bb.0: 1333; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 1334; GCN-NEXT: s_waitcnt vmcnt(0) 1335; GCN-NEXT: ; return to shader part epilog 1336 %zext.offset = zext i32 %voffset to i64 1337 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1338 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1339 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1340 %load = load i16, i16 addrspace(1)* %gep1.cast 1341 %sextload = sext i16 %load to i32 1342 %cast.load = bitcast i32 %sextload to float 1343 ret float %cast.load 1344} 1345 1346define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1347; GCN-LABEL: global_zextload_saddr_i8: 1348; GCN: ; %bb.0: 1349; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 1350; GCN-NEXT: s_waitcnt vmcnt(0) 1351; GCN-NEXT: ; return to shader part epilog 1352 %zext.offset = zext i32 %voffset to i64 1353 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1354 %load = load i8, i8 addrspace(1)* %gep0 1355 %zextload = zext i8 %load to i32 1356 %cast.load = bitcast i32 %zextload to float 1357 ret float %cast.load 1358} 1359 1360define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1361; GCN-LABEL: global_zextload_saddr_i8_immneg128: 1362; GCN: ; %bb.0: 1363; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 1364; GCN-NEXT: s_waitcnt vmcnt(0) 1365; GCN-NEXT: ; return to shader part epilog 1366 %zext.offset = zext i32 %voffset to i64 1367 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1368 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1369 %load = load i8, i8 addrspace(1)* %gep1 1370 %zextload = zext i8 %load to i32 1371 %cast.load = bitcast i32 %zextload to float 1372 ret float %cast.load 1373} 1374 1375define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1376; GCN-LABEL: global_zextload_saddr_i16: 1377; GCN: ; %bb.0: 1378; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1379; GCN-NEXT: s_waitcnt vmcnt(0) 1380; GCN-NEXT: ; return to shader part epilog 1381 %zext.offset = zext i32 %voffset to i64 1382 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1383 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1384 %load = load i16, i16 addrspace(1)* %gep0.cast 1385 %zextload = zext i16 %load to i32 1386 %cast.load = bitcast i32 %zextload to float 1387 ret float %cast.load 1388} 1389 1390define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1391; GCN-LABEL: global_zextload_saddr_i16_immneg128: 1392; GCN: ; %bb.0: 1393; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1394; GCN-NEXT: s_waitcnt vmcnt(0) 1395; GCN-NEXT: ; return to shader part epilog 1396 %zext.offset = zext i32 %voffset to i64 1397 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1398 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1399 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1400 %load = load i16, i16 addrspace(1)* %gep1.cast 1401 %zextload = zext i16 %load to i32 1402 %cast.load = bitcast i32 %zextload to float 1403 ret float %cast.load 1404} 1405 1406; -------------------------------------------------------------------------------- 1407; Atomic load 1408; -------------------------------------------------------------------------------- 1409 1410define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1411; GFX9-LABEL: atomic_global_load_saddr_i32: 1412; GFX9: ; %bb.0: 1413; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1414; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 1415; GFX9-NEXT: s_waitcnt vmcnt(0) 1416; GFX9-NEXT: buffer_wbinvl1 1417; GFX9-NEXT: ; return to shader part epilog 1418; 1419; GFX10-LABEL: atomic_global_load_saddr_i32: 1420; GFX10: ; %bb.0: 1421; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1422; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1423; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 1424; GFX10-NEXT: s_waitcnt vmcnt(0) 1425; GFX10-NEXT: buffer_gl0_inv 1426; GFX10-NEXT: buffer_gl1_inv 1427; GFX10-NEXT: ; return to shader part epilog 1428 %zext.offset = zext i32 %voffset to i64 1429 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1430 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 1431 %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4 1432 %cast.load = bitcast i32 %load to float 1433 ret float %cast.load 1434} 1435 1436define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1437; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: 1438; GFX9: ; %bb.0: 1439; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1440; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc 1441; GFX9-NEXT: s_waitcnt vmcnt(0) 1442; GFX9-NEXT: buffer_wbinvl1 1443; GFX9-NEXT: ; return to shader part epilog 1444; 1445; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: 1446; GFX10: ; %bb.0: 1447; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1448; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1449; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc 1450; GFX10-NEXT: s_waitcnt vmcnt(0) 1451; GFX10-NEXT: buffer_gl0_inv 1452; GFX10-NEXT: buffer_gl1_inv 1453; GFX10-NEXT: ; return to shader part epilog 1454 %zext.offset = zext i32 %voffset to i64 1455 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1456 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1457 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 1458 %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4 1459 %cast.load = bitcast i32 %load to float 1460 ret float %cast.load 1461} 1462 1463define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1464; GFX9-LABEL: atomic_global_load_saddr_i64: 1465; GFX9: ; %bb.0: 1466; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1467; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc 1468; GFX9-NEXT: s_waitcnt vmcnt(0) 1469; GFX9-NEXT: buffer_wbinvl1 1470; GFX9-NEXT: ; return to shader part epilog 1471; 1472; GFX10-LABEL: atomic_global_load_saddr_i64: 1473; GFX10: ; %bb.0: 1474; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1475; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1476; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc 1477; GFX10-NEXT: s_waitcnt vmcnt(0) 1478; GFX10-NEXT: buffer_gl0_inv 1479; GFX10-NEXT: buffer_gl1_inv 1480; GFX10-NEXT: ; return to shader part epilog 1481 %zext.offset = zext i32 %voffset to i64 1482 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1483 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 1484 %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8 1485 %cast.load = bitcast i64 %load to <2 x float> 1486 ret <2 x float> %cast.load 1487} 1488 1489define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1490; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: 1491; GFX9: ; %bb.0: 1492; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1493; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc 1494; GFX9-NEXT: s_waitcnt vmcnt(0) 1495; GFX9-NEXT: buffer_wbinvl1 1496; GFX9-NEXT: ; return to shader part epilog 1497; 1498; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: 1499; GFX10: ; %bb.0: 1500; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1501; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1502; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc 1503; GFX10-NEXT: s_waitcnt vmcnt(0) 1504; GFX10-NEXT: buffer_gl0_inv 1505; GFX10-NEXT: buffer_gl1_inv 1506; GFX10-NEXT: ; return to shader part epilog 1507 %zext.offset = zext i32 %voffset to i64 1508 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1509 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1510 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 1511 %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8 1512 %cast.load = bitcast i64 %load to <2 x float> 1513 ret <2 x float> %cast.load 1514} 1515 1516; -------------------------------------------------------------------------------- 1517; D16 load (low 16) 1518; -------------------------------------------------------------------------------- 1519 1520define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1521; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: 1522; GCN: ; %bb.0: 1523; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] 1524; GCN-NEXT: s_waitcnt vmcnt(0) 1525; GCN-NEXT: ; return to shader part epilog 1526 %zext.offset = zext i32 %voffset to i64 1527 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1528 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1529 %load = load i16, i16 addrspace(1)* %gep0.cast 1530 %build = insertelement <2 x i16> undef, i16 %load, i32 0 1531 %cast = bitcast <2 x i16> %build to <2 x half> 1532 ret <2 x half> %cast 1533} 1534 1535define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1536; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: 1537; GCN: ; %bb.0: 1538; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 1539; GCN-NEXT: s_waitcnt vmcnt(0) 1540; GCN-NEXT: ; return to shader part epilog 1541 %zext.offset = zext i32 %voffset to i64 1542 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1543 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1544 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1545 %load = load i16, i16 addrspace(1)* %gep1.cast 1546 %build = insertelement <2 x i16> undef, i16 %load, i32 0 1547 %cast = bitcast <2 x i16> %build to <2 x half> 1548 ret <2 x half> %cast 1549} 1550 1551define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1552; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi: 1553; GCN: ; %bb.0: 1554; GCN-NEXT: v_mov_b32_e32 v1, 0 1555; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 1556; GCN-NEXT: s_waitcnt vmcnt(0) 1557; GCN-NEXT: v_mov_b32_e32 v0, v1 1558; GCN-NEXT: ; return to shader part epilog 1559 %zext.offset = zext i32 %voffset to i64 1560 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1561 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1562 %load = load i16, i16 addrspace(1)* %gep0.cast 1563 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 1564 %cast = bitcast <2 x i16> %build to <2 x half> 1565 ret <2 x half> %cast 1566} 1567 1568define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1569; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: 1570; GCN: ; %bb.0: 1571; GCN-NEXT: v_mov_b32_e32 v1, 0 1572; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 1573; GCN-NEXT: s_waitcnt vmcnt(0) 1574; GCN-NEXT: v_mov_b32_e32 v0, v1 1575; GCN-NEXT: ; return to shader part epilog 1576 %zext.offset = zext i32 %voffset to i64 1577 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1578 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1579 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1580 %load = load i16, i16 addrspace(1)* %gep1.cast 1581 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 1582 %cast = bitcast <2 x i16> %build to <2 x half> 1583 ret <2 x half> %cast 1584} 1585 1586define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1587; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi: 1588; GCN: ; %bb.0: 1589; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 1590; GCN-NEXT: s_waitcnt vmcnt(0) 1591; GCN-NEXT: v_mov_b32_e32 v0, v1 1592; GCN-NEXT: ; return to shader part epilog 1593 %zext.offset = zext i32 %voffset to i64 1594 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1595 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1596 %load = load i16, i16 addrspace(1)* %gep0.cast 1597 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 1598 %cast = bitcast <2 x i16> %build to <2 x half> 1599 ret <2 x half> %cast 1600} 1601 1602define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1603; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: 1604; GCN: ; %bb.0: 1605; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 1606; GCN-NEXT: s_waitcnt vmcnt(0) 1607; GCN-NEXT: v_mov_b32_e32 v0, v1 1608; GCN-NEXT: ; return to shader part epilog 1609 %zext.offset = zext i32 %voffset to i64 1610 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1611 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1612 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1613 %load = load i16, i16 addrspace(1)* %gep1.cast 1614 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 1615 %cast = bitcast <2 x i16> %build to <2 x half> 1616 ret <2 x half> %cast 1617} 1618 1619define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1620; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: 1621; GCN: ; %bb.0: 1622; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] 1623; GCN-NEXT: s_waitcnt vmcnt(0) 1624; GCN-NEXT: v_mov_b32_e32 v0, v1 1625; GCN-NEXT: ; return to shader part epilog 1626 %zext.offset = zext i32 %voffset to i64 1627 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1628 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 1629 %load = load i8, i8 addrspace(1)* %gep0.cast 1630 %zext.load = zext i8 %load to i16 1631 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 1632 %cast = bitcast <2 x i16> %build to <2 x half> 1633 ret <2 x half> %cast 1634} 1635 1636define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1637; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: 1638; GCN: ; %bb.0: 1639; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128 1640; GCN-NEXT: s_waitcnt vmcnt(0) 1641; GCN-NEXT: v_mov_b32_e32 v0, v1 1642; GCN-NEXT: ; return to shader part epilog 1643 %zext.offset = zext i32 %voffset to i64 1644 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1645 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1646 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 1647 %load = load i8, i8 addrspace(1)* %gep1.cast 1648 %zext.load = zext i8 %load to i16 1649 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 1650 %cast = bitcast <2 x i16> %build to <2 x half> 1651 ret <2 x half> %cast 1652} 1653 1654define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1655; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: 1656; GCN: ; %bb.0: 1657; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] 1658; GCN-NEXT: s_waitcnt vmcnt(0) 1659; GCN-NEXT: v_mov_b32_e32 v0, v1 1660; GCN-NEXT: ; return to shader part epilog 1661 %zext.offset = zext i32 %voffset to i64 1662 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1663 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 1664 %load = load i8, i8 addrspace(1)* %gep0.cast 1665 %sext.load = sext i8 %load to i16 1666 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 1667 %cast = bitcast <2 x i16> %build to <2 x half> 1668 ret <2 x half> %cast 1669} 1670 1671define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1672; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: 1673; GCN: ; %bb.0: 1674; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128 1675; GCN-NEXT: s_waitcnt vmcnt(0) 1676; GCN-NEXT: v_mov_b32_e32 v0, v1 1677; GCN-NEXT: ; return to shader part epilog 1678 %zext.offset = zext i32 %voffset to i64 1679 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1680 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1681 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 1682 %load = load i8, i8 addrspace(1)* %gep1.cast 1683 %sext.load = sext i8 %load to i16 1684 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 1685 %cast = bitcast <2 x i16> %build to <2 x half> 1686 ret <2 x half> %cast 1687} 1688 1689; -------------------------------------------------------------------------------- 1690; D16 hi load (hi16) 1691; -------------------------------------------------------------------------------- 1692 1693define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1694; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: 1695; GCN: ; %bb.0: 1696; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] 1697; GCN-NEXT: s_waitcnt vmcnt(0) 1698; GCN-NEXT: ; return to shader part epilog 1699 %zext.offset = zext i32 %voffset to i64 1700 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1701 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1702 %load = load i16, i16 addrspace(1)* %gep0.cast 1703 %build = insertelement <2 x i16> undef, i16 %load, i32 1 1704 %cast = bitcast <2 x i16> %build to <2 x half> 1705 ret <2 x half> %cast 1706} 1707 1708define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1709; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: 1710; GCN: ; %bb.0: 1711; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 1712; GCN-NEXT: s_waitcnt vmcnt(0) 1713; GCN-NEXT: ; return to shader part epilog 1714 %zext.offset = zext i32 %voffset to i64 1715 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1716 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1717 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1718 %load = load i16, i16 addrspace(1)* %gep1.cast 1719 %build = insertelement <2 x i16> undef, i16 %load, i32 1 1720 %cast = bitcast <2 x i16> %build to <2 x half> 1721 ret <2 x half> %cast 1722} 1723 1724define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1725; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi: 1726; GCN: ; %bb.0: 1727; GCN-NEXT: v_mov_b32_e32 v1, 0 1728; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 1729; GCN-NEXT: s_waitcnt vmcnt(0) 1730; GCN-NEXT: v_mov_b32_e32 v0, v1 1731; GCN-NEXT: ; return to shader part epilog 1732 %zext.offset = zext i32 %voffset to i64 1733 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1734 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1735 %load = load i16, i16 addrspace(1)* %gep0.cast 1736 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 1737 %cast = bitcast <2 x i16> %build to <2 x half> 1738 ret <2 x half> %cast 1739} 1740 1741define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1742; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: 1743; GCN: ; %bb.0: 1744; GCN-NEXT: v_mov_b32_e32 v1, 0 1745; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 1746; GCN-NEXT: s_waitcnt vmcnt(0) 1747; GCN-NEXT: v_mov_b32_e32 v0, v1 1748; GCN-NEXT: ; return to shader part epilog 1749 %zext.offset = zext i32 %voffset to i64 1750 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1751 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1752 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1753 %load = load i16, i16 addrspace(1)* %gep1.cast 1754 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 1755 %cast = bitcast <2 x i16> %build to <2 x half> 1756 ret <2 x half> %cast 1757} 1758 1759define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1760; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi: 1761; GCN: ; %bb.0: 1762; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 1763; GCN-NEXT: s_waitcnt vmcnt(0) 1764; GCN-NEXT: v_mov_b32_e32 v0, v1 1765; GCN-NEXT: ; return to shader part epilog 1766 %zext.offset = zext i32 %voffset to i64 1767 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1768 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1769 %load = load i16, i16 addrspace(1)* %gep0.cast 1770 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 1771 %cast = bitcast <2 x i16> %build to <2 x half> 1772 ret <2 x half> %cast 1773} 1774 1775define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1776; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: 1777; GCN: ; %bb.0: 1778; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 1779; GCN-NEXT: s_waitcnt vmcnt(0) 1780; GCN-NEXT: v_mov_b32_e32 v0, v1 1781; GCN-NEXT: ; return to shader part epilog 1782 %zext.offset = zext i32 %voffset to i64 1783 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1784 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1785 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1786 %load = load i16, i16 addrspace(1)* %gep1.cast 1787 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 1788 %cast = bitcast <2 x i16> %build to <2 x half> 1789 ret <2 x half> %cast 1790} 1791 1792define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1793; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: 1794; GCN: ; %bb.0: 1795; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] 1796; GCN-NEXT: s_waitcnt vmcnt(0) 1797; GCN-NEXT: v_mov_b32_e32 v0, v1 1798; GCN-NEXT: ; return to shader part epilog 1799 %zext.offset = zext i32 %voffset to i64 1800 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1801 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 1802 %load = load i8, i8 addrspace(1)* %gep0.cast 1803 %zext.load = zext i8 %load to i16 1804 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 1805 %cast = bitcast <2 x i16> %build to <2 x half> 1806 ret <2 x half> %cast 1807} 1808 1809define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1810; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: 1811; GCN: ; %bb.0: 1812; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128 1813; GCN-NEXT: s_waitcnt vmcnt(0) 1814; GCN-NEXT: v_mov_b32_e32 v0, v1 1815; GCN-NEXT: ; return to shader part epilog 1816 %zext.offset = zext i32 %voffset to i64 1817 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1818 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1819 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 1820 %load = load i8, i8 addrspace(1)* %gep1.cast 1821 %zext.load = zext i8 %load to i16 1822 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 1823 %cast = bitcast <2 x i16> %build to <2 x half> 1824 ret <2 x half> %cast 1825} 1826 1827define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1828; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: 1829; GCN: ; %bb.0: 1830; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] 1831; GCN-NEXT: s_waitcnt vmcnt(0) 1832; GCN-NEXT: v_mov_b32_e32 v0, v1 1833; GCN-NEXT: ; return to shader part epilog 1834 %zext.offset = zext i32 %voffset to i64 1835 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1836 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 1837 %load = load i8, i8 addrspace(1)* %gep0.cast 1838 %sext.load = sext i8 %load to i16 1839 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 1840 %cast = bitcast <2 x i16> %build to <2 x half> 1841 ret <2 x half> %cast 1842} 1843 1844define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 1845; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: 1846; GCN: ; %bb.0: 1847; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128 1848; GCN-NEXT: s_waitcnt vmcnt(0) 1849; GCN-NEXT: v_mov_b32_e32 v0, v1 1850; GCN-NEXT: ; return to shader part epilog 1851 %zext.offset = zext i32 %voffset to i64 1852 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1853 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1854 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 1855 %load = load i8, i8 addrspace(1)* %gep1.cast 1856 %sext.load = sext i8 %load to i16 1857 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 1858 %cast = bitcast <2 x i16> %build to <2 x half> 1859 ret <2 x half> %cast 1860} 1861 1862!0 = !{i32 0, i32 1073741824} ; (1 << 30) 1863!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 1864