1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s 5 6; GCN-LABEL: {{^}}ds_consume_lds: 7; GCN: s_load_dword [[PTR:s[0-9]+]] 8; GCN: s_mov_b32 m0, [[PTR]] 9; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 10; GCN-NOT: buffer_wbinvl1 11; GCN: {{.*}}store{{.*}} [[RESULT]] 12define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 13 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) 14 store i32 %val, i32 addrspace(1)* %out 15 ret void 16} 17 18; GCN-LABEL: {{^}}ds_consume_lds_max_offset: 19; GCN: s_load_dword [[PTR:s[0-9]+]] 20; GCN: s_mov_b32 m0, [[PTR]] 21; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} 22; GCN-NOT: buffer_wbinvl1 23; GCN: {{.*}}store{{.*}} [[RESULT]] 24define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 25 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 26 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) 27 store i32 %val, i32 addrspace(1)* %out 28 ret void 29} 30 31; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si: 32; GCN: s_load_dword [[PTR:s[0-9]+]] 33 34; SI: s_add_i32 [[PTR]], [[PTR]], 16 35; SI: s_mov_b32 m0, [[PTR]] 36; SI: ds_consume [[RESULT:v[0-9]+]]{{$}} 37 38; CIPLUS: s_mov_b32 m0, [[PTR]] 39; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}} 40 41; GCN-NOT: buffer_wbinvl1 42; GCN: {{.*}}store{{.*}} [[RESULT]] 43define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 { 44 %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4 45 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4 46 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) 47 store i32 %val, i32 addrspace(1)* %out 48 ret void 49} 50 51; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset: 52; GCN: s_load_dword [[PTR:s[0-9]+]] 53 54; SI: s_bitset1_b32 [[PTR]], 16 55; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000 56 57; GCN: s_mov_b32 m0, [[PTR]] 58; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 59; GCN-NOT: buffer_wbinvl1 60; GCN: {{.*}}store{{.*}} [[RESULT]] 61define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 62 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384 63 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) 64 store i32 %val, i32 addrspace(1)* %out 65 ret void 66} 67 68; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr: 69; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 70; GCN: s_mov_b32 m0, [[READLANE]] 71; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 72; GCN-NOT: buffer_wbinvl1 73; GCN: {{.*}}store{{.*}} [[RESULT]] 74define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 75 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) 76 store i32 %val, i32 addrspace(1)* %out 77 ret void 78} 79 80; GCN-LABEL: {{^}}ds_consume_gds: 81; GCN: s_load_dword [[PTR:s[0-9]+]] 82; GCN: s_mov_b32 m0, [[PTR]] 83; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}} 84; GCN-NOT: buffer_wbinvl1 85; GCN: {{.*}}store{{.*}} [[RESULT]] 86define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 87 %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i1 false) 88 store i32 %val, i32 addrspace(1)* %out 89 ret void 90} 91 92; GCN-LABEL: {{^}}ds_consume_gds_max_offset: 93; GCN: s_load_dword [[PTR:s[0-9]+]] 94; GCN: s_mov_b32 m0, [[PTR]] 95; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}} 96; GCN-NOT: buffer_wbinvl1 97; GCN: {{.*}}store{{.*}} [[RESULT]] 98define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 99 %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383 100 %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false) 101 store i32 %val, i32 addrspace(1)* %out 102 ret void 103} 104 105; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset: 106; GCN-NOT: buffer_wbinvl1 107define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 108 %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384 109 %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false) 110 store i32 %val, i32 addrspace(1)* %out 111 ret void 112} 113 114; GCN-LABEL: {{^}}ds_consume_lds_m0_restore: 115; GCN: s_load_dword [[PTR:s[0-9]+]] 116; GCN: s_mov_b32 m0, [[PTR]] 117; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} 118; GCN-NOT: buffer_wbinvl1 119; NOTGFX9: s_mov_b32 m0, -1 120; GFX9-NOT: m0 121; GCN: _store_dword 122; GCN: ds_read_b32 123define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 124 %val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) 125 store i32 %val0, i32 addrspace(1)* %out 126 %val1 = load volatile i32, i32 addrspace(3)* %lds 127 ret void 128} 129 130; Make sure this selects successfully with no use. The result register needs to be constrained. 131; GCN-LABEL: {{^}}ds_consume_lds_no_use: 132; GCN: s_load_dword [[PTR:s[0-9]+]] 133; GCN: s_mov_b32 m0, [[PTR]] 134; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} 135define amdgpu_kernel void @ds_consume_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 136 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 137 %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) 138 ret void 139} 140 141declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 142declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 143 144attributes #0 = { nounwind } 145attributes #1 = { argmemonly convergent nounwind } 146