1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX90A %s 5 6declare i64 @_Z13get_global_idj(i32) 7 8define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { 9; GCN-LABEL: clmem_read_simplified: 10; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 11; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 12; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 13; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 14; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 15; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 16; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 17; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 18; 19; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 20; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 21; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 22; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 23; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 24; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 25; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 26; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 27; 28; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 29; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 30; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 31; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 32; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 33; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 34; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 35; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 36 37entry: 38 %call = tail call i64 @_Z13get_global_idj(i32 0) 39 %conv = and i64 %call, 255 40 %a0 = shl i64 %call, 7 41 %idx.ext11 = and i64 %a0, 4294934528 42 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 43 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 44 45 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 46 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 47 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 48 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 49 %add.1 = add i64 %load2, %load1 50 51 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 52 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 53 %add.2 = add i64 %load3, %add.1 54 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 55 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 56 %add.3 = add i64 %load4, %add.2 57 58 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 59 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 60 %add.4 = add i64 %load5, %add.3 61 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 62 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 63 %add.5 = add i64 %load6, %add.4 64 65 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 66 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 67 %add.6 = add i64 %load7, %add.5 68 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 69 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 70 %add.7 = add i64 %load8, %add.6 71 72 store i64 %add.7, i64 addrspace(1)* %saddr, align 8 73 ret void 74} 75 76define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { 77; GCN-LABEL: clmem_read: 78; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 79; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 80; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 81; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 82; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 83; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 84; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 85; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 86; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 87; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 88; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 89; 90; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 91; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 92; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 93; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 94; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 95; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 96; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 97; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 98; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 99; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 100; GFX900: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 101; 102; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 103; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 104; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 105; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 106; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 107; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 108; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 109; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 110; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 111; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 112; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 113 114; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 115; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 116; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 117; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 118; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 119; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 120; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 121; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 122; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 123; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 124; GFX90A: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 125 126entry: 127 %call = tail call i64 @_Z13get_global_idj(i32 0) 128 %conv = and i64 %call, 255 129 %a0 = shl i64 %call, 17 130 %idx.ext11 = and i64 %a0, 4261412864 131 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 132 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 133 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv 134 br label %for.cond.preheader 135 136while.cond.loopexit: ; preds = %for.body 137 %dec = add nsw i32 %dec31, -1 138 %tobool = icmp eq i32 %dec31, 0 139 br i1 %tobool, label %while.end, label %for.cond.preheader 140 141for.cond.preheader: ; preds = %entry, %while.cond.loopexit 142 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] 143 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] 144 br label %for.body 145 146for.body: ; preds = %for.body, %for.cond.preheader 147 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] 148 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] 149 %conv3 = zext i32 %block.029 to i64 150 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 151 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 152 %add = add i64 %load1, %sum.128 153 154 %add9 = or i32 %block.029, 256 155 %conv3.1 = zext i32 %add9 to i64 156 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 157 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 158 %add.1 = add i64 %load2, %add 159 160 %add9.1 = or i32 %block.029, 512 161 %conv3.2 = zext i32 %add9.1 to i64 162 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 163 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 164 %add.2 = add i64 %l3, %add.1 165 166 %add9.2 = or i32 %block.029, 768 167 %conv3.3 = zext i32 %add9.2 to i64 168 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 169 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 170 %add.3 = add i64 %l4, %add.2 171 172 %add9.3 = or i32 %block.029, 1024 173 %conv3.4 = zext i32 %add9.3 to i64 174 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 175 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 176 %add.4 = add i64 %l5, %add.3 177 178 %add9.4 = or i32 %block.029, 1280 179 %conv3.5 = zext i32 %add9.4 to i64 180 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 181 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 182 %add.5 = add i64 %l6, %add.4 183 184 %add9.5 = or i32 %block.029, 1536 185 %conv3.6 = zext i32 %add9.5 to i64 186 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 187 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 188 %add.6 = add i64 %load7, %add.5 189 190 %add9.6 = or i32 %block.029, 1792 191 %conv3.7 = zext i32 %add9.6 to i64 192 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 193 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 194 %add.7 = add i64 %load8, %add.6 195 196 %add9.7 = or i32 %block.029, 2048 197 %conv3.8 = zext i32 %add9.7 to i64 198 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 199 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 200 %add.8 = add i64 %load9, %add.7 201 202 %add9.8 = or i32 %block.029, 2304 203 %conv3.9 = zext i32 %add9.8 to i64 204 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 205 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 206 %add.9 = add i64 %load10, %add.8 207 208 %add9.9 = or i32 %block.029, 2560 209 %conv3.10 = zext i32 %add9.9 to i64 210 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 211 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 212 %add.10 = add i64 %load11, %add.9 213 214 %add9.31 = add nuw nsw i32 %block.029, 8192 215 %cmp.31 = icmp ult i32 %add9.31, 4194304 216 br i1 %cmp.31, label %for.body, label %while.cond.loopexit 217 218while.end: ; preds = %while.cond.loopexit 219 store i64 %add.10, i64 addrspace(1)* %a1, align 8 220 ret void 221} 222 223; using 32bit address. 224define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { 225; GCN-LABEL: Address32: 226; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 227; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 228; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 229; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 230; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 231; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 232; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 233; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 234; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 235; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 236; 237; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 238; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 239; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 240; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 241; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 242; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 243; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 244; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 245; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 246; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 247; 248; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 249; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 250; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 251; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 252; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 253; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 254; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 255; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 256; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 257; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 258entry: 259 %call = tail call i64 @_Z13get_global_idj(i32 0) 260 %conv = and i64 %call, 255 261 %id = shl i64 %call, 7 262 %idx.ext11 = and i64 %id, 4294934528 263 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 264 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 265 266 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv 267 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 268 269 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 270 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 271 %add.1 = add i32 %load2, %load1 272 273 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 274 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 275 %add.2 = add i32 %load3, %add.1 276 277 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 278 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 279 %add.3 = add i32 %load4, %add.2 280 281 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 282 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 283 %add.4 = add i32 %load5, %add.3 284 285 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 286 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 287 %add.5 = add i32 %load6, %add.4 288 289 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 290 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 291 %add.6 = add i32 %load7, %add.5 292 293 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 294 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 295 %add.7 = add i32 %load8, %add.6 296 297 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 298 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 299 %add.8 = add i32 %load9, %add.7 300 301 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 302 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 303 %add.9 = add i32 %load10, %add.8 304 305 store i32 %add.9, i32 addrspace(1)* %addr, align 4 306 ret void 307} 308 309define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { 310; GCN-LABEL: Offset64: 311; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 312; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 313; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 314; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 315; 316; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 317; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 318; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 319; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 320; 321; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 322; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 323; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 324; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 325entry: 326 %call = tail call i64 @_Z13get_global_idj(i32 0) 327 %conv = and i64 %call, 255 328 %a0 = shl i64 %call, 7 329 %idx.ext11 = and i64 %a0, 4294934528 330 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 331 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 332 333 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 334 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 335 336 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 337 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 338 339 %add1 = add i64 %load2, %load1 340 341 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 342 %load3 = load i64, i64 addrspace(1)* %addr3, align 8 343 344 %add2 = add i64 %load3, %add1 345 346 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 347 %load4 = load i64, i64 addrspace(1)* %addr4, align 8 348 %add4 = add i64 %load4, %add2 349 350 store i64 %add4, i64 addrspace(1)* %saddr, align 8 351 ret void 352} 353 354; TODO: Support load4 as anchor instruction. 355define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { 356; GCN-LABEL: p32Offset64: 357; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 358; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 359; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 360; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 361; 362; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 363; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 364; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 365; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 366; 367; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 368; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 369; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} 370; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 371entry: 372 %call = tail call i64 @_Z13get_global_idj(i32 0) 373 %conv = and i64 %call, 255 374 %a0 = shl i64 %call, 7 375 %idx.ext11 = and i64 %a0, 4294934528 376 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 377 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 378 379 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv 380 %load1 = load i32, i32 addrspace(1)* %addr1, align 8 381 382 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 383 %load2 = load i32, i32 addrspace(1)* %addr2, align 8 384 385 %add1 = add i32 %load2, %load1 386 387 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 388 %load3 = load i32, i32 addrspace(1)* %addr3, align 8 389 390 %add2 = add i32 %load3, %add1 391 392 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 393 %load4 = load i32, i32 addrspace(1)* %addr4, align 8 394 %add4 = add i32 %load4, %add2 395 396 store i32 %add4, i32 addrspace(1)* %saddr, align 8 397 ret void 398} 399 400define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, 401; GCN-LABEL: DiffBase: 402; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 403; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 404; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 405; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 406; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 407; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 408; 409; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 410; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 411; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 412; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 413; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 414; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 415; 416; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 417; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 418; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 419; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 420; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 421; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 422 i8 addrspace(1)* %buffer2) { 423entry: 424 %call = tail call i64 @_Z13get_global_idj(i32 0) 425 %conv = and i64 %call, 255 426 %a0 = shl i64 %call, 7 427 %idx.ext11 = and i64 %a0, 4294934528 428 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 429 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 430 431 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 432 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* 433 434 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 435 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 436 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 437 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 438 %add1 = add i64 %load2, %load1 439 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 440 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 441 %add2 = add i64 %load3, %add1 442 443 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 444 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 445 446 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 447 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 448 %add3 = add i64 %load5, %load4 449 450 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 451 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 452 %add4 = add i64 %load6, %add3 453 454 %add5 = add i64 %add2, %add4 455 456 store i64 %add5, i64 addrspace(1)* %saddr, align 8 457 ret void 458} 459 460define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { 461; GCN-LABEL: ReverseOrder: 462; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 463; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 464; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 465; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 466; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 467; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 468; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 469; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 470; 471; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 472; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 473; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 474; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 475; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 476; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 477; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 478; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 479; 480; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 481; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 482; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 483; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 484; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 485; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 486; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 487; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 488entry: 489 %call = tail call i64 @_Z13get_global_idj(i32 0) 490 %conv = and i64 %call, 255 491 %a0 = shl i64 %call, 7 492 %idx.ext11 = and i64 %a0, 4294934528 493 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 494 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 495 496 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 497 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 498 499 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 500 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 501 %add7 = add i64 %load8, %load1 502 503 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 504 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 505 %add6 = add i64 %load7, %add7 506 507 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 508 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 509 %add5 = add i64 %load6, %add6 510 511 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 512 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 513 %add4 = add i64 %load5, %add5 514 515 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 516 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 517 %add3 = add i64 %load4, %add4 518 519 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 520 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 521 %add2 = add i64 %load3, %add3 522 523 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 524 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 525 %add1 = add i64 %load2, %add2 526 527 store i64 %add1, i64 addrspace(1)* %saddr, align 8 528 ret void 529} 530 531define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { 532; GCN-LABEL: negativeoffset: 533; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 534; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 535; 536; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 537; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 538; 539; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 540; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} 541entry: 542 %call = tail call i64 @_Z13get_global_idj(i32 0) #2 543 %conv = and i64 %call, 255 544 %0 = shl i64 %call, 7 545 %idx.ext11 = and i64 %0, 4294934528 546 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 547 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 548 549 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv 550 551 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656 552 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 553 554 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912 555 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 556 557 558 %add = add i64 %load2, %load1 559 560 store i64 %add, i64 addrspace(1)* %buffer_head, align 8 561 ret void 562} 563