1; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3 4declare i64 @_Z13get_global_idj(i32) 5 6define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { 7; GCN-LABEL: clmem_read_simplified: 8; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 9; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 10; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 11; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 12; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 13; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 14; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 15; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 16; 17; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 18; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 19; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 20; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 21; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 22; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 23; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 24; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 25entry: 26 %call = tail call i64 @_Z13get_global_idj(i32 0) 27 %conv = and i64 %call, 255 28 %a0 = shl i64 %call, 7 29 %idx.ext11 = and i64 %a0, 4294934528 30 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 31 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 32 33 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 34 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 35 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 36 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 37 %add.1 = add i64 %load2, %load1 38 39 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 40 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 41 %add.2 = add i64 %load3, %add.1 42 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 43 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 44 %add.3 = add i64 %load4, %add.2 45 46 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 47 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 48 %add.4 = add i64 %load5, %add.3 49 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 50 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 51 %add.5 = add i64 %load6, %add.4 52 53 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 54 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 55 %add.6 = add i64 %load7, %add.5 56 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 57 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 58 %add.7 = add i64 %load8, %add.6 59 60 store i64 %add.7, i64 addrspace(1)* %saddr, align 8 61 ret void 62} 63 64define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { 65; GCN-LABEL: clmem_read: 66; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 67; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 68; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 69; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 70; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 71; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 72; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 73; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 74; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 75; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 76; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 77; 78; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 79; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 80; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 81; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 82; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 83; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 84; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 85; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 86; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 87; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 88; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 89entry: 90 %call = tail call i64 @_Z13get_global_idj(i32 0) 91 %conv = and i64 %call, 255 92 %a0 = shl i64 %call, 17 93 %idx.ext11 = and i64 %a0, 4261412864 94 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 95 %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 96 %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv 97 br label %for.cond.preheader 98 99while.cond.loopexit: ; preds = %for.body 100 %dec = add nsw i32 %dec31, -1 101 %tobool = icmp eq i32 %dec31, 0 102 br i1 %tobool, label %while.end, label %for.cond.preheader 103 104for.cond.preheader: ; preds = %entry, %while.cond.loopexit 105 %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] 106 %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] 107 br label %for.body 108 109for.body: ; preds = %for.body, %for.cond.preheader 110 %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] 111 %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] 112 %conv3 = zext i32 %block.029 to i64 113 %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 114 %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 115 %add = add i64 %load1, %sum.128 116 117 %add9 = or i32 %block.029, 256 118 %conv3.1 = zext i32 %add9 to i64 119 %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 120 %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 121 %add.1 = add i64 %load2, %add 122 123 %add9.1 = or i32 %block.029, 512 124 %conv3.2 = zext i32 %add9.1 to i64 125 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 126 %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 127 %add.2 = add i64 %l3, %add.1 128 129 %add9.2 = or i32 %block.029, 768 130 %conv3.3 = zext i32 %add9.2 to i64 131 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 132 %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 133 %add.3 = add i64 %l4, %add.2 134 135 %add9.3 = or i32 %block.029, 1024 136 %conv3.4 = zext i32 %add9.3 to i64 137 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 138 %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 139 %add.4 = add i64 %l5, %add.3 140 141 %add9.4 = or i32 %block.029, 1280 142 %conv3.5 = zext i32 %add9.4 to i64 143 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 144 %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 145 %add.5 = add i64 %l6, %add.4 146 147 %add9.5 = or i32 %block.029, 1536 148 %conv3.6 = zext i32 %add9.5 to i64 149 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 150 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 151 %add.6 = add i64 %load7, %add.5 152 153 %add9.6 = or i32 %block.029, 1792 154 %conv3.7 = zext i32 %add9.6 to i64 155 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 156 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 157 %add.7 = add i64 %load8, %add.6 158 159 %add9.7 = or i32 %block.029, 2048 160 %conv3.8 = zext i32 %add9.7 to i64 161 %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 162 %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 163 %add.8 = add i64 %load9, %add.7 164 165 %add9.8 = or i32 %block.029, 2304 166 %conv3.9 = zext i32 %add9.8 to i64 167 %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 168 %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 169 %add.9 = add i64 %load10, %add.8 170 171 %add9.9 = or i32 %block.029, 2560 172 %conv3.10 = zext i32 %add9.9 to i64 173 %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 174 %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 175 %add.10 = add i64 %load11, %add.9 176 177 %add9.31 = add nuw nsw i32 %block.029, 8192 178 %cmp.31 = icmp ult i32 %add9.31, 4194304 179 br i1 %cmp.31, label %for.body, label %while.cond.loopexit 180 181while.end: ; preds = %while.cond.loopexit 182 store i64 %add.10, i64 addrspace(1)* %a1, align 8 183 ret void 184} 185 186; using 32bit address. 187define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { 188; GCN-LABEL: Address32: 189; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 190; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 191; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 192; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 193; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 194; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 195; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 196; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 197; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 198; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 199; 200; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 201; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 202; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 203; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 204; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 205; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096 206; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072 207; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 208; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 209; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 210entry: 211 %call = tail call i64 @_Z13get_global_idj(i32 0) 212 %conv = and i64 %call, 255 213 %id = shl i64 %call, 7 214 %idx.ext11 = and i64 %id, 4294934528 215 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 216 %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 217 218 %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv 219 %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 220 221 %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 222 %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 223 %add.1 = add i32 %load2, %load1 224 225 %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 226 %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 227 %add.2 = add i32 %load3, %add.1 228 229 %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 230 %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 231 %add.3 = add i32 %load4, %add.2 232 233 %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 234 %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 235 %add.4 = add i32 %load5, %add.3 236 237 %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 238 %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 239 %add.5 = add i32 %load6, %add.4 240 241 %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 242 %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 243 %add.6 = add i32 %load7, %add.5 244 245 %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 246 %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 247 %add.7 = add i32 %load8, %add.6 248 249 %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 250 %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 251 %add.8 = add i32 %load9, %add.7 252 253 %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 254 %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 255 %add.9 = add i32 %load10, %add.8 256 257 store i32 %add.9, i32 addrspace(1)* %addr, align 4 258 ret void 259} 260 261define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { 262; GCN-LABEL: Offset64: 263; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 264; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 265; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 266; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 267; 268; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 269; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 270; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 271; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 272entry: 273 %call = tail call i64 @_Z13get_global_idj(i32 0) 274 %conv = and i64 %call, 255 275 %a0 = shl i64 %call, 7 276 %idx.ext11 = and i64 %a0, 4294934528 277 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 278 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 279 280 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 281 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 282 283 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 284 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 285 286 %add1 = add i64 %load2, %load1 287 288 %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 289 %load3 = load i64, i64 addrspace(1)* %addr3, align 8 290 291 %add2 = add i64 %load3, %add1 292 293 %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 294 %load4 = load i64, i64 addrspace(1)* %addr4, align 8 295 %add4 = add i64 %load4, %add2 296 297 store i64 %add4, i64 addrspace(1)* %saddr, align 8 298 ret void 299} 300 301; TODO: Support load4 as anchor instruction. 302define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { 303; GCN-LABEL: p32Offset64: 304; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 305; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 306; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 307; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] 308; 309; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 310; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 311; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 312; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off 313entry: 314 %call = tail call i64 @_Z13get_global_idj(i32 0) 315 %conv = and i64 %call, 255 316 %a0 = shl i64 %call, 7 317 %idx.ext11 = and i64 %a0, 4294934528 318 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 319 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* 320 321 %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv 322 %load1 = load i32, i32 addrspace(1)* %addr1, align 8 323 324 %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 325 %load2 = load i32, i32 addrspace(1)* %addr2, align 8 326 327 %add1 = add i32 %load2, %load1 328 329 %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 330 %load3 = load i32, i32 addrspace(1)* %addr3, align 8 331 332 %add2 = add i32 %load3, %add1 333 334 %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 335 %load4 = load i32, i32 addrspace(1)* %addr4, align 8 336 %add4 = add i32 %load4, %add2 337 338 store i32 %add4, i32 addrspace(1)* %saddr, align 8 339 ret void 340} 341 342define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, 343; GCN-LABEL: DiffBase: 344; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 345; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 346; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 347; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 348; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 349; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 350; 351; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 352; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 353; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 354; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 355; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 356; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 357 i8 addrspace(1)* %buffer2) { 358entry: 359 %call = tail call i64 @_Z13get_global_idj(i32 0) 360 %conv = and i64 %call, 255 361 %a0 = shl i64 %call, 7 362 %idx.ext11 = and i64 %a0, 4294934528 363 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 364 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 365 366 %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 367 %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* 368 369 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 370 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 371 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 372 %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 373 %add1 = add i64 %load2, %load1 374 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 375 %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 376 %add2 = add i64 %load3, %add1 377 378 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 379 %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 380 381 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 382 %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 383 %add3 = add i64 %load5, %load4 384 385 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 386 %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 387 %add4 = add i64 %load6, %add3 388 389 %add5 = add i64 %add2, %add4 390 391 store i64 %add5, i64 addrspace(1)* %saddr, align 8 392 ret void 393} 394 395define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { 396; GCN-LABEL: ReverseOrder: 397; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 398; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 399; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 400; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 401; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 402; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 403; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 404; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 405; 406; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 407; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 408; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 409; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 410; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 411; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 412; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 413; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 414entry: 415 %call = tail call i64 @_Z13get_global_idj(i32 0) 416 %conv = and i64 %call, 255 417 %a0 = shl i64 %call, 7 418 %idx.ext11 = and i64 %a0, 4294934528 419 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 420 %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 421 422 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv 423 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 424 425 %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 426 %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 427 %add7 = add i64 %load8, %load1 428 429 %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 430 %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 431 %add6 = add i64 %load7, %add7 432 433 %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 434 %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 435 %add5 = add i64 %load6, %add6 436 437 %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 438 %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 439 %add4 = add i64 %load5, %add5 440 441 %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 442 %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 443 %add3 = add i64 %load4, %add4 444 445 %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 446 %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 447 %add2 = add i64 %load3, %add3 448 449 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 450 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 451 %add1 = add i64 %load2, %add2 452 453 store i64 %add1, i64 addrspace(1)* %saddr, align 8 454 ret void 455} 456 457define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { 458; GCN-LABEL: negativeoffset: 459; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 460; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] 461; 462; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off 463; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 464entry: 465 %call = tail call i64 @_Z13get_global_idj(i32 0) #2 466 %conv = and i64 %call, 255 467 %0 = shl i64 %call, 7 468 %idx.ext11 = and i64 %0, 4294934528 469 %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 470 %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* 471 472 %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv 473 474 %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656 475 %load1 = load i64, i64 addrspace(1)* %addr1, align 8 476 477 %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912 478 %load2 = load i64, i64 addrspace(1)* %addr2, align 8 479 480 481 %add = add i64 %load2, %load1 482 483 store i64 %add, i64 addrspace(1)* %buffer_head, align 8 484 ret void 485} 486