1; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 3; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s 5 6;===------------------------------------------------------------------------===; 7; Global Address Space 8;===------------------------------------------------------------------------===; 9; FUNC-LABEL: {{^}}store_i1: 10; EG: MEM_RAT MSKOR 11; SI: buffer_store_byte 12define void @store_i1(i1 addrspace(1)* %out) { 13entry: 14 store i1 true, i1 addrspace(1)* %out 15 ret void 16} 17 18; i8 store 19; EG-LABEL: {{^}}store_i8: 20; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 21 22; IG 0: Get the byte index and truncate the value 23; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 24; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 25; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y 26; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 27 28 29; IG 1: Truncate the calculated the shift amount for the mask 30 31; IG 2: Shift the value and the mask 32; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] 33; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 34; EG-NEXT: 255 35; IG 3: Initialize the Y and Z channels to zero 36; XXX: An optimal scheduler should merge this into one of the prevous IGs. 37; EG: MOV T[[RW_GPR]].Y, 0.0 38; EG: MOV * T[[RW_GPR]].Z, 0.0 39 40; SI-LABEL: {{^}}store_i8: 41; SI: buffer_store_byte 42 43define void @store_i8(i8 addrspace(1)* %out, i8 %in) { 44entry: 45 store i8 %in, i8 addrspace(1)* %out 46 ret void 47} 48 49; i16 store 50; EG-LABEL: {{^}}store_i16: 51; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 52 53; IG 0: Get the byte index and truncate the value 54 55 56; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 57; EG-NEXT: 3(4.203895e-45), 58 59; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 60; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y 61 62; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 63; IG 1: Truncate the calculated the shift amount for the mask 64 65; IG 2: Shift the value and the mask 66; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] 67; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 68; EG-NEXT: 65535 69; IG 3: Initialize the Y and Z channels to zero 70; XXX: An optimal scheduler should merge this into one of the prevous IGs. 71; EG: MOV T[[RW_GPR]].Y, 0.0 72; EG: MOV * T[[RW_GPR]].Z, 0.0 73 74; SI-LABEL: {{^}}store_i16: 75; SI: buffer_store_short 76define void @store_i16(i16 addrspace(1)* %out, i16 %in) { 77entry: 78 store i16 %in, i16 addrspace(1)* %out 79 ret void 80} 81 82; EG-LABEL: {{^}}store_v2i8: 83; EG: MEM_RAT MSKOR 84; EG-NOT: MEM_RAT MSKOR 85; SI-LABEL: {{^}}store_v2i8: 86; SI: buffer_store_byte 87; SI: buffer_store_byte 88define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 89entry: 90 %0 = trunc <2 x i32> %in to <2 x i8> 91 store <2 x i8> %0, <2 x i8> addrspace(1)* %out 92 ret void 93} 94 95 96; EG-LABEL: {{^}}store_v2i16: 97; EG: MEM_RAT_CACHELESS STORE_RAW 98; CM-LABEL: {{^}}store_v2i16: 99; CM: MEM_RAT_CACHELESS STORE_DWORD 100; SI-LABEL: {{^}}store_v2i16: 101; SI: buffer_store_short 102; SI: buffer_store_short 103define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 104entry: 105 %0 = trunc <2 x i32> %in to <2 x i16> 106 store <2 x i16> %0, <2 x i16> addrspace(1)* %out 107 ret void 108} 109 110; EG-LABEL: {{^}}store_v4i8: 111; EG: MEM_RAT_CACHELESS STORE_RAW 112; CM-LABEL: {{^}}store_v4i8: 113; CM: MEM_RAT_CACHELESS STORE_DWORD 114; SI-LABEL: {{^}}store_v4i8: 115; SI: buffer_store_byte 116; SI: buffer_store_byte 117; SI: buffer_store_byte 118; SI: buffer_store_byte 119define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 120entry: 121 %0 = trunc <4 x i32> %in to <4 x i8> 122 store <4 x i8> %0, <4 x i8> addrspace(1)* %out 123 ret void 124} 125 126; floating-point store 127; EG-LABEL: {{^}}store_f32: 128; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 129; CM-LABEL: {{^}}store_f32: 130; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} 131; SI-LABEL: {{^}}store_f32: 132; SI: buffer_store_dword 133 134define void @store_f32(float addrspace(1)* %out, float %in) { 135 store float %in, float addrspace(1)* %out 136 ret void 137} 138 139; EG-LABEL: {{^}}store_v4i16: 140; EG: MEM_RAT MSKOR 141; EG: MEM_RAT MSKOR 142; EG: MEM_RAT MSKOR 143; EG: MEM_RAT MSKOR 144; EG-NOT: MEM_RAT MSKOR 145; SI-LABEL: {{^}}store_v4i16: 146; SI: buffer_store_short 147; SI: buffer_store_short 148; SI: buffer_store_short 149; SI: buffer_store_short 150; SI-NOT: buffer_store_byte 151define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { 152entry: 153 %0 = trunc <4 x i32> %in to <4 x i16> 154 store <4 x i16> %0, <4 x i16> addrspace(1)* %out 155 ret void 156} 157 158; vec2 floating-point stores 159; EG-LABEL: {{^}}store_v2f32: 160; EG: MEM_RAT_CACHELESS STORE_RAW 161; CM-LABEL: {{^}}store_v2f32: 162; CM: MEM_RAT_CACHELESS STORE_DWORD 163; SI-LABEL: {{^}}store_v2f32: 164; SI: buffer_store_dwordx2 165 166define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { 167entry: 168 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 169 %1 = insertelement <2 x float> %0, float %b, i32 1 170 store <2 x float> %1, <2 x float> addrspace(1)* %out 171 ret void 172} 173 174; EG-LABEL: {{^}}store_v4i32: 175; EG: MEM_RAT_CACHELESS STORE_RAW 176; EG-NOT: MEM_RAT_CACHELESS STORE_RAW 177; CM-LABEL: {{^}}store_v4i32: 178; CM: MEM_RAT_CACHELESS STORE_DWORD 179; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD 180; SI-LABEL: {{^}}store_v4i32: 181; SI: buffer_store_dwordx4 182define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 183entry: 184 store <4 x i32> %in, <4 x i32> addrspace(1)* %out 185 ret void 186} 187 188; FUNC-LABEL: {{^}}store_i64_i8: 189; EG: MEM_RAT MSKOR 190; SI: buffer_store_byte 191define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { 192entry: 193 %0 = trunc i64 %in to i8 194 store i8 %0, i8 addrspace(1)* %out 195 ret void 196} 197 198; FUNC-LABEL: {{^}}store_i64_i16: 199; EG: MEM_RAT MSKOR 200; SI: buffer_store_short 201define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) { 202entry: 203 %0 = trunc i64 %in to i16 204 store i16 %0, i16 addrspace(1)* %out 205 ret void 206} 207 208;===------------------------------------------------------------------------===; 209; Local Address Space 210;===------------------------------------------------------------------------===; 211 212; FUNC-LABEL: {{^}}store_local_i1: 213; EG: LDS_BYTE_WRITE 214; SI: ds_write_b8 215define void @store_local_i1(i1 addrspace(3)* %out) { 216entry: 217 store i1 true, i1 addrspace(3)* %out 218 ret void 219} 220 221; EG-LABEL: {{^}}store_local_i8: 222; EG: LDS_BYTE_WRITE 223; SI-LABEL: {{^}}store_local_i8: 224; SI: ds_write_b8 225define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { 226 store i8 %in, i8 addrspace(3)* %out 227 ret void 228} 229 230; EG-LABEL: {{^}}store_local_i16: 231; EG: LDS_SHORT_WRITE 232; SI-LABEL: {{^}}store_local_i16: 233; SI: ds_write_b16 234define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { 235 store i16 %in, i16 addrspace(3)* %out 236 ret void 237} 238 239; EG-LABEL: {{^}}store_local_v2i16: 240; EG: LDS_WRITE 241; CM-LABEL: {{^}}store_local_v2i16: 242; CM: LDS_WRITE 243; SI-LABEL: {{^}}store_local_v2i16: 244; SI: ds_write_b16 245; SI: ds_write_b16 246define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { 247entry: 248 store <2 x i16> %in, <2 x i16> addrspace(3)* %out 249 ret void 250} 251 252; EG-LABEL: {{^}}store_local_v4i8: 253; EG: LDS_WRITE 254; CM-LABEL: {{^}}store_local_v4i8: 255; CM: LDS_WRITE 256; SI-LABEL: {{^}}store_local_v4i8: 257; SI: ds_write_b8 258; SI: ds_write_b8 259; SI: ds_write_b8 260; SI: ds_write_b8 261define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { 262entry: 263 store <4 x i8> %in, <4 x i8> addrspace(3)* %out 264 ret void 265} 266 267; EG-LABEL: {{^}}store_local_v2i32: 268; EG: LDS_WRITE 269; EG: LDS_WRITE 270; CM-LABEL: {{^}}store_local_v2i32: 271; CM: LDS_WRITE 272; CM: LDS_WRITE 273; SI-LABEL: {{^}}store_local_v2i32: 274; SI: ds_write_b64 275define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { 276entry: 277 store <2 x i32> %in, <2 x i32> addrspace(3)* %out 278 ret void 279} 280 281; EG-LABEL: {{^}}store_local_v4i32: 282; EG: LDS_WRITE 283; EG: LDS_WRITE 284; EG: LDS_WRITE 285; EG: LDS_WRITE 286; CM-LABEL: {{^}}store_local_v4i32: 287; CM: LDS_WRITE 288; CM: LDS_WRITE 289; CM: LDS_WRITE 290; CM: LDS_WRITE 291; SI-LABEL: {{^}}store_local_v4i32: 292; SI: ds_write_b32 293; SI: ds_write_b32 294; SI: ds_write_b32 295; SI: ds_write_b32 296define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { 297entry: 298 store <4 x i32> %in, <4 x i32> addrspace(3)* %out 299 ret void 300} 301 302; FUNC-LABEL: {{^}}store_local_i64_i8: 303; EG: LDS_BYTE_WRITE 304; SI: ds_write_b8 305define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) { 306entry: 307 %0 = trunc i64 %in to i8 308 store i8 %0, i8 addrspace(3)* %out 309 ret void 310} 311 312; FUNC-LABEL: {{^}}store_local_i64_i16: 313; EG: LDS_SHORT_WRITE 314; SI: ds_write_b16 315define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) { 316entry: 317 %0 = trunc i64 %in to i16 318 store i16 %0, i16 addrspace(3)* %out 319 ret void 320} 321 322; The stores in this function are combined by the optimizer to create a 323; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer 324; should not try to split the 64-bit store back into 2 32-bit stores. 325; 326; Evergreen / Northern Islands don't support 64-bit stores yet, so there should 327; be two 32-bit stores. 328 329; EG-LABEL: {{^}}vecload2: 330; EG: MEM_RAT_CACHELESS STORE_RAW 331; CM-LABEL: {{^}}vecload2: 332; CM: MEM_RAT_CACHELESS STORE_DWORD 333; SI-LABEL: {{^}}vecload2: 334; SI: buffer_store_dwordx2 335define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { 336entry: 337 %0 = load i32 addrspace(2)* %mem, align 4 338 %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1 339 %1 = load i32 addrspace(2)* %arrayidx1.i, align 4 340 store i32 %0, i32 addrspace(1)* %out, align 4 341 %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1 342 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 343 ret void 344} 345 346attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } 347 348; When i128 was a legal type this program generated cannot select errors: 349 350; FUNC-LABEL: {{^}}"i128-const-store": 351; FIXME: We should be able to to this with one store instruction 352; EG: STORE_RAW 353; EG: STORE_RAW 354; EG: STORE_RAW 355; EG: STORE_RAW 356; CM: STORE_DWORD 357; CM: STORE_DWORD 358; CM: STORE_DWORD 359; CM: STORE_DWORD 360; SI: buffer_store_dwordx2 361; SI: buffer_store_dwordx2 362define void @i128-const-store(i32 addrspace(1)* %out) { 363entry: 364 store i32 1, i32 addrspace(1)* %out, align 4 365 %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i64 1 366 store i32 1, i32 addrspace(1)* %arrayidx2, align 4 367 %arrayidx4 = getelementptr inbounds i32 addrspace(1)* %out, i64 2 368 store i32 2, i32 addrspace(1)* %arrayidx4, align 4 369 %arrayidx6 = getelementptr inbounds i32 addrspace(1)* %out, i64 3 370 store i32 2, i32 addrspace(1)* %arrayidx6, align 4 371 ret void 372} 373