1; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s 2; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s 3; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s 4 5;===------------------------------------------------------------------------===; 6; Global Address Space 7;===------------------------------------------------------------------------===; 8 9; i8 store 10; EG-CHECK-LABEL: @store_i8 11; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 12; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]] 13; IG 0: Get the byte index and truncate the value 14; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 15; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y 16; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43) 17; IG 1: Truncate the calculated the shift amount for the mask 18; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 19; EG-CHECK-NEXT: 3 20; IG 2: Shift the value and the mask 21; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] 22; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 23; EG-CHECK-NEXT: 255 24; IG 3: Initialize the Y and Z channels to zero 25; XXX: An optimal scheduler should merge this into one of the prevous IGs. 26; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0 27; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0 28 29; SI-CHECK-LABEL: @store_i8 30; SI-CHECK: BUFFER_STORE_BYTE 31 32define void @store_i8(i8 addrspace(1)* %out, i8 %in) { 33entry: 34 store i8 %in, i8 addrspace(1)* %out 35 ret void 36} 37 38; i16 store 39; EG-CHECK-LABEL: @store_i16 40; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X 41; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]] 42; IG 0: Get the byte index and truncate the value 43; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x 44; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y 45; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 46; IG 1: Truncate the calculated the shift amount for the mask 47; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x 48; EG-CHECK: 3 49; IG 2: Shift the value and the mask 50; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]] 51; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] 52; EG-CHECK-NEXT: 65535 53; IG 3: Initialize the Y and Z channels to zero 54; XXX: An optimal scheduler should merge this into one of the prevous IGs. 55; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0 56; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0 57 58; SI-CHECK-LABEL: @store_i16 59; SI-CHECK: BUFFER_STORE_SHORT 60define void @store_i16(i16 addrspace(1)* %out, i16 %in) { 61entry: 62 store i16 %in, i16 addrspace(1)* %out 63 ret void 64} 65 66; EG-CHECK-LABEL: @store_v2i8 67; EG-CHECK: MEM_RAT MSKOR 68; EG-CHECK-NOT: MEM_RAT MSKOR 69; SI-CHECK-LABEL: @store_v2i8 70; SI-CHECK: BUFFER_STORE_BYTE 71; SI-CHECK: BUFFER_STORE_BYTE 72define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { 73entry: 74 %0 = trunc <2 x i32> %in to <2 x i8> 75 store <2 x i8> %0, <2 x i8> addrspace(1)* %out 76 ret void 77} 78 79 80; EG-CHECK-LABEL: @store_v2i16 81; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW 82; CM-CHECK-LABEL: @store_v2i16 83; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD 84; SI-CHECK-LABEL: @store_v2i16 85; SI-CHECK: BUFFER_STORE_SHORT 86; SI-CHECK: BUFFER_STORE_SHORT 87define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { 88entry: 89 %0 = trunc <2 x i32> %in to <2 x i16> 90 store <2 x i16> %0, <2 x i16> addrspace(1)* %out 91 ret void 92} 93 94; EG-CHECK-LABEL: @store_v4i8 95; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW 96; CM-CHECK-LABEL: @store_v4i8 97; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD 98; SI-CHECK-LABEL: @store_v4i8 99; SI-CHECK: BUFFER_STORE_BYTE 100; SI-CHECK: BUFFER_STORE_BYTE 101; SI-CHECK: BUFFER_STORE_BYTE 102; SI-CHECK: BUFFER_STORE_BYTE 103define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { 104entry: 105 %0 = trunc <4 x i32> %in to <4 x i8> 106 store <4 x i8> %0, <4 x i8> addrspace(1)* %out 107 ret void 108} 109 110; floating-point store 111; EG-CHECK-LABEL: @store_f32 112; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 113; CM-CHECK-LABEL: @store_f32 114; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}} 115; SI-CHECK-LABEL: @store_f32 116; SI-CHECK: BUFFER_STORE_DWORD 117 118define void @store_f32(float addrspace(1)* %out, float %in) { 119 store float %in, float addrspace(1)* %out 120 ret void 121} 122 123; EG-CHECK-LABEL: @store_v4i16 124; EG-CHECK: MEM_RAT MSKOR 125; EG-CHECK: MEM_RAT MSKOR 126; EG-CHECK: MEM_RAT MSKOR 127; EG-CHECK: MEM_RAT MSKOR 128; EG-CHECK-NOT: MEM_RAT MSKOR 129; SI-CHECK-LABEL: @store_v4i16 130; SI-CHECK: BUFFER_STORE_SHORT 131; SI-CHECK: BUFFER_STORE_SHORT 132; SI-CHECK: BUFFER_STORE_SHORT 133; SI-CHECK: BUFFER_STORE_SHORT 134; SI-CHECK-NOT: BUFFER_STORE_BYTE 135define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { 136entry: 137 %0 = trunc <4 x i32> %in to <4 x i16> 138 store <4 x i16> %0, <4 x i16> addrspace(1)* %out 139 ret void 140} 141 142; vec2 floating-point stores 143; EG-CHECK-LABEL: @store_v2f32 144; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW 145; CM-CHECK-LABEL: @store_v2f32 146; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD 147; SI-CHECK-LABEL: @store_v2f32 148; SI-CHECK: BUFFER_STORE_DWORDX2 149 150define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) { 151entry: 152 %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0 153 %1 = insertelement <2 x float> %0, float %b, i32 1 154 store <2 x float> %1, <2 x float> addrspace(1)* %out 155 ret void 156} 157 158; EG-CHECK-LABEL: @store_v4i32 159; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW 160; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW 161; CM-CHECK-LABEL: @store_v4i32 162; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD 163; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD 164; SI-CHECK-LABEL: @store_v4i32 165; SI-CHECK: BUFFER_STORE_DWORDX4 166define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { 167entry: 168 store <4 x i32> %in, <4 x i32> addrspace(1)* %out 169 ret void 170} 171 172;===------------------------------------------------------------------------===; 173; Local Address Space 174;===------------------------------------------------------------------------===; 175 176; EG-CHECK-LABEL: @store_local_i8 177; EG-CHECK: LDS_BYTE_WRITE 178; SI-CHECK-LABEL: @store_local_i8 179; SI-CHECK: DS_WRITE_B8 180define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { 181 store i8 %in, i8 addrspace(3)* %out 182 ret void 183} 184 185; EG-CHECK-LABEL: @store_local_i16 186; EG-CHECK: LDS_SHORT_WRITE 187; SI-CHECK-LABEL: @store_local_i16 188; SI-CHECK: DS_WRITE_B16 189define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { 190 store i16 %in, i16 addrspace(3)* %out 191 ret void 192} 193 194; EG-CHECK-LABEL: @store_local_v2i16 195; EG-CHECK: LDS_WRITE 196; CM-CHECK-LABEL: @store_local_v2i16 197; CM-CHECK: LDS_WRITE 198; SI-CHECK-LABEL: @store_local_v2i16 199; SI-CHECK: DS_WRITE_B16 200; SI-CHECK: DS_WRITE_B16 201define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { 202entry: 203 store <2 x i16> %in, <2 x i16> addrspace(3)* %out 204 ret void 205} 206 207; EG-CHECK-LABEL: @store_local_v4i8 208; EG-CHECK: LDS_WRITE 209; CM-CHECK-LABEL: @store_local_v4i8 210; CM-CHECK: LDS_WRITE 211; SI-CHECK-LABEL: @store_local_v4i8 212; SI-CHECK: DS_WRITE_B8 213; SI-CHECK: DS_WRITE_B8 214; SI-CHECK: DS_WRITE_B8 215; SI-CHECK: DS_WRITE_B8 216define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { 217entry: 218 store <4 x i8> %in, <4 x i8> addrspace(3)* %out 219 ret void 220} 221 222; EG-CHECK-LABEL: @store_local_v2i32 223; EG-CHECK: LDS_WRITE 224; EG-CHECK: LDS_WRITE 225; CM-CHECK-LABEL: @store_local_v2i32 226; CM-CHECK: LDS_WRITE 227; CM-CHECK: LDS_WRITE 228; SI-CHECK-LABEL: @store_local_v2i32 229; SI-CHECK: DS_WRITE_B32 230; SI-CHECK: DS_WRITE_B32 231define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { 232entry: 233 store <2 x i32> %in, <2 x i32> addrspace(3)* %out 234 ret void 235} 236 237; EG-CHECK-LABEL: @store_local_v4i32 238; EG-CHECK: LDS_WRITE 239; EG-CHECK: LDS_WRITE 240; EG-CHECK: LDS_WRITE 241; EG-CHECK: LDS_WRITE 242; CM-CHECK-LABEL: @store_local_v4i32 243; CM-CHECK: LDS_WRITE 244; CM-CHECK: LDS_WRITE 245; CM-CHECK: LDS_WRITE 246; CM-CHECK: LDS_WRITE 247; SI-CHECK-LABEL: @store_local_v4i32 248; SI-CHECK: DS_WRITE_B32 249; SI-CHECK: DS_WRITE_B32 250; SI-CHECK: DS_WRITE_B32 251; SI-CHECK: DS_WRITE_B32 252define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) { 253entry: 254 store <4 x i32> %in, <4 x i32> addrspace(3)* %out 255 ret void 256} 257 258; The stores in this function are combined by the optimizer to create a 259; 64-bit store with 32-bit alignment. This is legal for SI and the legalizer 260; should not try to split the 64-bit store back into 2 32-bit stores. 261; 262; Evergreen / Northern Islands don't support 64-bit stores yet, so there should 263; be two 32-bit stores. 264 265; EG-CHECK-LABEL: @vecload2 266; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW 267; CM-CHECK-LABEL: @vecload2 268; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD 269; SI-CHECK-LABEL: @vecload2 270; SI-CHECK: BUFFER_STORE_DWORDX2 271define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { 272entry: 273 %0 = load i32 addrspace(2)* %mem, align 4 274 %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1 275 %1 = load i32 addrspace(2)* %arrayidx1.i, align 4 276 store i32 %0, i32 addrspace(1)* %out, align 4 277 %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1 278 store i32 %1, i32 addrspace(1)* %arrayidx1, align 4 279 ret void 280} 281 282attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } 283