1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; 2-lane contiguous load/stores 8 9define void @test_masked_ldst_sv2i8(i8 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 10; CHECK-LABEL: test_masked_ldst_sv2i8: 11; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] 12; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] 13; CHECK-NEXT: ret 14 %base_i8 = getelementptr i8, i8* %base, i64 %offset 15 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>* 16 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr, 17 i32 1, 18 <vscale x 2 x i1> %mask, 19 <vscale x 2 x i8> undef) 20 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data, 21 <vscale x 2 x i8>* %base_addr, 22 i32 1, 23 <vscale x 2 x i1> %mask) 24 ret void 25} 26 27define void @test_masked_ldst_sv2i16(i16 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 28; CHECK-LABEL: test_masked_ldst_sv2i16: 29; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] 30; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] 31; CHECK-NEXT: ret 32 %base_i16 = getelementptr i16, i16* %base, i64 %offset 33 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>* 34 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr, 35 i32 1, 36 <vscale x 2 x i1> %mask, 37 <vscale x 2 x i16> undef) 38 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data, 39 <vscale x 2 x i16>* %base_addr, 40 i32 1, 41 <vscale x 2 x i1> %mask) 42 ret void 43} 44 45define void @test_masked_ldst_sv2i32(i32 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 46; CHECK-LABEL: test_masked_ldst_sv2i32: 47; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] 48; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] 49; CHECK-NEXT: ret 50 %base_i32 = getelementptr i32, i32* %base, i64 %offset 51 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>* 52 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr, 53 i32 1, 54 <vscale x 2 x i1> %mask, 55 <vscale x 2 x i32> undef) 56 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data, 57 <vscale x 2 x i32>* %base_addr, 58 i32 1, 59 <vscale x 2 x i1> %mask) 60 ret void 61} 62 63define void @test_masked_ldst_sv2i64(i64 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 64; CHECK-LABEL: test_masked_ldst_sv2i64: 65; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] 66; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] 67; CHECK-NEXT: ret 68 %base_i64 = getelementptr i64, i64* %base, i64 %offset 69 %base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>* 70 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_addr, 71 i32 1, 72 <vscale x 2 x i1> %mask, 73 <vscale x 2 x i64> undef) 74 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data, 75 <vscale x 2 x i64>* %base_addr, 76 i32 1, 77 <vscale x 2 x i1> %mask) 78 ret void 79} 80 81define void @test_masked_ldst_sv2f16(half * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 82; CHECK-LABEL: test_masked_ldst_sv2f16: 83; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] 84; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] 85; CHECK-NEXT: ret 86 %base_half = getelementptr half, half* %base, i64 %offset 87 %base_addr = bitcast half* %base_half to <vscale x 2 x half>* 88 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_addr, 89 i32 1, 90 <vscale x 2 x i1> %mask, 91 <vscale x 2 x half> undef) 92 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data, 93 <vscale x 2 x half>* %base_addr, 94 i32 1, 95 <vscale x 2 x i1> %mask) 96 ret void 97} 98 99define void @test_masked_ldst_sv2f32(float * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 100; CHECK-LABEL: test_masked_ldst_sv2f32: 101; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] 102; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] 103; CHECK-NEXT: ret 104 %base_float = getelementptr float, float* %base, i64 %offset 105 %base_addr = bitcast float* %base_float to <vscale x 2 x float>* 106 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_addr, 107 i32 1, 108 <vscale x 2 x i1> %mask, 109 <vscale x 2 x float> undef) 110 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data, 111 <vscale x 2 x float>* %base_addr, 112 i32 1, 113 <vscale x 2 x i1> %mask) 114 ret void 115} 116 117define void @test_masked_ldst_sv2f64(double * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 118; CHECK-LABEL: test_masked_ldst_sv2f64: 119; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] 120; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] 121; CHECK-NEXT: ret 122 %base_double = getelementptr double, double* %base, i64 %offset 123 %base_addr = bitcast double* %base_double to <vscale x 2 x double>* 124 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_addr, 125 i32 1, 126 <vscale x 2 x i1> %mask, 127 <vscale x 2 x double> undef) 128 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data, 129 <vscale x 2 x double>* %base_addr, 130 i32 1, 131 <vscale x 2 x i1> %mask) 132 ret void 133} 134 135; 2-lane zero/sign extended contiguous loads. 136 137define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 138; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: 139; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] 140; CHECK-NEXT: ret 141 %base_i8 = getelementptr i8, i8* %base, i64 %offset 142 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>* 143 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr, 144 i32 1, 145 <vscale x 2 x i1> %mask, 146 <vscale x 2 x i8> undef) 147 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 148 ret <vscale x 2 x i64> %ext 149} 150 151define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 152; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: 153; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] 154; CHECK-NEXT: ret 155 %base_i8 = getelementptr i8, i8* %base, i64 %offset 156 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>* 157 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr, 158 i32 1, 159 <vscale x 2 x i1> %mask, 160 <vscale x 2 x i8> undef) 161 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 162 ret <vscale x 2 x i64> %ext 163} 164 165define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 166; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: 167; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] 168; CHECK-NEXT: ret 169 %base_i16 = getelementptr i16, i16* %base, i64 %offset 170 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>* 171 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr, 172 i32 1, 173 <vscale x 2 x i1> %mask, 174 <vscale x 2 x i16> undef) 175 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 176 ret <vscale x 2 x i64> %ext 177} 178 179define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 180; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: 181; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] 182; CHECK-NEXT: ret 183 %base_i16 = getelementptr i16, i16* %base, i64 %offset 184 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>* 185 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr, 186 i32 1, 187 <vscale x 2 x i1> %mask, 188 <vscale x 2 x i16> undef) 189 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 190 ret <vscale x 2 x i64> %ext 191} 192 193 194define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 195; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: 196; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] 197; CHECK-NEXT: ret 198 %base_i32 = getelementptr i32, i32* %base, i64 %offset 199 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>* 200 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr, 201 i32 1, 202 <vscale x 2 x i1> %mask, 203 <vscale x 2 x i32> undef) 204 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 205 ret <vscale x 2 x i64> %ext 206} 207 208define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 209; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: 210; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] 211; CHECK-NEXT: ret 212 %base_i32 = getelementptr i32, i32* %base, i64 %offset 213 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>* 214 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr, 215 i32 1, 216 <vscale x 2 x i1> %mask, 217 <vscale x 2 x i32> undef) 218 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 219 ret <vscale x 2 x i64> %ext 220} 221 222; 2-lane truncating contiguous stores. 223 224define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, i8 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 225; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: 226; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] 227; CHECK-NEXT: ret 228 %base_i8 = getelementptr i8, i8* %base, i64 %offset 229 %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>* 230 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8> 231 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc, 232 <vscale x 2 x i8> *%base_addr, 233 i32 1, 234 <vscale x 2 x i1> %mask) 235 ret void 236} 237 238define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, i16 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 239; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: 240; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] 241; CHECK-NEXT: ret 242 %base_i16 = getelementptr i16, i16* %base, i64 %offset 243 %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>* 244 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16> 245 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc, 246 <vscale x 2 x i16> *%base_addr, 247 i32 1, 248 <vscale x 2 x i1> %mask) 249 ret void 250} 251 252define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, i32 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 253; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: 254; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] 255; CHECK-NEXT: ret 256 %base_i32 = getelementptr i32, i32* %base, i64 %offset 257 %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>* 258 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32> 259 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc, 260 <vscale x 2 x i32> *%base_addr, 261 i32 1, 262 <vscale x 2 x i1> %mask) 263 ret void 264} 265 266; 4-lane contiguous load/stores. 267 268define void @test_masked_ldst_sv4i8(i8 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 269; CHECK-LABEL: test_masked_ldst_sv4i8: 270; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] 271; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] 272; CHECK-NEXT: ret 273 %base_i8 = getelementptr i8, i8* %base, i64 %offset 274 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>* 275 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr, 276 i32 1, 277 <vscale x 4 x i1> %mask, 278 <vscale x 4 x i8> undef) 279 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data, 280 <vscale x 4 x i8>* %base_addr, 281 i32 1, 282 <vscale x 4 x i1> %mask) 283 ret void 284} 285 286define void @test_masked_ldst_sv4i16(i16 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 287; CHECK-LABEL: test_masked_ldst_sv4i16: 288; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] 289; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] 290; CHECK-NEXT: ret 291 %base_i16 = getelementptr i16, i16* %base, i64 %offset 292 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>* 293 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr, 294 i32 1, 295 <vscale x 4 x i1> %mask, 296 <vscale x 4 x i16> undef) 297 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data, 298 <vscale x 4 x i16>* %base_addr, 299 i32 1, 300 <vscale x 4 x i1> %mask) 301 ret void 302} 303 304define void @test_masked_ldst_sv4i32(i32 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 305; CHECK-LABEL: test_masked_ldst_sv4i32: 306; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] 307; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] 308; CHECK-NEXT: ret 309 %base_i32 = getelementptr i32, i32* %base, i64 %offset 310 %base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>* 311 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_addr, 312 i32 1, 313 <vscale x 4 x i1> %mask, 314 <vscale x 4 x i32> undef) 315 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data, 316 <vscale x 4 x i32>* %base_addr, 317 i32 1, 318 <vscale x 4 x i1> %mask) 319 ret void 320} 321 322define void @test_masked_ldst_sv4f16(half * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 323; CHECK-LABEL: test_masked_ldst_sv4f16: 324; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] 325; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] 326; CHECK-NEXT: ret 327 %base_f16 = getelementptr half, half* %base, i64 %offset 328 %base_addr = bitcast half* %base_f16 to <vscale x 4 x half>* 329 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_addr, 330 i32 1, 331 <vscale x 4 x i1> %mask, 332 <vscale x 4 x half> undef) 333 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data, 334 <vscale x 4 x half>* %base_addr, 335 i32 1, 336 <vscale x 4 x i1> %mask) 337 ret void 338} 339 340define void @test_masked_ldst_sv4f32(float * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 341; CHECK-LABEL: test_masked_ldst_sv4f32: 342; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] 343; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] 344; CHECK-NEXT: ret 345 %base_f32 = getelementptr float, float* %base, i64 %offset 346 %base_addr = bitcast float* %base_f32 to <vscale x 4 x float>* 347 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_addr, 348 i32 1, 349 <vscale x 4 x i1> %mask, 350 <vscale x 4 x float> undef) 351 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data, 352 <vscale x 4 x float>* %base_addr, 353 i32 1, 354 <vscale x 4 x i1> %mask) 355 ret void 356} 357 358; 4-lane zero/sign extended contiguous loads. 359 360define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 361; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: 362; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] 363; CHECK-NEXT: ret 364 %base_i8 = getelementptr i8, i8* %base, i64 %offset 365 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>* 366 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr, 367 i32 1, 368 <vscale x 4 x i1> %mask, 369 <vscale x 4 x i8> undef) 370 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 371 ret <vscale x 4 x i32> %ext 372} 373 374define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 375; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: 376; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] 377; CHECK-NEXT: ret 378 %base_i8 = getelementptr i8, i8* %base, i64 %offset 379 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>* 380 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr, 381 i32 1, 382 <vscale x 4 x i1> %mask, 383 <vscale x 4 x i8> undef) 384 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 385 ret <vscale x 4 x i32> %ext 386} 387 388define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 389; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: 390; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] 391; CHECK-NEXT: ret 392 %base_i16 = getelementptr i16, i16* %base, i64 %offset 393 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>* 394 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr, 395 i32 1, 396 <vscale x 4 x i1> %mask, 397 <vscale x 4 x i16> undef) 398 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 399 ret <vscale x 4 x i32> %ext 400} 401 402define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 403; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: 404; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] 405; CHECK-NEXT: ret 406 %base_i16 = getelementptr i16, i16* %base, i64 %offset 407 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>* 408 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr, 409 i32 1, 410 <vscale x 4 x i1> %mask, 411 <vscale x 4 x i16> undef) 412 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 413 ret <vscale x 4 x i32> %ext 414} 415 416; 4-lane truncating contiguous stores. 417 418define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, i8 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 419; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: 420; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] 421; CHECK-NEXT: ret 422 %base_i8 = getelementptr i8, i8* %base, i64 %offset 423 %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>* 424 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8> 425 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc, 426 <vscale x 4 x i8> *%base_addr, 427 i32 1, 428 <vscale x 4 x i1> %mask) 429 ret void 430} 431 432define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, i16 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 433; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: 434; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] 435; CHECK-NEXT: ret 436 %base_i16 = getelementptr i16, i16* %base, i64 %offset 437 %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>* 438 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16> 439 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc, 440 <vscale x 4 x i16> *%base_addr, 441 i32 1, 442 <vscale x 4 x i1> %mask) 443 ret void 444} 445 446; 8-lane contiguous load/stores. 447 448define void @test_masked_ldst_sv8i8(i8 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 449; CHECK-LABEL: test_masked_ldst_sv8i8: 450; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] 451; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] 452; CHECK-NEXT: ret 453 %base_i8 = getelementptr i8, i8* %base, i64 %offset 454 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>* 455 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr, 456 i32 1, 457 <vscale x 8 x i1> %mask, 458 <vscale x 8 x i8> undef) 459 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data, 460 <vscale x 8 x i8>* %base_addr, 461 i32 1, 462 <vscale x 8 x i1> %mask) 463 ret void 464} 465 466define void @test_masked_ldst_sv8i16(i16 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 467; CHECK-LABEL: test_masked_ldst_sv8i16: 468; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 469; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 470; CHECK-NEXT: ret 471 %base_i16 = getelementptr i16, i16* %base, i64 %offset 472 %base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>* 473 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_addr, 474 i32 1, 475 <vscale x 8 x i1> %mask, 476 <vscale x 8 x i16> undef) 477 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data, 478 <vscale x 8 x i16>* %base_addr, 479 i32 1, 480 <vscale x 8 x i1> %mask) 481 ret void 482} 483 484define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 485; CHECK-LABEL: test_masked_ldst_sv8f16: 486; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 487; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 488; CHECK-NEXT: ret 489 %base_f16 = getelementptr half, half* %base, i64 %offset 490 %base_addr = bitcast half* %base_f16 to <vscale x 8 x half>* 491 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_addr, 492 i32 1, 493 <vscale x 8 x i1> %mask, 494 <vscale x 8 x half> undef) 495 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data, 496 <vscale x 8 x half>* %base_addr, 497 i32 1, 498 <vscale x 8 x i1> %mask) 499 ret void 500} 501 502define void @test_masked_ldst_sv8bf16(bfloat * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 { 503; CHECK-LABEL: test_masked_ldst_sv8bf16: 504; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 505; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 506; CHECK-NEXT: ret 507 %base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset 508 %base_addr = bitcast bfloat* %base_f16 to <vscale x 8 x bfloat>* 509 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_addr, 510 i32 1, 511 <vscale x 8 x i1> %mask, 512 <vscale x 8 x bfloat> undef) 513 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data, 514 <vscale x 8 x bfloat>* %base_addr, 515 i32 1, 516 <vscale x 8 x i1> %mask) 517 ret void 518} 519 520; 8-lane zero/sign extended contiguous loads. 521 522define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 523; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: 524; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] 525; CHECK-NEXT: ret 526 %base_i8 = getelementptr i8, i8* %base, i64 %offset 527 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>* 528 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr, 529 i32 1, 530 <vscale x 8 x i1> %mask, 531 <vscale x 8 x i8> undef) 532 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> 533 ret <vscale x 8 x i16> %ext 534} 535 536define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 537; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: 538; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] 539; CHECK-NEXT: ret 540 %base_i8 = getelementptr i8, i8* %base, i64 %offset 541 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>* 542 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr, 543 i32 1, 544 <vscale x 8 x i1> %mask, 545 <vscale x 8 x i8> undef) 546 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> 547 ret <vscale x 8 x i16> %ext 548} 549 550; 8-lane truncating contiguous stores. 551 552define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, i8 *%base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 553; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: 554; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] 555; CHECK-NEXT: ret 556 %base_i8 = getelementptr i8, i8* %base, i64 %offset 557 %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>* 558 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8> 559 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc, 560 <vscale x 8 x i8> *%base_addr, 561 i32 1, 562 <vscale x 8 x i1> %mask) 563 ret void 564} 565 566; 16-lane contiguous load/stores. 567 568define void @test_masked_ldst_sv16i8(i8 * %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind { 569; CHECK-LABEL: test_masked_ldst_sv16i8: 570; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] 571; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] 572; CHECK-NEXT: ret 573 %base_i8 = getelementptr i8, i8* %base, i64 %offset 574 %base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>* 575 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_addr, 576 i32 1, 577 <vscale x 16 x i1> %mask, 578 <vscale x 16 x i8> undef) 579 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data, 580 <vscale x 16 x i8>* %base_addr, 581 i32 1, 582 <vscale x 16 x i1> %mask) 583 ret void 584} 585 586; 2-element contiguous loads. 587declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> ) 588declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>) 589declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>) 590declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>) 591declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>) 592declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>) 593declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>) 594 595; 4-element contiguous loads. 596declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> ) 597declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>) 598declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>) 599declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>) 600declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>) 601 602; 8-element contiguous loads. 603declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> ) 604declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>) 605declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>) 606declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>) 607 608; 16-element contiguous loads. 609declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>) 610 611; 2-element contiguous stores. 612declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>) 613declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>) 614declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>) 615declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>) 616declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>) 617declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>) 618declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>) 619 620; 4-element contiguous stores. 621declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>) 622declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>) 623declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>) 624declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>) 625declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>) 626 627; 8-element contiguous stores. 628declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>) 629declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>) 630declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>) 631declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>) 632 633; 16-element contiguous stores. 634declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>) 635 636; +bf16 is required for the bfloat version. 637attributes #0 = { "target-features"="+sve,+bf16" } 638