1; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s 2 3; Simple load of v4i16 4define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 { 5; CHECK-LABEL: load_64: 6; CHECK: ldr d0, [x0] 7entry: 8 %0 = load <4 x half>, <4 x half>* %a, align 8 9 ret <4 x half> %0 10} 11 12; Simple load of v8i16 13define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 { 14; CHECK-LABEL: load_128: 15; CHECK: ldr q0, [x0] 16entry: 17 %0 = load <8 x half>, <8 x half>* %a, align 16 18 ret <8 x half> %0 19} 20 21; Duplicating load to v4i16 22define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 { 23; CHECK-LABEL: load_dup_64: 24; CHECK: ld1r { v0.4h }, [x0] 25entry: 26 %0 = load half, half* %a, align 2 27 %1 = insertelement <4 x half> undef, half %0, i32 0 28 %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer 29 ret <4 x half> %2 30} 31 32; Duplicating load to v8i16 33define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 { 34; CHECK-LABEL: load_dup_128: 35; CHECK: ld1r { v0.8h }, [x0] 36entry: 37 %0 = load half, half* %a, align 2 38 %1 = insertelement <8 x half> undef, half %0, i32 0 39 %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer 40 ret <8 x half> %2 41} 42 43; Load to one lane of v4f16 44define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 { 45; CHECK-LABEL: load_lane_64: 46; CHECK: ld1 { v0.h }[2], [x0] 47entry: 48 %0 = load half, half* %a, align 2 49 %1 = insertelement <4 x half> %b, half %0, i32 2 50 ret <4 x half> %1 51} 52 53; Load to one lane of v8f16 54define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 { 55; CHECK-LABEL: load_lane_128: 56; CHECK: ld1 { v0.h }[5], [x0] 57entry: 58 %0 = load half, half* %a, align 2 59 %1 = insertelement <8 x half> %b, half %0, i32 5 60 ret <8 x half> %1 61} 62 63; Simple store of v4f16 64define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 { 65; CHECK-LABEL: store_64: 66; CHECK: str d0, [x0] 67entry: 68 store <4 x half> %b, <4 x half>* %a, align 8 69 ret void 70} 71 72; Simple store of v8f16 73define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 { 74; CHECK-LABEL: store_128: 75; CHECK: str q0, [x0] 76entry: 77 store <8 x half> %b, <8 x half>* %a, align 16 78 ret void 79} 80 81; Store from one lane of v4f16 82define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 { 83; CHECK-LABEL: store_lane_64: 84; CHECK: st1 { v0.h }[2], [x0] 85entry: 86 %0 = extractelement <4 x half> %b, i32 2 87 store half %0, half* %a, align 2 88 ret void 89} 90 91define void @store_lane0_64(half* nocapture %a, <4 x half> %b) #1 { 92; CHECK-LABEL: store_lane0_64: 93; CHECK: str h0, [x0] 94entry: 95 %0 = extractelement <4 x half> %b, i32 0 96 store half %0, half* %a, align 2 97 ret void 98} 99 100define void @storeu_lane0_64(half* nocapture %a, <4 x half> %b) #1 { 101; CHECK-LABEL: storeu_lane0_64: 102; CHECK: stur h0, [x{{[0-9]+}}, #-2] 103entry: 104 %0 = getelementptr half, half* %a, i64 -1 105 %1 = extractelement <4 x half> %b, i32 0 106 store half %1, half* %0, align 2 107 ret void 108} 109 110define void @storero_lane_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 { 111; CHECK-LABEL: storero_lane_64: 112; CHECK: st1 { v0.h }[2], [x{{[0-9]+}}] 113entry: 114 %0 = getelementptr half, half* %a, i64 %c 115 %1 = extractelement <4 x half> %b, i32 2 116 store half %1, half* %0, align 2 117 ret void 118} 119 120define void @storero_lane0_64(half* nocapture %a, <4 x half> %b, i64 %c) #1 { 121; CHECK-LABEL: storero_lane0_64: 122; CHECK: str h0, [x0, x1, lsl #1] 123entry: 124 %0 = getelementptr half, half* %a, i64 %c 125 %1 = extractelement <4 x half> %b, i32 0 126 store half %1, half* %0, align 2 127 ret void 128} 129 130; Store from one lane of v8f16 131define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { 132; CHECK-LABEL: store_lane_128: 133; CHECK: st1 { v0.h }[5], [x0] 134entry: 135 %0 = extractelement <8 x half> %b, i32 5 136 store half %0, half* %a, align 2 137 ret void 138} 139 140define void @store_lane0_128(half* nocapture %a, <8 x half> %b) #1 { 141; CHECK-LABEL: store_lane0_128: 142; CHECK: str h0, [x0] 143entry: 144 %0 = extractelement <8 x half> %b, i32 0 145 store half %0, half* %a, align 2 146 ret void 147} 148 149define void @storeu_lane0_128(half* nocapture %a, <8 x half> %b) #1 { 150; CHECK-LABEL: storeu_lane0_128: 151; CHECK: stur h0, [x{{[0-9]+}}, #-2] 152entry: 153 %0 = getelementptr half, half* %a, i64 -1 154 %1 = extractelement <8 x half> %b, i32 0 155 store half %1, half* %0, align 2 156 ret void 157} 158 159define void @storero_lane_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 { 160; CHECK-LABEL: storero_lane_128: 161; CHECK: st1 { v0.h }[4], [x{{[0-9]+}}] 162entry: 163 %0 = getelementptr half, half* %a, i64 %c 164 %1 = extractelement <8 x half> %b, i32 4 165 store half %1, half* %0, align 2 166 ret void 167} 168 169define void @storero_lane0_128(half* nocapture %a, <8 x half> %b, i64 %c) #1 { 170; CHECK-LABEL: storero_lane0_128: 171; CHECK: str h0, [x0, x1, lsl #1] 172entry: 173 %0 = getelementptr half, half* %a, i64 %c 174 %1 = extractelement <8 x half> %b, i32 0 175 store half %1, half* %0, align 2 176 ret void 177} 178 179; NEON intrinsics - (de-)interleaving loads and stores 180declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) 181declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*) 182declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*) 183declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 184declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 185declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 186declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*) 187declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*) 188declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*) 189declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 190declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 191declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 192 193; Load 2 x v4f16 with de-interleaving 194define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 { 195; CHECK-LABEL: load_interleave_64_2: 196; CHECK: ld2 { v0.4h, v1.4h }, [x0] 197entry: 198 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a) 199 ret { <4 x half>, <4 x half> } %0 200} 201 202; Load 3 x v4f16 with de-interleaving 203define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 { 204; CHECK-LABEL: load_interleave_64_3: 205; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] 206entry: 207 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a) 208 ret { <4 x half>, <4 x half>, <4 x half> } %0 209} 210 211; Load 4 x v4f16 with de-interleaving 212define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 { 213; CHECK-LABEL: load_interleave_64_4: 214; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 215entry: 216 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a) 217 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 218} 219 220; Store 2 x v4f16 with interleaving 221define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 222; CHECK-LABEL: store_interleave_64_2: 223; CHECK: st2 { v0.4h, v1.4h }, [x0] 224entry: 225 tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 226 ret void 227} 228 229; Store 3 x v4f16 with interleaving 230define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 231; CHECK-LABEL: store_interleave_64_3: 232; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] 233entry: 234 tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 235 ret void 236} 237 238; Store 4 x v4f16 with interleaving 239define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 240; CHECK-LABEL: store_interleave_64_4: 241; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 242entry: 243 tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 244 ret void 245} 246 247; Load 2 x v8f16 with de-interleaving 248define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 { 249; CHECK-LABEL: load_interleave_128_2: 250; CHECK: ld2 { v0.8h, v1.8h }, [x0] 251entry: 252 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a) 253 ret { <8 x half>, <8 x half> } %0 254} 255 256; Load 3 x v8f16 with de-interleaving 257define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 { 258; CHECK-LABEL: load_interleave_128_3: 259; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] 260entry: 261 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a) 262 ret { <8 x half>, <8 x half>, <8 x half> } %0 263} 264 265; Load 8 x v8f16 with de-interleaving 266define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 { 267; CHECK-LABEL: load_interleave_128_4: 268; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 269entry: 270 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a) 271 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 272} 273 274; Store 2 x v8f16 with interleaving 275define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 276; CHECK-LABEL: store_interleave_128_2: 277; CHECK: st2 { v0.8h, v1.8h }, [x0] 278entry: 279 tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 280 ret void 281} 282 283; Store 3 x v8f16 with interleaving 284define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 285; CHECK-LABEL: store_interleave_128_3: 286; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] 287entry: 288 tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 289 ret void 290} 291 292; Store 8 x v8f16 with interleaving 293define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 294; CHECK-LABEL: store_interleave_128_4: 295; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 296entry: 297 tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 298 ret void 299} 300 301; NEON intrinsics - duplicating loads 302declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*) 303declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*) 304declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*) 305declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*) 306declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*) 307declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*) 308 309; Load 2 x v4f16 with duplication 310define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 { 311; CHECK-LABEL: load_dup_64_2: 312; CHECK: ld2r { v0.4h, v1.4h }, [x0] 313entry: 314 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a) 315 ret { <4 x half>, <4 x half> } %0 316} 317 318; Load 3 x v4f16 with duplication 319define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 { 320; CHECK-LABEL: load_dup_64_3: 321; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] 322entry: 323 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a) 324 ret { <4 x half>, <4 x half>, <4 x half> } %0 325} 326 327; Load 4 x v4f16 with duplication 328define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 { 329; CHECK-LABEL: load_dup_64_4: 330; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 331entry: 332 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a) 333 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 334} 335 336; Load 2 x v8f16 with duplication 337define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 { 338; CHECK-LABEL: load_dup_128_2: 339; CHECK: ld2r { v0.8h, v1.8h }, [x0] 340entry: 341 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a) 342 ret { <8 x half>, <8 x half> } %0 343} 344 345; Load 3 x v8f16 with duplication 346define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 { 347; CHECK-LABEL: load_dup_128_3: 348; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] 349entry: 350 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a) 351 ret { <8 x half>, <8 x half>, <8 x half> } %0 352} 353 354; Load 8 x v8f16 with duplication 355define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 { 356; CHECK-LABEL: load_dup_128_4: 357; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 358entry: 359 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a) 360 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 361} 362 363 364; NEON intrinsics - loads and stores to/from one lane 365declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 366declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 367declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 368declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 369declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 370declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 371declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 372declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 373declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 374declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 375declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 376declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 377 378; Load one lane of 2 x v4f16 379define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 380; CHECK-LABEL: load_lane_64_2: 381; CHECK: ld2 { v0.h, v1.h }[2], [x0] 382entry: 383 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 384 ret { <4 x half>, <4 x half> } %0 385} 386 387; Load one lane of 3 x v4f16 388define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 389; CHECK-LABEL: load_lane_64_3: 390; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 391entry: 392 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 393 ret { <4 x half>, <4 x half>, <4 x half> } %0 394} 395 396; Load one lane of 4 x v4f16 397define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 398; CHECK-LABEL: load_lane_64_4: 399; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 400entry: 401 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 402 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 403} 404 405; Store one lane of 2 x v4f16 406define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 407; CHECK-LABEL: store_lane_64_2: 408; CHECK: st2 { v0.h, v1.h }[2], [x0] 409entry: 410 tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 411 ret void 412} 413 414; Store one lane of 3 x v4f16 415define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 416; CHECK-LABEL: store_lane_64_3: 417; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 418entry: 419 tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 420 ret void 421} 422 423; Store one lane of 4 x v4f16 424define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 425; CHECK-LABEL: store_lane_64_4: 426; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 427entry: 428 tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 429 ret void 430} 431 432; Load one lane of 2 x v8f16 433define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 434; CHECK-LABEL: load_lane_128_2: 435; CHECK: ld2 { v0.h, v1.h }[2], [x0] 436entry: 437 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 438 ret { <8 x half>, <8 x half> } %0 439} 440 441; Load one lane of 3 x v8f16 442define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 443; CHECK-LABEL: load_lane_128_3: 444; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 445entry: 446 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 447 ret { <8 x half>, <8 x half>, <8 x half> } %0 448} 449 450; Load one lane of 8 x v8f16 451define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 452; CHECK-LABEL: load_lane_128_4: 453; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 454entry: 455 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 456 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 457} 458 459; Store one lane of 2 x v8f16 460define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 461; CHECK-LABEL: store_lane_128_2: 462; CHECK: st2 { v0.h, v1.h }[2], [x0] 463entry: 464 tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 465 ret void 466} 467 468; Store one lane of 3 x v8f16 469define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 470; CHECK-LABEL: store_lane_128_3: 471; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 472entry: 473 tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 474 ret void 475} 476 477; Store one lane of 8 x v8f16 478define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 479; CHECK-LABEL: store_lane_128_4: 480; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 481entry: 482 tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 483 ret void 484} 485 486; NEON intrinsics - load/store without interleaving 487declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*) 488declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*) 489declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*) 490declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 491declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 492declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 493declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*) 494declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*) 495declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*) 496declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 497declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 498declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 499 500; Load 2 x v4f16 without de-interleaving 501define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 { 502; CHECK-LABEL: load_64_2: 503; CHECK: ld1 { v0.4h, v1.4h }, [x0] 504entry: 505 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a) 506 ret { <4 x half>, <4 x half> } %0 507} 508 509; Load 3 x v4f16 without de-interleaving 510define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 { 511; CHECK-LABEL: load_64_3: 512; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] 513entry: 514 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a) 515 ret { <4 x half>, <4 x half>, <4 x half> } %0 516} 517 518; Load 4 x v4f16 without de-interleaving 519define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 { 520; CHECK-LABEL: load_64_4: 521; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 522entry: 523 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a) 524 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 525} 526 527; Store 2 x v4f16 without interleaving 528define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 529; CHECK-LABEL: store_64_2: 530; CHECK: st1 { v0.4h, v1.4h }, [x0] 531entry: 532 tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 533 ret void 534} 535 536; Store 3 x v4f16 without interleaving 537define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 538; CHECK-LABEL: store_64_3: 539; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] 540entry: 541 tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 542 ret void 543} 544 545; Store 4 x v4f16 without interleaving 546define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 547; CHECK-LABEL: store_64_4: 548; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 549entry: 550 tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 551 ret void 552} 553 554; Load 2 x v8f16 without de-interleaving 555define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 { 556; CHECK-LABEL: load_128_2: 557; CHECK: ld1 { v0.8h, v1.8h }, [x0] 558entry: 559 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a) 560 ret { <8 x half>, <8 x half> } %0 561} 562 563; Load 3 x v8f16 without de-interleaving 564define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 { 565; CHECK-LABEL: load_128_3: 566; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] 567entry: 568 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a) 569 ret { <8 x half>, <8 x half>, <8 x half> } %0 570} 571 572; Load 8 x v8f16 without de-interleaving 573define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 { 574; CHECK-LABEL: load_128_4: 575; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 576entry: 577 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a) 578 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 579} 580 581; Store 2 x v8f16 without interleaving 582define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 583; CHECK-LABEL: store_128_2: 584; CHECK: st1 { v0.8h, v1.8h }, [x0] 585entry: 586 tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 587 ret void 588} 589 590; Store 3 x v8f16 without interleaving 591define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 592; CHECK-LABEL: store_128_3: 593; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] 594entry: 595 tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 596 ret void 597} 598 599; Store 8 x v8f16 without interleaving 600define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 601; CHECK-LABEL: store_128_4: 602; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 603entry: 604 tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 605 ret void 606} 607