1; RUN: llc -mtriple aarch64-arm-none-eabi -asm-verbose=1 -mattr=+bf16 %s -o - | FileCheck %s 2 3%struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] } 4%struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] } 5%struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] } 6%struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] } 7%struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] } 8%struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] } 9 10define <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind { 11; CHECK-LABEL: test_vld1_bf16: 12; CHECK: // %bb.0: // %entry 13; CHECK-NEXT: ldr d0, [x0] 14; CHECK-NEXT: ret 15entry: 16 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 17 %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2 18 ret <4 x bfloat> %1 19} 20 21define <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind { 22; CHECK-LABEL: test_vld1q_bf16: 23; CHECK: // %bb.0: // %entry 24; CHECK-NEXT: ldr q0, [x0] 25; CHECK-NEXT: ret 26entry: 27 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 28 %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 29 ret <8 x bfloat> %1 30} 31 32define <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr nounwind { 33; CHECK-LABEL: test_vld1_lane_bf16: 34; CHECK: // %bb.0: // %entry 35; CHECK: ld1 { v0.h }[0], [x0] 36; CHECK: ret 37entry: 38 %0 = load bfloat, bfloat* %ptr, align 2 39 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 40 ret <4 x bfloat> %vld1_lane 41} 42 43define <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr nounwind { 44; CHECK-LABEL: test_vld1q_lane_bf16: 45; CHECK: // %bb.0: // %entry 46; CHECK-NEXT: ld1 { v0.h }[7], [x0] 47; CHECK-NEXT: ret 48entry: 49 %0 = load bfloat, bfloat* %ptr, align 2 50 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 51 ret <8 x bfloat> %vld1_lane 52} 53 54define <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind { 55; CHECK-LABEL: test_vld1_dup_bf16: 56; CHECK: // %bb.0: // %entry 57; CHECK-NEXT: ld1r { v0.4h }, [x0] 58; CHECK-NEXT: ret 59entry: 60 %0 = load bfloat, bfloat* %ptr, align 2 61 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 62 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer 63 ret <4 x bfloat> %lane 64} 65 66define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind { 67; CHECK-LABEL: test_vld1_bf16_x2: 68; CHECK: // %bb.0: // %entry 69; CHECK-NEXT: ld1 { v0.4h, v1.4h }, [x0] 70; CHECK-NEXT: ret 71entry: 72 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr) 73 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 74 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 75 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 76 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 77 ret %struct.bfloat16x4x2_t %.fca.0.1.insert 78} 79 80declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat*) nounwind 81 82define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(bfloat* %ptr) local_unnamed_addr nounwind { 83; CHECK-LABEL: test_vld1q_bf16_x2: 84; CHECK: // %bb.0: // %entry 85; CHECK-NEXT: ld1 { v0.8h, v1.8h }, [x0] 86; CHECK-NEXT: ret 87entry: 88 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr) 89 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 90 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 91 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 92 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 93 ret %struct.bfloat16x8x2_t %.fca.0.1.insert 94} 95 96; Function Attrs: argmemonly nounwind readonly 97declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat*) nounwind 98 99define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind { 100; CHECK-LABEL: test_vld1_bf16_x3: 101; CHECK: // %bb.0: // %entry 102; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h }, [x0] 103; CHECK-NEXT: ret 104entry: 105 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr) 106 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 107 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 108 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 109 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 110 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 111 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 112 ret %struct.bfloat16x4x3_t %.fca.0.2.insert 113} 114 115; Function Attrs: argmemonly nounwind readonly 116declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat*) nounwind 117 118define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(bfloat* %ptr) local_unnamed_addr nounwind { 119; CHECK-LABEL: test_vld1q_bf16_x3: 120; CHECK: // %bb.0: // %entry 121; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h }, [x0] 122; CHECK-NEXT: ret 123entry: 124 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr) 125 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 126 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 127 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 128 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 129 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 130 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 131 ret %struct.bfloat16x8x3_t %.fca.0.2.insert 132} 133 134; Function Attrs: argmemonly nounwind readonly 135declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat*) nounwind 136 137define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind { 138; CHECK-LABEL: test_vld1_bf16_x4: 139; CHECK: // %bb.0: // %entry 140; CHECK-NEXT: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 141; CHECK-NEXT: ret 142entry: 143 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr) 144 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 145 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 146 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 147 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3 148 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 149 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 150 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 151 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3 152 ret %struct.bfloat16x4x4_t %.fca.0.3.insert 153} 154 155; Function Attrs: argmemonly nounwind readonly 156declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat*) nounwind 157 158define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(bfloat* %ptr) local_unnamed_addr nounwind { 159; CHECK-LABEL: test_vld1q_bf16_x4: 160; CHECK: // %bb.0: // %entry 161; CHECK-NEXT: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 162; CHECK-NEXT: ret 163entry: 164 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr) 165 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 166 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 167 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 168 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3 169 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 170 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 171 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 172 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3 173 ret %struct.bfloat16x8x4_t %.fca.0.3.insert 174} 175 176; Function Attrs: argmemonly nounwind readonly 177declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat*) nounwind 178 179define <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr nounwind { 180; CHECK-LABEL: test_vld1q_dup_bf16: 181; CHECK: // %bb.0: // %entry 182; CHECK-NEXT: ld1r { v0.8h }, [x0] 183; CHECK-NEXT: ret 184entry: 185 %0 = load bfloat, bfloat* %ptr, align 2 186 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 187 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer 188 ret <8 x bfloat> %lane 189} 190 191define %struct.bfloat16x4x2_t @test_vld2_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 192; CHECK-LABEL: test_vld2_bf16: 193; CHECK: // %bb.0: // %entry 194; CHECK-NEXT: ld2 { v0.4h, v1.4h }, [x0] 195; CHECK-NEXT: ret 196entry: 197 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 198 %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0) 199 %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 200 %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 201 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 202 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 203 ret %struct.bfloat16x4x2_t %.fca.0.1.insert 204} 205 206; Function Attrs: argmemonly nounwind readonly 207declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind 208 209define %struct.bfloat16x8x2_t @test_vld2q_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 210; CHECK-LABEL: test_vld2q_bf16: 211; CHECK: // %bb.0: // %entry 212; CHECK-NEXT: ld2 { v0.8h, v1.8h }, [x0] 213; CHECK-NEXT: ret 214entry: 215 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 216 %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0) 217 %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 218 %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 219 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 220 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 221 ret %struct.bfloat16x8x2_t %.fca.0.1.insert 222} 223 224; Function Attrs: argmemonly nounwind readonly 225declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind 226define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(bfloat* %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 227; CHECK-LABEL: test_vld2_lane_bf16: 228; CHECK: // %bb.0: // %entry 229; CHECK: ld2 { v0.h, v1.h }[1], [x0] 230; CHECK: ret 231entry: 232 %src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0 233 %src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1 234 %0 = bitcast bfloat* %ptr to i8* 235 %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0) 236 %vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0 237 %vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1 238 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0 239 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1 240 ret %struct.bfloat16x4x2_t %.fca.0.1.insert 241} 242 243; Function Attrs: argmemonly nounwind readonly 244declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind 245 246define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 247; CHECK-LABEL: test_vld2q_lane_bf16: 248; CHECK: // %bb.0: // %entry 249; CHECK: ld2 { v0.h, v1.h }[7], [x0] 250; CHECK: ret 251entry: 252 %src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0 253 %src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1 254 %0 = bitcast bfloat* %ptr to i8* 255 %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0) 256 %vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0 257 %vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1 258 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0 259 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1 260 ret %struct.bfloat16x8x2_t %.fca.0.1.insert 261} 262 263; Function Attrs: argmemonly nounwind readonly 264declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind 265 266define %struct.bfloat16x4x3_t @test_vld3_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 267; CHECK-LABEL: test_vld3_bf16: 268; CHECK: // %bb.0: // %entry 269; CHECK-NEXT: ld3 { v0.4h, v1.4h, v2.4h }, [x0] 270; CHECK-NEXT: ret 271entry: 272 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 273 %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0) 274 %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 275 %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 276 %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 277 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 278 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 279 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 280 ret %struct.bfloat16x4x3_t %.fca.0.2.insert 281} 282 283; Function Attrs: argmemonly nounwind readonly 284declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind 285 286define %struct.bfloat16x8x3_t @test_vld3q_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 287; CHECK-LABEL: test_vld3q_bf16: 288; CHECK: // %bb.0: // %entry 289; CHECK-NEXT: ld3 { v0.8h, v1.8h, v2.8h }, [x0] 290; CHECK-NEXT: ret 291entry: 292 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 293 %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0) 294 %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 295 %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 296 %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 297 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 298 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 299 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 300 ret %struct.bfloat16x8x3_t %.fca.0.2.insert 301} 302 303; Function Attrs: argmemonly nounwind readonly 304declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind 305 306define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(bfloat* %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 307; CHECK-LABEL: test_vld3_lane_bf16: 308; CHECK: // %bb.0: // %entry 309; CHECK: ld3 { v0.h, v1.h, v2.h }[1], [x0] 310; CHECK: ret 311entry: 312 %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0 313 %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1 314 %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2 315 %0 = bitcast bfloat* %ptr to i8* 316 %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) 317 %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0 318 %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1 319 %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2 320 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0 321 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1 322 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2 323 ret %struct.bfloat16x4x3_t %.fca.0.2.insert 324} 325 326; Function Attrs: argmemonly nounwind readonly 327declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind 328 329define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 330; CHECK-LABEL: test_vld3q_lane_bf16: 331; CHECK: // %bb.0: // %entry 332; CHECKT: ld3 { v0.h, v1.h, v2.h }[7], [x0] 333; CHECKT: ret 334entry: 335 %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0 336 %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1 337 %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2 338 %0 = bitcast bfloat* %ptr to i8* 339 %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) 340 %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0 341 %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1 342 %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2 343 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0 344 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1 345 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2 346 ret %struct.bfloat16x8x3_t %.fca.0.2.insert 347} 348 349; Function Attrs: argmemonly nounwind readonly 350declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind 351 352define %struct.bfloat16x4x4_t @test_vld4_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 353; CHECK-LABEL: test_vld4_bf16: 354; CHECK: // %bb.0: // %entry 355; CHECK-NEXT: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 356; CHECK-NEXT: ret 357entry: 358 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 359 %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0) 360 %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 361 %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 362 %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 363 %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 364 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 365 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 366 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 367 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 368 ret %struct.bfloat16x4x4_t %.fca.0.3.insert 369} 370 371; Function Attrs: argmemonly nounwind readonly 372declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>*) nounwind 373 374define %struct.bfloat16x8x4_t @test_vld4q_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 375; CHECK-LABEL: test_vld4q_bf16: 376; CHECK: // %bb.0: // %entry 377; CHECK-NEXT: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 378; CHECK-NEXT: ret 379entry: 380 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 381 %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0) 382 %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 383 %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 384 %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 385 %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 386 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 387 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 388 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 389 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 390 ret %struct.bfloat16x8x4_t %.fca.0.3.insert 391} 392 393; Function Attrs: argmemonly nounwind readonly 394declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>*) nounwind 395 396define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(bfloat* %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 397; CHECK-LABEL: test_vld4_lane_bf16: 398; CHECK: // %bb.0: // %entry 399; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] 400; CHECK: ret 401entry: 402 %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0 403 %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1 404 %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2 405 %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3 406 %0 = bitcast bfloat* %ptr to i8* 407 %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) 408 %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0 409 %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1 410 %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2 411 %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3 412 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0 413 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1 414 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2 415 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3 416 ret %struct.bfloat16x4x4_t %.fca.0.3.insert 417} 418 419; Function Attrs: argmemonly nounwind readonly 420declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) nounwind 421 422define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr nounwind { 423; CHECK-LABEL: test_vld4q_lane_bf16: 424; CHECK: // %bb.0: // %entry 425; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] 426; CHECK: ret 427entry: 428 %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0 429 %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1 430 %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2 431 %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3 432 %0 = bitcast bfloat* %ptr to i8* 433 %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) 434 %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0 435 %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1 436 %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2 437 %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3 438 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0 439 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1 440 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2 441 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3 442 ret %struct.bfloat16x8x4_t %.fca.0.3.insert 443} 444 445; Function Attrs: argmemonly nounwind readonly 446declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) nounwind 447 448define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 449; CHECK-LABEL: test_vld2_dup_bf16: 450; CHECK: // %bb.0: // %entry 451; CHECK-NEXT: ld2r { v0.4h, v1.4h }, [x0] 452; CHECK-NEXT: ret 453entry: 454 %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr) 455 %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 456 %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 457 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 458 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 459 ret %struct.bfloat16x4x2_t %.fca.0.1.insert 460} 461 462; Function Attrs: argmemonly nounwind readonly 463declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat*) nounwind 464 465define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 466; CHECK-LABEL: test_vld2q_dup_bf16: 467; CHECK: // %bb.0: // %entry 468; CHECK-NEXT: ld2r { v0.8h, v1.8h }, [x0] 469; CHECK-NEXT: ret 470entry: 471 %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr) 472 %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 473 %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 474 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 475 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 476 ret %struct.bfloat16x8x2_t %.fca.0.1.insert 477} 478 479; Function Attrs: argmemonly nounwind readonly 480declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat*) nounwind 481 482define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 483; CHECK-LABEL: test_vld3_dup_bf16: 484; CHECK: // %bb.0: // %entry 485; CHECK-NEXT: ld3r { v0.4h, v1.4h, v2.4h }, [x0] 486; CHECK-NEXT: ret 487entry: 488 %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr) 489 %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 490 %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 491 %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 492 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 493 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 494 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 495 ret %struct.bfloat16x4x3_t %.fca.0.2.insert 496} 497 498; Function Attrs: argmemonly nounwind readonly 499declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat*) nounwind 500 501define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 502; CHECK-LABEL: test_vld3q_dup_bf16: 503; CHECK: // %bb.0: // %entry 504; CHECK-NEXT: ld3r { v0.8h, v1.8h, v2.8h }, [x0] 505; CHECK-NEXT: ret 506entry: 507 %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr) 508 %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 509 %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 510 %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 511 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 512 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 513 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 514 ret %struct.bfloat16x8x3_t %.fca.0.2.insert 515} 516 517; Function Attrs: argmemonly nounwind readonly 518declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat*) nounwind 519 520define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 521; CHECK-LABEL: test_vld4_dup_bf16: 522; CHECK: // %bb.0: // %entry 523; CHECK-NEXT: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 524; CHECK-NEXT: ret 525entry: 526 %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr) 527 %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 528 %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 529 %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 530 %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 531 %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 532 %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 533 %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 534 %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 535 ret %struct.bfloat16x4x4_t %.fca.0.3.insert 536} 537 538; Function Attrs: argmemonly nounwind readonly 539declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat*) nounwind 540 541define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(bfloat* %ptr) local_unnamed_addr nounwind { 542; CHECK-LABEL: test_vld4q_dup_bf16: 543; CHECK: // %bb.0: // %entry 544; CHECK-NEXT: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 545; CHECK-NEXT: ret 546entry: 547 %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr) 548 %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 549 %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 550 %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 551 %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 552 %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 553 %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 554 %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 555 %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 556 ret %struct.bfloat16x8x4_t %.fca.0.3.insert 557} 558 559; Function Attrs: argmemonly nounwind readonly 560declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat*) nounwind 561 562define void @test_vst1_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind { 563; CHECK-LABEL: test_vst1_bf16: 564; CHECK: // %bb.0: // %entry 565; CHECK-NEXT: str d0, [x0] 566; CHECK-NEXT: ret 567entry: 568 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 569 store <4 x bfloat> %val, <4 x bfloat>* %0, align 8 570 ret void 571} 572 573define void @test_vst1q_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind { 574; CHECK-LABEL: test_vst1q_bf16: 575; CHECK: // %bb.0: // %entry 576; CHECK-NEXT: str q0, [x0] 577; CHECK-NEXT: ret 578entry: 579 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 580 store <8 x bfloat> %val, <8 x bfloat>* %0, align 16 581 ret void 582} 583 584define void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr nounwind { 585; CHECK-LABEL: test_vst1_lane_bf16: 586; CHECK: // %bb.0: // %entry 587; CHECK: st1 { v0.h }[1], [x0] 588; CHECK: ret 589entry: 590 %0 = extractelement <4 x bfloat> %val, i32 1 591 store bfloat %0, bfloat* %ptr, align 2 592 ret void 593} 594 595define void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr nounwind { 596; CHECK-LABEL: test_vst1q_lane_bf16: 597; CHECK: // %bb.0: // %entry 598; CHECK-NEXT: st1 { v0.h }[7], [x0] 599; CHECK-NEXT: ret 600entry: 601 %0 = extractelement <8 x bfloat> %val, i32 7 602 store bfloat %0, bfloat* %ptr, align 2 603 ret void 604} 605 606define void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 607; CHECK-LABEL: test_vst1_bf16_x2: 608; CHECK: // %bb.0: // %entry 609; CHECK: st1 { v0.4h, v1.4h }, [x0] 610; CHECK: ret 611entry: 612 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 613 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 614 tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) 615 ret void 616} 617 618; Function Attrs: argmemonly nounwind 619declare void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind 620 621define void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 622; CHECK-LABEL: test_vst1q_bf16_x2: 623; CHECK: // %bb.0: // %entry 624; CHECK: st1 { v0.8h, v1.8h }, [x0] 625; CHECK: ret 626entry: 627 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 628 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 629 tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) 630 ret void 631} 632 633; Function Attrs: argmemonly nounwind 634declare void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind 635 636define void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 637; CHECK-LABEL: test_vst1_bf16_x3: 638; CHECK: // %bb.0: // %entry 639; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] 640; CHECK: ret 641entry: 642 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 643 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 644 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 645 tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) 646 ret void 647} 648 649; Function Attrs: argmemonly nounwind 650declare void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind 651 652define void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 653; CHECK-LABEL: test_vst1q_bf16_x3: 654; CHECK: // %bb.0: // %entry 655; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] 656; CHECK: ret 657entry: 658 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 659 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 660 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 661 tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) 662 ret void 663} 664 665; Function Attrs: argmemonly nounwind 666declare void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind 667 668; Function Attrs: nounwind 669define void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 670; CHECK-LABEL: test_vst1_bf16_x4: 671; CHECK: // %bb.0: // %entry 672; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 673; CHECK: ret 674entry: 675 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 676 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 677 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 678 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 679 tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) 680 ret void 681} 682 683; Function Attrs: argmemonly nounwind 684declare void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) nounwind 685 686define void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 687; CHECK-LABEL: test_vst1q_bf16_x4: 688; CHECK: // %bb.0: // %entry 689; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 690; CHECK: ret 691entry: 692 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 693 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 694 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 695 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 696 tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) 697 ret void 698} 699 700; Function Attrs: argmemonly nounwind 701declare void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) nounwind 702 703define void @test_vst2_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 704; CHECK-LABEL: test_vst2_bf16: 705; CHECK: // %bb.0: // %entry 706; CHECK: st2 { v0.4h, v1.4h }, [x0] 707; CHECK: ret 708entry: 709 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 710 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 711 %0 = bitcast bfloat* %ptr to i8* 712 tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0) 713 ret void 714} 715 716; Function Attrs: argmemonly nounwind 717declare void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind 718 719define void @test_vst2q_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 720; CHECK-LABEL: test_vst2q_bf16: 721; CHECK: // %bb.0: // %entry 722; CHECK: st2 { v0.8h, v1.8h }, [x0] 723; CHECK: ret 724entry: 725 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 726 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 727 %0 = bitcast bfloat* %ptr to i8* 728 tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0) 729 ret void 730} 731 732; Function Attrs: argmemonly nounwind 733declare void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind 734 735define void @test_vst2_lane_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 736; CHECK-LABEL: test_vst2_lane_bf16: 737; CHECK: // %bb.0: // %entry 738; CHECK: st2 { v0.h, v1.h }[1], [x0] 739; CHECK: ret 740entry: 741 %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 742 %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 743 %0 = bitcast bfloat* %ptr to i8* 744 tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0) 745 ret void 746} 747 748; Function Attrs: argmemonly nounwind 749declare void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind 750 751; Function Attrs: nounwind 752define void @test_vst2q_lane_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 753; CHECK-LABEL: test_vst2q_lane_bf16: 754; CHECK: // %bb.0: // %entry 755; CHECK: st2 { v0.h, v1.h }[7], [x0] 756; CHECK: ret 757entry: 758 %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 759 %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 760 %0 = bitcast bfloat* %ptr to i8* 761 tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0) 762 ret void 763} 764 765; Function Attrs: argmemonly nounwind 766declare void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind 767 768; Function Attrs: nounwind 769define void @test_vst3_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 770; CHECK-LABEL: test_vst3_bf16: 771; CHECK: // %bb.0: // %entry 772; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] 773; CHECK: ret 774entry: 775 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 776 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 777 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 778 %0 = bitcast bfloat* %ptr to i8* 779 tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0) 780 ret void 781} 782 783; Function Attrs: argmemonly nounwind 784declare void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind 785 786; Function Attrs: nounwind 787define void @test_vst3q_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 788; CHECK-LABEL: test_vst3q_bf16: 789; CHECK: // %bb.0: // %entry 790; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] 791; CHECK: ret 792entry: 793 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 794 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 795 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 796 %0 = bitcast bfloat* %ptr to i8* 797 tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0) 798 ret void 799} 800 801; Function Attrs: argmemonly nounwind 802declare void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind 803 804; Function Attrs: nounwind 805define void @test_vst3_lane_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 806; CHECK-LABEL: test_vst3_lane_bf16: 807; CHECK: // %bb.0: // %entry 808; CHECK: st3 { v0.h, v1.h, v2.h }[1], [x0] 809; CHECK: ret 810entry: 811 %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 812 %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 813 %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 814 %0 = bitcast bfloat* %ptr to i8* 815 tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0) 816 ret void 817} 818 819; Function Attrs: argmemonly nounwind 820declare void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind 821 822; Function Attrs: nounwind 823define void @test_vst3q_lane_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 824; CHECK-LABEL: test_vst3q_lane_bf16: 825; CHECK: // %bb.0: // %entry 826; CHECK: st3 { v0.h, v1.h, v2.h }[7], [x0] 827; CHECK: ret 828entry: 829 %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 830 %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 831 %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 832 %0 = bitcast bfloat* %ptr to i8* 833 tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0) 834 ret void 835} 836 837; Function Attrs: argmemonly nounwind 838declare void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind 839 840; Function Attrs: nounwind 841define void @test_vst4_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 842; CHECK-LABEL: test_vst4_bf16: 843; CHECK: // %bb.0: // %entry 844; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 845; CHECK: ret 846entry: 847 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 848 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 849 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 850 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 851 %0 = bitcast bfloat* %ptr to i8* 852 tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0) 853 ret void 854} 855 856; Function Attrs: argmemonly nounwind 857declare void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) nounwind 858 859; Function Attrs: nounwind 860define void @test_vst4q_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 861; CHECK-LABEL: test_vst4q_bf16: 862; CHECK: // %bb.0: // %entry 863; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 864; CHECK: ret 865entry: 866 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 867 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 868 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 869 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 870 %0 = bitcast bfloat* %ptr to i8* 871 tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0) 872 ret void 873} 874 875; Function Attrs: argmemonly nounwind 876declare void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) nounwind 877 878; Function Attrs: nounwind 879define void @test_vst4_lane_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 880; CHECK-LABEL: test_vst4_lane_bf16: 881; CHECK: // %bb.0: // %entry 882; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] 883; CHECK: ret 884entry: 885 %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 886 %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 887 %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 888 %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 889 %0 = bitcast bfloat* %ptr to i8* 890 tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0) 891 ret void 892} 893 894; Function Attrs: argmemonly nounwind 895declare void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) nounwind 896 897; Function Attrs: nounwind 898define void @test_vst4q_lane_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr nounwind { 899; CHECK-LABEL: test_vst4q_lane_bf16: 900; CHECK: // %bb.0: // %entry 901; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] 902; CHECK: ret 903entry: 904 %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 905 %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 906 %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 907 %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 908 %0 = bitcast bfloat* %ptr to i8* 909 tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0) 910 ret void 911} 912 913; Function Attrs: argmemonly nounwind 914declare void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) nounwind 915 916 917