1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s 2 3; 4; BFDOT 5; 6 7define <vscale x 4 x float> @bfdot_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 8; CHECK-LABEL: bfdot_f32: 9; CHECK-NEXT: bfdot z0.s, z1.h, z2.h 10; CHECK-NEXT: ret 11 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) 12 ret <vscale x 4 x float> %out 13} 14 15define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 16; CHECK-LABEL: bfdot_lane_0_f32: 17; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[0] 18; CHECK-NEXT: ret 19 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) 20 ret <vscale x 4 x float> %out 21} 22 23define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 24; CHECK-LABEL: bfdot_lane_1_f32: 25; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[1] 26; CHECK-NEXT: ret 27 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) 28 ret <vscale x 4 x float> %out 29} 30 31define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 32; CHECK-LABEL: bfdot_lane_2_f32: 33; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[2] 34; CHECK-NEXT: ret 35 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) 36 ret <vscale x 4 x float> %out 37} 38 39define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 40; CHECK-LABEL: bfdot_lane_3_f32: 41; CHECK-NEXT: bfdot z0.s, z1.h, z2.h[3] 42; CHECK-NEXT: ret 43 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) 44 ret <vscale x 4 x float> %out 45} 46 47; 48; BFMLALB 49; 50 51define <vscale x 4 x float> @bfmlalb_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 52; CHECK-LABEL: bfmlalb_f32: 53; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h 54; CHECK-NEXT: ret 55 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) 56 ret <vscale x 4 x float> %out 57} 58 59define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 60; CHECK-LABEL: bfmlalb_lane_0_f32: 61; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[0] 62; CHECK-NEXT: ret 63 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) 64 ret <vscale x 4 x float> %out 65} 66 67define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 68; CHECK-LABEL: bfmlalb_lane_1_f32: 69; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[1] 70; CHECK-NEXT: ret 71 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) 72 ret <vscale x 4 x float> %out 73} 74 75define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 76; CHECK-LABEL: bfmlalb_lane_2_f32: 77; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[2] 78; CHECK-NEXT: ret 79 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) 80 ret <vscale x 4 x float> %out 81} 82 83define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 84; CHECK-LABEL: bfmlalb_lane_3_f32: 85; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[3] 86; CHECK-NEXT: ret 87 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) 88 ret <vscale x 4 x float> %out 89} 90 91define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 92; CHECK-LABEL: bfmlalb_lane_4_f32: 93; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[4] 94; CHECK-NEXT: ret 95 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4) 96 ret <vscale x 4 x float> %out 97} 98 99define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 100; CHECK-LABEL: bfmlalb_lane_5_f32: 101; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[5] 102; CHECK-NEXT: ret 103 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5) 104 ret <vscale x 4 x float> %out 105} 106 107define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 108; CHECK-LABEL: bfmlalb_lane_6_f32: 109; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[6] 110; CHECK-NEXT: ret 111 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6) 112 ret <vscale x 4 x float> %out 113} 114 115define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 116; CHECK-LABEL: bfmlalb_lane_7_f32: 117; CHECK-NEXT: bfmlalb z0.s, z1.h, z2.h[7] 118; CHECK-NEXT: ret 119 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7) 120 ret <vscale x 4 x float> %out 121} 122 123; 124; BFMLALT 125; 126 127define <vscale x 4 x float> @bfmlalt_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 128; CHECK-LABEL: bfmlalt_f32: 129; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h 130; CHECK-NEXT: ret 131 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) 132 ret <vscale x 4 x float> %out 133} 134 135define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 136; CHECK-LABEL: bfmlalt_lane_0_f32: 137; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[0] 138; CHECK-NEXT: ret 139 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0) 140 ret <vscale x 4 x float> %out 141} 142 143define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 144; CHECK-LABEL: bfmlalt_lane_1_f32: 145; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[1] 146; CHECK-NEXT: ret 147 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1) 148 ret <vscale x 4 x float> %out 149} 150 151define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 152; CHECK-LABEL: bfmlalt_lane_2_f32: 153; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[2] 154; CHECK-NEXT: ret 155 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2) 156 ret <vscale x 4 x float> %out 157} 158 159define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 160; CHECK-LABEL: bfmlalt_lane_3_f32: 161; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[3] 162; CHECK-NEXT: ret 163 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3) 164 ret <vscale x 4 x float> %out 165} 166 167define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 168; CHECK-LABEL: bfmlalt_lane_4_f32: 169; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[4] 170; CHECK-NEXT: ret 171 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4) 172 ret <vscale x 4 x float> %out 173} 174 175define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 176; CHECK-LABEL: bfmlalt_lane_5_f32: 177; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[5] 178; CHECK-NEXT: ret 179 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5) 180 ret <vscale x 4 x float> %out 181} 182 183define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 184; CHECK-LABEL: bfmlalt_lane_6_f32: 185; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[6] 186; CHECK-NEXT: ret 187 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6) 188 ret <vscale x 4 x float> %out 189} 190 191define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 192; CHECK-LABEL: bfmlalt_lane_7_f32: 193; CHECK-NEXT: bfmlalt z0.s, z1.h, z2.h[7] 194; CHECK-NEXT: ret 195 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7) 196 ret <vscale x 4 x float> %out 197} 198 199; 200; BFMMLA 201; 202 203define <vscale x 4 x float> @bfmmla_f32(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind { 204; CHECK-LABEL: bfmmla_f32: 205; CHECK-NEXT: bfmmla z0.s, z1.h, z2.h 206; CHECK-NEXT: ret 207 %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) 208 ret <vscale x 4 x float> %out 209} 210 211; 212; BFCVT 213; 214 215define <vscale x 8 x bfloat> @fcvt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind { 216; CHECK-LABEL: fcvt_bf16_f32: 217; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s 218; CHECK-NEXT: ret 219 %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) 220 ret <vscale x 8 x bfloat> %out 221} 222 223; 224; BFCVTNT 225; 226 227define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) nounwind { 228; CHECK-LABEL: fcvtnt_bf16_f32: 229; CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s 230; CHECK-NEXT: ret 231 %out = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat> %a, <vscale x 8 x i1> %pg, <vscale x 4 x float> %b) 232 ret <vscale x 8 x bfloat> %out 233} 234 235declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 236declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) 237declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 238declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) 239declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 240declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64) 241declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 242declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>) 243declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>) 244