1 // RUN: %clang_cc1 -triple armv8-linux-gnueabihf -target-cpu cortex-a75 -target-feature +dotprod \
2 // RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
3 
4 // REQUIRES: arm-registered-target
5 
6 // Test ARM v8.2-A dot product intrinsics
7 
8 #include <arm_neon.h>
9 
test_vdot_u32(uint32x2_t a,uint8x8_t b,uint8x8_t c)10 uint32x2_t test_vdot_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
11 // CHECK-LABEL: define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
12 // CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
13 // CHECK: ret <2 x i32> [[RESULT]]
14   return vdot_u32(a, b, c);
15 }
16 
test_vdotq_u32(uint32x4_t a,uint8x16_t b,uint8x16_t c)17 uint32x4_t test_vdotq_u32(uint32x4_t a, uint8x16_t b, uint8x16_t c) {
18 // CHECK-LABEL: define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
19 // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
20 // CHECK: ret <4 x i32> [[RESULT]]
21   return vdotq_u32(a, b, c);
22 }
23 
test_vdot_s32(int32x2_t a,int8x8_t b,int8x8_t c)24 int32x2_t test_vdot_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
25 // CHECK-LABEL: define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
26 // CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
27 // CHECK: ret <2 x i32> [[RESULT]]
28   return vdot_s32(a, b, c);
29 }
30 
test_vdotq_s32(int32x4_t a,int8x16_t b,int8x16_t c)31 int32x4_t test_vdotq_s32(int32x4_t a, int8x16_t b, int8x16_t c) {
32 // CHECK-LABEL: define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
33 // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c)
34 // CHECK: ret <4 x i32> [[RESULT]]
35   return vdotq_s32(a, b, c);
36 }
37 
test_vdot_lane_u32(uint32x2_t a,uint8x8_t b,uint8x8_t c)38 uint32x2_t test_vdot_lane_u32(uint32x2_t a, uint8x8_t b, uint8x8_t c) {
39 // CHECK-LABEL: define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
40 // CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
41 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
42 // CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
43 // CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
44 // CHECK: ret <2 x i32> [[RESULT]]
45   return vdot_lane_u32(a, b, c, 1);
46 }
47 
test_vdotq_lane_u32(uint32x4_t a,uint8x16_t b,uint8x8_t c)48 uint32x4_t test_vdotq_lane_u32(uint32x4_t a, uint8x16_t b, uint8x8_t c) {
49 // CHECK-LABEL: define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
50 // CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
51 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
52 // CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
53 // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
54 // CHECK: ret <4 x i32> [[RESULT]]
55   return vdotq_lane_u32(a, b, c, 1);
56 }
57 
test_vdot_lane_s32(int32x2_t a,int8x8_t b,int8x8_t c)58 int32x2_t test_vdot_lane_s32(int32x2_t a, int8x8_t b, int8x8_t c) {
59 // CHECK-LABEL: define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c)
60 // CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
61 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
62 // CHECK: [[CAST2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
63 // CHECK: [[RESULT:%.*]] = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> [[CAST2]])
64 // CHECK: ret <2 x i32> [[RESULT]]
65   return vdot_lane_s32(a, b, c, 1);
66 }
67 
test_vdotq_lane_s32(int32x4_t a,int8x16_t b,int8x8_t c)68 int32x4_t test_vdotq_lane_s32(int32x4_t a, int8x16_t b, int8x8_t c) {
69 // CHECK-LABEL: define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c)
70 // CHECK: [[CAST1:%.*]] = bitcast <8 x i8> %c to <2 x i32>
71 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[CAST1]], <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
72 // CHECK: [[CAST2:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
73 // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> [[CAST2]])
74 // CHECK: ret <4 x i32> [[RESULT]]
75   return vdotq_lane_s32(a, b, c, 1);
76 }
77