1 // RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
2 
3 // Also check we do not crash when running some middle-end passes. Most
4 // importantly this includes the IR verifier, to ensure we emit valid IR.
5 // RUN: %clang_cc1 -fenable-matrix -emit-llvm -triple x86_64-apple-darwin %s -o %t
6 
7 // Tests for the matrix type builtins.
8 
9 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
10 typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
11 typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
12 typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
13 typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
14 typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
15 typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1)));
16 
transpose_double_5x5(dx5x5_t * a)17 void transpose_double_5x5(dx5x5_t *a) {
18   // CHECK-LABEL: define void @transpose_double_5x5(
19   // CHECK:        [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
20   // CHECK-NEXT:   [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5)
21   // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [25 x double]* %a_t to <25 x double>*
22   // CHECK-NEXT:   store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 8
23   dx5x5_t a_t = __builtin_matrix_transpose(*a);
24 }
25 
transpose_float_3x2(fx3x2_t * a)26 void transpose_float_3x2(fx3x2_t *a) {
27   // CHECK-LABEL: define void @transpose_float_3x2(
28   // CHECK:        [[A:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
29   // CHECK-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
30   // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [6 x float]* %a_t to <6 x float>*
31   // CHECK-NEXT:   store <6 x float> [[TRANS]], <6 x float>* [[AT_ADDR]], align 4
32 
33   fx2x3_t a_t = __builtin_matrix_transpose(*a);
34 }
35 
transpose_int_20x4(ix20x4_t * a)36 void transpose_int_20x4(ix20x4_t *a) {
37   // CHECK-LABEL: define void @transpose_int_20x4(
38   // CHECK:         [[A:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4
39   // CHECK-NEXT:    [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4)
40   // CHECK-NEXT:    [[AT_ADDR:%.*]] = bitcast [80 x i32]* %a_t to <80 x i32>*
41   // CHECK-NEXT:    store <80 x i32> [[TRANS]], <80 x i32>* [[AT_ADDR]], align 4
42 
43   ix4x20_t a_t = __builtin_matrix_transpose(*a);
44 }
45 
46 struct Foo {
47   ux1x6_t in;
48   ux6x1_t out;
49 };
50 
transpose_struct_member(struct Foo * F)51 void transpose_struct_member(struct Foo *F) {
52   // CHECK-LABEL: define void @transpose_struct_member(
53   // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
54   // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
55   // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
56   // CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 1
57   // CHECK-NEXT:    [[OUT_PTR_C:%.*]] = bitcast [6 x i32]* [[OUT_PTR]] to <6 x i32>*
58   // CHECK-NEXT:    store <6 x i32> [[M_T]], <6 x i32>* [[OUT_PTR_C]], align 4
59 
60   F->out = __builtin_matrix_transpose(F->in);
61 }
62 
transpose_transpose_struct_member(struct Foo * F)63 void transpose_transpose_struct_member(struct Foo *F) {
64   // CHECK-LABEL: define void @transpose_transpose_struct_member(
65   // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
66   // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
67   // CHECK-NEXT:    [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1)
68   // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
69   // CHECK-NEXT:    [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 0
70   // CHECK-NEXT:    [[IN_PTR_C:%.*]] = bitcast [6 x i32]* [[IN_PTR]] to <6 x i32>*
71   // CHECK-NEXT:    store <6 x i32> [[M_T2]], <6 x i32>* [[IN_PTR_C]], align 4
72 
73   F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in));
74 }
75 
76 dx5x5_t get_matrix();
77 
transpose_rvalue()78 void transpose_rvalue() {
79   // CHECK-LABEL: define void @transpose_rvalue()
80   // CHECK-NEXT:  entry:
81   // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
82   // CHECK-NEXT:    [[CALL:%.*]] = call <25 x double> (...) @get_matrix()
83   // CHECK-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[CALL]], i32 5, i32 5)
84   // CHECK-NEXT:    [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>*
85   // CHECK-NEXT:    store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8
86 
87   dx5x5_t m_t = __builtin_matrix_transpose(get_matrix());
88 }
89 
90 const dx5x5_t global_matrix;
91 
transpose_global()92 void transpose_global() {
93   // CHECK-LABEL: define void @transpose_global()
94   // CHECK-NEXT:  entry:
95   // CHECK-NEXT:    [[M_T_ADDR:%.*]] = alloca [25 x double], align 8
96   // CHECK-NEXT:    [[GLOBAL_MATRIX:%.*]] = load <25 x double>, <25 x double>* bitcast ([25 x double]* @global_matrix to <25 x double>*), align 8
97   // CHECK-NEXT:    [[M_T:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[GLOBAL_MATRIX]], i32 5, i32 5)
98   // CHECK-NEXT:    [[M_T_ADDR_C:%.*]] = bitcast [25 x double]* [[M_T_ADDR]] to <25 x double>*
99   // CHECK-NEXT:    store <25 x double> [[M_T]], <25 x double>* [[M_T_ADDR_C]], align 8
100 
101   dx5x5_t m_t = __builtin_matrix_transpose(global_matrix);
102 }
103 
column_major_load_with_const_stride_double(double * Ptr)104 void column_major_load_with_const_stride_double(double *Ptr) {
105   // CHECK-LABEL: define void @column_major_load_with_const_stride_double(double* %Ptr)
106   // CHECK:         [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
107   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
108 
109   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
110 }
111 
column_major_load_with_const_stride2_double(double * Ptr)112 void column_major_load_with_const_stride2_double(double *Ptr) {
113   // CHECK-LABEL: define void @column_major_load_with_const_stride2_double(double* %Ptr)
114   // CHECK:         [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
115   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
116 
117   dx5x5_t m_a2 = __builtin_matrix_column_major_load(Ptr, 5, 5, 2 * 3 + 9);
118 }
119 
column_major_load_with_variable_stride_ull_float(float * Ptr,unsigned long long S)120 void column_major_load_with_variable_stride_ull_float(float *Ptr, unsigned long long S) {
121   // CHECK-LABEL: define void @column_major_load_with_variable_stride_ull_float(float* %Ptr, i64 %S)
122   // CHECK:         [[S:%.*]] = load i64, i64* %S.addr, align 8
123   // CHECK-NEXT:    [[PTR:%.*]] = load float*, float** %Ptr.addr, align 8
124   // CHECK-NEXT:    call <6 x float> @llvm.matrix.column.major.load.v6f32(float* align 4 [[PTR]], i64 [[S]], i1 false, i32 2, i32 3)
125 
126   fx2x3_t m_b = __builtin_matrix_column_major_load(Ptr, 2, 3, S);
127 }
128 
column_major_load_with_stride_math_int(int * Ptr,int S)129 void column_major_load_with_stride_math_int(int *Ptr, int S) {
130   // CHECK-LABEL: define void @column_major_load_with_stride_math_int(i32* %Ptr, i32 %S)
131   // CHECK:         [[S:%.*]] = load i32, i32* %S.addr, align 4
132   // CHECK-NEXT:    [[STRIDE:%.*]] = add nsw i32 [[S]], 32
133   // CHECK-NEXT:    [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
134   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
135   // CHECK-NEXT:    call <80 x i32> @llvm.matrix.column.major.load.v80i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
136 
137   ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
138 }
139 
column_major_load_with_stride_math_s_int(int * Ptr,short S)140 void column_major_load_with_stride_math_s_int(int *Ptr, short S) {
141   // CHECK-LABEL:  define void @column_major_load_with_stride_math_s_int(i32* %Ptr, i16 signext %S)
142   // CHECK:         [[S:%.*]] = load i16, i16* %S.addr, align 2
143   // CHECK-NEXT:    [[S_EXT:%.*]] = sext i16 [[S]] to i32
144   // CHECK-NEXT:    [[STRIDE:%.*]] = add nsw i32 [[S_EXT]], 32
145   // CHECK-NEXT:    [[STRIDE_EXT:%.*]] = sext i32 [[STRIDE]] to i64
146   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
147   // CHECK-NEXT:    %matrix = call <80 x i32> @llvm.matrix.column.major.load.v80i32(i32* align 4 [[PTR]], i64 [[STRIDE_EXT]], i1 false, i32 4, i32 20)
148 
149   ix4x20_t m_c = __builtin_matrix_column_major_load(Ptr, 4, 20, S + 32);
150 }
151 
column_major_load_array1(double Ptr[25])152 void column_major_load_array1(double Ptr[25]) {
153   // CHECK-LABEL: define void @column_major_load_array1(double* %Ptr)
154   // CHECK:         [[ADDR:%.*]] = load double*, double** %Ptr.addr, align 8
155   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[ADDR]], i64 5, i1 false, i32 5, i32 5)
156 
157   dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
158 }
159 
column_major_load_array2()160 void column_major_load_array2() {
161   // CHECK-LABEL: define void @column_major_load_array2() #0 {
162   // CHECK-NEXT:  entry:
163   // CHECK-NEXT:    [[PTR:%.*]] = alloca [25 x double], align 16
164   // CHECK:         [[ARRAY_DEC:%.*]] = getelementptr inbounds [25 x double], [25 x double]* [[PTR]], i64 0, i64 0
165   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 16 [[ARRAY_DEC]], i64 5, i1 false, i32 5, i32 5)
166 
167   double Ptr[25];
168   dx5x5_t m = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
169 }
170 
column_major_load_const(const double * Ptr)171 void column_major_load_const(const double *Ptr) {
172   // CHECK-LABEL: define void @column_major_load_const(double* %Ptr)
173   // CHECK:         [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
174   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
175 
176   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
177 }
178 
column_major_load_volatile(volatile double * Ptr)179 void column_major_load_volatile(volatile double *Ptr) {
180   // CHECK-LABEL: define void @column_major_load_volatile(double* %Ptr)
181   // CHECK:         [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
182   // CHECK-NEXT:    call <25 x double> @llvm.matrix.column.major.load.v25f64(double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
183 
184   dx5x5_t m_a1 = __builtin_matrix_column_major_load(Ptr, 5, 5, 5);
185 }
186 
column_major_store_with_const_stride_double(double * Ptr)187 void column_major_store_with_const_stride_double(double *Ptr) {
188   // CHECK-LABEL: define void @column_major_store_with_const_stride_double(double* %Ptr)
189   // CHECK:         [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
190   // CHECK-NEXT:    [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
191   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
192 
193   dx5x5_t m;
194   __builtin_matrix_column_major_store(m, Ptr, 5);
195 }
196 
column_major_store_with_const_stride2_double(double * Ptr)197 void column_major_store_with_const_stride2_double(double *Ptr) {
198   // CHECK-LABEL: define void @column_major_store_with_const_stride2_double(double* %Ptr)
199   // CHECK:         [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
200   // CHECK-NEXT:    [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
201   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 15, i1 false, i32 5, i32 5)
202   //
203   dx5x5_t m;
204   __builtin_matrix_column_major_store(m, Ptr, 2 * 3 + 9);
205 }
206 
column_major_store_with_stride_math_int(int * Ptr,int S)207 void column_major_store_with_stride_math_int(int *Ptr, int S) {
208   // CHECK-LABEL: define void @column_major_store_with_stride_math_int(i32* %Ptr, i32 %S)
209   // CHECK:         [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4
210   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
211   // CHECK-NEXT:    [[S:%.*]] = load i32, i32* %S.addr, align 4
212   // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[S]], 32
213   // CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[ADD]] to i64
214   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v80i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
215 
216   ix4x20_t m;
217   __builtin_matrix_column_major_store(m, Ptr, S + 32);
218 }
219 
column_major_store_with_stride_math_s_int(int * Ptr,short S)220 void column_major_store_with_stride_math_s_int(int *Ptr, short S) {
221   // CHECK-LABEL: define void @column_major_store_with_stride_math_s_int(i32* %Ptr, i16 signext %S)
222   // CHECK:         [[M:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4
223   // CHECK-NEXT:    [[PTR:%.*]] = load i32*, i32** %Ptr.addr, align 8
224   // CHECK-NEXT:    [[S:%.*]] = load i16, i16* %S.addr, align 2
225   // CHECK-NEXT:    [[EXT:%.*]] = sext i16 [[S]] to i32
226   // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[EXT]], 2
227   // CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[ADD]] to i64
228   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v80i32(<80 x i32> [[M]], i32* align 4 [[PTR]], i64 [[IDX]], i1 false, i32 4, i32 20)
229 
230   ix4x20_t m;
231   __builtin_matrix_column_major_store(m, Ptr, S + 2);
232 }
233 
column_major_store_array1(double Ptr[25])234 void column_major_store_array1(double Ptr[25]) {
235   // CHECK-LABEL: define void @column_major_store_array1(double* %Ptr)
236   // CHECK:         [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
237   // CHECK-NEXT:    [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
238   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 false, i32 5, i32 5)
239 
240   dx5x5_t m;
241   __builtin_matrix_column_major_store(m, Ptr, 5);
242 }
243 
column_major_store_array2()244 void column_major_store_array2() {
245   // CHECK-LABEL: define void @column_major_store_array2()
246   // CHECK:         [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
247   // CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds [25 x double], [25 x double]* %Ptr, i64 0, i64 0
248   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 16 [[PTR]], i64 5, i1 false, i32 5, i32 5)
249 
250   double Ptr[25];
251   dx5x5_t m;
252   __builtin_matrix_column_major_store(m, Ptr, 5);
253 }
254 
column_major_store_volatile(volatile double * Ptr)255 void column_major_store_volatile(volatile double *Ptr) {
256   // CHECK-LABEL: define void @column_major_store_volatile(double* %Ptr) #0 {
257   // CHECK:         [[M:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
258   // CHECK-NEXT:    [[PTR:%.*]] = load double*, double** %Ptr.addr, align 8
259   // CHECK-NEXT:    call void @llvm.matrix.column.major.store.v25f64(<25 x double> [[M]], double* align 8 [[PTR]], i64 5, i1 true, i32 5, i32 5)
260 
261   dx5x5_t m;
262   __builtin_matrix_column_major_store(m, Ptr, 5);
263 }
264