1 #include <arm_neon.h>
2 #include "arm-neon-ref.h"
3 #include "compute-ref-data.h"
4 
5 /* Expected results.  */
6 
7 /* vld2_dup/chunk 0.  */
8 VECT_VAR_DECL(expected_vld2_0,int,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
9 				       0xf0, 0xf1, 0xf0, 0xf1 };
10 VECT_VAR_DECL(expected_vld2_0,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
11 VECT_VAR_DECL(expected_vld2_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
12 VECT_VAR_DECL(expected_vld2_0,int,64,1) [] = { 0xfffffffffffffff0 };
13 VECT_VAR_DECL(expected_vld2_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
14 					0xf0, 0xf1, 0xf0, 0xf1 };
15 VECT_VAR_DECL(expected_vld2_0,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
16 VECT_VAR_DECL(expected_vld2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
17 VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
18 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
19 					0xf0, 0xf1, 0xf0, 0xf1 };
20 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
21 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
22 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
23 
24 /* vld2_dup/chunk 1.  */
25 VECT_VAR_DECL(expected_vld2_1,int,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
26 					      0xf0, 0xf1, 0xf0, 0xf1 };
27 VECT_VAR_DECL(expected_vld2_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
28 VECT_VAR_DECL(expected_vld2_1,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
29 VECT_VAR_DECL(expected_vld2_1,int,64,1) [] = { 0xfffffffffffffff1 };
30 VECT_VAR_DECL(expected_vld2_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
31 					       0xf0, 0xf1, 0xf0, 0xf1 };
32 VECT_VAR_DECL(expected_vld2_1,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
33 VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
34 VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
35 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
36 					       0xf0, 0xf1, 0xf0, 0xf1 };
37 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
38 						0xfff0, 0xfff1 };
39 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
40 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
41 
42 /* vld3_dup/chunk 0.  */
43 VECT_VAR_DECL(expected_vld3_0,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
44 					      0xf1, 0xf2, 0xf0, 0xf1 };
45 VECT_VAR_DECL(expected_vld3_0,int,16,4) [] = { 0xfff0, 0xfff1,
46 					       0xfff2, 0xfff0 };
47 VECT_VAR_DECL(expected_vld3_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
48 VECT_VAR_DECL(expected_vld3_0,int,64,1) [] = { 0xfffffffffffffff0 };
49 VECT_VAR_DECL(expected_vld3_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
50 					       0xf1, 0xf2, 0xf0, 0xf1 };
51 VECT_VAR_DECL(expected_vld3_0,uint,16,4) [] = { 0xfff0, 0xfff1,
52 						0xfff2, 0xfff0 };
53 VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
54 VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
55 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
56 					       0xf1, 0xf2, 0xf0, 0xf1 };
57 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
58 						0xfff2, 0xfff0 };
59 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
60 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
61 
62 /* vld3_dup/chunk 1.  */
63 VECT_VAR_DECL(expected_vld3_1,int,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
64 					      0xf0, 0xf1, 0xf2, 0xf0 };
65 VECT_VAR_DECL(expected_vld3_1,int,16,4) [] = { 0xfff1, 0xfff2,
66 					       0xfff0, 0xfff1 };
67 VECT_VAR_DECL(expected_vld3_1,int,32,2) [] = { 0xfffffff2, 0xfffffff0 };
68 VECT_VAR_DECL(expected_vld3_1,int,64,1) [] = { 0xfffffffffffffff1 };
69 VECT_VAR_DECL(expected_vld3_1,uint,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
70 					       0xf0, 0xf1, 0xf2, 0xf0 };
71 VECT_VAR_DECL(expected_vld3_1,uint,16,4) [] = { 0xfff1, 0xfff2,
72 						0xfff0, 0xfff1 };
73 VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff0 };
74 VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
75 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
76 					       0xf0, 0xf1, 0xf2, 0xf0 };
77 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
78 						0xfff0, 0xfff1 };
79 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
80 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
81 
82 /* vld3_dup/chunk 2.  */
83 VECT_VAR_DECL(expected_vld3_2,int,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
84 					      0xf2, 0xf0, 0xf1, 0xf2 };
85 VECT_VAR_DECL(expected_vld3_2,int,16,4) [] = { 0xfff2, 0xfff0,
86 					       0xfff1, 0xfff2 };
87 VECT_VAR_DECL(expected_vld3_2,int,32,2) [] = { 0xfffffff1, 0xfffffff2 };
88 VECT_VAR_DECL(expected_vld3_2,int,64,1) [] = { 0xfffffffffffffff2 };
89 VECT_VAR_DECL(expected_vld3_2,uint,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
90 					       0xf2, 0xf0, 0xf1, 0xf2 };
91 VECT_VAR_DECL(expected_vld3_2,uint,16,4) [] = { 0xfff2, 0xfff0,
92 						0xfff1, 0xfff2 };
93 VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
94 VECT_VAR_DECL(expected_vld3_2,uint,64,1) [] = { 0xfffffffffffffff2 };
95 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
96 					       0xf2, 0xf0, 0xf1, 0xf2 };
97 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
98 						0xfff1, 0xfff2 };
99 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
100 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
101 
102 /* vld4_dup/chunk 0.  */
103 VECT_VAR_DECL(expected_vld4_0,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
104 					      0xf0, 0xf1, 0xf2, 0xf3 };
105 VECT_VAR_DECL(expected_vld4_0,int,16,4) [] = { 0xfff0, 0xfff1,
106 					       0xfff2, 0xfff3 };
107 VECT_VAR_DECL(expected_vld4_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
108 VECT_VAR_DECL(expected_vld4_0,int,64,1) [] = { 0xfffffffffffffff0 };
109 VECT_VAR_DECL(expected_vld4_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
110 					       0xf0, 0xf1, 0xf2, 0xf3 };
111 VECT_VAR_DECL(expected_vld4_0,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
112 VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
113 VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
114 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
115 					       0xf0, 0xf1, 0xf2, 0xf3 };
116 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
117 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
118 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
119 
120 /* vld4_dup/chunk 1.  */
121 VECT_VAR_DECL(expected_vld4_1,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
122 					      0xf0, 0xf1, 0xf2, 0xf3 };
123 VECT_VAR_DECL(expected_vld4_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
124 VECT_VAR_DECL(expected_vld4_1,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
125 VECT_VAR_DECL(expected_vld4_1,int,64,1) [] = { 0xfffffffffffffff1 };
126 VECT_VAR_DECL(expected_vld4_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
127 					       0xf0, 0xf1, 0xf2, 0xf3 };
128 VECT_VAR_DECL(expected_vld4_1,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
129 VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
130 VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
131 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
132 					       0xf0, 0xf1, 0xf2, 0xf3 };
133 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
134 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
135 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
136 
137 /* vld4_dup/chunk 2.  */
138 VECT_VAR_DECL(expected_vld4_2,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
139 					      0xf0, 0xf1, 0xf2, 0xf3 };
140 VECT_VAR_DECL(expected_vld4_2,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
141 VECT_VAR_DECL(expected_vld4_2,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
142 VECT_VAR_DECL(expected_vld4_2,int,64,1) [] = { 0xfffffffffffffff2 };
143 VECT_VAR_DECL(expected_vld4_2,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
144 					       0xf0, 0xf1, 0xf2, 0xf3 };
145 VECT_VAR_DECL(expected_vld4_2,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
146 VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
147 VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
148 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
149 					       0xf0, 0xf1, 0xf2, 0xf3 };
150 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
151 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
152 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
153 
154 /* vld4_dup/chunk3.  */
155 VECT_VAR_DECL(expected_vld4_3,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
156 					      0xf0, 0xf1, 0xf2, 0xf3 };
157 VECT_VAR_DECL(expected_vld4_3,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
158 VECT_VAR_DECL(expected_vld4_3,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
159 VECT_VAR_DECL(expected_vld4_3,int,64,1) [] = { 0xfffffffffffffff3 };
160 VECT_VAR_DECL(expected_vld4_3,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
161 					       0xf0, 0xf1, 0xf2, 0xf3 };
162 VECT_VAR_DECL(expected_vld4_3,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
163 VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
164 VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
165 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
166 					       0xf0, 0xf1, 0xf2, 0xf3 };
167 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
168 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
169 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
170 
exec_vldX_dup(void)171 void exec_vldX_dup (void)
172 {
173   /* In this case, input variables are arrays of vectors.  */
174 #define DECL_VLDX_DUP(T1, W, N, X)					\
175   VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
176   VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
177 
178   /* We need to use a temporary result buffer (result_bis), because
179      the one used for other tests is not large enough. A subset of the
180      result data is moved from result_bis to result, and it is this
181      subset which is used to check the actual behavior. The next
182      macro enables to move another chunk of data from result_bis to
183      result.  */
184 #define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
185   VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
186     vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]);	\
187 									\
188   vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
189 		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
190   memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
191 	 sizeof(VECT_VAR(result, T1, W, N)));
192 
193 
194   /* Overwrite "result" with the contents of "result_bis"[Y].  */
195 #define TEST_EXTRA_CHUNK(T1, W, N, X,Y)			\
196   memcpy(VECT_VAR(result, T1, W, N),			\
197 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
198 	 sizeof(VECT_VAR(result, T1, W, N)));
199 
200 #define DECL_ALL_VLDX_DUP_NO_FP16(X)		\
201   DECL_VLDX_DUP(int, 8, 8, X);			\
202   DECL_VLDX_DUP(int, 16, 4, X);			\
203   DECL_VLDX_DUP(int, 32, 2, X);			\
204   DECL_VLDX_DUP(int, 64, 1, X);			\
205   DECL_VLDX_DUP(uint, 8, 8, X);			\
206   DECL_VLDX_DUP(uint, 16, 4, X);		\
207   DECL_VLDX_DUP(uint, 32, 2, X);		\
208   DECL_VLDX_DUP(uint, 64, 1, X);		\
209   DECL_VLDX_DUP(poly, 8, 8, X);			\
210   DECL_VLDX_DUP(poly, 16, 4, X);		\
211   DECL_VLDX_DUP(float, 32, 2, X)
212 
213 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
214 #define DECL_ALL_VLDX_DUP(X)		\
215   DECL_ALL_VLDX_DUP_NO_FP16(X);		\
216   DECL_VLDX_DUP(float, 16, 4, X)
217 #else
218 #define DECL_ALL_VLDX_DUP(X) DECL_ALL_VLDX_DUP_NO_FP16(X)
219 #endif
220 
221 #define TEST_ALL_VLDX_DUP_NO_FP16(X)		\
222   TEST_VLDX_DUP(, int, s, 8, 8, X);		\
223   TEST_VLDX_DUP(, int, s, 16, 4, X);		\
224   TEST_VLDX_DUP(, int, s, 32, 2, X);		\
225   TEST_VLDX_DUP(, int, s, 64, 1, X);		\
226   TEST_VLDX_DUP(, uint, u, 8, 8, X);		\
227   TEST_VLDX_DUP(, uint, u, 16, 4, X);		\
228   TEST_VLDX_DUP(, uint, u, 32, 2, X);		\
229   TEST_VLDX_DUP(, uint, u, 64, 1, X);		\
230   TEST_VLDX_DUP(, poly, p, 8, 8, X);		\
231   TEST_VLDX_DUP(, poly, p, 16, 4, X);		\
232   TEST_VLDX_DUP(, float, f, 32, 2, X)
233 
234 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
235 #define TEST_ALL_VLDX_DUP(X)		\
236   TEST_ALL_VLDX_DUP_NO_FP16(X);		\
237   TEST_VLDX_DUP(, float, f, 16, 4, X)
238 #else
239 #define TEST_ALL_VLDX_DUP(X) TEST_ALL_VLDX_DUP_NO_FP16(X)
240 #endif
241 
242 #define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)	\
243   TEST_EXTRA_CHUNK(int, 8, 8, X, Y);		\
244   TEST_EXTRA_CHUNK(int, 16, 4, X, Y);		\
245   TEST_EXTRA_CHUNK(int, 32, 2, X, Y);		\
246   TEST_EXTRA_CHUNK(int, 64, 1, X, Y);		\
247   TEST_EXTRA_CHUNK(uint, 8, 8, X, Y);		\
248   TEST_EXTRA_CHUNK(uint, 16, 4, X, Y);		\
249   TEST_EXTRA_CHUNK(uint, 32, 2, X, Y);		\
250   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
251   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
252   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
253   TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
254 
255 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
256 #define TEST_ALL_EXTRA_CHUNKS(X, Y)	\
257   TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y);	\
258   TEST_EXTRA_CHUNK(float, 16, 4, X, Y)
259 #else
260 #define TEST_ALL_EXTRA_CHUNKS(X, Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
261 #endif
262 
263   /* vldX_dup supports only 64-bit inputs.  */
264 #define CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment)	\
265     CHECK(test_name, int, 8, 8, PRIx8, EXPECTED, comment);		\
266     CHECK(test_name, int, 16, 4, PRIx16, EXPECTED, comment);		\
267     CHECK(test_name, int, 32, 2, PRIx32, EXPECTED, comment);		\
268     CHECK(test_name, int, 64, 1, PRIx64, EXPECTED, comment);		\
269     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
270     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
271     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
272     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
273     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
274     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
275     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
276 
277 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
278 #define CHECK_RESULTS_VLDX_DUP(test_name,EXPECTED,comment)		\
279   {									\
280     CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment);		\
281     CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
282   }
283 #else
284 #define CHECK_RESULTS_VLDX_DUP(test_name,EXPECTED,comment)		\
285   {									\
286     CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment);		\
287   }
288 #endif
289 
290   DECL_ALL_VLDX_DUP(2);
291   DECL_ALL_VLDX_DUP(3);
292   DECL_ALL_VLDX_DUP(4);
293 
294   /* Special input buffers of suitable size are needed for vld2/vld3/vld4.  */
295   /* Input buffers for vld2, 1 of each size */
296   VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8);
297   PAD(buffer_vld2_pad, int, 8, 8);
298   VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4);
299   PAD(buffer_vld2_pad, int, 16, 4);
300   VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2);
301   PAD(buffer_vld2_pad, int, 32, 2);
302   VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1);
303   PAD(buffer_vld2_pad, int, 64, 1);
304   VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8);
305   PAD(buffer_vld2_pad, uint, 8, 8);
306   VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4);
307   PAD(buffer_vld2_pad, uint, 16, 4);
308   VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2);
309   PAD(buffer_vld2_pad, uint, 32, 2);
310   VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1);
311   PAD(buffer_vld2_pad, uint, 64, 1);
312   VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8);
313   PAD(buffer_vld2_pad, poly, 8, 8);
314   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
315   PAD(buffer_vld2_pad, poly, 16, 4);
316 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
317   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
318   PAD(buffer_vld2_pad, float, 16, 4);
319 #endif
320   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
321   PAD(buffer_vld2_pad, float, 32, 2);
322 
323   VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16);
324   PAD(buffer_vld2_pad, int, 8, 16);
325   VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8);
326   PAD(buffer_vld2_pad, int, 16, 8);
327   VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4);
328   PAD(buffer_vld2_pad, int, 32, 4);
329   VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2);
330   PAD(buffer_vld2_pad, int, 64, 2);
331   VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16);
332   PAD(buffer_vld2_pad, uint, 8, 16);
333   VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8);
334   PAD(buffer_vld2_pad, uint, 16, 8);
335   VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4);
336   PAD(buffer_vld2_pad, uint, 32, 4);
337   VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2);
338   PAD(buffer_vld2_pad, uint, 64, 2);
339   VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16);
340   PAD(buffer_vld2_pad, poly, 8, 16);
341   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
342   PAD(buffer_vld2_pad, poly, 16, 8);
343 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
344   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
345   PAD(buffer_vld2_pad, float, 16, 8);
346 #endif
347   VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
348   PAD(buffer_vld2_pad, float, 32, 4);
349 
350   /* Input buffers for vld3, 1 of each size */
351   VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8);
352   PAD(buffer_vld3_pad, int, 8, 8);
353   VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4);
354   PAD(buffer_vld3_pad, int, 16, 4);
355   VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2);
356   PAD(buffer_vld3_pad, int, 32, 2);
357   VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1);
358   PAD(buffer_vld3_pad, int, 64, 1);
359   VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8);
360   PAD(buffer_vld3_pad, uint, 8, 8);
361   VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4);
362   PAD(buffer_vld3_pad, uint, 16, 4);
363   VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2);
364   PAD(buffer_vld3_pad, uint, 32, 2);
365   VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1);
366   PAD(buffer_vld3_pad, uint, 64, 1);
367   VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8);
368   PAD(buffer_vld3_pad, poly, 8, 8);
369   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
370   PAD(buffer_vld3_pad, poly, 16, 4);
371 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
372   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
373   PAD(buffer_vld3_pad, float, 16, 4);
374 #endif
375   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
376   PAD(buffer_vld3_pad, float, 32, 2);
377 
378   VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16);
379   PAD(buffer_vld3_pad, int, 8, 16);
380   VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8);
381   PAD(buffer_vld3_pad, int, 16, 8);
382   VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4);
383   PAD(buffer_vld3_pad, int, 32, 4);
384   VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2);
385   PAD(buffer_vld3_pad, int, 64, 2);
386   VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16);
387   PAD(buffer_vld3_pad, uint, 8, 16);
388   VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8);
389   PAD(buffer_vld3_pad, uint, 16, 8);
390   VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4);
391   PAD(buffer_vld3_pad, uint, 32, 4);
392   VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2);
393   PAD(buffer_vld3_pad, uint, 64, 2);
394   VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16);
395   PAD(buffer_vld3_pad, poly, 8, 16);
396   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
397   PAD(buffer_vld3_pad, poly, 16, 8);
398 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
399   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
400   PAD(buffer_vld3_pad, float, 16, 8);
401 #endif
402   VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
403   PAD(buffer_vld3_pad, float, 32, 4);
404 
405   /* Input buffers for vld4, 1 of each size */
406   VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8);
407   PAD(buffer_vld4_pad, int, 8, 8);
408   VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4);
409   PAD(buffer_vld4_pad, int, 16, 4);
410   VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2);
411   PAD(buffer_vld4_pad, int, 32, 2);
412   VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1);
413   PAD(buffer_vld4_pad, int, 64, 1);
414   VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8);
415   PAD(buffer_vld4_pad, uint, 8, 8);
416   VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4);
417   PAD(buffer_vld4_pad, uint, 16, 4);
418   VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2);
419   PAD(buffer_vld4_pad, uint, 32, 2);
420   VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1);
421   PAD(buffer_vld4_pad, uint, 64, 1);
422   VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8);
423   PAD(buffer_vld4_pad, poly, 8, 8);
424   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
425   PAD(buffer_vld4_pad, poly, 16, 4);
426 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
427   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
428   PAD(buffer_vld4_pad, float, 16, 4);
429 #endif
430   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
431   PAD(buffer_vld4_pad, float, 32, 2);
432 
433   VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16);
434   PAD(buffer_vld4_pad, int, 8, 16);
435   VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8);
436   PAD(buffer_vld4_pad, int, 16, 8);
437   VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4);
438   PAD(buffer_vld4_pad, int, 32, 4);
439   VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2);
440   PAD(buffer_vld4_pad, int, 64, 2);
441   VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16);
442   PAD(buffer_vld4_pad, uint, 8, 16);
443   VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8);
444   PAD(buffer_vld4_pad, uint, 16, 8);
445   VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4);
446   PAD(buffer_vld4_pad, uint, 32, 4);
447   VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2);
448   PAD(buffer_vld4_pad, uint, 64, 2);
449   VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16);
450   PAD(buffer_vld4_pad, poly, 8, 16);
451   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
452   PAD(buffer_vld4_pad, poly, 16, 8);
453 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
454   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
455   PAD(buffer_vld4_pad, float, 16, 8);
456 #endif
457   VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
458   PAD(buffer_vld4_pad, float, 32, 4);
459 
460   /* Check vld2_dup/vld2q_dup.  */
461   clean_results ();
462 #define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
463   TEST_ALL_VLDX_DUP(2);
464   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld2_0, "chunk 0");
465 
466   TEST_ALL_EXTRA_CHUNKS(2, 1);
467   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld2_1, "chunk 1");
468 
469   /* Check vld3_dup/vld3q_dup.  */
470   clean_results ();
471 #undef TEST_MSG
472 #define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
473   TEST_ALL_VLDX_DUP(3);
474   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_0, "chunk 0");
475 
476   TEST_ALL_EXTRA_CHUNKS(3, 1);
477   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_1, "chunk 1");
478 
479   TEST_ALL_EXTRA_CHUNKS(3, 2);
480   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_2, "chunk 2");
481 
482   /* Check vld4_dup/vld4q_dup */
483   clean_results ();
484 #undef TEST_MSG
485 #define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
486   TEST_ALL_VLDX_DUP(4);
487   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_0, "chunk 0");
488 
489   TEST_ALL_EXTRA_CHUNKS(4, 1);
490   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_1, "chunk 1");
491 
492   TEST_ALL_EXTRA_CHUNKS(4, 2);
493   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_2, "chunk 2");
494 
495   TEST_ALL_EXTRA_CHUNKS(4, 3);
496   CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_3, "chunk 3");
497 }
498 
main(void)499 int main (void)
500 {
501   exec_vldX_dup ();
502   return 0;
503 }
504