1 #include <arm_neon.h>
2 #include "arm-neon-ref.h"
3 #include "compute-ref-data.h"
4
5 /* Expected results. */
6
7 /* vld2_dup/chunk 0. */
8 VECT_VAR_DECL(expected_vld2_0,int,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
9 0xf0, 0xf1, 0xf0, 0xf1 };
10 VECT_VAR_DECL(expected_vld2_0,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
11 VECT_VAR_DECL(expected_vld2_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
12 VECT_VAR_DECL(expected_vld2_0,int,64,1) [] = { 0xfffffffffffffff0 };
13 VECT_VAR_DECL(expected_vld2_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
14 0xf0, 0xf1, 0xf0, 0xf1 };
15 VECT_VAR_DECL(expected_vld2_0,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
16 VECT_VAR_DECL(expected_vld2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
17 VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
18 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
19 0xf0, 0xf1, 0xf0, 0xf1 };
20 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
21 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
22 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
23
24 /* vld2_dup/chunk 1. */
25 VECT_VAR_DECL(expected_vld2_1,int,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
26 0xf0, 0xf1, 0xf0, 0xf1 };
27 VECT_VAR_DECL(expected_vld2_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
28 VECT_VAR_DECL(expected_vld2_1,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
29 VECT_VAR_DECL(expected_vld2_1,int,64,1) [] = { 0xfffffffffffffff1 };
30 VECT_VAR_DECL(expected_vld2_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
31 0xf0, 0xf1, 0xf0, 0xf1 };
32 VECT_VAR_DECL(expected_vld2_1,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
33 VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
34 VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
35 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
36 0xf0, 0xf1, 0xf0, 0xf1 };
37 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
38 0xfff0, 0xfff1 };
39 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
40 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
41
42 /* vld3_dup/chunk 0. */
43 VECT_VAR_DECL(expected_vld3_0,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
44 0xf1, 0xf2, 0xf0, 0xf1 };
45 VECT_VAR_DECL(expected_vld3_0,int,16,4) [] = { 0xfff0, 0xfff1,
46 0xfff2, 0xfff0 };
47 VECT_VAR_DECL(expected_vld3_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
48 VECT_VAR_DECL(expected_vld3_0,int,64,1) [] = { 0xfffffffffffffff0 };
49 VECT_VAR_DECL(expected_vld3_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
50 0xf1, 0xf2, 0xf0, 0xf1 };
51 VECT_VAR_DECL(expected_vld3_0,uint,16,4) [] = { 0xfff0, 0xfff1,
52 0xfff2, 0xfff0 };
53 VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
54 VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
55 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
56 0xf1, 0xf2, 0xf0, 0xf1 };
57 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
58 0xfff2, 0xfff0 };
59 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
60 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
61
62 /* vld3_dup/chunk 1. */
63 VECT_VAR_DECL(expected_vld3_1,int,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
64 0xf0, 0xf1, 0xf2, 0xf0 };
65 VECT_VAR_DECL(expected_vld3_1,int,16,4) [] = { 0xfff1, 0xfff2,
66 0xfff0, 0xfff1 };
67 VECT_VAR_DECL(expected_vld3_1,int,32,2) [] = { 0xfffffff2, 0xfffffff0 };
68 VECT_VAR_DECL(expected_vld3_1,int,64,1) [] = { 0xfffffffffffffff1 };
69 VECT_VAR_DECL(expected_vld3_1,uint,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
70 0xf0, 0xf1, 0xf2, 0xf0 };
71 VECT_VAR_DECL(expected_vld3_1,uint,16,4) [] = { 0xfff1, 0xfff2,
72 0xfff0, 0xfff1 };
73 VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff0 };
74 VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
75 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
76 0xf0, 0xf1, 0xf2, 0xf0 };
77 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
78 0xfff0, 0xfff1 };
79 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
80 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
81
82 /* vld3_dup/chunk 2. */
83 VECT_VAR_DECL(expected_vld3_2,int,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
84 0xf2, 0xf0, 0xf1, 0xf2 };
85 VECT_VAR_DECL(expected_vld3_2,int,16,4) [] = { 0xfff2, 0xfff0,
86 0xfff1, 0xfff2 };
87 VECT_VAR_DECL(expected_vld3_2,int,32,2) [] = { 0xfffffff1, 0xfffffff2 };
88 VECT_VAR_DECL(expected_vld3_2,int,64,1) [] = { 0xfffffffffffffff2 };
89 VECT_VAR_DECL(expected_vld3_2,uint,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
90 0xf2, 0xf0, 0xf1, 0xf2 };
91 VECT_VAR_DECL(expected_vld3_2,uint,16,4) [] = { 0xfff2, 0xfff0,
92 0xfff1, 0xfff2 };
93 VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
94 VECT_VAR_DECL(expected_vld3_2,uint,64,1) [] = { 0xfffffffffffffff2 };
95 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
96 0xf2, 0xf0, 0xf1, 0xf2 };
97 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
98 0xfff1, 0xfff2 };
99 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
100 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
101
102 /* vld4_dup/chunk 0. */
103 VECT_VAR_DECL(expected_vld4_0,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
104 0xf0, 0xf1, 0xf2, 0xf3 };
105 VECT_VAR_DECL(expected_vld4_0,int,16,4) [] = { 0xfff0, 0xfff1,
106 0xfff2, 0xfff3 };
107 VECT_VAR_DECL(expected_vld4_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
108 VECT_VAR_DECL(expected_vld4_0,int,64,1) [] = { 0xfffffffffffffff0 };
109 VECT_VAR_DECL(expected_vld4_0,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
110 0xf0, 0xf1, 0xf2, 0xf3 };
111 VECT_VAR_DECL(expected_vld4_0,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
112 VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
113 VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
114 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
115 0xf0, 0xf1, 0xf2, 0xf3 };
116 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
117 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
118 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
119
120 /* vld4_dup/chunk 1. */
121 VECT_VAR_DECL(expected_vld4_1,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
122 0xf0, 0xf1, 0xf2, 0xf3 };
123 VECT_VAR_DECL(expected_vld4_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
124 VECT_VAR_DECL(expected_vld4_1,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
125 VECT_VAR_DECL(expected_vld4_1,int,64,1) [] = { 0xfffffffffffffff1 };
126 VECT_VAR_DECL(expected_vld4_1,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
127 0xf0, 0xf1, 0xf2, 0xf3 };
128 VECT_VAR_DECL(expected_vld4_1,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
129 VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
130 VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
131 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
132 0xf0, 0xf1, 0xf2, 0xf3 };
133 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
134 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
135 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
136
137 /* vld4_dup/chunk 2. */
138 VECT_VAR_DECL(expected_vld4_2,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
139 0xf0, 0xf1, 0xf2, 0xf3 };
140 VECT_VAR_DECL(expected_vld4_2,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
141 VECT_VAR_DECL(expected_vld4_2,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
142 VECT_VAR_DECL(expected_vld4_2,int,64,1) [] = { 0xfffffffffffffff2 };
143 VECT_VAR_DECL(expected_vld4_2,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
144 0xf0, 0xf1, 0xf2, 0xf3 };
145 VECT_VAR_DECL(expected_vld4_2,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
146 VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
147 VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
148 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
149 0xf0, 0xf1, 0xf2, 0xf3 };
150 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
151 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
152 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
153
154 /* vld4_dup/chunk3. */
155 VECT_VAR_DECL(expected_vld4_3,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
156 0xf0, 0xf1, 0xf2, 0xf3 };
157 VECT_VAR_DECL(expected_vld4_3,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
158 VECT_VAR_DECL(expected_vld4_3,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
159 VECT_VAR_DECL(expected_vld4_3,int,64,1) [] = { 0xfffffffffffffff3 };
160 VECT_VAR_DECL(expected_vld4_3,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
161 0xf0, 0xf1, 0xf2, 0xf3 };
162 VECT_VAR_DECL(expected_vld4_3,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
163 VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
164 VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
165 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
166 0xf0, 0xf1, 0xf2, 0xf3 };
167 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
168 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
169 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
170
exec_vldX_dup(void)171 void exec_vldX_dup (void)
172 {
173 /* In this case, input variables are arrays of vectors. */
174 #define DECL_VLDX_DUP(T1, W, N, X) \
175 VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X); \
176 VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
177
178 /* We need to use a temporary result buffer (result_bis), because
179 the one used for other tests is not large enough. A subset of the
180 result data is moved from result_bis to result, and it is this
181 subset which is used to check the actual behavior. The next
182 macro enables to move another chunk of data from result_bis to
183 result. */
184 #define TEST_VLDX_DUP(Q, T1, T2, W, N, X) \
185 VECT_ARRAY_VAR(vector, T1, W, N, X) = \
186 vld##X##Q##_dup_##T2##W(&VECT_VAR(buffer_dup, T1, W, N)[0]); \
187 \
188 vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \
189 VECT_ARRAY_VAR(vector, T1, W, N, X)); \
190 memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
191 sizeof(VECT_VAR(result, T1, W, N)));
192
193
194 /* Overwrite "result" with the contents of "result_bis"[Y]. */
195 #define TEST_EXTRA_CHUNK(T1, W, N, X,Y) \
196 memcpy(VECT_VAR(result, T1, W, N), \
197 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]), \
198 sizeof(VECT_VAR(result, T1, W, N)));
199
200 #define DECL_ALL_VLDX_DUP_NO_FP16(X) \
201 DECL_VLDX_DUP(int, 8, 8, X); \
202 DECL_VLDX_DUP(int, 16, 4, X); \
203 DECL_VLDX_DUP(int, 32, 2, X); \
204 DECL_VLDX_DUP(int, 64, 1, X); \
205 DECL_VLDX_DUP(uint, 8, 8, X); \
206 DECL_VLDX_DUP(uint, 16, 4, X); \
207 DECL_VLDX_DUP(uint, 32, 2, X); \
208 DECL_VLDX_DUP(uint, 64, 1, X); \
209 DECL_VLDX_DUP(poly, 8, 8, X); \
210 DECL_VLDX_DUP(poly, 16, 4, X); \
211 DECL_VLDX_DUP(float, 32, 2, X)
212
213 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
214 #define DECL_ALL_VLDX_DUP(X) \
215 DECL_ALL_VLDX_DUP_NO_FP16(X); \
216 DECL_VLDX_DUP(float, 16, 4, X)
217 #else
218 #define DECL_ALL_VLDX_DUP(X) DECL_ALL_VLDX_DUP_NO_FP16(X)
219 #endif
220
221 #define TEST_ALL_VLDX_DUP_NO_FP16(X) \
222 TEST_VLDX_DUP(, int, s, 8, 8, X); \
223 TEST_VLDX_DUP(, int, s, 16, 4, X); \
224 TEST_VLDX_DUP(, int, s, 32, 2, X); \
225 TEST_VLDX_DUP(, int, s, 64, 1, X); \
226 TEST_VLDX_DUP(, uint, u, 8, 8, X); \
227 TEST_VLDX_DUP(, uint, u, 16, 4, X); \
228 TEST_VLDX_DUP(, uint, u, 32, 2, X); \
229 TEST_VLDX_DUP(, uint, u, 64, 1, X); \
230 TEST_VLDX_DUP(, poly, p, 8, 8, X); \
231 TEST_VLDX_DUP(, poly, p, 16, 4, X); \
232 TEST_VLDX_DUP(, float, f, 32, 2, X)
233
234 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
235 #define TEST_ALL_VLDX_DUP(X) \
236 TEST_ALL_VLDX_DUP_NO_FP16(X); \
237 TEST_VLDX_DUP(, float, f, 16, 4, X)
238 #else
239 #define TEST_ALL_VLDX_DUP(X) TEST_ALL_VLDX_DUP_NO_FP16(X)
240 #endif
241
242 #define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y) \
243 TEST_EXTRA_CHUNK(int, 8, 8, X, Y); \
244 TEST_EXTRA_CHUNK(int, 16, 4, X, Y); \
245 TEST_EXTRA_CHUNK(int, 32, 2, X, Y); \
246 TEST_EXTRA_CHUNK(int, 64, 1, X, Y); \
247 TEST_EXTRA_CHUNK(uint, 8, 8, X, Y); \
248 TEST_EXTRA_CHUNK(uint, 16, 4, X, Y); \
249 TEST_EXTRA_CHUNK(uint, 32, 2, X, Y); \
250 TEST_EXTRA_CHUNK(uint, 64, 1, X, Y); \
251 TEST_EXTRA_CHUNK(poly, 8, 8, X, Y); \
252 TEST_EXTRA_CHUNK(poly, 16, 4, X, Y); \
253 TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
254
255 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
256 #define TEST_ALL_EXTRA_CHUNKS(X, Y) \
257 TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y); \
258 TEST_EXTRA_CHUNK(float, 16, 4, X, Y)
259 #else
260 #define TEST_ALL_EXTRA_CHUNKS(X, Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
261 #endif
262
263 /* vldX_dup supports only 64-bit inputs. */
264 #define CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment) \
265 CHECK(test_name, int, 8, 8, PRIx8, EXPECTED, comment); \
266 CHECK(test_name, int, 16, 4, PRIx16, EXPECTED, comment); \
267 CHECK(test_name, int, 32, 2, PRIx32, EXPECTED, comment); \
268 CHECK(test_name, int, 64, 1, PRIx64, EXPECTED, comment); \
269 CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment); \
270 CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
271 CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
272 CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment); \
273 CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
274 CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
275 CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
276
277 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
278 #define CHECK_RESULTS_VLDX_DUP(test_name,EXPECTED,comment) \
279 { \
280 CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment); \
281 CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment); \
282 }
283 #else
284 #define CHECK_RESULTS_VLDX_DUP(test_name,EXPECTED,comment) \
285 { \
286 CHECK_RESULTS_VLDX_DUP_NO_FP16(test_name,EXPECTED,comment); \
287 }
288 #endif
289
290 DECL_ALL_VLDX_DUP(2);
291 DECL_ALL_VLDX_DUP(3);
292 DECL_ALL_VLDX_DUP(4);
293
294 /* Special input buffers of suitable size are needed for vld2/vld3/vld4. */
295 /* Input buffers for vld2, 1 of each size */
296 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8);
297 PAD(buffer_vld2_pad, int, 8, 8);
298 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4);
299 PAD(buffer_vld2_pad, int, 16, 4);
300 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2);
301 PAD(buffer_vld2_pad, int, 32, 2);
302 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1);
303 PAD(buffer_vld2_pad, int, 64, 1);
304 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8);
305 PAD(buffer_vld2_pad, uint, 8, 8);
306 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4);
307 PAD(buffer_vld2_pad, uint, 16, 4);
308 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2);
309 PAD(buffer_vld2_pad, uint, 32, 2);
310 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1);
311 PAD(buffer_vld2_pad, uint, 64, 1);
312 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 8);
313 PAD(buffer_vld2_pad, poly, 8, 8);
314 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
315 PAD(buffer_vld2_pad, poly, 16, 4);
316 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
317 VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
318 PAD(buffer_vld2_pad, float, 16, 4);
319 #endif
320 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2);
321 PAD(buffer_vld2_pad, float, 32, 2);
322
323 VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16);
324 PAD(buffer_vld2_pad, int, 8, 16);
325 VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8);
326 PAD(buffer_vld2_pad, int, 16, 8);
327 VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4);
328 PAD(buffer_vld2_pad, int, 32, 4);
329 VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2);
330 PAD(buffer_vld2_pad, int, 64, 2);
331 VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16);
332 PAD(buffer_vld2_pad, uint, 8, 16);
333 VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8);
334 PAD(buffer_vld2_pad, uint, 16, 8);
335 VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4);
336 PAD(buffer_vld2_pad, uint, 32, 4);
337 VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2);
338 PAD(buffer_vld2_pad, uint, 64, 2);
339 VECT_ARRAY_INIT2(buffer_vld2, poly, 8, 16);
340 PAD(buffer_vld2_pad, poly, 8, 16);
341 VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
342 PAD(buffer_vld2_pad, poly, 16, 8);
343 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
344 VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
345 PAD(buffer_vld2_pad, float, 16, 8);
346 #endif
347 VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4);
348 PAD(buffer_vld2_pad, float, 32, 4);
349
350 /* Input buffers for vld3, 1 of each size */
351 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8);
352 PAD(buffer_vld3_pad, int, 8, 8);
353 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4);
354 PAD(buffer_vld3_pad, int, 16, 4);
355 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2);
356 PAD(buffer_vld3_pad, int, 32, 2);
357 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1);
358 PAD(buffer_vld3_pad, int, 64, 1);
359 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8);
360 PAD(buffer_vld3_pad, uint, 8, 8);
361 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4);
362 PAD(buffer_vld3_pad, uint, 16, 4);
363 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2);
364 PAD(buffer_vld3_pad, uint, 32, 2);
365 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1);
366 PAD(buffer_vld3_pad, uint, 64, 1);
367 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 8);
368 PAD(buffer_vld3_pad, poly, 8, 8);
369 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
370 PAD(buffer_vld3_pad, poly, 16, 4);
371 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
372 VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
373 PAD(buffer_vld3_pad, float, 16, 4);
374 #endif
375 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2);
376 PAD(buffer_vld3_pad, float, 32, 2);
377
378 VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16);
379 PAD(buffer_vld3_pad, int, 8, 16);
380 VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8);
381 PAD(buffer_vld3_pad, int, 16, 8);
382 VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4);
383 PAD(buffer_vld3_pad, int, 32, 4);
384 VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2);
385 PAD(buffer_vld3_pad, int, 64, 2);
386 VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16);
387 PAD(buffer_vld3_pad, uint, 8, 16);
388 VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8);
389 PAD(buffer_vld3_pad, uint, 16, 8);
390 VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4);
391 PAD(buffer_vld3_pad, uint, 32, 4);
392 VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2);
393 PAD(buffer_vld3_pad, uint, 64, 2);
394 VECT_ARRAY_INIT3(buffer_vld3, poly, 8, 16);
395 PAD(buffer_vld3_pad, poly, 8, 16);
396 VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
397 PAD(buffer_vld3_pad, poly, 16, 8);
398 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
399 VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
400 PAD(buffer_vld3_pad, float, 16, 8);
401 #endif
402 VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4);
403 PAD(buffer_vld3_pad, float, 32, 4);
404
405 /* Input buffers for vld4, 1 of each size */
406 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8);
407 PAD(buffer_vld4_pad, int, 8, 8);
408 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4);
409 PAD(buffer_vld4_pad, int, 16, 4);
410 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2);
411 PAD(buffer_vld4_pad, int, 32, 2);
412 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1);
413 PAD(buffer_vld4_pad, int, 64, 1);
414 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8);
415 PAD(buffer_vld4_pad, uint, 8, 8);
416 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4);
417 PAD(buffer_vld4_pad, uint, 16, 4);
418 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2);
419 PAD(buffer_vld4_pad, uint, 32, 2);
420 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1);
421 PAD(buffer_vld4_pad, uint, 64, 1);
422 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 8);
423 PAD(buffer_vld4_pad, poly, 8, 8);
424 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
425 PAD(buffer_vld4_pad, poly, 16, 4);
426 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
427 VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
428 PAD(buffer_vld4_pad, float, 16, 4);
429 #endif
430 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2);
431 PAD(buffer_vld4_pad, float, 32, 2);
432
433 VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16);
434 PAD(buffer_vld4_pad, int, 8, 16);
435 VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8);
436 PAD(buffer_vld4_pad, int, 16, 8);
437 VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4);
438 PAD(buffer_vld4_pad, int, 32, 4);
439 VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2);
440 PAD(buffer_vld4_pad, int, 64, 2);
441 VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16);
442 PAD(buffer_vld4_pad, uint, 8, 16);
443 VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8);
444 PAD(buffer_vld4_pad, uint, 16, 8);
445 VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4);
446 PAD(buffer_vld4_pad, uint, 32, 4);
447 VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2);
448 PAD(buffer_vld4_pad, uint, 64, 2);
449 VECT_ARRAY_INIT4(buffer_vld4, poly, 8, 16);
450 PAD(buffer_vld4_pad, poly, 8, 16);
451 VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
452 PAD(buffer_vld4_pad, poly, 16, 8);
453 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
454 VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
455 PAD(buffer_vld4_pad, float, 16, 8);
456 #endif
457 VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4);
458 PAD(buffer_vld4_pad, float, 32, 4);
459
460 /* Check vld2_dup/vld2q_dup. */
461 clean_results ();
462 #define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
463 TEST_ALL_VLDX_DUP(2);
464 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld2_0, "chunk 0");
465
466 TEST_ALL_EXTRA_CHUNKS(2, 1);
467 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld2_1, "chunk 1");
468
469 /* Check vld3_dup/vld3q_dup. */
470 clean_results ();
471 #undef TEST_MSG
472 #define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
473 TEST_ALL_VLDX_DUP(3);
474 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_0, "chunk 0");
475
476 TEST_ALL_EXTRA_CHUNKS(3, 1);
477 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_1, "chunk 1");
478
479 TEST_ALL_EXTRA_CHUNKS(3, 2);
480 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld3_2, "chunk 2");
481
482 /* Check vld4_dup/vld4q_dup */
483 clean_results ();
484 #undef TEST_MSG
485 #define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
486 TEST_ALL_VLDX_DUP(4);
487 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_0, "chunk 0");
488
489 TEST_ALL_EXTRA_CHUNKS(4, 1);
490 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_1, "chunk 1");
491
492 TEST_ALL_EXTRA_CHUNKS(4, 2);
493 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_2, "chunk 2");
494
495 TEST_ALL_EXTRA_CHUNKS(4, 3);
496 CHECK_RESULTS_VLDX_DUP (TEST_MSG, expected_vld4_3, "chunk 3");
497 }
498
main(void)499 int main (void)
500 {
501 exec_vldX_dup ();
502 return 0;
503 }
504