1 #include <arm_neon.h>
2 #include "arm-neon-ref.h"
3 #include "compute-ref-data.h"
4
5 /* Expected results. */
6
7 /* vld2/chunk 0. */
8 VECT_VAR_DECL(expected_vld2_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
9 0xaa, 0xaa, 0xaa, 0xaa };
10 VECT_VAR_DECL(expected_vld2_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
11 VECT_VAR_DECL(expected_vld2_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
12 VECT_VAR_DECL(expected_vld2_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
13 0xaa, 0xaa, 0xaa, 0xaa };
14 VECT_VAR_DECL(expected_vld2_0,uint,16,4) [] = { 0xaaaa, 0xaaaa,
15 0xaaaa, 0xaaaa };
16 VECT_VAR_DECL(expected_vld2_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
17 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
18 0xaa, 0xaa, 0xaa, 0xaa };
19 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
20 0xaaaa, 0xaaaa };
21 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
22 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
23 VECT_VAR_DECL(expected_vld2_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
24 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
25 VECT_VAR_DECL(expected_vld2_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
26 0xaaaaaaaa, 0xaaaaaaaa };
27 VECT_VAR_DECL(expected_vld2_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
28 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
29 VECT_VAR_DECL(expected_vld2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
30 0xaaaaaaaa, 0xaaaaaaaa };
31 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
32 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
33 VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
34 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa } ;
35 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
36 0xaaaaaaaa, 0xaaaaaaaa };
37
38 /* vld2/chunk 1. */
39 VECT_VAR_DECL(expected_vld2_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
40 0xaa, 0xaa, 0xf0, 0xf1 };
41 VECT_VAR_DECL(expected_vld2_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xaaaa, 0xaaaa };
42 VECT_VAR_DECL(expected_vld2_1,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
43 VECT_VAR_DECL(expected_vld2_1,uint,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
44 0xaa, 0xaa, 0xaa, 0xaa };
45 VECT_VAR_DECL(expected_vld2_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
46 VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
47 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
48 0xaa, 0xaa, 0xaa, 0xaa };
49 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
50 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
51 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
52 VECT_VAR_DECL(expected_vld2_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
53 0xfff0, 0xfff1, 0xaaaa, 0xaaaa };
54 VECT_VAR_DECL(expected_vld2_1,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
55 0xaaaaaaaa, 0xaaaaaaaa };
56 VECT_VAR_DECL(expected_vld2_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
57 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
58 VECT_VAR_DECL(expected_vld2_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
59 0xaaaaaaaa, 0xaaaaaaaa };
60 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
61 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
62 VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
63 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
64 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
65 0xaaaaaaaa, 0xaaaaaaaa };
66
67 /* vld3/chunk 0. */
68 VECT_VAR_DECL(expected_vld3_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
69 0xaa, 0xaa, 0xaa, 0xaa };
70 VECT_VAR_DECL(expected_vld3_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
71 VECT_VAR_DECL(expected_vld3_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
72 VECT_VAR_DECL(expected_vld3_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
73 0xaa, 0xaa, 0xaa, 0xaa };
74 VECT_VAR_DECL(expected_vld3_0,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
75 VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
76 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
77 0xaa, 0xaa, 0xaa, 0xaa };
78 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
79 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
80 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
81 VECT_VAR_DECL(expected_vld3_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
82 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
83 VECT_VAR_DECL(expected_vld3_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
84 0xaaaaaaaa, 0xaaaaaaaa };
85 VECT_VAR_DECL(expected_vld3_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
86 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
87 VECT_VAR_DECL(expected_vld3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
88 0xfffffff2, 0xaaaaaaaa };
89 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
90 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
91 VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
92 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
93 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
94 0xaaaaaaaa, 0xaaaaaaaa };
95
96 /* vld3/chunk 1. */
97 VECT_VAR_DECL(expected_vld3_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
98 0xaa, 0xaa, 0xaa, 0xaa };
99 VECT_VAR_DECL(expected_vld3_1,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
100 VECT_VAR_DECL(expected_vld3_1,int,32,2) [] = { 0xfffffff2, 0xaaaaaaaa };
101 VECT_VAR_DECL(expected_vld3_1,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
102 0xf0, 0xf1, 0xf2, 0xaa };
103 VECT_VAR_DECL(expected_vld3_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
104 VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xaaaaaaaa, 0xfffffff0 };
105 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
106 0xf0, 0xf1, 0xf2, 0xaa };
107 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
108 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
109 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
110 VECT_VAR_DECL(expected_vld3_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
111 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
112 VECT_VAR_DECL(expected_vld3_1,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
113 0xfffffff0, 0xfffffff1 };
114 VECT_VAR_DECL(expected_vld3_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
115 0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
116 VECT_VAR_DECL(expected_vld3_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
117 0xaaaaaaaa, 0xaaaaaaaa };
118 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
119 0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
120 VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
121 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
122 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
123 0xc1800000, 0xc1700000 };
124
125 /* vld3/chunk 2. */
126 VECT_VAR_DECL(expected_vld3_2,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
127 0xaa, 0xf0, 0xf1, 0xf2 };
128 VECT_VAR_DECL(expected_vld3_2,int,16,4) [] = { 0xfff2, 0xaaaa, 0xaaaa, 0xaaaa };
129 VECT_VAR_DECL(expected_vld3_2,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
130 VECT_VAR_DECL(expected_vld3_2,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
131 0xaa, 0xaa, 0xaa, 0xaa };
132 VECT_VAR_DECL(expected_vld3_2,uint,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
133 VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
134 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
135 0xaa, 0xaa, 0xaa, 0xaa };
136 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
137 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
138 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
139 VECT_VAR_DECL(expected_vld3_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
140 0xfff2, 0xaaaa, 0xaaaa, 0xaaaa };
141 VECT_VAR_DECL(expected_vld3_2,int,32,4) [] = { 0xfffffff2, 0xaaaaaaaa,
142 0xaaaaaaaa, 0xaaaaaaaa };
143 VECT_VAR_DECL(expected_vld3_2,uint,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
144 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
145 VECT_VAR_DECL(expected_vld3_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
146 0xaaaaaaaa, 0xaaaaaaaa };
147 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
148 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
149 VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80,
150 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
151 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1600000, 0xaaaaaaaa,
152 0xaaaaaaaa, 0xaaaaaaaa };
153
154 /* vld4/chunk 0. */
155 VECT_VAR_DECL(expected_vld4_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
156 0xaa, 0xaa, 0xaa, 0xaa };
157 VECT_VAR_DECL(expected_vld4_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
158 VECT_VAR_DECL(expected_vld4_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
159 VECT_VAR_DECL(expected_vld4_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
160 0xaa, 0xaa, 0xaa, 0xaa };
161 VECT_VAR_DECL(expected_vld4_0,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
162 VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
163 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
164 0xaa, 0xaa, 0xaa, 0xaa };
165 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
166 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
167 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
168 VECT_VAR_DECL(expected_vld4_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
169 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
170 VECT_VAR_DECL(expected_vld4_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
171 0xaaaaaaaa, 0xaaaaaaaa };
172 VECT_VAR_DECL(expected_vld4_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
173 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
174 VECT_VAR_DECL(expected_vld4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
175 0xfffffff2, 0xfffffff3 };
176 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
177 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
178 VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
179 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
180 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
181 0xaaaaaaaa, 0xaaaaaaaa };
182
183 /* vld4/chunk 1. */
184 VECT_VAR_DECL(expected_vld4_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
185 0xaa, 0xaa, 0xaa, 0xaa };
186 VECT_VAR_DECL(expected_vld4_1,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
187 VECT_VAR_DECL(expected_vld4_1,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
188 VECT_VAR_DECL(expected_vld4_1,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
189 0xaa, 0xaa, 0xaa, 0xaa };
190 VECT_VAR_DECL(expected_vld4_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
191 VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
192 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
193 0xaa, 0xaa, 0xaa, 0xaa };
194 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
195 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
196 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
197 VECT_VAR_DECL(expected_vld4_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
198 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
199 VECT_VAR_DECL(expected_vld4_1,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
200 0xaaaaaaaa, 0xaaaaaaaa };
201 VECT_VAR_DECL(expected_vld4_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
202 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
203 VECT_VAR_DECL(expected_vld4_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
204 0xaaaaaaaa, 0xaaaaaaaa };
205 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
206 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
207 VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
208 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
209 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
210 0xaaaaaaaa, 0xaaaaaaaa };
211
212 /* vld4/chunk 2. */
213 VECT_VAR_DECL(expected_vld4_2,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
214 0xaa, 0xaa, 0xaa, 0xaa };
215 VECT_VAR_DECL(expected_vld4_2,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
216 VECT_VAR_DECL(expected_vld4_2,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
217 VECT_VAR_DECL(expected_vld4_2,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
218 0xaa, 0xaa, 0xaa, 0xaa };
219 VECT_VAR_DECL(expected_vld4_2,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
220 VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
221 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
222 0xaa, 0xaa, 0xaa, 0xaa };
223 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
224 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
225 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
226 VECT_VAR_DECL(expected_vld4_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
227 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
228 VECT_VAR_DECL(expected_vld4_2,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
229 0xfffffff2, 0xfffffff3 };
230 VECT_VAR_DECL(expected_vld4_2,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
231 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
232 VECT_VAR_DECL(expected_vld4_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
233 0xaaaaaaaa, 0xaaaaaaaa };
234 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
235 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
236 VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
237 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
238 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
239 0xc1600000, 0xc1500000 };
240
241 /* vld4/chunk 3. */
242 VECT_VAR_DECL(expected_vld4_3,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
243 0xf0, 0xf1, 0xf2, 0xf3 };
244 VECT_VAR_DECL(expected_vld4_3,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
245 VECT_VAR_DECL(expected_vld4_3,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
246 VECT_VAR_DECL(expected_vld4_3,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
247 0xaa, 0xaa, 0xaa, 0xaa };
248 VECT_VAR_DECL(expected_vld4_3,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
249 VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
250 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
251 0xaa, 0xaa, 0xaa, 0xaa };
252 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
253 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
254 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
255 VECT_VAR_DECL(expected_vld4_3,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
256 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
257 VECT_VAR_DECL(expected_vld4_3,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
258 0xaaaaaaaa, 0xaaaaaaaa };
259 VECT_VAR_DECL(expected_vld4_3,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
260 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
261 VECT_VAR_DECL(expected_vld4_3,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
262 0xaaaaaaaa, 0xaaaaaaaa };
263 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
264 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
265 VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
266 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
267 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
268 0xaaaaaaaa, 0xaaaaaaaa };
269
270 /* Declare additional input buffers as needed. */
271 /* Input buffers for vld2_lane */
272 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2);
273 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2);
274 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2);
275 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2);
276 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2);
277 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2);
278 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
279 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
280 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
281 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
282 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
283 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
284 #endif
285 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
286
287 /* Input buffers for vld3_lane */
288 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3);
289 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3);
290 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3);
291 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3);
292 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3);
293 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3);
294 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
295 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
296 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
297 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
298 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
299 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
300 #endif
301 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
302
303 /* Input buffers for vld4_lane */
304 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4);
305 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4);
306 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4);
307 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4);
308 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4);
309 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4);
310 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
311 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
312 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
313 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
314 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
315 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
316 #endif
317 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
318
exec_vldX_lane(void)319 void exec_vldX_lane (void)
320 {
321 /* In this case, input variables are arrays of vectors. */
322 #define DECL_VLDX_LANE(T1, W, N, X) \
323 VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X); \
324 VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X); \
325 VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
326
327 /* We need to use a temporary result buffer (result_bis), because
328 the one used for other tests is not large enough. A subset of the
329 result data is moved from result_bis to result, and it is this
330 subset which is used to check the actual behavior. The next
331 macro enables to move another chunk of data from result_bis to
332 result. */
333 /* We also use another extra input buffer (buffer_src), which we
334 fill with 0xAA, and which it used to load a vector from which we
335 read a given lane. */
336 #define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L) \
337 memset (VECT_VAR(buffer_src, T1, W, N), 0xAA, \
338 sizeof(VECT_VAR(buffer_src, T1, W, N))); \
339 \
340 VECT_ARRAY_VAR(vector_src, T1, W, N, X) = \
341 vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \
342 \
343 VECT_ARRAY_VAR(vector, T1, W, N, X) = \
344 /* Use dedicated init buffer, of size. X */ \
345 vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \
346 VECT_ARRAY_VAR(vector_src, T1, W, N, X), \
347 L); \
348 vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \
349 VECT_ARRAY_VAR(vector, T1, W, N, X)); \
350 memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
351 sizeof(VECT_VAR(result, T1, W, N)))
352
353 /* Overwrite "result" with the contents of "result_bis"[Y]. */
354 #define TEST_EXTRA_CHUNK(T1, W, N, X, Y) \
355 memcpy(VECT_VAR(result, T1, W, N), \
356 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]), \
357 sizeof(VECT_VAR(result, T1, W, N)));
358
359 /* We need all variants in 64 bits, but there is no 64x2 variant. */
360 #define DECL_ALL_VLDX_LANE_NO_FP16(X) \
361 DECL_VLDX_LANE(int, 8, 8, X); \
362 DECL_VLDX_LANE(int, 16, 4, X); \
363 DECL_VLDX_LANE(int, 32, 2, X); \
364 DECL_VLDX_LANE(uint, 8, 8, X); \
365 DECL_VLDX_LANE(uint, 16, 4, X); \
366 DECL_VLDX_LANE(uint, 32, 2, X); \
367 DECL_VLDX_LANE(poly, 8, 8, X); \
368 DECL_VLDX_LANE(poly, 16, 4, X); \
369 DECL_VLDX_LANE(int, 16, 8, X); \
370 DECL_VLDX_LANE(int, 32, 4, X); \
371 DECL_VLDX_LANE(uint, 16, 8, X); \
372 DECL_VLDX_LANE(uint, 32, 4, X); \
373 DECL_VLDX_LANE(poly, 16, 8, X); \
374 DECL_VLDX_LANE(float, 32, 2, X); \
375 DECL_VLDX_LANE(float, 32, 4, X)
376
377 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
378 #define DECL_ALL_VLDX_LANE(X) \
379 DECL_ALL_VLDX_LANE_NO_FP16(X); \
380 DECL_VLDX_LANE(float, 16, 4, X); \
381 DECL_VLDX_LANE(float, 16, 8, X)
382 #else
383 #define DECL_ALL_VLDX_LANE(X) DECL_ALL_VLDX_LANE_NO_FP16(X)
384 #endif
385
386 /* Add some padding to try to catch out of bound accesses. */
387 #define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
388 #define DUMMY_ARRAY(V, T, W, N, L) \
389 VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
390 ARRAY1(V##_pad,T,W,N)
391
392 /* Use the same lanes regardless of the size of the array (X), for
393 simplicity. */
394 #define TEST_ALL_VLDX_LANE_NO_FP16(X) \
395 TEST_VLDX_LANE(, int, s, 8, 8, X, 7); \
396 TEST_VLDX_LANE(, int, s, 16, 4, X, 2); \
397 TEST_VLDX_LANE(, int, s, 32, 2, X, 0); \
398 TEST_VLDX_LANE(, uint, u, 8, 8, X, 4); \
399 TEST_VLDX_LANE(, uint, u, 16, 4, X, 3); \
400 TEST_VLDX_LANE(, uint, u, 32, 2, X, 1); \
401 TEST_VLDX_LANE(, poly, p, 8, 8, X, 4); \
402 TEST_VLDX_LANE(, poly, p, 16, 4, X, 3); \
403 TEST_VLDX_LANE(q, int, s, 16, 8, X, 6); \
404 TEST_VLDX_LANE(q, int, s, 32, 4, X, 2); \
405 TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5); \
406 TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0); \
407 TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5); \
408 TEST_VLDX_LANE(, float, f, 32, 2, X, 0); \
409 TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
410
411 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
412 #define TEST_ALL_VLDX_LANE(X) \
413 TEST_ALL_VLDX_LANE_NO_FP16(X); \
414 TEST_VLDX_LANE(, float, f, 16, 4, X, 2); \
415 TEST_VLDX_LANE(q, float, f, 16, 8, X, 6)
416 #else
417 #define TEST_ALL_VLDX_LANE(X) TEST_ALL_VLDX_LANE_NO_FP16(X)
418 #endif
419
420 #define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X,Y) \
421 TEST_EXTRA_CHUNK(int, 8, 8, X, Y); \
422 TEST_EXTRA_CHUNK(int, 16, 4, X, Y); \
423 TEST_EXTRA_CHUNK(int, 32, 2, X, Y); \
424 TEST_EXTRA_CHUNK(uint, 8, 8, X, Y); \
425 TEST_EXTRA_CHUNK(uint, 16, 4, X, Y); \
426 TEST_EXTRA_CHUNK(uint, 32, 2, X, Y); \
427 TEST_EXTRA_CHUNK(poly, 8, 8, X, Y); \
428 TEST_EXTRA_CHUNK(poly, 16, 4, X, Y); \
429 TEST_EXTRA_CHUNK(int, 16, 8, X, Y); \
430 TEST_EXTRA_CHUNK(int, 32, 4, X, Y); \
431 TEST_EXTRA_CHUNK(uint, 16, 8, X, Y); \
432 TEST_EXTRA_CHUNK(uint, 32, 4, X, Y); \
433 TEST_EXTRA_CHUNK(poly, 16, 8, X, Y); \
434 TEST_EXTRA_CHUNK(float, 32, 2, X, Y); \
435 TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
436
437 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
438 #define TEST_ALL_EXTRA_CHUNKS(X,Y) \
439 TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y); \
440 TEST_EXTRA_CHUNK(float, 16, 4, X, Y); \
441 TEST_EXTRA_CHUNK(float, 16, 8, X, Y)
442 #else
443 #define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
444 #endif
445
446 /* vldX_lane supports only a subset of all variants. */
447 #define CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment) \
448 CHECK(test_name, int, 8, 8, PRIx8, EXPECTED, comment); \
449 CHECK(test_name, int, 16, 4, PRIx16, EXPECTED, comment); \
450 CHECK(test_name, int, 32, 2, PRIx32, EXPECTED, comment); \
451 CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment); \
452 CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment); \
453 CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment); \
454 CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment); \
455 CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment); \
456 CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment); \
457 CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment); \
458 CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment); \
459 CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment); \
460 CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment); \
461 CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment); \
462 CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
463
464 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
465 #define CHECK_RESULTS_VLDX_LANE(test_name,EXPECTED,comment) \
466 { \
467 CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment); \
468 CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment); \
469 CHECK_FP(test_name, float, 16, 8, PRIx16, EXPECTED, comment); \
470 }
471 #else
472 #define CHECK_RESULTS_VLDX_LANE(test_name,EXPECTED,comment) \
473 { \
474 CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment); \
475 }
476 #endif
477
478 /* Declare the temporary buffers / variables. */
479 DECL_ALL_VLDX_LANE(2);
480 DECL_ALL_VLDX_LANE(3);
481 DECL_ALL_VLDX_LANE(4);
482
483 /* Define dummy input arrays, large enough for x4 vectors. */
484 DUMMY_ARRAY(buffer_src, int, 8, 8, 4);
485 DUMMY_ARRAY(buffer_src, int, 16, 4, 4);
486 DUMMY_ARRAY(buffer_src, int, 32, 2, 4);
487 DUMMY_ARRAY(buffer_src, uint, 8, 8, 4);
488 DUMMY_ARRAY(buffer_src, uint, 16, 4, 4);
489 DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
490 DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
491 DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
492 DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
493 DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
494 DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
495 DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
496 DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
497 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
498 DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
499 DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
500 #endif
501 DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
502 DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
503
504 /* Check vld2_lane/vld2q_lane. */
505 clean_results ();
506 #define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
507 TEST_ALL_VLDX_LANE(2);
508 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld2_0, " chunk 0");
509
510 TEST_ALL_EXTRA_CHUNKS(2, 1);
511 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld2_1, " chunk 1");
512
513 /* Check vld3_lane/vld3q_lane. */
514 clean_results ();
515 #undef TEST_MSG
516 #define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
517 TEST_ALL_VLDX_LANE(3);
518 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_0, " chunk 0");
519
520 TEST_ALL_EXTRA_CHUNKS(3, 1);
521 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_1, " chunk 1");
522
523 TEST_ALL_EXTRA_CHUNKS(3, 2);
524 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_2, " chunk 2");
525
526 /* Check vld4_lane/vld4q_lane. */
527 clean_results ();
528 #undef TEST_MSG
529 #define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
530 TEST_ALL_VLDX_LANE(4);
531 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_0, " chunk 0");
532
533 TEST_ALL_EXTRA_CHUNKS(4, 1);
534 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_1, " chunk 1");
535 TEST_ALL_EXTRA_CHUNKS(4, 2);
536
537 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_2, " chunk 2");
538
539 TEST_ALL_EXTRA_CHUNKS(4, 3);
540 CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_3, " chunk 3");
541 }
542
main(void)543 int main (void)
544 {
545 exec_vldX_lane ();
546 return 0;
547 }
548