1 #include <arm_neon.h>
2 #include "arm-neon-ref.h"
3 #include "compute-ref-data.h"
4 
5 /* Expected results.  */
6 
7 /* vld2/chunk 0.  */
8 VECT_VAR_DECL(expected_vld2_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
9 					      0xaa, 0xaa, 0xaa, 0xaa };
10 VECT_VAR_DECL(expected_vld2_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
11 VECT_VAR_DECL(expected_vld2_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
12 VECT_VAR_DECL(expected_vld2_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
13 					       0xaa, 0xaa, 0xaa, 0xaa };
14 VECT_VAR_DECL(expected_vld2_0,uint,16,4) [] = { 0xaaaa, 0xaaaa,
15 						0xaaaa, 0xaaaa };
16 VECT_VAR_DECL(expected_vld2_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
17 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
18 					       0xaa, 0xaa, 0xaa, 0xaa };
19 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
20 						0xaaaa, 0xaaaa };
21 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
22 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
23 VECT_VAR_DECL(expected_vld2_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
24 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
25 VECT_VAR_DECL(expected_vld2_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
26 					       0xaaaaaaaa, 0xaaaaaaaa };
27 VECT_VAR_DECL(expected_vld2_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
28 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
29 VECT_VAR_DECL(expected_vld2_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
30 						0xaaaaaaaa, 0xaaaaaaaa };
31 VECT_VAR_DECL(expected_vld2_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
32 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
33 VECT_VAR_DECL(expected_vld2_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
34 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa } ;
35 VECT_VAR_DECL(expected_vld2_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
36 						  0xaaaaaaaa, 0xaaaaaaaa };
37 
38 /* vld2/chunk 1.  */
39 VECT_VAR_DECL(expected_vld2_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
40 					      0xaa, 0xaa, 0xf0, 0xf1 };
41 VECT_VAR_DECL(expected_vld2_1,int,16,4) [] = { 0xfff0, 0xfff1, 0xaaaa, 0xaaaa };
42 VECT_VAR_DECL(expected_vld2_1,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
43 VECT_VAR_DECL(expected_vld2_1,uint,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
44 					       0xaa, 0xaa, 0xaa, 0xaa };
45 VECT_VAR_DECL(expected_vld2_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
46 VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
47 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
48 					       0xaa, 0xaa, 0xaa, 0xaa };
49 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
50 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
51 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
52 VECT_VAR_DECL(expected_vld2_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
53 					       0xfff0, 0xfff1, 0xaaaa, 0xaaaa };
54 VECT_VAR_DECL(expected_vld2_1,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
55 					       0xaaaaaaaa, 0xaaaaaaaa };
56 VECT_VAR_DECL(expected_vld2_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
57 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
58 VECT_VAR_DECL(expected_vld2_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
59 						0xaaaaaaaa, 0xaaaaaaaa };
60 VECT_VAR_DECL(expected_vld2_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
61 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
62 VECT_VAR_DECL(expected_vld2_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
63 						  0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
64 VECT_VAR_DECL(expected_vld2_1,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
65 						  0xaaaaaaaa, 0xaaaaaaaa };
66 
67 /* vld3/chunk 0.  */
68 VECT_VAR_DECL(expected_vld3_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
69 					      0xaa, 0xaa, 0xaa, 0xaa };
70 VECT_VAR_DECL(expected_vld3_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
71 VECT_VAR_DECL(expected_vld3_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
72 VECT_VAR_DECL(expected_vld3_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
73 					       0xaa, 0xaa, 0xaa, 0xaa };
74 VECT_VAR_DECL(expected_vld3_0,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
75 VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
76 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
77 					       0xaa, 0xaa, 0xaa, 0xaa };
78 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
79 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
80 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
81 VECT_VAR_DECL(expected_vld3_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
82 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
83 VECT_VAR_DECL(expected_vld3_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
84 					       0xaaaaaaaa, 0xaaaaaaaa };
85 VECT_VAR_DECL(expected_vld3_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
86 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
87 VECT_VAR_DECL(expected_vld3_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
88 						0xfffffff2, 0xaaaaaaaa };
89 VECT_VAR_DECL(expected_vld3_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
90 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
91 VECT_VAR_DECL(expected_vld3_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
92 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
93 VECT_VAR_DECL(expected_vld3_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
94 						  0xaaaaaaaa, 0xaaaaaaaa };
95 
96 /* vld3/chunk 1.  */
97 VECT_VAR_DECL(expected_vld3_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
98 					      0xaa, 0xaa, 0xaa, 0xaa };
99 VECT_VAR_DECL(expected_vld3_1,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
100 VECT_VAR_DECL(expected_vld3_1,int,32,2) [] = { 0xfffffff2, 0xaaaaaaaa };
101 VECT_VAR_DECL(expected_vld3_1,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
102 					       0xf0, 0xf1, 0xf2, 0xaa };
103 VECT_VAR_DECL(expected_vld3_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
104 VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xaaaaaaaa, 0xfffffff0 };
105 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
106 					       0xf0, 0xf1, 0xf2, 0xaa };
107 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
108 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
109 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
110 VECT_VAR_DECL(expected_vld3_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
111 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
112 VECT_VAR_DECL(expected_vld3_1,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
113 					       0xfffffff0, 0xfffffff1 };
114 VECT_VAR_DECL(expected_vld3_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
115 						0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
116 VECT_VAR_DECL(expected_vld3_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
117 						0xaaaaaaaa, 0xaaaaaaaa };
118 VECT_VAR_DECL(expected_vld3_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
119 						0xaaaa, 0xaaaa, 0xaaaa, 0xfff0 };
120 VECT_VAR_DECL(expected_vld3_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
121 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
122 VECT_VAR_DECL(expected_vld3_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
123 						  0xc1800000, 0xc1700000 };
124 
125 /* vld3/chunk 2.  */
126 VECT_VAR_DECL(expected_vld3_2,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
127 					      0xaa, 0xf0, 0xf1, 0xf2 };
128 VECT_VAR_DECL(expected_vld3_2,int,16,4) [] = { 0xfff2, 0xaaaa, 0xaaaa, 0xaaaa };
129 VECT_VAR_DECL(expected_vld3_2,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
130 VECT_VAR_DECL(expected_vld3_2,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
131 					       0xaa, 0xaa, 0xaa, 0xaa };
132 VECT_VAR_DECL(expected_vld3_2,uint,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
133 VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
134 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
135 					       0xaa, 0xaa, 0xaa, 0xaa };
136 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
137 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
138 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
139 VECT_VAR_DECL(expected_vld3_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
140 					       0xfff2, 0xaaaa, 0xaaaa, 0xaaaa };
141 VECT_VAR_DECL(expected_vld3_2,int,32,4) [] = { 0xfffffff2, 0xaaaaaaaa,
142 					       0xaaaaaaaa, 0xaaaaaaaa };
143 VECT_VAR_DECL(expected_vld3_2,uint,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
144 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
145 VECT_VAR_DECL(expected_vld3_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
146 						0xaaaaaaaa, 0xaaaaaaaa };
147 VECT_VAR_DECL(expected_vld3_2,poly,16,8) [] = { 0xfff1, 0xfff2, 0xaaaa, 0xaaaa,
148 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
149 VECT_VAR_DECL(expected_vld3_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80,
150 						  0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
151 VECT_VAR_DECL(expected_vld3_2,hfloat,32,4) [] = { 0xc1600000, 0xaaaaaaaa,
152 						  0xaaaaaaaa, 0xaaaaaaaa };
153 
154 /* vld4/chunk 0.  */
155 VECT_VAR_DECL(expected_vld4_0,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
156 					      0xaa, 0xaa, 0xaa, 0xaa };
157 VECT_VAR_DECL(expected_vld4_0,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
158 VECT_VAR_DECL(expected_vld4_0,int,32,2) [] = { 0xfffffff0, 0xfffffff1 };
159 VECT_VAR_DECL(expected_vld4_0,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
160 					       0xaa, 0xaa, 0xaa, 0xaa };
161 VECT_VAR_DECL(expected_vld4_0,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
162 VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
163 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
164 					       0xaa, 0xaa, 0xaa, 0xaa };
165 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
166 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
167 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
168 VECT_VAR_DECL(expected_vld4_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
169 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
170 VECT_VAR_DECL(expected_vld4_0,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
171 					       0xaaaaaaaa, 0xaaaaaaaa };
172 VECT_VAR_DECL(expected_vld4_0,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
173 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
174 VECT_VAR_DECL(expected_vld4_0,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
175 						0xfffffff2, 0xfffffff3 };
176 VECT_VAR_DECL(expected_vld4_0,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
177 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
178 VECT_VAR_DECL(expected_vld4_0,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
179 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
180 VECT_VAR_DECL(expected_vld4_0,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
181 						  0xaaaaaaaa, 0xaaaaaaaa };
182 
183 /* vld4/chunk 1.  */
184 VECT_VAR_DECL(expected_vld4_1,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
185 					      0xaa, 0xaa, 0xaa, 0xaa };
186 VECT_VAR_DECL(expected_vld4_1,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
187 VECT_VAR_DECL(expected_vld4_1,int,32,2) [] = { 0xfffffff2, 0xfffffff3 };
188 VECT_VAR_DECL(expected_vld4_1,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
189 					       0xaa, 0xaa, 0xaa, 0xaa };
190 VECT_VAR_DECL(expected_vld4_1,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
191 VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
192 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
193 					       0xaa, 0xaa, 0xaa, 0xaa };
194 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
195 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
196 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
197 VECT_VAR_DECL(expected_vld4_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
198 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
199 VECT_VAR_DECL(expected_vld4_1,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
200 					       0xaaaaaaaa, 0xaaaaaaaa };
201 VECT_VAR_DECL(expected_vld4_1,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
202 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
203 VECT_VAR_DECL(expected_vld4_1,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
204 						0xaaaaaaaa, 0xaaaaaaaa };
205 VECT_VAR_DECL(expected_vld4_1,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
206 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
207 VECT_VAR_DECL(expected_vld4_1,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
208 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
209 VECT_VAR_DECL(expected_vld4_1,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
210 						  0xaaaaaaaa, 0xaaaaaaaa };
211 
212 /* vld4/chunk 2.  */
213 VECT_VAR_DECL(expected_vld4_2,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
214 					      0xaa, 0xaa, 0xaa, 0xaa };
215 VECT_VAR_DECL(expected_vld4_2,int,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
216 VECT_VAR_DECL(expected_vld4_2,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
217 VECT_VAR_DECL(expected_vld4_2,uint,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
218 					       0xaa, 0xaa, 0xaa, 0xaa };
219 VECT_VAR_DECL(expected_vld4_2,uint,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
220 VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
221 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
222 					       0xaa, 0xaa, 0xaa, 0xaa };
223 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
224 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
225 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
226 VECT_VAR_DECL(expected_vld4_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
227 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
228 VECT_VAR_DECL(expected_vld4_2,int,32,4) [] = { 0xfffffff0, 0xfffffff1,
229 					       0xfffffff2, 0xfffffff3 };
230 VECT_VAR_DECL(expected_vld4_2,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
231 						0xfff0, 0xfff1, 0xfff2, 0xfff3 };
232 VECT_VAR_DECL(expected_vld4_2,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
233 						0xaaaaaaaa, 0xaaaaaaaa };
234 VECT_VAR_DECL(expected_vld4_2,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
235 						0xfff0, 0xfff1, 0xfff2, 0xfff3 };
236 VECT_VAR_DECL(expected_vld4_2,hfloat,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
237 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
238 VECT_VAR_DECL(expected_vld4_2,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
239 						  0xc1600000, 0xc1500000 };
240 
241 /* vld4/chunk 3.  */
242 VECT_VAR_DECL(expected_vld4_3,int,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
243 					      0xf0, 0xf1, 0xf2, 0xf3 };
244 VECT_VAR_DECL(expected_vld4_3,int,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
245 VECT_VAR_DECL(expected_vld4_3,int,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
246 VECT_VAR_DECL(expected_vld4_3,uint,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
247 					       0xaa, 0xaa, 0xaa, 0xaa };
248 VECT_VAR_DECL(expected_vld4_3,uint,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
249 VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
250 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
251 					       0xaa, 0xaa, 0xaa, 0xaa };
252 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
253 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
254 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
255 VECT_VAR_DECL(expected_vld4_3,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
256 					       0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
257 VECT_VAR_DECL(expected_vld4_3,int,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
258 					       0xaaaaaaaa, 0xaaaaaaaa };
259 VECT_VAR_DECL(expected_vld4_3,uint,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
260 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
261 VECT_VAR_DECL(expected_vld4_3,uint,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
262 						0xaaaaaaaa, 0xaaaaaaaa };
263 VECT_VAR_DECL(expected_vld4_3,poly,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
264 						0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
265 VECT_VAR_DECL(expected_vld4_3,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
266 						  0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
267 VECT_VAR_DECL(expected_vld4_3,hfloat,32,4) [] = { 0xaaaaaaaa, 0xaaaaaaaa,
268 						  0xaaaaaaaa, 0xaaaaaaaa };
269 
270 /* Declare additional input buffers as needed.  */
271 /* Input buffers for vld2_lane */
272 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2);
273 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2);
274 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2);
275 VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2);
276 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2);
277 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2);
278 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
279 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
280 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
281 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
282 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
283 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
284 #endif
285 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2);
286 
287 /* Input buffers for vld3_lane */
288 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3);
289 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3);
290 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3);
291 VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3);
292 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3);
293 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3);
294 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
295 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
296 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
297 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
298 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
299 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
300 #endif
301 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3);
302 
303 /* Input buffers for vld4_lane */
304 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4);
305 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4);
306 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4);
307 VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4);
308 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4);
309 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4);
310 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
311 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
312 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
313 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
314 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
315 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
316 #endif
317 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4);
318 
exec_vldX_lane(void)319 void exec_vldX_lane (void)
320 {
321   /* In this case, input variables are arrays of vectors.  */
322 #define DECL_VLDX_LANE(T1, W, N, X)					\
323   VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X);	\
324   VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X);	\
325   VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
326 
327   /* We need to use a temporary result buffer (result_bis), because
328      the one used for other tests is not large enough. A subset of the
329      result data is moved from result_bis to result, and it is this
330      subset which is used to check the actual behavior. The next
331      macro enables to move another chunk of data from result_bis to
332      result.  */
333   /* We also use another extra input buffer (buffer_src), which we
334      fill with 0xAA, and which it used to load a vector from which we
335      read a given lane.  */
336 #define TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)				\
337   memset (VECT_VAR(buffer_src, T1, W, N), 0xAA,				\
338 	  sizeof(VECT_VAR(buffer_src, T1, W, N)));			\
339 									\
340   VECT_ARRAY_VAR(vector_src, T1, W, N, X) =				\
341     vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N));		\
342 									\
343   VECT_ARRAY_VAR(vector, T1, W, N, X) =					\
344     /* Use dedicated init buffer, of size.  X */			\
345     vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X),	\
346 			     VECT_ARRAY_VAR(vector_src, T1, W, N, X),	\
347 			     L);					\
348   vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N),		\
349 		      VECT_ARRAY_VAR(vector, T1, W, N, X));		\
350   memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
351 	 sizeof(VECT_VAR(result, T1, W, N)))
352 
353   /* Overwrite "result" with the contents of "result_bis"[Y].  */
354 #define TEST_EXTRA_CHUNK(T1, W, N, X, Y)		\
355   memcpy(VECT_VAR(result, T1, W, N),			\
356 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
357 	 sizeof(VECT_VAR(result, T1, W, N)));
358 
359   /* We need all variants in 64 bits, but there is no 64x2 variant.  */
360 #define DECL_ALL_VLDX_LANE_NO_FP16(X)		\
361   DECL_VLDX_LANE(int, 8, 8, X);			\
362   DECL_VLDX_LANE(int, 16, 4, X);		\
363   DECL_VLDX_LANE(int, 32, 2, X);		\
364   DECL_VLDX_LANE(uint, 8, 8, X);		\
365   DECL_VLDX_LANE(uint, 16, 4, X);		\
366   DECL_VLDX_LANE(uint, 32, 2, X);		\
367   DECL_VLDX_LANE(poly, 8, 8, X);		\
368   DECL_VLDX_LANE(poly, 16, 4, X);		\
369   DECL_VLDX_LANE(int, 16, 8, X);		\
370   DECL_VLDX_LANE(int, 32, 4, X);		\
371   DECL_VLDX_LANE(uint, 16, 8, X);		\
372   DECL_VLDX_LANE(uint, 32, 4, X);		\
373   DECL_VLDX_LANE(poly, 16, 8, X);		\
374   DECL_VLDX_LANE(float, 32, 2, X);		\
375   DECL_VLDX_LANE(float, 32, 4, X)
376 
377 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
378 #define DECL_ALL_VLDX_LANE(X)		\
379   DECL_ALL_VLDX_LANE_NO_FP16(X);	\
380   DECL_VLDX_LANE(float, 16, 4, X);	\
381   DECL_VLDX_LANE(float, 16, 8, X)
382 #else
383 #define DECL_ALL_VLDX_LANE(X) DECL_ALL_VLDX_LANE_NO_FP16(X)
384 #endif
385 
386   /* Add some padding to try to catch out of bound accesses.  */
387 #define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42}
388 #define DUMMY_ARRAY(V, T, W, N, L) \
389   VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \
390   ARRAY1(V##_pad,T,W,N)
391 
392   /* Use the same lanes regardless of the size of the array (X), for
393      simplicity.  */
394 #define TEST_ALL_VLDX_LANE_NO_FP16(X)		\
395   TEST_VLDX_LANE(, int, s, 8, 8, X, 7);		\
396   TEST_VLDX_LANE(, int, s, 16, 4, X, 2);	\
397   TEST_VLDX_LANE(, int, s, 32, 2, X, 0);	\
398   TEST_VLDX_LANE(, uint, u, 8, 8, X, 4);	\
399   TEST_VLDX_LANE(, uint, u, 16, 4, X, 3);	\
400   TEST_VLDX_LANE(, uint, u, 32, 2, X, 1);	\
401   TEST_VLDX_LANE(, poly, p, 8, 8, X, 4);	\
402   TEST_VLDX_LANE(, poly, p, 16, 4, X, 3);	\
403   TEST_VLDX_LANE(q, int, s, 16, 8, X, 6);	\
404   TEST_VLDX_LANE(q, int, s, 32, 4, X, 2);	\
405   TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5);	\
406   TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0);	\
407   TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5);	\
408   TEST_VLDX_LANE(, float, f, 32, 2, X, 0);	\
409   TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
410 
411 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
412 #define TEST_ALL_VLDX_LANE(X)			\
413   TEST_ALL_VLDX_LANE_NO_FP16(X);		\
414   TEST_VLDX_LANE(, float, f, 16, 4, X, 2);	\
415   TEST_VLDX_LANE(q, float, f, 16, 8, X, 6)
416 #else
417 #define TEST_ALL_VLDX_LANE(X) TEST_ALL_VLDX_LANE_NO_FP16(X)
418 #endif
419 
420 #define TEST_ALL_EXTRA_CHUNKS_NO_FP16(X,Y)	\
421   TEST_EXTRA_CHUNK(int, 8, 8, X, Y);		\
422   TEST_EXTRA_CHUNK(int, 16, 4, X, Y);		\
423   TEST_EXTRA_CHUNK(int, 32, 2, X, Y);		\
424   TEST_EXTRA_CHUNK(uint, 8, 8, X, Y);		\
425   TEST_EXTRA_CHUNK(uint, 16, 4, X, Y);		\
426   TEST_EXTRA_CHUNK(uint, 32, 2, X, Y);		\
427   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
428   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
429   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
430   TEST_EXTRA_CHUNK(int, 32, 4, X, Y);		\
431   TEST_EXTRA_CHUNK(uint, 16, 8, X, Y);		\
432   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
433   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
434   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
435   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
436 
437 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
438 #define TEST_ALL_EXTRA_CHUNKS(X,Y)		\
439   TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y);		\
440   TEST_EXTRA_CHUNK(float, 16, 4, X, Y);		\
441   TEST_EXTRA_CHUNK(float, 16, 8, X, Y)
442 #else
443 #define TEST_ALL_EXTRA_CHUNKS(X,Y) TEST_ALL_EXTRA_CHUNKS_NO_FP16(X, Y)
444 #endif
445 
446   /* vldX_lane supports only a subset of all variants.  */
447 #define CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment)	\
448     CHECK(test_name, int, 8, 8, PRIx8, EXPECTED, comment);		\
449     CHECK(test_name, int, 16, 4, PRIx16, EXPECTED, comment);		\
450     CHECK(test_name, int, 32, 2, PRIx32, EXPECTED, comment);		\
451     CHECK(test_name, uint, 8, 8, PRIx8, EXPECTED, comment);		\
452     CHECK(test_name, uint, 16, 4, PRIx16, EXPECTED, comment);		\
453     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
454     CHECK_POLY(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);	\
455     CHECK_POLY(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);	\
456     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
457     CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment);		\
458     CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment);		\
459     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
460     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
461     CHECK_POLY(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);	\
462     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
463 
464 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
465 #define CHECK_RESULTS_VLDX_LANE(test_name,EXPECTED,comment)		\
466   {									\
467     CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment);	\
468     CHECK_FP(test_name, float, 16, 4, PRIx16, EXPECTED, comment);	\
469     CHECK_FP(test_name, float, 16, 8, PRIx16, EXPECTED, comment);	\
470   }
471 #else
472 #define CHECK_RESULTS_VLDX_LANE(test_name,EXPECTED,comment)		\
473   {									\
474     CHECK_RESULTS_VLDX_LANE_NO_FP16(test_name,EXPECTED,comment);	\
475   }
476 #endif
477 
478   /* Declare the temporary buffers / variables.  */
479   DECL_ALL_VLDX_LANE(2);
480   DECL_ALL_VLDX_LANE(3);
481   DECL_ALL_VLDX_LANE(4);
482 
483   /* Define dummy input arrays, large enough for x4 vectors.  */
484   DUMMY_ARRAY(buffer_src, int, 8, 8, 4);
485   DUMMY_ARRAY(buffer_src, int, 16, 4, 4);
486   DUMMY_ARRAY(buffer_src, int, 32, 2, 4);
487   DUMMY_ARRAY(buffer_src, uint, 8, 8, 4);
488   DUMMY_ARRAY(buffer_src, uint, 16, 4, 4);
489   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
490   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
491   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
492   DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
493   DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
494   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
495   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
496   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
497 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
498   DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
499   DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
500 #endif
501   DUMMY_ARRAY(buffer_src, float, 32, 2, 4);
502   DUMMY_ARRAY(buffer_src, float, 32, 4, 4);
503 
504   /* Check vld2_lane/vld2q_lane.  */
505   clean_results ();
506 #define TEST_MSG "VLD2_LANE/VLD2Q_LANE"
507   TEST_ALL_VLDX_LANE(2);
508   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld2_0, " chunk 0");
509 
510   TEST_ALL_EXTRA_CHUNKS(2, 1);
511   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld2_1, " chunk 1");
512 
513   /* Check vld3_lane/vld3q_lane.  */
514   clean_results ();
515 #undef TEST_MSG
516 #define TEST_MSG "VLD3_LANE/VLD3Q_LANE"
517   TEST_ALL_VLDX_LANE(3);
518   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_0, " chunk 0");
519 
520   TEST_ALL_EXTRA_CHUNKS(3, 1);
521   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_1, " chunk 1");
522 
523   TEST_ALL_EXTRA_CHUNKS(3, 2);
524   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld3_2, " chunk 2");
525 
526   /* Check vld4_lane/vld4q_lane.  */
527   clean_results ();
528 #undef TEST_MSG
529 #define TEST_MSG "VLD4_LANE/VLD4Q_LANE"
530   TEST_ALL_VLDX_LANE(4);
531   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_0, " chunk 0");
532 
533   TEST_ALL_EXTRA_CHUNKS(4, 1);
534   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_1, " chunk 1");
535   TEST_ALL_EXTRA_CHUNKS(4, 2);
536 
537   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_2, " chunk 2");
538 
539   TEST_ALL_EXTRA_CHUNKS(4, 3);
540   CHECK_RESULTS_VLDX_LANE (TEST_MSG, expected_vld4_3, " chunk 3");
541 }
542 
main(void)543 int main (void)
544 {
545   exec_vldX_lane ();
546   return 0;
547 }
548