xref: /qemu/tests/tcg/hexagon/hvx_misc.c (revision 727385c4)
1 /*
2  *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdint.h>
20 #include <stdbool.h>
21 #include <string.h>
22 
23 int err;
24 
25 static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
26 {
27     if (result != expect) {
28         printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
29                line, i, j, result, expect);
30         err++;
31     }
32 }
33 
34 #define check(RES, EXP) __check(__LINE__, RES, EXP)
35 
36 #define MAX_VEC_SIZE_BYTES         128
37 
38 typedef union {
39     uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
40     int64_t   d[MAX_VEC_SIZE_BYTES / 8];
41     uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
42     int32_t   w[MAX_VEC_SIZE_BYTES / 4];
43     uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
44     int16_t   h[MAX_VEC_SIZE_BYTES / 2];
45     uint8_t  ub[MAX_VEC_SIZE_BYTES / 1];
46     int8_t    b[MAX_VEC_SIZE_BYTES / 1];
47 } MMVector;
48 
49 #define BUFSIZE      16
50 #define OUTSIZE      16
51 #define MASKMOD      3
52 
53 MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
54 MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
55 MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
56 MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
57 MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
58 
59 #define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
60 static void check_output_##FIELD(int line, size_t num_vectors) \
61 { \
62     for (int i = 0; i < num_vectors; i++) { \
63         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
64             __check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
65         } \
66     } \
67 }
68 
69 CHECK_OUTPUT_FUNC(d,  8)
70 CHECK_OUTPUT_FUNC(w,  4)
71 CHECK_OUTPUT_FUNC(h,  2)
72 CHECK_OUTPUT_FUNC(b,  1)
73 
74 static void init_buffers(void)
75 {
76     int counter0 = 0;
77     int counter1 = 17;
78     for (int i = 0; i < BUFSIZE; i++) {
79         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
80             buffer0[i].b[j] = counter0++;
81             buffer1[i].b[j] = counter1++;
82         }
83         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
84             mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
85         }
86     }
87 }
88 
89 static void test_load_tmp(void)
90 {
91     void *p0 = buffer0;
92     void *p1 = buffer1;
93     void *pout = output;
94 
95     for (int i = 0; i < BUFSIZE; i++) {
96         /*
97          * Load into v12 as .tmp, then use it in the next packet
98          * Should get the new value within the same packet and
99          * the old value in the next packet
100          */
101         asm("v3 = vmem(%0 + #0)\n\t"
102             "r1 = #1\n\t"
103             "v12 = vsplat(r1)\n\t"
104             "{\n\t"
105             "    v12.tmp = vmem(%1 + #0)\n\t"
106             "    v4.w = vadd(v12.w, v3.w)\n\t"
107             "}\n\t"
108             "v4.w = vadd(v4.w, v12.w)\n\t"
109             "vmem(%2 + #0) = v4\n\t"
110             : : "r"(p0), "r"(p1), "r"(pout)
111             : "r1", "v12", "v3", "v4", "v6", "memory");
112         p0 += sizeof(MMVector);
113         p1 += sizeof(MMVector);
114         pout += sizeof(MMVector);
115 
116         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
117             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
118         }
119     }
120 
121     check_output_w(__LINE__, BUFSIZE);
122 }
123 
124 static void test_load_cur(void)
125 {
126     void *p0 = buffer0;
127     void *pout = output;
128 
129     for (int i = 0; i < BUFSIZE; i++) {
130         asm("{\n\t"
131             "    v2.cur = vmem(%0 + #0)\n\t"
132             "    vmem(%1 + #0) = v2\n\t"
133             "}\n\t"
134             : : "r"(p0), "r"(pout) : "v2", "memory");
135         p0 += sizeof(MMVector);
136         pout += sizeof(MMVector);
137 
138         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
139             expect[i].uw[j] = buffer0[i].uw[j];
140         }
141     }
142 
143     check_output_w(__LINE__, BUFSIZE);
144 }
145 
146 static void test_load_aligned(void)
147 {
148     /* Aligned loads ignore the low bits of the address */
149     void *p0 = buffer0;
150     void *pout = output;
151     const size_t offset = 13;
152 
153     p0 += offset;    /* Create an unaligned address */
154     asm("v2 = vmem(%0 + #0)\n\t"
155         "vmem(%1 + #0) = v2\n\t"
156         : : "r"(p0), "r"(pout) : "v2", "memory");
157 
158     expect[0] = buffer0[0];
159 
160     check_output_w(__LINE__, 1);
161 }
162 
163 static void test_load_unaligned(void)
164 {
165     void *p0 = buffer0;
166     void *pout = output;
167     const size_t offset = 12;
168 
169     p0 += offset;    /* Create an unaligned address */
170     asm("v2 = vmemu(%0 + #0)\n\t"
171         "vmem(%1 + #0) = v2\n\t"
172         : : "r"(p0), "r"(pout) : "v2", "memory");
173 
174     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
175 
176     check_output_w(__LINE__, 1);
177 }
178 
179 static void test_store_aligned(void)
180 {
181     /* Aligned stores ignore the low bits of the address */
182     void *p0 = buffer0;
183     void *pout = output;
184     const size_t offset = 13;
185 
186     pout += offset;    /* Create an unaligned address */
187     asm("v2 = vmem(%0 + #0)\n\t"
188         "vmem(%1 + #0) = v2\n\t"
189         : : "r"(p0), "r"(pout) : "v2", "memory");
190 
191     expect[0] = buffer0[0];
192 
193     check_output_w(__LINE__, 1);
194 }
195 
196 static void test_store_unaligned(void)
197 {
198     void *p0 = buffer0;
199     void *pout = output;
200     const size_t offset = 12;
201 
202     pout += offset;    /* Create an unaligned address */
203     asm("v2 = vmem(%0 + #0)\n\t"
204         "vmemu(%1 + #0) = v2\n\t"
205         : : "r"(p0), "r"(pout) : "v2", "memory");
206 
207     memcpy(expect, buffer0, 2 * sizeof(MMVector));
208     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
209 
210     check_output_w(__LINE__, 2);
211 }
212 
213 static void test_masked_store(bool invert)
214 {
215     void *p0 = buffer0;
216     void *pmask = mask;
217     void *pout = output;
218 
219     memset(expect, 0xff, sizeof(expect));
220     memset(output, 0xff, sizeof(expect));
221 
222     for (int i = 0; i < BUFSIZE; i++) {
223         if (invert) {
224             asm("r4 = #0\n\t"
225                 "v4 = vsplat(r4)\n\t"
226                 "v5 = vmem(%0 + #0)\n\t"
227                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
228                 "v5 = vmem(%1)\n\t"
229                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
230                 : : "r"(pmask), "r"(p0), "r"(pout)
231                 : "r4", "v4", "v5", "q0", "memory");
232         } else {
233             asm("r4 = #0\n\t"
234                 "v4 = vsplat(r4)\n\t"
235                 "v5 = vmem(%0 + #0)\n\t"
236                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
237                 "v5 = vmem(%1)\n\t"
238                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
239                 : : "r"(pmask), "r"(p0), "r"(pout)
240                 : "r4", "v4", "v5", "q0", "memory");
241         }
242         p0 += sizeof(MMVector);
243         pmask += sizeof(MMVector);
244         pout += sizeof(MMVector);
245 
246         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
247             if (invert) {
248                 if (i + j % MASKMOD != 0) {
249                     expect[i].w[j] = buffer0[i].w[j];
250                 }
251             } else {
252                 if (i + j % MASKMOD == 0) {
253                     expect[i].w[j] = buffer0[i].w[j];
254                 }
255             }
256         }
257     }
258 
259     check_output_w(__LINE__, BUFSIZE);
260 }
261 
262 static void test_new_value_store(void)
263 {
264     void *p0 = buffer0;
265     void *pout = output;
266 
267     asm("{\n\t"
268         "    v2 = vmem(%0 + #0)\n\t"
269         "    vmem(%1 + #0) = v2.new\n\t"
270         "}\n\t"
271         : : "r"(p0), "r"(pout) : "v2", "memory");
272 
273     expect[0] = buffer0[0];
274 
275     check_output_w(__LINE__, 1);
276 }
277 
278 static void test_max_temps()
279 {
280     void *p0 = buffer0;
281     void *pout = output;
282 
283     asm("v0 = vmem(%0 + #0)\n\t"
284         "v1 = vmem(%0 + #1)\n\t"
285         "v2 = vmem(%0 + #2)\n\t"
286         "v3 = vmem(%0 + #3)\n\t"
287         "v4 = vmem(%0 + #4)\n\t"
288         "{\n\t"
289         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
290         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
291         "    v3.w = vadd(v1.w, v4.w)\n\t"
292         "    v4.tmp = vmem(%0 + #5)\n\t"
293         "}\n\t"
294         "vmem(%1 + #0) = v0\n\t"
295         "vmem(%1 + #1) = v1\n\t"
296         "vmem(%1 + #2) = v2\n\t"
297         "vmem(%1 + #3) = v3\n\t"
298         "vmem(%1 + #4) = v4\n\t"
299         : : "r"(p0), "r"(pout) : "memory");
300 
301         /* The first two vectors come from the vadd-pair instruction */
302         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
303             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
304             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
305         }
306         /* The third vector comes from the vshuffe instruction */
307         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
308             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
309                               (buffer0[3].uh[i] & 0xff) << 8;
310         }
311         /* The fourth vector comes from the vadd-single instruction */
312         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
313             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
314         }
315         /*
316          * The fifth vector comes from the load to v4
317          * make sure the .tmp is dropped
318          */
319         expect[4] = buffer0[4];
320 
321         check_output_b(__LINE__, 5);
322 }
323 
324 #define VEC_OP1(ASM, EL, IN, OUT) \
325     asm("v2 = vmem(%0 + #0)\n\t" \
326         "v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
327         "vmem(%1 + #0) = v2\n\t" \
328         : : "r"(IN), "r"(OUT) : "v2", "memory")
329 
330 #define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
331     asm("v2 = vmem(%0 + #0)\n\t" \
332         "v3 = vmem(%1 + #0)\n\t" \
333         "v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
334         "vmem(%2 + #0) = v2\n\t" \
335         : : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
336 
337 #define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
338 static void test_##NAME(void) \
339 { \
340     void *pin = buffer0; \
341     void *pout = output; \
342     for (int i = 0; i < BUFSIZE; i++) { \
343         VEC_OP1(ASM, EL, pin, pout); \
344         pin += sizeof(MMVector); \
345         pout += sizeof(MMVector); \
346     } \
347     for (int i = 0; i < BUFSIZE; i++) { \
348         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
349             expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
350         } \
351     } \
352     check_output_##FIELD(__LINE__, BUFSIZE); \
353 }
354 
355 #define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
356 static void test_##NAME(void) \
357 { \
358     void *p0 = buffer0; \
359     void *p1 = buffer1; \
360     void *pout = output; \
361     for (int i = 0; i < BUFSIZE; i++) { \
362         VEC_OP2(ASM, EL, p0, p1, pout); \
363         p0 += sizeof(MMVector); \
364         p1 += sizeof(MMVector); \
365         pout += sizeof(MMVector); \
366     } \
367     for (int i = 0; i < BUFSIZE; i++) { \
368         for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
369             expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
370         } \
371     } \
372     check_output_##FIELD(__LINE__, BUFSIZE); \
373 }
374 
375 #define THRESHOLD        31
376 
377 #define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
378     asm("r4 = #%3\n\t" \
379         "v1.b = vsplat(r4)\n\t" \
380         "v2 = vmem(%0 + #0)\n\t" \
381         "q0 = vcmp.gt(v2.b, v1.b)\n\t" \
382         "v3 = vmem(%1 + #0)\n\t" \
383         "q1 = vcmp.gt(v3.b, v1.b)\n\t" \
384         "q2 = " #ASM "(q0, " INV "q1)\n\t" \
385         "r4 = #0xff\n\t" \
386         "v1.b = vsplat(r4)\n\t" \
387         "if (q2) vmem(%2 + #0) = v1\n\t" \
388         : : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
389         : "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
390 
391 #define TEST_PRED_OP2(NAME, ASM, OP, INV) \
392 static void test_##NAME(bool invert) \
393 { \
394     void *p0 = buffer0; \
395     void *p1 = buffer1; \
396     void *pout = output; \
397     memset(output, 0, sizeof(expect)); \
398     for (int i = 0; i < BUFSIZE; i++) { \
399         PRED_OP2(ASM, p0, p1, pout, INV); \
400         p0 += sizeof(MMVector); \
401         p1 += sizeof(MMVector); \
402         pout += sizeof(MMVector); \
403     } \
404     for (int i = 0; i < BUFSIZE; i++) { \
405         for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
406             bool p0 = (buffer0[i].b[j] > THRESHOLD); \
407             bool p1 = (buffer1[i].b[j] > THRESHOLD); \
408             if (invert) { \
409                 expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
410             } else { \
411                 expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
412             } \
413         } \
414     } \
415     check_output_b(__LINE__, BUFSIZE); \
416 }
417 
418 TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
419 TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
420 TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
421 TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
422 TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
423 TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
424 TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
425 TEST_VEC_OP2(vand, vand, , d, 8, &)
426 TEST_VEC_OP2(vor, vor, , d, 8, |)
427 TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
428 
429 TEST_PRED_OP2(pred_or, or, |, "")
430 TEST_PRED_OP2(pred_or_n, or, |, "!")
431 TEST_PRED_OP2(pred_and, and, &, "")
432 TEST_PRED_OP2(pred_and_n, and, &, "!")
433 TEST_PRED_OP2(pred_xor, xor, ^, "")
434 
435 int main()
436 {
437     init_buffers();
438 
439     test_load_tmp();
440     test_load_cur();
441     test_load_aligned();
442     test_load_unaligned();
443     test_store_aligned();
444     test_store_unaligned();
445     test_masked_store(false);
446     test_masked_store(true);
447     test_new_value_store();
448     test_max_temps();
449 
450     test_vadd_w();
451     test_vadd_h();
452     test_vadd_b();
453     test_vsub_w();
454     test_vsub_h();
455     test_vsub_b();
456     test_vxor();
457     test_vand();
458     test_vor();
459     test_vnot();
460 
461     test_pred_or(false);
462     test_pred_or_n(true);
463     test_pred_and(false);
464     test_pred_and_n(true);
465     test_pred_xor(false);
466 
467     puts(err ? "FAIL" : "PASS");
468     return err ? 1 : 0;
469 }
470