xref: /qemu/tests/tcg/hexagon/hvx_misc.c (revision b887b6b7)
1afb9539eSTaylor Simpson /*
2b887b6b7STaylor Simpson  *  Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
3afb9539eSTaylor Simpson  *
4afb9539eSTaylor Simpson  *  This program is free software; you can redistribute it and/or modify
5afb9539eSTaylor Simpson  *  it under the terms of the GNU General Public License as published by
6afb9539eSTaylor Simpson  *  the Free Software Foundation; either version 2 of the License, or
7afb9539eSTaylor Simpson  *  (at your option) any later version.
8afb9539eSTaylor Simpson  *
9afb9539eSTaylor Simpson  *  This program is distributed in the hope that it will be useful,
10afb9539eSTaylor Simpson  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11afb9539eSTaylor Simpson  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12afb9539eSTaylor Simpson  *  GNU General Public License for more details.
13afb9539eSTaylor Simpson  *
14afb9539eSTaylor Simpson  *  You should have received a copy of the GNU General Public License
15afb9539eSTaylor Simpson  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16afb9539eSTaylor Simpson  */
17afb9539eSTaylor Simpson 
18afb9539eSTaylor Simpson #include <stdio.h>
19afb9539eSTaylor Simpson #include <stdint.h>
20afb9539eSTaylor Simpson #include <stdbool.h>
21afb9539eSTaylor Simpson #include <string.h>
225b0043c6STaylor Simpson #include <limits.h>
23afb9539eSTaylor Simpson 
24afb9539eSTaylor Simpson int err;
25afb9539eSTaylor Simpson 
26761e1c67STaylor Simpson #include "hvx_misc.h"
27afb9539eSTaylor Simpson 
test_load_tmp(void)28afb9539eSTaylor Simpson static void test_load_tmp(void)
29afb9539eSTaylor Simpson {
30afb9539eSTaylor Simpson     void *p0 = buffer0;
31afb9539eSTaylor Simpson     void *p1 = buffer1;
32afb9539eSTaylor Simpson     void *pout = output;
33afb9539eSTaylor Simpson 
34afb9539eSTaylor Simpson     for (int i = 0; i < BUFSIZE; i++) {
35afb9539eSTaylor Simpson         /*
36afb9539eSTaylor Simpson          * Load into v12 as .tmp, then use it in the next packet
37afb9539eSTaylor Simpson          * Should get the new value within the same packet and
38afb9539eSTaylor Simpson          * the old value in the next packet
39afb9539eSTaylor Simpson          */
40afb9539eSTaylor Simpson         asm("v3 = vmem(%0 + #0)\n\t"
41afb9539eSTaylor Simpson             "r1 = #1\n\t"
42afb9539eSTaylor Simpson             "v12 = vsplat(r1)\n\t"
43afb9539eSTaylor Simpson             "{\n\t"
44afb9539eSTaylor Simpson             "    v12.tmp = vmem(%1 + #0)\n\t"
45afb9539eSTaylor Simpson             "    v4.w = vadd(v12.w, v3.w)\n\t"
46afb9539eSTaylor Simpson             "}\n\t"
47afb9539eSTaylor Simpson             "v4.w = vadd(v4.w, v12.w)\n\t"
48afb9539eSTaylor Simpson             "vmem(%2 + #0) = v4\n\t"
49afb9539eSTaylor Simpson             : : "r"(p0), "r"(p1), "r"(pout)
50afb9539eSTaylor Simpson             : "r1", "v12", "v3", "v4", "v6", "memory");
51afb9539eSTaylor Simpson         p0 += sizeof(MMVector);
52afb9539eSTaylor Simpson         p1 += sizeof(MMVector);
53afb9539eSTaylor Simpson         pout += sizeof(MMVector);
54afb9539eSTaylor Simpson 
55afb9539eSTaylor Simpson         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
56afb9539eSTaylor Simpson             expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
57afb9539eSTaylor Simpson         }
58afb9539eSTaylor Simpson     }
59afb9539eSTaylor Simpson 
60afb9539eSTaylor Simpson     check_output_w(__LINE__, BUFSIZE);
61afb9539eSTaylor Simpson }
62afb9539eSTaylor Simpson 
test_load_tmp2(void)633fd49e22SMarco Liebel static void test_load_tmp2(void)
643fd49e22SMarco Liebel {
653fd49e22SMarco Liebel     void *pout0 = &output[0];
663fd49e22SMarco Liebel     void *pout1 = &output[1];
673fd49e22SMarco Liebel 
683fd49e22SMarco Liebel     asm volatile(
693fd49e22SMarco Liebel         "r0 = #0x03030303\n\t"
703fd49e22SMarco Liebel         "v16 = vsplat(r0)\n\t"
713fd49e22SMarco Liebel         "r0 = #0x04040404\n\t"
723fd49e22SMarco Liebel         "v18 = vsplat(r0)\n\t"
733fd49e22SMarco Liebel         "r0 = #0x05050505\n\t"
743fd49e22SMarco Liebel         "v21 = vsplat(r0)\n\t"
753fd49e22SMarco Liebel         "{\n\t"
763fd49e22SMarco Liebel         "   v25:24 += vmpyo(v18.w, v14.h)\n\t"
773fd49e22SMarco Liebel         "   v15:14.tmp = vcombine(v21, v16)\n\t"
783fd49e22SMarco Liebel         "}\n\t"
793fd49e22SMarco Liebel         "vmem(%0 + #0) = v24\n\t"
803fd49e22SMarco Liebel         "vmem(%1 + #0) = v25\n\t"
813fd49e22SMarco Liebel         : : "r"(pout0), "r"(pout1)
823fd49e22SMarco Liebel         : "r0", "v16", "v18", "v21", "v24", "v25", "memory"
833fd49e22SMarco Liebel     );
843fd49e22SMarco Liebel 
853fd49e22SMarco Liebel     for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) {
863fd49e22SMarco Liebel         expect[0].w[i] = 0x180c0000;
873fd49e22SMarco Liebel         expect[1].w[i] = 0x000c1818;
883fd49e22SMarco Liebel     }
893fd49e22SMarco Liebel 
903fd49e22SMarco Liebel     check_output_w(__LINE__, 2);
913fd49e22SMarco Liebel }
923fd49e22SMarco Liebel 
test_load_cur(void)93afb9539eSTaylor Simpson static void test_load_cur(void)
94afb9539eSTaylor Simpson {
95afb9539eSTaylor Simpson     void *p0 = buffer0;
96afb9539eSTaylor Simpson     void *pout = output;
97afb9539eSTaylor Simpson 
98afb9539eSTaylor Simpson     for (int i = 0; i < BUFSIZE; i++) {
99afb9539eSTaylor Simpson         asm("{\n\t"
100afb9539eSTaylor Simpson             "    v2.cur = vmem(%0 + #0)\n\t"
101afb9539eSTaylor Simpson             "    vmem(%1 + #0) = v2\n\t"
102afb9539eSTaylor Simpson             "}\n\t"
103afb9539eSTaylor Simpson             : : "r"(p0), "r"(pout) : "v2", "memory");
104afb9539eSTaylor Simpson         p0 += sizeof(MMVector);
105afb9539eSTaylor Simpson         pout += sizeof(MMVector);
106afb9539eSTaylor Simpson 
107afb9539eSTaylor Simpson         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
108afb9539eSTaylor Simpson             expect[i].uw[j] = buffer0[i].uw[j];
109afb9539eSTaylor Simpson         }
110afb9539eSTaylor Simpson     }
111afb9539eSTaylor Simpson 
112afb9539eSTaylor Simpson     check_output_w(__LINE__, BUFSIZE);
113afb9539eSTaylor Simpson }
114afb9539eSTaylor Simpson 
test_load_aligned(void)115afb9539eSTaylor Simpson static void test_load_aligned(void)
116afb9539eSTaylor Simpson {
117afb9539eSTaylor Simpson     /* Aligned loads ignore the low bits of the address */
118afb9539eSTaylor Simpson     void *p0 = buffer0;
119afb9539eSTaylor Simpson     void *pout = output;
120afb9539eSTaylor Simpson     const size_t offset = 13;
121afb9539eSTaylor Simpson 
122afb9539eSTaylor Simpson     p0 += offset;    /* Create an unaligned address */
123afb9539eSTaylor Simpson     asm("v2 = vmem(%0 + #0)\n\t"
124afb9539eSTaylor Simpson         "vmem(%1 + #0) = v2\n\t"
125afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "v2", "memory");
126afb9539eSTaylor Simpson 
127afb9539eSTaylor Simpson     expect[0] = buffer0[0];
128afb9539eSTaylor Simpson 
129afb9539eSTaylor Simpson     check_output_w(__LINE__, 1);
130afb9539eSTaylor Simpson }
131afb9539eSTaylor Simpson 
test_load_unaligned(void)132afb9539eSTaylor Simpson static void test_load_unaligned(void)
133afb9539eSTaylor Simpson {
134afb9539eSTaylor Simpson     void *p0 = buffer0;
135afb9539eSTaylor Simpson     void *pout = output;
136afb9539eSTaylor Simpson     const size_t offset = 12;
137afb9539eSTaylor Simpson 
138afb9539eSTaylor Simpson     p0 += offset;    /* Create an unaligned address */
139afb9539eSTaylor Simpson     asm("v2 = vmemu(%0 + #0)\n\t"
140afb9539eSTaylor Simpson         "vmem(%1 + #0) = v2\n\t"
141afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "v2", "memory");
142afb9539eSTaylor Simpson 
143afb9539eSTaylor Simpson     memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
144afb9539eSTaylor Simpson 
145afb9539eSTaylor Simpson     check_output_w(__LINE__, 1);
146afb9539eSTaylor Simpson }
147afb9539eSTaylor Simpson 
test_store_aligned(void)148afb9539eSTaylor Simpson static void test_store_aligned(void)
149afb9539eSTaylor Simpson {
150afb9539eSTaylor Simpson     /* Aligned stores ignore the low bits of the address */
151afb9539eSTaylor Simpson     void *p0 = buffer0;
152afb9539eSTaylor Simpson     void *pout = output;
153afb9539eSTaylor Simpson     const size_t offset = 13;
154afb9539eSTaylor Simpson 
155afb9539eSTaylor Simpson     pout += offset;    /* Create an unaligned address */
156afb9539eSTaylor Simpson     asm("v2 = vmem(%0 + #0)\n\t"
157afb9539eSTaylor Simpson         "vmem(%1 + #0) = v2\n\t"
158afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "v2", "memory");
159afb9539eSTaylor Simpson 
160afb9539eSTaylor Simpson     expect[0] = buffer0[0];
161afb9539eSTaylor Simpson 
162afb9539eSTaylor Simpson     check_output_w(__LINE__, 1);
163afb9539eSTaylor Simpson }
164afb9539eSTaylor Simpson 
test_store_unaligned(void)165afb9539eSTaylor Simpson static void test_store_unaligned(void)
166afb9539eSTaylor Simpson {
167afb9539eSTaylor Simpson     void *p0 = buffer0;
168afb9539eSTaylor Simpson     void *pout = output;
169afb9539eSTaylor Simpson     const size_t offset = 12;
170afb9539eSTaylor Simpson 
171afb9539eSTaylor Simpson     pout += offset;    /* Create an unaligned address */
172afb9539eSTaylor Simpson     asm("v2 = vmem(%0 + #0)\n\t"
173afb9539eSTaylor Simpson         "vmemu(%1 + #0) = v2\n\t"
174afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "v2", "memory");
175afb9539eSTaylor Simpson 
176afb9539eSTaylor Simpson     memcpy(expect, buffer0, 2 * sizeof(MMVector));
177afb9539eSTaylor Simpson     memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
178afb9539eSTaylor Simpson 
179afb9539eSTaylor Simpson     check_output_w(__LINE__, 2);
180afb9539eSTaylor Simpson }
181afb9539eSTaylor Simpson 
test_masked_store(bool invert)182afb9539eSTaylor Simpson static void test_masked_store(bool invert)
183afb9539eSTaylor Simpson {
184afb9539eSTaylor Simpson     void *p0 = buffer0;
185afb9539eSTaylor Simpson     void *pmask = mask;
186afb9539eSTaylor Simpson     void *pout = output;
187afb9539eSTaylor Simpson 
188afb9539eSTaylor Simpson     memset(expect, 0xff, sizeof(expect));
189afb9539eSTaylor Simpson     memset(output, 0xff, sizeof(expect));
190afb9539eSTaylor Simpson 
191afb9539eSTaylor Simpson     for (int i = 0; i < BUFSIZE; i++) {
192afb9539eSTaylor Simpson         if (invert) {
193afb9539eSTaylor Simpson             asm("r4 = #0\n\t"
194afb9539eSTaylor Simpson                 "v4 = vsplat(r4)\n\t"
195afb9539eSTaylor Simpson                 "v5 = vmem(%0 + #0)\n\t"
196afb9539eSTaylor Simpson                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
197afb9539eSTaylor Simpson                 "v5 = vmem(%1)\n\t"
198afb9539eSTaylor Simpson                 "if (!q0) vmem(%2) = v5\n\t"             /* Inverted test */
199afb9539eSTaylor Simpson                 : : "r"(pmask), "r"(p0), "r"(pout)
200afb9539eSTaylor Simpson                 : "r4", "v4", "v5", "q0", "memory");
201afb9539eSTaylor Simpson         } else {
202afb9539eSTaylor Simpson             asm("r4 = #0\n\t"
203afb9539eSTaylor Simpson                 "v4 = vsplat(r4)\n\t"
204afb9539eSTaylor Simpson                 "v5 = vmem(%0 + #0)\n\t"
205afb9539eSTaylor Simpson                 "q0 = vcmp.eq(v4.w, v5.w)\n\t"
206afb9539eSTaylor Simpson                 "v5 = vmem(%1)\n\t"
207afb9539eSTaylor Simpson                 "if (q0) vmem(%2) = v5\n\t"             /* Non-inverted test */
208afb9539eSTaylor Simpson                 : : "r"(pmask), "r"(p0), "r"(pout)
209afb9539eSTaylor Simpson                 : "r4", "v4", "v5", "q0", "memory");
210afb9539eSTaylor Simpson         }
211afb9539eSTaylor Simpson         p0 += sizeof(MMVector);
212afb9539eSTaylor Simpson         pmask += sizeof(MMVector);
213afb9539eSTaylor Simpson         pout += sizeof(MMVector);
214afb9539eSTaylor Simpson 
215afb9539eSTaylor Simpson         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
216afb9539eSTaylor Simpson             if (invert) {
217afb9539eSTaylor Simpson                 if (i + j % MASKMOD != 0) {
218afb9539eSTaylor Simpson                     expect[i].w[j] = buffer0[i].w[j];
219afb9539eSTaylor Simpson                 }
220afb9539eSTaylor Simpson             } else {
221afb9539eSTaylor Simpson                 if (i + j % MASKMOD == 0) {
222afb9539eSTaylor Simpson                     expect[i].w[j] = buffer0[i].w[j];
223afb9539eSTaylor Simpson                 }
224afb9539eSTaylor Simpson             }
225afb9539eSTaylor Simpson         }
226afb9539eSTaylor Simpson     }
227afb9539eSTaylor Simpson 
228afb9539eSTaylor Simpson     check_output_w(__LINE__, BUFSIZE);
229afb9539eSTaylor Simpson }
230afb9539eSTaylor Simpson 
test_new_value_store(void)231afb9539eSTaylor Simpson static void test_new_value_store(void)
232afb9539eSTaylor Simpson {
233afb9539eSTaylor Simpson     void *p0 = buffer0;
234b887b6b7STaylor Simpson     void *p1 = buffer1;
235afb9539eSTaylor Simpson     void *pout = output;
236afb9539eSTaylor Simpson 
237afb9539eSTaylor Simpson     asm("{\n\t"
238afb9539eSTaylor Simpson         "    v2 = vmem(%0 + #0)\n\t"
239afb9539eSTaylor Simpson         "    vmem(%1 + #0) = v2.new\n\t"
240afb9539eSTaylor Simpson         "}\n\t"
241afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "v2", "memory");
242afb9539eSTaylor Simpson 
243afb9539eSTaylor Simpson     expect[0] = buffer0[0];
244afb9539eSTaylor Simpson 
245afb9539eSTaylor Simpson     check_output_w(__LINE__, 1);
246b887b6b7STaylor Simpson 
247b887b6b7STaylor Simpson     /* Test the .new read from the high half of a pair */
248b887b6b7STaylor Simpson     asm("v7 = vmem(%0 + #0)\n\t"
249b887b6b7STaylor Simpson         "v12 = vmem(%1 + #0)\n\t"
250b887b6b7STaylor Simpson         "{\n\t"
251b887b6b7STaylor Simpson         "    v5:4 = vcombine(v12, v7)\n\t"
252b887b6b7STaylor Simpson         "    vmem(%2 + #0) = v5.new\n\t"
253b887b6b7STaylor Simpson         "}\n\t"
254b887b6b7STaylor Simpson         : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
255b887b6b7STaylor Simpson 
256b887b6b7STaylor Simpson     expect[0] = buffer1[0];
257b887b6b7STaylor Simpson 
258b887b6b7STaylor Simpson     check_output_w(__LINE__, 1);
259afb9539eSTaylor Simpson }
260afb9539eSTaylor Simpson 
test_max_temps()261afb9539eSTaylor Simpson static void test_max_temps()
262afb9539eSTaylor Simpson {
263afb9539eSTaylor Simpson     void *p0 = buffer0;
264afb9539eSTaylor Simpson     void *pout = output;
265afb9539eSTaylor Simpson 
266afb9539eSTaylor Simpson     asm("v0 = vmem(%0 + #0)\n\t"
267afb9539eSTaylor Simpson         "v1 = vmem(%0 + #1)\n\t"
268afb9539eSTaylor Simpson         "v2 = vmem(%0 + #2)\n\t"
269afb9539eSTaylor Simpson         "v3 = vmem(%0 + #3)\n\t"
270afb9539eSTaylor Simpson         "v4 = vmem(%0 + #4)\n\t"
271afb9539eSTaylor Simpson         "{\n\t"
272afb9539eSTaylor Simpson         "    v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
273afb9539eSTaylor Simpson         "    v2.b = vshuffe(v3.b, v2.b)\n\t"
274afb9539eSTaylor Simpson         "    v3.w = vadd(v1.w, v4.w)\n\t"
275afb9539eSTaylor Simpson         "    v4.tmp = vmem(%0 + #5)\n\t"
276afb9539eSTaylor Simpson         "}\n\t"
277afb9539eSTaylor Simpson         "vmem(%1 + #0) = v0\n\t"
278afb9539eSTaylor Simpson         "vmem(%1 + #1) = v1\n\t"
279afb9539eSTaylor Simpson         "vmem(%1 + #2) = v2\n\t"
280afb9539eSTaylor Simpson         "vmem(%1 + #3) = v3\n\t"
281afb9539eSTaylor Simpson         "vmem(%1 + #4) = v4\n\t"
282afb9539eSTaylor Simpson         : : "r"(p0), "r"(pout) : "memory");
283afb9539eSTaylor Simpson 
284afb9539eSTaylor Simpson         /* The first two vectors come from the vadd-pair instruction */
285afb9539eSTaylor Simpson         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
286afb9539eSTaylor Simpson             expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
287afb9539eSTaylor Simpson             expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
288afb9539eSTaylor Simpson         }
289afb9539eSTaylor Simpson         /* The third vector comes from the vshuffe instruction */
290afb9539eSTaylor Simpson         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
291afb9539eSTaylor Simpson             expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
292afb9539eSTaylor Simpson                               (buffer0[3].uh[i] & 0xff) << 8;
293afb9539eSTaylor Simpson         }
294afb9539eSTaylor Simpson         /* The fourth vector comes from the vadd-single instruction */
295afb9539eSTaylor Simpson         for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
296afb9539eSTaylor Simpson             expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
297afb9539eSTaylor Simpson         }
298afb9539eSTaylor Simpson         /*
299afb9539eSTaylor Simpson          * The fifth vector comes from the load to v4
300afb9539eSTaylor Simpson          * make sure the .tmp is dropped
301afb9539eSTaylor Simpson          */
302afb9539eSTaylor Simpson         expect[4] = buffer0[4];
303afb9539eSTaylor Simpson 
304afb9539eSTaylor Simpson         check_output_b(__LINE__, 5);
305afb9539eSTaylor Simpson }
306afb9539eSTaylor Simpson 
307afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
308afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
309afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
310afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
311afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
312afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
313afb9539eSTaylor Simpson TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
314afb9539eSTaylor Simpson TEST_VEC_OP2(vand, vand, , d, 8, &)
315afb9539eSTaylor Simpson TEST_VEC_OP2(vor, vor, , d, 8, |)
316afb9539eSTaylor Simpson TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
317afb9539eSTaylor Simpson 
318afb9539eSTaylor Simpson TEST_PRED_OP2(pred_or, or, |, "")
319afb9539eSTaylor Simpson TEST_PRED_OP2(pred_or_n, or, |, "!")
320afb9539eSTaylor Simpson TEST_PRED_OP2(pred_and, and, &, "")
321afb9539eSTaylor Simpson TEST_PRED_OP2(pred_and_n, and, &, "!")
322afb9539eSTaylor Simpson TEST_PRED_OP2(pred_xor, xor, ^, "")
323afb9539eSTaylor Simpson 
test_vadduwsat(void)3245b0043c6STaylor Simpson static void test_vadduwsat(void)
3255b0043c6STaylor Simpson {
3265b0043c6STaylor Simpson     /*
3275b0043c6STaylor Simpson      * Test for saturation by adding two numbers that add to more than UINT_MAX
3285b0043c6STaylor Simpson      * and make sure the result saturates to UINT_MAX
3295b0043c6STaylor Simpson      */
3305b0043c6STaylor Simpson     const uint32_t x = 0xffff0000;
3315b0043c6STaylor Simpson     const uint32_t y = 0x000fffff;
3325b0043c6STaylor Simpson 
3335b0043c6STaylor Simpson     memset(expect, 0x12, sizeof(MMVector));
3345b0043c6STaylor Simpson     memset(output, 0x34, sizeof(MMVector));
3355b0043c6STaylor Simpson 
3365b0043c6STaylor Simpson     asm volatile ("v10 = vsplat(%0)\n\t"
3375b0043c6STaylor Simpson                   "v11 = vsplat(%1)\n\t"
3385b0043c6STaylor Simpson                   "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
3395b0043c6STaylor Simpson                   "vmem(%2+#0) = v21\n\t"
3405b0043c6STaylor Simpson                   : /* no outputs */
3415b0043c6STaylor Simpson                   : "r"(x), "r"(y), "r"(output)
3425b0043c6STaylor Simpson                   : "v10", "v11", "v21", "memory");
3435b0043c6STaylor Simpson 
3445b0043c6STaylor Simpson     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
3455b0043c6STaylor Simpson         expect[0].uw[j] = UINT_MAX;
3465b0043c6STaylor Simpson     }
3475b0043c6STaylor Simpson 
3485b0043c6STaylor Simpson     check_output_w(__LINE__, 1);
3495b0043c6STaylor Simpson }
3505b0043c6STaylor Simpson 
test_vsubuwsat_dv(void)3515b0043c6STaylor Simpson static void test_vsubuwsat_dv(void)
3525b0043c6STaylor Simpson {
3535b0043c6STaylor Simpson     /*
3545b0043c6STaylor Simpson      * Test for saturation by subtracting two numbers where the result is
3555b0043c6STaylor Simpson      * negative and make sure the result saturates to zero
3565b0043c6STaylor Simpson      *
3575b0043c6STaylor Simpson      * vsubuwsat_dv operates on an HVX register pair, so we'll have a
3585b0043c6STaylor Simpson      * pair of subtractions
3595b0043c6STaylor Simpson      *     w - x < 0
3605b0043c6STaylor Simpson      *     y - z < 0
3615b0043c6STaylor Simpson      */
3625b0043c6STaylor Simpson     const uint32_t w = 0x000000b7;
3635b0043c6STaylor Simpson     const uint32_t x = 0xffffff4e;
3645b0043c6STaylor Simpson     const uint32_t y = 0x31fe88e7;
3655b0043c6STaylor Simpson     const uint32_t z = 0x7fffff79;
3665b0043c6STaylor Simpson 
3675b0043c6STaylor Simpson     memset(expect, 0x12, sizeof(MMVector) * 2);
3685b0043c6STaylor Simpson     memset(output, 0x34, sizeof(MMVector) * 2);
3695b0043c6STaylor Simpson 
3705b0043c6STaylor Simpson     asm volatile ("v16 = vsplat(%0)\n\t"
3715b0043c6STaylor Simpson                   "v17 = vsplat(%1)\n\t"
3725b0043c6STaylor Simpson                   "v26 = vsplat(%2)\n\t"
3735b0043c6STaylor Simpson                   "v27 = vsplat(%3)\n\t"
3745b0043c6STaylor Simpson                   "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
3755b0043c6STaylor Simpson                   "vmem(%4+#0) = v24\n\t"
3765b0043c6STaylor Simpson                   "vmem(%4+#1) = v25\n\t"
3775b0043c6STaylor Simpson                   : /* no outputs */
3785b0043c6STaylor Simpson                   : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
3795b0043c6STaylor Simpson                   : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
3805b0043c6STaylor Simpson 
3815b0043c6STaylor Simpson     for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
3825b0043c6STaylor Simpson         expect[0].uw[j] = 0x00000000;
3835b0043c6STaylor Simpson         expect[1].uw[j] = 0x00000000;
3845b0043c6STaylor Simpson     }
3855b0043c6STaylor Simpson 
3865b0043c6STaylor Simpson     check_output_w(__LINE__, 2);
3875b0043c6STaylor Simpson }
3885b0043c6STaylor Simpson 
test_load_tmp_predicated(void)38983853ea0STaylor Simpson static void test_load_tmp_predicated(void)
39083853ea0STaylor Simpson {
39183853ea0STaylor Simpson     void *p0 = buffer0;
39283853ea0STaylor Simpson     void *p1 = buffer1;
39383853ea0STaylor Simpson     void *pout = output;
39483853ea0STaylor Simpson     bool pred = true;
39583853ea0STaylor Simpson 
39683853ea0STaylor Simpson     for (int i = 0; i < BUFSIZE; i++) {
39783853ea0STaylor Simpson         /*
39883853ea0STaylor Simpson          * Load into v12 as .tmp with a predicate
39983853ea0STaylor Simpson          * When the predicate is true, we get the vector from buffer1[i]
40083853ea0STaylor Simpson          * When the predicate is false, we get a vector of all 1's
40183853ea0STaylor Simpson          * Regardless of the predicate, the next packet should have
40283853ea0STaylor Simpson          * a vector of all 1's
40383853ea0STaylor Simpson          */
40483853ea0STaylor Simpson         asm("v3 = vmem(%0 + #0)\n\t"
40583853ea0STaylor Simpson             "r1 = #1\n\t"
40683853ea0STaylor Simpson             "v12 = vsplat(r1)\n\t"
40783853ea0STaylor Simpson             "p1 = !cmp.eq(%3, #0)\n\t"
40883853ea0STaylor Simpson             "{\n\t"
40983853ea0STaylor Simpson             "    if (p1) v12.tmp = vmem(%1 + #0)\n\t"
41083853ea0STaylor Simpson             "    v4.w = vadd(v12.w, v3.w)\n\t"
41183853ea0STaylor Simpson             "}\n\t"
41283853ea0STaylor Simpson             "v4.w = vadd(v4.w, v12.w)\n\t"
41383853ea0STaylor Simpson             "vmem(%2 + #0) = v4\n\t"
41483853ea0STaylor Simpson             : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
41583853ea0STaylor Simpson             : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
41683853ea0STaylor Simpson         p0 += sizeof(MMVector);
41783853ea0STaylor Simpson         p1 += sizeof(MMVector);
41883853ea0STaylor Simpson         pout += sizeof(MMVector);
41983853ea0STaylor Simpson 
42083853ea0STaylor Simpson         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
42183853ea0STaylor Simpson             expect[i].w[j] =
42283853ea0STaylor Simpson                 pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
42383853ea0STaylor Simpson                      : buffer0[i].w[j] + 2;
42483853ea0STaylor Simpson         }
42583853ea0STaylor Simpson         pred = !pred;
42683853ea0STaylor Simpson     }
42783853ea0STaylor Simpson 
42883853ea0STaylor Simpson     check_output_w(__LINE__, BUFSIZE);
42983853ea0STaylor Simpson }
43083853ea0STaylor Simpson 
test_load_cur_predicated(void)43183853ea0STaylor Simpson static void test_load_cur_predicated(void)
43283853ea0STaylor Simpson {
43383853ea0STaylor Simpson     bool pred = true;
43483853ea0STaylor Simpson     for (int i = 0; i < BUFSIZE; i++) {
43583853ea0STaylor Simpson         asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
43683853ea0STaylor Simpson                      "v3 = vmem(%0+#0)\n\t"
43783853ea0STaylor Simpson                      /*
43883853ea0STaylor Simpson                       * Preload v4 to make sure that the assignment from the
43983853ea0STaylor Simpson                       * packet below is not being ignored when pred is false.
44083853ea0STaylor Simpson                       */
44183853ea0STaylor Simpson                      "r0 = #0x01237654\n\t"
44283853ea0STaylor Simpson                      "v4 = vsplat(r0)\n\t"
44383853ea0STaylor Simpson                      "{\n\t"
44483853ea0STaylor Simpson                      "    if (p0) v3.cur = vmem(%1+#0)\n\t"
44583853ea0STaylor Simpson                      "    v4 = v3\n\t"
44683853ea0STaylor Simpson                      "}\n\t"
44783853ea0STaylor Simpson                      "vmem(%2+#0) = v4\n\t"
44883853ea0STaylor Simpson                      :
44983853ea0STaylor Simpson                      : "r"(&buffer0[i]), "r"(&buffer1[i]),
45083853ea0STaylor Simpson                        "r"(&output[i]), "r"(pred)
45183853ea0STaylor Simpson                      : "r0", "p0", "v3", "v4", "memory");
45283853ea0STaylor Simpson         expect[i] = pred ? buffer1[i] : buffer0[i];
45383853ea0STaylor Simpson         pred = !pred;
45483853ea0STaylor Simpson     }
45583853ea0STaylor Simpson     check_output_w(__LINE__, BUFSIZE);
45683853ea0STaylor Simpson }
45783853ea0STaylor Simpson 
test_vcombine(void)458d05d5eebSTaylor Simpson static void test_vcombine(void)
459d05d5eebSTaylor Simpson {
460d05d5eebSTaylor Simpson     for (int i = 0; i < BUFSIZE / 2; i++) {
461d05d5eebSTaylor Simpson         asm volatile("v2 = vsplat(%0)\n\t"
462d05d5eebSTaylor Simpson                      "v3 = vsplat(%1)\n\t"
463d05d5eebSTaylor Simpson                      "v3:2 = vcombine(v2, v3)\n\t"
464d05d5eebSTaylor Simpson                      "vmem(%2+#0) = v2\n\t"
465d05d5eebSTaylor Simpson                      "vmem(%2+#1) = v3\n\t"
466d05d5eebSTaylor Simpson                      :
467d05d5eebSTaylor Simpson                      : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
468d05d5eebSTaylor Simpson                      : "v2", "v3", "memory");
469d05d5eebSTaylor Simpson         for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
470d05d5eebSTaylor Simpson             expect[2 * i].w[j] = 2 * i + 1;
471d05d5eebSTaylor Simpson             expect[2 * i + 1].w[j] = 2 * i;
472d05d5eebSTaylor Simpson         }
473d05d5eebSTaylor Simpson     }
474d05d5eebSTaylor Simpson     check_output_w(__LINE__, BUFSIZE);
475d05d5eebSTaylor Simpson }
476d05d5eebSTaylor Simpson 
main()477afb9539eSTaylor Simpson int main()
478afb9539eSTaylor Simpson {
479afb9539eSTaylor Simpson     init_buffers();
480afb9539eSTaylor Simpson 
481afb9539eSTaylor Simpson     test_load_tmp();
4823fd49e22SMarco Liebel     test_load_tmp2();
483afb9539eSTaylor Simpson     test_load_cur();
484afb9539eSTaylor Simpson     test_load_aligned();
485afb9539eSTaylor Simpson     test_load_unaligned();
486afb9539eSTaylor Simpson     test_store_aligned();
487afb9539eSTaylor Simpson     test_store_unaligned();
488afb9539eSTaylor Simpson     test_masked_store(false);
489afb9539eSTaylor Simpson     test_masked_store(true);
490afb9539eSTaylor Simpson     test_new_value_store();
491afb9539eSTaylor Simpson     test_max_temps();
492afb9539eSTaylor Simpson 
493afb9539eSTaylor Simpson     test_vadd_w();
494afb9539eSTaylor Simpson     test_vadd_h();
495afb9539eSTaylor Simpson     test_vadd_b();
496afb9539eSTaylor Simpson     test_vsub_w();
497afb9539eSTaylor Simpson     test_vsub_h();
498afb9539eSTaylor Simpson     test_vsub_b();
499afb9539eSTaylor Simpson     test_vxor();
500afb9539eSTaylor Simpson     test_vand();
501afb9539eSTaylor Simpson     test_vor();
502afb9539eSTaylor Simpson     test_vnot();
503afb9539eSTaylor Simpson 
504afb9539eSTaylor Simpson     test_pred_or(false);
505afb9539eSTaylor Simpson     test_pred_or_n(true);
506afb9539eSTaylor Simpson     test_pred_and(false);
507afb9539eSTaylor Simpson     test_pred_and_n(true);
508afb9539eSTaylor Simpson     test_pred_xor(false);
509afb9539eSTaylor Simpson 
5105b0043c6STaylor Simpson     test_vadduwsat();
5115b0043c6STaylor Simpson     test_vsubuwsat_dv();
5125b0043c6STaylor Simpson 
51383853ea0STaylor Simpson     test_load_tmp_predicated();
51483853ea0STaylor Simpson     test_load_cur_predicated();
51583853ea0STaylor Simpson 
516d05d5eebSTaylor Simpson     test_vcombine();
517d05d5eebSTaylor Simpson 
518afb9539eSTaylor Simpson     puts(err ? "FAIL" : "PASS");
519afb9539eSTaylor Simpson     return err ? 1 : 0;
520afb9539eSTaylor Simpson }
521