1afb9539eSTaylor Simpson /*
2b887b6b7STaylor Simpson * Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights Reserved.
3afb9539eSTaylor Simpson *
4afb9539eSTaylor Simpson * This program is free software; you can redistribute it and/or modify
5afb9539eSTaylor Simpson * it under the terms of the GNU General Public License as published by
6afb9539eSTaylor Simpson * the Free Software Foundation; either version 2 of the License, or
7afb9539eSTaylor Simpson * (at your option) any later version.
8afb9539eSTaylor Simpson *
9afb9539eSTaylor Simpson * This program is distributed in the hope that it will be useful,
10afb9539eSTaylor Simpson * but WITHOUT ANY WARRANTY; without even the implied warranty of
11afb9539eSTaylor Simpson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12afb9539eSTaylor Simpson * GNU General Public License for more details.
13afb9539eSTaylor Simpson *
14afb9539eSTaylor Simpson * You should have received a copy of the GNU General Public License
15afb9539eSTaylor Simpson * along with this program; if not, see <http://www.gnu.org/licenses/>.
16afb9539eSTaylor Simpson */
17afb9539eSTaylor Simpson
18afb9539eSTaylor Simpson #include <stdio.h>
19afb9539eSTaylor Simpson #include <stdint.h>
20afb9539eSTaylor Simpson #include <stdbool.h>
21afb9539eSTaylor Simpson #include <string.h>
225b0043c6STaylor Simpson #include <limits.h>
23afb9539eSTaylor Simpson
24afb9539eSTaylor Simpson int err;
25afb9539eSTaylor Simpson
26761e1c67STaylor Simpson #include "hvx_misc.h"
27afb9539eSTaylor Simpson
test_load_tmp(void)28afb9539eSTaylor Simpson static void test_load_tmp(void)
29afb9539eSTaylor Simpson {
30afb9539eSTaylor Simpson void *p0 = buffer0;
31afb9539eSTaylor Simpson void *p1 = buffer1;
32afb9539eSTaylor Simpson void *pout = output;
33afb9539eSTaylor Simpson
34afb9539eSTaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
35afb9539eSTaylor Simpson /*
36afb9539eSTaylor Simpson * Load into v12 as .tmp, then use it in the next packet
37afb9539eSTaylor Simpson * Should get the new value within the same packet and
38afb9539eSTaylor Simpson * the old value in the next packet
39afb9539eSTaylor Simpson */
40afb9539eSTaylor Simpson asm("v3 = vmem(%0 + #0)\n\t"
41afb9539eSTaylor Simpson "r1 = #1\n\t"
42afb9539eSTaylor Simpson "v12 = vsplat(r1)\n\t"
43afb9539eSTaylor Simpson "{\n\t"
44afb9539eSTaylor Simpson " v12.tmp = vmem(%1 + #0)\n\t"
45afb9539eSTaylor Simpson " v4.w = vadd(v12.w, v3.w)\n\t"
46afb9539eSTaylor Simpson "}\n\t"
47afb9539eSTaylor Simpson "v4.w = vadd(v4.w, v12.w)\n\t"
48afb9539eSTaylor Simpson "vmem(%2 + #0) = v4\n\t"
49afb9539eSTaylor Simpson : : "r"(p0), "r"(p1), "r"(pout)
50afb9539eSTaylor Simpson : "r1", "v12", "v3", "v4", "v6", "memory");
51afb9539eSTaylor Simpson p0 += sizeof(MMVector);
52afb9539eSTaylor Simpson p1 += sizeof(MMVector);
53afb9539eSTaylor Simpson pout += sizeof(MMVector);
54afb9539eSTaylor Simpson
55afb9539eSTaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
56afb9539eSTaylor Simpson expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
57afb9539eSTaylor Simpson }
58afb9539eSTaylor Simpson }
59afb9539eSTaylor Simpson
60afb9539eSTaylor Simpson check_output_w(__LINE__, BUFSIZE);
61afb9539eSTaylor Simpson }
62afb9539eSTaylor Simpson
test_load_tmp2(void)633fd49e22SMarco Liebel static void test_load_tmp2(void)
643fd49e22SMarco Liebel {
653fd49e22SMarco Liebel void *pout0 = &output[0];
663fd49e22SMarco Liebel void *pout1 = &output[1];
673fd49e22SMarco Liebel
683fd49e22SMarco Liebel asm volatile(
693fd49e22SMarco Liebel "r0 = #0x03030303\n\t"
703fd49e22SMarco Liebel "v16 = vsplat(r0)\n\t"
713fd49e22SMarco Liebel "r0 = #0x04040404\n\t"
723fd49e22SMarco Liebel "v18 = vsplat(r0)\n\t"
733fd49e22SMarco Liebel "r0 = #0x05050505\n\t"
743fd49e22SMarco Liebel "v21 = vsplat(r0)\n\t"
753fd49e22SMarco Liebel "{\n\t"
763fd49e22SMarco Liebel " v25:24 += vmpyo(v18.w, v14.h)\n\t"
773fd49e22SMarco Liebel " v15:14.tmp = vcombine(v21, v16)\n\t"
783fd49e22SMarco Liebel "}\n\t"
793fd49e22SMarco Liebel "vmem(%0 + #0) = v24\n\t"
803fd49e22SMarco Liebel "vmem(%1 + #0) = v25\n\t"
813fd49e22SMarco Liebel : : "r"(pout0), "r"(pout1)
823fd49e22SMarco Liebel : "r0", "v16", "v18", "v21", "v24", "v25", "memory"
833fd49e22SMarco Liebel );
843fd49e22SMarco Liebel
853fd49e22SMarco Liebel for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; ++i) {
863fd49e22SMarco Liebel expect[0].w[i] = 0x180c0000;
873fd49e22SMarco Liebel expect[1].w[i] = 0x000c1818;
883fd49e22SMarco Liebel }
893fd49e22SMarco Liebel
903fd49e22SMarco Liebel check_output_w(__LINE__, 2);
913fd49e22SMarco Liebel }
923fd49e22SMarco Liebel
test_load_cur(void)93afb9539eSTaylor Simpson static void test_load_cur(void)
94afb9539eSTaylor Simpson {
95afb9539eSTaylor Simpson void *p0 = buffer0;
96afb9539eSTaylor Simpson void *pout = output;
97afb9539eSTaylor Simpson
98afb9539eSTaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
99afb9539eSTaylor Simpson asm("{\n\t"
100afb9539eSTaylor Simpson " v2.cur = vmem(%0 + #0)\n\t"
101afb9539eSTaylor Simpson " vmem(%1 + #0) = v2\n\t"
102afb9539eSTaylor Simpson "}\n\t"
103afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
104afb9539eSTaylor Simpson p0 += sizeof(MMVector);
105afb9539eSTaylor Simpson pout += sizeof(MMVector);
106afb9539eSTaylor Simpson
107afb9539eSTaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
108afb9539eSTaylor Simpson expect[i].uw[j] = buffer0[i].uw[j];
109afb9539eSTaylor Simpson }
110afb9539eSTaylor Simpson }
111afb9539eSTaylor Simpson
112afb9539eSTaylor Simpson check_output_w(__LINE__, BUFSIZE);
113afb9539eSTaylor Simpson }
114afb9539eSTaylor Simpson
test_load_aligned(void)115afb9539eSTaylor Simpson static void test_load_aligned(void)
116afb9539eSTaylor Simpson {
117afb9539eSTaylor Simpson /* Aligned loads ignore the low bits of the address */
118afb9539eSTaylor Simpson void *p0 = buffer0;
119afb9539eSTaylor Simpson void *pout = output;
120afb9539eSTaylor Simpson const size_t offset = 13;
121afb9539eSTaylor Simpson
122afb9539eSTaylor Simpson p0 += offset; /* Create an unaligned address */
123afb9539eSTaylor Simpson asm("v2 = vmem(%0 + #0)\n\t"
124afb9539eSTaylor Simpson "vmem(%1 + #0) = v2\n\t"
125afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
126afb9539eSTaylor Simpson
127afb9539eSTaylor Simpson expect[0] = buffer0[0];
128afb9539eSTaylor Simpson
129afb9539eSTaylor Simpson check_output_w(__LINE__, 1);
130afb9539eSTaylor Simpson }
131afb9539eSTaylor Simpson
test_load_unaligned(void)132afb9539eSTaylor Simpson static void test_load_unaligned(void)
133afb9539eSTaylor Simpson {
134afb9539eSTaylor Simpson void *p0 = buffer0;
135afb9539eSTaylor Simpson void *pout = output;
136afb9539eSTaylor Simpson const size_t offset = 12;
137afb9539eSTaylor Simpson
138afb9539eSTaylor Simpson p0 += offset; /* Create an unaligned address */
139afb9539eSTaylor Simpson asm("v2 = vmemu(%0 + #0)\n\t"
140afb9539eSTaylor Simpson "vmem(%1 + #0) = v2\n\t"
141afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
142afb9539eSTaylor Simpson
143afb9539eSTaylor Simpson memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
144afb9539eSTaylor Simpson
145afb9539eSTaylor Simpson check_output_w(__LINE__, 1);
146afb9539eSTaylor Simpson }
147afb9539eSTaylor Simpson
test_store_aligned(void)148afb9539eSTaylor Simpson static void test_store_aligned(void)
149afb9539eSTaylor Simpson {
150afb9539eSTaylor Simpson /* Aligned stores ignore the low bits of the address */
151afb9539eSTaylor Simpson void *p0 = buffer0;
152afb9539eSTaylor Simpson void *pout = output;
153afb9539eSTaylor Simpson const size_t offset = 13;
154afb9539eSTaylor Simpson
155afb9539eSTaylor Simpson pout += offset; /* Create an unaligned address */
156afb9539eSTaylor Simpson asm("v2 = vmem(%0 + #0)\n\t"
157afb9539eSTaylor Simpson "vmem(%1 + #0) = v2\n\t"
158afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
159afb9539eSTaylor Simpson
160afb9539eSTaylor Simpson expect[0] = buffer0[0];
161afb9539eSTaylor Simpson
162afb9539eSTaylor Simpson check_output_w(__LINE__, 1);
163afb9539eSTaylor Simpson }
164afb9539eSTaylor Simpson
test_store_unaligned(void)165afb9539eSTaylor Simpson static void test_store_unaligned(void)
166afb9539eSTaylor Simpson {
167afb9539eSTaylor Simpson void *p0 = buffer0;
168afb9539eSTaylor Simpson void *pout = output;
169afb9539eSTaylor Simpson const size_t offset = 12;
170afb9539eSTaylor Simpson
171afb9539eSTaylor Simpson pout += offset; /* Create an unaligned address */
172afb9539eSTaylor Simpson asm("v2 = vmem(%0 + #0)\n\t"
173afb9539eSTaylor Simpson "vmemu(%1 + #0) = v2\n\t"
174afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
175afb9539eSTaylor Simpson
176afb9539eSTaylor Simpson memcpy(expect, buffer0, 2 * sizeof(MMVector));
177afb9539eSTaylor Simpson memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
178afb9539eSTaylor Simpson
179afb9539eSTaylor Simpson check_output_w(__LINE__, 2);
180afb9539eSTaylor Simpson }
181afb9539eSTaylor Simpson
test_masked_store(bool invert)182afb9539eSTaylor Simpson static void test_masked_store(bool invert)
183afb9539eSTaylor Simpson {
184afb9539eSTaylor Simpson void *p0 = buffer0;
185afb9539eSTaylor Simpson void *pmask = mask;
186afb9539eSTaylor Simpson void *pout = output;
187afb9539eSTaylor Simpson
188afb9539eSTaylor Simpson memset(expect, 0xff, sizeof(expect));
189afb9539eSTaylor Simpson memset(output, 0xff, sizeof(expect));
190afb9539eSTaylor Simpson
191afb9539eSTaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
192afb9539eSTaylor Simpson if (invert) {
193afb9539eSTaylor Simpson asm("r4 = #0\n\t"
194afb9539eSTaylor Simpson "v4 = vsplat(r4)\n\t"
195afb9539eSTaylor Simpson "v5 = vmem(%0 + #0)\n\t"
196afb9539eSTaylor Simpson "q0 = vcmp.eq(v4.w, v5.w)\n\t"
197afb9539eSTaylor Simpson "v5 = vmem(%1)\n\t"
198afb9539eSTaylor Simpson "if (!q0) vmem(%2) = v5\n\t" /* Inverted test */
199afb9539eSTaylor Simpson : : "r"(pmask), "r"(p0), "r"(pout)
200afb9539eSTaylor Simpson : "r4", "v4", "v5", "q0", "memory");
201afb9539eSTaylor Simpson } else {
202afb9539eSTaylor Simpson asm("r4 = #0\n\t"
203afb9539eSTaylor Simpson "v4 = vsplat(r4)\n\t"
204afb9539eSTaylor Simpson "v5 = vmem(%0 + #0)\n\t"
205afb9539eSTaylor Simpson "q0 = vcmp.eq(v4.w, v5.w)\n\t"
206afb9539eSTaylor Simpson "v5 = vmem(%1)\n\t"
207afb9539eSTaylor Simpson "if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */
208afb9539eSTaylor Simpson : : "r"(pmask), "r"(p0), "r"(pout)
209afb9539eSTaylor Simpson : "r4", "v4", "v5", "q0", "memory");
210afb9539eSTaylor Simpson }
211afb9539eSTaylor Simpson p0 += sizeof(MMVector);
212afb9539eSTaylor Simpson pmask += sizeof(MMVector);
213afb9539eSTaylor Simpson pout += sizeof(MMVector);
214afb9539eSTaylor Simpson
215afb9539eSTaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
216afb9539eSTaylor Simpson if (invert) {
217afb9539eSTaylor Simpson if (i + j % MASKMOD != 0) {
218afb9539eSTaylor Simpson expect[i].w[j] = buffer0[i].w[j];
219afb9539eSTaylor Simpson }
220afb9539eSTaylor Simpson } else {
221afb9539eSTaylor Simpson if (i + j % MASKMOD == 0) {
222afb9539eSTaylor Simpson expect[i].w[j] = buffer0[i].w[j];
223afb9539eSTaylor Simpson }
224afb9539eSTaylor Simpson }
225afb9539eSTaylor Simpson }
226afb9539eSTaylor Simpson }
227afb9539eSTaylor Simpson
228afb9539eSTaylor Simpson check_output_w(__LINE__, BUFSIZE);
229afb9539eSTaylor Simpson }
230afb9539eSTaylor Simpson
test_new_value_store(void)231afb9539eSTaylor Simpson static void test_new_value_store(void)
232afb9539eSTaylor Simpson {
233afb9539eSTaylor Simpson void *p0 = buffer0;
234b887b6b7STaylor Simpson void *p1 = buffer1;
235afb9539eSTaylor Simpson void *pout = output;
236afb9539eSTaylor Simpson
237afb9539eSTaylor Simpson asm("{\n\t"
238afb9539eSTaylor Simpson " v2 = vmem(%0 + #0)\n\t"
239afb9539eSTaylor Simpson " vmem(%1 + #0) = v2.new\n\t"
240afb9539eSTaylor Simpson "}\n\t"
241afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "v2", "memory");
242afb9539eSTaylor Simpson
243afb9539eSTaylor Simpson expect[0] = buffer0[0];
244afb9539eSTaylor Simpson
245afb9539eSTaylor Simpson check_output_w(__LINE__, 1);
246b887b6b7STaylor Simpson
247b887b6b7STaylor Simpson /* Test the .new read from the high half of a pair */
248b887b6b7STaylor Simpson asm("v7 = vmem(%0 + #0)\n\t"
249b887b6b7STaylor Simpson "v12 = vmem(%1 + #0)\n\t"
250b887b6b7STaylor Simpson "{\n\t"
251b887b6b7STaylor Simpson " v5:4 = vcombine(v12, v7)\n\t"
252b887b6b7STaylor Simpson " vmem(%2 + #0) = v5.new\n\t"
253b887b6b7STaylor Simpson "}\n\t"
254b887b6b7STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
255b887b6b7STaylor Simpson
256b887b6b7STaylor Simpson expect[0] = buffer1[0];
257b887b6b7STaylor Simpson
258b887b6b7STaylor Simpson check_output_w(__LINE__, 1);
259afb9539eSTaylor Simpson }
260afb9539eSTaylor Simpson
test_max_temps()261afb9539eSTaylor Simpson static void test_max_temps()
262afb9539eSTaylor Simpson {
263afb9539eSTaylor Simpson void *p0 = buffer0;
264afb9539eSTaylor Simpson void *pout = output;
265afb9539eSTaylor Simpson
266afb9539eSTaylor Simpson asm("v0 = vmem(%0 + #0)\n\t"
267afb9539eSTaylor Simpson "v1 = vmem(%0 + #1)\n\t"
268afb9539eSTaylor Simpson "v2 = vmem(%0 + #2)\n\t"
269afb9539eSTaylor Simpson "v3 = vmem(%0 + #3)\n\t"
270afb9539eSTaylor Simpson "v4 = vmem(%0 + #4)\n\t"
271afb9539eSTaylor Simpson "{\n\t"
272afb9539eSTaylor Simpson " v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
273afb9539eSTaylor Simpson " v2.b = vshuffe(v3.b, v2.b)\n\t"
274afb9539eSTaylor Simpson " v3.w = vadd(v1.w, v4.w)\n\t"
275afb9539eSTaylor Simpson " v4.tmp = vmem(%0 + #5)\n\t"
276afb9539eSTaylor Simpson "}\n\t"
277afb9539eSTaylor Simpson "vmem(%1 + #0) = v0\n\t"
278afb9539eSTaylor Simpson "vmem(%1 + #1) = v1\n\t"
279afb9539eSTaylor Simpson "vmem(%1 + #2) = v2\n\t"
280afb9539eSTaylor Simpson "vmem(%1 + #3) = v3\n\t"
281afb9539eSTaylor Simpson "vmem(%1 + #4) = v4\n\t"
282afb9539eSTaylor Simpson : : "r"(p0), "r"(pout) : "memory");
283afb9539eSTaylor Simpson
284afb9539eSTaylor Simpson /* The first two vectors come from the vadd-pair instruction */
285afb9539eSTaylor Simpson for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
286afb9539eSTaylor Simpson expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
287afb9539eSTaylor Simpson expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
288afb9539eSTaylor Simpson }
289afb9539eSTaylor Simpson /* The third vector comes from the vshuffe instruction */
290afb9539eSTaylor Simpson for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
291afb9539eSTaylor Simpson expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
292afb9539eSTaylor Simpson (buffer0[3].uh[i] & 0xff) << 8;
293afb9539eSTaylor Simpson }
294afb9539eSTaylor Simpson /* The fourth vector comes from the vadd-single instruction */
295afb9539eSTaylor Simpson for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
296afb9539eSTaylor Simpson expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
297afb9539eSTaylor Simpson }
298afb9539eSTaylor Simpson /*
299afb9539eSTaylor Simpson * The fifth vector comes from the load to v4
300afb9539eSTaylor Simpson * make sure the .tmp is dropped
301afb9539eSTaylor Simpson */
302afb9539eSTaylor Simpson expect[4] = buffer0[4];
303afb9539eSTaylor Simpson
304afb9539eSTaylor Simpson check_output_b(__LINE__, 5);
305afb9539eSTaylor Simpson }
306afb9539eSTaylor Simpson
307afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
308afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
309afb9539eSTaylor Simpson TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
310afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
311afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
312afb9539eSTaylor Simpson TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
313afb9539eSTaylor Simpson TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
314afb9539eSTaylor Simpson TEST_VEC_OP2(vand, vand, , d, 8, &)
315afb9539eSTaylor Simpson TEST_VEC_OP2(vor, vor, , d, 8, |)
316afb9539eSTaylor Simpson TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
317afb9539eSTaylor Simpson
318afb9539eSTaylor Simpson TEST_PRED_OP2(pred_or, or, |, "")
319afb9539eSTaylor Simpson TEST_PRED_OP2(pred_or_n, or, |, "!")
320afb9539eSTaylor Simpson TEST_PRED_OP2(pred_and, and, &, "")
321afb9539eSTaylor Simpson TEST_PRED_OP2(pred_and_n, and, &, "!")
322afb9539eSTaylor Simpson TEST_PRED_OP2(pred_xor, xor, ^, "")
323afb9539eSTaylor Simpson
test_vadduwsat(void)3245b0043c6STaylor Simpson static void test_vadduwsat(void)
3255b0043c6STaylor Simpson {
3265b0043c6STaylor Simpson /*
3275b0043c6STaylor Simpson * Test for saturation by adding two numbers that add to more than UINT_MAX
3285b0043c6STaylor Simpson * and make sure the result saturates to UINT_MAX
3295b0043c6STaylor Simpson */
3305b0043c6STaylor Simpson const uint32_t x = 0xffff0000;
3315b0043c6STaylor Simpson const uint32_t y = 0x000fffff;
3325b0043c6STaylor Simpson
3335b0043c6STaylor Simpson memset(expect, 0x12, sizeof(MMVector));
3345b0043c6STaylor Simpson memset(output, 0x34, sizeof(MMVector));
3355b0043c6STaylor Simpson
3365b0043c6STaylor Simpson asm volatile ("v10 = vsplat(%0)\n\t"
3375b0043c6STaylor Simpson "v11 = vsplat(%1)\n\t"
3385b0043c6STaylor Simpson "v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
3395b0043c6STaylor Simpson "vmem(%2+#0) = v21\n\t"
3405b0043c6STaylor Simpson : /* no outputs */
3415b0043c6STaylor Simpson : "r"(x), "r"(y), "r"(output)
3425b0043c6STaylor Simpson : "v10", "v11", "v21", "memory");
3435b0043c6STaylor Simpson
3445b0043c6STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
3455b0043c6STaylor Simpson expect[0].uw[j] = UINT_MAX;
3465b0043c6STaylor Simpson }
3475b0043c6STaylor Simpson
3485b0043c6STaylor Simpson check_output_w(__LINE__, 1);
3495b0043c6STaylor Simpson }
3505b0043c6STaylor Simpson
test_vsubuwsat_dv(void)3515b0043c6STaylor Simpson static void test_vsubuwsat_dv(void)
3525b0043c6STaylor Simpson {
3535b0043c6STaylor Simpson /*
3545b0043c6STaylor Simpson * Test for saturation by subtracting two numbers where the result is
3555b0043c6STaylor Simpson * negative and make sure the result saturates to zero
3565b0043c6STaylor Simpson *
3575b0043c6STaylor Simpson * vsubuwsat_dv operates on an HVX register pair, so we'll have a
3585b0043c6STaylor Simpson * pair of subtractions
3595b0043c6STaylor Simpson * w - x < 0
3605b0043c6STaylor Simpson * y - z < 0
3615b0043c6STaylor Simpson */
3625b0043c6STaylor Simpson const uint32_t w = 0x000000b7;
3635b0043c6STaylor Simpson const uint32_t x = 0xffffff4e;
3645b0043c6STaylor Simpson const uint32_t y = 0x31fe88e7;
3655b0043c6STaylor Simpson const uint32_t z = 0x7fffff79;
3665b0043c6STaylor Simpson
3675b0043c6STaylor Simpson memset(expect, 0x12, sizeof(MMVector) * 2);
3685b0043c6STaylor Simpson memset(output, 0x34, sizeof(MMVector) * 2);
3695b0043c6STaylor Simpson
3705b0043c6STaylor Simpson asm volatile ("v16 = vsplat(%0)\n\t"
3715b0043c6STaylor Simpson "v17 = vsplat(%1)\n\t"
3725b0043c6STaylor Simpson "v26 = vsplat(%2)\n\t"
3735b0043c6STaylor Simpson "v27 = vsplat(%3)\n\t"
3745b0043c6STaylor Simpson "v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
3755b0043c6STaylor Simpson "vmem(%4+#0) = v24\n\t"
3765b0043c6STaylor Simpson "vmem(%4+#1) = v25\n\t"
3775b0043c6STaylor Simpson : /* no outputs */
3785b0043c6STaylor Simpson : "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
3795b0043c6STaylor Simpson : "v16", "v17", "v24", "v25", "v26", "v27", "memory");
3805b0043c6STaylor Simpson
3815b0043c6STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
3825b0043c6STaylor Simpson expect[0].uw[j] = 0x00000000;
3835b0043c6STaylor Simpson expect[1].uw[j] = 0x00000000;
3845b0043c6STaylor Simpson }
3855b0043c6STaylor Simpson
3865b0043c6STaylor Simpson check_output_w(__LINE__, 2);
3875b0043c6STaylor Simpson }
3885b0043c6STaylor Simpson
test_load_tmp_predicated(void)38983853ea0STaylor Simpson static void test_load_tmp_predicated(void)
39083853ea0STaylor Simpson {
39183853ea0STaylor Simpson void *p0 = buffer0;
39283853ea0STaylor Simpson void *p1 = buffer1;
39383853ea0STaylor Simpson void *pout = output;
39483853ea0STaylor Simpson bool pred = true;
39583853ea0STaylor Simpson
39683853ea0STaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
39783853ea0STaylor Simpson /*
39883853ea0STaylor Simpson * Load into v12 as .tmp with a predicate
39983853ea0STaylor Simpson * When the predicate is true, we get the vector from buffer1[i]
40083853ea0STaylor Simpson * When the predicate is false, we get a vector of all 1's
40183853ea0STaylor Simpson * Regardless of the predicate, the next packet should have
40283853ea0STaylor Simpson * a vector of all 1's
40383853ea0STaylor Simpson */
40483853ea0STaylor Simpson asm("v3 = vmem(%0 + #0)\n\t"
40583853ea0STaylor Simpson "r1 = #1\n\t"
40683853ea0STaylor Simpson "v12 = vsplat(r1)\n\t"
40783853ea0STaylor Simpson "p1 = !cmp.eq(%3, #0)\n\t"
40883853ea0STaylor Simpson "{\n\t"
40983853ea0STaylor Simpson " if (p1) v12.tmp = vmem(%1 + #0)\n\t"
41083853ea0STaylor Simpson " v4.w = vadd(v12.w, v3.w)\n\t"
41183853ea0STaylor Simpson "}\n\t"
41283853ea0STaylor Simpson "v4.w = vadd(v4.w, v12.w)\n\t"
41383853ea0STaylor Simpson "vmem(%2 + #0) = v4\n\t"
41483853ea0STaylor Simpson : : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
41583853ea0STaylor Simpson : "r1", "p1", "v12", "v3", "v4", "v6", "memory");
41683853ea0STaylor Simpson p0 += sizeof(MMVector);
41783853ea0STaylor Simpson p1 += sizeof(MMVector);
41883853ea0STaylor Simpson pout += sizeof(MMVector);
41983853ea0STaylor Simpson
42083853ea0STaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
42183853ea0STaylor Simpson expect[i].w[j] =
42283853ea0STaylor Simpson pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
42383853ea0STaylor Simpson : buffer0[i].w[j] + 2;
42483853ea0STaylor Simpson }
42583853ea0STaylor Simpson pred = !pred;
42683853ea0STaylor Simpson }
42783853ea0STaylor Simpson
42883853ea0STaylor Simpson check_output_w(__LINE__, BUFSIZE);
42983853ea0STaylor Simpson }
43083853ea0STaylor Simpson
test_load_cur_predicated(void)43183853ea0STaylor Simpson static void test_load_cur_predicated(void)
43283853ea0STaylor Simpson {
43383853ea0STaylor Simpson bool pred = true;
43483853ea0STaylor Simpson for (int i = 0; i < BUFSIZE; i++) {
43583853ea0STaylor Simpson asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
43683853ea0STaylor Simpson "v3 = vmem(%0+#0)\n\t"
43783853ea0STaylor Simpson /*
43883853ea0STaylor Simpson * Preload v4 to make sure that the assignment from the
43983853ea0STaylor Simpson * packet below is not being ignored when pred is false.
44083853ea0STaylor Simpson */
44183853ea0STaylor Simpson "r0 = #0x01237654\n\t"
44283853ea0STaylor Simpson "v4 = vsplat(r0)\n\t"
44383853ea0STaylor Simpson "{\n\t"
44483853ea0STaylor Simpson " if (p0) v3.cur = vmem(%1+#0)\n\t"
44583853ea0STaylor Simpson " v4 = v3\n\t"
44683853ea0STaylor Simpson "}\n\t"
44783853ea0STaylor Simpson "vmem(%2+#0) = v4\n\t"
44883853ea0STaylor Simpson :
44983853ea0STaylor Simpson : "r"(&buffer0[i]), "r"(&buffer1[i]),
45083853ea0STaylor Simpson "r"(&output[i]), "r"(pred)
45183853ea0STaylor Simpson : "r0", "p0", "v3", "v4", "memory");
45283853ea0STaylor Simpson expect[i] = pred ? buffer1[i] : buffer0[i];
45383853ea0STaylor Simpson pred = !pred;
45483853ea0STaylor Simpson }
45583853ea0STaylor Simpson check_output_w(__LINE__, BUFSIZE);
45683853ea0STaylor Simpson }
45783853ea0STaylor Simpson
test_vcombine(void)458d05d5eebSTaylor Simpson static void test_vcombine(void)
459d05d5eebSTaylor Simpson {
460d05d5eebSTaylor Simpson for (int i = 0; i < BUFSIZE / 2; i++) {
461d05d5eebSTaylor Simpson asm volatile("v2 = vsplat(%0)\n\t"
462d05d5eebSTaylor Simpson "v3 = vsplat(%1)\n\t"
463d05d5eebSTaylor Simpson "v3:2 = vcombine(v2, v3)\n\t"
464d05d5eebSTaylor Simpson "vmem(%2+#0) = v2\n\t"
465d05d5eebSTaylor Simpson "vmem(%2+#1) = v3\n\t"
466d05d5eebSTaylor Simpson :
467d05d5eebSTaylor Simpson : "r"(2 * i), "r"(2 * i + 1), "r"(&output[2 * i])
468d05d5eebSTaylor Simpson : "v2", "v3", "memory");
469d05d5eebSTaylor Simpson for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
470d05d5eebSTaylor Simpson expect[2 * i].w[j] = 2 * i + 1;
471d05d5eebSTaylor Simpson expect[2 * i + 1].w[j] = 2 * i;
472d05d5eebSTaylor Simpson }
473d05d5eebSTaylor Simpson }
474d05d5eebSTaylor Simpson check_output_w(__LINE__, BUFSIZE);
475d05d5eebSTaylor Simpson }
476d05d5eebSTaylor Simpson
main()477afb9539eSTaylor Simpson int main()
478afb9539eSTaylor Simpson {
479afb9539eSTaylor Simpson init_buffers();
480afb9539eSTaylor Simpson
481afb9539eSTaylor Simpson test_load_tmp();
4823fd49e22SMarco Liebel test_load_tmp2();
483afb9539eSTaylor Simpson test_load_cur();
484afb9539eSTaylor Simpson test_load_aligned();
485afb9539eSTaylor Simpson test_load_unaligned();
486afb9539eSTaylor Simpson test_store_aligned();
487afb9539eSTaylor Simpson test_store_unaligned();
488afb9539eSTaylor Simpson test_masked_store(false);
489afb9539eSTaylor Simpson test_masked_store(true);
490afb9539eSTaylor Simpson test_new_value_store();
491afb9539eSTaylor Simpson test_max_temps();
492afb9539eSTaylor Simpson
493afb9539eSTaylor Simpson test_vadd_w();
494afb9539eSTaylor Simpson test_vadd_h();
495afb9539eSTaylor Simpson test_vadd_b();
496afb9539eSTaylor Simpson test_vsub_w();
497afb9539eSTaylor Simpson test_vsub_h();
498afb9539eSTaylor Simpson test_vsub_b();
499afb9539eSTaylor Simpson test_vxor();
500afb9539eSTaylor Simpson test_vand();
501afb9539eSTaylor Simpson test_vor();
502afb9539eSTaylor Simpson test_vnot();
503afb9539eSTaylor Simpson
504afb9539eSTaylor Simpson test_pred_or(false);
505afb9539eSTaylor Simpson test_pred_or_n(true);
506afb9539eSTaylor Simpson test_pred_and(false);
507afb9539eSTaylor Simpson test_pred_and_n(true);
508afb9539eSTaylor Simpson test_pred_xor(false);
509afb9539eSTaylor Simpson
5105b0043c6STaylor Simpson test_vadduwsat();
5115b0043c6STaylor Simpson test_vsubuwsat_dv();
5125b0043c6STaylor Simpson
51383853ea0STaylor Simpson test_load_tmp_predicated();
51483853ea0STaylor Simpson test_load_cur_predicated();
51583853ea0STaylor Simpson
516d05d5eebSTaylor Simpson test_vcombine();
517d05d5eebSTaylor Simpson
518afb9539eSTaylor Simpson puts(err ? "FAIL" : "PASS");
519afb9539eSTaylor Simpson return err ? 1 : 0;
520afb9539eSTaylor Simpson }
521