xref: /qemu/tests/tcg/hexagon/hvx_histogram_row.S (revision 49278c1b)
1*49278c1bSTaylor Simpson/*
2*49278c1bSTaylor Simpson *  Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3*49278c1bSTaylor Simpson *
4*49278c1bSTaylor Simpson *  This program is free software; you can redistribute it and/or modify
5*49278c1bSTaylor Simpson *  it under the terms of the GNU General Public License as published by
6*49278c1bSTaylor Simpson *  the Free Software Foundation; either version 2 of the License, or
7*49278c1bSTaylor Simpson *  (at your option) any later version.
8*49278c1bSTaylor Simpson *
9*49278c1bSTaylor Simpson *  This program is distributed in the hope that it will be useful,
10*49278c1bSTaylor Simpson *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11*49278c1bSTaylor Simpson *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12*49278c1bSTaylor Simpson *  GNU General Public License for more details.
13*49278c1bSTaylor Simpson *
14*49278c1bSTaylor Simpson *  You should have received a copy of the GNU General Public License
15*49278c1bSTaylor Simpson *  along with this program; if not, see <http://www.gnu.org/licenses/>.
16*49278c1bSTaylor Simpson */
17*49278c1bSTaylor Simpson
18*49278c1bSTaylor Simpson
19*49278c1bSTaylor Simpson/*
20*49278c1bSTaylor Simpson * void hvx_histogram_row(uint8_t *src,     => r0
21*49278c1bSTaylor Simpson *                        int stride,       => r1
22*49278c1bSTaylor Simpson *                        int width,        => r2
23*49278c1bSTaylor Simpson *                        int height,       => r3
24*49278c1bSTaylor Simpson *                        int *hist         => r4)
25*49278c1bSTaylor Simpson */
26*49278c1bSTaylor Simpson    .text
27*49278c1bSTaylor Simpson    .p2align 2
28*49278c1bSTaylor Simpson    .global hvx_histogram_row
29*49278c1bSTaylor Simpson    .type hvx_histogram_row, @function
30*49278c1bSTaylor Simpsonhvx_histogram_row:
31*49278c1bSTaylor Simpson    { r2 = lsr(r2, #7)          /* size / VLEN */
32*49278c1bSTaylor Simpson      r5 = and(r2, #127)        /* size % VLEN */
33*49278c1bSTaylor Simpson      v1 = #0
34*49278c1bSTaylor Simpson      v0 = #0
35*49278c1bSTaylor Simpson    }
36*49278c1bSTaylor Simpson    /*
37*49278c1bSTaylor Simpson     * Step 1: Clean the whole vector register file
38*49278c1bSTaylor Simpson     */
39*49278c1bSTaylor Simpson    { v3:2 = v1:0
40*49278c1bSTaylor Simpson      v5:4 = v1:0
41*49278c1bSTaylor Simpson      p0 = cmp.gt(r2, #0)       /* P0 = (width / VLEN > 0) */
42*49278c1bSTaylor Simpson      p1 = cmp.eq(r5, #0)       /* P1 = (width % VLEN == 0) */
43*49278c1bSTaylor Simpson    }
44*49278c1bSTaylor Simpson    { q0 = vsetq(r5)
45*49278c1bSTaylor Simpson      v7:6 = v1:0
46*49278c1bSTaylor Simpson    }
47*49278c1bSTaylor Simpson    { v9:8   = v1:0
48*49278c1bSTaylor Simpson      v11:10 = v1:0
49*49278c1bSTaylor Simpson    }
50*49278c1bSTaylor Simpson    { v13:12 = v1:0
51*49278c1bSTaylor Simpson      v15:14 = v1:0
52*49278c1bSTaylor Simpson    }
53*49278c1bSTaylor Simpson    { v17:16 = v1:0
54*49278c1bSTaylor Simpson      v19:18 = v1:0
55*49278c1bSTaylor Simpson    }
56*49278c1bSTaylor Simpson    { v21:20 = v1:0
57*49278c1bSTaylor Simpson      v23:22 = v1:0
58*49278c1bSTaylor Simpson    }
59*49278c1bSTaylor Simpson    { v25:24 = v1:0
60*49278c1bSTaylor Simpson      v27:26 = v1:0
61*49278c1bSTaylor Simpson    }
62*49278c1bSTaylor Simpson    { v29:28 = v1:0
63*49278c1bSTaylor Simpson      v31:30 = v1:0
64*49278c1bSTaylor Simpson      r10 = add(r0, r1)           /* R10 = &src[2 * stride] */
65*49278c1bSTaylor Simpson      loop1(.outerloop, r3)
66*49278c1bSTaylor Simpson    }
67*49278c1bSTaylor Simpson
68*49278c1bSTaylor Simpson    /*
69*49278c1bSTaylor Simpson     * Step 2: vhist
70*49278c1bSTaylor Simpson     */
71*49278c1bSTaylor Simpson    .falign
72*49278c1bSTaylor Simpson.outerloop:
73*49278c1bSTaylor Simpson    { if (!p0) jump .loopend
74*49278c1bSTaylor Simpson      loop0(.innerloop, r2)
75*49278c1bSTaylor Simpson    }
76*49278c1bSTaylor Simpson
77*49278c1bSTaylor Simpson    .falign
78*49278c1bSTaylor Simpson.innerloop:
79*49278c1bSTaylor Simpson    { v12.tmp = vmem(R0++#1)
80*49278c1bSTaylor Simpson      vhist
81*49278c1bSTaylor Simpson    }:endloop0
82*49278c1bSTaylor Simpson
83*49278c1bSTaylor Simpson    .falign
84*49278c1bSTaylor Simpson.loopend:
85*49278c1bSTaylor Simpson    if (p1) jump .skip       /* if (width % VLEN == 0) done with current row */
86*49278c1bSTaylor Simpson    { v13.tmp = vmem(r0 + #0)
87*49278c1bSTaylor Simpson      vhist(q0)
88*49278c1bSTaylor Simpson    }
89*49278c1bSTaylor Simpson
90*49278c1bSTaylor Simpson    .falign
91*49278c1bSTaylor Simpson.skip:
92*49278c1bSTaylor Simpson    { r0 = r10                    /* R0  = &src[(i + 1) * stride] */
93*49278c1bSTaylor Simpson      r10 = add(r10, r1)          /* R10 = &src[(i + 2) * stride] */
94*49278c1bSTaylor Simpson    }:endloop1
95*49278c1bSTaylor Simpson
96*49278c1bSTaylor Simpson
97*49278c1bSTaylor Simpson    /*
98*49278c1bSTaylor Simpson     * Step 3: Sum up the data
99*49278c1bSTaylor Simpson     */
100*49278c1bSTaylor Simpson    { v0.h = vshuff(v0.h)
101*49278c1bSTaylor Simpson      r10 = ##0x00010001
102*49278c1bSTaylor Simpson    }
103*49278c1bSTaylor Simpson    v1.h = vshuff(v1.h)
104*49278c1bSTaylor Simpson    { V2.h = vshuff(v2.h)
105*49278c1bSTaylor Simpson      v0.w = vdmpy(v0.h, r10.h):sat
106*49278c1bSTaylor Simpson    }
107*49278c1bSTaylor Simpson    { v3.h = vshuff(v3.h)
108*49278c1bSTaylor Simpson      v1.w = vdmpy(v1.h, r10.h):sat
109*49278c1bSTaylor Simpson    }
110*49278c1bSTaylor Simpson    { v4.h = vshuff(V4.h)
111*49278c1bSTaylor Simpson      v2.w = vdmpy(v2.h, r10.h):sat
112*49278c1bSTaylor Simpson    }
113*49278c1bSTaylor Simpson    { v5.h = vshuff(v5.h)
114*49278c1bSTaylor Simpson      v3.w = vdmpy(v3.h, r10.h):sat
115*49278c1bSTaylor Simpson    }
116*49278c1bSTaylor Simpson    { v6.h = vshuff(v6.h)
117*49278c1bSTaylor Simpson      v4.w = vdmpy(v4.h, r10.h):sat
118*49278c1bSTaylor Simpson    }
119*49278c1bSTaylor Simpson    { v7.h = vshuff(v7.h)
120*49278c1bSTaylor Simpson      v5.w = vdmpy(v5.h, r10.h):sat
121*49278c1bSTaylor Simpson    }
122*49278c1bSTaylor Simpson    { v8.h = vshuff(V8.h)
123*49278c1bSTaylor Simpson      v6.w = vdmpy(v6.h, r10.h):sat
124*49278c1bSTaylor Simpson    }
125*49278c1bSTaylor Simpson    { v9.h = vshuff(V9.h)
126*49278c1bSTaylor Simpson      v7.w = vdmpy(v7.h, r10.h):sat
127*49278c1bSTaylor Simpson    }
128*49278c1bSTaylor Simpson    { v10.h = vshuff(v10.h)
129*49278c1bSTaylor Simpson      v8.w = vdmpy(v8.h, r10.h):sat
130*49278c1bSTaylor Simpson    }
131*49278c1bSTaylor Simpson    { v11.h = vshuff(v11.h)
132*49278c1bSTaylor Simpson      v9.w = vdmpy(v9.h, r10.h):sat
133*49278c1bSTaylor Simpson    }
134*49278c1bSTaylor Simpson    { v12.h = vshuff(v12.h)
135*49278c1bSTaylor Simpson      v10.w = vdmpy(v10.h, r10.h):sat
136*49278c1bSTaylor Simpson    }
137*49278c1bSTaylor Simpson    { v13.h = vshuff(V13.h)
138*49278c1bSTaylor Simpson      v11.w = vdmpy(v11.h, r10.h):sat
139*49278c1bSTaylor Simpson    }
140*49278c1bSTaylor Simpson    { v14.h = vshuff(v14.h)
141*49278c1bSTaylor Simpson      v12.w = vdmpy(v12.h, r10.h):sat
142*49278c1bSTaylor Simpson    }
143*49278c1bSTaylor Simpson    { v15.h = vshuff(v15.h)
144*49278c1bSTaylor Simpson      v13.w = vdmpy(v13.h, r10.h):sat
145*49278c1bSTaylor Simpson    }
146*49278c1bSTaylor Simpson    { v16.h = vshuff(v16.h)
147*49278c1bSTaylor Simpson      v14.w = vdmpy(v14.h, r10.h):sat
148*49278c1bSTaylor Simpson    }
149*49278c1bSTaylor Simpson    { v17.h = vshuff(v17.h)
150*49278c1bSTaylor Simpson      v15.w = vdmpy(v15.h, r10.h):sat
151*49278c1bSTaylor Simpson    }
152*49278c1bSTaylor Simpson    { v18.h = vshuff(v18.h)
153*49278c1bSTaylor Simpson      v16.w = vdmpy(v16.h, r10.h):sat
154*49278c1bSTaylor Simpson    }
155*49278c1bSTaylor Simpson    { v19.h = vshuff(v19.h)
156*49278c1bSTaylor Simpson      v17.w = vdmpy(v17.h, r10.h):sat
157*49278c1bSTaylor Simpson    }
158*49278c1bSTaylor Simpson    { v20.h = vshuff(v20.h)
159*49278c1bSTaylor Simpson      v18.W = vdmpy(v18.h, r10.h):sat
160*49278c1bSTaylor Simpson    }
161*49278c1bSTaylor Simpson    { v21.h = vshuff(v21.h)
162*49278c1bSTaylor Simpson      v19.w = vdmpy(v19.h, r10.h):sat
163*49278c1bSTaylor Simpson    }
164*49278c1bSTaylor Simpson    { v22.h = vshuff(v22.h)
165*49278c1bSTaylor Simpson      v20.w = vdmpy(v20.h, r10.h):sat
166*49278c1bSTaylor Simpson    }
167*49278c1bSTaylor Simpson    { v23.h = vshuff(v23.h)
168*49278c1bSTaylor Simpson      v21.w = vdmpy(v21.h, r10.h):sat
169*49278c1bSTaylor Simpson    }
170*49278c1bSTaylor Simpson    { v24.h = vshuff(v24.h)
171*49278c1bSTaylor Simpson      v22.w = vdmpy(v22.h, r10.h):sat
172*49278c1bSTaylor Simpson    }
173*49278c1bSTaylor Simpson    { v25.h = vshuff(v25.h)
174*49278c1bSTaylor Simpson      v23.w = vdmpy(v23.h, r10.h):sat
175*49278c1bSTaylor Simpson    }
176*49278c1bSTaylor Simpson    { v26.h = vshuff(v26.h)
177*49278c1bSTaylor Simpson      v24.w = vdmpy(v24.h, r10.h):sat
178*49278c1bSTaylor Simpson    }
179*49278c1bSTaylor Simpson    { v27.h = vshuff(V27.h)
180*49278c1bSTaylor Simpson      v25.w = vdmpy(v25.h, r10.h):sat
181*49278c1bSTaylor Simpson    }
182*49278c1bSTaylor Simpson    { v28.h = vshuff(v28.h)
183*49278c1bSTaylor Simpson      v26.w = vdmpy(v26.h, r10.h):sat
184*49278c1bSTaylor Simpson    }
185*49278c1bSTaylor Simpson    { v29.h = vshuff(v29.h)
186*49278c1bSTaylor Simpson      v27.w = vdmpy(v27.h, r10.h):sat
187*49278c1bSTaylor Simpson    }
188*49278c1bSTaylor Simpson    { v30.h = vshuff(v30.h)
189*49278c1bSTaylor Simpson      v28.w = vdmpy(v28.h, r10.h):sat
190*49278c1bSTaylor Simpson    }
191*49278c1bSTaylor Simpson    { v31.h = vshuff(v31.h)
192*49278c1bSTaylor Simpson      v29.w = vdmpy(v29.h, r10.h):sat
193*49278c1bSTaylor Simpson      r28 = #32
194*49278c1bSTaylor Simpson    }
195*49278c1bSTaylor Simpson    { vshuff(v1, v0, r28)
196*49278c1bSTaylor Simpson      v30.w = vdmpy(v30.h, r10.h):sat
197*49278c1bSTaylor Simpson    }
198*49278c1bSTaylor Simpson    { vshuff(v3, v2, r28)
199*49278c1bSTaylor Simpson      v31.w = vdmpy(v31.h, r10.h):sat
200*49278c1bSTaylor Simpson    }
201*49278c1bSTaylor Simpson    { vshuff(v5, v4, r28)
202*49278c1bSTaylor Simpson      v0.w = vadd(v1.w, v0.w)
203*49278c1bSTaylor Simpson      v2.w = vadd(v3.w, v2.w)
204*49278c1bSTaylor Simpson    }
205*49278c1bSTaylor Simpson    { vshuff(v7, v6, r28)
206*49278c1bSTaylor Simpson      r7 = #64
207*49278c1bSTaylor Simpson    }
208*49278c1bSTaylor Simpson    { vshuff(v9, v8, r28)
209*49278c1bSTaylor Simpson      v4.w = vadd(v5.w, v4.w)
210*49278c1bSTaylor Simpson      v6.w = vadd(v7.w, v6.w)
211*49278c1bSTaylor Simpson    }
212*49278c1bSTaylor Simpson    vshuff(v11, v10, r28)
213*49278c1bSTaylor Simpson    { vshuff(v13, v12, r28)
214*49278c1bSTaylor Simpson      v8.w = vadd(v9.w, v8.w)
215*49278c1bSTaylor Simpson      v10.w = vadd(v11.w, v10.w)
216*49278c1bSTaylor Simpson    }
217*49278c1bSTaylor Simpson    vshuff(v15, v14, r28)
218*49278c1bSTaylor Simpson    { vshuff(v17, v16, r28)
219*49278c1bSTaylor Simpson      v12.w = vadd(v13.w, v12.w)
220*49278c1bSTaylor Simpson      v14.w = vadd(v15.w, v14.w)
221*49278c1bSTaylor Simpson    }
222*49278c1bSTaylor Simpson    vshuff(v19, v18, r28)
223*49278c1bSTaylor Simpson    { vshuff(v21, v20, r28)
224*49278c1bSTaylor Simpson      v16.w = vadd(v17.w, v16.w)
225*49278c1bSTaylor Simpson      v18.w = vadd(v19.w, v18.w)
226*49278c1bSTaylor Simpson    }
227*49278c1bSTaylor Simpson    vshuff(v23, v22, r28)
228*49278c1bSTaylor Simpson    { vshuff(v25, v24, r28)
229*49278c1bSTaylor Simpson      v20.w = vadd(v21.w, v20.w)
230*49278c1bSTaylor Simpson      v22.w = vadd(v23.w, v22.w)
231*49278c1bSTaylor Simpson    }
232*49278c1bSTaylor Simpson    vshuff(v27, v26, r28)
233*49278c1bSTaylor Simpson    { vshuff(v29, v28, r28)
234*49278c1bSTaylor Simpson      v24.w = vadd(v25.w, v24.w)
235*49278c1bSTaylor Simpson      v26.w = vadd(v27.w, v26.w)
236*49278c1bSTaylor Simpson    }
237*49278c1bSTaylor Simpson    vshuff(v31, v30, r28)
238*49278c1bSTaylor Simpson    { v28.w = vadd(v29.w, v28.w)
239*49278c1bSTaylor Simpson      vshuff(v2, v0, r7)
240*49278c1bSTaylor Simpson    }
241*49278c1bSTaylor Simpson    { v30.w = vadd(v31.w, v30.w)
242*49278c1bSTaylor Simpson      vshuff(v6, v4, r7)
243*49278c1bSTaylor Simpson      v0.w  = vadd(v0.w, v2.w)
244*49278c1bSTaylor Simpson    }
245*49278c1bSTaylor Simpson    { vshuff(v10, v8, r7)
246*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[0-31] */
247*49278c1bSTaylor Simpson      v0.w  = vadd(v0.w, v1.w)
248*49278c1bSTaylor Simpson      vmem(r4++#1) = v0.new
249*49278c1bSTaylor Simpson    }
250*49278c1bSTaylor Simpson    { vshuff(v14, v12, r7)
251*49278c1bSTaylor Simpson      v4.w  = vadd(v4.w, v6.w)
252*49278c1bSTaylor Simpson      v8.w  = vadd(v8.w, v10.w)
253*49278c1bSTaylor Simpson    }
254*49278c1bSTaylor Simpson    { vshuff(v18, v16, r7)
255*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[32-63] */
256*49278c1bSTaylor Simpson      v4.w  = vadd(v4.w, v1.w)
257*49278c1bSTaylor Simpson      vmem(r4++#1) = v4.new
258*49278c1bSTaylor Simpson    }
259*49278c1bSTaylor Simpson    { vshuff(v22, v20, r7)
260*49278c1bSTaylor Simpson      v12.w = vadd(v12.w, v14.w)
261*49278c1bSTaylor Simpson      V16.w = vadd(v16.w, v18.w)
262*49278c1bSTaylor Simpson    }
263*49278c1bSTaylor Simpson    { vshuff(v26, v24, r7)
264*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[64-95] */
265*49278c1bSTaylor Simpson      v8.w  = vadd(v8.w, v1.w)
266*49278c1bSTaylor Simpson      vmem(r4++#1) = v8.new
267*49278c1bSTaylor Simpson    }
268*49278c1bSTaylor Simpson    { vshuff(v30, v28, r7)
269*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[96-127] */
270*49278c1bSTaylor Simpson      v12.w  = vadd(v12.w, v1.w)
271*49278c1bSTaylor Simpson      vmem(r4++#1) = v12.new
272*49278c1bSTaylor Simpson    }
273*49278c1bSTaylor Simpson
274*49278c1bSTaylor Simpson    { v20.w = vadd(v20.w, v22.w)
275*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[128-159] */
276*49278c1bSTaylor Simpson      v16.w  = vadd(v16.w, v1.w)
277*49278c1bSTaylor Simpson      vmem(r4++#1) = v16.new
278*49278c1bSTaylor Simpson    }
279*49278c1bSTaylor Simpson    { v24.w = vadd(v24.w, v26.w)
280*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[160-191] */
281*49278c1bSTaylor Simpson      v20.w  = vadd(v20.w, v1.w)
282*49278c1bSTaylor Simpson      vmem(r4++#1) = v20.new
283*49278c1bSTaylor Simpson    }
284*49278c1bSTaylor Simpson    { v28.w = vadd(v28.w, v30.w)
285*49278c1bSTaylor Simpson      v1.tmp = vmem(r4 + #0)      /* update hist[192-223] */
286*49278c1bSTaylor Simpson      v24.w  = vadd(v24.w, v1.w)
287*49278c1bSTaylor Simpson      vmem(r4++#1) = v24.new
288*49278c1bSTaylor Simpson    }
289*49278c1bSTaylor Simpson    { v1.tmp = vmem(r4 + #0)      /* update hist[224-255] */
290*49278c1bSTaylor Simpson      v28.w  = vadd(v28.w, v1.w)
291*49278c1bSTaylor Simpson      vmem(r4++#1) = v28.new
292*49278c1bSTaylor Simpson    }
293*49278c1bSTaylor Simpson    jumpr r31
294*49278c1bSTaylor Simpson    .size hvx_histogram_row, .-hvx_histogram_row
295