1*49278c1bSTaylor Simpson/* 2*49278c1bSTaylor Simpson * Copyright(c) 2021 Qualcomm Innovation Center, Inc. All Rights Reserved. 3*49278c1bSTaylor Simpson * 4*49278c1bSTaylor Simpson * This program is free software; you can redistribute it and/or modify 5*49278c1bSTaylor Simpson * it under the terms of the GNU General Public License as published by 6*49278c1bSTaylor Simpson * the Free Software Foundation; either version 2 of the License, or 7*49278c1bSTaylor Simpson * (at your option) any later version. 8*49278c1bSTaylor Simpson * 9*49278c1bSTaylor Simpson * This program is distributed in the hope that it will be useful, 10*49278c1bSTaylor Simpson * but WITHOUT ANY WARRANTY; without even the implied warranty of 11*49278c1bSTaylor Simpson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12*49278c1bSTaylor Simpson * GNU General Public License for more details. 13*49278c1bSTaylor Simpson * 14*49278c1bSTaylor Simpson * You should have received a copy of the GNU General Public License 15*49278c1bSTaylor Simpson * along with this program; if not, see <http://www.gnu.org/licenses/>. 16*49278c1bSTaylor Simpson */ 17*49278c1bSTaylor Simpson 18*49278c1bSTaylor Simpson 19*49278c1bSTaylor Simpson/* 20*49278c1bSTaylor Simpson * void hvx_histogram_row(uint8_t *src, => r0 21*49278c1bSTaylor Simpson * int stride, => r1 22*49278c1bSTaylor Simpson * int width, => r2 23*49278c1bSTaylor Simpson * int height, => r3 24*49278c1bSTaylor Simpson * int *hist => r4) 25*49278c1bSTaylor Simpson */ 26*49278c1bSTaylor Simpson .text 27*49278c1bSTaylor Simpson .p2align 2 28*49278c1bSTaylor Simpson .global hvx_histogram_row 29*49278c1bSTaylor Simpson .type hvx_histogram_row, @function 30*49278c1bSTaylor Simpsonhvx_histogram_row: 31*49278c1bSTaylor Simpson { r2 = lsr(r2, #7) /* size / VLEN */ 32*49278c1bSTaylor Simpson r5 = and(r2, #127) /* size % VLEN */ 33*49278c1bSTaylor Simpson v1 = #0 34*49278c1bSTaylor Simpson v0 = #0 35*49278c1bSTaylor Simpson } 36*49278c1bSTaylor Simpson /* 37*49278c1bSTaylor Simpson * Step 1: Clean the whole vector register file 38*49278c1bSTaylor Simpson */ 39*49278c1bSTaylor Simpson { v3:2 = v1:0 40*49278c1bSTaylor Simpson v5:4 = v1:0 41*49278c1bSTaylor Simpson p0 = cmp.gt(r2, #0) /* P0 = (width / VLEN > 0) */ 42*49278c1bSTaylor Simpson p1 = cmp.eq(r5, #0) /* P1 = (width % VLEN == 0) */ 43*49278c1bSTaylor Simpson } 44*49278c1bSTaylor Simpson { q0 = vsetq(r5) 45*49278c1bSTaylor Simpson v7:6 = v1:0 46*49278c1bSTaylor Simpson } 47*49278c1bSTaylor Simpson { v9:8 = v1:0 48*49278c1bSTaylor Simpson v11:10 = v1:0 49*49278c1bSTaylor Simpson } 50*49278c1bSTaylor Simpson { v13:12 = v1:0 51*49278c1bSTaylor Simpson v15:14 = v1:0 52*49278c1bSTaylor Simpson } 53*49278c1bSTaylor Simpson { v17:16 = v1:0 54*49278c1bSTaylor Simpson v19:18 = v1:0 55*49278c1bSTaylor Simpson } 56*49278c1bSTaylor Simpson { v21:20 = v1:0 57*49278c1bSTaylor Simpson v23:22 = v1:0 58*49278c1bSTaylor Simpson } 59*49278c1bSTaylor Simpson { v25:24 = v1:0 60*49278c1bSTaylor Simpson v27:26 = v1:0 61*49278c1bSTaylor Simpson } 62*49278c1bSTaylor Simpson { v29:28 = v1:0 63*49278c1bSTaylor Simpson v31:30 = v1:0 64*49278c1bSTaylor Simpson r10 = add(r0, r1) /* R10 = &src[2 * stride] */ 65*49278c1bSTaylor Simpson loop1(.outerloop, r3) 66*49278c1bSTaylor Simpson } 67*49278c1bSTaylor Simpson 68*49278c1bSTaylor Simpson /* 69*49278c1bSTaylor Simpson * Step 2: vhist 70*49278c1bSTaylor Simpson */ 71*49278c1bSTaylor Simpson .falign 72*49278c1bSTaylor Simpson.outerloop: 73*49278c1bSTaylor Simpson { if (!p0) jump .loopend 74*49278c1bSTaylor Simpson loop0(.innerloop, r2) 75*49278c1bSTaylor Simpson } 76*49278c1bSTaylor Simpson 77*49278c1bSTaylor Simpson .falign 78*49278c1bSTaylor Simpson.innerloop: 79*49278c1bSTaylor Simpson { v12.tmp = vmem(R0++#1) 80*49278c1bSTaylor Simpson vhist 81*49278c1bSTaylor Simpson }:endloop0 82*49278c1bSTaylor Simpson 83*49278c1bSTaylor Simpson .falign 84*49278c1bSTaylor Simpson.loopend: 85*49278c1bSTaylor Simpson if (p1) jump .skip /* if (width % VLEN == 0) done with current row */ 86*49278c1bSTaylor Simpson { v13.tmp = vmem(r0 + #0) 87*49278c1bSTaylor Simpson vhist(q0) 88*49278c1bSTaylor Simpson } 89*49278c1bSTaylor Simpson 90*49278c1bSTaylor Simpson .falign 91*49278c1bSTaylor Simpson.skip: 92*49278c1bSTaylor Simpson { r0 = r10 /* R0 = &src[(i + 1) * stride] */ 93*49278c1bSTaylor Simpson r10 = add(r10, r1) /* R10 = &src[(i + 2) * stride] */ 94*49278c1bSTaylor Simpson }:endloop1 95*49278c1bSTaylor Simpson 96*49278c1bSTaylor Simpson 97*49278c1bSTaylor Simpson /* 98*49278c1bSTaylor Simpson * Step 3: Sum up the data 99*49278c1bSTaylor Simpson */ 100*49278c1bSTaylor Simpson { v0.h = vshuff(v0.h) 101*49278c1bSTaylor Simpson r10 = ##0x00010001 102*49278c1bSTaylor Simpson } 103*49278c1bSTaylor Simpson v1.h = vshuff(v1.h) 104*49278c1bSTaylor Simpson { V2.h = vshuff(v2.h) 105*49278c1bSTaylor Simpson v0.w = vdmpy(v0.h, r10.h):sat 106*49278c1bSTaylor Simpson } 107*49278c1bSTaylor Simpson { v3.h = vshuff(v3.h) 108*49278c1bSTaylor Simpson v1.w = vdmpy(v1.h, r10.h):sat 109*49278c1bSTaylor Simpson } 110*49278c1bSTaylor Simpson { v4.h = vshuff(V4.h) 111*49278c1bSTaylor Simpson v2.w = vdmpy(v2.h, r10.h):sat 112*49278c1bSTaylor Simpson } 113*49278c1bSTaylor Simpson { v5.h = vshuff(v5.h) 114*49278c1bSTaylor Simpson v3.w = vdmpy(v3.h, r10.h):sat 115*49278c1bSTaylor Simpson } 116*49278c1bSTaylor Simpson { v6.h = vshuff(v6.h) 117*49278c1bSTaylor Simpson v4.w = vdmpy(v4.h, r10.h):sat 118*49278c1bSTaylor Simpson } 119*49278c1bSTaylor Simpson { v7.h = vshuff(v7.h) 120*49278c1bSTaylor Simpson v5.w = vdmpy(v5.h, r10.h):sat 121*49278c1bSTaylor Simpson } 122*49278c1bSTaylor Simpson { v8.h = vshuff(V8.h) 123*49278c1bSTaylor Simpson v6.w = vdmpy(v6.h, r10.h):sat 124*49278c1bSTaylor Simpson } 125*49278c1bSTaylor Simpson { v9.h = vshuff(V9.h) 126*49278c1bSTaylor Simpson v7.w = vdmpy(v7.h, r10.h):sat 127*49278c1bSTaylor Simpson } 128*49278c1bSTaylor Simpson { v10.h = vshuff(v10.h) 129*49278c1bSTaylor Simpson v8.w = vdmpy(v8.h, r10.h):sat 130*49278c1bSTaylor Simpson } 131*49278c1bSTaylor Simpson { v11.h = vshuff(v11.h) 132*49278c1bSTaylor Simpson v9.w = vdmpy(v9.h, r10.h):sat 133*49278c1bSTaylor Simpson } 134*49278c1bSTaylor Simpson { v12.h = vshuff(v12.h) 135*49278c1bSTaylor Simpson v10.w = vdmpy(v10.h, r10.h):sat 136*49278c1bSTaylor Simpson } 137*49278c1bSTaylor Simpson { v13.h = vshuff(V13.h) 138*49278c1bSTaylor Simpson v11.w = vdmpy(v11.h, r10.h):sat 139*49278c1bSTaylor Simpson } 140*49278c1bSTaylor Simpson { v14.h = vshuff(v14.h) 141*49278c1bSTaylor Simpson v12.w = vdmpy(v12.h, r10.h):sat 142*49278c1bSTaylor Simpson } 143*49278c1bSTaylor Simpson { v15.h = vshuff(v15.h) 144*49278c1bSTaylor Simpson v13.w = vdmpy(v13.h, r10.h):sat 145*49278c1bSTaylor Simpson } 146*49278c1bSTaylor Simpson { v16.h = vshuff(v16.h) 147*49278c1bSTaylor Simpson v14.w = vdmpy(v14.h, r10.h):sat 148*49278c1bSTaylor Simpson } 149*49278c1bSTaylor Simpson { v17.h = vshuff(v17.h) 150*49278c1bSTaylor Simpson v15.w = vdmpy(v15.h, r10.h):sat 151*49278c1bSTaylor Simpson } 152*49278c1bSTaylor Simpson { v18.h = vshuff(v18.h) 153*49278c1bSTaylor Simpson v16.w = vdmpy(v16.h, r10.h):sat 154*49278c1bSTaylor Simpson } 155*49278c1bSTaylor Simpson { v19.h = vshuff(v19.h) 156*49278c1bSTaylor Simpson v17.w = vdmpy(v17.h, r10.h):sat 157*49278c1bSTaylor Simpson } 158*49278c1bSTaylor Simpson { v20.h = vshuff(v20.h) 159*49278c1bSTaylor Simpson v18.W = vdmpy(v18.h, r10.h):sat 160*49278c1bSTaylor Simpson } 161*49278c1bSTaylor Simpson { v21.h = vshuff(v21.h) 162*49278c1bSTaylor Simpson v19.w = vdmpy(v19.h, r10.h):sat 163*49278c1bSTaylor Simpson } 164*49278c1bSTaylor Simpson { v22.h = vshuff(v22.h) 165*49278c1bSTaylor Simpson v20.w = vdmpy(v20.h, r10.h):sat 166*49278c1bSTaylor Simpson } 167*49278c1bSTaylor Simpson { v23.h = vshuff(v23.h) 168*49278c1bSTaylor Simpson v21.w = vdmpy(v21.h, r10.h):sat 169*49278c1bSTaylor Simpson } 170*49278c1bSTaylor Simpson { v24.h = vshuff(v24.h) 171*49278c1bSTaylor Simpson v22.w = vdmpy(v22.h, r10.h):sat 172*49278c1bSTaylor Simpson } 173*49278c1bSTaylor Simpson { v25.h = vshuff(v25.h) 174*49278c1bSTaylor Simpson v23.w = vdmpy(v23.h, r10.h):sat 175*49278c1bSTaylor Simpson } 176*49278c1bSTaylor Simpson { v26.h = vshuff(v26.h) 177*49278c1bSTaylor Simpson v24.w = vdmpy(v24.h, r10.h):sat 178*49278c1bSTaylor Simpson } 179*49278c1bSTaylor Simpson { v27.h = vshuff(V27.h) 180*49278c1bSTaylor Simpson v25.w = vdmpy(v25.h, r10.h):sat 181*49278c1bSTaylor Simpson } 182*49278c1bSTaylor Simpson { v28.h = vshuff(v28.h) 183*49278c1bSTaylor Simpson v26.w = vdmpy(v26.h, r10.h):sat 184*49278c1bSTaylor Simpson } 185*49278c1bSTaylor Simpson { v29.h = vshuff(v29.h) 186*49278c1bSTaylor Simpson v27.w = vdmpy(v27.h, r10.h):sat 187*49278c1bSTaylor Simpson } 188*49278c1bSTaylor Simpson { v30.h = vshuff(v30.h) 189*49278c1bSTaylor Simpson v28.w = vdmpy(v28.h, r10.h):sat 190*49278c1bSTaylor Simpson } 191*49278c1bSTaylor Simpson { v31.h = vshuff(v31.h) 192*49278c1bSTaylor Simpson v29.w = vdmpy(v29.h, r10.h):sat 193*49278c1bSTaylor Simpson r28 = #32 194*49278c1bSTaylor Simpson } 195*49278c1bSTaylor Simpson { vshuff(v1, v0, r28) 196*49278c1bSTaylor Simpson v30.w = vdmpy(v30.h, r10.h):sat 197*49278c1bSTaylor Simpson } 198*49278c1bSTaylor Simpson { vshuff(v3, v2, r28) 199*49278c1bSTaylor Simpson v31.w = vdmpy(v31.h, r10.h):sat 200*49278c1bSTaylor Simpson } 201*49278c1bSTaylor Simpson { vshuff(v5, v4, r28) 202*49278c1bSTaylor Simpson v0.w = vadd(v1.w, v0.w) 203*49278c1bSTaylor Simpson v2.w = vadd(v3.w, v2.w) 204*49278c1bSTaylor Simpson } 205*49278c1bSTaylor Simpson { vshuff(v7, v6, r28) 206*49278c1bSTaylor Simpson r7 = #64 207*49278c1bSTaylor Simpson } 208*49278c1bSTaylor Simpson { vshuff(v9, v8, r28) 209*49278c1bSTaylor Simpson v4.w = vadd(v5.w, v4.w) 210*49278c1bSTaylor Simpson v6.w = vadd(v7.w, v6.w) 211*49278c1bSTaylor Simpson } 212*49278c1bSTaylor Simpson vshuff(v11, v10, r28) 213*49278c1bSTaylor Simpson { vshuff(v13, v12, r28) 214*49278c1bSTaylor Simpson v8.w = vadd(v9.w, v8.w) 215*49278c1bSTaylor Simpson v10.w = vadd(v11.w, v10.w) 216*49278c1bSTaylor Simpson } 217*49278c1bSTaylor Simpson vshuff(v15, v14, r28) 218*49278c1bSTaylor Simpson { vshuff(v17, v16, r28) 219*49278c1bSTaylor Simpson v12.w = vadd(v13.w, v12.w) 220*49278c1bSTaylor Simpson v14.w = vadd(v15.w, v14.w) 221*49278c1bSTaylor Simpson } 222*49278c1bSTaylor Simpson vshuff(v19, v18, r28) 223*49278c1bSTaylor Simpson { vshuff(v21, v20, r28) 224*49278c1bSTaylor Simpson v16.w = vadd(v17.w, v16.w) 225*49278c1bSTaylor Simpson v18.w = vadd(v19.w, v18.w) 226*49278c1bSTaylor Simpson } 227*49278c1bSTaylor Simpson vshuff(v23, v22, r28) 228*49278c1bSTaylor Simpson { vshuff(v25, v24, r28) 229*49278c1bSTaylor Simpson v20.w = vadd(v21.w, v20.w) 230*49278c1bSTaylor Simpson v22.w = vadd(v23.w, v22.w) 231*49278c1bSTaylor Simpson } 232*49278c1bSTaylor Simpson vshuff(v27, v26, r28) 233*49278c1bSTaylor Simpson { vshuff(v29, v28, r28) 234*49278c1bSTaylor Simpson v24.w = vadd(v25.w, v24.w) 235*49278c1bSTaylor Simpson v26.w = vadd(v27.w, v26.w) 236*49278c1bSTaylor Simpson } 237*49278c1bSTaylor Simpson vshuff(v31, v30, r28) 238*49278c1bSTaylor Simpson { v28.w = vadd(v29.w, v28.w) 239*49278c1bSTaylor Simpson vshuff(v2, v0, r7) 240*49278c1bSTaylor Simpson } 241*49278c1bSTaylor Simpson { v30.w = vadd(v31.w, v30.w) 242*49278c1bSTaylor Simpson vshuff(v6, v4, r7) 243*49278c1bSTaylor Simpson v0.w = vadd(v0.w, v2.w) 244*49278c1bSTaylor Simpson } 245*49278c1bSTaylor Simpson { vshuff(v10, v8, r7) 246*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[0-31] */ 247*49278c1bSTaylor Simpson v0.w = vadd(v0.w, v1.w) 248*49278c1bSTaylor Simpson vmem(r4++#1) = v0.new 249*49278c1bSTaylor Simpson } 250*49278c1bSTaylor Simpson { vshuff(v14, v12, r7) 251*49278c1bSTaylor Simpson v4.w = vadd(v4.w, v6.w) 252*49278c1bSTaylor Simpson v8.w = vadd(v8.w, v10.w) 253*49278c1bSTaylor Simpson } 254*49278c1bSTaylor Simpson { vshuff(v18, v16, r7) 255*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[32-63] */ 256*49278c1bSTaylor Simpson v4.w = vadd(v4.w, v1.w) 257*49278c1bSTaylor Simpson vmem(r4++#1) = v4.new 258*49278c1bSTaylor Simpson } 259*49278c1bSTaylor Simpson { vshuff(v22, v20, r7) 260*49278c1bSTaylor Simpson v12.w = vadd(v12.w, v14.w) 261*49278c1bSTaylor Simpson V16.w = vadd(v16.w, v18.w) 262*49278c1bSTaylor Simpson } 263*49278c1bSTaylor Simpson { vshuff(v26, v24, r7) 264*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[64-95] */ 265*49278c1bSTaylor Simpson v8.w = vadd(v8.w, v1.w) 266*49278c1bSTaylor Simpson vmem(r4++#1) = v8.new 267*49278c1bSTaylor Simpson } 268*49278c1bSTaylor Simpson { vshuff(v30, v28, r7) 269*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[96-127] */ 270*49278c1bSTaylor Simpson v12.w = vadd(v12.w, v1.w) 271*49278c1bSTaylor Simpson vmem(r4++#1) = v12.new 272*49278c1bSTaylor Simpson } 273*49278c1bSTaylor Simpson 274*49278c1bSTaylor Simpson { v20.w = vadd(v20.w, v22.w) 275*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[128-159] */ 276*49278c1bSTaylor Simpson v16.w = vadd(v16.w, v1.w) 277*49278c1bSTaylor Simpson vmem(r4++#1) = v16.new 278*49278c1bSTaylor Simpson } 279*49278c1bSTaylor Simpson { v24.w = vadd(v24.w, v26.w) 280*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[160-191] */ 281*49278c1bSTaylor Simpson v20.w = vadd(v20.w, v1.w) 282*49278c1bSTaylor Simpson vmem(r4++#1) = v20.new 283*49278c1bSTaylor Simpson } 284*49278c1bSTaylor Simpson { v28.w = vadd(v28.w, v30.w) 285*49278c1bSTaylor Simpson v1.tmp = vmem(r4 + #0) /* update hist[192-223] */ 286*49278c1bSTaylor Simpson v24.w = vadd(v24.w, v1.w) 287*49278c1bSTaylor Simpson vmem(r4++#1) = v24.new 288*49278c1bSTaylor Simpson } 289*49278c1bSTaylor Simpson { v1.tmp = vmem(r4 + #0) /* update hist[224-255] */ 290*49278c1bSTaylor Simpson v28.w = vadd(v28.w, v1.w) 291*49278c1bSTaylor Simpson vmem(r4++#1) = v28.new 292*49278c1bSTaylor Simpson } 293*49278c1bSTaylor Simpson jumpr r31 294*49278c1bSTaylor Simpson .size hvx_histogram_row, .-hvx_histogram_row 295