1;*****************************************************************************
2;* x86-optimized functions for gblur filter
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION .text
24
25; void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps,
26;                          float nu, float bscale)
27
28%macro HORIZ_SLICE 0
29%if UNIX64
30cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, x, y, step, stride, remain
31%else
32cglobal horiz_slice, 4, 9, 9, ptr, width, height, steps, nu, bscale, x, y, step, stride, remain
33%endif
34%if WIN64
35    movss m0, num
36    movss m1, bscalem
37    DEFINE_ARGS ptr, width, height, steps, x, y, step, stride, remain
38%endif
39    movsxdifnidn widthq, widthd
40
41    mulss m2, m0, m0 ; nu ^ 2
42    mulss m3, m2, m0 ; nu ^ 3
43    mulss m4, m3, m0 ; nu ^ 4
44    xor   xq, xq
45    xor   yd, yd
46    mov   strideq, widthq
47    ; stride = width * 4
48    shl   strideq, 2
49    ; w = w - ((w - 1) & 3)
50    mov   remainq, widthq
51    sub   remainq, 1
52    and   remainq, 3
53    sub   widthq, remainq
54
55    shufps m0, m0, 0
56    shufps m2, m2, 0
57    shufps m3, m3, 0
58    shufps m4, m4, 0
59
60.loop_y:
61    xor   stepd, stepd
62
63    .loop_step:
64        ; p0 *= bscale
65        mulss m5, m1, [ptrq + xq * 4]
66        movss [ptrq + xq * 4], m5
67        inc xq
68        ; filter rightwards
69        ; Here we are vectorizing the c version by 4
70        ;    for (x = 1; x < width; x++)
71        ;       ptr[x] += nu * ptr[x - 1];
72        ;   let p0 stands for ptr[x-1], the data from last loop
73        ;   and [p1,p2,p3,p4] be the vector data for this loop.
74        ; Unrolling the loop, we get:
75        ;   p1' = p1 + p0*nu
76        ;   p2' = p2 + p1*nu + p0*nu^2
77        ;   p3' = p3 + p2*nu + p1*nu^2 + p0*nu^3
78        ;   p4' = p4 + p3*nu + p2*nu^2 + p1*nu^3 + p0*nu^4
79        ; so we can do it in simd:
80        ; [p1',p2',p3',p4'] = [p1,p2,p3,p4] + [p0,p1,p2,p3]*nu +
81        ;                     [0,p0,p1,p2]*nu^2 + [0,0,p0,p1]*nu^3 +
82        ;                     [0,0,0,p0]*nu^4
83
84        .loop_x:
85            movu m6, [ptrq + xq * 4]         ; s  = [p1,p2,p3,p4]
86            pslldq m7, m6, 4                 ;      [0, p1,p2,p3]
87            movss  m7, m5                    ;      [p0,p1,p2,p3]
88            FMULADD_PS  m6, m7, m0, m6, m8   ; s += [p0,p1,p2,p3] * nu
89            pslldq m7, 4                     ;      [0,p0,p1,p2]
90            FMULADD_PS  m6, m7, m2, m6, m8   ; s += [0,p0,p1,p2]  * nu^2
91            pslldq m7, 4
92            FMULADD_PS  m6, m7, m3, m6, m8   ; s += [0,0,p0,p1]   * nu^3
93            pslldq m7, 4
94            FMULADD_PS  m6, m7, m4, m6, m8   ; s += [0,0,0,p0]    * nu^4
95            movu [ptrq + xq * 4], m6
96            shufps m5, m6, m6, q3333
97            add xq, 4
98            cmp xq, widthq
99            jl .loop_x
100
101        add widthq, remainq
102        cmp xq, widthq
103        jge .end_scalar
104
105        .loop_scalar:
106            ; ptr[x] += nu * ptr[x-1]
107            movss m5, [ptrq + 4*xq - 4]
108            mulss m5, m0
109            addss m5, [ptrq + 4*xq]
110            movss [ptrq + 4*xq], m5
111            inc xq
112            cmp xq, widthq
113            jl .loop_scalar
114        .end_scalar:
115            ; ptr[width - 1] *= bscale
116            dec xq
117            mulss m5, m1, [ptrq + 4*xq]
118            movss [ptrq + 4*xq], m5
119            shufps m5, m5, 0
120
121        ; filter leftwards
122        ;    for (; x > 0; x--)
123        ;        ptr[x - 1] += nu * ptr[x];
124        ; The idea here is basically the same as filter rightwards.
125        ; But we need to take care as the data layout is different.
126        ; Let p0 stands for the ptr[x], which is the data from last loop.
127        ; The way we do it in simd as below:
128        ; [p-4', p-3', p-2', p-1'] = [p-4, p-3, p-2, p-1]
129        ;                          + [p-3, p-2, p-1, p0] * nu
130        ;                          + [p-2, p-1, p0,  0]  * nu^2
131        ;                          + [p-1, p0,  0,   0]  * nu^3
132        ;                          + [p0,  0,   0,   0]  * nu^4
133        .loop_x_back:
134            sub xq, 4
135            movu m6, [ptrq + xq * 4]      ; s = [p-4, p-3, p-2, p-1]
136            psrldq m7, m6, 4              ;     [p-3, p-2, p-1, 0  ]
137            blendps m7, m5, 0x8           ;     [p-3, p-2, p-1, p0 ]
138            FMULADD_PS m6, m7, m0, m6, m8 ; s+= [p-3, p-2, p-1, p0 ] * nu
139            psrldq m7, 4                  ;
140            FMULADD_PS m6, m7, m2, m6, m8 ; s+= [p-2, p-1, p0,  0] * nu^2
141            psrldq m7, 4
142            FMULADD_PS m6, m7, m3, m6, m8 ; s+= [p-1, p0,   0,  0] * nu^3
143            psrldq m7, 4
144            FMULADD_PS m6, m7, m4, m6, m8 ; s+= [p0,  0,    0,  0] * nu^4
145            movu [ptrq + xq * 4], m6
146            shufps m5, m6, m6, 0          ; m5 = [p-4', p-4', p-4', p-4']
147            cmp xq, remainq
148            jg .loop_x_back
149
150        cmp xq, 0
151        jle .end_scalar_back
152
153        .loop_scalar_back:
154            ; ptr[x-1] += nu * ptr[x]
155            movss m5, [ptrq + 4*xq]
156            mulss m5, m0
157            addss m5, [ptrq + 4*xq - 4]
158            movss [ptrq + 4*xq - 4], m5
159            dec xq
160            cmp xq, 0
161            jg .loop_scalar_back
162        .end_scalar_back:
163
164        ; reset aligned width for next line
165        sub widthq, remainq
166
167        inc stepd
168        cmp stepd, stepsd
169        jl .loop_step
170
171    add ptrq, strideq
172    inc yd
173    cmp yd, heightd
174    jl .loop_y
175
176    RET
177%endmacro
178
179%if ARCH_X86_64
180INIT_XMM sse4
181HORIZ_SLICE
182
183INIT_XMM avx2
184HORIZ_SLICE
185%endif
186