1;******************************************************************************
2;* SIMD optimized SAO functions for HEVC 8bit decoding
3;*
4;* Copyright (c) 2013 Pierre-Edouard LEPERE
5;* Copyright (c) 2014 James Almer
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
29pb_eo:                   db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
30cextern pb_1
31cextern pb_2
32
33SECTION .text
34
35;******************************************************************************
36;SAO Band Filter
37;******************************************************************************
38
39%macro HEVC_SAO_BAND_FILTER_INIT 0
40    and            leftq, 31
41    movd             xm0, leftd
42    add            leftq, 1
43    and            leftq, 31
44    movd             xm1, leftd
45    add            leftq, 1
46    and            leftq, 31
47    movd             xm2, leftd
48    add            leftq, 1
49    and            leftq, 31
50    movd             xm3, leftd
51
52    SPLATW            m0, xm0
53    SPLATW            m1, xm1
54    SPLATW            m2, xm2
55    SPLATW            m3, xm3
56%if mmsize > 16
57    SPLATW            m4, [offsetq + 2]
58    SPLATW            m5, [offsetq + 4]
59    SPLATW            m6, [offsetq + 6]
60    SPLATW            m7, [offsetq + 8]
61%else
62    movq              m7, [offsetq + 2]
63    SPLATW            m4, m7, 0
64    SPLATW            m5, m7, 1
65    SPLATW            m6, m7, 2
66    SPLATW            m7, m7, 3
67%endif
68
69%if ARCH_X86_64
70    pxor             m14, m14
71
72%else ; ARCH_X86_32
73    mova  [rsp+mmsize*0], m0
74    mova  [rsp+mmsize*1], m1
75    mova  [rsp+mmsize*2], m2
76    mova  [rsp+mmsize*3], m3
77    mova  [rsp+mmsize*4], m4
78    mova  [rsp+mmsize*5], m5
79    mova  [rsp+mmsize*6], m6
80    pxor              m0, m0
81    %assign MMSIZE mmsize
82    %define m14 m0
83    %define m13 m1
84    %define  m9 m2
85    %define  m8 m3
86%endif ; ARCH
87DEFINE_ARGS dst, src, dststride, srcstride, offset, height
88    mov          heightd, r7m
89%endmacro
90
91%macro HEVC_SAO_BAND_FILTER_COMPUTE 2
92    psraw             %1, %2, 3
93%if ARCH_X86_64
94    pcmpeqw          m10, %1, m0
95    pcmpeqw          m11, %1, m1
96    pcmpeqw          m12, %1, m2
97    pcmpeqw           %1, m3
98    pand             m10, m4
99    pand             m11, m5
100    pand             m12, m6
101    pand              %1, m7
102    por              m10, m11
103    por              m12, %1
104    por              m10, m12
105    paddw             %2, m10
106%else ; ARCH_X86_32
107    pcmpeqw           m4, %1, [rsp+MMSIZE*0]
108    pcmpeqw           m5, %1, [rsp+MMSIZE*1]
109    pcmpeqw           m6, %1, [rsp+MMSIZE*2]
110    pcmpeqw           %1, [rsp+MMSIZE*3]
111    pand              m4, [rsp+MMSIZE*4]
112    pand              m5, [rsp+MMSIZE*5]
113    pand              m6, [rsp+MMSIZE*6]
114    pand              %1, m7
115    por               m4, m5
116    por               m6, %1
117    por               m4, m6
118    paddw             %2, m4
119%endif ; ARCH
120%endmacro
121
122;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
123;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
124%macro HEVC_SAO_BAND_FILTER 2
125cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
126    HEVC_SAO_BAND_FILTER_INIT
127
128align 16
129.loop:
130%if %1 == 8
131    movq              m8, [srcq]
132    punpcklbw         m8, m14
133    HEVC_SAO_BAND_FILTER_COMPUTE m9, m8
134    packuswb          m8, m14
135    movq          [dstq], m8
136%endif ; %1 == 8
137
138%assign i 0
139%rep %2
140    mova             m13, [srcq + i]
141    punpcklbw         m8, m13, m14
142    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
143    punpckhbw        m13, m14
144    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
145    packuswb          m8, m13
146    mova      [dstq + i], m8
147%assign i i+mmsize
148%endrep
149
150%if %1 == 48
151INIT_XMM cpuname
152
153    mova             m13, [srcq + i]
154    punpcklbw         m8, m13, m14
155    HEVC_SAO_BAND_FILTER_COMPUTE m9,  m8
156    punpckhbw        m13, m14
157    HEVC_SAO_BAND_FILTER_COMPUTE m9, m13
158    packuswb          m8, m13
159    mova      [dstq + i], m8
160%if cpuflag(avx2)
161INIT_YMM cpuname
162%endif
163%endif ; %1 == 48
164
165    add             dstq, dststrideq             ; dst += dststride
166    add             srcq, srcstrideq             ; src += srcstride
167    dec          heightd                         ; cmp height
168    jnz               .loop                      ; height loop
169    REP_RET
170%endmacro
171
172
173%macro HEVC_SAO_BAND_FILTER_FUNCS 0
174HEVC_SAO_BAND_FILTER  8, 0
175HEVC_SAO_BAND_FILTER 16, 1
176HEVC_SAO_BAND_FILTER 32, 2
177HEVC_SAO_BAND_FILTER 48, 2
178HEVC_SAO_BAND_FILTER 64, 4
179%endmacro
180
181INIT_XMM sse2
182HEVC_SAO_BAND_FILTER_FUNCS
183INIT_XMM avx
184HEVC_SAO_BAND_FILTER_FUNCS
185
186%if HAVE_AVX2_EXTERNAL
187INIT_XMM avx2
188HEVC_SAO_BAND_FILTER  8, 0
189HEVC_SAO_BAND_FILTER 16, 1
190INIT_YMM avx2
191HEVC_SAO_BAND_FILTER 32, 1
192HEVC_SAO_BAND_FILTER 48, 1
193HEVC_SAO_BAND_FILTER 64, 2
194%endif
195
196;******************************************************************************
197;SAO Edge Filter
198;******************************************************************************
199
200%define MAX_PB_SIZE  64
201%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
202%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
203
204%macro HEVC_SAO_EDGE_FILTER_INIT 0
205%if WIN64
206    movsxd           eoq, dword eom
207%elif ARCH_X86_64
208    movsxd           eoq, eod
209%else
210    mov              eoq, r4m
211%endif
212    lea            tmp2q, [pb_eo]
213    movsx      a_strideq, byte [tmp2q+eoq*4+1]
214    movsx      b_strideq, byte [tmp2q+eoq*4+3]
215    imul       a_strideq, EDGE_SRCSTRIDE
216    imul       b_strideq, EDGE_SRCSTRIDE
217    movsx           tmpq, byte [tmp2q+eoq*4]
218    add        a_strideq, tmpq
219    movsx           tmpq, byte [tmp2q+eoq*4+2]
220    add        b_strideq, tmpq
221%endmacro
222
223%macro HEVC_SAO_EDGE_FILTER_COMPUTE 1
224    pminub            m4, m1, m2
225    pminub            m5, m1, m3
226    pcmpeqb           m2, m4
227    pcmpeqb           m3, m5
228    pcmpeqb           m4, m1
229    pcmpeqb           m5, m1
230    psubb             m4, m2
231    psubb             m5, m3
232    paddb             m4, m6
233    paddb             m4, m5
234
235    pshufb            m2, m0, m4
236%if %1 > 8
237    punpckhbw         m5, m7, m1
238    punpckhbw         m4, m2, m7
239    punpcklbw         m3, m7, m1
240    punpcklbw         m2, m7
241    pmaddubsw         m5, m4
242    pmaddubsw         m3, m2
243    packuswb          m3, m5
244%else
245    punpcklbw         m3, m7, m1
246    punpcklbw         m2, m7
247    pmaddubsw         m3, m2
248    packuswb          m3, m3
249%endif
250%endmacro
251
252;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
253;                                             int eo, int width, int height);
254%macro HEVC_SAO_EDGE_FILTER 2-3
255%if ARCH_X86_64
256cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
257%define tmp2q heightq
258    HEVC_SAO_EDGE_FILTER_INIT
259    mov          heightd, r6m
260
261%else ; ARCH_X86_32
262cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
263%define eoq   srcq
264%define tmpq  heightq
265%define tmp2q dststrideq
266%define offsetq heightq
267    HEVC_SAO_EDGE_FILTER_INIT
268    mov             srcq, srcm
269    mov          offsetq, r3m
270    mov       dststrideq, dststridem
271%endif ; ARCH
272
273%if mmsize > 16
274    vbroadcasti128    m0, [offsetq]
275%else
276    movu              m0, [offsetq]
277%endif
278    mova              m1, [pb_edge_shuffle]
279    packsswb          m0, m0
280    mova              m7, [pb_1]
281    pshufb            m0, m1
282    mova              m6, [pb_2]
283%if ARCH_X86_32
284    mov          heightd, r6m
285%endif
286
287align 16
288.loop:
289
290%if %1 == 8
291    movq              m1, [srcq]
292    movq              m2, [srcq + a_strideq]
293    movq              m3, [srcq + b_strideq]
294    HEVC_SAO_EDGE_FILTER_COMPUTE %1
295    movq          [dstq], m3
296%endif
297
298%assign i 0
299%rep %2
300    mova              m1, [srcq + i]
301    movu              m2, [srcq + a_strideq + i]
302    movu              m3, [srcq + b_strideq + i]
303    HEVC_SAO_EDGE_FILTER_COMPUTE %1
304    mov%3     [dstq + i], m3
305%assign i i+mmsize
306%endrep
307
308%if %1 == 48
309INIT_XMM cpuname
310
311    mova              m1, [srcq + i]
312    movu              m2, [srcq + a_strideq + i]
313    movu              m3, [srcq + b_strideq + i]
314    HEVC_SAO_EDGE_FILTER_COMPUTE %1
315    mova      [dstq + i], m3
316%if cpuflag(avx2)
317INIT_YMM cpuname
318%endif
319%endif
320
321    add             dstq, dststrideq
322    add             srcq, EDGE_SRCSTRIDE
323    dec          heightd
324    jg .loop
325    RET
326%endmacro
327
328INIT_XMM ssse3
329HEVC_SAO_EDGE_FILTER  8, 0
330HEVC_SAO_EDGE_FILTER 16, 1, a
331HEVC_SAO_EDGE_FILTER 32, 2, a
332HEVC_SAO_EDGE_FILTER 48, 2, a
333HEVC_SAO_EDGE_FILTER 64, 4, a
334
335%if HAVE_AVX2_EXTERNAL
336INIT_YMM avx2
337HEVC_SAO_EDGE_FILTER 32, 1, a
338HEVC_SAO_EDGE_FILTER 48, 1, u
339HEVC_SAO_EDGE_FILTER 64, 2, a
340%endif
341