1;******************************************************************************
2;* SIMD optimized SAO functions for HEVC 10/12bit decoding
3;*
4;* Copyright (c) 2013 Pierre-Edouard LEPERE
5;* Copyright (c) 2014 James Almer
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA 32
27
28pw_m2:     times 16 dw -2
29pw_mask10: times 16 dw 0x03FF
30pw_mask12: times 16 dw 0x0FFF
31pb_eo:              db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
32cextern pw_m1
33cextern pw_1
34cextern pw_2
35
36SECTION .text
37
38;******************************************************************************
39;SAO Band Filter
40;******************************************************************************
41
42%macro HEVC_SAO_BAND_FILTER_INIT 1
43    and            leftq, 31
44    movd             xm0, leftd
45    add            leftq, 1
46    and            leftq, 31
47    movd             xm1, leftd
48    add            leftq, 1
49    and            leftq, 31
50    movd             xm2, leftd
51    add            leftq, 1
52    and            leftq, 31
53    movd             xm3, leftd
54
55    SPLATW            m0, xm0
56    SPLATW            m1, xm1
57    SPLATW            m2, xm2
58    SPLATW            m3, xm3
59%if mmsize > 16
60    SPLATW            m4, [offsetq + 2]
61    SPLATW            m5, [offsetq + 4]
62    SPLATW            m6, [offsetq + 6]
63    SPLATW            m7, [offsetq + 8]
64%else
65    movq              m7, [offsetq + 2]
66    SPLATW            m4, m7, 0
67    SPLATW            m5, m7, 1
68    SPLATW            m6, m7, 2
69    SPLATW            m7, m7, 3
70%endif
71
72%if ARCH_X86_64
73    mova             m13, [pw_mask %+ %1]
74    pxor             m14, m14
75
76%else ; ARCH_X86_32
77    mova  [rsp+mmsize*0], m0
78    mova  [rsp+mmsize*1], m1
79    mova  [rsp+mmsize*2], m2
80    mova  [rsp+mmsize*3], m3
81    mova  [rsp+mmsize*4], m4
82    mova  [rsp+mmsize*5], m5
83    mova  [rsp+mmsize*6], m6
84    mova              m1, [pw_mask %+ %1]
85    pxor              m0, m0
86    %define m14 m0
87    %define m13 m1
88    %define  m9 m2
89    %define  m8 m3
90%endif ; ARCH
91DEFINE_ARGS dst, src, dststride, srcstride, offset, height
92    mov          heightd, r7m
93%endmacro
94
95;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
96;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
97%macro HEVC_SAO_BAND_FILTER 3
98cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
99    HEVC_SAO_BAND_FILTER_INIT %1
100
101align 16
102.loop:
103
104%assign i 0
105%assign j 0
106%rep %3
107%assign k 8+(j&1)
108%assign l 9-(j&1)
109    mova          m %+ k, [srcq + i]
110    psraw         m %+ l, m %+ k, %1-5
111%if ARCH_X86_64
112    pcmpeqw          m10, m %+ l, m0
113    pcmpeqw          m11, m %+ l, m1
114    pcmpeqw          m12, m %+ l, m2
115    pcmpeqw       m %+ l, m3
116    pand             m10, m4
117    pand             m11, m5
118    pand             m12, m6
119    pand          m %+ l, m7
120    por              m10, m11
121    por              m12, m %+ l
122    por              m10, m12
123    paddw         m %+ k, m10
124%else ; ARCH_X86_32
125    pcmpeqw           m4, m %+ l, [rsp+mmsize*0]
126    pcmpeqw           m5, m %+ l, [rsp+mmsize*1]
127    pcmpeqw           m6, m %+ l, [rsp+mmsize*2]
128    pcmpeqw       m %+ l, [rsp+mmsize*3]
129    pand              m4, [rsp+mmsize*4]
130    pand              m5, [rsp+mmsize*5]
131    pand              m6, [rsp+mmsize*6]
132    pand          m %+ l, m7
133    por               m4, m5
134    por               m6, m %+ l
135    por               m4, m6
136    paddw         m %+ k, m4
137%endif ; ARCH
138    CLIPW             m %+ k, m14, m13
139    mova      [dstq + i], m %+ k
140%assign i i+mmsize
141%assign j j+1
142%endrep
143
144    add             dstq, dststrideq
145    add             srcq, srcstrideq
146    dec          heightd
147    jg .loop
148    REP_RET
149%endmacro
150
151%macro HEVC_SAO_BAND_FILTER_FUNCS 0
152HEVC_SAO_BAND_FILTER 10,  8, 1
153HEVC_SAO_BAND_FILTER 10, 16, 2
154HEVC_SAO_BAND_FILTER 10, 32, 4
155HEVC_SAO_BAND_FILTER 10, 48, 6
156HEVC_SAO_BAND_FILTER 10, 64, 8
157
158HEVC_SAO_BAND_FILTER 12,  8, 1
159HEVC_SAO_BAND_FILTER 12, 16, 2
160HEVC_SAO_BAND_FILTER 12, 32, 4
161HEVC_SAO_BAND_FILTER 12, 48, 6
162HEVC_SAO_BAND_FILTER 12, 64, 8
163%endmacro
164
165INIT_XMM sse2
166HEVC_SAO_BAND_FILTER_FUNCS
167INIT_XMM avx
168HEVC_SAO_BAND_FILTER_FUNCS
169
170%if HAVE_AVX2_EXTERNAL
171INIT_XMM avx2
172HEVC_SAO_BAND_FILTER 10,  8, 1
173INIT_YMM avx2
174HEVC_SAO_BAND_FILTER 10, 16, 1
175HEVC_SAO_BAND_FILTER 10, 32, 2
176HEVC_SAO_BAND_FILTER 10, 48, 3
177HEVC_SAO_BAND_FILTER 10, 64, 4
178
179INIT_XMM avx2
180HEVC_SAO_BAND_FILTER 12,  8, 1
181INIT_YMM avx2
182HEVC_SAO_BAND_FILTER 12, 16, 1
183HEVC_SAO_BAND_FILTER 12, 32, 2
184HEVC_SAO_BAND_FILTER 12, 48, 3
185HEVC_SAO_BAND_FILTER 12, 64, 4
186%endif
187
188;******************************************************************************
189;SAO Edge Filter
190;******************************************************************************
191
192%define MAX_PB_SIZE  64
193%define PADDING_SIZE 64 ; AV_INPUT_BUFFER_PADDING_SIZE
194%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
195
196%macro PMINUW 4
197%if cpuflag(sse4)
198    pminuw            %1, %2, %3
199%else
200    psubusw           %4, %2, %3
201    psubw             %1, %2, %4
202%endif
203%endmacro
204
205%macro HEVC_SAO_EDGE_FILTER_INIT 0
206%if WIN64
207    movsxd           eoq, dword eom
208%elif ARCH_X86_64
209    movsxd           eoq, eod
210%else
211    mov              eoq, r4m
212%endif
213    lea            tmp2q, [pb_eo]
214    movsx      a_strideq, byte [tmp2q+eoq*4+1]
215    movsx      b_strideq, byte [tmp2q+eoq*4+3]
216    imul       a_strideq, EDGE_SRCSTRIDE >> 1
217    imul       b_strideq, EDGE_SRCSTRIDE >> 1
218    movsx           tmpq, byte [tmp2q+eoq*4]
219    add        a_strideq, tmpq
220    movsx           tmpq, byte [tmp2q+eoq*4+2]
221    add        b_strideq, tmpq
222%endmacro
223
224;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
225;                                                   int eo, int width, int height);
226%macro HEVC_SAO_EDGE_FILTER 3
227%if ARCH_X86_64
228cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
229%define tmp2q heightq
230    HEVC_SAO_EDGE_FILTER_INIT
231    mov          heightd, r6m
232    add        a_strideq, a_strideq
233    add        b_strideq, b_strideq
234
235%else ; ARCH_X86_32
236cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
237%define eoq   srcq
238%define tmpq  heightq
239%define tmp2q dststrideq
240%define offsetq heightq
241%define m8 m1
242%define m9 m2
243%define m10 m3
244%define m11 m4
245%define m12 m5
246    HEVC_SAO_EDGE_FILTER_INIT
247    mov             srcq, srcm
248    mov          offsetq, r3m
249    mov       dststrideq, dststridem
250    add        a_strideq, a_strideq
251    add        b_strideq, b_strideq
252
253%endif ; ARCH
254
255%if mmsize > 16
256    SPLATW            m8, [offsetq+2]
257    SPLATW            m9, [offsetq+4]
258    SPLATW           m10, [offsetq+0]
259    SPLATW           m11, [offsetq+6]
260    SPLATW           m12, [offsetq+8]
261%else
262    movq             m10, [offsetq+0]
263    movd             m12, [offsetq+6]
264    SPLATW            m8, xm10, 1
265    SPLATW            m9, xm10, 2
266    SPLATW           m10, xm10, 0
267    SPLATW           m11, xm12, 0
268    SPLATW           m12, xm12, 1
269%endif
270    pxor              m0, m0
271%if ARCH_X86_64
272    mova             m13, [pw_m1]
273    mova             m14, [pw_1]
274    mova             m15, [pw_2]
275%else
276    mov          heightd, r6m
277    mova  [rsp+mmsize*0], m8
278    mova  [rsp+mmsize*1], m9
279    mova  [rsp+mmsize*2], m10
280    mova  [rsp+mmsize*3], m11
281    mova  [rsp+mmsize*4], m12
282%endif
283
284align 16
285.loop:
286
287%assign i 0
288%rep %3
289    mova              m1, [srcq + i]
290    movu              m2, [srcq+a_strideq + i]
291    movu              m3, [srcq+b_strideq + i]
292    PMINUW            m4, m1, m2, m6
293    PMINUW            m5, m1, m3, m7
294    pcmpeqw           m2, m4
295    pcmpeqw           m3, m5
296    pcmpeqw           m4, m1
297    pcmpeqw           m5, m1
298    psubw             m4, m2
299    psubw             m5, m3
300
301    paddw             m4, m5
302    pcmpeqw           m2, m4, [pw_m2]
303%if ARCH_X86_64
304    pcmpeqw           m3, m4, m13
305    pcmpeqw           m5, m4, m0
306    pcmpeqw           m6, m4, m14
307    pcmpeqw           m7, m4, m15
308    pand              m2, m8
309    pand              m3, m9
310    pand              m5, m10
311    pand              m6, m11
312    pand              m7, m12
313%else
314    pcmpeqw           m3, m4, [pw_m1]
315    pcmpeqw           m5, m4, m0
316    pcmpeqw           m6, m4, [pw_1]
317    pcmpeqw           m7, m4, [pw_2]
318    pand              m2, [rsp+mmsize*0]
319    pand              m3, [rsp+mmsize*1]
320    pand              m5, [rsp+mmsize*2]
321    pand              m6, [rsp+mmsize*3]
322    pand              m7, [rsp+mmsize*4]
323%endif
324    paddw             m2, m3
325    paddw             m5, m6
326    paddw             m2, m7
327    paddw             m2, m1
328    paddw             m2, m5
329    CLIPW             m2, m0, [pw_mask %+ %1]
330    mova      [dstq + i], m2
331%assign i i+mmsize
332%endrep
333
334    add             dstq, dststrideq
335    add             srcq, EDGE_SRCSTRIDE
336    dec          heightd
337    jg .loop
338    RET
339%endmacro
340
341INIT_XMM sse2
342HEVC_SAO_EDGE_FILTER 10,  8, 1
343HEVC_SAO_EDGE_FILTER 10, 16, 2
344HEVC_SAO_EDGE_FILTER 10, 32, 4
345HEVC_SAO_EDGE_FILTER 10, 48, 6
346HEVC_SAO_EDGE_FILTER 10, 64, 8
347
348HEVC_SAO_EDGE_FILTER 12,  8, 1
349HEVC_SAO_EDGE_FILTER 12, 16, 2
350HEVC_SAO_EDGE_FILTER 12, 32, 4
351HEVC_SAO_EDGE_FILTER 12, 48, 6
352HEVC_SAO_EDGE_FILTER 12, 64, 8
353
354%if HAVE_AVX2_EXTERNAL
355INIT_XMM avx2
356HEVC_SAO_EDGE_FILTER 10,  8, 1
357INIT_YMM avx2
358HEVC_SAO_EDGE_FILTER 10, 16, 1
359HEVC_SAO_EDGE_FILTER 10, 32, 2
360HEVC_SAO_EDGE_FILTER 10, 48, 3
361HEVC_SAO_EDGE_FILTER 10, 64, 4
362
363INIT_XMM avx2
364HEVC_SAO_EDGE_FILTER 12,  8, 1
365INIT_YMM avx2
366HEVC_SAO_EDGE_FILTER 12, 16, 1
367HEVC_SAO_EDGE_FILTER 12, 32, 2
368HEVC_SAO_EDGE_FILTER 12, 48, 3
369HEVC_SAO_EDGE_FILTER 12, 64, 4
370%endif
371