1;******************************************************************************
2;* SIMD-optimized HuffYUV functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Christophe Gisquet
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION .text
26
27%include "libavcodec/x86/huffyuvdsp_template.asm"
28
29;------------------------------------------------------------------------------
30; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
31;------------------------------------------------------------------------------
32
33%macro ADD_INT16 0
34cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
35%if mmsize > 8
36    test srcq, mmsize-1
37    jnz .unaligned
38    test dstq, mmsize-1
39    jnz .unaligned
40%endif
41    INT16_LOOP a, add
42%if mmsize > 8
43.unaligned:
44    INT16_LOOP u, add
45%endif
46%endmacro
47
48%if ARCH_X86_32
49INIT_MMX mmx
50ADD_INT16
51%endif
52
53INIT_XMM sse2
54ADD_INT16
55
56%if HAVE_AVX2_EXTERNAL
57INIT_YMM avx2
58ADD_INT16
59%endif
60
61; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
62;                               intptr_t w, uint8_t *left)
63%macro LEFT_BGR32 0
64cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
65    shl           wq, 2
66    movd          m0, [leftq]
67    lea         dstq, [dstq + wq]
68    lea         srcq, [srcq + wq]
69    LSHIFT        m0, mmsize-4
70    neg           wq
71.loop:
72    movu          m1, [srcq+wq]
73    mova          m2, m1
74%if mmsize == 8
75    punpckhdq     m0, m0
76%endif
77    LSHIFT        m1, 4
78    paddb         m1, m2
79%if mmsize == 16
80    pshufd        m0, m0, q3333
81    mova          m2, m1
82    LSHIFT        m1, 8
83    paddb         m1, m2
84%endif
85    paddb         m0, m1
86    movu   [dstq+wq], m0
87    add           wq, mmsize
88    jl         .loop
89    movd          m0, [dstq-4]
90    movd     [leftq], m0
91    REP_RET
92%endmacro
93
94%if ARCH_X86_32
95INIT_MMX mmx
96LEFT_BGR32
97%endif
98INIT_XMM sse2
99LEFT_BGR32
100
101; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
102INIT_MMX mmxext
103cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
104    add      wd, wd
105    movd    mm6, maskd
106    SPLATW  mm6, mm6
107    movq    mm0, [topq]
108    movq    mm2, mm0
109    movd    mm4, [left_topq]
110    psllq   mm2, 16
111    movq    mm1, mm0
112    por     mm4, mm2
113    movd    mm3, [leftq]
114    psubw   mm0, mm4 ; t-tl
115    add    dstq, wq
116    add    topq, wq
117    add   diffq, wq
118    neg      wq
119    jmp .skip
120.loop:
121    movq    mm4, [topq+wq]
122    movq    mm0, mm4
123    psllq   mm4, 16
124    por     mm4, mm1
125    movq    mm1, mm0 ; t
126    psubw   mm0, mm4 ; t-tl
127.skip:
128    movq    mm2, [diffq+wq]
129%assign i 0
130%rep 4
131    movq    mm4, mm0
132    paddw   mm4, mm3 ; t-tl+l
133    pand    mm4, mm6
134    movq    mm5, mm3
135    pmaxsw  mm3, mm1
136    pminsw  mm5, mm1
137    pminsw  mm3, mm4
138    pmaxsw  mm3, mm5 ; median
139    paddw   mm3, mm2 ; +residual
140    pand    mm3, mm6
141%if i==0
142    movq    mm7, mm3
143    psllq   mm7, 48
144%else
145    movq    mm4, mm3
146    psrlq   mm7, 16
147    psllq   mm4, 48
148    por     mm7, mm4
149%endif
150%if i<3
151    psrlq   mm0, 16
152    psrlq   mm1, 16
153    psrlq   mm2, 16
154%endif
155%assign i i+1
156%endrep
157    movq [dstq+wq], mm7
158    add      wq, 8
159    jl .loop
160    movzx   r2d, word [dstq-2]
161    mov [leftq], r2d
162    movzx   r2d, word [topq-2]
163    mov [left_topq], r2d
164    RET
165