1;******************************************************************************
2;* SIMD lossless video DSP utils
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2014 Michael Niedermayer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27pb_ef: times 8 db 14,15
28pb_67: times 8 db  6, 7
29pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
30pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
31
32SECTION_TEXT
33
34%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
35    movd    m4, maskd
36    SPLATW  m4, m4
37    add     wd, wd
38    test    wq, 2*mmsize - 1
39    jz %%.tomainloop
40    push  tmpq
41%%.wordloop:
42    sub     wq, 2
43%ifidn %2, add
44    mov   tmpw, [srcq+wq]
45    add   tmpw, [dstq+wq]
46%else
47    mov   tmpw, [src1q+wq]
48    sub   tmpw, [src2q+wq]
49%endif
50    and   tmpw, maskw
51    mov     [dstq+wq], tmpw
52    test    wq, 2*mmsize - 1
53    jnz %%.wordloop
54    pop   tmpq
55%%.tomainloop:
56%ifidn %2, add
57    add     srcq, wq
58%else
59    add     src1q, wq
60    add     src2q, wq
61%endif
62    add     dstq, wq
63    neg     wq
64    jz      %%.end
65%%.loop:
66%ifidn %2, add
67    mov%1   m0, [srcq+wq]
68    mov%1   m1, [dstq+wq]
69    mov%1   m2, [srcq+wq+mmsize]
70    mov%1   m3, [dstq+wq+mmsize]
71%else
72    mov%1   m0, [src1q+wq]
73    mov%1   m1, [src2q+wq]
74    mov%1   m2, [src1q+wq+mmsize]
75    mov%1   m3, [src2q+wq+mmsize]
76%endif
77    p%2w    m0, m1
78    p%2w    m2, m3
79    pand    m0, m4
80    pand    m2, m4
81    mov%1   [dstq+wq]       , m0
82    mov%1   [dstq+wq+mmsize], m2
83    add     wq, 2*mmsize
84    jl %%.loop
85%%.end:
86    RET
87%endmacro
88
89INIT_MMX mmx
90cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
91    INT16_LOOP a, add
92
93INIT_XMM sse2
94cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
95    test srcq, mmsize-1
96    jnz .unaligned
97    test dstq, mmsize-1
98    jnz .unaligned
99    INT16_LOOP a, add
100.unaligned:
101    INT16_LOOP u, add
102
103INIT_MMX mmx
104cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
105    INT16_LOOP a, sub
106
107INIT_XMM sse2
108cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
109    test src1q, mmsize-1
110    jnz .unaligned
111    test src2q, mmsize-1
112    jnz .unaligned
113    test dstq, mmsize-1
114    jnz .unaligned
115    INT16_LOOP a, sub
116.unaligned:
117    INT16_LOOP u, sub
118
119
120%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
121    add     wd, wd
122    add     srcq, wq
123    add     dstq, wq
124    neg     wq
125%%.loop:
126    mov%2   m1, [srcq+wq]
127    mova    m2, m1
128    pslld   m1, 16
129    paddw   m1, m2
130    mova    m2, m1
131
132    pshufb  m1, m3
133    paddw   m1, m2
134    pshufb  m0, m5
135%if mmsize == 16
136    mova    m2, m1
137    pshufb  m1, m4
138    paddw   m1, m2
139%endif
140    paddw   m0, m1
141    pand    m0, m7
142%ifidn %1, a
143    mova    [dstq+wq], m0
144%else
145    movq    [dstq+wq], m0
146    movhps  [dstq+wq+8], m0
147%endif
148    add     wq, mmsize
149    jl %%.loop
150    mov     eax, mmsize-1
151    sub     eax, wd
152    mov     wd, eax
153    shl     wd, 8
154    lea     eax, [wd+eax-1]
155    movd    m1, eax
156    pshufb  m0, m1
157    movd    eax, m0
158    RET
159%endmacro
160
161; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
162INIT_MMX ssse3
163cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
164.skip_prologue:
165    mova    m5, [pb_67]
166    mova    m3, [pb_zzzz2323zzzzabab]
167    movd    m0, leftm
168    psllq   m0, 48
169    movd    m7, maskm
170    SPLATW  m7 ,m7
171    ADD_HFYU_LEFT_LOOP_INT16 a, a
172
173INIT_XMM sse4
174cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
175    mova    m5, [pb_ef]
176    mova    m4, [pb_zzzzzzzz67676767]
177    mova    m3, [pb_zzzz2323zzzzabab]
178    movd    m0, leftm
179    pslldq  m0, 14
180    movd    m7, maskm
181    SPLATW  m7 ,m7
182    test    srcq, 15
183    jnz .src_unaligned
184    test    dstq, 15
185    jnz .dst_unaligned
186    ADD_HFYU_LEFT_LOOP_INT16 a, a
187.dst_unaligned:
188    ADD_HFYU_LEFT_LOOP_INT16 u, a
189.src_unaligned:
190    ADD_HFYU_LEFT_LOOP_INT16 u, u
191
192; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
193INIT_MMX mmxext
194cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
195    add      wd, wd
196    movd    mm6, maskd
197    SPLATW  mm6, mm6
198    movq    mm0, [topq]
199    movq    mm2, mm0
200    movd    mm4, [left_topq]
201    psllq   mm2, 16
202    movq    mm1, mm0
203    por     mm4, mm2
204    movd    mm3, [leftq]
205    psubw   mm0, mm4 ; t-tl
206    add    dstq, wq
207    add    topq, wq
208    add   diffq, wq
209    neg      wq
210    jmp .skip
211.loop:
212    movq    mm4, [topq+wq]
213    movq    mm0, mm4
214    psllq   mm4, 16
215    por     mm4, mm1
216    movq    mm1, mm0 ; t
217    psubw   mm0, mm4 ; t-tl
218.skip:
219    movq    mm2, [diffq+wq]
220%assign i 0
221%rep 4
222    movq    mm4, mm0
223    paddw   mm4, mm3 ; t-tl+l
224    pand    mm4, mm6
225    movq    mm5, mm3
226    pmaxsw  mm3, mm1
227    pminsw  mm5, mm1
228    pminsw  mm3, mm4
229    pmaxsw  mm3, mm5 ; median
230    paddw   mm3, mm2 ; +residual
231    pand    mm3, mm6
232%if i==0
233    movq    mm7, mm3
234    psllq   mm7, 48
235%else
236    movq    mm4, mm3
237    psrlq   mm7, 16
238    psllq   mm4, 48
239    por     mm7, mm4
240%endif
241%if i<3
242    psrlq   mm0, 16
243    psrlq   mm1, 16
244    psrlq   mm2, 16
245%endif
246%assign i i+1
247%endrep
248    movq [dstq+wq], mm7
249    add      wq, 8
250    jl .loop
251    movzx   r2d, word [dstq-2]
252    mov [leftq], r2d
253    movzx   r2d, word [topq-2]
254    mov [left_topq], r2d
255    RET
256
257cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
258    add      wd, wd
259    movd    mm7, maskd
260    SPLATW  mm7, mm7
261    movq    mm0, [src1q]
262    movq    mm2, [src2q]
263    psllq   mm0, 16
264    psllq   mm2, 16
265    movd    mm6, [left_topq]
266    por     mm0, mm6
267    movd    mm6, [leftq]
268    por     mm2, mm6
269    xor     maskq, maskq
270.loop:
271    movq    mm1, [src1q + maskq]
272    movq    mm3, [src2q + maskq]
273    movq    mm4, mm2
274    psubw   mm2, mm0
275    paddw   mm2, mm1
276    pand    mm2, mm7
277    movq    mm5, mm4
278    pmaxsw  mm4, mm1
279    pminsw  mm1, mm5
280    pminsw  mm4, mm2
281    pmaxsw  mm4, mm1
282    psubw   mm3, mm4
283    pand    mm3, mm7
284    movq    [dstq + maskq], mm3
285    add     maskq, 8
286    movq    mm0, [src1q + maskq - 2]
287    movq    mm2, [src2q + maskq - 2]
288    cmp     maskq, wq
289        jb .loop
290    movzx maskd, word [src1q + wq - 2]
291    mov [left_topq], maskd
292    movzx maskd, word [src2q + wq - 2]
293    mov [leftq], maskd
294    RET
295