1;*****************************************************************************
2;* SSE2-optimized weighted prediction code
3;*****************************************************************************
4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28;-----------------------------------------------------------------------------
29; biweight pred:
30;
31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32;                               int height, int log2_denom, int weightd,
33;                               int weights, int offset);
34; and
35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36;                             int log2_denom, int weight, int offset);
37;-----------------------------------------------------------------------------
38
39%macro WEIGHT_SETUP 0
40    add        r5, r5
41    inc        r5
42    movd       m3, r4d
43    movd       m5, r5d
44    movd       m6, r3d
45    pslld      m5, m6
46    psrld      m5, 1
47%if mmsize == 16
48    pshuflw    m3, m3, 0
49    pshuflw    m5, m5, 0
50    punpcklqdq m3, m3
51    punpcklqdq m5, m5
52%else
53    pshufw     m3, m3, 0
54    pshufw     m5, m5, 0
55%endif
56    pxor       m7, m7
57%endmacro
58
59%macro WEIGHT_OP 2
60    movh          m0, [r0+%1]
61    movh          m1, [r0+%2]
62    punpcklbw     m0, m7
63    punpcklbw     m1, m7
64    pmullw        m0, m3
65    pmullw        m1, m3
66    paddsw        m0, m5
67    paddsw        m1, m5
68    psraw         m0, m6
69    psraw         m1, m6
70    packuswb      m0, m1
71%endmacro
72
73INIT_MMX mmxext
74cglobal h264_weight_16, 6, 6, 0
75    WEIGHT_SETUP
76.nextrow:
77    WEIGHT_OP 0,  4
78    mova     [r0  ], m0
79    WEIGHT_OP 8, 12
80    mova     [r0+8], m0
81    add        r0, r1
82    dec        r2d
83    jnz .nextrow
84    REP_RET
85
86%macro WEIGHT_FUNC_MM 2
87cglobal h264_weight_%1, 6, 6, %2
88    WEIGHT_SETUP
89.nextrow:
90    WEIGHT_OP 0, mmsize/2
91    mova     [r0], m0
92    add        r0, r1
93    dec        r2d
94    jnz .nextrow
95    REP_RET
96%endmacro
97
98INIT_MMX mmxext
99WEIGHT_FUNC_MM  8, 0
100INIT_XMM sse2
101WEIGHT_FUNC_MM 16, 8
102
103%macro WEIGHT_FUNC_HALF_MM 2
104cglobal h264_weight_%1, 6, 6, %2
105    WEIGHT_SETUP
106    sar       r2d, 1
107    lea        r3, [r1*2]
108.nextrow:
109    WEIGHT_OP 0, r1
110    movh     [r0], m0
111%if mmsize == 16
112    movhps   [r0+r1], m0
113%else
114    psrlq      m0, 32
115    movh     [r0+r1], m0
116%endif
117    add        r0, r3
118    dec        r2d
119    jnz .nextrow
120    REP_RET
121%endmacro
122
123INIT_MMX mmxext
124WEIGHT_FUNC_HALF_MM 4, 0
125INIT_XMM sse2
126WEIGHT_FUNC_HALF_MM 8, 8
127
128%macro BIWEIGHT_SETUP 0
129%if ARCH_X86_64
130%define off_regd r7d
131%else
132%define off_regd r3d
133%endif
134    mov  off_regd, r7m
135    add  off_regd, 1
136    or   off_regd, 1
137    add       r4d, 1
138    cmp       r6d, 128
139    je .nonnormal
140    cmp       r5d, 128
141    jne .normal
142.nonnormal:
143    sar       r5d, 1
144    sar       r6d, 1
145    sar  off_regd, 1
146    sub       r4d, 1
147.normal:
148%if cpuflag(ssse3)
149    movd       m4, r5d
150    movd       m0, r6d
151%else
152    movd       m3, r5d
153    movd       m4, r6d
154%endif
155    movd       m5, off_regd
156    movd       m6, r4d
157    pslld      m5, m6
158    psrld      m5, 1
159%if cpuflag(ssse3)
160    punpcklbw  m4, m0
161    pshuflw    m4, m4, 0
162    pshuflw    m5, m5, 0
163    punpcklqdq m4, m4
164    punpcklqdq m5, m5
165
166%else
167%if mmsize == 16
168    pshuflw    m3, m3, 0
169    pshuflw    m4, m4, 0
170    pshuflw    m5, m5, 0
171    punpcklqdq m3, m3
172    punpcklqdq m4, m4
173    punpcklqdq m5, m5
174%else
175    pshufw     m3, m3, 0
176    pshufw     m4, m4, 0
177    pshufw     m5, m5, 0
178%endif
179    pxor       m7, m7
180%endif
181%endmacro
182
183%macro BIWEIGHT_STEPA 3
184    movh       m%1, [r0+%3]
185    movh       m%2, [r1+%3]
186    punpcklbw  m%1, m7
187    punpcklbw  m%2, m7
188    pmullw     m%1, m3
189    pmullw     m%2, m4
190    paddsw     m%1, m%2
191%endmacro
192
193%macro BIWEIGHT_STEPB 0
194    paddsw     m0, m5
195    paddsw     m1, m5
196    psraw      m0, m6
197    psraw      m1, m6
198    packuswb   m0, m1
199%endmacro
200
201INIT_MMX mmxext
202cglobal h264_biweight_16, 7, 8, 0
203    BIWEIGHT_SETUP
204    movifnidn r3d, r3m
205.nextrow:
206    BIWEIGHT_STEPA 0, 1, 0
207    BIWEIGHT_STEPA 1, 2, 4
208    BIWEIGHT_STEPB
209    mova       [r0], m0
210    BIWEIGHT_STEPA 0, 1, 8
211    BIWEIGHT_STEPA 1, 2, 12
212    BIWEIGHT_STEPB
213    mova     [r0+8], m0
214    add        r0, r2
215    add        r1, r2
216    dec        r3d
217    jnz .nextrow
218    REP_RET
219
220%macro BIWEIGHT_FUNC_MM 2
221cglobal h264_biweight_%1, 7, 8, %2
222    BIWEIGHT_SETUP
223    movifnidn r3d, r3m
224.nextrow:
225    BIWEIGHT_STEPA 0, 1, 0
226    BIWEIGHT_STEPA 1, 2, mmsize/2
227    BIWEIGHT_STEPB
228    mova       [r0], m0
229    add        r0, r2
230    add        r1, r2
231    dec        r3d
232    jnz .nextrow
233    REP_RET
234%endmacro
235
236INIT_MMX mmxext
237BIWEIGHT_FUNC_MM  8, 0
238INIT_XMM sse2
239BIWEIGHT_FUNC_MM 16, 8
240
241%macro BIWEIGHT_FUNC_HALF_MM 2
242cglobal h264_biweight_%1, 7, 8, %2
243    BIWEIGHT_SETUP
244    movifnidn r3d, r3m
245    sar        r3, 1
246    lea        r4, [r2*2]
247.nextrow:
248    BIWEIGHT_STEPA 0, 1, 0
249    BIWEIGHT_STEPA 1, 2, r2
250    BIWEIGHT_STEPB
251    movh       [r0], m0
252%if mmsize == 16
253    movhps     [r0+r2], m0
254%else
255    psrlq      m0, 32
256    movh       [r0+r2], m0
257%endif
258    add        r0, r4
259    add        r1, r4
260    dec        r3d
261    jnz .nextrow
262    REP_RET
263%endmacro
264
265INIT_MMX mmxext
266BIWEIGHT_FUNC_HALF_MM 4, 0
267INIT_XMM sse2
268BIWEIGHT_FUNC_HALF_MM 8, 8
269
270%macro BIWEIGHT_SSSE3_OP 0
271    pmaddubsw  m0, m4
272    pmaddubsw  m2, m4
273    paddsw     m0, m5
274    paddsw     m2, m5
275    psraw      m0, m6
276    psraw      m2, m6
277    packuswb   m0, m2
278%endmacro
279
280INIT_XMM ssse3
281cglobal h264_biweight_16, 7, 8, 8
282    BIWEIGHT_SETUP
283    movifnidn r3d, r3m
284
285.nextrow:
286    movh       m0, [r0]
287    movh       m2, [r0+8]
288    movh       m3, [r1+8]
289    punpcklbw  m0, [r1]
290    punpcklbw  m2, m3
291    BIWEIGHT_SSSE3_OP
292    mova       [r0], m0
293    add        r0, r2
294    add        r1, r2
295    dec        r3d
296    jnz .nextrow
297    REP_RET
298
299INIT_XMM ssse3
300cglobal h264_biweight_8, 7, 8, 8
301    BIWEIGHT_SETUP
302    movifnidn r3d, r3m
303    sar        r3, 1
304    lea        r4, [r2*2]
305
306.nextrow:
307    movh       m0, [r0]
308    movh       m1, [r1]
309    movh       m2, [r0+r2]
310    movh       m3, [r1+r2]
311    punpcklbw  m0, m1
312    punpcklbw  m2, m3
313    BIWEIGHT_SSSE3_OP
314    movh       [r0], m0
315    movhps     [r0+r2], m0
316    add        r0, r4
317    add        r1, r4
318    dec        r3d
319    jnz .nextrow
320    REP_RET
321