1;******************************************************************************
2;* MMX-optimized H.263 loop filter
3;* Copyright (c) 2003-2013 Michael Niedermayer
4;* Copyright (c) 2013 Daniel Kang
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26cextern pb_FC
27cextern h263_loop_filter_strength
28
29SECTION .text
30
31%macro H263_LOOP_FILTER 5
32    pxor         m7, m7
33    mova         m0, [%1]
34    mova         m1, [%1]
35    mova         m2, [%4]
36    mova         m3, [%4]
37    punpcklbw    m0, m7
38    punpckhbw    m1, m7
39    punpcklbw    m2, m7
40    punpckhbw    m3, m7
41    psubw        m0, m2
42    psubw        m1, m3
43    mova         m2, [%2]
44    mova         m3, [%2]
45    mova         m4, [%3]
46    mova         m5, [%3]
47    punpcklbw    m2, m7
48    punpckhbw    m3, m7
49    punpcklbw    m4, m7
50    punpckhbw    m5, m7
51    psubw        m4, m2
52    psubw        m5, m3
53    psllw        m4, 2
54    psllw        m5, 2
55    paddw        m4, m0
56    paddw        m5, m1
57    pxor         m6, m6
58    pcmpgtw      m6, m4
59    pcmpgtw      m7, m5
60    pxor         m4, m6
61    pxor         m5, m7
62    psubw        m4, m6
63    psubw        m5, m7
64    psrlw        m4, 3
65    psrlw        m5, 3
66    packuswb     m4, m5
67    packsswb     m6, m7
68    pxor         m7, m7
69    movd         m2, %5
70    punpcklbw    m2, m2
71    punpcklbw    m2, m2
72    punpcklbw    m2, m2
73    psubusb      m2, m4
74    mova         m3, m2
75    psubusb      m3, m4
76    psubb        m2, m3
77    mova         m3, [%2]
78    mova         m4, [%3]
79    pxor         m3, m6
80    pxor         m4, m6
81    paddusb      m3, m2
82    psubusb      m4, m2
83    pxor         m3, m6
84    pxor         m4, m6
85    paddusb      m2, m2
86    packsswb     m0, m1
87    pcmpgtb      m7, m0
88    pxor         m0, m7
89    psubb        m0, m7
90    mova         m1, m0
91    psubusb      m0, m2
92    psubb        m1, m0
93    pand         m1, [pb_FC]
94    psrlw        m1, 2
95    pxor         m1, m7
96    psubb        m1, m7
97    mova         m5, [%1]
98    mova         m6, [%4]
99    psubb        m5, m1
100    paddb        m6, m1
101%endmacro
102
103INIT_MMX mmx
104; void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
105cglobal h263_v_loop_filter, 3,5
106    movsxdifnidn r1, r1d
107    movsxdifnidn r2, r2d
108
109    lea          r4, [h263_loop_filter_strength]
110    movzx       r3d, BYTE [r4+r2]
111    movsx        r2, r3b
112    shl          r2, 1
113
114    mov          r3, r0
115    sub          r3, r1
116    mov          r4, r3
117    sub          r4, r1
118    H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
119
120    mova       [r3], m3
121    mova       [r0], m4
122    mova       [r4], m5
123    mova    [r0+r1], m6
124    RET
125
126%macro TRANSPOSE4X4 2
127    movd      m0, [%1]
128    movd      m1, [%1+r1]
129    movd      m2, [%1+r1*2]
130    movd      m3, [%1+r3]
131    punpcklbw m0, m1
132    punpcklbw m2, m3
133    mova      m1, m0
134    punpcklwd m0, m2
135    punpckhwd m1, m2
136    movd [%2+ 0], m0
137    punpckhdq m0, m0
138    movd [%2+ 8], m0
139    movd [%2+16], m1
140    punpckhdq m1, m1
141    movd [%2+24], m1
142%endmacro
143
144
145; void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
146INIT_MMX mmx
147cglobal h263_h_loop_filter, 3,5,0,32
148    movsxdifnidn r1, r1d
149    movsxdifnidn r2, r2d
150
151    lea          r4, [h263_loop_filter_strength]
152    movzx       r3d, BYTE [r4+r2]
153    movsx        r2, r3b
154    shl          r2, 1
155
156    sub          r0, 2
157    lea          r3, [r1*3]
158
159    TRANSPOSE4X4 r0, rsp
160    lea          r4, [r0+r1*4]
161    TRANSPOSE4X4 r4, rsp+4
162
163    H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
164
165    mova         m1, m5
166    mova         m0, m4
167    punpcklbw    m5, m3
168    punpcklbw    m4, m6
169    punpckhbw    m1, m3
170    punpckhbw    m0, m6
171    mova         m3, m5
172    mova         m6, m1
173    punpcklwd    m5, m4
174    punpcklwd    m1, m0
175    punpckhwd    m3, m4
176    punpckhwd    m6, m0
177    movd       [r0], m5
178    punpckhdq    m5, m5
179    movd  [r0+r1*1], m5
180    movd  [r0+r1*2], m3
181    punpckhdq    m3, m3
182    movd    [r0+r3], m3
183    movd       [r4], m1
184    punpckhdq    m1, m1
185    movd  [r4+r1*1], m1
186    movd  [r4+r1*2], m6
187    punpckhdq    m6, m6
188    movd    [r4+r3], m6
189    RET
190