1 /*
2  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "dsputil_mmx.h"
22 
23 /***********************************/
24 /* IDCT */
25 
26 #define SUMSUB_BADC( a, b, c, d ) \
27     "paddw "#b", "#a" \n\t"\
28     "paddw "#d", "#c" \n\t"\
29     "paddw "#b", "#b" \n\t"\
30     "paddw "#d", "#d" \n\t"\
31     "psubw "#a", "#b" \n\t"\
32     "psubw "#c", "#d" \n\t"
33 
34 #define SUMSUBD2_AB( a, b, t ) \
35     "movq  "#b", "#t" \n\t"\
36     "psraw  $1 , "#b" \n\t"\
37     "paddw "#a", "#b" \n\t"\
38     "psraw  $1 , "#a" \n\t"\
39     "psubw "#t", "#a" \n\t"
40 
41 #define IDCT4_1D( s02, s13, d02, d13, t ) \
42     SUMSUB_BA  ( s02, d02 )\
43     SUMSUBD2_AB( s13, d13, t )\
44     SUMSUB_BADC( d13, s02, s13, d02 )
45 
46 #define STORE_DIFF_4P( p, t, z ) \
47     "psraw      $6,     "#p" \n\t"\
48     "movd       (%0),   "#t" \n\t"\
49     "punpcklbw "#z",    "#t" \n\t"\
50     "paddsw    "#t",    "#p" \n\t"\
51     "packuswb  "#z",    "#p" \n\t"\
52     "movd      "#p",    (%0) \n\t"
53 
ff_h264_idct_add_mmx(uint8_t * dst,int16_t * block,int stride)54 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
55 {
56     /* Load dct coeffs */
57     asm volatile(
58         "movq   (%0), %%mm0 \n\t"
59         "movq  8(%0), %%mm1 \n\t"
60         "movq 16(%0), %%mm2 \n\t"
61         "movq 24(%0), %%mm3 \n\t"
62     :: "r"(block) );
63 
64     asm volatile(
65         /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
66         IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
67 
68         "movq      %0,    %%mm6 \n\t"
69         /* in: 1,4,0,2  out: 1,2,3,0 */
70         TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
71 
72         "paddw     %%mm6, %%mm3 \n\t"
73 
74         /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
75         IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
76 
77         "pxor %%mm7, %%mm7    \n\t"
78     :: "m"(ff_pw_32));
79 
80     asm volatile(
81     STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
82         "add %1, %0             \n\t"
83     STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
84         "add %1, %0             \n\t"
85     STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
86         "add %1, %0             \n\t"
87     STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
88         : "+r"(dst)
89         : "r" ((long)stride)
90     );
91 }
92 
h264_idct8_1d(int16_t * block)93 static inline void h264_idct8_1d(int16_t *block)
94 {
95     asm volatile(
96         "movq 112(%0), %%mm7  \n\t"
97         "movq  80(%0), %%mm0  \n\t"
98         "movq  48(%0), %%mm3  \n\t"
99         "movq  16(%0), %%mm5  \n\t"
100 
101         "movq   %%mm0, %%mm4  \n\t"
102         "movq   %%mm5, %%mm1  \n\t"
103         "psraw  $1,    %%mm4  \n\t"
104         "psraw  $1,    %%mm1  \n\t"
105         "paddw  %%mm0, %%mm4  \n\t"
106         "paddw  %%mm5, %%mm1  \n\t"
107         "paddw  %%mm7, %%mm4  \n\t"
108         "paddw  %%mm0, %%mm1  \n\t"
109         "psubw  %%mm5, %%mm4  \n\t"
110         "paddw  %%mm3, %%mm1  \n\t"
111 
112         "psubw  %%mm3, %%mm5  \n\t"
113         "psubw  %%mm3, %%mm0  \n\t"
114         "paddw  %%mm7, %%mm5  \n\t"
115         "psubw  %%mm7, %%mm0  \n\t"
116         "psraw  $1,    %%mm3  \n\t"
117         "psraw  $1,    %%mm7  \n\t"
118         "psubw  %%mm3, %%mm5  \n\t"
119         "psubw  %%mm7, %%mm0  \n\t"
120 
121         "movq   %%mm4, %%mm3  \n\t"
122         "movq   %%mm1, %%mm7  \n\t"
123         "psraw  $2,    %%mm1  \n\t"
124         "psraw  $2,    %%mm3  \n\t"
125         "paddw  %%mm5, %%mm3  \n\t"
126         "psraw  $2,    %%mm5  \n\t"
127         "paddw  %%mm0, %%mm1  \n\t"
128         "psraw  $2,    %%mm0  \n\t"
129         "psubw  %%mm4, %%mm5  \n\t"
130         "psubw  %%mm0, %%mm7  \n\t"
131 
132         "movq  32(%0), %%mm2  \n\t"
133         "movq  96(%0), %%mm6  \n\t"
134         "movq   %%mm2, %%mm4  \n\t"
135         "movq   %%mm6, %%mm0  \n\t"
136         "psraw  $1,    %%mm4  \n\t"
137         "psraw  $1,    %%mm6  \n\t"
138         "psubw  %%mm0, %%mm4  \n\t"
139         "paddw  %%mm2, %%mm6  \n\t"
140 
141         "movq    (%0), %%mm2  \n\t"
142         "movq  64(%0), %%mm0  \n\t"
143         SUMSUB_BA( %%mm0, %%mm2 )
144         SUMSUB_BA( %%mm6, %%mm0 )
145         SUMSUB_BA( %%mm4, %%mm2 )
146         SUMSUB_BA( %%mm7, %%mm6 )
147         SUMSUB_BA( %%mm5, %%mm4 )
148         SUMSUB_BA( %%mm3, %%mm2 )
149         SUMSUB_BA( %%mm1, %%mm0 )
150         :: "r"(block)
151     );
152 }
153 
ff_h264_idct8_add_mmx(uint8_t * dst,int16_t * block,int stride)154 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
155 {
156     int i;
157     int16_t __attribute__ ((aligned(8))) b2[64];
158 
159     block[0] += 32;
160 
161     for(i=0; i<2; i++){
162         DECLARE_ALIGNED_8(uint64_t, tmp);
163 
164         h264_idct8_1d(block+4*i);
165 
166         asm volatile(
167             "movq   %%mm7,    %0   \n\t"
168             TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
169             "movq   %%mm0,  8(%1)  \n\t"
170             "movq   %%mm6, 24(%1)  \n\t"
171             "movq   %%mm7, 40(%1)  \n\t"
172             "movq   %%mm4, 56(%1)  \n\t"
173             "movq    %0,    %%mm7  \n\t"
174             TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
175             "movq   %%mm7,   (%1)  \n\t"
176             "movq   %%mm1, 16(%1)  \n\t"
177             "movq   %%mm0, 32(%1)  \n\t"
178             "movq   %%mm3, 48(%1)  \n\t"
179             : "=m"(tmp)
180             : "r"(b2+32*i)
181             : "memory"
182         );
183     }
184 
185     for(i=0; i<2; i++){
186         h264_idct8_1d(b2+4*i);
187 
188         asm volatile(
189             "psraw     $6, %%mm7  \n\t"
190             "psraw     $6, %%mm6  \n\t"
191             "psraw     $6, %%mm5  \n\t"
192             "psraw     $6, %%mm4  \n\t"
193             "psraw     $6, %%mm3  \n\t"
194             "psraw     $6, %%mm2  \n\t"
195             "psraw     $6, %%mm1  \n\t"
196             "psraw     $6, %%mm0  \n\t"
197 
198             "movq   %%mm7,    (%0)  \n\t"
199             "movq   %%mm5,  16(%0)  \n\t"
200             "movq   %%mm3,  32(%0)  \n\t"
201             "movq   %%mm1,  48(%0)  \n\t"
202             "movq   %%mm0,  64(%0)  \n\t"
203             "movq   %%mm2,  80(%0)  \n\t"
204             "movq   %%mm4,  96(%0)  \n\t"
205             "movq   %%mm6, 112(%0)  \n\t"
206             :: "r"(b2+4*i)
207             : "memory"
208         );
209     }
210 
211     add_pixels_clamped_mmx(b2, dst, stride);
212 }
213 
214 #define STORE_DIFF_8P( p, d, t, z )\
215         "movq       "#d", "#t" \n"\
216         "psraw       $6,  "#p" \n"\
217         "punpcklbw  "#z", "#t" \n"\
218         "paddsw     "#t", "#p" \n"\
219         "packuswb   "#p", "#p" \n"\
220         "movq       "#p", "#d" \n"
221 
222 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
223         "movdqa     "#c", "#a" \n"\
224         "movdqa     "#g", "#e" \n"\
225         "psraw       $1,  "#c" \n"\
226         "psraw       $1,  "#g" \n"\
227         "psubw      "#e", "#c" \n"\
228         "paddw      "#a", "#g" \n"\
229         "movdqa     "#b", "#e" \n"\
230         "psraw       $1,  "#e" \n"\
231         "paddw      "#b", "#e" \n"\
232         "paddw      "#d", "#e" \n"\
233         "paddw      "#f", "#e" \n"\
234         "movdqa     "#f", "#a" \n"\
235         "psraw       $1,  "#a" \n"\
236         "paddw      "#f", "#a" \n"\
237         "paddw      "#h", "#a" \n"\
238         "psubw      "#b", "#a" \n"\
239         "psubw      "#d", "#b" \n"\
240         "psubw      "#d", "#f" \n"\
241         "paddw      "#h", "#b" \n"\
242         "psubw      "#h", "#f" \n"\
243         "psraw       $1,  "#d" \n"\
244         "psraw       $1,  "#h" \n"\
245         "psubw      "#d", "#b" \n"\
246         "psubw      "#h", "#f" \n"\
247         "movdqa     "#e", "#d" \n"\
248         "movdqa     "#a", "#h" \n"\
249         "psraw       $2,  "#d" \n"\
250         "psraw       $2,  "#h" \n"\
251         "paddw      "#f", "#d" \n"\
252         "paddw      "#b", "#h" \n"\
253         "psraw       $2,  "#f" \n"\
254         "psraw       $2,  "#b" \n"\
255         "psubw      "#f", "#e" \n"\
256         "psubw      "#a", "#b" \n"\
257         "movdqa 0x00(%1), "#a" \n"\
258         "movdqa 0x40(%1), "#f" \n"\
259         SUMSUB_BA(f, a)\
260         SUMSUB_BA(g, f)\
261         SUMSUB_BA(c, a)\
262         SUMSUB_BA(e, g)\
263         SUMSUB_BA(b, c)\
264         SUMSUB_BA(h, a)\
265         SUMSUB_BA(d, f)
266 
ff_h264_idct8_add_sse2(uint8_t * dst,int16_t * block,int stride)267 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
268 {
269     asm volatile(
270         "movdqa   0x10(%1), %%xmm1 \n"
271         "movdqa   0x20(%1), %%xmm2 \n"
272         "movdqa   0x30(%1), %%xmm3 \n"
273         "movdqa   0x50(%1), %%xmm5 \n"
274         "movdqa   0x60(%1), %%xmm6 \n"
275         "movdqa   0x70(%1), %%xmm7 \n"
276         H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
277         TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
278         "paddw          %4, %%xmm4 \n"
279         "movdqa     %%xmm4, 0x00(%1) \n"
280         "movdqa     %%xmm2, 0x40(%1) \n"
281         H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
282         "movdqa     %%xmm6, 0x60(%1) \n"
283         "movdqa     %%xmm7, 0x70(%1) \n"
284         "pxor       %%xmm7, %%xmm7 \n"
285         STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
286         STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
287         STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
288         STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
289         "lea     (%0,%2,4), %0 \n"
290         STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
291         STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
292         "movdqa   0x60(%1), %%xmm0 \n"
293         "movdqa   0x70(%1), %%xmm1 \n"
294         STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
295         STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
296         :"+r"(dst)
297         :"r"(block), "r"((long)stride), "r"(3L*stride), "m"(ff_pw_32)
298     );
299 }
300 
ff_h264_idct_dc_add_mmx2(uint8_t * dst,int16_t * block,int stride)301 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
302 {
303     int dc = (block[0] + 32) >> 6;
304     asm volatile(
305         "movd          %0, %%mm0 \n\t"
306         "pshufw $0, %%mm0, %%mm0 \n\t"
307         "pxor       %%mm1, %%mm1 \n\t"
308         "psubw      %%mm0, %%mm1 \n\t"
309         "packuswb   %%mm0, %%mm0 \n\t"
310         "packuswb   %%mm1, %%mm1 \n\t"
311         ::"r"(dc)
312     );
313     asm volatile(
314         "movd          %0, %%mm2 \n\t"
315         "movd          %1, %%mm3 \n\t"
316         "movd          %2, %%mm4 \n\t"
317         "movd          %3, %%mm5 \n\t"
318         "paddusb    %%mm0, %%mm2 \n\t"
319         "paddusb    %%mm0, %%mm3 \n\t"
320         "paddusb    %%mm0, %%mm4 \n\t"
321         "paddusb    %%mm0, %%mm5 \n\t"
322         "psubusb    %%mm1, %%mm2 \n\t"
323         "psubusb    %%mm1, %%mm3 \n\t"
324         "psubusb    %%mm1, %%mm4 \n\t"
325         "psubusb    %%mm1, %%mm5 \n\t"
326         "movd       %%mm2, %0    \n\t"
327         "movd       %%mm3, %1    \n\t"
328         "movd       %%mm4, %2    \n\t"
329         "movd       %%mm5, %3    \n\t"
330         :"+m"(*(uint32_t*)(dst+0*stride)),
331          "+m"(*(uint32_t*)(dst+1*stride)),
332          "+m"(*(uint32_t*)(dst+2*stride)),
333          "+m"(*(uint32_t*)(dst+3*stride))
334     );
335 }
336 
ff_h264_idct8_dc_add_mmx2(uint8_t * dst,int16_t * block,int stride)337 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
338 {
339     int dc = (block[0] + 32) >> 6;
340     int y;
341     asm volatile(
342         "movd          %0, %%mm0 \n\t"
343         "pshufw $0, %%mm0, %%mm0 \n\t"
344         "pxor       %%mm1, %%mm1 \n\t"
345         "psubw      %%mm0, %%mm1 \n\t"
346         "packuswb   %%mm0, %%mm0 \n\t"
347         "packuswb   %%mm1, %%mm1 \n\t"
348         ::"r"(dc)
349     );
350     for(y=2; y--; dst += 4*stride){
351     asm volatile(
352         "movq          %0, %%mm2 \n\t"
353         "movq          %1, %%mm3 \n\t"
354         "movq          %2, %%mm4 \n\t"
355         "movq          %3, %%mm5 \n\t"
356         "paddusb    %%mm0, %%mm2 \n\t"
357         "paddusb    %%mm0, %%mm3 \n\t"
358         "paddusb    %%mm0, %%mm4 \n\t"
359         "paddusb    %%mm0, %%mm5 \n\t"
360         "psubusb    %%mm1, %%mm2 \n\t"
361         "psubusb    %%mm1, %%mm3 \n\t"
362         "psubusb    %%mm1, %%mm4 \n\t"
363         "psubusb    %%mm1, %%mm5 \n\t"
364         "movq       %%mm2, %0    \n\t"
365         "movq       %%mm3, %1    \n\t"
366         "movq       %%mm4, %2    \n\t"
367         "movq       %%mm5, %3    \n\t"
368         :"+m"(*(uint64_t*)(dst+0*stride)),
369          "+m"(*(uint64_t*)(dst+1*stride)),
370          "+m"(*(uint64_t*)(dst+2*stride)),
371          "+m"(*(uint64_t*)(dst+3*stride))
372     );
373     }
374 }
375 
376 
377 /***********************************/
378 /* deblocking */
379 
380 // out: o = |x-y|>a
381 // clobbers: t
382 #define DIFF_GT_MMX(x,y,a,o,t)\
383     "movq     "#y", "#t"  \n\t"\
384     "movq     "#x", "#o"  \n\t"\
385     "psubusb  "#x", "#t"  \n\t"\
386     "psubusb  "#y", "#o"  \n\t"\
387     "por      "#t", "#o"  \n\t"\
388     "psubusb  "#a", "#o"  \n\t"
389 
390 // out: o = |x-y|>a
391 // clobbers: t
392 #define DIFF_GT2_MMX(x,y,a,o,t)\
393     "movq     "#y", "#t"  \n\t"\
394     "movq     "#x", "#o"  \n\t"\
395     "psubusb  "#x", "#t"  \n\t"\
396     "psubusb  "#y", "#o"  \n\t"\
397     "psubusb  "#a", "#t"  \n\t"\
398     "psubusb  "#a", "#o"  \n\t"\
399     "pcmpeqb  "#t", "#o"  \n\t"\
400 
401 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
402 // out: mm5=beta-1, mm7=mask
403 // clobbers: mm4,mm6
404 #define H264_DEBLOCK_MASK(alpha1, beta1) \
405     "pshufw $0, "#alpha1", %%mm4 \n\t"\
406     "pshufw $0, "#beta1 ", %%mm5 \n\t"\
407     "packuswb  %%mm4, %%mm4      \n\t"\
408     "packuswb  %%mm5, %%mm5      \n\t"\
409     DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
410     DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
411     "por       %%mm4, %%mm7      \n\t"\
412     DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
413     "por       %%mm4, %%mm7      \n\t"\
414     "pxor      %%mm6, %%mm6      \n\t"\
415     "pcmpeqb   %%mm6, %%mm7      \n\t"
416 
417 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
418 // out: mm1=p0' mm2=q0'
419 // clobbers: mm0,3-6
420 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
421         "movq    %%mm1              , %%mm5 \n\t"\
422         "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
423         "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
424         "pcmpeqb %%mm4              , %%mm4 \n\t"\
425         "pxor    %%mm4              , %%mm3 \n\t"\
426         "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
427         "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
428         "pxor    %%mm1              , %%mm4 \n\t"\
429         "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
430         "pavgb   %%mm5              , %%mm3 \n\t"\
431         "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
432         "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
433         "psubusb %%mm3              , %%mm6 \n\t"\
434         "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
435         "pminub  %%mm7              , %%mm6 \n\t"\
436         "pminub  %%mm7              , %%mm3 \n\t"\
437         "psubusb %%mm6              , %%mm1 \n\t"\
438         "psubusb %%mm3              , %%mm2 \n\t"\
439         "paddusb %%mm3              , %%mm1 \n\t"\
440         "paddusb %%mm6              , %%mm2 \n\t"
441 
442 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
443 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
444 // clobbers: q2, tmp, tc0
445 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
446         "movq     %%mm1,  "#tmp"   \n\t"\
447         "pavgb    %%mm2,  "#tmp"   \n\t"\
448         "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
449         "pxor   "q2addr", "#tmp"   \n\t"\
450         "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
451         "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
452         "movq     "#p1",  "#tmp"   \n\t"\
453         "psubusb  "#tc0", "#tmp"   \n\t"\
454         "paddusb  "#p1",  "#tc0"   \n\t"\
455         "pmaxub   "#tmp", "#q2"    \n\t"\
456         "pminub   "#tc0", "#q2"    \n\t"\
457         "movq     "#q2",  "q1addr" \n\t"
458 
h264_loop_filter_luma_mmx2(uint8_t * pix,int stride,int alpha1,int beta1,int8_t * tc0)459 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
460 {
461     DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
462 
463     asm volatile(
464         "movq    (%1,%3), %%mm0    \n\t" //p1
465         "movq    (%1,%3,2), %%mm1  \n\t" //p0
466         "movq    (%2),    %%mm2    \n\t" //q0
467         "movq    (%2,%3), %%mm3    \n\t" //q1
468         H264_DEBLOCK_MASK(%6, %7)
469 
470         "movd      %5,    %%mm4    \n\t"
471         "punpcklbw %%mm4, %%mm4    \n\t"
472         "punpcklwd %%mm4, %%mm4    \n\t"
473         "pcmpeqb   %%mm3, %%mm3    \n\t"
474         "movq      %%mm4, %%mm6    \n\t"
475         "pcmpgtb   %%mm3, %%mm4    \n\t"
476         "movq      %%mm6, 8+%0     \n\t"
477         "pand      %%mm4, %%mm7    \n\t"
478         "movq      %%mm7, %0       \n\t"
479 
480         /* filter p1 */
481         "movq     (%1),   %%mm3    \n\t" //p2
482         DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
483         "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
484         "pand     8+%0,   %%mm7    \n\t" // mask & tc0
485         "movq     %%mm7,  %%mm4    \n\t"
486         "psubb    %%mm6,  %%mm7    \n\t"
487         "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
488         H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
489 
490         /* filter q1 */
491         "movq    (%2,%3,2), %%mm4  \n\t" //q2
492         DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
493         "pand     %0,     %%mm6    \n\t"
494         "movq     8+%0,   %%mm5    \n\t" // can be merged with the and below but is slower then
495         "pand     %%mm6,  %%mm5    \n\t"
496         "psubb    %%mm6,  %%mm7    \n\t"
497         "movq    (%2,%3), %%mm3    \n\t"
498         H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
499 
500         /* filter p0, q0 */
501         H264_DEBLOCK_P0_Q0(%8, unused)
502         "movq      %%mm1, (%1,%3,2) \n\t"
503         "movq      %%mm2, (%2)      \n\t"
504 
505         : "=m"(*tmp0)
506         : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
507           "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
508           "m"(ff_bone)
509     );
510 }
511 
h264_v_loop_filter_luma_mmx2(uint8_t * pix,int stride,int alpha,int beta,int8_t * tc0)512 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
513 {
514     if((tc0[0] & tc0[1]) >= 0)
515         h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
516     if((tc0[2] & tc0[3]) >= 0)
517         h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
518 }
h264_h_loop_filter_luma_mmx2(uint8_t * pix,int stride,int alpha,int beta,int8_t * tc0)519 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
520 {
521     //FIXME: could cut some load/stores by merging transpose with filter
522     // also, it only needs to transpose 6x8
523     DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
524     int i;
525     for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
526         if((tc0[0] & tc0[1]) < 0)
527             continue;
528         transpose4x4(trans,       pix-4,          8, stride);
529         transpose4x4(trans  +4*8, pix,            8, stride);
530         transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
531         transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
532         h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
533         transpose4x4(pix-2,          trans  +2*8, stride, 8);
534         transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
535     }
536 }
537 
h264_loop_filter_chroma_mmx2(uint8_t * pix,int stride,int alpha1,int beta1,int8_t * tc0)538 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
539 {
540     asm volatile(
541         "movq    (%0),    %%mm0     \n\t" //p1
542         "movq    (%0,%2), %%mm1     \n\t" //p0
543         "movq    (%1),    %%mm2     \n\t" //q0
544         "movq    (%1,%2), %%mm3     \n\t" //q1
545         H264_DEBLOCK_MASK(%4, %5)
546         "movd      %3,    %%mm6     \n\t"
547         "punpcklbw %%mm6, %%mm6     \n\t"
548         "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
549         H264_DEBLOCK_P0_Q0(%6, %7)
550         "movq      %%mm1, (%0,%2)   \n\t"
551         "movq      %%mm2, (%1)      \n\t"
552 
553         :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
554            "r"(*(uint32_t*)tc0),
555            "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
556     );
557 }
558 
h264_v_loop_filter_chroma_mmx2(uint8_t * pix,int stride,int alpha,int beta,int8_t * tc0)559 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
560 {
561     h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
562 }
563 
h264_h_loop_filter_chroma_mmx2(uint8_t * pix,int stride,int alpha,int beta,int8_t * tc0)564 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
565 {
566     //FIXME: could cut some load/stores by merging transpose with filter
567     DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
568     transpose4x4(trans, pix-2, 8, stride);
569     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
570     h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
571     transpose4x4(pix-2, trans, stride, 8);
572     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
573 }
574 
575 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
576 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
577     "movq    "#p0", %%mm4  \n\t"\
578     "pxor    "#q1", %%mm4  \n\t"\
579     "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
580     "pavgb   "#q1", "#p0"  \n\t"\
581     "psubusb %%mm4, "#p0"  \n\t"\
582     "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
583 
h264_loop_filter_chroma_intra_mmx2(uint8_t * pix,int stride,int alpha1,int beta1)584 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
585 {
586     asm volatile(
587         "movq    (%0),    %%mm0     \n\t"
588         "movq    (%0,%2), %%mm1     \n\t"
589         "movq    (%1),    %%mm2     \n\t"
590         "movq    (%1,%2), %%mm3     \n\t"
591         H264_DEBLOCK_MASK(%3, %4)
592         "movq    %%mm1,   %%mm5     \n\t"
593         "movq    %%mm2,   %%mm6     \n\t"
594         H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
595         H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
596         "psubb   %%mm5,   %%mm1     \n\t"
597         "psubb   %%mm6,   %%mm2     \n\t"
598         "pand    %%mm7,   %%mm1     \n\t"
599         "pand    %%mm7,   %%mm2     \n\t"
600         "paddb   %%mm5,   %%mm1     \n\t"
601         "paddb   %%mm6,   %%mm2     \n\t"
602         "movq    %%mm1,   (%0,%2)   \n\t"
603         "movq    %%mm2,   (%1)      \n\t"
604         :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
605            "m"(alpha1), "m"(beta1), "m"(ff_bone)
606     );
607 }
608 
h264_v_loop_filter_chroma_intra_mmx2(uint8_t * pix,int stride,int alpha,int beta)609 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
610 {
611     h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
612 }
613 
h264_h_loop_filter_chroma_intra_mmx2(uint8_t * pix,int stride,int alpha,int beta)614 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
615 {
616     //FIXME: could cut some load/stores by merging transpose with filter
617     DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
618     transpose4x4(trans, pix-2, 8, stride);
619     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
620     h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
621     transpose4x4(pix-2, trans, stride, 8);
622     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
623 }
624 
h264_loop_filter_strength_mmx2(int16_t bS[2][4][4],uint8_t nnz[40],int8_t ref[2][40],int16_t mv[2][40][2],int bidir,int edges,int step,int mask_mv0,int mask_mv1)625 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
626                                             int bidir, int edges, int step, int mask_mv0, int mask_mv1 ) {
627     int dir;
628     asm volatile(
629         "pxor %%mm7, %%mm7 \n\t"
630         "movq %0, %%mm6 \n\t"
631         "movq %1, %%mm5 \n\t"
632         "movq %2, %%mm4 \n\t"
633         ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
634     );
635     // could do a special case for dir==0 && edges==1, but it only reduces the
636     // average filter time by 1.2%
637     for( dir=1; dir>=0; dir-- ) {
638         const int d_idx = dir ? -8 : -1;
639         const int mask_mv = dir ? mask_mv1 : mask_mv0;
640         DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
641         int b_idx, edge, l;
642         for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
643             asm volatile(
644                 "pand %0, %%mm0 \n\t"
645                 ::"m"(mask_dir)
646             );
647             if(!(mask_mv & edge)) {
648                 asm volatile("pxor %%mm0, %%mm0 \n\t":);
649                 for( l = bidir; l >= 0; l-- ) {
650                     asm volatile(
651                         "movd %0, %%mm1 \n\t"
652                         "punpckldq %1, %%mm1 \n\t"
653                         "movq %%mm1, %%mm2 \n\t"
654                         "psrlw $7, %%mm2 \n\t"
655                         "pand %%mm6, %%mm2 \n\t"
656                         "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
657                         "punpckldq %%mm1, %%mm2 \n\t"
658                         "pcmpeqb %%mm2, %%mm1 \n\t"
659                         "paddb %%mm6, %%mm1 \n\t"
660                         "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
661                         "por %%mm1, %%mm0 \n\t"
662 
663                         "movq %2, %%mm1 \n\t"
664                         "movq %3, %%mm2 \n\t"
665                         "psubw %4, %%mm1 \n\t"
666                         "psubw %5, %%mm2 \n\t"
667                         "packsswb %%mm2, %%mm1 \n\t"
668                         "paddb %%mm5, %%mm1 \n\t"
669                         "pminub %%mm4, %%mm1 \n\t"
670                         "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
671                         "por %%mm1, %%mm0 \n\t"
672                         ::"m"(ref[l][b_idx]),
673                           "m"(ref[l][b_idx+d_idx]),
674                           "m"(mv[l][b_idx][0]),
675                           "m"(mv[l][b_idx+2][0]),
676                           "m"(mv[l][b_idx+d_idx][0]),
677                           "m"(mv[l][b_idx+d_idx+2][0])
678                     );
679                 }
680             }
681             asm volatile(
682                 "movd %0, %%mm1 \n\t"
683                 "por  %1, %%mm1 \n\t"
684                 "punpcklbw %%mm7, %%mm1 \n\t"
685                 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
686                 ::"m"(nnz[b_idx]),
687                   "m"(nnz[b_idx+d_idx])
688             );
689             asm volatile(
690                 "pcmpeqw %%mm7, %%mm0 \n\t"
691                 "pcmpeqw %%mm7, %%mm0 \n\t"
692                 "psrlw $15, %%mm0 \n\t" // nonzero -> 1
693                 "psrlw $14, %%mm1 \n\t"
694                 "movq %%mm0, %%mm2 \n\t"
695                 "por %%mm1, %%mm2 \n\t"
696                 "psrlw $1, %%mm1 \n\t"
697                 "pandn %%mm2, %%mm1 \n\t"
698                 "movq %%mm1, %0 \n\t"
699                 :"=m"(*bS[dir][edge])
700                 ::"memory"
701             );
702         }
703         edges = 4;
704         step = 1;
705     }
706     asm volatile(
707         "movq   (%0), %%mm0 \n\t"
708         "movq  8(%0), %%mm1 \n\t"
709         "movq 16(%0), %%mm2 \n\t"
710         "movq 24(%0), %%mm3 \n\t"
711         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
712         "movq %%mm0,   (%0) \n\t"
713         "movq %%mm3,  8(%0) \n\t"
714         "movq %%mm4, 16(%0) \n\t"
715         "movq %%mm2, 24(%0) \n\t"
716         ::"r"(bS[0])
717         :"memory"
718     );
719 }
720 
721 /***********************************/
722 /* motion compensation */
723 
724 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
725         "mov"#q" "#C", "#T"         \n\t"\
726         "mov"#d" (%0), "#F"         \n\t"\
727         "paddw "#D", "#T"           \n\t"\
728         "psllw $2, "#T"             \n\t"\
729         "psubw "#B", "#T"           \n\t"\
730         "psubw "#E", "#T"           \n\t"\
731         "punpcklbw "#Z", "#F"       \n\t"\
732         "pmullw %4, "#T"            \n\t"\
733         "paddw %5, "#A"             \n\t"\
734         "add %2, %0                 \n\t"\
735         "paddw "#F", "#A"           \n\t"\
736         "paddw "#A", "#T"           \n\t"\
737         "psraw $5, "#T"             \n\t"\
738         "packuswb "#T", "#T"        \n\t"\
739         OP(T, (%1), A, d)\
740         "add %3, %1                 \n\t"
741 
742 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
743         "mov"#q" "#C", "#T"         \n\t"\
744         "mov"#d" (%0), "#F"         \n\t"\
745         "paddw "#D", "#T"           \n\t"\
746         "psllw $2, "#T"             \n\t"\
747         "paddw %4, "#A"             \n\t"\
748         "psubw "#B", "#T"           \n\t"\
749         "psubw "#E", "#T"           \n\t"\
750         "punpcklbw "#Z", "#F"       \n\t"\
751         "pmullw %3, "#T"            \n\t"\
752         "paddw "#F", "#A"           \n\t"\
753         "add %2, %0                 \n\t"\
754         "paddw "#A", "#T"           \n\t"\
755         "mov"#q" "#T", "#OF"(%1)    \n\t"
756 
757 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
758 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
759 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
760 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
761 
762 
763 #define QPEL_H264(OPNAME, OP, MMX)\
764 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
765     int h=4;\
766 \
767     asm volatile(\
768         "pxor %%mm7, %%mm7          \n\t"\
769         "movq %5, %%mm4             \n\t"\
770         "movq %6, %%mm5             \n\t"\
771         "1:                         \n\t"\
772         "movd  -1(%0), %%mm1        \n\t"\
773         "movd    (%0), %%mm2        \n\t"\
774         "movd   1(%0), %%mm3        \n\t"\
775         "movd   2(%0), %%mm0        \n\t"\
776         "punpcklbw %%mm7, %%mm1     \n\t"\
777         "punpcklbw %%mm7, %%mm2     \n\t"\
778         "punpcklbw %%mm7, %%mm3     \n\t"\
779         "punpcklbw %%mm7, %%mm0     \n\t"\
780         "paddw %%mm0, %%mm1         \n\t"\
781         "paddw %%mm3, %%mm2         \n\t"\
782         "movd  -2(%0), %%mm0        \n\t"\
783         "movd   3(%0), %%mm3        \n\t"\
784         "punpcklbw %%mm7, %%mm0     \n\t"\
785         "punpcklbw %%mm7, %%mm3     \n\t"\
786         "paddw %%mm3, %%mm0         \n\t"\
787         "psllw $2, %%mm2            \n\t"\
788         "psubw %%mm1, %%mm2         \n\t"\
789         "pmullw %%mm4, %%mm2        \n\t"\
790         "paddw %%mm5, %%mm0         \n\t"\
791         "paddw %%mm2, %%mm0         \n\t"\
792         "psraw $5, %%mm0            \n\t"\
793         "packuswb %%mm0, %%mm0      \n\t"\
794         OP(%%mm0, (%1),%%mm6, d)\
795         "add %3, %0                 \n\t"\
796         "add %4, %1                 \n\t"\
797         "decl %2                    \n\t"\
798         " jnz 1b                    \n\t"\
799         : "+a"(src), "+c"(dst), "+g"(h)\
800         : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
801         : "memory"\
802     );\
803 }\
804 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
805     int h=4;\
806     asm volatile(\
807         "pxor %%mm7, %%mm7          \n\t"\
808         "movq %0, %%mm4             \n\t"\
809         "movq %1, %%mm5             \n\t"\
810         :: "m"(ff_pw_5), "m"(ff_pw_16)\
811     );\
812     do{\
813     asm volatile(\
814         "movd  -1(%0), %%mm1        \n\t"\
815         "movd    (%0), %%mm2        \n\t"\
816         "movd   1(%0), %%mm3        \n\t"\
817         "movd   2(%0), %%mm0        \n\t"\
818         "punpcklbw %%mm7, %%mm1     \n\t"\
819         "punpcklbw %%mm7, %%mm2     \n\t"\
820         "punpcklbw %%mm7, %%mm3     \n\t"\
821         "punpcklbw %%mm7, %%mm0     \n\t"\
822         "paddw %%mm0, %%mm1         \n\t"\
823         "paddw %%mm3, %%mm2         \n\t"\
824         "movd  -2(%0), %%mm0        \n\t"\
825         "movd   3(%0), %%mm3        \n\t"\
826         "punpcklbw %%mm7, %%mm0     \n\t"\
827         "punpcklbw %%mm7, %%mm3     \n\t"\
828         "paddw %%mm3, %%mm0         \n\t"\
829         "psllw $2, %%mm2            \n\t"\
830         "psubw %%mm1, %%mm2         \n\t"\
831         "pmullw %%mm4, %%mm2        \n\t"\
832         "paddw %%mm5, %%mm0         \n\t"\
833         "paddw %%mm2, %%mm0         \n\t"\
834         "movd   (%2), %%mm3         \n\t"\
835         "psraw $5, %%mm0            \n\t"\
836         "packuswb %%mm0, %%mm0      \n\t"\
837         PAVGB" %%mm3, %%mm0         \n\t"\
838         OP(%%mm0, (%1),%%mm6, d)\
839         "add %4, %0                 \n\t"\
840         "add %4, %1                 \n\t"\
841         "add %3, %2                 \n\t"\
842         : "+a"(src), "+c"(dst), "+d"(src2)\
843         : "D"((long)src2Stride), "S"((long)dstStride)\
844         : "memory"\
845     );\
846     }while(--h);\
847 }\
848 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
849     src -= 2*srcStride;\
850     asm volatile(\
851         "pxor %%mm7, %%mm7          \n\t"\
852         "movd (%0), %%mm0           \n\t"\
853         "add %2, %0                 \n\t"\
854         "movd (%0), %%mm1           \n\t"\
855         "add %2, %0                 \n\t"\
856         "movd (%0), %%mm2           \n\t"\
857         "add %2, %0                 \n\t"\
858         "movd (%0), %%mm3           \n\t"\
859         "add %2, %0                 \n\t"\
860         "movd (%0), %%mm4           \n\t"\
861         "add %2, %0                 \n\t"\
862         "punpcklbw %%mm7, %%mm0     \n\t"\
863         "punpcklbw %%mm7, %%mm1     \n\t"\
864         "punpcklbw %%mm7, %%mm2     \n\t"\
865         "punpcklbw %%mm7, %%mm3     \n\t"\
866         "punpcklbw %%mm7, %%mm4     \n\t"\
867         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
868         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
869         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
870         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
871          \
872         : "+a"(src), "+c"(dst)\
873         : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
874         : "memory"\
875     );\
876 }\
877 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
878     int h=4;\
879     int w=3;\
880     src -= 2*srcStride+2;\
881     while(w--){\
882         asm volatile(\
883             "pxor %%mm7, %%mm7      \n\t"\
884             "movd (%0), %%mm0       \n\t"\
885             "add %2, %0             \n\t"\
886             "movd (%0), %%mm1       \n\t"\
887             "add %2, %0             \n\t"\
888             "movd (%0), %%mm2       \n\t"\
889             "add %2, %0             \n\t"\
890             "movd (%0), %%mm3       \n\t"\
891             "add %2, %0             \n\t"\
892             "movd (%0), %%mm4       \n\t"\
893             "add %2, %0             \n\t"\
894             "punpcklbw %%mm7, %%mm0 \n\t"\
895             "punpcklbw %%mm7, %%mm1 \n\t"\
896             "punpcklbw %%mm7, %%mm2 \n\t"\
897             "punpcklbw %%mm7, %%mm3 \n\t"\
898             "punpcklbw %%mm7, %%mm4 \n\t"\
899             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
900             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
901             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
902             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
903              \
904             : "+a"(src)\
905             : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
906             : "memory"\
907         );\
908         tmp += 4;\
909         src += 4 - 9*srcStride;\
910     }\
911     tmp -= 3*4;\
912     asm volatile(\
913         "1:                         \n\t"\
914         "movq     (%0), %%mm0       \n\t"\
915         "paddw  10(%0), %%mm0       \n\t"\
916         "movq    2(%0), %%mm1       \n\t"\
917         "paddw   8(%0), %%mm1       \n\t"\
918         "movq    4(%0), %%mm2       \n\t"\
919         "paddw   6(%0), %%mm2       \n\t"\
920         "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
921         "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
922         "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
923         "paddsw %%mm2, %%mm0        \n\t"\
924         "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
925         "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
926         "psraw $6, %%mm0            \n\t"\
927         "packuswb %%mm0, %%mm0      \n\t"\
928         OP(%%mm0, (%1),%%mm7, d)\
929         "add $24, %0                \n\t"\
930         "add %3, %1                 \n\t"\
931         "decl %2                    \n\t"\
932         " jnz 1b                    \n\t"\
933         : "+a"(tmp), "+c"(dst), "+g"(h)\
934         : "S"((long)dstStride)\
935         : "memory"\
936     );\
937 }\
938 \
939 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
940     int h=8;\
941     asm volatile(\
942         "pxor %%mm7, %%mm7          \n\t"\
943         "movq %5, %%mm6             \n\t"\
944         "1:                         \n\t"\
945         "movq    (%0), %%mm0        \n\t"\
946         "movq   1(%0), %%mm2        \n\t"\
947         "movq %%mm0, %%mm1          \n\t"\
948         "movq %%mm2, %%mm3          \n\t"\
949         "punpcklbw %%mm7, %%mm0     \n\t"\
950         "punpckhbw %%mm7, %%mm1     \n\t"\
951         "punpcklbw %%mm7, %%mm2     \n\t"\
952         "punpckhbw %%mm7, %%mm3     \n\t"\
953         "paddw %%mm2, %%mm0         \n\t"\
954         "paddw %%mm3, %%mm1         \n\t"\
955         "psllw $2, %%mm0            \n\t"\
956         "psllw $2, %%mm1            \n\t"\
957         "movq   -1(%0), %%mm2       \n\t"\
958         "movq    2(%0), %%mm4       \n\t"\
959         "movq %%mm2, %%mm3          \n\t"\
960         "movq %%mm4, %%mm5          \n\t"\
961         "punpcklbw %%mm7, %%mm2     \n\t"\
962         "punpckhbw %%mm7, %%mm3     \n\t"\
963         "punpcklbw %%mm7, %%mm4     \n\t"\
964         "punpckhbw %%mm7, %%mm5     \n\t"\
965         "paddw %%mm4, %%mm2         \n\t"\
966         "paddw %%mm3, %%mm5         \n\t"\
967         "psubw %%mm2, %%mm0         \n\t"\
968         "psubw %%mm5, %%mm1         \n\t"\
969         "pmullw %%mm6, %%mm0        \n\t"\
970         "pmullw %%mm6, %%mm1        \n\t"\
971         "movd   -2(%0), %%mm2       \n\t"\
972         "movd    7(%0), %%mm5       \n\t"\
973         "punpcklbw %%mm7, %%mm2     \n\t"\
974         "punpcklbw %%mm7, %%mm5     \n\t"\
975         "paddw %%mm3, %%mm2         \n\t"\
976         "paddw %%mm5, %%mm4         \n\t"\
977         "movq %6, %%mm5             \n\t"\
978         "paddw %%mm5, %%mm2         \n\t"\
979         "paddw %%mm5, %%mm4         \n\t"\
980         "paddw %%mm2, %%mm0         \n\t"\
981         "paddw %%mm4, %%mm1         \n\t"\
982         "psraw $5, %%mm0            \n\t"\
983         "psraw $5, %%mm1            \n\t"\
984         "packuswb %%mm1, %%mm0      \n\t"\
985         OP(%%mm0, (%1),%%mm5, q)\
986         "add %3, %0                 \n\t"\
987         "add %4, %1                 \n\t"\
988         "decl %2                    \n\t"\
989         " jnz 1b                    \n\t"\
990         : "+a"(src), "+c"(dst), "+g"(h)\
991         : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
992         : "memory"\
993     );\
994 }\
995 \
996 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
997     int h=8;\
998     asm volatile(\
999         "pxor %%mm7, %%mm7          \n\t"\
1000         "movq %0, %%mm6             \n\t"\
1001         :: "m"(ff_pw_5)\
1002     );\
1003     do{\
1004     asm volatile(\
1005         "movq    (%0), %%mm0        \n\t"\
1006         "movq   1(%0), %%mm2        \n\t"\
1007         "movq %%mm0, %%mm1          \n\t"\
1008         "movq %%mm2, %%mm3          \n\t"\
1009         "punpcklbw %%mm7, %%mm0     \n\t"\
1010         "punpckhbw %%mm7, %%mm1     \n\t"\
1011         "punpcklbw %%mm7, %%mm2     \n\t"\
1012         "punpckhbw %%mm7, %%mm3     \n\t"\
1013         "paddw %%mm2, %%mm0         \n\t"\
1014         "paddw %%mm3, %%mm1         \n\t"\
1015         "psllw $2, %%mm0            \n\t"\
1016         "psllw $2, %%mm1            \n\t"\
1017         "movq   -1(%0), %%mm2       \n\t"\
1018         "movq    2(%0), %%mm4       \n\t"\
1019         "movq %%mm2, %%mm3          \n\t"\
1020         "movq %%mm4, %%mm5          \n\t"\
1021         "punpcklbw %%mm7, %%mm2     \n\t"\
1022         "punpckhbw %%mm7, %%mm3     \n\t"\
1023         "punpcklbw %%mm7, %%mm4     \n\t"\
1024         "punpckhbw %%mm7, %%mm5     \n\t"\
1025         "paddw %%mm4, %%mm2         \n\t"\
1026         "paddw %%mm3, %%mm5         \n\t"\
1027         "psubw %%mm2, %%mm0         \n\t"\
1028         "psubw %%mm5, %%mm1         \n\t"\
1029         "pmullw %%mm6, %%mm0        \n\t"\
1030         "pmullw %%mm6, %%mm1        \n\t"\
1031         "movd   -2(%0), %%mm2       \n\t"\
1032         "movd    7(%0), %%mm5       \n\t"\
1033         "punpcklbw %%mm7, %%mm2     \n\t"\
1034         "punpcklbw %%mm7, %%mm5     \n\t"\
1035         "paddw %%mm3, %%mm2         \n\t"\
1036         "paddw %%mm5, %%mm4         \n\t"\
1037         "movq %5, %%mm5             \n\t"\
1038         "paddw %%mm5, %%mm2         \n\t"\
1039         "paddw %%mm5, %%mm4         \n\t"\
1040         "paddw %%mm2, %%mm0         \n\t"\
1041         "paddw %%mm4, %%mm1         \n\t"\
1042         "psraw $5, %%mm0            \n\t"\
1043         "psraw $5, %%mm1            \n\t"\
1044         "movq (%2), %%mm4           \n\t"\
1045         "packuswb %%mm1, %%mm0      \n\t"\
1046         PAVGB" %%mm4, %%mm0         \n\t"\
1047         OP(%%mm0, (%1),%%mm5, q)\
1048         "add %4, %0                 \n\t"\
1049         "add %4, %1                 \n\t"\
1050         "add %3, %2                 \n\t"\
1051         : "+a"(src), "+c"(dst), "+d"(src2)\
1052         : "D"((long)src2Stride), "S"((long)dstStride),\
1053           "m"(ff_pw_16)\
1054         : "memory"\
1055     );\
1056     }while(--h);\
1057 }\
1058 \
1059 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1060     int w= 2;\
1061     src -= 2*srcStride;\
1062     \
1063     while(w--){\
1064       asm volatile(\
1065         "pxor %%mm7, %%mm7          \n\t"\
1066         "movd (%0), %%mm0           \n\t"\
1067         "add %2, %0                 \n\t"\
1068         "movd (%0), %%mm1           \n\t"\
1069         "add %2, %0                 \n\t"\
1070         "movd (%0), %%mm2           \n\t"\
1071         "add %2, %0                 \n\t"\
1072         "movd (%0), %%mm3           \n\t"\
1073         "add %2, %0                 \n\t"\
1074         "movd (%0), %%mm4           \n\t"\
1075         "add %2, %0                 \n\t"\
1076         "punpcklbw %%mm7, %%mm0     \n\t"\
1077         "punpcklbw %%mm7, %%mm1     \n\t"\
1078         "punpcklbw %%mm7, %%mm2     \n\t"\
1079         "punpcklbw %%mm7, %%mm3     \n\t"\
1080         "punpcklbw %%mm7, %%mm4     \n\t"\
1081         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
1082         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
1083         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
1084         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
1085         QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
1086         QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
1087         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
1088         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
1089          \
1090         : "+a"(src), "+c"(dst)\
1091         : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1092         : "memory"\
1093      );\
1094      if(h==16){\
1095         asm volatile(\
1096             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
1097             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
1098             QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
1099             QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
1100             QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
1101             QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
1102             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
1103             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
1104             \
1105            : "+a"(src), "+c"(dst)\
1106            : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1107            : "memory"\
1108         );\
1109      }\
1110      src += 4-(h+5)*srcStride;\
1111      dst += 4-h*dstStride;\
1112    }\
1113 }\
1114 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
1115     int w = (size+8)>>2;\
1116     src -= 2*srcStride+2;\
1117     while(w--){\
1118         asm volatile(\
1119             "pxor %%mm7, %%mm7      \n\t"\
1120             "movd (%0), %%mm0       \n\t"\
1121             "add %2, %0             \n\t"\
1122             "movd (%0), %%mm1       \n\t"\
1123             "add %2, %0             \n\t"\
1124             "movd (%0), %%mm2       \n\t"\
1125             "add %2, %0             \n\t"\
1126             "movd (%0), %%mm3       \n\t"\
1127             "add %2, %0             \n\t"\
1128             "movd (%0), %%mm4       \n\t"\
1129             "add %2, %0             \n\t"\
1130             "punpcklbw %%mm7, %%mm0 \n\t"\
1131             "punpcklbw %%mm7, %%mm1 \n\t"\
1132             "punpcklbw %%mm7, %%mm2 \n\t"\
1133             "punpcklbw %%mm7, %%mm3 \n\t"\
1134             "punpcklbw %%mm7, %%mm4 \n\t"\
1135             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
1136             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
1137             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
1138             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
1139             QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
1140             QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
1141             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
1142             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
1143             : "+a"(src)\
1144             : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1145             : "memory"\
1146         );\
1147         if(size==16){\
1148             asm volatile(\
1149                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
1150                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
1151                 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
1152                 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
1153                 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
1154                 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
1155                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
1156                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
1157                 : "+a"(src)\
1158                 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1159                 : "memory"\
1160             );\
1161         }\
1162         tmp += 4;\
1163         src += 4 - (size+5)*srcStride;\
1164     }\
1165 }\
1166 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1167     int w = size>>4;\
1168     do{\
1169     int h = size;\
1170     asm volatile(\
1171         "1:                         \n\t"\
1172         "movq     (%0), %%mm0       \n\t"\
1173         "movq    8(%0), %%mm3       \n\t"\
1174         "movq    2(%0), %%mm1       \n\t"\
1175         "movq   10(%0), %%mm4       \n\t"\
1176         "paddw   %%mm4, %%mm0       \n\t"\
1177         "paddw   %%mm3, %%mm1       \n\t"\
1178         "paddw  18(%0), %%mm3       \n\t"\
1179         "paddw  16(%0), %%mm4       \n\t"\
1180         "movq    4(%0), %%mm2       \n\t"\
1181         "movq   12(%0), %%mm5       \n\t"\
1182         "paddw   6(%0), %%mm2       \n\t"\
1183         "paddw  14(%0), %%mm5       \n\t"\
1184         "psubw %%mm1, %%mm0         \n\t"\
1185         "psubw %%mm4, %%mm3         \n\t"\
1186         "psraw $2, %%mm0            \n\t"\
1187         "psraw $2, %%mm3            \n\t"\
1188         "psubw %%mm1, %%mm0         \n\t"\
1189         "psubw %%mm4, %%mm3         \n\t"\
1190         "paddsw %%mm2, %%mm0        \n\t"\
1191         "paddsw %%mm5, %%mm3        \n\t"\
1192         "psraw $2, %%mm0            \n\t"\
1193         "psraw $2, %%mm3            \n\t"\
1194         "paddw %%mm2, %%mm0         \n\t"\
1195         "paddw %%mm5, %%mm3         \n\t"\
1196         "psraw $6, %%mm0            \n\t"\
1197         "psraw $6, %%mm3            \n\t"\
1198         "packuswb %%mm3, %%mm0      \n\t"\
1199         OP(%%mm0, (%1),%%mm7, q)\
1200         "add $48, %0                \n\t"\
1201         "add %3, %1                 \n\t"\
1202         "decl %2                    \n\t"\
1203         " jnz 1b                    \n\t"\
1204         : "+a"(tmp), "+c"(dst), "+g"(h)\
1205         : "S"((long)dstStride)\
1206         : "memory"\
1207     );\
1208     tmp += 8 - size*24;\
1209     dst += 8 - size*dstStride;\
1210     }while(w--);\
1211 }\
1212 \
1213 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1214     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
1215 }\
1216 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1217     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
1218     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
1219 }\
1220 \
1221 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1222     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1223     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1224     src += 8*srcStride;\
1225     dst += 8*dstStride;\
1226     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1227     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1228 }\
1229 \
1230 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1231     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
1232     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
1233     src += 8*dstStride;\
1234     dst += 8*dstStride;\
1235     src2 += 8*src2Stride;\
1236     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
1237     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
1238 }\
1239 \
1240 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1241           put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
1242     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
1243 }\
1244 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1245     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
1246 }\
1247 \
1248 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1249     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
1250 }\
1251 \
1252 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1253 {\
1254     asm volatile(\
1255         "movq      (%1), %%mm0          \n\t"\
1256         "movq    24(%1), %%mm1          \n\t"\
1257         "psraw      $5,  %%mm0          \n\t"\
1258         "psraw      $5,  %%mm1          \n\t"\
1259         "packuswb %%mm0, %%mm0          \n\t"\
1260         "packuswb %%mm1, %%mm1          \n\t"\
1261         PAVGB"     (%0), %%mm0          \n\t"\
1262         PAVGB"  (%0,%3), %%mm1          \n\t"\
1263         OP(%%mm0, (%2),    %%mm4, d)\
1264         OP(%%mm1, (%2,%4), %%mm5, d)\
1265         "lea  (%0,%3,2), %0             \n\t"\
1266         "lea  (%2,%4,2), %2             \n\t"\
1267         "movq    48(%1), %%mm0          \n\t"\
1268         "movq    72(%1), %%mm1          \n\t"\
1269         "psraw      $5,  %%mm0          \n\t"\
1270         "psraw      $5,  %%mm1          \n\t"\
1271         "packuswb %%mm0, %%mm0          \n\t"\
1272         "packuswb %%mm1, %%mm1          \n\t"\
1273         PAVGB"     (%0), %%mm0          \n\t"\
1274         PAVGB"  (%0,%3), %%mm1          \n\t"\
1275         OP(%%mm0, (%2),    %%mm4, d)\
1276         OP(%%mm1, (%2,%4), %%mm5, d)\
1277         :"+a"(src8), "+c"(src16), "+d"(dst)\
1278         :"S"((long)src8Stride), "D"((long)dstStride)\
1279         :"memory");\
1280 }\
1281 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1282 {\
1283     do{\
1284     asm volatile(\
1285         "movq      (%1), %%mm0          \n\t"\
1286         "movq     8(%1), %%mm1          \n\t"\
1287         "movq    48(%1), %%mm2          \n\t"\
1288         "movq  8+48(%1), %%mm3          \n\t"\
1289         "psraw      $5,  %%mm0          \n\t"\
1290         "psraw      $5,  %%mm1          \n\t"\
1291         "psraw      $5,  %%mm2          \n\t"\
1292         "psraw      $5,  %%mm3          \n\t"\
1293         "packuswb %%mm1, %%mm0          \n\t"\
1294         "packuswb %%mm3, %%mm2          \n\t"\
1295         PAVGB"     (%0), %%mm0          \n\t"\
1296         PAVGB"  (%0,%3), %%mm2          \n\t"\
1297         OP(%%mm0, (%2), %%mm5, q)\
1298         OP(%%mm2, (%2,%4), %%mm5, q)\
1299         ::"a"(src8), "c"(src16), "d"(dst),\
1300           "r"((long)src8Stride), "r"((long)dstStride)\
1301         :"memory");\
1302         src8 += 2L*src8Stride;\
1303         src16 += 48;\
1304         dst += 2L*dstStride;\
1305     }while(h-=2);\
1306 }\
1307 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1308 {\
1309     OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
1310     OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
1311 }\
1312 
1313 
1314 #ifdef ARCH_X86_64
1315 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1316 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1317     int h=16;\
1318     asm volatile(\
1319         "pxor %%xmm15, %%xmm15      \n\t"\
1320         "movdqa %6, %%xmm14         \n\t"\
1321         "movdqa %7, %%xmm13         \n\t"\
1322         "1:                         \n\t"\
1323         "lddqu    3(%0), %%xmm1     \n\t"\
1324         "lddqu   -5(%0), %%xmm7     \n\t"\
1325         "movdqa  %%xmm1, %%xmm0     \n\t"\
1326         "punpckhbw %%xmm15, %%xmm1  \n\t"\
1327         "punpcklbw %%xmm15, %%xmm0  \n\t"\
1328         "punpcklbw %%xmm15, %%xmm7  \n\t"\
1329         "movdqa  %%xmm1, %%xmm2     \n\t"\
1330         "movdqa  %%xmm0, %%xmm6     \n\t"\
1331         "movdqa  %%xmm1, %%xmm3     \n\t"\
1332         "movdqa  %%xmm0, %%xmm8     \n\t"\
1333         "movdqa  %%xmm1, %%xmm4     \n\t"\
1334         "movdqa  %%xmm0, %%xmm9     \n\t"\
1335         "movdqa  %%xmm1, %%xmm5     \n\t"\
1336         "movdqa  %%xmm0, %%xmm10    \n\t"\
1337         "palignr $6, %%xmm0, %%xmm5 \n\t"\
1338         "palignr $6, %%xmm7, %%xmm10\n\t"\
1339         "palignr $8, %%xmm0, %%xmm4 \n\t"\
1340         "palignr $8, %%xmm7, %%xmm9 \n\t"\
1341         "palignr $10,%%xmm0, %%xmm3 \n\t"\
1342         "palignr $10,%%xmm7, %%xmm8 \n\t"\
1343         "paddw   %%xmm1, %%xmm5     \n\t"\
1344         "paddw   %%xmm0, %%xmm10    \n\t"\
1345         "palignr $12,%%xmm0, %%xmm2 \n\t"\
1346         "palignr $12,%%xmm7, %%xmm6 \n\t"\
1347         "palignr $14,%%xmm0, %%xmm1 \n\t"\
1348         "palignr $14,%%xmm7, %%xmm0 \n\t"\
1349         "paddw   %%xmm3, %%xmm2     \n\t"\
1350         "paddw   %%xmm8, %%xmm6     \n\t"\
1351         "paddw   %%xmm4, %%xmm1     \n\t"\
1352         "paddw   %%xmm9, %%xmm0     \n\t"\
1353         "psllw   $2,     %%xmm2     \n\t"\
1354         "psllw   $2,     %%xmm6     \n\t"\
1355         "psubw   %%xmm1, %%xmm2     \n\t"\
1356         "psubw   %%xmm0, %%xmm6     \n\t"\
1357         "paddw   %%xmm13,%%xmm5     \n\t"\
1358         "paddw   %%xmm13,%%xmm10    \n\t"\
1359         "pmullw  %%xmm14,%%xmm2     \n\t"\
1360         "pmullw  %%xmm14,%%xmm6     \n\t"\
1361         "lddqu   (%2),   %%xmm3     \n\t"\
1362         "paddw   %%xmm5, %%xmm2     \n\t"\
1363         "paddw   %%xmm10,%%xmm6     \n\t"\
1364         "psraw   $5,     %%xmm2     \n\t"\
1365         "psraw   $5,     %%xmm6     \n\t"\
1366         "packuswb %%xmm2,%%xmm6     \n\t"\
1367         "pavgb   %%xmm3, %%xmm6     \n\t"\
1368         OP(%%xmm6, (%1), %%xmm4, dqa)\
1369         "add %5, %0                 \n\t"\
1370         "add %5, %1                 \n\t"\
1371         "add %4, %2                 \n\t"\
1372         "decl %3                    \n\t"\
1373         "jg 1b                      \n\t"\
1374         : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
1375         : "D"((long)src2Stride), "S"((long)dstStride),\
1376           "m"(ff_pw_5), "m"(ff_pw_16)\
1377         : "memory"\
1378     );\
1379 }
1380 #else // ARCH_X86_64
1381 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1382 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1383     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
1384     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
1385     src += 8*dstStride;\
1386     dst += 8*dstStride;\
1387     src2 += 8*src2Stride;\
1388     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
1389     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
1390 }
1391 #endif // ARCH_X86_64
1392 
1393 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
1394 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1395     int h=8;\
1396     asm volatile(\
1397         "pxor %%xmm7, %%xmm7        \n\t"\
1398         "movdqa %0, %%xmm6          \n\t"\
1399         :: "m"(ff_pw_5)\
1400     );\
1401     do{\
1402     asm volatile(\
1403         "lddqu   -5(%0), %%xmm1     \n\t"\
1404         "movdqa  %%xmm1, %%xmm0     \n\t"\
1405         "punpckhbw %%xmm7, %%xmm1   \n\t"\
1406         "punpcklbw %%xmm7, %%xmm0   \n\t"\
1407         "movdqa  %%xmm1, %%xmm2     \n\t"\
1408         "movdqa  %%xmm1, %%xmm3     \n\t"\
1409         "movdqa  %%xmm1, %%xmm4     \n\t"\
1410         "movdqa  %%xmm1, %%xmm5     \n\t"\
1411         "palignr $6, %%xmm0, %%xmm5 \n\t"\
1412         "palignr $8, %%xmm0, %%xmm4 \n\t"\
1413         "palignr $10,%%xmm0, %%xmm3 \n\t"\
1414         "paddw   %%xmm1, %%xmm5     \n\t"\
1415         "palignr $12,%%xmm0, %%xmm2 \n\t"\
1416         "palignr $14,%%xmm0, %%xmm1 \n\t"\
1417         "paddw   %%xmm3, %%xmm2     \n\t"\
1418         "paddw   %%xmm4, %%xmm1     \n\t"\
1419         "psllw   $2,     %%xmm2     \n\t"\
1420         "movq    (%2),   %%xmm3     \n\t"\
1421         "psubw   %%xmm1, %%xmm2     \n\t"\
1422         "paddw   %5,     %%xmm5     \n\t"\
1423         "pmullw  %%xmm6, %%xmm2     \n\t"\
1424         "paddw   %%xmm5, %%xmm2     \n\t"\
1425         "psraw   $5,     %%xmm2     \n\t"\
1426         "packuswb %%xmm2, %%xmm2    \n\t"\
1427         "pavgb   %%xmm3, %%xmm2     \n\t"\
1428         OP(%%xmm2, (%1), %%xmm4, q)\
1429         "add %4, %0                 \n\t"\
1430         "add %4, %1                 \n\t"\
1431         "add %3, %2                 \n\t"\
1432         : "+a"(src), "+c"(dst), "+d"(src2)\
1433         : "D"((long)src2Stride), "S"((long)dstStride),\
1434           "m"(ff_pw_16)\
1435         : "memory"\
1436     );\
1437     }while(--h);\
1438 }\
1439 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1440 \
1441 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1442     int h=8;\
1443     asm volatile(\
1444         "pxor %%xmm7, %%xmm7        \n\t"\
1445         "movdqa %5, %%xmm6          \n\t"\
1446         "1:                         \n\t"\
1447         "lddqu   -5(%0), %%xmm1     \n\t"\
1448         "movdqa  %%xmm1, %%xmm0     \n\t"\
1449         "punpckhbw %%xmm7, %%xmm1   \n\t"\
1450         "punpcklbw %%xmm7, %%xmm0   \n\t"\
1451         "movdqa  %%xmm1, %%xmm2     \n\t"\
1452         "movdqa  %%xmm1, %%xmm3     \n\t"\
1453         "movdqa  %%xmm1, %%xmm4     \n\t"\
1454         "movdqa  %%xmm1, %%xmm5     \n\t"\
1455         "palignr $6, %%xmm0, %%xmm5 \n\t"\
1456         "palignr $8, %%xmm0, %%xmm4 \n\t"\
1457         "palignr $10,%%xmm0, %%xmm3 \n\t"\
1458         "paddw   %%xmm1, %%xmm5     \n\t"\
1459         "palignr $12,%%xmm0, %%xmm2 \n\t"\
1460         "palignr $14,%%xmm0, %%xmm1 \n\t"\
1461         "paddw   %%xmm3, %%xmm2     \n\t"\
1462         "paddw   %%xmm4, %%xmm1     \n\t"\
1463         "psllw   $2,     %%xmm2     \n\t"\
1464         "psubw   %%xmm1, %%xmm2     \n\t"\
1465         "paddw   %6,     %%xmm5     \n\t"\
1466         "pmullw  %%xmm6, %%xmm2     \n\t"\
1467         "paddw   %%xmm5, %%xmm2     \n\t"\
1468         "psraw   $5,     %%xmm2     \n\t"\
1469         "packuswb %%xmm2, %%xmm2    \n\t"\
1470         OP(%%xmm2, (%1), %%xmm4, q)\
1471         "add %3, %0                 \n\t"\
1472         "add %4, %1                 \n\t"\
1473         "decl %2                    \n\t"\
1474         " jnz 1b                    \n\t"\
1475         : "+a"(src), "+c"(dst), "+g"(h)\
1476         : "D"((long)srcStride), "S"((long)dstStride),\
1477           "m"(ff_pw_5), "m"(ff_pw_16)\
1478         : "memory"\
1479     );\
1480 }\
1481 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1482     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1483     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1484     src += 8*srcStride;\
1485     dst += 8*dstStride;\
1486     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
1487     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
1488 }\
1489 
1490 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
1491 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1492     src -= 2*srcStride;\
1493     \
1494     asm volatile(\
1495         "pxor %%xmm7, %%xmm7        \n\t"\
1496         "movq (%0), %%xmm0          \n\t"\
1497         "add %2, %0                 \n\t"\
1498         "movq (%0), %%xmm1          \n\t"\
1499         "add %2, %0                 \n\t"\
1500         "movq (%0), %%xmm2          \n\t"\
1501         "add %2, %0                 \n\t"\
1502         "movq (%0), %%xmm3          \n\t"\
1503         "add %2, %0                 \n\t"\
1504         "movq (%0), %%xmm4          \n\t"\
1505         "add %2, %0                 \n\t"\
1506         "punpcklbw %%xmm7, %%xmm0   \n\t"\
1507         "punpcklbw %%xmm7, %%xmm1   \n\t"\
1508         "punpcklbw %%xmm7, %%xmm2   \n\t"\
1509         "punpcklbw %%xmm7, %%xmm3   \n\t"\
1510         "punpcklbw %%xmm7, %%xmm4   \n\t"\
1511         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
1512         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
1513         QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
1514         QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
1515         QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
1516         QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
1517         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
1518         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
1519          \
1520         : "+a"(src), "+c"(dst)\
1521         : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1522         : "memory"\
1523     );\
1524     if(h==16){\
1525         asm volatile(\
1526             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
1527             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
1528             QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
1529             QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
1530             QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
1531             QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
1532             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
1533             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
1534             \
1535             : "+a"(src), "+c"(dst)\
1536             : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1537             : "memory"\
1538         );\
1539     }\
1540 }\
1541 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1542     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
1543 }\
1544 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
1546     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
1547 }
1548 
put_h264_qpel8or16_hv1_lowpass_sse2(int16_t * tmp,uint8_t * src,int tmpStride,int srcStride,int size)1549 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
1550     int w = (size+8)>>3;
1551     src -= 2*srcStride+2;
1552     while(w--){
1553         asm volatile(
1554             "pxor %%xmm7, %%xmm7        \n\t"
1555             "movq (%0), %%xmm0          \n\t"
1556             "add %2, %0                 \n\t"
1557             "movq (%0), %%xmm1          \n\t"
1558             "add %2, %0                 \n\t"
1559             "movq (%0), %%xmm2          \n\t"
1560             "add %2, %0                 \n\t"
1561             "movq (%0), %%xmm3          \n\t"
1562             "add %2, %0                 \n\t"
1563             "movq (%0), %%xmm4          \n\t"
1564             "add %2, %0                 \n\t"
1565             "punpcklbw %%xmm7, %%xmm0   \n\t"
1566             "punpcklbw %%xmm7, %%xmm1   \n\t"
1567             "punpcklbw %%xmm7, %%xmm2   \n\t"
1568             "punpcklbw %%xmm7, %%xmm3   \n\t"
1569             "punpcklbw %%xmm7, %%xmm4   \n\t"
1570             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
1571             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
1572             QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
1573             QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
1574             QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
1575             QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
1576             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
1577             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
1578             : "+a"(src)
1579             : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
1580             : "memory"
1581         );
1582         if(size==16){
1583             asm volatile(
1584                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
1585                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
1586                 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
1587                 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
1588                 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
1589                 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
1590                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
1591                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
1592                 : "+a"(src)
1593                 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
1594                 : "memory"
1595             );
1596         }
1597         tmp += 8;
1598         src += 8 - (size+5)*srcStride;
1599     }
1600 }
1601 
1602 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
1603 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1604     int h = size;\
1605     if(size == 16){\
1606         asm volatile(\
1607             "1:                         \n\t"\
1608             "movdqa 32(%0), %%xmm4      \n\t"\
1609             "movdqa 16(%0), %%xmm5      \n\t"\
1610             "movdqa   (%0), %%xmm7      \n\t"\
1611             "movdqa %%xmm4, %%xmm3      \n\t"\
1612             "movdqa %%xmm4, %%xmm2      \n\t"\
1613             "movdqa %%xmm4, %%xmm1      \n\t"\
1614             "movdqa %%xmm4, %%xmm0      \n\t"\
1615             "palignr $10, %%xmm5, %%xmm0 \n\t"\
1616             "palignr  $8, %%xmm5, %%xmm1 \n\t"\
1617             "palignr  $6, %%xmm5, %%xmm2 \n\t"\
1618             "palignr  $4, %%xmm5, %%xmm3 \n\t"\
1619             "palignr  $2, %%xmm5, %%xmm4 \n\t"\
1620             "paddw  %%xmm5, %%xmm0      \n\t"\
1621             "paddw  %%xmm4, %%xmm1      \n\t"\
1622             "paddw  %%xmm3, %%xmm2      \n\t"\
1623             "movdqa %%xmm5, %%xmm6      \n\t"\
1624             "movdqa %%xmm5, %%xmm4      \n\t"\
1625             "movdqa %%xmm5, %%xmm3      \n\t"\
1626             "palignr  $8, %%xmm7, %%xmm4 \n\t"\
1627             "palignr  $2, %%xmm7, %%xmm6 \n\t"\
1628             "palignr $10, %%xmm7, %%xmm3 \n\t"\
1629             "paddw  %%xmm6, %%xmm4      \n\t"\
1630             "movdqa %%xmm5, %%xmm6      \n\t"\
1631             "palignr  $6, %%xmm7, %%xmm5 \n\t"\
1632             "palignr  $4, %%xmm7, %%xmm6 \n\t"\
1633             "paddw  %%xmm7, %%xmm3      \n\t"\
1634             "paddw  %%xmm6, %%xmm5      \n\t"\
1635             \
1636             "psubw  %%xmm1, %%xmm0      \n\t"\
1637             "psubw  %%xmm4, %%xmm3      \n\t"\
1638             "psraw      $2, %%xmm0      \n\t"\
1639             "psraw      $2, %%xmm3      \n\t"\
1640             "psubw  %%xmm1, %%xmm0      \n\t"\
1641             "psubw  %%xmm4, %%xmm3      \n\t"\
1642             "paddw  %%xmm2, %%xmm0      \n\t"\
1643             "paddw  %%xmm5, %%xmm3      \n\t"\
1644             "psraw      $2, %%xmm0      \n\t"\
1645             "psraw      $2, %%xmm3      \n\t"\
1646             "paddw  %%xmm2, %%xmm0      \n\t"\
1647             "paddw  %%xmm5, %%xmm3      \n\t"\
1648             "psraw      $6, %%xmm0      \n\t"\
1649             "psraw      $6, %%xmm3      \n\t"\
1650             "packuswb %%xmm0, %%xmm3    \n\t"\
1651             OP(%%xmm3, (%1), %%xmm7, dqa)\
1652             "add $48, %0                \n\t"\
1653             "add %3, %1                 \n\t"\
1654             "decl %2                    \n\t"\
1655             " jnz 1b                    \n\t"\
1656             : "+a"(tmp), "+c"(dst), "+g"(h)\
1657             : "S"((long)dstStride)\
1658             : "memory"\
1659         );\
1660     }else{\
1661         asm volatile(\
1662             "1:                         \n\t"\
1663             "movdqa 16(%0), %%xmm1      \n\t"\
1664             "movdqa   (%0), %%xmm0      \n\t"\
1665             "movdqa %%xmm1, %%xmm2      \n\t"\
1666             "movdqa %%xmm1, %%xmm3      \n\t"\
1667             "movdqa %%xmm1, %%xmm4      \n\t"\
1668             "movdqa %%xmm1, %%xmm5      \n\t"\
1669             "palignr $10, %%xmm0, %%xmm5 \n\t"\
1670             "palignr  $8, %%xmm0, %%xmm4 \n\t"\
1671             "palignr  $6, %%xmm0, %%xmm3 \n\t"\
1672             "palignr  $4, %%xmm0, %%xmm2 \n\t"\
1673             "palignr  $2, %%xmm0, %%xmm1 \n\t"\
1674             "paddw  %%xmm5, %%xmm0      \n\t"\
1675             "paddw  %%xmm4, %%xmm1      \n\t"\
1676             "paddw  %%xmm3, %%xmm2      \n\t"\
1677             "psubw  %%xmm1, %%xmm0      \n\t"\
1678             "psraw      $2, %%xmm0      \n\t"\
1679             "psubw  %%xmm1, %%xmm0      \n\t"\
1680             "paddw  %%xmm2, %%xmm0      \n\t"\
1681             "psraw      $2, %%xmm0      \n\t"\
1682             "paddw  %%xmm2, %%xmm0      \n\t"\
1683             "psraw      $6, %%xmm0      \n\t"\
1684             "packuswb %%xmm0, %%xmm0    \n\t"\
1685             OP(%%xmm0, (%1), %%xmm7, q)\
1686             "add $48, %0                \n\t"\
1687             "add %3, %1                 \n\t"\
1688             "decl %2                    \n\t"\
1689             " jnz 1b                    \n\t"\
1690             : "+a"(tmp), "+c"(dst), "+g"(h)\
1691             : "S"((long)dstStride)\
1692             : "memory"\
1693         );\
1694     }\
1695 }
1696 
1697 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
1698 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
1699           put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
1700     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
1701 }\
1702 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1703     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
1704 }\
1705 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1706     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
1707 }\
1708 
1709 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
1710 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
1711 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
1712 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
1713 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
1714 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
1715 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
1716 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
1717 
1718 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
1719 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
1720 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
1721 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
1722 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
1723 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
1724 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
1725 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
1726 
1727 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
1728 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
1729 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
1730 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
1731 
1732 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
1733 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
1734 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
1735 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
1736 
1737 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
1738 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
1739 
1740 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
1741 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
1742 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
1743 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
1744 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
1745 
put_h264_qpel16_mc00_sse2(uint8_t * dst,uint8_t * src,int stride)1746 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1747     put_pixels16_sse2(dst, src, stride, 16);
1748 }
avg_h264_qpel16_mc00_sse2(uint8_t * dst,uint8_t * src,int stride)1749 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
1750     avg_pixels16_sse2(dst, src, stride, 16);
1751 }
1752 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
1753 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
1754 
1755 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
1756 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1757     OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
1758 }\
1759 
1760 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
1761 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1762     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
1763 }\
1764 \
1765 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1766     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
1767 }\
1768 \
1769 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1770     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
1771 }\
1772 
1773 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
1774 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1775     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1776     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1777     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
1778 }\
1779 \
1780 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1781     OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
1782 }\
1783 \
1784 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1785     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1786     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1787     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
1788 }\
1789 
1790 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
1791 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1792     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1793     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1794     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1795 }\
1796 \
1797 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1798     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1799     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1800     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
1801 }\
1802 \
1803 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1804     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1805     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
1806     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1807 }\
1808 \
1809 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1810     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
1811     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
1812     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
1813 }\
1814 \
1815 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1816     DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
1817     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
1818 }\
1819 \
1820 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1821     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
1822     uint8_t * const halfHV= temp;\
1823     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1824     assert(((int)temp & 7) == 0);\
1825     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1826     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
1827 }\
1828 \
1829 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1830     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
1831     uint8_t * const halfHV= temp;\
1832     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1833     assert(((int)temp & 7) == 0);\
1834     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1835     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
1836 }\
1837 \
1838 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1839     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
1840     uint8_t * const halfHV= temp;\
1841     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1842     assert(((int)temp & 7) == 0);\
1843     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1844     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
1845 }\
1846 \
1847 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1848     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
1849     uint8_t * const halfHV= temp;\
1850     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
1851     assert(((int)temp & 7) == 0);\
1852     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
1853     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
1854 }\
1855 
1856 #define H264_MC_4816(MMX)\
1857 H264_MC(put_, 4, MMX, 8)\
1858 H264_MC(put_, 8, MMX, 8)\
1859 H264_MC(put_, 16,MMX, 8)\
1860 H264_MC(avg_, 4, MMX, 8)\
1861 H264_MC(avg_, 8, MMX, 8)\
1862 H264_MC(avg_, 16,MMX, 8)\
1863 
1864 #define H264_MC_816(QPEL, XMM)\
1865 QPEL(put_, 8, XMM, 16)\
1866 QPEL(put_, 16,XMM, 16)\
1867 QPEL(avg_, 8, XMM, 16)\
1868 QPEL(avg_, 16,XMM, 16)\
1869 
1870 
1871 #define AVG_3DNOW_OP(a,b,temp, size) \
1872 "mov" #size " " #b ", " #temp "   \n\t"\
1873 "pavgusb " #temp ", " #a "        \n\t"\
1874 "mov" #size " " #a ", " #b "      \n\t"
1875 #define AVG_MMX2_OP(a,b,temp, size) \
1876 "mov" #size " " #b ", " #temp "   \n\t"\
1877 "pavgb " #temp ", " #a "          \n\t"\
1878 "mov" #size " " #a ", " #b "      \n\t"
1879 
1880 #define PAVGB "pavgusb"
1881 QPEL_H264(put_,       PUT_OP, 3dnow)
1882 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
1883 #undef PAVGB
1884 #define PAVGB "pavgb"
QPEL_H264(put_,PUT_OP,mmx2)1885 QPEL_H264(put_,       PUT_OP, mmx2)
1886 QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
1887 QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
1888 QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
1889 QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
1890 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
1891 #ifdef HAVE_SSSE3
1892 QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
1893 QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
1894 QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
1895 QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
1896 QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
1897 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
1898 #endif
1899 #undef PAVGB
1900 
1901 H264_MC_4816(3dnow)
1902 H264_MC_4816(mmx2)
1903 H264_MC_816(H264_MC_V, sse2)
1904 H264_MC_816(H264_MC_HV, sse2)
1905 #ifdef HAVE_SSSE3
1906 H264_MC_816(H264_MC_H, ssse3)
1907 H264_MC_816(H264_MC_HV, ssse3)
1908 #endif
1909 
1910 
1911 #define H264_CHROMA_OP(S,D)
1912 #define H264_CHROMA_OP4(S,D,T)
1913 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
1914 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
1915 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
1916 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
1917 #include "dsputil_h264_template_mmx.c"
1918 
1919 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1920 {
1921     put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
1922 }
put_h264_chroma_mc8_mmx_nornd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1923 static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1924 {
1925     put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
1926 }
1927 
1928 #undef H264_CHROMA_OP
1929 #undef H264_CHROMA_OP4
1930 #undef H264_CHROMA_MC8_TMPL
1931 #undef H264_CHROMA_MC4_TMPL
1932 #undef H264_CHROMA_MC2_TMPL
1933 #undef H264_CHROMA_MC8_MV0
1934 
1935 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
1936 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
1937                                "pavgb " #T ", " #D " \n\t"
1938 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
1939 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
1940 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
1941 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
1942 #include "dsputil_h264_template_mmx.c"
avg_h264_chroma_mc8_mmx2_rnd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1943 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1944 {
1945     avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
1946 }
1947 #undef H264_CHROMA_OP
1948 #undef H264_CHROMA_OP4
1949 #undef H264_CHROMA_MC8_TMPL
1950 #undef H264_CHROMA_MC4_TMPL
1951 #undef H264_CHROMA_MC2_TMPL
1952 #undef H264_CHROMA_MC8_MV0
1953 
1954 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
1955 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
1956                                "pavgusb " #T ", " #D " \n\t"
1957 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
1958 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
1959 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
1960 #include "dsputil_h264_template_mmx.c"
avg_h264_chroma_mc8_3dnow_rnd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1961 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1962 {
1963     avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
1964 }
1965 #undef H264_CHROMA_OP
1966 #undef H264_CHROMA_OP4
1967 #undef H264_CHROMA_MC8_TMPL
1968 #undef H264_CHROMA_MC4_TMPL
1969 #undef H264_CHROMA_MC8_MV0
1970 
1971 #ifdef HAVE_SSSE3
1972 #define AVG_OP(X)
1973 #undef H264_CHROMA_MC8_TMPL
1974 #undef H264_CHROMA_MC4_TMPL
1975 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
1976 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
1977 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
1978 #include "dsputil_h264_template_ssse3.c"
put_h264_chroma_mc8_ssse3_rnd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1979 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1980 {
1981     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
1982 }
put_h264_chroma_mc8_ssse3_nornd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1983 static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1984 {
1985     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
1986 }
1987 
1988 #undef AVG_OP
1989 #undef H264_CHROMA_MC8_TMPL
1990 #undef H264_CHROMA_MC4_TMPL
1991 #undef H264_CHROMA_MC8_MV0
1992 #define AVG_OP(X) X
1993 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
1994 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
1995 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
1996 #include "dsputil_h264_template_ssse3.c"
avg_h264_chroma_mc8_ssse3_rnd(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)1997 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
1998 {
1999     avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
2000 }
2001 #undef AVG_OP
2002 #undef H264_CHROMA_MC8_TMPL
2003 #undef H264_CHROMA_MC4_TMPL
2004 #undef H264_CHROMA_MC8_MV0
2005 #endif
2006 
2007 /***********************************/
2008 /* weighted prediction */
2009 
ff_h264_weight_WxH_mmx2(uint8_t * dst,int stride,int log2_denom,int weight,int offset,int w,int h)2010 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
2011 {
2012     int x, y;
2013     offset <<= log2_denom;
2014     offset += (1 << log2_denom) >> 1;
2015     asm volatile(
2016         "movd    %0, %%mm4        \n\t"
2017         "movd    %1, %%mm5        \n\t"
2018         "movd    %2, %%mm6        \n\t"
2019         "pshufw  $0, %%mm4, %%mm4 \n\t"
2020         "pshufw  $0, %%mm5, %%mm5 \n\t"
2021         "pxor    %%mm7, %%mm7     \n\t"
2022         :: "g"(weight), "g"(offset), "g"(log2_denom)
2023     );
2024     for(y=0; y<h; y+=2){
2025         for(x=0; x<w; x+=4){
2026             asm volatile(
2027                 "movd      %0,    %%mm0 \n\t"
2028                 "movd      %1,    %%mm1 \n\t"
2029                 "punpcklbw %%mm7, %%mm0 \n\t"
2030                 "punpcklbw %%mm7, %%mm1 \n\t"
2031                 "pmullw    %%mm4, %%mm0 \n\t"
2032                 "pmullw    %%mm4, %%mm1 \n\t"
2033                 "paddsw    %%mm5, %%mm0 \n\t"
2034                 "paddsw    %%mm5, %%mm1 \n\t"
2035                 "psraw     %%mm6, %%mm0 \n\t"
2036                 "psraw     %%mm6, %%mm1 \n\t"
2037                 "packuswb  %%mm7, %%mm0 \n\t"
2038                 "packuswb  %%mm7, %%mm1 \n\t"
2039                 "movd      %%mm0, %0    \n\t"
2040                 "movd      %%mm1, %1    \n\t"
2041                 : "+m"(*(uint32_t*)(dst+x)),
2042                   "+m"(*(uint32_t*)(dst+x+stride))
2043             );
2044         }
2045         dst += 2*stride;
2046     }
2047 }
2048 
ff_h264_biweight_WxH_mmx2(uint8_t * dst,uint8_t * src,int stride,int log2_denom,int weightd,int weights,int offset,int w,int h)2049 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
2050 {
2051     int x, y;
2052     offset = ((offset + 1) | 1) << log2_denom;
2053     asm volatile(
2054         "movd    %0, %%mm3        \n\t"
2055         "movd    %1, %%mm4        \n\t"
2056         "movd    %2, %%mm5        \n\t"
2057         "movd    %3, %%mm6        \n\t"
2058         "pshufw  $0, %%mm3, %%mm3 \n\t"
2059         "pshufw  $0, %%mm4, %%mm4 \n\t"
2060         "pshufw  $0, %%mm5, %%mm5 \n\t"
2061         "pxor    %%mm7, %%mm7     \n\t"
2062         :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
2063     );
2064     for(y=0; y<h; y++){
2065         for(x=0; x<w; x+=4){
2066             asm volatile(
2067                 "movd      %0,    %%mm0 \n\t"
2068                 "movd      %1,    %%mm1 \n\t"
2069                 "punpcklbw %%mm7, %%mm0 \n\t"
2070                 "punpcklbw %%mm7, %%mm1 \n\t"
2071                 "pmullw    %%mm3, %%mm0 \n\t"
2072                 "pmullw    %%mm4, %%mm1 \n\t"
2073                 "paddsw    %%mm1, %%mm0 \n\t"
2074                 "paddsw    %%mm5, %%mm0 \n\t"
2075                 "psraw     %%mm6, %%mm0 \n\t"
2076                 "packuswb  %%mm0, %%mm0 \n\t"
2077                 "movd      %%mm0, %0    \n\t"
2078                 : "+m"(*(uint32_t*)(dst+x))
2079                 :  "m"(*(uint32_t*)(src+x))
2080             );
2081         }
2082         src += stride;
2083         dst += stride;
2084     }
2085 }
2086 
2087 #define H264_WEIGHT(W,H) \
2088 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2089     ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
2090 } \
2091 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
2092     ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
2093 }
2094 
2095 H264_WEIGHT(16,16)
2096 H264_WEIGHT(16, 8)
2097 H264_WEIGHT( 8,16)
2098 H264_WEIGHT( 8, 8)
2099 H264_WEIGHT( 8, 4)
2100 H264_WEIGHT( 4, 8)
2101 H264_WEIGHT( 4, 4)
2102 H264_WEIGHT( 4, 2)
2103 
2104