1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
31 
32 int ff_sum_abs_dctelem_mmx(int16_t *block);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
34 int ff_sum_abs_dctelem_sse2(int16_t *block);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37                 int line_size, int h);
38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39                  int line_size, int h);
40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41                   int line_size, int h);
42 int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
43 int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
44 
45 #define hadamard_func(cpu)                                              \
46     int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
47                                   uint8_t *src2, int stride, int h);    \
48     int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
49                                     uint8_t *src2, int stride, int h);
50 
51 hadamard_func(mmx)
hadamard_func(mmxext)52 hadamard_func(mmxext)
53 hadamard_func(sse2)
54 hadamard_func(ssse3)
55 
56 #if HAVE_YASM
57 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
58                       int line_size, int h)
59 {
60     int score1, score2;
61 
62     if (c)
63         score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h);
64     else
65         score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
66     score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
67            - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
68 
69     if (c)
70         return score1 + FFABS(score2) * c->avctx->nsse_weight;
71     else
72         return score1 + FFABS(score2) * 8;
73 }
74 
nsse8_mmx(MpegEncContext * c,uint8_t * pix1,uint8_t * pix2,int line_size,int h)75 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
76                      int line_size, int h)
77 {
78     int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
79     int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
80                  ff_hf_noise8_mmx(pix2, line_size, h);
81 
82     if (c)
83         return score1 + FFABS(score2) * c->avctx->nsse_weight;
84     else
85         return score1 + FFABS(score2) * 8;
86 }
87 
88 #endif /* HAVE_YASM */
89 
90 #if HAVE_INLINE_ASM
91 
vsad_intra16_mmx(MpegEncContext * v,uint8_t * pix,uint8_t * dummy,int line_size,int h)92 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
93                             int line_size, int h)
94 {
95     int tmp;
96 
97     av_assert2((((int) pix) & 7) == 0);
98     av_assert2((line_size & 7) == 0);
99 
100 #define SUM(in0, in1, out0, out1)               \
101     "movq (%0), %%mm2\n"                        \
102     "movq 8(%0), %%mm3\n"                       \
103     "add %2,%0\n"                               \
104     "movq %%mm2, " #out0 "\n"                   \
105     "movq %%mm3, " #out1 "\n"                   \
106     "psubusb " #in0 ", %%mm2\n"                 \
107     "psubusb " #in1 ", %%mm3\n"                 \
108     "psubusb " #out0 ", " #in0 "\n"             \
109     "psubusb " #out1 ", " #in1 "\n"             \
110     "por %%mm2, " #in0 "\n"                     \
111     "por %%mm3, " #in1 "\n"                     \
112     "movq " #in0 ", %%mm2\n"                    \
113     "movq " #in1 ", %%mm3\n"                    \
114     "punpcklbw %%mm7, " #in0 "\n"               \
115     "punpcklbw %%mm7, " #in1 "\n"               \
116     "punpckhbw %%mm7, %%mm2\n"                  \
117     "punpckhbw %%mm7, %%mm3\n"                  \
118     "paddw " #in1 ", " #in0 "\n"                \
119     "paddw %%mm3, %%mm2\n"                      \
120     "paddw %%mm2, " #in0 "\n"                   \
121     "paddw " #in0 ", %%mm6\n"
122 
123 
124     __asm__ volatile (
125         "movl    %3, %%ecx\n"
126         "pxor %%mm6, %%mm6\n"
127         "pxor %%mm7, %%mm7\n"
128         "movq  (%0), %%mm0\n"
129         "movq 8(%0), %%mm1\n"
130         "add %2, %0\n"
131         "jmp 2f\n"
132         "1:\n"
133 
134         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
135         "2:\n"
136         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
137 
138         "subl $2, %%ecx\n"
139         "jnz 1b\n"
140 
141         "movq  %%mm6, %%mm0\n"
142         "psrlq $32,   %%mm6\n"
143         "paddw %%mm6, %%mm0\n"
144         "movq  %%mm0, %%mm6\n"
145         "psrlq $16,   %%mm0\n"
146         "paddw %%mm6, %%mm0\n"
147         "movd  %%mm0, %1\n"
148         : "+r" (pix), "=r" (tmp)
149         : "r" ((x86_reg) line_size), "m" (h)
150         : "%ecx");
151 
152     return tmp & 0xFFFF;
153 }
154 #undef SUM
155 
vsad_intra16_mmxext(MpegEncContext * v,uint8_t * pix,uint8_t * dummy,int line_size,int h)156 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
157                                int line_size, int h)
158 {
159     int tmp;
160 
161     av_assert2((((int) pix) & 7) == 0);
162     av_assert2((line_size & 7) == 0);
163 
164 #define SUM(in0, in1, out0, out1)               \
165     "movq (%0), " #out0 "\n"                    \
166     "movq 8(%0), " #out1 "\n"                   \
167     "add %2, %0\n"                              \
168     "psadbw " #out0 ", " #in0 "\n"              \
169     "psadbw " #out1 ", " #in1 "\n"              \
170     "paddw " #in1 ", " #in0 "\n"                \
171     "paddw " #in0 ", %%mm6\n"
172 
173     __asm__ volatile (
174         "movl %3, %%ecx\n"
175         "pxor %%mm6, %%mm6\n"
176         "pxor %%mm7, %%mm7\n"
177         "movq (%0), %%mm0\n"
178         "movq 8(%0), %%mm1\n"
179         "add %2, %0\n"
180         "jmp 2f\n"
181         "1:\n"
182 
183         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
184         "2:\n"
185         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
186 
187         "subl $2, %%ecx\n"
188         "jnz 1b\n"
189 
190         "movd %%mm6, %1\n"
191         : "+r" (pix), "=r" (tmp)
192         : "r" ((x86_reg) line_size), "m" (h)
193         : "%ecx");
194 
195     return tmp;
196 }
197 #undef SUM
198 
vsad16_mmx(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)199 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
200                       int line_size, int h)
201 {
202     int tmp;
203 
204     av_assert2((((int) pix1) & 7) == 0);
205     av_assert2((((int) pix2) & 7) == 0);
206     av_assert2((line_size & 7) == 0);
207 
208 #define SUM(in0, in1, out0, out1)       \
209     "movq (%0), %%mm2\n"                \
210     "movq (%1), " #out0 "\n"            \
211     "movq 8(%0), %%mm3\n"               \
212     "movq 8(%1), " #out1 "\n"           \
213     "add %3, %0\n"                      \
214     "add %3, %1\n"                      \
215     "psubb " #out0 ", %%mm2\n"          \
216     "psubb " #out1 ", %%mm3\n"          \
217     "pxor %%mm7, %%mm2\n"               \
218     "pxor %%mm7, %%mm3\n"               \
219     "movq %%mm2, " #out0 "\n"           \
220     "movq %%mm3, " #out1 "\n"           \
221     "psubusb " #in0 ", %%mm2\n"         \
222     "psubusb " #in1 ", %%mm3\n"         \
223     "psubusb " #out0 ", " #in0 "\n"     \
224     "psubusb " #out1 ", " #in1 "\n"     \
225     "por %%mm2, " #in0 "\n"             \
226     "por %%mm3, " #in1 "\n"             \
227     "movq " #in0 ", %%mm2\n"            \
228     "movq " #in1 ", %%mm3\n"            \
229     "punpcklbw %%mm7, " #in0 "\n"       \
230     "punpcklbw %%mm7, " #in1 "\n"       \
231     "punpckhbw %%mm7, %%mm2\n"          \
232     "punpckhbw %%mm7, %%mm3\n"          \
233     "paddw " #in1 ", " #in0 "\n"        \
234     "paddw %%mm3, %%mm2\n"              \
235     "paddw %%mm2, " #in0 "\n"           \
236     "paddw " #in0 ", %%mm6\n"
237 
238 
239     __asm__ volatile (
240         "movl %4, %%ecx\n"
241         "pxor %%mm6, %%mm6\n"
242         "pcmpeqw %%mm7, %%mm7\n"
243         "psllw $15, %%mm7\n"
244         "packsswb %%mm7, %%mm7\n"
245         "movq (%0), %%mm0\n"
246         "movq (%1), %%mm2\n"
247         "movq 8(%0), %%mm1\n"
248         "movq 8(%1), %%mm3\n"
249         "add %3, %0\n"
250         "add %3, %1\n"
251         "psubb %%mm2, %%mm0\n"
252         "psubb %%mm3, %%mm1\n"
253         "pxor %%mm7, %%mm0\n"
254         "pxor %%mm7, %%mm1\n"
255         "jmp 2f\n"
256         "1:\n"
257 
258         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
259         "2:\n"
260         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
261 
262         "subl $2, %%ecx\n"
263         "jnz 1b\n"
264 
265         "movq %%mm6, %%mm0\n"
266         "psrlq $32, %%mm6\n"
267         "paddw %%mm6, %%mm0\n"
268         "movq %%mm0, %%mm6\n"
269         "psrlq $16, %%mm0\n"
270         "paddw %%mm6, %%mm0\n"
271         "movd %%mm0, %2\n"
272         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
273         : "r" ((x86_reg) line_size), "m" (h)
274         : "%ecx");
275 
276     return tmp & 0x7FFF;
277 }
278 #undef SUM
279 
vsad16_mmxext(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,int line_size,int h)280 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
281                          int line_size, int h)
282 {
283     int tmp;
284 
285     av_assert2((((int) pix1) & 7) == 0);
286     av_assert2((((int) pix2) & 7) == 0);
287     av_assert2((line_size & 7) == 0);
288 
289 #define SUM(in0, in1, out0, out1)               \
290     "movq (%0), " #out0 "\n"                    \
291     "movq (%1), %%mm2\n"                        \
292     "movq 8(%0), " #out1 "\n"                   \
293     "movq 8(%1), %%mm3\n"                       \
294     "add %3, %0\n"                              \
295     "add %3, %1\n"                              \
296     "psubb %%mm2, " #out0 "\n"                  \
297     "psubb %%mm3, " #out1 "\n"                  \
298     "pxor %%mm7, " #out0 "\n"                   \
299     "pxor %%mm7, " #out1 "\n"                   \
300     "psadbw " #out0 ", " #in0 "\n"              \
301     "psadbw " #out1 ", " #in1 "\n"              \
302     "paddw " #in1 ", " #in0 "\n"                \
303     "paddw " #in0 ", %%mm6\n    "
304 
305     __asm__ volatile (
306         "movl %4, %%ecx\n"
307         "pxor %%mm6, %%mm6\n"
308         "pcmpeqw %%mm7, %%mm7\n"
309         "psllw $15, %%mm7\n"
310         "packsswb %%mm7, %%mm7\n"
311         "movq (%0), %%mm0\n"
312         "movq (%1), %%mm2\n"
313         "movq 8(%0), %%mm1\n"
314         "movq 8(%1), %%mm3\n"
315         "add %3, %0\n"
316         "add %3, %1\n"
317         "psubb %%mm2, %%mm0\n"
318         "psubb %%mm3, %%mm1\n"
319         "pxor %%mm7, %%mm0\n"
320         "pxor %%mm7, %%mm1\n"
321         "jmp 2f\n"
322         "1:\n"
323 
324         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
325         "2:\n"
326         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
327 
328         "subl $2, %%ecx\n"
329         "jnz 1b\n"
330 
331         "movd %%mm6, %2\n"
332         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
333         : "r" ((x86_reg) line_size), "m" (h)
334         : "%ecx");
335 
336     return tmp;
337 }
338 #undef SUM
339 
340 
341 
342 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
343     0x0000000000000000ULL,
344     0x0001000100010001ULL,
345     0x0002000200020002ULL,
346 };
347 
348 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
349 
sad8_1_mmx(uint8_t * blk1,uint8_t * blk2,int stride,int h)350 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
351 {
352     x86_reg len = -(x86_reg)stride * h;
353     __asm__ volatile (
354         ".p2align 4                     \n\t"
355         "1:                             \n\t"
356         "movq (%1, %%"REG_a"), %%mm0    \n\t"
357         "movq (%2, %%"REG_a"), %%mm2    \n\t"
358         "movq (%2, %%"REG_a"), %%mm4    \n\t"
359         "add %3, %%"REG_a"              \n\t"
360         "psubusb %%mm0, %%mm2           \n\t"
361         "psubusb %%mm4, %%mm0           \n\t"
362         "movq (%1, %%"REG_a"), %%mm1    \n\t"
363         "movq (%2, %%"REG_a"), %%mm3    \n\t"
364         "movq (%2, %%"REG_a"), %%mm5    \n\t"
365         "psubusb %%mm1, %%mm3           \n\t"
366         "psubusb %%mm5, %%mm1           \n\t"
367         "por %%mm2, %%mm0               \n\t"
368         "por %%mm1, %%mm3               \n\t"
369         "movq %%mm0, %%mm1              \n\t"
370         "movq %%mm3, %%mm2              \n\t"
371         "punpcklbw %%mm7, %%mm0         \n\t"
372         "punpckhbw %%mm7, %%mm1         \n\t"
373         "punpcklbw %%mm7, %%mm3         \n\t"
374         "punpckhbw %%mm7, %%mm2         \n\t"
375         "paddw %%mm1, %%mm0             \n\t"
376         "paddw %%mm3, %%mm2             \n\t"
377         "paddw %%mm2, %%mm0             \n\t"
378         "paddw %%mm0, %%mm6             \n\t"
379         "add %3, %%"REG_a"              \n\t"
380         " js 1b                         \n\t"
381         : "+a" (len)
382         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
383 }
384 
sad8_1_mmxext(uint8_t * blk1,uint8_t * blk2,int stride,int h)385 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
386                                  int stride, int h)
387 {
388     __asm__ volatile (
389         ".p2align 4                     \n\t"
390         "1:                             \n\t"
391         "movq (%1), %%mm0               \n\t"
392         "movq (%1, %3), %%mm1           \n\t"
393         "psadbw (%2), %%mm0             \n\t"
394         "psadbw (%2, %3), %%mm1         \n\t"
395         "paddw %%mm0, %%mm6             \n\t"
396         "paddw %%mm1, %%mm6             \n\t"
397         "lea (%1,%3,2), %1              \n\t"
398         "lea (%2,%3,2), %2              \n\t"
399         "sub $2, %0                     \n\t"
400         " jg 1b                         \n\t"
401         : "+r" (h), "+r" (blk1), "+r" (blk2)
402         : "r" ((x86_reg) stride));
403 }
404 
sad16_sse2(MpegEncContext * v,uint8_t * blk2,uint8_t * blk1,int stride,int h)405 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
406                       int stride, int h)
407 {
408     int ret;
409     __asm__ volatile (
410         "pxor %%xmm2, %%xmm2            \n\t"
411         ".p2align 4                     \n\t"
412         "1:                             \n\t"
413         "movdqu (%1), %%xmm0            \n\t"
414         "movdqu (%1, %4), %%xmm1        \n\t"
415         "psadbw (%2), %%xmm0            \n\t"
416         "psadbw (%2, %4), %%xmm1        \n\t"
417         "paddw %%xmm0, %%xmm2           \n\t"
418         "paddw %%xmm1, %%xmm2           \n\t"
419         "lea (%1,%4,2), %1              \n\t"
420         "lea (%2,%4,2), %2              \n\t"
421         "sub $2, %0                     \n\t"
422         " jg 1b                         \n\t"
423         "movhlps %%xmm2, %%xmm0         \n\t"
424         "paddw   %%xmm0, %%xmm2         \n\t"
425         "movd    %%xmm2, %3             \n\t"
426         : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
427         : "r" ((x86_reg) stride));
428     return ret;
429 }
430 
sad8_x2a_mmxext(uint8_t * blk1,uint8_t * blk2,int stride,int h)431 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
432                                    int stride, int h)
433 {
434     __asm__ volatile (
435         ".p2align 4                     \n\t"
436         "1:                             \n\t"
437         "movq (%1), %%mm0               \n\t"
438         "movq (%1, %3), %%mm1           \n\t"
439         "pavgb 1(%1), %%mm0             \n\t"
440         "pavgb 1(%1, %3), %%mm1         \n\t"
441         "psadbw (%2), %%mm0             \n\t"
442         "psadbw (%2, %3), %%mm1         \n\t"
443         "paddw %%mm0, %%mm6             \n\t"
444         "paddw %%mm1, %%mm6             \n\t"
445         "lea (%1,%3,2), %1              \n\t"
446         "lea (%2,%3,2), %2              \n\t"
447         "sub $2, %0                     \n\t"
448         " jg 1b                         \n\t"
449         : "+r" (h), "+r" (blk1), "+r" (blk2)
450         : "r" ((x86_reg) stride));
451 }
452 
sad8_y2a_mmxext(uint8_t * blk1,uint8_t * blk2,int stride,int h)453 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
454                                    int stride, int h)
455 {
456     __asm__ volatile (
457         "movq (%1), %%mm0               \n\t"
458         "add %3, %1                     \n\t"
459         ".p2align 4                     \n\t"
460         "1:                             \n\t"
461         "movq (%1), %%mm1               \n\t"
462         "movq (%1, %3), %%mm2           \n\t"
463         "pavgb %%mm1, %%mm0             \n\t"
464         "pavgb %%mm2, %%mm1             \n\t"
465         "psadbw (%2), %%mm0             \n\t"
466         "psadbw (%2, %3), %%mm1         \n\t"
467         "paddw %%mm0, %%mm6             \n\t"
468         "paddw %%mm1, %%mm6             \n\t"
469         "movq %%mm2, %%mm0              \n\t"
470         "lea (%1,%3,2), %1              \n\t"
471         "lea (%2,%3,2), %2              \n\t"
472         "sub $2, %0                     \n\t"
473         " jg 1b                         \n\t"
474         : "+r" (h), "+r" (blk1), "+r" (blk2)
475         : "r" ((x86_reg) stride));
476 }
477 
sad8_4_mmxext(uint8_t * blk1,uint8_t * blk2,int stride,int h)478 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
479                                  int stride, int h)
480 {
481     __asm__ volatile (
482         "movq "MANGLE(bone)", %%mm5     \n\t"
483         "movq (%1), %%mm0               \n\t"
484         "pavgb 1(%1), %%mm0             \n\t"
485         "add %3, %1                     \n\t"
486         ".p2align 4                     \n\t"
487         "1:                             \n\t"
488         "movq (%1), %%mm1               \n\t"
489         "movq (%1,%3), %%mm2            \n\t"
490         "pavgb 1(%1), %%mm1             \n\t"
491         "pavgb 1(%1,%3), %%mm2          \n\t"
492         "psubusb %%mm5, %%mm1           \n\t"
493         "pavgb %%mm1, %%mm0             \n\t"
494         "pavgb %%mm2, %%mm1             \n\t"
495         "psadbw (%2), %%mm0             \n\t"
496         "psadbw (%2,%3), %%mm1          \n\t"
497         "paddw %%mm0, %%mm6             \n\t"
498         "paddw %%mm1, %%mm6             \n\t"
499         "movq %%mm2, %%mm0              \n\t"
500         "lea (%1,%3,2), %1              \n\t"
501         "lea (%2,%3,2), %2              \n\t"
502         "sub $2, %0                     \n\t"
503         " jg 1b                         \n\t"
504         : "+r" (h), "+r" (blk1), "+r" (blk2)
505         : "r" ((x86_reg) stride)
506           NAMED_CONSTRAINTS_ADD(bone));
507 }
508 
sad8_2_mmx(uint8_t * blk1a,uint8_t * blk1b,uint8_t * blk2,int stride,int h)509 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
510                               int stride, int h)
511 {
512     x86_reg len = -(x86_reg)stride * h;
513     __asm__ volatile (
514         ".p2align 4                     \n\t"
515         "1:                             \n\t"
516         "movq (%1, %%"REG_a"), %%mm0    \n\t"
517         "movq (%2, %%"REG_a"), %%mm1    \n\t"
518         "movq (%1, %%"REG_a"), %%mm2    \n\t"
519         "movq (%2, %%"REG_a"), %%mm3    \n\t"
520         "punpcklbw %%mm7, %%mm0         \n\t"
521         "punpcklbw %%mm7, %%mm1         \n\t"
522         "punpckhbw %%mm7, %%mm2         \n\t"
523         "punpckhbw %%mm7, %%mm3         \n\t"
524         "paddw %%mm0, %%mm1             \n\t"
525         "paddw %%mm2, %%mm3             \n\t"
526         "movq (%3, %%"REG_a"), %%mm4    \n\t"
527         "movq (%3, %%"REG_a"), %%mm2    \n\t"
528         "paddw %%mm5, %%mm1             \n\t"
529         "paddw %%mm5, %%mm3             \n\t"
530         "psrlw $1, %%mm1                \n\t"
531         "psrlw $1, %%mm3                \n\t"
532         "packuswb %%mm3, %%mm1          \n\t"
533         "psubusb %%mm1, %%mm4           \n\t"
534         "psubusb %%mm2, %%mm1           \n\t"
535         "por %%mm4, %%mm1               \n\t"
536         "movq %%mm1, %%mm0              \n\t"
537         "punpcklbw %%mm7, %%mm0         \n\t"
538         "punpckhbw %%mm7, %%mm1         \n\t"
539         "paddw %%mm1, %%mm0             \n\t"
540         "paddw %%mm0, %%mm6             \n\t"
541         "add %4, %%"REG_a"              \n\t"
542         " js 1b                         \n\t"
543         : "+a" (len)
544         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
545           "r" ((x86_reg) stride));
546 }
547 
sad8_4_mmx(uint8_t * blk1,uint8_t * blk2,int stride,int h)548 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
549 {
550     x86_reg len = -(x86_reg)stride * h;
551     __asm__ volatile (
552         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
553         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
554         "movq %%mm0, %%mm1              \n\t"
555         "movq %%mm2, %%mm3              \n\t"
556         "punpcklbw %%mm7, %%mm0         \n\t"
557         "punpckhbw %%mm7, %%mm1         \n\t"
558         "punpcklbw %%mm7, %%mm2         \n\t"
559         "punpckhbw %%mm7, %%mm3         \n\t"
560         "paddw %%mm2, %%mm0             \n\t"
561         "paddw %%mm3, %%mm1             \n\t"
562         ".p2align 4                     \n\t"
563         "1:                             \n\t"
564         "movq  (%2, %%"REG_a"), %%mm2   \n\t"
565         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
566         "movq %%mm2, %%mm3              \n\t"
567         "movq %%mm4, %%mm5              \n\t"
568         "punpcklbw %%mm7, %%mm2         \n\t"
569         "punpckhbw %%mm7, %%mm3         \n\t"
570         "punpcklbw %%mm7, %%mm4         \n\t"
571         "punpckhbw %%mm7, %%mm5         \n\t"
572         "paddw %%mm4, %%mm2             \n\t"
573         "paddw %%mm5, %%mm3             \n\t"
574         "movq %5, %%mm5                 \n\t"
575         "paddw %%mm2, %%mm0             \n\t"
576         "paddw %%mm3, %%mm1             \n\t"
577         "paddw %%mm5, %%mm0             \n\t"
578         "paddw %%mm5, %%mm1             \n\t"
579         "movq (%3, %%"REG_a"), %%mm4    \n\t"
580         "movq (%3, %%"REG_a"), %%mm5    \n\t"
581         "psrlw $2, %%mm0                \n\t"
582         "psrlw $2, %%mm1                \n\t"
583         "packuswb %%mm1, %%mm0          \n\t"
584         "psubusb %%mm0, %%mm4           \n\t"
585         "psubusb %%mm5, %%mm0           \n\t"
586         "por %%mm4, %%mm0               \n\t"
587         "movq %%mm0, %%mm4              \n\t"
588         "punpcklbw %%mm7, %%mm0         \n\t"
589         "punpckhbw %%mm7, %%mm4         \n\t"
590         "paddw %%mm0, %%mm6             \n\t"
591         "paddw %%mm4, %%mm6             \n\t"
592         "movq  %%mm2, %%mm0             \n\t"
593         "movq  %%mm3, %%mm1             \n\t"
594         "add %4, %%"REG_a"              \n\t"
595         " js 1b                         \n\t"
596         : "+a" (len)
597         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
598           "r" ((x86_reg) stride), "m" (round_tab[2]));
599 }
600 
sum_mmx(void)601 static inline int sum_mmx(void)
602 {
603     int ret;
604     __asm__ volatile (
605         "movq %%mm6, %%mm0              \n\t"
606         "psrlq $32, %%mm6               \n\t"
607         "paddw %%mm0, %%mm6             \n\t"
608         "movq %%mm6, %%mm0              \n\t"
609         "psrlq $16, %%mm6               \n\t"
610         "paddw %%mm0, %%mm6             \n\t"
611         "movd %%mm6, %0                 \n\t"
612         : "=r" (ret));
613     return ret & 0xFFFF;
614 }
615 
sum_mmxext(void)616 static inline int sum_mmxext(void)
617 {
618     int ret;
619     __asm__ volatile (
620         "movd %%mm6, %0                 \n\t"
621         : "=r" (ret));
622     return ret;
623 }
624 
sad8_x2a_mmx(uint8_t * blk1,uint8_t * blk2,int stride,int h)625 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
626 {
627     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
628 }
629 
sad8_y2a_mmx(uint8_t * blk1,uint8_t * blk2,int stride,int h)630 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
631 {
632     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
633 }
634 
635 #define PIX_SAD(suf)                                                    \
636 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
637                         uint8_t *blk1, int stride, int h)               \
638 {                                                                       \
639     av_assert2(h == 8);                                                     \
640     __asm__ volatile (                                                  \
641         "pxor %%mm7, %%mm7     \n\t"                                    \
642         "pxor %%mm6, %%mm6     \n\t"                                    \
643         :);                                                             \
644                                                                         \
645     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
646                                                                         \
647     return sum_ ## suf();                                               \
648 }                                                                       \
649                                                                         \
650 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
651                            uint8_t *blk1, int stride, int h)            \
652 {                                                                       \
653     av_assert2(h == 8);                                                     \
654     __asm__ volatile (                                                  \
655         "pxor %%mm7, %%mm7     \n\t"                                    \
656         "pxor %%mm6, %%mm6     \n\t"                                    \
657         "movq %0, %%mm5        \n\t"                                    \
658         :: "m" (round_tab[1]));                                         \
659                                                                         \
660     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
661                                                                         \
662     return sum_ ## suf();                                               \
663 }                                                                       \
664                                                                         \
665 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
666                            uint8_t *blk1, int stride, int h)            \
667 {                                                                       \
668     av_assert2(h == 8);                                                     \
669     __asm__ volatile (                                                  \
670         "pxor %%mm7, %%mm7     \n\t"                                    \
671         "pxor %%mm6, %%mm6     \n\t"                                    \
672         "movq %0, %%mm5        \n\t"                                    \
673         :: "m" (round_tab[1]));                                         \
674                                                                         \
675     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
676                                                                         \
677     return sum_ ## suf();                                               \
678 }                                                                       \
679                                                                         \
680 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
681                             uint8_t *blk1, int stride, int h)           \
682 {                                                                       \
683     av_assert2(h == 8);                                                     \
684     __asm__ volatile (                                                  \
685         "pxor %%mm7, %%mm7     \n\t"                                    \
686         "pxor %%mm6, %%mm6     \n\t"                                    \
687         ::);                                                            \
688                                                                         \
689     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
690                                                                         \
691     return sum_ ## suf();                                               \
692 }                                                                       \
693                                                                         \
694 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
695                          uint8_t *blk1, int stride, int h)              \
696 {                                                                       \
697     __asm__ volatile (                                                  \
698         "pxor %%mm7, %%mm7     \n\t"                                    \
699         "pxor %%mm6, %%mm6     \n\t"                                    \
700         :);                                                             \
701                                                                         \
702     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
703     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
704                                                                         \
705     return sum_ ## suf();                                               \
706 }                                                                       \
707                                                                         \
708 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
709                             uint8_t *blk1, int stride, int h)           \
710 {                                                                       \
711     __asm__ volatile (                                                  \
712         "pxor %%mm7, %%mm7     \n\t"                                    \
713         "pxor %%mm6, %%mm6     \n\t"                                    \
714         "movq %0, %%mm5        \n\t"                                    \
715         :: "m" (round_tab[1]));                                         \
716                                                                         \
717     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
718     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
719                                                                         \
720     return sum_ ## suf();                                               \
721 }                                                                       \
722                                                                         \
723 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
724                             uint8_t *blk1, int stride, int h)           \
725 {                                                                       \
726     __asm__ volatile (                                                  \
727         "pxor %%mm7, %%mm7     \n\t"                                    \
728         "pxor %%mm6, %%mm6     \n\t"                                    \
729         "movq %0, %%mm5        \n\t"                                    \
730         :: "m" (round_tab[1]));                                         \
731                                                                         \
732     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
733     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
734                                                                         \
735     return sum_ ## suf();                                               \
736 }                                                                       \
737                                                                         \
738 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
739                              uint8_t *blk1, int stride, int h)          \
740 {                                                                       \
741     __asm__ volatile (                                                  \
742         "pxor %%mm7, %%mm7     \n\t"                                    \
743         "pxor %%mm6, %%mm6     \n\t"                                    \
744         ::);                                                            \
745                                                                         \
746     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
747     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
748                                                                         \
749     return sum_ ## suf();                                               \
750 }                                                                       \
751 
752 PIX_SAD(mmx)
PIX_SAD(mmxext)753 PIX_SAD(mmxext)
754 
755 #endif /* HAVE_INLINE_ASM */
756 
757 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
758 {
759     int cpu_flags = av_get_cpu_flags();
760 
761 #if HAVE_INLINE_ASM
762     if (INLINE_MMX(cpu_flags)) {
763         c->pix_abs[0][0] = sad16_mmx;
764         c->pix_abs[0][1] = sad16_x2_mmx;
765         c->pix_abs[0][2] = sad16_y2_mmx;
766         c->pix_abs[0][3] = sad16_xy2_mmx;
767         c->pix_abs[1][0] = sad8_mmx;
768         c->pix_abs[1][1] = sad8_x2_mmx;
769         c->pix_abs[1][2] = sad8_y2_mmx;
770         c->pix_abs[1][3] = sad8_xy2_mmx;
771 
772         c->sad[0] = sad16_mmx;
773         c->sad[1] = sad8_mmx;
774 
775         c->vsad[4] = vsad_intra16_mmx;
776 
777         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
778             c->vsad[0] = vsad16_mmx;
779         }
780     }
781 
782     if (INLINE_MMXEXT(cpu_flags)) {
783         c->vsad[4] = vsad_intra16_mmxext;
784 
785         c->pix_abs[0][0] = sad16_mmxext;
786         c->pix_abs[1][0] = sad8_mmxext;
787 
788         c->sad[0] = sad16_mmxext;
789         c->sad[1] = sad8_mmxext;
790 
791         c->pix_abs[0][1] = sad16_x2_mmxext;
792         c->pix_abs[0][2] = sad16_y2_mmxext;
793         c->pix_abs[1][1] = sad8_x2_mmxext;
794         c->pix_abs[1][2] = sad8_y2_mmxext;
795 
796         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
797             c->pix_abs[0][3] = sad16_xy2_mmxext;
798             c->pix_abs[1][3] = sad8_xy2_mmxext;
799 
800             c->vsad[0] = vsad16_mmxext;
801         }
802     }
803 
804     if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
805         c->sad[0] = sad16_sse2;
806     }
807 
808 #endif /* HAVE_INLINE_ASM */
809 
810 #if (HAVE_MMX_EXTERNAL == 1)
811     if (EXTERNAL_MMX(cpu_flags)) {
812         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
813         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
814         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
815         c->sse[0]            = ff_sse16_mmx;
816         c->sse[1]            = ff_sse8_mmx;
817 #if HAVE_YASM
818         c->nsse[0]           = nsse16_mmx;
819         c->nsse[1]           = nsse8_mmx;
820 #endif
821     }
822 #endif
823 
824 #if (HAVE_MMXEXT_EXTERNAL == 1)
825     if (EXTERNAL_MMXEXT(cpu_flags)) {
826         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
827         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
828         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
829     }
830 #endif
831 
832 #if (HAVE_SSE2_EXTERNAL == 1)
833     if (EXTERNAL_SSE2(cpu_flags)) {
834         c->sse[0] = ff_sse16_sse2;
835         c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
836 
837 #if HAVE_ALIGNED_STACK
838         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
839         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
840 #endif
841     }
842 #endif
843 
844 #if (HAVE_SSSE3_EXTERNAL == 1)
845     if (EXTERNAL_SSSE3(cpu_flags)) {
846         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
847 #if HAVE_ALIGNED_STACK
848         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
849         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
850 #endif
851     }
852 #endif
853 }
854