1 /*
2  * DSP utils : average functions are compiled twice for 3dnow/mmx2
3  * Copyright (c) 2000, 2001 Fabrice Bellard.
4  * Copyright (c) 2002-2004 Michael Niedermayer
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 /* This header intentionally has no multiple inclusion guards. It is meant to
28  * be included multiple times and generates different code depending on the
29  * value of certain #defines. */
30 
31 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
32    clobber bug - now it will work with 2.95.2 and also with -fPIC
33  */
DEF(put_pixels8_x2)34 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
35 {
36     asm volatile(
37         "lea (%3, %3), %%"REG_a"        \n\t"
38         "1:                             \n\t"
39         "movq (%1), %%mm0               \n\t"
40         "movq (%1, %3), %%mm1           \n\t"
41         PAVGB" 1(%1), %%mm0             \n\t"
42         PAVGB" 1(%1, %3), %%mm1         \n\t"
43         "movq %%mm0, (%2)               \n\t"
44         "movq %%mm1, (%2, %3)           \n\t"
45         "add %%"REG_a", %1              \n\t"
46         "add %%"REG_a", %2              \n\t"
47         "movq (%1), %%mm0               \n\t"
48         "movq (%1, %3), %%mm1           \n\t"
49         PAVGB" 1(%1), %%mm0             \n\t"
50         PAVGB" 1(%1, %3), %%mm1         \n\t"
51         "add %%"REG_a", %1              \n\t"
52         "movq %%mm0, (%2)               \n\t"
53         "movq %%mm1, (%2, %3)           \n\t"
54         "add %%"REG_a", %2              \n\t"
55         "subl $4, %0                    \n\t"
56         "jnz 1b                         \n\t"
57         :"+g"(h), "+S"(pixels), "+D"(block)
58         :"r" ((long)line_size)
59         :"%"REG_a, "memory");
60 }
61 
DEF(put_pixels4_l2)62 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
63 {
64     asm volatile(
65         "testl $1, %0                   \n\t"
66             " jz 1f                     \n\t"
67         "movd   (%1), %%mm0             \n\t"
68         "movd   (%2), %%mm1             \n\t"
69         "add    %4, %1                  \n\t"
70         "add    $4, %2                  \n\t"
71         PAVGB" %%mm1, %%mm0             \n\t"
72         "movd   %%mm0, (%3)             \n\t"
73         "add    %5, %3                  \n\t"
74         "decl   %0                      \n\t"
75         "1:                             \n\t"
76         "movd   (%1), %%mm0             \n\t"
77         "add    %4, %1                  \n\t"
78         "movd   (%1), %%mm1             \n\t"
79         "movd   (%2), %%mm2             \n\t"
80         "movd   4(%2), %%mm3            \n\t"
81         "add    %4, %1                  \n\t"
82         PAVGB" %%mm2, %%mm0             \n\t"
83         PAVGB" %%mm3, %%mm1             \n\t"
84         "movd   %%mm0, (%3)             \n\t"
85         "add    %5, %3                  \n\t"
86         "movd   %%mm1, (%3)             \n\t"
87         "add    %5, %3                  \n\t"
88         "movd   (%1), %%mm0             \n\t"
89         "add    %4, %1                  \n\t"
90         "movd   (%1), %%mm1             \n\t"
91         "movd   8(%2), %%mm2            \n\t"
92         "movd   12(%2), %%mm3           \n\t"
93         "add    %4, %1                  \n\t"
94         PAVGB" %%mm2, %%mm0             \n\t"
95         PAVGB" %%mm3, %%mm1             \n\t"
96         "movd   %%mm0, (%3)             \n\t"
97         "add    %5, %3                  \n\t"
98         "movd   %%mm1, (%3)             \n\t"
99         "add    %5, %3                  \n\t"
100         "add    $16, %2                 \n\t"
101         "subl   $4, %0                  \n\t"
102         "jnz    1b                      \n\t"
103 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
104         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
105 #else
106         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
107 #endif
108         :"S"((long)src1Stride), "D"((long)dstStride)
109         :"memory");
110 }
111 
112 
DEF(put_pixels8_l2)113 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
114 {
115     asm volatile(
116         "testl $1, %0                   \n\t"
117             " jz 1f                     \n\t"
118         "movq   (%1), %%mm0             \n\t"
119         "movq   (%2), %%mm1             \n\t"
120         "add    %4, %1                  \n\t"
121         "add    $8, %2                  \n\t"
122         PAVGB" %%mm1, %%mm0             \n\t"
123         "movq   %%mm0, (%3)             \n\t"
124         "add    %5, %3                  \n\t"
125         "decl   %0                      \n\t"
126         "1:                             \n\t"
127         "movq   (%1), %%mm0             \n\t"
128         "add    %4, %1                  \n\t"
129         "movq   (%1), %%mm1             \n\t"
130         "add    %4, %1                  \n\t"
131         PAVGB" (%2), %%mm0              \n\t"
132         PAVGB" 8(%2), %%mm1             \n\t"
133         "movq   %%mm0, (%3)             \n\t"
134         "add    %5, %3                  \n\t"
135         "movq   %%mm1, (%3)             \n\t"
136         "add    %5, %3                  \n\t"
137         "movq   (%1), %%mm0             \n\t"
138         "add    %4, %1                  \n\t"
139         "movq   (%1), %%mm1             \n\t"
140         "add    %4, %1                  \n\t"
141         PAVGB" 16(%2), %%mm0            \n\t"
142         PAVGB" 24(%2), %%mm1            \n\t"
143         "movq   %%mm0, (%3)             \n\t"
144         "add    %5, %3                  \n\t"
145         "movq   %%mm1, (%3)             \n\t"
146         "add    %5, %3                  \n\t"
147         "add    $32, %2                 \n\t"
148         "subl   $4, %0                  \n\t"
149         "jnz    1b                      \n\t"
150 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
151         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
152 #else
153         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
154 #endif
155         :"S"((long)src1Stride), "D"((long)dstStride)
156         :"memory");
157 //the following should be used, though better not with gcc ...
158 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
159         :"r"(src1Stride), "r"(dstStride)
160         :"memory");*/
161 }
162 
DEF(put_no_rnd_pixels8_l2)163 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
164 {
165     asm volatile(
166         "pcmpeqb %%mm6, %%mm6           \n\t"
167         "testl $1, %0                   \n\t"
168             " jz 1f                     \n\t"
169         "movq   (%1), %%mm0             \n\t"
170         "movq   (%2), %%mm1             \n\t"
171         "add    %4, %1                  \n\t"
172         "add    $8, %2                  \n\t"
173         "pxor %%mm6, %%mm0              \n\t"
174         "pxor %%mm6, %%mm1              \n\t"
175         PAVGB" %%mm1, %%mm0             \n\t"
176         "pxor %%mm6, %%mm0              \n\t"
177         "movq   %%mm0, (%3)             \n\t"
178         "add    %5, %3                  \n\t"
179         "decl   %0                      \n\t"
180         "1:                             \n\t"
181         "movq   (%1), %%mm0             \n\t"
182         "add    %4, %1                  \n\t"
183         "movq   (%1), %%mm1             \n\t"
184         "add    %4, %1                  \n\t"
185         "movq   (%2), %%mm2             \n\t"
186         "movq   8(%2), %%mm3            \n\t"
187         "pxor %%mm6, %%mm0              \n\t"
188         "pxor %%mm6, %%mm1              \n\t"
189         "pxor %%mm6, %%mm2              \n\t"
190         "pxor %%mm6, %%mm3              \n\t"
191         PAVGB" %%mm2, %%mm0             \n\t"
192         PAVGB" %%mm3, %%mm1             \n\t"
193         "pxor %%mm6, %%mm0              \n\t"
194         "pxor %%mm6, %%mm1              \n\t"
195         "movq   %%mm0, (%3)             \n\t"
196         "add    %5, %3                  \n\t"
197         "movq   %%mm1, (%3)             \n\t"
198         "add    %5, %3                  \n\t"
199         "movq   (%1), %%mm0             \n\t"
200         "add    %4, %1                  \n\t"
201         "movq   (%1), %%mm1             \n\t"
202         "add    %4, %1                  \n\t"
203         "movq   16(%2), %%mm2           \n\t"
204         "movq   24(%2), %%mm3           \n\t"
205         "pxor %%mm6, %%mm0              \n\t"
206         "pxor %%mm6, %%mm1              \n\t"
207         "pxor %%mm6, %%mm2              \n\t"
208         "pxor %%mm6, %%mm3              \n\t"
209         PAVGB" %%mm2, %%mm0             \n\t"
210         PAVGB" %%mm3, %%mm1             \n\t"
211         "pxor %%mm6, %%mm0              \n\t"
212         "pxor %%mm6, %%mm1              \n\t"
213         "movq   %%mm0, (%3)             \n\t"
214         "add    %5, %3                  \n\t"
215         "movq   %%mm1, (%3)             \n\t"
216         "add    %5, %3                  \n\t"
217         "add    $32, %2                 \n\t"
218         "subl   $4, %0                  \n\t"
219         "jnz    1b                      \n\t"
220 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
221         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
222 #else
223         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
224 #endif
225         :"S"((long)src1Stride), "D"((long)dstStride)
226         :"memory");
227 //the following should be used, though better not with gcc ...
228 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
229         :"r"(src1Stride), "r"(dstStride)
230         :"memory");*/
231 }
232 
DEF(avg_pixels4_l2)233 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
234 {
235     asm volatile(
236         "testl $1, %0                   \n\t"
237             " jz 1f                     \n\t"
238         "movd   (%1), %%mm0             \n\t"
239         "movd   (%2), %%mm1             \n\t"
240         "add    %4, %1                  \n\t"
241         "add    $4, %2                  \n\t"
242         PAVGB" %%mm1, %%mm0             \n\t"
243         PAVGB" (%3), %%mm0              \n\t"
244         "movd   %%mm0, (%3)             \n\t"
245         "add    %5, %3                  \n\t"
246         "decl   %0                      \n\t"
247         "1:                             \n\t"
248         "movd   (%1), %%mm0             \n\t"
249         "add    %4, %1                  \n\t"
250         "movd   (%1), %%mm1             \n\t"
251         "add    %4, %1                  \n\t"
252         PAVGB" (%2), %%mm0              \n\t"
253         PAVGB" 4(%2), %%mm1             \n\t"
254         PAVGB" (%3), %%mm0              \n\t"
255         "movd   %%mm0, (%3)             \n\t"
256         "add    %5, %3                  \n\t"
257         PAVGB" (%3), %%mm1              \n\t"
258         "movd   %%mm1, (%3)             \n\t"
259         "add    %5, %3                  \n\t"
260         "movd   (%1), %%mm0             \n\t"
261         "add    %4, %1                  \n\t"
262         "movd   (%1), %%mm1             \n\t"
263         "add    %4, %1                  \n\t"
264         PAVGB" 8(%2), %%mm0             \n\t"
265         PAVGB" 12(%2), %%mm1            \n\t"
266         PAVGB" (%3), %%mm0              \n\t"
267         "movd   %%mm0, (%3)             \n\t"
268         "add    %5, %3                  \n\t"
269         PAVGB" (%3), %%mm1              \n\t"
270         "movd   %%mm1, (%3)             \n\t"
271         "add    %5, %3                  \n\t"
272         "add    $16, %2                 \n\t"
273         "subl   $4, %0                  \n\t"
274         "jnz    1b                      \n\t"
275 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
276         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
277 #else
278         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
279 #endif
280         :"S"((long)src1Stride), "D"((long)dstStride)
281         :"memory");
282 }
283 
284 
DEF(avg_pixels8_l2)285 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
286 {
287     asm volatile(
288         "testl $1, %0                   \n\t"
289             " jz 1f                     \n\t"
290         "movq   (%1), %%mm0             \n\t"
291         "movq   (%2), %%mm1             \n\t"
292         "add    %4, %1                  \n\t"
293         "add    $8, %2                  \n\t"
294         PAVGB" %%mm1, %%mm0             \n\t"
295         PAVGB" (%3), %%mm0              \n\t"
296         "movq   %%mm0, (%3)             \n\t"
297         "add    %5, %3                  \n\t"
298         "decl   %0                      \n\t"
299         "1:                             \n\t"
300         "movq   (%1), %%mm0             \n\t"
301         "add    %4, %1                  \n\t"
302         "movq   (%1), %%mm1             \n\t"
303         "add    %4, %1                  \n\t"
304         PAVGB" (%2), %%mm0              \n\t"
305         PAVGB" 8(%2), %%mm1             \n\t"
306         PAVGB" (%3), %%mm0              \n\t"
307         "movq   %%mm0, (%3)             \n\t"
308         "add    %5, %3                  \n\t"
309         PAVGB" (%3), %%mm1              \n\t"
310         "movq   %%mm1, (%3)             \n\t"
311         "add    %5, %3                  \n\t"
312         "movq   (%1), %%mm0             \n\t"
313         "add    %4, %1                  \n\t"
314         "movq   (%1), %%mm1             \n\t"
315         "add    %4, %1                  \n\t"
316         PAVGB" 16(%2), %%mm0            \n\t"
317         PAVGB" 24(%2), %%mm1            \n\t"
318         PAVGB" (%3), %%mm0              \n\t"
319         "movq   %%mm0, (%3)             \n\t"
320         "add    %5, %3                  \n\t"
321         PAVGB" (%3), %%mm1              \n\t"
322         "movq   %%mm1, (%3)             \n\t"
323         "add    %5, %3                  \n\t"
324         "add    $32, %2                 \n\t"
325         "subl   $4, %0                  \n\t"
326         "jnz    1b                      \n\t"
327 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
328         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
329 #else
330         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
331 #endif
332         :"S"((long)src1Stride), "D"((long)dstStride)
333         :"memory");
334 //the following should be used, though better not with gcc ...
335 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
336         :"r"(src1Stride), "r"(dstStride)
337         :"memory");*/
338 }
339 
DEF(put_pixels16_x2)340 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
341 {
342     asm volatile(
343         "lea (%3, %3), %%"REG_a"        \n\t"
344         "1:                             \n\t"
345         "movq (%1), %%mm0               \n\t"
346         "movq (%1, %3), %%mm1           \n\t"
347         "movq 8(%1), %%mm2              \n\t"
348         "movq 8(%1, %3), %%mm3          \n\t"
349         PAVGB" 1(%1), %%mm0             \n\t"
350         PAVGB" 1(%1, %3), %%mm1         \n\t"
351         PAVGB" 9(%1), %%mm2             \n\t"
352         PAVGB" 9(%1, %3), %%mm3         \n\t"
353         "movq %%mm0, (%2)               \n\t"
354         "movq %%mm1, (%2, %3)           \n\t"
355         "movq %%mm2, 8(%2)              \n\t"
356         "movq %%mm3, 8(%2, %3)          \n\t"
357         "add %%"REG_a", %1              \n\t"
358         "add %%"REG_a", %2              \n\t"
359         "movq (%1), %%mm0               \n\t"
360         "movq (%1, %3), %%mm1           \n\t"
361         "movq 8(%1), %%mm2              \n\t"
362         "movq 8(%1, %3), %%mm3          \n\t"
363         PAVGB" 1(%1), %%mm0             \n\t"
364         PAVGB" 1(%1, %3), %%mm1         \n\t"
365         PAVGB" 9(%1), %%mm2             \n\t"
366         PAVGB" 9(%1, %3), %%mm3         \n\t"
367         "add %%"REG_a", %1              \n\t"
368         "movq %%mm0, (%2)               \n\t"
369         "movq %%mm1, (%2, %3)           \n\t"
370         "movq %%mm2, 8(%2)              \n\t"
371         "movq %%mm3, 8(%2, %3)          \n\t"
372         "add %%"REG_a", %2              \n\t"
373         "subl $4, %0                    \n\t"
374         "jnz 1b                         \n\t"
375         :"+g"(h), "+S"(pixels), "+D"(block)
376         :"r" ((long)line_size)
377         :"%"REG_a, "memory");
378 }
379 
DEF(put_pixels16_l2)380 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
381 {
382     asm volatile(
383         "testl $1, %0                   \n\t"
384             " jz 1f                     \n\t"
385         "movq   (%1), %%mm0             \n\t"
386         "movq   8(%1), %%mm1            \n\t"
387         PAVGB" (%2), %%mm0              \n\t"
388         PAVGB" 8(%2), %%mm1             \n\t"
389         "add    %4, %1                  \n\t"
390         "add    $16, %2                 \n\t"
391         "movq   %%mm0, (%3)             \n\t"
392         "movq   %%mm1, 8(%3)            \n\t"
393         "add    %5, %3                  \n\t"
394         "decl   %0                      \n\t"
395         "1:                             \n\t"
396         "movq   (%1), %%mm0             \n\t"
397         "movq   8(%1), %%mm1            \n\t"
398         "add    %4, %1                  \n\t"
399         PAVGB" (%2), %%mm0              \n\t"
400         PAVGB" 8(%2), %%mm1             \n\t"
401         "movq   %%mm0, (%3)             \n\t"
402         "movq   %%mm1, 8(%3)            \n\t"
403         "add    %5, %3                  \n\t"
404         "movq   (%1), %%mm0             \n\t"
405         "movq   8(%1), %%mm1            \n\t"
406         "add    %4, %1                  \n\t"
407         PAVGB" 16(%2), %%mm0            \n\t"
408         PAVGB" 24(%2), %%mm1            \n\t"
409         "movq   %%mm0, (%3)             \n\t"
410         "movq   %%mm1, 8(%3)            \n\t"
411         "add    %5, %3                  \n\t"
412         "add    $32, %2                 \n\t"
413         "subl   $2, %0                  \n\t"
414         "jnz    1b                      \n\t"
415 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
416         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
417 #else
418         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
419 #endif
420         :"S"((long)src1Stride), "D"((long)dstStride)
421         :"memory");
422 //the following should be used, though better not with gcc ...
423 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
424         :"r"(src1Stride), "r"(dstStride)
425         :"memory");*/
426 }
427 
DEF(avg_pixels16_l2)428 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
429 {
430     asm volatile(
431         "testl $1, %0                   \n\t"
432             " jz 1f                     \n\t"
433         "movq   (%1), %%mm0             \n\t"
434         "movq   8(%1), %%mm1            \n\t"
435         PAVGB" (%2), %%mm0              \n\t"
436         PAVGB" 8(%2), %%mm1             \n\t"
437         "add    %4, %1                  \n\t"
438         "add    $16, %2                 \n\t"
439         PAVGB" (%3), %%mm0              \n\t"
440         PAVGB" 8(%3), %%mm1             \n\t"
441         "movq   %%mm0, (%3)             \n\t"
442         "movq   %%mm1, 8(%3)            \n\t"
443         "add    %5, %3                  \n\t"
444         "decl   %0                      \n\t"
445         "1:                             \n\t"
446         "movq   (%1), %%mm0             \n\t"
447         "movq   8(%1), %%mm1            \n\t"
448         "add    %4, %1                  \n\t"
449         PAVGB" (%2), %%mm0              \n\t"
450         PAVGB" 8(%2), %%mm1             \n\t"
451         PAVGB" (%3), %%mm0              \n\t"
452         PAVGB" 8(%3), %%mm1             \n\t"
453         "movq   %%mm0, (%3)             \n\t"
454         "movq   %%mm1, 8(%3)            \n\t"
455         "add    %5, %3                  \n\t"
456         "movq   (%1), %%mm0             \n\t"
457         "movq   8(%1), %%mm1            \n\t"
458         "add    %4, %1                  \n\t"
459         PAVGB" 16(%2), %%mm0            \n\t"
460         PAVGB" 24(%2), %%mm1            \n\t"
461         PAVGB" (%3), %%mm0              \n\t"
462         PAVGB" 8(%3), %%mm1             \n\t"
463         "movq   %%mm0, (%3)             \n\t"
464         "movq   %%mm1, 8(%3)            \n\t"
465         "add    %5, %3                  \n\t"
466         "add    $32, %2                 \n\t"
467         "subl   $2, %0                  \n\t"
468         "jnz    1b                      \n\t"
469 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
470         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
471 #else
472         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
473 #endif
474         :"S"((long)src1Stride), "D"((long)dstStride)
475         :"memory");
476 //the following should be used, though better not with gcc ...
477 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
478         :"r"(src1Stride), "r"(dstStride)
479         :"memory");*/
480 }
481 
DEF(put_no_rnd_pixels16_l2)482 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
483 {
484     asm volatile(
485         "pcmpeqb %%mm6, %%mm6           \n\t"
486         "testl $1, %0                   \n\t"
487             " jz 1f                     \n\t"
488         "movq   (%1), %%mm0             \n\t"
489         "movq   8(%1), %%mm1            \n\t"
490         "movq   (%2), %%mm2             \n\t"
491         "movq   8(%2), %%mm3            \n\t"
492         "pxor %%mm6, %%mm0              \n\t"
493         "pxor %%mm6, %%mm1              \n\t"
494         "pxor %%mm6, %%mm2              \n\t"
495         "pxor %%mm6, %%mm3              \n\t"
496         PAVGB" %%mm2, %%mm0             \n\t"
497         PAVGB" %%mm3, %%mm1             \n\t"
498         "pxor %%mm6, %%mm0              \n\t"
499         "pxor %%mm6, %%mm1              \n\t"
500         "add    %4, %1                  \n\t"
501         "add    $16, %2                 \n\t"
502         "movq   %%mm0, (%3)             \n\t"
503         "movq   %%mm1, 8(%3)            \n\t"
504         "add    %5, %3                  \n\t"
505         "decl   %0                      \n\t"
506         "1:                             \n\t"
507         "movq   (%1), %%mm0             \n\t"
508         "movq   8(%1), %%mm1            \n\t"
509         "add    %4, %1                  \n\t"
510         "movq   (%2), %%mm2             \n\t"
511         "movq   8(%2), %%mm3            \n\t"
512         "pxor %%mm6, %%mm0              \n\t"
513         "pxor %%mm6, %%mm1              \n\t"
514         "pxor %%mm6, %%mm2              \n\t"
515         "pxor %%mm6, %%mm3              \n\t"
516         PAVGB" %%mm2, %%mm0             \n\t"
517         PAVGB" %%mm3, %%mm1             \n\t"
518         "pxor %%mm6, %%mm0              \n\t"
519         "pxor %%mm6, %%mm1              \n\t"
520         "movq   %%mm0, (%3)             \n\t"
521         "movq   %%mm1, 8(%3)            \n\t"
522         "add    %5, %3                  \n\t"
523         "movq   (%1), %%mm0             \n\t"
524         "movq   8(%1), %%mm1            \n\t"
525         "add    %4, %1                  \n\t"
526         "movq   16(%2), %%mm2           \n\t"
527         "movq   24(%2), %%mm3           \n\t"
528         "pxor %%mm6, %%mm0              \n\t"
529         "pxor %%mm6, %%mm1              \n\t"
530         "pxor %%mm6, %%mm2              \n\t"
531         "pxor %%mm6, %%mm3              \n\t"
532         PAVGB" %%mm2, %%mm0             \n\t"
533         PAVGB" %%mm3, %%mm1             \n\t"
534         "pxor %%mm6, %%mm0              \n\t"
535         "pxor %%mm6, %%mm1              \n\t"
536         "movq   %%mm0, (%3)             \n\t"
537         "movq   %%mm1, 8(%3)            \n\t"
538         "add    %5, %3                  \n\t"
539         "add    $32, %2                 \n\t"
540         "subl   $2, %0                  \n\t"
541         "jnz    1b                      \n\t"
542 #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
543         :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
544 #else
545         :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
546 #endif
547         :"S"((long)src1Stride), "D"((long)dstStride)
548         :"memory");
549 //the following should be used, though better not with gcc ...
550 /*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
551         :"r"(src1Stride), "r"(dstStride)
552         :"memory");*/
553 }
554 
555 /* GL: this function does incorrect rounding if overflow */
DEF(put_no_rnd_pixels8_x2)556 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
557 {
558     MOVQ_BONE(mm6);
559     asm volatile(
560         "lea (%3, %3), %%"REG_a"        \n\t"
561         "1:                             \n\t"
562         "movq (%1), %%mm0               \n\t"
563         "movq (%1, %3), %%mm2           \n\t"
564         "movq 1(%1), %%mm1              \n\t"
565         "movq 1(%1, %3), %%mm3          \n\t"
566         "add %%"REG_a", %1              \n\t"
567         "psubusb %%mm6, %%mm0           \n\t"
568         "psubusb %%mm6, %%mm2           \n\t"
569         PAVGB" %%mm1, %%mm0             \n\t"
570         PAVGB" %%mm3, %%mm2             \n\t"
571         "movq %%mm0, (%2)               \n\t"
572         "movq %%mm2, (%2, %3)           \n\t"
573         "movq (%1), %%mm0               \n\t"
574         "movq 1(%1), %%mm1              \n\t"
575         "movq (%1, %3), %%mm2           \n\t"
576         "movq 1(%1, %3), %%mm3          \n\t"
577         "add %%"REG_a", %2              \n\t"
578         "add %%"REG_a", %1              \n\t"
579         "psubusb %%mm6, %%mm0           \n\t"
580         "psubusb %%mm6, %%mm2           \n\t"
581         PAVGB" %%mm1, %%mm0             \n\t"
582         PAVGB" %%mm3, %%mm2             \n\t"
583         "movq %%mm0, (%2)               \n\t"
584         "movq %%mm2, (%2, %3)           \n\t"
585         "add %%"REG_a", %2              \n\t"
586         "subl $4, %0                    \n\t"
587         "jnz 1b                         \n\t"
588         :"+g"(h), "+S"(pixels), "+D"(block)
589         :"r" ((long)line_size)
590         :"%"REG_a, "memory");
591 }
592 
DEF(put_pixels8_y2)593 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
594 {
595     asm volatile(
596         "lea (%3, %3), %%"REG_a"        \n\t"
597         "movq (%1), %%mm0               \n\t"
598         "sub %3, %2                     \n\t"
599         "1:                             \n\t"
600         "movq (%1, %3), %%mm1           \n\t"
601         "movq (%1, %%"REG_a"), %%mm2    \n\t"
602         "add %%"REG_a", %1              \n\t"
603         PAVGB" %%mm1, %%mm0             \n\t"
604         PAVGB" %%mm2, %%mm1             \n\t"
605         "movq %%mm0, (%2, %3)           \n\t"
606         "movq %%mm1, (%2, %%"REG_a")    \n\t"
607         "movq (%1, %3), %%mm1           \n\t"
608         "movq (%1, %%"REG_a"), %%mm0    \n\t"
609         "add %%"REG_a", %2              \n\t"
610         "add %%"REG_a", %1              \n\t"
611         PAVGB" %%mm1, %%mm2             \n\t"
612         PAVGB" %%mm0, %%mm1             \n\t"
613         "movq %%mm2, (%2, %3)           \n\t"
614         "movq %%mm1, (%2, %%"REG_a")    \n\t"
615         "add %%"REG_a", %2              \n\t"
616         "subl $4, %0                    \n\t"
617         "jnz 1b                         \n\t"
618         :"+g"(h), "+S"(pixels), "+D" (block)
619         :"r" ((long)line_size)
620         :"%"REG_a, "memory");
621 }
622 
623 /* GL: this function does incorrect rounding if overflow */
DEF(put_no_rnd_pixels8_y2)624 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
625 {
626     MOVQ_BONE(mm6);
627     asm volatile(
628         "lea (%3, %3), %%"REG_a"        \n\t"
629         "movq (%1), %%mm0               \n\t"
630         "sub %3, %2                     \n\t"
631         "1:                             \n\t"
632         "movq (%1, %3), %%mm1           \n\t"
633         "movq (%1, %%"REG_a"), %%mm2    \n\t"
634         "add %%"REG_a", %1              \n\t"
635         "psubusb %%mm6, %%mm1           \n\t"
636         PAVGB" %%mm1, %%mm0             \n\t"
637         PAVGB" %%mm2, %%mm1             \n\t"
638         "movq %%mm0, (%2, %3)           \n\t"
639         "movq %%mm1, (%2, %%"REG_a")    \n\t"
640         "movq (%1, %3), %%mm1           \n\t"
641         "movq (%1, %%"REG_a"), %%mm0    \n\t"
642         "add %%"REG_a", %2              \n\t"
643         "add %%"REG_a", %1              \n\t"
644         "psubusb %%mm6, %%mm1           \n\t"
645         PAVGB" %%mm1, %%mm2             \n\t"
646         PAVGB" %%mm0, %%mm1             \n\t"
647         "movq %%mm2, (%2, %3)           \n\t"
648         "movq %%mm1, (%2, %%"REG_a")    \n\t"
649         "add %%"REG_a", %2              \n\t"
650         "subl $4, %0                    \n\t"
651         "jnz 1b                         \n\t"
652         :"+g"(h), "+S"(pixels), "+D" (block)
653         :"r" ((long)line_size)
654         :"%"REG_a, "memory");
655 }
656 
DEF(avg_pixels8)657 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
658 {
659     asm volatile(
660         "lea (%3, %3), %%"REG_a"        \n\t"
661         "1:                             \n\t"
662         "movq (%2), %%mm0               \n\t"
663         "movq (%2, %3), %%mm1           \n\t"
664         PAVGB" (%1), %%mm0              \n\t"
665         PAVGB" (%1, %3), %%mm1          \n\t"
666         "movq %%mm0, (%2)               \n\t"
667         "movq %%mm1, (%2, %3)           \n\t"
668         "add %%"REG_a", %1              \n\t"
669         "add %%"REG_a", %2              \n\t"
670         "movq (%2), %%mm0               \n\t"
671         "movq (%2, %3), %%mm1           \n\t"
672         PAVGB" (%1), %%mm0              \n\t"
673         PAVGB" (%1, %3), %%mm1          \n\t"
674         "add %%"REG_a", %1              \n\t"
675         "movq %%mm0, (%2)               \n\t"
676         "movq %%mm1, (%2, %3)           \n\t"
677         "add %%"REG_a", %2              \n\t"
678         "subl $4, %0                    \n\t"
679         "jnz 1b                         \n\t"
680         :"+g"(h), "+S"(pixels), "+D"(block)
681         :"r" ((long)line_size)
682         :"%"REG_a, "memory");
683 }
684 
DEF(avg_pixels8_x2)685 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
686 {
687     asm volatile(
688         "lea (%3, %3), %%"REG_a"        \n\t"
689         "1:                             \n\t"
690         "movq (%1), %%mm0               \n\t"
691         "movq (%1, %3), %%mm2           \n\t"
692         PAVGB" 1(%1), %%mm0             \n\t"
693         PAVGB" 1(%1, %3), %%mm2         \n\t"
694         PAVGB" (%2), %%mm0              \n\t"
695         PAVGB" (%2, %3), %%mm2          \n\t"
696         "add %%"REG_a", %1              \n\t"
697         "movq %%mm0, (%2)               \n\t"
698         "movq %%mm2, (%2, %3)           \n\t"
699         "movq (%1), %%mm0               \n\t"
700         "movq (%1, %3), %%mm2           \n\t"
701         PAVGB" 1(%1), %%mm0             \n\t"
702         PAVGB" 1(%1, %3), %%mm2         \n\t"
703         "add %%"REG_a", %2              \n\t"
704         "add %%"REG_a", %1              \n\t"
705         PAVGB" (%2), %%mm0              \n\t"
706         PAVGB" (%2, %3), %%mm2          \n\t"
707         "movq %%mm0, (%2)               \n\t"
708         "movq %%mm2, (%2, %3)           \n\t"
709         "add %%"REG_a", %2              \n\t"
710         "subl $4, %0                    \n\t"
711         "jnz 1b                         \n\t"
712         :"+g"(h), "+S"(pixels), "+D"(block)
713         :"r" ((long)line_size)
714         :"%"REG_a, "memory");
715 }
716 
DEF(avg_pixels8_y2)717 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
718 {
719     asm volatile(
720         "lea (%3, %3), %%"REG_a"        \n\t"
721         "movq (%1), %%mm0               \n\t"
722         "sub %3, %2                     \n\t"
723         "1:                             \n\t"
724         "movq (%1, %3), %%mm1           \n\t"
725         "movq (%1, %%"REG_a"), %%mm2    \n\t"
726         "add %%"REG_a", %1              \n\t"
727         PAVGB" %%mm1, %%mm0             \n\t"
728         PAVGB" %%mm2, %%mm1             \n\t"
729         "movq (%2, %3), %%mm3           \n\t"
730         "movq (%2, %%"REG_a"), %%mm4    \n\t"
731         PAVGB" %%mm3, %%mm0             \n\t"
732         PAVGB" %%mm4, %%mm1             \n\t"
733         "movq %%mm0, (%2, %3)           \n\t"
734         "movq %%mm1, (%2, %%"REG_a")    \n\t"
735         "movq (%1, %3), %%mm1           \n\t"
736         "movq (%1, %%"REG_a"), %%mm0    \n\t"
737         PAVGB" %%mm1, %%mm2             \n\t"
738         PAVGB" %%mm0, %%mm1             \n\t"
739         "add %%"REG_a", %2              \n\t"
740         "add %%"REG_a", %1              \n\t"
741         "movq (%2, %3), %%mm3           \n\t"
742         "movq (%2, %%"REG_a"), %%mm4    \n\t"
743         PAVGB" %%mm3, %%mm2             \n\t"
744         PAVGB" %%mm4, %%mm1             \n\t"
745         "movq %%mm2, (%2, %3)           \n\t"
746         "movq %%mm1, (%2, %%"REG_a")    \n\t"
747         "add %%"REG_a", %2              \n\t"
748         "subl $4, %0                    \n\t"
749         "jnz 1b                         \n\t"
750         :"+g"(h), "+S"(pixels), "+D"(block)
751         :"r" ((long)line_size)
752         :"%"REG_a, "memory");
753 }
754 
755 /* Note this is not correctly rounded, but this function is only
756  * used for B-frames so it does not matter. */
DEF(avg_pixels8_xy2)757 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
758 {
759     MOVQ_BONE(mm6);
760     asm volatile(
761         "lea (%3, %3), %%"REG_a"        \n\t"
762         "movq (%1), %%mm0               \n\t"
763         PAVGB" 1(%1), %%mm0             \n\t"
764          ASMALIGN(3)
765         "1:                             \n\t"
766         "movq (%1, %%"REG_a"), %%mm2    \n\t"
767         "movq (%1, %3), %%mm1           \n\t"
768         "psubusb %%mm6, %%mm2           \n\t"
769         PAVGB" 1(%1, %3), %%mm1         \n\t"
770         PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
771         "add %%"REG_a", %1              \n\t"
772         PAVGB" %%mm1, %%mm0             \n\t"
773         PAVGB" %%mm2, %%mm1             \n\t"
774         PAVGB" (%2), %%mm0              \n\t"
775         PAVGB" (%2, %3), %%mm1          \n\t"
776         "movq %%mm0, (%2)               \n\t"
777         "movq %%mm1, (%2, %3)           \n\t"
778         "movq (%1, %3), %%mm1           \n\t"
779         "movq (%1, %%"REG_a"), %%mm0    \n\t"
780         PAVGB" 1(%1, %3), %%mm1         \n\t"
781         PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
782         "add %%"REG_a", %2              \n\t"
783         "add %%"REG_a", %1              \n\t"
784         PAVGB" %%mm1, %%mm2             \n\t"
785         PAVGB" %%mm0, %%mm1             \n\t"
786         PAVGB" (%2), %%mm2              \n\t"
787         PAVGB" (%2, %3), %%mm1          \n\t"
788         "movq %%mm2, (%2)               \n\t"
789         "movq %%mm1, (%2, %3)           \n\t"
790         "add %%"REG_a", %2              \n\t"
791         "subl $4, %0                    \n\t"
792         "jnz 1b                         \n\t"
793         :"+g"(h), "+S"(pixels), "+D"(block)
794         :"r" ((long)line_size)
795         :"%"REG_a,  "memory");
796 }
797 
DEF(avg_pixels4)798 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
799 {
800     do {
801         asm volatile(
802             "movd (%1), %%mm0               \n\t"
803             "movd (%1, %2), %%mm1           \n\t"
804             "movd (%1, %2, 2), %%mm2        \n\t"
805             "movd (%1, %3), %%mm3           \n\t"
806             PAVGB" (%0), %%mm0              \n\t"
807             PAVGB" (%0, %2), %%mm1          \n\t"
808             PAVGB" (%0, %2, 2), %%mm2       \n\t"
809             PAVGB" (%0, %3), %%mm3          \n\t"
810             "movd %%mm0, (%1)               \n\t"
811             "movd %%mm1, (%1, %2)           \n\t"
812             "movd %%mm2, (%1, %2, 2)        \n\t"
813             "movd %%mm3, (%1, %3)           \n\t"
814             ::"S"(pixels), "D"(block),
815              "r" ((long)line_size), "r"(3L*line_size)
816             :"memory");
817         block += 4*line_size;
818         pixels += 4*line_size;
819         h -= 4;
820     } while(h > 0);
821 }
822 
823 //FIXME the following could be optimized too ...
DEF(put_no_rnd_pixels16_x2)824 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
825     DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
826     DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
827 }
DEF(put_pixels16_y2)828 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
829     DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
830     DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
831 }
DEF(put_no_rnd_pixels16_y2)832 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
833     DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
834     DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
835 }
DEF(avg_pixels16)836 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
837     DEF(avg_pixels8)(block  , pixels  , line_size, h);
838     DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
839 }
DEF(avg_pixels16_x2)840 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
841     DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
842     DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
843 }
DEF(avg_pixels16_y2)844 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
845     DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
846     DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
847 }
DEF(avg_pixels16_xy2)848 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
849     DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
850     DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
851 }
852 
853 #define QPEL_2TAP_L3(OPNAME) \
854 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
855     asm volatile(\
856         "1:                    \n\t"\
857         "movq   (%1,%2), %%mm0 \n\t"\
858         "movq  8(%1,%2), %%mm1 \n\t"\
859         PAVGB"  (%1,%3), %%mm0 \n\t"\
860         PAVGB" 8(%1,%3), %%mm1 \n\t"\
861         PAVGB"  (%1),    %%mm0 \n\t"\
862         PAVGB" 8(%1),    %%mm1 \n\t"\
863         STORE_OP( (%1,%4),%%mm0)\
864         STORE_OP(8(%1,%4),%%mm1)\
865         "movq  %%mm0,  (%1,%4) \n\t"\
866         "movq  %%mm1, 8(%1,%4) \n\t"\
867         "add   %5, %1          \n\t"\
868         "decl  %0              \n\t"\
869         "jnz   1b              \n\t"\
870         :"+g"(h), "+r"(src)\
871         :"r"((long)off1), "r"((long)off2),\
872          "r"((long)(dst-src)), "r"((long)stride)\
873         :"memory"\
874     );\
875 }\
876 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
877     asm volatile(\
878         "1:                    \n\t"\
879         "movq   (%1,%2), %%mm0 \n\t"\
880         PAVGB"  (%1,%3), %%mm0 \n\t"\
881         PAVGB"  (%1),    %%mm0 \n\t"\
882         STORE_OP((%1,%4),%%mm0)\
883         "movq  %%mm0,  (%1,%4) \n\t"\
884         "add   %5, %1          \n\t"\
885         "decl  %0              \n\t"\
886         "jnz   1b              \n\t"\
887         :"+g"(h), "+r"(src)\
888         :"r"((long)off1), "r"((long)off2),\
889          "r"((long)(dst-src)), "r"((long)stride)\
890         :"memory"\
891     );\
892 }
893 
894 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
895 QPEL_2TAP_L3(avg_)
896 #undef STORE_OP
897 #define STORE_OP(a,b)
898 QPEL_2TAP_L3(put_)
899 #undef STORE_OP
900 #undef QPEL_2TAP_L3
901