1 /*
2  * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 // put_pixels
DEF(put,pixels8_x2)31 av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
32 {
33     MOVQ_BFE(mm6);
34     __asm__ volatile(
35         "lea    (%3, %3), %%"FF_REG_a"  \n\t"
36         ".p2align 3                     \n\t"
37         "1:                             \n\t"
38         "movq   (%1), %%mm0             \n\t"
39         "movq   1(%1), %%mm1            \n\t"
40         "movq   (%1, %3), %%mm2         \n\t"
41         "movq   1(%1, %3), %%mm3        \n\t"
42         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
43         "movq   %%mm4, (%2)             \n\t"
44         "movq   %%mm5, (%2, %3)         \n\t"
45         "add    %%"FF_REG_a", %1        \n\t"
46         "add    %%"FF_REG_a", %2        \n\t"
47         "movq   (%1), %%mm0             \n\t"
48         "movq   1(%1), %%mm1            \n\t"
49         "movq   (%1, %3), %%mm2         \n\t"
50         "movq   1(%1, %3), %%mm3        \n\t"
51         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
52         "movq   %%mm4, (%2)             \n\t"
53         "movq   %%mm5, (%2, %3)         \n\t"
54         "add    %%"FF_REG_a", %1        \n\t"
55         "add    %%"FF_REG_a", %2        \n\t"
56         "subl   $4, %0                  \n\t"
57         "jnz    1b                      \n\t"
58         :"+g"(h), "+S"(pixels), "+D"(block)
59         :"r"((x86_reg)line_size)
60         :FF_REG_a, "memory");
61 }
62 
DEF(put,pixels16_x2)63 av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
64 {
65     MOVQ_BFE(mm6);
66     __asm__ volatile(
67         "lea    (%3, %3), %%"FF_REG_a"  \n\t"
68         ".p2align 3                     \n\t"
69         "1:                             \n\t"
70         "movq   (%1), %%mm0             \n\t"
71         "movq   1(%1), %%mm1            \n\t"
72         "movq   (%1, %3), %%mm2         \n\t"
73         "movq   1(%1, %3), %%mm3        \n\t"
74         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
75         "movq   %%mm4, (%2)             \n\t"
76         "movq   %%mm5, (%2, %3)         \n\t"
77         "movq   8(%1), %%mm0            \n\t"
78         "movq   9(%1), %%mm1            \n\t"
79         "movq   8(%1, %3), %%mm2        \n\t"
80         "movq   9(%1, %3), %%mm3        \n\t"
81         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
82         "movq   %%mm4, 8(%2)            \n\t"
83         "movq   %%mm5, 8(%2, %3)        \n\t"
84         "add    %%"FF_REG_a", %1        \n\t"
85         "add    %%"FF_REG_a", %2        \n\t"
86         "movq   (%1), %%mm0             \n\t"
87         "movq   1(%1), %%mm1            \n\t"
88         "movq   (%1, %3), %%mm2         \n\t"
89         "movq   1(%1, %3), %%mm3        \n\t"
90         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
91         "movq   %%mm4, (%2)             \n\t"
92         "movq   %%mm5, (%2, %3)         \n\t"
93         "movq   8(%1), %%mm0            \n\t"
94         "movq   9(%1), %%mm1            \n\t"
95         "movq   8(%1, %3), %%mm2        \n\t"
96         "movq   9(%1, %3), %%mm3        \n\t"
97         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
98         "movq   %%mm4, 8(%2)            \n\t"
99         "movq   %%mm5, 8(%2, %3)        \n\t"
100         "add    %%"FF_REG_a", %1        \n\t"
101         "add    %%"FF_REG_a", %2        \n\t"
102         "subl   $4, %0                  \n\t"
103         "jnz    1b                      \n\t"
104         :"+g"(h), "+S"(pixels), "+D"(block)
105         :"r"((x86_reg)line_size)
106         :FF_REG_a, "memory");
107 }
108 
DEF(put,pixels8_y2)109 av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
110 {
111     MOVQ_BFE(mm6);
112     __asm__ volatile(
113         "lea (%3, %3), %%"FF_REG_a"     \n\t"
114         "movq (%1), %%mm0               \n\t"
115         ".p2align 3                     \n\t"
116         "1:                             \n\t"
117         "movq   (%1, %3), %%mm1         \n\t"
118         "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
119         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
120         "movq   %%mm4, (%2)             \n\t"
121         "movq   %%mm5, (%2, %3)         \n\t"
122         "add    %%"FF_REG_a", %1        \n\t"
123         "add    %%"FF_REG_a", %2        \n\t"
124         "movq   (%1, %3), %%mm1         \n\t"
125         "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
126         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
127         "movq   %%mm4, (%2)             \n\t"
128         "movq   %%mm5, (%2, %3)         \n\t"
129         "add    %%"FF_REG_a", %1        \n\t"
130         "add    %%"FF_REG_a", %2        \n\t"
131         "subl   $4, %0                  \n\t"
132         "jnz    1b                      \n\t"
133         :"+g"(h), "+S"(pixels), "+D"(block)
134         :"r"((x86_reg)line_size)
135         :FF_REG_a, "memory");
136 }
137 
DEF(avg,pixels16_x2)138 av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
139 {
140     MOVQ_BFE(mm6);
141         __asm__ volatile(
142             ".p2align 3                 \n\t"
143             "1:                         \n\t"
144             "movq  (%1), %%mm0          \n\t"
145             "movq  1(%1), %%mm1         \n\t"
146             "movq  (%2), %%mm3          \n\t"
147             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
148             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
149             "movq  %%mm0, (%2)          \n\t"
150             "movq  8(%1), %%mm0         \n\t"
151             "movq  9(%1), %%mm1         \n\t"
152             "movq  8(%2), %%mm3         \n\t"
153             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
154             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
155             "movq  %%mm0, 8(%2)         \n\t"
156             "add    %3, %1              \n\t"
157             "add    %3, %2              \n\t"
158             "subl   $1, %0              \n\t"
159             "jnz    1b                  \n\t"
160             :"+g"(h), "+S"(pixels), "+D"(block)
161             :"r"((x86_reg)line_size)
162             :"memory");
163 }
164 
DEF(avg,pixels8_y2)165 av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
166 {
167     MOVQ_BFE(mm6);
168     __asm__ volatile(
169         "lea    (%3, %3), %%"FF_REG_a"  \n\t"
170         "movq   (%1), %%mm0             \n\t"
171         ".p2align 3                     \n\t"
172         "1:                             \n\t"
173         "movq   (%1, %3), %%mm1         \n\t"
174         "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
175         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
176         "movq   (%2), %%mm3             \n\t"
177         PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
178         "movq   (%2, %3), %%mm3         \n\t"
179         PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
180         "movq   %%mm0, (%2)             \n\t"
181         "movq   %%mm1, (%2, %3)         \n\t"
182         "add    %%"FF_REG_a", %1        \n\t"
183         "add    %%"FF_REG_a", %2        \n\t"
184 
185         "movq   (%1, %3), %%mm1         \n\t"
186         "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
187         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
188         "movq   (%2), %%mm3             \n\t"
189         PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
190         "movq   (%2, %3), %%mm3         \n\t"
191         PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
192         "movq   %%mm2, (%2)             \n\t"
193         "movq   %%mm1, (%2, %3)         \n\t"
194         "add    %%"FF_REG_a", %1        \n\t"
195         "add    %%"FF_REG_a", %2        \n\t"
196 
197         "subl   $4, %0                  \n\t"
198         "jnz    1b                      \n\t"
199         :"+g"(h), "+S"(pixels), "+D"(block)
200         :"r"((x86_reg)line_size)
201         :FF_REG_a, "memory");
202 }
203