1 /*
2  * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 // put_pixels
DEF(put,pixels8_x2)31 static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
32 {
33     MOVQ_BFE(mm6);
34     __asm__ volatile(
35         "lea    (%3, %3), %%"REG_a"     \n\t"
36         ".p2align 3                     \n\t"
37         "1:                             \n\t"
38         "movq   (%1), %%mm0             \n\t"
39         "movq   1(%1), %%mm1            \n\t"
40         "movq   (%1, %3), %%mm2         \n\t"
41         "movq   1(%1, %3), %%mm3        \n\t"
42         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
43         "movq   %%mm4, (%2)             \n\t"
44         "movq   %%mm5, (%2, %3)         \n\t"
45         "add    %%"REG_a", %1           \n\t"
46         "add    %%"REG_a", %2           \n\t"
47         "movq   (%1), %%mm0             \n\t"
48         "movq   1(%1), %%mm1            \n\t"
49         "movq   (%1, %3), %%mm2         \n\t"
50         "movq   1(%1, %3), %%mm3        \n\t"
51         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
52         "movq   %%mm4, (%2)             \n\t"
53         "movq   %%mm5, (%2, %3)         \n\t"
54         "add    %%"REG_a", %1           \n\t"
55         "add    %%"REG_a", %2           \n\t"
56         "subl   $4, %0                  \n\t"
57         "jnz    1b                      \n\t"
58         :"+g"(h), "+S"(pixels), "+D"(block)
59         :"r"((x86_reg)line_size)
60         :REG_a, "memory");
61 }
62 
DEF(put,pixels16_x2)63 static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
64 {
65     MOVQ_BFE(mm6);
66     __asm__ volatile(
67         "lea        (%3, %3), %%"REG_a" \n\t"
68         ".p2align 3                     \n\t"
69         "1:                             \n\t"
70         "movq   (%1), %%mm0             \n\t"
71         "movq   1(%1), %%mm1            \n\t"
72         "movq   (%1, %3), %%mm2         \n\t"
73         "movq   1(%1, %3), %%mm3        \n\t"
74         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
75         "movq   %%mm4, (%2)             \n\t"
76         "movq   %%mm5, (%2, %3)         \n\t"
77         "movq   8(%1), %%mm0            \n\t"
78         "movq   9(%1), %%mm1            \n\t"
79         "movq   8(%1, %3), %%mm2        \n\t"
80         "movq   9(%1, %3), %%mm3        \n\t"
81         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
82         "movq   %%mm4, 8(%2)            \n\t"
83         "movq   %%mm5, 8(%2, %3)        \n\t"
84         "add    %%"REG_a", %1           \n\t"
85         "add    %%"REG_a", %2           \n\t"
86         "movq   (%1), %%mm0             \n\t"
87         "movq   1(%1), %%mm1            \n\t"
88         "movq   (%1, %3), %%mm2         \n\t"
89         "movq   1(%1, %3), %%mm3        \n\t"
90         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
91         "movq   %%mm4, (%2)             \n\t"
92         "movq   %%mm5, (%2, %3)         \n\t"
93         "movq   8(%1), %%mm0            \n\t"
94         "movq   9(%1), %%mm1            \n\t"
95         "movq   8(%1, %3), %%mm2        \n\t"
96         "movq   9(%1, %3), %%mm3        \n\t"
97         PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
98         "movq   %%mm4, 8(%2)            \n\t"
99         "movq   %%mm5, 8(%2, %3)        \n\t"
100         "add    %%"REG_a", %1           \n\t"
101         "add    %%"REG_a", %2           \n\t"
102         "subl   $4, %0                  \n\t"
103         "jnz    1b                      \n\t"
104         :"+g"(h), "+S"(pixels), "+D"(block)
105         :"r"((x86_reg)line_size)
106         :REG_a, "memory");
107 }
108 
DEF(put,pixels8_y2)109 static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
110 {
111     MOVQ_BFE(mm6);
112     __asm__ volatile(
113         "lea (%3, %3), %%"REG_a"        \n\t"
114         "movq (%1), %%mm0               \n\t"
115         ".p2align 3                     \n\t"
116         "1:                             \n\t"
117         "movq   (%1, %3), %%mm1         \n\t"
118         "movq   (%1, %%"REG_a"),%%mm2   \n\t"
119         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
120         "movq   %%mm4, (%2)             \n\t"
121         "movq   %%mm5, (%2, %3)         \n\t"
122         "add    %%"REG_a", %1           \n\t"
123         "add    %%"REG_a", %2           \n\t"
124         "movq   (%1, %3), %%mm1         \n\t"
125         "movq   (%1, %%"REG_a"),%%mm0   \n\t"
126         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
127         "movq   %%mm4, (%2)             \n\t"
128         "movq   %%mm5, (%2, %3)         \n\t"
129         "add    %%"REG_a", %1           \n\t"
130         "add    %%"REG_a", %2           \n\t"
131         "subl   $4, %0                  \n\t"
132         "jnz    1b                      \n\t"
133         :"+g"(h), "+S"(pixels), "+D"(block)
134         :"r"((x86_reg)line_size)
135         :REG_a, "memory");
136 }
137 
DEF(avg,pixels16_x2)138 static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
139 {
140     MOVQ_BFE(mm6);
141     JUMPALIGN();
142     do {
143         __asm__ volatile(
144             "movq  %1, %%mm0            \n\t"
145             "movq  1%1, %%mm1           \n\t"
146             "movq  %0, %%mm3            \n\t"
147             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
148             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
149             "movq  %%mm0, %0            \n\t"
150             "movq  8%1, %%mm0           \n\t"
151             "movq  9%1, %%mm1           \n\t"
152             "movq  8%0, %%mm3           \n\t"
153             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
154             PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
155             "movq  %%mm0, 8%0           \n\t"
156             :"+m"(*block)
157             :"m"(*pixels)
158             :"memory");
159         pixels += line_size;
160         block += line_size;
161     } while (--h);
162 }
163 
DEF(avg,pixels8_y2)164 static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
165 {
166     MOVQ_BFE(mm6);
167     __asm__ volatile(
168         "lea    (%3, %3), %%"REG_a"     \n\t"
169         "movq   (%1), %%mm0             \n\t"
170         ".p2align 3                     \n\t"
171         "1:                             \n\t"
172         "movq   (%1, %3), %%mm1         \n\t"
173         "movq   (%1, %%"REG_a"), %%mm2  \n\t"
174         PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
175         "movq   (%2), %%mm3             \n\t"
176         PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
177         "movq   (%2, %3), %%mm3         \n\t"
178         PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
179         "movq   %%mm0, (%2)             \n\t"
180         "movq   %%mm1, (%2, %3)         \n\t"
181         "add    %%"REG_a", %1           \n\t"
182         "add    %%"REG_a", %2           \n\t"
183 
184         "movq   (%1, %3), %%mm1         \n\t"
185         "movq   (%1, %%"REG_a"), %%mm0  \n\t"
186         PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
187         "movq   (%2), %%mm3             \n\t"
188         PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
189         "movq   (%2, %3), %%mm3         \n\t"
190         PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
191         "movq   %%mm2, (%2)             \n\t"
192         "movq   %%mm1, (%2, %3)         \n\t"
193         "add    %%"REG_a", %1           \n\t"
194         "add    %%"REG_a", %2           \n\t"
195 
196         "subl   $4, %0                  \n\t"
197         "jnz    1b                      \n\t"
198         :"+g"(h), "+S"(pixels), "+D"(block)
199         :"r"((x86_reg)line_size)
200         :REG_a, "memory");
201 }
202