1 /*
2 * img_yuv_planar.c - YUV planar image format conversion routines
3 * Written by Andrew Church <achurch@achurch.org>
4 *
5 * This file is part of transcode, a video stream processing tool.
6 * transcode is free software, distributable under the terms of the GNU
7 * General Public License (version 2 or later). See the file COPYING
8 * for details.
9 */
10
11 #include "ac.h"
12 #include "imgconvert.h"
13 #include "img_internal.h"
14
15 #include <string.h>
16
17 /*************************************************************************/
18 /*************************************************************************/
19
20 /* Standard C implementations */
21
22 /*************************************************************************/
23
24 /* Identity transformations */
25
yuv420p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)26 static int yuv420p_copy(uint8_t **src, uint8_t **dest, int width, int height)
27 {
28 ac_memcpy(dest[0], src[0], width*height);
29 ac_memcpy(dest[1], src[1], (width/2)*(height/2));
30 ac_memcpy(dest[2], src[2], (width/2)*(height/2));
31 return 1;
32 }
33
yuv411p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)34 static int yuv411p_copy(uint8_t **src, uint8_t **dest, int width, int height)
35 {
36 ac_memcpy(dest[0], src[0], width*height);
37 ac_memcpy(dest[1], src[1], (width/4)*height);
38 ac_memcpy(dest[2], src[2], (width/4)*height);
39 return 1;
40 }
41
yuv422p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)42 static int yuv422p_copy(uint8_t **src, uint8_t **dest, int width, int height)
43 {
44 ac_memcpy(dest[0], src[0], width*height);
45 ac_memcpy(dest[1], src[1], (width/2)*height);
46 ac_memcpy(dest[2], src[2], (width/2)*height);
47 return 1;
48 }
49
yuv444p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)50 static int yuv444p_copy(uint8_t **src, uint8_t **dest, int width, int height)
51 {
52 ac_memcpy(dest[0], src[0], width*height);
53 ac_memcpy(dest[1], src[1], width*height);
54 ac_memcpy(dest[2], src[2], width*height);
55 return 1;
56 }
57
y8_copy(uint8_t ** src,uint8_t ** dest,int width,int height)58 static int y8_copy(uint8_t **src, uint8_t **dest, int width, int height)
59 {
60 ac_memcpy(dest[0], src[0], width*height);
61 return 1;
62 }
63
64 /*************************************************************************/
65
yuv420p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)66 static int yuv420p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
67 {
68 int x, y;
69 ac_memcpy(dest[0], src[0], width*height);
70 for (y = 0; y < (height & ~1); y += 2) {
71 for (x = 0; x < (width/2 & ~1); x += 2) {
72 dest[1][y*(width/4)+x/2] = (src[1][(y/2)*(width/2)+x]
73 + src[1][(y/2)*(width/2)+x+1] + 1) / 2;
74 dest[2][y*(width/4)+x/2] = (src[2][(y/2)*(width/2)+x]
75 + src[2][(y/2)*(width/2)+x+1] + 1) / 2;
76 }
77 ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
78 ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
79 }
80 return 1;
81 }
82
yuv420p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)83 static int yuv420p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
84 {
85 int y;
86 ac_memcpy(dest[0], src[0], width*height);
87 for (y = 0; y < (height & ~1); y += 2) {
88 ac_memcpy(dest[1]+(y )*(width/2), src[1]+(y/2)*(width/2), width/2);
89 ac_memcpy(dest[1]+(y+1)*(width/2), src[1]+(y/2)*(width/2), width/2);
90 ac_memcpy(dest[2]+(y )*(width/2), src[2]+(y/2)*(width/2), width/2);
91 ac_memcpy(dest[2]+(y+1)*(width/2), src[2]+(y/2)*(width/2), width/2);
92 }
93 return 1;
94 }
95
yuv420p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)96 static int yuv420p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
97 {
98 int x, y;
99 ac_memcpy(dest[0], src[0], width*height);
100 for (y = 0; y < height; y += 2) {
101 for (x = 0; x < width; x += 2) {
102 dest[1][y*width+x ] =
103 dest[1][y*width+x+1] = src[1][(y/2)*(width/2)+(x/2)];
104 dest[2][y*width+x ] =
105 dest[2][y*width+x+1] = src[2][(y/2)*(width/2)+(x/2)];
106 }
107 ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
108 ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
109 }
110 return 1;
111 }
112
113 /*************************************************************************/
114
yuv411p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)115 static int yuv411p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
116 {
117 int x, y;
118 ac_memcpy(dest[0], src[0], width*height);
119 for (y = 0; y < (height & ~1); y += 2) {
120 for (x = 0; x < ((width/2) & ~1); x += 2) {
121 dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/4)+x/2]
122 + src[1][(y+1)*(width/4)+x/2] + 1) / 2;
123 dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/4)+x/2]
124 + src[2][(y+1)*(width/4)+x/2] + 1) / 2;
125 dest[1][(y/2)*(width/2)+x+1] = dest[1][(y/2)*(width/2)+x];
126 dest[2][(y/2)*(width/2)+x+1] = dest[2][(y/2)*(width/2)+x];
127 }
128 }
129 return 1;
130 }
131
yuv411p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)132 static int yuv411p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
133 {
134 int x, y;
135 ac_memcpy(dest[0], src[0], width*height);
136 for (y = 0; y < height; y++) {
137 for (x = 0; x < ((width/2) & ~1); x += 2) {
138 dest[1][y*(width/2)+x ] = src[1][y*(width/4)+x/2];
139 dest[1][y*(width/2)+x+1] = src[1][y*(width/4)+x/2];
140 dest[2][y*(width/2)+x ] = src[2][y*(width/4)+x/2];
141 dest[2][y*(width/2)+x+1] = src[2][y*(width/4)+x/2];
142 }
143 }
144 return 1;
145 }
146
yuv411p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)147 static int yuv411p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
148 {
149 int x, y;
150 ac_memcpy(dest[0], src[0], width*height);
151 for (y = 0; y < height; y++) {
152 for (x = 0; x < (width & ~3); x += 4) {
153 dest[1][y*width+x ] = src[1][y*(width/4)+x/4];
154 dest[1][y*width+x+1] = src[1][y*(width/4)+x/4];
155 dest[1][y*width+x+2] = src[1][y*(width/4)+x/4];
156 dest[1][y*width+x+3] = src[1][y*(width/4)+x/4];
157 dest[2][y*width+x ] = src[2][y*(width/4)+x/4];
158 dest[2][y*width+x+1] = src[2][y*(width/4)+x/4];
159 dest[2][y*width+x+2] = src[2][y*(width/4)+x/4];
160 dest[2][y*width+x+3] = src[2][y*(width/4)+x/4];
161 }
162 }
163 return 1;
164 }
165
166 /*************************************************************************/
167
yuv422p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)168 static int yuv422p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
169 {
170 int x, y;
171 ac_memcpy(dest[0], src[0], width*height);
172 for (y = 0; y < (height & ~1); y += 2) {
173 for (x = 0; x < width/2; x++) {
174 dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/2)+x]
175 + src[1][(y+1)*(width/2)+x] + 1) / 2;
176 dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/2)+x]
177 + src[2][(y+1)*(width/2)+x] + 1) / 2;
178 }
179 }
180 return 1;
181 }
182
yuv422p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)183 static int yuv422p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
184 {
185 int x, y;
186 ac_memcpy(dest[0], src[0], width*height);
187 for (y = 0; y < height; y++) {
188 for (x = 0; x < ((width/2) & ~1); x += 2) {
189 dest[1][y*(width/4)+x/2] = (src[1][y*(width/2)+x]
190 + src[1][y*(width/2)+x+1] + 1) / 2;
191 dest[2][y*(width/4)+x/2] = (src[2][y*(width/2)+x]
192 + src[2][y*(width/2)+x+1] + 1) / 2;
193 }
194 }
195 return 1;
196 }
197
yuv422p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)198 static int yuv422p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
199 {
200 int x, y;
201 ac_memcpy(dest[0], src[0], width*height);
202 for (y = 0; y < height; y++) {
203 for (x = 0; x < (width & ~1); x += 2) {
204 dest[1][y*width+x ] = src[1][y*(width/2)+x/2];
205 dest[1][y*width+x+1] = src[1][y*(width/2)+x/2];
206 dest[2][y*width+x ] = src[2][y*(width/2)+x/2];
207 dest[2][y*width+x+1] = src[2][y*(width/2)+x/2];
208 }
209 }
210 return 1;
211 }
212
213 /*************************************************************************/
214
yuv444p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)215 static int yuv444p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
216 {
217 int x, y;
218 ac_memcpy(dest[0], src[0], width*height);
219 for (y = 0; y < (height & ~1); y += 2) {
220 for (x = 0; x < (width & ~1); x += 2) {
221 dest[1][(y/2)*(width/2)+x/2] = (src[1][y*width+x]
222 + src[1][y*width+x+1]
223 + src[1][(y+1)*width+x]
224 + src[1][(y+1)*width+x+1] + 2) / 4;
225 dest[2][(y/2)*(width/2)+x/2] = (src[2][y*width+x]
226 + src[2][y*width+x+1]
227 + src[2][(y+1)*width+x]
228 + src[2][(y+1)*width+x+1] + 2) / 4;
229 }
230 }
231 return 1;
232 }
233
yuv444p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)234 static int yuv444p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
235 {
236 int x, y;
237 ac_memcpy(dest[0], src[0], width*height);
238 for (y = 0; y < height; y++) {
239 for (x = 0; x < (width & ~3); x += 4) {
240 dest[1][y*(width/4)+x/4] = (src[1][y*width+x]
241 + src[1][y*width+x+1]
242 + src[1][y*width+x+2]
243 + src[1][y*width+x+3] + 2) / 4;
244 dest[2][y*(width/4)+x/4] = (src[2][y*width+x]
245 + src[2][y*width+x+1]
246 + src[2][y*width+x+2]
247 + src[2][y*width+x+3] + 2) / 4;
248 }
249 }
250 return 1;
251 }
252
yuv444p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)253 static int yuv444p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
254 {
255 int x, y;
256 ac_memcpy(dest[0], src[0], width*height);
257 for (y = 0; y < height; y++) {
258 for (x = 0; x < (width & ~1); x += 2) {
259 dest[1][y*(width/2)+x/2] = (src[1][y*width+x]
260 + src[1][y*width+x+1] + 1) / 2;
261 dest[2][y*(width/2)+x/2] = (src[2][y*width+x]
262 + src[2][y*width+x+1] + 1) / 2;
263 }
264 }
265 return 1;
266 }
267
268 /*************************************************************************/
269
270 /* We treat Y8 as a planar format */
271
yuvp_y8(uint8_t ** src,uint8_t ** dest,int width,int height)272 static int yuvp_y8(uint8_t **src, uint8_t **dest, int width, int height)
273 {
274 ac_memcpy(dest[0], src[0], width*height);
275 return 1;
276 }
277
y8_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)278 static int y8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
279 {
280 ac_memcpy(dest[0], src[0], width*height);
281 memset(dest[1], 128, (width/2)*(height/2));
282 memset(dest[2], 128, (width/2)*(height/2));
283 return 1;
284 }
285
y8_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)286 static int y8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
287 {
288 ac_memcpy(dest[0], src[0], width*height);
289 memset(dest[1], 128, (width/4)*height);
290 memset(dest[2], 128, (width/4)*height);
291 return 1;
292 }
293
y8_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)294 static int y8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
295 {
296 ac_memcpy(dest[0], src[0], width*height);
297 memset(dest[1], 128, (width/2)*height);
298 memset(dest[2], 128, (width/2)*height);
299 return 1;
300 }
301
y8_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)302 static int y8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
303 {
304 ac_memcpy(dest[0], src[0], width*height);
305 memset(dest[1], 128, width*height);
306 memset(dest[2], 128, width*height);
307 return 1;
308 }
309
310 /*************************************************************************/
311 /*************************************************************************/
312
313 #if defined(HAVE_ASM_SSE2)
314
315 /* SSE2 routines. See comments in img_x86_common.h for why we don't bother
316 * unrolling the loops. */
317
318 /* Common macros/data for x86 code */
319 #include "img_x86_common.h"
320
321 /* Average 2 bytes horizontally (e.g. 422P->411P) (unit: 2 source bytes) */
322 #define AVG_2H(src,dest,count) do { \
323 int dummy; \
324 asm volatile( \
325 "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
326 SIMD_LOOP_WRAPPER( \
327 /* blocksize */ 8, \
328 /* push_regs */ "", \
329 /* pop_regs */ "", \
330 /* small_loop */ \
331 "movzbl -2("ESI","ECX",2), %%eax \n\
332 movzbl -1("ESI","ECX",2), %%edx \n\
333 addl %%edx, %%eax \n\
334 shrl $1, %%eax \n\
335 movb %%al, -1("EDI","ECX")", \
336 /* main_loop */ \
337 "movdqu -16("ESI","ECX",2),%%xmm0 #XMM0:FEDCBA9876543210 \n\
338 movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\
339 pand %%xmm7, %%xmm0 # XMM0: E C A 8 6 4 2 0 \n\
340 psrlw $8, %%xmm1 # XMM1: F D B 9 7 5 3 1 \n\
341 pavgw %%xmm1, %%xmm0 # XMM0: w v u t s r q p (avgs) \n\
342 packuswb %%xmm0, %%xmm0 # XMM0: wvutsrqpwvutsrqp \n\
343 movq %%xmm0, -8("EDI","ECX")", \
344 /* emms */ "emms") \
345 : "=c" (dummy) \
346 : "S" (src), "D" (dest), "0" (count) \
347 : "eax", "edx"); \
348 } while (0)
349
350 /* Average 4 bytes horizontally (e.g. 444P->411P) (unit: 4 source bytes) */
351 #define AVG_4H(src,dest,count) do { \
352 int dummy; \
353 asm volatile( \
354 "pcmpeqd %%xmm7, %%xmm7; psrld $24, %%xmm7;" /* XMM7: 0x000000FF*4 */ \
355 SIMD_LOOP_WRAPPER( \
356 /* blocksize */ 4, \
357 /* push_regs */ "", \
358 /* pop_regs */ "", \
359 /* small_loop */ \
360 "movzbl -4("ESI","ECX",4), %%eax \n\
361 movzbl -3("ESI","ECX",4), %%edx \n\
362 addl %%edx, %%eax \n\
363 movzbl -2("ESI","ECX",4), %%edx \n\
364 addl %%edx, %%eax \n\
365 movzbl -1("ESI","ECX",4), %%edx \n\
366 addl %%edx, %%eax \n\
367 shrl $2, %%eax \n\
368 movb %%al, -1("EDI","ECX")", \
369 /* main_loop */ \
370 "movdqu -16("ESI","ECX",4),%%xmm0 #XMM0:FEDCBA9876543210 \n\
371 movdqa %%xmm0, %%xmm1 # XMM1: FEDCBA9876543210 \n\
372 movdqa %%xmm0, %%xmm2 # XMM2: FEDCBA9876543210 \n\
373 movdqa %%xmm0, %%xmm3 # XMM3: FEDCBA9876543210 \n\
374 pand %%xmm7, %%xmm0 # XMM0: C 8 4 0 \n\
375 psrld $8, %%xmm1 # XMM1: FED BA9 765 321 \n\
376 pand %%xmm7, %%xmm1 # XMM1: D 9 5 1 \n\
377 psrld $16, %%xmm2 # XMM2: FE BA 76 32 \n\
378 pand %%xmm7, %%xmm2 # XMM2: E A 6 2 \n\
379 psrld $24, %%xmm3 # XMM3: F B 7 3 \n\
380 pavgw %%xmm1, %%xmm0 # XMM0: C+D 8+9 4+5 0+1 (avgs) \n\
381 pavgw %%xmm3, %%xmm2 # XMM2: E+F A+B 6+7 2+3 (avgs) \n\
382 pavgw %%xmm2, %%xmm0 # XMM0: s r q p (avgs) \n\
383 packuswb %%xmm0, %%xmm0 # XMM0: s r q p s r q p \n\
384 packuswb %%xmm0, %%xmm0 # XMM0: srqpsrqpsrqpsrqp \n\
385 movd %%xmm0, -4("EDI","ECX")", \
386 /* emms */ "emms") \
387 : "=c" (dummy) \
388 : "S" (src), "D" (dest), "0" (count) \
389 : "eax", "edx"); \
390 } while (0)
391
392 /* Repeat 2 bytes horizontally (e.g. 422P->444P) (unit: 1 source byte) */
393 #define REP_2H(src,dest,count) do { \
394 int dummy; \
395 asm volatile(SIMD_LOOP_WRAPPER( \
396 /* blocksize */ 8, \
397 /* push_regs */ "", \
398 /* pop_regs */ "", \
399 /* small_loop */ \
400 "movb -1("ESI","ECX"), %%al \n\
401 movb %%al, %%ah \n\
402 movw %%ax, -2("EDI","ECX",2)", \
403 /* main_loop */ \
404 "movq -8("ESI","ECX"), %%xmm0 # XMM0: 76543210 \n\
405 punpcklbw %%xmm0, %%xmm0 # XMM0: 7766554433221100 \n\
406 movdqu %%xmm0, -16("EDI","ECX",2)", \
407 /* emms */ "emms") \
408 : "=c" (dummy) \
409 : "S" (src), "D" (dest), "0" (count) \
410 : "eax"); \
411 } while (0)
412
413 /* Repeat 4 bytes horizontally (e.g. 411P->444P) (unit: 1 source byte) */
414 #define REP_4H(src,dest,count) do { \
415 int dummy; \
416 asm volatile(SIMD_LOOP_WRAPPER( \
417 /* blocksize */ 4, \
418 /* push_regs */ "", \
419 /* pop_regs */ "", \
420 /* small_loop */ \
421 "movzbl -1("ESI","ECX"), %%eax \n\
422 movb %%al, %%ah \n\
423 movl %%eax, %%edx \n\
424 shll $16, %%eax \n\
425 orl %%edx, %%eax \n\
426 movl %%eax, -4("EDI","ECX",4)", \
427 /* main_loop */ \
428 "movd -4("ESI","ECX"), %%xmm0 # XMM0: 3210 \n\
429 punpcklbw %%xmm0, %%xmm0 # XMM0: 33221100 \n\
430 punpcklwd %%xmm0, %%xmm0 # XMM0: 3333222211110000 \n\
431 movdqu %%xmm0, -16("EDI","ECX",4)", \
432 /* emms */ "emms") \
433 : "=c" (dummy) \
434 : "S" (src), "D" (dest), "0" (count) \
435 : "eax", "edx"); \
436 } while (0)
437
438 /* Average 2 bytes vertically and double horizontally (411P->420P)
439 * (unit: 1 source byte) */
440 #define AVG_411_420(src1,src2,dest,count) do { \
441 int dummy; \
442 asm volatile(SIMD_LOOP_WRAPPER( \
443 /* blocksize */ 8, \
444 /* push_regs */ "push "EBX, \
445 /* pop_regs */ "pop "EBX, \
446 /* small_loop */ \
447 "movzbl -1("ESI","ECX"), %%eax \n\
448 movzbl -1("EDX","ECX"), %%ebx \n\
449 addl %%ebx, %%eax \n\
450 shrl $1, %%eax \n\
451 movb %%al, %%ah \n\
452 movw %%ax, -2("EDI","ECX",2)", \
453 /* main_loop */ \
454 "movq -8("ESI","ECX"), %%xmm0 \n\
455 movq -8("EDX","ECX"), %%xmm1 \n\
456 pavgb %%xmm1, %%xmm0 \n\
457 punpcklbw %%xmm0, %%xmm0 \n\
458 movdqu %%xmm0, -16("EDI","ECX",2)", \
459 /* emms */ "emms") \
460 : "=c" (dummy) \
461 : "S" (src1), "d" (src2), "D" (dest), "0" (count) \
462 : "eax"); \
463 } while (0)
464
465 /* Average 2 bytes vertically (422P->420P) (unit: 1 source byte) */
466 #define AVG_422_420(src1,src2,dest,count) do { \
467 int dummy; \
468 asm volatile(SIMD_LOOP_WRAPPER( \
469 /* blocksize */ 16, \
470 /* push_regs */ "push "EBX, \
471 /* pop_regs */ "pop "EBX, \
472 /* small_loop */ \
473 "movzbl -1("ESI","ECX"), %%eax \n\
474 movzbl -1("EDX","ECX"), %%ebx \n\
475 addl %%ebx, %%eax \n\
476 shrl $1, %%eax \n\
477 movb %%al, -1("EDI","ECX")", \
478 /* main_loop */ \
479 "movdqu -16("ESI","ECX"), %%xmm0 \n\
480 movdqu -16("EDX","ECX"), %%xmm1 \n\
481 pavgb %%xmm1, %%xmm0 \n\
482 movdqu %%xmm0, -16("EDI","ECX")", \
483 /* emms */ "emms") \
484 : "=c" (dummy) \
485 : "S" (src1), "d" (src2), "D" (dest), "0" (count) \
486 : "eax"); \
487 } while (0)
488
489 /* Average 4 bytes, 2 horizontally and 2 vertically (444P->420P)
490 * (unit: 2 source bytes) */
491 #define AVG_444_420(src1,src2,dest,count) do { \
492 int dummy; \
493 asm volatile( \
494 "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
495 SIMD_LOOP_WRAPPER( \
496 /* blocksize */ 8, \
497 /* push_regs */ "push "EBX, \
498 /* pop_regs */ "pop "EBX, \
499 /* small_loop */ \
500 "movzbl -2("ESI","ECX",2), %%eax \n\
501 movzbl -1("ESI","ECX",2), %%ebx \n\
502 addl %%ebx, %%eax \n\
503 movzbl -2("EDX","ECX",2), %%ebx \n\
504 addl %%ebx, %%eax \n\
505 movzbl -1("EDX","ECX",2), %%ebx \n\
506 addl %%ebx, %%eax \n\
507 shrl $2, %%eax \n\
508 movb %%al, -1("EDI","ECX")", \
509 /* main_loop */ \
510 "movdqu -16("ESI","ECX",2), %%xmm0 \n\
511 movdqu -16("EDX","ECX",2), %%xmm2 \n\
512 movdqa %%xmm0, %%xmm1 \n\
513 pand %%xmm7, %%xmm0 \n\
514 psrlw $8, %%xmm1 \n\
515 pavgw %%xmm1, %%xmm0 \n\
516 movdqa %%xmm2, %%xmm3 \n\
517 pand %%xmm7, %%xmm2 \n\
518 psrlw $8, %%xmm3 \n\
519 pavgw %%xmm3, %%xmm2 \n\
520 pavgw %%xmm2, %%xmm0 \n\
521 packuswb %%xmm0, %%xmm0 \n\
522 movq %%xmm0, -8("EDI","ECX")", \
523 /* emms */ "emms") \
524 : "=c" (dummy) \
525 : "S" (src1), "d" (src2), "D" (dest), "c" (count)); \
526 } while (0)
527
528 /*************************************************************************/
529
yuv420p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)530 static int yuv420p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
531 {
532 int y;
533 ac_memcpy(dest[0], src[0], width*height);
534 for (y = 0; y < (height & ~1); y += 2) {
535 AVG_2H(src[1]+(y/2)*(width/2), dest[1]+y*(width/4), width/4);
536 ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
537 AVG_2H(src[2]+(y/2)*(width/2), dest[2]+y*(width/4), width/4);
538 ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
539 }
540 return 1;
541 }
542
yuv420p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)543 static int yuv420p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
544 {
545 int y;
546 ac_memcpy(dest[0], src[0], width*height);
547 for (y = 0; y < height; y += 2) {
548 REP_2H(src[1]+(y/2)*(width/2), dest[1]+y*width, width/2);
549 ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
550 REP_2H(src[2]+(y/2)*(width/2), dest[2]+y*width, width/2);
551 ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
552 }
553 return 1;
554 }
555
556 /*************************************************************************/
557
yuv411p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)558 static int yuv411p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
559 {
560 int y;
561 ac_memcpy(dest[0], src[0], width*height);
562 for (y = 0; y < (height & ~1); y += 2) {
563 AVG_411_420(src[1]+y*(width/4), src[1]+(y+1)*(width/4),
564 dest[1]+(y/2)*(width/2), width/4);
565 AVG_411_420(src[2]+y*(width/4), src[2]+(y+1)*(width/4),
566 dest[2]+(y/2)*(width/2), width/4);
567 }
568 return 1;
569 }
570
yuv411p_yuv422p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)571 static int yuv411p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
572 {
573 ac_memcpy(dest[0], src[0], width*height);
574 if (!(width & 3)) {
575 /* Fast version, no bytes at end of row to skip */
576 REP_2H(src[1], dest[1], (width/4)*height);
577 REP_2H(src[2], dest[2], (width/4)*height);
578 } else {
579 /* Slow version, loop through each row */
580 int y;
581 for (y = 0; y < height; y++) {
582 REP_2H(src[1]+y*(width/4), dest[1]+y*(width/2), width/4);
583 REP_2H(src[2]+y*(width/4), dest[2]+y*(width/2), width/4);
584 }
585 }
586 return 1;
587 }
588
yuv411p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)589 static int yuv411p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
590 {
591 ac_memcpy(dest[0], src[0], width*height);
592 if (!(width & 3)) {
593 /* Fast version, no bytes at end of row to skip */
594 REP_4H(src[1], dest[1], (width/4)*height);
595 REP_4H(src[2], dest[2], (width/4)*height);
596 } else {
597 /* Slow version, loop through each row */
598 int y;
599 for (y = 0; y < height; y++) {
600 REP_4H(src[1]+y*(width/4), dest[1]+y*width, width/4);
601 REP_4H(src[2]+y*(width/4), dest[2]+y*width, width/4);
602 }
603 }
604 return 1;
605 }
606
607 /*************************************************************************/
608
yuv422p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)609 static int yuv422p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
610 {
611 int y;
612 ac_memcpy(dest[0], src[0], width*height);
613 for (y = 0; y < (height & ~1); y += 2) {
614 AVG_422_420(src[1]+y*(width/2), src[1]+(y+1)*(width/2),
615 dest[1]+(y/2)*(width/2), width/2);
616 AVG_422_420(src[2]+y*(width/2), src[2]+(y+1)*(width/2),
617 dest[2]+(y/2)*(width/2), width/2);
618 }
619 return 1;
620 }
621
yuv422p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)622 static int yuv422p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
623 {
624 ac_memcpy(dest[0], src[0], width*height);
625 if (!(width & 3)) {
626 /* Fast version, no bytes at end of row to skip */
627 AVG_2H(src[1], dest[1], (width/4)*height);
628 AVG_2H(src[2], dest[2], (width/4)*height);
629 } else {
630 /* Slow version, loop through each row */
631 int y;
632 for (y = 0; y < height; y++) {
633 AVG_2H(src[1]+y*(width/2), dest[1]+y*(width/4), width/4);
634 AVG_2H(src[2]+y*(width/2), dest[2]+y*(width/4), width/4);
635 }
636 }
637 return 1;
638 }
639
yuv422p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)640 static int yuv422p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
641 {
642 ac_memcpy(dest[0], src[0], width*height);
643 if (!(width & 1)) {
644 /* Fast version, no bytes at end of row to skip */
645 REP_2H(src[1], dest[1], (width/2)*height);
646 REP_2H(src[2], dest[2], (width/2)*height);
647 } else {
648 /* Slow version, loop through each row */
649 int y;
650 for (y = 0; y < height; y++) {
651 REP_2H(src[1]+y*(width/2), dest[1]+y*width, width/2);
652 REP_2H(src[2]+y*(width/2), dest[2]+y*width, width/2);
653 }
654 }
655 return 1;
656 }
657
658 /*************************************************************************/
659
yuv444p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)660 static int yuv444p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
661 {
662 int y;
663 ac_memcpy(dest[0], src[0], width*height);
664 for (y = 0; y < (height & ~1); y += 2) {
665 AVG_444_420(src[1]+y*width, src[1]+(y+1)*width,
666 dest[1]+(y/2)*(width/2), width/2);
667 AVG_444_420(src[2]+y*width, src[2]+(y+1)*width,
668 dest[2]+(y/2)*(width/2), width/2);
669 }
670 return 1;
671 }
672
yuv444p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)673 static int yuv444p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
674 {
675 ac_memcpy(dest[0], src[0], width*height);
676 if (!(width & 3)) {
677 /* Fast version, no bytes at end of row to skip */
678 AVG_4H(src[1], dest[1], (width/4)*height);
679 AVG_4H(src[2], dest[2], (width/4)*height);
680 } else {
681 /* Slow version, loop through each row */
682 int y;
683 for (y = 0; y < height; y++) {
684 AVG_4H(src[1]+y*width, dest[1]+y*(width/4), width/4);
685 AVG_4H(src[2]+y*width, dest[2]+y*(width/4), width/4);
686 }
687 }
688 return 1;
689 }
690
yuv444p_yuv422p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)691 static int yuv444p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
692 {
693 ac_memcpy(dest[0], src[0], width*height);
694 if (!(width & 1)) {
695 /* Fast version, no bytes at end of row to skip */
696 AVG_2H(src[1], dest[1], (width/2)*height);
697 AVG_2H(src[2], dest[2], (width/2)*height);
698 } else {
699 /* Slow version, loop through each row */
700 int y;
701 for (y = 0; y < height; y++) {
702 AVG_2H(src[1]+y*width, dest[1]+y*(width/2), width/2);
703 AVG_2H(src[2]+y*width, dest[2]+y*(width/2), width/2);
704 }
705 }
706 return 1;
707 }
708
709 /*************************************************************************/
710
711 #endif /* HAVE_ASM_SSE2 */
712
713 /*************************************************************************/
714 /*************************************************************************/
715
716 /* Initialization */
717
ac_imgconvert_init_yuv_planar(int accel)718 int ac_imgconvert_init_yuv_planar(int accel)
719 {
720 if (!register_conversion(IMG_YUV420P, IMG_YUV420P, yuv420p_copy)
721 || !register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p)
722 || !register_conversion(IMG_YUV420P, IMG_YUV422P, yuv420p_yuv422p)
723 || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p)
724 || !register_conversion(IMG_YUV420P, IMG_Y8, yuvp_y8)
725
726 || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p)
727 || !register_conversion(IMG_YUV411P, IMG_YUV411P, yuv411p_copy)
728 || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p)
729 || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p)
730 || !register_conversion(IMG_YUV411P, IMG_Y8, yuvp_y8)
731
732 || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p)
733 || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p)
734 || !register_conversion(IMG_YUV422P, IMG_YUV422P, yuv422p_copy)
735 || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p)
736 || !register_conversion(IMG_YUV422P, IMG_Y8, yuvp_y8)
737
738 || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p)
739 || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p)
740 || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p)
741 || !register_conversion(IMG_YUV444P, IMG_YUV444P, yuv444p_copy)
742 || !register_conversion(IMG_YUV444P, IMG_Y8, yuvp_y8)
743
744 || !register_conversion(IMG_Y8, IMG_YUV420P, y8_yuv420p)
745 || !register_conversion(IMG_Y8, IMG_YUV411P, y8_yuv411p)
746 || !register_conversion(IMG_Y8, IMG_YUV422P, y8_yuv422p)
747 || !register_conversion(IMG_Y8, IMG_YUV444P, y8_yuv444p)
748 || !register_conversion(IMG_Y8, IMG_Y8, y8_copy)
749 ) {
750 return 0;
751 }
752
753 #if defined(HAVE_ASM_SSE2)
754 if (accel & AC_SSE2) {
755 if (!register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p_sse2)
756 || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p_sse2)
757
758 || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p_sse2)
759 || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p_sse2)
760 || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p_sse2)
761
762 || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p_sse2)
763 || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p_sse2)
764 || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p_sse2)
765
766 || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p_sse2)
767 || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p_sse2)
768 || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p_sse2)
769 ) {
770 return 0;
771 }
772 }
773 #endif /* ARCH_X86 || ARCH_X86_64 */
774
775 return 1;
776 }
777
778 /*************************************************************************/
779
780 /*
781 * Local variables:
782 * c-file-style: "stroustrup"
783 * c-file-offsets: ((case-label . *) (statement-case-intro . *))
784 * indent-tabs-mode: nil
785 * End:
786 *
787 * vim: expandtab shiftwidth=4:
788 */
789