1 /*
2  * img_yuv_planar.c - YUV planar image format conversion routines
3  * Written by Andrew Church <achurch@achurch.org>
4  *
5  * This file is part of transcode, a video stream processing tool.
6  * transcode is free software, distributable under the terms of the GNU
7  * General Public License (version 2 or later).  See the file COPYING
8  * for details.
9  */
10 
11 #include "ac.h"
12 #include "imgconvert.h"
13 #include "img_internal.h"
14 
15 #include <string.h>
16 
17 /*************************************************************************/
18 /*************************************************************************/
19 
20 /* Standard C implementations */
21 
22 /*************************************************************************/
23 
24 /* Identity transformations */
25 
yuv420p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)26 static int yuv420p_copy(uint8_t **src, uint8_t **dest, int width, int height)
27 {
28     ac_memcpy(dest[0], src[0], width*height);
29     ac_memcpy(dest[1], src[1], (width/2)*(height/2));
30     ac_memcpy(dest[2], src[2], (width/2)*(height/2));
31     return 1;
32 }
33 
yuv411p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)34 static int yuv411p_copy(uint8_t **src, uint8_t **dest, int width, int height)
35 {
36     ac_memcpy(dest[0], src[0], width*height);
37     ac_memcpy(dest[1], src[1], (width/4)*height);
38     ac_memcpy(dest[2], src[2], (width/4)*height);
39     return 1;
40 }
41 
yuv422p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)42 static int yuv422p_copy(uint8_t **src, uint8_t **dest, int width, int height)
43 {
44     ac_memcpy(dest[0], src[0], width*height);
45     ac_memcpy(dest[1], src[1], (width/2)*height);
46     ac_memcpy(dest[2], src[2], (width/2)*height);
47     return 1;
48 }
49 
yuv444p_copy(uint8_t ** src,uint8_t ** dest,int width,int height)50 static int yuv444p_copy(uint8_t **src, uint8_t **dest, int width, int height)
51 {
52     ac_memcpy(dest[0], src[0], width*height);
53     ac_memcpy(dest[1], src[1], width*height);
54     ac_memcpy(dest[2], src[2], width*height);
55     return 1;
56 }
57 
y8_copy(uint8_t ** src,uint8_t ** dest,int width,int height)58 static int y8_copy(uint8_t **src, uint8_t **dest, int width, int height)
59 {
60     ac_memcpy(dest[0], src[0], width*height);
61     return 1;
62 }
63 
64 /*************************************************************************/
65 
yuv420p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)66 static int yuv420p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
67 {
68     int x, y;
69     ac_memcpy(dest[0], src[0], width*height);
70     for (y = 0; y < (height & ~1); y += 2) {
71         for (x = 0; x < (width/2 & ~1); x += 2) {
72             dest[1][y*(width/4)+x/2] = (src[1][(y/2)*(width/2)+x]
73                                       + src[1][(y/2)*(width/2)+x+1] + 1) / 2;
74             dest[2][y*(width/4)+x/2] = (src[2][(y/2)*(width/2)+x]
75                                       + src[2][(y/2)*(width/2)+x+1] + 1) / 2;
76         }
77         ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
78         ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
79     }
80     return 1;
81 }
82 
yuv420p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)83 static int yuv420p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
84 {
85     int y;
86     ac_memcpy(dest[0], src[0], width*height);
87     for (y = 0; y < (height & ~1); y += 2) {
88         ac_memcpy(dest[1]+(y  )*(width/2), src[1]+(y/2)*(width/2), width/2);
89         ac_memcpy(dest[1]+(y+1)*(width/2), src[1]+(y/2)*(width/2), width/2);
90         ac_memcpy(dest[2]+(y  )*(width/2), src[2]+(y/2)*(width/2), width/2);
91         ac_memcpy(dest[2]+(y+1)*(width/2), src[2]+(y/2)*(width/2), width/2);
92     }
93     return 1;
94 }
95 
yuv420p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)96 static int yuv420p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
97 {
98     int x, y;
99     ac_memcpy(dest[0], src[0], width*height);
100     for (y = 0; y < height; y += 2) {
101         for (x = 0; x < width; x += 2) {
102             dest[1][y*width+x  ] =
103             dest[1][y*width+x+1] = src[1][(y/2)*(width/2)+(x/2)];
104             dest[2][y*width+x  ] =
105             dest[2][y*width+x+1] = src[2][(y/2)*(width/2)+(x/2)];
106         }
107         ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
108         ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
109     }
110     return 1;
111 }
112 
113 /*************************************************************************/
114 
yuv411p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)115 static int yuv411p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
116 {
117     int x, y;
118     ac_memcpy(dest[0], src[0], width*height);
119     for (y = 0; y < (height & ~1); y += 2) {
120         for (x = 0; x < ((width/2) & ~1); x += 2) {
121             dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/4)+x/2]
122                                         + src[1][(y+1)*(width/4)+x/2] + 1) / 2;
123             dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/4)+x/2]
124                                         + src[2][(y+1)*(width/4)+x/2] + 1) / 2;
125             dest[1][(y/2)*(width/2)+x+1] = dest[1][(y/2)*(width/2)+x];
126             dest[2][(y/2)*(width/2)+x+1] = dest[2][(y/2)*(width/2)+x];
127         }
128     }
129     return 1;
130 }
131 
yuv411p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)132 static int yuv411p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
133 {
134     int x, y;
135     ac_memcpy(dest[0], src[0], width*height);
136     for (y = 0; y < height; y++) {
137         for (x = 0; x < ((width/2) & ~1); x += 2) {
138             dest[1][y*(width/2)+x  ] = src[1][y*(width/4)+x/2];
139             dest[1][y*(width/2)+x+1] = src[1][y*(width/4)+x/2];
140             dest[2][y*(width/2)+x  ] = src[2][y*(width/4)+x/2];
141             dest[2][y*(width/2)+x+1] = src[2][y*(width/4)+x/2];
142         }
143     }
144     return 1;
145 }
146 
yuv411p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)147 static int yuv411p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
148 {
149     int x, y;
150     ac_memcpy(dest[0], src[0], width*height);
151     for (y = 0; y < height; y++) {
152         for (x = 0; x < (width & ~3); x += 4) {
153             dest[1][y*width+x  ] = src[1][y*(width/4)+x/4];
154             dest[1][y*width+x+1] = src[1][y*(width/4)+x/4];
155             dest[1][y*width+x+2] = src[1][y*(width/4)+x/4];
156             dest[1][y*width+x+3] = src[1][y*(width/4)+x/4];
157             dest[2][y*width+x  ] = src[2][y*(width/4)+x/4];
158             dest[2][y*width+x+1] = src[2][y*(width/4)+x/4];
159             dest[2][y*width+x+2] = src[2][y*(width/4)+x/4];
160             dest[2][y*width+x+3] = src[2][y*(width/4)+x/4];
161         }
162     }
163     return 1;
164 }
165 
166 /*************************************************************************/
167 
yuv422p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)168 static int yuv422p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
169 {
170     int x, y;
171     ac_memcpy(dest[0], src[0], width*height);
172     for (y = 0; y < (height & ~1); y += 2) {
173         for (x = 0; x < width/2; x++) {
174             dest[1][(y/2)*(width/2)+x] = (src[1][y*(width/2)+x]
175                                         + src[1][(y+1)*(width/2)+x] + 1) / 2;
176             dest[2][(y/2)*(width/2)+x] = (src[2][y*(width/2)+x]
177                                         + src[2][(y+1)*(width/2)+x] + 1) / 2;
178         }
179     }
180     return 1;
181 }
182 
yuv422p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)183 static int yuv422p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
184 {
185     int x, y;
186     ac_memcpy(dest[0], src[0], width*height);
187     for (y = 0; y < height; y++) {
188         for (x = 0; x < ((width/2) & ~1); x += 2) {
189             dest[1][y*(width/4)+x/2] = (src[1][y*(width/2)+x]
190                                       + src[1][y*(width/2)+x+1] + 1) / 2;
191             dest[2][y*(width/4)+x/2] = (src[2][y*(width/2)+x]
192                                       + src[2][y*(width/2)+x+1] + 1) / 2;
193         }
194     }
195     return 1;
196 }
197 
yuv422p_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)198 static int yuv422p_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
199 {
200     int x, y;
201     ac_memcpy(dest[0], src[0], width*height);
202     for (y = 0; y < height; y++) {
203         for (x = 0; x < (width & ~1); x += 2) {
204             dest[1][y*width+x  ] = src[1][y*(width/2)+x/2];
205             dest[1][y*width+x+1] = src[1][y*(width/2)+x/2];
206             dest[2][y*width+x  ] = src[2][y*(width/2)+x/2];
207             dest[2][y*width+x+1] = src[2][y*(width/2)+x/2];
208         }
209     }
210     return 1;
211 }
212 
213 /*************************************************************************/
214 
yuv444p_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)215 static int yuv444p_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
216 {
217     int x, y;
218     ac_memcpy(dest[0], src[0], width*height);
219     for (y = 0; y < (height & ~1); y += 2) {
220         for (x = 0; x < (width & ~1); x += 2) {
221             dest[1][(y/2)*(width/2)+x/2] = (src[1][y*width+x]
222                                           + src[1][y*width+x+1]
223                                           + src[1][(y+1)*width+x]
224                                           + src[1][(y+1)*width+x+1] + 2) / 4;
225             dest[2][(y/2)*(width/2)+x/2] = (src[2][y*width+x]
226                                           + src[2][y*width+x+1]
227                                           + src[2][(y+1)*width+x]
228                                           + src[2][(y+1)*width+x+1] + 2) / 4;
229         }
230     }
231     return 1;
232 }
233 
yuv444p_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)234 static int yuv444p_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
235 {
236     int x, y;
237     ac_memcpy(dest[0], src[0], width*height);
238     for (y = 0; y < height; y++) {
239         for (x = 0; x < (width & ~3); x += 4) {
240             dest[1][y*(width/4)+x/4] = (src[1][y*width+x]
241                                       + src[1][y*width+x+1]
242                                       + src[1][y*width+x+2]
243                                       + src[1][y*width+x+3] + 2) / 4;
244             dest[2][y*(width/4)+x/4] = (src[2][y*width+x]
245                                       + src[2][y*width+x+1]
246                                       + src[2][y*width+x+2]
247                                       + src[2][y*width+x+3] + 2) / 4;
248         }
249     }
250     return 1;
251 }
252 
yuv444p_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)253 static int yuv444p_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
254 {
255     int x, y;
256     ac_memcpy(dest[0], src[0], width*height);
257     for (y = 0; y < height; y++) {
258         for (x = 0; x < (width & ~1); x += 2) {
259             dest[1][y*(width/2)+x/2] = (src[1][y*width+x]
260                                       + src[1][y*width+x+1] + 1) / 2;
261             dest[2][y*(width/2)+x/2] = (src[2][y*width+x]
262                                       + src[2][y*width+x+1] + 1) / 2;
263         }
264     }
265     return 1;
266 }
267 
268 /*************************************************************************/
269 
270 /* We treat Y8 as a planar format */
271 
yuvp_y8(uint8_t ** src,uint8_t ** dest,int width,int height)272 static int yuvp_y8(uint8_t **src, uint8_t **dest, int width, int height)
273 {
274     ac_memcpy(dest[0], src[0], width*height);
275     return 1;
276 }
277 
y8_yuv420p(uint8_t ** src,uint8_t ** dest,int width,int height)278 static int y8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
279 {
280     ac_memcpy(dest[0], src[0], width*height);
281     memset(dest[1], 128, (width/2)*(height/2));
282     memset(dest[2], 128, (width/2)*(height/2));
283     return 1;
284 }
285 
y8_yuv411p(uint8_t ** src,uint8_t ** dest,int width,int height)286 static int y8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
287 {
288     ac_memcpy(dest[0], src[0], width*height);
289     memset(dest[1], 128, (width/4)*height);
290     memset(dest[2], 128, (width/4)*height);
291     return 1;
292 }
293 
y8_yuv422p(uint8_t ** src,uint8_t ** dest,int width,int height)294 static int y8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
295 {
296     ac_memcpy(dest[0], src[0], width*height);
297     memset(dest[1], 128, (width/2)*height);
298     memset(dest[2], 128, (width/2)*height);
299     return 1;
300 }
301 
y8_yuv444p(uint8_t ** src,uint8_t ** dest,int width,int height)302 static int y8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
303 {
304     ac_memcpy(dest[0], src[0], width*height);
305     memset(dest[1], 128, width*height);
306     memset(dest[2], 128, width*height);
307     return 1;
308 }
309 
310 /*************************************************************************/
311 /*************************************************************************/
312 
313 #if defined(HAVE_ASM_SSE2)
314 
315 /* SSE2 routines.  See comments in img_x86_common.h for why we don't bother
316  * unrolling the loops. */
317 
318 /* Common macros/data for x86 code */
319 #include "img_x86_common.h"
320 
321 /* Average 2 bytes horizontally (e.g. 422P->411P) (unit: 2 source bytes) */
322 #define AVG_2H(src,dest,count)  do { \
323     int dummy;                                                          \
324     asm volatile(                                                       \
325         "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
326         SIMD_LOOP_WRAPPER(                                              \
327         /* blocksize */ 8,                                              \
328         /* push_regs */ "",                                             \
329         /* pop_regs  */ "",                                             \
330         /* small_loop */                                                \
331         "movzbl -2("ESI","ECX",2), %%eax                                \n\
332         movzbl -1("ESI","ECX",2), %%edx                                 \n\
333         addl %%edx, %%eax                                               \n\
334         shrl $1, %%eax                                                  \n\
335         movb %%al, -1("EDI","ECX")",                                    \
336         /* main_loop */                                                 \
337         "movdqu -16("ESI","ECX",2),%%xmm0 #XMM0:FEDCBA9876543210        \n\
338         movdqa %%xmm0, %%xmm1           # XMM1: FEDCBA9876543210        \n\
339         pand %%xmm7, %%xmm0             # XMM0:  E C A 8 6 4 2 0        \n\
340         psrlw $8, %%xmm1                # XMM1:  F D B 9 7 5 3 1        \n\
341         pavgw %%xmm1, %%xmm0            # XMM0:  w v u t s r q p (avgs) \n\
342         packuswb %%xmm0, %%xmm0         # XMM0: wvutsrqpwvutsrqp        \n\
343         movq %%xmm0, -8("EDI","ECX")",                                  \
344         /* emms */ "emms")                                              \
345         : "=c" (dummy)                                                  \
346         : "S" (src), "D" (dest), "0" (count)                            \
347         : "eax", "edx");                                                \
348 } while (0)
349 
350 /* Average 4 bytes horizontally (e.g. 444P->411P) (unit: 4 source bytes) */
351 #define AVG_4H(src,dest,count)  do { \
352     int dummy;                                                          \
353     asm volatile(                                                       \
354         "pcmpeqd %%xmm7, %%xmm7; psrld $24, %%xmm7;" /* XMM7: 0x000000FF*4 */ \
355         SIMD_LOOP_WRAPPER(                                              \
356         /* blocksize */ 4,                                              \
357         /* push_regs */ "",                                             \
358         /* pop_regs  */ "",                                             \
359         /* small_loop */                                                \
360         "movzbl -4("ESI","ECX",4), %%eax                                \n\
361         movzbl -3("ESI","ECX",4), %%edx                                 \n\
362         addl %%edx, %%eax                                               \n\
363         movzbl -2("ESI","ECX",4), %%edx                                 \n\
364         addl %%edx, %%eax                                               \n\
365         movzbl -1("ESI","ECX",4), %%edx                                 \n\
366         addl %%edx, %%eax                                               \n\
367         shrl $2, %%eax                                                  \n\
368         movb %%al, -1("EDI","ECX")",                                    \
369         /* main_loop */                                                 \
370         "movdqu -16("ESI","ECX",4),%%xmm0 #XMM0:FEDCBA9876543210        \n\
371         movdqa %%xmm0, %%xmm1           # XMM1: FEDCBA9876543210        \n\
372         movdqa %%xmm0, %%xmm2           # XMM2: FEDCBA9876543210        \n\
373         movdqa %%xmm0, %%xmm3           # XMM3: FEDCBA9876543210        \n\
374         pand %%xmm7, %%xmm0             # XMM0:    C   8   4   0        \n\
375         psrld $8, %%xmm1                # XMM1:  FED BA9 765 321        \n\
376         pand %%xmm7, %%xmm1             # XMM1:    D   9   5   1        \n\
377         psrld $16, %%xmm2               # XMM2:   FE  BA  76  32        \n\
378         pand %%xmm7, %%xmm2             # XMM2:    E   A   6   2        \n\
379         psrld $24, %%xmm3               # XMM3:    F   B   7   3        \n\
380         pavgw %%xmm1, %%xmm0            # XMM0:  C+D 8+9 4+5 0+1 (avgs) \n\
381         pavgw %%xmm3, %%xmm2            # XMM2:  E+F A+B 6+7 2+3 (avgs) \n\
382         pavgw %%xmm2, %%xmm0            # XMM0:    s   r   q   p (avgs) \n\
383         packuswb %%xmm0, %%xmm0         # XMM0:  s r q p s r q p        \n\
384         packuswb %%xmm0, %%xmm0         # XMM0: srqpsrqpsrqpsrqp        \n\
385         movd %%xmm0, -4("EDI","ECX")",                                  \
386         /* emms */ "emms")                                              \
387         : "=c" (dummy)                                                  \
388         : "S" (src), "D" (dest), "0" (count)                            \
389         : "eax", "edx");                                                \
390 } while (0)
391 
392 /* Repeat 2 bytes horizontally (e.g. 422P->444P) (unit: 1 source byte) */
393 #define REP_2H(src,dest,count)  do { \
394     int dummy;                                                          \
395     asm volatile(SIMD_LOOP_WRAPPER(                                     \
396         /* blocksize */ 8,                                              \
397         /* push_regs */ "",                                             \
398         /* pop_regs  */ "",                                             \
399         /* small_loop */                                                \
400         "movb -1("ESI","ECX"), %%al                                     \n\
401         movb %%al, %%ah                                                 \n\
402         movw %%ax, -2("EDI","ECX",2)",                                  \
403         /* main_loop */                                                 \
404         "movq -8("ESI","ECX"), %%xmm0   # XMM0:         76543210        \n\
405         punpcklbw %%xmm0, %%xmm0        # XMM0: 7766554433221100        \n\
406         movdqu %%xmm0, -16("EDI","ECX",2)",                             \
407         /* emms */ "emms")                                              \
408         : "=c" (dummy)                                                  \
409         : "S" (src), "D" (dest), "0" (count)                            \
410         : "eax");                                                       \
411 } while (0)
412 
413 /* Repeat 4 bytes horizontally (e.g. 411P->444P) (unit: 1 source byte) */
414 #define REP_4H(src,dest,count)  do { \
415     int dummy;                                                          \
416     asm volatile(SIMD_LOOP_WRAPPER(                                     \
417         /* blocksize */ 4,                                              \
418         /* push_regs */ "",                                             \
419         /* pop_regs  */ "",                                             \
420         /* small_loop */                                                \
421         "movzbl -1("ESI","ECX"), %%eax                                  \n\
422         movb %%al, %%ah                                                 \n\
423         movl %%eax, %%edx                                               \n\
424         shll $16, %%eax                                                 \n\
425         orl %%edx, %%eax                                                \n\
426         movl %%eax, -4("EDI","ECX",4)",                                 \
427         /* main_loop */                                                 \
428         "movd -4("ESI","ECX"), %%xmm0   # XMM0:             3210        \n\
429         punpcklbw %%xmm0, %%xmm0        # XMM0:         33221100        \n\
430         punpcklwd %%xmm0, %%xmm0        # XMM0: 3333222211110000        \n\
431         movdqu %%xmm0, -16("EDI","ECX",4)",                             \
432         /* emms */ "emms")                                              \
433         : "=c" (dummy)                                                  \
434         : "S" (src), "D" (dest), "0" (count)                            \
435         : "eax", "edx");                                                \
436 } while (0)
437 
438 /* Average 2 bytes vertically and double horizontally (411P->420P)
439  * (unit: 1 source byte) */
440 #define AVG_411_420(src1,src2,dest,count)  do { \
441     int dummy;                                                          \
442     asm volatile(SIMD_LOOP_WRAPPER(                                     \
443         /* blocksize */ 8,                                              \
444         /* push_regs */ "push "EBX,                                     \
445         /* pop_regs  */ "pop "EBX,                                      \
446         /* small_loop */                                                \
447         "movzbl -1("ESI","ECX"), %%eax                                  \n\
448         movzbl -1("EDX","ECX"), %%ebx                                   \n\
449         addl %%ebx, %%eax                                               \n\
450         shrl $1, %%eax                                                  \n\
451         movb %%al, %%ah                                                 \n\
452         movw %%ax, -2("EDI","ECX",2)",                                  \
453         /* main_loop */                                                 \
454         "movq -8("ESI","ECX"), %%xmm0                                   \n\
455         movq -8("EDX","ECX"), %%xmm1                                    \n\
456         pavgb %%xmm1, %%xmm0                                            \n\
457         punpcklbw %%xmm0, %%xmm0                                        \n\
458         movdqu %%xmm0, -16("EDI","ECX",2)",                             \
459         /* emms */ "emms")                                              \
460         : "=c" (dummy)                                                  \
461         : "S" (src1), "d" (src2), "D" (dest), "0" (count)               \
462         : "eax");                                                       \
463 } while (0)
464 
465 /* Average 2 bytes vertically (422P->420P) (unit: 1 source byte) */
466 #define AVG_422_420(src1,src2,dest,count)  do { \
467     int dummy;                                                          \
468     asm volatile(SIMD_LOOP_WRAPPER(                                     \
469         /* blocksize */ 16,                                             \
470         /* push_regs */ "push "EBX,                                     \
471         /* pop_regs  */ "pop "EBX,                                      \
472         /* small_loop */                                                \
473         "movzbl -1("ESI","ECX"), %%eax                                  \n\
474         movzbl -1("EDX","ECX"), %%ebx                                   \n\
475         addl %%ebx, %%eax                                               \n\
476         shrl $1, %%eax                                                  \n\
477         movb %%al, -1("EDI","ECX")",                                    \
478         /* main_loop */                                                 \
479         "movdqu -16("ESI","ECX"), %%xmm0                                \n\
480         movdqu -16("EDX","ECX"), %%xmm1                                 \n\
481         pavgb %%xmm1, %%xmm0                                            \n\
482         movdqu %%xmm0, -16("EDI","ECX")",                               \
483         /* emms */ "emms")                                              \
484         : "=c" (dummy)                                                  \
485         : "S" (src1), "d" (src2), "D" (dest), "0" (count)               \
486         : "eax");                                                       \
487 } while (0)
488 
489 /* Average 4 bytes, 2 horizontally and 2 vertically (444P->420P)
490  * (unit: 2 source bytes) */
491 #define AVG_444_420(src1,src2,dest,count)  do { \
492     int dummy;                                                          \
493     asm volatile(                                                       \
494         "pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" /* XMM7: 0x00FF*8 */ \
495         SIMD_LOOP_WRAPPER(                                              \
496         /* blocksize */ 8,                                              \
497         /* push_regs */ "push "EBX,                                     \
498         /* pop_regs  */ "pop "EBX,                                      \
499         /* small_loop */                                                \
500         "movzbl -2("ESI","ECX",2), %%eax                                \n\
501         movzbl -1("ESI","ECX",2), %%ebx                                 \n\
502         addl %%ebx, %%eax                                               \n\
503         movzbl -2("EDX","ECX",2), %%ebx                                 \n\
504         addl %%ebx, %%eax                                               \n\
505         movzbl -1("EDX","ECX",2), %%ebx                                 \n\
506         addl %%ebx, %%eax                                               \n\
507         shrl $2, %%eax                                                  \n\
508         movb %%al, -1("EDI","ECX")",                                    \
509         /* main_loop */                                                 \
510         "movdqu -16("ESI","ECX",2), %%xmm0                              \n\
511         movdqu -16("EDX","ECX",2), %%xmm2                               \n\
512         movdqa %%xmm0, %%xmm1                                           \n\
513         pand %%xmm7, %%xmm0                                             \n\
514         psrlw $8, %%xmm1                                                \n\
515         pavgw %%xmm1, %%xmm0                                            \n\
516         movdqa %%xmm2, %%xmm3                                           \n\
517         pand %%xmm7, %%xmm2                                             \n\
518         psrlw $8, %%xmm3                                                \n\
519         pavgw %%xmm3, %%xmm2                                            \n\
520         pavgw %%xmm2, %%xmm0                                            \n\
521         packuswb %%xmm0, %%xmm0                                         \n\
522         movq %%xmm0, -8("EDI","ECX")",                                  \
523         /* emms */ "emms")                                              \
524         : "=c" (dummy)                                                  \
525         : "S" (src1), "d" (src2), "D" (dest), "c" (count));             \
526 } while (0)
527 
528 /*************************************************************************/
529 
yuv420p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)530 static int yuv420p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
531 {
532     int y;
533     ac_memcpy(dest[0], src[0], width*height);
534     for (y = 0; y < (height & ~1); y += 2) {
535         AVG_2H(src[1]+(y/2)*(width/2), dest[1]+y*(width/4), width/4);
536         ac_memcpy(dest[1]+(y+1)*(width/4), dest[1]+y*(width/4), width/4);
537         AVG_2H(src[2]+(y/2)*(width/2), dest[2]+y*(width/4), width/4);
538         ac_memcpy(dest[2]+(y+1)*(width/4), dest[2]+y*(width/4), width/4);
539     }
540     return 1;
541 }
542 
yuv420p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)543 static int yuv420p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
544 {
545     int y;
546     ac_memcpy(dest[0], src[0], width*height);
547     for (y = 0; y < height; y += 2) {
548         REP_2H(src[1]+(y/2)*(width/2), dest[1]+y*width, width/2);
549         ac_memcpy(dest[1]+(y+1)*width, dest[1]+y*width, width);
550         REP_2H(src[2]+(y/2)*(width/2), dest[2]+y*width, width/2);
551         ac_memcpy(dest[2]+(y+1)*width, dest[2]+y*width, width);
552     }
553     return 1;
554 }
555 
556 /*************************************************************************/
557 
yuv411p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)558 static int yuv411p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
559 {
560     int y;
561     ac_memcpy(dest[0], src[0], width*height);
562     for (y = 0; y < (height & ~1); y += 2) {
563         AVG_411_420(src[1]+y*(width/4), src[1]+(y+1)*(width/4),
564                     dest[1]+(y/2)*(width/2), width/4);
565         AVG_411_420(src[2]+y*(width/4), src[2]+(y+1)*(width/4),
566                     dest[2]+(y/2)*(width/2), width/4);
567     }
568     return 1;
569 }
570 
yuv411p_yuv422p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)571 static int yuv411p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
572 {
573     ac_memcpy(dest[0], src[0], width*height);
574     if (!(width & 3)) {
575         /* Fast version, no bytes at end of row to skip */
576         REP_2H(src[1], dest[1], (width/4)*height);
577         REP_2H(src[2], dest[2], (width/4)*height);
578     } else {
579         /* Slow version, loop through each row */
580         int y;
581         for (y = 0; y < height; y++) {
582             REP_2H(src[1]+y*(width/4), dest[1]+y*(width/2), width/4);
583             REP_2H(src[2]+y*(width/4), dest[2]+y*(width/2), width/4);
584         }
585     }
586     return 1;
587 }
588 
yuv411p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)589 static int yuv411p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
590 {
591     ac_memcpy(dest[0], src[0], width*height);
592     if (!(width & 3)) {
593         /* Fast version, no bytes at end of row to skip */
594         REP_4H(src[1], dest[1], (width/4)*height);
595         REP_4H(src[2], dest[2], (width/4)*height);
596     } else {
597         /* Slow version, loop through each row */
598         int y;
599         for (y = 0; y < height; y++) {
600             REP_4H(src[1]+y*(width/4), dest[1]+y*width, width/4);
601             REP_4H(src[2]+y*(width/4), dest[2]+y*width, width/4);
602         }
603     }
604     return 1;
605 }
606 
607 /*************************************************************************/
608 
yuv422p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)609 static int yuv422p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
610 {
611     int y;
612     ac_memcpy(dest[0], src[0], width*height);
613     for (y = 0; y < (height & ~1); y += 2) {
614         AVG_422_420(src[1]+y*(width/2), src[1]+(y+1)*(width/2),
615                     dest[1]+(y/2)*(width/2), width/2);
616         AVG_422_420(src[2]+y*(width/2), src[2]+(y+1)*(width/2),
617                     dest[2]+(y/2)*(width/2), width/2);
618     }
619     return 1;
620 }
621 
yuv422p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)622 static int yuv422p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
623 {
624     ac_memcpy(dest[0], src[0], width*height);
625     if (!(width & 3)) {
626         /* Fast version, no bytes at end of row to skip */
627         AVG_2H(src[1], dest[1], (width/4)*height);
628         AVG_2H(src[2], dest[2], (width/4)*height);
629     } else {
630         /* Slow version, loop through each row */
631         int y;
632         for (y = 0; y < height; y++) {
633             AVG_2H(src[1]+y*(width/2), dest[1]+y*(width/4), width/4);
634             AVG_2H(src[2]+y*(width/2), dest[2]+y*(width/4), width/4);
635         }
636     }
637     return 1;
638 }
639 
yuv422p_yuv444p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)640 static int yuv422p_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
641 {
642     ac_memcpy(dest[0], src[0], width*height);
643     if (!(width & 1)) {
644         /* Fast version, no bytes at end of row to skip */
645         REP_2H(src[1], dest[1], (width/2)*height);
646         REP_2H(src[2], dest[2], (width/2)*height);
647     } else {
648         /* Slow version, loop through each row */
649         int y;
650         for (y = 0; y < height; y++) {
651             REP_2H(src[1]+y*(width/2), dest[1]+y*width, width/2);
652             REP_2H(src[2]+y*(width/2), dest[2]+y*width, width/2);
653         }
654     }
655     return 1;
656 }
657 
658 /*************************************************************************/
659 
yuv444p_yuv420p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)660 static int yuv444p_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
661 {
662     int y;
663     ac_memcpy(dest[0], src[0], width*height);
664     for (y = 0; y < (height & ~1); y += 2) {
665         AVG_444_420(src[1]+y*width, src[1]+(y+1)*width,
666                     dest[1]+(y/2)*(width/2), width/2);
667         AVG_444_420(src[2]+y*width, src[2]+(y+1)*width,
668                     dest[2]+(y/2)*(width/2), width/2);
669     }
670     return 1;
671 }
672 
yuv444p_yuv411p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)673 static int yuv444p_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
674 {
675     ac_memcpy(dest[0], src[0], width*height);
676     if (!(width & 3)) {
677         /* Fast version, no bytes at end of row to skip */
678         AVG_4H(src[1], dest[1], (width/4)*height);
679         AVG_4H(src[2], dest[2], (width/4)*height);
680     } else {
681         /* Slow version, loop through each row */
682         int y;
683         for (y = 0; y < height; y++) {
684             AVG_4H(src[1]+y*width, dest[1]+y*(width/4), width/4);
685             AVG_4H(src[2]+y*width, dest[2]+y*(width/4), width/4);
686         }
687     }
688     return 1;
689 }
690 
yuv444p_yuv422p_sse2(uint8_t ** src,uint8_t ** dest,int width,int height)691 static int yuv444p_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
692 {
693     ac_memcpy(dest[0], src[0], width*height);
694     if (!(width & 1)) {
695         /* Fast version, no bytes at end of row to skip */
696         AVG_2H(src[1], dest[1], (width/2)*height);
697         AVG_2H(src[2], dest[2], (width/2)*height);
698     } else {
699         /* Slow version, loop through each row */
700         int y;
701         for (y = 0; y < height; y++) {
702             AVG_2H(src[1]+y*width, dest[1]+y*(width/2), width/2);
703             AVG_2H(src[2]+y*width, dest[2]+y*(width/2), width/2);
704         }
705     }
706     return 1;
707 }
708 
709 /*************************************************************************/
710 
711 #endif  /* HAVE_ASM_SSE2 */
712 
713 /*************************************************************************/
714 /*************************************************************************/
715 
716 /* Initialization */
717 
ac_imgconvert_init_yuv_planar(int accel)718 int ac_imgconvert_init_yuv_planar(int accel)
719 {
720     if (!register_conversion(IMG_YUV420P, IMG_YUV420P, yuv420p_copy)
721      || !register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p)
722      || !register_conversion(IMG_YUV420P, IMG_YUV422P, yuv420p_yuv422p)
723      || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p)
724      || !register_conversion(IMG_YUV420P, IMG_Y8,      yuvp_y8)
725 
726      || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p)
727      || !register_conversion(IMG_YUV411P, IMG_YUV411P, yuv411p_copy)
728      || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p)
729      || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p)
730      || !register_conversion(IMG_YUV411P, IMG_Y8,      yuvp_y8)
731 
732      || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p)
733      || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p)
734      || !register_conversion(IMG_YUV422P, IMG_YUV422P, yuv422p_copy)
735      || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p)
736      || !register_conversion(IMG_YUV422P, IMG_Y8,      yuvp_y8)
737 
738      || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p)
739      || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p)
740      || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p)
741      || !register_conversion(IMG_YUV444P, IMG_YUV444P, yuv444p_copy)
742      || !register_conversion(IMG_YUV444P, IMG_Y8,      yuvp_y8)
743 
744      || !register_conversion(IMG_Y8,      IMG_YUV420P, y8_yuv420p)
745      || !register_conversion(IMG_Y8,      IMG_YUV411P, y8_yuv411p)
746      || !register_conversion(IMG_Y8,      IMG_YUV422P, y8_yuv422p)
747      || !register_conversion(IMG_Y8,      IMG_YUV444P, y8_yuv444p)
748      || !register_conversion(IMG_Y8,      IMG_Y8,      y8_copy)
749     ) {
750         return 0;
751     }
752 
753 #if defined(HAVE_ASM_SSE2)
754     if (accel & AC_SSE2) {
755         if (!register_conversion(IMG_YUV420P, IMG_YUV411P, yuv420p_yuv411p_sse2)
756          || !register_conversion(IMG_YUV420P, IMG_YUV444P, yuv420p_yuv444p_sse2)
757 
758          || !register_conversion(IMG_YUV411P, IMG_YUV420P, yuv411p_yuv420p_sse2)
759          || !register_conversion(IMG_YUV411P, IMG_YUV422P, yuv411p_yuv422p_sse2)
760          || !register_conversion(IMG_YUV411P, IMG_YUV444P, yuv411p_yuv444p_sse2)
761 
762          || !register_conversion(IMG_YUV422P, IMG_YUV420P, yuv422p_yuv420p_sse2)
763          || !register_conversion(IMG_YUV422P, IMG_YUV411P, yuv422p_yuv411p_sse2)
764          || !register_conversion(IMG_YUV422P, IMG_YUV444P, yuv422p_yuv444p_sse2)
765 
766          || !register_conversion(IMG_YUV444P, IMG_YUV420P, yuv444p_yuv420p_sse2)
767          || !register_conversion(IMG_YUV444P, IMG_YUV411P, yuv444p_yuv411p_sse2)
768          || !register_conversion(IMG_YUV444P, IMG_YUV422P, yuv444p_yuv422p_sse2)
769         ) {
770             return 0;
771         }
772     }
773 #endif  /* ARCH_X86 || ARCH_X86_64 */
774 
775     return 1;
776 }
777 
778 /*************************************************************************/
779 
780 /*
781  * Local variables:
782  *   c-file-style: "stroustrup"
783  *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
784  *   indent-tabs-mode: nil
785  * End:
786  *
787  * vim: expandtab shiftwidth=4:
788  */
789