1 // Copyright 2016 Adrien Descamps
2 // Distributed under BSD 3-Clause License
3 #include "../../SDL_internal.h"
4 
5 #if SDL_HAVE_YUV
6 
7 #include "yuv_rgb.h"
8 
9 #include "SDL_cpuinfo.h"
10 /*#include <x86intrin.h>*/
11 
12 #define PRECISION 6
13 #define PRECISION_FACTOR (1<<PRECISION)
14 
15 typedef struct
16 {
17 	uint8_t y_shift;
18 	int16_t matrix[3][3];
19 } RGB2YUVParam;
20 // |Y|   |y_shift|                        |matrix[0][0] matrix[0][1] matrix[0][2]|   |R|
21 // |U| = |  128  | + 1/PRECISION_FACTOR * |matrix[1][0] matrix[1][1] matrix[1][2]| * |G|
22 // |V|   |  128  |                        |matrix[2][0] matrix[2][1] matrix[2][2]|   |B|
23 
24 typedef struct
25 {
26 	uint8_t y_shift;
27 	int16_t y_factor;
28 	int16_t v_r_factor;
29 	int16_t u_g_factor;
30 	int16_t v_g_factor;
31 	int16_t u_b_factor;
32 } YUV2RGBParam;
33 // |R|                        |y_factor      0       v_r_factor|   |Y-y_shift|
34 // |G| = 1/PRECISION_FACTOR * |y_factor  u_g_factor  v_g_factor| * |  U-128  |
35 // |B|                        |y_factor  u_b_factor      0     |   |  V-128  |
36 
37 #define V(value) (int16_t)((value*PRECISION_FACTOR)+0.5)
38 
39 // for ITU-T T.871, values can be found in section 7
40 // for ITU-R BT.601-7 values are derived from equations in sections 2.5.1-2.5.3, assuming RGB is encoded using full range ([0-1]<->[0-255])
41 // for ITU-R BT.709-6 values are derived from equations in sections 3.2-3.4, assuming RGB is encoded using full range ([0-1]<->[0-255])
42 // all values are rounded to the fourth decimal
43 
44 static const YUV2RGBParam YUV2RGB[3] = {
45 	// ITU-T T.871 (JPEG)
46 	{/*.y_shift=*/ 0, /*.y_factor=*/ V(1.0), /*.v_r_factor=*/ V(1.402), /*.u_g_factor=*/ -V(0.3441), /*.v_g_factor=*/ -V(0.7141), /*.u_b_factor=*/ V(1.772)},
47 	// ITU-R BT.601-7
48 	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.596), /*.u_g_factor=*/ -V(0.3918), /*.v_g_factor=*/ -V(0.813), /*.u_b_factor=*/ V(2.0172)},
49 	// ITU-R BT.709-6
50 	{/*.y_shift=*/ 16, /*.y_factor=*/ V(1.1644), /*.v_r_factor=*/ V(1.7927), /*.u_g_factor=*/ -V(0.2132), /*.v_g_factor=*/ -V(0.5329), /*.u_b_factor=*/ V(2.1124)}
51 };
52 
53 static const RGB2YUVParam RGB2YUV[3] = {
54 	// ITU-T T.871 (JPEG)
55 	{/*.y_shift=*/ 0, /*.matrix=*/ {{V(0.299), V(0.587), V(0.114)}, {-V(0.1687), -V(0.3313), V(0.5)}, {V(0.5), -V(0.4187), -V(0.0813)}}},
56 	// ITU-R BT.601-7
57 	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.2568), V(0.5041), V(0.0979)}, {-V(0.1482), -V(0.291), V(0.4392)}, {V(0.4392), -V(0.3678), -V(0.0714)}}},
58 	// ITU-R BT.709-6
59 	{/*.y_shift=*/ 16, /*.matrix=*/ {{V(0.1826), V(0.6142), V(0.062)}, {-V(0.1006), -V(0.3386), V(0.4392)}, {V(0.4392), -V(0.3989), -V(0.0403)}}}
60 };
61 
62 /* The various layouts of YUV data we support */
63 #define YUV_FORMAT_420	1
64 #define YUV_FORMAT_422	2
65 #define YUV_FORMAT_NV12	3
66 
67 /* The various formats of RGB pixel that we support */
68 #define RGB_FORMAT_RGB565	1
69 #define RGB_FORMAT_RGB24	2
70 #define RGB_FORMAT_RGBA		3
71 #define RGB_FORMAT_BGRA		4
72 #define RGB_FORMAT_ARGB		5
73 #define RGB_FORMAT_ABGR		6
74 
75 // divide by PRECISION_FACTOR and clamp to [0:255] interval
76 // input must be in the [-128*PRECISION_FACTOR:384*PRECISION_FACTOR] range
clampU8(int32_t v)77 static uint8_t clampU8(int32_t v)
78 {
79 	static const uint8_t lut[512] =
80 	{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
81 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
82 	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,
83 	47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,
84 	91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
85 	126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,
86 	159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,
87 	192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
88 	225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
89 	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
90 	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
91 	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
92 	255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
93 	};
94 	return lut[(v+128*PRECISION_FACTOR)>>PRECISION];
95 }
96 
97 
98 #define STD_FUNCTION_NAME	yuv420_rgb565_std
99 #define YUV_FORMAT			YUV_FORMAT_420
100 #define RGB_FORMAT			RGB_FORMAT_RGB565
101 #include "yuv_rgb_std_func.h"
102 
103 #define STD_FUNCTION_NAME	yuv420_rgb24_std
104 #define YUV_FORMAT			YUV_FORMAT_420
105 #define RGB_FORMAT			RGB_FORMAT_RGB24
106 #include "yuv_rgb_std_func.h"
107 
108 #define STD_FUNCTION_NAME	yuv420_rgba_std
109 #define YUV_FORMAT			YUV_FORMAT_420
110 #define RGB_FORMAT			RGB_FORMAT_RGBA
111 #include "yuv_rgb_std_func.h"
112 
113 #define STD_FUNCTION_NAME	yuv420_bgra_std
114 #define YUV_FORMAT			YUV_FORMAT_420
115 #define RGB_FORMAT			RGB_FORMAT_BGRA
116 #include "yuv_rgb_std_func.h"
117 
118 #define STD_FUNCTION_NAME	yuv420_argb_std
119 #define YUV_FORMAT			YUV_FORMAT_420
120 #define RGB_FORMAT			RGB_FORMAT_ARGB
121 #include "yuv_rgb_std_func.h"
122 
123 #define STD_FUNCTION_NAME	yuv420_abgr_std
124 #define YUV_FORMAT			YUV_FORMAT_420
125 #define RGB_FORMAT			RGB_FORMAT_ABGR
126 #include "yuv_rgb_std_func.h"
127 
128 #define STD_FUNCTION_NAME	yuv422_rgb565_std
129 #define YUV_FORMAT			YUV_FORMAT_422
130 #define RGB_FORMAT			RGB_FORMAT_RGB565
131 #include "yuv_rgb_std_func.h"
132 
133 #define STD_FUNCTION_NAME	yuv422_rgb24_std
134 #define YUV_FORMAT			YUV_FORMAT_422
135 #define RGB_FORMAT			RGB_FORMAT_RGB24
136 #include "yuv_rgb_std_func.h"
137 
138 #define STD_FUNCTION_NAME	yuv422_rgba_std
139 #define YUV_FORMAT			YUV_FORMAT_422
140 #define RGB_FORMAT			RGB_FORMAT_RGBA
141 #include "yuv_rgb_std_func.h"
142 
143 #define STD_FUNCTION_NAME	yuv422_bgra_std
144 #define YUV_FORMAT			YUV_FORMAT_422
145 #define RGB_FORMAT			RGB_FORMAT_BGRA
146 #include "yuv_rgb_std_func.h"
147 
148 #define STD_FUNCTION_NAME	yuv422_argb_std
149 #define YUV_FORMAT			YUV_FORMAT_422
150 #define RGB_FORMAT			RGB_FORMAT_ARGB
151 #include "yuv_rgb_std_func.h"
152 
153 #define STD_FUNCTION_NAME	yuv422_abgr_std
154 #define YUV_FORMAT			YUV_FORMAT_422
155 #define RGB_FORMAT			RGB_FORMAT_ABGR
156 #include "yuv_rgb_std_func.h"
157 
158 #define STD_FUNCTION_NAME	yuvnv12_rgb565_std
159 #define YUV_FORMAT			YUV_FORMAT_NV12
160 #define RGB_FORMAT			RGB_FORMAT_RGB565
161 #include "yuv_rgb_std_func.h"
162 
163 #define STD_FUNCTION_NAME	yuvnv12_rgb24_std
164 #define YUV_FORMAT			YUV_FORMAT_NV12
165 #define RGB_FORMAT			RGB_FORMAT_RGB24
166 #include "yuv_rgb_std_func.h"
167 
168 #define STD_FUNCTION_NAME	yuvnv12_rgba_std
169 #define YUV_FORMAT			YUV_FORMAT_NV12
170 #define RGB_FORMAT			RGB_FORMAT_RGBA
171 #include "yuv_rgb_std_func.h"
172 
173 #define STD_FUNCTION_NAME	yuvnv12_bgra_std
174 #define YUV_FORMAT			YUV_FORMAT_NV12
175 #define RGB_FORMAT			RGB_FORMAT_BGRA
176 #include "yuv_rgb_std_func.h"
177 
178 #define STD_FUNCTION_NAME	yuvnv12_argb_std
179 #define YUV_FORMAT			YUV_FORMAT_NV12
180 #define RGB_FORMAT			RGB_FORMAT_ARGB
181 #include "yuv_rgb_std_func.h"
182 
183 #define STD_FUNCTION_NAME	yuvnv12_abgr_std
184 #define YUV_FORMAT			YUV_FORMAT_NV12
185 #define RGB_FORMAT			RGB_FORMAT_ABGR
186 #include "yuv_rgb_std_func.h"
187 
rgb24_yuv420_std(uint32_t width,uint32_t height,const uint8_t * RGB,uint32_t RGB_stride,uint8_t * Y,uint8_t * U,uint8_t * V,uint32_t Y_stride,uint32_t UV_stride,YCbCrType yuv_type)188 void rgb24_yuv420_std(
189 	uint32_t width, uint32_t height,
190 	const uint8_t *RGB, uint32_t RGB_stride,
191 	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
192 	YCbCrType yuv_type)
193 {
194 	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
195 
196 	uint32_t x, y;
197 	for(y=0; y<(height-1); y+=2)
198 	{
199 		const uint8_t *rgb_ptr1=RGB+y*RGB_stride,
200 			*rgb_ptr2=RGB+(y+1)*RGB_stride;
201 
202 		uint8_t *y_ptr1=Y+y*Y_stride,
203 			*y_ptr2=Y+(y+1)*Y_stride,
204 			*u_ptr=U+(y/2)*UV_stride,
205 			*v_ptr=V+(y/2)*UV_stride;
206 
207 		for(x=0; x<(width-1); x+=2)
208 		{
209 			// compute yuv for the four pixels, u and v values are summed
210 			int32_t y_tmp, u_tmp, v_tmp;
211 
212 			y_tmp = param->matrix[0][0]*rgb_ptr1[0] + param->matrix[0][1]*rgb_ptr1[1] + param->matrix[0][2]*rgb_ptr1[2];
213 			u_tmp = param->matrix[1][0]*rgb_ptr1[0] + param->matrix[1][1]*rgb_ptr1[1] + param->matrix[1][2]*rgb_ptr1[2];
214 			v_tmp = param->matrix[2][0]*rgb_ptr1[0] + param->matrix[2][1]*rgb_ptr1[1] + param->matrix[2][2]*rgb_ptr1[2];
215 			y_ptr1[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
216 
217 			y_tmp = param->matrix[0][0]*rgb_ptr1[3] + param->matrix[0][1]*rgb_ptr1[4] + param->matrix[0][2]*rgb_ptr1[5];
218 			u_tmp += param->matrix[1][0]*rgb_ptr1[3] + param->matrix[1][1]*rgb_ptr1[4] + param->matrix[1][2]*rgb_ptr1[5];
219 			v_tmp += param->matrix[2][0]*rgb_ptr1[3] + param->matrix[2][1]*rgb_ptr1[4] + param->matrix[2][2]*rgb_ptr1[5];
220 			y_ptr1[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
221 
222 			y_tmp = param->matrix[0][0]*rgb_ptr2[0] + param->matrix[0][1]*rgb_ptr2[1] + param->matrix[0][2]*rgb_ptr2[2];
223 			u_tmp += param->matrix[1][0]*rgb_ptr2[0] + param->matrix[1][1]*rgb_ptr2[1] + param->matrix[1][2]*rgb_ptr2[2];
224 			v_tmp += param->matrix[2][0]*rgb_ptr2[0] + param->matrix[2][1]*rgb_ptr2[1] + param->matrix[2][2]*rgb_ptr2[2];
225 			y_ptr2[0]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
226 
227 			y_tmp = param->matrix[0][0]*rgb_ptr2[3] + param->matrix[0][1]*rgb_ptr2[4] + param->matrix[0][2]*rgb_ptr2[5];
228 			u_tmp += param->matrix[1][0]*rgb_ptr2[3] + param->matrix[1][1]*rgb_ptr2[4] + param->matrix[1][2]*rgb_ptr2[5];
229 			v_tmp += param->matrix[2][0]*rgb_ptr2[3] + param->matrix[2][1]*rgb_ptr2[4] + param->matrix[2][2]*rgb_ptr2[5];
230 			y_ptr2[1]=clampU8(y_tmp+((param->y_shift)<<PRECISION));
231 
232 			u_ptr[0] = clampU8(u_tmp/4+(128<<PRECISION));
233 			v_ptr[0] = clampU8(v_tmp/4+(128<<PRECISION));
234 
235 			rgb_ptr1 += 6;
236 			rgb_ptr2 += 6;
237 			y_ptr1 += 2;
238 			y_ptr2 += 2;
239 			u_ptr += 1;
240 			v_ptr += 1;
241 		}
242 	}
243 }
244 
245 #ifdef __SSE2__
246 
247 #define SSE_FUNCTION_NAME	yuv420_rgb565_sse
248 #define STD_FUNCTION_NAME	yuv420_rgb565_std
249 #define YUV_FORMAT			YUV_FORMAT_420
250 #define RGB_FORMAT			RGB_FORMAT_RGB565
251 #define SSE_ALIGNED
252 #include "yuv_rgb_sse_func.h"
253 
254 #define SSE_FUNCTION_NAME	yuv420_rgb565_sseu
255 #define STD_FUNCTION_NAME	yuv420_rgb565_std
256 #define YUV_FORMAT			YUV_FORMAT_420
257 #define RGB_FORMAT			RGB_FORMAT_RGB565
258 #include "yuv_rgb_sse_func.h"
259 
260 #define SSE_FUNCTION_NAME	yuv420_rgb24_sse
261 #define STD_FUNCTION_NAME	yuv420_rgb24_std
262 #define YUV_FORMAT			YUV_FORMAT_420
263 #define RGB_FORMAT			RGB_FORMAT_RGB24
264 #define SSE_ALIGNED
265 #include "yuv_rgb_sse_func.h"
266 
267 #define SSE_FUNCTION_NAME	yuv420_rgb24_sseu
268 #define STD_FUNCTION_NAME	yuv420_rgb24_std
269 #define YUV_FORMAT			YUV_FORMAT_420
270 #define RGB_FORMAT			RGB_FORMAT_RGB24
271 #include "yuv_rgb_sse_func.h"
272 
273 #define SSE_FUNCTION_NAME	yuv420_rgba_sse
274 #define STD_FUNCTION_NAME	yuv420_rgba_std
275 #define YUV_FORMAT			YUV_FORMAT_420
276 #define RGB_FORMAT			RGB_FORMAT_RGBA
277 #define SSE_ALIGNED
278 #include "yuv_rgb_sse_func.h"
279 
280 #define SSE_FUNCTION_NAME	yuv420_rgba_sseu
281 #define STD_FUNCTION_NAME	yuv420_rgba_std
282 #define YUV_FORMAT			YUV_FORMAT_420
283 #define RGB_FORMAT			RGB_FORMAT_RGBA
284 #include "yuv_rgb_sse_func.h"
285 
286 #define SSE_FUNCTION_NAME	yuv420_bgra_sse
287 #define STD_FUNCTION_NAME	yuv420_bgra_std
288 #define YUV_FORMAT			YUV_FORMAT_420
289 #define RGB_FORMAT			RGB_FORMAT_BGRA
290 #define SSE_ALIGNED
291 #include "yuv_rgb_sse_func.h"
292 
293 #define SSE_FUNCTION_NAME	yuv420_bgra_sseu
294 #define STD_FUNCTION_NAME	yuv420_bgra_std
295 #define YUV_FORMAT			YUV_FORMAT_420
296 #define RGB_FORMAT			RGB_FORMAT_BGRA
297 #include "yuv_rgb_sse_func.h"
298 
299 #define SSE_FUNCTION_NAME	yuv420_argb_sse
300 #define STD_FUNCTION_NAME	yuv420_argb_std
301 #define YUV_FORMAT			YUV_FORMAT_420
302 #define RGB_FORMAT			RGB_FORMAT_ARGB
303 #define SSE_ALIGNED
304 #include "yuv_rgb_sse_func.h"
305 
306 #define SSE_FUNCTION_NAME	yuv420_argb_sseu
307 #define STD_FUNCTION_NAME	yuv420_argb_std
308 #define YUV_FORMAT			YUV_FORMAT_420
309 #define RGB_FORMAT			RGB_FORMAT_ARGB
310 #include "yuv_rgb_sse_func.h"
311 
312 #define SSE_FUNCTION_NAME	yuv420_abgr_sse
313 #define STD_FUNCTION_NAME	yuv420_abgr_std
314 #define YUV_FORMAT			YUV_FORMAT_420
315 #define RGB_FORMAT			RGB_FORMAT_ABGR
316 #define SSE_ALIGNED
317 #include "yuv_rgb_sse_func.h"
318 
319 #define SSE_FUNCTION_NAME	yuv420_abgr_sseu
320 #define STD_FUNCTION_NAME	yuv420_abgr_std
321 #define YUV_FORMAT			YUV_FORMAT_420
322 #define RGB_FORMAT			RGB_FORMAT_ABGR
323 #include "yuv_rgb_sse_func.h"
324 
325 #define SSE_FUNCTION_NAME	yuv422_rgb565_sse
326 #define STD_FUNCTION_NAME	yuv422_rgb565_std
327 #define YUV_FORMAT			YUV_FORMAT_422
328 #define RGB_FORMAT			RGB_FORMAT_RGB565
329 #define SSE_ALIGNED
330 #include "yuv_rgb_sse_func.h"
331 
332 #define SSE_FUNCTION_NAME	yuv422_rgb565_sseu
333 #define STD_FUNCTION_NAME	yuv422_rgb565_std
334 #define YUV_FORMAT			YUV_FORMAT_422
335 #define RGB_FORMAT			RGB_FORMAT_RGB565
336 #include "yuv_rgb_sse_func.h"
337 
338 #define SSE_FUNCTION_NAME	yuv422_rgb24_sse
339 #define STD_FUNCTION_NAME	yuv422_rgb24_std
340 #define YUV_FORMAT			YUV_FORMAT_422
341 #define RGB_FORMAT			RGB_FORMAT_RGB24
342 #define SSE_ALIGNED
343 #include "yuv_rgb_sse_func.h"
344 
345 #define SSE_FUNCTION_NAME	yuv422_rgb24_sseu
346 #define STD_FUNCTION_NAME	yuv422_rgb24_std
347 #define YUV_FORMAT			YUV_FORMAT_422
348 #define RGB_FORMAT			RGB_FORMAT_RGB24
349 #include "yuv_rgb_sse_func.h"
350 
351 #define SSE_FUNCTION_NAME	yuv422_rgba_sse
352 #define STD_FUNCTION_NAME	yuv422_rgba_std
353 #define YUV_FORMAT			YUV_FORMAT_422
354 #define RGB_FORMAT			RGB_FORMAT_RGBA
355 #define SSE_ALIGNED
356 #include "yuv_rgb_sse_func.h"
357 
358 #define SSE_FUNCTION_NAME	yuv422_rgba_sseu
359 #define STD_FUNCTION_NAME	yuv422_rgba_std
360 #define YUV_FORMAT			YUV_FORMAT_422
361 #define RGB_FORMAT			RGB_FORMAT_RGBA
362 #include "yuv_rgb_sse_func.h"
363 
364 #define SSE_FUNCTION_NAME	yuv422_bgra_sse
365 #define STD_FUNCTION_NAME	yuv422_bgra_std
366 #define YUV_FORMAT			YUV_FORMAT_422
367 #define RGB_FORMAT			RGB_FORMAT_BGRA
368 #define SSE_ALIGNED
369 #include "yuv_rgb_sse_func.h"
370 
371 #define SSE_FUNCTION_NAME	yuv422_bgra_sseu
372 #define STD_FUNCTION_NAME	yuv422_bgra_std
373 #define YUV_FORMAT			YUV_FORMAT_422
374 #define RGB_FORMAT			RGB_FORMAT_BGRA
375 #include "yuv_rgb_sse_func.h"
376 
377 #define SSE_FUNCTION_NAME	yuv422_argb_sse
378 #define STD_FUNCTION_NAME	yuv422_argb_std
379 #define YUV_FORMAT			YUV_FORMAT_422
380 #define RGB_FORMAT			RGB_FORMAT_ARGB
381 #define SSE_ALIGNED
382 #include "yuv_rgb_sse_func.h"
383 
384 #define SSE_FUNCTION_NAME	yuv422_argb_sseu
385 #define STD_FUNCTION_NAME	yuv422_argb_std
386 #define YUV_FORMAT			YUV_FORMAT_422
387 #define RGB_FORMAT			RGB_FORMAT_ARGB
388 #include "yuv_rgb_sse_func.h"
389 
390 #define SSE_FUNCTION_NAME	yuv422_abgr_sse
391 #define STD_FUNCTION_NAME	yuv422_abgr_std
392 #define YUV_FORMAT			YUV_FORMAT_422
393 #define RGB_FORMAT			RGB_FORMAT_ABGR
394 #define SSE_ALIGNED
395 #include "yuv_rgb_sse_func.h"
396 
397 #define SSE_FUNCTION_NAME	yuv422_abgr_sseu
398 #define STD_FUNCTION_NAME	yuv422_abgr_std
399 #define YUV_FORMAT			YUV_FORMAT_422
400 #define RGB_FORMAT			RGB_FORMAT_ABGR
401 #include "yuv_rgb_sse_func.h"
402 
403 #define SSE_FUNCTION_NAME	yuvnv12_rgb565_sse
404 #define STD_FUNCTION_NAME	yuvnv12_rgb565_std
405 #define YUV_FORMAT			YUV_FORMAT_NV12
406 #define RGB_FORMAT			RGB_FORMAT_RGB565
407 #define SSE_ALIGNED
408 #include "yuv_rgb_sse_func.h"
409 
410 #define SSE_FUNCTION_NAME	yuvnv12_rgb565_sseu
411 #define STD_FUNCTION_NAME	yuvnv12_rgb565_std
412 #define YUV_FORMAT			YUV_FORMAT_NV12
413 #define RGB_FORMAT			RGB_FORMAT_RGB565
414 #include "yuv_rgb_sse_func.h"
415 
416 #define SSE_FUNCTION_NAME	yuvnv12_rgb24_sse
417 #define STD_FUNCTION_NAME	yuvnv12_rgb24_std
418 #define YUV_FORMAT			YUV_FORMAT_NV12
419 #define RGB_FORMAT			RGB_FORMAT_RGB24
420 #define SSE_ALIGNED
421 #include "yuv_rgb_sse_func.h"
422 
423 #define SSE_FUNCTION_NAME	yuvnv12_rgb24_sseu
424 #define STD_FUNCTION_NAME	yuvnv12_rgb24_std
425 #define YUV_FORMAT			YUV_FORMAT_NV12
426 #define RGB_FORMAT			RGB_FORMAT_RGB24
427 #include "yuv_rgb_sse_func.h"
428 
429 #define SSE_FUNCTION_NAME	yuvnv12_rgba_sse
430 #define STD_FUNCTION_NAME	yuvnv12_rgba_std
431 #define YUV_FORMAT			YUV_FORMAT_NV12
432 #define RGB_FORMAT			RGB_FORMAT_RGBA
433 #define SSE_ALIGNED
434 #include "yuv_rgb_sse_func.h"
435 
436 #define SSE_FUNCTION_NAME	yuvnv12_rgba_sseu
437 #define STD_FUNCTION_NAME	yuvnv12_rgba_std
438 #define YUV_FORMAT			YUV_FORMAT_NV12
439 #define RGB_FORMAT			RGB_FORMAT_RGBA
440 #include "yuv_rgb_sse_func.h"
441 
442 #define SSE_FUNCTION_NAME	yuvnv12_bgra_sse
443 #define STD_FUNCTION_NAME	yuvnv12_bgra_std
444 #define YUV_FORMAT			YUV_FORMAT_NV12
445 #define RGB_FORMAT			RGB_FORMAT_BGRA
446 #define SSE_ALIGNED
447 #include "yuv_rgb_sse_func.h"
448 
449 #define SSE_FUNCTION_NAME	yuvnv12_bgra_sseu
450 #define STD_FUNCTION_NAME	yuvnv12_bgra_std
451 #define YUV_FORMAT			YUV_FORMAT_NV12
452 #define RGB_FORMAT			RGB_FORMAT_BGRA
453 #include "yuv_rgb_sse_func.h"
454 
455 #define SSE_FUNCTION_NAME	yuvnv12_argb_sse
456 #define STD_FUNCTION_NAME	yuvnv12_argb_std
457 #define YUV_FORMAT			YUV_FORMAT_NV12
458 #define RGB_FORMAT			RGB_FORMAT_ARGB
459 #define SSE_ALIGNED
460 #include "yuv_rgb_sse_func.h"
461 
462 #define SSE_FUNCTION_NAME	yuvnv12_argb_sseu
463 #define STD_FUNCTION_NAME	yuvnv12_argb_std
464 #define YUV_FORMAT			YUV_FORMAT_NV12
465 #define RGB_FORMAT			RGB_FORMAT_ARGB
466 #include "yuv_rgb_sse_func.h"
467 
468 #define SSE_FUNCTION_NAME	yuvnv12_abgr_sse
469 #define STD_FUNCTION_NAME	yuvnv12_abgr_std
470 #define YUV_FORMAT			YUV_FORMAT_NV12
471 #define RGB_FORMAT			RGB_FORMAT_ABGR
472 #define SSE_ALIGNED
473 #include "yuv_rgb_sse_func.h"
474 
475 #define SSE_FUNCTION_NAME	yuvnv12_abgr_sseu
476 #define STD_FUNCTION_NAME	yuvnv12_abgr_std
477 #define YUV_FORMAT			YUV_FORMAT_NV12
478 #define RGB_FORMAT			RGB_FORMAT_ABGR
479 #include "yuv_rgb_sse_func.h"
480 
481 
482 #define UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
483 R1 = _mm_unpacklo_epi8(RGB1, RGB4); \
484 R2 = _mm_unpackhi_epi8(RGB1, RGB4); \
485 G1 = _mm_unpacklo_epi8(RGB2, RGB5); \
486 G2 = _mm_unpackhi_epi8(RGB2, RGB5); \
487 B1 = _mm_unpacklo_epi8(RGB3, RGB6); \
488 B2 = _mm_unpackhi_epi8(RGB3, RGB6);
489 
490 #define UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
491 RGB1 = _mm_unpacklo_epi8(R1, G2); \
492 RGB2 = _mm_unpackhi_epi8(R1, G2); \
493 RGB3 = _mm_unpacklo_epi8(R2, B1); \
494 RGB4 = _mm_unpackhi_epi8(R2, B1); \
495 RGB5 = _mm_unpacklo_epi8(G1, B2); \
496 RGB6 = _mm_unpackhi_epi8(G1, B2); \
497 
498 #define UNPACK_RGB24_32(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
499 UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
500 UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
501 UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
502 UNPACK_RGB24_32_STEP2(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
503 UNPACK_RGB24_32_STEP1(RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, R1, R2, G1, G2, B1, B2) \
504 
505 #define RGB2YUV_16(R, G, B, Y, U, V) \
506 Y = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[0][0])), \
507 		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[0][1]))); \
508 Y = _mm_add_epi16(Y, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[0][2]))); \
509 Y = _mm_add_epi16(Y, _mm_set1_epi16((param->y_shift)<<PRECISION)); \
510 Y = _mm_srai_epi16(Y, PRECISION); \
511 U = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[1][0])), \
512 		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[1][1]))); \
513 U = _mm_add_epi16(U, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[1][2]))); \
514 U = _mm_add_epi16(U, _mm_set1_epi16(128<<PRECISION)); \
515 U = _mm_srai_epi16(U, PRECISION); \
516 V = _mm_add_epi16(_mm_mullo_epi16(R, _mm_set1_epi16(param->matrix[2][0])), \
517 		_mm_mullo_epi16(G, _mm_set1_epi16(param->matrix[2][1]))); \
518 V = _mm_add_epi16(V, _mm_mullo_epi16(B, _mm_set1_epi16(param->matrix[2][2]))); \
519 V = _mm_add_epi16(V, _mm_set1_epi16(128<<PRECISION)); \
520 V = _mm_srai_epi16(V, PRECISION);
521 
522 #define RGB2YUV_32 \
523 	__m128i r1, r2, b1, b2, g1, g2; \
524 	__m128i r_16, g_16, b_16; \
525 	__m128i y1_16, y2_16, u1_16, u2_16, v1_16, v2_16, y, u1, u2, v1, v2, u1_tmp, u2_tmp, v1_tmp, v2_tmp; \
526 	__m128i rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1)), \
527 		rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+16)), \
528 		rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+32)), \
529 		rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2)), \
530 		rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+16)), \
531 		rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+32)); \
532 	/* unpack rgb24 data to r, g and b data in separate channels*/ \
533 	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
534 	/* process pixels of first line */ \
535 	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
536 	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
537 	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
538 	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
539 	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
540 	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
541 	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
542 	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
543 	y = _mm_packus_epi16(y1_16, y2_16); \
544 	u1 = _mm_packus_epi16(u1_16, u2_16); \
545 	v1 = _mm_packus_epi16(v1_16, v2_16); \
546 	/* save Y values */ \
547 	SAVE_SI128((__m128i*)(y_ptr1), y); \
548 	/* process pixels of second line */ \
549 	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
550 	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
551 	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
552 	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
553 	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
554 	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
555 	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
556 	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
557 	y = _mm_packus_epi16(y1_16, y2_16); \
558 	u2 = _mm_packus_epi16(u1_16, u2_16); \
559 	v2 = _mm_packus_epi16(v1_16, v2_16); \
560 	/* save Y values */ \
561 	SAVE_SI128((__m128i*)(y_ptr2), y); \
562 	/* vertical subsampling of u/v values */ \
563 	u1_tmp = _mm_avg_epu8(u1, u2); \
564 	v1_tmp = _mm_avg_epu8(v1, v2); \
565 	/* do the same again with next data */ \
566 	rgb1 = LOAD_SI128((const __m128i*)(rgb_ptr1+48)); \
567 	rgb2 = LOAD_SI128((const __m128i*)(rgb_ptr1+64)); \
568 	rgb3 = LOAD_SI128((const __m128i*)(rgb_ptr1+80)); \
569 	rgb4 = LOAD_SI128((const __m128i*)(rgb_ptr2+48)); \
570 	rgb5 = LOAD_SI128((const __m128i*)(rgb_ptr2+64)); \
571 	rgb6 = LOAD_SI128((const __m128i*)(rgb_ptr2+80)); \
572 	/* unpack rgb24 data to r, g and b data in separate channels*/ \
573 	UNPACK_RGB24_32(rgb1, rgb2, rgb3, rgb4, rgb5, rgb6, r1, r2, g1, g2, b1, b2) \
574 	/* process pixels of first line */ \
575 	r_16 = _mm_unpacklo_epi8(r1, _mm_setzero_si128()); \
576 	g_16 = _mm_unpacklo_epi8(g1, _mm_setzero_si128()); \
577 	b_16 = _mm_unpacklo_epi8(b1, _mm_setzero_si128()); \
578 	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
579 	r_16 = _mm_unpackhi_epi8(r1, _mm_setzero_si128()); \
580 	g_16 = _mm_unpackhi_epi8(g1, _mm_setzero_si128()); \
581 	b_16 = _mm_unpackhi_epi8(b1, _mm_setzero_si128()); \
582 	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
583 	y = _mm_packus_epi16(y1_16, y2_16); \
584 	u1 = _mm_packus_epi16(u1_16, u2_16); \
585 	v1 = _mm_packus_epi16(v1_16, v2_16); \
586 	/* save Y values */ \
587 	SAVE_SI128((__m128i*)(y_ptr1+16), y); \
588 	/* process pixels of second line */ \
589 	r_16 = _mm_unpacklo_epi8(r2, _mm_setzero_si128()); \
590 	g_16 = _mm_unpacklo_epi8(g2, _mm_setzero_si128()); \
591 	b_16 = _mm_unpacklo_epi8(b2, _mm_setzero_si128()); \
592 	RGB2YUV_16(r_16, g_16, b_16, y1_16, u1_16, v1_16) \
593 	r_16 = _mm_unpackhi_epi8(r2, _mm_setzero_si128()); \
594 	g_16 = _mm_unpackhi_epi8(g2, _mm_setzero_si128()); \
595 	b_16 = _mm_unpackhi_epi8(b2, _mm_setzero_si128()); \
596 	RGB2YUV_16(r_16, g_16, b_16, y2_16, u2_16, v2_16) \
597 	y = _mm_packus_epi16(y1_16, y2_16); \
598 	u2 = _mm_packus_epi16(u1_16, u2_16); \
599 	v2 = _mm_packus_epi16(v1_16, v2_16); \
600 	/* save Y values */ \
601 	SAVE_SI128((__m128i*)(y_ptr2+16), y); \
602 	/* vertical subsampling of u/v values */ \
603 	u2_tmp = _mm_avg_epu8(u1, u2); \
604 	v2_tmp = _mm_avg_epu8(v1, v2); \
605 	/* horizontal subsampling of u/v values */ \
606 	u1 = _mm_packus_epi16(_mm_srl_epi16(u1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(u2_tmp, _mm_cvtsi32_si128(8))); \
607 	v1 = _mm_packus_epi16(_mm_srl_epi16(v1_tmp, _mm_cvtsi32_si128(8)), _mm_srl_epi16(v2_tmp, _mm_cvtsi32_si128(8))); \
608 	u2 = _mm_packus_epi16(_mm_and_si128(u1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(u2_tmp, _mm_set1_epi16(0xFF))); \
609 	v2 = _mm_packus_epi16(_mm_and_si128(v1_tmp, _mm_set1_epi16(0xFF)), _mm_and_si128(v2_tmp, _mm_set1_epi16(0xFF))); \
610 	u1 = _mm_avg_epu8(u1, u2); \
611 	v1 = _mm_avg_epu8(v1, v2); \
612 	SAVE_SI128((__m128i*)(u_ptr), u1); \
613 	SAVE_SI128((__m128i*)(v_ptr), v1);
614 
rgb24_yuv420_sse(uint32_t width,uint32_t height,const uint8_t * RGB,uint32_t RGB_stride,uint8_t * Y,uint8_t * U,uint8_t * V,uint32_t Y_stride,uint32_t UV_stride,YCbCrType yuv_type)615 void rgb24_yuv420_sse(uint32_t width, uint32_t height,
616 	const uint8_t *RGB, uint32_t RGB_stride,
617 	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
618 	YCbCrType yuv_type)
619 {
620 	#define LOAD_SI128 _mm_load_si128
621 	#define SAVE_SI128 _mm_stream_si128
622 	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
623 
624 	uint32_t xpos, ypos;
625 	for(ypos=0; ypos<(height-1); ypos+=2)
626 	{
627 		const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
628 			*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
629 
630 		uint8_t *y_ptr1=Y+ypos*Y_stride,
631 			*y_ptr2=Y+(ypos+1)*Y_stride,
632 			*u_ptr=U+(ypos/2)*UV_stride,
633 			*v_ptr=V+(ypos/2)*UV_stride;
634 
635 		for(xpos=0; xpos<(width-31); xpos+=32)
636 		{
637 			RGB2YUV_32
638 
639 			rgb_ptr1+=96;
640 			rgb_ptr2+=96;
641 			y_ptr1+=32;
642 			y_ptr2+=32;
643 			u_ptr+=16;
644 			v_ptr+=16;
645 		}
646 	}
647 	#undef LOAD_SI128
648 	#undef SAVE_SI128
649 }
650 
rgb24_yuv420_sseu(uint32_t width,uint32_t height,const uint8_t * RGB,uint32_t RGB_stride,uint8_t * Y,uint8_t * U,uint8_t * V,uint32_t Y_stride,uint32_t UV_stride,YCbCrType yuv_type)651 void rgb24_yuv420_sseu(uint32_t width, uint32_t height,
652 	const uint8_t *RGB, uint32_t RGB_stride,
653 	uint8_t *Y, uint8_t *U, uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
654 	YCbCrType yuv_type)
655 {
656 	#define LOAD_SI128 _mm_loadu_si128
657 	#define SAVE_SI128 _mm_storeu_si128
658 	const RGB2YUVParam *const param = &(RGB2YUV[yuv_type]);
659 
660 	uint32_t xpos, ypos;
661 	for(ypos=0; ypos<(height-1); ypos+=2)
662 	{
663 		const uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
664 			*rgb_ptr2=RGB+(ypos+1)*RGB_stride;
665 
666 		uint8_t *y_ptr1=Y+ypos*Y_stride,
667 			*y_ptr2=Y+(ypos+1)*Y_stride,
668 			*u_ptr=U+(ypos/2)*UV_stride,
669 			*v_ptr=V+(ypos/2)*UV_stride;
670 
671 		for(xpos=0; xpos<(width-31); xpos+=32)
672 		{
673 			RGB2YUV_32
674 
675 			rgb_ptr1+=96;
676 			rgb_ptr2+=96;
677 			y_ptr1+=32;
678 			y_ptr2+=32;
679 			u_ptr+=16;
680 			v_ptr+=16;
681 		}
682 	}
683 	#undef LOAD_SI128
684 	#undef SAVE_SI128
685 }
686 
687 
688 #endif //__SSE2__
689 
690 #endif /* SDL_HAVE_YUV */
691