1 /* FreeRDP: A Remote Desktop Protocol Client
2  * Optimized Color conversion operations.
3  * vi:ts=4 sw=4:
4  *
5  * Copyright 2011 Stephen Erisman
6  * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7  * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License"); you may
11  * not use this file except in compliance with the License. You may obtain
12  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16  * or implied. See the License for the specific language governing
17  * permissions and limitations under the License.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include <freerdp/types.h>
25 #include <freerdp/primitives.h>
26 #include <winpr/sysinfo.h>
27 
28 #ifdef WITH_SSE2
29 #include <emmintrin.h>
30 #elif defined(WITH_NEON)
31 #include <arm_neon.h>
32 #endif /* WITH_SSE2 else WITH_NEON */
33 
34 #include "prim_internal.h"
35 #include "prim_templates.h"
36 
37 static primitives_t* generic = NULL;
38 
39 #ifdef WITH_SSE2
40 
41 #ifdef __GNUC__
42 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
43 #else
44 #define GNU_INLINE
45 #endif
46 
47 #define CACHE_LINE_BYTES 64
48 
49 #define _mm_between_epi16(_val, _min, _max)                    \
50 	do                                                         \
51 	{                                                          \
52 		_val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
53 	} while (0)
54 
55 #ifdef DO_PREFETCH
56 /*---------------------------------------------------------------------------*/
_mm_prefetch_buffer(char * buffer,int num_bytes)57 static inline void GNU_INLINE _mm_prefetch_buffer(char* buffer, int num_bytes)
58 {
59 	__m128i* buf = (__m128i*)buffer;
60 	unsigned int i;
61 
62 	for (i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
63 	{
64 		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
65 	}
66 }
67 #endif /* DO_PREFETCH */
68 
69 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s16s_P3P3(const INT16 * const pSrc[3],int srcStep,INT16 * pDst[3],int dstStep,const prim_size_t * roi)70 static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
71                                              INT16* pDst[3], int dstStep,
72                                              const prim_size_t* roi) /* region of interest */
73 {
74 	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
75 	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
76 	UINT32 yp;
77 	int srcbump, dstbump, imax;
78 
79 	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
80 	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
81 	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
82 	    (srcStep & 127) || (dstStep & 127))
83 	{
84 		/* We can't maintain 16-byte alignment. */
85 		return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
86 	}
87 
88 	zero = _mm_setzero_si128();
89 	max = _mm_set1_epi16(255);
90 	y_buf = (__m128i*)(pSrc[0]);
91 	cb_buf = (__m128i*)(pSrc[1]);
92 	cr_buf = (__m128i*)(pSrc[2]);
93 	r_buf = (__m128i*)(pDst[0]);
94 	g_buf = (__m128i*)(pDst[1]);
95 	b_buf = (__m128i*)(pDst[2]);
96 	r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
97 	g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
98 	g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
99 	b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
100 	c4096 = _mm_set1_epi16(4096);
101 	srcbump = srcStep / sizeof(__m128i);
102 	dstbump = dstStep / sizeof(__m128i);
103 #ifdef DO_PREFETCH
104 
105 	/* Prefetch Y's, Cb's, and Cr's. */
106 	for (yp = 0; yp < roi->height; yp++)
107 	{
108 		int i;
109 
110 		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
111 		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
112 		{
113 			_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
114 			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
115 			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
116 		}
117 
118 		y_buf += srcbump;
119 		cb_buf += srcbump;
120 		cr_buf += srcbump;
121 	}
122 
123 	y_buf = (__m128i*)(pSrc[0]);
124 	cb_buf = (__m128i*)(pSrc[1]);
125 	cr_buf = (__m128i*)(pSrc[2]);
126 #endif /* DO_PREFETCH */
127 	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
128 
129 	for (yp = 0; yp < roi->height; ++yp)
130 	{
131 		int i;
132 
133 		for (i = 0; i < imax; i++)
134 		{
135 			/* In order to use SSE2 signed 16-bit integer multiplication
136 			 * we need to convert the floating point factors to signed int
137 			 * without losing information.
138 			 * The result of this multiplication is 32 bit and we have two
139 			 * SSE instructions that return either the hi or lo word.
140 			 * Thus we will multiply the factors by the highest possible 2^n,
141 			 * take the upper 16 bits of the signed 32-bit result
142 			 * (_mm_mulhi_epi16) and correct this result by multiplying
143 			 * it by 2^(16-n).
144 			 *
145 			 * For the given factors in the conversion matrix the best
146 			 * possible n is 14.
147 			 *
148 			 * Example for calculating r:
149 			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
150 			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
151 			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
152 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
153 			 */
154 			/* y = (y_r_buf[i] + 4096) >> 2 */
155 			__m128i y, cb, cr, r, g, b;
156 			y = _mm_load_si128(y_buf + i);
157 			y = _mm_add_epi16(y, c4096);
158 			y = _mm_srai_epi16(y, 2);
159 			/* cb = cb_g_buf[i]; */
160 			cb = _mm_load_si128(cb_buf + i);
161 			/* cr = cr_b_buf[i]; */
162 			cr = _mm_load_si128(cr_buf + i);
163 			/* (y + HIWORD(cr*22986)) >> 3 */
164 			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
165 			r = _mm_srai_epi16(r, 3);
166 			/* r_buf[i] = CLIP(r); */
167 			_mm_between_epi16(r, zero, max);
168 			_mm_store_si128(r_buf + i, r);
169 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
170 			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
171 			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
172 			g = _mm_srai_epi16(g, 3);
173 			/* g_buf[i] = CLIP(g); */
174 			_mm_between_epi16(g, zero, max);
175 			_mm_store_si128(g_buf + i, g);
176 			/* (y + HIWORD(cb*28999)) >> 3 */
177 			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
178 			b = _mm_srai_epi16(b, 3);
179 			/* b_buf[i] = CLIP(b); */
180 			_mm_between_epi16(b, zero, max);
181 			_mm_store_si128(b_buf + i, b);
182 		}
183 
184 		y_buf += srcbump;
185 		cb_buf += srcbump;
186 		cr_buf += srcbump;
187 		r_buf += dstbump;
188 		g_buf += dstbump;
189 		b_buf += dstbump;
190 	}
191 
192 	return PRIMITIVES_SUCCESS;
193 }
194 
195 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)196 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], UINT32 srcStep,
197                                                    BYTE* pDst, UINT32 dstStep,
198                                                    const prim_size_t* roi) /* region of interest */
199 {
200 	const __m128i zero = _mm_setzero_si128();
201 	const __m128i max = _mm_set1_epi16(255);
202 	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
203 	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
204 	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
205 	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
206 	const __m128i c4096 = _mm_set1_epi16(4096);
207 	const INT16* y_buf = (INT16*)pSrc[0];
208 	const INT16* cb_buf = (INT16*)pSrc[1];
209 	const INT16* cr_buf = (INT16*)pSrc[2];
210 	const UINT32 pad = roi->width % 16;
211 	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
212 	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
213 	BYTE* d_buf = pDst;
214 	UINT32 yp;
215 	const size_t dstPad = (dstStep - roi->width * 4);
216 #ifdef DO_PREFETCH
217 
218 	/* Prefetch Y's, Cb's, and Cr's. */
219 	for (yp = 0; yp < roi->height; yp++)
220 	{
221 		int i;
222 
223 		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
224 		{
225 			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
226 			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
227 			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
228 		}
229 
230 		y_buf += srcStep / sizeof(INT16);
231 		cb_buf += srcStep / sizeof(INT16);
232 		cr_buf += srcStep / sizeof(INT16);
233 	}
234 
235 	y_buf = (INT16*)pSrc[0];
236 	cb_buf = (INT16*)pSrc[1];
237 	cr_buf = (INT16*)pSrc[2];
238 #endif /* DO_PREFETCH */
239 
240 	for (yp = 0; yp < roi->height; ++yp)
241 	{
242 		UINT32 i;
243 
244 		for (i = 0; i < imax; i += 2)
245 		{
246 			/* In order to use SSE2 signed 16-bit integer multiplication
247 			 * we need to convert the floating point factors to signed int
248 			 * without losing information.
249 			 * The result of this multiplication is 32 bit and we have two
250 			 * SSE instructions that return either the hi or lo word.
251 			 * Thus we will multiply the factors by the highest possible 2^n,
252 			 * take the upper 16 bits of the signed 32-bit result
253 			 * (_mm_mulhi_epi16) and correct this result by multiplying
254 			 * it by 2^(16-n).
255 			 *
256 			 * For the given factors in the conversion matrix the best
257 			 * possible n is 14.
258 			 *
259 			 * Example for calculating r:
260 			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
261 			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
262 			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
263 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
264 			 */
265 			/* y = (y_r_buf[i] + 4096) >> 2 */
266 			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
267 			y1 = _mm_load_si128((__m128i*)y_buf);
268 			y_buf += step;
269 			y1 = _mm_add_epi16(y1, c4096);
270 			y1 = _mm_srai_epi16(y1, 2);
271 			/* cb = cb_g_buf[i]; */
272 			cb1 = _mm_load_si128((__m128i*)cb_buf);
273 			cb_buf += step;
274 			/* cr = cr_b_buf[i]; */
275 			cr1 = _mm_load_si128((__m128i*)cr_buf);
276 			cr_buf += step;
277 			/* (y + HIWORD(cr*22986)) >> 3 */
278 			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
279 			r1 = _mm_srai_epi16(r1, 3);
280 			/* r_buf[i] = CLIP(r); */
281 			_mm_between_epi16(r1, zero, max);
282 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
283 			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
284 			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
285 			g1 = _mm_srai_epi16(g1, 3);
286 			/* g_buf[i] = CLIP(g); */
287 			_mm_between_epi16(g1, zero, max);
288 			/* (y + HIWORD(cb*28999)) >> 3 */
289 			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
290 			b1 = _mm_srai_epi16(b1, 3);
291 			/* b_buf[i] = CLIP(b); */
292 			_mm_between_epi16(b1, zero, max);
293 			y2 = _mm_load_si128((__m128i*)y_buf);
294 			y_buf += step;
295 			y2 = _mm_add_epi16(y2, c4096);
296 			y2 = _mm_srai_epi16(y2, 2);
297 			/* cb = cb_g_buf[i]; */
298 			cb2 = _mm_load_si128((__m128i*)cb_buf);
299 			cb_buf += step;
300 			/* cr = cr_b_buf[i]; */
301 			cr2 = _mm_load_si128((__m128i*)cr_buf);
302 			cr_buf += step;
303 			/* (y + HIWORD(cr*22986)) >> 3 */
304 			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
305 			r2 = _mm_srai_epi16(r2, 3);
306 			/* r_buf[i] = CLIP(r); */
307 			_mm_between_epi16(r2, zero, max);
308 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
309 			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
310 			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
311 			g2 = _mm_srai_epi16(g2, 3);
312 			/* g_buf[i] = CLIP(g); */
313 			_mm_between_epi16(g2, zero, max);
314 			/* (y + HIWORD(cb*28999)) >> 3 */
315 			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
316 			b2 = _mm_srai_epi16(b2, 3);
317 			/* b_buf[i] = CLIP(b); */
318 			_mm_between_epi16(b2, zero, max);
319 			{
320 				__m128i R0, R1, R2, R3, R4;
321 				/* The comments below pretend these are 8-byte registers
322 				 * rather than 16-byte, for readability.
323 				 */
324 				R0 = b1;                              /* R0 = 00B300B200B100B0 */
325 				R1 = b2;                              /* R1 = 00B700B600B500B4 */
326 				R0 = _mm_packus_epi16(R0, R1);        /* R0 = B7B6B5B4B3B2B1B0 */
327 				R1 = g1;                              /* R1 = 00G300G200G100G0 */
328 				R2 = g2;                              /* R2 = 00G700G600G500G4 */
329 				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
330 				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
331 				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = B3G3B2G2B1G1B0G0 */
332 				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = B7G7B6G6B5G5B4G4 */
333 				R0 = r1;                              /* R0 = 00R300R200R100R0 */
334 				R3 = r2;                              /* R3 = 00R700R600R500R4 */
335 				R0 = _mm_packus_epi16(R0, R3);        /* R0 = R7R6R5R4R3R2R1R0 */
336 				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
337 				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
338 				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = R3FFR2FFR1FFR0FF */
339 				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = R7FFR6FFR5FFR4FF */
340 				R0 = R4;                              /* R0 = R4               */
341 				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = B1G1R1FFB0G0R0FF */
342 				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = B3G3R3FFB2G2R2FF */
343 				R2 = R3;                              /* R2 = R3               */
344 				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = B5G5R5FFB4G4R4FF */
345 				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = B7G7R7FFB6G6R6FF */
346 				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
347 				d_buf += sizeof(__m128i);
348 				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
349 				d_buf += sizeof(__m128i);
350 				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
351 				d_buf += sizeof(__m128i);
352 				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
353 				d_buf += sizeof(__m128i);
354 			}
355 		}
356 
357 		for (i = 0; i < pad; i++)
358 		{
359 			const INT32 divisor = 16;
360 			const INT32 Y = ((*y_buf++) + 4096) << divisor;
361 			const INT32 Cb = (*cb_buf++);
362 			const INT32 Cr = (*cr_buf++);
363 			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
364 			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
365 			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
366 			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
367 			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
368 			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
369 			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
370 			*d_buf++ = CLIP(B);
371 			*d_buf++ = CLIP(G);
372 			*d_buf++ = CLIP(R);
373 			*d_buf++ = 0xFF;
374 		}
375 
376 		d_buf += dstPad;
377 	}
378 
379 	return PRIMITIVES_SUCCESS;
380 }
381 
382 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)383 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], UINT32 srcStep,
384                                                    BYTE* pDst, UINT32 dstStep,
385                                                    const prim_size_t* roi) /* region of interest */
386 {
387 	const __m128i zero = _mm_setzero_si128();
388 	const __m128i max = _mm_set1_epi16(255);
389 	const __m128i r_cr = _mm_set1_epi16(22986);  /*  1.403 << 14 */
390 	const __m128i g_cb = _mm_set1_epi16(-5636);  /* -0.344 << 14 */
391 	const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
392 	const __m128i b_cb = _mm_set1_epi16(28999);  /*  1.770 << 14 */
393 	const __m128i c4096 = _mm_set1_epi16(4096);
394 	const INT16* y_buf = (INT16*)pSrc[0];
395 	const INT16* cb_buf = (INT16*)pSrc[1];
396 	const INT16* cr_buf = (INT16*)pSrc[2];
397 	const UINT32 pad = roi->width % 16;
398 	const UINT32 step = sizeof(__m128i) / sizeof(INT16);
399 	const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
400 	BYTE* d_buf = pDst;
401 	UINT32 yp;
402 	const size_t dstPad = (dstStep - roi->width * 4);
403 #ifdef DO_PREFETCH
404 
405 	/* Prefetch Y's, Cb's, and Cr's. */
406 	for (yp = 0; yp < roi->height; yp++)
407 	{
408 		int i;
409 
410 		for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
411 		{
412 			_mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
413 			_mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
414 			_mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
415 		}
416 
417 		y_buf += srcStep / sizeof(INT16);
418 		cb_buf += srcStep / sizeof(INT16);
419 		cr_buf += srcStep / sizeof(INT16);
420 	}
421 
422 	y_buf = (INT16*)(pSrc[0]);
423 	cb_buf = (INT16*)(pSrc[1]);
424 	cr_buf = (INT16*)(pSrc[2]);
425 #endif /* DO_PREFETCH */
426 
427 	for (yp = 0; yp < roi->height; ++yp)
428 	{
429 		UINT32 i;
430 
431 		for (i = 0; i < imax; i += 2)
432 		{
433 			/* In order to use SSE2 signed 16-bit integer multiplication
434 			 * we need to convert the floating point factors to signed int
435 			 * without losing information.
436 			 * The result of this multiplication is 32 bit and we have two
437 			 * SSE instructions that return either the hi or lo word.
438 			 * Thus we will multiply the factors by the highest possible 2^n,
439 			 * take the upper 16 bits of the signed 32-bit result
440 			 * (_mm_mulhi_epi16) and correct this result by multiplying
441 			 * it by 2^(16-n).
442 			 *
443 			 * For the given factors in the conversion matrix the best
444 			 * possible n is 14.
445 			 *
446 			 * Example for calculating r:
447 			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
448 			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
449 			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
450 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
451 			 */
452 			/* y = (y_r_buf[i] + 4096) >> 2 */
453 			__m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
454 			y1 = _mm_load_si128((__m128i*)y_buf);
455 			y_buf += step;
456 			y1 = _mm_add_epi16(y1, c4096);
457 			y1 = _mm_srai_epi16(y1, 2);
458 			/* cb = cb_g_buf[i]; */
459 			cb1 = _mm_load_si128((__m128i*)cb_buf);
460 			cb_buf += step;
461 			/* cr = cr_b_buf[i]; */
462 			cr1 = _mm_load_si128((__m128i*)cr_buf);
463 			cr_buf += step;
464 			/* (y + HIWORD(cr*22986)) >> 3 */
465 			r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
466 			r1 = _mm_srai_epi16(r1, 3);
467 			/* r_buf[i] = CLIP(r); */
468 			_mm_between_epi16(r1, zero, max);
469 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
470 			g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
471 			g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
472 			g1 = _mm_srai_epi16(g1, 3);
473 			/* g_buf[i] = CLIP(g); */
474 			_mm_between_epi16(g1, zero, max);
475 			/* (y + HIWORD(cb*28999)) >> 3 */
476 			b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
477 			b1 = _mm_srai_epi16(b1, 3);
478 			/* b_buf[i] = CLIP(b); */
479 			_mm_between_epi16(b1, zero, max);
480 			y2 = _mm_load_si128((__m128i*)y_buf);
481 			y_buf += step;
482 			y2 = _mm_add_epi16(y2, c4096);
483 			y2 = _mm_srai_epi16(y2, 2);
484 			/* cb = cb_g_buf[i]; */
485 			cb2 = _mm_load_si128((__m128i*)cb_buf);
486 			cb_buf += step;
487 			/* cr = cr_b_buf[i]; */
488 			cr2 = _mm_load_si128((__m128i*)cr_buf);
489 			cr_buf += step;
490 			/* (y + HIWORD(cr*22986)) >> 3 */
491 			r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
492 			r2 = _mm_srai_epi16(r2, 3);
493 			/* r_buf[i] = CLIP(r); */
494 			_mm_between_epi16(r2, zero, max);
495 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
496 			g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
497 			g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
498 			g2 = _mm_srai_epi16(g2, 3);
499 			/* g_buf[i] = CLIP(g); */
500 			_mm_between_epi16(g2, zero, max);
501 			/* (y + HIWORD(cb*28999)) >> 3 */
502 			b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
503 			b2 = _mm_srai_epi16(b2, 3);
504 			/* b_buf[i] = CLIP(b); */
505 			_mm_between_epi16(b2, zero, max);
506 			{
507 				__m128i R0, R1, R2, R3, R4;
508 				/* The comments below pretend these are 8-byte registers
509 				 * rather than 16-byte, for readability.
510 				 */
511 				R0 = r1;                              /* R0 = 00R300R200R100R0 */
512 				R1 = r2;                              /* R1 = 00R700R600R500R4 */
513 				R0 = _mm_packus_epi16(R0, R1);        /* R0 = R7R6R5R4R3R2R1R0 */
514 				R1 = g1;                              /* R1 = 00G300G200G100G0 */
515 				R2 = g2;                              /* R2 = 00G700G600G500G4 */
516 				R1 = _mm_packus_epi16(R1, R2);        /* R1 = G7G6G5G4G3G2G1G0 */
517 				R2 = R1;                              /* R2 = G7G6G5G4G3G2G1G0 */
518 				R2 = _mm_unpacklo_epi8(R0, R2);       /* R2 = R3G3R2G2R1G1R0G0 */
519 				R1 = _mm_unpackhi_epi8(R0, R1);       /* R1 = R7G7R6G6R5G5R4G4 */
520 				R0 = b1;                              /* R0 = 00B300B200B100B0 */
521 				R3 = b2;                              /* R3 = 00B700B600B500B4 */
522 				R0 = _mm_packus_epi16(R0, R3);        /* R0 = B7B6B5B4B3B2B1B0 */
523 				R3 = _mm_set1_epi32(0xFFFFFFFFU);     /* R3 = FFFFFFFFFFFFFFFF */
524 				R4 = R3;                              /* R4 = FFFFFFFFFFFFFFFF */
525 				R4 = _mm_unpacklo_epi8(R0, R4);       /* R4 = B3FFB2FFB1FFB0FF */
526 				R3 = _mm_unpackhi_epi8(R0, R3);       /* R3 = B7FFB6FFB5FFB4FF */
527 				R0 = R4;                              /* R0 = R4               */
528 				R0 = _mm_unpacklo_epi16(R2, R0);      /* R0 = R1G1B1FFR0G0B0FF */
529 				R4 = _mm_unpackhi_epi16(R2, R4);      /* R4 = R3G3B3FFR2G2B2FF */
530 				R2 = R3;                              /* R2 = R3               */
531 				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = R5G5B5FFR4G4B4FF */
532 				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = R7G7B7FFR6G6B6FF */
533 				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
534 				d_buf += sizeof(__m128i);
535 				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
536 				d_buf += sizeof(__m128i);
537 				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
538 				d_buf += sizeof(__m128i);
539 				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
540 				d_buf += sizeof(__m128i);
541 			}
542 		}
543 
544 		for (i = 0; i < pad; i++)
545 		{
546 			const INT32 divisor = 16;
547 			const INT32 Y = ((*y_buf++) + 4096) << divisor;
548 			const INT32 Cb = (*cb_buf++);
549 			const INT32 Cr = (*cr_buf++);
550 			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
551 			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
552 			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
553 			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
554 			const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
555 			const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
556 			const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
557 			*d_buf++ = CLIP(R);
558 			*d_buf++ = CLIP(G);
559 			*d_buf++ = CLIP(B);
560 			*d_buf++ = 0xFF;
561 		}
562 
563 		d_buf += dstPad;
564 	}
565 
566 	return PRIMITIVES_SUCCESS;
567 }
568 
sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)569 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
570                                               BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
571                                               const prim_size_t* roi) /* region of interest */
572 {
573 	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
574 	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
575 	    (dstStep & 0x0f))
576 	{
577 		/* We can't maintain 16-byte alignment. */
578 		return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
579 	}
580 
581 	switch (DstFormat)
582 	{
583 		case PIXEL_FORMAT_BGRA32:
584 		case PIXEL_FORMAT_BGRX32:
585 			return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
586 
587 		case PIXEL_FORMAT_RGBA32:
588 		case PIXEL_FORMAT_RGBX32:
589 			return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
590 
591 		default:
592 			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
593 	}
594 }
595 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
596  * numbers. See the general code above.
597  */
sse2_RGBToYCbCr_16s16s_P3P3(const INT16 * const pSrc[3],int srcStep,INT16 * pDst[3],int dstStep,const prim_size_t * roi)598 static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
599                                              INT16* pDst[3], int dstStep,
600                                              const prim_size_t* roi) /* region of interest */
601 {
602 	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
603 	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
604 	UINT32 yp;
605 	int srcbump, dstbump, imax;
606 
607 	if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
608 	    ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
609 	    ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
610 	    (srcStep & 127) || (dstStep & 127))
611 	{
612 		/* We can't maintain 16-byte alignment. */
613 		return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
614 	}
615 
616 	min = _mm_set1_epi16(-128 * 32);
617 	max = _mm_set1_epi16(127 * 32);
618 	r_buf = (__m128i*)(pSrc[0]);
619 	g_buf = (__m128i*)(pSrc[1]);
620 	b_buf = (__m128i*)(pSrc[2]);
621 	y_buf = (__m128i*)(pDst[0]);
622 	cb_buf = (__m128i*)(pDst[1]);
623 	cr_buf = (__m128i*)(pDst[2]);
624 	y_r = _mm_set1_epi16(9798);    /*  0.299000 << 15 */
625 	y_g = _mm_set1_epi16(19235);   /*  0.587000 << 15 */
626 	y_b = _mm_set1_epi16(3735);    /*  0.114000 << 15 */
627 	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
628 	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
629 	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
630 	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
631 	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
632 	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
633 	srcbump = srcStep / sizeof(__m128i);
634 	dstbump = dstStep / sizeof(__m128i);
635 #ifdef DO_PREFETCH
636 
637 	/* Prefetch RGB's. */
638 	for (yp = 0; yp < roi->height; yp++)
639 	{
640 		int i;
641 
642 		for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
643 		     i += (CACHE_LINE_BYTES / sizeof(__m128i)))
644 		{
645 			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
646 			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
647 			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
648 		}
649 
650 		r_buf += srcbump;
651 		g_buf += srcbump;
652 		b_buf += srcbump;
653 	}
654 
655 	r_buf = (__m128i*)(pSrc[0]);
656 	g_buf = (__m128i*)(pSrc[1]);
657 	b_buf = (__m128i*)(pSrc[2]);
658 #endif /* DO_PREFETCH */
659 	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
660 
661 	for (yp = 0; yp < roi->height; ++yp)
662 	{
663 		int i;
664 
665 		for (i = 0; i < imax; i++)
666 		{
667 			/* In order to use SSE2 signed 16-bit integer multiplication we
668 			 * need to convert the floating point factors to signed int
669 			 * without loosing information.  The result of this multiplication
670 			 * is 32 bit and using SSE2 we get either the product's hi or lo
671 			 * word.  Thus we will multiply the factors by the highest
672 			 * possible 2^n and take the upper 16 bits of the signed 32-bit
673 			 * result (_mm_mulhi_epi16).  Since the final result needs to
674 			 * be scaled by << 5 and also in in order to keep the precision
675 			 * within the upper 16 bits we will also have to scale the RGB
676 			 * values used in the multiplication by << 5+(16-n).
677 			 */
678 			__m128i r, g, b, y, cb, cr;
679 			r = _mm_load_si128(y_buf + i);
680 			g = _mm_load_si128(g_buf + i);
681 			b = _mm_load_si128(b_buf + i);
682 			/* r<<6; g<<6; b<<6 */
683 			r = _mm_slli_epi16(r, 6);
684 			g = _mm_slli_epi16(g, 6);
685 			b = _mm_slli_epi16(b, 6);
686 			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
687 			y = _mm_mulhi_epi16(r, y_r);
688 			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
689 			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
690 			y = _mm_add_epi16(y, min);
691 			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
692 			_mm_between_epi16(y, min, max);
693 			_mm_store_si128(y_buf + i, y);
694 			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
695 			cb = _mm_mulhi_epi16(r, cb_r);
696 			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
697 			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
698 			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
699 			_mm_between_epi16(cb, min, max);
700 			_mm_store_si128(cb_buf + i, cb);
701 			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
702 			cr = _mm_mulhi_epi16(r, cr_r);
703 			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
704 			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
705 			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
706 			_mm_between_epi16(cr, min, max);
707 			_mm_store_si128(cr_buf + i, cr);
708 		}
709 
710 		y_buf += srcbump;
711 		cb_buf += srcbump;
712 		cr_buf += srcbump;
713 		r_buf += dstbump;
714 		g_buf += dstbump;
715 		b_buf += dstbump;
716 	}
717 
718 	return PRIMITIVES_SUCCESS;
719 }
720 
721 /*---------------------------------------------------------------------------*/
722 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)723 sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
724                                 UINT32 srcStep,             /* bytes between rows in source data */
725                                 BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
726                                 UINT32 dstStep,         /* bytes between rows in dest data */
727                                 const prim_size_t* roi) /* region of interest */
728 {
729 	const UINT16* pr = (const UINT16*)(pSrc[0]);
730 	const UINT16* pg = (const UINT16*)(pSrc[1]);
731 	const UINT16* pb = (const UINT16*)(pSrc[2]);
732 	const UINT32 pad = roi->width % 16;
733 	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
734 	BYTE* out;
735 	UINT32 srcbump, dstbump, y;
736 	out = (BYTE*)pDst;
737 	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
738 	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
739 
740 	for (y = 0; y < roi->height; ++y)
741 	{
742 		UINT32 x;
743 
744 		for (x = 0; x < roi->width - pad; x += 16)
745 		{
746 			__m128i r, g, b;
747 			/* The comments below pretend these are 8-byte registers
748 			 * rather than 16-byte, for readability.
749 			 */
750 			{
751 				__m128i R0, R1;
752 				R0 = _mm_load_si128((__m128i*)pb);
753 				pb += 8; /* R0 = 00B300B200B100B0 */
754 				R1 = _mm_load_si128((__m128i*)pb);
755 				pb += 8;                      /* R1 = 00B700B600B500B4 */
756 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
757 			}
758 			{
759 				__m128i R0, R1;
760 				R0 = _mm_load_si128((__m128i*)pg);
761 				pg += 8; /* R1 = 00G300G200G100G0 */
762 				R1 = _mm_load_si128((__m128i*)pg);
763 				pg += 8;                      /* R2 = 00G700G600G500G4 */
764 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
765 			}
766 			{
767 				__m128i R0, R1;
768 				R0 = _mm_load_si128((__m128i*)pr);
769 				pr += 8; /* R0 = 00R300R200R100R0 */
770 				R1 = _mm_load_si128((__m128i*)pr);
771 				pr += 8;                      /* R3 = 00R700R600R500R4 */
772 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
773 			}
774 			{
775 				__m128i gbHi, gbLo, arHi, arLo;
776 				{
777 					gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
778 					gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
779 					arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
780 					arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
781 				}
782 				{
783 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
784 					_mm_store_si128((__m128i*)out, bgrx);
785 					out += 16; /* FFR1G1B1FFR0G0B0      */
786 				}
787 				{
788 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
789 					_mm_store_si128((__m128i*)out, bgrx);
790 					out += 16; /* FFR3G3B3FFR2G2B2      */
791 				}
792 				{
793 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
794 					_mm_store_si128((__m128i*)out, bgrx);
795 					out += 16; /* FFR5G5B5FFR4G4B4      */
796 				}
797 				{
798 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
799 					_mm_store_si128((__m128i*)out, bgrx);
800 					out += 16; /* FFR7G7B7FFR6G6B6      */
801 				}
802 			}
803 		}
804 
805 		for (x = 0; x < pad; x++)
806 		{
807 			const BYTE R = CLIP(*pr++);
808 			const BYTE G = CLIP(*pg++);
809 			const BYTE B = CLIP(*pb++);
810 			*out++ = B;
811 			*out++ = G;
812 			*out++ = R;
813 			*out++ = 0xFF;
814 		}
815 
816 		/* Jump to next row. */
817 		pr += srcbump;
818 		pg += srcbump;
819 		pb += srcbump;
820 		out += dstbump;
821 	}
822 
823 	return PRIMITIVES_SUCCESS;
824 }
825 
826 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)827 sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
828                                 UINT32 srcStep,             /* bytes between rows in source data */
829                                 BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
830                                 UINT32 dstStep,         /* bytes between rows in dest data */
831                                 const prim_size_t* roi) /* region of interest */
832 {
833 	const UINT16* pr = (const UINT16*)(pSrc[0]);
834 	const UINT16* pg = (const UINT16*)(pSrc[1]);
835 	const UINT16* pb = (const UINT16*)(pSrc[2]);
836 	const UINT32 pad = roi->width % 16;
837 	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
838 	BYTE* out;
839 	UINT32 srcbump, dstbump, y;
840 	out = (BYTE*)pDst;
841 	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
842 	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
843 
844 	for (y = 0; y < roi->height; ++y)
845 	{
846 		UINT32 x;
847 
848 		for (x = 0; x < roi->width - pad; x += 16)
849 		{
850 			__m128i r, g, b;
851 			/* The comments below pretend these are 8-byte registers
852 			 * rather than 16-byte, for readability.
853 			 */
854 			{
855 				__m128i R0, R1;
856 				R0 = _mm_load_si128((__m128i*)pb);
857 				pb += 8; /* R0 = 00B300B200B100B0 */
858 				R1 = _mm_load_si128((__m128i*)pb);
859 				pb += 8;                      /* R1 = 00B700B600B500B4 */
860 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
861 			}
862 			{
863 				__m128i R0, R1;
864 				R0 = _mm_load_si128((__m128i*)pg);
865 				pg += 8; /* R1 = 00G300G200G100G0 */
866 				R1 = _mm_load_si128((__m128i*)pg);
867 				pg += 8;                      /* R2 = 00G700G600G500G4 */
868 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
869 			}
870 			{
871 				__m128i R0, R1;
872 				R0 = _mm_load_si128((__m128i*)pr);
873 				pr += 8; /* R0 = 00R300R200R100R0 */
874 				R1 = _mm_load_si128((__m128i*)pr);
875 				pr += 8;                      /* R3 = 00R700R600R500R4 */
876 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
877 			}
878 			{
879 				__m128i gbHi, gbLo, arHi, arLo;
880 				{
881 					gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
882 					gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
883 					arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
884 					arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
885 				}
886 				{
887 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
888 					_mm_store_si128((__m128i*)out, bgrx);
889 					out += 16; /* FFR1G1B1FFR0G0B0      */
890 				}
891 				{
892 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
893 					_mm_store_si128((__m128i*)out, bgrx);
894 					out += 16; /* FFR3G3B3FFR2G2B2      */
895 				}
896 				{
897 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
898 					_mm_store_si128((__m128i*)out, bgrx);
899 					out += 16; /* FFR5G5B5FFR4G4B4      */
900 				}
901 				{
902 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
903 					_mm_store_si128((__m128i*)out, bgrx);
904 					out += 16; /* FFR7G7B7FFR6G6B6      */
905 				}
906 			}
907 		}
908 
909 		for (x = 0; x < pad; x++)
910 		{
911 			const BYTE R = CLIP(*pr++);
912 			const BYTE G = CLIP(*pg++);
913 			const BYTE B = CLIP(*pb++);
914 			*out++ = R;
915 			*out++ = G;
916 			*out++ = B;
917 			*out++ = 0xFF;
918 		}
919 
920 		/* Jump to next row. */
921 		pr += srcbump;
922 		pg += srcbump;
923 		pb += srcbump;
924 		out += dstbump;
925 	}
926 
927 	return PRIMITIVES_SUCCESS;
928 }
929 
930 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)931 sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
932                                 UINT32 srcStep,             /* bytes between rows in source data */
933                                 BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
934                                 UINT32 dstStep,         /* bytes between rows in dest data */
935                                 const prim_size_t* roi) /* region of interest */
936 {
937 	const UINT16* pr = (const UINT16*)(pSrc[0]);
938 	const UINT16* pg = (const UINT16*)(pSrc[1]);
939 	const UINT16* pb = (const UINT16*)(pSrc[2]);
940 	const UINT32 pad = roi->width % 16;
941 	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
942 	BYTE* out;
943 	UINT32 srcbump, dstbump, y;
944 	out = (BYTE*)pDst;
945 	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
946 	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
947 
948 	for (y = 0; y < roi->height; ++y)
949 	{
950 		UINT32 x;
951 
952 		for (x = 0; x < roi->width - pad; x += 16)
953 		{
954 			__m128i r, g, b;
955 			/* The comments below pretend these are 8-byte registers
956 			 * rather than 16-byte, for readability.
957 			 */
958 			{
959 				__m128i R0, R1;
960 				R0 = _mm_load_si128((__m128i*)pb);
961 				pb += 8; /* R0 = 00B300B200B100B0 */
962 				R1 = _mm_load_si128((__m128i*)pb);
963 				pb += 8;                      /* R1 = 00B700B600B500B4 */
964 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
965 			}
966 			{
967 				__m128i R0, R1;
968 				R0 = _mm_load_si128((__m128i*)pg);
969 				pg += 8; /* R1 = 00G300G200G100G0 */
970 				R1 = _mm_load_si128((__m128i*)pg);
971 				pg += 8;                      /* R2 = 00G700G600G500G4 */
972 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
973 			}
974 			{
975 				__m128i R0, R1;
976 				R0 = _mm_load_si128((__m128i*)pr);
977 				pr += 8; /* R0 = 00R300R200R100R0 */
978 				R1 = _mm_load_si128((__m128i*)pr);
979 				pr += 8;                      /* R3 = 00R700R600R500R4 */
980 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
981 			}
982 			{
983 				__m128i gbHi, gbLo, arHi, arLo;
984 				{
985 					gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
986 					gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
987 					arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
988 					arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
989 				}
990 				{
991 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
992 					_mm_store_si128((__m128i*)out, bgrx);
993 					out += 16; /* FFR1G1B1FFR0G0B0      */
994 				}
995 				{
996 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
997 					_mm_store_si128((__m128i*)out, bgrx);
998 					out += 16; /* FFR3G3B3FFR2G2B2      */
999 				}
1000 				{
1001 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1002 					_mm_store_si128((__m128i*)out, bgrx);
1003 					out += 16; /* FFR5G5B5FFR4G4B4      */
1004 				}
1005 				{
1006 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1007 					_mm_store_si128((__m128i*)out, bgrx);
1008 					out += 16; /* FFR7G7B7FFR6G6B6      */
1009 				}
1010 			}
1011 		}
1012 
1013 		for (x = 0; x < pad; x++)
1014 		{
1015 			const BYTE R = CLIP(*pr++);
1016 			const BYTE G = CLIP(*pg++);
1017 			const BYTE B = CLIP(*pb++);
1018 			*out++ = 0xFF;
1019 			*out++ = B;
1020 			*out++ = G;
1021 			*out++ = R;
1022 		}
1023 
1024 		/* Jump to next row. */
1025 		pr += srcbump;
1026 		pg += srcbump;
1027 		pb += srcbump;
1028 		out += dstbump;
1029 	}
1030 
1031 	return PRIMITIVES_SUCCESS;
1032 }
1033 
1034 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)1035 sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1036                                 UINT32 srcStep,             /* bytes between rows in source data */
1037                                 BYTE* pDst,             /* 32-bit interleaved ARGB (ABGR?) data */
1038                                 UINT32 dstStep,         /* bytes between rows in dest data */
1039                                 const prim_size_t* roi) /* region of interest */
1040 {
1041 	const UINT16* pr = (const UINT16*)(pSrc[0]);
1042 	const UINT16* pg = (const UINT16*)(pSrc[1]);
1043 	const UINT16* pb = (const UINT16*)(pSrc[2]);
1044 	const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1045 	const UINT32 pad = roi->width % 16;
1046 	BYTE* out;
1047 	UINT32 srcbump, dstbump, y;
1048 	out = (BYTE*)pDst;
1049 	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
1050 	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
1051 
1052 	for (y = 0; y < roi->height; ++y)
1053 	{
1054 		UINT32 x;
1055 
1056 		for (x = 0; x < roi->width - pad; x += 16)
1057 		{
1058 			__m128i r, g, b;
1059 			/* The comments below pretend these are 8-byte registers
1060 			 * rather than 16-byte, for readability.
1061 			 */
1062 			{
1063 				__m128i R0, R1;
1064 				R0 = _mm_load_si128((__m128i*)pb);
1065 				pb += 8; /* R0 = 00B300B200B100B0 */
1066 				R1 = _mm_load_si128((__m128i*)pb);
1067 				pb += 8;                      /* R1 = 00B700B600B500B4 */
1068 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
1069 			}
1070 			{
1071 				__m128i R0, R1;
1072 				R0 = _mm_load_si128((__m128i*)pg);
1073 				pg += 8; /* R1 = 00G300G200G100G0 */
1074 				R1 = _mm_load_si128((__m128i*)pg);
1075 				pg += 8;                      /* R2 = 00G700G600G500G4 */
1076 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1077 			}
1078 			{
1079 				__m128i R0, R1;
1080 				R0 = _mm_load_si128((__m128i*)pr);
1081 				pr += 8; /* R0 = 00R300R200R100R0 */
1082 				R1 = _mm_load_si128((__m128i*)pr);
1083 				pr += 8;                      /* R3 = 00R700R600R500R4 */
1084 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1085 			}
1086 			{
1087 				__m128i gbHi, gbLo, arHi, arLo;
1088 				{
1089 					gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
1090 					gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
1091 					arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
1092 					arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
1093 				}
1094 				{
1095 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1096 					_mm_store_si128((__m128i*)out, bgrx);
1097 					out += 16; /* FFR1G1B1FFR0G0B0      */
1098 				}
1099 				{
1100 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1101 					_mm_store_si128((__m128i*)out, bgrx);
1102 					out += 16; /* FFR3G3B3FFR2G2B2      */
1103 				}
1104 				{
1105 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1106 					_mm_store_si128((__m128i*)out, bgrx);
1107 					out += 16; /* FFR5G5B5FFR4G4B4      */
1108 				}
1109 				{
1110 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1111 					_mm_store_si128((__m128i*)out, bgrx);
1112 					out += 16; /* FFR7G7B7FFR6G6B6      */
1113 				}
1114 			}
1115 		}
1116 
1117 		for (x = 0; x < pad; x++)
1118 		{
1119 			const BYTE R = CLIP(*pr++);
1120 			const BYTE G = CLIP(*pg++);
1121 			const BYTE B = CLIP(*pb++);
1122 			*out++ = 0xFF;
1123 			*out++ = R;
1124 			*out++ = G;
1125 			*out++ = B;
1126 		}
1127 
1128 		/* Jump to next row. */
1129 		pr += srcbump;
1130 		pg += srcbump;
1131 		pb += srcbump;
1132 		out += dstbump;
1133 	}
1134 
1135 	return PRIMITIVES_SUCCESS;
1136 }
1137 
1138 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1139 sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1140                            UINT32 srcStep,             /* bytes between rows in source data */
1141                            BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
1142                            UINT32 dstStep,             /* bytes between rows in dest data */
1143                            UINT32 DstFormat, const prim_size_t* roi)
1144 {
1145 	if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1146 	    (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1147 		return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1148 
1149 	switch (DstFormat)
1150 	{
1151 		case PIXEL_FORMAT_BGRA32:
1152 		case PIXEL_FORMAT_BGRX32:
1153 			return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1154 
1155 		case PIXEL_FORMAT_RGBA32:
1156 		case PIXEL_FORMAT_RGBX32:
1157 			return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1158 
1159 		case PIXEL_FORMAT_ABGR32:
1160 		case PIXEL_FORMAT_XBGR32:
1161 			return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1162 
1163 		case PIXEL_FORMAT_ARGB32:
1164 		case PIXEL_FORMAT_XRGB32:
1165 			return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1166 
1167 		default:
1168 			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1169 	}
1170 }
1171 #endif /* WITH_SSE2 */
1172 
1173 /*---------------------------------------------------------------------------*/
1174 #ifdef WITH_NEON
neon_yCbCrToRGB_16s16s_P3P3(const INT16 * const pSrc[3],INT32 srcStep,INT16 * pDst[3],INT32 dstStep,const prim_size_t * roi)1175 static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], INT32 srcStep,
1176                                              INT16* pDst[3], INT32 dstStep,
1177                                              const prim_size_t* roi) /* region of interest */
1178 {
1179 	/* TODO: If necessary, check alignments and call the general version. */
1180 	int16x8_t zero = vdupq_n_s16(0);
1181 	int16x8_t max = vdupq_n_s16(255);
1182 	int16x8_t r_cr = vdupq_n_s16(22986);  //  1.403 << 14
1183 	int16x8_t g_cb = vdupq_n_s16(-5636);  // -0.344 << 14
1184 	int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
1185 	int16x8_t b_cb = vdupq_n_s16(28999);  //  1.770 << 14
1186 	int16x8_t c4096 = vdupq_n_s16(4096);
1187 	int16x8_t* y_buf = (int16x8_t*)pSrc[0];
1188 	int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
1189 	int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
1190 	int16x8_t* r_buf = (int16x8_t*)pDst[0];
1191 	int16x8_t* g_buf = (int16x8_t*)pDst[1];
1192 	int16x8_t* b_buf = (int16x8_t*)pDst[2];
1193 	int srcbump = srcStep / sizeof(int16x8_t);
1194 	int dstbump = dstStep / sizeof(int16x8_t);
1195 	int yp;
1196 	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
1197 
1198 	for (yp = 0; yp < roi->height; ++yp)
1199 	{
1200 		int i;
1201 
1202 		for (i = 0; i < imax; i++)
1203 		{
1204 			/*
1205 			    In order to use NEON signed 16-bit integer multiplication we need to convert
1206 			    the floating point factors to signed int without loosing information.
1207 			    The result of this multiplication is 32 bit and we have a NEON instruction
1208 			    that returns the hi word of the saturated double.
1209 			    Thus we will multiply the factors by the highest possible 2^n, take the
1210 			    upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
1211 			    shift by 1 to reverse the doubling) and correct	this result by multiplying it
1212 			    by 2^(16-n).
1213 			    For the given factors in the conversion matrix the best possible n is 14.
1214 
1215 			    Example for calculating r:
1216 			    r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
1217 			    r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
1218 			    r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
1219 			    r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
1220 			*/
1221 			/* y = (y_buf[i] + 4096) >> 2 */
1222 			int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
1223 			y = vaddq_s16(y, c4096);
1224 			y = vshrq_n_s16(y, 2);
1225 			/* cb = cb_buf[i]; */
1226 			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
1227 			/* cr = cr_buf[i]; */
1228 			int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
1229 			/* (y + HIWORD(cr*22986)) >> 3 */
1230 			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
1231 			r = vshrq_n_s16(r, 3);
1232 			/* r_buf[i] = CLIP(r); */
1233 			r = vminq_s16(vmaxq_s16(r, zero), max);
1234 			vst1q_s16((INT16*)&r_buf[i], r);
1235 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
1236 			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
1237 			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
1238 			g = vshrq_n_s16(g, 3);
1239 			/* g_buf[i] = CLIP(g); */
1240 			g = vminq_s16(vmaxq_s16(g, zero), max);
1241 			vst1q_s16((INT16*)&g_buf[i], g);
1242 			/* (y + HIWORD(cb*28999)) >> 3 */
1243 			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
1244 			b = vshrq_n_s16(b, 3);
1245 			/* b_buf[i] = CLIP(b); */
1246 			b = vminq_s16(vmaxq_s16(b, zero), max);
1247 			vst1q_s16((INT16*)&b_buf[i], b);
1248 		}
1249 
1250 		y_buf += srcbump;
1251 		cb_buf += srcbump;
1252 		cr_buf += srcbump;
1253 		r_buf += dstbump;
1254 		g_buf += dstbump;
1255 		b_buf += dstbump;
1256 	}
1257 
1258 	return PRIMITIVES_SUCCESS;
1259 }
1260 
neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi,uint8_t rPos,uint8_t gPos,uint8_t bPos,uint8_t aPos)1261 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], UINT32 srcStep,
1262                                                 BYTE* pDst, UINT32 dstStep, const prim_size_t* roi,
1263                                                 uint8_t rPos, uint8_t gPos, uint8_t bPos,
1264                                                 uint8_t aPos)
1265 {
1266 	UINT32 x, y;
1267 	BYTE* pRGB = pDst;
1268 	const INT16* pY = pSrc[0];
1269 	const INT16* pCb = pSrc[1];
1270 	const INT16* pCr = pSrc[2];
1271 	const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
1272 	const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
1273 	const size_t pad = roi->width % 8;
1274 	const int16x4_t c4096 = vdup_n_s16(4096);
1275 
1276 	for (y = 0; y < roi->height; y++)
1277 	{
1278 		for (x = 0; x < roi->width - pad; x += 8)
1279 		{
1280 			const int16x8_t Y = vld1q_s16(pY);
1281 			const int16x4_t Yh = vget_high_s16(Y);
1282 			const int16x4_t Yl = vget_low_s16(Y);
1283 			const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
1284 			const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
1285 			const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
1286 			const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
1287 			const int16x8_t Cr = vld1q_s16(pCr);
1288 			const int16x4_t Crh = vget_high_s16(Cr);
1289 			const int16x4_t Crl = vget_low_s16(Cr);
1290 			const int16x8_t Cb = vld1q_s16(pCb);
1291 			const int16x4_t Cbh = vget_high_s16(Cb);
1292 			const int16x4_t Cbl = vget_low_s16(Cb);
1293 			uint8x8x4_t bgrx;
1294 			{
1295 				/* R */
1296 				const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
1297 				const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
1298 				const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
1299 				const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
1300 				const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
1301 				const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
1302 				const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
1303 				bgrx.val[rPos] = vqmovun_s16(Rs);
1304 			}
1305 			{
1306 				/* G */
1307 				const int32x4_t CbGh = vmull_n_s16(Cbh, 22527);            /* 0.343730 * 2^16 */
1308 				const int32x4_t CbGl = vmull_n_s16(Cbl, 22527);            /* 0.343730 * 2^16 */
1309 				const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
1310 				const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
1311 				const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
1312 				const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
1313 				const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
1314 				const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
1315 				const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
1316 				const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
1317 				const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
1318 				const uint8x8_t G = vqmovun_s16(Gs);
1319 				bgrx.val[gPos] = G;
1320 			}
1321 			{
1322 				/* B */
1323 				const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
1324 				const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
1325 				const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
1326 				const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
1327 				const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
1328 				const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
1329 				const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
1330 				const uint8x8_t B = vqmovun_s16(Bs);
1331 				bgrx.val[bPos] = B;
1332 			}
1333 			/* A */
1334 			{
1335 				bgrx.val[aPos] = vdup_n_u8(0xFF);
1336 			}
1337 			vst4_u8(pRGB, bgrx);
1338 			pY += 8;
1339 			pCb += 8;
1340 			pCr += 8;
1341 			pRGB += 32;
1342 		}
1343 
1344 		for (x = 0; x < pad; x++)
1345 		{
1346 			const INT32 divisor = 16;
1347 			const INT32 Y = ((*pY++) + 4096) << divisor;
1348 			const INT32 Cb = (*pCb++);
1349 			const INT32 Cr = (*pCr++);
1350 			const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
1351 			const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
1352 			const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
1353 			const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
1354 			INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
1355 			INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
1356 			INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
1357 			BYTE bgrx[4];
1358 			bgrx[bPos] = CLIP(B);
1359 			bgrx[gPos] = CLIP(G);
1360 			bgrx[rPos] = CLIP(R);
1361 			bgrx[aPos] = 0xFF;
1362 			*pRGB++ = bgrx[0];
1363 			*pRGB++ = bgrx[1];
1364 			*pRGB++ = bgrx[2];
1365 			*pRGB++ = bgrx[3];
1366 		}
1367 
1368 		pY += srcPad;
1369 		pCb += srcPad;
1370 		pCr += srcPad;
1371 		pRGB += dstPad;
1372 	}
1373 
1374 	return PRIMITIVES_SUCCESS;
1375 }
1376 
neon_yCbCrToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1377 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
1378                                               BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
1379                                               const prim_size_t* roi)
1380 {
1381 	switch (DstFormat)
1382 	{
1383 		case PIXEL_FORMAT_BGRA32:
1384 		case PIXEL_FORMAT_BGRX32:
1385 			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
1386 
1387 		case PIXEL_FORMAT_RGBA32:
1388 		case PIXEL_FORMAT_RGBX32:
1389 			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
1390 
1391 		case PIXEL_FORMAT_ARGB32:
1392 		case PIXEL_FORMAT_XRGB32:
1393 			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
1394 
1395 		case PIXEL_FORMAT_ABGR32:
1396 		case PIXEL_FORMAT_XBGR32:
1397 			return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
1398 
1399 		default:
1400 			return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1401 	}
1402 }
1403 
1404 static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi,uint8_t rPos,uint8_t gPos,uint8_t bPos,uint8_t aPos)1405 neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1406                              UINT32 srcStep,             /* bytes between rows in source data */
1407                              BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
1408                              UINT32 dstStep,             /* bytes between rows in dest data */
1409                              const prim_size_t* roi,     /* region of interest */
1410                              uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
1411 {
1412 	UINT32 x, y;
1413 	UINT32 pad = roi->width % 8;
1414 
1415 	for (y = 0; y < roi->height; y++)
1416 	{
1417 		const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
1418 		const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
1419 		const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
1420 		BYTE* dst = pDst + y * dstStep;
1421 
1422 		for (x = 0; x < roi->width - pad; x += 8)
1423 		{
1424 			int16x8_t r = vld1q_s16(pr);
1425 			int16x8_t g = vld1q_s16(pg);
1426 			int16x8_t b = vld1q_s16(pb);
1427 			uint8x8x4_t bgrx;
1428 			bgrx.val[aPos] = vdup_n_u8(0xFF);
1429 			bgrx.val[rPos] = vqmovun_s16(r);
1430 			bgrx.val[gPos] = vqmovun_s16(g);
1431 			bgrx.val[bPos] = vqmovun_s16(b);
1432 			vst4_u8(dst, bgrx);
1433 			pr += 8;
1434 			pg += 8;
1435 			pb += 8;
1436 			dst += 32;
1437 		}
1438 
1439 		for (x = 0; x < pad; x++)
1440 		{
1441 			BYTE bgrx[4];
1442 			bgrx[bPos] = *pb++;
1443 			bgrx[gPos] = *pg++;
1444 			bgrx[rPos] = *pr++;
1445 			bgrx[aPos] = 0xFF;
1446 			*dst++ = bgrx[0];
1447 			*dst++ = bgrx[1];
1448 			*dst++ = bgrx[2];
1449 			*dst++ = bgrx[3];
1450 		}
1451 	}
1452 
1453 	return PRIMITIVES_SUCCESS;
1454 }
1455 
1456 static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1457 neon_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1458                            UINT32 srcStep,             /* bytes between rows in source data */
1459                            BYTE* pDst,                 /* 32-bit interleaved ARGB (ABGR?) data */
1460                            UINT32 dstStep,             /* bytes between rows in dest data */
1461                            UINT32 DstFormat, const prim_size_t* roi) /* region of interest */
1462 {
1463 	switch (DstFormat)
1464 	{
1465 		case PIXEL_FORMAT_BGRA32:
1466 		case PIXEL_FORMAT_BGRX32:
1467 			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
1468 
1469 		case PIXEL_FORMAT_RGBA32:
1470 		case PIXEL_FORMAT_RGBX32:
1471 			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
1472 
1473 		case PIXEL_FORMAT_ARGB32:
1474 		case PIXEL_FORMAT_XRGB32:
1475 			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
1476 
1477 		case PIXEL_FORMAT_ABGR32:
1478 		case PIXEL_FORMAT_XBGR32:
1479 			return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
1480 
1481 		default:
1482 			return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1483 	}
1484 }
1485 #endif /* WITH_NEON */
1486 /* I don't see a direct IPP version of this, since the input is INT16
1487  * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
1488  * But that would likely be slower.
1489  */
1490 
1491 /* ------------------------------------------------------------------------- */
primitives_init_colors_opt(primitives_t * prims)1492 void primitives_init_colors_opt(primitives_t* prims)
1493 {
1494 	generic = primitives_get_generic();
1495 	primitives_init_colors(prims);
1496 #if defined(WITH_SSE2)
1497 
1498 	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1499 	{
1500 		prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1501 		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1502 		prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1503 		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1504 	}
1505 
1506 #elif defined(WITH_NEON)
1507 
1508 	if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
1509 	{
1510 		prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
1511 		prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
1512 		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
1513 	}
1514 
1515 #endif /* WITH_SSE2 */
1516 }
1517