1 /* FreeRDP: A Remote Desktop Protocol Client
2 * Optimized Color conversion operations.
3 * vi:ts=4 sw=4:
4 *
5 * Copyright 2011 Stephen Erisman
6 * Copyright 2011 Norbert Federa <norbert.federa@thincast.com>
7 * Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
8 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License"); you may
11 * not use this file except in compliance with the License. You may obtain
12 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
16 * or implied. See the License for the specific language governing
17 * permissions and limitations under the License.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23
24 #include <freerdp/types.h>
25 #include <freerdp/primitives.h>
26 #include <winpr/sysinfo.h>
27
28 #ifdef WITH_SSE2
29 #include <emmintrin.h>
30 #elif defined(WITH_NEON)
31 #include <arm_neon.h>
32 #endif /* WITH_SSE2 else WITH_NEON */
33
34 #include "prim_internal.h"
35 #include "prim_templates.h"
36
37 static primitives_t* generic = NULL;
38
39 #ifdef WITH_SSE2
40
41 #ifdef __GNUC__
42 #define GNU_INLINE __attribute__((__gnu_inline__, __always_inline__, __artificial__))
43 #else
44 #define GNU_INLINE
45 #endif
46
47 #define CACHE_LINE_BYTES 64
48
49 #define _mm_between_epi16(_val, _min, _max) \
50 do \
51 { \
52 _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); \
53 } while (0)
54
55 #ifdef DO_PREFETCH
56 /*---------------------------------------------------------------------------*/
_mm_prefetch_buffer(char * buffer,int num_bytes)57 static inline void GNU_INLINE _mm_prefetch_buffer(char* buffer, int num_bytes)
58 {
59 __m128i* buf = (__m128i*)buffer;
60 unsigned int i;
61
62 for (i = 0; i < (num_bytes / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
63 {
64 _mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
65 }
66 }
67 #endif /* DO_PREFETCH */
68
69 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s16s_P3P3(const INT16 * const pSrc[3],int srcStep,INT16 * pDst[3],int dstStep,const prim_size_t * roi)70 static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
71 INT16* pDst[3], int dstStep,
72 const prim_size_t* roi) /* region of interest */
73 {
74 __m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
75 __m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
76 UINT32 yp;
77 int srcbump, dstbump, imax;
78
79 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
80 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
81 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
82 (srcStep & 127) || (dstStep & 127))
83 {
84 /* We can't maintain 16-byte alignment. */
85 return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
86 }
87
88 zero = _mm_setzero_si128();
89 max = _mm_set1_epi16(255);
90 y_buf = (__m128i*)(pSrc[0]);
91 cb_buf = (__m128i*)(pSrc[1]);
92 cr_buf = (__m128i*)(pSrc[2]);
93 r_buf = (__m128i*)(pDst[0]);
94 g_buf = (__m128i*)(pDst[1]);
95 b_buf = (__m128i*)(pDst[2]);
96 r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
97 g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
98 g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
99 b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
100 c4096 = _mm_set1_epi16(4096);
101 srcbump = srcStep / sizeof(__m128i);
102 dstbump = dstStep / sizeof(__m128i);
103 #ifdef DO_PREFETCH
104
105 /* Prefetch Y's, Cb's, and Cr's. */
106 for (yp = 0; yp < roi->height; yp++)
107 {
108 int i;
109
110 for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
111 i += (CACHE_LINE_BYTES / sizeof(__m128i)))
112 {
113 _mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
114 _mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
115 _mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
116 }
117
118 y_buf += srcbump;
119 cb_buf += srcbump;
120 cr_buf += srcbump;
121 }
122
123 y_buf = (__m128i*)(pSrc[0]);
124 cb_buf = (__m128i*)(pSrc[1]);
125 cr_buf = (__m128i*)(pSrc[2]);
126 #endif /* DO_PREFETCH */
127 imax = roi->width * sizeof(INT16) / sizeof(__m128i);
128
129 for (yp = 0; yp < roi->height; ++yp)
130 {
131 int i;
132
133 for (i = 0; i < imax; i++)
134 {
135 /* In order to use SSE2 signed 16-bit integer multiplication
136 * we need to convert the floating point factors to signed int
137 * without losing information.
138 * The result of this multiplication is 32 bit and we have two
139 * SSE instructions that return either the hi or lo word.
140 * Thus we will multiply the factors by the highest possible 2^n,
141 * take the upper 16 bits of the signed 32-bit result
142 * (_mm_mulhi_epi16) and correct this result by multiplying
143 * it by 2^(16-n).
144 *
145 * For the given factors in the conversion matrix the best
146 * possible n is 14.
147 *
148 * Example for calculating r:
149 * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
150 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
151 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
152 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
153 */
154 /* y = (y_r_buf[i] + 4096) >> 2 */
155 __m128i y, cb, cr, r, g, b;
156 y = _mm_load_si128(y_buf + i);
157 y = _mm_add_epi16(y, c4096);
158 y = _mm_srai_epi16(y, 2);
159 /* cb = cb_g_buf[i]; */
160 cb = _mm_load_si128(cb_buf + i);
161 /* cr = cr_b_buf[i]; */
162 cr = _mm_load_si128(cr_buf + i);
163 /* (y + HIWORD(cr*22986)) >> 3 */
164 r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
165 r = _mm_srai_epi16(r, 3);
166 /* r_buf[i] = CLIP(r); */
167 _mm_between_epi16(r, zero, max);
168 _mm_store_si128(r_buf + i, r);
169 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
170 g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
171 g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
172 g = _mm_srai_epi16(g, 3);
173 /* g_buf[i] = CLIP(g); */
174 _mm_between_epi16(g, zero, max);
175 _mm_store_si128(g_buf + i, g);
176 /* (y + HIWORD(cb*28999)) >> 3 */
177 b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
178 b = _mm_srai_epi16(b, 3);
179 /* b_buf[i] = CLIP(b); */
180 _mm_between_epi16(b, zero, max);
181 _mm_store_si128(b_buf + i, b);
182 }
183
184 y_buf += srcbump;
185 cb_buf += srcbump;
186 cr_buf += srcbump;
187 r_buf += dstbump;
188 g_buf += dstbump;
189 b_buf += dstbump;
190 }
191
192 return PRIMITIVES_SUCCESS;
193 }
194
195 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)196 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], UINT32 srcStep,
197 BYTE* pDst, UINT32 dstStep,
198 const prim_size_t* roi) /* region of interest */
199 {
200 const __m128i zero = _mm_setzero_si128();
201 const __m128i max = _mm_set1_epi16(255);
202 const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
203 const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
204 const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
205 const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
206 const __m128i c4096 = _mm_set1_epi16(4096);
207 const INT16* y_buf = (INT16*)pSrc[0];
208 const INT16* cb_buf = (INT16*)pSrc[1];
209 const INT16* cr_buf = (INT16*)pSrc[2];
210 const UINT32 pad = roi->width % 16;
211 const UINT32 step = sizeof(__m128i) / sizeof(INT16);
212 const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
213 BYTE* d_buf = pDst;
214 UINT32 yp;
215 const size_t dstPad = (dstStep - roi->width * 4);
216 #ifdef DO_PREFETCH
217
218 /* Prefetch Y's, Cb's, and Cr's. */
219 for (yp = 0; yp < roi->height; yp++)
220 {
221 int i;
222
223 for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
224 {
225 _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
226 _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
227 _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
228 }
229
230 y_buf += srcStep / sizeof(INT16);
231 cb_buf += srcStep / sizeof(INT16);
232 cr_buf += srcStep / sizeof(INT16);
233 }
234
235 y_buf = (INT16*)pSrc[0];
236 cb_buf = (INT16*)pSrc[1];
237 cr_buf = (INT16*)pSrc[2];
238 #endif /* DO_PREFETCH */
239
240 for (yp = 0; yp < roi->height; ++yp)
241 {
242 UINT32 i;
243
244 for (i = 0; i < imax; i += 2)
245 {
246 /* In order to use SSE2 signed 16-bit integer multiplication
247 * we need to convert the floating point factors to signed int
248 * without losing information.
249 * The result of this multiplication is 32 bit and we have two
250 * SSE instructions that return either the hi or lo word.
251 * Thus we will multiply the factors by the highest possible 2^n,
252 * take the upper 16 bits of the signed 32-bit result
253 * (_mm_mulhi_epi16) and correct this result by multiplying
254 * it by 2^(16-n).
255 *
256 * For the given factors in the conversion matrix the best
257 * possible n is 14.
258 *
259 * Example for calculating r:
260 * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
261 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
262 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
263 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
264 */
265 /* y = (y_r_buf[i] + 4096) >> 2 */
266 __m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
267 y1 = _mm_load_si128((__m128i*)y_buf);
268 y_buf += step;
269 y1 = _mm_add_epi16(y1, c4096);
270 y1 = _mm_srai_epi16(y1, 2);
271 /* cb = cb_g_buf[i]; */
272 cb1 = _mm_load_si128((__m128i*)cb_buf);
273 cb_buf += step;
274 /* cr = cr_b_buf[i]; */
275 cr1 = _mm_load_si128((__m128i*)cr_buf);
276 cr_buf += step;
277 /* (y + HIWORD(cr*22986)) >> 3 */
278 r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
279 r1 = _mm_srai_epi16(r1, 3);
280 /* r_buf[i] = CLIP(r); */
281 _mm_between_epi16(r1, zero, max);
282 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
283 g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
284 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
285 g1 = _mm_srai_epi16(g1, 3);
286 /* g_buf[i] = CLIP(g); */
287 _mm_between_epi16(g1, zero, max);
288 /* (y + HIWORD(cb*28999)) >> 3 */
289 b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
290 b1 = _mm_srai_epi16(b1, 3);
291 /* b_buf[i] = CLIP(b); */
292 _mm_between_epi16(b1, zero, max);
293 y2 = _mm_load_si128((__m128i*)y_buf);
294 y_buf += step;
295 y2 = _mm_add_epi16(y2, c4096);
296 y2 = _mm_srai_epi16(y2, 2);
297 /* cb = cb_g_buf[i]; */
298 cb2 = _mm_load_si128((__m128i*)cb_buf);
299 cb_buf += step;
300 /* cr = cr_b_buf[i]; */
301 cr2 = _mm_load_si128((__m128i*)cr_buf);
302 cr_buf += step;
303 /* (y + HIWORD(cr*22986)) >> 3 */
304 r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
305 r2 = _mm_srai_epi16(r2, 3);
306 /* r_buf[i] = CLIP(r); */
307 _mm_between_epi16(r2, zero, max);
308 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
309 g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
310 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
311 g2 = _mm_srai_epi16(g2, 3);
312 /* g_buf[i] = CLIP(g); */
313 _mm_between_epi16(g2, zero, max);
314 /* (y + HIWORD(cb*28999)) >> 3 */
315 b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
316 b2 = _mm_srai_epi16(b2, 3);
317 /* b_buf[i] = CLIP(b); */
318 _mm_between_epi16(b2, zero, max);
319 {
320 __m128i R0, R1, R2, R3, R4;
321 /* The comments below pretend these are 8-byte registers
322 * rather than 16-byte, for readability.
323 */
324 R0 = b1; /* R0 = 00B300B200B100B0 */
325 R1 = b2; /* R1 = 00B700B600B500B4 */
326 R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */
327 R1 = g1; /* R1 = 00G300G200G100G0 */
328 R2 = g2; /* R2 = 00G700G600G500G4 */
329 R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
330 R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
331 R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */
332 R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */
333 R0 = r1; /* R0 = 00R300R200R100R0 */
334 R3 = r2; /* R3 = 00R700R600R500R4 */
335 R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */
336 R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
337 R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
338 R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */
339 R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */
340 R0 = R4; /* R0 = R4 */
341 R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */
342 R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */
343 R2 = R3; /* R2 = R3 */
344 R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
345 R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
346 _mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
347 d_buf += sizeof(__m128i);
348 _mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
349 d_buf += sizeof(__m128i);
350 _mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
351 d_buf += sizeof(__m128i);
352 _mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
353 d_buf += sizeof(__m128i);
354 }
355 }
356
357 for (i = 0; i < pad; i++)
358 {
359 const INT32 divisor = 16;
360 const INT32 Y = ((*y_buf++) + 4096) << divisor;
361 const INT32 Cb = (*cb_buf++);
362 const INT32 Cr = (*cr_buf++);
363 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
364 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
365 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
366 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
367 const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
368 const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
369 const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
370 *d_buf++ = CLIP(B);
371 *d_buf++ = CLIP(G);
372 *d_buf++ = CLIP(R);
373 *d_buf++ = 0xFF;
374 }
375
376 d_buf += dstPad;
377 }
378
379 return PRIMITIVES_SUCCESS;
380 }
381
382 /*---------------------------------------------------------------------------*/
sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)383 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], UINT32 srcStep,
384 BYTE* pDst, UINT32 dstStep,
385 const prim_size_t* roi) /* region of interest */
386 {
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i max = _mm_set1_epi16(255);
389 const __m128i r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
390 const __m128i g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
391 const __m128i g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
392 const __m128i b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
393 const __m128i c4096 = _mm_set1_epi16(4096);
394 const INT16* y_buf = (INT16*)pSrc[0];
395 const INT16* cb_buf = (INT16*)pSrc[1];
396 const INT16* cr_buf = (INT16*)pSrc[2];
397 const UINT32 pad = roi->width % 16;
398 const UINT32 step = sizeof(__m128i) / sizeof(INT16);
399 const UINT32 imax = (roi->width - pad) * sizeof(INT16) / sizeof(__m128i);
400 BYTE* d_buf = pDst;
401 UINT32 yp;
402 const size_t dstPad = (dstStep - roi->width * 4);
403 #ifdef DO_PREFETCH
404
405 /* Prefetch Y's, Cb's, and Cr's. */
406 for (yp = 0; yp < roi->height; yp++)
407 {
408 int i;
409
410 for (i = 0; i < imax; i += (CACHE_LINE_BYTES / sizeof(__m128i)))
411 {
412 _mm_prefetch((char*)(&((__m128i*)y_buf)[i]), _MM_HINT_NTA);
413 _mm_prefetch((char*)(&((__m128i*)cb_buf)[i]), _MM_HINT_NTA);
414 _mm_prefetch((char*)(&((__m128i*)cr_buf)[i]), _MM_HINT_NTA);
415 }
416
417 y_buf += srcStep / sizeof(INT16);
418 cb_buf += srcStep / sizeof(INT16);
419 cr_buf += srcStep / sizeof(INT16);
420 }
421
422 y_buf = (INT16*)(pSrc[0]);
423 cb_buf = (INT16*)(pSrc[1]);
424 cr_buf = (INT16*)(pSrc[2]);
425 #endif /* DO_PREFETCH */
426
427 for (yp = 0; yp < roi->height; ++yp)
428 {
429 UINT32 i;
430
431 for (i = 0; i < imax; i += 2)
432 {
433 /* In order to use SSE2 signed 16-bit integer multiplication
434 * we need to convert the floating point factors to signed int
435 * without losing information.
436 * The result of this multiplication is 32 bit and we have two
437 * SSE instructions that return either the hi or lo word.
438 * Thus we will multiply the factors by the highest possible 2^n,
439 * take the upper 16 bits of the signed 32-bit result
440 * (_mm_mulhi_epi16) and correct this result by multiplying
441 * it by 2^(16-n).
442 *
443 * For the given factors in the conversion matrix the best
444 * possible n is 14.
445 *
446 * Example for calculating r:
447 * r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
448 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
449 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
450 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
451 */
452 /* y = (y_r_buf[i] + 4096) >> 2 */
453 __m128i y1, y2, cb1, cb2, cr1, cr2, r1, r2, g1, g2, b1, b2;
454 y1 = _mm_load_si128((__m128i*)y_buf);
455 y_buf += step;
456 y1 = _mm_add_epi16(y1, c4096);
457 y1 = _mm_srai_epi16(y1, 2);
458 /* cb = cb_g_buf[i]; */
459 cb1 = _mm_load_si128((__m128i*)cb_buf);
460 cb_buf += step;
461 /* cr = cr_b_buf[i]; */
462 cr1 = _mm_load_si128((__m128i*)cr_buf);
463 cr_buf += step;
464 /* (y + HIWORD(cr*22986)) >> 3 */
465 r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
466 r1 = _mm_srai_epi16(r1, 3);
467 /* r_buf[i] = CLIP(r); */
468 _mm_between_epi16(r1, zero, max);
469 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
470 g1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, g_cb));
471 g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(cr1, g_cr));
472 g1 = _mm_srai_epi16(g1, 3);
473 /* g_buf[i] = CLIP(g); */
474 _mm_between_epi16(g1, zero, max);
475 /* (y + HIWORD(cb*28999)) >> 3 */
476 b1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cb1, b_cb));
477 b1 = _mm_srai_epi16(b1, 3);
478 /* b_buf[i] = CLIP(b); */
479 _mm_between_epi16(b1, zero, max);
480 y2 = _mm_load_si128((__m128i*)y_buf);
481 y_buf += step;
482 y2 = _mm_add_epi16(y2, c4096);
483 y2 = _mm_srai_epi16(y2, 2);
484 /* cb = cb_g_buf[i]; */
485 cb2 = _mm_load_si128((__m128i*)cb_buf);
486 cb_buf += step;
487 /* cr = cr_b_buf[i]; */
488 cr2 = _mm_load_si128((__m128i*)cr_buf);
489 cr_buf += step;
490 /* (y + HIWORD(cr*22986)) >> 3 */
491 r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
492 r2 = _mm_srai_epi16(r2, 3);
493 /* r_buf[i] = CLIP(r); */
494 _mm_between_epi16(r2, zero, max);
495 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
496 g2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, g_cb));
497 g2 = _mm_add_epi16(g2, _mm_mulhi_epi16(cr2, g_cr));
498 g2 = _mm_srai_epi16(g2, 3);
499 /* g_buf[i] = CLIP(g); */
500 _mm_between_epi16(g2, zero, max);
501 /* (y + HIWORD(cb*28999)) >> 3 */
502 b2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cb2, b_cb));
503 b2 = _mm_srai_epi16(b2, 3);
504 /* b_buf[i] = CLIP(b); */
505 _mm_between_epi16(b2, zero, max);
506 {
507 __m128i R0, R1, R2, R3, R4;
508 /* The comments below pretend these are 8-byte registers
509 * rather than 16-byte, for readability.
510 */
511 R0 = r1; /* R0 = 00R300R200R100R0 */
512 R1 = r2; /* R1 = 00R700R600R500R4 */
513 R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */
514 R1 = g1; /* R1 = 00G300G200G100G0 */
515 R2 = g2; /* R2 = 00G700G600G500G4 */
516 R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */
517 R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
518 R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */
519 R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */
520 R0 = b1; /* R0 = 00B300B200B100B0 */
521 R3 = b2; /* R3 = 00B700B600B500B4 */
522 R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */
523 R3 = _mm_set1_epi32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */
524 R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
525 R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */
526 R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */
527 R0 = R4; /* R0 = R4 */
528 R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */
529 R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */
530 R2 = R3; /* R2 = R3 */
531 R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
532 R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
533 _mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
534 d_buf += sizeof(__m128i);
535 _mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
536 d_buf += sizeof(__m128i);
537 _mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
538 d_buf += sizeof(__m128i);
539 _mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
540 d_buf += sizeof(__m128i);
541 }
542 }
543
544 for (i = 0; i < pad; i++)
545 {
546 const INT32 divisor = 16;
547 const INT32 Y = ((*y_buf++) + 4096) << divisor;
548 const INT32 Cb = (*cb_buf++);
549 const INT32 Cr = (*cr_buf++);
550 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
551 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
552 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
553 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
554 const INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
555 const INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
556 const INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
557 *d_buf++ = CLIP(R);
558 *d_buf++ = CLIP(G);
559 *d_buf++ = CLIP(B);
560 *d_buf++ = 0xFF;
561 }
562
563 d_buf += dstPad;
564 }
565
566 return PRIMITIVES_SUCCESS;
567 }
568
sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)569 static pstatus_t sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
570 BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
571 const prim_size_t* roi) /* region of interest */
572 {
573 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
574 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) ||
575 (dstStep & 0x0f))
576 {
577 /* We can't maintain 16-byte alignment. */
578 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
579 }
580
581 switch (DstFormat)
582 {
583 case PIXEL_FORMAT_BGRA32:
584 case PIXEL_FORMAT_BGRX32:
585 return sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
586
587 case PIXEL_FORMAT_RGBA32:
588 case PIXEL_FORMAT_RGBX32:
589 return sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
590
591 default:
592 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
593 }
594 }
595 /* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
596 * numbers. See the general code above.
597 */
sse2_RGBToYCbCr_16s16s_P3P3(const INT16 * const pSrc[3],int srcStep,INT16 * pDst[3],int dstStep,const prim_size_t * roi)598 static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(const INT16* const pSrc[3], int srcStep,
599 INT16* pDst[3], int dstStep,
600 const prim_size_t* roi) /* region of interest */
601 {
602 __m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
603 __m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
604 UINT32 yp;
605 int srcbump, dstbump, imax;
606
607 if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
608 ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
609 ((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
610 (srcStep & 127) || (dstStep & 127))
611 {
612 /* We can't maintain 16-byte alignment. */
613 return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
614 }
615
616 min = _mm_set1_epi16(-128 * 32);
617 max = _mm_set1_epi16(127 * 32);
618 r_buf = (__m128i*)(pSrc[0]);
619 g_buf = (__m128i*)(pSrc[1]);
620 b_buf = (__m128i*)(pSrc[2]);
621 y_buf = (__m128i*)(pDst[0]);
622 cb_buf = (__m128i*)(pDst[1]);
623 cr_buf = (__m128i*)(pDst[2]);
624 y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
625 y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
626 y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
627 cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
628 cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
629 cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
630 cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
631 cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
632 cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
633 srcbump = srcStep / sizeof(__m128i);
634 dstbump = dstStep / sizeof(__m128i);
635 #ifdef DO_PREFETCH
636
637 /* Prefetch RGB's. */
638 for (yp = 0; yp < roi->height; yp++)
639 {
640 int i;
641
642 for (i = 0; i < roi->width * sizeof(INT16) / sizeof(__m128i);
643 i += (CACHE_LINE_BYTES / sizeof(__m128i)))
644 {
645 _mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
646 _mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
647 _mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
648 }
649
650 r_buf += srcbump;
651 g_buf += srcbump;
652 b_buf += srcbump;
653 }
654
655 r_buf = (__m128i*)(pSrc[0]);
656 g_buf = (__m128i*)(pSrc[1]);
657 b_buf = (__m128i*)(pSrc[2]);
658 #endif /* DO_PREFETCH */
659 imax = roi->width * sizeof(INT16) / sizeof(__m128i);
660
661 for (yp = 0; yp < roi->height; ++yp)
662 {
663 int i;
664
665 for (i = 0; i < imax; i++)
666 {
667 /* In order to use SSE2 signed 16-bit integer multiplication we
668 * need to convert the floating point factors to signed int
669 * without loosing information. The result of this multiplication
670 * is 32 bit and using SSE2 we get either the product's hi or lo
671 * word. Thus we will multiply the factors by the highest
672 * possible 2^n and take the upper 16 bits of the signed 32-bit
673 * result (_mm_mulhi_epi16). Since the final result needs to
674 * be scaled by << 5 and also in in order to keep the precision
675 * within the upper 16 bits we will also have to scale the RGB
676 * values used in the multiplication by << 5+(16-n).
677 */
678 __m128i r, g, b, y, cb, cr;
679 r = _mm_load_si128(y_buf + i);
680 g = _mm_load_si128(g_buf + i);
681 b = _mm_load_si128(b_buf + i);
682 /* r<<6; g<<6; b<<6 */
683 r = _mm_slli_epi16(r, 6);
684 g = _mm_slli_epi16(g, 6);
685 b = _mm_slli_epi16(b, 6);
686 /* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
687 y = _mm_mulhi_epi16(r, y_r);
688 y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
689 y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
690 y = _mm_add_epi16(y, min);
691 /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
692 _mm_between_epi16(y, min, max);
693 _mm_store_si128(y_buf + i, y);
694 /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
695 cb = _mm_mulhi_epi16(r, cb_r);
696 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
697 cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
698 /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
699 _mm_between_epi16(cb, min, max);
700 _mm_store_si128(cb_buf + i, cb);
701 /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
702 cr = _mm_mulhi_epi16(r, cr_r);
703 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
704 cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
705 /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
706 _mm_between_epi16(cr, min, max);
707 _mm_store_si128(cr_buf + i, cr);
708 }
709
710 y_buf += srcbump;
711 cb_buf += srcbump;
712 cr_buf += srcbump;
713 r_buf += dstbump;
714 g_buf += dstbump;
715 b_buf += dstbump;
716 }
717
718 return PRIMITIVES_SUCCESS;
719 }
720
721 /*---------------------------------------------------------------------------*/
722 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)723 sse2_RGBToRGB_16s8u_P3AC4R_BGRX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
724 UINT32 srcStep, /* bytes between rows in source data */
725 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
726 UINT32 dstStep, /* bytes between rows in dest data */
727 const prim_size_t* roi) /* region of interest */
728 {
729 const UINT16* pr = (const UINT16*)(pSrc[0]);
730 const UINT16* pg = (const UINT16*)(pSrc[1]);
731 const UINT16* pb = (const UINT16*)(pSrc[2]);
732 const UINT32 pad = roi->width % 16;
733 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
734 BYTE* out;
735 UINT32 srcbump, dstbump, y;
736 out = (BYTE*)pDst;
737 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
738 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
739
740 for (y = 0; y < roi->height; ++y)
741 {
742 UINT32 x;
743
744 for (x = 0; x < roi->width - pad; x += 16)
745 {
746 __m128i r, g, b;
747 /* The comments below pretend these are 8-byte registers
748 * rather than 16-byte, for readability.
749 */
750 {
751 __m128i R0, R1;
752 R0 = _mm_load_si128((__m128i*)pb);
753 pb += 8; /* R0 = 00B300B200B100B0 */
754 R1 = _mm_load_si128((__m128i*)pb);
755 pb += 8; /* R1 = 00B700B600B500B4 */
756 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
757 }
758 {
759 __m128i R0, R1;
760 R0 = _mm_load_si128((__m128i*)pg);
761 pg += 8; /* R1 = 00G300G200G100G0 */
762 R1 = _mm_load_si128((__m128i*)pg);
763 pg += 8; /* R2 = 00G700G600G500G4 */
764 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
765 }
766 {
767 __m128i R0, R1;
768 R0 = _mm_load_si128((__m128i*)pr);
769 pr += 8; /* R0 = 00R300R200R100R0 */
770 R1 = _mm_load_si128((__m128i*)pr);
771 pr += 8; /* R3 = 00R700R600R500R4 */
772 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
773 }
774 {
775 __m128i gbHi, gbLo, arHi, arLo;
776 {
777 gbLo = _mm_unpacklo_epi8(b, g); /* R0 = G7G6G5G4G3G2G1G0 */
778 gbHi = _mm_unpackhi_epi8(b, g); /* R1 = G7B7G6B7G5B5G4B4 */
779 arLo = _mm_unpacklo_epi8(r, a); /* R4 = FFR3FFR2FFR1FFR0 */
780 arHi = _mm_unpackhi_epi8(r, a); /* R3 = FFR7FFR6FFR5FFR4 */
781 }
782 {
783 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
784 _mm_store_si128((__m128i*)out, bgrx);
785 out += 16; /* FFR1G1B1FFR0G0B0 */
786 }
787 {
788 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
789 _mm_store_si128((__m128i*)out, bgrx);
790 out += 16; /* FFR3G3B3FFR2G2B2 */
791 }
792 {
793 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
794 _mm_store_si128((__m128i*)out, bgrx);
795 out += 16; /* FFR5G5B5FFR4G4B4 */
796 }
797 {
798 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
799 _mm_store_si128((__m128i*)out, bgrx);
800 out += 16; /* FFR7G7B7FFR6G6B6 */
801 }
802 }
803 }
804
805 for (x = 0; x < pad; x++)
806 {
807 const BYTE R = CLIP(*pr++);
808 const BYTE G = CLIP(*pg++);
809 const BYTE B = CLIP(*pb++);
810 *out++ = B;
811 *out++ = G;
812 *out++ = R;
813 *out++ = 0xFF;
814 }
815
816 /* Jump to next row. */
817 pr += srcbump;
818 pg += srcbump;
819 pb += srcbump;
820 out += dstbump;
821 }
822
823 return PRIMITIVES_SUCCESS;
824 }
825
826 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)827 sse2_RGBToRGB_16s8u_P3AC4R_RGBX(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
828 UINT32 srcStep, /* bytes between rows in source data */
829 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
830 UINT32 dstStep, /* bytes between rows in dest data */
831 const prim_size_t* roi) /* region of interest */
832 {
833 const UINT16* pr = (const UINT16*)(pSrc[0]);
834 const UINT16* pg = (const UINT16*)(pSrc[1]);
835 const UINT16* pb = (const UINT16*)(pSrc[2]);
836 const UINT32 pad = roi->width % 16;
837 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
838 BYTE* out;
839 UINT32 srcbump, dstbump, y;
840 out = (BYTE*)pDst;
841 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
842 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
843
844 for (y = 0; y < roi->height; ++y)
845 {
846 UINT32 x;
847
848 for (x = 0; x < roi->width - pad; x += 16)
849 {
850 __m128i r, g, b;
851 /* The comments below pretend these are 8-byte registers
852 * rather than 16-byte, for readability.
853 */
854 {
855 __m128i R0, R1;
856 R0 = _mm_load_si128((__m128i*)pb);
857 pb += 8; /* R0 = 00B300B200B100B0 */
858 R1 = _mm_load_si128((__m128i*)pb);
859 pb += 8; /* R1 = 00B700B600B500B4 */
860 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
861 }
862 {
863 __m128i R0, R1;
864 R0 = _mm_load_si128((__m128i*)pg);
865 pg += 8; /* R1 = 00G300G200G100G0 */
866 R1 = _mm_load_si128((__m128i*)pg);
867 pg += 8; /* R2 = 00G700G600G500G4 */
868 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
869 }
870 {
871 __m128i R0, R1;
872 R0 = _mm_load_si128((__m128i*)pr);
873 pr += 8; /* R0 = 00R300R200R100R0 */
874 R1 = _mm_load_si128((__m128i*)pr);
875 pr += 8; /* R3 = 00R700R600R500R4 */
876 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
877 }
878 {
879 __m128i gbHi, gbLo, arHi, arLo;
880 {
881 gbLo = _mm_unpacklo_epi8(r, g); /* R0 = G7G6G5G4G3G2G1G0 */
882 gbHi = _mm_unpackhi_epi8(r, g); /* R1 = G7B7G6B7G5B5G4B4 */
883 arLo = _mm_unpacklo_epi8(b, a); /* R4 = FFR3FFR2FFR1FFR0 */
884 arHi = _mm_unpackhi_epi8(b, a); /* R3 = FFR7FFR6FFR5FFR4 */
885 }
886 {
887 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
888 _mm_store_si128((__m128i*)out, bgrx);
889 out += 16; /* FFR1G1B1FFR0G0B0 */
890 }
891 {
892 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
893 _mm_store_si128((__m128i*)out, bgrx);
894 out += 16; /* FFR3G3B3FFR2G2B2 */
895 }
896 {
897 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
898 _mm_store_si128((__m128i*)out, bgrx);
899 out += 16; /* FFR5G5B5FFR4G4B4 */
900 }
901 {
902 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
903 _mm_store_si128((__m128i*)out, bgrx);
904 out += 16; /* FFR7G7B7FFR6G6B6 */
905 }
906 }
907 }
908
909 for (x = 0; x < pad; x++)
910 {
911 const BYTE R = CLIP(*pr++);
912 const BYTE G = CLIP(*pg++);
913 const BYTE B = CLIP(*pb++);
914 *out++ = R;
915 *out++ = G;
916 *out++ = B;
917 *out++ = 0xFF;
918 }
919
920 /* Jump to next row. */
921 pr += srcbump;
922 pg += srcbump;
923 pb += srcbump;
924 out += dstbump;
925 }
926
927 return PRIMITIVES_SUCCESS;
928 }
929
930 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)931 sse2_RGBToRGB_16s8u_P3AC4R_XBGR(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
932 UINT32 srcStep, /* bytes between rows in source data */
933 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
934 UINT32 dstStep, /* bytes between rows in dest data */
935 const prim_size_t* roi) /* region of interest */
936 {
937 const UINT16* pr = (const UINT16*)(pSrc[0]);
938 const UINT16* pg = (const UINT16*)(pSrc[1]);
939 const UINT16* pb = (const UINT16*)(pSrc[2]);
940 const UINT32 pad = roi->width % 16;
941 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
942 BYTE* out;
943 UINT32 srcbump, dstbump, y;
944 out = (BYTE*)pDst;
945 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
946 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
947
948 for (y = 0; y < roi->height; ++y)
949 {
950 UINT32 x;
951
952 for (x = 0; x < roi->width - pad; x += 16)
953 {
954 __m128i r, g, b;
955 /* The comments below pretend these are 8-byte registers
956 * rather than 16-byte, for readability.
957 */
958 {
959 __m128i R0, R1;
960 R0 = _mm_load_si128((__m128i*)pb);
961 pb += 8; /* R0 = 00B300B200B100B0 */
962 R1 = _mm_load_si128((__m128i*)pb);
963 pb += 8; /* R1 = 00B700B600B500B4 */
964 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
965 }
966 {
967 __m128i R0, R1;
968 R0 = _mm_load_si128((__m128i*)pg);
969 pg += 8; /* R1 = 00G300G200G100G0 */
970 R1 = _mm_load_si128((__m128i*)pg);
971 pg += 8; /* R2 = 00G700G600G500G4 */
972 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
973 }
974 {
975 __m128i R0, R1;
976 R0 = _mm_load_si128((__m128i*)pr);
977 pr += 8; /* R0 = 00R300R200R100R0 */
978 R1 = _mm_load_si128((__m128i*)pr);
979 pr += 8; /* R3 = 00R700R600R500R4 */
980 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
981 }
982 {
983 __m128i gbHi, gbLo, arHi, arLo;
984 {
985 gbLo = _mm_unpacklo_epi8(a, b); /* R0 = G7G6G5G4G3G2G1G0 */
986 gbHi = _mm_unpackhi_epi8(a, b); /* R1 = G7B7G6B7G5B5G4B4 */
987 arLo = _mm_unpacklo_epi8(g, r); /* R4 = FFR3FFR2FFR1FFR0 */
988 arHi = _mm_unpackhi_epi8(g, r); /* R3 = FFR7FFR6FFR5FFR4 */
989 }
990 {
991 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
992 _mm_store_si128((__m128i*)out, bgrx);
993 out += 16; /* FFR1G1B1FFR0G0B0 */
994 }
995 {
996 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
997 _mm_store_si128((__m128i*)out, bgrx);
998 out += 16; /* FFR3G3B3FFR2G2B2 */
999 }
1000 {
1001 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1002 _mm_store_si128((__m128i*)out, bgrx);
1003 out += 16; /* FFR5G5B5FFR4G4B4 */
1004 }
1005 {
1006 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1007 _mm_store_si128((__m128i*)out, bgrx);
1008 out += 16; /* FFR7G7B7FFR6G6B6 */
1009 }
1010 }
1011 }
1012
1013 for (x = 0; x < pad; x++)
1014 {
1015 const BYTE R = CLIP(*pr++);
1016 const BYTE G = CLIP(*pg++);
1017 const BYTE B = CLIP(*pb++);
1018 *out++ = 0xFF;
1019 *out++ = B;
1020 *out++ = G;
1021 *out++ = R;
1022 }
1023
1024 /* Jump to next row. */
1025 pr += srcbump;
1026 pg += srcbump;
1027 pb += srcbump;
1028 out += dstbump;
1029 }
1030
1031 return PRIMITIVES_SUCCESS;
1032 }
1033
1034 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi)1035 sse2_RGBToRGB_16s8u_P3AC4R_XRGB(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1036 UINT32 srcStep, /* bytes between rows in source data */
1037 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1038 UINT32 dstStep, /* bytes between rows in dest data */
1039 const prim_size_t* roi) /* region of interest */
1040 {
1041 const UINT16* pr = (const UINT16*)(pSrc[0]);
1042 const UINT16* pg = (const UINT16*)(pSrc[1]);
1043 const UINT16* pb = (const UINT16*)(pSrc[2]);
1044 const __m128i a = _mm_set1_epi32(0xFFFFFFFFU);
1045 const UINT32 pad = roi->width % 16;
1046 BYTE* out;
1047 UINT32 srcbump, dstbump, y;
1048 out = (BYTE*)pDst;
1049 srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
1050 dstbump = (dstStep - (roi->width * sizeof(UINT32)));
1051
1052 for (y = 0; y < roi->height; ++y)
1053 {
1054 UINT32 x;
1055
1056 for (x = 0; x < roi->width - pad; x += 16)
1057 {
1058 __m128i r, g, b;
1059 /* The comments below pretend these are 8-byte registers
1060 * rather than 16-byte, for readability.
1061 */
1062 {
1063 __m128i R0, R1;
1064 R0 = _mm_load_si128((__m128i*)pb);
1065 pb += 8; /* R0 = 00B300B200B100B0 */
1066 R1 = _mm_load_si128((__m128i*)pb);
1067 pb += 8; /* R1 = 00B700B600B500B4 */
1068 b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
1069 }
1070 {
1071 __m128i R0, R1;
1072 R0 = _mm_load_si128((__m128i*)pg);
1073 pg += 8; /* R1 = 00G300G200G100G0 */
1074 R1 = _mm_load_si128((__m128i*)pg);
1075 pg += 8; /* R2 = 00G700G600G500G4 */
1076 g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
1077 }
1078 {
1079 __m128i R0, R1;
1080 R0 = _mm_load_si128((__m128i*)pr);
1081 pr += 8; /* R0 = 00R300R200R100R0 */
1082 R1 = _mm_load_si128((__m128i*)pr);
1083 pr += 8; /* R3 = 00R700R600R500R4 */
1084 r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
1085 }
1086 {
1087 __m128i gbHi, gbLo, arHi, arLo;
1088 {
1089 gbLo = _mm_unpacklo_epi8(a, r); /* R0 = G7G6G5G4G3G2G1G0 */
1090 gbHi = _mm_unpackhi_epi8(a, r); /* R1 = G7B7G6B7G5B5G4B4 */
1091 arLo = _mm_unpacklo_epi8(g, b); /* R4 = FFR3FFR2FFR1FFR0 */
1092 arHi = _mm_unpackhi_epi8(g, b); /* R3 = FFR7FFR6FFR5FFR4 */
1093 }
1094 {
1095 const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
1096 _mm_store_si128((__m128i*)out, bgrx);
1097 out += 16; /* FFR1G1B1FFR0G0B0 */
1098 }
1099 {
1100 const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
1101 _mm_store_si128((__m128i*)out, bgrx);
1102 out += 16; /* FFR3G3B3FFR2G2B2 */
1103 }
1104 {
1105 const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
1106 _mm_store_si128((__m128i*)out, bgrx);
1107 out += 16; /* FFR5G5B5FFR4G4B4 */
1108 }
1109 {
1110 const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
1111 _mm_store_si128((__m128i*)out, bgrx);
1112 out += 16; /* FFR7G7B7FFR6G6B6 */
1113 }
1114 }
1115 }
1116
1117 for (x = 0; x < pad; x++)
1118 {
1119 const BYTE R = CLIP(*pr++);
1120 const BYTE G = CLIP(*pg++);
1121 const BYTE B = CLIP(*pb++);
1122 *out++ = 0xFF;
1123 *out++ = R;
1124 *out++ = G;
1125 *out++ = B;
1126 }
1127
1128 /* Jump to next row. */
1129 pr += srcbump;
1130 pg += srcbump;
1131 pb += srcbump;
1132 out += dstbump;
1133 }
1134
1135 return PRIMITIVES_SUCCESS;
1136 }
1137
1138 static pstatus_t
sse2_RGBToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1139 sse2_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1140 UINT32 srcStep, /* bytes between rows in source data */
1141 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1142 UINT32 dstStep, /* bytes between rows in dest data */
1143 UINT32 DstFormat, const prim_size_t* roi)
1144 {
1145 if (((ULONG_PTR)pSrc[0] & 0x0f) || ((ULONG_PTR)pSrc[1] & 0x0f) || ((ULONG_PTR)pSrc[2] & 0x0f) ||
1146 (srcStep & 0x0f) || ((ULONG_PTR)pDst & 0x0f) || (dstStep & 0x0f))
1147 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1148
1149 switch (DstFormat)
1150 {
1151 case PIXEL_FORMAT_BGRA32:
1152 case PIXEL_FORMAT_BGRX32:
1153 return sse2_RGBToRGB_16s8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi);
1154
1155 case PIXEL_FORMAT_RGBA32:
1156 case PIXEL_FORMAT_RGBX32:
1157 return sse2_RGBToRGB_16s8u_P3AC4R_RGBX(pSrc, srcStep, pDst, dstStep, roi);
1158
1159 case PIXEL_FORMAT_ABGR32:
1160 case PIXEL_FORMAT_XBGR32:
1161 return sse2_RGBToRGB_16s8u_P3AC4R_XBGR(pSrc, srcStep, pDst, dstStep, roi);
1162
1163 case PIXEL_FORMAT_ARGB32:
1164 case PIXEL_FORMAT_XRGB32:
1165 return sse2_RGBToRGB_16s8u_P3AC4R_XRGB(pSrc, srcStep, pDst, dstStep, roi);
1166
1167 default:
1168 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1169 }
1170 }
1171 #endif /* WITH_SSE2 */
1172
1173 /*---------------------------------------------------------------------------*/
1174 #ifdef WITH_NEON
neon_yCbCrToRGB_16s16s_P3P3(const INT16 * const pSrc[3],INT32 srcStep,INT16 * pDst[3],INT32 dstStep,const prim_size_t * roi)1175 static pstatus_t neon_yCbCrToRGB_16s16s_P3P3(const INT16* const pSrc[3], INT32 srcStep,
1176 INT16* pDst[3], INT32 dstStep,
1177 const prim_size_t* roi) /* region of interest */
1178 {
1179 /* TODO: If necessary, check alignments and call the general version. */
1180 int16x8_t zero = vdupq_n_s16(0);
1181 int16x8_t max = vdupq_n_s16(255);
1182 int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
1183 int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
1184 int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
1185 int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
1186 int16x8_t c4096 = vdupq_n_s16(4096);
1187 int16x8_t* y_buf = (int16x8_t*)pSrc[0];
1188 int16x8_t* cb_buf = (int16x8_t*)pSrc[1];
1189 int16x8_t* cr_buf = (int16x8_t*)pSrc[2];
1190 int16x8_t* r_buf = (int16x8_t*)pDst[0];
1191 int16x8_t* g_buf = (int16x8_t*)pDst[1];
1192 int16x8_t* b_buf = (int16x8_t*)pDst[2];
1193 int srcbump = srcStep / sizeof(int16x8_t);
1194 int dstbump = dstStep / sizeof(int16x8_t);
1195 int yp;
1196 int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
1197
1198 for (yp = 0; yp < roi->height; ++yp)
1199 {
1200 int i;
1201
1202 for (i = 0; i < imax; i++)
1203 {
1204 /*
1205 In order to use NEON signed 16-bit integer multiplication we need to convert
1206 the floating point factors to signed int without loosing information.
1207 The result of this multiplication is 32 bit and we have a NEON instruction
1208 that returns the hi word of the saturated double.
1209 Thus we will multiply the factors by the highest possible 2^n, take the
1210 upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
1211 shift by 1 to reverse the doubling) and correct this result by multiplying it
1212 by 2^(16-n).
1213 For the given factors in the conversion matrix the best possible n is 14.
1214
1215 Example for calculating r:
1216 r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
1217 r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
1218 r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
1219 r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
1220 */
1221 /* y = (y_buf[i] + 4096) >> 2 */
1222 int16x8_t y = vld1q_s16((INT16*)&y_buf[i]);
1223 y = vaddq_s16(y, c4096);
1224 y = vshrq_n_s16(y, 2);
1225 /* cb = cb_buf[i]; */
1226 int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
1227 /* cr = cr_buf[i]; */
1228 int16x8_t cr = vld1q_s16((INT16*)&cr_buf[i]);
1229 /* (y + HIWORD(cr*22986)) >> 3 */
1230 int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
1231 r = vshrq_n_s16(r, 3);
1232 /* r_buf[i] = CLIP(r); */
1233 r = vminq_s16(vmaxq_s16(r, zero), max);
1234 vst1q_s16((INT16*)&r_buf[i], r);
1235 /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
1236 int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
1237 g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
1238 g = vshrq_n_s16(g, 3);
1239 /* g_buf[i] = CLIP(g); */
1240 g = vminq_s16(vmaxq_s16(g, zero), max);
1241 vst1q_s16((INT16*)&g_buf[i], g);
1242 /* (y + HIWORD(cb*28999)) >> 3 */
1243 int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
1244 b = vshrq_n_s16(b, 3);
1245 /* b_buf[i] = CLIP(b); */
1246 b = vminq_s16(vmaxq_s16(b, zero), max);
1247 vst1q_s16((INT16*)&b_buf[i], b);
1248 }
1249
1250 y_buf += srcbump;
1251 cb_buf += srcbump;
1252 cr_buf += srcbump;
1253 r_buf += dstbump;
1254 g_buf += dstbump;
1255 b_buf += dstbump;
1256 }
1257
1258 return PRIMITIVES_SUCCESS;
1259 }
1260
neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi,uint8_t rPos,uint8_t gPos,uint8_t bPos,uint8_t aPos)1261 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], UINT32 srcStep,
1262 BYTE* pDst, UINT32 dstStep, const prim_size_t* roi,
1263 uint8_t rPos, uint8_t gPos, uint8_t bPos,
1264 uint8_t aPos)
1265 {
1266 UINT32 x, y;
1267 BYTE* pRGB = pDst;
1268 const INT16* pY = pSrc[0];
1269 const INT16* pCb = pSrc[1];
1270 const INT16* pCr = pSrc[2];
1271 const size_t srcPad = (srcStep - (roi->width * sizeof(INT16))) / sizeof(INT16);
1272 const size_t dstPad = (dstStep - (roi->width * 4)) / 4;
1273 const size_t pad = roi->width % 8;
1274 const int16x4_t c4096 = vdup_n_s16(4096);
1275
1276 for (y = 0; y < roi->height; y++)
1277 {
1278 for (x = 0; x < roi->width - pad; x += 8)
1279 {
1280 const int16x8_t Y = vld1q_s16(pY);
1281 const int16x4_t Yh = vget_high_s16(Y);
1282 const int16x4_t Yl = vget_low_s16(Y);
1283 const int32x4_t YhAdd = vaddl_s16(Yh, c4096); /* Y + 4096 */
1284 const int32x4_t YlAdd = vaddl_s16(Yl, c4096); /* Y + 4096 */
1285 const int32x4_t YhW = vshlq_n_s32(YhAdd, 16);
1286 const int32x4_t YlW = vshlq_n_s32(YlAdd, 16);
1287 const int16x8_t Cr = vld1q_s16(pCr);
1288 const int16x4_t Crh = vget_high_s16(Cr);
1289 const int16x4_t Crl = vget_low_s16(Cr);
1290 const int16x8_t Cb = vld1q_s16(pCb);
1291 const int16x4_t Cbh = vget_high_s16(Cb);
1292 const int16x4_t Cbl = vget_low_s16(Cb);
1293 uint8x8x4_t bgrx;
1294 {
1295 /* R */
1296 const int32x4_t CrhR = vmulq_n_s32(vmovl_s16(Crh), 91916); /* 1.402525 * 2^16 */
1297 const int32x4_t CrlR = vmulq_n_s32(vmovl_s16(Crl), 91916); /* 1.402525 * 2^16 */
1298 const int32x4_t CrhRa = vaddq_s32(CrhR, YhW);
1299 const int32x4_t CrlRa = vaddq_s32(CrlR, YlW);
1300 const int16x4_t Rsh = vmovn_s32(vshrq_n_s32(CrhRa, 21));
1301 const int16x4_t Rsl = vmovn_s32(vshrq_n_s32(CrlRa, 21));
1302 const int16x8_t Rs = vcombine_s16(Rsl, Rsh);
1303 bgrx.val[rPos] = vqmovun_s16(Rs);
1304 }
1305 {
1306 /* G */
1307 const int32x4_t CbGh = vmull_n_s16(Cbh, 22527); /* 0.343730 * 2^16 */
1308 const int32x4_t CbGl = vmull_n_s16(Cbl, 22527); /* 0.343730 * 2^16 */
1309 const int32x4_t CrGh = vmulq_n_s32(vmovl_s16(Crh), 46819); /* 0.714401 * 2^16 */
1310 const int32x4_t CrGl = vmulq_n_s32(vmovl_s16(Crl), 46819); /* 0.714401 * 2^16 */
1311 const int32x4_t CbCrGh = vaddq_s32(CbGh, CrGh);
1312 const int32x4_t CbCrGl = vaddq_s32(CbGl, CrGl);
1313 const int32x4_t YCbCrGh = vsubq_s32(YhW, CbCrGh);
1314 const int32x4_t YCbCrGl = vsubq_s32(YlW, CbCrGl);
1315 const int16x4_t Gsh = vmovn_s32(vshrq_n_s32(YCbCrGh, 21));
1316 const int16x4_t Gsl = vmovn_s32(vshrq_n_s32(YCbCrGl, 21));
1317 const int16x8_t Gs = vcombine_s16(Gsl, Gsh);
1318 const uint8x8_t G = vqmovun_s16(Gs);
1319 bgrx.val[gPos] = G;
1320 }
1321 {
1322 /* B */
1323 const int32x4_t CbBh = vmulq_n_s32(vmovl_s16(Cbh), 115992); /* 1.769905 * 2^16 */
1324 const int32x4_t CbBl = vmulq_n_s32(vmovl_s16(Cbl), 115992); /* 1.769905 * 2^16 */
1325 const int32x4_t YCbBh = vaddq_s32(CbBh, YhW);
1326 const int32x4_t YCbBl = vaddq_s32(CbBl, YlW);
1327 const int16x4_t Bsh = vmovn_s32(vshrq_n_s32(YCbBh, 21));
1328 const int16x4_t Bsl = vmovn_s32(vshrq_n_s32(YCbBl, 21));
1329 const int16x8_t Bs = vcombine_s16(Bsl, Bsh);
1330 const uint8x8_t B = vqmovun_s16(Bs);
1331 bgrx.val[bPos] = B;
1332 }
1333 /* A */
1334 {
1335 bgrx.val[aPos] = vdup_n_u8(0xFF);
1336 }
1337 vst4_u8(pRGB, bgrx);
1338 pY += 8;
1339 pCb += 8;
1340 pCr += 8;
1341 pRGB += 32;
1342 }
1343
1344 for (x = 0; x < pad; x++)
1345 {
1346 const INT32 divisor = 16;
1347 const INT32 Y = ((*pY++) + 4096) << divisor;
1348 const INT32 Cb = (*pCb++);
1349 const INT32 Cr = (*pCr++);
1350 const INT32 CrR = Cr * (INT32)(1.402525f * (1 << divisor));
1351 const INT32 CrG = Cr * (INT32)(0.714401f * (1 << divisor));
1352 const INT32 CbG = Cb * (INT32)(0.343730f * (1 << divisor));
1353 const INT32 CbB = Cb * (INT32)(1.769905f * (1 << divisor));
1354 INT16 R = ((INT16)((CrR + Y) >> divisor) >> 5);
1355 INT16 G = ((INT16)((Y - CbG - CrG) >> divisor) >> 5);
1356 INT16 B = ((INT16)((CbB + Y) >> divisor) >> 5);
1357 BYTE bgrx[4];
1358 bgrx[bPos] = CLIP(B);
1359 bgrx[gPos] = CLIP(G);
1360 bgrx[rPos] = CLIP(R);
1361 bgrx[aPos] = 0xFF;
1362 *pRGB++ = bgrx[0];
1363 *pRGB++ = bgrx[1];
1364 *pRGB++ = bgrx[2];
1365 *pRGB++ = bgrx[3];
1366 }
1367
1368 pY += srcPad;
1369 pCb += srcPad;
1370 pCr += srcPad;
1371 pRGB += dstPad;
1372 }
1373
1374 return PRIMITIVES_SUCCESS;
1375 }
1376
neon_yCbCrToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1377 static pstatus_t neon_yCbCrToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], UINT32 srcStep,
1378 BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
1379 const prim_size_t* roi)
1380 {
1381 switch (DstFormat)
1382 {
1383 case PIXEL_FORMAT_BGRA32:
1384 case PIXEL_FORMAT_BGRX32:
1385 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
1386
1387 case PIXEL_FORMAT_RGBA32:
1388 case PIXEL_FORMAT_RGBX32:
1389 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
1390
1391 case PIXEL_FORMAT_ARGB32:
1392 case PIXEL_FORMAT_XRGB32:
1393 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
1394
1395 case PIXEL_FORMAT_ABGR32:
1396 case PIXEL_FORMAT_XBGR32:
1397 return neon_yCbCrToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
1398
1399 default:
1400 return generic->yCbCrToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1401 }
1402 }
1403
1404 static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R_X(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,const prim_size_t * roi,uint8_t rPos,uint8_t gPos,uint8_t bPos,uint8_t aPos)1405 neon_RGBToRGB_16s8u_P3AC4R_X(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1406 UINT32 srcStep, /* bytes between rows in source data */
1407 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1408 UINT32 dstStep, /* bytes between rows in dest data */
1409 const prim_size_t* roi, /* region of interest */
1410 uint8_t rPos, uint8_t gPos, uint8_t bPos, uint8_t aPos)
1411 {
1412 UINT32 x, y;
1413 UINT32 pad = roi->width % 8;
1414
1415 for (y = 0; y < roi->height; y++)
1416 {
1417 const INT16* pr = (INT16*)(((BYTE*)pSrc[0]) + y * srcStep);
1418 const INT16* pg = (INT16*)(((BYTE*)pSrc[1]) + y * srcStep);
1419 const INT16* pb = (INT16*)(((BYTE*)pSrc[2]) + y * srcStep);
1420 BYTE* dst = pDst + y * dstStep;
1421
1422 for (x = 0; x < roi->width - pad; x += 8)
1423 {
1424 int16x8_t r = vld1q_s16(pr);
1425 int16x8_t g = vld1q_s16(pg);
1426 int16x8_t b = vld1q_s16(pb);
1427 uint8x8x4_t bgrx;
1428 bgrx.val[aPos] = vdup_n_u8(0xFF);
1429 bgrx.val[rPos] = vqmovun_s16(r);
1430 bgrx.val[gPos] = vqmovun_s16(g);
1431 bgrx.val[bPos] = vqmovun_s16(b);
1432 vst4_u8(dst, bgrx);
1433 pr += 8;
1434 pg += 8;
1435 pb += 8;
1436 dst += 32;
1437 }
1438
1439 for (x = 0; x < pad; x++)
1440 {
1441 BYTE bgrx[4];
1442 bgrx[bPos] = *pb++;
1443 bgrx[gPos] = *pg++;
1444 bgrx[rPos] = *pr++;
1445 bgrx[aPos] = 0xFF;
1446 *dst++ = bgrx[0];
1447 *dst++ = bgrx[1];
1448 *dst++ = bgrx[2];
1449 *dst++ = bgrx[3];
1450 }
1451 }
1452
1453 return PRIMITIVES_SUCCESS;
1454 }
1455
1456 static pstatus_t
neon_RGBToRGB_16s8u_P3AC4R(const INT16 * const pSrc[3],UINT32 srcStep,BYTE * pDst,UINT32 dstStep,UINT32 DstFormat,const prim_size_t * roi)1457 neon_RGBToRGB_16s8u_P3AC4R(const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */
1458 UINT32 srcStep, /* bytes between rows in source data */
1459 BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */
1460 UINT32 dstStep, /* bytes between rows in dest data */
1461 UINT32 DstFormat, const prim_size_t* roi) /* region of interest */
1462 {
1463 switch (DstFormat)
1464 {
1465 case PIXEL_FORMAT_BGRA32:
1466 case PIXEL_FORMAT_BGRX32:
1467 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);
1468
1469 case PIXEL_FORMAT_RGBA32:
1470 case PIXEL_FORMAT_RGBX32:
1471 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);
1472
1473 case PIXEL_FORMAT_ARGB32:
1474 case PIXEL_FORMAT_XRGB32:
1475 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);
1476
1477 case PIXEL_FORMAT_ABGR32:
1478 case PIXEL_FORMAT_XBGR32:
1479 return neon_RGBToRGB_16s8u_P3AC4R_X(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);
1480
1481 default:
1482 return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);
1483 }
1484 }
1485 #endif /* WITH_NEON */
1486 /* I don't see a direct IPP version of this, since the input is INT16
1487 * YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
1488 * But that would likely be slower.
1489 */
1490
1491 /* ------------------------------------------------------------------------- */
primitives_init_colors_opt(primitives_t * prims)1492 void primitives_init_colors_opt(primitives_t* prims)
1493 {
1494 generic = primitives_get_generic();
1495 primitives_init_colors(prims);
1496 #if defined(WITH_SSE2)
1497
1498 if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
1499 {
1500 prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
1501 prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
1502 prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
1503 prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
1504 }
1505
1506 #elif defined(WITH_NEON)
1507
1508 if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
1509 {
1510 prims->RGBToRGB_16s8u_P3AC4R = neon_RGBToRGB_16s8u_P3AC4R;
1511 prims->yCbCrToRGB_16s8u_P3AC4R = neon_yCbCrToRGB_16s8u_P3AC4R;
1512 prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
1513 }
1514
1515 #endif /* WITH_SSE2 */
1516 }
1517