1 /*
2 * Copyright (c) 2005
3 * Eric Anholt. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 #include <liboil/liboilclasses.h>
31 #include <liboil/liboilfunction.h>
32 #include <emmintrin.h>
33 #include <liboil/liboilcolorspace.h>
34
35 #ifdef HAVE_I386
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #else
38 #define SSE_FUNCTION
39 #endif
40
41 #ifdef ENABLE_BROKEN_IMPLS
42
43 union m128_int {
44 __m128i m128;
45 uint64_t ull[2];
46 };
47
48 static const struct _SSEData {
49 union m128_int sse_16xff;
50 union m128_int sse_8x0080;
51 } c = {
52 .sse_16xff.ull = {0xffffffffffffffffULL, 0xffffffffffffffffULL},
53 .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
54 };
55
56 #define MC(x) (c.sse_##x.m128)
57
58 /* non-SSE2 compositing support */
59 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
60 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
61 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
62
63 /* This SSE2 code is based around operations on four pixels at a time. The
64 * exception is muldiv_255_sse2, which needs to expand the four pixels into
65 * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
66 * the appropriate rounding on division.
67 */
68
69 /* Shuffles the given value such that the alpha for each pixel appears in each
70 * channel of the pixel.
71 */
72 SSE_FUNCTION static inline __m128i
argb_A_sse2(__m128i a)73 argb_A_sse2(__m128i a)
74 {
75 #if 0
76 /* Shift the alpha channel of each pixel into the low byte */
77 a = _mm_srli_epi32(a, 24);
78 /* Now, shift and or so we can get it into all the channels */
79 a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
80 a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
81 return a;
82 #else
83 /* Move the alpha channel into the low byte */
84 a = _mm_srli_epi32(a, 24);
85 /* Pack our four alpha channels down into the lower 32 bits */
86 a = _mm_packus_epi16(a, _mm_setzero_si128());
87 a = _mm_packus_epi16(a, _mm_setzero_si128());
88 /* And expand it back out into four pixels of all channels the same */
89 a = _mm_unpacklo_epi8(a, a);
90 return _mm_unpacklo_epi16(a, a);
91 #endif
92 }
93
94 /* Multiplies the unpacked 16-bits-per-channel pixel data in a
95 * channel-by-channel by b, and divides the result by 255, with rounding.
96 */
97 SSE_FUNCTION static inline __m128i
inner_muldiv_255_sse2(__m128i a,__m128i b)98 inner_muldiv_255_sse2(__m128i a, __m128i b)
99 {
100 __m128i ret;
101 __m128i roundconst = MC(8x0080);
102
103 ret = _mm_mullo_epi16(a, b);
104 ret = _mm_adds_epu16(ret, roundconst);
105 ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
106 ret = _mm_srli_epi16(ret, 8);
107
108 return ret;
109 }
110
111 SSE_FUNCTION static inline __m128i
muldiv_255_sse2(__m128i a,__m128i b)112 muldiv_255_sse2(__m128i a, __m128i b)
113 {
114 __m128i alow, blow, ahigh, bhigh, low, high;
115
116 alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
117 blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
118 ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
119 bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
120 low = inner_muldiv_255_sse2(alow, blow);
121 high = inner_muldiv_255_sse2(ahigh, bhigh);
122 return _mm_packus_epi16(low, high);
123 }
124
125 SSE_FUNCTION static inline __m128i
negate_argb_sse2(__m128i a)126 negate_argb_sse2(__m128i a)
127 {
128 return _mm_xor_si128(a, MC(16xff));
129 }
130
131 SSE_FUNCTION static inline __m128i
load_argb_sse2(const uint32_t * src)132 load_argb_sse2(const uint32_t *src)
133 {
134 return _mm_loadu_si128((__m128i *)src);
135 }
136
137 SSE_FUNCTION static inline __m128i
set1_argb_sse2(uint32_t src)138 set1_argb_sse2(uint32_t src)
139 {
140 return _mm_set1_epi32(src);
141 }
142
143 SSE_FUNCTION static inline __m128i
load_u8_mask(const uint8_t * m)144 load_u8_mask(const uint8_t *m)
145 {
146 __m128i a;
147 a = _mm_cvtsi32_si128(*(uint32_t *)m);
148 a = _mm_unpacklo_epi8(a, a);
149 a = _mm_unpacklo_epi16(a, a);
150 return a;
151 }
152
153 SSE_FUNCTION static inline __m128i
set1_u8_mask(uint8_t m)154 set1_u8_mask(uint8_t m)
155 {
156 return _mm_set1_epi8(m);
157 }
158
159 SSE_FUNCTION static void
store_argb_sse2(uint32_t * dest,__m128i pix)160 store_argb_sse2(uint32_t *dest, __m128i pix)
161 {
162 _mm_store_si128((__m128i *)dest, pix);
163 }
164
165 SSE_FUNCTION static __m128i
over_argb_sse2(__m128i dest,__m128i src,__m128i srca)166 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
167 {
168 return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
169 }
170
171 SSE_FUNCTION static void
composite_in_argb_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)172 composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
173 int n)
174 {
175 for (; ((long)dest & 15) && (n > 0); n--) {
176 uint32_t s = *src++;
177 uint8_t m = *mask++;
178
179 *dest++ = oil_argb(
180 COMPOSITE_IN(oil_argb_A(s), m),
181 COMPOSITE_IN(oil_argb_R(s), m),
182 COMPOSITE_IN(oil_argb_G(s), m),
183 COMPOSITE_IN(oil_argb_B(s), m));
184 }
185 for (; n >= 4; n -= 4) {
186 __m128i s, m;
187 s = load_argb_sse2(src);
188 m = load_u8_mask(mask);
189 store_argb_sse2(dest, muldiv_255_sse2(s, m));
190 src += 4;
191 mask += 4;
192 dest += 4;
193 }
194 for (; n > 0; n--) {
195 uint32_t s = *src++;
196 uint8_t m = *mask++;
197
198 *dest++ = oil_argb(
199 COMPOSITE_IN(oil_argb_A(s), m),
200 COMPOSITE_IN(oil_argb_R(s), m),
201 COMPOSITE_IN(oil_argb_G(s), m),
202 COMPOSITE_IN(oil_argb_B(s), m));
203 }
204 }
205 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
206 OIL_IMPL_FLAG_SSE2);
207
208 SSE_FUNCTION static void
composite_in_argb_const_src_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)209 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
210 const uint8_t *mask, int n)
211 {
212 __m128i s;
213
214 s = set1_argb_sse2(*src);
215
216 for (; ((long)dest & 15) && (n > 0); n--) {
217 uint8_t m = *mask++;
218
219 *dest++ = oil_argb(
220 COMPOSITE_IN(oil_argb_A(*src), m),
221 COMPOSITE_IN(oil_argb_R(*src), m),
222 COMPOSITE_IN(oil_argb_G(*src), m),
223 COMPOSITE_IN(oil_argb_B(*src), m));
224 }
225 for (; n >= 4; n -= 4) {
226 __m128i m;
227 m = load_u8_mask(mask);
228 store_argb_sse2(dest, muldiv_255_sse2(s, m));
229 mask += 4;
230 dest += 4;
231 }
232 for (; n > 0; n--) {
233 uint8_t m = *mask++;
234
235 *dest++ = oil_argb(
236 COMPOSITE_IN(oil_argb_A(*src), m),
237 COMPOSITE_IN(oil_argb_R(*src), m),
238 COMPOSITE_IN(oil_argb_G(*src), m),
239 COMPOSITE_IN(oil_argb_B(*src), m));
240 }
241 }
242 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
243 composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
244
245 SSE_FUNCTION static void
composite_in_argb_const_mask_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)246 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
247 const uint8_t *mask, int n)
248 {
249 __m128i m;
250
251 m = set1_u8_mask(*mask);
252
253 for (; ((long)dest & 15) && (n > 0); n--) {
254 uint32_t s = *src++;
255
256 *dest++ = oil_argb(
257 COMPOSITE_IN(oil_argb_A(s), mask[0]),
258 COMPOSITE_IN(oil_argb_R(s), mask[0]),
259 COMPOSITE_IN(oil_argb_G(s), mask[0]),
260 COMPOSITE_IN(oil_argb_B(s), mask[0]));
261 }
262 for (; n >= 4; n -= 4) {
263 __m128i s;
264 s = load_argb_sse2(src);
265 store_argb_sse2(dest, muldiv_255_sse2(s, m));
266 src += 4;
267 dest += 4;
268 }
269 for (; n > 0; n--) {
270 uint32_t s = *src++;
271
272 *dest++ = oil_argb(
273 COMPOSITE_IN(oil_argb_A(s), mask[0]),
274 COMPOSITE_IN(oil_argb_R(s), mask[0]),
275 COMPOSITE_IN(oil_argb_G(s), mask[0]),
276 COMPOSITE_IN(oil_argb_B(s), mask[0]));
277 }
278 }
279 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
280 composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
281
282 SSE_FUNCTION static void
composite_over_argb_sse(uint32_t * dest,const uint32_t * src,int n)283 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
284 {
285 for (; ((long)dest & 15) && (n > 0); n--) {
286 uint32_t d = *dest, s = *src++;
287 uint8_t srca = oil_argb_A(s);
288 d = oil_argb(
289 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
290 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
291 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
292 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
293 *dest++ = d;
294 }
295 for (; n >= 4; n -= 4) {
296 __m128i d, s;
297 s = load_argb_sse2(src);
298 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
299 store_argb_sse2(dest, d);
300 src += 4;
301 dest += 4;
302 }
303 for (; n > 0; n--) {
304 uint32_t d = *dest, s = *src++;
305 uint8_t srca = oil_argb_A(s);
306 d = oil_argb(
307 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
308 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
309 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
310 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
311 *dest++ = d;
312 }
313 }
314 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
315 OIL_IMPL_FLAG_SSE2);
316
317 SSE_FUNCTION static void
composite_over_argb_const_src_sse(uint32_t * dest,const uint32_t * src,int n)318 composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
319 {
320 __m128i s, sa;
321 uint32_t srca;
322
323 srca = oil_argb_A(*src);
324 s = set1_argb_sse2(*src);
325 sa = negate_argb_sse2(argb_A_sse2(s));
326 for (; ((long)dest & 15) && (n > 0); n--) {
327 uint32_t d = *dest;
328 d = oil_argb(
329 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
330 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
331 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
332 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
333 *dest++ = d;
334 }
335 for (; n >= 4; n -= 4) {
336 __m128i d;
337 d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
338 store_argb_sse2(dest, d);
339 dest += 4;
340 }
341 for (; n > 0; n--) {
342 uint32_t d = *dest;
343 d = oil_argb(
344 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
345 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
346 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
347 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
348 *dest++ = d;
349 }
350 }
351 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
352 composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
353
354 SSE_FUNCTION static void
composite_in_over_argb_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)355 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
356 const uint8_t *mask, int n)
357 {
358 for (; ((long)dest & 15) && (n > 0); n--) {
359 uint32_t d = *dest, s = *src++, m = *mask++, color;
360 uint8_t srca;
361
362 color = oil_argb(
363 COMPOSITE_IN(oil_argb_A(s), m),
364 COMPOSITE_IN(oil_argb_R(s), m),
365 COMPOSITE_IN(oil_argb_G(s), m),
366 COMPOSITE_IN(oil_argb_B(s), m));
367 srca = oil_argb_A(color);
368 d = oil_argb(
369 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
370 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
371 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
372 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
373 *dest++ = d;
374 }
375 for (; n >= 4; n -= 4) {
376 __m128i d, s, m;
377 s = load_argb_sse2(src);
378 m = load_u8_mask(mask);
379 s = muldiv_255_sse2(s, m);
380 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
381 store_argb_sse2(dest, d);
382 src += 4;
383 mask += 4;
384 dest += 4;
385 }
386 for (; n > 0; n--) {
387 uint32_t d = *dest, s = *src++, m = *mask++, color;
388 uint8_t srca;
389
390 color = oil_argb(
391 COMPOSITE_IN(oil_argb_A(s), m),
392 COMPOSITE_IN(oil_argb_R(s), m),
393 COMPOSITE_IN(oil_argb_G(s), m),
394 COMPOSITE_IN(oil_argb_B(s), m));
395 srca = oil_argb_A(color);
396 d = oil_argb(
397 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
398 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
399 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
400 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
401 *dest++ = d;
402 }
403 }
404 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
405 OIL_IMPL_FLAG_SSE2);
406
407 SSE_FUNCTION static void
composite_in_over_argb_const_src_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)408 composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
409 const uint8_t *mask, int n)
410 {
411 __m128i s;
412
413 s = set1_argb_sse2(*src);
414
415 for (; ((long)dest & 15) && (n > 0); n--) {
416 uint32_t d = *dest, m = *mask++, color;
417 uint8_t srca;
418
419 color = oil_argb(
420 COMPOSITE_IN(oil_argb_A(*src), m),
421 COMPOSITE_IN(oil_argb_R(*src), m),
422 COMPOSITE_IN(oil_argb_G(*src), m),
423 COMPOSITE_IN(oil_argb_B(*src), m));
424 srca = oil_argb_A(color);
425 d = oil_argb(
426 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
427 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
428 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
429 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
430 *dest++ = d;
431 }
432 for (; n >= 4; n -= 4) {
433 __m128i d, color, m;
434 m = load_u8_mask(mask);
435 color = muldiv_255_sse2(s, m);
436 d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
437 store_argb_sse2(dest, d);
438 mask += 4;
439 dest += 4;
440 }
441 for (; n > 0; n--) {
442 uint32_t d = *dest, m = *mask++, color;
443 uint8_t srca;
444
445 color = oil_argb(
446 COMPOSITE_IN(oil_argb_A(*src), m),
447 COMPOSITE_IN(oil_argb_R(*src), m),
448 COMPOSITE_IN(oil_argb_G(*src), m),
449 COMPOSITE_IN(oil_argb_B(*src), m));
450 srca = oil_argb_A(color);
451 d = oil_argb(
452 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
453 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
454 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
455 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
456 *dest++ = d;
457 }
458 }
459 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
460 composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
461
462 SSE_FUNCTION static void
composite_in_over_argb_const_mask_sse(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)463 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
464 const uint8_t *mask, int n)
465 {
466 __m128i m;
467
468 m = set1_u8_mask(*mask);
469
470 for (; ((long)dest & 15) && (n > 0); n--) {
471 uint32_t d = *dest, s = *src++, color;
472 uint8_t srca;
473
474 color = oil_argb(
475 COMPOSITE_IN(oil_argb_A(s), *mask),
476 COMPOSITE_IN(oil_argb_R(s), *mask),
477 COMPOSITE_IN(oil_argb_G(s), *mask),
478 COMPOSITE_IN(oil_argb_B(s), *mask));
479 srca = oil_argb_A(color);
480 d = oil_argb(
481 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
482 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
483 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
484 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
485 *dest++ = d;
486 }
487 for (; n >= 4; n -= 4) {
488 __m128i d, s;
489 s = load_argb_sse2(src);
490 s = muldiv_255_sse2(s, m);
491 d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
492 store_argb_sse2(dest, d);
493 src += 4;
494 dest += 4;
495 }
496 for (; n > 0; n--) {
497 uint32_t d = *dest, s = *src++, color;
498 uint8_t srca;
499
500 color = oil_argb(
501 COMPOSITE_IN(oil_argb_A(s), *mask),
502 COMPOSITE_IN(oil_argb_R(s), *mask),
503 COMPOSITE_IN(oil_argb_G(s), *mask),
504 COMPOSITE_IN(oil_argb_B(s), *mask));
505 srca = oil_argb_A(color);
506 d = oil_argb(
507 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
508 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
509 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
510 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
511 *dest++ = d;
512 }
513 }
514 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
515 composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
516
517 SSE_FUNCTION static void
composite_over_u8_sse(uint8_t * dest,const uint8_t * src,int n)518 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
519 {
520 /* Initial operations to align the destination pointer */
521 for (; ((long)dest & 15) && (n > 0); n--) {
522 *dest = COMPOSITE_OVER(*dest, *src, *src);
523 src++;
524 dest++;
525 }
526 /* over_u8 can be dealt with using our argb code, with srca = s */
527 for (; n >= 16; n -= 16) {
528 __m128i d, s;
529 d = *(__m128i *)dest;
530 s = load_argb_sse2((uint32_t *)src);
531 store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
532 src += 16;
533 dest += 16;
534 }
535 for (; n > 0; n--) {
536 *dest = COMPOSITE_OVER(*dest, *src, *src);
537 src++;
538 dest++;
539 }
540 }
541 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,
542 OIL_IMPL_FLAG_SSE2);
543
544 #endif
545
546