1 /*
2 * Copyright (c) 2005
3 * Eric Anholt. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 #include <liboil/liboilclasses.h>
31 #include <liboil/liboilfunction.h>
32 #include <emmintrin.h>
33 #include <liboil/liboilcolorspace.h>
34
35 #ifdef HAVE_I386
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #else
38 #define SSE_FUNCTION
39 #endif
40
41 /* non-SSE2 compositing support */
42 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
43 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
44 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
45
46 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
47 * the channel value in the low byte. This means 2 pixels per pass.
48 */
49
50 #ifdef ENABLE_BROKEN_IMPLS
51
52 union m128_int {
53 __m128i m128;
54 uint64_t ull[2];
55 };
56
57 static const struct _SSEData {
58 union m128_int sse_8x00ff;
59 union m128_int sse_8x0080;
60 } c = {
61 .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
62 .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL},
63 };
64
65 #define MC(x) (c.sse_##x.m128)
66
67 /* Shuffles the given value such that the alpha for each pixel appears in each
68 * channel of the pixel.
69 */
70 SSE_FUNCTION static inline __m128i
argb_A_sse2(__m128i a)71 argb_A_sse2(__m128i a)
72 {
73 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
74 a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
75 return a;
76 }
77
78 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
79 * result by 255, with rounding.
80 */
81 SSE_FUNCTION static inline __m128i
muldiv_255_sse2(__m128i a,__m128i b)82 muldiv_255_sse2(__m128i a, __m128i b)
83 {
84 __m128i ret;
85 __m128i roundconst = MC(8x0080);
86
87 ret = _mm_mullo_epi16(a, b);
88 ret = _mm_adds_epu16(ret, roundconst);
89 ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
90 ret = _mm_srli_epi16(ret, 8);
91
92 return ret;
93 }
94
95 SSE_FUNCTION static inline __m128i
negate_argb_sse2(__m128i a)96 negate_argb_sse2(__m128i a)
97 {
98 return _mm_xor_si128(a, MC(8x00ff));
99 }
100
101 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
102 SSE_FUNCTION static inline __m128i
load_argb_sse2(const uint32_t * src)103 load_argb_sse2(const uint32_t *src)
104 {
105 __m128i pix;
106
107 pix = _mm_loadl_epi64((__m128i *)src);
108 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
109 return pix;
110 }
111
112 SSE_FUNCTION static inline __m128i
set1_argb_sse2(uint32_t src)113 set1_argb_sse2(uint32_t src)
114 {
115 __m128i pix;
116
117 pix = _mm_set1_epi32(src);
118 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
119 return pix;
120 }
121
122 SSE_FUNCTION static inline __m128i
load_u8_mask(const uint8_t * m)123 load_u8_mask(const uint8_t *m)
124 {
125 return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
126 }
127
128 SSE_FUNCTION static inline __m128i
set1_u8_mask(uint8_t m)129 set1_u8_mask(uint8_t m)
130 {
131 return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
132 }
133
134 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
135 SSE_FUNCTION static void
store_argb_sse2(uint32_t * dest,__m128i pix)136 store_argb_sse2(uint32_t *dest, __m128i pix)
137 {
138 pix = _mm_packus_epi16(pix, pix);
139 _mm_storel_epi64((__m128i *)dest, pix);
140 }
141
142 SSE_FUNCTION static __m128i
over_argb_sse2(__m128i dest,__m128i src,__m128i srca)143 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
144 {
145 return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
146 }
147
148 SSE_FUNCTION static void
composite_in_argb_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)149 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
150 const uint8_t *mask, int n)
151 {
152 for (; n >= 2; n -= 2) {
153 __m128i s, m;
154 s = load_argb_sse2(src);
155 m = load_u8_mask(mask);
156 store_argb_sse2(dest, muldiv_255_sse2(s, m));
157 src += 2;
158 mask += 2;
159 dest += 2;
160 }
161 for (; n > 0; n--) {
162 uint32_t s = *src++;
163 uint8_t m = *mask++;
164
165 *dest++ = oil_argb(
166 COMPOSITE_IN(oil_argb_A(s), m),
167 COMPOSITE_IN(oil_argb_R(s), m),
168 COMPOSITE_IN(oil_argb_G(s), m),
169 COMPOSITE_IN(oil_argb_B(s), m));
170 }
171 }
172 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
173 OIL_IMPL_FLAG_SSE2);
174
175 SSE_FUNCTION static void
composite_in_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)176 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
177 const uint8_t *mask, int n)
178 {
179 __m128i s;
180
181 s = set1_argb_sse2(*src);
182
183 for (; n >= 2; n -= 2) {
184 __m128i m;
185 m = load_u8_mask(mask);
186 store_argb_sse2(dest, muldiv_255_sse2(s, m));
187 mask += 2;
188 dest += 2;
189 }
190 for (; n > 0; n--) {
191 uint8_t m = *mask++;
192
193 *dest++ = oil_argb(
194 COMPOSITE_IN(oil_argb_A(*src), m),
195 COMPOSITE_IN(oil_argb_R(*src), m),
196 COMPOSITE_IN(oil_argb_G(*src), m),
197 COMPOSITE_IN(oil_argb_B(*src), m));
198 }
199 }
200 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
201 composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
202
203 #ifdef SSE_ALIGN
204 SSE_FUNCTION static void
composite_in_argb_const_mask_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)205 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
206 const uint8_t *mask, int n)
207 {
208 __m128i m;
209
210 m = set1_u8_mask(*mask);
211
212 for (; n >= 2; n -= 2) {
213 __m128i s;
214 s = load_argb_sse2(src);
215 store_argb_sse2(dest, muldiv_255_sse2(s, m));
216 src += 2;
217 dest += 2;
218 }
219 for (; n > 0; n--) {
220 uint32_t s = *src++;
221
222 *dest++ = oil_argb(
223 COMPOSITE_IN(oil_argb_A(s), mask[0]),
224 COMPOSITE_IN(oil_argb_R(s), mask[0]),
225 COMPOSITE_IN(oil_argb_G(s), mask[0]),
226 COMPOSITE_IN(oil_argb_B(s), mask[0]));
227 }
228 }
229 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
230 composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
231 #endif
232
233 SSE_FUNCTION static void
composite_over_argb_sse_2pix(uint32_t * dest,const uint32_t * src,int n)234 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
235 {
236 for (; n >= 2; n -= 2) {
237 __m128i d, s;
238 s = load_argb_sse2(src);
239 d = load_argb_sse2(dest);
240 d = over_argb_sse2(d, s, argb_A_sse2(s));
241 store_argb_sse2(dest, d);
242 src += 2;
243 dest += 2;
244 }
245 for (; n > 0; n--) {
246 uint32_t d = *dest, s = *src++;
247 uint8_t srca = oil_argb_A(s);
248 d = oil_argb(
249 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
250 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
251 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
252 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
253 *dest++ = d;
254 }
255 }
256 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
257 OIL_IMPL_FLAG_SSE2);
258
259 SSE_FUNCTION static void
composite_over_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,int n)260 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
261 int n)
262 {
263 __m128i s, sa;
264 uint32_t srca;
265
266 srca = oil_argb_A(*src);
267 s = set1_argb_sse2(*src);
268 sa = negate_argb_sse2(argb_A_sse2(s));
269 for (; n >= 2; n -= 2) {
270 __m128i d;
271 d = load_argb_sse2(dest);
272 d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
273 store_argb_sse2(dest, d);
274 dest += 2;
275 }
276 for (; n > 0; n--) {
277 uint32_t d = *dest;
278 d = oil_argb(
279 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
280 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
281 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
282 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
283 *dest++ = d;
284 }
285 }
286 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
287 composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
288
289 SSE_FUNCTION static void
composite_in_over_argb_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)290 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
291 const uint8_t *mask, int n)
292 {
293 for (; n >= 2; n -= 2) {
294 __m128i d, s, m;
295 s = load_argb_sse2(src);
296 m = load_u8_mask(mask);
297 d = load_argb_sse2(dest);
298 s = muldiv_255_sse2(s, m);
299 d = over_argb_sse2(d, s, argb_A_sse2(s));
300 store_argb_sse2(dest, d);
301 src += 2;
302 mask += 2;
303 dest += 2;
304 }
305 for (; n > 0; n--) {
306 uint32_t d = *dest, s = *src++, m = *mask++, color;
307 uint8_t srca;
308
309 color = oil_argb(
310 COMPOSITE_IN(oil_argb_A(s), m),
311 COMPOSITE_IN(oil_argb_R(s), m),
312 COMPOSITE_IN(oil_argb_G(s), m),
313 COMPOSITE_IN(oil_argb_B(s), m));
314 srca = oil_argb_A(color);
315 d = oil_argb(
316 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
317 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
318 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
319 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
320 *dest++ = d;
321 }
322 }
323 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
324 OIL_IMPL_FLAG_SSE2);
325
326 SSE_FUNCTION static void
composite_in_over_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)327 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
328 const uint8_t *mask, int n)
329 {
330 __m128i s;
331
332 s = set1_argb_sse2(*src);
333
334 for (; n >= 2; n -= 2) {
335 __m128i d, color, m;
336 m = load_u8_mask(mask);
337 d = load_argb_sse2(dest);
338 color = muldiv_255_sse2(s, m);
339 d = over_argb_sse2(d, color, argb_A_sse2(color));
340 store_argb_sse2(dest, d);
341 mask += 2;
342 dest += 2;
343 }
344 for (; n > 0; n--) {
345 uint32_t d = *dest, m = *mask++, color;
346 uint8_t srca;
347
348 color = oil_argb(
349 COMPOSITE_IN(oil_argb_A(*src), m),
350 COMPOSITE_IN(oil_argb_R(*src), m),
351 COMPOSITE_IN(oil_argb_G(*src), m),
352 COMPOSITE_IN(oil_argb_B(*src), m));
353 srca = oil_argb_A(color);
354 d = oil_argb(
355 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
356 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
357 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
358 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
359 *dest++ = d;
360 }
361 }
362 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
363 composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
364
365 SSE_FUNCTION static void
composite_in_over_argb_const_mask_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)366 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
367 const uint8_t *mask, int n)
368 {
369 __m128i m;
370
371 m = set1_u8_mask(*mask);
372
373 for (; n >= 2; n -= 2) {
374 __m128i d, s;
375 s = load_argb_sse2(src);
376 d = load_argb_sse2(dest);
377 s = muldiv_255_sse2(s, m);
378 d = over_argb_sse2(d, s, argb_A_sse2(s));
379 store_argb_sse2(dest, d);
380 src += 2;
381 dest += 2;
382 }
383 for (; n > 0; n--) {
384 uint32_t d = *dest, s = *src++, color;
385 uint8_t srca;
386
387 color = oil_argb(
388 COMPOSITE_IN(oil_argb_A(s), *mask),
389 COMPOSITE_IN(oil_argb_R(s), *mask),
390 COMPOSITE_IN(oil_argb_G(s), *mask),
391 COMPOSITE_IN(oil_argb_B(s), *mask));
392 srca = oil_argb_A(color);
393 d = oil_argb(
394 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
395 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
396 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
397 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
398 *dest++ = d;
399 }
400 }
401 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
402 composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
403
404 SSE_FUNCTION static void
composite_over_u8_sse_2pix(uint8_t * dest,const uint8_t * src,int n)405 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
406 {
407 /* Initial operations to align the destination pointer */
408 for (; ((long)dest & 15) && (n > 0); n--) {
409 *dest = COMPOSITE_OVER(*dest, *src, *src);
410 src++;
411 dest++;
412 }
413 /* over_u8 can be dealt with using our argb code, with srca = s */
414 for (; n >= 8; n -= 8) {
415 __m128i d, s;
416 d = load_argb_sse2((uint32_t *)dest);
417 s = load_argb_sse2((uint32_t *)src);
418 store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
419 src += 8;
420 dest += 8;
421 }
422 for (; n > 0; n--) {
423 *dest = COMPOSITE_OVER(*dest, *src, *src);
424 src++;
425 dest++;
426 }
427 }
428 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
429 OIL_IMPL_FLAG_SSE2);
430 #endif
431
432