1 /*
2  * Copyright (c) 2005
3  *	Eric Anholt.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 #include <liboil/liboilclasses.h>
31 #include <liboil/liboilfunction.h>
32 #include <emmintrin.h>
33 #include <liboil/liboilcolorspace.h>
34 
35 #ifdef HAVE_I386
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #else
38 #define SSE_FUNCTION
39 #endif
40 
41 /* non-SSE2 compositing support */
42 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
43 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
44 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
45 
46 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
47  * the channel value in the low byte.  This means 2 pixels per pass.
48  */
49 
50 #ifdef ENABLE_BROKEN_IMPLS
51 
52 union m128_int {
53   __m128i m128;
54   uint64_t ull[2];
55 };
56 
57 static const struct _SSEData {
58   union m128_int sse_8x00ff;
59   union m128_int sse_8x0080;
60 } c = {
61     .sse_8x00ff.ull =	{0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
62     .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
63 };
64 
65 #define MC(x) (c.sse_##x.m128)
66 
67 /* Shuffles the given value such that the alpha for each pixel appears in each
68  * channel of the pixel.
69  */
70 SSE_FUNCTION static inline __m128i
argb_A_sse2(__m128i a)71 argb_A_sse2(__m128i a)
72 {
73   a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
74   a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
75   return a;
76 }
77 
78 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
79  * result by 255, with rounding.
80  */
81 SSE_FUNCTION static inline __m128i
muldiv_255_sse2(__m128i a,__m128i b)82 muldiv_255_sse2(__m128i a, __m128i b)
83 {
84   __m128i ret;
85   __m128i roundconst = MC(8x0080);
86 
87   ret = _mm_mullo_epi16(a, b);
88   ret = _mm_adds_epu16(ret, roundconst);
89   ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
90   ret = _mm_srli_epi16(ret, 8);
91 
92   return ret;
93 }
94 
95 SSE_FUNCTION static inline __m128i
negate_argb_sse2(__m128i a)96 negate_argb_sse2(__m128i a)
97 {
98   return _mm_xor_si128(a, MC(8x00ff));
99 }
100 
101 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
102 SSE_FUNCTION static inline __m128i
load_argb_sse2(const uint32_t * src)103 load_argb_sse2(const uint32_t *src)
104 {
105   __m128i pix;
106 
107   pix = _mm_loadl_epi64((__m128i *)src);
108   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
109   return pix;
110 }
111 
112 SSE_FUNCTION static inline __m128i
set1_argb_sse2(uint32_t src)113 set1_argb_sse2(uint32_t src)
114 {
115   __m128i pix;
116 
117   pix = _mm_set1_epi32(src);
118   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
119   return pix;
120 }
121 
122 SSE_FUNCTION static inline __m128i
load_u8_mask(const uint8_t * m)123 load_u8_mask(const uint8_t *m)
124 {
125   return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
126 }
127 
128 SSE_FUNCTION static inline __m128i
set1_u8_mask(uint8_t m)129 set1_u8_mask(uint8_t m)
130 {
131   return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
132 }
133 
134 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
135 SSE_FUNCTION static void
store_argb_sse2(uint32_t * dest,__m128i pix)136 store_argb_sse2(uint32_t *dest, __m128i pix)
137 {
138   pix = _mm_packus_epi16(pix, pix);
139   _mm_storel_epi64((__m128i *)dest, pix);
140 }
141 
142 SSE_FUNCTION static __m128i
over_argb_sse2(__m128i dest,__m128i src,__m128i srca)143 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
144 {
145   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
146 }
147 
148 SSE_FUNCTION static void
composite_in_argb_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)149 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
150     const uint8_t *mask, int n)
151 {
152   for (; n >= 2; n -= 2) {
153     __m128i s, m;
154     s = load_argb_sse2(src);
155     m = load_u8_mask(mask);
156     store_argb_sse2(dest, muldiv_255_sse2(s, m));
157     src += 2;
158     mask += 2;
159     dest += 2;
160   }
161   for (; n > 0; n--) {
162     uint32_t s = *src++;
163     uint8_t m = *mask++;
164 
165     *dest++ = oil_argb(
166 	COMPOSITE_IN(oil_argb_A(s), m),
167 	COMPOSITE_IN(oil_argb_R(s), m),
168 	COMPOSITE_IN(oil_argb_G(s), m),
169 	COMPOSITE_IN(oil_argb_B(s), m));
170   }
171 }
172 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
173     OIL_IMPL_FLAG_SSE2);
174 
175 SSE_FUNCTION static void
composite_in_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)176 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
177     const uint8_t *mask, int n)
178 {
179   __m128i s;
180 
181   s = set1_argb_sse2(*src);
182 
183   for (; n >= 2; n -= 2) {
184     __m128i m;
185     m = load_u8_mask(mask);
186     store_argb_sse2(dest, muldiv_255_sse2(s, m));
187     mask += 2;
188     dest += 2;
189   }
190   for (; n > 0; n--) {
191     uint8_t m = *mask++;
192 
193     *dest++ = oil_argb(
194 	COMPOSITE_IN(oil_argb_A(*src), m),
195 	COMPOSITE_IN(oil_argb_R(*src), m),
196 	COMPOSITE_IN(oil_argb_G(*src), m),
197 	COMPOSITE_IN(oil_argb_B(*src), m));
198   }
199 }
200 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
201     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
202 
203 #ifdef SSE_ALIGN
204 SSE_FUNCTION static void
composite_in_argb_const_mask_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)205 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
206     const uint8_t *mask, int n)
207 {
208   __m128i m;
209 
210   m = set1_u8_mask(*mask);
211 
212   for (; n >= 2; n -= 2) {
213     __m128i s;
214     s = load_argb_sse2(src);
215     store_argb_sse2(dest,  muldiv_255_sse2(s, m));
216     src += 2;
217     dest += 2;
218   }
219   for (; n > 0; n--) {
220     uint32_t s = *src++;
221 
222     *dest++ = oil_argb(
223 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
224 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
225 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
226 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
227   }
228 }
229 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
230     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
231 #endif
232 
233 SSE_FUNCTION static void
composite_over_argb_sse_2pix(uint32_t * dest,const uint32_t * src,int n)234 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
235 {
236   for (; n >= 2; n -= 2) {
237     __m128i d, s;
238     s = load_argb_sse2(src);
239     d = load_argb_sse2(dest);
240     d = over_argb_sse2(d, s, argb_A_sse2(s));
241     store_argb_sse2(dest, d);
242     src += 2;
243     dest += 2;
244   }
245   for (; n > 0; n--) {
246     uint32_t d = *dest, s = *src++;
247     uint8_t srca = oil_argb_A(s);
248     d = oil_argb(
249 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
250 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
251 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
252 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
253     *dest++ = d;
254   }
255 }
256 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
257     OIL_IMPL_FLAG_SSE2);
258 
259 SSE_FUNCTION static void
composite_over_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,int n)260 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
261     int n)
262 {
263   __m128i s, sa;
264   uint32_t srca;
265 
266   srca = oil_argb_A(*src);
267   s = set1_argb_sse2(*src);
268   sa = negate_argb_sse2(argb_A_sse2(s));
269   for (; n >= 2; n -= 2) {
270     __m128i d;
271     d = load_argb_sse2(dest);
272     d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
273     store_argb_sse2(dest, d);
274     dest += 2;
275   }
276   for (; n > 0; n--) {
277     uint32_t d = *dest;
278     d = oil_argb(
279 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
280 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
281 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
282 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
283     *dest++ = d;
284   }
285 }
286 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
287     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
288 
289 SSE_FUNCTION static void
composite_in_over_argb_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)290 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
291     const uint8_t *mask, int n)
292 {
293   for (; n >= 2; n -= 2) {
294     __m128i d, s, m;
295     s = load_argb_sse2(src);
296     m = load_u8_mask(mask);
297     d = load_argb_sse2(dest);
298     s = muldiv_255_sse2(s, m);
299     d = over_argb_sse2(d, s, argb_A_sse2(s));
300     store_argb_sse2(dest, d);
301     src += 2;
302     mask += 2;
303     dest += 2;
304   }
305   for (; n > 0; n--) {
306     uint32_t d = *dest, s = *src++, m = *mask++, color;
307     uint8_t srca;
308 
309     color = oil_argb(
310         COMPOSITE_IN(oil_argb_A(s), m),
311         COMPOSITE_IN(oil_argb_R(s), m),
312         COMPOSITE_IN(oil_argb_G(s), m),
313         COMPOSITE_IN(oil_argb_B(s), m));
314     srca = oil_argb_A(color);
315     d = oil_argb(
316 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
317 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
318 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
319 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
320     *dest++ = d;
321   }
322 }
323 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
324     OIL_IMPL_FLAG_SSE2);
325 
326 SSE_FUNCTION static void
composite_in_over_argb_const_src_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)327 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
328     const uint8_t *mask, int n)
329 {
330   __m128i s;
331 
332   s = set1_argb_sse2(*src);
333 
334   for (; n >= 2; n -= 2) {
335     __m128i d, color, m;
336     m = load_u8_mask(mask);
337     d = load_argb_sse2(dest);
338     color = muldiv_255_sse2(s, m);
339     d = over_argb_sse2(d, color, argb_A_sse2(color));
340     store_argb_sse2(dest, d);
341     mask += 2;
342     dest += 2;
343   }
344   for (; n > 0; n--) {
345     uint32_t d = *dest, m = *mask++, color;
346     uint8_t srca;
347 
348     color = oil_argb(
349         COMPOSITE_IN(oil_argb_A(*src), m),
350         COMPOSITE_IN(oil_argb_R(*src), m),
351         COMPOSITE_IN(oil_argb_G(*src), m),
352         COMPOSITE_IN(oil_argb_B(*src), m));
353     srca = oil_argb_A(color);
354     d = oil_argb(
355 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
356 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
357 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
358 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
359     *dest++ = d;
360   }
361 }
362 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
363     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
364 
365 SSE_FUNCTION static void
composite_in_over_argb_const_mask_sse_2pix(uint32_t * dest,const uint32_t * src,const uint8_t * mask,int n)366 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
367     const uint8_t *mask, int n)
368 {
369   __m128i m;
370 
371   m = set1_u8_mask(*mask);
372 
373   for (; n >= 2; n -= 2) {
374     __m128i d, s;
375     s = load_argb_sse2(src);
376     d = load_argb_sse2(dest);
377     s = muldiv_255_sse2(s, m);
378     d = over_argb_sse2(d, s, argb_A_sse2(s));
379     store_argb_sse2(dest, d);
380     src += 2;
381     dest += 2;
382   }
383   for (; n > 0; n--) {
384     uint32_t d = *dest, s = *src++, color;
385     uint8_t srca;
386 
387     color = oil_argb(
388         COMPOSITE_IN(oil_argb_A(s), *mask),
389         COMPOSITE_IN(oil_argb_R(s), *mask),
390         COMPOSITE_IN(oil_argb_G(s), *mask),
391         COMPOSITE_IN(oil_argb_B(s), *mask));
392     srca = oil_argb_A(color);
393     d = oil_argb(
394 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
395 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
396 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
397 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
398     *dest++ = d;
399   }
400 }
401 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
402     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
403 
404 SSE_FUNCTION static void
composite_over_u8_sse_2pix(uint8_t * dest,const uint8_t * src,int n)405 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
406 {
407   /* Initial operations to align the destination pointer */
408   for (; ((long)dest & 15) && (n > 0); n--) {
409     *dest = COMPOSITE_OVER(*dest, *src, *src);
410     src++;
411     dest++;
412   }
413   /* over_u8 can be dealt with using our argb code, with srca = s */
414   for (; n >= 8; n -= 8) {
415     __m128i d, s;
416     d = load_argb_sse2((uint32_t *)dest);
417     s = load_argb_sse2((uint32_t *)src);
418     store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
419     src += 8;
420     dest += 8;
421   }
422   for (; n > 0; n--) {
423     *dest = COMPOSITE_OVER(*dest, *src, *src);
424     src++;
425     dest++;
426   }
427 }
428 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
429     OIL_IMPL_FLAG_SSE2);
430 #endif
431 
432