1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/private/SkColorData.h"
9 #include "src/core/SkBlitRow.h"
10 #include "src/core/SkOpts.h"
11 #include "src/core/SkUtils.h"
12 
13 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)14 static void blit_row_s32_opaque(SkPMColor* dst,
15                                 const SkPMColor* src,
16                                 int count,
17                                 U8CPU alpha) {
18     SkASSERT(255 == alpha);
19     memcpy(dst, src, count * sizeof(SkPMColor));
20 }
21 
22 // We have SSE2, NEON, and portable implementations of
23 // blit_row_s32_blend() and blit_row_s32a_blend().
24 
25 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
26 
27 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
28     #include <emmintrin.h>
29 
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)30     static inline __m128i SkPMLerp_SSE2(const __m128i& src,
31                                         const __m128i& dst,
32                                         const unsigned src_scale) {
33         // Computes dst + (((src - dst)*src_scale)>>8)
34         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
35 
36         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
37         __m128i src_rb = _mm_and_si128(mask, src);
38         __m128i src_ag = _mm_srli_epi16(src, 8);
39         __m128i dst_rb = _mm_and_si128(mask, dst);
40         __m128i dst_ag = _mm_srli_epi16(dst, 8);
41 
42         // Compute scaled differences.
43         __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
44         __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
45         __m128i s = _mm_set1_epi16(src_scale);
46         diff_rb = _mm_mullo_epi16(diff_rb, s);
47         diff_ag = _mm_mullo_epi16(diff_ag, s);
48 
49         // Pack the differences back together.
50         diff_rb = _mm_srli_epi16(diff_rb, 8);
51         diff_ag = _mm_andnot_si128(mask, diff_ag);
52         __m128i diff = _mm_or_si128(diff_rb, diff_ag);
53 
54         // Add difference to destination.
55         return _mm_add_epi8(dst, diff);
56     }
57 
58 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)59     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
60         SkASSERT(alpha <= 255);
61 
62         auto src4 = (const __m128i*)src;
63         auto dst4 = (      __m128i*)dst;
64 
65         while (count >= 4) {
66             _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
67                                                  _mm_loadu_si128(dst4),
68                                                  SkAlpha255To256(alpha)));
69             src4++;
70             dst4++;
71             count -= 4;
72         }
73 
74         src = (const SkPMColor*)src4;
75         dst = (      SkPMColor*)dst4;
76 
77         while (count --> 0) {
78             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
79             src++;
80             dst++;
81         }
82     }
83 
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)84     static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
85                                              const __m128i& dst,
86                                              const unsigned aa) {
87         unsigned alpha = SkAlpha255To256(aa);
88         __m128i src_scale = _mm_set1_epi16(alpha);
89         // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
90         __m128i dst_scale = _mm_srli_epi32(src, 24);
91         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
92         dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
93         dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
94         dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
95         dst_scale = _mm_srli_epi32(dst_scale, 8);
96         // Duplicate scales into 2x16-bit pattern per pixel.
97         dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
98         dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
99 
100         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
101 
102         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
103         __m128i src_rb = _mm_and_si128(mask, src);
104         __m128i src_ag = _mm_srli_epi16(src, 8);
105         __m128i dst_rb = _mm_and_si128(mask, dst);
106         __m128i dst_ag = _mm_srli_epi16(dst, 8);
107 
108         // Scale them.
109         src_rb = _mm_mullo_epi16(src_rb, src_scale);
110         src_ag = _mm_mullo_epi16(src_ag, src_scale);
111         dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
112         dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
113 
114         // Add the scaled source and destination.
115         dst_rb = _mm_add_epi16(src_rb, dst_rb);
116         dst_ag = _mm_add_epi16(src_ag, dst_ag);
117 
118         // Unsplay the halves back together.
119         dst_rb = _mm_srli_epi16(dst_rb, 8);
120         dst_ag = _mm_andnot_si128(mask, dst_ag);
121         return _mm_or_si128(dst_rb, dst_ag);
122     }
123 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)124     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
125         SkASSERT(alpha <= 255);
126 
127         auto src4 = (const __m128i*)src;
128         auto dst4 = (      __m128i*)dst;
129 
130         while (count >= 4) {
131             _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
132                                                       _mm_loadu_si128(dst4),
133                                                       alpha));
134             src4++;
135             dst4++;
136             count -= 4;
137         }
138 
139         src = (const SkPMColor*)src4;
140         dst = (      SkPMColor*)dst4;
141 
142         while (count --> 0) {
143             *dst = SkBlendARGB32(*src, *dst, alpha);
144             src++;
145             dst++;
146         }
147     }
148 
149 #elif defined(SK_ARM_HAS_NEON)
150     #include <arm_neon.h>
151 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)152     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
153         SkASSERT(alpha <= 255);
154 
155         uint16_t src_scale = SkAlpha255To256(alpha);
156         uint16_t dst_scale = 256 - src_scale;
157 
158         while (count >= 2) {
159             uint8x8_t vsrc, vdst, vres;
160             uint16x8_t vsrc_wide, vdst_wide;
161 
162             vsrc = vreinterpret_u8_u32(vld1_u32(src));
163             vdst = vreinterpret_u8_u32(vld1_u32(dst));
164 
165             vsrc_wide = vmovl_u8(vsrc);
166             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
167 
168             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
169 
170             vdst_wide += vsrc_wide;
171             vres = vshrn_n_u16(vdst_wide, 8);
172 
173             vst1_u32(dst, vreinterpret_u32_u8(vres));
174 
175             src += 2;
176             dst += 2;
177             count -= 2;
178         }
179 
180         if (count == 1) {
181             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
182             uint16x8_t vsrc_wide, vdst_wide;
183 
184             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
185             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
186 
187             vsrc_wide = vmovl_u8(vsrc);
188             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
189             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
190             vdst_wide += vsrc_wide;
191             vres = vshrn_n_u16(vdst_wide, 8);
192 
193             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
194         }
195     }
196 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)197     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
198         SkASSERT(alpha < 255);
199 
200         unsigned alpha256 = SkAlpha255To256(alpha);
201 
202         if (count & 1) {
203             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
204             uint16x8_t vdst_wide, vsrc_wide;
205             unsigned dst_scale;
206 
207             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
208             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
209 
210             dst_scale = vget_lane_u8(vsrc, 3);
211             dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
212 
213             vsrc_wide = vmovl_u8(vsrc);
214             vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
215 
216             vdst_wide = vmovl_u8(vdst);
217             vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
218 
219             vdst_wide += vsrc_wide;
220             vres = vshrn_n_u16(vdst_wide, 8);
221 
222             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
223             dst++;
224             src++;
225             count--;
226         }
227 
228         uint8x8_t alpha_mask;
229         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
230         alpha_mask = vld1_u8(alpha_mask_setup);
231 
232         while (count) {
233 
234             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
235             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
236 
237             __builtin_prefetch(src+32);
238             __builtin_prefetch(dst+32);
239 
240             vsrc = vreinterpret_u8_u32(vld1_u32(src));
241             vdst = vreinterpret_u8_u32(vld1_u32(dst));
242 
243             vsrc_scale = vdupq_n_u16(alpha256);
244 
245             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
246             vdst_scale = vmovl_u8(vsrc_alphas);
247             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
248             // A 16-bit lane would overflow if we used 0xFFFF here,
249             // so use an approximation with 0xFF00 that is off by 1,
250             // and add back 1 after to get the correct value.
251             // This is valid if alpha256 <= 255.
252             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
253             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
254             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
255 
256             vsrc_wide = vmovl_u8(vsrc);
257             vsrc_wide *= vsrc_scale;
258 
259             vdst_wide = vmovl_u8(vdst);
260             vdst_wide *= vdst_scale;
261 
262             vdst_wide += vsrc_wide;
263             vres = vshrn_n_u16(vdst_wide, 8);
264 
265             vst1_u32(dst, vreinterpret_u32_u8(vres));
266 
267             src += 2;
268             dst += 2;
269             count -= 2;
270         }
271     }
272 
273 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)274     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
275         SkASSERT(alpha <= 255);
276         while (count --> 0) {
277             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
278             src++;
279             dst++;
280         }
281     }
282 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)283     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
284         SkASSERT(alpha <= 255);
285         while (count --> 0) {
286             *dst = SkBlendARGB32(*src, *dst, alpha);
287             src++;
288             dst++;
289         }
290     }
291 #endif
292 
Factory32(unsigned flags)293 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
294     static const SkBlitRow::Proc32 kProcs[] = {
295         blit_row_s32_opaque,
296         blit_row_s32_blend,
297         nullptr,  // blit_row_s32a_opaque is in SkOpts
298         blit_row_s32a_blend
299     };
300 
301     SkASSERT(flags < SK_ARRAY_COUNT(kProcs));
302     flags &= SK_ARRAY_COUNT(kProcs) - 1;  // just to be safe
303 
304     return flags == 2 ? SkOpts::blit_row_s32a_opaque
305                       : kProcs[flags];
306 }
307 
Color32(SkPMColor dst[],const SkPMColor src[],int count,SkPMColor color)308 void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
309     switch (SkGetPackedA32(color)) {
310         case   0: memmove(dst, src, count * sizeof(SkPMColor)); return;
311         case 255: sk_memset32(dst, color, count);               return;
312     }
313     return SkOpts::blit_row_color32(dst, src, count, color);
314 }
315