1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19 
20 #ifdef SK_CPU_ARM64
sk_vld4_u8_arm64_3(const SkPMColor * SK_RESTRICT & src)21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22     uint8x8x4_t vsrc;
23     uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24 
25     asm (
26         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27         "mov    %[vsrc0].8b, v0.8b             \t\n"
28         "mov    %[vsrc1].8b, v1.8b             \t\n"
29         "mov    %[vsrc2].8b, v2.8b             \t\n"
30         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31           [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32         : : "v0", "v1", "v2", "v3"
33     );
34 
35     vsrc.val[0] = vsrc_0;
36     vsrc.val[1] = vsrc_1;
37     vsrc.val[2] = vsrc_2;
38 
39     return vsrc;
40 }
41 
sk_vld4_u8_arm64_4(const SkPMColor * SK_RESTRICT & src)42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43     uint8x8x4_t vsrc;
44     uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45 
46     asm (
47         "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48         "mov    %[vsrc0].8b, v0.8b             \t\n"
49         "mov    %[vsrc1].8b, v1.8b             \t\n"
50         "mov    %[vsrc2].8b, v2.8b             \t\n"
51         "mov    %[vsrc3].8b, v3.8b             \t\n"
52         : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53           [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54           [src] "+&r" (src)
55         : : "v0", "v1", "v2", "v3"
56     );
57 
58     vsrc.val[0] = vsrc_0;
59     vsrc.val[1] = vsrc_1;
60     vsrc.val[2] = vsrc_2;
61     vsrc.val[3] = vsrc_3;
62 
63     return vsrc;
64 }
65 #endif
66 
S32_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68                            const SkPMColor* SK_RESTRICT src, int count,
69                            U8CPU alpha, int /*x*/, int /*y*/) {
70     SkASSERT(255 == alpha);
71 
72     while (count >= 8) {
73         uint8x8x4_t vsrc;
74         uint16x8_t vdst;
75 
76         // Load
77 #ifdef SK_CPU_ARM64
78         vsrc = sk_vld4_u8_arm64_3(src);
79 #else
80         vsrc = vld4_u8((uint8_t*)src);
81         src += 8;
82 #endif
83 
84         // Convert src to 565
85         vdst = SkPixel32ToPixel16_neon8(vsrc);
86 
87         // Store
88         vst1q_u16(dst, vdst);
89 
90         // Prepare next iteration
91         dst += 8;
92         count -= 8;
93     };
94 
95     // Leftovers
96     while (count > 0) {
97         SkPMColor c = *src++;
98         SkPMColorAssert(c);
99         *dst = SkPixel32ToPixel16_ToU16(c);
100         dst++;
101         count--;
102     };
103 }
104 
S32_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106                           const SkPMColor* SK_RESTRICT src, int count,
107                           U8CPU alpha, int /*x*/, int /*y*/) {
108     SkASSERT(255 > alpha);
109 
110     uint16x8_t vmask_blue, vscale;
111 
112     // prepare constants
113     vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114     vmask_blue = vmovq_n_u16(0x1F);
115 
116     while (count >= 8) {
117         uint8x8x4_t vsrc;
118         uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119         uint16x8_t vres_r, vres_g, vres_b;
120 
121         // Load src
122 #ifdef SK_CPU_ARM64
123         vsrc = sk_vld4_u8_arm64_3(src);
124 #else
125         {
126         register uint8x8_t d0 asm("d0");
127         register uint8x8_t d1 asm("d1");
128         register uint8x8_t d2 asm("d2");
129         register uint8x8_t d3 asm("d3");
130 
131         asm (
132             "vld4.8    {d0-d3},[%[src]]!"
133             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
134             :
135         );
136         vsrc.val[0] = d0;
137         vsrc.val[1] = d1;
138         vsrc.val[2] = d2;
139         }
140 #endif
141 
142         // Load and unpack dst
143         vdst = vld1q_u16(dst);
144         vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
145         vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146         vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
147         vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
148 
149         // Shift src to 565 range
150         vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151         vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152         vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
153 
154         // Scale src - dst
155         vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156         vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157         vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
158 
159         vres_r = vshrq_n_u16(vres_r * vscale, 8);
160         vres_g = vshrq_n_u16(vres_g * vscale, 8);
161         vres_b = vshrq_n_u16(vres_b * vscale, 8);
162 
163         vres_r += vdst_r;
164         vres_g += vdst_g;
165         vres_b += vdst_b;
166 
167         // Combine
168         vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
169         vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
170 
171         // Store
172         vst1q_u16(dst, vres_b);
173         dst += 8;
174         count -= 8;
175     }
176     if (count > 0) {
177         int scale = SkAlpha255To256(alpha);
178         do {
179             SkPMColor c = *src++;
180             SkPMColorAssert(c);
181             uint16_t d = *dst;
182             *dst++ = SkPackRGB16(
183                     SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184                     SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185                     SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186         } while (--count != 0);
187     }
188 }
189 
190 #ifdef SK_CPU_ARM32
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192                            const SkPMColor* SK_RESTRICT src, int count,
193                            U8CPU alpha, int /*x*/, int /*y*/) {
194     SkASSERT(255 == alpha);
195 
196     if (count >= 8) {
197         uint16_t* SK_RESTRICT keep_dst = 0;
198 
199         asm volatile (
200                       "ands       ip, %[count], #7            \n\t"
201                       "vmov.u8    d31, #1<<7                  \n\t"
202                       "vld1.16    {q12}, [%[dst]]             \n\t"
203                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
204                       // Thumb does not support the standard ARM conditional
205                       // instructions but instead requires the 'it' instruction
206                       // to signal conditional execution
207                       "it eq                                  \n\t"
208                       "moveq      ip, #8                      \n\t"
209                       "mov        %[keep_dst], %[dst]         \n\t"
210 
211                       "add        %[src], %[src], ip, LSL#2   \n\t"
212                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
213                       "subs       %[count], %[count], ip      \n\t"
214                       "b          9f                          \n\t"
215                       // LOOP
216                       "2:                                         \n\t"
217 
218                       "vld1.16    {q12}, [%[dst]]!            \n\t"
219                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
222                       "subs       %[count], %[count], #8      \n\t"
223                       "9:                                         \n\t"
224                       "pld        [%[dst],#32]                \n\t"
225                       // expand 0565 q12 to 8888 {d4-d7}
226                       "vmovn.u16  d4, q12                     \n\t"
227                       "vshr.u16   q11, q12, #5                \n\t"
228                       "vshr.u16   q10, q12, #6+5              \n\t"
229                       "vmovn.u16  d5, q11                     \n\t"
230                       "vmovn.u16  d6, q10                     \n\t"
231                       "vshl.u8    d4, d4, #3                  \n\t"
232                       "vshl.u8    d5, d5, #2                  \n\t"
233                       "vshl.u8    d6, d6, #3                  \n\t"
234 
235                       "vmovl.u8   q14, d31                    \n\t"
236                       "vmovl.u8   q13, d31                    \n\t"
237                       "vmovl.u8   q12, d31                    \n\t"
238 
239                       // duplicate in 4/2/1 & 8pix vsns
240                       "vmvn.8     d30, d3                     \n\t"
241                       "vmlal.u8   q14, d30, d6                \n\t"
242                       "vmlal.u8   q13, d30, d5                \n\t"
243                       "vmlal.u8   q12, d30, d4                \n\t"
244                       "vshr.u16   q8, q14, #5                 \n\t"
245                       "vshr.u16   q9, q13, #6                 \n\t"
246                       "vaddhn.u16 d6, q14, q8                 \n\t"
247                       "vshr.u16   q8, q12, #5                 \n\t"
248                       "vaddhn.u16 d5, q13, q9                 \n\t"
249                       "vaddhn.u16 d4, q12, q8                 \n\t"
250                       // intentionally don't calculate alpha
251                       // result in d4-d6
252 
253             #ifdef SK_PMCOLOR_IS_RGBA
254                       "vqadd.u8   d6, d6, d0                  \n\t"
255                       "vqadd.u8   d5, d5, d1                  \n\t"
256                       "vqadd.u8   d4, d4, d2                  \n\t"
257             #else
258                       "vqadd.u8   d6, d6, d2                  \n\t"
259                       "vqadd.u8   d5, d5, d1                  \n\t"
260                       "vqadd.u8   d4, d4, d0                  \n\t"
261             #endif
262 
263                       // pack 8888 {d4-d6} to 0565 q10
264                       "vshll.u8   q10, d6, #8                 \n\t"
265                       "vshll.u8   q3, d5, #8                  \n\t"
266                       "vshll.u8   q2, d4, #8                  \n\t"
267                       "vsri.u16   q10, q3, #5                 \n\t"
268                       "vsri.u16   q10, q2, #11                \n\t"
269 
270                       "bne        2b                          \n\t"
271 
272                       "1:                                         \n\t"
273                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274                       : [count] "+r" (count)
275                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278                       "d30","d31"
279                       );
280     }
281     else
282     {   // handle count < 8
283         uint16_t* SK_RESTRICT keep_dst = 0;
284 
285         asm volatile (
286                       "vmov.u8    d31, #1<<7                  \n\t"
287                       "mov        %[keep_dst], %[dst]         \n\t"
288 
289                       "tst        %[count], #4                \n\t"
290                       "beq        14f                         \n\t"
291                       "vld1.16    {d25}, [%[dst]]!            \n\t"
292                       "vld1.32    {q1}, [%[src]]!             \n\t"
293 
294                       "14:                                        \n\t"
295                       "tst        %[count], #2                \n\t"
296                       "beq        12f                         \n\t"
297                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298                       "vld1.32    {d1}, [%[src]]!             \n\t"
299 
300                       "12:                                        \n\t"
301                       "tst        %[count], #1                \n\t"
302                       "beq        11f                         \n\t"
303                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305 
306                       "11:                                        \n\t"
307                       // unzips achieve the same as a vld4 operation
308                       "vuzp.u16   q0, q1                      \n\t"
309                       "vuzp.u8    d0, d1                      \n\t"
310                       "vuzp.u8    d2, d3                      \n\t"
311                       // expand 0565 q12 to 8888 {d4-d7}
312                       "vmovn.u16  d4, q12                     \n\t"
313                       "vshr.u16   q11, q12, #5                \n\t"
314                       "vshr.u16   q10, q12, #6+5              \n\t"
315                       "vmovn.u16  d5, q11                     \n\t"
316                       "vmovn.u16  d6, q10                     \n\t"
317                       "vshl.u8    d4, d4, #3                  \n\t"
318                       "vshl.u8    d5, d5, #2                  \n\t"
319                       "vshl.u8    d6, d6, #3                  \n\t"
320 
321                       "vmovl.u8   q14, d31                    \n\t"
322                       "vmovl.u8   q13, d31                    \n\t"
323                       "vmovl.u8   q12, d31                    \n\t"
324 
325                       // duplicate in 4/2/1 & 8pix vsns
326                       "vmvn.8     d30, d3                     \n\t"
327                       "vmlal.u8   q14, d30, d6                \n\t"
328                       "vmlal.u8   q13, d30, d5                \n\t"
329                       "vmlal.u8   q12, d30, d4                \n\t"
330                       "vshr.u16   q8, q14, #5                 \n\t"
331                       "vshr.u16   q9, q13, #6                 \n\t"
332                       "vaddhn.u16 d6, q14, q8                 \n\t"
333                       "vshr.u16   q8, q12, #5                 \n\t"
334                       "vaddhn.u16 d5, q13, q9                 \n\t"
335                       "vaddhn.u16 d4, q12, q8                 \n\t"
336                       // intentionally don't calculate alpha
337                       // result in d4-d6
338 
339             #ifdef SK_PMCOLOR_IS_RGBA
340                       "vqadd.u8   d6, d6, d0                  \n\t"
341                       "vqadd.u8   d5, d5, d1                  \n\t"
342                       "vqadd.u8   d4, d4, d2                  \n\t"
343             #else
344                       "vqadd.u8   d6, d6, d2                  \n\t"
345                       "vqadd.u8   d5, d5, d1                  \n\t"
346                       "vqadd.u8   d4, d4, d0                  \n\t"
347             #endif
348 
349                       // pack 8888 {d4-d6} to 0565 q10
350                       "vshll.u8   q10, d6, #8                 \n\t"
351                       "vshll.u8   q3, d5, #8                  \n\t"
352                       "vshll.u8   q2, d4, #8                  \n\t"
353                       "vsri.u16   q10, q3, #5                 \n\t"
354                       "vsri.u16   q10, q2, #11                \n\t"
355 
356                       // store
357                       "tst        %[count], #4                \n\t"
358                       "beq        24f                         \n\t"
359                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360 
361                       "24:                                        \n\t"
362                       "tst        %[count], #2                \n\t"
363                       "beq        22f                         \n\t"
364                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365 
366                       "22:                                        \n\t"
367                       "tst        %[count], #1                \n\t"
368                       "beq        21f                         \n\t"
369                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370 
371                       "21:                                        \n\t"
372                       : [count] "+r" (count)
373                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376                       "d30","d31"
377                       );
378     }
379 }
380 
381 #else // #ifdef SK_CPU_ARM32
382 
S32A_D565_Opaque_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)383 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
384                            const SkPMColor* SK_RESTRICT src, int count,
385                            U8CPU alpha, int /*x*/, int /*y*/) {
386     SkASSERT(255 == alpha);
387 
388     if (count >= 16) {
389         asm (
390             "movi    v4.8h, #0x80                   \t\n"
391 
392             "1:                                     \t\n"
393             "sub     %w[count], %w[count], #16      \t\n"
394             "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
395             "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
396             "prfm    pldl1keep, [%[src],#512]       \t\n"
397             "prfm    pldl1keep, [%[dst],#256]       \t\n"
398             "ushr    v20.8h, v17.8h, #5             \t\n"
399             "ushr    v31.8h, v16.8h, #5             \t\n"
400             "xtn     v6.8b, v31.8h                  \t\n"
401             "xtn2    v6.16b, v20.8h                 \t\n"
402             "ushr    v20.8h, v17.8h, #11            \t\n"
403             "shl     v19.16b, v6.16b, #2            \t\n"
404             "ushr    v31.8h, v16.8h, #11            \t\n"
405             "xtn     v22.8b, v31.8h                 \t\n"
406             "xtn2    v22.16b, v20.8h                \t\n"
407             "shl     v18.16b, v22.16b, #3           \t\n"
408             "mvn     v3.16b, v3.16b                 \t\n"
409             "xtn     v16.8b, v16.8h                 \t\n"
410             "mov     v7.16b, v4.16b                 \t\n"
411             "xtn2    v16.16b, v17.8h                \t\n"
412             "umlal   v7.8h, v3.8b, v19.8b           \t\n"
413             "shl     v16.16b, v16.16b, #3           \t\n"
414             "mov     v22.16b, v4.16b                \t\n"
415             "ushr    v24.8h, v7.8h, #6              \t\n"
416             "umlal   v22.8h, v3.8b, v18.8b          \t\n"
417             "ushr    v20.8h, v22.8h, #5             \t\n"
418             "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419             "cmp     %w[count], #16                 \t\n"
420             "mov     v6.16b, v4.16b                 \t\n"
421             "mov     v5.16b, v4.16b                 \t\n"
422             "umlal   v6.8h, v3.8b, v16.8b           \t\n"
423             "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
424             "mov     v17.16b, v4.16b                \t\n"
425             "ushr    v19.8h, v6.8h, #5              \t\n"
426             "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
427             "addhn   v7.8b, v7.8h, v24.8h           \t\n"
428             "ushr    v18.8h, v5.8h, #6              \t\n"
429             "ushr    v21.8h, v17.8h, #5             \t\n"
430             "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
431             "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
432             "mov     v22.16b, v4.16b                \t\n"
433             "addhn   v6.8b, v6.8h, v19.8h           \t\n"
434             "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
435             "ushr    v5.8h, v22.8h, #5              \t\n"
436             "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
437             "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
438 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
439             "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
440             "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
441 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
442             "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
443             "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
444 #else
445 #error "This function only supports BGRA and RGBA."
446 #endif
447             "shll    v22.8h, v20.8b, #8             \t\n"
448             "shll    v5.8h, v7.8b, #8               \t\n"
449             "sri     v22.8h, v5.8h, #5              \t\n"
450             "shll    v17.8h, v6.8b, #8              \t\n"
451             "shll2   v23.8h, v20.16b, #8            \t\n"
452             "shll2   v7.8h, v7.16b, #8              \t\n"
453             "sri     v22.8h, v17.8h, #11            \t\n"
454             "sri     v23.8h, v7.8h, #5              \t\n"
455             "shll2   v6.8h, v6.16b, #8              \t\n"
456             "st1     {v22.8h}, [%[dst]], #16        \t\n"
457             "sri     v23.8h, v6.8h, #11             \t\n"
458             "st1     {v23.8h}, [%[dst]], #16        \t\n"
459             "b.ge    1b                             \t\n"
460             : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
461             :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
462                "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
463                "v31"
464         );
465     }
466         // Leftovers
467     if (count > 0) {
468         do {
469             SkPMColor c = *src++;
470             SkPMColorAssert(c);
471             if (c) {
472                 *dst = SkSrcOver32To16(c, *dst);
473             }
474             dst += 1;
475         } while (--count != 0);
476     }
477 }
478 #endif // #ifdef SK_CPU_ARM32
479 
pmcolor_to_expand16(SkPMColor c)480 static uint32_t pmcolor_to_expand16(SkPMColor c) {
481     unsigned r = SkGetPackedR32(c);
482     unsigned g = SkGetPackedG32(c);
483     unsigned b = SkGetPackedB32(c);
484     return (g << 24) | (r << 13) | (b << 2);
485 }
486 
Color32A_D565_neon(uint16_t dst[],SkPMColor src,int count,int x,int y)487 void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488     uint32_t src_expand;
489     unsigned scale;
490     uint16x8_t vmask_blue;
491 
492     if (count <= 0) return;
493     SkASSERT(((size_t)dst & 0x01) == 0);
494 
495     /*
496      * This preamble code is in order to make dst aligned to 8 bytes
497      * in the next mutiple bytes read & write access.
498      */
499     src_expand = pmcolor_to_expand16(src);
500     scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501 
502 #define DST_ALIGN 8
503 
504     /*
505      * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506      */
507     int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508 
509     for (int i = 0; i < preamble_size; i+=2, dst++) {
510         uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511         *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512         if (--count == 0)
513             break;
514     }
515 
516     int count16 = 0;
517     count16 = count >> 4;
518     vmask_blue = vmovq_n_u16(SK_B16_MASK);
519 
520     if (count16) {
521         uint16x8_t wide_sr;
522         uint16x8_t wide_sg;
523         uint16x8_t wide_sb;
524         uint16x8_t wide_256_sa;
525 
526         unsigned sr = SkGetPackedR32(src);
527         unsigned sg = SkGetPackedG32(src);
528         unsigned sb = SkGetPackedB32(src);
529         unsigned sa = SkGetPackedA32(src);
530 
531         // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532         // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533         //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534         wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535 
536         // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537         //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538         wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539 
540         // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541         //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542         wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543 
544         wide_256_sa =
545             vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546 
547         while (count16-- > 0) {
548             uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549             uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550             vdst1 = vld1q_u16(dst);
551             dst += 8;
552             vdst2 = vld1q_u16(dst);
553             dst -= 8;    //to store dst again.
554 
555             vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556             vdst1_b = vdst1 & vmask_blue;                              // extract blue
557             vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558             vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559 
560             vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561             vdst2_b = vdst2 & vmask_blue;                              // extract blue
562             vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563             vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564 
565             vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566             vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567             vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568 
569             vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570             vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571             vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572 
573             vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574             vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575             vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576 
577             vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578             vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579 
580             vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581             vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582             vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583 
584             vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585             vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586 
587             vst1q_u16(dst, vdst1);
588             dst += 8;
589             vst1q_u16(dst, vdst2);
590             dst += 8;
591         }
592     }
593 
594     count &= 0xF;
595     if (count > 0) {
596         do {
597             uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598             *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599             dst += 1;
600         } while (--count != 0);
601     }
602 }
603 
SkDiv255Round_neon8(uint16x8_t prod)604 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605     prod += vdupq_n_u16(128);
606     prod += vshrq_n_u16(prod, 8);
607     return vshrq_n_u16(prod, 8);
608 }
609 
S32A_D565_Blend_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int,int)610 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611                           const SkPMColor* SK_RESTRICT src, int count,
612                           U8CPU alpha, int /*x*/, int /*y*/) {
613    SkASSERT(255 > alpha);
614 
615     /* This code implements a Neon version of S32A_D565_Blend. The results have
616      * a few mismatches compared to the original code. These mismatches never
617      * exceed 1.
618      */
619 
620     if (count >= 8) {
621         uint16x8_t valpha_max, vmask_blue;
622         uint8x8_t valpha;
623 
624         // prepare constants
625         valpha_max = vmovq_n_u16(255);
626         valpha = vdup_n_u8(alpha);
627         vmask_blue = vmovq_n_u16(SK_B16_MASK);
628 
629         do {
630             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631             uint16x8_t vres_a, vres_r, vres_g, vres_b;
632             uint8x8x4_t vsrc;
633 
634             // load pixels
635             vdst = vld1q_u16(dst);
636 #ifdef SK_CPU_ARM64
637             vsrc = sk_vld4_u8_arm64_4(src);
638 #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639             asm (
640                 "vld4.u8 %h[vsrc], [%[src]]!"
641                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642                 : :
643             );
644 #else
645             register uint8x8_t d0 asm("d0");
646             register uint8x8_t d1 asm("d1");
647             register uint8x8_t d2 asm("d2");
648             register uint8x8_t d3 asm("d3");
649 
650             asm volatile (
651                 "vld4.u8    {d0-d3},[%[src]]!;"
652                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653                   [src] "+&r" (src)
654                 : :
655             );
656             vsrc.val[0] = d0;
657             vsrc.val[1] = d1;
658             vsrc.val[2] = d2;
659             vsrc.val[3] = d3;
660 #endif
661 
662 
663             // deinterleave dst
664             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665             vdst_b = vdst & vmask_blue;                     // extract blue
666             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668 
669             // shift src to 565
670             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673 
674             // calc src * src_scale
675             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679 
680             // prepare dst_scale
681             vres_a = SkDiv255Round_neon8(vres_a);
682             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683 
684             // add dst * dst_scale to previous result
685             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688 
689 #ifdef S32A_D565_BLEND_EXACT
690             // It is possible to get exact results with this but it is slow,
691             // even slower than C code in some cases
692             vres_r = SkDiv255Round_neon8(vres_r);
693             vres_g = SkDiv255Round_neon8(vres_g);
694             vres_b = SkDiv255Round_neon8(vres_b);
695 #else
696             vres_r = vrshrq_n_u16(vres_r, 8);
697             vres_g = vrshrq_n_u16(vres_g, 8);
698             vres_b = vrshrq_n_u16(vres_b, 8);
699 #endif
700             // pack result
701             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703 
704             // store
705             vst1q_u16(dst, vres_b);
706             dst += 8;
707             count -= 8;
708         } while (count >= 8);
709     }
710 
711     // leftovers
712     while (count-- > 0) {
713         SkPMColor sc = *src++;
714         if (sc) {
715             uint16_t dc = *dst;
716             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
717             unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
718             unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
719             unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721         }
722         dst += 1;
723     }
724 }
725 
726 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727  * each dither value is spaced out into byte lanes, and repeated
728  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729  * start of each row.
730  */
731 static const uint8_t gDitherMatrix_Neon[48] = {
732     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736 
737 };
738 
S32_D565_Blend_Dither_neon(uint16_t * dst,const SkPMColor * src,int count,U8CPU alpha,int x,int y)739 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740                                 int count, U8CPU alpha, int x, int y)
741 {
742 
743     SkASSERT(255 > alpha);
744 
745     // rescale alpha to range 1 - 256
746     int scale = SkAlpha255To256(alpha);
747 
748     if (count >= 8) {
749         /* select row and offset for dither array */
750         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751 
752         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
753         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754 
755         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
756         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757 
758         do {
759 
760             uint8x8x4_t vsrc;
761             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
762             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
763             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
764             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
765             uint16x8_t vdst;
766             uint16x8_t vdst_r, vdst_g, vdst_b;
767             int16x8_t vres_r, vres_g, vres_b;
768             int8x8_t vres8_r, vres8_g, vres8_b;
769 
770             // Load source and add dither
771 #ifdef SK_CPU_ARM64
772             vsrc = sk_vld4_u8_arm64_3(src);
773 #else
774             {
775             register uint8x8_t d0 asm("d0");
776             register uint8x8_t d1 asm("d1");
777             register uint8x8_t d2 asm("d2");
778             register uint8x8_t d3 asm("d3");
779 
780             asm (
781                 "vld4.8    {d0-d3},[%[src]]! "
782                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
783                 :
784             );
785             vsrc.val[0] = d0;
786             vsrc.val[1] = d1;
787             vsrc.val[2] = d2;
788             }
789 #endif
790             vsrc_r = vsrc.val[NEON_R];
791             vsrc_g = vsrc.val[NEON_G];
792             vsrc_b = vsrc.val[NEON_B];
793 
794             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
795             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
796             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
797 
798             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
799             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
800             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
801 
802             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
803             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
804             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
805 
806             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
807             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
808             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
809 
810             // Load dst and unpack
811             vdst = vld1q_u16(dst);
812             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
813             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
814             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
815 
816             // subtract dst from src and widen
817             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
818             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
819             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
820 
821             // multiply diffs by scale and shift
822             vres_r = vmulq_s16(vres_r, vscale);
823             vres_g = vmulq_s16(vres_g, vscale);
824             vres_b = vmulq_s16(vres_b, vscale);
825 
826             vres8_r = vshrn_n_s16(vres_r, 8);
827             vres8_g = vshrn_n_s16(vres_g, 8);
828             vres8_b = vshrn_n_s16(vres_b, 8);
829 
830             // add dst to result
831             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
832             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
833             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
834 
835             // put result into 565 format
836             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
837             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
838 
839             // Store result
840             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
841 
842             // Next iteration
843             dst += 8;
844             count -= 8;
845 
846         } while (count >= 8);
847     }
848 
849     // Leftovers
850     if (count > 0) {
851         int scale = SkAlpha255To256(alpha);
852         DITHER_565_SCAN(y);
853         do {
854             SkPMColor c = *src++;
855             SkPMColorAssert(c);
856 
857             int dither = DITHER_VALUE(x);
858             int sr = SkGetPackedR32(c);
859             int sg = SkGetPackedG32(c);
860             int sb = SkGetPackedB32(c);
861             sr = SkDITHER_R32To565(sr, dither);
862             sg = SkDITHER_G32To565(sg, dither);
863             sb = SkDITHER_B32To565(sb, dither);
864 
865             uint16_t d = *dst;
866             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
867                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
868                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
869             DITHER_INC_X(x);
870         } while (--count != 0);
871     }
872 }
873 
874 /* Neon version of S32_Blend_BlitRow32()
875  * portable version is in src/core/SkBlitRow_D32.cpp
876  */
S32_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)877 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
878                               const SkPMColor* SK_RESTRICT src,
879                               int count, U8CPU alpha) {
880     SkASSERT(alpha <= 255);
881 
882     if (count <= 0) {
883         return;
884     }
885 
886     uint16_t src_scale = SkAlpha255To256(alpha);
887     uint16_t dst_scale = 256 - src_scale;
888 
889     while (count >= 2) {
890         uint8x8_t vsrc, vdst, vres;
891         uint16x8_t vsrc_wide, vdst_wide;
892 
893         /* These commented prefetches are a big win for count
894          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
895          * They also hurt a little (<5%) on an A15
896          */
897         //__builtin_prefetch(src+32);
898         //__builtin_prefetch(dst+32);
899 
900         // Load
901         vsrc = vreinterpret_u8_u32(vld1_u32(src));
902         vdst = vreinterpret_u8_u32(vld1_u32(dst));
903 
904         // Process src
905         vsrc_wide = vmovl_u8(vsrc);
906         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907 
908         // Process dst
909         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910 
911         // Combine
912 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
913         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
914 #else
915         vdst_wide += vsrc_wide;
916         vres = vshrn_n_u16(vdst_wide, 8);
917 #endif
918 
919         // Store
920         vst1_u32(dst, vreinterpret_u32_u8(vres));
921 
922         src += 2;
923         dst += 2;
924         count -= 2;
925     }
926 
927     if (count == 1) {
928         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
929         uint16x8_t vsrc_wide, vdst_wide;
930 
931         // Load
932         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
933         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
934 
935         // Process
936         vsrc_wide = vmovl_u8(vsrc);
937         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
938         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
939 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
940         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
941 #else
942         vdst_wide += vsrc_wide;
943         vres = vshrn_n_u16(vdst_wide, 8);
944 #endif
945 
946         // Store
947         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
948     }
949 }
950 
951 #ifdef SK_CPU_ARM32
S32A_Blend_BlitRow32_neon(SkPMColor * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha)952 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
953                          const SkPMColor* SK_RESTRICT src,
954                          int count, U8CPU alpha) {
955 
956     SkASSERT(255 > alpha);
957 
958     if (count <= 0) {
959         return;
960     }
961 
962     unsigned alpha256 = SkAlpha255To256(alpha);
963 
964     // First deal with odd counts
965     if (count & 1) {
966         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
967         uint16x8_t vdst_wide, vsrc_wide;
968         unsigned dst_scale;
969 
970         // Load
971         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
972         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
973 
974         // Calc dst_scale
975         dst_scale = vget_lane_u8(vsrc, 3);
976         dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
977 
978         // Process src
979         vsrc_wide = vmovl_u8(vsrc);
980         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
981 
982         // Process dst
983         vdst_wide = vmovl_u8(vdst);
984         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
985 
986         // Combine
987 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
988         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
989 #else
990         vdst_wide += vsrc_wide;
991         vres = vshrn_n_u16(vdst_wide, 8);
992 #endif
993 
994         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
995         dst++;
996         src++;
997         count--;
998     }
999 
1000     if (count) {
1001         uint8x8_t alpha_mask;
1002         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1003         alpha_mask = vld1_u8(alpha_mask_setup);
1004 
1005         do {
1006 
1007             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1008             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1009 
1010             __builtin_prefetch(src+32);
1011             __builtin_prefetch(dst+32);
1012 
1013             // Load
1014             vsrc = vreinterpret_u8_u32(vld1_u32(src));
1015             vdst = vreinterpret_u8_u32(vld1_u32(dst));
1016 
1017             // Prepare src_scale
1018             vsrc_scale = vdupq_n_u16(alpha256);
1019 
1020             // Calc dst_scale
1021             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1022             vdst_scale = vmovl_u8(vsrc_alphas);
1023 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1024             vdst_scale *= vsrc_scale;
1025             vdst_scale = vshrq_n_u16(vdst_scale, 8);
1026             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1027 #else
1028             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
1029             // A 16-bit lane would overflow if we used 0xFFFF here,
1030             // so use an approximation with 0xFF00 that is off by 1,
1031             // and add back 1 after to get the correct value.
1032             // This is valid if alpha256 <= 255.
1033             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
1034             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
1035             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
1036 #endif
1037 
1038             // Process src
1039             vsrc_wide = vmovl_u8(vsrc);
1040             vsrc_wide *= vsrc_scale;
1041 
1042             // Process dst
1043             vdst_wide = vmovl_u8(vdst);
1044             vdst_wide *= vdst_scale;
1045 
1046             // Combine
1047 #ifdef SK_SUPPORT_LEGACY_BROKEN_LERP
1048             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1049 #else
1050             vdst_wide += vsrc_wide;
1051             vres = vshrn_n_u16(vdst_wide, 8);
1052 #endif
1053 
1054             vst1_u32(dst, vreinterpret_u32_u8(vres));
1055 
1056             src += 2;
1057             dst += 2;
1058             count -= 2;
1059         } while(count);
1060     }
1061 }
1062 
1063 ///////////////////////////////////////////////////////////////////////////////
1064 
1065 #endif // #ifdef SK_CPU_ARM32
1066 
S32A_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1067 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1068                                    const SkPMColor* SK_RESTRICT src,
1069                                    int count, U8CPU alpha, int x, int y) {
1070     SkASSERT(255 == alpha);
1071 
1072 #define    UNROLL    8
1073 
1074     if (count >= UNROLL) {
1075 
1076     uint8x8_t dbase;
1077     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1078     dbase = vld1_u8(dstart);
1079 
1080         do {
1081         uint8x8x4_t vsrc;
1082         uint8x8_t sr, sg, sb, sa, d;
1083         uint16x8_t dst8, scale8, alpha8;
1084         uint16x8_t dst_r, dst_g, dst_b;
1085 
1086 #ifdef SK_CPU_ARM64
1087         vsrc = sk_vld4_u8_arm64_4(src);
1088 #else
1089         {
1090         register uint8x8_t d0 asm("d0");
1091         register uint8x8_t d1 asm("d1");
1092         register uint8x8_t d2 asm("d2");
1093         register uint8x8_t d3 asm("d3");
1094 
1095         asm ("vld4.8    {d0-d3},[%[src]]! "
1096             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1097             :
1098         );
1099         vsrc.val[0] = d0;
1100         vsrc.val[1] = d1;
1101         vsrc.val[2] = d2;
1102         vsrc.val[3] = d3;
1103         }
1104 #endif
1105         sa = vsrc.val[NEON_A];
1106         sr = vsrc.val[NEON_R];
1107         sg = vsrc.val[NEON_G];
1108         sb = vsrc.val[NEON_B];
1109 
1110         /* calculate 'd', which will be 0..7
1111          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1112          */
1113         alpha8 = vmovl_u8(dbase);
1114         alpha8 = vmlal_u8(alpha8, sa, dbase);
1115         d = vshrn_n_u16(alpha8, 8);    // narrowing too
1116 
1117         // sr = sr - (sr>>5) + d
1118         /* watching for 8-bit overflow.  d is 0..7; risky range of
1119          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1120          * safe  as long as we do ((sr-sr>>5) + d)
1121          */
1122         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1123         sr = vadd_u8(sr, d);
1124 
1125         // sb = sb - (sb>>5) + d
1126         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1127         sb = vadd_u8(sb, d);
1128 
1129         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1130         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1131         sg = vadd_u8(sg, vshr_n_u8(d,1));
1132 
1133         // need to pick up 8 dst's -- at 16 bits each, 128 bits
1134         dst8 = vld1q_u16(dst);
1135         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1136         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1137         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1138 
1139         // blend
1140         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1141 
1142         // combine the addq and mul, save 3 insns
1143         scale8 = vshrq_n_u16(scale8, 3);
1144         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1145         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1146         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1147 
1148         // repack to store
1149         dst8 = vshrq_n_u16(dst_b, 5);
1150         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1151         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1152 
1153         vst1q_u16(dst, dst8);
1154 
1155         dst += UNROLL;
1156         count -= UNROLL;
1157         // skip x += UNROLL, since it's unchanged mod-4
1158         } while (count >= UNROLL);
1159     }
1160 #undef    UNROLL
1161 
1162     // residuals
1163     if (count > 0) {
1164         DITHER_565_SCAN(y);
1165         do {
1166             SkPMColor c = *src++;
1167             SkPMColorAssert(c);
1168             if (c) {
1169                 unsigned a = SkGetPackedA32(c);
1170 
1171                 // dither and alpha are just temporary variables to work-around
1172                 // an ICE in debug.
1173                 unsigned dither = DITHER_VALUE(x);
1174                 unsigned alpha = SkAlpha255To256(a);
1175                 int d = SkAlphaMul(dither, alpha);
1176 
1177                 unsigned sr = SkGetPackedR32(c);
1178                 unsigned sg = SkGetPackedG32(c);
1179                 unsigned sb = SkGetPackedB32(c);
1180                 sr = SkDITHER_R32_FOR_565(sr, d);
1181                 sg = SkDITHER_G32_FOR_565(sg, d);
1182                 sb = SkDITHER_B32_FOR_565(sb, d);
1183 
1184                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1185                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1186                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1187                 // now src and dst expanded are in g:11 r:10 x:1 b:10
1188                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1189             }
1190             dst += 1;
1191             DITHER_INC_X(x);
1192         } while (--count != 0);
1193     }
1194 }
1195 
1196 ///////////////////////////////////////////////////////////////////////////////
1197 
S32_D565_Opaque_Dither_neon(uint16_t * SK_RESTRICT dst,const SkPMColor * SK_RESTRICT src,int count,U8CPU alpha,int x,int y)1198 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1199                                  const SkPMColor* SK_RESTRICT src,
1200                                  int count, U8CPU alpha, int x, int y) {
1201     SkASSERT(255 == alpha);
1202 
1203 #define    UNROLL    8
1204     if (count >= UNROLL) {
1205     uint8x8_t d;
1206     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1207     d = vld1_u8(dstart);
1208 
1209     while (count >= UNROLL) {
1210         uint8x8_t sr, sg, sb;
1211         uint16x8_t dr, dg, db;
1212         uint16x8_t dst8;
1213         uint8x8x4_t vsrc;
1214 
1215 #ifdef SK_CPU_ARM64
1216         vsrc = sk_vld4_u8_arm64_3(src);
1217 #else
1218         {
1219         register uint8x8_t d0 asm("d0");
1220         register uint8x8_t d1 asm("d1");
1221         register uint8x8_t d2 asm("d2");
1222         register uint8x8_t d3 asm("d3");
1223 
1224         asm (
1225             "vld4.8    {d0-d3},[%[src]]! "
1226             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1227             :
1228         );
1229         vsrc.val[0] = d0;
1230         vsrc.val[1] = d1;
1231         vsrc.val[2] = d2;
1232         }
1233 #endif
1234         sr = vsrc.val[NEON_R];
1235         sg = vsrc.val[NEON_G];
1236         sb = vsrc.val[NEON_B];
1237 
1238         /* XXX: if we want to prefetch, hide it in the above asm()
1239          * using the gcc __builtin_prefetch(), the prefetch will
1240          * fall to the bottom of the loop -- it won't stick up
1241          * at the top of the loop, just after the vld4.
1242          */
1243 
1244         // sr = sr - (sr>>5) + d
1245         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1246         dr = vaddl_u8(sr, d);
1247 
1248         // sb = sb - (sb>>5) + d
1249         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1250         db = vaddl_u8(sb, d);
1251 
1252         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1253         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1254         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1255 
1256         // pack high bits of each into 565 format  (rgb, b is lsb)
1257         dst8 = vshrq_n_u16(db, 3);
1258         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1259         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1260 
1261         // store it
1262         vst1q_u16(dst, dst8);
1263 
1264         dst += UNROLL;
1265         // we don't need to increment src as the asm above has already done it
1266         count -= UNROLL;
1267         x += UNROLL;        // probably superfluous
1268     }
1269     }
1270 #undef    UNROLL
1271 
1272     // residuals
1273     if (count > 0) {
1274         DITHER_565_SCAN(y);
1275         do {
1276             SkPMColor c = *src++;
1277             SkPMColorAssert(c);
1278             SkASSERT(SkGetPackedA32(c) == 255);
1279 
1280             unsigned dither = DITHER_VALUE(x);
1281             *dst++ = SkDitherRGB32To565(c, dither);
1282             DITHER_INC_X(x);
1283         } while (--count != 0);
1284     }
1285 }
1286 
1287 ///////////////////////////////////////////////////////////////////////////////
1288 
1289 const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1290     // no dither
1291     S32_D565_Opaque_neon,
1292     S32_D565_Blend_neon,
1293     S32A_D565_Opaque_neon,
1294 #if 0
1295     S32A_D565_Blend_neon,
1296 #else
1297     nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
1298 #endif
1299 
1300     // dither
1301     S32_D565_Opaque_Dither_neon,
1302     S32_D565_Blend_Dither_neon,
1303     S32A_D565_Opaque_Dither_neon,
1304     nullptr,   // S32A_D565_Blend_Dither
1305 };
1306 
1307 const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1308     Color32A_D565_neon,    // Color32_D565,
1309     Color32A_D565_neon,    // Color32A_D565,
1310     Color32A_D565_neon,    // Color32_D565_Dither,
1311     Color32A_D565_neon,    // Color32A_D565_Dither
1312 };
1313 
1314 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1315     nullptr,   // S32_Opaque,
1316     S32_Blend_BlitRow32_neon,        // S32_Blend,
1317     nullptr,  // Ported to SkOpts
1318 #ifdef SK_CPU_ARM32
1319     S32A_Blend_BlitRow32_neon        // S32A_Blend
1320 #else
1321     nullptr
1322 #endif
1323 };
1324