1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtGui module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include <private/qdrawhelper_neon_p.h>
41 #include <private/qblendfunctions_p.h>
42 #include <private/qmath_p.h>
43 
44 #ifdef __ARM_NEON__
45 
46 #include <private/qpaintengine_raster_p.h>
47 
48 QT_BEGIN_NAMESPACE
49 
qt_memfill32(quint32 * dest,quint32 value,qsizetype count)50 void qt_memfill32(quint32 *dest, quint32 value, qsizetype count)
51 {
52     const int epilogueSize = count % 16;
53 #if defined(Q_CC_GHS) || defined(Q_CC_MSVC)
54     // inline assembler free version:
55     if (count >= 16) {
56         quint32 *const neonEnd = dest + count - epilogueSize;
57         const uint32x4_t valueVector1 = vdupq_n_u32(value);
58         const uint32x4x4_t valueVector4 = { valueVector1, valueVector1, valueVector1, valueVector1 };
59         do {
60             vst4q_u32(dest, valueVector4);
61             dest += 16;
62         } while (dest != neonEnd);
63     }
64 #elif !defined(Q_PROCESSOR_ARM_64)
65     if (count >= 16) {
66         quint32 *const neonEnd = dest + count - epilogueSize;
67         register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
68         register uint32x4_t valueVector2 asm ("q1") = valueVector1;
69         while (dest != neonEnd) {
70             asm volatile (
71                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
72                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
73                 : [DST]"+r" (dest)
74                 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
75                 : "memory"
76             );
77         }
78     }
79 #else
80     if (count >= 16) {
81         quint32 *const neonEnd = dest + count - epilogueSize;
82         register uint32x4_t valueVector1 asm ("v0") = vdupq_n_u32(value);
83         register uint32x4_t valueVector2 asm ("v1") = valueVector1;
84         while (dest != neonEnd) {
85             asm volatile (
86                 "st2     { v0.4s, v1.4s }, [%[DST]], #32 \n\t"
87                 "st2     { v0.4s, v1.4s }, [%[DST]], #32 \n\t"
88                 : [DST]"+r" (dest)
89                 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
90                 : "memory"
91             );
92         }
93     }
94 #endif
95 
96     switch (epilogueSize)
97     {
98     case 15:     *dest++ = value; Q_FALLTHROUGH();
99     case 14:     *dest++ = value; Q_FALLTHROUGH();
100     case 13:     *dest++ = value; Q_FALLTHROUGH();
101     case 12:     *dest++ = value; Q_FALLTHROUGH();
102     case 11:     *dest++ = value; Q_FALLTHROUGH();
103     case 10:     *dest++ = value; Q_FALLTHROUGH();
104     case 9:      *dest++ = value; Q_FALLTHROUGH();
105     case 8:      *dest++ = value; Q_FALLTHROUGH();
106     case 7:      *dest++ = value; Q_FALLTHROUGH();
107     case 6:      *dest++ = value; Q_FALLTHROUGH();
108     case 5:      *dest++ = value; Q_FALLTHROUGH();
109     case 4:      *dest++ = value; Q_FALLTHROUGH();
110     case 3:      *dest++ = value; Q_FALLTHROUGH();
111     case 2:      *dest++ = value; Q_FALLTHROUGH();
112     case 1:      *dest++ = value;
113     }
114 }
115 
qvdiv_255_u16(uint16x8_t x,uint16x8_t half)116 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
117 {
118     // result = (x + (x >> 8) + 0x80) >> 8
119 
120     const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
121     const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
122     const uint16x8_t sum = vaddq_u16(temp, sum_part);
123 
124     return vshrq_n_u16(sum, 8);
125 }
126 
qvbyte_mul_u16(uint16x8_t x,uint16x8_t alpha,uint16x8_t half)127 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
128 {
129     // t = qRound(x * alpha / 255.0)
130 
131     const uint16x8_t t = vmulq_u16(x, alpha); // t
132     return qvdiv_255_u16(t, half);
133 }
134 
qvinterpolate_pixel_255(uint16x8_t x,uint16x8_t a,uint16x8_t y,uint16x8_t b,uint16x8_t half)135 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
136 {
137     // t = x * a + y * b
138 
139     const uint16x8_t ta = vmulq_u16(x, a);
140     const uint16x8_t tb = vmulq_u16(y, b);
141 
142     return qvdiv_255_u16(vaddq_u16(ta, tb), half);
143 }
144 
qvsource_over_u16(uint16x8_t src16,uint16x8_t dst16,uint16x8_t half,uint16x8_t full)145 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
146 {
147     const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
148     const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
149 
150     const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
151 
152     return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
153 }
154 
155 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
156 extern "C" void
157 pixman_composite_over_8888_0565_asm_neon (int32_t   w,
158                                           int32_t   h,
159                                           uint16_t *dst,
160                                           int32_t   dst_stride,
161                                           uint32_t *src,
162                                           int32_t   src_stride);
163 
164 extern "C" void
165 pixman_composite_over_8888_8888_asm_neon (int32_t   w,
166                                           int32_t   h,
167                                           uint32_t *dst,
168                                           int32_t   dst_stride,
169                                           uint32_t *src,
170                                           int32_t   src_stride);
171 
172 extern "C" void
173 pixman_composite_src_0565_8888_asm_neon (int32_t   w,
174                                          int32_t   h,
175                                          uint32_t *dst,
176                                          int32_t   dst_stride,
177                                          uint16_t *src,
178                                          int32_t   src_stride);
179 
180 extern "C" void
181 pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
182                                          int32_t    h,
183                                          uint16_t  *dst,
184                                          int32_t    dst_stride,
185                                          uint32_t   src,
186                                          int32_t    unused,
187                                          uint8_t   *mask,
188                                          int32_t    mask_stride);
189 
190 extern "C" void
191 pixman_composite_scanline_over_asm_neon (int32_t         w,
192                                          const uint32_t *dst,
193                                          const uint32_t *src);
194 
195 extern "C" void
196 pixman_composite_src_0565_0565_asm_neon (int32_t   w,
197                                          int32_t   h,
198                                          uint16_t *dst,
199                                          int32_t   dst_stride,
200                                          uint16_t *src,
201                                          int32_t   src_stride);
202 // qblendfunctions.cpp
203 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
204                                           const uchar *srcPixels, int sbpl,
205                                           int w, int h,
206                                           int const_alpha);
207 
qt_blend_rgb16_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)208 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
209                                    const uchar *srcPixels, int sbpl,
210                                    int w, int h,
211                                    int const_alpha)
212 {
213     dbpl /= 4;
214     sbpl /= 2;
215 
216     quint32 *dst = (quint32 *) destPixels;
217     quint16 *src = (quint16 *) srcPixels;
218 
219     if (const_alpha != 256) {
220         quint8 a = (255 * const_alpha) >> 8;
221         quint8 ia = 255 - a;
222 
223         while (h--) {
224             for (int x=0; x<w; ++x)
225                 dst[x] = INTERPOLATE_PIXEL_255(qConvertRgb16To32(src[x]), a, dst[x], ia);
226             dst += dbpl;
227             src += sbpl;
228         }
229         return;
230     }
231 
232     pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
233 }
234 
235 // qblendfunctions.cpp
236 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
237                              const uchar *src, int sbpl,
238                              int w, int h,
239                              int const_alpha);
240 
241 
242 template <int N>
scanLineBlit16(quint16 * dst,quint16 * src,int dstride)243 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
244 {
245     if (N >= 2) {
246         ((quint32 *)dst)[0] = ((quint32 *)src)[0];
247         __builtin_prefetch(dst + dstride, 1, 0);
248     }
249     for (int i = 1; i < N/2; ++i)
250         ((quint32 *)dst)[i] = ((quint32 *)src)[i];
251     if (N & 1)
252         dst[N-1] = src[N-1];
253 }
254 
255 template <int Width>
blockBlit16(quint16 * dst,quint16 * src,int dstride,int sstride,int h)256 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
257 {
258     union {
259         quintptr address;
260         quint16 *pointer;
261     } u;
262 
263     u.pointer = dst;
264 
265     if (u.address & 2) {
266         while (h--) {
267             // align dst
268             dst[0] = src[0];
269             if (Width > 1)
270                 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
271             dst += dstride;
272             src += sstride;
273         }
274     } else {
275         while (h--) {
276             scanLineBlit16<Width>(dst, src, dstride);
277 
278             dst += dstride;
279             src += sstride;
280         }
281     }
282 }
283 
qt_blend_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)284 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
285                                   const uchar *srcPixels, int sbpl,
286                                   int w, int h,
287                                   int const_alpha)
288 {
289     // testing show that the default memcpy is faster for widths 150 and up
290     if (const_alpha != 256 || w >= 150) {
291         qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
292         return;
293     }
294 
295     int dstride = dbpl / 2;
296     int sstride = sbpl / 2;
297 
298     quint16 *dst = (quint16 *) destPixels;
299     quint16 *src = (quint16 *) srcPixels;
300 
301     switch (w) {
302 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
303     BLOCKBLIT(1);
304     BLOCKBLIT(2);
305     BLOCKBLIT(3);
306     BLOCKBLIT(4);
307     BLOCKBLIT(5);
308     BLOCKBLIT(6);
309     BLOCKBLIT(7);
310     BLOCKBLIT(8);
311     BLOCKBLIT(9);
312     BLOCKBLIT(10);
313     BLOCKBLIT(11);
314     BLOCKBLIT(12);
315     BLOCKBLIT(13);
316     BLOCKBLIT(14);
317     BLOCKBLIT(15);
318 #undef BLOCKBLIT
319     default:
320         break;
321     }
322 
323     pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
324 }
325 
326 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
327 
qt_blend_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)328 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
329                                    const uchar *srcPixels, int sbpl,
330                                    int w, int h,
331                                    int const_alpha)
332 {
333     quint16 *dst = (quint16 *) destPixels;
334     quint32 *src = (quint32 *) srcPixels;
335 
336     if (const_alpha != 256) {
337         for (int y=0; y<h; ++y) {
338             int i = 0;
339             for (; i < w-7; i += 8)
340                 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
341 
342             if (i < w) {
343                 int tail = w - i;
344 
345                 quint16 dstBuffer[8];
346                 quint32 srcBuffer[8];
347 
348                 for (int j = 0; j < tail; ++j) {
349                     dstBuffer[j] = dst[i + j];
350                     srcBuffer[j] = src[i + j];
351                 }
352 
353                 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
354 
355                 for (int j = 0; j < tail; ++j)
356                     dst[i + j] = dstBuffer[j];
357             }
358 
359             dst = (quint16 *)(((uchar *) dst) + dbpl);
360             src = (quint32 *)(((uchar *) src) + sbpl);
361         }
362         return;
363     }
364 
365     pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
366 }
367 #endif
368 
qt_blend_argb32_on_argb32_scanline_neon(uint * dest,const uint * src,int length,uint const_alpha)369 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
370 {
371     if (const_alpha == 255) {
372 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
373         pixman_composite_scanline_over_asm_neon(length, dest, src);
374 #else
375         qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, 256);
376 #endif
377     } else {
378         qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
379     }
380 }
381 
qt_blend_argb32_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)382 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
383                                     const uchar *srcPixels, int sbpl,
384                                     int w, int h,
385                                     int const_alpha)
386 {
387     const uint *src = (const uint *) srcPixels;
388     uint *dst = (uint *) destPixels;
389     uint16x8_t half = vdupq_n_u16(0x80);
390     uint16x8_t full = vdupq_n_u16(0xff);
391     if (const_alpha == 256) {
392 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
393         pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
394 #else
395         for (int y=0; y<h; ++y) {
396             int x = 0;
397             for (; x < w-3; x += 4) {
398                 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
399                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
400                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
401 
402                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
403                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
404 
405                     const uint8x8_t src8_low = vget_low_u8(src8);
406                     const uint8x8_t dst8_low = vget_low_u8(dst8);
407 
408                     const uint8x8_t src8_high = vget_high_u8(src8);
409                     const uint8x8_t dst8_high = vget_high_u8(dst8);
410 
411                     const uint16x8_t src16_low = vmovl_u8(src8_low);
412                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
413 
414                     const uint16x8_t src16_high = vmovl_u8(src8_high);
415                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
416 
417                     const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full);
418                     const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full);
419 
420                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
421                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
422 
423                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
424                 }
425             }
426             for (; x<w; ++x) {
427                 uint s = src[x];
428                 if (s >= 0xff000000)
429                     dst[x] = s;
430                 else if (s != 0)
431                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
432             }
433             dst = (quint32 *)(((uchar *) dst) + dbpl);
434             src = (const quint32 *)(((const uchar *) src) + sbpl);
435         }
436 #endif
437     } else if (const_alpha != 0) {
438         const_alpha = (const_alpha * 255) >> 8;
439         uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
440         for (int y = 0; y < h; ++y) {
441             int x = 0;
442             for (; x < w-3; x += 4) {
443                 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
444                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
445                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
446 
447                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
448                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
449 
450                     const uint8x8_t src8_low = vget_low_u8(src8);
451                     const uint8x8_t dst8_low = vget_low_u8(dst8);
452 
453                     const uint8x8_t src8_high = vget_high_u8(src8);
454                     const uint8x8_t dst8_high = vget_high_u8(dst8);
455 
456                     const uint16x8_t src16_low = vmovl_u8(src8_low);
457                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
458 
459                     const uint16x8_t src16_high = vmovl_u8(src8_high);
460                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
461 
462                     const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
463                     const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
464 
465                     const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
466                     const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
467 
468                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
469                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
470 
471                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
472                 }
473             }
474             for (; x<w; ++x) {
475                 uint s = src[x];
476                 if (s != 0) {
477                     s = BYTE_MUL(s, const_alpha);
478                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
479                 }
480             }
481             dst = (quint32 *)(((uchar *) dst) + dbpl);
482             src = (const quint32 *)(((const uchar *) src) + sbpl);
483         }
484     }
485 }
486 
487 // qblendfunctions.cpp
488 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
489                              const uchar *srcPixels, int sbpl,
490                              int w, int h,
491                              int const_alpha);
492 
qt_blend_rgb32_on_rgb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)493 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
494                                   const uchar *srcPixels, int sbpl,
495                                   int w, int h,
496                                   int const_alpha)
497 {
498     if (const_alpha != 256) {
499         if (const_alpha != 0) {
500             const uint *src = (const uint *) srcPixels;
501             uint *dst = (uint *) destPixels;
502             uint16x8_t half = vdupq_n_u16(0x80);
503             const_alpha = (const_alpha * 255) >> 8;
504             int one_minus_const_alpha = 255 - const_alpha;
505             uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
506             uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
507             for (int y = 0; y < h; ++y) {
508                 int x = 0;
509                 for (; x < w-3; x += 4) {
510                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
511                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
512 
513                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
514                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
515 
516                     const uint8x8_t src8_low = vget_low_u8(src8);
517                     const uint8x8_t dst8_low = vget_low_u8(dst8);
518 
519                     const uint8x8_t src8_high = vget_high_u8(src8);
520                     const uint8x8_t dst8_high = vget_high_u8(dst8);
521 
522                     const uint16x8_t src16_low = vmovl_u8(src8_low);
523                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
524 
525                     const uint16x8_t src16_high = vmovl_u8(src8_high);
526                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
527 
528                     const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
529                     const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
530 
531                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
532                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
533 
534                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
535                 }
536                 for (; x<w; ++x) {
537                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
538                 }
539                 dst = (quint32 *)(((uchar *) dst) + dbpl);
540                 src = (const quint32 *)(((const uchar *) src) + sbpl);
541             }
542         }
543     } else {
544         qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
545     }
546 }
547 
548 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
549 extern void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
550                                     int x, int y, const QRgba64 &color,
551                                     const uchar *map,
552                                     int mapWidth, int mapHeight, int mapStride,
553                                     const QClipData *clip, bool useGammaCorrection);
554 
qt_alphamapblit_quint16_neon(QRasterBuffer * rasterBuffer,int x,int y,const QRgba64 & color,const uchar * bitmap,int mapWidth,int mapHeight,int mapStride,const QClipData * clip,bool useGammaCorrection)555 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
556                                   int x, int y, const QRgba64 &color,
557                                   const uchar *bitmap,
558                                   int mapWidth, int mapHeight, int mapStride,
559                                   const QClipData *clip, bool useGammaCorrection)
560 {
561     if (clip || useGammaCorrection) {
562         qt_alphamapblit_quint16(rasterBuffer, x, y, color, bitmap, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
563         return;
564     }
565 
566     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
567     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
568 
569     uchar *mask = const_cast<uchar *>(bitmap);
570     const uint c = color.toArgb32();
571 
572     pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, c, 0, mask, mapStride);
573 }
574 
575 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
576 
577 template <typename SRC, typename BlendFunc>
578 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
Blend_on_RGB16_SourceAndConstAlpha_NeonBlend_on_RGB16_SourceAndConstAlpha_Neon579     Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
580         : m_index(0)
581         , m_blender(blender)
582         , m_const_alpha(const_alpha)
583     {
584     }
585 
writeBlend_on_RGB16_SourceAndConstAlpha_Neon586     inline void write(quint16 *dst, quint32 src)
587     {
588         srcBuffer[m_index++] = src;
589 
590         if (m_index == 8) {
591             m_blender(dst - 7, srcBuffer, m_const_alpha);
592             m_index = 0;
593         }
594     }
595 
flushBlend_on_RGB16_SourceAndConstAlpha_Neon596     inline void flush(quint16 *dst)
597     {
598         if (m_index > 0) {
599             quint16 dstBuffer[8];
600             for (int i = 0; i < m_index; ++i)
601                 dstBuffer[i] = dst[i - m_index];
602 
603             m_blender(dstBuffer, srcBuffer, m_const_alpha);
604 
605             for (int i = 0; i < m_index; ++i)
606                 dst[i - m_index] = dstBuffer[i];
607 
608             m_index = 0;
609         }
610     }
611 
612     SRC srcBuffer[8];
613 
614     int m_index;
615     BlendFunc m_blender;
616     int m_const_alpha;
617 };
618 
619 template <typename SRC, typename BlendFunc>
620 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender,int const_alpha)621 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
622 {
623     return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
624 }
625 
qt_scale_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int srch,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)626 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
627                                          const uchar *srcPixels, int sbpl, int srch,
628                                          const QRectF &targetRect,
629                                          const QRectF &sourceRect,
630                                          const QRect &clip,
631                                          int const_alpha)
632 {
633     if (const_alpha == 0)
634         return;
635 
636     qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip,
637         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
638 }
639 
640 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
641                                    const uchar *srcPixels, int sbpl, int srch,
642                                    const QRectF &targetRect,
643                                    const QRectF &sourceRect,
644                                    const QRect &clip,
645                                    int const_alpha);
646 
qt_scale_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int srch,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)647 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
648                                         const uchar *srcPixels, int sbpl, int srch,
649                                         const QRectF &targetRect,
650                                         const QRectF &sourceRect,
651                                         const QRect &clip,
652                                         int const_alpha)
653 {
654     if (const_alpha == 0)
655         return;
656 
657     if (const_alpha == 256) {
658         qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha);
659         return;
660     }
661 
662     qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip,
663         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
664 }
665 
666 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
667                                               const uchar *srcPixels, int sbpl,
668                                               const QRectF &targetRect,
669                                               const QRectF &sourceRect,
670                                               const QRect &clip,
671                                               const QTransform &targetRectTransform,
672                                               int const_alpha);
673 
qt_transform_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)674 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
675                                             const uchar *srcPixels, int sbpl,
676                                             const QRectF &targetRect,
677                                             const QRectF &sourceRect,
678                                             const QRect &clip,
679                                             const QTransform &targetRectTransform,
680                                             int const_alpha)
681 {
682     if (const_alpha == 0)
683         return;
684 
685     if (const_alpha == 256) {
686         qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
687         return;
688     }
689 
690     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
691                        reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
692         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
693 }
694 
qt_transform_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)695 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
696                                              const uchar *srcPixels, int sbpl,
697                                              const QRectF &targetRect,
698                                              const QRectF &sourceRect,
699                                              const QRect &clip,
700                                              const QTransform &targetRectTransform,
701                                              int const_alpha)
702 {
703     if (const_alpha == 0)
704         return;
705 
706     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
707                        reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
708         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
709 }
710 
convert_8_pixels_rgb16_to_argb32(quint32 * dst,const quint16 * src)711 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
712 {
713     asm volatile (
714         "vld1.16     { d0, d1 }, [%[SRC]]\n\t"
715 
716         /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
717            and put data into d4 - red, d3 - green, d2 - blue */
718         "vshrn.u16   d4,  q0,  #8\n\t"
719         "vshrn.u16   d3,  q0,  #3\n\t"
720         "vsli.u16    q0,  q0,  #5\n\t"
721         "vsri.u8     d4,  d4,  #5\n\t"
722         "vsri.u8     d3,  d3,  #6\n\t"
723         "vshrn.u16   d2,  q0,  #2\n\t"
724 
725         /* fill d5 - alpha with 0xff */
726         "mov         r2, #255\n\t"
727         "vdup.8      d5, r2\n\t"
728 
729         "vst4.8      { d2, d3, d4, d5 }, [%[DST]]"
730         : : [DST]"r" (dst), [SRC]"r" (src)
731         : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
732     );
733 }
734 
qt_destFetchRGB16_neon(uint * buffer,QRasterBuffer * rasterBuffer,int x,int y,int length)735 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
736 {
737     const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
738 
739     int i = 0;
740     for (; i < length - 7; i += 8)
741         convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
742 
743     if (i < length) {
744         quint16 srcBuffer[8];
745         quint32 dstBuffer[8];
746 
747         int tail = length - i;
748         for (int j = 0; j < tail; ++j)
749             srcBuffer[j] = data[i + j];
750 
751         convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
752 
753         for (int j = 0; j < tail; ++j)
754             buffer[i + j] = dstBuffer[j];
755     }
756 
757     return buffer;
758 }
759 
convert_8_pixels_argb32_to_rgb16(quint16 * dst,const quint32 * src)760 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
761 {
762     asm volatile (
763         "vld4.8      { d0, d1, d2, d3 }, [%[SRC]]\n\t"
764 
765         /* convert to r5g6b5 and store it into {d28, d29} */
766         "vshll.u8    q14, d2, #8\n\t"
767         "vshll.u8    q8,  d1, #8\n\t"
768         "vshll.u8    q9,  d0, #8\n\t"
769         "vsri.u16    q14, q8, #5\n\t"
770         "vsri.u16    q14, q9, #11\n\t"
771 
772         "vst1.16     { d28, d29 }, [%[DST]]"
773         : : [DST]"r" (dst), [SRC]"r" (src)
774         : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
775     );
776 }
777 
qt_destStoreRGB16_neon(QRasterBuffer * rasterBuffer,int x,int y,const uint * buffer,int length)778 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
779 {
780     quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
781 
782     int i = 0;
783     for (; i < length - 7; i += 8)
784         convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
785 
786     if (i < length) {
787         quint32 srcBuffer[8];
788         quint16 dstBuffer[8];
789 
790         int tail = length - i;
791         for (int j = 0; j < tail; ++j)
792             srcBuffer[j] = buffer[i + j];
793 
794         convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
795 
796         for (int j = 0; j < tail; ++j)
797             data[i + j] = dstBuffer[j];
798     }
799 }
800 #endif
801 
comp_func_solid_SourceOver_neon(uint * destPixels,int length,uint color,uint const_alpha)802 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
803 {
804     if ((const_alpha & qAlpha(color)) == 255) {
805         qt_memfill32(destPixels, color, length);
806     } else {
807         if (const_alpha != 255)
808             color = BYTE_MUL(color, const_alpha);
809 
810         const quint32 minusAlphaOfColor = qAlpha(~color);
811         int x = 0;
812 
813         uint32_t *dst = (uint32_t *) destPixels;
814         const uint32x4_t colorVector = vdupq_n_u32(color);
815         uint16x8_t half = vdupq_n_u16(0x80);
816         const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
817 
818         for (; x < length-3; x += 4) {
819             uint32x4_t dstVector = vld1q_u32(&dst[x]);
820 
821             const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
822 
823             const uint8x8_t dst8_low = vget_low_u8(dst8);
824             const uint8x8_t dst8_high = vget_high_u8(dst8);
825 
826             const uint16x8_t dst16_low = vmovl_u8(dst8_low);
827             const uint16x8_t dst16_high = vmovl_u8(dst8_high);
828 
829             const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
830             const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
831 
832             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
833             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
834 
835             uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
836             uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
837             vst1q_u32(&dst[x], colorPlusBlendedPixels);
838         }
839 
840         SIMD_EPILOGUE(x, length, 3)
841             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
842     }
843 }
844 
comp_func_Plus_neon(uint * dst,const uint * src,int length,uint const_alpha)845 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
846 {
847     if (const_alpha == 255) {
848         uint *const end = dst + length;
849         uint *const neonEnd = end - 3;
850 
851         while (dst < neonEnd) {
852             uint8x16_t vs = vld1q_u8((const uint8_t*)src);
853             const uint8x16_t vd = vld1q_u8((uint8_t*)dst);
854             vs = vqaddq_u8(vs, vd);
855             vst1q_u8((uint8_t*)dst, vs);
856             src += 4;
857             dst += 4;
858         };
859 
860         while (dst != end) {
861             *dst = comp_func_Plus_one_pixel(*dst, *src);
862             ++dst;
863             ++src;
864         }
865     } else {
866         int x = 0;
867         const int one_minus_const_alpha = 255 - const_alpha;
868         const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
869         const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
870 
871         const uint16x8_t half = vdupq_n_u16(0x80);
872         for (; x < length - 3; x += 4) {
873             const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
874             const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
875             uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
876             uint8x16_t result = vqaddq_u8(dst8, src8);
877 
878             uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
879             uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
880 
881             uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
882             uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
883 
884             result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
885             result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
886 
887             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
888             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
889             vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
890         }
891 
892         SIMD_EPILOGUE(x, length, 3)
893             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
894     }
895 }
896 
897 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
898 static const int tileSize = 32;
899 
900 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
901 
qt_memrotate90_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)902 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
903 {
904     const ushort *src = (const ushort *)srcPixels;
905     ushort *dest = (ushort *)destPixels;
906 
907     sstride /= sizeof(ushort);
908     dstride /= sizeof(ushort);
909 
910     const int pack = sizeof(quint32) / sizeof(ushort);
911     const int unaligned =
912         qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
913     const int restX = w % tileSize;
914     const int restY = (h - unaligned) % tileSize;
915     const int unoptimizedY = restY % pack;
916     const int numTilesX = w / tileSize + (restX > 0);
917     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
918 
919     for (int tx = 0; tx < numTilesX; ++tx) {
920         const int startx = w - tx * tileSize - 1;
921         const int stopx = qMax(startx - tileSize, 0);
922 
923         if (unaligned) {
924             for (int x = startx; x >= stopx; --x) {
925                 ushort *d = dest + (w - x - 1) * dstride;
926                 for (int y = 0; y < unaligned; ++y) {
927                     *d++ = src[y * sstride + x];
928                 }
929             }
930         }
931 
932         for (int ty = 0; ty < numTilesY; ++ty) {
933             const int starty = ty * tileSize + unaligned;
934             const int stopy = qMin(starty + tileSize, h - unoptimizedY);
935 
936             int x = startx;
937             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
938             for (; x >= stopx + 7; x -= 8) {
939                 ushort *d = dest + (w - x - 1) * dstride + starty;
940                 const ushort *s = &src[starty * sstride + x - 7];
941                 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
942             }
943 
944             for (; x >= stopx; --x) {
945                 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
946                 for (int y = starty; y < stopy; y += pack) {
947                     quint32 c = src[y * sstride + x];
948                     for (int i = 1; i < pack; ++i) {
949                         const int shift = (sizeof(int) * 8 / pack * i);
950                         const ushort color = src[(y + i) * sstride + x];
951                         c |= color << shift;
952                     }
953                     *d++ = c;
954                 }
955             }
956         }
957 
958         if (unoptimizedY) {
959             const int starty = h - unoptimizedY;
960             for (int x = startx; x >= stopx; --x) {
961                 ushort *d = dest + (w - x - 1) * dstride + starty;
962                 for (int y = starty; y < h; ++y) {
963                     *d++ = src[y * sstride + x];
964                 }
965             }
966         }
967     }
968 }
969 
970 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
971 
qt_memrotate270_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)972 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
973                              int sstride,
974                              uchar *destPixels, int dstride)
975 {
976     const ushort *src = (const ushort *)srcPixels;
977     ushort *dest = (ushort *)destPixels;
978 
979     sstride /= sizeof(ushort);
980     dstride /= sizeof(ushort);
981 
982     const int pack = sizeof(quint32) / sizeof(ushort);
983     const int unaligned =
984         qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
985     const int restX = w % tileSize;
986     const int restY = (h - unaligned) % tileSize;
987     const int unoptimizedY = restY % pack;
988     const int numTilesX = w / tileSize + (restX > 0);
989     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
990 
991     for (int tx = 0; tx < numTilesX; ++tx) {
992         const int startx = tx * tileSize;
993         const int stopx = qMin(startx + tileSize, w);
994 
995         if (unaligned) {
996             for (int x = startx; x < stopx; ++x) {
997                 ushort *d = dest + x * dstride;
998                 for (int y = h - 1; y >= h - unaligned; --y) {
999                     *d++ = src[y * sstride + x];
1000                 }
1001             }
1002         }
1003 
1004         for (int ty = 0; ty < numTilesY; ++ty) {
1005             const int starty = h - 1 - unaligned - ty * tileSize;
1006             const int stopy = qMax(starty - tileSize, unoptimizedY);
1007 
1008             int x = startx;
1009             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
1010             for (; x < stopx - 7; x += 8) {
1011                 ushort *d = dest + x * dstride + h - 1 - starty;
1012                 const ushort *s = &src[starty * sstride + x];
1013                 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
1014             }
1015 
1016             for (; x < stopx; ++x) {
1017                 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
1018                                                         + h - 1 - starty);
1019                 for (int y = starty; y > stopy; y -= pack) {
1020                     quint32 c = src[y * sstride + x];
1021                     for (int i = 1; i < pack; ++i) {
1022                         const int shift = (sizeof(int) * 8 / pack * i);
1023                         const ushort color = src[(y - i) * sstride + x];
1024                         c |= color << shift;
1025                     }
1026                     *d++ = c;
1027                 }
1028             }
1029         }
1030         if (unoptimizedY) {
1031             const int starty = unoptimizedY - 1;
1032             for (int x = startx; x < stopx; ++x) {
1033                 ushort *d = dest + x * dstride + h - 1 - starty;
1034                 for (int y = starty; y >= 0; --y) {
1035                     *d++ = src[y * sstride + x];
1036                 }
1037             }
1038         }
1039     }
1040 }
1041 #endif
1042 
1043 class QSimdNeon
1044 {
1045 public:
1046     typedef int32x4_t Int32x4;
1047     typedef float32x4_t Float32x4;
1048 
1049     union Vect_buffer_i { Int32x4 v; int i[4]; };
1050     union Vect_buffer_f { Float32x4 v; float f[4]; };
1051 
v_dup(double x)1052     static inline Float32x4 v_dup(double x) { return vdupq_n_f32(float(x)); }
v_dup(float x)1053     static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
v_dup(int x)1054     static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
v_dup(uint x)1055     static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
1056 
v_add(Float32x4 a,Float32x4 b)1057     static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
v_add(Int32x4 a,Int32x4 b)1058     static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
1059 
v_max(Float32x4 a,Float32x4 b)1060     static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
v_min(Float32x4 a,Float32x4 b)1061     static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
v_min_16(Int32x4 a,Int32x4 b)1062     static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
1063 
v_and(Int32x4 a,Int32x4 b)1064     static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
1065 
v_sub(Float32x4 a,Float32x4 b)1066     static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
v_sub(Int32x4 a,Int32x4 b)1067     static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
1068 
v_mul(Float32x4 a,Float32x4 b)1069     static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
1070 
v_sqrt(Float32x4 x)1071     static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
1072 
v_toInt(Float32x4 x)1073     static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
1074 
v_greaterOrEqual(Float32x4 a,Float32x4 b)1075     static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
1076 };
1077 
qt_fetch_radial_gradient_neon(uint * buffer,const Operator * op,const QSpanData * data,int y,int x,int length)1078 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
1079                                                        int y, int x, int length)
1080 {
1081     return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon>,uint>(buffer, op, data, y, x, length);
1082 }
1083 
1084 extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_neon(quint32 *dst, const uchar *src, int len);
1085 
qt_fetchUntransformed_888_neon(uint * buffer,const Operator *,const QSpanData * data,int y,int x,int length)1086 const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Operator *, const QSpanData *data,
1087                                                        int y, int x, int length)
1088 {
1089     const uchar *line = data->texture.scanLine(y) + x * 3;
1090     qt_convert_rgb888_to_rgb32_neon(buffer, line, length);
1091     return buffer;
1092 }
1093 
1094 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
vrgba2argb(uint32x4_t srcVector)1095 static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
1096 {
1097 #if defined(Q_PROCESSOR_ARM_64)
1098     const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1099 #else
1100     const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
1101 #endif
1102 #if defined(Q_PROCESSOR_ARM_64)
1103     srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
1104 #else
1105     // no vqtbl1q_u8, so use two vtbl1_u8
1106     const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
1107     const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
1108     srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
1109 #endif
1110     return srcVector;
1111 }
1112 
1113 template<bool RGBA>
convertARGBToARGB32PM_neon(uint * buffer,const uint * src,int count)1114 static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
1115 {
1116     int i = 0;
1117     const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
1118     const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
1119 
1120     for (; i < count - 3; i += 4) {
1121         uint32x4_t srcVector = vld1q_u32(src + i);
1122         uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
1123 #if defined(Q_PROCESSOR_ARM_64)
1124         uint32_t alphaSum = vaddvq_u32(alphaVector);
1125 #else
1126         // no vaddvq_u32
1127         uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1128         uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1129 #endif
1130         if (alphaSum) {
1131             if (alphaSum != 255 * 4) {
1132                 if (RGBA)
1133                     srcVector = vrgba2argb(srcVector);
1134                 const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
1135                 const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
1136                 const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
1137                 const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
1138                 uint16x8_t src1 = vmull_u8(s1, alpha1);
1139                 uint16x8_t src2 = vmull_u8(s2, alpha2);
1140                 src1 = vsraq_n_u16(src1, src1, 8);
1141                 src2 = vsraq_n_u16(src2, src2, 8);
1142                 const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
1143                 const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
1144                 const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
1145                 vst1q_u32(buffer + i, d);
1146             } else {
1147                 if (RGBA)
1148                     vst1q_u32(buffer + i, vrgba2argb(srcVector));
1149                 else if (buffer != src)
1150                     vst1q_u32(buffer + i, srcVector);
1151             }
1152         } else {
1153             vst1q_u32(buffer + i, vdupq_n_u32(0));
1154         }
1155     }
1156 
1157     SIMD_EPILOGUE(i, count, 3) {
1158         uint v = qPremultiply(src[i]);
1159         buffer[i] = RGBA ? RGBA2ARGB(v) : v;
1160     }
1161 }
1162 
1163 template<bool RGBA>
convertARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count)1164 static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count)
1165 {
1166     if (count <= 0)
1167         return;
1168 
1169     const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
1170     const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
1171 
1172     int i = 0;
1173     for (; i < count-3; i += 4) {
1174         uint32x4_t vs32 = vld1q_u32(src + i);
1175         uint32x4_t alphaVector = vshrq_n_u32(vs32, 24);
1176 #if defined(Q_PROCESSOR_ARM_64)
1177         uint32_t alphaSum = vaddvq_u32(alphaVector);
1178 #else
1179         // no vaddvq_u32
1180         uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1181         uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1182 #endif
1183         if (alphaSum) {
1184             if (!RGBA)
1185                 vs32 = vrgba2argb(vs32);
1186             const uint8x16_t vs8 = vreinterpretq_u8_u32(vs32);
1187             const uint8x16x2_t v = vzipq_u8(vs8, vs8);
1188             if (alphaSum != 255 * 4) {
1189                 const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(vs32));
1190                 const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(vs32));
1191                 const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
1192                 const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
1193                 uint16x8_t src1 = vmull_u8(s1, alpha1);
1194                 uint16x8_t src2 = vmull_u8(s2, alpha2);
1195                 // convert from 0->(255x255) to 0->(255x257)
1196                 src1 = vsraq_n_u16(src1, src1, 7);
1197                 src2 = vsraq_n_u16(src2, src2, 7);
1198 
1199                 // now restore alpha from the trivial conversion
1200                 const uint64x2_t d1 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[0]), vreinterpretq_u64_u16(src1));
1201                 const uint64x2_t d2 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[1]), vreinterpretq_u64_u16(src2));
1202 
1203                 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d1));
1204                 buffer += 2;
1205                 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d2));
1206                 buffer += 2;
1207             } else {
1208                 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0]));
1209                 buffer += 2;
1210                 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1]));
1211                 buffer += 2;
1212             }
1213         } else {
1214             vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
1215             buffer += 2;
1216             vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
1217             buffer += 2;
1218         }
1219     }
1220 
1221     SIMD_EPILOGUE(i, count, 3) {
1222         uint s = src[i];
1223         if (RGBA)
1224             s = RGBA2ARGB(s);
1225         *buffer++ = QRgba64::fromArgb32(s).premultiplied();
1226     }
1227 }
1228 
reciprocal_mul_ps(float32x4_t a,float mul)1229 static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
1230 {
1231     float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
1232     ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul
1233     return ia;
1234 }
1235 
1236 template<bool RGBA, bool RGBx>
convertARGBFromARGB32PM_neon(uint * buffer,const uint * src,int count)1237 static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count)
1238 {
1239     int i = 0;
1240     const uint32x4_t alphaMask = vdupq_n_u32(0xff000000);
1241 
1242     for (; i < count - 3; i += 4) {
1243         uint32x4_t srcVector = vld1q_u32(src + i);
1244         uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
1245 #if defined(Q_PROCESSOR_ARM_64)
1246         uint32_t alphaSum = vaddvq_u32(alphaVector);
1247 #else
1248         // no vaddvq_u32
1249         uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1250         uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1251 #endif
1252         if (alphaSum) {
1253             if (alphaSum != 255 * 4) {
1254                 if (RGBA)
1255                     srcVector = vrgba2argb(srcVector);
1256                 const float32x4_t a = vcvtq_f32_u32(alphaVector);
1257                 const float32x4_t ia = reciprocal_mul_ps(a, 255.0f);
1258                 // Convert 4x(4xU8) to 4x(4xF32)
1259                 uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector)));
1260                 uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector)));
1261                 float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
1262                 float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
1263                 float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3)));
1264                 float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3)));
1265                 src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0);
1266                 src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1);
1267                 src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0);
1268                 src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1);
1269                 // Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding)
1270                 tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1),
1271                                     vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1));
1272                 tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1),
1273                                     vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1));
1274                 uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3)));
1275                 // Overwrite any undefined results from alpha==0 with zeros:
1276 #if defined(Q_PROCESSOR_ARM_64)
1277                 uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector);
1278 #else
1279                 uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0));
1280 #endif
1281                 dstVector = vbicq_u32(dstVector, srcVectorAlphaMask);
1282                 // Restore or mask alpha values:
1283                 if (RGBx)
1284                     dstVector = vorrq_u32(alphaMask, dstVector);
1285                 else
1286                     dstVector = vbslq_u32(alphaMask, srcVector, dstVector);
1287                 vst1q_u32(&buffer[i], dstVector);
1288             } else {
1289                 // 4xAlpha==255, no change except if we are doing RGBA->ARGB:
1290                 if (RGBA)
1291                     vst1q_u32(&buffer[i], vrgba2argb(srcVector));
1292                 else if (buffer != src)
1293                     vst1q_u32(&buffer[i], srcVector);
1294             }
1295         } else {
1296             // 4xAlpha==0, always zero, except if output is RGBx:
1297             if (RGBx)
1298                 vst1q_u32(&buffer[i], alphaMask);
1299             else
1300                 vst1q_u32(&buffer[i], vdupq_n_u32(0));
1301         }
1302     }
1303 
1304     SIMD_EPILOGUE(i, count, 3) {
1305         uint v = qUnpremultiply(src[i]);
1306         if (RGBx)
1307             v = 0xff000000 | v;
1308         if (RGBA)
1309             v = ARGB2RGBA(v);
1310         buffer[i] = v;
1311     }
1312 }
1313 
convertARGB32ToARGB32PM_neon(uint * buffer,int count,const QVector<QRgb> *)1314 void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
1315 {
1316     convertARGBToARGB32PM_neon<false>(buffer, buffer, count);
1317 }
1318 
convertRGBA8888ToARGB32PM_neon(uint * buffer,int count,const QVector<QRgb> *)1319 void QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
1320 {
1321     convertARGBToARGB32PM_neon<true>(buffer, buffer, count);
1322 }
1323 
fetchARGB32ToARGB32PM_neon(uint * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1324 const uint *QT_FASTCALL fetchARGB32ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
1325                                                    const QVector<QRgb> *, QDitherInfo *)
1326 {
1327     convertARGBToARGB32PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1328     return buffer;
1329 }
1330 
fetchRGBA8888ToARGB32PM_neon(uint * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1331 const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
1332                                                      const QVector<QRgb> *, QDitherInfo *)
1333 {
1334     convertARGBToARGB32PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1335     return buffer;
1336 }
1337 
convertARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count,const QVector<QRgb> *,QDitherInfo *)1338 const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
1339                                                          const QVector<QRgb> *, QDitherInfo *)
1340 {
1341     convertARGB32ToRGBA64PM_neon<false>(buffer, src, count);
1342     return buffer;
1343 }
1344 
convertRGBA8888ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count,const QVector<QRgb> *,QDitherInfo *)1345 const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
1346                                                            const QVector<QRgb> *, QDitherInfo *)
1347 {
1348     convertARGB32ToRGBA64PM_neon<true>(buffer, src, count);
1349     return buffer;
1350 }
1351 
fetchARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1352 const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
1353                                                       const QVector<QRgb> *, QDitherInfo *)
1354 {
1355     convertARGB32ToRGBA64PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1356     return buffer;
1357 }
1358 
fetchRGBA8888ToRGBA64PM_neon(QRgba64 * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1359 const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
1360                                                         const QVector<QRgb> *, QDitherInfo *)
1361 {
1362     convertARGB32ToRGBA64PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1363     return buffer;
1364 }
1365 
storeRGB32FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1366 void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1367                                              const QVector<QRgb> *, QDitherInfo *)
1368 {
1369     uint *d = reinterpret_cast<uint *>(dest) + index;
1370     convertARGBFromARGB32PM_neon<false,true>(d, src, count);
1371 }
1372 
storeARGB32FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1373 void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1374                                               const QVector<QRgb> *, QDitherInfo *)
1375 {
1376     uint *d = reinterpret_cast<uint *>(dest) + index;
1377     convertARGBFromARGB32PM_neon<false,false>(d, src, count);
1378 }
1379 
storeRGBA8888FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1380 void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1381                                                 const QVector<QRgb> *, QDitherInfo *)
1382 {
1383     uint *d = reinterpret_cast<uint *>(dest) + index;
1384     convertARGBFromARGB32PM_neon<true,false>(d, src, count);
1385 }
1386 
storeRGBXFromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1387 void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1388                                             const QVector<QRgb> *, QDitherInfo *)
1389 {
1390     uint *d = reinterpret_cast<uint *>(dest) + index;
1391     convertARGBFromARGB32PM_neon<true,true>(d, src, count);
1392 }
1393 
1394 #endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN
1395 
1396 QT_END_NAMESPACE
1397 
1398 #endif // __ARM_NEON__
1399 
1400