1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtGui module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file.  Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include <private/qdrawhelper_p.h>
43 #include <private/qblendfunctions_p.h>
44 #include <private/qmath_p.h>
45 
46 #ifdef QT_HAVE_NEON
47 
48 #include <private/qdrawhelper_neon_p.h>
49 #include <private/qpaintengine_raster_p.h>
50 #include <arm_neon.h>
51 
52 QT_BEGIN_NAMESPACE
53 
qt_memfill32_neon(quint32 * dest,quint32 value,int count)54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
55 {
56     const int epilogueSize = count % 16;
57     if (count >= 16) {
58         quint32 *const neonEnd = dest + count - epilogueSize;
59         register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60         register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61         while (dest != neonEnd) {
62             asm volatile (
63                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
65                 : [DST]"+r" (dest)
66                 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
67                 : "memory"
68             );
69         }
70     }
71 
72     switch (epilogueSize)
73     {
74     case 15:     *dest++ = value;
75     case 14:     *dest++ = value;
76     case 13:     *dest++ = value;
77     case 12:     *dest++ = value;
78     case 11:     *dest++ = value;
79     case 10:     *dest++ = value;
80     case 9:      *dest++ = value;
81     case 8:      *dest++ = value;
82     case 7:      *dest++ = value;
83     case 6:      *dest++ = value;
84     case 5:      *dest++ = value;
85     case 4:      *dest++ = value;
86     case 3:      *dest++ = value;
87     case 2:      *dest++ = value;
88     case 1:      *dest++ = value;
89     }
90 }
91 
qvdiv_255_u16(uint16x8_t x,uint16x8_t half)92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
93 {
94     // result = (x + (x >> 8) + 0x80) >> 8
95 
96     const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97     const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98     const uint16x8_t sum = vaddq_u16(temp, sum_part);
99 
100     return vshrq_n_u16(sum, 8);
101 }
102 
qvbyte_mul_u16(uint16x8_t x,uint16x8_t alpha,uint16x8_t half)103 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
104 {
105     // t = qRound(x * alpha / 255.0)
106 
107     const uint16x8_t t = vmulq_u16(x, alpha); // t
108     return qvdiv_255_u16(t, half);
109 }
110 
qvinterpolate_pixel_255(uint16x8_t x,uint16x8_t a,uint16x8_t y,uint16x8_t b,uint16x8_t half)111 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
112 {
113     // t = x * a + y * b
114 
115     const uint16x8_t ta = vmulq_u16(x, a);
116     const uint16x8_t tb = vmulq_u16(y, b);
117 
118     return qvdiv_255_u16(vaddq_u16(ta, tb), half);
119 }
120 
qvsource_over_u16(uint16x8_t src16,uint16x8_t dst16,uint16x8_t half,uint16x8_t full)121 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
122 {
123     const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124     const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
125 
126     const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
127 
128     return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
129 }
130 
131 extern "C" void
132 pixman_composite_over_8888_0565_asm_neon (int32_t   w,
133                                           int32_t   h,
134                                           uint16_t *dst,
135                                           int32_t   dst_stride,
136                                           uint32_t *src,
137                                           int32_t   src_stride);
138 
139 extern "C" void
140 pixman_composite_over_8888_8888_asm_neon (int32_t   w,
141                                           int32_t   h,
142                                           uint32_t *dst,
143                                           int32_t   dst_stride,
144                                           uint32_t *src,
145                                           int32_t   src_stride);
146 
147 extern "C" void
148 pixman_composite_src_0565_8888_asm_neon (int32_t   w,
149                                          int32_t   h,
150                                          uint32_t *dst,
151                                          int32_t   dst_stride,
152                                          uint16_t *src,
153                                          int32_t   src_stride);
154 
155 extern "C" void
156 pixman_composite_over_n_8_0565_asm_neon (int32_t    w,
157                                          int32_t    h,
158                                          uint16_t  *dst,
159                                          int32_t    dst_stride,
160                                          uint32_t   src,
161                                          int32_t    unused,
162                                          uint8_t   *mask,
163                                          int32_t    mask_stride);
164 
165 extern "C" void
166 pixman_composite_scanline_over_asm_neon (int32_t         w,
167                                          const uint32_t *dst,
168                                          const uint32_t *src);
169 
170 extern "C" void
171 pixman_composite_src_0565_0565_asm_neon (int32_t   w,
172                                          int32_t   h,
173                                          uint16_t *dst,
174                                          int32_t   dst_stride,
175                                          uint16_t *src,
176                                          int32_t   src_stride);
177 
178 // qblendfunctions.cpp
179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180                                           const uchar *srcPixels, int sbpl,
181                                           int w, int h,
182                                           int const_alpha);
183 
qt_blend_rgb16_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)184 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185                                    const uchar *srcPixels, int sbpl,
186                                    int w, int h,
187                                    int const_alpha)
188 {
189     dbpl /= 4;
190     sbpl /= 2;
191 
192     quint32 *dst = (quint32 *) destPixels;
193     quint16 *src = (quint16 *) srcPixels;
194 
195     if (const_alpha != 256) {
196         quint8 a = (255 * const_alpha) >> 8;
197         quint8 ia = 255 - a;
198 
199         while (h--) {
200             for (int x=0; x<w; ++x)
201                 dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
202             dst += dbpl;
203             src += sbpl;
204         }
205         return;
206     }
207 
208     pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
209 }
210 
211 // qblendfunctions.cpp
212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213                              const uchar *src, int sbpl,
214                              int w, int h,
215                              int const_alpha);
216 
217 template <int N>
scanLineBlit16(quint16 * dst,quint16 * src,int dstride)218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
219 {
220     if (N >= 2) {
221         ((quint32 *)dst)[0] = ((quint32 *)src)[0];
222         __builtin_prefetch(dst + dstride, 1, 0);
223     }
224     for (int i = 1; i < N/2; ++i)
225         ((quint32 *)dst)[i] = ((quint32 *)src)[i];
226     if (N & 1)
227         dst[N-1] = src[N-1];
228 }
229 
230 template <int Width>
blockBlit16(quint16 * dst,quint16 * src,int dstride,int sstride,int h)231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
232 {
233     union {
234         quintptr address;
235         quint16 *pointer;
236     } u;
237 
238     u.pointer = dst;
239 
240     if (u.address & 2) {
241         while (h--) {
242             // align dst
243             dst[0] = src[0];
244             if (Width > 1)
245                 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
246             dst += dstride;
247             src += sstride;
248         }
249     } else {
250         while (h--) {
251             scanLineBlit16<Width>(dst, src, dstride);
252 
253             dst += dstride;
254             src += sstride;
255         }
256     }
257 }
258 
qt_blend_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260                                   const uchar *srcPixels, int sbpl,
261                                   int w, int h,
262                                   int const_alpha)
263 {
264     // testing show that the default memcpy is faster for widths 150 and up
265     if (const_alpha != 256 || w >= 150) {
266         qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
267         return;
268     }
269 
270     int dstride = dbpl / 2;
271     int sstride = sbpl / 2;
272 
273     quint16 *dst = (quint16 *) destPixels;
274     quint16 *src = (quint16 *) srcPixels;
275 
276     switch (w) {
277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
278     BLOCKBLIT(1);
279     BLOCKBLIT(2);
280     BLOCKBLIT(3);
281     BLOCKBLIT(4);
282     BLOCKBLIT(5);
283     BLOCKBLIT(6);
284     BLOCKBLIT(7);
285     BLOCKBLIT(8);
286     BLOCKBLIT(9);
287     BLOCKBLIT(10);
288     BLOCKBLIT(11);
289     BLOCKBLIT(12);
290     BLOCKBLIT(13);
291     BLOCKBLIT(14);
292     BLOCKBLIT(15);
293 #undef BLOCKBLIT
294     default:
295         break;
296     }
297 
298     pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
299 }
300 
301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
302 
qt_blend_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304                                    const uchar *srcPixels, int sbpl,
305                                    int w, int h,
306                                    int const_alpha)
307 {
308     quint16 *dst = (quint16 *) destPixels;
309     quint32 *src = (quint32 *) srcPixels;
310 
311     if (const_alpha != 256) {
312         for (int y=0; y<h; ++y) {
313             int i = 0;
314             for (; i < w-7; i += 8)
315                 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
316 
317             if (i < w) {
318                 int tail = w - i;
319 
320                 quint16 dstBuffer[8];
321                 quint32 srcBuffer[8];
322 
323                 for (int j = 0; j < tail; ++j) {
324                     dstBuffer[j] = dst[i + j];
325                     srcBuffer[j] = src[i + j];
326                 }
327 
328                 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
329 
330                 for (int j = 0; j < tail; ++j)
331                     dst[i + j] = dstBuffer[j];
332             }
333 
334             dst = (quint16 *)(((uchar *) dst) + dbpl);
335             src = (quint32 *)(((uchar *) src) + sbpl);
336         }
337         return;
338     }
339 
340     pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
341 }
342 
qt_blend_argb32_on_argb32_scanline_neon(uint * dest,const uint * src,int length,uint const_alpha)343 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
344 {
345     if (const_alpha == 255) {
346         pixman_composite_scanline_over_asm_neon(length, dest, src);
347     } else {
348         qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
349     }
350 }
351 
qt_blend_argb32_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)352 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
353                                     const uchar *srcPixels, int sbpl,
354                                     int w, int h,
355                                     int const_alpha)
356 {
357     const uint *src = (const uint *) srcPixels;
358     uint *dst = (uint *) destPixels;
359     uint16x8_t half = vdupq_n_u16(0x80);
360     uint16x8_t full = vdupq_n_u16(0xff);
361     if (const_alpha == 256) {
362         pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
363     } else if (const_alpha != 0) {
364         const_alpha = (const_alpha * 255) >> 8;
365         uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
366         for (int y = 0; y < h; ++y) {
367             int x = 0;
368             for (; x < w-3; x += 4) {
369                 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
370                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
371                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
372 
373                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
374                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
375 
376                     const uint8x8_t src8_low = vget_low_u8(src8);
377                     const uint8x8_t dst8_low = vget_low_u8(dst8);
378 
379                     const uint8x8_t src8_high = vget_high_u8(src8);
380                     const uint8x8_t dst8_high = vget_high_u8(dst8);
381 
382                     const uint16x8_t src16_low = vmovl_u8(src8_low);
383                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
384 
385                     const uint16x8_t src16_high = vmovl_u8(src8_high);
386                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
387 
388                     const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
389                     const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
390 
391                     const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
392                     const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
393 
394                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
395                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
396 
397                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
398                 }
399             }
400             for (; x<w; ++x) {
401                 uint s = src[x];
402                 if (s != 0) {
403                     s = BYTE_MUL(s, const_alpha);
404                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
405                 }
406             }
407             dst = (quint32 *)(((uchar *) dst) + dbpl);
408             src = (const quint32 *)(((const uchar *) src) + sbpl);
409         }
410     }
411 }
412 
413 // qblendfunctions.cpp
414 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
415                              const uchar *srcPixels, int sbpl,
416                              int w, int h,
417                              int const_alpha);
418 
qt_blend_rgb32_on_rgb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)419 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
420                                   const uchar *srcPixels, int sbpl,
421                                   int w, int h,
422                                   int const_alpha)
423 {
424     if (const_alpha != 256) {
425         if (const_alpha != 0) {
426             const uint *src = (const uint *) srcPixels;
427             uint *dst = (uint *) destPixels;
428             uint16x8_t half = vdupq_n_u16(0x80);
429             const_alpha = (const_alpha * 255) >> 8;
430             int one_minus_const_alpha = 255 - const_alpha;
431             uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
432             uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
433             for (int y = 0; y < h; ++y) {
434                 int x = 0;
435                 for (; x < w-3; x += 4) {
436                     uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
437                     uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
438 
439                     const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
440                     const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
441 
442                     const uint8x8_t src8_low = vget_low_u8(src8);
443                     const uint8x8_t dst8_low = vget_low_u8(dst8);
444 
445                     const uint8x8_t src8_high = vget_high_u8(src8);
446                     const uint8x8_t dst8_high = vget_high_u8(dst8);
447 
448                     const uint16x8_t src16_low = vmovl_u8(src8_low);
449                     const uint16x8_t dst16_low = vmovl_u8(dst8_low);
450 
451                     const uint16x8_t src16_high = vmovl_u8(src8_high);
452                     const uint16x8_t dst16_high = vmovl_u8(dst8_high);
453 
454                     const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
455                     const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
456 
457                     const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
458                     const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
459 
460                     vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
461                 }
462                 for (; x<w; ++x) {
463                     uint s = src[x];
464                     s = BYTE_MUL(s, const_alpha);
465                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
466                 }
467                 dst = (quint32 *)(((uchar *) dst) + dbpl);
468                 src = (const quint32 *)(((const uchar *) src) + sbpl);
469             }
470         }
471     } else {
472         qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
473     }
474 }
475 
qt_alphamapblit_quint16_neon(QRasterBuffer * rasterBuffer,int x,int y,quint32 color,const uchar * bitmap,int mapWidth,int mapHeight,int mapStride,const QClipData *)476 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
477                                   int x, int y, quint32 color,
478                                   const uchar *bitmap,
479                                   int mapWidth, int mapHeight, int mapStride,
480                                   const QClipData *)
481 {
482     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
483     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
484 
485     uchar *mask = const_cast<uchar *>(bitmap);
486 
487     pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
488 }
489 
490 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
491 
492 template <typename SRC, typename BlendFunc>
493 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
Blend_on_RGB16_SourceAndConstAlpha_NeonBlend_on_RGB16_SourceAndConstAlpha_Neon494     Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
495         : m_index(0)
496         , m_blender(blender)
497         , m_const_alpha(const_alpha)
498     {
499     }
500 
writeBlend_on_RGB16_SourceAndConstAlpha_Neon501     inline void write(quint16 *dst, quint32 src)
502     {
503         srcBuffer[m_index++] = src;
504 
505         if (m_index == 8) {
506             m_blender(dst - 7, srcBuffer, m_const_alpha);
507             m_index = 0;
508         }
509     }
510 
flushBlend_on_RGB16_SourceAndConstAlpha_Neon511     inline void flush(quint16 *dst)
512     {
513         if (m_index > 0) {
514             quint16 dstBuffer[8];
515             for (int i = 0; i < m_index; ++i)
516                 dstBuffer[i] = dst[i - m_index];
517 
518             m_blender(dstBuffer, srcBuffer, m_const_alpha);
519 
520             for (int i = 0; i < m_index; ++i)
521                 dst[i - m_index] = dstBuffer[i];
522 
523             m_index = 0;
524         }
525     }
526 
527     SRC srcBuffer[8];
528 
529     int m_index;
530     BlendFunc m_blender;
531     int m_const_alpha;
532 };
533 
534 template <typename SRC, typename BlendFunc>
535 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender,int const_alpha)536 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
537 {
538     return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
539 }
540 
qt_scale_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int sh,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)541 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
542                                          const uchar *srcPixels, int sbpl, int sh,
543                                          const QRectF &targetRect,
544                                          const QRectF &sourceRect,
545                                          const QRect &clip,
546                                          int const_alpha)
547 {
548     if (const_alpha == 0)
549         return;
550 
551     qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
552         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
553 }
554 
555 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
556                                    const uchar *srcPixels, int sbpl, int sh,
557                                    const QRectF &targetRect,
558                                    const QRectF &sourceRect,
559                                    const QRect &clip,
560                                    int const_alpha);
561 
qt_scale_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int sh,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)562 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
563                                         const uchar *srcPixels, int sbpl, int sh,
564                                         const QRectF &targetRect,
565                                         const QRectF &sourceRect,
566                                         const QRect &clip,
567                                         int const_alpha)
568 {
569     if (const_alpha == 0)
570         return;
571 
572     if (const_alpha == 256) {
573         qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip, const_alpha);
574         return;
575     }
576 
577     qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
578         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
579 }
580 
581 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
582                                               const uchar *srcPixels, int sbpl,
583                                               const QRectF &targetRect,
584                                               const QRectF &sourceRect,
585                                               const QRect &clip,
586                                               const QTransform &targetRectTransform,
587                                               int const_alpha);
588 
qt_transform_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)589 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
590                                             const uchar *srcPixels, int sbpl,
591                                             const QRectF &targetRect,
592                                             const QRectF &sourceRect,
593                                             const QRect &clip,
594                                             const QTransform &targetRectTransform,
595                                             int const_alpha)
596 {
597     if (const_alpha == 0)
598         return;
599 
600     if (const_alpha == 256) {
601         qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
602         return;
603     }
604 
605     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
606                        reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
607         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
608 }
609 
qt_transform_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)610 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
611                                              const uchar *srcPixels, int sbpl,
612                                              const QRectF &targetRect,
613                                              const QRectF &sourceRect,
614                                              const QRect &clip,
615                                              const QTransform &targetRectTransform,
616                                              int const_alpha)
617 {
618     if (const_alpha == 0)
619         return;
620 
621     qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
622                        reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
623         Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
624 }
625 
convert_8_pixels_rgb16_to_argb32(quint32 * dst,const quint16 * src)626 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
627 {
628     asm volatile (
629         "vld1.16     { d0, d1 }, [%[SRC]]\n\t"
630 
631         /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
632            and put data into d4 - red, d3 - green, d2 - blue */
633         "vshrn.u16   d4,  q0,  #8\n\t"
634         "vshrn.u16   d3,  q0,  #3\n\t"
635         "vsli.u16    q0,  q0,  #5\n\t"
636         "vsri.u8     d4,  d4,  #5\n\t"
637         "vsri.u8     d3,  d3,  #6\n\t"
638         "vshrn.u16   d2,  q0,  #2\n\t"
639 
640         /* fill d5 - alpha with 0xff */
641         "mov         r2, #255\n\t"
642         "vdup.8      d5, r2\n\t"
643 
644         "vst4.8      { d2, d3, d4, d5 }, [%[DST]]"
645         : : [DST]"r" (dst), [SRC]"r" (src)
646         : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
647     );
648 }
649 
qt_destFetchRGB16_neon(uint * buffer,QRasterBuffer * rasterBuffer,int x,int y,int length)650 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
651 {
652     const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
653 
654     int i = 0;
655     for (; i < length - 7; i += 8)
656         convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
657 
658     if (i < length) {
659         quint16 srcBuffer[8];
660         quint32 dstBuffer[8];
661 
662         int tail = length - i;
663         for (int j = 0; j < tail; ++j)
664             srcBuffer[j] = data[i + j];
665 
666         convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
667 
668         for (int j = 0; j < tail; ++j)
669             buffer[i + j] = dstBuffer[j];
670     }
671 
672     return buffer;
673 }
674 
convert_8_pixels_argb32_to_rgb16(quint16 * dst,const quint32 * src)675 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
676 {
677     asm volatile (
678         "vld4.8      { d0, d1, d2, d3 }, [%[SRC]]\n\t"
679 
680         /* convert to r5g6b5 and store it into {d28, d29} */
681         "vshll.u8    q14, d2, #8\n\t"
682         "vshll.u8    q8,  d1, #8\n\t"
683         "vshll.u8    q9,  d0, #8\n\t"
684         "vsri.u16    q14, q8, #5\n\t"
685         "vsri.u16    q14, q9, #11\n\t"
686 
687         "vst1.16     { d28, d29 }, [%[DST]]"
688         : : [DST]"r" (dst), [SRC]"r" (src)
689         : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
690     );
691 }
692 
qt_destStoreRGB16_neon(QRasterBuffer * rasterBuffer,int x,int y,const uint * buffer,int length)693 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
694 {
695     quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
696 
697     int i = 0;
698     for (; i < length - 7; i += 8)
699         convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
700 
701     if (i < length) {
702         quint32 srcBuffer[8];
703         quint16 dstBuffer[8];
704 
705         int tail = length - i;
706         for (int j = 0; j < tail; ++j)
707             srcBuffer[j] = buffer[i + j];
708 
709         convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
710 
711         for (int j = 0; j < tail; ++j)
712             data[i + j] = dstBuffer[j];
713     }
714 }
715 
comp_func_solid_SourceOver_neon(uint * destPixels,int length,uint color,uint const_alpha)716 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
717 {
718     if ((const_alpha & qAlpha(color)) == 255) {
719         QT_MEMFILL_UINT(destPixels, length, color);
720     } else {
721         if (const_alpha != 255)
722             color = BYTE_MUL(color, const_alpha);
723 
724         const quint32 minusAlphaOfColor = qAlpha(~color);
725         int x = 0;
726 
727         uint32_t *dst = (uint32_t *) destPixels;
728         const uint32x4_t colorVector = vdupq_n_u32(color);
729         uint16x8_t half = vdupq_n_u16(0x80);
730         const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
731 
732         for (; x < length-3; x += 4) {
733             uint32x4_t dstVector = vld1q_u32(&dst[x]);
734 
735             const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
736 
737             const uint8x8_t dst8_low = vget_low_u8(dst8);
738             const uint8x8_t dst8_high = vget_high_u8(dst8);
739 
740             const uint16x8_t dst16_low = vmovl_u8(dst8_low);
741             const uint16x8_t dst16_high = vmovl_u8(dst8_high);
742 
743             const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
744             const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
745 
746             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
747             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
748 
749             uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
750             uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
751             vst1q_u32(&dst[x], colorPlusBlendedPixels);
752         }
753 
754         for (;x < length; ++x)
755             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
756     }
757 }
758 
comp_func_Plus_neon(uint * dst,const uint * src,int length,uint const_alpha)759 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
760 {
761     if (const_alpha == 255) {
762         uint *const end = dst + length;
763         uint *const neonEnd = end - 3;
764 
765         while (dst < neonEnd) {
766             asm volatile (
767                 "vld2.8     { d0, d1 }, [%[SRC]] !\n\t"
768                 "vld2.8     { d2, d3 }, [%[DST]]\n\t"
769                 "vqadd.u8 q0, q0, q1\n\t"
770                 "vst2.8     { d0, d1 }, [%[DST]] !\n\t"
771                 : [DST]"+r" (dst), [SRC]"+r" (src)
772                 :
773                 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
774             );
775         }
776 
777         while (dst != end) {
778             *dst = comp_func_Plus_one_pixel(*dst, *src);
779             ++dst;
780             ++src;
781         }
782     } else {
783         int x = 0;
784         const int one_minus_const_alpha = 255 - const_alpha;
785         const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
786         const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
787 
788         const uint16x8_t half = vdupq_n_u16(0x80);
789         for (; x < length - 3; x += 4) {
790             const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
791             const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
792             uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
793             uint8x16_t result = vqaddq_u8(dst8, src8);
794 
795             uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
796             uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
797 
798             uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
799             uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
800 
801             result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
802             result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
803 
804             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
805             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
806             vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
807         }
808 
809         for (; x < length; ++x)
810             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
811     }
812 }
813 
814 static const int tileSize = 32;
815 
816 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
817 
qt_memrotate90_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)818 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
819 {
820     const ushort *src = (const ushort *)srcPixels;
821     ushort *dest = (ushort *)destPixels;
822 
823     sstride /= sizeof(ushort);
824     dstride /= sizeof(ushort);
825 
826     const int pack = sizeof(quint32) / sizeof(ushort);
827     const int unaligned =
828         qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
829     const int restX = w % tileSize;
830     const int restY = (h - unaligned) % tileSize;
831     const int unoptimizedY = restY % pack;
832     const int numTilesX = w / tileSize + (restX > 0);
833     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
834 
835     for (int tx = 0; tx < numTilesX; ++tx) {
836         const int startx = w - tx * tileSize - 1;
837         const int stopx = qMax(startx - tileSize, 0);
838 
839         if (unaligned) {
840             for (int x = startx; x >= stopx; --x) {
841                 ushort *d = dest + (w - x - 1) * dstride;
842                 for (int y = 0; y < unaligned; ++y) {
843                     *d++ = src[y * sstride + x];
844                 }
845             }
846         }
847 
848         for (int ty = 0; ty < numTilesY; ++ty) {
849             const int starty = ty * tileSize + unaligned;
850             const int stopy = qMin(starty + tileSize, h - unoptimizedY);
851 
852             int x = startx;
853             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
854             for (; x >= stopx + 7; x -= 8) {
855                 ushort *d = dest + (w - x - 1) * dstride + starty;
856                 const ushort *s = &src[starty * sstride + x - 7];
857                 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
858             }
859 
860             for (; x >= stopx; --x) {
861                 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
862                 for (int y = starty; y < stopy; y += pack) {
863                     quint32 c = src[y * sstride + x];
864                     for (int i = 1; i < pack; ++i) {
865                         const int shift = (sizeof(int) * 8 / pack * i);
866                         const ushort color = src[(y + i) * sstride + x];
867                         c |= color << shift;
868                     }
869                     *d++ = c;
870                 }
871             }
872         }
873 
874         if (unoptimizedY) {
875             const int starty = h - unoptimizedY;
876             for (int x = startx; x >= stopx; --x) {
877                 ushort *d = dest + (w - x - 1) * dstride + starty;
878                 for (int y = starty; y < h; ++y) {
879                     *d++ = src[y * sstride + x];
880                 }
881             }
882         }
883     }
884 }
885 
886 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
887 
qt_memrotate270_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)888 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
889                              int sstride,
890                              uchar *destPixels, int dstride)
891 {
892     const ushort *src = (const ushort *)srcPixels;
893     ushort *dest = (ushort *)destPixels;
894 
895     sstride /= sizeof(ushort);
896     dstride /= sizeof(ushort);
897 
898     const int pack = sizeof(quint32) / sizeof(ushort);
899     const int unaligned =
900         qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
901     const int restX = w % tileSize;
902     const int restY = (h - unaligned) % tileSize;
903     const int unoptimizedY = restY % pack;
904     const int numTilesX = w / tileSize + (restX > 0);
905     const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
906 
907     for (int tx = 0; tx < numTilesX; ++tx) {
908         const int startx = tx * tileSize;
909         const int stopx = qMin(startx + tileSize, w);
910 
911         if (unaligned) {
912             for (int x = startx; x < stopx; ++x) {
913                 ushort *d = dest + x * dstride;
914                 for (int y = h - 1; y >= h - unaligned; --y) {
915                     *d++ = src[y * sstride + x];
916                 }
917             }
918         }
919 
920         for (int ty = 0; ty < numTilesY; ++ty) {
921             const int starty = h - 1 - unaligned - ty * tileSize;
922             const int stopy = qMax(starty - tileSize, unoptimizedY);
923 
924             int x = startx;
925             // qt_rotate90_16_neon writes to eight rows, four pixels at a time
926             for (; x < stopx - 7; x += 8) {
927                 ushort *d = dest + x * dstride + h - 1 - starty;
928                 const ushort *s = &src[starty * sstride + x];
929                 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
930             }
931 
932             for (; x < stopx; ++x) {
933                 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
934                                                         + h - 1 - starty);
935                 for (int y = starty; y > stopy; y -= pack) {
936                     quint32 c = src[y * sstride + x];
937                     for (int i = 1; i < pack; ++i) {
938                         const int shift = (sizeof(int) * 8 / pack * i);
939                         const ushort color = src[(y - i) * sstride + x];
940                         c |= color << shift;
941                     }
942                     *d++ = c;
943                 }
944             }
945         }
946         if (unoptimizedY) {
947             const int starty = unoptimizedY - 1;
948             for (int x = startx; x < stopx; ++x) {
949                 ushort *d = dest + x * dstride + h - 1 - starty;
950                 for (int y = starty; y >= 0; --y) {
951                     *d++ = src[y * sstride + x];
952                 }
953             }
954         }
955     }
956 }
957 
958 class QSimdNeon
959 {
960 public:
961     typedef int32x4_t Int32x4;
962     typedef float32x4_t Float32x4;
963 
964     union Vect_buffer_i { Int32x4 v; int i[4]; };
965     union Vect_buffer_f { Float32x4 v; float f[4]; };
966 
v_dup(float x)967     static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
v_dup(int x)968     static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
v_dup(uint x)969     static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
970 
v_add(Float32x4 a,Float32x4 b)971     static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
v_add(Int32x4 a,Int32x4 b)972     static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
973 
v_max(Float32x4 a,Float32x4 b)974     static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
v_min(Float32x4 a,Float32x4 b)975     static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
v_min_16(Int32x4 a,Int32x4 b)976     static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
977 
v_and(Int32x4 a,Int32x4 b)978     static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
979 
v_sub(Float32x4 a,Float32x4 b)980     static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
v_sub(Int32x4 a,Int32x4 b)981     static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
982 
v_mul(Float32x4 a,Float32x4 b)983     static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
984 
v_sqrt(Float32x4 x)985     static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
986 
v_toInt(Float32x4 x)987     static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
988 
v_greaterOrEqual(Float32x4 a,Float32x4 b)989     static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
990 };
991 
qt_fetch_radial_gradient_neon(uint * buffer,const Operator * op,const QSpanData * data,int y,int x,int length)992 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
993                                                        int y, int x, int length)
994 {
995     return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
996 }
997 
998 QT_END_NAMESPACE
999 
1000 #endif // QT_HAVE_NEON
1001 
1002