1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtGui module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39
40 #include <private/qdrawhelper_neon_p.h>
41 #include <private/qblendfunctions_p.h>
42 #include <private/qmath_p.h>
43
44 #ifdef __ARM_NEON__
45
46 #include <private/qpaintengine_raster_p.h>
47
48 QT_BEGIN_NAMESPACE
49
qt_memfill32(quint32 * dest,quint32 value,qsizetype count)50 void qt_memfill32(quint32 *dest, quint32 value, qsizetype count)
51 {
52 const int epilogueSize = count % 16;
53 #if defined(Q_CC_GHS) || defined(Q_CC_MSVC)
54 // inline assembler free version:
55 if (count >= 16) {
56 quint32 *const neonEnd = dest + count - epilogueSize;
57 const uint32x4_t valueVector1 = vdupq_n_u32(value);
58 const uint32x4x4_t valueVector4 = { valueVector1, valueVector1, valueVector1, valueVector1 };
59 do {
60 vst4q_u32(dest, valueVector4);
61 dest += 16;
62 } while (dest != neonEnd);
63 }
64 #elif !defined(Q_PROCESSOR_ARM_64)
65 if (count >= 16) {
66 quint32 *const neonEnd = dest + count - epilogueSize;
67 register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
68 register uint32x4_t valueVector2 asm ("q1") = valueVector1;
69 while (dest != neonEnd) {
70 asm volatile (
71 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
72 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
73 : [DST]"+r" (dest)
74 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
75 : "memory"
76 );
77 }
78 }
79 #else
80 if (count >= 16) {
81 quint32 *const neonEnd = dest + count - epilogueSize;
82 register uint32x4_t valueVector1 asm ("v0") = vdupq_n_u32(value);
83 register uint32x4_t valueVector2 asm ("v1") = valueVector1;
84 while (dest != neonEnd) {
85 asm volatile (
86 "st2 { v0.4s, v1.4s }, [%[DST]], #32 \n\t"
87 "st2 { v0.4s, v1.4s }, [%[DST]], #32 \n\t"
88 : [DST]"+r" (dest)
89 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
90 : "memory"
91 );
92 }
93 }
94 #endif
95
96 switch (epilogueSize)
97 {
98 case 15: *dest++ = value; Q_FALLTHROUGH();
99 case 14: *dest++ = value; Q_FALLTHROUGH();
100 case 13: *dest++ = value; Q_FALLTHROUGH();
101 case 12: *dest++ = value; Q_FALLTHROUGH();
102 case 11: *dest++ = value; Q_FALLTHROUGH();
103 case 10: *dest++ = value; Q_FALLTHROUGH();
104 case 9: *dest++ = value; Q_FALLTHROUGH();
105 case 8: *dest++ = value; Q_FALLTHROUGH();
106 case 7: *dest++ = value; Q_FALLTHROUGH();
107 case 6: *dest++ = value; Q_FALLTHROUGH();
108 case 5: *dest++ = value; Q_FALLTHROUGH();
109 case 4: *dest++ = value; Q_FALLTHROUGH();
110 case 3: *dest++ = value; Q_FALLTHROUGH();
111 case 2: *dest++ = value; Q_FALLTHROUGH();
112 case 1: *dest++ = value;
113 }
114 }
115
qvdiv_255_u16(uint16x8_t x,uint16x8_t half)116 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
117 {
118 // result = (x + (x >> 8) + 0x80) >> 8
119
120 const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
121 const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
122 const uint16x8_t sum = vaddq_u16(temp, sum_part);
123
124 return vshrq_n_u16(sum, 8);
125 }
126
qvbyte_mul_u16(uint16x8_t x,uint16x8_t alpha,uint16x8_t half)127 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
128 {
129 // t = qRound(x * alpha / 255.0)
130
131 const uint16x8_t t = vmulq_u16(x, alpha); // t
132 return qvdiv_255_u16(t, half);
133 }
134
qvinterpolate_pixel_255(uint16x8_t x,uint16x8_t a,uint16x8_t y,uint16x8_t b,uint16x8_t half)135 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
136 {
137 // t = x * a + y * b
138
139 const uint16x8_t ta = vmulq_u16(x, a);
140 const uint16x8_t tb = vmulq_u16(y, b);
141
142 return qvdiv_255_u16(vaddq_u16(ta, tb), half);
143 }
144
qvsource_over_u16(uint16x8_t src16,uint16x8_t dst16,uint16x8_t half,uint16x8_t full)145 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
146 {
147 const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
148 const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
149
150 const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
151
152 return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
153 }
154
155 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
156 extern "C" void
157 pixman_composite_over_8888_0565_asm_neon (int32_t w,
158 int32_t h,
159 uint16_t *dst,
160 int32_t dst_stride,
161 uint32_t *src,
162 int32_t src_stride);
163
164 extern "C" void
165 pixman_composite_over_8888_8888_asm_neon (int32_t w,
166 int32_t h,
167 uint32_t *dst,
168 int32_t dst_stride,
169 uint32_t *src,
170 int32_t src_stride);
171
172 extern "C" void
173 pixman_composite_src_0565_8888_asm_neon (int32_t w,
174 int32_t h,
175 uint32_t *dst,
176 int32_t dst_stride,
177 uint16_t *src,
178 int32_t src_stride);
179
180 extern "C" void
181 pixman_composite_over_n_8_0565_asm_neon (int32_t w,
182 int32_t h,
183 uint16_t *dst,
184 int32_t dst_stride,
185 uint32_t src,
186 int32_t unused,
187 uint8_t *mask,
188 int32_t mask_stride);
189
190 extern "C" void
191 pixman_composite_scanline_over_asm_neon (int32_t w,
192 const uint32_t *dst,
193 const uint32_t *src);
194
195 extern "C" void
196 pixman_composite_src_0565_0565_asm_neon (int32_t w,
197 int32_t h,
198 uint16_t *dst,
199 int32_t dst_stride,
200 uint16_t *src,
201 int32_t src_stride);
202 // qblendfunctions.cpp
203 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
204 const uchar *srcPixels, int sbpl,
205 int w, int h,
206 int const_alpha);
207
qt_blend_rgb16_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)208 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
209 const uchar *srcPixels, int sbpl,
210 int w, int h,
211 int const_alpha)
212 {
213 dbpl /= 4;
214 sbpl /= 2;
215
216 quint32 *dst = (quint32 *) destPixels;
217 quint16 *src = (quint16 *) srcPixels;
218
219 if (const_alpha != 256) {
220 quint8 a = (255 * const_alpha) >> 8;
221 quint8 ia = 255 - a;
222
223 while (h--) {
224 for (int x=0; x<w; ++x)
225 dst[x] = INTERPOLATE_PIXEL_255(qConvertRgb16To32(src[x]), a, dst[x], ia);
226 dst += dbpl;
227 src += sbpl;
228 }
229 return;
230 }
231
232 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
233 }
234
235 // qblendfunctions.cpp
236 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
237 const uchar *src, int sbpl,
238 int w, int h,
239 int const_alpha);
240
241
242 template <int N>
scanLineBlit16(quint16 * dst,quint16 * src,int dstride)243 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
244 {
245 if (N >= 2) {
246 ((quint32 *)dst)[0] = ((quint32 *)src)[0];
247 __builtin_prefetch(dst + dstride, 1, 0);
248 }
249 for (int i = 1; i < N/2; ++i)
250 ((quint32 *)dst)[i] = ((quint32 *)src)[i];
251 if (N & 1)
252 dst[N-1] = src[N-1];
253 }
254
255 template <int Width>
blockBlit16(quint16 * dst,quint16 * src,int dstride,int sstride,int h)256 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
257 {
258 union {
259 quintptr address;
260 quint16 *pointer;
261 } u;
262
263 u.pointer = dst;
264
265 if (u.address & 2) {
266 while (h--) {
267 // align dst
268 dst[0] = src[0];
269 if (Width > 1)
270 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
271 dst += dstride;
272 src += sstride;
273 }
274 } else {
275 while (h--) {
276 scanLineBlit16<Width>(dst, src, dstride);
277
278 dst += dstride;
279 src += sstride;
280 }
281 }
282 }
283
qt_blend_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)284 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
285 const uchar *srcPixels, int sbpl,
286 int w, int h,
287 int const_alpha)
288 {
289 // testing show that the default memcpy is faster for widths 150 and up
290 if (const_alpha != 256 || w >= 150) {
291 qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
292 return;
293 }
294
295 int dstride = dbpl / 2;
296 int sstride = sbpl / 2;
297
298 quint16 *dst = (quint16 *) destPixels;
299 quint16 *src = (quint16 *) srcPixels;
300
301 switch (w) {
302 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
303 BLOCKBLIT(1);
304 BLOCKBLIT(2);
305 BLOCKBLIT(3);
306 BLOCKBLIT(4);
307 BLOCKBLIT(5);
308 BLOCKBLIT(6);
309 BLOCKBLIT(7);
310 BLOCKBLIT(8);
311 BLOCKBLIT(9);
312 BLOCKBLIT(10);
313 BLOCKBLIT(11);
314 BLOCKBLIT(12);
315 BLOCKBLIT(13);
316 BLOCKBLIT(14);
317 BLOCKBLIT(15);
318 #undef BLOCKBLIT
319 default:
320 break;
321 }
322
323 pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
324 }
325
326 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
327
qt_blend_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)328 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
329 const uchar *srcPixels, int sbpl,
330 int w, int h,
331 int const_alpha)
332 {
333 quint16 *dst = (quint16 *) destPixels;
334 quint32 *src = (quint32 *) srcPixels;
335
336 if (const_alpha != 256) {
337 for (int y=0; y<h; ++y) {
338 int i = 0;
339 for (; i < w-7; i += 8)
340 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
341
342 if (i < w) {
343 int tail = w - i;
344
345 quint16 dstBuffer[8];
346 quint32 srcBuffer[8];
347
348 for (int j = 0; j < tail; ++j) {
349 dstBuffer[j] = dst[i + j];
350 srcBuffer[j] = src[i + j];
351 }
352
353 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
354
355 for (int j = 0; j < tail; ++j)
356 dst[i + j] = dstBuffer[j];
357 }
358
359 dst = (quint16 *)(((uchar *) dst) + dbpl);
360 src = (quint32 *)(((uchar *) src) + sbpl);
361 }
362 return;
363 }
364
365 pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
366 }
367 #endif
368
qt_blend_argb32_on_argb32_scanline_neon(uint * dest,const uint * src,int length,uint const_alpha)369 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
370 {
371 if (const_alpha == 255) {
372 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
373 pixman_composite_scanline_over_asm_neon(length, dest, src);
374 #else
375 qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, 256);
376 #endif
377 } else {
378 qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
379 }
380 }
381
qt_blend_argb32_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)382 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
383 const uchar *srcPixels, int sbpl,
384 int w, int h,
385 int const_alpha)
386 {
387 const uint *src = (const uint *) srcPixels;
388 uint *dst = (uint *) destPixels;
389 uint16x8_t half = vdupq_n_u16(0x80);
390 uint16x8_t full = vdupq_n_u16(0xff);
391 if (const_alpha == 256) {
392 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
393 pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
394 #else
395 for (int y=0; y<h; ++y) {
396 int x = 0;
397 for (; x < w-3; x += 4) {
398 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
399 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
400 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
401
402 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
403 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
404
405 const uint8x8_t src8_low = vget_low_u8(src8);
406 const uint8x8_t dst8_low = vget_low_u8(dst8);
407
408 const uint8x8_t src8_high = vget_high_u8(src8);
409 const uint8x8_t dst8_high = vget_high_u8(dst8);
410
411 const uint16x8_t src16_low = vmovl_u8(src8_low);
412 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
413
414 const uint16x8_t src16_high = vmovl_u8(src8_high);
415 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
416
417 const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full);
418 const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full);
419
420 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
421 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
422
423 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
424 }
425 }
426 for (; x<w; ++x) {
427 uint s = src[x];
428 if (s >= 0xff000000)
429 dst[x] = s;
430 else if (s != 0)
431 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
432 }
433 dst = (quint32 *)(((uchar *) dst) + dbpl);
434 src = (const quint32 *)(((const uchar *) src) + sbpl);
435 }
436 #endif
437 } else if (const_alpha != 0) {
438 const_alpha = (const_alpha * 255) >> 8;
439 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
440 for (int y = 0; y < h; ++y) {
441 int x = 0;
442 for (; x < w-3; x += 4) {
443 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
444 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
445 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
446
447 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
448 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
449
450 const uint8x8_t src8_low = vget_low_u8(src8);
451 const uint8x8_t dst8_low = vget_low_u8(dst8);
452
453 const uint8x8_t src8_high = vget_high_u8(src8);
454 const uint8x8_t dst8_high = vget_high_u8(dst8);
455
456 const uint16x8_t src16_low = vmovl_u8(src8_low);
457 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
458
459 const uint16x8_t src16_high = vmovl_u8(src8_high);
460 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
461
462 const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
463 const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
464
465 const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
466 const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
467
468 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
469 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
470
471 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
472 }
473 }
474 for (; x<w; ++x) {
475 uint s = src[x];
476 if (s != 0) {
477 s = BYTE_MUL(s, const_alpha);
478 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
479 }
480 }
481 dst = (quint32 *)(((uchar *) dst) + dbpl);
482 src = (const quint32 *)(((const uchar *) src) + sbpl);
483 }
484 }
485 }
486
487 // qblendfunctions.cpp
488 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
489 const uchar *srcPixels, int sbpl,
490 int w, int h,
491 int const_alpha);
492
qt_blend_rgb32_on_rgb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)493 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
494 const uchar *srcPixels, int sbpl,
495 int w, int h,
496 int const_alpha)
497 {
498 if (const_alpha != 256) {
499 if (const_alpha != 0) {
500 const uint *src = (const uint *) srcPixels;
501 uint *dst = (uint *) destPixels;
502 uint16x8_t half = vdupq_n_u16(0x80);
503 const_alpha = (const_alpha * 255) >> 8;
504 int one_minus_const_alpha = 255 - const_alpha;
505 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
506 uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
507 for (int y = 0; y < h; ++y) {
508 int x = 0;
509 for (; x < w-3; x += 4) {
510 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
511 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
512
513 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
514 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
515
516 const uint8x8_t src8_low = vget_low_u8(src8);
517 const uint8x8_t dst8_low = vget_low_u8(dst8);
518
519 const uint8x8_t src8_high = vget_high_u8(src8);
520 const uint8x8_t dst8_high = vget_high_u8(dst8);
521
522 const uint16x8_t src16_low = vmovl_u8(src8_low);
523 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
524
525 const uint16x8_t src16_high = vmovl_u8(src8_high);
526 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
527
528 const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
529 const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
530
531 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
532 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
533
534 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
535 }
536 for (; x<w; ++x) {
537 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
538 }
539 dst = (quint32 *)(((uchar *) dst) + dbpl);
540 src = (const quint32 *)(((const uchar *) src) + sbpl);
541 }
542 }
543 } else {
544 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
545 }
546 }
547
548 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
549 extern void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
550 int x, int y, const QRgba64 &color,
551 const uchar *map,
552 int mapWidth, int mapHeight, int mapStride,
553 const QClipData *clip, bool useGammaCorrection);
554
qt_alphamapblit_quint16_neon(QRasterBuffer * rasterBuffer,int x,int y,const QRgba64 & color,const uchar * bitmap,int mapWidth,int mapHeight,int mapStride,const QClipData * clip,bool useGammaCorrection)555 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
556 int x, int y, const QRgba64 &color,
557 const uchar *bitmap,
558 int mapWidth, int mapHeight, int mapStride,
559 const QClipData *clip, bool useGammaCorrection)
560 {
561 if (clip || useGammaCorrection) {
562 qt_alphamapblit_quint16(rasterBuffer, x, y, color, bitmap, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
563 return;
564 }
565
566 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
567 const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
568
569 uchar *mask = const_cast<uchar *>(bitmap);
570 const uint c = color.toArgb32();
571
572 pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, c, 0, mask, mapStride);
573 }
574
575 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
576
577 template <typename SRC, typename BlendFunc>
578 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
Blend_on_RGB16_SourceAndConstAlpha_NeonBlend_on_RGB16_SourceAndConstAlpha_Neon579 Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
580 : m_index(0)
581 , m_blender(blender)
582 , m_const_alpha(const_alpha)
583 {
584 }
585
writeBlend_on_RGB16_SourceAndConstAlpha_Neon586 inline void write(quint16 *dst, quint32 src)
587 {
588 srcBuffer[m_index++] = src;
589
590 if (m_index == 8) {
591 m_blender(dst - 7, srcBuffer, m_const_alpha);
592 m_index = 0;
593 }
594 }
595
flushBlend_on_RGB16_SourceAndConstAlpha_Neon596 inline void flush(quint16 *dst)
597 {
598 if (m_index > 0) {
599 quint16 dstBuffer[8];
600 for (int i = 0; i < m_index; ++i)
601 dstBuffer[i] = dst[i - m_index];
602
603 m_blender(dstBuffer, srcBuffer, m_const_alpha);
604
605 for (int i = 0; i < m_index; ++i)
606 dst[i - m_index] = dstBuffer[i];
607
608 m_index = 0;
609 }
610 }
611
612 SRC srcBuffer[8];
613
614 int m_index;
615 BlendFunc m_blender;
616 int m_const_alpha;
617 };
618
619 template <typename SRC, typename BlendFunc>
620 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender,int const_alpha)621 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
622 {
623 return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
624 }
625
qt_scale_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int srch,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)626 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
627 const uchar *srcPixels, int sbpl, int srch,
628 const QRectF &targetRect,
629 const QRectF &sourceRect,
630 const QRect &clip,
631 int const_alpha)
632 {
633 if (const_alpha == 0)
634 return;
635
636 qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip,
637 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
638 }
639
640 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
641 const uchar *srcPixels, int sbpl, int srch,
642 const QRectF &targetRect,
643 const QRectF &sourceRect,
644 const QRect &clip,
645 int const_alpha);
646
qt_scale_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int srch,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)647 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
648 const uchar *srcPixels, int sbpl, int srch,
649 const QRectF &targetRect,
650 const QRectF &sourceRect,
651 const QRect &clip,
652 int const_alpha)
653 {
654 if (const_alpha == 0)
655 return;
656
657 if (const_alpha == 256) {
658 qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha);
659 return;
660 }
661
662 qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip,
663 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
664 }
665
666 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
667 const uchar *srcPixels, int sbpl,
668 const QRectF &targetRect,
669 const QRectF &sourceRect,
670 const QRect &clip,
671 const QTransform &targetRectTransform,
672 int const_alpha);
673
qt_transform_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)674 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
675 const uchar *srcPixels, int sbpl,
676 const QRectF &targetRect,
677 const QRectF &sourceRect,
678 const QRect &clip,
679 const QTransform &targetRectTransform,
680 int const_alpha)
681 {
682 if (const_alpha == 0)
683 return;
684
685 if (const_alpha == 256) {
686 qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
687 return;
688 }
689
690 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
691 reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
692 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
693 }
694
qt_transform_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)695 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
696 const uchar *srcPixels, int sbpl,
697 const QRectF &targetRect,
698 const QRectF &sourceRect,
699 const QRect &clip,
700 const QTransform &targetRectTransform,
701 int const_alpha)
702 {
703 if (const_alpha == 0)
704 return;
705
706 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
707 reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
708 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
709 }
710
convert_8_pixels_rgb16_to_argb32(quint32 * dst,const quint16 * src)711 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
712 {
713 asm volatile (
714 "vld1.16 { d0, d1 }, [%[SRC]]\n\t"
715
716 /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
717 and put data into d4 - red, d3 - green, d2 - blue */
718 "vshrn.u16 d4, q0, #8\n\t"
719 "vshrn.u16 d3, q0, #3\n\t"
720 "vsli.u16 q0, q0, #5\n\t"
721 "vsri.u8 d4, d4, #5\n\t"
722 "vsri.u8 d3, d3, #6\n\t"
723 "vshrn.u16 d2, q0, #2\n\t"
724
725 /* fill d5 - alpha with 0xff */
726 "mov r2, #255\n\t"
727 "vdup.8 d5, r2\n\t"
728
729 "vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
730 : : [DST]"r" (dst), [SRC]"r" (src)
731 : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
732 );
733 }
734
qt_destFetchRGB16_neon(uint * buffer,QRasterBuffer * rasterBuffer,int x,int y,int length)735 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
736 {
737 const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
738
739 int i = 0;
740 for (; i < length - 7; i += 8)
741 convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
742
743 if (i < length) {
744 quint16 srcBuffer[8];
745 quint32 dstBuffer[8];
746
747 int tail = length - i;
748 for (int j = 0; j < tail; ++j)
749 srcBuffer[j] = data[i + j];
750
751 convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
752
753 for (int j = 0; j < tail; ++j)
754 buffer[i + j] = dstBuffer[j];
755 }
756
757 return buffer;
758 }
759
convert_8_pixels_argb32_to_rgb16(quint16 * dst,const quint32 * src)760 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
761 {
762 asm volatile (
763 "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
764
765 /* convert to r5g6b5 and store it into {d28, d29} */
766 "vshll.u8 q14, d2, #8\n\t"
767 "vshll.u8 q8, d1, #8\n\t"
768 "vshll.u8 q9, d0, #8\n\t"
769 "vsri.u16 q14, q8, #5\n\t"
770 "vsri.u16 q14, q9, #11\n\t"
771
772 "vst1.16 { d28, d29 }, [%[DST]]"
773 : : [DST]"r" (dst), [SRC]"r" (src)
774 : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
775 );
776 }
777
qt_destStoreRGB16_neon(QRasterBuffer * rasterBuffer,int x,int y,const uint * buffer,int length)778 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
779 {
780 quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
781
782 int i = 0;
783 for (; i < length - 7; i += 8)
784 convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
785
786 if (i < length) {
787 quint32 srcBuffer[8];
788 quint16 dstBuffer[8];
789
790 int tail = length - i;
791 for (int j = 0; j < tail; ++j)
792 srcBuffer[j] = buffer[i + j];
793
794 convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
795
796 for (int j = 0; j < tail; ++j)
797 data[i + j] = dstBuffer[j];
798 }
799 }
800 #endif
801
comp_func_solid_SourceOver_neon(uint * destPixels,int length,uint color,uint const_alpha)802 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
803 {
804 if ((const_alpha & qAlpha(color)) == 255) {
805 qt_memfill32(destPixels, color, length);
806 } else {
807 if (const_alpha != 255)
808 color = BYTE_MUL(color, const_alpha);
809
810 const quint32 minusAlphaOfColor = qAlpha(~color);
811 int x = 0;
812
813 uint32_t *dst = (uint32_t *) destPixels;
814 const uint32x4_t colorVector = vdupq_n_u32(color);
815 uint16x8_t half = vdupq_n_u16(0x80);
816 const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
817
818 for (; x < length-3; x += 4) {
819 uint32x4_t dstVector = vld1q_u32(&dst[x]);
820
821 const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
822
823 const uint8x8_t dst8_low = vget_low_u8(dst8);
824 const uint8x8_t dst8_high = vget_high_u8(dst8);
825
826 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
827 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
828
829 const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
830 const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
831
832 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
833 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
834
835 uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
836 uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
837 vst1q_u32(&dst[x], colorPlusBlendedPixels);
838 }
839
840 SIMD_EPILOGUE(x, length, 3)
841 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
842 }
843 }
844
comp_func_Plus_neon(uint * dst,const uint * src,int length,uint const_alpha)845 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
846 {
847 if (const_alpha == 255) {
848 uint *const end = dst + length;
849 uint *const neonEnd = end - 3;
850
851 while (dst < neonEnd) {
852 uint8x16_t vs = vld1q_u8((const uint8_t*)src);
853 const uint8x16_t vd = vld1q_u8((uint8_t*)dst);
854 vs = vqaddq_u8(vs, vd);
855 vst1q_u8((uint8_t*)dst, vs);
856 src += 4;
857 dst += 4;
858 };
859
860 while (dst != end) {
861 *dst = comp_func_Plus_one_pixel(*dst, *src);
862 ++dst;
863 ++src;
864 }
865 } else {
866 int x = 0;
867 const int one_minus_const_alpha = 255 - const_alpha;
868 const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
869 const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
870
871 const uint16x8_t half = vdupq_n_u16(0x80);
872 for (; x < length - 3; x += 4) {
873 const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
874 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
875 uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
876 uint8x16_t result = vqaddq_u8(dst8, src8);
877
878 uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
879 uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
880
881 uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
882 uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
883
884 result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
885 result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
886
887 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
888 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
889 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
890 }
891
892 SIMD_EPILOGUE(x, length, 3)
893 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
894 }
895 }
896
897 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
898 static const int tileSize = 32;
899
900 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
901
qt_memrotate90_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)902 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
903 {
904 const ushort *src = (const ushort *)srcPixels;
905 ushort *dest = (ushort *)destPixels;
906
907 sstride /= sizeof(ushort);
908 dstride /= sizeof(ushort);
909
910 const int pack = sizeof(quint32) / sizeof(ushort);
911 const int unaligned =
912 qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
913 const int restX = w % tileSize;
914 const int restY = (h - unaligned) % tileSize;
915 const int unoptimizedY = restY % pack;
916 const int numTilesX = w / tileSize + (restX > 0);
917 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
918
919 for (int tx = 0; tx < numTilesX; ++tx) {
920 const int startx = w - tx * tileSize - 1;
921 const int stopx = qMax(startx - tileSize, 0);
922
923 if (unaligned) {
924 for (int x = startx; x >= stopx; --x) {
925 ushort *d = dest + (w - x - 1) * dstride;
926 for (int y = 0; y < unaligned; ++y) {
927 *d++ = src[y * sstride + x];
928 }
929 }
930 }
931
932 for (int ty = 0; ty < numTilesY; ++ty) {
933 const int starty = ty * tileSize + unaligned;
934 const int stopy = qMin(starty + tileSize, h - unoptimizedY);
935
936 int x = startx;
937 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
938 for (; x >= stopx + 7; x -= 8) {
939 ushort *d = dest + (w - x - 1) * dstride + starty;
940 const ushort *s = &src[starty * sstride + x - 7];
941 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
942 }
943
944 for (; x >= stopx; --x) {
945 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
946 for (int y = starty; y < stopy; y += pack) {
947 quint32 c = src[y * sstride + x];
948 for (int i = 1; i < pack; ++i) {
949 const int shift = (sizeof(int) * 8 / pack * i);
950 const ushort color = src[(y + i) * sstride + x];
951 c |= color << shift;
952 }
953 *d++ = c;
954 }
955 }
956 }
957
958 if (unoptimizedY) {
959 const int starty = h - unoptimizedY;
960 for (int x = startx; x >= stopx; --x) {
961 ushort *d = dest + (w - x - 1) * dstride + starty;
962 for (int y = starty; y < h; ++y) {
963 *d++ = src[y * sstride + x];
964 }
965 }
966 }
967 }
968 }
969
970 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
971
qt_memrotate270_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)972 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
973 int sstride,
974 uchar *destPixels, int dstride)
975 {
976 const ushort *src = (const ushort *)srcPixels;
977 ushort *dest = (ushort *)destPixels;
978
979 sstride /= sizeof(ushort);
980 dstride /= sizeof(ushort);
981
982 const int pack = sizeof(quint32) / sizeof(ushort);
983 const int unaligned =
984 qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
985 const int restX = w % tileSize;
986 const int restY = (h - unaligned) % tileSize;
987 const int unoptimizedY = restY % pack;
988 const int numTilesX = w / tileSize + (restX > 0);
989 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
990
991 for (int tx = 0; tx < numTilesX; ++tx) {
992 const int startx = tx * tileSize;
993 const int stopx = qMin(startx + tileSize, w);
994
995 if (unaligned) {
996 for (int x = startx; x < stopx; ++x) {
997 ushort *d = dest + x * dstride;
998 for (int y = h - 1; y >= h - unaligned; --y) {
999 *d++ = src[y * sstride + x];
1000 }
1001 }
1002 }
1003
1004 for (int ty = 0; ty < numTilesY; ++ty) {
1005 const int starty = h - 1 - unaligned - ty * tileSize;
1006 const int stopy = qMax(starty - tileSize, unoptimizedY);
1007
1008 int x = startx;
1009 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
1010 for (; x < stopx - 7; x += 8) {
1011 ushort *d = dest + x * dstride + h - 1 - starty;
1012 const ushort *s = &src[starty * sstride + x];
1013 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
1014 }
1015
1016 for (; x < stopx; ++x) {
1017 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
1018 + h - 1 - starty);
1019 for (int y = starty; y > stopy; y -= pack) {
1020 quint32 c = src[y * sstride + x];
1021 for (int i = 1; i < pack; ++i) {
1022 const int shift = (sizeof(int) * 8 / pack * i);
1023 const ushort color = src[(y - i) * sstride + x];
1024 c |= color << shift;
1025 }
1026 *d++ = c;
1027 }
1028 }
1029 }
1030 if (unoptimizedY) {
1031 const int starty = unoptimizedY - 1;
1032 for (int x = startx; x < stopx; ++x) {
1033 ushort *d = dest + x * dstride + h - 1 - starty;
1034 for (int y = starty; y >= 0; --y) {
1035 *d++ = src[y * sstride + x];
1036 }
1037 }
1038 }
1039 }
1040 }
1041 #endif
1042
1043 class QSimdNeon
1044 {
1045 public:
1046 typedef int32x4_t Int32x4;
1047 typedef float32x4_t Float32x4;
1048
1049 union Vect_buffer_i { Int32x4 v; int i[4]; };
1050 union Vect_buffer_f { Float32x4 v; float f[4]; };
1051
v_dup(double x)1052 static inline Float32x4 v_dup(double x) { return vdupq_n_f32(float(x)); }
v_dup(float x)1053 static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
v_dup(int x)1054 static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
v_dup(uint x)1055 static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
1056
v_add(Float32x4 a,Float32x4 b)1057 static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
v_add(Int32x4 a,Int32x4 b)1058 static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
1059
v_max(Float32x4 a,Float32x4 b)1060 static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
v_min(Float32x4 a,Float32x4 b)1061 static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
v_min_16(Int32x4 a,Int32x4 b)1062 static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
1063
v_and(Int32x4 a,Int32x4 b)1064 static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
1065
v_sub(Float32x4 a,Float32x4 b)1066 static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
v_sub(Int32x4 a,Int32x4 b)1067 static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
1068
v_mul(Float32x4 a,Float32x4 b)1069 static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
1070
v_sqrt(Float32x4 x)1071 static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
1072
v_toInt(Float32x4 x)1073 static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
1074
v_greaterOrEqual(Float32x4 a,Float32x4 b)1075 static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
1076 };
1077
qt_fetch_radial_gradient_neon(uint * buffer,const Operator * op,const QSpanData * data,int y,int x,int length)1078 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
1079 int y, int x, int length)
1080 {
1081 return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon>,uint>(buffer, op, data, y, x, length);
1082 }
1083
1084 extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_neon(quint32 *dst, const uchar *src, int len);
1085
qt_fetchUntransformed_888_neon(uint * buffer,const Operator *,const QSpanData * data,int y,int x,int length)1086 const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Operator *, const QSpanData *data,
1087 int y, int x, int length)
1088 {
1089 const uchar *line = data->texture.scanLine(y) + x * 3;
1090 qt_convert_rgb888_to_rgb32_neon(buffer, line, length);
1091 return buffer;
1092 }
1093
1094 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
vrgba2argb(uint32x4_t srcVector)1095 static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
1096 {
1097 #if defined(Q_PROCESSOR_ARM_64)
1098 const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1099 #else
1100 const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
1101 #endif
1102 #if defined(Q_PROCESSOR_ARM_64)
1103 srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
1104 #else
1105 // no vqtbl1q_u8, so use two vtbl1_u8
1106 const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
1107 const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
1108 srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
1109 #endif
1110 return srcVector;
1111 }
1112
1113 template<bool RGBA>
convertARGBToARGB32PM_neon(uint * buffer,const uint * src,int count)1114 static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
1115 {
1116 int i = 0;
1117 const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
1118 const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
1119
1120 for (; i < count - 3; i += 4) {
1121 uint32x4_t srcVector = vld1q_u32(src + i);
1122 uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
1123 #if defined(Q_PROCESSOR_ARM_64)
1124 uint32_t alphaSum = vaddvq_u32(alphaVector);
1125 #else
1126 // no vaddvq_u32
1127 uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1128 uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1129 #endif
1130 if (alphaSum) {
1131 if (alphaSum != 255 * 4) {
1132 if (RGBA)
1133 srcVector = vrgba2argb(srcVector);
1134 const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
1135 const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
1136 const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
1137 const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
1138 uint16x8_t src1 = vmull_u8(s1, alpha1);
1139 uint16x8_t src2 = vmull_u8(s2, alpha2);
1140 src1 = vsraq_n_u16(src1, src1, 8);
1141 src2 = vsraq_n_u16(src2, src2, 8);
1142 const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
1143 const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
1144 const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
1145 vst1q_u32(buffer + i, d);
1146 } else {
1147 if (RGBA)
1148 vst1q_u32(buffer + i, vrgba2argb(srcVector));
1149 else if (buffer != src)
1150 vst1q_u32(buffer + i, srcVector);
1151 }
1152 } else {
1153 vst1q_u32(buffer + i, vdupq_n_u32(0));
1154 }
1155 }
1156
1157 SIMD_EPILOGUE(i, count, 3) {
1158 uint v = qPremultiply(src[i]);
1159 buffer[i] = RGBA ? RGBA2ARGB(v) : v;
1160 }
1161 }
1162
1163 template<bool RGBA>
convertARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count)1164 static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count)
1165 {
1166 if (count <= 0)
1167 return;
1168
1169 const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
1170 const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
1171
1172 int i = 0;
1173 for (; i < count-3; i += 4) {
1174 uint32x4_t vs32 = vld1q_u32(src + i);
1175 uint32x4_t alphaVector = vshrq_n_u32(vs32, 24);
1176 #if defined(Q_PROCESSOR_ARM_64)
1177 uint32_t alphaSum = vaddvq_u32(alphaVector);
1178 #else
1179 // no vaddvq_u32
1180 uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1181 uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1182 #endif
1183 if (alphaSum) {
1184 if (!RGBA)
1185 vs32 = vrgba2argb(vs32);
1186 const uint8x16_t vs8 = vreinterpretq_u8_u32(vs32);
1187 const uint8x16x2_t v = vzipq_u8(vs8, vs8);
1188 if (alphaSum != 255 * 4) {
1189 const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(vs32));
1190 const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(vs32));
1191 const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
1192 const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
1193 uint16x8_t src1 = vmull_u8(s1, alpha1);
1194 uint16x8_t src2 = vmull_u8(s2, alpha2);
1195 // convert from 0->(255x255) to 0->(255x257)
1196 src1 = vsraq_n_u16(src1, src1, 7);
1197 src2 = vsraq_n_u16(src2, src2, 7);
1198
1199 // now restore alpha from the trivial conversion
1200 const uint64x2_t d1 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[0]), vreinterpretq_u64_u16(src1));
1201 const uint64x2_t d2 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[1]), vreinterpretq_u64_u16(src2));
1202
1203 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d1));
1204 buffer += 2;
1205 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d2));
1206 buffer += 2;
1207 } else {
1208 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0]));
1209 buffer += 2;
1210 vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1]));
1211 buffer += 2;
1212 }
1213 } else {
1214 vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
1215 buffer += 2;
1216 vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
1217 buffer += 2;
1218 }
1219 }
1220
1221 SIMD_EPILOGUE(i, count, 3) {
1222 uint s = src[i];
1223 if (RGBA)
1224 s = RGBA2ARGB(s);
1225 *buffer++ = QRgba64::fromArgb32(s).premultiplied();
1226 }
1227 }
1228
reciprocal_mul_ps(float32x4_t a,float mul)1229 static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
1230 {
1231 float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
1232 ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul
1233 return ia;
1234 }
1235
1236 template<bool RGBA, bool RGBx>
convertARGBFromARGB32PM_neon(uint * buffer,const uint * src,int count)1237 static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count)
1238 {
1239 int i = 0;
1240 const uint32x4_t alphaMask = vdupq_n_u32(0xff000000);
1241
1242 for (; i < count - 3; i += 4) {
1243 uint32x4_t srcVector = vld1q_u32(src + i);
1244 uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
1245 #if defined(Q_PROCESSOR_ARM_64)
1246 uint32_t alphaSum = vaddvq_u32(alphaVector);
1247 #else
1248 // no vaddvq_u32
1249 uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
1250 uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
1251 #endif
1252 if (alphaSum) {
1253 if (alphaSum != 255 * 4) {
1254 if (RGBA)
1255 srcVector = vrgba2argb(srcVector);
1256 const float32x4_t a = vcvtq_f32_u32(alphaVector);
1257 const float32x4_t ia = reciprocal_mul_ps(a, 255.0f);
1258 // Convert 4x(4xU8) to 4x(4xF32)
1259 uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector)));
1260 uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector)));
1261 float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
1262 float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
1263 float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3)));
1264 float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3)));
1265 src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0);
1266 src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1);
1267 src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0);
1268 src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1);
1269 // Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding)
1270 tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1),
1271 vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1));
1272 tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1),
1273 vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1));
1274 uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3)));
1275 // Overwrite any undefined results from alpha==0 with zeros:
1276 #if defined(Q_PROCESSOR_ARM_64)
1277 uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector);
1278 #else
1279 uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0));
1280 #endif
1281 dstVector = vbicq_u32(dstVector, srcVectorAlphaMask);
1282 // Restore or mask alpha values:
1283 if (RGBx)
1284 dstVector = vorrq_u32(alphaMask, dstVector);
1285 else
1286 dstVector = vbslq_u32(alphaMask, srcVector, dstVector);
1287 vst1q_u32(&buffer[i], dstVector);
1288 } else {
1289 // 4xAlpha==255, no change except if we are doing RGBA->ARGB:
1290 if (RGBA)
1291 vst1q_u32(&buffer[i], vrgba2argb(srcVector));
1292 else if (buffer != src)
1293 vst1q_u32(&buffer[i], srcVector);
1294 }
1295 } else {
1296 // 4xAlpha==0, always zero, except if output is RGBx:
1297 if (RGBx)
1298 vst1q_u32(&buffer[i], alphaMask);
1299 else
1300 vst1q_u32(&buffer[i], vdupq_n_u32(0));
1301 }
1302 }
1303
1304 SIMD_EPILOGUE(i, count, 3) {
1305 uint v = qUnpremultiply(src[i]);
1306 if (RGBx)
1307 v = 0xff000000 | v;
1308 if (RGBA)
1309 v = ARGB2RGBA(v);
1310 buffer[i] = v;
1311 }
1312 }
1313
convertARGB32ToARGB32PM_neon(uint * buffer,int count,const QVector<QRgb> *)1314 void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
1315 {
1316 convertARGBToARGB32PM_neon<false>(buffer, buffer, count);
1317 }
1318
convertRGBA8888ToARGB32PM_neon(uint * buffer,int count,const QVector<QRgb> *)1319 void QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
1320 {
1321 convertARGBToARGB32PM_neon<true>(buffer, buffer, count);
1322 }
1323
fetchARGB32ToARGB32PM_neon(uint * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1324 const uint *QT_FASTCALL fetchARGB32ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
1325 const QVector<QRgb> *, QDitherInfo *)
1326 {
1327 convertARGBToARGB32PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1328 return buffer;
1329 }
1330
fetchRGBA8888ToARGB32PM_neon(uint * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1331 const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
1332 const QVector<QRgb> *, QDitherInfo *)
1333 {
1334 convertARGBToARGB32PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1335 return buffer;
1336 }
1337
convertARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count,const QVector<QRgb> *,QDitherInfo *)1338 const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
1339 const QVector<QRgb> *, QDitherInfo *)
1340 {
1341 convertARGB32ToRGBA64PM_neon<false>(buffer, src, count);
1342 return buffer;
1343 }
1344
convertRGBA8888ToRGBA64PM_neon(QRgba64 * buffer,const uint * src,int count,const QVector<QRgb> *,QDitherInfo *)1345 const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
1346 const QVector<QRgb> *, QDitherInfo *)
1347 {
1348 convertARGB32ToRGBA64PM_neon<true>(buffer, src, count);
1349 return buffer;
1350 }
1351
fetchARGB32ToRGBA64PM_neon(QRgba64 * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1352 const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
1353 const QVector<QRgb> *, QDitherInfo *)
1354 {
1355 convertARGB32ToRGBA64PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1356 return buffer;
1357 }
1358
fetchRGBA8888ToRGBA64PM_neon(QRgba64 * buffer,const uchar * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1359 const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
1360 const QVector<QRgb> *, QDitherInfo *)
1361 {
1362 convertARGB32ToRGBA64PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1363 return buffer;
1364 }
1365
storeRGB32FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1366 void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1367 const QVector<QRgb> *, QDitherInfo *)
1368 {
1369 uint *d = reinterpret_cast<uint *>(dest) + index;
1370 convertARGBFromARGB32PM_neon<false,true>(d, src, count);
1371 }
1372
storeARGB32FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1373 void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1374 const QVector<QRgb> *, QDitherInfo *)
1375 {
1376 uint *d = reinterpret_cast<uint *>(dest) + index;
1377 convertARGBFromARGB32PM_neon<false,false>(d, src, count);
1378 }
1379
storeRGBA8888FromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1380 void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1381 const QVector<QRgb> *, QDitherInfo *)
1382 {
1383 uint *d = reinterpret_cast<uint *>(dest) + index;
1384 convertARGBFromARGB32PM_neon<true,false>(d, src, count);
1385 }
1386
storeRGBXFromARGB32PM_neon(uchar * dest,const uint * src,int index,int count,const QVector<QRgb> *,QDitherInfo *)1387 void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
1388 const QVector<QRgb> *, QDitherInfo *)
1389 {
1390 uint *d = reinterpret_cast<uint *>(dest) + index;
1391 convertARGBFromARGB32PM_neon<true,true>(d, src, count);
1392 }
1393
1394 #endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN
1395
1396 QT_END_NAMESPACE
1397
1398 #endif // __ARM_NEON__
1399
1400