1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtGui module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file. Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include <private/qdrawhelper_p.h>
43 #include <private/qblendfunctions_p.h>
44 #include <private/qmath_p.h>
45
46 #ifdef QT_HAVE_NEON
47
48 #include <private/qdrawhelper_neon_p.h>
49 #include <private/qpaintengine_raster_p.h>
50 #include <arm_neon.h>
51
52 QT_BEGIN_NAMESPACE
53
qt_memfill32_neon(quint32 * dest,quint32 value,int count)54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
55 {
56 const int epilogueSize = count % 16;
57 if (count >= 16) {
58 quint32 *const neonEnd = dest + count - epilogueSize;
59 register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60 register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61 while (dest != neonEnd) {
62 asm volatile (
63 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
65 : [DST]"+r" (dest)
66 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
67 : "memory"
68 );
69 }
70 }
71
72 switch (epilogueSize)
73 {
74 case 15: *dest++ = value;
75 case 14: *dest++ = value;
76 case 13: *dest++ = value;
77 case 12: *dest++ = value;
78 case 11: *dest++ = value;
79 case 10: *dest++ = value;
80 case 9: *dest++ = value;
81 case 8: *dest++ = value;
82 case 7: *dest++ = value;
83 case 6: *dest++ = value;
84 case 5: *dest++ = value;
85 case 4: *dest++ = value;
86 case 3: *dest++ = value;
87 case 2: *dest++ = value;
88 case 1: *dest++ = value;
89 }
90 }
91
qvdiv_255_u16(uint16x8_t x,uint16x8_t half)92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
93 {
94 // result = (x + (x >> 8) + 0x80) >> 8
95
96 const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97 const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98 const uint16x8_t sum = vaddq_u16(temp, sum_part);
99
100 return vshrq_n_u16(sum, 8);
101 }
102
qvbyte_mul_u16(uint16x8_t x,uint16x8_t alpha,uint16x8_t half)103 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
104 {
105 // t = qRound(x * alpha / 255.0)
106
107 const uint16x8_t t = vmulq_u16(x, alpha); // t
108 return qvdiv_255_u16(t, half);
109 }
110
qvinterpolate_pixel_255(uint16x8_t x,uint16x8_t a,uint16x8_t y,uint16x8_t b,uint16x8_t half)111 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
112 {
113 // t = x * a + y * b
114
115 const uint16x8_t ta = vmulq_u16(x, a);
116 const uint16x8_t tb = vmulq_u16(y, b);
117
118 return qvdiv_255_u16(vaddq_u16(ta, tb), half);
119 }
120
qvsource_over_u16(uint16x8_t src16,uint16x8_t dst16,uint16x8_t half,uint16x8_t full)121 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
122 {
123 const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124 const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
125
126 const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
127
128 return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
129 }
130
131 extern "C" void
132 pixman_composite_over_8888_0565_asm_neon (int32_t w,
133 int32_t h,
134 uint16_t *dst,
135 int32_t dst_stride,
136 uint32_t *src,
137 int32_t src_stride);
138
139 extern "C" void
140 pixman_composite_over_8888_8888_asm_neon (int32_t w,
141 int32_t h,
142 uint32_t *dst,
143 int32_t dst_stride,
144 uint32_t *src,
145 int32_t src_stride);
146
147 extern "C" void
148 pixman_composite_src_0565_8888_asm_neon (int32_t w,
149 int32_t h,
150 uint32_t *dst,
151 int32_t dst_stride,
152 uint16_t *src,
153 int32_t src_stride);
154
155 extern "C" void
156 pixman_composite_over_n_8_0565_asm_neon (int32_t w,
157 int32_t h,
158 uint16_t *dst,
159 int32_t dst_stride,
160 uint32_t src,
161 int32_t unused,
162 uint8_t *mask,
163 int32_t mask_stride);
164
165 extern "C" void
166 pixman_composite_scanline_over_asm_neon (int32_t w,
167 const uint32_t *dst,
168 const uint32_t *src);
169
170 extern "C" void
171 pixman_composite_src_0565_0565_asm_neon (int32_t w,
172 int32_t h,
173 uint16_t *dst,
174 int32_t dst_stride,
175 uint16_t *src,
176 int32_t src_stride);
177
178 // qblendfunctions.cpp
179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180 const uchar *srcPixels, int sbpl,
181 int w, int h,
182 int const_alpha);
183
qt_blend_rgb16_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)184 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185 const uchar *srcPixels, int sbpl,
186 int w, int h,
187 int const_alpha)
188 {
189 dbpl /= 4;
190 sbpl /= 2;
191
192 quint32 *dst = (quint32 *) destPixels;
193 quint16 *src = (quint16 *) srcPixels;
194
195 if (const_alpha != 256) {
196 quint8 a = (255 * const_alpha) >> 8;
197 quint8 ia = 255 - a;
198
199 while (h--) {
200 for (int x=0; x<w; ++x)
201 dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
202 dst += dbpl;
203 src += sbpl;
204 }
205 return;
206 }
207
208 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
209 }
210
211 // qblendfunctions.cpp
212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213 const uchar *src, int sbpl,
214 int w, int h,
215 int const_alpha);
216
217 template <int N>
scanLineBlit16(quint16 * dst,quint16 * src,int dstride)218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
219 {
220 if (N >= 2) {
221 ((quint32 *)dst)[0] = ((quint32 *)src)[0];
222 __builtin_prefetch(dst + dstride, 1, 0);
223 }
224 for (int i = 1; i < N/2; ++i)
225 ((quint32 *)dst)[i] = ((quint32 *)src)[i];
226 if (N & 1)
227 dst[N-1] = src[N-1];
228 }
229
230 template <int Width>
blockBlit16(quint16 * dst,quint16 * src,int dstride,int sstride,int h)231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
232 {
233 union {
234 quintptr address;
235 quint16 *pointer;
236 } u;
237
238 u.pointer = dst;
239
240 if (u.address & 2) {
241 while (h--) {
242 // align dst
243 dst[0] = src[0];
244 if (Width > 1)
245 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
246 dst += dstride;
247 src += sstride;
248 }
249 } else {
250 while (h--) {
251 scanLineBlit16<Width>(dst, src, dstride);
252
253 dst += dstride;
254 src += sstride;
255 }
256 }
257 }
258
qt_blend_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260 const uchar *srcPixels, int sbpl,
261 int w, int h,
262 int const_alpha)
263 {
264 // testing show that the default memcpy is faster for widths 150 and up
265 if (const_alpha != 256 || w >= 150) {
266 qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
267 return;
268 }
269
270 int dstride = dbpl / 2;
271 int sstride = sbpl / 2;
272
273 quint16 *dst = (quint16 *) destPixels;
274 quint16 *src = (quint16 *) srcPixels;
275
276 switch (w) {
277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
278 BLOCKBLIT(1);
279 BLOCKBLIT(2);
280 BLOCKBLIT(3);
281 BLOCKBLIT(4);
282 BLOCKBLIT(5);
283 BLOCKBLIT(6);
284 BLOCKBLIT(7);
285 BLOCKBLIT(8);
286 BLOCKBLIT(9);
287 BLOCKBLIT(10);
288 BLOCKBLIT(11);
289 BLOCKBLIT(12);
290 BLOCKBLIT(13);
291 BLOCKBLIT(14);
292 BLOCKBLIT(15);
293 #undef BLOCKBLIT
294 default:
295 break;
296 }
297
298 pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
299 }
300
301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
302
qt_blend_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304 const uchar *srcPixels, int sbpl,
305 int w, int h,
306 int const_alpha)
307 {
308 quint16 *dst = (quint16 *) destPixels;
309 quint32 *src = (quint32 *) srcPixels;
310
311 if (const_alpha != 256) {
312 for (int y=0; y<h; ++y) {
313 int i = 0;
314 for (; i < w-7; i += 8)
315 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
316
317 if (i < w) {
318 int tail = w - i;
319
320 quint16 dstBuffer[8];
321 quint32 srcBuffer[8];
322
323 for (int j = 0; j < tail; ++j) {
324 dstBuffer[j] = dst[i + j];
325 srcBuffer[j] = src[i + j];
326 }
327
328 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
329
330 for (int j = 0; j < tail; ++j)
331 dst[i + j] = dstBuffer[j];
332 }
333
334 dst = (quint16 *)(((uchar *) dst) + dbpl);
335 src = (quint32 *)(((uchar *) src) + sbpl);
336 }
337 return;
338 }
339
340 pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
341 }
342
qt_blend_argb32_on_argb32_scanline_neon(uint * dest,const uint * src,int length,uint const_alpha)343 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
344 {
345 if (const_alpha == 255) {
346 pixman_composite_scanline_over_asm_neon(length, dest, src);
347 } else {
348 qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
349 }
350 }
351
qt_blend_argb32_on_argb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)352 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
353 const uchar *srcPixels, int sbpl,
354 int w, int h,
355 int const_alpha)
356 {
357 const uint *src = (const uint *) srcPixels;
358 uint *dst = (uint *) destPixels;
359 uint16x8_t half = vdupq_n_u16(0x80);
360 uint16x8_t full = vdupq_n_u16(0xff);
361 if (const_alpha == 256) {
362 pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
363 } else if (const_alpha != 0) {
364 const_alpha = (const_alpha * 255) >> 8;
365 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
366 for (int y = 0; y < h; ++y) {
367 int x = 0;
368 for (; x < w-3; x += 4) {
369 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
370 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
371 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
372
373 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
374 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
375
376 const uint8x8_t src8_low = vget_low_u8(src8);
377 const uint8x8_t dst8_low = vget_low_u8(dst8);
378
379 const uint8x8_t src8_high = vget_high_u8(src8);
380 const uint8x8_t dst8_high = vget_high_u8(dst8);
381
382 const uint16x8_t src16_low = vmovl_u8(src8_low);
383 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
384
385 const uint16x8_t src16_high = vmovl_u8(src8_high);
386 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
387
388 const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
389 const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
390
391 const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
392 const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
393
394 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
395 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
396
397 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
398 }
399 }
400 for (; x<w; ++x) {
401 uint s = src[x];
402 if (s != 0) {
403 s = BYTE_MUL(s, const_alpha);
404 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
405 }
406 }
407 dst = (quint32 *)(((uchar *) dst) + dbpl);
408 src = (const quint32 *)(((const uchar *) src) + sbpl);
409 }
410 }
411 }
412
413 // qblendfunctions.cpp
414 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
415 const uchar *srcPixels, int sbpl,
416 int w, int h,
417 int const_alpha);
418
qt_blend_rgb32_on_rgb32_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int w,int h,int const_alpha)419 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
420 const uchar *srcPixels, int sbpl,
421 int w, int h,
422 int const_alpha)
423 {
424 if (const_alpha != 256) {
425 if (const_alpha != 0) {
426 const uint *src = (const uint *) srcPixels;
427 uint *dst = (uint *) destPixels;
428 uint16x8_t half = vdupq_n_u16(0x80);
429 const_alpha = (const_alpha * 255) >> 8;
430 int one_minus_const_alpha = 255 - const_alpha;
431 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
432 uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
433 for (int y = 0; y < h; ++y) {
434 int x = 0;
435 for (; x < w-3; x += 4) {
436 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
437 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
438
439 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
440 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
441
442 const uint8x8_t src8_low = vget_low_u8(src8);
443 const uint8x8_t dst8_low = vget_low_u8(dst8);
444
445 const uint8x8_t src8_high = vget_high_u8(src8);
446 const uint8x8_t dst8_high = vget_high_u8(dst8);
447
448 const uint16x8_t src16_low = vmovl_u8(src8_low);
449 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
450
451 const uint16x8_t src16_high = vmovl_u8(src8_high);
452 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
453
454 const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
455 const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
456
457 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
458 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
459
460 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
461 }
462 for (; x<w; ++x) {
463 uint s = src[x];
464 s = BYTE_MUL(s, const_alpha);
465 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
466 }
467 dst = (quint32 *)(((uchar *) dst) + dbpl);
468 src = (const quint32 *)(((const uchar *) src) + sbpl);
469 }
470 }
471 } else {
472 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
473 }
474 }
475
qt_alphamapblit_quint16_neon(QRasterBuffer * rasterBuffer,int x,int y,quint32 color,const uchar * bitmap,int mapWidth,int mapHeight,int mapStride,const QClipData *)476 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
477 int x, int y, quint32 color,
478 const uchar *bitmap,
479 int mapWidth, int mapHeight, int mapStride,
480 const QClipData *)
481 {
482 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
483 const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
484
485 uchar *mask = const_cast<uchar *>(bitmap);
486
487 pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
488 }
489
490 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
491
492 template <typename SRC, typename BlendFunc>
493 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
Blend_on_RGB16_SourceAndConstAlpha_NeonBlend_on_RGB16_SourceAndConstAlpha_Neon494 Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
495 : m_index(0)
496 , m_blender(blender)
497 , m_const_alpha(const_alpha)
498 {
499 }
500
writeBlend_on_RGB16_SourceAndConstAlpha_Neon501 inline void write(quint16 *dst, quint32 src)
502 {
503 srcBuffer[m_index++] = src;
504
505 if (m_index == 8) {
506 m_blender(dst - 7, srcBuffer, m_const_alpha);
507 m_index = 0;
508 }
509 }
510
flushBlend_on_RGB16_SourceAndConstAlpha_Neon511 inline void flush(quint16 *dst)
512 {
513 if (m_index > 0) {
514 quint16 dstBuffer[8];
515 for (int i = 0; i < m_index; ++i)
516 dstBuffer[i] = dst[i - m_index];
517
518 m_blender(dstBuffer, srcBuffer, m_const_alpha);
519
520 for (int i = 0; i < m_index; ++i)
521 dst[i - m_index] = dstBuffer[i];
522
523 m_index = 0;
524 }
525 }
526
527 SRC srcBuffer[8];
528
529 int m_index;
530 BlendFunc m_blender;
531 int m_const_alpha;
532 };
533
534 template <typename SRC, typename BlendFunc>
535 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender,int const_alpha)536 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
537 {
538 return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
539 }
540
qt_scale_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int sh,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)541 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
542 const uchar *srcPixels, int sbpl, int sh,
543 const QRectF &targetRect,
544 const QRectF &sourceRect,
545 const QRect &clip,
546 int const_alpha)
547 {
548 if (const_alpha == 0)
549 return;
550
551 qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
552 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
553 }
554
555 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
556 const uchar *srcPixels, int sbpl, int sh,
557 const QRectF &targetRect,
558 const QRectF &sourceRect,
559 const QRect &clip,
560 int const_alpha);
561
qt_scale_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,int sh,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,int const_alpha)562 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
563 const uchar *srcPixels, int sbpl, int sh,
564 const QRectF &targetRect,
565 const QRectF &sourceRect,
566 const QRect &clip,
567 int const_alpha)
568 {
569 if (const_alpha == 0)
570 return;
571
572 if (const_alpha == 256) {
573 qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip, const_alpha);
574 return;
575 }
576
577 qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
578 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
579 }
580
581 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
582 const uchar *srcPixels, int sbpl,
583 const QRectF &targetRect,
584 const QRectF &sourceRect,
585 const QRect &clip,
586 const QTransform &targetRectTransform,
587 int const_alpha);
588
qt_transform_image_rgb16_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)589 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
590 const uchar *srcPixels, int sbpl,
591 const QRectF &targetRect,
592 const QRectF &sourceRect,
593 const QRect &clip,
594 const QTransform &targetRectTransform,
595 int const_alpha)
596 {
597 if (const_alpha == 0)
598 return;
599
600 if (const_alpha == 256) {
601 qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
602 return;
603 }
604
605 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
606 reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
607 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
608 }
609
qt_transform_image_argb32_on_rgb16_neon(uchar * destPixels,int dbpl,const uchar * srcPixels,int sbpl,const QRectF & targetRect,const QRectF & sourceRect,const QRect & clip,const QTransform & targetRectTransform,int const_alpha)610 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
611 const uchar *srcPixels, int sbpl,
612 const QRectF &targetRect,
613 const QRectF &sourceRect,
614 const QRect &clip,
615 const QTransform &targetRectTransform,
616 int const_alpha)
617 {
618 if (const_alpha == 0)
619 return;
620
621 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
622 reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
623 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
624 }
625
convert_8_pixels_rgb16_to_argb32(quint32 * dst,const quint16 * src)626 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
627 {
628 asm volatile (
629 "vld1.16 { d0, d1 }, [%[SRC]]\n\t"
630
631 /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
632 and put data into d4 - red, d3 - green, d2 - blue */
633 "vshrn.u16 d4, q0, #8\n\t"
634 "vshrn.u16 d3, q0, #3\n\t"
635 "vsli.u16 q0, q0, #5\n\t"
636 "vsri.u8 d4, d4, #5\n\t"
637 "vsri.u8 d3, d3, #6\n\t"
638 "vshrn.u16 d2, q0, #2\n\t"
639
640 /* fill d5 - alpha with 0xff */
641 "mov r2, #255\n\t"
642 "vdup.8 d5, r2\n\t"
643
644 "vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
645 : : [DST]"r" (dst), [SRC]"r" (src)
646 : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
647 );
648 }
649
qt_destFetchRGB16_neon(uint * buffer,QRasterBuffer * rasterBuffer,int x,int y,int length)650 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
651 {
652 const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
653
654 int i = 0;
655 for (; i < length - 7; i += 8)
656 convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
657
658 if (i < length) {
659 quint16 srcBuffer[8];
660 quint32 dstBuffer[8];
661
662 int tail = length - i;
663 for (int j = 0; j < tail; ++j)
664 srcBuffer[j] = data[i + j];
665
666 convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
667
668 for (int j = 0; j < tail; ++j)
669 buffer[i + j] = dstBuffer[j];
670 }
671
672 return buffer;
673 }
674
convert_8_pixels_argb32_to_rgb16(quint16 * dst,const quint32 * src)675 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
676 {
677 asm volatile (
678 "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
679
680 /* convert to r5g6b5 and store it into {d28, d29} */
681 "vshll.u8 q14, d2, #8\n\t"
682 "vshll.u8 q8, d1, #8\n\t"
683 "vshll.u8 q9, d0, #8\n\t"
684 "vsri.u16 q14, q8, #5\n\t"
685 "vsri.u16 q14, q9, #11\n\t"
686
687 "vst1.16 { d28, d29 }, [%[DST]]"
688 : : [DST]"r" (dst), [SRC]"r" (src)
689 : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
690 );
691 }
692
qt_destStoreRGB16_neon(QRasterBuffer * rasterBuffer,int x,int y,const uint * buffer,int length)693 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
694 {
695 quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
696
697 int i = 0;
698 for (; i < length - 7; i += 8)
699 convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
700
701 if (i < length) {
702 quint32 srcBuffer[8];
703 quint16 dstBuffer[8];
704
705 int tail = length - i;
706 for (int j = 0; j < tail; ++j)
707 srcBuffer[j] = buffer[i + j];
708
709 convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
710
711 for (int j = 0; j < tail; ++j)
712 data[i + j] = dstBuffer[j];
713 }
714 }
715
comp_func_solid_SourceOver_neon(uint * destPixels,int length,uint color,uint const_alpha)716 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
717 {
718 if ((const_alpha & qAlpha(color)) == 255) {
719 QT_MEMFILL_UINT(destPixels, length, color);
720 } else {
721 if (const_alpha != 255)
722 color = BYTE_MUL(color, const_alpha);
723
724 const quint32 minusAlphaOfColor = qAlpha(~color);
725 int x = 0;
726
727 uint32_t *dst = (uint32_t *) destPixels;
728 const uint32x4_t colorVector = vdupq_n_u32(color);
729 uint16x8_t half = vdupq_n_u16(0x80);
730 const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
731
732 for (; x < length-3; x += 4) {
733 uint32x4_t dstVector = vld1q_u32(&dst[x]);
734
735 const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
736
737 const uint8x8_t dst8_low = vget_low_u8(dst8);
738 const uint8x8_t dst8_high = vget_high_u8(dst8);
739
740 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
741 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
742
743 const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
744 const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
745
746 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
747 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
748
749 uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
750 uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
751 vst1q_u32(&dst[x], colorPlusBlendedPixels);
752 }
753
754 for (;x < length; ++x)
755 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
756 }
757 }
758
comp_func_Plus_neon(uint * dst,const uint * src,int length,uint const_alpha)759 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
760 {
761 if (const_alpha == 255) {
762 uint *const end = dst + length;
763 uint *const neonEnd = end - 3;
764
765 while (dst < neonEnd) {
766 asm volatile (
767 "vld2.8 { d0, d1 }, [%[SRC]] !\n\t"
768 "vld2.8 { d2, d3 }, [%[DST]]\n\t"
769 "vqadd.u8 q0, q0, q1\n\t"
770 "vst2.8 { d0, d1 }, [%[DST]] !\n\t"
771 : [DST]"+r" (dst), [SRC]"+r" (src)
772 :
773 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
774 );
775 }
776
777 while (dst != end) {
778 *dst = comp_func_Plus_one_pixel(*dst, *src);
779 ++dst;
780 ++src;
781 }
782 } else {
783 int x = 0;
784 const int one_minus_const_alpha = 255 - const_alpha;
785 const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
786 const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
787
788 const uint16x8_t half = vdupq_n_u16(0x80);
789 for (; x < length - 3; x += 4) {
790 const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
791 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
792 uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
793 uint8x16_t result = vqaddq_u8(dst8, src8);
794
795 uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
796 uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
797
798 uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
799 uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
800
801 result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
802 result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
803
804 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
805 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
806 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
807 }
808
809 for (; x < length; ++x)
810 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
811 }
812 }
813
814 static const int tileSize = 32;
815
816 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
817
qt_memrotate90_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)818 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
819 {
820 const ushort *src = (const ushort *)srcPixels;
821 ushort *dest = (ushort *)destPixels;
822
823 sstride /= sizeof(ushort);
824 dstride /= sizeof(ushort);
825
826 const int pack = sizeof(quint32) / sizeof(ushort);
827 const int unaligned =
828 qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
829 const int restX = w % tileSize;
830 const int restY = (h - unaligned) % tileSize;
831 const int unoptimizedY = restY % pack;
832 const int numTilesX = w / tileSize + (restX > 0);
833 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
834
835 for (int tx = 0; tx < numTilesX; ++tx) {
836 const int startx = w - tx * tileSize - 1;
837 const int stopx = qMax(startx - tileSize, 0);
838
839 if (unaligned) {
840 for (int x = startx; x >= stopx; --x) {
841 ushort *d = dest + (w - x - 1) * dstride;
842 for (int y = 0; y < unaligned; ++y) {
843 *d++ = src[y * sstride + x];
844 }
845 }
846 }
847
848 for (int ty = 0; ty < numTilesY; ++ty) {
849 const int starty = ty * tileSize + unaligned;
850 const int stopy = qMin(starty + tileSize, h - unoptimizedY);
851
852 int x = startx;
853 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
854 for (; x >= stopx + 7; x -= 8) {
855 ushort *d = dest + (w - x - 1) * dstride + starty;
856 const ushort *s = &src[starty * sstride + x - 7];
857 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
858 }
859
860 for (; x >= stopx; --x) {
861 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
862 for (int y = starty; y < stopy; y += pack) {
863 quint32 c = src[y * sstride + x];
864 for (int i = 1; i < pack; ++i) {
865 const int shift = (sizeof(int) * 8 / pack * i);
866 const ushort color = src[(y + i) * sstride + x];
867 c |= color << shift;
868 }
869 *d++ = c;
870 }
871 }
872 }
873
874 if (unoptimizedY) {
875 const int starty = h - unoptimizedY;
876 for (int x = startx; x >= stopx; --x) {
877 ushort *d = dest + (w - x - 1) * dstride + starty;
878 for (int y = starty; y < h; ++y) {
879 *d++ = src[y * sstride + x];
880 }
881 }
882 }
883 }
884 }
885
886 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
887
qt_memrotate270_16_neon(const uchar * srcPixels,int w,int h,int sstride,uchar * destPixels,int dstride)888 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
889 int sstride,
890 uchar *destPixels, int dstride)
891 {
892 const ushort *src = (const ushort *)srcPixels;
893 ushort *dest = (ushort *)destPixels;
894
895 sstride /= sizeof(ushort);
896 dstride /= sizeof(ushort);
897
898 const int pack = sizeof(quint32) / sizeof(ushort);
899 const int unaligned =
900 qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
901 const int restX = w % tileSize;
902 const int restY = (h - unaligned) % tileSize;
903 const int unoptimizedY = restY % pack;
904 const int numTilesX = w / tileSize + (restX > 0);
905 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
906
907 for (int tx = 0; tx < numTilesX; ++tx) {
908 const int startx = tx * tileSize;
909 const int stopx = qMin(startx + tileSize, w);
910
911 if (unaligned) {
912 for (int x = startx; x < stopx; ++x) {
913 ushort *d = dest + x * dstride;
914 for (int y = h - 1; y >= h - unaligned; --y) {
915 *d++ = src[y * sstride + x];
916 }
917 }
918 }
919
920 for (int ty = 0; ty < numTilesY; ++ty) {
921 const int starty = h - 1 - unaligned - ty * tileSize;
922 const int stopy = qMax(starty - tileSize, unoptimizedY);
923
924 int x = startx;
925 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
926 for (; x < stopx - 7; x += 8) {
927 ushort *d = dest + x * dstride + h - 1 - starty;
928 const ushort *s = &src[starty * sstride + x];
929 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
930 }
931
932 for (; x < stopx; ++x) {
933 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
934 + h - 1 - starty);
935 for (int y = starty; y > stopy; y -= pack) {
936 quint32 c = src[y * sstride + x];
937 for (int i = 1; i < pack; ++i) {
938 const int shift = (sizeof(int) * 8 / pack * i);
939 const ushort color = src[(y - i) * sstride + x];
940 c |= color << shift;
941 }
942 *d++ = c;
943 }
944 }
945 }
946 if (unoptimizedY) {
947 const int starty = unoptimizedY - 1;
948 for (int x = startx; x < stopx; ++x) {
949 ushort *d = dest + x * dstride + h - 1 - starty;
950 for (int y = starty; y >= 0; --y) {
951 *d++ = src[y * sstride + x];
952 }
953 }
954 }
955 }
956 }
957
958 class QSimdNeon
959 {
960 public:
961 typedef int32x4_t Int32x4;
962 typedef float32x4_t Float32x4;
963
964 union Vect_buffer_i { Int32x4 v; int i[4]; };
965 union Vect_buffer_f { Float32x4 v; float f[4]; };
966
v_dup(float x)967 static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
v_dup(int x)968 static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
v_dup(uint x)969 static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
970
v_add(Float32x4 a,Float32x4 b)971 static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
v_add(Int32x4 a,Int32x4 b)972 static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
973
v_max(Float32x4 a,Float32x4 b)974 static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
v_min(Float32x4 a,Float32x4 b)975 static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
v_min_16(Int32x4 a,Int32x4 b)976 static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
977
v_and(Int32x4 a,Int32x4 b)978 static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
979
v_sub(Float32x4 a,Float32x4 b)980 static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
v_sub(Int32x4 a,Int32x4 b)981 static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
982
v_mul(Float32x4 a,Float32x4 b)983 static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
984
v_sqrt(Float32x4 x)985 static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
986
v_toInt(Float32x4 x)987 static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
988
v_greaterOrEqual(Float32x4 a,Float32x4 b)989 static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
990 };
991
qt_fetch_radial_gradient_neon(uint * buffer,const Operator * op,const QSpanData * data,int y,int x,int length)992 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
993 int y, int x, int length)
994 {
995 return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
996 }
997
998 QT_END_NAMESPACE
999
1000 #endif // QT_HAVE_NEON
1001
1002