1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * Copyright (c) 2018 RISC OS Open Ltd
28 *
29 * This software is provided 'as-is', without any express or implied
30 * warranty.  In no event will the authors be held liable for any damages
31 * arising from the use of this software.
32 *
33 * Permission is granted to anyone to use this software for any purpose,
34 * including commercial applications, and to alter it and redistribute it
35 * freely, subject to the following restrictions:
36 *
37 * 1. The origin of this software must not be misrepresented; you must not
38 *    claim that you wrote the original software. If you use this software
39 *    in a product, an acknowledgment in the product documentation would be
40 *    appreciated but is not required.
41 * 2. Altered source versions must be plainly marked as such, and must not be
42 *    misrepresented as being the original software.
43 * 3. This notice may not be removed or altered from any source distribution.
44 */
45
46/* Prevent the stack from becoming executable for no reason... */
47#if defined(__linux__) && defined(__ELF__)
48.section .note.GNU-stack,"",%progbits
49#endif
50
51    .text
52    .fpu neon
53    .arch armv7a
54    .object_arch armv4
55    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
56    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
57    .arm
58    .altmacro
59    .p2align 2
60
61#include "pixman-arm-asm.h"
62#include "pixman-arm-neon-asm.h"
63
64/* Global configuration options and preferences */
65
66/*
67 * The code can optionally make use of unaligned memory accesses to improve
68 * performance of handling leading/trailing pixels for each scanline.
69 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
70 * example in linux if unaligned memory accesses are not configured to
71 * generate.exceptions.
72 */
73.set RESPECT_STRICT_ALIGNMENT, 1
74
75/*
76 * Set default prefetch type. There is a choice between the following options:
77 *
78 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
79 * as NOP to workaround some HW bugs or for whatever other reason)
80 *
81 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
82 * advanced prefetch intruduces heavy overhead)
83 *
84 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
85 * which can run ARM and NEON instructions simultaneously so that extra ARM
86 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
87 *
88 * Note: some types of function can't support advanced prefetch and fallback
89 *       to simple one (those which handle 24bpp pixels)
90 */
91.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
92
93/* Prefetch distance in pixels for simple prefetch */
94.set PREFETCH_DISTANCE_SIMPLE, 64
95
96/******************************************************************************/
97
98/* We can actually do significantly better than the Pixman macros, at least for
99 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
100 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
101 */
102
103.macro generate_fillrect_function name, bpp, log2Bpp
104/*
105 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
106 * On entry:
107 * a1 = width, pixels
108 * a2 = height, rows
109 * a3 = pointer to top-left destination pixel
110 * a4 = stride, pixels
111 * [sp] = pixel value to fill with
112 * Within the function:
113 * v1 = width remaining
114 * v2 = vst offset
115 * v3 = alternate pointer
116 * ip = data ARM register
117 */
118pixman_asm_function name
119    vld1.\bpp   {d0[],d1[]}, [sp]
120    sub         a4, a1
121    vld1.\bpp   {d2[],d3[]}, [sp]
122    cmp         a1, #(15+64) >> \log2Bpp
123    push        {v1-v3,lr}
124    vmov        ip, s0
125    blo         51f
126
127    /* Long-row case */
128    mov         v2, #64
1291:  mov         v1, a1
130    ands        v3, a3, #15
131    beq         2f
132    /* Leading pixels */
133    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
134    sub         v1, v1, v3, lsr #\log2Bpp
135    rbit        v3, v3
136.if bpp <= 16
137.if bpp == 8
138    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
139    strneb      ip, [a3], #1
140    tst         v3, #1<<30
141.else
142    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
143.endif
144    strneh      ip, [a3], #2
145.endif
146    movs        v3, v3, lsl #3
147    vstmcs      a3!, {s0}
148    vstmmi      a3!, {d0}
1492:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
150    add         v3, a3, #32
151    /* Inner loop */
1523:  vst1.\bpp   {q0-q1}, [a3 :128], v2
153    subs        v1, v1, #64 >> \log2Bpp
154    vst1.\bpp   {q0-q1}, [v3 :128], v2
155    bhs         3b
156    /* Trailing pixels */
1574:  movs        v1, v1, lsl #27 + \log2Bpp
158    bcc         5f
159    vst1.\bpp   {q0-q1}, [a3 :128]!
1605:  bpl         6f
161    vst1.\bpp   {q0}, [a3 :128]!
1626:  movs        v1, v1, lsl #2
163    vstmcs      a3!, {d0}
164    vstmmi      a3!, {s0}
165.if bpp <= 16
166    movs        v1, v1, lsl #2
167    strcsh      ip, [a3], #2
168.if bpp == 8
169    strmib      ip, [a3], #1
170.endif
171.endif
172    subs        a2, a2, #1
173    add         a3, a3, a4, lsl #\log2Bpp
174    bhi         1b
175    pop         {v1-v3,pc}
176
177    /* Short-row case */
17851: movs        v1, a1
179.if bpp == 8
180    tst         a3, #3
181    beq         53f
18252: subs        v1, v1, #1
183    blo         57f
184    strb        ip, [a3], #1
185    tst         a3, #3
186    bne         52b
187.elseif bpp == 16
188    tstne       a3, #2
189    subne       v1, v1, #1
190    strneh      ip, [a3], #2
191.endif
19253: cmp         v1, #32 >> \log2Bpp
193    bcc         54f
194    vst1.\bpp   {q0-q1}, [a3]!
195    sub         v1, v1, #32 >> \log2Bpp
196    /* Trailing pixels */
19754: movs        v1, v1, lsl #27 + \log2Bpp
198    bcc         55f
199    vst1.\bpp   {q0-q1}, [a3]!
20055: bpl         56f
201    vst1.\bpp   {q0}, [a3]!
20256: movs        v1, v1, lsl #2
203    vstmcs      a3!, {d0}
204    vstmmi      a3!, {s0}
205.if bpp <= 16
206    movs        v1, v1, lsl #2
207    strcsh      ip, [a3], #2
208.if bpp == 8
209    strmib      ip, [a3], #1
210.endif
211.endif
212    subs        a2, a2, #1
213    add         a3, a3, a4, lsl #\log2Bpp
214    bhi         51b
21557: pop         {v1-v3,pc}
216
217.endfunc
218.endm
219
220generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
221generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
222generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
223
224/******************************************************************************/
225
226.macro RGBtoRGBPixelAlpha_process_pixblock_head
227    vmvn        d30, d3  /* get inverted source alpha */
228    vmov        d31, d7  /* dest alpha is always unchanged */
229    vmull.u8    q14, d0, d3
230    vmlal.u8    q14, d4, d30
231    vmull.u8    q0, d1, d3
232    vmlal.u8    q0, d5, d30
233    vmull.u8    q1, d2, d3
234    vmlal.u8    q1, d6, d30
235    vrshr.u16   q2, q14, #8
236    vrshr.u16   q3, q0, #8
237    vraddhn.u16 d28, q14, q2
238    vrshr.u16   q2, q1, #8
239    vraddhn.u16 d29, q0, q3
240    vraddhn.u16 d30, q1, q2
241.endm
242
243.macro RGBtoRGBPixelAlpha_process_pixblock_tail
244    /* nothing */
245.endm
246
247.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
248    vld4.8      {d0-d3}, [SRC]!
249                                    PF add PF_X, PF_X, #8
250        vst4.8      {d28-d31}, [DST_W :128]!
251                                    PF tst PF_CTL, #0xF
252    vld4.8      {d4-d7}, [DST_R :128]!
253                                    PF addne PF_X, PF_X, #8
254    vmvn        d30, d3  /* get inverted source alpha */
255    vmov        d31, d7  /* dest alpha is always unchanged */
256    vmull.u8    q14, d0, d3
257                                    PF subne PF_CTL, PF_CTL, #1
258    vmlal.u8    q14, d4, d30
259                                    PF cmp PF_X, ORIG_W
260    vmull.u8    q0, d1, d3
261                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
262    vmlal.u8    q0, d5, d30
263                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
264    vmull.u8    q1, d2, d3
265                                    PF subge PF_X, PF_X, ORIG_W
266    vmlal.u8    q1, d6, d30
267                                    PF subges PF_CTL, PF_CTL, #0x10
268    vrshr.u16   q2, q14, #8
269                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
270    vrshr.u16   q3, q0, #8
271                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
272    vraddhn.u16 d28, q14, q2
273    vrshr.u16   q2, q1, #8
274    vraddhn.u16 d29, q0, q3
275    vraddhn.u16 d30, q1, q2
276.endm
277
278generate_composite_function \
279    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
280    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
281    8, /* number of pixels, processed in a single block */ \
282    5, /* prefetch distance */ \
283    default_init, \
284    default_cleanup, \
285    RGBtoRGBPixelAlpha_process_pixblock_head, \
286    RGBtoRGBPixelAlpha_process_pixblock_tail, \
287    RGBtoRGBPixelAlpha_process_pixblock_tail_head
288
289 /******************************************************************************/
290
291.macro ARGBto565PixelAlpha_process_pixblock_head
292    vmvn        d6, d3
293    vshr.u8     d1, #2
294    vshr.u8     d3, #3
295    vshr.u8     d0, #3
296    vshrn.u16   d7, q2, #3
297    vshrn.u16   d25, q2, #8
298    vbic.i16    q2, #0xe0
299    vshr.u8     d6, #3
300    vshr.u8     d7, #2
301    vshr.u8     d2, #3
302    vmovn.u16   d24, q2
303    vshr.u8     d25, #3
304    vmull.u8    q13, d1, d3
305    vmlal.u8    q13, d7, d6
306    vmull.u8    q14, d0, d3
307    vmlal.u8    q14, d24, d6
308    vmull.u8    q15, d2, d3
309    vmlal.u8    q15, d25, d6
310.endm
311
312.macro ARGBto565PixelAlpha_process_pixblock_tail
313    vsra.u16    q13, #5
314    vsra.u16    q14, #5
315    vsra.u16    q15, #5
316    vrshr.u16   q13, #5
317    vrshr.u16   q14, #5
318    vrshr.u16   q15, #5
319    vsli.u16    q14, q13, #5
320    vsli.u16    q14, q15, #11
321.endm
322
323.macro ARGBto565PixelAlpha_process_pixblock_tail_head
324    vld4.8      {d0-d3}, [SRC]!
325                                    PF add PF_X, PF_X, #8
326        vsra.u16    q13, #5
327                                    PF tst PF_CTL, #0xF
328        vsra.u16    q14, #5
329                                    PF addne PF_X, PF_X, #8
330        vsra.u16    q15, #5
331                                    PF subne PF_CTL, PF_CTL, #1
332        vrshr.u16   q13, #5
333                                    PF cmp PF_X, ORIG_W
334        vrshr.u16   q14, #5
335                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
336        vrshr.u16   q15, #5
337                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
338    vld1.8      {d4-d5}, [DST_R]!
339                                    PF subge PF_X, PF_X, ORIG_W
340        vsli.u16    q14, q13, #5
341                                    PF subges PF_CTL, PF_CTL, #0x10
342        vsli.u16    q14, q15, #11
343                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
344        vst1.8      {q14}, [DST_W :128]!
345    vmvn        d6, d3
346    vshr.u8     d1, #2
347    vshr.u8     d3, #3
348    vshr.u8     d0, #3
349    vshrn.u16   d7, q2, #3
350    vshrn.u16   d25, q2, #8
351    vbic.i16    q2, #0xe0
352                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
353    vshr.u8     d6, #3
354    vshr.u8     d7, #2
355    vshr.u8     d2, #3
356    vmovn.u16   d24, q2
357    vshr.u8     d25, #3
358    vmull.u8    q13, d1, d3
359    vmlal.u8    q13, d7, d6
360    vmull.u8    q14, d0, d3
361    vmlal.u8    q14, d24, d6
362    vmull.u8    q15, d2, d3
363    vmlal.u8    q15, d25, d6
364.endm
365
366generate_composite_function \
367    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
368    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
369    8, /* number of pixels, processed in a single block */ \
370    6, /* prefetch distance */ \
371    default_init, \
372    default_cleanup, \
373    ARGBto565PixelAlpha_process_pixblock_head, \
374    ARGBto565PixelAlpha_process_pixblock_tail, \
375    ARGBto565PixelAlpha_process_pixblock_tail_head
376