1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 *  - pixman_composite_over_8888_0565_asm_neon
34 *  - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37#if defined(ENABLE_PIXMAN_DRAWHELPERS)
38
39/* Prevent the stack from becoming executable for no reason... */
40#if defined(__linux__) && defined(__ELF__)
41.section .note.GNU-stack,"",%progbits
42#endif
43
44    .text
45    .fpu neon
46    .arch armv7a
47    .altmacro
48
49#include "pixman-arm-neon-asm.h"
50
51/* Global configuration options and preferences */
52
53/*
54 * The code can optionally make use of unaligned memory accesses to improve
55 * performance of handling leading/trailing pixels for each scanline.
56 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
57 * example in linux if unaligned memory accesses are not configured to
58 * generate.exceptions.
59 */
60.set RESPECT_STRICT_ALIGNMENT, 1
61
62/*
63 * Set default prefetch type. There is a choice between the following options:
64 *
65 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
66 * as NOP to workaround some HW bugs or for whatever other reason)
67 *
68 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
69 * advanced prefetch intruduces heavy overhead)
70 *
71 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
72 * which can run ARM and NEON instructions simultaneously so that extra ARM
73 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
74 *
75 * Note: some types of function can't support advanced prefetch and fallback
76 *       to simple one (those which handle 24bpp pixels)
77 */
78.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
79
80/* Prefetch distance in pixels for simple prefetch */
81.set PREFETCH_DISTANCE_SIMPLE, 64
82
83/*
84 * Implementation of pixman_composite_over_8888_0565_asm_neon
85 *
86 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
87 * performs OVER compositing operation. Function fast_composite_over_8888_0565
88 * from pixman-fast-path.c does the same in C and can be used as a reference.
89 *
90 * First we need to have some NEON assembly code which can do the actual
91 * operation on the pixels and provide it to the template macro.
92 *
93 * Template macro quite conveniently takes care of emitting all the necessary
94 * code for memory reading and writing (including quite tricky cases of
95 * handling unaligned leading/trailing pixels), so we only need to deal with
96 * the data in NEON registers.
97 *
98 * NEON registers allocation in general is recommented to be the following:
99 * d0,  d1,  d2,  d3  - contain loaded source pixel data
100 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
101 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
102 * d28, d29, d30, d31 - place for storing the result (destination pixels)
103 *
104 * As can be seen above, four 64-bit NEON registers are used for keeping
105 * intermediate pixel data and up to 8 pixels can be processed in one step
106 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
107 *
108 * This particular function uses the following registers allocation:
109 * d0,  d1,  d2,  d3  - contain loaded source pixel data
110 * d4,  d5            - contain loaded destination pixels (they are needed)
111 * d28, d29           - place for storing the result (destination pixels)
112 */
113
114/*
115 * Step one. We need to have some code to do some arithmetics on pixel data.
116 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
117 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
118 * perform all the needed calculations and write the result to {d28, d29}.
119 * The rationale for having two macros and not just one will be explained
120 * later. In practice, any single monolitic function which does the work can
121 * be split into two parts in any arbitrary way without affecting correctness.
122 *
123 * There is one special trick here too. Common template macro can optionally
124 * make our life a bit easier by doing R, G, B, A color components
125 * deinterleaving for 32bpp pixel formats (and this feature is used in
126 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
127 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
128 * actually use d0 register for blue channel (a vector of eight 8-bit
129 * values), d1 register for green, d2 for red and d3 for alpha. This
130 * simple conversion can be also done with a few NEON instructions:
131 *
132 * Packed to planar conversion:
133 *  vuzp.8 d0, d1
134 *  vuzp.8 d2, d3
135 *  vuzp.8 d1, d3
136 *  vuzp.8 d0, d2
137 *
138 * Planar to packed conversion:
139 *  vzip.8 d0, d2
140 *  vzip.8 d1, d3
141 *  vzip.8 d2, d3
142 *  vzip.8 d0, d1
143 *
144 * But pixel can be loaded directly in planar format using VLD4.8 NEON
145 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
146 * desirable, that's why deinterleaving is optional.
147 *
148 * But anyway, here is the code:
149 */
150.macro pixman_composite_over_8888_0565_process_pixblock_head
151    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
152       and put data into d6 - red, d7 - green, d30 - blue */
153    vshrn.u16   d6, q2, #8
154    vshrn.u16   d7, q2, #3
155    vsli.u16    q2, q2, #5
156    vsri.u8     d6, d6, #5
157    vmvn.8      d3, d3      /* invert source alpha */
158    vsri.u8     d7, d7, #6
159    vshrn.u16   d30, q2, #2
160    /* now do alpha blending, storing results in 8-bit planar format
161       into d16 - red, d19 - green, d18 - blue */
162    vmull.u8    q10, d3, d6
163    vmull.u8    q11, d3, d7
164    vmull.u8    q12, d3, d30
165    vrshr.u16   q13, q10, #8
166    vrshr.u16   q3, q11, #8
167    vrshr.u16   q15, q12, #8
168    vraddhn.u16 d20, q10, q13
169    vraddhn.u16 d23, q11, q3
170    vraddhn.u16 d22, q12, q15
171.endm
172
173.macro pixman_composite_over_8888_0565_process_pixblock_tail
174    /* ... continue alpha blending */
175    vqadd.u8    d16, d2, d20
176    vqadd.u8    q9, q0, q11
177    /* convert the result to r5g6b5 and store it into {d28, d29} */
178    vshll.u8    q14, d16, #8
179    vshll.u8    q8, d19, #8
180    vshll.u8    q9, d18, #8
181    vsri.u16    q14, q8, #5
182    vsri.u16    q14, q9, #11
183.endm
184
185/*
186 * OK, now we got almost everything that we need. Using the above two
187 * macros, the work can be done right. But now we want to optimize
188 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
189 * a lot from good code scheduling and software pipelining.
190 *
191 * Let's construct some code, which will run in the core main loop.
192 * Some pseudo-code of the main loop will look like this:
193 *   head
194 *   while (...) {
195 *     tail
196 *     head
197 *   }
198 *   tail
199 *
200 * It may look a bit weird, but this setup allows to hide instruction
201 * latencies better and also utilize dual-issue capability more
202 * efficiently (make pairs of load-store and ALU instructions).
203 *
204 * So what we need now is a '*_tail_head' macro, which will be used
205 * in the core main loop. A trivial straightforward implementation
206 * of this macro would look like this:
207 *
208 *   pixman_composite_over_8888_0565_process_pixblock_tail
209 *   vst1.16     {d28, d29}, [DST_W, :128]!
210 *   vld1.16     {d4, d5}, [DST_R, :128]!
211 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
212 *   pixman_composite_over_8888_0565_process_pixblock_head
213 *   cache_preload 8, 8
214 *
215 * Now it also got some VLD/VST instructions. We simply can't move from
216 * processing one block of pixels to the other one with just arithmetics.
217 * The previously processed data needs to be written to memory and new
218 * data needs to be fetched. Fortunately, this main loop does not deal
219 * with partial leading/trailing pixels and can load/store a full block
220 * of pixels in a bulk. Additionally, destination buffer is already
221 * 16 bytes aligned here (which is good for performance).
222 *
223 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
224 * are the aliases for ARM registers which are used as pointers for
225 * accessing data. We maintain separate pointers for reading and writing
226 * destination buffer (DST_R and DST_W).
227 *
228 * Another new thing is 'cache_preload' macro. It is used for prefetching
229 * data into CPU L2 cache and improve performance when dealing with large
230 * images which are far larger than cache size. It uses one argument
231 * (actually two, but they need to be the same here) - number of pixels
232 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
233 * details about this macro. Moreover, if good performance is needed
234 * the code from this macro needs to be copied into '*_tail_head' macro
235 * and mixed with the rest of code for optimal instructions scheduling.
236 * We are actually doing it below.
237 *
238 * Now after all the explanations, here is the optimized code.
239 * Different instruction streams (originaling from '*_head', '*_tail'
240 * and 'cache_preload' macro) use different indentation levels for
241 * better readability. Actually taking the code from one of these
242 * indentation levels and ignoring a few VLD/VST instructions would
243 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
244 * macro!
245 */
246
247#if 1
248
249.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
250        vqadd.u8    d16, d2, d20
251    vld1.16     {d4, d5}, [DST_R, :128]!
252        vqadd.u8    q9, q0, q11
253    vshrn.u16   d6, q2, #8
254    vld4.8      {d0, d1, d2, d3}, [SRC]!
255    vshrn.u16   d7, q2, #3
256    vsli.u16    q2, q2, #5
257        vshll.u8    q14, d16, #8
258                                    PF add PF_X, PF_X, #8
259        vshll.u8    q8, d19, #8
260                                    PF tst PF_CTL, #0xF
261    vsri.u8     d6, d6, #5
262                                    PF addne PF_X, PF_X, #8
263    vmvn.8      d3, d3
264                                    PF subne PF_CTL, PF_CTL, #1
265    vsri.u8     d7, d7, #6
266    vshrn.u16   d30, q2, #2
267    vmull.u8    q10, d3, d6
268                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
269    vmull.u8    q11, d3, d7
270    vmull.u8    q12, d3, d30
271                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
272        vsri.u16    q14, q8, #5
273                                    PF cmp PF_X, ORIG_W
274        vshll.u8    q9, d18, #8
275    vrshr.u16   q13, q10, #8
276                                    PF subge PF_X, PF_X, ORIG_W
277    vrshr.u16   q3, q11, #8
278    vrshr.u16   q15, q12, #8
279                                    PF subges PF_CTL, PF_CTL, #0x10
280        vsri.u16    q14, q9, #11
281                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
282    vraddhn.u16 d20, q10, q13
283    vraddhn.u16 d23, q11, q3
284                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
285    vraddhn.u16 d22, q12, q15
286        vst1.16     {d28, d29}, [DST_W, :128]!
287.endm
288
289#else
290
291/* If we did not care much about the performance, we would just use this... */
292.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
293    pixman_composite_over_8888_0565_process_pixblock_tail
294    vst1.16     {d28, d29}, [DST_W, :128]!
295    vld1.16     {d4, d5}, [DST_R, :128]!
296    vld4.32     {d0, d1, d2, d3}, [SRC]!
297    pixman_composite_over_8888_0565_process_pixblock_head
298    cache_preload 8, 8
299.endm
300
301#endif
302
303/*
304 * And now the final part. We are using 'generate_composite_function' macro
305 * to put all the stuff together. We are specifying the name of the function
306 * which we want to get, number of bits per pixel for the source, mask and
307 * destination (0 if unused, like mask in this case). Next come some bit
308 * flags:
309 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
310 *                             and written, for write-only buffer we would use
311 *                             FLAG_DST_WRITEONLY flag instead
312 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
313 *                             and separate color channels for 32bpp format.
314 * The next things are:
315 *  - the number of pixels processed per iteration (8 in this case, because
316 *    that's the maximum what can fit into four 64-bit NEON registers).
317 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
318 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
319 *    prefetch distance can be selected by running some benchmarks.
320 *
321 * After that we specify some macros, these are 'default_init',
322 * 'default_cleanup' here which are empty (but it is possible to have custom
323 * init/cleanup macros to be able to save/restore some extra NEON registers
324 * like d8-d15 or do anything else) followed by
325 * 'pixman_composite_over_8888_0565_process_pixblock_head',
326 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
327 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
328 * which we got implemented above.
329 *
330 * The last part is the NEON registers allocation scheme.
331 */
332generate_composite_function \
333    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
334    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
335    8, /* number of pixels, processed in a single block */ \
336    5, /* prefetch distance */ \
337    default_init, \
338    default_cleanup, \
339    pixman_composite_over_8888_0565_process_pixblock_head, \
340    pixman_composite_over_8888_0565_process_pixblock_tail, \
341    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
342    28, /* dst_w_basereg */ \
343    4,  /* dst_r_basereg */ \
344    0,  /* src_basereg   */ \
345    24  /* mask_basereg  */
346
347/******************************************************************************/
348
349.macro pixman_composite_over_n_0565_process_pixblock_head
350    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
351       and put data into d6 - red, d7 - green, d30 - blue */
352    vshrn.u16   d6, q2, #8
353    vshrn.u16   d7, q2, #3
354    vsli.u16    q2, q2, #5
355    vsri.u8     d6, d6, #5
356    vsri.u8     d7, d7, #6
357    vshrn.u16   d30, q2, #2
358    /* now do alpha blending, storing results in 8-bit planar format
359       into d16 - red, d19 - green, d18 - blue */
360    vmull.u8    q10, d3, d6
361    vmull.u8    q11, d3, d7
362    vmull.u8    q12, d3, d30
363    vrshr.u16   q13, q10, #8
364    vrshr.u16   q3, q11, #8
365    vrshr.u16   q15, q12, #8
366    vraddhn.u16 d20, q10, q13
367    vraddhn.u16 d23, q11, q3
368    vraddhn.u16 d22, q12, q15
369.endm
370
371.macro pixman_composite_over_n_0565_process_pixblock_tail
372    /* ... continue alpha blending */
373    vqadd.u8    d16, d2, d20
374    vqadd.u8    q9, q0, q11
375    /* convert the result to r5g6b5 and store it into {d28, d29} */
376    vshll.u8    q14, d16, #8
377    vshll.u8    q8, d19, #8
378    vshll.u8    q9, d18, #8
379    vsri.u16    q14, q8, #5
380    vsri.u16    q14, q9, #11
381.endm
382
383/* TODO: expand macros and do better instructions scheduling */
384.macro pixman_composite_over_n_0565_process_pixblock_tail_head
385    pixman_composite_over_n_0565_process_pixblock_tail
386    vld1.16     {d4, d5}, [DST_R, :128]!
387    vst1.16     {d28, d29}, [DST_W, :128]!
388    pixman_composite_over_n_0565_process_pixblock_head
389.endm
390
391.macro pixman_composite_over_n_0565_init
392    add         DUMMY, sp, #ARGS_STACK_OFFSET
393    vld1.32     {d3[0]}, [DUMMY]
394    vdup.8      d0, d3[0]
395    vdup.8      d1, d3[1]
396    vdup.8      d2, d3[2]
397    vdup.8      d3, d3[3]
398    vmvn.8      d3, d3      /* invert source alpha */
399.endm
400
401generate_composite_function \
402    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
403    FLAG_DST_READWRITE, \
404    8, /* number of pixels, processed in a single block */ \
405    5, /* prefetch distance */ \
406    pixman_composite_over_n_0565_init, \
407    default_cleanup, \
408    pixman_composite_over_n_0565_process_pixblock_head, \
409    pixman_composite_over_n_0565_process_pixblock_tail, \
410    pixman_composite_over_n_0565_process_pixblock_tail_head, \
411    28, /* dst_w_basereg */ \
412    4,  /* dst_r_basereg */ \
413    0,  /* src_basereg   */ \
414    24  /* mask_basereg  */
415
416/******************************************************************************/
417
418.macro pixman_composite_src_8888_0565_process_pixblock_head
419    vshll.u8    q8, d1, #8
420    vshll.u8    q14, d2, #8
421    vshll.u8    q9, d0, #8
422.endm
423
424.macro pixman_composite_src_8888_0565_process_pixblock_tail
425    vsri.u16    q14, q8, #5
426    vsri.u16    q14, q9, #11
427.endm
428
429.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
430        vsri.u16    q14, q8, #5
431                                    PF add PF_X, PF_X, #8
432                                    PF tst PF_CTL, #0xF
433    vld4.8      {d0, d1, d2, d3}, [SRC]!
434                                    PF addne PF_X, PF_X, #8
435                                    PF subne PF_CTL, PF_CTL, #1
436        vsri.u16    q14, q9, #11
437                                    PF cmp PF_X, ORIG_W
438                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
439    vshll.u8    q8, d1, #8
440        vst1.16     {d28, d29}, [DST_W, :128]!
441                                    PF subge PF_X, PF_X, ORIG_W
442                                    PF subges PF_CTL, PF_CTL, #0x10
443    vshll.u8    q14, d2, #8
444                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
445    vshll.u8    q9, d0, #8
446.endm
447
448generate_composite_function \
449    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
450    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
451    8, /* number of pixels, processed in a single block */ \
452    10, /* prefetch distance */ \
453    default_init, \
454    default_cleanup, \
455    pixman_composite_src_8888_0565_process_pixblock_head, \
456    pixman_composite_src_8888_0565_process_pixblock_tail, \
457    pixman_composite_src_8888_0565_process_pixblock_tail_head
458
459/******************************************************************************/
460
461.macro pixman_composite_src_0565_8888_process_pixblock_head
462    vshrn.u16   d30, q0, #8
463    vshrn.u16   d29, q0, #3
464    vsli.u16    q0, q0, #5
465    vmov.u8     d31, #255
466    vsri.u8     d30, d30, #5
467    vsri.u8     d29, d29, #6
468    vshrn.u16   d28, q0, #2
469.endm
470
471.macro pixman_composite_src_0565_8888_process_pixblock_tail
472.endm
473
474/* TODO: expand macros and do better instructions scheduling */
475.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
476    pixman_composite_src_0565_8888_process_pixblock_tail
477    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
478    vld1.16    {d0, d1}, [SRC]!
479    pixman_composite_src_0565_8888_process_pixblock_head
480    cache_preload 8, 8
481.endm
482
483generate_composite_function \
484    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
485    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
486    8, /* number of pixels, processed in a single block */ \
487    10, /* prefetch distance */ \
488    default_init, \
489    default_cleanup, \
490    pixman_composite_src_0565_8888_process_pixblock_head, \
491    pixman_composite_src_0565_8888_process_pixblock_tail, \
492    pixman_composite_src_0565_8888_process_pixblock_tail_head
493
494/******************************************************************************/
495
496.macro pixman_composite_add_8000_8000_process_pixblock_head
497    vqadd.u8    q14, q0, q2
498    vqadd.u8    q15, q1, q3
499.endm
500
501.macro pixman_composite_add_8000_8000_process_pixblock_tail
502.endm
503
504.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
505    vld1.8      {d0, d1, d2, d3}, [SRC]!
506                                    PF add PF_X, PF_X, #32
507                                    PF tst PF_CTL, #0xF
508    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
509                                    PF addne PF_X, PF_X, #32
510                                    PF subne PF_CTL, PF_CTL, #1
511        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
512                                    PF cmp PF_X, ORIG_W
513                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
514                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
515                                    PF subge PF_X, PF_X, ORIG_W
516                                    PF subges PF_CTL, PF_CTL, #0x10
517    vqadd.u8    q14, q0, q2
518                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
519                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
520    vqadd.u8    q15, q1, q3
521.endm
522
523generate_composite_function \
524    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
525    FLAG_DST_READWRITE, \
526    32, /* number of pixels, processed in a single block */ \
527    10, /* prefetch distance */ \
528    default_init, \
529    default_cleanup, \
530    pixman_composite_add_8000_8000_process_pixblock_head, \
531    pixman_composite_add_8000_8000_process_pixblock_tail, \
532    pixman_composite_add_8000_8000_process_pixblock_tail_head
533
534/******************************************************************************/
535
536.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
537    vld1.8      {d0, d1, d2, d3}, [SRC]!
538                                    PF add PF_X, PF_X, #8
539                                    PF tst PF_CTL, #0xF
540    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
541                                    PF addne PF_X, PF_X, #8
542                                    PF subne PF_CTL, PF_CTL, #1
543        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
544                                    PF cmp PF_X, ORIG_W
545                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
546                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
547                                    PF subge PF_X, PF_X, ORIG_W
548                                    PF subges PF_CTL, PF_CTL, #0x10
549    vqadd.u8    q14, q0, q2
550                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
551                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
552    vqadd.u8    q15, q1, q3
553.endm
554
555generate_composite_function \
556    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
557    FLAG_DST_READWRITE, \
558    8, /* number of pixels, processed in a single block */ \
559    10, /* prefetch distance */ \
560    default_init, \
561    default_cleanup, \
562    pixman_composite_add_8000_8000_process_pixblock_head, \
563    pixman_composite_add_8000_8000_process_pixblock_tail, \
564    pixman_composite_add_8888_8888_process_pixblock_tail_head
565
566generate_composite_function_single_scanline \
567    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
568    FLAG_DST_READWRITE, \
569    8, /* number of pixels, processed in a single block */ \
570    default_init, \
571    default_cleanup, \
572    pixman_composite_add_8000_8000_process_pixblock_head, \
573    pixman_composite_add_8000_8000_process_pixblock_tail, \
574    pixman_composite_add_8888_8888_process_pixblock_tail_head
575
576/******************************************************************************/
577
578.macro pixman_composite_over_8888_8888_process_pixblock_head
579    vmvn.8      d24, d3  /* get inverted alpha */
580    /* do alpha blending */
581    vmull.u8    q8, d24, d4
582    vmull.u8    q9, d24, d5
583    vmull.u8    q10, d24, d6
584    vmull.u8    q11, d24, d7
585.endm
586
587.macro pixman_composite_over_8888_8888_process_pixblock_tail
588    vrshr.u16   q14, q8, #8
589    vrshr.u16   q15, q9, #8
590    vrshr.u16   q12, q10, #8
591    vrshr.u16   q13, q11, #8
592    vraddhn.u16 d28, q14, q8
593    vraddhn.u16 d29, q15, q9
594    vraddhn.u16 d30, q12, q10
595    vraddhn.u16 d31, q13, q11
596    vqadd.u8    q14, q0, q14
597    vqadd.u8    q15, q1, q15
598.endm
599
600.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
601    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
602        vrshr.u16   q14, q8, #8
603                                    PF add PF_X, PF_X, #8
604                                    PF tst PF_CTL, #0xF
605        vrshr.u16   q15, q9, #8
606        vrshr.u16   q12, q10, #8
607        vrshr.u16   q13, q11, #8
608                                    PF addne PF_X, PF_X, #8
609                                    PF subne PF_CTL, PF_CTL, #1
610        vraddhn.u16 d28, q14, q8
611        vraddhn.u16 d29, q15, q9
612                                    PF cmp PF_X, ORIG_W
613        vraddhn.u16 d30, q12, q10
614        vraddhn.u16 d31, q13, q11
615        vqadd.u8    q14, q0, q14
616        vqadd.u8    q15, q1, q15
617    vld4.8      {d0, d1, d2, d3}, [SRC]!
618                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
619    vmvn.8      d22, d3
620                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
621        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
622                                    PF subge PF_X, PF_X, ORIG_W
623    vmull.u8    q8, d22, d4
624                                    PF subges PF_CTL, PF_CTL, #0x10
625    vmull.u8    q9, d22, d5
626                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
627    vmull.u8    q10, d22, d6
628                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
629    vmull.u8    q11, d22, d7
630.endm
631
632generate_composite_function \
633    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
634    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
635    8, /* number of pixels, processed in a single block */ \
636    5, /* prefetch distance */ \
637    default_init, \
638    default_cleanup, \
639    pixman_composite_over_8888_8888_process_pixblock_head, \
640    pixman_composite_over_8888_8888_process_pixblock_tail, \
641    pixman_composite_over_8888_8888_process_pixblock_tail_head
642
643generate_composite_function_single_scanline \
644    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
645    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
646    8, /* number of pixels, processed in a single block */ \
647    default_init, \
648    default_cleanup, \
649    pixman_composite_over_8888_8888_process_pixblock_head, \
650    pixman_composite_over_8888_8888_process_pixblock_tail, \
651    pixman_composite_over_8888_8888_process_pixblock_tail_head
652
653/******************************************************************************/
654
655/* TODO: expand macros and do better instructions scheduling */
656.macro pixman_composite_over_n_8888_process_pixblock_tail_head
657    pixman_composite_over_8888_8888_process_pixblock_tail
658    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
659    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
660    pixman_composite_over_8888_8888_process_pixblock_head
661.endm
662
663.macro pixman_composite_over_n_8888_init
664    add         DUMMY, sp, #ARGS_STACK_OFFSET
665    vld1.32     {d3[0]}, [DUMMY]
666    vdup.8      d0, d3[0]
667    vdup.8      d1, d3[1]
668    vdup.8      d2, d3[2]
669    vdup.8      d3, d3[3]
670.endm
671
672generate_composite_function \
673    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
674    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
675    8, /* number of pixels, processed in a single block */ \
676    5, /* prefetch distance */ \
677    pixman_composite_over_n_8888_init, \
678    default_cleanup, \
679    pixman_composite_over_8888_8888_process_pixblock_head, \
680    pixman_composite_over_8888_8888_process_pixblock_tail, \
681    pixman_composite_over_n_8888_process_pixblock_tail_head
682
683/******************************************************************************/
684
685.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
686        vrshr.u16   q14, q8, #8
687                                    PF add PF_X, PF_X, #8
688                                    PF tst PF_CTL, #0xF
689        vrshr.u16   q15, q9, #8
690        vrshr.u16   q12, q10, #8
691        vrshr.u16   q13, q11, #8
692                                    PF addne PF_X, PF_X, #8
693                                    PF subne PF_CTL, PF_CTL, #1
694        vraddhn.u16 d28, q14, q8
695        vraddhn.u16 d29, q15, q9
696                                    PF cmp PF_X, ORIG_W
697        vraddhn.u16 d30, q12, q10
698        vraddhn.u16 d31, q13, q11
699        vqadd.u8    q14, q0, q14
700        vqadd.u8    q15, q1, q15
701    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
702    vmvn.8      d22, d3
703                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
704        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
705                                    PF subge PF_X, PF_X, ORIG_W
706    vmull.u8    q8, d22, d4
707                                    PF subges PF_CTL, PF_CTL, #0x10
708    vmull.u8    q9, d22, d5
709    vmull.u8    q10, d22, d6
710                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
711    vmull.u8    q11, d22, d7
712.endm
713
714.macro pixman_composite_over_reverse_n_8888_init
715    add         DUMMY, sp, #ARGS_STACK_OFFSET
716    vld1.32     {d7[0]}, [DUMMY]
717    vdup.8      d4, d7[0]
718    vdup.8      d5, d7[1]
719    vdup.8      d6, d7[2]
720    vdup.8      d7, d7[3]
721.endm
722
723generate_composite_function \
724    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
725    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
726    8, /* number of pixels, processed in a single block */ \
727    5, /* prefetch distance */ \
728    pixman_composite_over_reverse_n_8888_init, \
729    default_cleanup, \
730    pixman_composite_over_8888_8888_process_pixblock_head, \
731    pixman_composite_over_8888_8888_process_pixblock_tail, \
732    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
733    28, /* dst_w_basereg */ \
734    0,  /* dst_r_basereg */ \
735    4,  /* src_basereg   */ \
736    24  /* mask_basereg  */
737
738/******************************************************************************/
739
740.macro pixman_composite_over_n_8_0565_process_pixblock_head
741    /* in */
742    vmull.u8    q0, d24, d8
743    vmull.u8    q1, d24, d9
744    vmull.u8    q6, d24, d10
745    vmull.u8    q7, d24, d11
746    vrshr.u16   q10, q0, #8
747    vrshr.u16   q11, q1, #8
748    vrshr.u16   q12, q6, #8
749    vrshr.u16   q13, q7, #8
750    vraddhn.u16 d0, q0, q10
751    vraddhn.u16 d1, q1, q11
752    vraddhn.u16 d2, q6, q12
753    vraddhn.u16 d3, q7, q13
754
755    vshrn.u16   d6, q2, #8
756    vshrn.u16   d7, q2, #3
757    vsli.u16    q2, q2, #5
758    vsri.u8     d6, d6, #5
759    vmvn.8      d3, d3
760    vsri.u8     d7, d7, #6
761    vshrn.u16   d30, q2, #2
762    /* now do alpha blending */
763    vmull.u8    q10, d3, d6
764    vmull.u8    q11, d3, d7
765    vmull.u8    q12, d3, d30
766    vrshr.u16   q13, q10, #8
767    vrshr.u16   q3, q11, #8
768    vrshr.u16   q15, q12, #8
769    vraddhn.u16 d20, q10, q13
770    vraddhn.u16 d23, q11, q3
771    vraddhn.u16 d22, q12, q15
772.endm
773
774.macro pixman_composite_over_n_8_0565_process_pixblock_tail
775    vqadd.u8    d16, d2, d20
776    vqadd.u8    q9, q0, q11
777    /* convert to r5g6b5 */
778    vshll.u8    q14, d16, #8
779    vshll.u8    q8, d19, #8
780    vshll.u8    q9, d18, #8
781    vsri.u16    q14, q8, #5
782    vsri.u16    q14, q9, #11
783.endm
784
785/* TODO: expand macros and do better instructions scheduling */
786.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
787    pixman_composite_over_n_8_0565_process_pixblock_tail
788    vst1.16     {d28, d29}, [DST_W, :128]!
789    vld1.16     {d4, d5}, [DST_R, :128]!
790    vld1.8      {d24}, [MASK]!
791    cache_preload 8, 8
792    pixman_composite_over_n_8_0565_process_pixblock_head
793.endm
794
795/*
796 * This function needs a special initialization of solid mask.
797 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
798 * offset, split into color components and replicated in d8-d11
799 * registers. Additionally, this function needs all the NEON registers,
800 * so it has to save d8-d15 registers which are callee saved according
801 * to ABI. These registers are restored from 'cleanup' macro. All the
802 * other NEON registers are caller saved, so can be clobbered freely
803 * without introducing any problems.
804 */
805.macro pixman_composite_over_n_8_0565_init
806    add         DUMMY, sp, #ARGS_STACK_OFFSET
807    vpush       {d8-d15}
808    vld1.32     {d11[0]}, [DUMMY]
809    vdup.8      d8, d11[0]
810    vdup.8      d9, d11[1]
811    vdup.8      d10, d11[2]
812    vdup.8      d11, d11[3]
813.endm
814
815.macro pixman_composite_over_n_8_0565_cleanup
816    vpop        {d8-d15}
817.endm
818
819generate_composite_function \
820    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
821    FLAG_DST_READWRITE, \
822    8, /* number of pixels, processed in a single block */ \
823    5, /* prefetch distance */ \
824    pixman_composite_over_n_8_0565_init, \
825    pixman_composite_over_n_8_0565_cleanup, \
826    pixman_composite_over_n_8_0565_process_pixblock_head, \
827    pixman_composite_over_n_8_0565_process_pixblock_tail, \
828    pixman_composite_over_n_8_0565_process_pixblock_tail_head
829
830/******************************************************************************/
831
832.macro pixman_composite_src_0565_0565_process_pixblock_head
833.endm
834
835.macro pixman_composite_src_0565_0565_process_pixblock_tail
836.endm
837
838.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
839    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
840    vld1.16 {d0, d1, d2, d3}, [SRC]!
841    cache_preload 16, 16
842.endm
843
844generate_composite_function \
845    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
846    FLAG_DST_WRITEONLY, \
847    16, /* number of pixels, processed in a single block */ \
848    10, /* prefetch distance */ \
849    default_init, \
850    default_cleanup, \
851    pixman_composite_src_0565_0565_process_pixblock_head, \
852    pixman_composite_src_0565_0565_process_pixblock_tail, \
853    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
854    0, /* dst_w_basereg */ \
855    0, /* dst_r_basereg */ \
856    0, /* src_basereg   */ \
857    0  /* mask_basereg  */
858
859/******************************************************************************/
860
861.macro pixman_composite_src_n_8_process_pixblock_head
862.endm
863
864.macro pixman_composite_src_n_8_process_pixblock_tail
865.endm
866
867.macro pixman_composite_src_n_8_process_pixblock_tail_head
868    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
869.endm
870
871.macro pixman_composite_src_n_8_init
872    add         DUMMY, sp, #ARGS_STACK_OFFSET
873    vld1.32     {d0[0]}, [DUMMY]
874    vsli.u64    d0, d0, #8
875    vsli.u64    d0, d0, #16
876    vsli.u64    d0, d0, #32
877    vmov        d1, d0
878    vmov        q1, q0
879.endm
880
881.macro pixman_composite_src_n_8_cleanup
882.endm
883
884generate_composite_function \
885    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
886    FLAG_DST_WRITEONLY, \
887    32, /* number of pixels, processed in a single block */ \
888    0,  /* prefetch distance */ \
889    pixman_composite_src_n_8_init, \
890    pixman_composite_src_n_8_cleanup, \
891    pixman_composite_src_n_8_process_pixblock_head, \
892    pixman_composite_src_n_8_process_pixblock_tail, \
893    pixman_composite_src_n_8_process_pixblock_tail_head, \
894    0, /* dst_w_basereg */ \
895    0, /* dst_r_basereg */ \
896    0, /* src_basereg   */ \
897    0  /* mask_basereg  */
898
899/******************************************************************************/
900
901.macro pixman_composite_src_n_0565_process_pixblock_head
902.endm
903
904.macro pixman_composite_src_n_0565_process_pixblock_tail
905.endm
906
907.macro pixman_composite_src_n_0565_process_pixblock_tail_head
908    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
909.endm
910
911.macro pixman_composite_src_n_0565_init
912    add         DUMMY, sp, #ARGS_STACK_OFFSET
913    vld1.32     {d0[0]}, [DUMMY]
914    vsli.u64    d0, d0, #16
915    vsli.u64    d0, d0, #32
916    vmov        d1, d0
917    vmov        q1, q0
918.endm
919
920.macro pixman_composite_src_n_0565_cleanup
921.endm
922
923generate_composite_function \
924    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
925    FLAG_DST_WRITEONLY, \
926    16, /* number of pixels, processed in a single block */ \
927    0,  /* prefetch distance */ \
928    pixman_composite_src_n_0565_init, \
929    pixman_composite_src_n_0565_cleanup, \
930    pixman_composite_src_n_0565_process_pixblock_head, \
931    pixman_composite_src_n_0565_process_pixblock_tail, \
932    pixman_composite_src_n_0565_process_pixblock_tail_head, \
933    0, /* dst_w_basereg */ \
934    0, /* dst_r_basereg */ \
935    0, /* src_basereg   */ \
936    0  /* mask_basereg  */
937
938/******************************************************************************/
939
940.macro pixman_composite_src_n_8888_process_pixblock_head
941.endm
942
943.macro pixman_composite_src_n_8888_process_pixblock_tail
944.endm
945
946.macro pixman_composite_src_n_8888_process_pixblock_tail_head
947    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
948.endm
949
950.macro pixman_composite_src_n_8888_init
951    add         DUMMY, sp, #ARGS_STACK_OFFSET
952    vld1.32     {d0[0]}, [DUMMY]
953    vsli.u64    d0, d0, #32
954    vmov        d1, d0
955    vmov        q1, q0
956.endm
957
958.macro pixman_composite_src_n_8888_cleanup
959.endm
960
961generate_composite_function \
962    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
963    FLAG_DST_WRITEONLY, \
964    8, /* number of pixels, processed in a single block */ \
965    0, /* prefetch distance */ \
966    pixman_composite_src_n_8888_init, \
967    pixman_composite_src_n_8888_cleanup, \
968    pixman_composite_src_n_8888_process_pixblock_head, \
969    pixman_composite_src_n_8888_process_pixblock_tail, \
970    pixman_composite_src_n_8888_process_pixblock_tail_head, \
971    0, /* dst_w_basereg */ \
972    0, /* dst_r_basereg */ \
973    0, /* src_basereg   */ \
974    0  /* mask_basereg  */
975
976/******************************************************************************/
977
978.macro pixman_composite_src_8888_8888_process_pixblock_head
979.endm
980
981.macro pixman_composite_src_8888_8888_process_pixblock_tail
982.endm
983
984.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
985    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
986    vld1.32 {d0, d1, d2, d3}, [SRC]!
987    cache_preload 8, 8
988.endm
989
990generate_composite_function \
991    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
992    FLAG_DST_WRITEONLY, \
993    8, /* number of pixels, processed in a single block */ \
994    10, /* prefetch distance */ \
995    default_init, \
996    default_cleanup, \
997    pixman_composite_src_8888_8888_process_pixblock_head, \
998    pixman_composite_src_8888_8888_process_pixblock_tail, \
999    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1000    0, /* dst_w_basereg */ \
1001    0, /* dst_r_basereg */ \
1002    0, /* src_basereg   */ \
1003    0  /* mask_basereg  */
1004
1005/******************************************************************************/
1006
1007.macro pixman_composite_src_x888_8888_process_pixblock_head
1008    vorr     q0, q0, q2
1009    vorr     q1, q1, q2
1010.endm
1011
1012.macro pixman_composite_src_x888_8888_process_pixblock_tail
1013.endm
1014
1015.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1016    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1017    vld1.32 {d0, d1, d2, d3}, [SRC]!
1018    vorr     q0, q0, q2
1019    vorr     q1, q1, q2
1020    cache_preload 8, 8
1021.endm
1022
1023.macro pixman_composite_src_x888_8888_init
1024    vmov.u8  q2, #0xFF
1025    vshl.u32 q2, q2, #24
1026.endm
1027
1028generate_composite_function \
1029    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1030    FLAG_DST_WRITEONLY, \
1031    8, /* number of pixels, processed in a single block */ \
1032    10, /* prefetch distance */ \
1033    pixman_composite_src_x888_8888_init, \
1034    default_cleanup, \
1035    pixman_composite_src_x888_8888_process_pixblock_head, \
1036    pixman_composite_src_x888_8888_process_pixblock_tail, \
1037    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1038    0, /* dst_w_basereg */ \
1039    0, /* dst_r_basereg */ \
1040    0, /* src_basereg   */ \
1041    0  /* mask_basereg  */
1042
1043/******************************************************************************/
1044
1045.macro pixman_composite_over_n_8_8888_process_pixblock_head
1046    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1047    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1048    /* and destination data in {d4, d5, d6, d7} */
1049    /* mask is in d24 (d25, d26, d27 are unused) */
1050
1051    /* in */
1052    vmull.u8    q0, d24, d8
1053    vmull.u8    q1, d24, d9
1054    vmull.u8    q6, d24, d10
1055    vmull.u8    q7, d24, d11
1056    vrshr.u16   q10, q0, #8
1057    vrshr.u16   q11, q1, #8
1058    vrshr.u16   q12, q6, #8
1059    vrshr.u16   q13, q7, #8
1060    vraddhn.u16 d0, q0, q10
1061    vraddhn.u16 d1, q1, q11
1062    vraddhn.u16 d2, q6, q12
1063    vraddhn.u16 d3, q7, q13
1064    vmvn.8      d24, d3  /* get inverted alpha */
1065    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
1066    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1067    /* now do alpha blending */
1068    vmull.u8    q8, d24, d4
1069    vmull.u8    q9, d24, d5
1070    vmull.u8    q10, d24, d6
1071    vmull.u8    q11, d24, d7
1072.endm
1073
1074.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1075    vrshr.u16   q14, q8, #8
1076    vrshr.u16   q15, q9, #8
1077    vrshr.u16   q12, q10, #8
1078    vrshr.u16   q13, q11, #8
1079    vraddhn.u16 d28, q14, q8
1080    vraddhn.u16 d29, q15, q9
1081    vraddhn.u16 d30, q12, q10
1082    vraddhn.u16 d31, q13, q11
1083    vqadd.u8    q14, q0, q14
1084    vqadd.u8    q15, q1, q15
1085.endm
1086
1087/* TODO: expand macros and do better instructions scheduling */
1088.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1089    pixman_composite_over_n_8_8888_process_pixblock_tail
1090    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1091    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1092    vld1.8      {d24}, [MASK]!
1093    cache_preload 8, 8
1094    pixman_composite_over_n_8_8888_process_pixblock_head
1095.endm
1096
1097.macro pixman_composite_over_n_8_8888_init
1098    add         DUMMY, sp, #ARGS_STACK_OFFSET
1099    vpush       {d8-d15}
1100    vld1.32     {d11[0]}, [DUMMY]
1101    vdup.8      d8, d11[0]
1102    vdup.8      d9, d11[1]
1103    vdup.8      d10, d11[2]
1104    vdup.8      d11, d11[3]
1105.endm
1106
1107.macro pixman_composite_over_n_8_8888_cleanup
1108    vpop        {d8-d15}
1109.endm
1110
1111generate_composite_function \
1112    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1113    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1114    8, /* number of pixels, processed in a single block */ \
1115    5, /* prefetch distance */ \
1116    pixman_composite_over_n_8_8888_init, \
1117    pixman_composite_over_n_8_8888_cleanup, \
1118    pixman_composite_over_n_8_8888_process_pixblock_head, \
1119    pixman_composite_over_n_8_8888_process_pixblock_tail, \
1120    pixman_composite_over_n_8_8888_process_pixblock_tail_head
1121
1122/******************************************************************************/
1123
1124.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1125    /*
1126     * 'combine_mask_ca' replacement
1127     *
1128     * input:  solid src (n) in {d8,  d9,  d10, d11}
1129     *         dest in          {d4,  d5,  d6,  d7 }
1130     *         mask in          {d24, d25, d26, d27}
1131     * output: updated src in   {d0,  d1,  d2,  d3 }
1132     *         updated mask in  {d24, d25, d26, d3 }
1133     */
1134    vmull.u8    q0,  d24, d8
1135    vmull.u8    q1,  d25, d9
1136    vmull.u8    q6,  d26, d10
1137    vmull.u8    q7,  d27, d11
1138    vmull.u8    q9,  d11, d25
1139    vmull.u8    q12, d11, d24
1140    vmull.u8    q13, d11, d26
1141    vrshr.u16   q8,  q0,  #8
1142    vrshr.u16   q10, q1,  #8
1143    vrshr.u16   q11, q6,  #8
1144    vraddhn.u16 d0,  q0,  q8
1145    vraddhn.u16 d1,  q1,  q10
1146    vraddhn.u16 d2,  q6,  q11
1147    vrshr.u16   q11, q12, #8
1148    vrshr.u16   q8,  q9,  #8
1149    vrshr.u16   q6,  q13, #8
1150    vrshr.u16   q10, q7,  #8
1151    vraddhn.u16 d24, q12, q11
1152    vraddhn.u16 d25, q9,  q8
1153    vraddhn.u16 d26, q13, q6
1154    vraddhn.u16 d3,  q7,  q10
1155    /*
1156     * 'combine_over_ca' replacement
1157     *
1158     * output: updated dest in {d28, d29, d30, d31}
1159     */
1160    vmvn.8      d24, d24
1161    vmvn.8      d25, d25
1162    vmull.u8    q8,  d24, d4
1163    vmull.u8    q9,  d25, d5
1164    vmvn.8      d26, d26
1165    vmvn.8      d27, d3
1166    vmull.u8    q10, d26, d6
1167    vmull.u8    q11, d27, d7
1168.endm
1169
1170.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1171    /* ... continue 'combine_over_ca' replacement */
1172    vrshr.u16   q14, q8,  #8
1173    vrshr.u16   q15, q9,  #8
1174    vrshr.u16   q6,  q10, #8
1175    vrshr.u16   q7,  q11, #8
1176    vraddhn.u16 d28, q14, q8
1177    vraddhn.u16 d29, q15, q9
1178    vraddhn.u16 d30, q6,  q10
1179    vraddhn.u16 d31, q7,  q11
1180    vqadd.u8    q14, q0,  q14
1181    vqadd.u8    q15, q1,  q15
1182.endm
1183
1184.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1185        vrshr.u16   q14, q8, #8
1186        vrshr.u16   q15, q9, #8
1187    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1188        vrshr.u16   q6, q10, #8
1189        vrshr.u16   q7, q11, #8
1190        vraddhn.u16 d28, q14, q8
1191        vraddhn.u16 d29, q15, q9
1192        vraddhn.u16 d30, q6, q10
1193        vraddhn.u16 d31, q7, q11
1194    vld4.8      {d24, d25, d26, d27}, [MASK]!
1195        vqadd.u8    q14, q0, q14
1196        vqadd.u8    q15, q1, q15
1197    cache_preload 8, 8
1198    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1199    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1200.endm
1201
1202.macro pixman_composite_over_n_8888_8888_ca_init
1203    add         DUMMY, sp, #ARGS_STACK_OFFSET
1204    vpush       {d8-d15}
1205    vld1.32     {d11[0]}, [DUMMY]
1206    vdup.8      d8, d11[0]
1207    vdup.8      d9, d11[1]
1208    vdup.8      d10, d11[2]
1209    vdup.8      d11, d11[3]
1210.endm
1211
1212.macro pixman_composite_over_n_8888_8888_ca_cleanup
1213    vpop        {d8-d15}
1214.endm
1215
1216generate_composite_function \
1217    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1218    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1219    8, /* number of pixels, processed in a single block */ \
1220    5, /* prefetch distance */ \
1221    pixman_composite_over_n_8888_8888_ca_init, \
1222    pixman_composite_over_n_8888_8888_ca_cleanup, \
1223    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1224    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1225    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1226
1227/******************************************************************************/
1228
1229.macro pixman_composite_add_n_8_8_process_pixblock_head
1230    /* expecting source data in {d8, d9, d10, d11} */
1231    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1232    /* and destination data in {d4, d5, d6, d7} */
1233    /* mask is in d24, d25, d26, d27 */
1234    vmull.u8    q0, d24, d11
1235    vmull.u8    q1, d25, d11
1236    vmull.u8    q6, d26, d11
1237    vmull.u8    q7, d27, d11
1238    vrshr.u16   q10, q0, #8
1239    vrshr.u16   q11, q1, #8
1240    vrshr.u16   q12, q6, #8
1241    vrshr.u16   q13, q7, #8
1242    vraddhn.u16 d0, q0, q10
1243    vraddhn.u16 d1, q1, q11
1244    vraddhn.u16 d2, q6, q12
1245    vraddhn.u16 d3, q7, q13
1246    vqadd.u8    q14, q0, q2
1247    vqadd.u8    q15, q1, q3
1248.endm
1249
1250.macro pixman_composite_add_n_8_8_process_pixblock_tail
1251.endm
1252
1253/* TODO: expand macros and do better instructions scheduling */
1254.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1255    pixman_composite_add_n_8_8_process_pixblock_tail
1256    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1257    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1258    vld1.8      {d24, d25, d26, d27}, [MASK]!
1259    cache_preload 32, 32
1260    pixman_composite_add_n_8_8_process_pixblock_head
1261.endm
1262
1263.macro pixman_composite_add_n_8_8_init
1264    add         DUMMY, sp, #ARGS_STACK_OFFSET
1265    vpush       {d8-d15}
1266    vld1.32     {d11[0]}, [DUMMY]
1267    vdup.8      d11, d11[3]
1268.endm
1269
1270.macro pixman_composite_add_n_8_8_cleanup
1271    vpop        {d8-d15}
1272.endm
1273
1274generate_composite_function \
1275    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1276    FLAG_DST_READWRITE, \
1277    32, /* number of pixels, processed in a single block */ \
1278    5, /* prefetch distance */ \
1279    pixman_composite_add_n_8_8_init, \
1280    pixman_composite_add_n_8_8_cleanup, \
1281    pixman_composite_add_n_8_8_process_pixblock_head, \
1282    pixman_composite_add_n_8_8_process_pixblock_tail, \
1283    pixman_composite_add_n_8_8_process_pixblock_tail_head
1284
1285/******************************************************************************/
1286
1287.macro pixman_composite_add_8_8_8_process_pixblock_head
1288    /* expecting source data in {d0, d1, d2, d3} */
1289    /* destination data in {d4, d5, d6, d7} */
1290    /* mask in {d24, d25, d26, d27} */
1291    vmull.u8    q8, d24, d0
1292    vmull.u8    q9, d25, d1
1293    vmull.u8    q10, d26, d2
1294    vmull.u8    q11, d27, d3
1295    vrshr.u16   q0, q8, #8
1296    vrshr.u16   q1, q9, #8
1297    vrshr.u16   q12, q10, #8
1298    vrshr.u16   q13, q11, #8
1299    vraddhn.u16 d0, q0, q8
1300    vraddhn.u16 d1, q1, q9
1301    vraddhn.u16 d2, q12, q10
1302    vraddhn.u16 d3, q13, q11
1303    vqadd.u8    q14, q0, q2
1304    vqadd.u8    q15, q1, q3
1305.endm
1306
1307.macro pixman_composite_add_8_8_8_process_pixblock_tail
1308.endm
1309
1310/* TODO: expand macros and do better instructions scheduling */
1311.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1312    pixman_composite_add_8_8_8_process_pixblock_tail
1313    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1314    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1315    vld1.8      {d24, d25, d26, d27}, [MASK]!
1316    vld1.8      {d0, d1, d2, d3}, [SRC]!
1317    cache_preload 32, 32
1318    pixman_composite_add_8_8_8_process_pixblock_head
1319.endm
1320
1321.macro pixman_composite_add_8_8_8_init
1322.endm
1323
1324.macro pixman_composite_add_8_8_8_cleanup
1325.endm
1326
1327generate_composite_function \
1328    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1329    FLAG_DST_READWRITE, \
1330    32, /* number of pixels, processed in a single block */ \
1331    5, /* prefetch distance */ \
1332    pixman_composite_add_8_8_8_init, \
1333    pixman_composite_add_8_8_8_cleanup, \
1334    pixman_composite_add_8_8_8_process_pixblock_head, \
1335    pixman_composite_add_8_8_8_process_pixblock_tail, \
1336    pixman_composite_add_8_8_8_process_pixblock_tail_head
1337
1338/******************************************************************************/
1339
1340.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1341    /* expecting source data in {d0, d1, d2, d3} */
1342    /* destination data in {d4, d5, d6, d7} */
1343    /* mask in {d24, d25, d26, d27} */
1344    vmull.u8    q8, d27, d0
1345    vmull.u8    q9, d27, d1
1346    vmull.u8    q10, d27, d2
1347    vmull.u8    q11, d27, d3
1348    vrshr.u16   q0, q8, #8
1349    vrshr.u16   q1, q9, #8
1350    vrshr.u16   q12, q10, #8
1351    vrshr.u16   q13, q11, #8
1352    vraddhn.u16 d0, q0, q8
1353    vraddhn.u16 d1, q1, q9
1354    vraddhn.u16 d2, q12, q10
1355    vraddhn.u16 d3, q13, q11
1356    vqadd.u8    q14, q0, q2
1357    vqadd.u8    q15, q1, q3
1358.endm
1359
1360.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1361.endm
1362
1363/* TODO: expand macros and do better instructions scheduling */
1364.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1365    pixman_composite_add_8888_8888_8888_process_pixblock_tail
1366    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1367    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1368    vld4.8      {d24, d25, d26, d27}, [MASK]!
1369    vld4.8      {d0, d1, d2, d3}, [SRC]!
1370    cache_preload 8, 8
1371    pixman_composite_add_8888_8888_8888_process_pixblock_head
1372.endm
1373
1374generate_composite_function \
1375    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1376    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1377    8, /* number of pixels, processed in a single block */ \
1378    10, /* prefetch distance */ \
1379    default_init, \
1380    default_cleanup, \
1381    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1382    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1383    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1384
1385generate_composite_function_single_scanline \
1386    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1387    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1388    8, /* number of pixels, processed in a single block */ \
1389    default_init, \
1390    default_cleanup, \
1391    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1392    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1393    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1394
1395/******************************************************************************/
1396
1397.macro pixman_composite_over_8888_n_8888_process_pixblock_head
1398    /* expecting source data in {d0, d1, d2, d3} */
1399    /* destination data in {d4, d5, d6, d7} */
1400    /* solid mask is in d15 */
1401
1402    /* 'in' */
1403    vmull.u8    q8, d15, d3
1404    vmull.u8    q6, d15, d2
1405    vmull.u8    q5, d15, d1
1406    vmull.u8    q4, d15, d0
1407    vrshr.u16   q13, q8, #8
1408    vrshr.u16   q12, q6, #8
1409    vrshr.u16   q11, q5, #8
1410    vrshr.u16   q10, q4, #8
1411    vraddhn.u16 d3, q8, q13
1412    vraddhn.u16 d2, q6, q12
1413    vraddhn.u16 d1, q5, q11
1414    vraddhn.u16 d0, q4, q10
1415    vmvn.8      d24, d3  /* get inverted alpha */
1416    /* now do alpha blending */
1417    vmull.u8    q8, d24, d4
1418    vmull.u8    q9, d24, d5
1419    vmull.u8    q10, d24, d6
1420    vmull.u8    q11, d24, d7
1421.endm
1422
1423.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1424    vrshr.u16   q14, q8, #8
1425    vrshr.u16   q15, q9, #8
1426    vrshr.u16   q12, q10, #8
1427    vrshr.u16   q13, q11, #8
1428    vraddhn.u16 d28, q14, q8
1429    vraddhn.u16 d29, q15, q9
1430    vraddhn.u16 d30, q12, q10
1431    vraddhn.u16 d31, q13, q11
1432    vqadd.u8    q14, q0, q14
1433    vqadd.u8    q15, q1, q15
1434.endm
1435
1436/* TODO: expand macros and do better instructions scheduling */
1437.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1438    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1439    pixman_composite_over_8888_n_8888_process_pixblock_tail
1440    vld4.8     {d0, d1, d2, d3}, [SRC]!
1441    cache_preload 8, 8
1442    pixman_composite_over_8888_n_8888_process_pixblock_head
1443    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1444.endm
1445
1446.macro pixman_composite_over_8888_n_8888_init
1447    add         DUMMY, sp, #48
1448    vpush       {d8-d15}
1449    vld1.32     {d15[0]}, [DUMMY]
1450    vdup.8      d15, d15[3]
1451.endm
1452
1453.macro pixman_composite_over_8888_n_8888_cleanup
1454    vpop        {d8-d15}
1455.endm
1456
1457generate_composite_function \
1458    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1459    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1460    8, /* number of pixels, processed in a single block */ \
1461    5, /* prefetch distance */ \
1462    pixman_composite_over_8888_n_8888_init, \
1463    pixman_composite_over_8888_n_8888_cleanup, \
1464    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1465    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1466    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1467
1468/******************************************************************************/
1469
1470/* TODO: expand macros and do better instructions scheduling */
1471.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1472    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1473    pixman_composite_over_8888_n_8888_process_pixblock_tail
1474    vld4.8     {d0, d1, d2, d3}, [SRC]!
1475    cache_preload 8, 8
1476    vld4.8     {d12, d13, d14, d15}, [MASK]!
1477    pixman_composite_over_8888_n_8888_process_pixblock_head
1478    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1479.endm
1480
1481.macro pixman_composite_over_8888_8888_8888_init
1482    vpush       {d8-d15}
1483.endm
1484
1485.macro pixman_composite_over_8888_8888_8888_cleanup
1486    vpop        {d8-d15}
1487.endm
1488
1489generate_composite_function \
1490    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1491    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1492    8, /* number of pixels, processed in a single block */ \
1493    5, /* prefetch distance */ \
1494    pixman_composite_over_8888_8888_8888_init, \
1495    pixman_composite_over_8888_8888_8888_cleanup, \
1496    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1497    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1498    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1499    28, /* dst_w_basereg */ \
1500    4,  /* dst_r_basereg */ \
1501    0,  /* src_basereg   */ \
1502    12  /* mask_basereg  */
1503
1504generate_composite_function_single_scanline \
1505    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1506    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1507    8, /* number of pixels, processed in a single block */ \
1508    pixman_composite_over_8888_8888_8888_init, \
1509    pixman_composite_over_8888_8888_8888_cleanup, \
1510    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1511    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1512    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1513    28, /* dst_w_basereg */ \
1514    4,  /* dst_r_basereg */ \
1515    0,  /* src_basereg   */ \
1516    12  /* mask_basereg  */
1517
1518/******************************************************************************/
1519
1520/* TODO: expand macros and do better instructions scheduling */
1521.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1522    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1523    pixman_composite_over_8888_n_8888_process_pixblock_tail
1524    vld4.8     {d0, d1, d2, d3}, [SRC]!
1525    cache_preload 8, 8
1526    vld1.8     {d15}, [MASK]!
1527    pixman_composite_over_8888_n_8888_process_pixblock_head
1528    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1529.endm
1530
1531.macro pixman_composite_over_8888_8_8888_init
1532    vpush       {d8-d15}
1533.endm
1534
1535.macro pixman_composite_over_8888_8_8888_cleanup
1536    vpop        {d8-d15}
1537.endm
1538
1539generate_composite_function \
1540    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1541    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1542    8, /* number of pixels, processed in a single block */ \
1543    5, /* prefetch distance */ \
1544    pixman_composite_over_8888_8_8888_init, \
1545    pixman_composite_over_8888_8_8888_cleanup, \
1546    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1547    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1548    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1549    28, /* dst_w_basereg */ \
1550    4,  /* dst_r_basereg */ \
1551    0,  /* src_basereg   */ \
1552    15  /* mask_basereg  */
1553
1554/******************************************************************************/
1555
1556.macro pixman_composite_src_0888_0888_process_pixblock_head
1557.endm
1558
1559.macro pixman_composite_src_0888_0888_process_pixblock_tail
1560.endm
1561
1562.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1563    vst3.8 {d0, d1, d2}, [DST_W]!
1564    vld3.8 {d0, d1, d2}, [SRC]!
1565    cache_preload 8, 8
1566.endm
1567
1568generate_composite_function \
1569    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1570    FLAG_DST_WRITEONLY, \
1571    8, /* number of pixels, processed in a single block */ \
1572    10, /* prefetch distance */ \
1573    default_init, \
1574    default_cleanup, \
1575    pixman_composite_src_0888_0888_process_pixblock_head, \
1576    pixman_composite_src_0888_0888_process_pixblock_tail, \
1577    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1578    0, /* dst_w_basereg */ \
1579    0, /* dst_r_basereg */ \
1580    0, /* src_basereg   */ \
1581    0  /* mask_basereg  */
1582
1583/******************************************************************************/
1584
1585.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1586    vswp   d0, d2
1587.endm
1588
1589.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1590.endm
1591
1592.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1593    vst4.8 {d0, d1, d2, d3}, [DST_W]!
1594    vld3.8 {d0, d1, d2}, [SRC]!
1595    vswp   d0, d2
1596    cache_preload 8, 8
1597.endm
1598
1599.macro pixman_composite_src_0888_8888_rev_init
1600    veor   d3, d3, d3
1601.endm
1602
1603generate_composite_function \
1604    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1605    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1606    8, /* number of pixels, processed in a single block */ \
1607    10, /* prefetch distance */ \
1608    pixman_composite_src_0888_8888_rev_init, \
1609    default_cleanup, \
1610    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1611    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1612    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1613    0, /* dst_w_basereg */ \
1614    0, /* dst_r_basereg */ \
1615    0, /* src_basereg   */ \
1616    0  /* mask_basereg  */
1617
1618/******************************************************************************/
1619
1620.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1621    vshll.u8    q8, d1, #8
1622    vshll.u8    q9, d2, #8
1623.endm
1624
1625.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1626    vshll.u8    q14, d0, #8
1627    vsri.u16    q14, q8, #5
1628    vsri.u16    q14, q9, #11
1629.endm
1630
1631.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1632        vshll.u8    q14, d0, #8
1633    vld3.8 {d0, d1, d2}, [SRC]!
1634        vsri.u16    q14, q8, #5
1635        vsri.u16    q14, q9, #11
1636    vshll.u8    q8, d1, #8
1637        vst1.16 {d28, d29}, [DST_W, :128]!
1638    vshll.u8    q9, d2, #8
1639.endm
1640
1641generate_composite_function \
1642    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1643    FLAG_DST_WRITEONLY, \
1644    8, /* number of pixels, processed in a single block */ \
1645    10, /* prefetch distance */ \
1646    default_init, \
1647    default_cleanup, \
1648    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1649    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1650    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1651    28, /* dst_w_basereg */ \
1652    0, /* dst_r_basereg */ \
1653    0, /* src_basereg   */ \
1654    0  /* mask_basereg  */
1655
1656/******************************************************************************/
1657
1658.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1659    vmull.u8    q8, d3, d0
1660    vmull.u8    q9, d3, d1
1661    vmull.u8    q10, d3, d2
1662.endm
1663
1664.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1665    vrshr.u16   q11, q8, #8
1666    vswp        d3, d31
1667    vrshr.u16   q12, q9, #8
1668    vrshr.u16   q13, q10, #8
1669    vraddhn.u16 d30, q11, q8
1670    vraddhn.u16 d29, q12, q9
1671    vraddhn.u16 d28, q13, q10
1672.endm
1673
1674.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1675        vrshr.u16   q11, q8, #8
1676        vswp        d3, d31
1677        vrshr.u16   q12, q9, #8
1678        vrshr.u16   q13, q10, #8
1679    vld4.8 {d0, d1, d2, d3}, [SRC]!
1680        vraddhn.u16 d30, q11, q8
1681                                    PF add PF_X, PF_X, #8
1682                                    PF tst PF_CTL, #0xF
1683                                    PF addne PF_X, PF_X, #8
1684                                    PF subne PF_CTL, PF_CTL, #1
1685        vraddhn.u16 d29, q12, q9
1686        vraddhn.u16 d28, q13, q10
1687    vmull.u8    q8, d3, d0
1688    vmull.u8    q9, d3, d1
1689    vmull.u8    q10, d3, d2
1690        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1691                                    PF cmp PF_X, ORIG_W
1692                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1693                                    PF subge PF_X, PF_X, ORIG_W
1694                                    PF subges PF_CTL, PF_CTL, #0x10
1695                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1696.endm
1697
1698generate_composite_function \
1699    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1700    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1701    8, /* number of pixels, processed in a single block */ \
1702    10, /* prefetch distance */ \
1703    default_init, \
1704    default_cleanup, \
1705    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1706    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1707    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1708    28, /* dst_w_basereg */ \
1709    0, /* dst_r_basereg */ \
1710    0, /* src_basereg   */ \
1711    0  /* mask_basereg  */
1712
1713#endif