1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 *  - pixman_composite_over_8888_0565_asm_neon
34 *  - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits
40#endif
41
42    .text
43    .fpu neon
44    .arch armv7a
45    .altmacro
46
47#include "pixman-arm-neon-asm.h"
48
49/* Global configuration options and preferences */
50
51/*
52 * The code can optionally make use of unaligned memory accesses to improve
53 * performance of handling leading/trailing pixels for each scanline.
54 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
55 * example in linux if unaligned memory accesses are not configured to
56 * generate.exceptions.
57 */
58.set RESPECT_STRICT_ALIGNMENT, 1
59
60/*
61 * Set default prefetch type. There is a choice between the following options:
62 *
63 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
64 * as NOP to workaround some HW bugs or for whatever other reason)
65 *
66 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
67 * advanced prefetch intruduces heavy overhead)
68 *
69 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
70 * which can run ARM and NEON instructions simultaneously so that extra ARM
71 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
72 *
73 * Note: some types of function can't support advanced prefetch and fallback
74 *       to simple one (those which handle 24bpp pixels)
75 */
76.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
77
78/* Prefetch distance in pixels for simple prefetch */
79.set PREFETCH_DISTANCE_SIMPLE, 64
80
81/*
82 * Implementation of pixman_composite_over_8888_0565_asm_neon
83 *
84 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
85 * performs OVER compositing operation. Function fast_composite_over_8888_0565
86 * from pixman-fast-path.c does the same in C and can be used as a reference.
87 *
88 * First we need to have some NEON assembly code which can do the actual
89 * operation on the pixels and provide it to the template macro.
90 *
91 * Template macro quite conveniently takes care of emitting all the necessary
92 * code for memory reading and writing (including quite tricky cases of
93 * handling unaligned leading/trailing pixels), so we only need to deal with
94 * the data in NEON registers.
95 *
96 * NEON registers allocation in general is recommented to be the following:
97 * d0,  d1,  d2,  d3  - contain loaded source pixel data
98 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
99 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
100 * d28, d29, d30, d31 - place for storing the result (destination pixels)
101 *
102 * As can be seen above, four 64-bit NEON registers are used for keeping
103 * intermediate pixel data and up to 8 pixels can be processed in one step
104 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
105 *
106 * This particular function uses the following registers allocation:
107 * d0,  d1,  d2,  d3  - contain loaded source pixel data
108 * d4,  d5            - contain loaded destination pixels (they are needed)
109 * d28, d29           - place for storing the result (destination pixels)
110 */
111
112/*
113 * Step one. We need to have some code to do some arithmetics on pixel data.
114 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
115 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
116 * perform all the needed calculations and write the result to {d28, d29}.
117 * The rationale for having two macros and not just one will be explained
118 * later. In practice, any single monolitic function which does the work can
119 * be split into two parts in any arbitrary way without affecting correctness.
120 *
121 * There is one special trick here too. Common template macro can optionally
122 * make our life a bit easier by doing R, G, B, A color components
123 * deinterleaving for 32bpp pixel formats (and this feature is used in
124 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
125 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
126 * actually use d0 register for blue channel (a vector of eight 8-bit
127 * values), d1 register for green, d2 for red and d3 for alpha. This
128 * simple conversion can be also done with a few NEON instructions:
129 *
130 * Packed to planar conversion:
131 *  vuzp.8 d0, d1
132 *  vuzp.8 d2, d3
133 *  vuzp.8 d1, d3
134 *  vuzp.8 d0, d2
135 *
136 * Planar to packed conversion:
137 *  vzip.8 d0, d2
138 *  vzip.8 d1, d3
139 *  vzip.8 d2, d3
140 *  vzip.8 d0, d1
141 *
142 * But pixel can be loaded directly in planar format using VLD4.8 NEON
143 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
144 * desirable, that's why deinterleaving is optional.
145 *
146 * But anyway, here is the code:
147 */
148.macro pixman_composite_over_8888_0565_process_pixblock_head
149    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
150       and put data into d6 - red, d7 - green, d30 - blue */
151    vshrn.u16   d6, q2, #8
152    vshrn.u16   d7, q2, #3
153    vsli.u16    q2, q2, #5
154    vsri.u8     d6, d6, #5
155    vmvn.8      d3, d3      /* invert source alpha */
156    vsri.u8     d7, d7, #6
157    vshrn.u16   d30, q2, #2
158    /* now do alpha blending, storing results in 8-bit planar format
159       into d16 - red, d19 - green, d18 - blue */
160    vmull.u8    q10, d3, d6
161    vmull.u8    q11, d3, d7
162    vmull.u8    q12, d3, d30
163    vrshr.u16   q13, q10, #8
164    vrshr.u16   q3, q11, #8
165    vrshr.u16   q15, q12, #8
166    vraddhn.u16 d20, q10, q13
167    vraddhn.u16 d23, q11, q3
168    vraddhn.u16 d22, q12, q15
169.endm
170
171.macro pixman_composite_over_8888_0565_process_pixblock_tail
172    /* ... continue alpha blending */
173    vqadd.u8    d16, d2, d20
174    vqadd.u8    q9, q0, q11
175    /* convert the result to r5g6b5 and store it into {d28, d29} */
176    vshll.u8    q14, d16, #8
177    vshll.u8    q8, d19, #8
178    vshll.u8    q9, d18, #8
179    vsri.u16    q14, q8, #5
180    vsri.u16    q14, q9, #11
181.endm
182
183/*
184 * OK, now we got almost everything that we need. Using the above two
185 * macros, the work can be done right. But now we want to optimize
186 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
187 * a lot from good code scheduling and software pipelining.
188 *
189 * Let's construct some code, which will run in the core main loop.
190 * Some pseudo-code of the main loop will look like this:
191 *   head
192 *   while (...) {
193 *     tail
194 *     head
195 *   }
196 *   tail
197 *
198 * It may look a bit weird, but this setup allows to hide instruction
199 * latencies better and also utilize dual-issue capability more
200 * efficiently (make pairs of load-store and ALU instructions).
201 *
202 * So what we need now is a '*_tail_head' macro, which will be used
203 * in the core main loop. A trivial straightforward implementation
204 * of this macro would look like this:
205 *
206 *   pixman_composite_over_8888_0565_process_pixblock_tail
207 *   vst1.16     {d28, d29}, [DST_W, :128]!
208 *   vld1.16     {d4, d5}, [DST_R, :128]!
209 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
210 *   pixman_composite_over_8888_0565_process_pixblock_head
211 *   cache_preload 8, 8
212 *
213 * Now it also got some VLD/VST instructions. We simply can't move from
214 * processing one block of pixels to the other one with just arithmetics.
215 * The previously processed data needs to be written to memory and new
216 * data needs to be fetched. Fortunately, this main loop does not deal
217 * with partial leading/trailing pixels and can load/store a full block
218 * of pixels in a bulk. Additionally, destination buffer is already
219 * 16 bytes aligned here (which is good for performance).
220 *
221 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
222 * are the aliases for ARM registers which are used as pointers for
223 * accessing data. We maintain separate pointers for reading and writing
224 * destination buffer (DST_R and DST_W).
225 *
226 * Another new thing is 'cache_preload' macro. It is used for prefetching
227 * data into CPU L2 cache and improve performance when dealing with large
228 * images which are far larger than cache size. It uses one argument
229 * (actually two, but they need to be the same here) - number of pixels
230 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
231 * details about this macro. Moreover, if good performance is needed
232 * the code from this macro needs to be copied into '*_tail_head' macro
233 * and mixed with the rest of code for optimal instructions scheduling.
234 * We are actually doing it below.
235 *
236 * Now after all the explanations, here is the optimized code.
237 * Different instruction streams (originaling from '*_head', '*_tail'
238 * and 'cache_preload' macro) use different indentation levels for
239 * better readability. Actually taking the code from one of these
240 * indentation levels and ignoring a few VLD/VST instructions would
241 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
242 * macro!
243 */
244
245#if 1
246
247.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
248        vqadd.u8    d16, d2, d20
249    vld1.16     {d4, d5}, [DST_R, :128]!
250        vqadd.u8    q9, q0, q11
251    vshrn.u16   d6, q2, #8
252    vld4.8      {d0, d1, d2, d3}, [SRC]!
253    vshrn.u16   d7, q2, #3
254    vsli.u16    q2, q2, #5
255        vshll.u8    q14, d16, #8
256                                    PF add PF_X, PF_X, #8
257        vshll.u8    q8, d19, #8
258                                    PF tst PF_CTL, #0xF
259    vsri.u8     d6, d6, #5
260                                    PF addne PF_X, PF_X, #8
261    vmvn.8      d3, d3
262                                    PF subne PF_CTL, PF_CTL, #1
263    vsri.u8     d7, d7, #6
264    vshrn.u16   d30, q2, #2
265    vmull.u8    q10, d3, d6
266                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
267    vmull.u8    q11, d3, d7
268    vmull.u8    q12, d3, d30
269                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
270        vsri.u16    q14, q8, #5
271                                    PF cmp PF_X, ORIG_W
272        vshll.u8    q9, d18, #8
273    vrshr.u16   q13, q10, #8
274                                    PF subge PF_X, PF_X, ORIG_W
275    vrshr.u16   q3, q11, #8
276    vrshr.u16   q15, q12, #8
277                                    PF subges PF_CTL, PF_CTL, #0x10
278        vsri.u16    q14, q9, #11
279                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
280    vraddhn.u16 d20, q10, q13
281    vraddhn.u16 d23, q11, q3
282                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
283    vraddhn.u16 d22, q12, q15
284        vst1.16     {d28, d29}, [DST_W, :128]!
285.endm
286
287#else
288
289/* If we did not care much about the performance, we would just use this... */
290.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
291    pixman_composite_over_8888_0565_process_pixblock_tail
292    vst1.16     {d28, d29}, [DST_W, :128]!
293    vld1.16     {d4, d5}, [DST_R, :128]!
294    vld4.32     {d0, d1, d2, d3}, [SRC]!
295    pixman_composite_over_8888_0565_process_pixblock_head
296    cache_preload 8, 8
297.endm
298
299#endif
300
301/*
302 * And now the final part. We are using 'generate_composite_function' macro
303 * to put all the stuff together. We are specifying the name of the function
304 * which we want to get, number of bits per pixel for the source, mask and
305 * destination (0 if unused, like mask in this case). Next come some bit
306 * flags:
307 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
308 *                             and written, for write-only buffer we would use
309 *                             FLAG_DST_WRITEONLY flag instead
310 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
311 *                             and separate color channels for 32bpp format.
312 * The next things are:
313 *  - the number of pixels processed per iteration (8 in this case, because
314 *    that's the maximum what can fit into four 64-bit NEON registers).
315 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
316 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
317 *    prefetch distance can be selected by running some benchmarks.
318 *
319 * After that we specify some macros, these are 'default_init',
320 * 'default_cleanup' here which are empty (but it is possible to have custom
321 * init/cleanup macros to be able to save/restore some extra NEON registers
322 * like d8-d15 or do anything else) followed by
323 * 'pixman_composite_over_8888_0565_process_pixblock_head',
324 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
325 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
326 * which we got implemented above.
327 *
328 * The last part is the NEON registers allocation scheme.
329 */
330generate_composite_function \
331    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
332    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
333    8, /* number of pixels, processed in a single block */ \
334    5, /* prefetch distance */ \
335    default_init, \
336    default_cleanup, \
337    pixman_composite_over_8888_0565_process_pixblock_head, \
338    pixman_composite_over_8888_0565_process_pixblock_tail, \
339    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
340    28, /* dst_w_basereg */ \
341    4,  /* dst_r_basereg */ \
342    0,  /* src_basereg   */ \
343    24  /* mask_basereg  */
344
345/******************************************************************************/
346
347.macro pixman_composite_over_n_0565_process_pixblock_head
348    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
349       and put data into d6 - red, d7 - green, d30 - blue */
350    vshrn.u16   d6, q2, #8
351    vshrn.u16   d7, q2, #3
352    vsli.u16    q2, q2, #5
353    vsri.u8     d6, d6, #5
354    vsri.u8     d7, d7, #6
355    vshrn.u16   d30, q2, #2
356    /* now do alpha blending, storing results in 8-bit planar format
357       into d16 - red, d19 - green, d18 - blue */
358    vmull.u8    q10, d3, d6
359    vmull.u8    q11, d3, d7
360    vmull.u8    q12, d3, d30
361    vrshr.u16   q13, q10, #8
362    vrshr.u16   q3, q11, #8
363    vrshr.u16   q15, q12, #8
364    vraddhn.u16 d20, q10, q13
365    vraddhn.u16 d23, q11, q3
366    vraddhn.u16 d22, q12, q15
367.endm
368
369.macro pixman_composite_over_n_0565_process_pixblock_tail
370    /* ... continue alpha blending */
371    vqadd.u8    d16, d2, d20
372    vqadd.u8    q9, q0, q11
373    /* convert the result to r5g6b5 and store it into {d28, d29} */
374    vshll.u8    q14, d16, #8
375    vshll.u8    q8, d19, #8
376    vshll.u8    q9, d18, #8
377    vsri.u16    q14, q8, #5
378    vsri.u16    q14, q9, #11
379.endm
380
381/* TODO: expand macros and do better instructions scheduling */
382.macro pixman_composite_over_n_0565_process_pixblock_tail_head
383    pixman_composite_over_n_0565_process_pixblock_tail
384    vld1.16     {d4, d5}, [DST_R, :128]!
385    vst1.16     {d28, d29}, [DST_W, :128]!
386    pixman_composite_over_n_0565_process_pixblock_head
387.endm
388
389.macro pixman_composite_over_n_0565_init
390    add         DUMMY, sp, #ARGS_STACK_OFFSET
391    vld1.32     {d3[0]}, [DUMMY]
392    vdup.8      d0, d3[0]
393    vdup.8      d1, d3[1]
394    vdup.8      d2, d3[2]
395    vdup.8      d3, d3[3]
396    vmvn.8      d3, d3      /* invert source alpha */
397.endm
398
399generate_composite_function \
400    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
401    FLAG_DST_READWRITE, \
402    8, /* number of pixels, processed in a single block */ \
403    5, /* prefetch distance */ \
404    pixman_composite_over_n_0565_init, \
405    default_cleanup, \
406    pixman_composite_over_n_0565_process_pixblock_head, \
407    pixman_composite_over_n_0565_process_pixblock_tail, \
408    pixman_composite_over_n_0565_process_pixblock_tail_head, \
409    28, /* dst_w_basereg */ \
410    4,  /* dst_r_basereg */ \
411    0,  /* src_basereg   */ \
412    24  /* mask_basereg  */
413
414/******************************************************************************/
415
416.macro pixman_composite_src_8888_0565_process_pixblock_head
417    vshll.u8    q8, d1, #8
418    vshll.u8    q14, d2, #8
419    vshll.u8    q9, d0, #8
420.endm
421
422.macro pixman_composite_src_8888_0565_process_pixblock_tail
423    vsri.u16    q14, q8, #5
424    vsri.u16    q14, q9, #11
425.endm
426
427.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
428        vsri.u16    q14, q8, #5
429                                    PF add PF_X, PF_X, #8
430                                    PF tst PF_CTL, #0xF
431    vld4.8      {d0, d1, d2, d3}, [SRC]!
432                                    PF addne PF_X, PF_X, #8
433                                    PF subne PF_CTL, PF_CTL, #1
434        vsri.u16    q14, q9, #11
435                                    PF cmp PF_X, ORIG_W
436                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
437    vshll.u8    q8, d1, #8
438        vst1.16     {d28, d29}, [DST_W, :128]!
439                                    PF subge PF_X, PF_X, ORIG_W
440                                    PF subges PF_CTL, PF_CTL, #0x10
441    vshll.u8    q14, d2, #8
442                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
443    vshll.u8    q9, d0, #8
444.endm
445
446generate_composite_function \
447    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
448    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
449    8, /* number of pixels, processed in a single block */ \
450    10, /* prefetch distance */ \
451    default_init, \
452    default_cleanup, \
453    pixman_composite_src_8888_0565_process_pixblock_head, \
454    pixman_composite_src_8888_0565_process_pixblock_tail, \
455    pixman_composite_src_8888_0565_process_pixblock_tail_head
456
457/******************************************************************************/
458
459.macro pixman_composite_src_0565_8888_process_pixblock_head
460    vshrn.u16   d30, q0, #8
461    vshrn.u16   d29, q0, #3
462    vsli.u16    q0, q0, #5
463    vmov.u8     d31, #255
464    vsri.u8     d30, d30, #5
465    vsri.u8     d29, d29, #6
466    vshrn.u16   d28, q0, #2
467.endm
468
469.macro pixman_composite_src_0565_8888_process_pixblock_tail
470.endm
471
472/* TODO: expand macros and do better instructions scheduling */
473.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
474    pixman_composite_src_0565_8888_process_pixblock_tail
475    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
476    vld1.16    {d0, d1}, [SRC]!
477    pixman_composite_src_0565_8888_process_pixblock_head
478    cache_preload 8, 8
479.endm
480
481generate_composite_function \
482    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
483    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
484    8, /* number of pixels, processed in a single block */ \
485    10, /* prefetch distance */ \
486    default_init, \
487    default_cleanup, \
488    pixman_composite_src_0565_8888_process_pixblock_head, \
489    pixman_composite_src_0565_8888_process_pixblock_tail, \
490    pixman_composite_src_0565_8888_process_pixblock_tail_head
491
492/******************************************************************************/
493
494.macro pixman_composite_add_8000_8000_process_pixblock_head
495    vqadd.u8    q14, q0, q2
496    vqadd.u8    q15, q1, q3
497.endm
498
499.macro pixman_composite_add_8000_8000_process_pixblock_tail
500.endm
501
502.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
503    vld1.8      {d0, d1, d2, d3}, [SRC]!
504                                    PF add PF_X, PF_X, #32
505                                    PF tst PF_CTL, #0xF
506    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
507                                    PF addne PF_X, PF_X, #32
508                                    PF subne PF_CTL, PF_CTL, #1
509        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
510                                    PF cmp PF_X, ORIG_W
511                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
512                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
513                                    PF subge PF_X, PF_X, ORIG_W
514                                    PF subges PF_CTL, PF_CTL, #0x10
515    vqadd.u8    q14, q0, q2
516                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
517                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
518    vqadd.u8    q15, q1, q3
519.endm
520
521generate_composite_function \
522    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
523    FLAG_DST_READWRITE, \
524    32, /* number of pixels, processed in a single block */ \
525    10, /* prefetch distance */ \
526    default_init, \
527    default_cleanup, \
528    pixman_composite_add_8000_8000_process_pixblock_head, \
529    pixman_composite_add_8000_8000_process_pixblock_tail, \
530    pixman_composite_add_8000_8000_process_pixblock_tail_head
531
532/******************************************************************************/
533
534.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
535    vld1.8      {d0, d1, d2, d3}, [SRC]!
536                                    PF add PF_X, PF_X, #8
537                                    PF tst PF_CTL, #0xF
538    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
539                                    PF addne PF_X, PF_X, #8
540                                    PF subne PF_CTL, PF_CTL, #1
541        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
542                                    PF cmp PF_X, ORIG_W
543                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
544                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
545                                    PF subge PF_X, PF_X, ORIG_W
546                                    PF subges PF_CTL, PF_CTL, #0x10
547    vqadd.u8    q14, q0, q2
548                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
549                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
550    vqadd.u8    q15, q1, q3
551.endm
552
553generate_composite_function \
554    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
555    FLAG_DST_READWRITE, \
556    8, /* number of pixels, processed in a single block */ \
557    10, /* prefetch distance */ \
558    default_init, \
559    default_cleanup, \
560    pixman_composite_add_8000_8000_process_pixblock_head, \
561    pixman_composite_add_8000_8000_process_pixblock_tail, \
562    pixman_composite_add_8888_8888_process_pixblock_tail_head
563
564generate_composite_function_single_scanline \
565    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
566    FLAG_DST_READWRITE, \
567    8, /* number of pixels, processed in a single block */ \
568    default_init, \
569    default_cleanup, \
570    pixman_composite_add_8000_8000_process_pixblock_head, \
571    pixman_composite_add_8000_8000_process_pixblock_tail, \
572    pixman_composite_add_8888_8888_process_pixblock_tail_head
573
574/******************************************************************************/
575
576.macro pixman_composite_over_8888_8888_process_pixblock_head
577    vmvn.8      d24, d3  /* get inverted alpha */
578    /* do alpha blending */
579    vmull.u8    q8, d24, d4
580    vmull.u8    q9, d24, d5
581    vmull.u8    q10, d24, d6
582    vmull.u8    q11, d24, d7
583.endm
584
585.macro pixman_composite_over_8888_8888_process_pixblock_tail
586    vrshr.u16   q14, q8, #8
587    vrshr.u16   q15, q9, #8
588    vrshr.u16   q12, q10, #8
589    vrshr.u16   q13, q11, #8
590    vraddhn.u16 d28, q14, q8
591    vraddhn.u16 d29, q15, q9
592    vraddhn.u16 d30, q12, q10
593    vraddhn.u16 d31, q13, q11
594    vqadd.u8    q14, q0, q14
595    vqadd.u8    q15, q1, q15
596.endm
597
598.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
599    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
600        vrshr.u16   q14, q8, #8
601                                    PF add PF_X, PF_X, #8
602                                    PF tst PF_CTL, #0xF
603        vrshr.u16   q15, q9, #8
604        vrshr.u16   q12, q10, #8
605        vrshr.u16   q13, q11, #8
606                                    PF addne PF_X, PF_X, #8
607                                    PF subne PF_CTL, PF_CTL, #1
608        vraddhn.u16 d28, q14, q8
609        vraddhn.u16 d29, q15, q9
610                                    PF cmp PF_X, ORIG_W
611        vraddhn.u16 d30, q12, q10
612        vraddhn.u16 d31, q13, q11
613        vqadd.u8    q14, q0, q14
614        vqadd.u8    q15, q1, q15
615    vld4.8      {d0, d1, d2, d3}, [SRC]!
616                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
617    vmvn.8      d22, d3
618                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
619        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
620                                    PF subge PF_X, PF_X, ORIG_W
621    vmull.u8    q8, d22, d4
622                                    PF subges PF_CTL, PF_CTL, #0x10
623    vmull.u8    q9, d22, d5
624                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
625    vmull.u8    q10, d22, d6
626                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
627    vmull.u8    q11, d22, d7
628.endm
629
630generate_composite_function \
631    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
632    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
633    8, /* number of pixels, processed in a single block */ \
634    5, /* prefetch distance */ \
635    default_init, \
636    default_cleanup, \
637    pixman_composite_over_8888_8888_process_pixblock_head, \
638    pixman_composite_over_8888_8888_process_pixblock_tail, \
639    pixman_composite_over_8888_8888_process_pixblock_tail_head
640
641generate_composite_function_single_scanline \
642    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
643    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
644    8, /* number of pixels, processed in a single block */ \
645    default_init, \
646    default_cleanup, \
647    pixman_composite_over_8888_8888_process_pixblock_head, \
648    pixman_composite_over_8888_8888_process_pixblock_tail, \
649    pixman_composite_over_8888_8888_process_pixblock_tail_head
650
651/******************************************************************************/
652
653/* TODO: expand macros and do better instructions scheduling */
654.macro pixman_composite_over_n_8888_process_pixblock_tail_head
655    pixman_composite_over_8888_8888_process_pixblock_tail
656    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
657    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
658    pixman_composite_over_8888_8888_process_pixblock_head
659.endm
660
661.macro pixman_composite_over_n_8888_init
662    add         DUMMY, sp, #ARGS_STACK_OFFSET
663    vld1.32     {d3[0]}, [DUMMY]
664    vdup.8      d0, d3[0]
665    vdup.8      d1, d3[1]
666    vdup.8      d2, d3[2]
667    vdup.8      d3, d3[3]
668.endm
669
670generate_composite_function \
671    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
672    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
673    8, /* number of pixels, processed in a single block */ \
674    5, /* prefetch distance */ \
675    pixman_composite_over_n_8888_init, \
676    default_cleanup, \
677    pixman_composite_over_8888_8888_process_pixblock_head, \
678    pixman_composite_over_8888_8888_process_pixblock_tail, \
679    pixman_composite_over_n_8888_process_pixblock_tail_head
680
681/******************************************************************************/
682
683.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
684        vrshr.u16   q14, q8, #8
685                                    PF add PF_X, PF_X, #8
686                                    PF tst PF_CTL, #0xF
687        vrshr.u16   q15, q9, #8
688        vrshr.u16   q12, q10, #8
689        vrshr.u16   q13, q11, #8
690                                    PF addne PF_X, PF_X, #8
691                                    PF subne PF_CTL, PF_CTL, #1
692        vraddhn.u16 d28, q14, q8
693        vraddhn.u16 d29, q15, q9
694                                    PF cmp PF_X, ORIG_W
695        vraddhn.u16 d30, q12, q10
696        vraddhn.u16 d31, q13, q11
697        vqadd.u8    q14, q0, q14
698        vqadd.u8    q15, q1, q15
699    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
700    vmvn.8      d22, d3
701                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
702        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
703                                    PF subge PF_X, PF_X, ORIG_W
704    vmull.u8    q8, d22, d4
705                                    PF subges PF_CTL, PF_CTL, #0x10
706    vmull.u8    q9, d22, d5
707    vmull.u8    q10, d22, d6
708                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
709    vmull.u8    q11, d22, d7
710.endm
711
712.macro pixman_composite_over_reverse_n_8888_init
713    add         DUMMY, sp, #ARGS_STACK_OFFSET
714    vld1.32     {d7[0]}, [DUMMY]
715    vdup.8      d4, d7[0]
716    vdup.8      d5, d7[1]
717    vdup.8      d6, d7[2]
718    vdup.8      d7, d7[3]
719.endm
720
721generate_composite_function \
722    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
723    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
724    8, /* number of pixels, processed in a single block */ \
725    5, /* prefetch distance */ \
726    pixman_composite_over_reverse_n_8888_init, \
727    default_cleanup, \
728    pixman_composite_over_8888_8888_process_pixblock_head, \
729    pixman_composite_over_8888_8888_process_pixblock_tail, \
730    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
731    28, /* dst_w_basereg */ \
732    0,  /* dst_r_basereg */ \
733    4,  /* src_basereg   */ \
734    24  /* mask_basereg  */
735
736/******************************************************************************/
737
738.macro pixman_composite_over_n_8_0565_process_pixblock_head
739    /* in */
740    vmull.u8    q0, d24, d8
741    vmull.u8    q1, d24, d9
742    vmull.u8    q6, d24, d10
743    vmull.u8    q7, d24, d11
744    vrshr.u16   q10, q0, #8
745    vrshr.u16   q11, q1, #8
746    vrshr.u16   q12, q6, #8
747    vrshr.u16   q13, q7, #8
748    vraddhn.u16 d0, q0, q10
749    vraddhn.u16 d1, q1, q11
750    vraddhn.u16 d2, q6, q12
751    vraddhn.u16 d3, q7, q13
752
753    vshrn.u16   d6, q2, #8
754    vshrn.u16   d7, q2, #3
755    vsli.u16    q2, q2, #5
756    vsri.u8     d6, d6, #5
757    vmvn.8      d3, d3
758    vsri.u8     d7, d7, #6
759    vshrn.u16   d30, q2, #2
760    /* now do alpha blending */
761    vmull.u8    q10, d3, d6
762    vmull.u8    q11, d3, d7
763    vmull.u8    q12, d3, d30
764    vrshr.u16   q13, q10, #8
765    vrshr.u16   q3, q11, #8
766    vrshr.u16   q15, q12, #8
767    vraddhn.u16 d20, q10, q13
768    vraddhn.u16 d23, q11, q3
769    vraddhn.u16 d22, q12, q15
770.endm
771
772.macro pixman_composite_over_n_8_0565_process_pixblock_tail
773    vqadd.u8    d16, d2, d20
774    vqadd.u8    q9, q0, q11
775    /* convert to r5g6b5 */
776    vshll.u8    q14, d16, #8
777    vshll.u8    q8, d19, #8
778    vshll.u8    q9, d18, #8
779    vsri.u16    q14, q8, #5
780    vsri.u16    q14, q9, #11
781.endm
782
783/* TODO: expand macros and do better instructions scheduling */
784.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
785    pixman_composite_over_n_8_0565_process_pixblock_tail
786    vst1.16     {d28, d29}, [DST_W, :128]!
787    vld1.16     {d4, d5}, [DST_R, :128]!
788    vld1.8      {d24}, [MASK]!
789    cache_preload 8, 8
790    pixman_composite_over_n_8_0565_process_pixblock_head
791.endm
792
793/*
794 * This function needs a special initialization of solid mask.
795 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
796 * offset, split into color components and replicated in d8-d11
797 * registers. Additionally, this function needs all the NEON registers,
798 * so it has to save d8-d15 registers which are callee saved according
799 * to ABI. These registers are restored from 'cleanup' macro. All the
800 * other NEON registers are caller saved, so can be clobbered freely
801 * without introducing any problems.
802 */
803.macro pixman_composite_over_n_8_0565_init
804    add         DUMMY, sp, #ARGS_STACK_OFFSET
805    vpush       {d8-d15}
806    vld1.32     {d11[0]}, [DUMMY]
807    vdup.8      d8, d11[0]
808    vdup.8      d9, d11[1]
809    vdup.8      d10, d11[2]
810    vdup.8      d11, d11[3]
811.endm
812
813.macro pixman_composite_over_n_8_0565_cleanup
814    vpop        {d8-d15}
815.endm
816
817generate_composite_function \
818    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
819    FLAG_DST_READWRITE, \
820    8, /* number of pixels, processed in a single block */ \
821    5, /* prefetch distance */ \
822    pixman_composite_over_n_8_0565_init, \
823    pixman_composite_over_n_8_0565_cleanup, \
824    pixman_composite_over_n_8_0565_process_pixblock_head, \
825    pixman_composite_over_n_8_0565_process_pixblock_tail, \
826    pixman_composite_over_n_8_0565_process_pixblock_tail_head
827
828/******************************************************************************/
829
830.macro pixman_composite_src_0565_0565_process_pixblock_head
831.endm
832
833.macro pixman_composite_src_0565_0565_process_pixblock_tail
834.endm
835
836.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
837    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
838    vld1.16 {d0, d1, d2, d3}, [SRC]!
839    cache_preload 16, 16
840.endm
841
842generate_composite_function \
843    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
844    FLAG_DST_WRITEONLY, \
845    16, /* number of pixels, processed in a single block */ \
846    10, /* prefetch distance */ \
847    default_init, \
848    default_cleanup, \
849    pixman_composite_src_0565_0565_process_pixblock_head, \
850    pixman_composite_src_0565_0565_process_pixblock_tail, \
851    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
852    0, /* dst_w_basereg */ \
853    0, /* dst_r_basereg */ \
854    0, /* src_basereg   */ \
855    0  /* mask_basereg  */
856
857/******************************************************************************/
858
859.macro pixman_composite_src_n_8_process_pixblock_head
860.endm
861
862.macro pixman_composite_src_n_8_process_pixblock_tail
863.endm
864
865.macro pixman_composite_src_n_8_process_pixblock_tail_head
866    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
867.endm
868
869.macro pixman_composite_src_n_8_init
870    add         DUMMY, sp, #ARGS_STACK_OFFSET
871    vld1.32     {d0[0]}, [DUMMY]
872    vsli.u64    d0, d0, #8
873    vsli.u64    d0, d0, #16
874    vsli.u64    d0, d0, #32
875    vmov        d1, d0
876    vmov        q1, q0
877.endm
878
879.macro pixman_composite_src_n_8_cleanup
880.endm
881
882generate_composite_function \
883    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
884    FLAG_DST_WRITEONLY, \
885    32, /* number of pixels, processed in a single block */ \
886    0,  /* prefetch distance */ \
887    pixman_composite_src_n_8_init, \
888    pixman_composite_src_n_8_cleanup, \
889    pixman_composite_src_n_8_process_pixblock_head, \
890    pixman_composite_src_n_8_process_pixblock_tail, \
891    pixman_composite_src_n_8_process_pixblock_tail_head, \
892    0, /* dst_w_basereg */ \
893    0, /* dst_r_basereg */ \
894    0, /* src_basereg   */ \
895    0  /* mask_basereg  */
896
897/******************************************************************************/
898
899.macro pixman_composite_src_n_0565_process_pixblock_head
900.endm
901
902.macro pixman_composite_src_n_0565_process_pixblock_tail
903.endm
904
905.macro pixman_composite_src_n_0565_process_pixblock_tail_head
906    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
907.endm
908
909.macro pixman_composite_src_n_0565_init
910    add         DUMMY, sp, #ARGS_STACK_OFFSET
911    vld1.32     {d0[0]}, [DUMMY]
912    vsli.u64    d0, d0, #16
913    vsli.u64    d0, d0, #32
914    vmov        d1, d0
915    vmov        q1, q0
916.endm
917
918.macro pixman_composite_src_n_0565_cleanup
919.endm
920
921generate_composite_function \
922    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
923    FLAG_DST_WRITEONLY, \
924    16, /* number of pixels, processed in a single block */ \
925    0,  /* prefetch distance */ \
926    pixman_composite_src_n_0565_init, \
927    pixman_composite_src_n_0565_cleanup, \
928    pixman_composite_src_n_0565_process_pixblock_head, \
929    pixman_composite_src_n_0565_process_pixblock_tail, \
930    pixman_composite_src_n_0565_process_pixblock_tail_head, \
931    0, /* dst_w_basereg */ \
932    0, /* dst_r_basereg */ \
933    0, /* src_basereg   */ \
934    0  /* mask_basereg  */
935
936/******************************************************************************/
937
938.macro pixman_composite_src_n_8888_process_pixblock_head
939.endm
940
941.macro pixman_composite_src_n_8888_process_pixblock_tail
942.endm
943
944.macro pixman_composite_src_n_8888_process_pixblock_tail_head
945    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
946.endm
947
948.macro pixman_composite_src_n_8888_init
949    add         DUMMY, sp, #ARGS_STACK_OFFSET
950    vld1.32     {d0[0]}, [DUMMY]
951    vsli.u64    d0, d0, #32
952    vmov        d1, d0
953    vmov        q1, q0
954.endm
955
956.macro pixman_composite_src_n_8888_cleanup
957.endm
958
959generate_composite_function \
960    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
961    FLAG_DST_WRITEONLY, \
962    8, /* number of pixels, processed in a single block */ \
963    0, /* prefetch distance */ \
964    pixman_composite_src_n_8888_init, \
965    pixman_composite_src_n_8888_cleanup, \
966    pixman_composite_src_n_8888_process_pixblock_head, \
967    pixman_composite_src_n_8888_process_pixblock_tail, \
968    pixman_composite_src_n_8888_process_pixblock_tail_head, \
969    0, /* dst_w_basereg */ \
970    0, /* dst_r_basereg */ \
971    0, /* src_basereg   */ \
972    0  /* mask_basereg  */
973
974/******************************************************************************/
975
976.macro pixman_composite_src_8888_8888_process_pixblock_head
977.endm
978
979.macro pixman_composite_src_8888_8888_process_pixblock_tail
980.endm
981
982.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
983    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
984    vld1.32 {d0, d1, d2, d3}, [SRC]!
985    cache_preload 8, 8
986.endm
987
988generate_composite_function \
989    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
990    FLAG_DST_WRITEONLY, \
991    8, /* number of pixels, processed in a single block */ \
992    10, /* prefetch distance */ \
993    default_init, \
994    default_cleanup, \
995    pixman_composite_src_8888_8888_process_pixblock_head, \
996    pixman_composite_src_8888_8888_process_pixblock_tail, \
997    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
998    0, /* dst_w_basereg */ \
999    0, /* dst_r_basereg */ \
1000    0, /* src_basereg   */ \
1001    0  /* mask_basereg  */
1002
1003/******************************************************************************/
1004
1005.macro pixman_composite_src_x888_8888_process_pixblock_head
1006    vorr     q0, q0, q2
1007    vorr     q1, q1, q2
1008.endm
1009
1010.macro pixman_composite_src_x888_8888_process_pixblock_tail
1011.endm
1012
1013.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1014    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1015    vld1.32 {d0, d1, d2, d3}, [SRC]!
1016    vorr     q0, q0, q2
1017    vorr     q1, q1, q2
1018    cache_preload 8, 8
1019.endm
1020
1021.macro pixman_composite_src_x888_8888_init
1022    vmov.u8  q2, #0xFF
1023    vshl.u32 q2, q2, #24
1024.endm
1025
1026generate_composite_function \
1027    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1028    FLAG_DST_WRITEONLY, \
1029    8, /* number of pixels, processed in a single block */ \
1030    10, /* prefetch distance */ \
1031    pixman_composite_src_x888_8888_init, \
1032    default_cleanup, \
1033    pixman_composite_src_x888_8888_process_pixblock_head, \
1034    pixman_composite_src_x888_8888_process_pixblock_tail, \
1035    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1036    0, /* dst_w_basereg */ \
1037    0, /* dst_r_basereg */ \
1038    0, /* src_basereg   */ \
1039    0  /* mask_basereg  */
1040
1041/******************************************************************************/
1042
1043.macro pixman_composite_over_n_8_8888_process_pixblock_head
1044    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1045    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1046    /* and destination data in {d4, d5, d6, d7} */
1047    /* mask is in d24 (d25, d26, d27 are unused) */
1048
1049    /* in */
1050    vmull.u8    q0, d24, d8
1051    vmull.u8    q1, d24, d9
1052    vmull.u8    q6, d24, d10
1053    vmull.u8    q7, d24, d11
1054    vrshr.u16   q10, q0, #8
1055    vrshr.u16   q11, q1, #8
1056    vrshr.u16   q12, q6, #8
1057    vrshr.u16   q13, q7, #8
1058    vraddhn.u16 d0, q0, q10
1059    vraddhn.u16 d1, q1, q11
1060    vraddhn.u16 d2, q6, q12
1061    vraddhn.u16 d3, q7, q13
1062    vmvn.8      d24, d3  /* get inverted alpha */
1063    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
1064    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1065    /* now do alpha blending */
1066    vmull.u8    q8, d24, d4
1067    vmull.u8    q9, d24, d5
1068    vmull.u8    q10, d24, d6
1069    vmull.u8    q11, d24, d7
1070.endm
1071
1072.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1073    vrshr.u16   q14, q8, #8
1074    vrshr.u16   q15, q9, #8
1075    vrshr.u16   q12, q10, #8
1076    vrshr.u16   q13, q11, #8
1077    vraddhn.u16 d28, q14, q8
1078    vraddhn.u16 d29, q15, q9
1079    vraddhn.u16 d30, q12, q10
1080    vraddhn.u16 d31, q13, q11
1081    vqadd.u8    q14, q0, q14
1082    vqadd.u8    q15, q1, q15
1083.endm
1084
1085/* TODO: expand macros and do better instructions scheduling */
1086.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1087    pixman_composite_over_n_8_8888_process_pixblock_tail
1088    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1089    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1090    vld1.8      {d24}, [MASK]!
1091    cache_preload 8, 8
1092    pixman_composite_over_n_8_8888_process_pixblock_head
1093.endm
1094
1095.macro pixman_composite_over_n_8_8888_init
1096    add         DUMMY, sp, #ARGS_STACK_OFFSET
1097    vpush       {d8-d15}
1098    vld1.32     {d11[0]}, [DUMMY]
1099    vdup.8      d8, d11[0]
1100    vdup.8      d9, d11[1]
1101    vdup.8      d10, d11[2]
1102    vdup.8      d11, d11[3]
1103.endm
1104
1105.macro pixman_composite_over_n_8_8888_cleanup
1106    vpop        {d8-d15}
1107.endm
1108
1109generate_composite_function \
1110    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1111    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1112    8, /* number of pixels, processed in a single block */ \
1113    5, /* prefetch distance */ \
1114    pixman_composite_over_n_8_8888_init, \
1115    pixman_composite_over_n_8_8888_cleanup, \
1116    pixman_composite_over_n_8_8888_process_pixblock_head, \
1117    pixman_composite_over_n_8_8888_process_pixblock_tail, \
1118    pixman_composite_over_n_8_8888_process_pixblock_tail_head
1119
1120/******************************************************************************/
1121
1122.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1123    /*
1124     * 'combine_mask_ca' replacement
1125     *
1126     * input:  solid src (n) in {d8,  d9,  d10, d11}
1127     *         dest in          {d4,  d5,  d6,  d7 }
1128     *         mask in          {d24, d25, d26, d27}
1129     * output: updated src in   {d0,  d1,  d2,  d3 }
1130     *         updated mask in  {d24, d25, d26, d3 }
1131     */
1132    vmull.u8    q0,  d24, d8
1133    vmull.u8    q1,  d25, d9
1134    vmull.u8    q6,  d26, d10
1135    vmull.u8    q7,  d27, d11
1136    vmull.u8    q9,  d11, d25
1137    vmull.u8    q12, d11, d24
1138    vmull.u8    q13, d11, d26
1139    vrshr.u16   q8,  q0,  #8
1140    vrshr.u16   q10, q1,  #8
1141    vrshr.u16   q11, q6,  #8
1142    vraddhn.u16 d0,  q0,  q8
1143    vraddhn.u16 d1,  q1,  q10
1144    vraddhn.u16 d2,  q6,  q11
1145    vrshr.u16   q11, q12, #8
1146    vrshr.u16   q8,  q9,  #8
1147    vrshr.u16   q6,  q13, #8
1148    vrshr.u16   q10, q7,  #8
1149    vraddhn.u16 d24, q12, q11
1150    vraddhn.u16 d25, q9,  q8
1151    vraddhn.u16 d26, q13, q6
1152    vraddhn.u16 d3,  q7,  q10
1153    /*
1154     * 'combine_over_ca' replacement
1155     *
1156     * output: updated dest in {d28, d29, d30, d31}
1157     */
1158    vmvn.8      d24, d24
1159    vmvn.8      d25, d25
1160    vmull.u8    q8,  d24, d4
1161    vmull.u8    q9,  d25, d5
1162    vmvn.8      d26, d26
1163    vmvn.8      d27, d3
1164    vmull.u8    q10, d26, d6
1165    vmull.u8    q11, d27, d7
1166.endm
1167
1168.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1169    /* ... continue 'combine_over_ca' replacement */
1170    vrshr.u16   q14, q8,  #8
1171    vrshr.u16   q15, q9,  #8
1172    vrshr.u16   q6,  q10, #8
1173    vrshr.u16   q7,  q11, #8
1174    vraddhn.u16 d28, q14, q8
1175    vraddhn.u16 d29, q15, q9
1176    vraddhn.u16 d30, q6,  q10
1177    vraddhn.u16 d31, q7,  q11
1178    vqadd.u8    q14, q0,  q14
1179    vqadd.u8    q15, q1,  q15
1180.endm
1181
1182.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1183        vrshr.u16   q14, q8, #8
1184        vrshr.u16   q15, q9, #8
1185    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1186        vrshr.u16   q6, q10, #8
1187        vrshr.u16   q7, q11, #8
1188        vraddhn.u16 d28, q14, q8
1189        vraddhn.u16 d29, q15, q9
1190        vraddhn.u16 d30, q6, q10
1191        vraddhn.u16 d31, q7, q11
1192    vld4.8      {d24, d25, d26, d27}, [MASK]!
1193        vqadd.u8    q14, q0, q14
1194        vqadd.u8    q15, q1, q15
1195    cache_preload 8, 8
1196    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1197    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1198.endm
1199
1200.macro pixman_composite_over_n_8888_8888_ca_init
1201    add         DUMMY, sp, #ARGS_STACK_OFFSET
1202    vpush       {d8-d15}
1203    vld1.32     {d11[0]}, [DUMMY]
1204    vdup.8      d8, d11[0]
1205    vdup.8      d9, d11[1]
1206    vdup.8      d10, d11[2]
1207    vdup.8      d11, d11[3]
1208.endm
1209
1210.macro pixman_composite_over_n_8888_8888_ca_cleanup
1211    vpop        {d8-d15}
1212.endm
1213
1214generate_composite_function \
1215    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1216    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1217    8, /* number of pixels, processed in a single block */ \
1218    5, /* prefetch distance */ \
1219    pixman_composite_over_n_8888_8888_ca_init, \
1220    pixman_composite_over_n_8888_8888_ca_cleanup, \
1221    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1222    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1223    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1224
1225/******************************************************************************/
1226
1227.macro pixman_composite_add_n_8_8_process_pixblock_head
1228    /* expecting source data in {d8, d9, d10, d11} */
1229    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1230    /* and destination data in {d4, d5, d6, d7} */
1231    /* mask is in d24, d25, d26, d27 */
1232    vmull.u8    q0, d24, d11
1233    vmull.u8    q1, d25, d11
1234    vmull.u8    q6, d26, d11
1235    vmull.u8    q7, d27, d11
1236    vrshr.u16   q10, q0, #8
1237    vrshr.u16   q11, q1, #8
1238    vrshr.u16   q12, q6, #8
1239    vrshr.u16   q13, q7, #8
1240    vraddhn.u16 d0, q0, q10
1241    vraddhn.u16 d1, q1, q11
1242    vraddhn.u16 d2, q6, q12
1243    vraddhn.u16 d3, q7, q13
1244    vqadd.u8    q14, q0, q2
1245    vqadd.u8    q15, q1, q3
1246.endm
1247
1248.macro pixman_composite_add_n_8_8_process_pixblock_tail
1249.endm
1250
1251/* TODO: expand macros and do better instructions scheduling */
1252.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1253    pixman_composite_add_n_8_8_process_pixblock_tail
1254    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1255    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1256    vld1.8      {d24, d25, d26, d27}, [MASK]!
1257    cache_preload 32, 32
1258    pixman_composite_add_n_8_8_process_pixblock_head
1259.endm
1260
1261.macro pixman_composite_add_n_8_8_init
1262    add         DUMMY, sp, #ARGS_STACK_OFFSET
1263    vpush       {d8-d15}
1264    vld1.32     {d11[0]}, [DUMMY]
1265    vdup.8      d11, d11[3]
1266.endm
1267
1268.macro pixman_composite_add_n_8_8_cleanup
1269    vpop        {d8-d15}
1270.endm
1271
1272generate_composite_function \
1273    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1274    FLAG_DST_READWRITE, \
1275    32, /* number of pixels, processed in a single block */ \
1276    5, /* prefetch distance */ \
1277    pixman_composite_add_n_8_8_init, \
1278    pixman_composite_add_n_8_8_cleanup, \
1279    pixman_composite_add_n_8_8_process_pixblock_head, \
1280    pixman_composite_add_n_8_8_process_pixblock_tail, \
1281    pixman_composite_add_n_8_8_process_pixblock_tail_head
1282
1283/******************************************************************************/
1284
1285.macro pixman_composite_add_8_8_8_process_pixblock_head
1286    /* expecting source data in {d0, d1, d2, d3} */
1287    /* destination data in {d4, d5, d6, d7} */
1288    /* mask in {d24, d25, d26, d27} */
1289    vmull.u8    q8, d24, d0
1290    vmull.u8    q9, d25, d1
1291    vmull.u8    q10, d26, d2
1292    vmull.u8    q11, d27, d3
1293    vrshr.u16   q0, q8, #8
1294    vrshr.u16   q1, q9, #8
1295    vrshr.u16   q12, q10, #8
1296    vrshr.u16   q13, q11, #8
1297    vraddhn.u16 d0, q0, q8
1298    vraddhn.u16 d1, q1, q9
1299    vraddhn.u16 d2, q12, q10
1300    vraddhn.u16 d3, q13, q11
1301    vqadd.u8    q14, q0, q2
1302    vqadd.u8    q15, q1, q3
1303.endm
1304
1305.macro pixman_composite_add_8_8_8_process_pixblock_tail
1306.endm
1307
1308/* TODO: expand macros and do better instructions scheduling */
1309.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1310    pixman_composite_add_8_8_8_process_pixblock_tail
1311    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1312    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1313    vld1.8      {d24, d25, d26, d27}, [MASK]!
1314    vld1.8      {d0, d1, d2, d3}, [SRC]!
1315    cache_preload 32, 32
1316    pixman_composite_add_8_8_8_process_pixblock_head
1317.endm
1318
1319.macro pixman_composite_add_8_8_8_init
1320.endm
1321
1322.macro pixman_composite_add_8_8_8_cleanup
1323.endm
1324
1325generate_composite_function \
1326    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1327    FLAG_DST_READWRITE, \
1328    32, /* number of pixels, processed in a single block */ \
1329    5, /* prefetch distance */ \
1330    pixman_composite_add_8_8_8_init, \
1331    pixman_composite_add_8_8_8_cleanup, \
1332    pixman_composite_add_8_8_8_process_pixblock_head, \
1333    pixman_composite_add_8_8_8_process_pixblock_tail, \
1334    pixman_composite_add_8_8_8_process_pixblock_tail_head
1335
1336/******************************************************************************/
1337
1338.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1339    /* expecting source data in {d0, d1, d2, d3} */
1340    /* destination data in {d4, d5, d6, d7} */
1341    /* mask in {d24, d25, d26, d27} */
1342    vmull.u8    q8, d27, d0
1343    vmull.u8    q9, d27, d1
1344    vmull.u8    q10, d27, d2
1345    vmull.u8    q11, d27, d3
1346    vrshr.u16   q0, q8, #8
1347    vrshr.u16   q1, q9, #8
1348    vrshr.u16   q12, q10, #8
1349    vrshr.u16   q13, q11, #8
1350    vraddhn.u16 d0, q0, q8
1351    vraddhn.u16 d1, q1, q9
1352    vraddhn.u16 d2, q12, q10
1353    vraddhn.u16 d3, q13, q11
1354    vqadd.u8    q14, q0, q2
1355    vqadd.u8    q15, q1, q3
1356.endm
1357
1358.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1359.endm
1360
1361/* TODO: expand macros and do better instructions scheduling */
1362.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1363    pixman_composite_add_8888_8888_8888_process_pixblock_tail
1364    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1365    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1366    vld4.8      {d24, d25, d26, d27}, [MASK]!
1367    vld4.8      {d0, d1, d2, d3}, [SRC]!
1368    cache_preload 8, 8
1369    pixman_composite_add_8888_8888_8888_process_pixblock_head
1370.endm
1371
1372generate_composite_function \
1373    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
1374    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1375    8, /* number of pixels, processed in a single block */ \
1376    10, /* prefetch distance */ \
1377    default_init, \
1378    default_cleanup, \
1379    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1380    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1381    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1382
1383generate_composite_function_single_scanline \
1384    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
1385    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1386    8, /* number of pixels, processed in a single block */ \
1387    default_init, \
1388    default_cleanup, \
1389    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
1390    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
1391    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
1392
1393/******************************************************************************/
1394
1395.macro pixman_composite_over_8888_n_8888_process_pixblock_head
1396    /* expecting source data in {d0, d1, d2, d3} */
1397    /* destination data in {d4, d5, d6, d7} */
1398    /* solid mask is in d15 */
1399
1400    /* 'in' */
1401    vmull.u8    q8, d15, d3
1402    vmull.u8    q6, d15, d2
1403    vmull.u8    q5, d15, d1
1404    vmull.u8    q4, d15, d0
1405    vrshr.u16   q13, q8, #8
1406    vrshr.u16   q12, q6, #8
1407    vrshr.u16   q11, q5, #8
1408    vrshr.u16   q10, q4, #8
1409    vraddhn.u16 d3, q8, q13
1410    vraddhn.u16 d2, q6, q12
1411    vraddhn.u16 d1, q5, q11
1412    vraddhn.u16 d0, q4, q10
1413    vmvn.8      d24, d3  /* get inverted alpha */
1414    /* now do alpha blending */
1415    vmull.u8    q8, d24, d4
1416    vmull.u8    q9, d24, d5
1417    vmull.u8    q10, d24, d6
1418    vmull.u8    q11, d24, d7
1419.endm
1420
1421.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
1422    vrshr.u16   q14, q8, #8
1423    vrshr.u16   q15, q9, #8
1424    vrshr.u16   q12, q10, #8
1425    vrshr.u16   q13, q11, #8
1426    vraddhn.u16 d28, q14, q8
1427    vraddhn.u16 d29, q15, q9
1428    vraddhn.u16 d30, q12, q10
1429    vraddhn.u16 d31, q13, q11
1430    vqadd.u8    q14, q0, q14
1431    vqadd.u8    q15, q1, q15
1432.endm
1433
1434/* TODO: expand macros and do better instructions scheduling */
1435.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1436    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1437    pixman_composite_over_8888_n_8888_process_pixblock_tail
1438    vld4.8     {d0, d1, d2, d3}, [SRC]!
1439    cache_preload 8, 8
1440    pixman_composite_over_8888_n_8888_process_pixblock_head
1441    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1442.endm
1443
1444.macro pixman_composite_over_8888_n_8888_init
1445    add         DUMMY, sp, #48
1446    vpush       {d8-d15}
1447    vld1.32     {d15[0]}, [DUMMY]
1448    vdup.8      d15, d15[3]
1449.endm
1450
1451.macro pixman_composite_over_8888_n_8888_cleanup
1452    vpop        {d8-d15}
1453.endm
1454
1455generate_composite_function \
1456    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
1457    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1458    8, /* number of pixels, processed in a single block */ \
1459    5, /* prefetch distance */ \
1460    pixman_composite_over_8888_n_8888_init, \
1461    pixman_composite_over_8888_n_8888_cleanup, \
1462    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1463    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1464    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
1465
1466/******************************************************************************/
1467
1468/* TODO: expand macros and do better instructions scheduling */
1469.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
1470    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1471    pixman_composite_over_8888_n_8888_process_pixblock_tail
1472    vld4.8     {d0, d1, d2, d3}, [SRC]!
1473    cache_preload 8, 8
1474    vld4.8     {d12, d13, d14, d15}, [MASK]!
1475    pixman_composite_over_8888_n_8888_process_pixblock_head
1476    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1477.endm
1478
1479.macro pixman_composite_over_8888_8888_8888_init
1480    vpush       {d8-d15}
1481.endm
1482
1483.macro pixman_composite_over_8888_8888_8888_cleanup
1484    vpop        {d8-d15}
1485.endm
1486
1487generate_composite_function \
1488    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1489    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1490    8, /* number of pixels, processed in a single block */ \
1491    5, /* prefetch distance */ \
1492    pixman_composite_over_8888_8888_8888_init, \
1493    pixman_composite_over_8888_8888_8888_cleanup, \
1494    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1495    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1496    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1497    28, /* dst_w_basereg */ \
1498    4,  /* dst_r_basereg */ \
1499    0,  /* src_basereg   */ \
1500    12  /* mask_basereg  */
1501
1502generate_composite_function_single_scanline \
1503    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1504    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1505    8, /* number of pixels, processed in a single block */ \
1506    pixman_composite_over_8888_8888_8888_init, \
1507    pixman_composite_over_8888_8888_8888_cleanup, \
1508    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1509    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1510    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1511    28, /* dst_w_basereg */ \
1512    4,  /* dst_r_basereg */ \
1513    0,  /* src_basereg   */ \
1514    12  /* mask_basereg  */
1515
1516/******************************************************************************/
1517
1518/* TODO: expand macros and do better instructions scheduling */
1519.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
1520    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
1521    pixman_composite_over_8888_n_8888_process_pixblock_tail
1522    vld4.8     {d0, d1, d2, d3}, [SRC]!
1523    cache_preload 8, 8
1524    vld1.8     {d15}, [MASK]!
1525    pixman_composite_over_8888_n_8888_process_pixblock_head
1526    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1527.endm
1528
1529.macro pixman_composite_over_8888_8_8888_init
1530    vpush       {d8-d15}
1531.endm
1532
1533.macro pixman_composite_over_8888_8_8888_cleanup
1534    vpop        {d8-d15}
1535.endm
1536
1537generate_composite_function \
1538    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1539    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1540    8, /* number of pixels, processed in a single block */ \
1541    5, /* prefetch distance */ \
1542    pixman_composite_over_8888_8_8888_init, \
1543    pixman_composite_over_8888_8_8888_cleanup, \
1544    pixman_composite_over_8888_n_8888_process_pixblock_head, \
1545    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1546    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1547    28, /* dst_w_basereg */ \
1548    4,  /* dst_r_basereg */ \
1549    0,  /* src_basereg   */ \
1550    15  /* mask_basereg  */
1551
1552/******************************************************************************/
1553
1554.macro pixman_composite_src_0888_0888_process_pixblock_head
1555.endm
1556
1557.macro pixman_composite_src_0888_0888_process_pixblock_tail
1558.endm
1559
1560.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
1561    vst3.8 {d0, d1, d2}, [DST_W]!
1562    vld3.8 {d0, d1, d2}, [SRC]!
1563    cache_preload 8, 8
1564.endm
1565
1566generate_composite_function \
1567    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
1568    FLAG_DST_WRITEONLY, \
1569    8, /* number of pixels, processed in a single block */ \
1570    10, /* prefetch distance */ \
1571    default_init, \
1572    default_cleanup, \
1573    pixman_composite_src_0888_0888_process_pixblock_head, \
1574    pixman_composite_src_0888_0888_process_pixblock_tail, \
1575    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
1576    0, /* dst_w_basereg */ \
1577    0, /* dst_r_basereg */ \
1578    0, /* src_basereg   */ \
1579    0  /* mask_basereg  */
1580
1581/******************************************************************************/
1582
1583.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
1584    vswp   d0, d2
1585.endm
1586
1587.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
1588.endm
1589
1590.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
1591    vst4.8 {d0, d1, d2, d3}, [DST_W]!
1592    vld3.8 {d0, d1, d2}, [SRC]!
1593    vswp   d0, d2
1594    cache_preload 8, 8
1595.endm
1596
1597.macro pixman_composite_src_0888_8888_rev_init
1598    veor   d3, d3, d3
1599.endm
1600
1601generate_composite_function \
1602    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
1603    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1604    8, /* number of pixels, processed in a single block */ \
1605    10, /* prefetch distance */ \
1606    pixman_composite_src_0888_8888_rev_init, \
1607    default_cleanup, \
1608    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
1609    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
1610    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
1611    0, /* dst_w_basereg */ \
1612    0, /* dst_r_basereg */ \
1613    0, /* src_basereg   */ \
1614    0  /* mask_basereg  */
1615
1616/******************************************************************************/
1617
1618.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
1619    vshll.u8    q8, d1, #8
1620    vshll.u8    q9, d2, #8
1621.endm
1622
1623.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
1624    vshll.u8    q14, d0, #8
1625    vsri.u16    q14, q8, #5
1626    vsri.u16    q14, q9, #11
1627.endm
1628
1629.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
1630        vshll.u8    q14, d0, #8
1631    vld3.8 {d0, d1, d2}, [SRC]!
1632        vsri.u16    q14, q8, #5
1633        vsri.u16    q14, q9, #11
1634    vshll.u8    q8, d1, #8
1635        vst1.16 {d28, d29}, [DST_W, :128]!
1636    vshll.u8    q9, d2, #8
1637.endm
1638
1639generate_composite_function \
1640    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
1641    FLAG_DST_WRITEONLY, \
1642    8, /* number of pixels, processed in a single block */ \
1643    10, /* prefetch distance */ \
1644    default_init, \
1645    default_cleanup, \
1646    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
1647    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
1648    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
1649    28, /* dst_w_basereg */ \
1650    0, /* dst_r_basereg */ \
1651    0, /* src_basereg   */ \
1652    0  /* mask_basereg  */
1653
1654/******************************************************************************/
1655
1656.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
1657    vmull.u8    q8, d3, d0
1658    vmull.u8    q9, d3, d1
1659    vmull.u8    q10, d3, d2
1660.endm
1661
1662.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
1663    vrshr.u16   q11, q8, #8
1664    vswp        d3, d31
1665    vrshr.u16   q12, q9, #8
1666    vrshr.u16   q13, q10, #8
1667    vraddhn.u16 d30, q11, q8
1668    vraddhn.u16 d29, q12, q9
1669    vraddhn.u16 d28, q13, q10
1670.endm
1671
1672.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1673        vrshr.u16   q11, q8, #8
1674        vswp        d3, d31
1675        vrshr.u16   q12, q9, #8
1676        vrshr.u16   q13, q10, #8
1677    vld4.8 {d0, d1, d2, d3}, [SRC]!
1678        vraddhn.u16 d30, q11, q8
1679                                    PF add PF_X, PF_X, #8
1680                                    PF tst PF_CTL, #0xF
1681                                    PF addne PF_X, PF_X, #8
1682                                    PF subne PF_CTL, PF_CTL, #1
1683        vraddhn.u16 d29, q12, q9
1684        vraddhn.u16 d28, q13, q10
1685    vmull.u8    q8, d3, d0
1686    vmull.u8    q9, d3, d1
1687    vmull.u8    q10, d3, d2
1688        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1689                                    PF cmp PF_X, ORIG_W
1690                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1691                                    PF subge PF_X, PF_X, ORIG_W
1692                                    PF subges PF_CTL, PF_CTL, #0x10
1693                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1694.endm
1695
1696generate_composite_function \
1697    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1698    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1699    8, /* number of pixels, processed in a single block */ \
1700    10, /* prefetch distance */ \
1701    default_init, \
1702    default_cleanup, \
1703    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
1704    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
1705    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
1706    28, /* dst_w_basereg */ \
1707    0, /* dst_r_basereg */ \
1708    0, /* src_basereg   */ \
1709    0  /* mask_basereg  */
1710