1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 *  - pixman_composite_over_8888_0565_asm_neon
34 *  - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits
40#endif
41
42    .text
43    .fpu neon
44    .arch armv7a
45    .object_arch armv4
46    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
48    .arm
49    .altmacro
50    .p2align 2
51
52#include "pixman-private.h"
53#include "pixman-arm-asm.h"
54#include "pixman-arm-neon-asm.h"
55
56/* Global configuration options and preferences */
57
58/*
59 * The code can optionally make use of unaligned memory accesses to improve
60 * performance of handling leading/trailing pixels for each scanline.
61 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
62 * example in linux if unaligned memory accesses are not configured to
63 * generate.exceptions.
64 */
65.set RESPECT_STRICT_ALIGNMENT, 1
66
67/*
68 * Set default prefetch type. There is a choice between the following options:
69 *
70 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
71 * as NOP to workaround some HW bugs or for whatever other reason)
72 *
73 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
74 * advanced prefetch intruduces heavy overhead)
75 *
76 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
77 * which can run ARM and NEON instructions simultaneously so that extra ARM
78 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
79 *
80 * Note: some types of function can't support advanced prefetch and fallback
81 *       to simple one (those which handle 24bpp pixels)
82 */
83.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
84
85/* Prefetch distance in pixels for simple prefetch */
86.set PREFETCH_DISTANCE_SIMPLE, 64
87
88/*
89 * Implementation of pixman_composite_over_8888_0565_asm_neon
90 *
91 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
92 * performs OVER compositing operation. Function fast_composite_over_8888_0565
93 * from pixman-fast-path.c does the same in C and can be used as a reference.
94 *
95 * First we need to have some NEON assembly code which can do the actual
96 * operation on the pixels and provide it to the template macro.
97 *
98 * Template macro quite conveniently takes care of emitting all the necessary
99 * code for memory reading and writing (including quite tricky cases of
100 * handling unaligned leading/trailing pixels), so we only need to deal with
101 * the data in NEON registers.
102 *
103 * NEON registers allocation in general is recommented to be the following:
104 * d0,  d1,  d2,  d3  - contain loaded source pixel data
105 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
106 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
107 * d28, d29, d30, d31 - place for storing the result (destination pixels)
108 *
109 * As can be seen above, four 64-bit NEON registers are used for keeping
110 * intermediate pixel data and up to 8 pixels can be processed in one step
111 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
112 *
113 * This particular function uses the following registers allocation:
114 * d0,  d1,  d2,  d3  - contain loaded source pixel data
115 * d4,  d5            - contain loaded destination pixels (they are needed)
116 * d28, d29           - place for storing the result (destination pixels)
117 */
118
119/*
120 * Step one. We need to have some code to do some arithmetics on pixel data.
121 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
122 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
123 * perform all the needed calculations and write the result to {d28, d29}.
124 * The rationale for having two macros and not just one will be explained
125 * later. In practice, any single monolitic function which does the work can
126 * be split into two parts in any arbitrary way without affecting correctness.
127 *
128 * There is one special trick here too. Common template macro can optionally
129 * make our life a bit easier by doing R, G, B, A color components
130 * deinterleaving for 32bpp pixel formats (and this feature is used in
131 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
132 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
133 * actually use d0 register for blue channel (a vector of eight 8-bit
134 * values), d1 register for green, d2 for red and d3 for alpha. This
135 * simple conversion can be also done with a few NEON instructions:
136 *
137 * Packed to planar conversion:
138 *  vuzp.8 d0, d1
139 *  vuzp.8 d2, d3
140 *  vuzp.8 d1, d3
141 *  vuzp.8 d0, d2
142 *
143 * Planar to packed conversion:
144 *  vzip.8 d0, d2
145 *  vzip.8 d1, d3
146 *  vzip.8 d2, d3
147 *  vzip.8 d0, d1
148 *
149 * But pixel can be loaded directly in planar format using VLD4.8 NEON
150 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
151 * desirable, that's why deinterleaving is optional.
152 *
153 * But anyway, here is the code:
154 */
155.macro pixman_composite_over_8888_0565_process_pixblock_head
156    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
157       and put data into d6 - red, d7 - green, d30 - blue */
158    vshrn.u16   d6, q2, #8
159    vshrn.u16   d7, q2, #3
160    vsli.u16    q2, q2, #5
161    vsri.u8     d6, d6, #5
162    vmvn.8      d3, d3      /* invert source alpha */
163    vsri.u8     d7, d7, #6
164    vshrn.u16   d30, q2, #2
165    /* now do alpha blending, storing results in 8-bit planar format
166       into d16 - red, d19 - green, d18 - blue */
167    vmull.u8    q10, d3, d6
168    vmull.u8    q11, d3, d7
169    vmull.u8    q12, d3, d30
170    vrshr.u16   q13, q10, #8
171    vrshr.u16   q3, q11, #8
172    vrshr.u16   q15, q12, #8
173    vraddhn.u16 d20, q10, q13
174    vraddhn.u16 d23, q11, q3
175    vraddhn.u16 d22, q12, q15
176.endm
177
178.macro pixman_composite_over_8888_0565_process_pixblock_tail
179    /* ... continue alpha blending */
180    vqadd.u8    d16, d2, d20
181    vqadd.u8    q9, q0, q11
182    /* convert the result to r5g6b5 and store it into {d28, d29} */
183    vshll.u8    q14, d16, #8
184    vshll.u8    q8, d19, #8
185    vshll.u8    q9, d18, #8
186    vsri.u16    q14, q8, #5
187    vsri.u16    q14, q9, #11
188.endm
189
190/*
191 * OK, now we got almost everything that we need. Using the above two
192 * macros, the work can be done right. But now we want to optimize
193 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
194 * a lot from good code scheduling and software pipelining.
195 *
196 * Let's construct some code, which will run in the core main loop.
197 * Some pseudo-code of the main loop will look like this:
198 *   head
199 *   while (...) {
200 *     tail
201 *     head
202 *   }
203 *   tail
204 *
205 * It may look a bit weird, but this setup allows to hide instruction
206 * latencies better and also utilize dual-issue capability more
207 * efficiently (make pairs of load-store and ALU instructions).
208 *
209 * So what we need now is a '*_tail_head' macro, which will be used
210 * in the core main loop. A trivial straightforward implementation
211 * of this macro would look like this:
212 *
213 *   pixman_composite_over_8888_0565_process_pixblock_tail
214 *   vst1.16     {d28, d29}, [DST_W, :128]!
215 *   vld1.16     {d4, d5}, [DST_R, :128]!
216 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
217 *   pixman_composite_over_8888_0565_process_pixblock_head
218 *   cache_preload 8, 8
219 *
220 * Now it also got some VLD/VST instructions. We simply can't move from
221 * processing one block of pixels to the other one with just arithmetics.
222 * The previously processed data needs to be written to memory and new
223 * data needs to be fetched. Fortunately, this main loop does not deal
224 * with partial leading/trailing pixels and can load/store a full block
225 * of pixels in a bulk. Additionally, destination buffer is already
226 * 16 bytes aligned here (which is good for performance).
227 *
228 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
229 * are the aliases for ARM registers which are used as pointers for
230 * accessing data. We maintain separate pointers for reading and writing
231 * destination buffer (DST_R and DST_W).
232 *
233 * Another new thing is 'cache_preload' macro. It is used for prefetching
234 * data into CPU L2 cache and improve performance when dealing with large
235 * images which are far larger than cache size. It uses one argument
236 * (actually two, but they need to be the same here) - number of pixels
237 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
238 * details about this macro. Moreover, if good performance is needed
239 * the code from this macro needs to be copied into '*_tail_head' macro
240 * and mixed with the rest of code for optimal instructions scheduling.
241 * We are actually doing it below.
242 *
243 * Now after all the explanations, here is the optimized code.
244 * Different instruction streams (originaling from '*_head', '*_tail'
245 * and 'cache_preload' macro) use different indentation levels for
246 * better readability. Actually taking the code from one of these
247 * indentation levels and ignoring a few VLD/VST instructions would
248 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
249 * macro!
250 */
251
252#if 1
253
254.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
255        vqadd.u8    d16, d2, d20
256    vld1.16     {d4, d5}, [DST_R, :128]!
257        vqadd.u8    q9, q0, q11
258    vshrn.u16   d6, q2, #8
259    fetch_src_pixblock
260    vshrn.u16   d7, q2, #3
261    vsli.u16    q2, q2, #5
262        vshll.u8    q14, d16, #8
263                                    PF add PF_X, PF_X, #8
264        vshll.u8    q8, d19, #8
265                                    PF tst PF_CTL, #0xF
266    vsri.u8     d6, d6, #5
267                                    PF addne PF_X, PF_X, #8
268    vmvn.8      d3, d3
269                                    PF subne PF_CTL, PF_CTL, #1
270    vsri.u8     d7, d7, #6
271    vshrn.u16   d30, q2, #2
272    vmull.u8    q10, d3, d6
273                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
274    vmull.u8    q11, d3, d7
275    vmull.u8    q12, d3, d30
276                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
277        vsri.u16    q14, q8, #5
278                                    PF cmp PF_X, ORIG_W
279        vshll.u8    q9, d18, #8
280    vrshr.u16   q13, q10, #8
281                                    PF subge PF_X, PF_X, ORIG_W
282    vrshr.u16   q3, q11, #8
283    vrshr.u16   q15, q12, #8
284                                    PF subges PF_CTL, PF_CTL, #0x10
285        vsri.u16    q14, q9, #11
286                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
287    vraddhn.u16 d20, q10, q13
288    vraddhn.u16 d23, q11, q3
289                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
290    vraddhn.u16 d22, q12, q15
291        vst1.16     {d28, d29}, [DST_W, :128]!
292.endm
293
294#else
295
296/* If we did not care much about the performance, we would just use this... */
297.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
298    pixman_composite_over_8888_0565_process_pixblock_tail
299    vst1.16     {d28, d29}, [DST_W, :128]!
300    vld1.16     {d4, d5}, [DST_R, :128]!
301    fetch_src_pixblock
302    pixman_composite_over_8888_0565_process_pixblock_head
303    cache_preload 8, 8
304.endm
305
306#endif
307
308/*
309 * And now the final part. We are using 'generate_composite_function' macro
310 * to put all the stuff together. We are specifying the name of the function
311 * which we want to get, number of bits per pixel for the source, mask and
312 * destination (0 if unused, like mask in this case). Next come some bit
313 * flags:
314 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
315 *                             and written, for write-only buffer we would use
316 *                             FLAG_DST_WRITEONLY flag instead
317 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
318 *                             and separate color channels for 32bpp format.
319 * The next things are:
320 *  - the number of pixels processed per iteration (8 in this case, because
321 *    that's the maximum what can fit into four 64-bit NEON registers).
322 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
323 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
324 *    prefetch distance can be selected by running some benchmarks.
325 *
326 * After that we specify some macros, these are 'default_init',
327 * 'default_cleanup' here which are empty (but it is possible to have custom
328 * init/cleanup macros to be able to save/restore some extra NEON registers
329 * like d8-d15 or do anything else) followed by
330 * 'pixman_composite_over_8888_0565_process_pixblock_head',
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
332 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
333 * which we got implemented above.
334 *
335 * The last part is the NEON registers allocation scheme.
336 */
337generate_composite_function \
338    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
339    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
340    8, /* number of pixels, processed in a single block */ \
341    5, /* prefetch distance */ \
342    default_init, \
343    default_cleanup, \
344    pixman_composite_over_8888_0565_process_pixblock_head, \
345    pixman_composite_over_8888_0565_process_pixblock_tail, \
346    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
347    28, /* dst_w_basereg */ \
348    4,  /* dst_r_basereg */ \
349    0,  /* src_basereg   */ \
350    24  /* mask_basereg  */
351
352/******************************************************************************/
353
354.macro pixman_composite_over_n_0565_process_pixblock_head
355    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
356       and put data into d6 - red, d7 - green, d30 - blue */
357    vshrn.u16   d6, q2, #8
358    vshrn.u16   d7, q2, #3
359    vsli.u16    q2, q2, #5
360    vsri.u8     d6, d6, #5
361    vsri.u8     d7, d7, #6
362    vshrn.u16   d30, q2, #2
363    /* now do alpha blending, storing results in 8-bit planar format
364       into d16 - red, d19 - green, d18 - blue */
365    vmull.u8    q10, d3, d6
366    vmull.u8    q11, d3, d7
367    vmull.u8    q12, d3, d30
368    vrshr.u16   q13, q10, #8
369    vrshr.u16   q3, q11, #8
370    vrshr.u16   q15, q12, #8
371    vraddhn.u16 d20, q10, q13
372    vraddhn.u16 d23, q11, q3
373    vraddhn.u16 d22, q12, q15
374.endm
375
376.macro pixman_composite_over_n_0565_process_pixblock_tail
377    /* ... continue alpha blending */
378    vqadd.u8    d16, d2, d20
379    vqadd.u8    q9, q0, q11
380    /* convert the result to r5g6b5 and store it into {d28, d29} */
381    vshll.u8    q14, d16, #8
382    vshll.u8    q8, d19, #8
383    vshll.u8    q9, d18, #8
384    vsri.u16    q14, q8, #5
385    vsri.u16    q14, q9, #11
386.endm
387
388/* TODO: expand macros and do better instructions scheduling */
389.macro pixman_composite_over_n_0565_process_pixblock_tail_head
390    pixman_composite_over_n_0565_process_pixblock_tail
391    vld1.16     {d4, d5}, [DST_R, :128]!
392    vst1.16     {d28, d29}, [DST_W, :128]!
393    pixman_composite_over_n_0565_process_pixblock_head
394    cache_preload 8, 8
395.endm
396
397.macro pixman_composite_over_n_0565_init
398    add         DUMMY, sp, #ARGS_STACK_OFFSET
399    vld1.32     {d3[0]}, [DUMMY]
400    vdup.8      d0, d3[0]
401    vdup.8      d1, d3[1]
402    vdup.8      d2, d3[2]
403    vdup.8      d3, d3[3]
404    vmvn.8      d3, d3      /* invert source alpha */
405.endm
406
407generate_composite_function \
408    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
409    FLAG_DST_READWRITE, \
410    8, /* number of pixels, processed in a single block */ \
411    5, /* prefetch distance */ \
412    pixman_composite_over_n_0565_init, \
413    default_cleanup, \
414    pixman_composite_over_n_0565_process_pixblock_head, \
415    pixman_composite_over_n_0565_process_pixblock_tail, \
416    pixman_composite_over_n_0565_process_pixblock_tail_head, \
417    28, /* dst_w_basereg */ \
418    4,  /* dst_r_basereg */ \
419    0,  /* src_basereg   */ \
420    24  /* mask_basereg  */
421
422/******************************************************************************/
423
424.macro pixman_composite_src_8888_0565_process_pixblock_head
425    vshll.u8    q8, d1, #8
426    vshll.u8    q14, d2, #8
427    vshll.u8    q9, d0, #8
428.endm
429
430.macro pixman_composite_src_8888_0565_process_pixblock_tail
431    vsri.u16    q14, q8, #5
432    vsri.u16    q14, q9, #11
433.endm
434
435.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
436        vsri.u16    q14, q8, #5
437                                    PF add PF_X, PF_X, #8
438                                    PF tst PF_CTL, #0xF
439    fetch_src_pixblock
440                                    PF addne PF_X, PF_X, #8
441                                    PF subne PF_CTL, PF_CTL, #1
442        vsri.u16    q14, q9, #11
443                                    PF cmp PF_X, ORIG_W
444                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
445    vshll.u8    q8, d1, #8
446        vst1.16     {d28, d29}, [DST_W, :128]!
447                                    PF subge PF_X, PF_X, ORIG_W
448                                    PF subges PF_CTL, PF_CTL, #0x10
449    vshll.u8    q14, d2, #8
450                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
451    vshll.u8    q9, d0, #8
452.endm
453
454generate_composite_function \
455    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
456    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
457    8, /* number of pixels, processed in a single block */ \
458    10, /* prefetch distance */ \
459    default_init, \
460    default_cleanup, \
461    pixman_composite_src_8888_0565_process_pixblock_head, \
462    pixman_composite_src_8888_0565_process_pixblock_tail, \
463    pixman_composite_src_8888_0565_process_pixblock_tail_head
464
465/******************************************************************************/
466
467.macro pixman_composite_src_0565_8888_process_pixblock_head
468    vshrn.u16   d30, q0, #8
469    vshrn.u16   d29, q0, #3
470    vsli.u16    q0, q0, #5
471    vmov.u8     d31, #255
472    vsri.u8     d30, d30, #5
473    vsri.u8     d29, d29, #6
474    vshrn.u16   d28, q0, #2
475.endm
476
477.macro pixman_composite_src_0565_8888_process_pixblock_tail
478.endm
479
480/* TODO: expand macros and do better instructions scheduling */
481.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
482    pixman_composite_src_0565_8888_process_pixblock_tail
483    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
484    fetch_src_pixblock
485    pixman_composite_src_0565_8888_process_pixblock_head
486    cache_preload 8, 8
487.endm
488
489generate_composite_function \
490    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
491    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
492    8, /* number of pixels, processed in a single block */ \
493    10, /* prefetch distance */ \
494    default_init, \
495    default_cleanup, \
496    pixman_composite_src_0565_8888_process_pixblock_head, \
497    pixman_composite_src_0565_8888_process_pixblock_tail, \
498    pixman_composite_src_0565_8888_process_pixblock_tail_head
499
500/******************************************************************************/
501
502.macro pixman_composite_add_8_8_process_pixblock_head
503    vqadd.u8    q14, q0, q2
504    vqadd.u8    q15, q1, q3
505.endm
506
507.macro pixman_composite_add_8_8_process_pixblock_tail
508.endm
509
510.macro pixman_composite_add_8_8_process_pixblock_tail_head
511    fetch_src_pixblock
512                                    PF add PF_X, PF_X, #32
513                                    PF tst PF_CTL, #0xF
514    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
515                                    PF addne PF_X, PF_X, #32
516                                    PF subne PF_CTL, PF_CTL, #1
517        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
518                                    PF cmp PF_X, ORIG_W
519                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
520                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
521                                    PF subge PF_X, PF_X, ORIG_W
522                                    PF subges PF_CTL, PF_CTL, #0x10
523    vqadd.u8    q14, q0, q2
524                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
525                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
526    vqadd.u8    q15, q1, q3
527.endm
528
529generate_composite_function \
530    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
531    FLAG_DST_READWRITE, \
532    32, /* number of pixels, processed in a single block */ \
533    10, /* prefetch distance */ \
534    default_init, \
535    default_cleanup, \
536    pixman_composite_add_8_8_process_pixblock_head, \
537    pixman_composite_add_8_8_process_pixblock_tail, \
538    pixman_composite_add_8_8_process_pixblock_tail_head
539
540/******************************************************************************/
541
542.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
543    fetch_src_pixblock
544                                    PF add PF_X, PF_X, #8
545                                    PF tst PF_CTL, #0xF
546    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
547                                    PF addne PF_X, PF_X, #8
548                                    PF subne PF_CTL, PF_CTL, #1
549        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
550                                    PF cmp PF_X, ORIG_W
551                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
552                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
553                                    PF subge PF_X, PF_X, ORIG_W
554                                    PF subges PF_CTL, PF_CTL, #0x10
555    vqadd.u8    q14, q0, q2
556                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
557                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
558    vqadd.u8    q15, q1, q3
559.endm
560
561generate_composite_function \
562    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
563    FLAG_DST_READWRITE, \
564    8, /* number of pixels, processed in a single block */ \
565    10, /* prefetch distance */ \
566    default_init, \
567    default_cleanup, \
568    pixman_composite_add_8_8_process_pixblock_head, \
569    pixman_composite_add_8_8_process_pixblock_tail, \
570    pixman_composite_add_8888_8888_process_pixblock_tail_head
571
572generate_composite_function_single_scanline \
573    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
574    FLAG_DST_READWRITE, \
575    8, /* number of pixels, processed in a single block */ \
576    default_init, \
577    default_cleanup, \
578    pixman_composite_add_8_8_process_pixblock_head, \
579    pixman_composite_add_8_8_process_pixblock_tail, \
580    pixman_composite_add_8888_8888_process_pixblock_tail_head
581
582/******************************************************************************/
583
584.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
585    vmvn.8      d24, d3  /* get inverted alpha */
586    /* do alpha blending */
587    vmull.u8    q8, d24, d4
588    vmull.u8    q9, d24, d5
589    vmull.u8    q10, d24, d6
590    vmull.u8    q11, d24, d7
591.endm
592
593.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
594    vrshr.u16   q14, q8, #8
595    vrshr.u16   q15, q9, #8
596    vrshr.u16   q12, q10, #8
597    vrshr.u16   q13, q11, #8
598    vraddhn.u16 d28, q14, q8
599    vraddhn.u16 d29, q15, q9
600    vraddhn.u16 d30, q12, q10
601    vraddhn.u16 d31, q13, q11
602.endm
603
604.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
605    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
606        vrshr.u16   q14, q8, #8
607                                    PF add PF_X, PF_X, #8
608                                    PF tst PF_CTL, #0xF
609        vrshr.u16   q15, q9, #8
610        vrshr.u16   q12, q10, #8
611        vrshr.u16   q13, q11, #8
612                                    PF addne PF_X, PF_X, #8
613                                    PF subne PF_CTL, PF_CTL, #1
614        vraddhn.u16 d28, q14, q8
615        vraddhn.u16 d29, q15, q9
616                                    PF cmp PF_X, ORIG_W
617        vraddhn.u16 d30, q12, q10
618        vraddhn.u16 d31, q13, q11
619    fetch_src_pixblock
620                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
621    vmvn.8      d22, d3
622                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
623        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
624                                    PF subge PF_X, PF_X, ORIG_W
625    vmull.u8    q8, d22, d4
626                                    PF subges PF_CTL, PF_CTL, #0x10
627    vmull.u8    q9, d22, d5
628                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
629    vmull.u8    q10, d22, d6
630                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
631    vmull.u8    q11, d22, d7
632.endm
633
634generate_composite_function_single_scanline \
635    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
636    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
637    8, /* number of pixels, processed in a single block */ \
638    default_init, \
639    default_cleanup, \
640    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
641    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
642    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
643
644/******************************************************************************/
645
646.macro pixman_composite_over_8888_8888_process_pixblock_head
647    pixman_composite_out_reverse_8888_8888_process_pixblock_head
648.endm
649
650.macro pixman_composite_over_8888_8888_process_pixblock_tail
651    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
652    vqadd.u8    q14, q0, q14
653    vqadd.u8    q15, q1, q15
654.endm
655
656.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
657    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
658        vrshr.u16   q14, q8, #8
659                                    PF add PF_X, PF_X, #8
660                                    PF tst PF_CTL, #0xF
661        vrshr.u16   q15, q9, #8
662        vrshr.u16   q12, q10, #8
663        vrshr.u16   q13, q11, #8
664                                    PF addne PF_X, PF_X, #8
665                                    PF subne PF_CTL, PF_CTL, #1
666        vraddhn.u16 d28, q14, q8
667        vraddhn.u16 d29, q15, q9
668                                    PF cmp PF_X, ORIG_W
669        vraddhn.u16 d30, q12, q10
670        vraddhn.u16 d31, q13, q11
671        vqadd.u8    q14, q0, q14
672        vqadd.u8    q15, q1, q15
673    fetch_src_pixblock
674                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
675    vmvn.8      d22, d3
676                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
677        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
678                                    PF subge PF_X, PF_X, ORIG_W
679    vmull.u8    q8, d22, d4
680                                    PF subges PF_CTL, PF_CTL, #0x10
681    vmull.u8    q9, d22, d5
682                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
683    vmull.u8    q10, d22, d6
684                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
685    vmull.u8    q11, d22, d7
686.endm
687
688generate_composite_function \
689    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
690    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
691    8, /* number of pixels, processed in a single block */ \
692    5, /* prefetch distance */ \
693    default_init, \
694    default_cleanup, \
695    pixman_composite_over_8888_8888_process_pixblock_head, \
696    pixman_composite_over_8888_8888_process_pixblock_tail, \
697    pixman_composite_over_8888_8888_process_pixblock_tail_head
698
699generate_composite_function_single_scanline \
700    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
701    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
702    8, /* number of pixels, processed in a single block */ \
703    default_init, \
704    default_cleanup, \
705    pixman_composite_over_8888_8888_process_pixblock_head, \
706    pixman_composite_over_8888_8888_process_pixblock_tail, \
707    pixman_composite_over_8888_8888_process_pixblock_tail_head
708
709/******************************************************************************/
710
711.macro pixman_composite_over_n_8888_process_pixblock_head
712    /* deinterleaved source pixels in {d0, d1, d2, d3} */
713    /* inverted alpha in {d24} */
714    /* destination pixels in {d4, d5, d6, d7} */
715    vmull.u8    q8, d24, d4
716    vmull.u8    q9, d24, d5
717    vmull.u8    q10, d24, d6
718    vmull.u8    q11, d24, d7
719.endm
720
721.macro pixman_composite_over_n_8888_process_pixblock_tail
722    vrshr.u16   q14, q8, #8
723    vrshr.u16   q15, q9, #8
724    vrshr.u16   q2, q10, #8
725    vrshr.u16   q3, q11, #8
726    vraddhn.u16 d28, q14, q8
727    vraddhn.u16 d29, q15, q9
728    vraddhn.u16 d30, q2, q10
729    vraddhn.u16 d31, q3, q11
730    vqadd.u8    q14, q0, q14
731    vqadd.u8    q15, q1, q15
732.endm
733
734.macro pixman_composite_over_n_8888_process_pixblock_tail_head
735        vrshr.u16   q14, q8, #8
736        vrshr.u16   q15, q9, #8
737        vrshr.u16   q2, q10, #8
738        vrshr.u16   q3, q11, #8
739        vraddhn.u16 d28, q14, q8
740        vraddhn.u16 d29, q15, q9
741        vraddhn.u16 d30, q2, q10
742        vraddhn.u16 d31, q3, q11
743    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
744        vqadd.u8    q14, q0, q14
745                                    PF add PF_X, PF_X, #8
746                                    PF tst PF_CTL, #0x0F
747                                    PF addne PF_X, PF_X, #8
748                                    PF subne PF_CTL, PF_CTL, #1
749        vqadd.u8    q15, q1, q15
750                                    PF cmp PF_X, ORIG_W
751    vmull.u8    q8, d24, d4
752                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
753    vmull.u8    q9, d24, d5
754                                    PF subge PF_X, PF_X, ORIG_W
755    vmull.u8    q10, d24, d6
756                                    PF subges PF_CTL, PF_CTL, #0x10
757    vmull.u8    q11, d24, d7
758                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
759        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
760.endm
761
762.macro pixman_composite_over_n_8888_init
763    add         DUMMY, sp, #ARGS_STACK_OFFSET
764    vld1.32     {d3[0]}, [DUMMY]
765    vdup.8      d0, d3[0]
766    vdup.8      d1, d3[1]
767    vdup.8      d2, d3[2]
768    vdup.8      d3, d3[3]
769    vmvn.8      d24, d3  /* get inverted alpha */
770.endm
771
772generate_composite_function \
773    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
774    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
775    8, /* number of pixels, processed in a single block */ \
776    5, /* prefetch distance */ \
777    pixman_composite_over_n_8888_init, \
778    default_cleanup, \
779    pixman_composite_over_8888_8888_process_pixblock_head, \
780    pixman_composite_over_8888_8888_process_pixblock_tail, \
781    pixman_composite_over_n_8888_process_pixblock_tail_head
782
783/******************************************************************************/
784
785.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
786        vrshr.u16   q14, q8, #8
787                                    PF add PF_X, PF_X, #8
788                                    PF tst PF_CTL, #0xF
789        vrshr.u16   q15, q9, #8
790        vrshr.u16   q12, q10, #8
791        vrshr.u16   q13, q11, #8
792                                    PF addne PF_X, PF_X, #8
793                                    PF subne PF_CTL, PF_CTL, #1
794        vraddhn.u16 d28, q14, q8
795        vraddhn.u16 d29, q15, q9
796                                    PF cmp PF_X, ORIG_W
797        vraddhn.u16 d30, q12, q10
798        vraddhn.u16 d31, q13, q11
799        vqadd.u8    q14, q0, q14
800        vqadd.u8    q15, q1, q15
801    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
802    vmvn.8      d22, d3
803                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
804        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
805                                    PF subge PF_X, PF_X, ORIG_W
806    vmull.u8    q8, d22, d4
807                                    PF subges PF_CTL, PF_CTL, #0x10
808    vmull.u8    q9, d22, d5
809    vmull.u8    q10, d22, d6
810                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
811    vmull.u8    q11, d22, d7
812.endm
813
814.macro pixman_composite_over_reverse_n_8888_init
815    add         DUMMY, sp, #ARGS_STACK_OFFSET
816    vld1.32     {d7[0]}, [DUMMY]
817    vdup.8      d4, d7[0]
818    vdup.8      d5, d7[1]
819    vdup.8      d6, d7[2]
820    vdup.8      d7, d7[3]
821.endm
822
823generate_composite_function \
824    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
825    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
826    8, /* number of pixels, processed in a single block */ \
827    5, /* prefetch distance */ \
828    pixman_composite_over_reverse_n_8888_init, \
829    default_cleanup, \
830    pixman_composite_over_8888_8888_process_pixblock_head, \
831    pixman_composite_over_8888_8888_process_pixblock_tail, \
832    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
833    28, /* dst_w_basereg */ \
834    0,  /* dst_r_basereg */ \
835    4,  /* src_basereg   */ \
836    24  /* mask_basereg  */
837
838/******************************************************************************/
839
840.macro pixman_composite_over_8888_8_0565_process_pixblock_head
841    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
842    vmull.u8    q1,  d24, d9
843    vmull.u8    q6,  d24, d10
844    vmull.u8    q7,  d24, d11
845        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
846        vshrn.u16   d7,  q2, #3
847        vsli.u16    q2,  q2, #5
848    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
849    vrshr.u16   q9,  q1,  #8
850    vrshr.u16   q10, q6,  #8
851    vrshr.u16   q11, q7,  #8
852    vraddhn.u16 d0,  q0,  q8
853    vraddhn.u16 d1,  q1,  q9
854    vraddhn.u16 d2,  q6,  q10
855    vraddhn.u16 d3,  q7,  q11
856        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
857        vsri.u8     d7,  d7, #6
858    vmvn.8      d3,  d3
859        vshrn.u16   d30, q2, #2
860    vmull.u8    q8,  d3, d6     /* now do alpha blending */
861    vmull.u8    q9,  d3, d7
862    vmull.u8    q10, d3, d30
863.endm
864
865.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
866    /* 3 cycle bubble (after vmull.u8) */
867    vrshr.u16   q13, q8,  #8
868    vrshr.u16   q11, q9,  #8
869    vrshr.u16   q15, q10, #8
870    vraddhn.u16 d16, q8,  q13
871    vraddhn.u16 d27, q9,  q11
872    vraddhn.u16 d26, q10, q15
873    vqadd.u8    d16, d2,  d16
874    /* 1 cycle bubble */
875    vqadd.u8    q9,  q0,  q13
876    vshll.u8    q14, d16, #8    /* convert to 16bpp */
877    vshll.u8    q8,  d19, #8
878    vshll.u8    q9,  d18, #8
879    vsri.u16    q14, q8,  #5
880    /* 1 cycle bubble */
881    vsri.u16    q14, q9,  #11
882.endm
883
884.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
885    vld1.16     {d4, d5}, [DST_R, :128]!
886    vshrn.u16   d6,  q2,  #8
887    fetch_mask_pixblock
888    vshrn.u16   d7,  q2,  #3
889    fetch_src_pixblock
890    vmull.u8    q6,  d24, d10
891        vrshr.u16   q13, q8,  #8
892        vrshr.u16   q11, q9,  #8
893        vrshr.u16   q15, q10, #8
894        vraddhn.u16 d16, q8,  q13
895        vraddhn.u16 d27, q9,  q11
896        vraddhn.u16 d26, q10, q15
897        vqadd.u8    d16, d2,  d16
898    vmull.u8    q1,  d24, d9
899        vqadd.u8    q9,  q0,  q13
900        vshll.u8    q14, d16, #8
901    vmull.u8    q0,  d24, d8
902        vshll.u8    q8,  d19, #8
903        vshll.u8    q9,  d18, #8
904        vsri.u16    q14, q8,  #5
905    vmull.u8    q7,  d24, d11
906        vsri.u16    q14, q9,  #11
907
908    cache_preload 8, 8
909
910    vsli.u16    q2,  q2,  #5
911    vrshr.u16   q8,  q0,  #8
912    vrshr.u16   q9,  q1,  #8
913    vrshr.u16   q10, q6,  #8
914    vrshr.u16   q11, q7,  #8
915    vraddhn.u16 d0,  q0,  q8
916    vraddhn.u16 d1,  q1,  q9
917    vraddhn.u16 d2,  q6,  q10
918    vraddhn.u16 d3,  q7,  q11
919    vsri.u8     d6,  d6,  #5
920    vsri.u8     d7,  d7,  #6
921    vmvn.8      d3,  d3
922    vshrn.u16   d30, q2,  #2
923    vst1.16     {d28, d29}, [DST_W, :128]!
924    vmull.u8    q8,  d3,  d6
925    vmull.u8    q9,  d3,  d7
926    vmull.u8    q10, d3,  d30
927.endm
928
929generate_composite_function \
930    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
931    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
932    8, /* number of pixels, processed in a single block */ \
933    5, /* prefetch distance */ \
934    default_init_need_all_regs, \
935    default_cleanup_need_all_regs, \
936    pixman_composite_over_8888_8_0565_process_pixblock_head, \
937    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
938    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
939    28, /* dst_w_basereg */ \
940    4,  /* dst_r_basereg */ \
941    8,  /* src_basereg   */ \
942    24  /* mask_basereg  */
943
944/******************************************************************************/
945
946/*
947 * This function needs a special initialization of solid mask.
948 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
949 * offset, split into color components and replicated in d8-d11
950 * registers. Additionally, this function needs all the NEON registers,
951 * so it has to save d8-d15 registers which are callee saved according
952 * to ABI. These registers are restored from 'cleanup' macro. All the
953 * other NEON registers are caller saved, so can be clobbered freely
954 * without introducing any problems.
955 */
956.macro pixman_composite_over_n_8_0565_init
957    add         DUMMY, sp, #ARGS_STACK_OFFSET
958    vpush       {d8-d15}
959    vld1.32     {d11[0]}, [DUMMY]
960    vdup.8      d8, d11[0]
961    vdup.8      d9, d11[1]
962    vdup.8      d10, d11[2]
963    vdup.8      d11, d11[3]
964.endm
965
966.macro pixman_composite_over_n_8_0565_cleanup
967    vpop        {d8-d15}
968.endm
969
970generate_composite_function \
971    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
972    FLAG_DST_READWRITE, \
973    8, /* number of pixels, processed in a single block */ \
974    5, /* prefetch distance */ \
975    pixman_composite_over_n_8_0565_init, \
976    pixman_composite_over_n_8_0565_cleanup, \
977    pixman_composite_over_8888_8_0565_process_pixblock_head, \
978    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
979    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
980
981/******************************************************************************/
982
983.macro pixman_composite_over_8888_n_0565_init
984    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
985    vpush       {d8-d15}
986    vld1.32     {d24[0]}, [DUMMY]
987    vdup.8      d24, d24[3]
988.endm
989
990.macro pixman_composite_over_8888_n_0565_cleanup
991    vpop        {d8-d15}
992.endm
993
994generate_composite_function \
995    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
996    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
997    8, /* number of pixels, processed in a single block */ \
998    5, /* prefetch distance */ \
999    pixman_composite_over_8888_n_0565_init, \
1000    pixman_composite_over_8888_n_0565_cleanup, \
1001    pixman_composite_over_8888_8_0565_process_pixblock_head, \
1002    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1003    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1004    28, /* dst_w_basereg */ \
1005    4,  /* dst_r_basereg */ \
1006    8,  /* src_basereg   */ \
1007    24  /* mask_basereg  */
1008
1009/******************************************************************************/
1010
1011.macro pixman_composite_src_0565_0565_process_pixblock_head
1012.endm
1013
1014.macro pixman_composite_src_0565_0565_process_pixblock_tail
1015.endm
1016
1017.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1018    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1019    fetch_src_pixblock
1020    cache_preload 16, 16
1021.endm
1022
1023generate_composite_function \
1024    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1025    FLAG_DST_WRITEONLY, \
1026    16, /* number of pixels, processed in a single block */ \
1027    10, /* prefetch distance */ \
1028    default_init, \
1029    default_cleanup, \
1030    pixman_composite_src_0565_0565_process_pixblock_head, \
1031    pixman_composite_src_0565_0565_process_pixblock_tail, \
1032    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1033    0, /* dst_w_basereg */ \
1034    0, /* dst_r_basereg */ \
1035    0, /* src_basereg   */ \
1036    0  /* mask_basereg  */
1037
1038/******************************************************************************/
1039
1040.macro pixman_composite_src_n_8_process_pixblock_head
1041.endm
1042
1043.macro pixman_composite_src_n_8_process_pixblock_tail
1044.endm
1045
1046.macro pixman_composite_src_n_8_process_pixblock_tail_head
1047    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
1048.endm
1049
1050.macro pixman_composite_src_n_8_init
1051    add         DUMMY, sp, #ARGS_STACK_OFFSET
1052    vld1.32     {d0[0]}, [DUMMY]
1053    vsli.u64    d0, d0, #8
1054    vsli.u64    d0, d0, #16
1055    vsli.u64    d0, d0, #32
1056    vorr        d1, d0, d0
1057    vorr        q1, q0, q0
1058.endm
1059
1060.macro pixman_composite_src_n_8_cleanup
1061.endm
1062
1063generate_composite_function \
1064    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1065    FLAG_DST_WRITEONLY, \
1066    32, /* number of pixels, processed in a single block */ \
1067    0,  /* prefetch distance */ \
1068    pixman_composite_src_n_8_init, \
1069    pixman_composite_src_n_8_cleanup, \
1070    pixman_composite_src_n_8_process_pixblock_head, \
1071    pixman_composite_src_n_8_process_pixblock_tail, \
1072    pixman_composite_src_n_8_process_pixblock_tail_head, \
1073    0, /* dst_w_basereg */ \
1074    0, /* dst_r_basereg */ \
1075    0, /* src_basereg   */ \
1076    0  /* mask_basereg  */
1077
1078/******************************************************************************/
1079
1080.macro pixman_composite_src_n_0565_process_pixblock_head
1081.endm
1082
1083.macro pixman_composite_src_n_0565_process_pixblock_tail
1084.endm
1085
1086.macro pixman_composite_src_n_0565_process_pixblock_tail_head
1087    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1088.endm
1089
1090.macro pixman_composite_src_n_0565_init
1091    add         DUMMY, sp, #ARGS_STACK_OFFSET
1092    vld1.32     {d0[0]}, [DUMMY]
1093    vsli.u64    d0, d0, #16
1094    vsli.u64    d0, d0, #32
1095    vorr        d1, d0, d0
1096    vorr        q1, q0, q0
1097.endm
1098
1099.macro pixman_composite_src_n_0565_cleanup
1100.endm
1101
1102generate_composite_function \
1103    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1104    FLAG_DST_WRITEONLY, \
1105    16, /* number of pixels, processed in a single block */ \
1106    0,  /* prefetch distance */ \
1107    pixman_composite_src_n_0565_init, \
1108    pixman_composite_src_n_0565_cleanup, \
1109    pixman_composite_src_n_0565_process_pixblock_head, \
1110    pixman_composite_src_n_0565_process_pixblock_tail, \
1111    pixman_composite_src_n_0565_process_pixblock_tail_head, \
1112    0, /* dst_w_basereg */ \
1113    0, /* dst_r_basereg */ \
1114    0, /* src_basereg   */ \
1115    0  /* mask_basereg  */
1116
1117/******************************************************************************/
1118
1119.macro pixman_composite_src_n_8888_process_pixblock_head
1120.endm
1121
1122.macro pixman_composite_src_n_8888_process_pixblock_tail
1123.endm
1124
1125.macro pixman_composite_src_n_8888_process_pixblock_tail_head
1126    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1127.endm
1128
1129.macro pixman_composite_src_n_8888_init
1130    add         DUMMY, sp, #ARGS_STACK_OFFSET
1131    vld1.32     {d0[0]}, [DUMMY]
1132    vsli.u64    d0, d0, #32
1133    vorr        d1, d0, d0
1134    vorr        q1, q0, q0
1135.endm
1136
1137.macro pixman_composite_src_n_8888_cleanup
1138.endm
1139
1140generate_composite_function \
1141    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1142    FLAG_DST_WRITEONLY, \
1143    8, /* number of pixels, processed in a single block */ \
1144    0, /* prefetch distance */ \
1145    pixman_composite_src_n_8888_init, \
1146    pixman_composite_src_n_8888_cleanup, \
1147    pixman_composite_src_n_8888_process_pixblock_head, \
1148    pixman_composite_src_n_8888_process_pixblock_tail, \
1149    pixman_composite_src_n_8888_process_pixblock_tail_head, \
1150    0, /* dst_w_basereg */ \
1151    0, /* dst_r_basereg */ \
1152    0, /* src_basereg   */ \
1153    0  /* mask_basereg  */
1154
1155/******************************************************************************/
1156
1157.macro pixman_composite_src_8888_8888_process_pixblock_head
1158.endm
1159
1160.macro pixman_composite_src_8888_8888_process_pixblock_tail
1161.endm
1162
1163.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1164    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1165    fetch_src_pixblock
1166    cache_preload 8, 8
1167.endm
1168
1169generate_composite_function \
1170    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1171    FLAG_DST_WRITEONLY, \
1172    8, /* number of pixels, processed in a single block */ \
1173    10, /* prefetch distance */ \
1174    default_init, \
1175    default_cleanup, \
1176    pixman_composite_src_8888_8888_process_pixblock_head, \
1177    pixman_composite_src_8888_8888_process_pixblock_tail, \
1178    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1179    0, /* dst_w_basereg */ \
1180    0, /* dst_r_basereg */ \
1181    0, /* src_basereg   */ \
1182    0  /* mask_basereg  */
1183
1184/******************************************************************************/
1185
1186.macro pixman_composite_src_x888_8888_process_pixblock_head
1187    vorr     q0, q0, q2
1188    vorr     q1, q1, q2
1189.endm
1190
1191.macro pixman_composite_src_x888_8888_process_pixblock_tail
1192.endm
1193
1194.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1195    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1196    fetch_src_pixblock
1197    vorr     q0, q0, q2
1198    vorr     q1, q1, q2
1199    cache_preload 8, 8
1200.endm
1201
1202.macro pixman_composite_src_x888_8888_init
1203    vmov.u8  q2, #0xFF
1204    vshl.u32 q2, q2, #24
1205.endm
1206
1207generate_composite_function \
1208    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1209    FLAG_DST_WRITEONLY, \
1210    8, /* number of pixels, processed in a single block */ \
1211    10, /* prefetch distance */ \
1212    pixman_composite_src_x888_8888_init, \
1213    default_cleanup, \
1214    pixman_composite_src_x888_8888_process_pixblock_head, \
1215    pixman_composite_src_x888_8888_process_pixblock_tail, \
1216    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1217    0, /* dst_w_basereg */ \
1218    0, /* dst_r_basereg */ \
1219    0, /* src_basereg   */ \
1220    0  /* mask_basereg  */
1221
1222/******************************************************************************/
1223
1224.macro pixman_composite_src_n_8_8888_process_pixblock_head
1225    /* expecting solid source in {d0, d1, d2, d3} */
1226    /* mask is in d24 (d25, d26, d27 are unused) */
1227
1228    /* in */
1229    vmull.u8    q8, d24, d0
1230    vmull.u8    q9, d24, d1
1231    vmull.u8    q10, d24, d2
1232    vmull.u8    q11, d24, d3
1233    vrsra.u16   q8, q8, #8
1234    vrsra.u16   q9, q9, #8
1235    vrsra.u16   q10, q10, #8
1236    vrsra.u16   q11, q11, #8
1237.endm
1238
1239.macro pixman_composite_src_n_8_8888_process_pixblock_tail
1240    vrshrn.u16  d28, q8, #8
1241    vrshrn.u16  d29, q9, #8
1242    vrshrn.u16  d30, q10, #8
1243    vrshrn.u16  d31, q11, #8
1244.endm
1245
1246.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1247    fetch_mask_pixblock
1248                                    PF add PF_X, PF_X, #8
1249        vrshrn.u16  d28, q8, #8
1250                                    PF tst PF_CTL, #0x0F
1251        vrshrn.u16  d29, q9, #8
1252                                    PF addne PF_X, PF_X, #8
1253        vrshrn.u16  d30, q10, #8
1254                                    PF subne PF_CTL, PF_CTL, #1
1255        vrshrn.u16  d31, q11, #8
1256                                    PF cmp PF_X, ORIG_W
1257    vmull.u8    q8, d24, d0
1258                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1259    vmull.u8    q9, d24, d1
1260                                    PF subge PF_X, PF_X, ORIG_W
1261    vmull.u8    q10, d24, d2
1262                                    PF subges PF_CTL, PF_CTL, #0x10
1263    vmull.u8    q11, d24, d3
1264                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1265        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1266    vrsra.u16   q8, q8, #8
1267    vrsra.u16   q9, q9, #8
1268    vrsra.u16   q10, q10, #8
1269    vrsra.u16   q11, q11, #8
1270.endm
1271
1272.macro pixman_composite_src_n_8_8888_init
1273    add         DUMMY, sp, #ARGS_STACK_OFFSET
1274    vld1.32     {d3[0]}, [DUMMY]
1275    vdup.8      d0, d3[0]
1276    vdup.8      d1, d3[1]
1277    vdup.8      d2, d3[2]
1278    vdup.8      d3, d3[3]
1279.endm
1280
1281.macro pixman_composite_src_n_8_8888_cleanup
1282.endm
1283
1284generate_composite_function \
1285    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1286    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1287    8, /* number of pixels, processed in a single block */ \
1288    5, /* prefetch distance */ \
1289    pixman_composite_src_n_8_8888_init, \
1290    pixman_composite_src_n_8_8888_cleanup, \
1291    pixman_composite_src_n_8_8888_process_pixblock_head, \
1292    pixman_composite_src_n_8_8888_process_pixblock_tail, \
1293    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1294
1295/******************************************************************************/
1296
1297.macro pixman_composite_src_n_8_8_process_pixblock_head
1298    vmull.u8    q0, d24, d16
1299    vmull.u8    q1, d25, d16
1300    vmull.u8    q2, d26, d16
1301    vmull.u8    q3, d27, d16
1302    vrsra.u16   q0, q0,  #8
1303    vrsra.u16   q1, q1,  #8
1304    vrsra.u16   q2, q2,  #8
1305    vrsra.u16   q3, q3,  #8
1306.endm
1307
1308.macro pixman_composite_src_n_8_8_process_pixblock_tail
1309    vrshrn.u16  d28, q0, #8
1310    vrshrn.u16  d29, q1, #8
1311    vrshrn.u16  d30, q2, #8
1312    vrshrn.u16  d31, q3, #8
1313.endm
1314
1315.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1316    fetch_mask_pixblock
1317                                    PF add PF_X, PF_X, #8
1318        vrshrn.u16  d28, q0, #8
1319                                    PF tst PF_CTL, #0x0F
1320        vrshrn.u16  d29, q1, #8
1321                                    PF addne PF_X, PF_X, #8
1322        vrshrn.u16  d30, q2, #8
1323                                    PF subne PF_CTL, PF_CTL, #1
1324        vrshrn.u16  d31, q3, #8
1325                                    PF cmp PF_X, ORIG_W
1326    vmull.u8    q0,  d24, d16
1327                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1328    vmull.u8    q1,  d25, d16
1329                                    PF subge PF_X, PF_X, ORIG_W
1330    vmull.u8    q2,  d26, d16
1331                                    PF subges PF_CTL, PF_CTL, #0x10
1332    vmull.u8    q3,  d27, d16
1333                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1334        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1335    vrsra.u16   q0, q0,  #8
1336    vrsra.u16   q1, q1,  #8
1337    vrsra.u16   q2, q2,  #8
1338    vrsra.u16   q3, q3,  #8
1339.endm
1340
1341.macro pixman_composite_src_n_8_8_init
1342    add         DUMMY, sp, #ARGS_STACK_OFFSET
1343    vld1.32     {d16[0]}, [DUMMY]
1344    vdup.8      d16, d16[3]
1345.endm
1346
1347.macro pixman_composite_src_n_8_8_cleanup
1348.endm
1349
1350generate_composite_function \
1351    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1352    FLAG_DST_WRITEONLY, \
1353    32, /* number of pixels, processed in a single block */ \
1354    5, /* prefetch distance */ \
1355    pixman_composite_src_n_8_8_init, \
1356    pixman_composite_src_n_8_8_cleanup, \
1357    pixman_composite_src_n_8_8_process_pixblock_head, \
1358    pixman_composite_src_n_8_8_process_pixblock_tail, \
1359    pixman_composite_src_n_8_8_process_pixblock_tail_head
1360
1361/******************************************************************************/
1362
1363.macro pixman_composite_over_n_8_8888_process_pixblock_head
1364    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1365    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1366    /* and destination data in {d4, d5, d6, d7} */
1367    /* mask is in d24 (d25, d26, d27 are unused) */
1368
1369    /* in */
1370    vmull.u8    q6, d24, d8
1371    vmull.u8    q7, d24, d9
1372    vmull.u8    q8, d24, d10
1373    vmull.u8    q9, d24, d11
1374    vrshr.u16   q10, q6, #8
1375    vrshr.u16   q11, q7, #8
1376    vrshr.u16   q12, q8, #8
1377    vrshr.u16   q13, q9, #8
1378    vraddhn.u16 d0, q6, q10
1379    vraddhn.u16 d1, q7, q11
1380    vraddhn.u16 d2, q8, q12
1381    vraddhn.u16 d3, q9, q13
1382    vmvn.8      d25, d3  /* get inverted alpha */
1383    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
1384    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1385    /* now do alpha blending */
1386    vmull.u8    q8, d25, d4
1387    vmull.u8    q9, d25, d5
1388    vmull.u8    q10, d25, d6
1389    vmull.u8    q11, d25, d7
1390.endm
1391
1392.macro pixman_composite_over_n_8_8888_process_pixblock_tail
1393    vrshr.u16   q14, q8, #8
1394    vrshr.u16   q15, q9, #8
1395    vrshr.u16   q6, q10, #8
1396    vrshr.u16   q7, q11, #8
1397    vraddhn.u16 d28, q14, q8
1398    vraddhn.u16 d29, q15, q9
1399    vraddhn.u16 d30, q6, q10
1400    vraddhn.u16 d31, q7, q11
1401    vqadd.u8    q14, q0, q14
1402    vqadd.u8    q15, q1, q15
1403.endm
1404
1405.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1406        vrshr.u16   q14, q8, #8
1407    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1408        vrshr.u16   q15, q9, #8
1409    fetch_mask_pixblock
1410        vrshr.u16   q6, q10, #8
1411                                    PF add PF_X, PF_X, #8
1412        vrshr.u16   q7, q11, #8
1413                                    PF tst PF_CTL, #0x0F
1414        vraddhn.u16 d28, q14, q8
1415                                    PF addne PF_X, PF_X, #8
1416        vraddhn.u16 d29, q15, q9
1417                                    PF subne PF_CTL, PF_CTL, #1
1418        vraddhn.u16 d30, q6, q10
1419                                    PF cmp PF_X, ORIG_W
1420        vraddhn.u16 d31, q7, q11
1421                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1422    vmull.u8    q6, d24, d8
1423                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1424    vmull.u8    q7, d24, d9
1425                                    PF subge PF_X, PF_X, ORIG_W
1426    vmull.u8    q8, d24, d10
1427                                    PF subges PF_CTL, PF_CTL, #0x10
1428    vmull.u8    q9, d24, d11
1429                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1430        vqadd.u8    q14, q0, q14
1431                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1432        vqadd.u8    q15, q1, q15
1433    vrshr.u16   q10, q6, #8
1434    vrshr.u16   q11, q7, #8
1435    vrshr.u16   q12, q8, #8
1436    vrshr.u16   q13, q9, #8
1437    vraddhn.u16 d0, q6, q10
1438    vraddhn.u16 d1, q7, q11
1439    vraddhn.u16 d2, q8, q12
1440    vraddhn.u16 d3, q9, q13
1441        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1442    vmvn.8      d25, d3
1443    vmull.u8    q8, d25, d4
1444    vmull.u8    q9, d25, d5
1445    vmull.u8    q10, d25, d6
1446    vmull.u8    q11, d25, d7
1447.endm
1448
1449.macro pixman_composite_over_n_8_8888_init
1450    add         DUMMY, sp, #ARGS_STACK_OFFSET
1451    vpush       {d8-d15}
1452    vld1.32     {d11[0]}, [DUMMY]
1453    vdup.8      d8, d11[0]
1454    vdup.8      d9, d11[1]
1455    vdup.8      d10, d11[2]
1456    vdup.8      d11, d11[3]
1457.endm
1458
1459.macro pixman_composite_over_n_8_8888_cleanup
1460    vpop        {d8-d15}
1461.endm
1462
1463generate_composite_function \
1464    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1465    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1466    8, /* number of pixels, processed in a single block */ \
1467    5, /* prefetch distance */ \
1468    pixman_composite_over_n_8_8888_init, \
1469    pixman_composite_over_n_8_8888_cleanup, \
1470    pixman_composite_over_n_8_8888_process_pixblock_head, \
1471    pixman_composite_over_n_8_8888_process_pixblock_tail, \
1472    pixman_composite_over_n_8_8888_process_pixblock_tail_head
1473
1474/******************************************************************************/
1475
1476.macro pixman_composite_over_n_8_8_process_pixblock_head
1477    vmull.u8    q0,  d24, d8
1478    vmull.u8    q1,  d25, d8
1479    vmull.u8    q6,  d26, d8
1480    vmull.u8    q7,  d27, d8
1481    vrshr.u16   q10, q0,  #8
1482    vrshr.u16   q11, q1,  #8
1483    vrshr.u16   q12, q6,  #8
1484    vrshr.u16   q13, q7,  #8
1485    vraddhn.u16 d0,  q0,  q10
1486    vraddhn.u16 d1,  q1,  q11
1487    vraddhn.u16 d2,  q6,  q12
1488    vraddhn.u16 d3,  q7,  q13
1489    vmvn.8      q12, q0
1490    vmvn.8      q13, q1
1491    vmull.u8    q8,  d24, d4
1492    vmull.u8    q9,  d25, d5
1493    vmull.u8    q10, d26, d6
1494    vmull.u8    q11, d27, d7
1495.endm
1496
1497.macro pixman_composite_over_n_8_8_process_pixblock_tail
1498    vrshr.u16   q14, q8,  #8
1499    vrshr.u16   q15, q9,  #8
1500    vrshr.u16   q12, q10, #8
1501    vrshr.u16   q13, q11, #8
1502    vraddhn.u16 d28, q14, q8
1503    vraddhn.u16 d29, q15, q9
1504    vraddhn.u16 d30, q12, q10
1505    vraddhn.u16 d31, q13, q11
1506    vqadd.u8    q14, q0,  q14
1507    vqadd.u8    q15, q1,  q15
1508.endm
1509
1510/* TODO: expand macros and do better instructions scheduling */
1511.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1512    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1513    pixman_composite_over_n_8_8_process_pixblock_tail
1514    fetch_mask_pixblock
1515    cache_preload 32, 32
1516    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1517    pixman_composite_over_n_8_8_process_pixblock_head
1518.endm
1519
1520.macro pixman_composite_over_n_8_8_init
1521    add         DUMMY, sp, #ARGS_STACK_OFFSET
1522    vpush       {d8-d15}
1523    vld1.32     {d8[0]}, [DUMMY]
1524    vdup.8      d8, d8[3]
1525.endm
1526
1527.macro pixman_composite_over_n_8_8_cleanup
1528    vpop        {d8-d15}
1529.endm
1530
1531generate_composite_function \
1532    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1533    FLAG_DST_READWRITE, \
1534    32, /* number of pixels, processed in a single block */ \
1535    5, /* prefetch distance */ \
1536    pixman_composite_over_n_8_8_init, \
1537    pixman_composite_over_n_8_8_cleanup, \
1538    pixman_composite_over_n_8_8_process_pixblock_head, \
1539    pixman_composite_over_n_8_8_process_pixblock_tail, \
1540    pixman_composite_over_n_8_8_process_pixblock_tail_head
1541
1542/******************************************************************************/
1543
1544.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1545    /*
1546     * 'combine_mask_ca' replacement
1547     *
1548     * input:  solid src (n) in {d8,  d9,  d10, d11}
1549     *         dest in          {d4,  d5,  d6,  d7 }
1550     *         mask in          {d24, d25, d26, d27}
1551     * output: updated src in   {d0,  d1,  d2,  d3 }
1552     *         updated mask in  {d24, d25, d26, d3 }
1553     */
1554    vmull.u8    q0,  d24, d8
1555    vmull.u8    q1,  d25, d9
1556    vmull.u8    q6,  d26, d10
1557    vmull.u8    q7,  d27, d11
1558    vmull.u8    q9,  d11, d25
1559    vmull.u8    q12, d11, d24
1560    vmull.u8    q13, d11, d26
1561    vrshr.u16   q8,  q0,  #8
1562    vrshr.u16   q10, q1,  #8
1563    vrshr.u16   q11, q6,  #8
1564    vraddhn.u16 d0,  q0,  q8
1565    vraddhn.u16 d1,  q1,  q10
1566    vraddhn.u16 d2,  q6,  q11
1567    vrshr.u16   q11, q12, #8
1568    vrshr.u16   q8,  q9,  #8
1569    vrshr.u16   q6,  q13, #8
1570    vrshr.u16   q10, q7,  #8
1571    vraddhn.u16 d24, q12, q11
1572    vraddhn.u16 d25, q9,  q8
1573    vraddhn.u16 d26, q13, q6
1574    vraddhn.u16 d3,  q7,  q10
1575    /*
1576     * 'combine_over_ca' replacement
1577     *
1578     * output: updated dest in {d28, d29, d30, d31}
1579     */
1580    vmvn.8      q12, q12
1581    vmvn.8      d26, d26
1582    vmull.u8    q8,  d24, d4
1583    vmull.u8    q9,  d25, d5
1584    vmvn.8      d27, d3
1585    vmull.u8    q10, d26, d6
1586    vmull.u8    q11, d27, d7
1587.endm
1588
1589.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1590    /* ... continue 'combine_over_ca' replacement */
1591    vrshr.u16   q14, q8,  #8
1592    vrshr.u16   q15, q9,  #8
1593    vrshr.u16   q6,  q10, #8
1594    vrshr.u16   q7,  q11, #8
1595    vraddhn.u16 d28, q14, q8
1596    vraddhn.u16 d29, q15, q9
1597    vraddhn.u16 d30, q6,  q10
1598    vraddhn.u16 d31, q7,  q11
1599    vqadd.u8    q14, q0,  q14
1600    vqadd.u8    q15, q1,  q15
1601.endm
1602
1603.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1604        vrshr.u16   q14, q8, #8
1605        vrshr.u16   q15, q9, #8
1606    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1607        vrshr.u16   q6, q10, #8
1608        vrshr.u16   q7, q11, #8
1609        vraddhn.u16 d28, q14, q8
1610        vraddhn.u16 d29, q15, q9
1611        vraddhn.u16 d30, q6, q10
1612        vraddhn.u16 d31, q7, q11
1613    fetch_mask_pixblock
1614        vqadd.u8    q14, q0, q14
1615        vqadd.u8    q15, q1, q15
1616    cache_preload 8, 8
1617    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1618    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1619.endm
1620
1621.macro pixman_composite_over_n_8888_8888_ca_init
1622    add         DUMMY, sp, #ARGS_STACK_OFFSET
1623    vpush       {d8-d15}
1624    vld1.32     {d11[0]}, [DUMMY]
1625    vdup.8      d8, d11[0]
1626    vdup.8      d9, d11[1]
1627    vdup.8      d10, d11[2]
1628    vdup.8      d11, d11[3]
1629.endm
1630
1631.macro pixman_composite_over_n_8888_8888_ca_cleanup
1632    vpop        {d8-d15}
1633.endm
1634
1635generate_composite_function \
1636    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1637    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1638    8, /* number of pixels, processed in a single block */ \
1639    5, /* prefetch distance */ \
1640    pixman_composite_over_n_8888_8888_ca_init, \
1641    pixman_composite_over_n_8888_8888_ca_cleanup, \
1642    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1643    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1644    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1645
1646/******************************************************************************/
1647
1648.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1649    /*
1650     * 'combine_mask_ca' replacement
1651     *
1652     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
1653     *         mask in          {d24, d25, d26}       [B, G, R]
1654     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
1655     *         updated mask in  {d24, d25, d26}       [B, G, R]
1656     */
1657    vmull.u8    q0,  d24, d8
1658    vmull.u8    q1,  d25, d9
1659    vmull.u8    q6,  d26, d10
1660    vmull.u8    q9,  d11, d25
1661    vmull.u8    q12, d11, d24
1662    vmull.u8    q13, d11, d26
1663    vrshr.u16   q8,  q0,  #8
1664    vrshr.u16   q10, q1,  #8
1665    vrshr.u16   q11, q6,  #8
1666    vraddhn.u16 d0,  q0,  q8
1667    vraddhn.u16 d1,  q1,  q10
1668    vraddhn.u16 d2,  q6,  q11
1669    vrshr.u16   q11, q12, #8
1670    vrshr.u16   q8,  q9,  #8
1671    vrshr.u16   q6,  q13, #8
1672    vraddhn.u16 d24, q12, q11
1673    vraddhn.u16 d25, q9,  q8
1674    /*
1675     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1676     * and put data into d16 - blue, d17 - green, d18 - red
1677     */
1678       vshrn.u16   d17, q2,  #3
1679       vshrn.u16   d18, q2,  #8
1680    vraddhn.u16 d26, q13, q6
1681       vsli.u16    q2,  q2,  #5
1682       vsri.u8     d18, d18, #5
1683       vsri.u8     d17, d17, #6
1684    /*
1685     * 'combine_over_ca' replacement
1686     *
1687     * output: updated dest in d16 - blue, d17 - green, d18 - red
1688     */
1689    vmvn.8      q12, q12
1690       vshrn.u16   d16, q2,  #2
1691    vmvn.8      d26, d26
1692    vmull.u8    q6,  d16, d24
1693    vmull.u8    q7,  d17, d25
1694    vmull.u8    q11, d18, d26
1695.endm
1696
1697.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1698    /* ... continue 'combine_over_ca' replacement */
1699    vrshr.u16   q10, q6,  #8
1700    vrshr.u16   q14, q7,  #8
1701    vrshr.u16   q15, q11, #8
1702    vraddhn.u16 d16, q10, q6
1703    vraddhn.u16 d17, q14, q7
1704    vraddhn.u16 d18, q15, q11
1705    vqadd.u8    q8,  q0,  q8
1706    vqadd.u8    d18, d2,  d18
1707    /*
1708     * convert the results in d16, d17, d18 to r5g6b5 and store
1709     * them into {d28, d29}
1710     */
1711    vshll.u8    q14, d18, #8
1712    vshll.u8    q10, d17, #8
1713    vshll.u8    q15, d16, #8
1714    vsri.u16    q14, q10, #5
1715    vsri.u16    q14, q15, #11
1716.endm
1717
1718.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1719    fetch_mask_pixblock
1720        vrshr.u16   q10, q6, #8
1721        vrshr.u16   q14, q7, #8
1722    vld1.16     {d4, d5}, [DST_R, :128]!
1723        vrshr.u16   q15, q11, #8
1724        vraddhn.u16 d16, q10, q6
1725        vraddhn.u16 d17, q14, q7
1726        vraddhn.u16 d22, q15, q11
1727            /* process_pixblock_head */
1728            /*
1729             * 'combine_mask_ca' replacement
1730             *
1731             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
1732             *         mask in          {d24, d25, d26}       [B, G, R]
1733             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
1734             *         updated mask in  {d24, d25, d26}       [B, G, R]
1735             */
1736            vmull.u8    q6,  d26, d10
1737        vqadd.u8    q8,  q0, q8
1738            vmull.u8    q0,  d24, d8
1739        vqadd.u8    d22, d2, d22
1740            vmull.u8    q1,  d25, d9
1741        /*
1742         * convert the result in d16, d17, d22 to r5g6b5 and store
1743         * it into {d28, d29}
1744         */
1745        vshll.u8    q14, d22, #8
1746        vshll.u8    q10, d17, #8
1747        vshll.u8    q15, d16, #8
1748            vmull.u8    q9,  d11, d25
1749        vsri.u16    q14, q10, #5
1750            vmull.u8    q12, d11, d24
1751            vmull.u8    q13, d11, d26
1752        vsri.u16    q14, q15, #11
1753    cache_preload 8, 8
1754            vrshr.u16   q8,  q0,  #8
1755            vrshr.u16   q10, q1,  #8
1756            vrshr.u16   q11, q6,  #8
1757            vraddhn.u16 d0,  q0,  q8
1758            vraddhn.u16 d1,  q1,  q10
1759            vraddhn.u16 d2,  q6,  q11
1760            vrshr.u16   q11, q12, #8
1761            vrshr.u16   q8,  q9,  #8
1762            vrshr.u16   q6,  q13, #8
1763            vraddhn.u16 d24, q12, q11
1764            vraddhn.u16 d25, q9,  q8
1765                /*
1766                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1767	         * 8-bit format and put data into d16 - blue, d17 - green,
1768	         * d18 - red
1769                 */
1770                vshrn.u16   d17, q2,  #3
1771                vshrn.u16   d18, q2,  #8
1772            vraddhn.u16 d26, q13, q6
1773                vsli.u16    q2,  q2,  #5
1774                vsri.u8     d17, d17, #6
1775                vsri.u8     d18, d18, #5
1776            /*
1777             * 'combine_over_ca' replacement
1778             *
1779             * output: updated dest in d16 - blue, d17 - green, d18 - red
1780             */
1781            vmvn.8      q12, q12
1782                vshrn.u16   d16, q2,  #2
1783            vmvn.8      d26, d26
1784            vmull.u8    q7,  d17, d25
1785            vmull.u8    q6,  d16, d24
1786            vmull.u8    q11, d18, d26
1787    vst1.16     {d28, d29}, [DST_W, :128]!
1788.endm
1789
1790.macro pixman_composite_over_n_8888_0565_ca_init
1791    add         DUMMY, sp, #ARGS_STACK_OFFSET
1792    vpush       {d8-d15}
1793    vld1.32     {d11[0]}, [DUMMY]
1794    vdup.8      d8, d11[0]
1795    vdup.8      d9, d11[1]
1796    vdup.8      d10, d11[2]
1797    vdup.8      d11, d11[3]
1798.endm
1799
1800.macro pixman_composite_over_n_8888_0565_ca_cleanup
1801    vpop        {d8-d15}
1802.endm
1803
1804generate_composite_function \
1805    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1806    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1807    8, /* number of pixels, processed in a single block */ \
1808    5, /* prefetch distance */ \
1809    pixman_composite_over_n_8888_0565_ca_init, \
1810    pixman_composite_over_n_8888_0565_ca_cleanup, \
1811    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1812    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1813    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1814
1815/******************************************************************************/
1816
1817.macro pixman_composite_in_n_8_process_pixblock_head
1818    /* expecting source data in {d0, d1, d2, d3} */
1819    /* and destination data in {d4, d5, d6, d7} */
1820    vmull.u8    q8,  d4,  d3
1821    vmull.u8    q9,  d5,  d3
1822    vmull.u8    q10, d6,  d3
1823    vmull.u8    q11, d7,  d3
1824.endm
1825
1826.macro pixman_composite_in_n_8_process_pixblock_tail
1827    vrshr.u16   q14, q8,  #8
1828    vrshr.u16   q15, q9,  #8
1829    vrshr.u16   q12, q10, #8
1830    vrshr.u16   q13, q11, #8
1831    vraddhn.u16 d28, q8,  q14
1832    vraddhn.u16 d29, q9,  q15
1833    vraddhn.u16 d30, q10, q12
1834    vraddhn.u16 d31, q11, q13
1835.endm
1836
1837.macro pixman_composite_in_n_8_process_pixblock_tail_head
1838    pixman_composite_in_n_8_process_pixblock_tail
1839    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1840    cache_preload 32, 32
1841    pixman_composite_in_n_8_process_pixblock_head
1842    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1843.endm
1844
1845.macro pixman_composite_in_n_8_init
1846    add         DUMMY, sp, #ARGS_STACK_OFFSET
1847    vld1.32     {d3[0]}, [DUMMY]
1848    vdup.8      d3, d3[3]
1849.endm
1850
1851.macro pixman_composite_in_n_8_cleanup
1852.endm
1853
1854generate_composite_function \
1855    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1856    FLAG_DST_READWRITE, \
1857    32, /* number of pixels, processed in a single block */ \
1858    5, /* prefetch distance */ \
1859    pixman_composite_in_n_8_init, \
1860    pixman_composite_in_n_8_cleanup, \
1861    pixman_composite_in_n_8_process_pixblock_head, \
1862    pixman_composite_in_n_8_process_pixblock_tail, \
1863    pixman_composite_in_n_8_process_pixblock_tail_head, \
1864    28, /* dst_w_basereg */ \
1865    4,  /* dst_r_basereg */ \
1866    0,  /* src_basereg   */ \
1867    24  /* mask_basereg  */
1868
1869.macro pixman_composite_add_n_8_8_process_pixblock_head
1870    /* expecting source data in {d8, d9, d10, d11} */
1871    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1872    /* and destination data in {d4, d5, d6, d7} */
1873    /* mask is in d24, d25, d26, d27 */
1874    vmull.u8    q0, d24, d11
1875    vmull.u8    q1, d25, d11
1876    vmull.u8    q6, d26, d11
1877    vmull.u8    q7, d27, d11
1878    vrshr.u16   q10, q0, #8
1879    vrshr.u16   q11, q1, #8
1880    vrshr.u16   q12, q6, #8
1881    vrshr.u16   q13, q7, #8
1882    vraddhn.u16 d0, q0, q10
1883    vraddhn.u16 d1, q1, q11
1884    vraddhn.u16 d2, q6, q12
1885    vraddhn.u16 d3, q7, q13
1886    vqadd.u8    q14, q0, q2
1887    vqadd.u8    q15, q1, q3
1888.endm
1889
1890.macro pixman_composite_add_n_8_8_process_pixblock_tail
1891.endm
1892
1893/* TODO: expand macros and do better instructions scheduling */
1894.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1895    pixman_composite_add_n_8_8_process_pixblock_tail
1896    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1897    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1898    fetch_mask_pixblock
1899    cache_preload 32, 32
1900    pixman_composite_add_n_8_8_process_pixblock_head
1901.endm
1902
1903.macro pixman_composite_add_n_8_8_init
1904    add         DUMMY, sp, #ARGS_STACK_OFFSET
1905    vpush       {d8-d15}
1906    vld1.32     {d11[0]}, [DUMMY]
1907    vdup.8      d11, d11[3]
1908.endm
1909
1910.macro pixman_composite_add_n_8_8_cleanup
1911    vpop        {d8-d15}
1912.endm
1913
1914generate_composite_function \
1915    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1916    FLAG_DST_READWRITE, \
1917    32, /* number of pixels, processed in a single block */ \
1918    5, /* prefetch distance */ \
1919    pixman_composite_add_n_8_8_init, \
1920    pixman_composite_add_n_8_8_cleanup, \
1921    pixman_composite_add_n_8_8_process_pixblock_head, \
1922    pixman_composite_add_n_8_8_process_pixblock_tail, \
1923    pixman_composite_add_n_8_8_process_pixblock_tail_head
1924
1925/******************************************************************************/
1926
1927.macro pixman_composite_add_8_8_8_process_pixblock_head
1928    /* expecting source data in {d0, d1, d2, d3} */
1929    /* destination data in {d4, d5, d6, d7} */
1930    /* mask in {d24, d25, d26, d27} */
1931    vmull.u8    q8, d24, d0
1932    vmull.u8    q9, d25, d1
1933    vmull.u8    q10, d26, d2
1934    vmull.u8    q11, d27, d3
1935    vrshr.u16   q0, q8, #8
1936    vrshr.u16   q1, q9, #8
1937    vrshr.u16   q12, q10, #8
1938    vrshr.u16   q13, q11, #8
1939    vraddhn.u16 d0, q0, q8
1940    vraddhn.u16 d1, q1, q9
1941    vraddhn.u16 d2, q12, q10
1942    vraddhn.u16 d3, q13, q11
1943    vqadd.u8    q14, q0, q2
1944    vqadd.u8    q15, q1, q3
1945.endm
1946
1947.macro pixman_composite_add_8_8_8_process_pixblock_tail
1948.endm
1949
1950/* TODO: expand macros and do better instructions scheduling */
1951.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1952    pixman_composite_add_8_8_8_process_pixblock_tail
1953    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1954    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1955    fetch_mask_pixblock
1956    fetch_src_pixblock
1957    cache_preload 32, 32
1958    pixman_composite_add_8_8_8_process_pixblock_head
1959.endm
1960
1961.macro pixman_composite_add_8_8_8_init
1962.endm
1963
1964.macro pixman_composite_add_8_8_8_cleanup
1965.endm
1966
1967generate_composite_function \
1968    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1969    FLAG_DST_READWRITE, \
1970    32, /* number of pixels, processed in a single block */ \
1971    5, /* prefetch distance */ \
1972    pixman_composite_add_8_8_8_init, \
1973    pixman_composite_add_8_8_8_cleanup, \
1974    pixman_composite_add_8_8_8_process_pixblock_head, \
1975    pixman_composite_add_8_8_8_process_pixblock_tail, \
1976    pixman_composite_add_8_8_8_process_pixblock_tail_head
1977
1978/******************************************************************************/
1979
1980.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1981    /* expecting source data in {d0, d1, d2, d3} */
1982    /* destination data in {d4, d5, d6, d7} */
1983    /* mask in {d24, d25, d26, d27} */
1984    vmull.u8    q8,  d27, d0
1985    vmull.u8    q9,  d27, d1
1986    vmull.u8    q10, d27, d2
1987    vmull.u8    q11, d27, d3
1988    /* 1 cycle bubble */
1989    vrsra.u16   q8,  q8,  #8
1990    vrsra.u16   q9,  q9,  #8
1991    vrsra.u16   q10, q10, #8
1992    vrsra.u16   q11, q11, #8
1993.endm
1994
1995.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1996    /* 2 cycle bubble */
1997    vrshrn.u16  d28, q8,  #8
1998    vrshrn.u16  d29, q9,  #8
1999    vrshrn.u16  d30, q10, #8
2000    vrshrn.u16  d31, q11, #8
2001    vqadd.u8    q14, q2,  q14
2002    /* 1 cycle bubble */
2003    vqadd.u8    q15, q3,  q15
2004.endm
2005
2006.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2007    fetch_src_pixblock
2008        vrshrn.u16  d28, q8,  #8
2009    fetch_mask_pixblock
2010        vrshrn.u16  d29, q9,  #8
2011    vmull.u8    q8,  d27, d0
2012        vrshrn.u16  d30, q10, #8
2013    vmull.u8    q9,  d27, d1
2014        vrshrn.u16  d31, q11, #8
2015    vmull.u8    q10, d27, d2
2016        vqadd.u8    q14, q2,  q14
2017    vmull.u8    q11, d27, d3
2018        vqadd.u8    q15, q3,  q15
2019    vrsra.u16   q8,  q8,  #8
2020    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
2021    vrsra.u16   q9,  q9,  #8
2022        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2023    vrsra.u16   q10, q10, #8
2024
2025    cache_preload 8, 8
2026
2027    vrsra.u16   q11, q11, #8
2028.endm
2029
2030generate_composite_function \
2031    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2032    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2033    8, /* number of pixels, processed in a single block */ \
2034    10, /* prefetch distance */ \
2035    default_init, \
2036    default_cleanup, \
2037    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2038    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2039    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2040
2041generate_composite_function_single_scanline \
2042    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2043    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2044    8, /* number of pixels, processed in a single block */ \
2045    default_init, \
2046    default_cleanup, \
2047    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2048    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2049    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2050
2051/******************************************************************************/
2052
2053generate_composite_function \
2054    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2055    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2056    8, /* number of pixels, processed in a single block */ \
2057    5, /* prefetch distance */ \
2058    default_init, \
2059    default_cleanup, \
2060    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2061    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2062    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2063    28, /* dst_w_basereg */ \
2064    4,  /* dst_r_basereg */ \
2065    0,  /* src_basereg   */ \
2066    27  /* mask_basereg  */
2067
2068/******************************************************************************/
2069
2070.macro pixman_composite_add_n_8_8888_init
2071    add         DUMMY, sp, #ARGS_STACK_OFFSET
2072    vld1.32     {d3[0]}, [DUMMY]
2073    vdup.8      d0, d3[0]
2074    vdup.8      d1, d3[1]
2075    vdup.8      d2, d3[2]
2076    vdup.8      d3, d3[3]
2077.endm
2078
2079.macro pixman_composite_add_n_8_8888_cleanup
2080.endm
2081
2082generate_composite_function \
2083    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2084    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2085    8, /* number of pixels, processed in a single block */ \
2086    5, /* prefetch distance */ \
2087    pixman_composite_add_n_8_8888_init, \
2088    pixman_composite_add_n_8_8888_cleanup, \
2089    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2090    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2091    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2092    28, /* dst_w_basereg */ \
2093    4,  /* dst_r_basereg */ \
2094    0,  /* src_basereg   */ \
2095    27  /* mask_basereg  */
2096
2097/******************************************************************************/
2098
2099.macro pixman_composite_add_8888_n_8888_init
2100    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2101    vld1.32     {d27[0]}, [DUMMY]
2102    vdup.8      d27, d27[3]
2103.endm
2104
2105.macro pixman_composite_add_8888_n_8888_cleanup
2106.endm
2107
2108generate_composite_function \
2109    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2110    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2111    8, /* number of pixels, processed in a single block */ \
2112    5, /* prefetch distance */ \
2113    pixman_composite_add_8888_n_8888_init, \
2114    pixman_composite_add_8888_n_8888_cleanup, \
2115    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2116    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2117    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2118    28, /* dst_w_basereg */ \
2119    4,  /* dst_r_basereg */ \
2120    0,  /* src_basereg   */ \
2121    27  /* mask_basereg  */
2122
2123/******************************************************************************/
2124
2125.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2126    /* expecting source data in {d0, d1, d2, d3} */
2127    /* destination data in {d4, d5, d6, d7} */
2128    /* solid mask is in d15 */
2129
2130    /* 'in' */
2131    vmull.u8    q8, d15, d3
2132    vmull.u8    q6, d15, d2
2133    vmull.u8    q5, d15, d1
2134    vmull.u8    q4, d15, d0
2135    vrshr.u16   q13, q8, #8
2136    vrshr.u16   q12, q6, #8
2137    vrshr.u16   q11, q5, #8
2138    vrshr.u16   q10, q4, #8
2139    vraddhn.u16 d3, q8, q13
2140    vraddhn.u16 d2, q6, q12
2141    vraddhn.u16 d1, q5, q11
2142    vraddhn.u16 d0, q4, q10
2143    vmvn.8      d24, d3  /* get inverted alpha */
2144    /* now do alpha blending */
2145    vmull.u8    q8, d24, d4
2146    vmull.u8    q9, d24, d5
2147    vmull.u8    q10, d24, d6
2148    vmull.u8    q11, d24, d7
2149.endm
2150
2151.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2152    vrshr.u16   q14, q8, #8
2153    vrshr.u16   q15, q9, #8
2154    vrshr.u16   q12, q10, #8
2155    vrshr.u16   q13, q11, #8
2156    vraddhn.u16 d28, q14, q8
2157    vraddhn.u16 d29, q15, q9
2158    vraddhn.u16 d30, q12, q10
2159    vraddhn.u16 d31, q13, q11
2160.endm
2161
2162/* TODO: expand macros and do better instructions scheduling */
2163.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2164    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2165    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2166    fetch_src_pixblock
2167    cache_preload 8, 8
2168    fetch_mask_pixblock
2169    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2170    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2171.endm
2172
2173generate_composite_function_single_scanline \
2174    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2175    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2176    8, /* number of pixels, processed in a single block */ \
2177    default_init_need_all_regs, \
2178    default_cleanup_need_all_regs, \
2179    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2180    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2181    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2182    28, /* dst_w_basereg */ \
2183    4,  /* dst_r_basereg */ \
2184    0,  /* src_basereg   */ \
2185    12  /* mask_basereg  */
2186
2187/******************************************************************************/
2188
2189.macro pixman_composite_over_8888_n_8888_process_pixblock_head
2190    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2191.endm
2192
2193.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2194    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2195    vqadd.u8    q14, q0, q14
2196    vqadd.u8    q15, q1, q15
2197.endm
2198
2199/* TODO: expand macros and do better instructions scheduling */
2200.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2201    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2202    pixman_composite_over_8888_n_8888_process_pixblock_tail
2203    fetch_src_pixblock
2204    cache_preload 8, 8
2205    pixman_composite_over_8888_n_8888_process_pixblock_head
2206    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2207.endm
2208
2209.macro pixman_composite_over_8888_n_8888_init
2210    add         DUMMY, sp, #48
2211    vpush       {d8-d15}
2212    vld1.32     {d15[0]}, [DUMMY]
2213    vdup.8      d15, d15[3]
2214.endm
2215
2216.macro pixman_composite_over_8888_n_8888_cleanup
2217    vpop        {d8-d15}
2218.endm
2219
2220generate_composite_function \
2221    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2222    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2223    8, /* number of pixels, processed in a single block */ \
2224    5, /* prefetch distance */ \
2225    pixman_composite_over_8888_n_8888_init, \
2226    pixman_composite_over_8888_n_8888_cleanup, \
2227    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2228    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2229    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2230
2231/******************************************************************************/
2232
2233/* TODO: expand macros and do better instructions scheduling */
2234.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2235    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2236    pixman_composite_over_8888_n_8888_process_pixblock_tail
2237    fetch_src_pixblock
2238    cache_preload 8, 8
2239    fetch_mask_pixblock
2240    pixman_composite_over_8888_n_8888_process_pixblock_head
2241    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2242.endm
2243
2244generate_composite_function \
2245    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2246    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2247    8, /* number of pixels, processed in a single block */ \
2248    5, /* prefetch distance */ \
2249    default_init_need_all_regs, \
2250    default_cleanup_need_all_regs, \
2251    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2252    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2253    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2254    28, /* dst_w_basereg */ \
2255    4,  /* dst_r_basereg */ \
2256    0,  /* src_basereg   */ \
2257    12  /* mask_basereg  */
2258
2259generate_composite_function_single_scanline \
2260    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2261    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2262    8, /* number of pixels, processed in a single block */ \
2263    default_init_need_all_regs, \
2264    default_cleanup_need_all_regs, \
2265    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2266    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2267    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2268    28, /* dst_w_basereg */ \
2269    4,  /* dst_r_basereg */ \
2270    0,  /* src_basereg   */ \
2271    12  /* mask_basereg  */
2272
2273/******************************************************************************/
2274
2275/* TODO: expand macros and do better instructions scheduling */
2276.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2277    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
2278    pixman_composite_over_8888_n_8888_process_pixblock_tail
2279    fetch_src_pixblock
2280    cache_preload 8, 8
2281    fetch_mask_pixblock
2282    pixman_composite_over_8888_n_8888_process_pixblock_head
2283    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
2284.endm
2285
2286generate_composite_function \
2287    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2288    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2289    8, /* number of pixels, processed in a single block */ \
2290    5, /* prefetch distance */ \
2291    default_init_need_all_regs, \
2292    default_cleanup_need_all_regs, \
2293    pixman_composite_over_8888_n_8888_process_pixblock_head, \
2294    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2295    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2296    28, /* dst_w_basereg */ \
2297    4,  /* dst_r_basereg */ \
2298    0,  /* src_basereg   */ \
2299    15  /* mask_basereg  */
2300
2301/******************************************************************************/
2302
2303.macro pixman_composite_src_0888_0888_process_pixblock_head
2304.endm
2305
2306.macro pixman_composite_src_0888_0888_process_pixblock_tail
2307.endm
2308
2309.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2310    vst3.8 {d0, d1, d2}, [DST_W]!
2311    fetch_src_pixblock
2312    cache_preload 8, 8
2313.endm
2314
2315generate_composite_function \
2316    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2317    FLAG_DST_WRITEONLY, \
2318    8, /* number of pixels, processed in a single block */ \
2319    10, /* prefetch distance */ \
2320    default_init, \
2321    default_cleanup, \
2322    pixman_composite_src_0888_0888_process_pixblock_head, \
2323    pixman_composite_src_0888_0888_process_pixblock_tail, \
2324    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2325    0, /* dst_w_basereg */ \
2326    0, /* dst_r_basereg */ \
2327    0, /* src_basereg   */ \
2328    0  /* mask_basereg  */
2329
2330/******************************************************************************/
2331
2332.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2333    vswp   d0, d2
2334.endm
2335
2336.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2337.endm
2338
2339.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2340    vst4.8 {d0, d1, d2, d3}, [DST_W]!
2341    fetch_src_pixblock
2342    vswp   d0, d2
2343    cache_preload 8, 8
2344.endm
2345
2346.macro pixman_composite_src_0888_8888_rev_init
2347    veor   d3, d3, d3
2348.endm
2349
2350generate_composite_function \
2351    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2352    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2353    8, /* number of pixels, processed in a single block */ \
2354    10, /* prefetch distance */ \
2355    pixman_composite_src_0888_8888_rev_init, \
2356    default_cleanup, \
2357    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2358    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2359    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2360    0, /* dst_w_basereg */ \
2361    0, /* dst_r_basereg */ \
2362    0, /* src_basereg   */ \
2363    0  /* mask_basereg  */
2364
2365/******************************************************************************/
2366
2367.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2368    vshll.u8    q8, d1, #8
2369    vshll.u8    q9, d2, #8
2370.endm
2371
2372.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2373    vshll.u8    q14, d0, #8
2374    vsri.u16    q14, q8, #5
2375    vsri.u16    q14, q9, #11
2376.endm
2377
2378.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2379        vshll.u8    q14, d0, #8
2380    fetch_src_pixblock
2381        vsri.u16    q14, q8, #5
2382        vsri.u16    q14, q9, #11
2383    vshll.u8    q8, d1, #8
2384        vst1.16 {d28, d29}, [DST_W, :128]!
2385    vshll.u8    q9, d2, #8
2386.endm
2387
2388generate_composite_function \
2389    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2390    FLAG_DST_WRITEONLY, \
2391    8, /* number of pixels, processed in a single block */ \
2392    10, /* prefetch distance */ \
2393    default_init, \
2394    default_cleanup, \
2395    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2396    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2397    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2398    28, /* dst_w_basereg */ \
2399    0, /* dst_r_basereg */ \
2400    0, /* src_basereg   */ \
2401    0  /* mask_basereg  */
2402
2403/******************************************************************************/
2404
2405.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2406    vmull.u8    q8, d3, d0
2407    vmull.u8    q9, d3, d1
2408    vmull.u8    q10, d3, d2
2409.endm
2410
2411.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2412    vrshr.u16   q11, q8, #8
2413    vswp        d3, d31
2414    vrshr.u16   q12, q9, #8
2415    vrshr.u16   q13, q10, #8
2416    vraddhn.u16 d30, q11, q8
2417    vraddhn.u16 d29, q12, q9
2418    vraddhn.u16 d28, q13, q10
2419.endm
2420
2421.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2422        vrshr.u16   q11, q8, #8
2423        vswp        d3, d31
2424        vrshr.u16   q12, q9, #8
2425        vrshr.u16   q13, q10, #8
2426    fetch_src_pixblock
2427        vraddhn.u16 d30, q11, q8
2428                                    PF add PF_X, PF_X, #8
2429                                    PF tst PF_CTL, #0xF
2430                                    PF addne PF_X, PF_X, #8
2431                                    PF subne PF_CTL, PF_CTL, #1
2432        vraddhn.u16 d29, q12, q9
2433        vraddhn.u16 d28, q13, q10
2434    vmull.u8    q8, d3, d0
2435    vmull.u8    q9, d3, d1
2436    vmull.u8    q10, d3, d2
2437        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2438                                    PF cmp PF_X, ORIG_W
2439                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2440                                    PF subge PF_X, PF_X, ORIG_W
2441                                    PF subges PF_CTL, PF_CTL, #0x10
2442                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2443.endm
2444
2445generate_composite_function \
2446    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2447    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2448    8, /* number of pixels, processed in a single block */ \
2449    10, /* prefetch distance */ \
2450    default_init, \
2451    default_cleanup, \
2452    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2453    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2454    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2455    28, /* dst_w_basereg */ \
2456    0, /* dst_r_basereg */ \
2457    0, /* src_basereg   */ \
2458    0  /* mask_basereg  */
2459
2460/******************************************************************************/
2461
2462.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2463    vmull.u8    q8, d3, d0
2464    vmull.u8    q9, d3, d1
2465    vmull.u8    q10, d3, d2
2466.endm
2467
2468.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2469    vrshr.u16   q11, q8, #8
2470    vswp        d3, d31
2471    vrshr.u16   q12, q9, #8
2472    vrshr.u16   q13, q10, #8
2473    vraddhn.u16 d28, q11, q8
2474    vraddhn.u16 d29, q12, q9
2475    vraddhn.u16 d30, q13, q10
2476.endm
2477
2478.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2479        vrshr.u16   q11, q8, #8
2480        vswp        d3, d31
2481        vrshr.u16   q12, q9, #8
2482        vrshr.u16   q13, q10, #8
2483    fetch_src_pixblock
2484        vraddhn.u16 d28, q11, q8
2485                                    PF add PF_X, PF_X, #8
2486                                    PF tst PF_CTL, #0xF
2487                                    PF addne PF_X, PF_X, #8
2488                                    PF subne PF_CTL, PF_CTL, #1
2489        vraddhn.u16 d29, q12, q9
2490        vraddhn.u16 d30, q13, q10
2491    vmull.u8    q8, d3, d0
2492    vmull.u8    q9, d3, d1
2493    vmull.u8    q10, d3, d2
2494        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2495                                    PF cmp PF_X, ORIG_W
2496                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2497                                    PF subge PF_X, PF_X, ORIG_W
2498                                    PF subges PF_CTL, PF_CTL, #0x10
2499                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2500.endm
2501
2502generate_composite_function \
2503    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2504    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2505    8, /* number of pixels, processed in a single block */ \
2506    10, /* prefetch distance */ \
2507    default_init, \
2508    default_cleanup, \
2509    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2510    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2511    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2512    28, /* dst_w_basereg */ \
2513    0, /* dst_r_basereg */ \
2514    0, /* src_basereg   */ \
2515    0  /* mask_basereg  */
2516
2517/******************************************************************************/
2518
2519.macro pixman_composite_over_0565_8_0565_process_pixblock_head
2520    /* mask is in d15 */
2521    convert_0565_to_x888 q4, d2, d1, d0
2522    convert_0565_to_x888 q5, d6, d5, d4
2523    /* source pixel data is in      {d0, d1, d2, XX} */
2524    /* destination pixel data is in {d4, d5, d6, XX} */
2525    vmvn.8      d7,  d15
2526    vmull.u8    q6,  d15, d2
2527    vmull.u8    q5,  d15, d1
2528    vmull.u8    q4,  d15, d0
2529    vmull.u8    q8,  d7,  d4
2530    vmull.u8    q9,  d7,  d5
2531    vmull.u8    q13, d7,  d6
2532    vrshr.u16   q12, q6,  #8
2533    vrshr.u16   q11, q5,  #8
2534    vrshr.u16   q10, q4,  #8
2535    vraddhn.u16 d2,  q6,  q12
2536    vraddhn.u16 d1,  q5,  q11
2537    vraddhn.u16 d0,  q4,  q10
2538.endm
2539
2540.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2541    vrshr.u16   q14, q8,  #8
2542    vrshr.u16   q15, q9,  #8
2543    vrshr.u16   q12, q13, #8
2544    vraddhn.u16 d28, q14, q8
2545    vraddhn.u16 d29, q15, q9
2546    vraddhn.u16 d30, q12, q13
2547    vqadd.u8    q0,  q0,  q14
2548    vqadd.u8    q1,  q1,  q15
2549    /* 32bpp result is in {d0, d1, d2, XX} */
2550    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2551.endm
2552
2553/* TODO: expand macros and do better instructions scheduling */
2554.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2555    fetch_mask_pixblock
2556    pixman_composite_over_0565_8_0565_process_pixblock_tail
2557    fetch_src_pixblock
2558    vld1.16    {d10, d11}, [DST_R, :128]!
2559    cache_preload 8, 8
2560    pixman_composite_over_0565_8_0565_process_pixblock_head
2561    vst1.16    {d28, d29}, [DST_W, :128]!
2562.endm
2563
2564generate_composite_function \
2565    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2566    FLAG_DST_READWRITE, \
2567    8, /* number of pixels, processed in a single block */ \
2568    5, /* prefetch distance */ \
2569    default_init_need_all_regs, \
2570    default_cleanup_need_all_regs, \
2571    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2572    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2573    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2574    28, /* dst_w_basereg */ \
2575    10,  /* dst_r_basereg */ \
2576    8,  /* src_basereg   */ \
2577    15  /* mask_basereg  */
2578
2579/******************************************************************************/
2580
2581.macro pixman_composite_over_0565_n_0565_init
2582    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2583    vpush       {d8-d15}
2584    vld1.32     {d15[0]}, [DUMMY]
2585    vdup.8      d15, d15[3]
2586.endm
2587
2588.macro pixman_composite_over_0565_n_0565_cleanup
2589    vpop        {d8-d15}
2590.endm
2591
2592generate_composite_function \
2593    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2594    FLAG_DST_READWRITE, \
2595    8, /* number of pixels, processed in a single block */ \
2596    5, /* prefetch distance */ \
2597    pixman_composite_over_0565_n_0565_init, \
2598    pixman_composite_over_0565_n_0565_cleanup, \
2599    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2600    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2601    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2602    28, /* dst_w_basereg */ \
2603    10, /* dst_r_basereg */ \
2604    8,  /* src_basereg   */ \
2605    15  /* mask_basereg  */
2606
2607/******************************************************************************/
2608
2609.macro pixman_composite_add_0565_8_0565_process_pixblock_head
2610    /* mask is in d15 */
2611    convert_0565_to_x888 q4, d2, d1, d0
2612    convert_0565_to_x888 q5, d6, d5, d4
2613    /* source pixel data is in      {d0, d1, d2, XX} */
2614    /* destination pixel data is in {d4, d5, d6, XX} */
2615    vmull.u8    q6,  d15, d2
2616    vmull.u8    q5,  d15, d1
2617    vmull.u8    q4,  d15, d0
2618    vrshr.u16   q12, q6,  #8
2619    vrshr.u16   q11, q5,  #8
2620    vrshr.u16   q10, q4,  #8
2621    vraddhn.u16 d2,  q6,  q12
2622    vraddhn.u16 d1,  q5,  q11
2623    vraddhn.u16 d0,  q4,  q10
2624.endm
2625
2626.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2627    vqadd.u8    q0,  q0,  q2
2628    vqadd.u8    q1,  q1,  q3
2629    /* 32bpp result is in {d0, d1, d2, XX} */
2630    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2631.endm
2632
2633/* TODO: expand macros and do better instructions scheduling */
2634.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2635    fetch_mask_pixblock
2636    pixman_composite_add_0565_8_0565_process_pixblock_tail
2637    fetch_src_pixblock
2638    vld1.16    {d10, d11}, [DST_R, :128]!
2639    cache_preload 8, 8
2640    pixman_composite_add_0565_8_0565_process_pixblock_head
2641    vst1.16    {d28, d29}, [DST_W, :128]!
2642.endm
2643
2644generate_composite_function \
2645    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2646    FLAG_DST_READWRITE, \
2647    8, /* number of pixels, processed in a single block */ \
2648    5, /* prefetch distance */ \
2649    default_init_need_all_regs, \
2650    default_cleanup_need_all_regs, \
2651    pixman_composite_add_0565_8_0565_process_pixblock_head, \
2652    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2653    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2654    28, /* dst_w_basereg */ \
2655    10, /* dst_r_basereg */ \
2656    8,  /* src_basereg   */ \
2657    15  /* mask_basereg  */
2658
2659/******************************************************************************/
2660
2661.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2662    /* mask is in d15 */
2663    convert_0565_to_x888 q5, d6, d5, d4
2664    /* destination pixel data is in {d4, d5, d6, xx} */
2665    vmvn.8      d24, d15 /* get inverted alpha */
2666    /* now do alpha blending */
2667    vmull.u8    q8, d24, d4
2668    vmull.u8    q9, d24, d5
2669    vmull.u8    q10, d24, d6
2670.endm
2671
2672.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2673    vrshr.u16   q14, q8, #8
2674    vrshr.u16   q15, q9, #8
2675    vrshr.u16   q12, q10, #8
2676    vraddhn.u16 d0, q14, q8
2677    vraddhn.u16 d1, q15, q9
2678    vraddhn.u16 d2, q12, q10
2679    /* 32bpp result is in {d0, d1, d2, XX} */
2680    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2681.endm
2682
2683/* TODO: expand macros and do better instructions scheduling */
2684.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2685    fetch_src_pixblock
2686    pixman_composite_out_reverse_8_0565_process_pixblock_tail
2687    vld1.16    {d10, d11}, [DST_R, :128]!
2688    cache_preload 8, 8
2689    pixman_composite_out_reverse_8_0565_process_pixblock_head
2690    vst1.16    {d28, d29}, [DST_W, :128]!
2691.endm
2692
2693generate_composite_function \
2694    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2695    FLAG_DST_READWRITE, \
2696    8, /* number of pixels, processed in a single block */ \
2697    5, /* prefetch distance */ \
2698    default_init_need_all_regs, \
2699    default_cleanup_need_all_regs, \
2700    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2701    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2702    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2703    28, /* dst_w_basereg */ \
2704    10, /* dst_r_basereg */ \
2705    15, /* src_basereg   */ \
2706    0   /* mask_basereg  */
2707
2708/******************************************************************************/
2709
2710.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2711    /* src is in d0 */
2712    /* destination pixel data is in {d4, d5, d6, d7} */
2713    vmvn.8      d1, d0 /* get inverted alpha */
2714    /* now do alpha blending */
2715    vmull.u8    q8, d1, d4
2716    vmull.u8    q9, d1, d5
2717    vmull.u8    q10, d1, d6
2718    vmull.u8    q11, d1, d7
2719.endm
2720
2721.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2722    vrshr.u16   q14, q8, #8
2723    vrshr.u16   q15, q9, #8
2724    vrshr.u16   q12, q10, #8
2725    vrshr.u16   q13, q11, #8
2726    vraddhn.u16 d28, q14, q8
2727    vraddhn.u16 d29, q15, q9
2728    vraddhn.u16 d30, q12, q10
2729    vraddhn.u16 d31, q13, q11
2730    /* 32bpp result is in {d28, d29, d30, d31} */
2731.endm
2732
2733/* TODO: expand macros and do better instructions scheduling */
2734.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2735    fetch_src_pixblock
2736    pixman_composite_out_reverse_8_8888_process_pixblock_tail
2737    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
2738    cache_preload 8, 8
2739    pixman_composite_out_reverse_8_8888_process_pixblock_head
2740    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
2741.endm
2742
2743generate_composite_function \
2744    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2745    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2746    8, /* number of pixels, processed in a single block */ \
2747    5, /* prefetch distance */ \
2748    default_init, \
2749    default_cleanup, \
2750    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2751    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2752    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2753    28, /* dst_w_basereg */ \
2754    4, /* dst_r_basereg */ \
2755    0, /* src_basereg   */ \
2756    0   /* mask_basereg  */
2757
2758/******************************************************************************/
2759
2760generate_composite_function_nearest_scanline \
2761    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2762    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2763    8, /* number of pixels, processed in a single block */ \
2764    default_init, \
2765    default_cleanup, \
2766    pixman_composite_over_8888_8888_process_pixblock_head, \
2767    pixman_composite_over_8888_8888_process_pixblock_tail, \
2768    pixman_composite_over_8888_8888_process_pixblock_tail_head
2769
2770generate_composite_function_nearest_scanline \
2771    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2772    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2773    8, /* number of pixels, processed in a single block */ \
2774    default_init, \
2775    default_cleanup, \
2776    pixman_composite_over_8888_0565_process_pixblock_head, \
2777    pixman_composite_over_8888_0565_process_pixblock_tail, \
2778    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2779    28, /* dst_w_basereg */ \
2780    4,  /* dst_r_basereg */ \
2781    0,  /* src_basereg   */ \
2782    24  /* mask_basereg  */
2783
2784generate_composite_function_nearest_scanline \
2785    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2786    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2787    8, /* number of pixels, processed in a single block */ \
2788    default_init, \
2789    default_cleanup, \
2790    pixman_composite_src_8888_0565_process_pixblock_head, \
2791    pixman_composite_src_8888_0565_process_pixblock_tail, \
2792    pixman_composite_src_8888_0565_process_pixblock_tail_head
2793
2794generate_composite_function_nearest_scanline \
2795    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2796    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2797    8, /* number of pixels, processed in a single block */ \
2798    default_init, \
2799    default_cleanup, \
2800    pixman_composite_src_0565_8888_process_pixblock_head, \
2801    pixman_composite_src_0565_8888_process_pixblock_tail, \
2802    pixman_composite_src_0565_8888_process_pixblock_tail_head
2803
2804generate_composite_function_nearest_scanline \
2805    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2806    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2807    8, /* number of pixels, processed in a single block */ \
2808    default_init_need_all_regs, \
2809    default_cleanup_need_all_regs, \
2810    pixman_composite_over_8888_8_0565_process_pixblock_head, \
2811    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2812    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2813    28, /* dst_w_basereg */ \
2814    4,  /* dst_r_basereg */ \
2815    8,  /* src_basereg   */ \
2816    24  /* mask_basereg  */
2817
2818generate_composite_function_nearest_scanline \
2819    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2820    FLAG_DST_READWRITE, \
2821    8, /* number of pixels, processed in a single block */ \
2822    default_init_need_all_regs, \
2823    default_cleanup_need_all_regs, \
2824    pixman_composite_over_0565_8_0565_process_pixblock_head, \
2825    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2826    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2827    28, /* dst_w_basereg */ \
2828    10,  /* dst_r_basereg */ \
2829    8,  /* src_basereg   */ \
2830    15  /* mask_basereg  */
2831
2832/******************************************************************************/
2833
2834/*
2835 * Bilinear scaling support code which tries to provide pixel fetching, color
2836 * format conversion, and interpolation as separate macros which can be used
2837 * as the basic building blocks for constructing bilinear scanline functions.
2838 */
2839
2840.macro bilinear_load_8888 reg1, reg2, tmp
2841    mov       TMP1, X, asr #16
2842    add       X, X, UX
2843    add       TMP1, TOP, TMP1, asl #2
2844    vld1.32   {reg1}, [TMP1], STRIDE
2845    vld1.32   {reg2}, [TMP1]
2846.endm
2847
2848.macro bilinear_load_0565 reg1, reg2, tmp
2849    mov       TMP1, X, asr #16
2850    add       X, X, UX
2851    add       TMP1, TOP, TMP1, asl #1
2852    vld1.32   {reg2[0]}, [TMP1], STRIDE
2853    vld1.32   {reg2[1]}, [TMP1]
2854    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2855.endm
2856
2857.macro bilinear_load_and_vertical_interpolate_two_8888 \
2858                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2859
2860    bilinear_load_8888 reg1, reg2, tmp1
2861    vmull.u8  acc1, reg1, d28
2862    vmlal.u8  acc1, reg2, d29
2863    bilinear_load_8888 reg3, reg4, tmp2
2864    vmull.u8  acc2, reg3, d28
2865    vmlal.u8  acc2, reg4, d29
2866.endm
2867
2868.macro bilinear_load_and_vertical_interpolate_four_8888 \
2869                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2870                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2871
2872    bilinear_load_and_vertical_interpolate_two_8888 \
2873                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2874    bilinear_load_and_vertical_interpolate_two_8888 \
2875                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2876.endm
2877
2878.macro bilinear_load_and_vertical_interpolate_two_0565 \
2879                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2880
2881    mov       TMP1, X, asr #16
2882    add       X, X, UX
2883    add       TMP1, TOP, TMP1, asl #1
2884    mov       TMP2, X, asr #16
2885    add       X, X, UX
2886    add       TMP2, TOP, TMP2, asl #1
2887    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
2888    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
2889    vld1.32   {acc2lo[1]}, [TMP1]
2890    vld1.32   {acc2hi[1]}, [TMP2]
2891    convert_0565_to_x888 acc2, reg3, reg2, reg1
2892    vzip.u8   reg1, reg3
2893    vzip.u8   reg2, reg4
2894    vzip.u8   reg3, reg4
2895    vzip.u8   reg1, reg2
2896    vmull.u8  acc1, reg1, d28
2897    vmlal.u8  acc1, reg2, d29
2898    vmull.u8  acc2, reg3, d28
2899    vmlal.u8  acc2, reg4, d29
2900.endm
2901
2902.macro bilinear_load_and_vertical_interpolate_four_0565 \
2903                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2904                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2905
2906    mov       TMP1, X, asr #16
2907    add       X, X, UX
2908    add       TMP1, TOP, TMP1, asl #1
2909    mov       TMP2, X, asr #16
2910    add       X, X, UX
2911    add       TMP2, TOP, TMP2, asl #1
2912    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
2913    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
2914    vld1.32   {xacc2lo[1]}, [TMP1]
2915    vld1.32   {xacc2hi[1]}, [TMP2]
2916    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2917    mov       TMP1, X, asr #16
2918    add       X, X, UX
2919    add       TMP1, TOP, TMP1, asl #1
2920    mov       TMP2, X, asr #16
2921    add       X, X, UX
2922    add       TMP2, TOP, TMP2, asl #1
2923    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
2924    vzip.u8   xreg1, xreg3
2925    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
2926    vzip.u8   xreg2, xreg4
2927    vld1.32   {yacc2lo[1]}, [TMP1]
2928    vzip.u8   xreg3, xreg4
2929    vld1.32   {yacc2hi[1]}, [TMP2]
2930    vzip.u8   xreg1, xreg2
2931    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2932    vmull.u8  xacc1, xreg1, d28
2933    vzip.u8   yreg1, yreg3
2934    vmlal.u8  xacc1, xreg2, d29
2935    vzip.u8   yreg2, yreg4
2936    vmull.u8  xacc2, xreg3, d28
2937    vzip.u8   yreg3, yreg4
2938    vmlal.u8  xacc2, xreg4, d29
2939    vzip.u8   yreg1, yreg2
2940    vmull.u8  yacc1, yreg1, d28
2941    vmlal.u8  yacc1, yreg2, d29
2942    vmull.u8  yacc2, yreg3, d28
2943    vmlal.u8  yacc2, yreg4, d29
2944.endm
2945
2946.macro bilinear_store_8888 numpix, tmp1, tmp2
2947.if numpix == 4
2948    vst1.32   {d0, d1}, [OUT, :128]!
2949.elseif numpix == 2
2950    vst1.32   {d0}, [OUT, :64]!
2951.elseif numpix == 1
2952    vst1.32   {d0[0]}, [OUT, :32]!
2953.else
2954    .error bilinear_store_8888 numpix is unsupported
2955.endif
2956.endm
2957
2958.macro bilinear_store_0565 numpix, tmp1, tmp2
2959    vuzp.u8 d0, d1
2960    vuzp.u8 d2, d3
2961    vuzp.u8 d1, d3
2962    vuzp.u8 d0, d2
2963    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2964.if numpix == 4
2965    vst1.16   {d2}, [OUT, :64]!
2966.elseif numpix == 2
2967    vst1.32   {d2[0]}, [OUT, :32]!
2968.elseif numpix == 1
2969    vst1.16   {d2[0]}, [OUT, :16]!
2970.else
2971    .error bilinear_store_0565 numpix is unsupported
2972.endif
2973.endm
2974
2975.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2976    bilinear_load_&src_fmt d0, d1, d2
2977    vmull.u8  q1, d0, d28
2978    vmlal.u8  q1, d1, d29
2979    /* 5 cycles bubble */
2980    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2981    vmlsl.u16 q0, d2, d30
2982    vmlal.u16 q0, d3, d30
2983    /* 5 cycles bubble */
2984    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
2985    /* 3 cycles bubble */
2986    vmovn.u16 d0, q0
2987    /* 1 cycle bubble */
2988    bilinear_store_&dst_fmt 1, q2, q3
2989.endm
2990
2991.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2992    bilinear_load_and_vertical_interpolate_two_&src_fmt \
2993                q1, q11, d0, d1, d20, d21, d22, d23
2994    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2995    vmlsl.u16 q0, d2, d30
2996    vmlal.u16 q0, d3, d30
2997    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
2998    vmlsl.u16 q10, d22, d31
2999    vmlal.u16 q10, d23, d31
3000    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3001    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3002    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3003    vadd.u16  q12, q12, q13
3004    vmovn.u16 d0, q0
3005    bilinear_store_&dst_fmt 2, q2, q3
3006.endm
3007
3008.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3009    bilinear_load_and_vertical_interpolate_four_&src_fmt \
3010                q1, q11, d0, d1, d20, d21, d22, d23 \
3011                q3, q9,  d4, d5, d16, d17, d18, d19
3012    pld       [TMP1, PF_OFFS]
3013    sub       TMP1, TMP1, STRIDE
3014    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3015    vmlsl.u16 q0, d2, d30
3016    vmlal.u16 q0, d3, d30
3017    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3018    vmlsl.u16 q10, d22, d31
3019    vmlal.u16 q10, d23, d31
3020    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3021    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3022    vmlsl.u16 q2, d6, d30
3023    vmlal.u16 q2, d7, d30
3024    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3025    pld       [TMP2, PF_OFFS]
3026    vmlsl.u16 q8, d18, d31
3027    vmlal.u16 q8, d19, d31
3028    vadd.u16  q12, q12, q13
3029    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3030    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3031    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3032    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3033    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3034    vmovn.u16 d0, q0
3035    vmovn.u16 d1, q2
3036    vadd.u16  q12, q12, q13
3037    bilinear_store_&dst_fmt 4, q2, q3
3038.endm
3039
3040.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3041.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3042    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3043.else
3044    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3045.endif
3046.endm
3047
3048.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3049.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3050    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3051.endif
3052.endm
3053
3054.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3055.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3056    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3057.else
3058    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3059.endif
3060.endm
3061
3062.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3063.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3064    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3065.else
3066    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3067    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3068.endif
3069.endm
3070
3071.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3072.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3073    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3074.else
3075    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3076.endif
3077.endm
3078
3079.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3080.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3081    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3082.else
3083    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3084    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3085.endif
3086.endm
3087
3088.set BILINEAR_FLAG_UNROLL_4,          0
3089.set BILINEAR_FLAG_UNROLL_8,          1
3090.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3091
3092/*
3093 * Main template macro for generating NEON optimized bilinear scanline
3094 * functions.
3095 *
3096 * Bilinear scanline scaler macro template uses the following arguments:
3097 *  fname             - name of the function to generate
3098 *  src_fmt           - source color format (8888 or 0565)
3099 *  dst_fmt           - destination color format (8888 or 0565)
3100 *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
3101 *  prefetch_distance - prefetch in the source image by that many
3102 *                      pixels ahead
3103 */
3104
3105.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3106                                       src_bpp_shift, dst_bpp_shift, \
3107                                       prefetch_distance, flags
3108
3109pixman_asm_function fname
3110    OUT       .req      r0
3111    TOP       .req      r1
3112    BOTTOM    .req      r2
3113    WT        .req      r3
3114    WB        .req      r4
3115    X         .req      r5
3116    UX        .req      r6
3117    WIDTH     .req      ip
3118    TMP1      .req      r3
3119    TMP2      .req      r4
3120    PF_OFFS   .req      r7
3121    TMP3      .req      r8
3122    TMP4      .req      r9
3123    STRIDE    .req      r2
3124
3125    mov       ip, sp
3126    push      {r4, r5, r6, r7, r8, r9}
3127    mov       PF_OFFS, #prefetch_distance
3128    ldmia     ip, {WB, X, UX, WIDTH}
3129    mul       PF_OFFS, PF_OFFS, UX
3130
3131.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3132    vpush     {d8-d15}
3133.endif
3134
3135    sub       STRIDE, BOTTOM, TOP
3136    .unreq    BOTTOM
3137
3138    cmp       WIDTH, #0
3139    ble       3f
3140
3141    vdup.u16  q12, X
3142    vdup.u16  q13, UX
3143    vdup.u8   d28, WT
3144    vdup.u8   d29, WB
3145    vadd.u16  d25, d25, d26
3146
3147    /* ensure good destination alignment  */
3148    cmp       WIDTH, #1
3149    blt       0f
3150    tst       OUT, #(1 << dst_bpp_shift)
3151    beq       0f
3152    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3153    vadd.u16  q12, q12, q13
3154    bilinear_interpolate_last_pixel src_fmt, dst_fmt
3155    sub       WIDTH, WIDTH, #1
31560:
3157    vadd.u16  q13, q13, q13
3158    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3159    vadd.u16  q12, q12, q13
3160
3161    cmp       WIDTH, #2
3162    blt       0f
3163    tst       OUT, #(1 << (dst_bpp_shift + 1))
3164    beq       0f
3165    bilinear_interpolate_two_pixels src_fmt, dst_fmt
3166    sub       WIDTH, WIDTH, #2
31670:
3168.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3169/*********** 8 pixels per iteration *****************/
3170    cmp       WIDTH, #4
3171    blt       0f
3172    tst       OUT, #(1 << (dst_bpp_shift + 2))
3173    beq       0f
3174    bilinear_interpolate_four_pixels src_fmt, dst_fmt
3175    sub       WIDTH, WIDTH, #4
31760:
3177    subs      WIDTH, WIDTH, #8
3178    blt       1f
3179    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3180    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3181    subs      WIDTH, WIDTH, #8
3182    blt       5f
31830:
3184    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3185    subs      WIDTH, WIDTH, #8
3186    bge       0b
31875:
3188    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
31891:
3190    tst       WIDTH, #4
3191    beq       2f
3192    bilinear_interpolate_four_pixels src_fmt, dst_fmt
31932:
3194.else
3195/*********** 4 pixels per iteration *****************/
3196    subs      WIDTH, WIDTH, #4
3197    blt       1f
3198    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3199    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3200    subs      WIDTH, WIDTH, #4
3201    blt       5f
32020:
3203    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3204    subs      WIDTH, WIDTH, #4
3205    bge       0b
32065:
3207    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
32081:
3209/****************************************************/
3210.endif
3211    /* handle the remaining trailing pixels */
3212    tst       WIDTH, #2
3213    beq       2f
3214    bilinear_interpolate_two_pixels src_fmt, dst_fmt
32152:
3216    tst       WIDTH, #1
3217    beq       3f
3218    bilinear_interpolate_last_pixel src_fmt, dst_fmt
32193:
3220.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3221    vpop      {d8-d15}
3222.endif
3223    pop       {r4, r5, r6, r7, r8, r9}
3224    bx        lr
3225
3226    .unreq    OUT
3227    .unreq    TOP
3228    .unreq    WT
3229    .unreq    WB
3230    .unreq    X
3231    .unreq    UX
3232    .unreq    WIDTH
3233    .unreq    TMP1
3234    .unreq    TMP2
3235    .unreq    PF_OFFS
3236    .unreq    TMP3
3237    .unreq    TMP4
3238    .unreq    STRIDE
3239.endfunc
3240
3241.endm
3242
3243/*****************************************************************************/
3244
3245.set have_bilinear_interpolate_four_pixels_8888_8888, 1
3246
3247.macro bilinear_interpolate_four_pixels_8888_8888_head
3248    mov       TMP1, X, asr #16
3249    add       X, X, UX
3250    add       TMP1, TOP, TMP1, asl #2
3251    mov       TMP2, X, asr #16
3252    add       X, X, UX
3253    add       TMP2, TOP, TMP2, asl #2
3254
3255    vld1.32   {d22}, [TMP1], STRIDE
3256    vld1.32   {d23}, [TMP1]
3257    mov       TMP3, X, asr #16
3258    add       X, X, UX
3259    add       TMP3, TOP, TMP3, asl #2
3260    vmull.u8  q8, d22, d28
3261    vmlal.u8  q8, d23, d29
3262
3263    vld1.32   {d22}, [TMP2], STRIDE
3264    vld1.32   {d23}, [TMP2]
3265    mov       TMP4, X, asr #16
3266    add       X, X, UX
3267    add       TMP4, TOP, TMP4, asl #2
3268    vmull.u8  q9, d22, d28
3269    vmlal.u8  q9, d23, d29
3270
3271    vld1.32   {d22}, [TMP3], STRIDE
3272    vld1.32   {d23}, [TMP3]
3273    vmull.u8  q10, d22, d28
3274    vmlal.u8  q10, d23, d29
3275
3276    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3277    vmlsl.u16 q0, d16, d30
3278    vmlal.u16 q0, d17, d30
3279
3280    pld       [TMP4, PF_OFFS]
3281    vld1.32   {d16}, [TMP4], STRIDE
3282    vld1.32   {d17}, [TMP4]
3283    pld       [TMP4, PF_OFFS]
3284    vmull.u8  q11, d16, d28
3285    vmlal.u8  q11, d17, d29
3286
3287    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3288    vmlsl.u16 q1, d18, d31
3289.endm
3290
3291.macro bilinear_interpolate_four_pixels_8888_8888_tail
3292    vmlal.u16 q1, d19, d31
3293    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3294    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3295    vmlsl.u16 q2, d20, d30
3296    vmlal.u16 q2, d21, d30
3297    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3298    vmlsl.u16 q3, d22, d31
3299    vmlal.u16 q3, d23, d31
3300    vadd.u16  q12, q12, q13
3301    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3302    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3303    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3304    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3305    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3306    vmovn.u16 d6, q0
3307    vmovn.u16 d7, q2
3308    vadd.u16  q12, q12, q13
3309    vst1.32   {d6, d7}, [OUT, :128]!
3310.endm
3311
3312.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3313    mov       TMP1, X, asr #16
3314    add       X, X, UX
3315    add       TMP1, TOP, TMP1, asl #2
3316    mov       TMP2, X, asr #16
3317    add       X, X, UX
3318    add       TMP2, TOP, TMP2, asl #2
3319        vmlal.u16 q1, d19, d31
3320        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3321        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3322        vmlsl.u16 q2, d20, d30
3323        vmlal.u16 q2, d21, d30
3324        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3325    vld1.32   {d20}, [TMP1], STRIDE
3326        vmlsl.u16 q3, d22, d31
3327        vmlal.u16 q3, d23, d31
3328    vld1.32   {d21}, [TMP1]
3329    vmull.u8  q8, d20, d28
3330    vmlal.u8  q8, d21, d29
3331        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3332        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3333        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3334    vld1.32   {d22}, [TMP2], STRIDE
3335        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3336        vadd.u16  q12, q12, q13
3337    vld1.32   {d23}, [TMP2]
3338    vmull.u8  q9, d22, d28
3339    mov       TMP3, X, asr #16
3340    add       X, X, UX
3341    add       TMP3, TOP, TMP3, asl #2
3342    mov       TMP4, X, asr #16
3343    add       X, X, UX
3344    add       TMP4, TOP, TMP4, asl #2
3345    vmlal.u8  q9, d23, d29
3346    vld1.32   {d22}, [TMP3], STRIDE
3347        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3348    vld1.32   {d23}, [TMP3]
3349    vmull.u8  q10, d22, d28
3350    vmlal.u8  q10, d23, d29
3351        vmovn.u16 d6, q0
3352    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3353        vmovn.u16 d7, q2
3354    vmlsl.u16 q0, d16, d30
3355    vmlal.u16 q0, d17, d30
3356    pld       [TMP4, PF_OFFS]
3357    vld1.32   {d16}, [TMP4], STRIDE
3358        vadd.u16  q12, q12, q13
3359    vld1.32   {d17}, [TMP4]
3360    pld       [TMP4, PF_OFFS]
3361    vmull.u8  q11, d16, d28
3362    vmlal.u8  q11, d17, d29
3363        vst1.32   {d6, d7}, [OUT, :128]!
3364    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3365    vmlsl.u16 q1, d18, d31
3366.endm
3367
3368/*****************************************************************************/
3369
3370.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3371
3372.macro bilinear_interpolate_eight_pixels_8888_0565_head
3373    mov       TMP1, X, asr #16
3374    add       X, X, UX
3375    add       TMP1, TOP, TMP1, asl #2
3376    mov       TMP2, X, asr #16
3377    add       X, X, UX
3378    add       TMP2, TOP, TMP2, asl #2
3379    vld1.32   {d20}, [TMP1], STRIDE
3380    vld1.32   {d21}, [TMP1]
3381    vmull.u8  q8, d20, d28
3382    vmlal.u8  q8, d21, d29
3383    vld1.32   {d22}, [TMP2], STRIDE
3384    vld1.32   {d23}, [TMP2]
3385    vmull.u8  q9, d22, d28
3386    mov       TMP3, X, asr #16
3387    add       X, X, UX
3388    add       TMP3, TOP, TMP3, asl #2
3389    mov       TMP4, X, asr #16
3390    add       X, X, UX
3391    add       TMP4, TOP, TMP4, asl #2
3392    vmlal.u8  q9, d23, d29
3393    vld1.32   {d22}, [TMP3], STRIDE
3394    vld1.32   {d23}, [TMP3]
3395    vmull.u8  q10, d22, d28
3396    vmlal.u8  q10, d23, d29
3397    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3398    vmlsl.u16 q0, d16, d30
3399    vmlal.u16 q0, d17, d30
3400    pld       [TMP4, PF_OFFS]
3401    vld1.32   {d16}, [TMP4], STRIDE
3402    vld1.32   {d17}, [TMP4]
3403    pld       [TMP4, PF_OFFS]
3404    vmull.u8  q11, d16, d28
3405    vmlal.u8  q11, d17, d29
3406    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3407    vmlsl.u16 q1, d18, d31
3408
3409    mov       TMP1, X, asr #16
3410    add       X, X, UX
3411    add       TMP1, TOP, TMP1, asl #2
3412    mov       TMP2, X, asr #16
3413    add       X, X, UX
3414    add       TMP2, TOP, TMP2, asl #2
3415        vmlal.u16 q1, d19, d31
3416        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3417        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3418        vmlsl.u16 q2, d20, d30
3419        vmlal.u16 q2, d21, d30
3420        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3421    vld1.32   {d20}, [TMP1], STRIDE
3422        vmlsl.u16 q3, d22, d31
3423        vmlal.u16 q3, d23, d31
3424    vld1.32   {d21}, [TMP1]
3425    vmull.u8  q8, d20, d28
3426    vmlal.u8  q8, d21, d29
3427        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3428        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3429        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3430    vld1.32   {d22}, [TMP2], STRIDE
3431        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3432        vadd.u16  q12, q12, q13
3433    vld1.32   {d23}, [TMP2]
3434    vmull.u8  q9, d22, d28
3435    mov       TMP3, X, asr #16
3436    add       X, X, UX
3437    add       TMP3, TOP, TMP3, asl #2
3438    mov       TMP4, X, asr #16
3439    add       X, X, UX
3440    add       TMP4, TOP, TMP4, asl #2
3441    vmlal.u8  q9, d23, d29
3442    vld1.32   {d22}, [TMP3], STRIDE
3443        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3444    vld1.32   {d23}, [TMP3]
3445    vmull.u8  q10, d22, d28
3446    vmlal.u8  q10, d23, d29
3447        vmovn.u16 d8, q0
3448    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3449        vmovn.u16 d9, q2
3450    vmlsl.u16 q0, d16, d30
3451    vmlal.u16 q0, d17, d30
3452    pld       [TMP4, PF_OFFS]
3453    vld1.32   {d16}, [TMP4], STRIDE
3454        vadd.u16  q12, q12, q13
3455    vld1.32   {d17}, [TMP4]
3456    pld       [TMP4, PF_OFFS]
3457    vmull.u8  q11, d16, d28
3458    vmlal.u8  q11, d17, d29
3459    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3460    vmlsl.u16 q1, d18, d31
3461.endm
3462
3463.macro bilinear_interpolate_eight_pixels_8888_0565_tail
3464    vmlal.u16 q1, d19, d31
3465    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3466    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3467    vmlsl.u16 q2, d20, d30
3468    vmlal.u16 q2, d21, d30
3469    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3470    vmlsl.u16 q3, d22, d31
3471    vmlal.u16 q3, d23, d31
3472    vadd.u16  q12, q12, q13
3473    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3474    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3475    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3476    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3477    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3478    vmovn.u16 d10, q0
3479    vmovn.u16 d11, q2
3480    vadd.u16  q12, q12, q13
3481
3482    vuzp.u8   d8, d9
3483    vuzp.u8   d10, d11
3484    vuzp.u8   d9, d11
3485    vuzp.u8   d8, d10
3486    vshll.u8  q6, d9, #8
3487    vshll.u8  q5, d10, #8
3488    vshll.u8  q7, d8, #8
3489    vsri.u16  q5, q6, #5
3490    vsri.u16  q5, q7, #11
3491    vst1.32   {d10, d11}, [OUT, :128]!
3492.endm
3493
3494.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3495    mov       TMP1, X, asr #16
3496    add       X, X, UX
3497    add       TMP1, TOP, TMP1, asl #2
3498    mov       TMP2, X, asr #16
3499    add       X, X, UX
3500    add       TMP2, TOP, TMP2, asl #2
3501        vmlal.u16 q1, d19, d31
3502        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3503            vuzp.u8 d8, d9
3504        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3505        vmlsl.u16 q2, d20, d30
3506        vmlal.u16 q2, d21, d30
3507        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3508    vld1.32   {d20}, [TMP1], STRIDE
3509        vmlsl.u16 q3, d22, d31
3510        vmlal.u16 q3, d23, d31
3511    vld1.32   {d21}, [TMP1]
3512    vmull.u8  q8, d20, d28
3513    vmlal.u8  q8, d21, d29
3514        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3515        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3516        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3517    vld1.32   {d22}, [TMP2], STRIDE
3518        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3519        vadd.u16  q12, q12, q13
3520    vld1.32   {d23}, [TMP2]
3521    vmull.u8  q9, d22, d28
3522    mov       TMP3, X, asr #16
3523    add       X, X, UX
3524    add       TMP3, TOP, TMP3, asl #2
3525    mov       TMP4, X, asr #16
3526    add       X, X, UX
3527    add       TMP4, TOP, TMP4, asl #2
3528    vmlal.u8  q9, d23, d29
3529    vld1.32   {d22}, [TMP3], STRIDE
3530        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3531    vld1.32   {d23}, [TMP3]
3532    vmull.u8  q10, d22, d28
3533    vmlal.u8  q10, d23, d29
3534        vmovn.u16 d10, q0
3535    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3536        vmovn.u16 d11, q2
3537    vmlsl.u16 q0, d16, d30
3538    vmlal.u16 q0, d17, d30
3539    pld       [TMP4, PF_OFFS]
3540    vld1.32   {d16}, [TMP4], STRIDE
3541        vadd.u16  q12, q12, q13
3542    vld1.32   {d17}, [TMP4]
3543    pld       [TMP4, PF_OFFS]
3544    vmull.u8  q11, d16, d28
3545    vmlal.u8  q11, d17, d29
3546            vuzp.u8 d10, d11
3547    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3548    vmlsl.u16 q1, d18, d31
3549
3550    mov       TMP1, X, asr #16
3551    add       X, X, UX
3552    add       TMP1, TOP, TMP1, asl #2
3553    mov       TMP2, X, asr #16
3554    add       X, X, UX
3555    add       TMP2, TOP, TMP2, asl #2
3556        vmlal.u16 q1, d19, d31
3557            vuzp.u8 d9, d11
3558        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3559        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3560            vuzp.u8 d8, d10
3561        vmlsl.u16 q2, d20, d30
3562        vmlal.u16 q2, d21, d30
3563        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3564    vld1.32   {d20}, [TMP1], STRIDE
3565        vmlsl.u16 q3, d22, d31
3566        vmlal.u16 q3, d23, d31
3567    vld1.32   {d21}, [TMP1]
3568    vmull.u8  q8, d20, d28
3569    vmlal.u8  q8, d21, d29
3570            vshll.u8  q6, d9, #8
3571            vshll.u8  q5, d10, #8
3572            vshll.u8  q7, d8, #8
3573        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3574            vsri.u16  q5, q6, #5
3575        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3576            vsri.u16  q5, q7, #11
3577        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3578    vld1.32   {d22}, [TMP2], STRIDE
3579        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3580        vadd.u16  q12, q12, q13
3581    vld1.32   {d23}, [TMP2]
3582    vmull.u8  q9, d22, d28
3583    mov       TMP3, X, asr #16
3584    add       X, X, UX
3585    add       TMP3, TOP, TMP3, asl #2
3586    mov       TMP4, X, asr #16
3587    add       X, X, UX
3588    add       TMP4, TOP, TMP4, asl #2
3589    vmlal.u8  q9, d23, d29
3590    vld1.32   {d22}, [TMP3], STRIDE
3591        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3592    vld1.32   {d23}, [TMP3]
3593    vmull.u8  q10, d22, d28
3594    vmlal.u8  q10, d23, d29
3595        vmovn.u16 d8, q0
3596    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3597        vmovn.u16 d9, q2
3598    vmlsl.u16 q0, d16, d30
3599    vmlal.u16 q0, d17, d30
3600    pld       [TMP4, PF_OFFS]
3601    vld1.32   {d16}, [TMP4], STRIDE
3602        vadd.u16  q12, q12, q13
3603    vld1.32   {d17}, [TMP4]
3604    pld       [TMP4, PF_OFFS]
3605    vmull.u8  q11, d16, d28
3606    vmlal.u8  q11, d17, d29
3607    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3608            vst1.32   {d10, d11}, [OUT, :128]!
3609    vmlsl.u16 q1, d18, d31
3610.endm
3611/*****************************************************************************/
3612
3613generate_bilinear_scanline_func \
3614    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3615    2, 2, 28, BILINEAR_FLAG_UNROLL_4
3616
3617generate_bilinear_scanline_func \
3618    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3619    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3620
3621generate_bilinear_scanline_func \
3622    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3623    1, 2, 28, BILINEAR_FLAG_UNROLL_4
3624
3625generate_bilinear_scanline_func \
3626    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3627    1, 1, 28, BILINEAR_FLAG_UNROLL_4
3628