1/*
2 * Copyright © 2009 Nokia Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 */
25
26/*
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
31 *
32 * You may want to have a look at the comments for following functions:
33 *  - pixman_composite_over_8888_0565_asm_neon
34 *  - pixman_composite_over_n_8_0565_asm_neon
35 */
36
37/* Prevent the stack from becoming executable for no reason... */
38#if defined(__linux__) && defined(__ELF__)
39.section .note.GNU-stack,"",%progbits
40#endif
41
42    .text
43    .fpu neon
44    .arch armv7a
45    .object_arch armv4
46    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
48    .arm
49    .altmacro
50    .p2align 2
51
52/* Supplementary macro for setting function attributes */
53.macro pixman_asm_function fname
54    .func fname
55    .global fname
56#ifdef __ELF__
57    .hidden fname
58    .type fname, %function
59#endif
60fname:
61.endm
62
63/*
64 * The defines which are shared between C and assembly code
65 */
66
67/* bilinear interpolation precision (must be < 8) */
68#define BILINEAR_INTERPOLATION_BITS 7
69#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
70
71/*
72 * Copyright © 2009 Nokia Corporation
73 *
74 * Permission is hereby granted, free of charge, to any person obtaining a
75 * copy of this software and associated documentation files (the "Software"),
76 * to deal in the Software without restriction, including without limitation
77 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
78 * and/or sell copies of the Software, and to permit persons to whom the
79 * Software is furnished to do so, subject to the following conditions:
80 *
81 * The above copyright notice and this permission notice (including the next
82 * paragraph) shall be included in all copies or substantial portions of the
83 * Software.
84 *
85 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
86 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
87 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
88 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
89 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
90 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
91 * DEALINGS IN THE SOFTWARE.
92 *
93 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
94 */
95
96/*
97 * This file contains a macro ('generate_composite_function') which can
98 * construct 2D image processing functions, based on a common template.
99 * Any combinations of source, destination and mask images with 8bpp,
100 * 16bpp, 24bpp, 32bpp color formats are supported.
101 *
102 * This macro takes care of:
103 *  - handling of leading and trailing unaligned pixels
104 *  - doing most of the work related to L2 cache preload
105 *  - encourages the use of software pipelining for better instructions
106 *    scheduling
107 *
108 * The user of this macro has to provide some configuration parameters
109 * (bit depths for the images, prefetch distance, etc.) and a set of
110 * macros, which should implement basic code chunks responsible for
111 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
112 * examples.
113 *
114 * TODO:
115 *  - try overlapped pixel method (from Ian Rickards) when processing
116 *    exactly two blocks of pixels
117 *  - maybe add an option to do reverse scanline processing
118 */
119
120/*
121 * Bit flags for 'generate_composite_function' macro which are used
122 * to tune generated functions behavior.
123 */
124.set FLAG_DST_WRITEONLY,       0
125.set FLAG_DST_READWRITE,       1
126.set FLAG_DEINTERLEAVE_32BPP,  2
127
128/*
129 * Offset in stack where mask and source pointer/stride can be accessed
130 * from 'init' macro. This is useful for doing special handling for solid mask.
131 */
132.set ARGS_STACK_OFFSET,        40
133
134/*
135 * Constants for selecting preferable prefetch type.
136 */
137.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
138.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
139.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
140
141/*
142 * Definitions of supplementary pixld/pixst macros (for partial load/store of
143 * pixel data).
144 */
145
146.macro pixldst1 op, elem_size, reg1, mem_operand, abits
147.if abits > 0
148    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
149.else
150    op&.&elem_size {d&reg1}, [&mem_operand&]!
151.endif
152.endm
153
154.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
155.if abits > 0
156    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
157.else
158    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
159.endif
160.endm
161
162.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
163.if abits > 0
164    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
165.else
166    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
167.endif
168.endm
169
170.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
171    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
172.endm
173
174.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
175    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
176.endm
177
178.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
179    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
180.endm
181
182.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
183.if numbytes == 32
184    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
185                              %(basereg+6), %(basereg+7), mem_operand, abits
186.elseif numbytes == 16
187    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
188.elseif numbytes == 8
189    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
190.elseif numbytes == 4
191    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
192        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
193    .elseif elem_size == 16
194        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
195        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
196    .else
197        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
198        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
199        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
200        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
201    .endif
202.elseif numbytes == 2
203    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
204        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
205    .else
206        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
207        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
208    .endif
209.elseif numbytes == 1
210    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
211.else
212    .error "unsupported size: numbytes"
213.endif
214.endm
215
216.macro pixld numpix, bpp, basereg, mem_operand, abits=0
217.if bpp > 0
218.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
219    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
220                      %(basereg+6), %(basereg+7), mem_operand, abits
221.elseif (bpp == 24) && (numpix == 8)
222    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
223.elseif (bpp == 24) && (numpix == 4)
224    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
225    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
226    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
227    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
228.elseif (bpp == 24) && (numpix == 2)
229    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
230    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
231.elseif (bpp == 24) && (numpix == 1)
232    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
233.else
234    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
235.endif
236.endif
237.endm
238
239.macro pixst numpix, bpp, basereg, mem_operand, abits=0
240.if bpp > 0
241.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
242    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
243                      %(basereg+6), %(basereg+7), mem_operand, abits
244.elseif (bpp == 24) && (numpix == 8)
245    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
246.elseif (bpp == 24) && (numpix == 4)
247    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
248    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
249    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
250    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
251.elseif (bpp == 24) && (numpix == 2)
252    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
253    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
254.elseif (bpp == 24) && (numpix == 1)
255    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
256.else
257    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
258.endif
259.endif
260.endm
261
262.macro pixld_a numpix, bpp, basereg, mem_operand
263.if (bpp * numpix) <= 128
264    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
265.else
266    pixld numpix, bpp, basereg, mem_operand, 128
267.endif
268.endm
269
270.macro pixst_a numpix, bpp, basereg, mem_operand
271.if (bpp * numpix) <= 128
272    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
273.else
274    pixst numpix, bpp, basereg, mem_operand, 128
275.endif
276.endm
277
278/*
279 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
280 * aliases to be defined)
281 */
282.macro pixld1_s elem_size, reg1, mem_operand
283.if elem_size == 16
284    mov     TMP1, VX, asr #16
285    adds    VX, VX, UNIT_X
2865:  subpls  VX, VX, SRC_WIDTH_FIXED
287    bpl     5b
288    add     TMP1, mem_operand, TMP1, asl #1
289    mov     TMP2, VX, asr #16
290    adds    VX, VX, UNIT_X
2915:  subpls  VX, VX, SRC_WIDTH_FIXED
292    bpl     5b
293    add     TMP2, mem_operand, TMP2, asl #1
294    vld1.16 {d&reg1&[0]}, [TMP1, :16]
295    mov     TMP1, VX, asr #16
296    adds    VX, VX, UNIT_X
2975:  subpls  VX, VX, SRC_WIDTH_FIXED
298    bpl     5b
299    add     TMP1, mem_operand, TMP1, asl #1
300    vld1.16 {d&reg1&[1]}, [TMP2, :16]
301    mov     TMP2, VX, asr #16
302    adds    VX, VX, UNIT_X
3035:  subpls  VX, VX, SRC_WIDTH_FIXED
304    bpl     5b
305    add     TMP2, mem_operand, TMP2, asl #1
306    vld1.16 {d&reg1&[2]}, [TMP1, :16]
307    vld1.16 {d&reg1&[3]}, [TMP2, :16]
308.elseif elem_size == 32
309    mov     TMP1, VX, asr #16
310    adds    VX, VX, UNIT_X
3115:  subpls  VX, VX, SRC_WIDTH_FIXED
312    bpl     5b
313    add     TMP1, mem_operand, TMP1, asl #2
314    mov     TMP2, VX, asr #16
315    adds    VX, VX, UNIT_X
3165:  subpls  VX, VX, SRC_WIDTH_FIXED
317    bpl     5b
318    add     TMP2, mem_operand, TMP2, asl #2
319    vld1.32 {d&reg1&[0]}, [TMP1, :32]
320    vld1.32 {d&reg1&[1]}, [TMP2, :32]
321.else
322    .error "unsupported"
323.endif
324.endm
325
326.macro pixld2_s elem_size, reg1, reg2, mem_operand
327.if 0 /* elem_size == 32 */
328    mov     TMP1, VX, asr #16
329    add     VX, VX, UNIT_X, asl #1
330    add     TMP1, mem_operand, TMP1, asl #2
331    mov     TMP2, VX, asr #16
332    sub     VX, VX, UNIT_X
333    add     TMP2, mem_operand, TMP2, asl #2
334    vld1.32 {d&reg1&[0]}, [TMP1, :32]
335    mov     TMP1, VX, asr #16
336    add     VX, VX, UNIT_X, asl #1
337    add     TMP1, mem_operand, TMP1, asl #2
338    vld1.32 {d&reg2&[0]}, [TMP2, :32]
339    mov     TMP2, VX, asr #16
340    add     VX, VX, UNIT_X
341    add     TMP2, mem_operand, TMP2, asl #2
342    vld1.32 {d&reg1&[1]}, [TMP1, :32]
343    vld1.32 {d&reg2&[1]}, [TMP2, :32]
344.else
345    pixld1_s elem_size, reg1, mem_operand
346    pixld1_s elem_size, reg2, mem_operand
347.endif
348.endm
349
350.macro pixld0_s elem_size, reg1, idx, mem_operand
351.if elem_size == 16
352    mov     TMP1, VX, asr #16
353    adds    VX, VX, UNIT_X
3545:  subpls  VX, VX, SRC_WIDTH_FIXED
355    bpl     5b
356    add     TMP1, mem_operand, TMP1, asl #1
357    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
358.elseif elem_size == 32
359    mov     TMP1, VX, asr #16
360    adds    VX, VX, UNIT_X
3615:  subpls  VX, VX, SRC_WIDTH_FIXED
362    bpl     5b
363    add     TMP1, mem_operand, TMP1, asl #2
364    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
365.endif
366.endm
367
368.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
369.if numbytes == 32
370    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
371    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
372    pixdeinterleave elem_size, %(basereg+4)
373.elseif numbytes == 16
374    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
375.elseif numbytes == 8
376    pixld1_s elem_size, %(basereg+1), mem_operand
377.elseif numbytes == 4
378    .if elem_size == 32
379        pixld0_s elem_size, %(basereg+0), 1, mem_operand
380    .elseif elem_size == 16
381        pixld0_s elem_size, %(basereg+0), 2, mem_operand
382        pixld0_s elem_size, %(basereg+0), 3, mem_operand
383    .else
384        pixld0_s elem_size, %(basereg+0), 4, mem_operand
385        pixld0_s elem_size, %(basereg+0), 5, mem_operand
386        pixld0_s elem_size, %(basereg+0), 6, mem_operand
387        pixld0_s elem_size, %(basereg+0), 7, mem_operand
388    .endif
389.elseif numbytes == 2
390    .if elem_size == 16
391        pixld0_s elem_size, %(basereg+0), 1, mem_operand
392    .else
393        pixld0_s elem_size, %(basereg+0), 2, mem_operand
394        pixld0_s elem_size, %(basereg+0), 3, mem_operand
395    .endif
396.elseif numbytes == 1
397    pixld0_s elem_size, %(basereg+0), 1, mem_operand
398.else
399    .error "unsupported size: numbytes"
400.endif
401.endm
402
403.macro pixld_s numpix, bpp, basereg, mem_operand
404.if bpp > 0
405    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
406.endif
407.endm
408
409.macro vuzp8 reg1, reg2
410    vuzp.8 d&reg1, d&reg2
411.endm
412
413.macro vzip8 reg1, reg2
414    vzip.8 d&reg1, d&reg2
415.endm
416
417/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
418.macro pixdeinterleave bpp, basereg
419.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
420    vuzp8 %(basereg+0), %(basereg+1)
421    vuzp8 %(basereg+2), %(basereg+3)
422    vuzp8 %(basereg+1), %(basereg+3)
423    vuzp8 %(basereg+0), %(basereg+2)
424.endif
425.endm
426
427/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
428.macro pixinterleave bpp, basereg
429.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
430    vzip8 %(basereg+0), %(basereg+2)
431    vzip8 %(basereg+1), %(basereg+3)
432    vzip8 %(basereg+2), %(basereg+3)
433    vzip8 %(basereg+0), %(basereg+1)
434.endif
435.endm
436
437/*
438 * This is a macro for implementing cache preload. The main idea is that
439 * cache preload logic is mostly independent from the rest of pixels
440 * processing code. It starts at the top left pixel and moves forward
441 * across pixels and can jump across scanlines. Prefetch distance is
442 * handled in an 'incremental' way: it starts from 0 and advances to the
443 * optimal distance over time. After reaching optimal prefetch distance,
444 * it is kept constant. There are some checks which prevent prefetching
445 * unneeded pixel lines below the image (but it still can prefetch a bit
446 * more data on the right side of the image - not a big issue and may
447 * be actually helpful when rendering text glyphs). Additional trick is
448 * the use of LDR instruction for prefetch instead of PLD when moving to
449 * the next line, the point is that we have a high chance of getting TLB
450 * miss in this case, and PLD would be useless.
451 *
452 * This sounds like it may introduce a noticeable overhead (when working with
453 * fully cached data). But in reality, due to having a separate pipeline and
454 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
455 * execute simultaneously with NEON and be completely shadowed by it. Thus
456 * we get no performance overhead at all (*). This looks like a very nice
457 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
458 * but still can implement some rather advanced prefetch logic in software
459 * for almost zero cost!
460 *
461 * (*) The overhead of the prefetcher is visible when running some trivial
462 * pixels processing like simple copy. Anyway, having prefetch is a must
463 * when working with the graphics data.
464 */
465.macro PF a, x:vararg
466.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
467    a x
468.endif
469.endm
470
471.macro cache_preload std_increment, boost_increment
472.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
473.if regs_shortage
474    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
475.endif
476.if std_increment != 0
477    PF add PF_X, PF_X, #std_increment
478.endif
479    PF tst PF_CTL, #0xF
480    PF addne PF_X, PF_X, #boost_increment
481    PF subne PF_CTL, PF_CTL, #1
482    PF cmp PF_X, ORIG_W
483.if src_bpp_shift >= 0
484    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
485.endif
486.if dst_r_bpp != 0
487    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
488.endif
489.if mask_bpp_shift >= 0
490    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
491.endif
492    PF subge PF_X, PF_X, ORIG_W
493    PF subges PF_CTL, PF_CTL, #0x10
494.if src_bpp_shift >= 0
495    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
496.endif
497.if dst_r_bpp != 0
498    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
499.endif
500.if mask_bpp_shift >= 0
501    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
502.endif
503.endif
504.endm
505
506.macro cache_preload_simple
507.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
508.if src_bpp > 0
509    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
510.endif
511.if dst_r_bpp > 0
512    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
513.endif
514.if mask_bpp > 0
515    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
516.endif
517.endif
518.endm
519
520.macro fetch_mask_pixblock
521    pixld       pixblock_size, mask_bpp, \
522                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
523.endm
524
525/*
526 * Macro which is used to process leading pixels until destination
527 * pointer is properly aligned (at 16 bytes boundary). When destination
528 * buffer uses 16bpp format, this is unnecessary, or even pointless.
529 */
530.macro ensure_destination_ptr_alignment process_pixblock_head, \
531                                        process_pixblock_tail, \
532                                        process_pixblock_tail_head
533.if dst_w_bpp != 24
534    tst         DST_R, #0xF
535    beq         2f
536
537.irp lowbit, 1, 2, 4, 8, 16
538local skip1
539.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
540.if lowbit < 16 /* we don't need more than 16-byte alignment */
541    tst         DST_R, #lowbit
542    beq         1f
543.endif
544    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
545    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
546.if dst_r_bpp > 0
547    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
548.else
549    add         DST_R, DST_R, #lowbit
550.endif
551    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
552    sub         W, W, #(lowbit * 8 / dst_w_bpp)
5531:
554.endif
555.endr
556    pixdeinterleave src_bpp, src_basereg
557    pixdeinterleave mask_bpp, mask_basereg
558    pixdeinterleave dst_r_bpp, dst_r_basereg
559
560    process_pixblock_head
561    cache_preload 0, pixblock_size
562    cache_preload_simple
563    process_pixblock_tail
564
565    pixinterleave dst_w_bpp, dst_w_basereg
566.irp lowbit, 1, 2, 4, 8, 16
567.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
568.if lowbit < 16 /* we don't need more than 16-byte alignment */
569    tst         DST_W, #lowbit
570    beq         1f
571.endif
572    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
5731:
574.endif
575.endr
576.endif
5772:
578.endm
579
580/*
581 * Special code for processing up to (pixblock_size - 1) remaining
582 * trailing pixels. As SIMD processing performs operation on
583 * pixblock_size pixels, anything smaller than this has to be loaded
584 * and stored in a special way. Loading and storing of pixel data is
585 * performed in such a way that we fill some 'slots' in the NEON
586 * registers (some slots naturally are unused), then perform compositing
587 * operation as usual. In the end, the data is taken from these 'slots'
588 * and saved to memory.
589 *
590 * cache_preload_flag - allows to suppress prefetch if
591 *                      set to 0
592 * dst_aligned_flag   - selects whether destination buffer
593 *                      is aligned
594 */
595.macro process_trailing_pixels cache_preload_flag, \
596                               dst_aligned_flag, \
597                               process_pixblock_head, \
598                               process_pixblock_tail, \
599                               process_pixblock_tail_head
600    tst         W, #(pixblock_size - 1)
601    beq         2f
602.irp chunk_size, 16, 8, 4, 2, 1
603.if pixblock_size > chunk_size
604    tst         W, #chunk_size
605    beq         1f
606    pixld_src   chunk_size, src_bpp, src_basereg, SRC
607    pixld       chunk_size, mask_bpp, mask_basereg, MASK
608.if dst_aligned_flag != 0
609    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
610.else
611    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
612.endif
613.if cache_preload_flag != 0
614    PF add      PF_X, PF_X, #chunk_size
615.endif
6161:
617.endif
618.endr
619    pixdeinterleave src_bpp, src_basereg
620    pixdeinterleave mask_bpp, mask_basereg
621    pixdeinterleave dst_r_bpp, dst_r_basereg
622
623    process_pixblock_head
624.if cache_preload_flag != 0
625    cache_preload 0, pixblock_size
626    cache_preload_simple
627.endif
628    process_pixblock_tail
629    pixinterleave dst_w_bpp, dst_w_basereg
630.irp chunk_size, 16, 8, 4, 2, 1
631.if pixblock_size > chunk_size
632    tst         W, #chunk_size
633    beq         1f
634.if dst_aligned_flag != 0
635    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
636.else
637    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
638.endif
6391:
640.endif
641.endr
6422:
643.endm
644
645/*
646 * Macro, which performs all the needed operations to switch to the next
647 * scanline and start the next loop iteration unless all the scanlines
648 * are already processed.
649 */
650.macro advance_to_next_scanline start_of_loop_label
651.if regs_shortage
652    ldrd        W, [sp] /* load W and H (width and height) from stack */
653.else
654    mov         W, ORIG_W
655.endif
656    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
657.if src_bpp != 0
658    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
659.endif
660.if mask_bpp != 0
661    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
662.endif
663.if (dst_w_bpp != 24)
664    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
665.endif
666.if (src_bpp != 24) && (src_bpp != 0)
667    sub         SRC, SRC, W, lsl #src_bpp_shift
668.endif
669.if (mask_bpp != 24) && (mask_bpp != 0)
670    sub         MASK, MASK, W, lsl #mask_bpp_shift
671.endif
672    subs        H, H, #1
673    mov         DST_R, DST_W
674.if regs_shortage
675    str         H, [sp, #4] /* save updated height to stack */
676.endif
677    bge         start_of_loop_label
678.endm
679
680/*
681 * Registers are allocated in the following way by default:
682 * d0, d1, d2, d3     - reserved for loading source pixel data
683 * d4, d5, d6, d7     - reserved for loading destination pixel data
684 * d24, d25, d26, d27 - reserved for loading mask pixel data
685 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
686 */
687.macro generate_composite_function fname, \
688                                   src_bpp_, \
689                                   mask_bpp_, \
690                                   dst_w_bpp_, \
691                                   flags, \
692                                   pixblock_size_, \
693                                   prefetch_distance, \
694                                   init, \
695                                   cleanup, \
696                                   process_pixblock_head, \
697                                   process_pixblock_tail, \
698                                   process_pixblock_tail_head, \
699                                   dst_w_basereg_ = 28, \
700                                   dst_r_basereg_ = 4, \
701                                   src_basereg_   = 0, \
702                                   mask_basereg_  = 24
703
704    pixman_asm_function fname
705
706    push        {r4-r12, lr}        /* save all registers */
707
708/*
709 * Select prefetch type for this function. If prefetch distance is
710 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
711 * has to be used instead of ADVANCED.
712 */
713    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
714.if prefetch_distance == 0
715    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
716.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
717        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
718    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
719.endif
720
721/*
722 * Make some macro arguments globally visible and accessible
723 * from other macros
724 */
725    .set src_bpp, src_bpp_
726    .set mask_bpp, mask_bpp_
727    .set dst_w_bpp, dst_w_bpp_
728    .set pixblock_size, pixblock_size_
729    .set dst_w_basereg, dst_w_basereg_
730    .set dst_r_basereg, dst_r_basereg_
731    .set src_basereg, src_basereg_
732    .set mask_basereg, mask_basereg_
733
734    .macro pixld_src x:vararg
735        pixld x
736    .endm
737    .macro fetch_src_pixblock
738        pixld_src   pixblock_size, src_bpp, \
739                    (src_basereg - pixblock_size * src_bpp / 64), SRC
740    .endm
741/*
742 * Assign symbolic names to registers
743 */
744    W           .req        r0      /* width (is updated during processing) */
745    H           .req        r1      /* height (is updated during processing) */
746    DST_W       .req        r2      /* destination buffer pointer for writes */
747    DST_STRIDE  .req        r3      /* destination image stride */
748    SRC         .req        r4      /* source buffer pointer */
749    SRC_STRIDE  .req        r5      /* source image stride */
750    DST_R       .req        r6      /* destination buffer pointer for reads */
751
752    MASK        .req        r7      /* mask pointer */
753    MASK_STRIDE .req        r8      /* mask stride */
754
755    PF_CTL      .req        r9      /* combined lines counter and prefetch */
756                                    /* distance increment counter */
757    PF_X        .req        r10     /* pixel index in a scanline for current */
758                                    /* pretetch position */
759    PF_SRC      .req        r11     /* pointer to source scanline start */
760                                    /* for prefetch purposes */
761    PF_DST      .req        r12     /* pointer to destination scanline start */
762                                    /* for prefetch purposes */
763    PF_MASK     .req        r14     /* pointer to mask scanline start */
764                                    /* for prefetch purposes */
765/*
766 * Check whether we have enough registers for all the local variables.
767 * If we don't have enough registers, original width and height are
768 * kept on top of stack (and 'regs_shortage' variable is set to indicate
769 * this for the rest of code). Even if there are enough registers, the
770 * allocation scheme may be a bit different depending on whether source
771 * or mask is not used.
772 */
773.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
774    ORIG_W      .req        r10     /* saved original width */
775    DUMMY       .req        r12     /* temporary register */
776    .set        regs_shortage, 0
777.elseif mask_bpp == 0
778    ORIG_W      .req        r7      /* saved original width */
779    DUMMY       .req        r8      /* temporary register */
780    .set        regs_shortage, 0
781.elseif src_bpp == 0
782    ORIG_W      .req        r4      /* saved original width */
783    DUMMY       .req        r5      /* temporary register */
784    .set        regs_shortage, 0
785.else
786    ORIG_W      .req        r1      /* saved original width */
787    DUMMY       .req        r1      /* temporary register */
788    .set        regs_shortage, 1
789.endif
790
791    .set mask_bpp_shift, -1
792.if src_bpp == 32
793    .set src_bpp_shift, 2
794.elseif src_bpp == 24
795    .set src_bpp_shift, 0
796.elseif src_bpp == 16
797    .set src_bpp_shift, 1
798.elseif src_bpp == 8
799    .set src_bpp_shift, 0
800.elseif src_bpp == 0
801    .set src_bpp_shift, -1
802.else
803    .error "requested src bpp (src_bpp) is not supported"
804.endif
805.if mask_bpp == 32
806    .set mask_bpp_shift, 2
807.elseif mask_bpp == 24
808    .set mask_bpp_shift, 0
809.elseif mask_bpp == 8
810    .set mask_bpp_shift, 0
811.elseif mask_bpp == 0
812    .set mask_bpp_shift, -1
813.else
814    .error "requested mask bpp (mask_bpp) is not supported"
815.endif
816.if dst_w_bpp == 32
817    .set dst_bpp_shift, 2
818.elseif dst_w_bpp == 24
819    .set dst_bpp_shift, 0
820.elseif dst_w_bpp == 16
821    .set dst_bpp_shift, 1
822.elseif dst_w_bpp == 8
823    .set dst_bpp_shift, 0
824.else
825    .error "requested dst bpp (dst_w_bpp) is not supported"
826.endif
827
828.if (((flags) & FLAG_DST_READWRITE) != 0)
829    .set dst_r_bpp, dst_w_bpp
830.else
831    .set dst_r_bpp, 0
832.endif
833.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
834    .set DEINTERLEAVE_32BPP_ENABLED, 1
835.else
836    .set DEINTERLEAVE_32BPP_ENABLED, 0
837.endif
838
839.if prefetch_distance < 0 || prefetch_distance > 15
840    .error "invalid prefetch distance (prefetch_distance)"
841.endif
842
843.if src_bpp > 0
844    ldr         SRC, [sp, #40]
845.endif
846.if mask_bpp > 0
847    ldr         MASK, [sp, #48]
848.endif
849    PF mov      PF_X, #0
850.if src_bpp > 0
851    ldr         SRC_STRIDE, [sp, #44]
852.endif
853.if mask_bpp > 0
854    ldr         MASK_STRIDE, [sp, #52]
855.endif
856    mov         DST_R, DST_W
857
858.if src_bpp == 24
859    sub         SRC_STRIDE, SRC_STRIDE, W
860    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
861.endif
862.if mask_bpp == 24
863    sub         MASK_STRIDE, MASK_STRIDE, W
864    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
865.endif
866.if dst_w_bpp == 24
867    sub         DST_STRIDE, DST_STRIDE, W
868    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
869.endif
870
871/*
872 * Setup advanced prefetcher initial state
873 */
874    PF mov      PF_SRC, SRC
875    PF mov      PF_DST, DST_R
876    PF mov      PF_MASK, MASK
877    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
878    PF mov      PF_CTL, H, lsl #4
879    PF add      PF_CTL, #(prefetch_distance - 0x10)
880
881    init
882.if regs_shortage
883    push        {r0, r1}
884.endif
885    subs        H, H, #1
886.if regs_shortage
887    str         H, [sp, #4] /* save updated height to stack */
888.else
889    mov         ORIG_W, W
890.endif
891    blt         9f
892    cmp         W, #(pixblock_size * 2)
893    blt         8f
894/*
895 * This is the start of the pipelined loop, which if optimized for
896 * long scanlines
897 */
8980:
899    ensure_destination_ptr_alignment process_pixblock_head, \
900                                     process_pixblock_tail, \
901                                     process_pixblock_tail_head
902
903    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
904    pixld_a     pixblock_size, dst_r_bpp, \
905                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
906    fetch_src_pixblock
907    pixld       pixblock_size, mask_bpp, \
908                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
909    PF add      PF_X, PF_X, #pixblock_size
910    process_pixblock_head
911    cache_preload 0, pixblock_size
912    cache_preload_simple
913    subs        W, W, #(pixblock_size * 2)
914    blt         2f
9151:
916    process_pixblock_tail_head
917    cache_preload_simple
918    subs        W, W, #pixblock_size
919    bge         1b
9202:
921    process_pixblock_tail
922    pixst_a     pixblock_size, dst_w_bpp, \
923                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
924
925    /* Process the remaining trailing pixels in the scanline */
926    process_trailing_pixels 1, 1, \
927                            process_pixblock_head, \
928                            process_pixblock_tail, \
929                            process_pixblock_tail_head
930    advance_to_next_scanline 0b
931
932.if regs_shortage
933    pop         {r0, r1}
934.endif
935    cleanup
936    pop         {r4-r12, pc}  /* exit */
937/*
938 * This is the start of the loop, designed to process images with small width
939 * (less than pixblock_size * 2 pixels). In this case neither pipelining
940 * nor prefetch are used.
941 */
9428:
943    /* Process exactly pixblock_size pixels if needed */
944    tst         W, #pixblock_size
945    beq         1f
946    pixld       pixblock_size, dst_r_bpp, \
947                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
948    fetch_src_pixblock
949    pixld       pixblock_size, mask_bpp, \
950                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
951    process_pixblock_head
952    process_pixblock_tail
953    pixst       pixblock_size, dst_w_bpp, \
954                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
9551:
956    /* Process the remaining trailing pixels in the scanline */
957    process_trailing_pixels 0, 0, \
958                            process_pixblock_head, \
959                            process_pixblock_tail, \
960                            process_pixblock_tail_head
961    advance_to_next_scanline 8b
9629:
963.if regs_shortage
964    pop         {r0, r1}
965.endif
966    cleanup
967    pop         {r4-r12, pc}  /* exit */
968
969    .purgem     fetch_src_pixblock
970    .purgem     pixld_src
971
972    .unreq      SRC
973    .unreq      MASK
974    .unreq      DST_R
975    .unreq      DST_W
976    .unreq      ORIG_W
977    .unreq      W
978    .unreq      H
979    .unreq      SRC_STRIDE
980    .unreq      DST_STRIDE
981    .unreq      MASK_STRIDE
982    .unreq      PF_CTL
983    .unreq      PF_X
984    .unreq      PF_SRC
985    .unreq      PF_DST
986    .unreq      PF_MASK
987    .unreq      DUMMY
988    .endfunc
989.endm
990
991/*
992 * A simplified variant of function generation template for a single
993 * scanline processing (for implementing pixman combine functions)
994 */
995.macro generate_composite_function_scanline        use_nearest_scaling, \
996                                                   fname, \
997                                                   src_bpp_, \
998                                                   mask_bpp_, \
999                                                   dst_w_bpp_, \
1000                                                   flags, \
1001                                                   pixblock_size_, \
1002                                                   init, \
1003                                                   cleanup, \
1004                                                   process_pixblock_head, \
1005                                                   process_pixblock_tail, \
1006                                                   process_pixblock_tail_head, \
1007                                                   dst_w_basereg_ = 28, \
1008                                                   dst_r_basereg_ = 4, \
1009                                                   src_basereg_   = 0, \
1010                                                   mask_basereg_  = 24
1011
1012    pixman_asm_function fname
1013
1014    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
1015/*
1016 * Make some macro arguments globally visible and accessible
1017 * from other macros
1018 */
1019    .set src_bpp, src_bpp_
1020    .set mask_bpp, mask_bpp_
1021    .set dst_w_bpp, dst_w_bpp_
1022    .set pixblock_size, pixblock_size_
1023    .set dst_w_basereg, dst_w_basereg_
1024    .set dst_r_basereg, dst_r_basereg_
1025    .set src_basereg, src_basereg_
1026    .set mask_basereg, mask_basereg_
1027
1028.if use_nearest_scaling != 0
1029    /*
1030     * Assign symbolic names to registers for nearest scaling
1031     */
1032    W           .req        r0
1033    DST_W       .req        r1
1034    SRC         .req        r2
1035    VX          .req        r3
1036    UNIT_X      .req        ip
1037    MASK        .req        lr
1038    TMP1        .req        r4
1039    TMP2        .req        r5
1040    DST_R       .req        r6
1041    SRC_WIDTH_FIXED .req        r7
1042
1043    .macro pixld_src x:vararg
1044        pixld_s x
1045    .endm
1046
1047    ldr         UNIT_X, [sp]
1048    push        {r4-r8, lr}
1049    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
1050    .if mask_bpp != 0
1051    ldr         MASK, [sp, #(24 + 8)]
1052    .endif
1053.else
1054    /*
1055     * Assign symbolic names to registers
1056     */
1057    W           .req        r0      /* width (is updated during processing) */
1058    DST_W       .req        r1      /* destination buffer pointer for writes */
1059    SRC         .req        r2      /* source buffer pointer */
1060    DST_R       .req        ip      /* destination buffer pointer for reads */
1061    MASK        .req        r3      /* mask pointer */
1062
1063    .macro pixld_src x:vararg
1064        pixld x
1065    .endm
1066.endif
1067
1068.if (((flags) & FLAG_DST_READWRITE) != 0)
1069    .set dst_r_bpp, dst_w_bpp
1070.else
1071    .set dst_r_bpp, 0
1072.endif
1073.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
1074    .set DEINTERLEAVE_32BPP_ENABLED, 1
1075.else
1076    .set DEINTERLEAVE_32BPP_ENABLED, 0
1077.endif
1078
1079    .macro fetch_src_pixblock
1080        pixld_src   pixblock_size, src_bpp, \
1081                    (src_basereg - pixblock_size * src_bpp / 64), SRC
1082    .endm
1083
1084    init
1085    mov         DST_R, DST_W
1086
1087    cmp         W, #pixblock_size
1088    blt         8f
1089
1090    ensure_destination_ptr_alignment process_pixblock_head, \
1091                                     process_pixblock_tail, \
1092                                     process_pixblock_tail_head
1093
1094    subs        W, W, #pixblock_size
1095    blt         7f
1096
1097    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1098    pixld_a     pixblock_size, dst_r_bpp, \
1099                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1100    fetch_src_pixblock
1101    pixld       pixblock_size, mask_bpp, \
1102                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1103    process_pixblock_head
1104    subs        W, W, #pixblock_size
1105    blt         2f
11061:
1107    process_pixblock_tail_head
1108    subs        W, W, #pixblock_size
1109    bge         1b
11102:
1111    process_pixblock_tail
1112    pixst_a     pixblock_size, dst_w_bpp, \
1113                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
11147:
1115    /* Process the remaining trailing pixels in the scanline (dst aligned) */
1116    process_trailing_pixels 0, 1, \
1117                            process_pixblock_head, \
1118                            process_pixblock_tail, \
1119                            process_pixblock_tail_head
1120
1121    cleanup
1122.if use_nearest_scaling != 0
1123    pop         {r4-r8, pc}  /* exit */
1124.else
1125    bx          lr  /* exit */
1126.endif
11278:
1128    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1129    process_trailing_pixels 0, 0, \
1130                            process_pixblock_head, \
1131                            process_pixblock_tail, \
1132                            process_pixblock_tail_head
1133
1134    cleanup
1135
1136.if use_nearest_scaling != 0
1137    pop         {r4-r8, pc}  /* exit */
1138
1139    .unreq      DST_R
1140    .unreq      SRC
1141    .unreq      W
1142    .unreq      VX
1143    .unreq      UNIT_X
1144    .unreq      TMP1
1145    .unreq      TMP2
1146    .unreq      DST_W
1147    .unreq      MASK
1148    .unreq      SRC_WIDTH_FIXED
1149
1150.else
1151    bx          lr  /* exit */
1152
1153    .unreq      SRC
1154    .unreq      MASK
1155    .unreq      DST_R
1156    .unreq      DST_W
1157    .unreq      W
1158.endif
1159
1160    .purgem     fetch_src_pixblock
1161    .purgem     pixld_src
1162
1163    .endfunc
1164.endm
1165
1166.macro generate_composite_function_single_scanline x:vararg
1167    generate_composite_function_scanline 0, x
1168.endm
1169
1170.macro generate_composite_function_nearest_scanline x:vararg
1171    generate_composite_function_scanline 1, x
1172.endm
1173
1174/* Default prologue/epilogue, nothing special needs to be done */
1175
1176.macro default_init
1177.endm
1178
1179.macro default_cleanup
1180.endm
1181
1182/*
1183 * Prologue/epilogue variant which additionally saves/restores d8-d15
1184 * registers (they need to be saved/restored by callee according to ABI).
1185 * This is required if the code needs to use all the NEON registers.
1186 */
1187
1188.macro default_init_need_all_regs
1189    vpush       {d8-d15}
1190.endm
1191
1192.macro default_cleanup_need_all_regs
1193    vpop        {d8-d15}
1194.endm
1195
1196/******************************************************************************/
1197
1198/*
1199 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1200 * into a planar a8r8g8b8 format (with a, r, g, b color components
1201 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1202 *
1203 * Warning: the conversion is destructive and the original
1204 *          value (in) is lost.
1205 */
1206.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1207    vshrn.u16   out_r, in,    #8
1208    vshrn.u16   out_g, in,    #3
1209    vsli.u16    in,    in,    #5
1210    vmov.u8     out_a, #255
1211    vsri.u8     out_r, out_r, #5
1212    vsri.u8     out_g, out_g, #6
1213    vshrn.u16   out_b, in,    #2
1214.endm
1215
1216.macro convert_0565_to_x888 in, out_r, out_g, out_b
1217    vshrn.u16   out_r, in,    #8
1218    vshrn.u16   out_g, in,    #3
1219    vsli.u16    in,    in,    #5
1220    vsri.u8     out_r, out_r, #5
1221    vsri.u8     out_g, out_g, #6
1222    vshrn.u16   out_b, in,    #2
1223.endm
1224
1225/*
1226 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1227 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1228 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1229 * registers (tmp1, tmp2)
1230 */
1231.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1232    vshll.u8    tmp1, in_g, #8
1233    vshll.u8    out, in_r, #8
1234    vshll.u8    tmp2, in_b, #8
1235    vsri.u16    out, tmp1, #5
1236    vsri.u16    out, tmp2, #11
1237.endm
1238
1239/*
1240 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1241 * returned in (out0, out1) registers pair. Requires one temporary
1242 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1243 * value from 'in' is lost
1244 */
1245.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1246    vshl.u16    out0, in,   #5  /* G top 6 bits */
1247    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
1248    vsri.u16    in,   in,   #5  /* R is ready in top bits */
1249    vsri.u16    out0, out0, #6  /* G is ready in top bits */
1250    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
1251    vshr.u16    out1, in,   #8  /* R is in place */
1252    vsri.u16    out0, tmp,  #8  /* G & B is in place */
1253    vzip.u16    out0, out1      /* everything is in place */
1254.endm
1255
1256/* Global configuration options and preferences */
1257
1258/*
1259 * The code can optionally make use of unaligned memory accesses to improve
1260 * performance of handling leading/trailing pixels for each scanline.
1261 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
1262 * example in linux if unaligned memory accesses are not configured to
1263 * generate.exceptions.
1264 */
1265.set RESPECT_STRICT_ALIGNMENT, 1
1266
1267/*
1268 * Set default prefetch type. There is a choice between the following options:
1269 *
1270 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
1271 * as NOP to workaround some HW bugs or for whatever other reason)
1272 *
1273 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
1274 * advanced prefetch intruduces heavy overhead)
1275 *
1276 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
1277 * which can run ARM and NEON instructions simultaneously so that extra ARM
1278 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
1279 *
1280 * Note: some types of function can't support advanced prefetch and fallback
1281 *       to simple one (those which handle 24bpp pixels)
1282 */
1283.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
1284
1285/* Prefetch distance in pixels for simple prefetch */
1286.set PREFETCH_DISTANCE_SIMPLE, 64
1287
1288/*
1289 * Implementation of pixman_composite_over_8888_0565_asm_neon
1290 *
1291 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
1292 * performs OVER compositing operation. Function fast_composite_over_8888_0565
1293 * from pixman-fast-path.c does the same in C and can be used as a reference.
1294 *
1295 * First we need to have some NEON assembly code which can do the actual
1296 * operation on the pixels and provide it to the template macro.
1297 *
1298 * Template macro quite conveniently takes care of emitting all the necessary
1299 * code for memory reading and writing (including quite tricky cases of
1300 * handling unaligned leading/trailing pixels), so we only need to deal with
1301 * the data in NEON registers.
1302 *
1303 * NEON registers allocation in general is recommented to be the following:
1304 * d0,  d1,  d2,  d3  - contain loaded source pixel data
1305 * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
1306 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
1307 * d28, d29, d30, d31 - place for storing the result (destination pixels)
1308 *
1309 * As can be seen above, four 64-bit NEON registers are used for keeping
1310 * intermediate pixel data and up to 8 pixels can be processed in one step
1311 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
1312 *
1313 * This particular function uses the following registers allocation:
1314 * d0,  d1,  d2,  d3  - contain loaded source pixel data
1315 * d4,  d5            - contain loaded destination pixels (they are needed)
1316 * d28, d29           - place for storing the result (destination pixels)
1317 */
1318
1319/*
1320 * Step one. We need to have some code to do some arithmetics on pixel data.
1321 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
1322 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
1323 * perform all the needed calculations and write the result to {d28, d29}.
1324 * The rationale for having two macros and not just one will be explained
1325 * later. In practice, any single monolitic function which does the work can
1326 * be split into two parts in any arbitrary way without affecting correctness.
1327 *
1328 * There is one special trick here too. Common template macro can optionally
1329 * make our life a bit easier by doing R, G, B, A color components
1330 * deinterleaving for 32bpp pixel formats (and this feature is used in
1331 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
1332 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
1333 * actually use d0 register for blue channel (a vector of eight 8-bit
1334 * values), d1 register for green, d2 for red and d3 for alpha. This
1335 * simple conversion can be also done with a few NEON instructions:
1336 *
1337 * Packed to planar conversion:
1338 *  vuzp.8 d0, d1
1339 *  vuzp.8 d2, d3
1340 *  vuzp.8 d1, d3
1341 *  vuzp.8 d0, d2
1342 *
1343 * Planar to packed conversion:
1344 *  vzip.8 d0, d2
1345 *  vzip.8 d1, d3
1346 *  vzip.8 d2, d3
1347 *  vzip.8 d0, d1
1348 *
1349 * But pixel can be loaded directly in planar format using VLD4.8 NEON
1350 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
1351 * desirable, that's why deinterleaving is optional.
1352 *
1353 * But anyway, here is the code:
1354 */
1355.macro pixman_composite_over_8888_0565_process_pixblock_head
1356    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1357       and put data into d6 - red, d7 - green, d30 - blue */
1358    vshrn.u16   d6, q2, #8
1359    vshrn.u16   d7, q2, #3
1360    vsli.u16    q2, q2, #5
1361    vsri.u8     d6, d6, #5
1362    vmvn.8      d3, d3      /* invert source alpha */
1363    vsri.u8     d7, d7, #6
1364    vshrn.u16   d30, q2, #2
1365    /* now do alpha blending, storing results in 8-bit planar format
1366       into d16 - red, d19 - green, d18 - blue */
1367    vmull.u8    q10, d3, d6
1368    vmull.u8    q11, d3, d7
1369    vmull.u8    q12, d3, d30
1370    vrshr.u16   q13, q10, #8
1371    vrshr.u16   q3, q11, #8
1372    vrshr.u16   q15, q12, #8
1373    vraddhn.u16 d20, q10, q13
1374    vraddhn.u16 d23, q11, q3
1375    vraddhn.u16 d22, q12, q15
1376.endm
1377
1378.macro pixman_composite_over_8888_0565_process_pixblock_tail
1379    /* ... continue alpha blending */
1380    vqadd.u8    d16, d2, d20
1381    vqadd.u8    q9, q0, q11
1382    /* convert the result to r5g6b5 and store it into {d28, d29} */
1383    vshll.u8    q14, d16, #8
1384    vshll.u8    q8, d19, #8
1385    vshll.u8    q9, d18, #8
1386    vsri.u16    q14, q8, #5
1387    vsri.u16    q14, q9, #11
1388.endm
1389
1390/*
1391 * OK, now we got almost everything that we need. Using the above two
1392 * macros, the work can be done right. But now we want to optimize
1393 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
1394 * a lot from good code scheduling and software pipelining.
1395 *
1396 * Let's construct some code, which will run in the core main loop.
1397 * Some pseudo-code of the main loop will look like this:
1398 *   head
1399 *   while (...) {
1400 *     tail
1401 *     head
1402 *   }
1403 *   tail
1404 *
1405 * It may look a bit weird, but this setup allows to hide instruction
1406 * latencies better and also utilize dual-issue capability more
1407 * efficiently (make pairs of load-store and ALU instructions).
1408 *
1409 * So what we need now is a '*_tail_head' macro, which will be used
1410 * in the core main loop. A trivial straightforward implementation
1411 * of this macro would look like this:
1412 *
1413 *   pixman_composite_over_8888_0565_process_pixblock_tail
1414 *   vst1.16     {d28, d29}, [DST_W, :128]!
1415 *   vld1.16     {d4, d5}, [DST_R, :128]!
1416 *   vld4.32     {d0, d1, d2, d3}, [SRC]!
1417 *   pixman_composite_over_8888_0565_process_pixblock_head
1418 *   cache_preload 8, 8
1419 *
1420 * Now it also got some VLD/VST instructions. We simply can't move from
1421 * processing one block of pixels to the other one with just arithmetics.
1422 * The previously processed data needs to be written to memory and new
1423 * data needs to be fetched. Fortunately, this main loop does not deal
1424 * with partial leading/trailing pixels and can load/store a full block
1425 * of pixels in a bulk. Additionally, destination buffer is already
1426 * 16 bytes aligned here (which is good for performance).
1427 *
1428 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
1429 * are the aliases for ARM registers which are used as pointers for
1430 * accessing data. We maintain separate pointers for reading and writing
1431 * destination buffer (DST_R and DST_W).
1432 *
1433 * Another new thing is 'cache_preload' macro. It is used for prefetching
1434 * data into CPU L2 cache and improve performance when dealing with large
1435 * images which are far larger than cache size. It uses one argument
1436 * (actually two, but they need to be the same here) - number of pixels
1437 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
1438 * details about this macro. Moreover, if good performance is needed
1439 * the code from this macro needs to be copied into '*_tail_head' macro
1440 * and mixed with the rest of code for optimal instructions scheduling.
1441 * We are actually doing it below.
1442 *
1443 * Now after all the explanations, here is the optimized code.
1444 * Different instruction streams (originaling from '*_head', '*_tail'
1445 * and 'cache_preload' macro) use different indentation levels for
1446 * better readability. Actually taking the code from one of these
1447 * indentation levels and ignoring a few VLD/VST instructions would
1448 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
1449 * macro!
1450 */
1451
1452#if 1
1453
1454.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
1455        vqadd.u8    d16, d2, d20
1456    vld1.16     {d4, d5}, [DST_R, :128]!
1457        vqadd.u8    q9, q0, q11
1458    vshrn.u16   d6, q2, #8
1459    fetch_src_pixblock
1460    vshrn.u16   d7, q2, #3
1461    vsli.u16    q2, q2, #5
1462        vshll.u8    q14, d16, #8
1463                                    PF add PF_X, PF_X, #8
1464        vshll.u8    q8, d19, #8
1465                                    PF tst PF_CTL, #0xF
1466    vsri.u8     d6, d6, #5
1467                                    PF addne PF_X, PF_X, #8
1468    vmvn.8      d3, d3
1469                                    PF subne PF_CTL, PF_CTL, #1
1470    vsri.u8     d7, d7, #6
1471    vshrn.u16   d30, q2, #2
1472    vmull.u8    q10, d3, d6
1473                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1474    vmull.u8    q11, d3, d7
1475    vmull.u8    q12, d3, d30
1476                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1477        vsri.u16    q14, q8, #5
1478                                    PF cmp PF_X, ORIG_W
1479        vshll.u8    q9, d18, #8
1480    vrshr.u16   q13, q10, #8
1481                                    PF subge PF_X, PF_X, ORIG_W
1482    vrshr.u16   q3, q11, #8
1483    vrshr.u16   q15, q12, #8
1484                                    PF subges PF_CTL, PF_CTL, #0x10
1485        vsri.u16    q14, q9, #11
1486                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1487    vraddhn.u16 d20, q10, q13
1488    vraddhn.u16 d23, q11, q3
1489                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1490    vraddhn.u16 d22, q12, q15
1491        vst1.16     {d28, d29}, [DST_W, :128]!
1492.endm
1493
1494#else
1495
1496/* If we did not care much about the performance, we would just use this... */
1497.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
1498    pixman_composite_over_8888_0565_process_pixblock_tail
1499    vst1.16     {d28, d29}, [DST_W, :128]!
1500    vld1.16     {d4, d5}, [DST_R, :128]!
1501    fetch_src_pixblock
1502    pixman_composite_over_8888_0565_process_pixblock_head
1503    cache_preload 8, 8
1504.endm
1505
1506#endif
1507
1508/*
1509 * And now the final part. We are using 'generate_composite_function' macro
1510 * to put all the stuff together. We are specifying the name of the function
1511 * which we want to get, number of bits per pixel for the source, mask and
1512 * destination (0 if unused, like mask in this case). Next come some bit
1513 * flags:
1514 *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
1515 *                             and written, for write-only buffer we would use
1516 *                             FLAG_DST_WRITEONLY flag instead
1517 *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
1518 *                             and separate color channels for 32bpp format.
1519 * The next things are:
1520 *  - the number of pixels processed per iteration (8 in this case, because
1521 *    that's the maximum what can fit into four 64-bit NEON registers).
1522 *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
1523 *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
1524 *    prefetch distance can be selected by running some benchmarks.
1525 *
1526 * After that we specify some macros, these are 'default_init',
1527 * 'default_cleanup' here which are empty (but it is possible to have custom
1528 * init/cleanup macros to be able to save/restore some extra NEON registers
1529 * like d8-d15 or do anything else) followed by
1530 * 'pixman_composite_over_8888_0565_process_pixblock_head',
1531 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
1532 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
1533 * which we got implemented above.
1534 *
1535 * The last part is the NEON registers allocation scheme.
1536 */
1537generate_composite_function \
1538    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
1539    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1540    8, /* number of pixels, processed in a single block */ \
1541    5, /* prefetch distance */ \
1542    default_init, \
1543    default_cleanup, \
1544    pixman_composite_over_8888_0565_process_pixblock_head, \
1545    pixman_composite_over_8888_0565_process_pixblock_tail, \
1546    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
1547    28, /* dst_w_basereg */ \
1548    4,  /* dst_r_basereg */ \
1549    0,  /* src_basereg   */ \
1550    24  /* mask_basereg  */
1551
1552/******************************************************************************/
1553
1554.macro pixman_composite_over_n_0565_process_pixblock_head
1555    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1556       and put data into d6 - red, d7 - green, d30 - blue */
1557    vshrn.u16   d6, q2, #8
1558    vshrn.u16   d7, q2, #3
1559    vsli.u16    q2, q2, #5
1560    vsri.u8     d6, d6, #5
1561    vsri.u8     d7, d7, #6
1562    vshrn.u16   d30, q2, #2
1563    /* now do alpha blending, storing results in 8-bit planar format
1564       into d16 - red, d19 - green, d18 - blue */
1565    vmull.u8    q10, d3, d6
1566    vmull.u8    q11, d3, d7
1567    vmull.u8    q12, d3, d30
1568    vrshr.u16   q13, q10, #8
1569    vrshr.u16   q3, q11, #8
1570    vrshr.u16   q15, q12, #8
1571    vraddhn.u16 d20, q10, q13
1572    vraddhn.u16 d23, q11, q3
1573    vraddhn.u16 d22, q12, q15
1574.endm
1575
1576.macro pixman_composite_over_n_0565_process_pixblock_tail
1577    /* ... continue alpha blending */
1578    vqadd.u8    d16, d2, d20
1579    vqadd.u8    q9, q0, q11
1580    /* convert the result to r5g6b5 and store it into {d28, d29} */
1581    vshll.u8    q14, d16, #8
1582    vshll.u8    q8, d19, #8
1583    vshll.u8    q9, d18, #8
1584    vsri.u16    q14, q8, #5
1585    vsri.u16    q14, q9, #11
1586.endm
1587
1588/* TODO: expand macros and do better instructions scheduling */
1589.macro pixman_composite_over_n_0565_process_pixblock_tail_head
1590    pixman_composite_over_n_0565_process_pixblock_tail
1591    vld1.16     {d4, d5}, [DST_R, :128]!
1592    vst1.16     {d28, d29}, [DST_W, :128]!
1593    pixman_composite_over_n_0565_process_pixblock_head
1594    cache_preload 8, 8
1595.endm
1596
1597.macro pixman_composite_over_n_0565_init
1598    add         DUMMY, sp, #ARGS_STACK_OFFSET
1599    vld1.32     {d3[0]}, [DUMMY]
1600    vdup.8      d0, d3[0]
1601    vdup.8      d1, d3[1]
1602    vdup.8      d2, d3[2]
1603    vdup.8      d3, d3[3]
1604    vmvn.8      d3, d3      /* invert source alpha */
1605.endm
1606
1607generate_composite_function \
1608    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
1609    FLAG_DST_READWRITE, \
1610    8, /* number of pixels, processed in a single block */ \
1611    5, /* prefetch distance */ \
1612    pixman_composite_over_n_0565_init, \
1613    default_cleanup, \
1614    pixman_composite_over_n_0565_process_pixblock_head, \
1615    pixman_composite_over_n_0565_process_pixblock_tail, \
1616    pixman_composite_over_n_0565_process_pixblock_tail_head, \
1617    28, /* dst_w_basereg */ \
1618    4,  /* dst_r_basereg */ \
1619    0,  /* src_basereg   */ \
1620    24  /* mask_basereg  */
1621
1622/******************************************************************************/
1623
1624.macro pixman_composite_src_8888_0565_process_pixblock_head
1625    vshll.u8    q8, d1, #8
1626    vshll.u8    q14, d2, #8
1627    vshll.u8    q9, d0, #8
1628.endm
1629
1630.macro pixman_composite_src_8888_0565_process_pixblock_tail
1631    vsri.u16    q14, q8, #5
1632    vsri.u16    q14, q9, #11
1633.endm
1634
1635.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
1636        vsri.u16    q14, q8, #5
1637                                    PF add PF_X, PF_X, #8
1638                                    PF tst PF_CTL, #0xF
1639    fetch_src_pixblock
1640                                    PF addne PF_X, PF_X, #8
1641                                    PF subne PF_CTL, PF_CTL, #1
1642        vsri.u16    q14, q9, #11
1643                                    PF cmp PF_X, ORIG_W
1644                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1645    vshll.u8    q8, d1, #8
1646        vst1.16     {d28, d29}, [DST_W, :128]!
1647                                    PF subge PF_X, PF_X, ORIG_W
1648                                    PF subges PF_CTL, PF_CTL, #0x10
1649    vshll.u8    q14, d2, #8
1650                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1651    vshll.u8    q9, d0, #8
1652.endm
1653
1654generate_composite_function \
1655    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
1656    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1657    8, /* number of pixels, processed in a single block */ \
1658    10, /* prefetch distance */ \
1659    default_init, \
1660    default_cleanup, \
1661    pixman_composite_src_8888_0565_process_pixblock_head, \
1662    pixman_composite_src_8888_0565_process_pixblock_tail, \
1663    pixman_composite_src_8888_0565_process_pixblock_tail_head
1664
1665/******************************************************************************/
1666
1667.macro pixman_composite_src_0565_8888_process_pixblock_head
1668    vshrn.u16   d30, q0, #8
1669    vshrn.u16   d29, q0, #3
1670    vsli.u16    q0, q0, #5
1671    vmov.u8     d31, #255
1672    vsri.u8     d30, d30, #5
1673    vsri.u8     d29, d29, #6
1674    vshrn.u16   d28, q0, #2
1675.endm
1676
1677.macro pixman_composite_src_0565_8888_process_pixblock_tail
1678.endm
1679
1680/* TODO: expand macros and do better instructions scheduling */
1681.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
1682    pixman_composite_src_0565_8888_process_pixblock_tail
1683    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
1684    fetch_src_pixblock
1685    pixman_composite_src_0565_8888_process_pixblock_head
1686    cache_preload 8, 8
1687.endm
1688
1689generate_composite_function \
1690    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
1691    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1692    8, /* number of pixels, processed in a single block */ \
1693    10, /* prefetch distance */ \
1694    default_init, \
1695    default_cleanup, \
1696    pixman_composite_src_0565_8888_process_pixblock_head, \
1697    pixman_composite_src_0565_8888_process_pixblock_tail, \
1698    pixman_composite_src_0565_8888_process_pixblock_tail_head
1699
1700/******************************************************************************/
1701
1702.macro pixman_composite_add_8_8_process_pixblock_head
1703    vqadd.u8    q14, q0, q2
1704    vqadd.u8    q15, q1, q3
1705.endm
1706
1707.macro pixman_composite_add_8_8_process_pixblock_tail
1708.endm
1709
1710.macro pixman_composite_add_8_8_process_pixblock_tail_head
1711    fetch_src_pixblock
1712                                    PF add PF_X, PF_X, #32
1713                                    PF tst PF_CTL, #0xF
1714    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
1715                                    PF addne PF_X, PF_X, #32
1716                                    PF subne PF_CTL, PF_CTL, #1
1717        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1718                                    PF cmp PF_X, ORIG_W
1719                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1720                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1721                                    PF subge PF_X, PF_X, ORIG_W
1722                                    PF subges PF_CTL, PF_CTL, #0x10
1723    vqadd.u8    q14, q0, q2
1724                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1725                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1726    vqadd.u8    q15, q1, q3
1727.endm
1728
1729generate_composite_function \
1730    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
1731    FLAG_DST_READWRITE, \
1732    32, /* number of pixels, processed in a single block */ \
1733    10, /* prefetch distance */ \
1734    default_init, \
1735    default_cleanup, \
1736    pixman_composite_add_8_8_process_pixblock_head, \
1737    pixman_composite_add_8_8_process_pixblock_tail, \
1738    pixman_composite_add_8_8_process_pixblock_tail_head
1739
1740/******************************************************************************/
1741
1742.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
1743    fetch_src_pixblock
1744                                    PF add PF_X, PF_X, #8
1745                                    PF tst PF_CTL, #0xF
1746    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
1747                                    PF addne PF_X, PF_X, #8
1748                                    PF subne PF_CTL, PF_CTL, #1
1749        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
1750                                    PF cmp PF_X, ORIG_W
1751                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1752                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1753                                    PF subge PF_X, PF_X, ORIG_W
1754                                    PF subges PF_CTL, PF_CTL, #0x10
1755    vqadd.u8    q14, q0, q2
1756                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1757                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1758    vqadd.u8    q15, q1, q3
1759.endm
1760
1761generate_composite_function \
1762    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
1763    FLAG_DST_READWRITE, \
1764    8, /* number of pixels, processed in a single block */ \
1765    10, /* prefetch distance */ \
1766    default_init, \
1767    default_cleanup, \
1768    pixman_composite_add_8_8_process_pixblock_head, \
1769    pixman_composite_add_8_8_process_pixblock_tail, \
1770    pixman_composite_add_8888_8888_process_pixblock_tail_head
1771
1772generate_composite_function_single_scanline \
1773    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
1774    FLAG_DST_READWRITE, \
1775    8, /* number of pixels, processed in a single block */ \
1776    default_init, \
1777    default_cleanup, \
1778    pixman_composite_add_8_8_process_pixblock_head, \
1779    pixman_composite_add_8_8_process_pixblock_tail, \
1780    pixman_composite_add_8888_8888_process_pixblock_tail_head
1781
1782/******************************************************************************/
1783
1784.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
1785    vmvn.8      d24, d3  /* get inverted alpha */
1786    /* do alpha blending */
1787    vmull.u8    q8, d24, d4
1788    vmull.u8    q9, d24, d5
1789    vmull.u8    q10, d24, d6
1790    vmull.u8    q11, d24, d7
1791.endm
1792
1793.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
1794    vrshr.u16   q14, q8, #8
1795    vrshr.u16   q15, q9, #8
1796    vrshr.u16   q12, q10, #8
1797    vrshr.u16   q13, q11, #8
1798    vraddhn.u16 d28, q14, q8
1799    vraddhn.u16 d29, q15, q9
1800    vraddhn.u16 d30, q12, q10
1801    vraddhn.u16 d31, q13, q11
1802.endm
1803
1804.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1805    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1806        vrshr.u16   q14, q8, #8
1807                                    PF add PF_X, PF_X, #8
1808                                    PF tst PF_CTL, #0xF
1809        vrshr.u16   q15, q9, #8
1810        vrshr.u16   q12, q10, #8
1811        vrshr.u16   q13, q11, #8
1812                                    PF addne PF_X, PF_X, #8
1813                                    PF subne PF_CTL, PF_CTL, #1
1814        vraddhn.u16 d28, q14, q8
1815        vraddhn.u16 d29, q15, q9
1816                                    PF cmp PF_X, ORIG_W
1817        vraddhn.u16 d30, q12, q10
1818        vraddhn.u16 d31, q13, q11
1819    fetch_src_pixblock
1820                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1821    vmvn.8      d22, d3
1822                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1823        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1824                                    PF subge PF_X, PF_X, ORIG_W
1825    vmull.u8    q8, d22, d4
1826                                    PF subges PF_CTL, PF_CTL, #0x10
1827    vmull.u8    q9, d22, d5
1828                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1829    vmull.u8    q10, d22, d6
1830                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1831    vmull.u8    q11, d22, d7
1832.endm
1833
1834generate_composite_function_single_scanline \
1835    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
1836    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1837    8, /* number of pixels, processed in a single block */ \
1838    default_init, \
1839    default_cleanup, \
1840    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
1841    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
1842    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1843
1844/******************************************************************************/
1845
1846.macro pixman_composite_over_8888_8888_process_pixblock_head
1847    pixman_composite_out_reverse_8888_8888_process_pixblock_head
1848.endm
1849
1850.macro pixman_composite_over_8888_8888_process_pixblock_tail
1851    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
1852    vqadd.u8    q14, q0, q14
1853    vqadd.u8    q15, q1, q15
1854.endm
1855
1856.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
1857    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1858        vrshr.u16   q14, q8, #8
1859                                    PF add PF_X, PF_X, #8
1860                                    PF tst PF_CTL, #0xF
1861        vrshr.u16   q15, q9, #8
1862        vrshr.u16   q12, q10, #8
1863        vrshr.u16   q13, q11, #8
1864                                    PF addne PF_X, PF_X, #8
1865                                    PF subne PF_CTL, PF_CTL, #1
1866        vraddhn.u16 d28, q14, q8
1867        vraddhn.u16 d29, q15, q9
1868                                    PF cmp PF_X, ORIG_W
1869        vraddhn.u16 d30, q12, q10
1870        vraddhn.u16 d31, q13, q11
1871        vqadd.u8    q14, q0, q14
1872        vqadd.u8    q15, q1, q15
1873    fetch_src_pixblock
1874                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1875    vmvn.8      d22, d3
1876                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1877        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1878                                    PF subge PF_X, PF_X, ORIG_W
1879    vmull.u8    q8, d22, d4
1880                                    PF subges PF_CTL, PF_CTL, #0x10
1881    vmull.u8    q9, d22, d5
1882                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1883    vmull.u8    q10, d22, d6
1884                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1885    vmull.u8    q11, d22, d7
1886.endm
1887
1888generate_composite_function \
1889    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
1890    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1891    8, /* number of pixels, processed in a single block */ \
1892    5, /* prefetch distance */ \
1893    default_init, \
1894    default_cleanup, \
1895    pixman_composite_over_8888_8888_process_pixblock_head, \
1896    pixman_composite_over_8888_8888_process_pixblock_tail, \
1897    pixman_composite_over_8888_8888_process_pixblock_tail_head
1898
1899generate_composite_function_single_scanline \
1900    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
1901    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1902    8, /* number of pixels, processed in a single block */ \
1903    default_init, \
1904    default_cleanup, \
1905    pixman_composite_over_8888_8888_process_pixblock_head, \
1906    pixman_composite_over_8888_8888_process_pixblock_tail, \
1907    pixman_composite_over_8888_8888_process_pixblock_tail_head
1908
1909/******************************************************************************/
1910
1911.macro pixman_composite_over_n_8888_process_pixblock_head
1912    /* deinterleaved source pixels in {d0, d1, d2, d3} */
1913    /* inverted alpha in {d24} */
1914    /* destination pixels in {d4, d5, d6, d7} */
1915    vmull.u8    q8, d24, d4
1916    vmull.u8    q9, d24, d5
1917    vmull.u8    q10, d24, d6
1918    vmull.u8    q11, d24, d7
1919.endm
1920
1921.macro pixman_composite_over_n_8888_process_pixblock_tail
1922    vrshr.u16   q14, q8, #8
1923    vrshr.u16   q15, q9, #8
1924    vrshr.u16   q2, q10, #8
1925    vrshr.u16   q3, q11, #8
1926    vraddhn.u16 d28, q14, q8
1927    vraddhn.u16 d29, q15, q9
1928    vraddhn.u16 d30, q2, q10
1929    vraddhn.u16 d31, q3, q11
1930    vqadd.u8    q14, q0, q14
1931    vqadd.u8    q15, q1, q15
1932.endm
1933
1934.macro pixman_composite_over_n_8888_process_pixblock_tail_head
1935        vrshr.u16   q14, q8, #8
1936        vrshr.u16   q15, q9, #8
1937        vrshr.u16   q2, q10, #8
1938        vrshr.u16   q3, q11, #8
1939        vraddhn.u16 d28, q14, q8
1940        vraddhn.u16 d29, q15, q9
1941        vraddhn.u16 d30, q2, q10
1942        vraddhn.u16 d31, q3, q11
1943    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1944        vqadd.u8    q14, q0, q14
1945                                    PF add PF_X, PF_X, #8
1946                                    PF tst PF_CTL, #0x0F
1947                                    PF addne PF_X, PF_X, #8
1948                                    PF subne PF_CTL, PF_CTL, #1
1949        vqadd.u8    q15, q1, q15
1950                                    PF cmp PF_X, ORIG_W
1951    vmull.u8    q8, d24, d4
1952                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1953    vmull.u8    q9, d24, d5
1954                                    PF subge PF_X, PF_X, ORIG_W
1955    vmull.u8    q10, d24, d6
1956                                    PF subges PF_CTL, PF_CTL, #0x10
1957    vmull.u8    q11, d24, d7
1958                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1959        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1960.endm
1961
1962.macro pixman_composite_over_n_8888_init
1963    add         DUMMY, sp, #ARGS_STACK_OFFSET
1964    vld1.32     {d3[0]}, [DUMMY]
1965    vdup.8      d0, d3[0]
1966    vdup.8      d1, d3[1]
1967    vdup.8      d2, d3[2]
1968    vdup.8      d3, d3[3]
1969    vmvn.8      d24, d3  /* get inverted alpha */
1970.endm
1971
1972generate_composite_function \
1973    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
1974    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1975    8, /* number of pixels, processed in a single block */ \
1976    5, /* prefetch distance */ \
1977    pixman_composite_over_n_8888_init, \
1978    default_cleanup, \
1979    pixman_composite_over_8888_8888_process_pixblock_head, \
1980    pixman_composite_over_8888_8888_process_pixblock_tail, \
1981    pixman_composite_over_n_8888_process_pixblock_tail_head
1982
1983/******************************************************************************/
1984
1985.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
1986        vrshr.u16   q14, q8, #8
1987                                    PF add PF_X, PF_X, #8
1988                                    PF tst PF_CTL, #0xF
1989        vrshr.u16   q15, q9, #8
1990        vrshr.u16   q12, q10, #8
1991        vrshr.u16   q13, q11, #8
1992                                    PF addne PF_X, PF_X, #8
1993                                    PF subne PF_CTL, PF_CTL, #1
1994        vraddhn.u16 d28, q14, q8
1995        vraddhn.u16 d29, q15, q9
1996                                    PF cmp PF_X, ORIG_W
1997        vraddhn.u16 d30, q12, q10
1998        vraddhn.u16 d31, q13, q11
1999        vqadd.u8    q14, q0, q14
2000        vqadd.u8    q15, q1, q15
2001    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
2002    vmvn.8      d22, d3
2003                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
2004        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2005                                    PF subge PF_X, PF_X, ORIG_W
2006    vmull.u8    q8, d22, d4
2007                                    PF subges PF_CTL, PF_CTL, #0x10
2008    vmull.u8    q9, d22, d5
2009    vmull.u8    q10, d22, d6
2010                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2011    vmull.u8    q11, d22, d7
2012.endm
2013
2014.macro pixman_composite_over_reverse_n_8888_init
2015    add         DUMMY, sp, #ARGS_STACK_OFFSET
2016    vld1.32     {d7[0]}, [DUMMY]
2017    vdup.8      d4, d7[0]
2018    vdup.8      d5, d7[1]
2019    vdup.8      d6, d7[2]
2020    vdup.8      d7, d7[3]
2021.endm
2022
2023generate_composite_function \
2024    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
2025    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2026    8, /* number of pixels, processed in a single block */ \
2027    5, /* prefetch distance */ \
2028    pixman_composite_over_reverse_n_8888_init, \
2029    default_cleanup, \
2030    pixman_composite_over_8888_8888_process_pixblock_head, \
2031    pixman_composite_over_8888_8888_process_pixblock_tail, \
2032    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
2033    28, /* dst_w_basereg */ \
2034    0,  /* dst_r_basereg */ \
2035    4,  /* src_basereg   */ \
2036    24  /* mask_basereg  */
2037
2038/******************************************************************************/
2039
2040.macro pixman_composite_over_8888_8_0565_process_pixblock_head
2041    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
2042    vmull.u8    q1,  d24, d9
2043    vmull.u8    q6,  d24, d10
2044    vmull.u8    q7,  d24, d11
2045        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
2046        vshrn.u16   d7,  q2, #3
2047        vsli.u16    q2,  q2, #5
2048    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
2049    vrshr.u16   q9,  q1,  #8
2050    vrshr.u16   q10, q6,  #8
2051    vrshr.u16   q11, q7,  #8
2052    vraddhn.u16 d0,  q0,  q8
2053    vraddhn.u16 d1,  q1,  q9
2054    vraddhn.u16 d2,  q6,  q10
2055    vraddhn.u16 d3,  q7,  q11
2056        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
2057        vsri.u8     d7,  d7, #6
2058    vmvn.8      d3,  d3
2059        vshrn.u16   d30, q2, #2
2060    vmull.u8    q8,  d3, d6     /* now do alpha blending */
2061    vmull.u8    q9,  d3, d7
2062    vmull.u8    q10, d3, d30
2063.endm
2064
2065.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
2066    /* 3 cycle bubble (after vmull.u8) */
2067    vrshr.u16   q13, q8,  #8
2068    vrshr.u16   q11, q9,  #8
2069    vrshr.u16   q15, q10, #8
2070    vraddhn.u16 d16, q8,  q13
2071    vraddhn.u16 d27, q9,  q11
2072    vraddhn.u16 d26, q10, q15
2073    vqadd.u8    d16, d2,  d16
2074    /* 1 cycle bubble */
2075    vqadd.u8    q9,  q0,  q13
2076    vshll.u8    q14, d16, #8    /* convert to 16bpp */
2077    vshll.u8    q8,  d19, #8
2078    vshll.u8    q9,  d18, #8
2079    vsri.u16    q14, q8,  #5
2080    /* 1 cycle bubble */
2081    vsri.u16    q14, q9,  #11
2082.endm
2083
2084.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
2085    vld1.16     {d4, d5}, [DST_R, :128]!
2086    vshrn.u16   d6,  q2,  #8
2087    fetch_mask_pixblock
2088    vshrn.u16   d7,  q2,  #3
2089    fetch_src_pixblock
2090    vmull.u8    q6,  d24, d10
2091        vrshr.u16   q13, q8,  #8
2092        vrshr.u16   q11, q9,  #8
2093        vrshr.u16   q15, q10, #8
2094        vraddhn.u16 d16, q8,  q13
2095        vraddhn.u16 d27, q9,  q11
2096        vraddhn.u16 d26, q10, q15
2097        vqadd.u8    d16, d2,  d16
2098    vmull.u8    q1,  d24, d9
2099        vqadd.u8    q9,  q0,  q13
2100        vshll.u8    q14, d16, #8
2101    vmull.u8    q0,  d24, d8
2102        vshll.u8    q8,  d19, #8
2103        vshll.u8    q9,  d18, #8
2104        vsri.u16    q14, q8,  #5
2105    vmull.u8    q7,  d24, d11
2106        vsri.u16    q14, q9,  #11
2107
2108    cache_preload 8, 8
2109
2110    vsli.u16    q2,  q2,  #5
2111    vrshr.u16   q8,  q0,  #8
2112    vrshr.u16   q9,  q1,  #8
2113    vrshr.u16   q10, q6,  #8
2114    vrshr.u16   q11, q7,  #8
2115    vraddhn.u16 d0,  q0,  q8
2116    vraddhn.u16 d1,  q1,  q9
2117    vraddhn.u16 d2,  q6,  q10
2118    vraddhn.u16 d3,  q7,  q11
2119    vsri.u8     d6,  d6,  #5
2120    vsri.u8     d7,  d7,  #6
2121    vmvn.8      d3,  d3
2122    vshrn.u16   d30, q2,  #2
2123    vst1.16     {d28, d29}, [DST_W, :128]!
2124    vmull.u8    q8,  d3,  d6
2125    vmull.u8    q9,  d3,  d7
2126    vmull.u8    q10, d3,  d30
2127.endm
2128
2129generate_composite_function \
2130    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
2131    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2132    8, /* number of pixels, processed in a single block */ \
2133    5, /* prefetch distance */ \
2134    default_init_need_all_regs, \
2135    default_cleanup_need_all_regs, \
2136    pixman_composite_over_8888_8_0565_process_pixblock_head, \
2137    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2138    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2139    28, /* dst_w_basereg */ \
2140    4,  /* dst_r_basereg */ \
2141    8,  /* src_basereg   */ \
2142    24  /* mask_basereg  */
2143
2144/******************************************************************************/
2145
2146/*
2147 * This function needs a special initialization of solid mask.
2148 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
2149 * offset, split into color components and replicated in d8-d11
2150 * registers. Additionally, this function needs all the NEON registers,
2151 * so it has to save d8-d15 registers which are callee saved according
2152 * to ABI. These registers are restored from 'cleanup' macro. All the
2153 * other NEON registers are caller saved, so can be clobbered freely
2154 * without introducing any problems.
2155 */
2156.macro pixman_composite_over_n_8_0565_init
2157    add         DUMMY, sp, #ARGS_STACK_OFFSET
2158    vpush       {d8-d15}
2159    vld1.32     {d11[0]}, [DUMMY]
2160    vdup.8      d8, d11[0]
2161    vdup.8      d9, d11[1]
2162    vdup.8      d10, d11[2]
2163    vdup.8      d11, d11[3]
2164.endm
2165
2166.macro pixman_composite_over_n_8_0565_cleanup
2167    vpop        {d8-d15}
2168.endm
2169
2170generate_composite_function \
2171    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
2172    FLAG_DST_READWRITE, \
2173    8, /* number of pixels, processed in a single block */ \
2174    5, /* prefetch distance */ \
2175    pixman_composite_over_n_8_0565_init, \
2176    pixman_composite_over_n_8_0565_cleanup, \
2177    pixman_composite_over_8888_8_0565_process_pixblock_head, \
2178    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2179    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
2180
2181/******************************************************************************/
2182
2183.macro pixman_composite_over_8888_n_0565_init
2184    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2185    vpush       {d8-d15}
2186    vld1.32     {d24[0]}, [DUMMY]
2187    vdup.8      d24, d24[3]
2188.endm
2189
2190.macro pixman_composite_over_8888_n_0565_cleanup
2191    vpop        {d8-d15}
2192.endm
2193
2194generate_composite_function \
2195    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
2196    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2197    8, /* number of pixels, processed in a single block */ \
2198    5, /* prefetch distance */ \
2199    pixman_composite_over_8888_n_0565_init, \
2200    pixman_composite_over_8888_n_0565_cleanup, \
2201    pixman_composite_over_8888_8_0565_process_pixblock_head, \
2202    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2203    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2204    28, /* dst_w_basereg */ \
2205    4,  /* dst_r_basereg */ \
2206    8,  /* src_basereg   */ \
2207    24  /* mask_basereg  */
2208
2209/******************************************************************************/
2210
2211.macro pixman_composite_src_0565_0565_process_pixblock_head
2212.endm
2213
2214.macro pixman_composite_src_0565_0565_process_pixblock_tail
2215.endm
2216
2217.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
2218    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
2219    fetch_src_pixblock
2220    cache_preload 16, 16
2221.endm
2222
2223generate_composite_function \
2224    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
2225    FLAG_DST_WRITEONLY, \
2226    16, /* number of pixels, processed in a single block */ \
2227    10, /* prefetch distance */ \
2228    default_init, \
2229    default_cleanup, \
2230    pixman_composite_src_0565_0565_process_pixblock_head, \
2231    pixman_composite_src_0565_0565_process_pixblock_tail, \
2232    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
2233    0, /* dst_w_basereg */ \
2234    0, /* dst_r_basereg */ \
2235    0, /* src_basereg   */ \
2236    0  /* mask_basereg  */
2237
2238/******************************************************************************/
2239
2240.macro pixman_composite_src_n_8_process_pixblock_head
2241.endm
2242
2243.macro pixman_composite_src_n_8_process_pixblock_tail
2244.endm
2245
2246.macro pixman_composite_src_n_8_process_pixblock_tail_head
2247    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
2248.endm
2249
2250.macro pixman_composite_src_n_8_init
2251    add         DUMMY, sp, #ARGS_STACK_OFFSET
2252    vld1.32     {d0[0]}, [DUMMY]
2253    vsli.u64    d0, d0, #8
2254    vsli.u64    d0, d0, #16
2255    vsli.u64    d0, d0, #32
2256    vorr        d1, d0, d0
2257    vorr        q1, q0, q0
2258.endm
2259
2260.macro pixman_composite_src_n_8_cleanup
2261.endm
2262
2263generate_composite_function \
2264    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
2265    FLAG_DST_WRITEONLY, \
2266    32, /* number of pixels, processed in a single block */ \
2267    0,  /* prefetch distance */ \
2268    pixman_composite_src_n_8_init, \
2269    pixman_composite_src_n_8_cleanup, \
2270    pixman_composite_src_n_8_process_pixblock_head, \
2271    pixman_composite_src_n_8_process_pixblock_tail, \
2272    pixman_composite_src_n_8_process_pixblock_tail_head, \
2273    0, /* dst_w_basereg */ \
2274    0, /* dst_r_basereg */ \
2275    0, /* src_basereg   */ \
2276    0  /* mask_basereg  */
2277
2278/******************************************************************************/
2279
2280.macro pixman_composite_src_n_0565_process_pixblock_head
2281.endm
2282
2283.macro pixman_composite_src_n_0565_process_pixblock_tail
2284.endm
2285
2286.macro pixman_composite_src_n_0565_process_pixblock_tail_head
2287    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
2288.endm
2289
2290.macro pixman_composite_src_n_0565_init
2291    add         DUMMY, sp, #ARGS_STACK_OFFSET
2292    vld1.32     {d0[0]}, [DUMMY]
2293    vsli.u64    d0, d0, #16
2294    vsli.u64    d0, d0, #32
2295    vorr        d1, d0, d0
2296    vorr        q1, q0, q0
2297.endm
2298
2299.macro pixman_composite_src_n_0565_cleanup
2300.endm
2301
2302generate_composite_function \
2303    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
2304    FLAG_DST_WRITEONLY, \
2305    16, /* number of pixels, processed in a single block */ \
2306    0,  /* prefetch distance */ \
2307    pixman_composite_src_n_0565_init, \
2308    pixman_composite_src_n_0565_cleanup, \
2309    pixman_composite_src_n_0565_process_pixblock_head, \
2310    pixman_composite_src_n_0565_process_pixblock_tail, \
2311    pixman_composite_src_n_0565_process_pixblock_tail_head, \
2312    0, /* dst_w_basereg */ \
2313    0, /* dst_r_basereg */ \
2314    0, /* src_basereg   */ \
2315    0  /* mask_basereg  */
2316
2317/******************************************************************************/
2318
2319.macro pixman_composite_src_n_8888_process_pixblock_head
2320.endm
2321
2322.macro pixman_composite_src_n_8888_process_pixblock_tail
2323.endm
2324
2325.macro pixman_composite_src_n_8888_process_pixblock_tail_head
2326    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
2327.endm
2328
2329.macro pixman_composite_src_n_8888_init
2330    add         DUMMY, sp, #ARGS_STACK_OFFSET
2331    vld1.32     {d0[0]}, [DUMMY]
2332    vsli.u64    d0, d0, #32
2333    vorr        d1, d0, d0
2334    vorr        q1, q0, q0
2335.endm
2336
2337.macro pixman_composite_src_n_8888_cleanup
2338.endm
2339
2340generate_composite_function \
2341    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
2342    FLAG_DST_WRITEONLY, \
2343    8, /* number of pixels, processed in a single block */ \
2344    0, /* prefetch distance */ \
2345    pixman_composite_src_n_8888_init, \
2346    pixman_composite_src_n_8888_cleanup, \
2347    pixman_composite_src_n_8888_process_pixblock_head, \
2348    pixman_composite_src_n_8888_process_pixblock_tail, \
2349    pixman_composite_src_n_8888_process_pixblock_tail_head, \
2350    0, /* dst_w_basereg */ \
2351    0, /* dst_r_basereg */ \
2352    0, /* src_basereg   */ \
2353    0  /* mask_basereg  */
2354
2355/******************************************************************************/
2356
2357.macro pixman_composite_src_8888_8888_process_pixblock_head
2358.endm
2359
2360.macro pixman_composite_src_8888_8888_process_pixblock_tail
2361.endm
2362
2363.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
2364    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
2365    fetch_src_pixblock
2366    cache_preload 8, 8
2367.endm
2368
2369generate_composite_function \
2370    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
2371    FLAG_DST_WRITEONLY, \
2372    8, /* number of pixels, processed in a single block */ \
2373    10, /* prefetch distance */ \
2374    default_init, \
2375    default_cleanup, \
2376    pixman_composite_src_8888_8888_process_pixblock_head, \
2377    pixman_composite_src_8888_8888_process_pixblock_tail, \
2378    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
2379    0, /* dst_w_basereg */ \
2380    0, /* dst_r_basereg */ \
2381    0, /* src_basereg   */ \
2382    0  /* mask_basereg  */
2383
2384/******************************************************************************/
2385
2386.macro pixman_composite_src_x888_8888_process_pixblock_head
2387    vorr     q0, q0, q2
2388    vorr     q1, q1, q2
2389.endm
2390
2391.macro pixman_composite_src_x888_8888_process_pixblock_tail
2392.endm
2393
2394.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
2395    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
2396    fetch_src_pixblock
2397    vorr     q0, q0, q2
2398    vorr     q1, q1, q2
2399    cache_preload 8, 8
2400.endm
2401
2402.macro pixman_composite_src_x888_8888_init
2403    vmov.u8  q2, #0xFF
2404    vshl.u32 q2, q2, #24
2405.endm
2406
2407generate_composite_function \
2408    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
2409    FLAG_DST_WRITEONLY, \
2410    8, /* number of pixels, processed in a single block */ \
2411    10, /* prefetch distance */ \
2412    pixman_composite_src_x888_8888_init, \
2413    default_cleanup, \
2414    pixman_composite_src_x888_8888_process_pixblock_head, \
2415    pixman_composite_src_x888_8888_process_pixblock_tail, \
2416    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
2417    0, /* dst_w_basereg */ \
2418    0, /* dst_r_basereg */ \
2419    0, /* src_basereg   */ \
2420    0  /* mask_basereg  */
2421
2422/******************************************************************************/
2423
2424.macro pixman_composite_src_n_8_8888_process_pixblock_head
2425    /* expecting solid source in {d0, d1, d2, d3} */
2426    /* mask is in d24 (d25, d26, d27 are unused) */
2427
2428    /* in */
2429    vmull.u8    q8, d24, d0
2430    vmull.u8    q9, d24, d1
2431    vmull.u8    q10, d24, d2
2432    vmull.u8    q11, d24, d3
2433    vrsra.u16   q8, q8, #8
2434    vrsra.u16   q9, q9, #8
2435    vrsra.u16   q10, q10, #8
2436    vrsra.u16   q11, q11, #8
2437.endm
2438
2439.macro pixman_composite_src_n_8_8888_process_pixblock_tail
2440    vrshrn.u16  d28, q8, #8
2441    vrshrn.u16  d29, q9, #8
2442    vrshrn.u16  d30, q10, #8
2443    vrshrn.u16  d31, q11, #8
2444.endm
2445
2446.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
2447    fetch_mask_pixblock
2448                                    PF add PF_X, PF_X, #8
2449        vrshrn.u16  d28, q8, #8
2450                                    PF tst PF_CTL, #0x0F
2451        vrshrn.u16  d29, q9, #8
2452                                    PF addne PF_X, PF_X, #8
2453        vrshrn.u16  d30, q10, #8
2454                                    PF subne PF_CTL, PF_CTL, #1
2455        vrshrn.u16  d31, q11, #8
2456                                    PF cmp PF_X, ORIG_W
2457    vmull.u8    q8, d24, d0
2458                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
2459    vmull.u8    q9, d24, d1
2460                                    PF subge PF_X, PF_X, ORIG_W
2461    vmull.u8    q10, d24, d2
2462                                    PF subges PF_CTL, PF_CTL, #0x10
2463    vmull.u8    q11, d24, d3
2464                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2465        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2466    vrsra.u16   q8, q8, #8
2467    vrsra.u16   q9, q9, #8
2468    vrsra.u16   q10, q10, #8
2469    vrsra.u16   q11, q11, #8
2470.endm
2471
2472.macro pixman_composite_src_n_8_8888_init
2473    add         DUMMY, sp, #ARGS_STACK_OFFSET
2474    vld1.32     {d3[0]}, [DUMMY]
2475    vdup.8      d0, d3[0]
2476    vdup.8      d1, d3[1]
2477    vdup.8      d2, d3[2]
2478    vdup.8      d3, d3[3]
2479.endm
2480
2481.macro pixman_composite_src_n_8_8888_cleanup
2482.endm
2483
2484generate_composite_function \
2485    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
2486    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2487    8, /* number of pixels, processed in a single block */ \
2488    5, /* prefetch distance */ \
2489    pixman_composite_src_n_8_8888_init, \
2490    pixman_composite_src_n_8_8888_cleanup, \
2491    pixman_composite_src_n_8_8888_process_pixblock_head, \
2492    pixman_composite_src_n_8_8888_process_pixblock_tail, \
2493    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
2494
2495/******************************************************************************/
2496
2497.macro pixman_composite_src_n_8_8_process_pixblock_head
2498    vmull.u8    q0, d24, d16
2499    vmull.u8    q1, d25, d16
2500    vmull.u8    q2, d26, d16
2501    vmull.u8    q3, d27, d16
2502    vrsra.u16   q0, q0,  #8
2503    vrsra.u16   q1, q1,  #8
2504    vrsra.u16   q2, q2,  #8
2505    vrsra.u16   q3, q3,  #8
2506.endm
2507
2508.macro pixman_composite_src_n_8_8_process_pixblock_tail
2509    vrshrn.u16  d28, q0, #8
2510    vrshrn.u16  d29, q1, #8
2511    vrshrn.u16  d30, q2, #8
2512    vrshrn.u16  d31, q3, #8
2513.endm
2514
2515.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
2516    fetch_mask_pixblock
2517                                    PF add PF_X, PF_X, #8
2518        vrshrn.u16  d28, q0, #8
2519                                    PF tst PF_CTL, #0x0F
2520        vrshrn.u16  d29, q1, #8
2521                                    PF addne PF_X, PF_X, #8
2522        vrshrn.u16  d30, q2, #8
2523                                    PF subne PF_CTL, PF_CTL, #1
2524        vrshrn.u16  d31, q3, #8
2525                                    PF cmp PF_X, ORIG_W
2526    vmull.u8    q0,  d24, d16
2527                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
2528    vmull.u8    q1,  d25, d16
2529                                    PF subge PF_X, PF_X, ORIG_W
2530    vmull.u8    q2,  d26, d16
2531                                    PF subges PF_CTL, PF_CTL, #0x10
2532    vmull.u8    q3,  d27, d16
2533                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2534        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
2535    vrsra.u16   q0, q0,  #8
2536    vrsra.u16   q1, q1,  #8
2537    vrsra.u16   q2, q2,  #8
2538    vrsra.u16   q3, q3,  #8
2539.endm
2540
2541.macro pixman_composite_src_n_8_8_init
2542    add         DUMMY, sp, #ARGS_STACK_OFFSET
2543    vld1.32     {d16[0]}, [DUMMY]
2544    vdup.8      d16, d16[3]
2545.endm
2546
2547.macro pixman_composite_src_n_8_8_cleanup
2548.endm
2549
2550generate_composite_function \
2551    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
2552    FLAG_DST_WRITEONLY, \
2553    32, /* number of pixels, processed in a single block */ \
2554    5, /* prefetch distance */ \
2555    pixman_composite_src_n_8_8_init, \
2556    pixman_composite_src_n_8_8_cleanup, \
2557    pixman_composite_src_n_8_8_process_pixblock_head, \
2558    pixman_composite_src_n_8_8_process_pixblock_tail, \
2559    pixman_composite_src_n_8_8_process_pixblock_tail_head
2560
2561/******************************************************************************/
2562
2563.macro pixman_composite_over_n_8_8888_process_pixblock_head
2564    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
2565    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
2566    /* and destination data in {d4, d5, d6, d7} */
2567    /* mask is in d24 (d25, d26, d27 are unused) */
2568
2569    /* in */
2570    vmull.u8    q6, d24, d8
2571    vmull.u8    q7, d24, d9
2572    vmull.u8    q8, d24, d10
2573    vmull.u8    q9, d24, d11
2574    vrshr.u16   q10, q6, #8
2575    vrshr.u16   q11, q7, #8
2576    vrshr.u16   q12, q8, #8
2577    vrshr.u16   q13, q9, #8
2578    vraddhn.u16 d0, q6, q10
2579    vraddhn.u16 d1, q7, q11
2580    vraddhn.u16 d2, q8, q12
2581    vraddhn.u16 d3, q9, q13
2582    vmvn.8      d25, d3  /* get inverted alpha */
2583    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
2584    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
2585    /* now do alpha blending */
2586    vmull.u8    q8, d25, d4
2587    vmull.u8    q9, d25, d5
2588    vmull.u8    q10, d25, d6
2589    vmull.u8    q11, d25, d7
2590.endm
2591
2592.macro pixman_composite_over_n_8_8888_process_pixblock_tail
2593    vrshr.u16   q14, q8, #8
2594    vrshr.u16   q15, q9, #8
2595    vrshr.u16   q6, q10, #8
2596    vrshr.u16   q7, q11, #8
2597    vraddhn.u16 d28, q14, q8
2598    vraddhn.u16 d29, q15, q9
2599    vraddhn.u16 d30, q6, q10
2600    vraddhn.u16 d31, q7, q11
2601    vqadd.u8    q14, q0, q14
2602    vqadd.u8    q15, q1, q15
2603.endm
2604
2605.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
2606        vrshr.u16   q14, q8, #8
2607    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
2608        vrshr.u16   q15, q9, #8
2609    fetch_mask_pixblock
2610        vrshr.u16   q6, q10, #8
2611                                    PF add PF_X, PF_X, #8
2612        vrshr.u16   q7, q11, #8
2613                                    PF tst PF_CTL, #0x0F
2614        vraddhn.u16 d28, q14, q8
2615                                    PF addne PF_X, PF_X, #8
2616        vraddhn.u16 d29, q15, q9
2617                                    PF subne PF_CTL, PF_CTL, #1
2618        vraddhn.u16 d30, q6, q10
2619                                    PF cmp PF_X, ORIG_W
2620        vraddhn.u16 d31, q7, q11
2621                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
2622    vmull.u8    q6, d24, d8
2623                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
2624    vmull.u8    q7, d24, d9
2625                                    PF subge PF_X, PF_X, ORIG_W
2626    vmull.u8    q8, d24, d10
2627                                    PF subges PF_CTL, PF_CTL, #0x10
2628    vmull.u8    q9, d24, d11
2629                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2630        vqadd.u8    q14, q0, q14
2631                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2632        vqadd.u8    q15, q1, q15
2633    vrshr.u16   q10, q6, #8
2634    vrshr.u16   q11, q7, #8
2635    vrshr.u16   q12, q8, #8
2636    vrshr.u16   q13, q9, #8
2637    vraddhn.u16 d0, q6, q10
2638    vraddhn.u16 d1, q7, q11
2639    vraddhn.u16 d2, q8, q12
2640    vraddhn.u16 d3, q9, q13
2641        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2642    vmvn.8      d25, d3
2643    vmull.u8    q8, d25, d4
2644    vmull.u8    q9, d25, d5
2645    vmull.u8    q10, d25, d6
2646    vmull.u8    q11, d25, d7
2647.endm
2648
2649.macro pixman_composite_over_n_8_8888_init
2650    add         DUMMY, sp, #ARGS_STACK_OFFSET
2651    vpush       {d8-d15}
2652    vld1.32     {d11[0]}, [DUMMY]
2653    vdup.8      d8, d11[0]
2654    vdup.8      d9, d11[1]
2655    vdup.8      d10, d11[2]
2656    vdup.8      d11, d11[3]
2657.endm
2658
2659.macro pixman_composite_over_n_8_8888_cleanup
2660    vpop        {d8-d15}
2661.endm
2662
2663generate_composite_function \
2664    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
2665    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2666    8, /* number of pixels, processed in a single block */ \
2667    5, /* prefetch distance */ \
2668    pixman_composite_over_n_8_8888_init, \
2669    pixman_composite_over_n_8_8888_cleanup, \
2670    pixman_composite_over_n_8_8888_process_pixblock_head, \
2671    pixman_composite_over_n_8_8888_process_pixblock_tail, \
2672    pixman_composite_over_n_8_8888_process_pixblock_tail_head
2673
2674/******************************************************************************/
2675
2676.macro pixman_composite_over_n_8_8_process_pixblock_head
2677    vmull.u8    q0,  d24, d8
2678    vmull.u8    q1,  d25, d8
2679    vmull.u8    q6,  d26, d8
2680    vmull.u8    q7,  d27, d8
2681    vrshr.u16   q10, q0,  #8
2682    vrshr.u16   q11, q1,  #8
2683    vrshr.u16   q12, q6,  #8
2684    vrshr.u16   q13, q7,  #8
2685    vraddhn.u16 d0,  q0,  q10
2686    vraddhn.u16 d1,  q1,  q11
2687    vraddhn.u16 d2,  q6,  q12
2688    vraddhn.u16 d3,  q7,  q13
2689    vmvn.8      q12, q0
2690    vmvn.8      q13, q1
2691    vmull.u8    q8,  d24, d4
2692    vmull.u8    q9,  d25, d5
2693    vmull.u8    q10, d26, d6
2694    vmull.u8    q11, d27, d7
2695.endm
2696
2697.macro pixman_composite_over_n_8_8_process_pixblock_tail
2698    vrshr.u16   q14, q8,  #8
2699    vrshr.u16   q15, q9,  #8
2700    vrshr.u16   q12, q10, #8
2701    vrshr.u16   q13, q11, #8
2702    vraddhn.u16 d28, q14, q8
2703    vraddhn.u16 d29, q15, q9
2704    vraddhn.u16 d30, q12, q10
2705    vraddhn.u16 d31, q13, q11
2706    vqadd.u8    q14, q0,  q14
2707    vqadd.u8    q15, q1,  q15
2708.endm
2709
2710/* TODO: expand macros and do better instructions scheduling */
2711.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
2712    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
2713    pixman_composite_over_n_8_8_process_pixblock_tail
2714    fetch_mask_pixblock
2715    cache_preload 32, 32
2716    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
2717    pixman_composite_over_n_8_8_process_pixblock_head
2718.endm
2719
2720.macro pixman_composite_over_n_8_8_init
2721    add         DUMMY, sp, #ARGS_STACK_OFFSET
2722    vpush       {d8-d15}
2723    vld1.32     {d8[0]}, [DUMMY]
2724    vdup.8      d8, d8[3]
2725.endm
2726
2727.macro pixman_composite_over_n_8_8_cleanup
2728    vpop        {d8-d15}
2729.endm
2730
2731generate_composite_function \
2732    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
2733    FLAG_DST_READWRITE, \
2734    32, /* number of pixels, processed in a single block */ \
2735    5, /* prefetch distance */ \
2736    pixman_composite_over_n_8_8_init, \
2737    pixman_composite_over_n_8_8_cleanup, \
2738    pixman_composite_over_n_8_8_process_pixblock_head, \
2739    pixman_composite_over_n_8_8_process_pixblock_tail, \
2740    pixman_composite_over_n_8_8_process_pixblock_tail_head
2741
2742/******************************************************************************/
2743
2744.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
2745    /*
2746     * 'combine_mask_ca' replacement
2747     *
2748     * input:  solid src (n) in {d8,  d9,  d10, d11}
2749     *         dest in          {d4,  d5,  d6,  d7 }
2750     *         mask in          {d24, d25, d26, d27}
2751     * output: updated src in   {d0,  d1,  d2,  d3 }
2752     *         updated mask in  {d24, d25, d26, d3 }
2753     */
2754    vmull.u8    q0,  d24, d8
2755    vmull.u8    q1,  d25, d9
2756    vmull.u8    q6,  d26, d10
2757    vmull.u8    q7,  d27, d11
2758    vmull.u8    q9,  d11, d25
2759    vmull.u8    q12, d11, d24
2760    vmull.u8    q13, d11, d26
2761    vrshr.u16   q8,  q0,  #8
2762    vrshr.u16   q10, q1,  #8
2763    vrshr.u16   q11, q6,  #8
2764    vraddhn.u16 d0,  q0,  q8
2765    vraddhn.u16 d1,  q1,  q10
2766    vraddhn.u16 d2,  q6,  q11
2767    vrshr.u16   q11, q12, #8
2768    vrshr.u16   q8,  q9,  #8
2769    vrshr.u16   q6,  q13, #8
2770    vrshr.u16   q10, q7,  #8
2771    vraddhn.u16 d24, q12, q11
2772    vraddhn.u16 d25, q9,  q8
2773    vraddhn.u16 d26, q13, q6
2774    vraddhn.u16 d3,  q7,  q10
2775    /*
2776     * 'combine_over_ca' replacement
2777     *
2778     * output: updated dest in {d28, d29, d30, d31}
2779     */
2780    vmvn.8      q12, q12
2781    vmvn.8      d26, d26
2782    vmull.u8    q8,  d24, d4
2783    vmull.u8    q9,  d25, d5
2784    vmvn.8      d27, d3
2785    vmull.u8    q10, d26, d6
2786    vmull.u8    q11, d27, d7
2787.endm
2788
2789.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
2790    /* ... continue 'combine_over_ca' replacement */
2791    vrshr.u16   q14, q8,  #8
2792    vrshr.u16   q15, q9,  #8
2793    vrshr.u16   q6,  q10, #8
2794    vrshr.u16   q7,  q11, #8
2795    vraddhn.u16 d28, q14, q8
2796    vraddhn.u16 d29, q15, q9
2797    vraddhn.u16 d30, q6,  q10
2798    vraddhn.u16 d31, q7,  q11
2799    vqadd.u8    q14, q0,  q14
2800    vqadd.u8    q15, q1,  q15
2801.endm
2802
2803.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
2804        vrshr.u16   q14, q8, #8
2805        vrshr.u16   q15, q9, #8
2806    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
2807        vrshr.u16   q6, q10, #8
2808        vrshr.u16   q7, q11, #8
2809        vraddhn.u16 d28, q14, q8
2810        vraddhn.u16 d29, q15, q9
2811        vraddhn.u16 d30, q6, q10
2812        vraddhn.u16 d31, q7, q11
2813    fetch_mask_pixblock
2814        vqadd.u8    q14, q0, q14
2815        vqadd.u8    q15, q1, q15
2816    cache_preload 8, 8
2817    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
2818    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
2819.endm
2820
2821.macro pixman_composite_over_n_8888_8888_ca_init
2822    add         DUMMY, sp, #ARGS_STACK_OFFSET
2823    vpush       {d8-d15}
2824    vld1.32     {d11[0]}, [DUMMY]
2825    vdup.8      d8, d11[0]
2826    vdup.8      d9, d11[1]
2827    vdup.8      d10, d11[2]
2828    vdup.8      d11, d11[3]
2829.endm
2830
2831.macro pixman_composite_over_n_8888_8888_ca_cleanup
2832    vpop        {d8-d15}
2833.endm
2834
2835generate_composite_function \
2836    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
2837    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2838    8, /* number of pixels, processed in a single block */ \
2839    5, /* prefetch distance */ \
2840    pixman_composite_over_n_8888_8888_ca_init, \
2841    pixman_composite_over_n_8888_8888_ca_cleanup, \
2842    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
2843    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
2844    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
2845
2846/******************************************************************************/
2847
2848.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
2849    /*
2850     * 'combine_mask_ca' replacement
2851     *
2852     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
2853     *         mask in          {d24, d25, d26}       [B, G, R]
2854     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
2855     *         updated mask in  {d24, d25, d26}       [B, G, R]
2856     */
2857    vmull.u8    q0,  d24, d8
2858    vmull.u8    q1,  d25, d9
2859    vmull.u8    q6,  d26, d10
2860    vmull.u8    q9,  d11, d25
2861    vmull.u8    q12, d11, d24
2862    vmull.u8    q13, d11, d26
2863    vrshr.u16   q8,  q0,  #8
2864    vrshr.u16   q10, q1,  #8
2865    vrshr.u16   q11, q6,  #8
2866    vraddhn.u16 d0,  q0,  q8
2867    vraddhn.u16 d1,  q1,  q10
2868    vraddhn.u16 d2,  q6,  q11
2869    vrshr.u16   q11, q12, #8
2870    vrshr.u16   q8,  q9,  #8
2871    vrshr.u16   q6,  q13, #8
2872    vraddhn.u16 d24, q12, q11
2873    vraddhn.u16 d25, q9,  q8
2874    /*
2875     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
2876     * and put data into d16 - blue, d17 - green, d18 - red
2877     */
2878       vshrn.u16   d17, q2,  #3
2879       vshrn.u16   d18, q2,  #8
2880    vraddhn.u16 d26, q13, q6
2881       vsli.u16    q2,  q2,  #5
2882       vsri.u8     d18, d18, #5
2883       vsri.u8     d17, d17, #6
2884    /*
2885     * 'combine_over_ca' replacement
2886     *
2887     * output: updated dest in d16 - blue, d17 - green, d18 - red
2888     */
2889    vmvn.8      q12, q12
2890       vshrn.u16   d16, q2,  #2
2891    vmvn.8      d26, d26
2892    vmull.u8    q6,  d16, d24
2893    vmull.u8    q7,  d17, d25
2894    vmull.u8    q11, d18, d26
2895.endm
2896
2897.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
2898    /* ... continue 'combine_over_ca' replacement */
2899    vrshr.u16   q10, q6,  #8
2900    vrshr.u16   q14, q7,  #8
2901    vrshr.u16   q15, q11, #8
2902    vraddhn.u16 d16, q10, q6
2903    vraddhn.u16 d17, q14, q7
2904    vraddhn.u16 d18, q15, q11
2905    vqadd.u8    q8,  q0,  q8
2906    vqadd.u8    d18, d2,  d18
2907    /*
2908     * convert the results in d16, d17, d18 to r5g6b5 and store
2909     * them into {d28, d29}
2910     */
2911    vshll.u8    q14, d18, #8
2912    vshll.u8    q10, d17, #8
2913    vshll.u8    q15, d16, #8
2914    vsri.u16    q14, q10, #5
2915    vsri.u16    q14, q15, #11
2916.endm
2917
2918.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
2919    fetch_mask_pixblock
2920        vrshr.u16   q10, q6, #8
2921        vrshr.u16   q14, q7, #8
2922    vld1.16     {d4, d5}, [DST_R, :128]!
2923        vrshr.u16   q15, q11, #8
2924        vraddhn.u16 d16, q10, q6
2925        vraddhn.u16 d17, q14, q7
2926        vraddhn.u16 d22, q15, q11
2927            /* process_pixblock_head */
2928            /*
2929             * 'combine_mask_ca' replacement
2930             *
2931             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
2932             *         mask in          {d24, d25, d26}       [B, G, R]
2933             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
2934             *         updated mask in  {d24, d25, d26}       [B, G, R]
2935             */
2936            vmull.u8    q6,  d26, d10
2937        vqadd.u8    q8,  q0, q8
2938            vmull.u8    q0,  d24, d8
2939        vqadd.u8    d22, d2, d22
2940            vmull.u8    q1,  d25, d9
2941        /*
2942         * convert the result in d16, d17, d22 to r5g6b5 and store
2943         * it into {d28, d29}
2944         */
2945        vshll.u8    q14, d22, #8
2946        vshll.u8    q10, d17, #8
2947        vshll.u8    q15, d16, #8
2948            vmull.u8    q9,  d11, d25
2949        vsri.u16    q14, q10, #5
2950            vmull.u8    q12, d11, d24
2951            vmull.u8    q13, d11, d26
2952        vsri.u16    q14, q15, #11
2953    cache_preload 8, 8
2954            vrshr.u16   q8,  q0,  #8
2955            vrshr.u16   q10, q1,  #8
2956            vrshr.u16   q11, q6,  #8
2957            vraddhn.u16 d0,  q0,  q8
2958            vraddhn.u16 d1,  q1,  q10
2959            vraddhn.u16 d2,  q6,  q11
2960            vrshr.u16   q11, q12, #8
2961            vrshr.u16   q8,  q9,  #8
2962            vrshr.u16   q6,  q13, #8
2963            vraddhn.u16 d24, q12, q11
2964            vraddhn.u16 d25, q9,  q8
2965                /*
2966                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
2967	         * 8-bit format and put data into d16 - blue, d17 - green,
2968	         * d18 - red
2969                 */
2970                vshrn.u16   d17, q2,  #3
2971                vshrn.u16   d18, q2,  #8
2972            vraddhn.u16 d26, q13, q6
2973                vsli.u16    q2,  q2,  #5
2974                vsri.u8     d17, d17, #6
2975                vsri.u8     d18, d18, #5
2976            /*
2977             * 'combine_over_ca' replacement
2978             *
2979             * output: updated dest in d16 - blue, d17 - green, d18 - red
2980             */
2981            vmvn.8      q12, q12
2982                vshrn.u16   d16, q2,  #2
2983            vmvn.8      d26, d26
2984            vmull.u8    q7,  d17, d25
2985            vmull.u8    q6,  d16, d24
2986            vmull.u8    q11, d18, d26
2987    vst1.16     {d28, d29}, [DST_W, :128]!
2988.endm
2989
2990.macro pixman_composite_over_n_8888_0565_ca_init
2991    add         DUMMY, sp, #ARGS_STACK_OFFSET
2992    vpush       {d8-d15}
2993    vld1.32     {d11[0]}, [DUMMY]
2994    vdup.8      d8, d11[0]
2995    vdup.8      d9, d11[1]
2996    vdup.8      d10, d11[2]
2997    vdup.8      d11, d11[3]
2998.endm
2999
3000.macro pixman_composite_over_n_8888_0565_ca_cleanup
3001    vpop        {d8-d15}
3002.endm
3003
3004generate_composite_function \
3005    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
3006    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3007    8, /* number of pixels, processed in a single block */ \
3008    5, /* prefetch distance */ \
3009    pixman_composite_over_n_8888_0565_ca_init, \
3010    pixman_composite_over_n_8888_0565_ca_cleanup, \
3011    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
3012    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
3013    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
3014
3015/******************************************************************************/
3016
3017.macro pixman_composite_in_n_8_process_pixblock_head
3018    /* expecting source data in {d0, d1, d2, d3} */
3019    /* and destination data in {d4, d5, d6, d7} */
3020    vmull.u8    q8,  d4,  d3
3021    vmull.u8    q9,  d5,  d3
3022    vmull.u8    q10, d6,  d3
3023    vmull.u8    q11, d7,  d3
3024.endm
3025
3026.macro pixman_composite_in_n_8_process_pixblock_tail
3027    vrshr.u16   q14, q8,  #8
3028    vrshr.u16   q15, q9,  #8
3029    vrshr.u16   q12, q10, #8
3030    vrshr.u16   q13, q11, #8
3031    vraddhn.u16 d28, q8,  q14
3032    vraddhn.u16 d29, q9,  q15
3033    vraddhn.u16 d30, q10, q12
3034    vraddhn.u16 d31, q11, q13
3035.endm
3036
3037.macro pixman_composite_in_n_8_process_pixblock_tail_head
3038    pixman_composite_in_n_8_process_pixblock_tail
3039    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
3040    cache_preload 32, 32
3041    pixman_composite_in_n_8_process_pixblock_head
3042    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
3043.endm
3044
3045.macro pixman_composite_in_n_8_init
3046    add         DUMMY, sp, #ARGS_STACK_OFFSET
3047    vld1.32     {d3[0]}, [DUMMY]
3048    vdup.8      d3, d3[3]
3049.endm
3050
3051.macro pixman_composite_in_n_8_cleanup
3052.endm
3053
3054generate_composite_function \
3055    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
3056    FLAG_DST_READWRITE, \
3057    32, /* number of pixels, processed in a single block */ \
3058    5, /* prefetch distance */ \
3059    pixman_composite_in_n_8_init, \
3060    pixman_composite_in_n_8_cleanup, \
3061    pixman_composite_in_n_8_process_pixblock_head, \
3062    pixman_composite_in_n_8_process_pixblock_tail, \
3063    pixman_composite_in_n_8_process_pixblock_tail_head, \
3064    28, /* dst_w_basereg */ \
3065    4,  /* dst_r_basereg */ \
3066    0,  /* src_basereg   */ \
3067    24  /* mask_basereg  */
3068
3069.macro pixman_composite_add_n_8_8_process_pixblock_head
3070    /* expecting source data in {d8, d9, d10, d11} */
3071    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
3072    /* and destination data in {d4, d5, d6, d7} */
3073    /* mask is in d24, d25, d26, d27 */
3074    vmull.u8    q0, d24, d11
3075    vmull.u8    q1, d25, d11
3076    vmull.u8    q6, d26, d11
3077    vmull.u8    q7, d27, d11
3078    vrshr.u16   q10, q0, #8
3079    vrshr.u16   q11, q1, #8
3080    vrshr.u16   q12, q6, #8
3081    vrshr.u16   q13, q7, #8
3082    vraddhn.u16 d0, q0, q10
3083    vraddhn.u16 d1, q1, q11
3084    vraddhn.u16 d2, q6, q12
3085    vraddhn.u16 d3, q7, q13
3086    vqadd.u8    q14, q0, q2
3087    vqadd.u8    q15, q1, q3
3088.endm
3089
3090.macro pixman_composite_add_n_8_8_process_pixblock_tail
3091.endm
3092
3093/* TODO: expand macros and do better instructions scheduling */
3094.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
3095    pixman_composite_add_n_8_8_process_pixblock_tail
3096    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
3097    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
3098    fetch_mask_pixblock
3099    cache_preload 32, 32
3100    pixman_composite_add_n_8_8_process_pixblock_head
3101.endm
3102
3103.macro pixman_composite_add_n_8_8_init
3104    add         DUMMY, sp, #ARGS_STACK_OFFSET
3105    vpush       {d8-d15}
3106    vld1.32     {d11[0]}, [DUMMY]
3107    vdup.8      d11, d11[3]
3108.endm
3109
3110.macro pixman_composite_add_n_8_8_cleanup
3111    vpop        {d8-d15}
3112.endm
3113
3114generate_composite_function \
3115    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
3116    FLAG_DST_READWRITE, \
3117    32, /* number of pixels, processed in a single block */ \
3118    5, /* prefetch distance */ \
3119    pixman_composite_add_n_8_8_init, \
3120    pixman_composite_add_n_8_8_cleanup, \
3121    pixman_composite_add_n_8_8_process_pixblock_head, \
3122    pixman_composite_add_n_8_8_process_pixblock_tail, \
3123    pixman_composite_add_n_8_8_process_pixblock_tail_head
3124
3125/******************************************************************************/
3126
3127.macro pixman_composite_add_8_8_8_process_pixblock_head
3128    /* expecting source data in {d0, d1, d2, d3} */
3129    /* destination data in {d4, d5, d6, d7} */
3130    /* mask in {d24, d25, d26, d27} */
3131    vmull.u8    q8, d24, d0
3132    vmull.u8    q9, d25, d1
3133    vmull.u8    q10, d26, d2
3134    vmull.u8    q11, d27, d3
3135    vrshr.u16   q0, q8, #8
3136    vrshr.u16   q1, q9, #8
3137    vrshr.u16   q12, q10, #8
3138    vrshr.u16   q13, q11, #8
3139    vraddhn.u16 d0, q0, q8
3140    vraddhn.u16 d1, q1, q9
3141    vraddhn.u16 d2, q12, q10
3142    vraddhn.u16 d3, q13, q11
3143    vqadd.u8    q14, q0, q2
3144    vqadd.u8    q15, q1, q3
3145.endm
3146
3147.macro pixman_composite_add_8_8_8_process_pixblock_tail
3148.endm
3149
3150/* TODO: expand macros and do better instructions scheduling */
3151.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
3152    pixman_composite_add_8_8_8_process_pixblock_tail
3153    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
3154    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
3155    fetch_mask_pixblock
3156    fetch_src_pixblock
3157    cache_preload 32, 32
3158    pixman_composite_add_8_8_8_process_pixblock_head
3159.endm
3160
3161.macro pixman_composite_add_8_8_8_init
3162.endm
3163
3164.macro pixman_composite_add_8_8_8_cleanup
3165.endm
3166
3167generate_composite_function \
3168    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
3169    FLAG_DST_READWRITE, \
3170    32, /* number of pixels, processed in a single block */ \
3171    5, /* prefetch distance */ \
3172    pixman_composite_add_8_8_8_init, \
3173    pixman_composite_add_8_8_8_cleanup, \
3174    pixman_composite_add_8_8_8_process_pixblock_head, \
3175    pixman_composite_add_8_8_8_process_pixblock_tail, \
3176    pixman_composite_add_8_8_8_process_pixblock_tail_head
3177
3178/******************************************************************************/
3179
3180.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
3181    /* expecting source data in {d0, d1, d2, d3} */
3182    /* destination data in {d4, d5, d6, d7} */
3183    /* mask in {d24, d25, d26, d27} */
3184    vmull.u8    q8,  d27, d0
3185    vmull.u8    q9,  d27, d1
3186    vmull.u8    q10, d27, d2
3187    vmull.u8    q11, d27, d3
3188    /* 1 cycle bubble */
3189    vrsra.u16   q8,  q8,  #8
3190    vrsra.u16   q9,  q9,  #8
3191    vrsra.u16   q10, q10, #8
3192    vrsra.u16   q11, q11, #8
3193.endm
3194
3195.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
3196    /* 2 cycle bubble */
3197    vrshrn.u16  d28, q8,  #8
3198    vrshrn.u16  d29, q9,  #8
3199    vrshrn.u16  d30, q10, #8
3200    vrshrn.u16  d31, q11, #8
3201    vqadd.u8    q14, q2,  q14
3202    /* 1 cycle bubble */
3203    vqadd.u8    q15, q3,  q15
3204.endm
3205
3206.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
3207    fetch_src_pixblock
3208        vrshrn.u16  d28, q8,  #8
3209    fetch_mask_pixblock
3210        vrshrn.u16  d29, q9,  #8
3211    vmull.u8    q8,  d27, d0
3212        vrshrn.u16  d30, q10, #8
3213    vmull.u8    q9,  d27, d1
3214        vrshrn.u16  d31, q11, #8
3215    vmull.u8    q10, d27, d2
3216        vqadd.u8    q14, q2,  q14
3217    vmull.u8    q11, d27, d3
3218        vqadd.u8    q15, q3,  q15
3219    vrsra.u16   q8,  q8,  #8
3220    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
3221    vrsra.u16   q9,  q9,  #8
3222        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
3223    vrsra.u16   q10, q10, #8
3224
3225    cache_preload 8, 8
3226
3227    vrsra.u16   q11, q11, #8
3228.endm
3229
3230generate_composite_function \
3231    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
3232    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3233    8, /* number of pixels, processed in a single block */ \
3234    10, /* prefetch distance */ \
3235    default_init, \
3236    default_cleanup, \
3237    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
3238    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
3239    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
3240
3241generate_composite_function_single_scanline \
3242    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
3243    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3244    8, /* number of pixels, processed in a single block */ \
3245    default_init, \
3246    default_cleanup, \
3247    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
3248    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
3249    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
3250
3251/******************************************************************************/
3252
3253generate_composite_function \
3254    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
3255    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3256    8, /* number of pixels, processed in a single block */ \
3257    5, /* prefetch distance */ \
3258    default_init, \
3259    default_cleanup, \
3260    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
3261    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
3262    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
3263    28, /* dst_w_basereg */ \
3264    4,  /* dst_r_basereg */ \
3265    0,  /* src_basereg   */ \
3266    27  /* mask_basereg  */
3267
3268/******************************************************************************/
3269
3270.macro pixman_composite_add_n_8_8888_init
3271    add         DUMMY, sp, #ARGS_STACK_OFFSET
3272    vld1.32     {d3[0]}, [DUMMY]
3273    vdup.8      d0, d3[0]
3274    vdup.8      d1, d3[1]
3275    vdup.8      d2, d3[2]
3276    vdup.8      d3, d3[3]
3277.endm
3278
3279.macro pixman_composite_add_n_8_8888_cleanup
3280.endm
3281
3282generate_composite_function \
3283    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
3284    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3285    8, /* number of pixels, processed in a single block */ \
3286    5, /* prefetch distance */ \
3287    pixman_composite_add_n_8_8888_init, \
3288    pixman_composite_add_n_8_8888_cleanup, \
3289    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
3290    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
3291    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
3292    28, /* dst_w_basereg */ \
3293    4,  /* dst_r_basereg */ \
3294    0,  /* src_basereg   */ \
3295    27  /* mask_basereg  */
3296
3297/******************************************************************************/
3298
3299.macro pixman_composite_add_8888_n_8888_init
3300    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
3301    vld1.32     {d27[0]}, [DUMMY]
3302    vdup.8      d27, d27[3]
3303.endm
3304
3305.macro pixman_composite_add_8888_n_8888_cleanup
3306.endm
3307
3308generate_composite_function \
3309    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
3310    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3311    8, /* number of pixels, processed in a single block */ \
3312    5, /* prefetch distance */ \
3313    pixman_composite_add_8888_n_8888_init, \
3314    pixman_composite_add_8888_n_8888_cleanup, \
3315    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
3316    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
3317    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
3318    28, /* dst_w_basereg */ \
3319    4,  /* dst_r_basereg */ \
3320    0,  /* src_basereg   */ \
3321    27  /* mask_basereg  */
3322
3323/******************************************************************************/
3324
3325.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
3326    /* expecting source data in {d0, d1, d2, d3} */
3327    /* destination data in {d4, d5, d6, d7} */
3328    /* solid mask is in d15 */
3329
3330    /* 'in' */
3331    vmull.u8    q8, d15, d3
3332    vmull.u8    q6, d15, d2
3333    vmull.u8    q5, d15, d1
3334    vmull.u8    q4, d15, d0
3335    vrshr.u16   q13, q8, #8
3336    vrshr.u16   q12, q6, #8
3337    vrshr.u16   q11, q5, #8
3338    vrshr.u16   q10, q4, #8
3339    vraddhn.u16 d3, q8, q13
3340    vraddhn.u16 d2, q6, q12
3341    vraddhn.u16 d1, q5, q11
3342    vraddhn.u16 d0, q4, q10
3343    vmvn.8      d24, d3  /* get inverted alpha */
3344    /* now do alpha blending */
3345    vmull.u8    q8, d24, d4
3346    vmull.u8    q9, d24, d5
3347    vmull.u8    q10, d24, d6
3348    vmull.u8    q11, d24, d7
3349.endm
3350
3351.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
3352    vrshr.u16   q14, q8, #8
3353    vrshr.u16   q15, q9, #8
3354    vrshr.u16   q12, q10, #8
3355    vrshr.u16   q13, q11, #8
3356    vraddhn.u16 d28, q14, q8
3357    vraddhn.u16 d29, q15, q9
3358    vraddhn.u16 d30, q12, q10
3359    vraddhn.u16 d31, q13, q11
3360.endm
3361
3362/* TODO: expand macros and do better instructions scheduling */
3363.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
3364    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
3365    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
3366    fetch_src_pixblock
3367    cache_preload 8, 8
3368    fetch_mask_pixblock
3369    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
3370    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
3371.endm
3372
3373generate_composite_function_single_scanline \
3374    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
3375    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3376    8, /* number of pixels, processed in a single block */ \
3377    default_init_need_all_regs, \
3378    default_cleanup_need_all_regs, \
3379    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
3380    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
3381    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
3382    28, /* dst_w_basereg */ \
3383    4,  /* dst_r_basereg */ \
3384    0,  /* src_basereg   */ \
3385    12  /* mask_basereg  */
3386
3387/******************************************************************************/
3388
3389.macro pixman_composite_over_8888_n_8888_process_pixblock_head
3390    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
3391.endm
3392
3393.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
3394    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
3395    vqadd.u8    q14, q0, q14
3396    vqadd.u8    q15, q1, q15
3397.endm
3398
3399/* TODO: expand macros and do better instructions scheduling */
3400.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
3401    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
3402    pixman_composite_over_8888_n_8888_process_pixblock_tail
3403    fetch_src_pixblock
3404    cache_preload 8, 8
3405    pixman_composite_over_8888_n_8888_process_pixblock_head
3406    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
3407.endm
3408
3409.macro pixman_composite_over_8888_n_8888_init
3410    add         DUMMY, sp, #48
3411    vpush       {d8-d15}
3412    vld1.32     {d15[0]}, [DUMMY]
3413    vdup.8      d15, d15[3]
3414.endm
3415
3416.macro pixman_composite_over_8888_n_8888_cleanup
3417    vpop        {d8-d15}
3418.endm
3419
3420generate_composite_function \
3421    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
3422    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3423    8, /* number of pixels, processed in a single block */ \
3424    5, /* prefetch distance */ \
3425    pixman_composite_over_8888_n_8888_init, \
3426    pixman_composite_over_8888_n_8888_cleanup, \
3427    pixman_composite_over_8888_n_8888_process_pixblock_head, \
3428    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
3429    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
3430
3431/******************************************************************************/
3432
3433/* TODO: expand macros and do better instructions scheduling */
3434.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
3435    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
3436    pixman_composite_over_8888_n_8888_process_pixblock_tail
3437    fetch_src_pixblock
3438    cache_preload 8, 8
3439    fetch_mask_pixblock
3440    pixman_composite_over_8888_n_8888_process_pixblock_head
3441    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
3442.endm
3443
3444generate_composite_function \
3445    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
3446    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3447    8, /* number of pixels, processed in a single block */ \
3448    5, /* prefetch distance */ \
3449    default_init_need_all_regs, \
3450    default_cleanup_need_all_regs, \
3451    pixman_composite_over_8888_n_8888_process_pixblock_head, \
3452    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
3453    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
3454    28, /* dst_w_basereg */ \
3455    4,  /* dst_r_basereg */ \
3456    0,  /* src_basereg   */ \
3457    12  /* mask_basereg  */
3458
3459generate_composite_function_single_scanline \
3460    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
3461    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3462    8, /* number of pixels, processed in a single block */ \
3463    default_init_need_all_regs, \
3464    default_cleanup_need_all_regs, \
3465    pixman_composite_over_8888_n_8888_process_pixblock_head, \
3466    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
3467    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
3468    28, /* dst_w_basereg */ \
3469    4,  /* dst_r_basereg */ \
3470    0,  /* src_basereg   */ \
3471    12  /* mask_basereg  */
3472
3473/******************************************************************************/
3474
3475/* TODO: expand macros and do better instructions scheduling */
3476.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
3477    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
3478    pixman_composite_over_8888_n_8888_process_pixblock_tail
3479    fetch_src_pixblock
3480    cache_preload 8, 8
3481    fetch_mask_pixblock
3482    pixman_composite_over_8888_n_8888_process_pixblock_head
3483    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
3484.endm
3485
3486generate_composite_function \
3487    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
3488    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3489    8, /* number of pixels, processed in a single block */ \
3490    5, /* prefetch distance */ \
3491    default_init_need_all_regs, \
3492    default_cleanup_need_all_regs, \
3493    pixman_composite_over_8888_n_8888_process_pixblock_head, \
3494    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
3495    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
3496    28, /* dst_w_basereg */ \
3497    4,  /* dst_r_basereg */ \
3498    0,  /* src_basereg   */ \
3499    15  /* mask_basereg  */
3500
3501/******************************************************************************/
3502
3503.macro pixman_composite_src_0888_0888_process_pixblock_head
3504.endm
3505
3506.macro pixman_composite_src_0888_0888_process_pixblock_tail
3507.endm
3508
3509.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
3510    vst3.8 {d0, d1, d2}, [DST_W]!
3511    fetch_src_pixblock
3512    cache_preload 8, 8
3513.endm
3514
3515generate_composite_function \
3516    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
3517    FLAG_DST_WRITEONLY, \
3518    8, /* number of pixels, processed in a single block */ \
3519    10, /* prefetch distance */ \
3520    default_init, \
3521    default_cleanup, \
3522    pixman_composite_src_0888_0888_process_pixblock_head, \
3523    pixman_composite_src_0888_0888_process_pixblock_tail, \
3524    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
3525    0, /* dst_w_basereg */ \
3526    0, /* dst_r_basereg */ \
3527    0, /* src_basereg   */ \
3528    0  /* mask_basereg  */
3529
3530/******************************************************************************/
3531
3532.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
3533    vswp   d0, d2
3534.endm
3535
3536.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
3537.endm
3538
3539.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
3540    vst4.8 {d0, d1, d2, d3}, [DST_W]!
3541    fetch_src_pixblock
3542    vswp   d0, d2
3543    cache_preload 8, 8
3544.endm
3545
3546.macro pixman_composite_src_0888_8888_rev_init
3547    veor   d3, d3, d3
3548.endm
3549
3550generate_composite_function \
3551    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
3552    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
3553    8, /* number of pixels, processed in a single block */ \
3554    10, /* prefetch distance */ \
3555    pixman_composite_src_0888_8888_rev_init, \
3556    default_cleanup, \
3557    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
3558    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
3559    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
3560    0, /* dst_w_basereg */ \
3561    0, /* dst_r_basereg */ \
3562    0, /* src_basereg   */ \
3563    0  /* mask_basereg  */
3564
3565/******************************************************************************/
3566
3567.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
3568    vshll.u8    q8, d1, #8
3569    vshll.u8    q9, d2, #8
3570.endm
3571
3572.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
3573    vshll.u8    q14, d0, #8
3574    vsri.u16    q14, q8, #5
3575    vsri.u16    q14, q9, #11
3576.endm
3577
3578.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
3579        vshll.u8    q14, d0, #8
3580    fetch_src_pixblock
3581        vsri.u16    q14, q8, #5
3582        vsri.u16    q14, q9, #11
3583    vshll.u8    q8, d1, #8
3584        vst1.16 {d28, d29}, [DST_W, :128]!
3585    vshll.u8    q9, d2, #8
3586.endm
3587
3588generate_composite_function \
3589    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
3590    FLAG_DST_WRITEONLY, \
3591    8, /* number of pixels, processed in a single block */ \
3592    10, /* prefetch distance */ \
3593    default_init, \
3594    default_cleanup, \
3595    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
3596    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
3597    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
3598    28, /* dst_w_basereg */ \
3599    0, /* dst_r_basereg */ \
3600    0, /* src_basereg   */ \
3601    0  /* mask_basereg  */
3602
3603/******************************************************************************/
3604
3605.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
3606    vmull.u8    q8, d3, d0
3607    vmull.u8    q9, d3, d1
3608    vmull.u8    q10, d3, d2
3609.endm
3610
3611.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
3612    vrshr.u16   q11, q8, #8
3613    vswp        d3, d31
3614    vrshr.u16   q12, q9, #8
3615    vrshr.u16   q13, q10, #8
3616    vraddhn.u16 d30, q11, q8
3617    vraddhn.u16 d29, q12, q9
3618    vraddhn.u16 d28, q13, q10
3619.endm
3620
3621.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
3622        vrshr.u16   q11, q8, #8
3623        vswp        d3, d31
3624        vrshr.u16   q12, q9, #8
3625        vrshr.u16   q13, q10, #8
3626    fetch_src_pixblock
3627        vraddhn.u16 d30, q11, q8
3628                                    PF add PF_X, PF_X, #8
3629                                    PF tst PF_CTL, #0xF
3630                                    PF addne PF_X, PF_X, #8
3631                                    PF subne PF_CTL, PF_CTL, #1
3632        vraddhn.u16 d29, q12, q9
3633        vraddhn.u16 d28, q13, q10
3634    vmull.u8    q8, d3, d0
3635    vmull.u8    q9, d3, d1
3636    vmull.u8    q10, d3, d2
3637        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
3638                                    PF cmp PF_X, ORIG_W
3639                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
3640                                    PF subge PF_X, PF_X, ORIG_W
3641                                    PF subges PF_CTL, PF_CTL, #0x10
3642                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
3643.endm
3644
3645generate_composite_function \
3646    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
3647    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
3648    8, /* number of pixels, processed in a single block */ \
3649    10, /* prefetch distance */ \
3650    default_init, \
3651    default_cleanup, \
3652    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
3653    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
3654    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
3655    28, /* dst_w_basereg */ \
3656    0, /* dst_r_basereg */ \
3657    0, /* src_basereg   */ \
3658    0  /* mask_basereg  */
3659
3660/******************************************************************************/
3661
3662.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
3663    vmull.u8    q8, d3, d0
3664    vmull.u8    q9, d3, d1
3665    vmull.u8    q10, d3, d2
3666.endm
3667
3668.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
3669    vrshr.u16   q11, q8, #8
3670    vswp        d3, d31
3671    vrshr.u16   q12, q9, #8
3672    vrshr.u16   q13, q10, #8
3673    vraddhn.u16 d28, q11, q8
3674    vraddhn.u16 d29, q12, q9
3675    vraddhn.u16 d30, q13, q10
3676.endm
3677
3678.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
3679        vrshr.u16   q11, q8, #8
3680        vswp        d3, d31
3681        vrshr.u16   q12, q9, #8
3682        vrshr.u16   q13, q10, #8
3683    fetch_src_pixblock
3684        vraddhn.u16 d28, q11, q8
3685                                    PF add PF_X, PF_X, #8
3686                                    PF tst PF_CTL, #0xF
3687                                    PF addne PF_X, PF_X, #8
3688                                    PF subne PF_CTL, PF_CTL, #1
3689        vraddhn.u16 d29, q12, q9
3690        vraddhn.u16 d30, q13, q10
3691    vmull.u8    q8, d3, d0
3692    vmull.u8    q9, d3, d1
3693    vmull.u8    q10, d3, d2
3694        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
3695                                    PF cmp PF_X, ORIG_W
3696                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
3697                                    PF subge PF_X, PF_X, ORIG_W
3698                                    PF subges PF_CTL, PF_CTL, #0x10
3699                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
3700.endm
3701
3702generate_composite_function \
3703    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
3704    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
3705    8, /* number of pixels, processed in a single block */ \
3706    10, /* prefetch distance */ \
3707    default_init, \
3708    default_cleanup, \
3709    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
3710    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
3711    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
3712    28, /* dst_w_basereg */ \
3713    0, /* dst_r_basereg */ \
3714    0, /* src_basereg   */ \
3715    0  /* mask_basereg  */
3716
3717/******************************************************************************/
3718
3719.macro pixman_composite_over_0565_8_0565_process_pixblock_head
3720    /* mask is in d15 */
3721    convert_0565_to_x888 q4, d2, d1, d0
3722    convert_0565_to_x888 q5, d6, d5, d4
3723    /* source pixel data is in      {d0, d1, d2, XX} */
3724    /* destination pixel data is in {d4, d5, d6, XX} */
3725    vmvn.8      d7,  d15
3726    vmull.u8    q6,  d15, d2
3727    vmull.u8    q5,  d15, d1
3728    vmull.u8    q4,  d15, d0
3729    vmull.u8    q8,  d7,  d4
3730    vmull.u8    q9,  d7,  d5
3731    vmull.u8    q13, d7,  d6
3732    vrshr.u16   q12, q6,  #8
3733    vrshr.u16   q11, q5,  #8
3734    vrshr.u16   q10, q4,  #8
3735    vraddhn.u16 d2,  q6,  q12
3736    vraddhn.u16 d1,  q5,  q11
3737    vraddhn.u16 d0,  q4,  q10
3738.endm
3739
3740.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
3741    vrshr.u16   q14, q8,  #8
3742    vrshr.u16   q15, q9,  #8
3743    vrshr.u16   q12, q13, #8
3744    vraddhn.u16 d28, q14, q8
3745    vraddhn.u16 d29, q15, q9
3746    vraddhn.u16 d30, q12, q13
3747    vqadd.u8    q0,  q0,  q14
3748    vqadd.u8    q1,  q1,  q15
3749    /* 32bpp result is in {d0, d1, d2, XX} */
3750    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
3751.endm
3752
3753/* TODO: expand macros and do better instructions scheduling */
3754.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
3755    fetch_mask_pixblock
3756    pixman_composite_over_0565_8_0565_process_pixblock_tail
3757    fetch_src_pixblock
3758    vld1.16    {d10, d11}, [DST_R, :128]!
3759    cache_preload 8, 8
3760    pixman_composite_over_0565_8_0565_process_pixblock_head
3761    vst1.16    {d28, d29}, [DST_W, :128]!
3762.endm
3763
3764generate_composite_function \
3765    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
3766    FLAG_DST_READWRITE, \
3767    8, /* number of pixels, processed in a single block */ \
3768    5, /* prefetch distance */ \
3769    default_init_need_all_regs, \
3770    default_cleanup_need_all_regs, \
3771    pixman_composite_over_0565_8_0565_process_pixblock_head, \
3772    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
3773    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
3774    28, /* dst_w_basereg */ \
3775    10,  /* dst_r_basereg */ \
3776    8,  /* src_basereg   */ \
3777    15  /* mask_basereg  */
3778
3779/******************************************************************************/
3780
3781.macro pixman_composite_over_0565_n_0565_init
3782    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
3783    vpush       {d8-d15}
3784    vld1.32     {d15[0]}, [DUMMY]
3785    vdup.8      d15, d15[3]
3786.endm
3787
3788.macro pixman_composite_over_0565_n_0565_cleanup
3789    vpop        {d8-d15}
3790.endm
3791
3792generate_composite_function \
3793    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
3794    FLAG_DST_READWRITE, \
3795    8, /* number of pixels, processed in a single block */ \
3796    5, /* prefetch distance */ \
3797    pixman_composite_over_0565_n_0565_init, \
3798    pixman_composite_over_0565_n_0565_cleanup, \
3799    pixman_composite_over_0565_8_0565_process_pixblock_head, \
3800    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
3801    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
3802    28, /* dst_w_basereg */ \
3803    10, /* dst_r_basereg */ \
3804    8,  /* src_basereg   */ \
3805    15  /* mask_basereg  */
3806
3807/******************************************************************************/
3808
3809.macro pixman_composite_add_0565_8_0565_process_pixblock_head
3810    /* mask is in d15 */
3811    convert_0565_to_x888 q4, d2, d1, d0
3812    convert_0565_to_x888 q5, d6, d5, d4
3813    /* source pixel data is in      {d0, d1, d2, XX} */
3814    /* destination pixel data is in {d4, d5, d6, XX} */
3815    vmull.u8    q6,  d15, d2
3816    vmull.u8    q5,  d15, d1
3817    vmull.u8    q4,  d15, d0
3818    vrshr.u16   q12, q6,  #8
3819    vrshr.u16   q11, q5,  #8
3820    vrshr.u16   q10, q4,  #8
3821    vraddhn.u16 d2,  q6,  q12
3822    vraddhn.u16 d1,  q5,  q11
3823    vraddhn.u16 d0,  q4,  q10
3824.endm
3825
3826.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
3827    vqadd.u8    q0,  q0,  q2
3828    vqadd.u8    q1,  q1,  q3
3829    /* 32bpp result is in {d0, d1, d2, XX} */
3830    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
3831.endm
3832
3833/* TODO: expand macros and do better instructions scheduling */
3834.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
3835    fetch_mask_pixblock
3836    pixman_composite_add_0565_8_0565_process_pixblock_tail
3837    fetch_src_pixblock
3838    vld1.16    {d10, d11}, [DST_R, :128]!
3839    cache_preload 8, 8
3840    pixman_composite_add_0565_8_0565_process_pixblock_head
3841    vst1.16    {d28, d29}, [DST_W, :128]!
3842.endm
3843
3844generate_composite_function \
3845    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
3846    FLAG_DST_READWRITE, \
3847    8, /* number of pixels, processed in a single block */ \
3848    5, /* prefetch distance */ \
3849    default_init_need_all_regs, \
3850    default_cleanup_need_all_regs, \
3851    pixman_composite_add_0565_8_0565_process_pixblock_head, \
3852    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
3853    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
3854    28, /* dst_w_basereg */ \
3855    10, /* dst_r_basereg */ \
3856    8,  /* src_basereg   */ \
3857    15  /* mask_basereg  */
3858
3859/******************************************************************************/
3860
3861.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
3862    /* mask is in d15 */
3863    convert_0565_to_x888 q5, d6, d5, d4
3864    /* destination pixel data is in {d4, d5, d6, xx} */
3865    vmvn.8      d24, d15 /* get inverted alpha */
3866    /* now do alpha blending */
3867    vmull.u8    q8, d24, d4
3868    vmull.u8    q9, d24, d5
3869    vmull.u8    q10, d24, d6
3870.endm
3871
3872.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
3873    vrshr.u16   q14, q8, #8
3874    vrshr.u16   q15, q9, #8
3875    vrshr.u16   q12, q10, #8
3876    vraddhn.u16 d0, q14, q8
3877    vraddhn.u16 d1, q15, q9
3878    vraddhn.u16 d2, q12, q10
3879    /* 32bpp result is in {d0, d1, d2, XX} */
3880    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
3881.endm
3882
3883/* TODO: expand macros and do better instructions scheduling */
3884.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
3885    fetch_src_pixblock
3886    pixman_composite_out_reverse_8_0565_process_pixblock_tail
3887    vld1.16    {d10, d11}, [DST_R, :128]!
3888    cache_preload 8, 8
3889    pixman_composite_out_reverse_8_0565_process_pixblock_head
3890    vst1.16    {d28, d29}, [DST_W, :128]!
3891.endm
3892
3893generate_composite_function \
3894    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
3895    FLAG_DST_READWRITE, \
3896    8, /* number of pixels, processed in a single block */ \
3897    5, /* prefetch distance */ \
3898    default_init_need_all_regs, \
3899    default_cleanup_need_all_regs, \
3900    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
3901    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
3902    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
3903    28, /* dst_w_basereg */ \
3904    10, /* dst_r_basereg */ \
3905    15, /* src_basereg   */ \
3906    0   /* mask_basereg  */
3907
3908/******************************************************************************/
3909
3910.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
3911    /* src is in d0 */
3912    /* destination pixel data is in {d4, d5, d6, d7} */
3913    vmvn.8      d1, d0 /* get inverted alpha */
3914    /* now do alpha blending */
3915    vmull.u8    q8, d1, d4
3916    vmull.u8    q9, d1, d5
3917    vmull.u8    q10, d1, d6
3918    vmull.u8    q11, d1, d7
3919.endm
3920
3921.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
3922    vrshr.u16   q14, q8, #8
3923    vrshr.u16   q15, q9, #8
3924    vrshr.u16   q12, q10, #8
3925    vrshr.u16   q13, q11, #8
3926    vraddhn.u16 d28, q14, q8
3927    vraddhn.u16 d29, q15, q9
3928    vraddhn.u16 d30, q12, q10
3929    vraddhn.u16 d31, q13, q11
3930    /* 32bpp result is in {d28, d29, d30, d31} */
3931.endm
3932
3933/* TODO: expand macros and do better instructions scheduling */
3934.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
3935    fetch_src_pixblock
3936    pixman_composite_out_reverse_8_8888_process_pixblock_tail
3937    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
3938    cache_preload 8, 8
3939    pixman_composite_out_reverse_8_8888_process_pixblock_head
3940    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
3941.endm
3942
3943generate_composite_function \
3944    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
3945    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3946    8, /* number of pixels, processed in a single block */ \
3947    5, /* prefetch distance */ \
3948    default_init, \
3949    default_cleanup, \
3950    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
3951    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
3952    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
3953    28, /* dst_w_basereg */ \
3954    4, /* dst_r_basereg */ \
3955    0, /* src_basereg   */ \
3956    0   /* mask_basereg  */
3957
3958/******************************************************************************/
3959
3960generate_composite_function_nearest_scanline \
3961    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
3962    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3963    8, /* number of pixels, processed in a single block */ \
3964    default_init, \
3965    default_cleanup, \
3966    pixman_composite_over_8888_8888_process_pixblock_head, \
3967    pixman_composite_over_8888_8888_process_pixblock_tail, \
3968    pixman_composite_over_8888_8888_process_pixblock_tail_head
3969
3970generate_composite_function_nearest_scanline \
3971    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
3972    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
3973    8, /* number of pixels, processed in a single block */ \
3974    default_init, \
3975    default_cleanup, \
3976    pixman_composite_over_8888_0565_process_pixblock_head, \
3977    pixman_composite_over_8888_0565_process_pixblock_tail, \
3978    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
3979    28, /* dst_w_basereg */ \
3980    4,  /* dst_r_basereg */ \
3981    0,  /* src_basereg   */ \
3982    24  /* mask_basereg  */
3983
3984generate_composite_function_nearest_scanline \
3985    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
3986    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
3987    8, /* number of pixels, processed in a single block */ \
3988    default_init, \
3989    default_cleanup, \
3990    pixman_composite_src_8888_0565_process_pixblock_head, \
3991    pixman_composite_src_8888_0565_process_pixblock_tail, \
3992    pixman_composite_src_8888_0565_process_pixblock_tail_head
3993
3994generate_composite_function_nearest_scanline \
3995    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
3996    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
3997    8, /* number of pixels, processed in a single block */ \
3998    default_init, \
3999    default_cleanup, \
4000    pixman_composite_src_0565_8888_process_pixblock_head, \
4001    pixman_composite_src_0565_8888_process_pixblock_tail, \
4002    pixman_composite_src_0565_8888_process_pixblock_tail_head
4003
4004generate_composite_function_nearest_scanline \
4005    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
4006    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
4007    8, /* number of pixels, processed in a single block */ \
4008    default_init_need_all_regs, \
4009    default_cleanup_need_all_regs, \
4010    pixman_composite_over_8888_8_0565_process_pixblock_head, \
4011    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
4012    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
4013    28, /* dst_w_basereg */ \
4014    4,  /* dst_r_basereg */ \
4015    8,  /* src_basereg   */ \
4016    24  /* mask_basereg  */
4017
4018generate_composite_function_nearest_scanline \
4019    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
4020    FLAG_DST_READWRITE, \
4021    8, /* number of pixels, processed in a single block */ \
4022    default_init_need_all_regs, \
4023    default_cleanup_need_all_regs, \
4024    pixman_composite_over_0565_8_0565_process_pixblock_head, \
4025    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
4026    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
4027    28, /* dst_w_basereg */ \
4028    10,  /* dst_r_basereg */ \
4029    8,  /* src_basereg   */ \
4030    15  /* mask_basereg  */
4031
4032/******************************************************************************/
4033
4034/*
4035 * Bilinear scaling support code which tries to provide pixel fetching, color
4036 * format conversion, and interpolation as separate macros which can be used
4037 * as the basic building blocks for constructing bilinear scanline functions.
4038 */
4039
4040.macro bilinear_load_8888 reg1, reg2, tmp
4041    mov       TMP1, X, asr #16
4042    add       X, X, UX
4043    add       TMP1, TOP, TMP1, asl #2
4044    vld1.32   {reg1}, [TMP1], STRIDE
4045    vld1.32   {reg2}, [TMP1]
4046.endm
4047
4048.macro bilinear_load_0565 reg1, reg2, tmp
4049    mov       TMP1, X, asr #16
4050    add       X, X, UX
4051    add       TMP1, TOP, TMP1, asl #1
4052    vld1.32   {reg2[0]}, [TMP1], STRIDE
4053    vld1.32   {reg2[1]}, [TMP1]
4054    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
4055.endm
4056
4057.macro bilinear_load_and_vertical_interpolate_two_8888 \
4058                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
4059
4060    bilinear_load_8888 reg1, reg2, tmp1
4061    vmull.u8  acc1, reg1, d28
4062    vmlal.u8  acc1, reg2, d29
4063    bilinear_load_8888 reg3, reg4, tmp2
4064    vmull.u8  acc2, reg3, d28
4065    vmlal.u8  acc2, reg4, d29
4066.endm
4067
4068.macro bilinear_load_and_vertical_interpolate_four_8888 \
4069                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
4070                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
4071
4072    bilinear_load_and_vertical_interpolate_two_8888 \
4073                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
4074    bilinear_load_and_vertical_interpolate_two_8888 \
4075                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
4076.endm
4077
4078.macro bilinear_load_and_vertical_interpolate_two_0565 \
4079                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
4080
4081    mov       TMP1, X, asr #16
4082    add       X, X, UX
4083    add       TMP1, TOP, TMP1, asl #1
4084    mov       TMP2, X, asr #16
4085    add       X, X, UX
4086    add       TMP2, TOP, TMP2, asl #1
4087    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
4088    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
4089    vld1.32   {acc2lo[1]}, [TMP1]
4090    vld1.32   {acc2hi[1]}, [TMP2]
4091    convert_0565_to_x888 acc2, reg3, reg2, reg1
4092    vzip.u8   reg1, reg3
4093    vzip.u8   reg2, reg4
4094    vzip.u8   reg3, reg4
4095    vzip.u8   reg1, reg2
4096    vmull.u8  acc1, reg1, d28
4097    vmlal.u8  acc1, reg2, d29
4098    vmull.u8  acc2, reg3, d28
4099    vmlal.u8  acc2, reg4, d29
4100.endm
4101
4102.macro bilinear_load_and_vertical_interpolate_four_0565 \
4103                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
4104                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
4105
4106    mov       TMP1, X, asr #16
4107    add       X, X, UX
4108    add       TMP1, TOP, TMP1, asl #1
4109    mov       TMP2, X, asr #16
4110    add       X, X, UX
4111    add       TMP2, TOP, TMP2, asl #1
4112    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
4113    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
4114    vld1.32   {xacc2lo[1]}, [TMP1]
4115    vld1.32   {xacc2hi[1]}, [TMP2]
4116    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
4117    mov       TMP1, X, asr #16
4118    add       X, X, UX
4119    add       TMP1, TOP, TMP1, asl #1
4120    mov       TMP2, X, asr #16
4121    add       X, X, UX
4122    add       TMP2, TOP, TMP2, asl #1
4123    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
4124    vzip.u8   xreg1, xreg3
4125    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
4126    vzip.u8   xreg2, xreg4
4127    vld1.32   {yacc2lo[1]}, [TMP1]
4128    vzip.u8   xreg3, xreg4
4129    vld1.32   {yacc2hi[1]}, [TMP2]
4130    vzip.u8   xreg1, xreg2
4131    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
4132    vmull.u8  xacc1, xreg1, d28
4133    vzip.u8   yreg1, yreg3
4134    vmlal.u8  xacc1, xreg2, d29
4135    vzip.u8   yreg2, yreg4
4136    vmull.u8  xacc2, xreg3, d28
4137    vzip.u8   yreg3, yreg4
4138    vmlal.u8  xacc2, xreg4, d29
4139    vzip.u8   yreg1, yreg2
4140    vmull.u8  yacc1, yreg1, d28
4141    vmlal.u8  yacc1, yreg2, d29
4142    vmull.u8  yacc2, yreg3, d28
4143    vmlal.u8  yacc2, yreg4, d29
4144.endm
4145
4146.macro bilinear_store_8888 numpix, tmp1, tmp2
4147.if numpix == 4
4148    vst1.32   {d0, d1}, [OUT, :128]!
4149.elseif numpix == 2
4150    vst1.32   {d0}, [OUT, :64]!
4151.elseif numpix == 1
4152    vst1.32   {d0[0]}, [OUT, :32]!
4153.else
4154    .error bilinear_store_8888 numpix is unsupported
4155.endif
4156.endm
4157
4158.macro bilinear_store_0565 numpix, tmp1, tmp2
4159    vuzp.u8 d0, d1
4160    vuzp.u8 d2, d3
4161    vuzp.u8 d1, d3
4162    vuzp.u8 d0, d2
4163    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
4164.if numpix == 4
4165    vst1.16   {d2}, [OUT, :64]!
4166.elseif numpix == 2
4167    vst1.32   {d2[0]}, [OUT, :32]!
4168.elseif numpix == 1
4169    vst1.16   {d2[0]}, [OUT, :16]!
4170.else
4171    .error bilinear_store_0565 numpix is unsupported
4172.endif
4173.endm
4174
4175.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
4176    bilinear_load_&src_fmt d0, d1, d2
4177    vmull.u8  q1, d0, d28
4178    vmlal.u8  q1, d1, d29
4179    /* 5 cycles bubble */
4180    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
4181    vmlsl.u16 q0, d2, d30
4182    vmlal.u16 q0, d3, d30
4183    /* 5 cycles bubble */
4184    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4185    /* 3 cycles bubble */
4186    vmovn.u16 d0, q0
4187    /* 1 cycle bubble */
4188    bilinear_store_&dst_fmt 1, q2, q3
4189.endm
4190
4191.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
4192    bilinear_load_and_vertical_interpolate_two_&src_fmt \
4193                q1, q11, d0, d1, d20, d21, d22, d23
4194    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
4195    vmlsl.u16 q0, d2, d30
4196    vmlal.u16 q0, d3, d30
4197    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
4198    vmlsl.u16 q10, d22, d31
4199    vmlal.u16 q10, d23, d31
4200    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4201    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
4202    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4203    vadd.u16  q12, q12, q13
4204    vmovn.u16 d0, q0
4205    bilinear_store_&dst_fmt 2, q2, q3
4206.endm
4207
4208.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
4209    bilinear_load_and_vertical_interpolate_four_&src_fmt \
4210                q1, q11, d0, d1, d20, d21, d22, d23 \
4211                q3, q9,  d4, d5, d16, d17, d18, d19
4212    pld       [TMP1, PF_OFFS]
4213    sub       TMP1, TMP1, STRIDE
4214    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
4215    vmlsl.u16 q0, d2, d30
4216    vmlal.u16 q0, d3, d30
4217    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
4218    vmlsl.u16 q10, d22, d31
4219    vmlal.u16 q10, d23, d31
4220    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4221    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
4222    vmlsl.u16 q2, d6, d30
4223    vmlal.u16 q2, d7, d30
4224    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
4225    pld       [TMP2, PF_OFFS]
4226    vmlsl.u16 q8, d18, d31
4227    vmlal.u16 q8, d19, d31
4228    vadd.u16  q12, q12, q13
4229    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4230    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
4231    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4232    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
4233    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4234    vmovn.u16 d0, q0
4235    vmovn.u16 d1, q2
4236    vadd.u16  q12, q12, q13
4237    bilinear_store_&dst_fmt 4, q2, q3
4238.endm
4239
4240.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
4241.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
4242    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
4243.else
4244    bilinear_interpolate_four_pixels src_fmt, dst_fmt
4245.endif
4246.endm
4247
4248.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
4249.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
4250    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
4251.endif
4252.endm
4253
4254.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
4255.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
4256    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
4257.else
4258    bilinear_interpolate_four_pixels src_fmt, dst_fmt
4259.endif
4260.endm
4261
4262.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
4263.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
4264    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
4265.else
4266    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
4267    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
4268.endif
4269.endm
4270
4271.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
4272.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
4273    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
4274.else
4275    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
4276.endif
4277.endm
4278
4279.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
4280.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
4281    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
4282.else
4283    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
4284    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
4285.endif
4286.endm
4287
4288.set BILINEAR_FLAG_UNROLL_4,          0
4289.set BILINEAR_FLAG_UNROLL_8,          1
4290.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
4291
4292/*
4293 * Main template macro for generating NEON optimized bilinear scanline
4294 * functions.
4295 *
4296 * Bilinear scanline scaler macro template uses the following arguments:
4297 *  fname             - name of the function to generate
4298 *  src_fmt           - source color format (8888 or 0565)
4299 *  dst_fmt           - destination color format (8888 or 0565)
4300 *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
4301 *  prefetch_distance - prefetch in the source image by that many
4302 *                      pixels ahead
4303 */
4304
4305.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
4306                                       src_bpp_shift, dst_bpp_shift, \
4307                                       prefetch_distance, flags
4308
4309pixman_asm_function fname
4310    OUT       .req      r0
4311    TOP       .req      r1
4312    BOTTOM    .req      r2
4313    WT        .req      r3
4314    WB        .req      r4
4315    X         .req      r5
4316    UX        .req      r6
4317    WIDTH     .req      ip
4318    TMP1      .req      r3
4319    TMP2      .req      r4
4320    PF_OFFS   .req      r7
4321    TMP3      .req      r8
4322    TMP4      .req      r9
4323    STRIDE    .req      r2
4324
4325    mov       ip, sp
4326    push      {r4, r5, r6, r7, r8, r9}
4327    mov       PF_OFFS, #prefetch_distance
4328    ldmia     ip, {WB, X, UX, WIDTH}
4329    mul       PF_OFFS, PF_OFFS, UX
4330
4331.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
4332    vpush     {d8-d15}
4333.endif
4334
4335    sub       STRIDE, BOTTOM, TOP
4336    .unreq    BOTTOM
4337
4338    cmp       WIDTH, #0
4339    ble       3f
4340
4341    vdup.u16  q12, X
4342    vdup.u16  q13, UX
4343    vdup.u8   d28, WT
4344    vdup.u8   d29, WB
4345    vadd.u16  d25, d25, d26
4346
4347    /* ensure good destination alignment  */
4348    cmp       WIDTH, #1
4349    blt       0f
4350    tst       OUT, #(1 << dst_bpp_shift)
4351    beq       0f
4352    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4353    vadd.u16  q12, q12, q13
4354    bilinear_interpolate_last_pixel src_fmt, dst_fmt
4355    sub       WIDTH, WIDTH, #1
43560:
4357    vadd.u16  q13, q13, q13
4358    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4359    vadd.u16  q12, q12, q13
4360
4361    cmp       WIDTH, #2
4362    blt       0f
4363    tst       OUT, #(1 << (dst_bpp_shift + 1))
4364    beq       0f
4365    bilinear_interpolate_two_pixels src_fmt, dst_fmt
4366    sub       WIDTH, WIDTH, #2
43670:
4368.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
4369/*********** 8 pixels per iteration *****************/
4370    cmp       WIDTH, #4
4371    blt       0f
4372    tst       OUT, #(1 << (dst_bpp_shift + 2))
4373    beq       0f
4374    bilinear_interpolate_four_pixels src_fmt, dst_fmt
4375    sub       WIDTH, WIDTH, #4
43760:
4377    subs      WIDTH, WIDTH, #8
4378    blt       1f
4379    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
4380    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
4381    subs      WIDTH, WIDTH, #8
4382    blt       5f
43830:
4384    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
4385    subs      WIDTH, WIDTH, #8
4386    bge       0b
43875:
4388    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
43891:
4390    tst       WIDTH, #4
4391    beq       2f
4392    bilinear_interpolate_four_pixels src_fmt, dst_fmt
43932:
4394.else
4395/*********** 4 pixels per iteration *****************/
4396    subs      WIDTH, WIDTH, #4
4397    blt       1f
4398    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
4399    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
4400    subs      WIDTH, WIDTH, #4
4401    blt       5f
44020:
4403    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
4404    subs      WIDTH, WIDTH, #4
4405    bge       0b
44065:
4407    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
44081:
4409/****************************************************/
4410.endif
4411    /* handle the remaining trailing pixels */
4412    tst       WIDTH, #2
4413    beq       2f
4414    bilinear_interpolate_two_pixels src_fmt, dst_fmt
44152:
4416    tst       WIDTH, #1
4417    beq       3f
4418    bilinear_interpolate_last_pixel src_fmt, dst_fmt
44193:
4420.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
4421    vpop      {d8-d15}
4422.endif
4423    pop       {r4, r5, r6, r7, r8, r9}
4424    bx        lr
4425
4426    .unreq    OUT
4427    .unreq    TOP
4428    .unreq    WT
4429    .unreq    WB
4430    .unreq    X
4431    .unreq    UX
4432    .unreq    WIDTH
4433    .unreq    TMP1
4434    .unreq    TMP2
4435    .unreq    PF_OFFS
4436    .unreq    TMP3
4437    .unreq    TMP4
4438    .unreq    STRIDE
4439.endfunc
4440
4441.endm
4442
4443/*****************************************************************************/
4444
4445.set have_bilinear_interpolate_four_pixels_8888_8888, 1
4446
4447.macro bilinear_interpolate_four_pixels_8888_8888_head
4448    mov       TMP1, X, asr #16
4449    add       X, X, UX
4450    add       TMP1, TOP, TMP1, asl #2
4451    mov       TMP2, X, asr #16
4452    add       X, X, UX
4453    add       TMP2, TOP, TMP2, asl #2
4454
4455    vld1.32   {d22}, [TMP1], STRIDE
4456    vld1.32   {d23}, [TMP1]
4457    mov       TMP3, X, asr #16
4458    add       X, X, UX
4459    add       TMP3, TOP, TMP3, asl #2
4460    vmull.u8  q8, d22, d28
4461    vmlal.u8  q8, d23, d29
4462
4463    vld1.32   {d22}, [TMP2], STRIDE
4464    vld1.32   {d23}, [TMP2]
4465    mov       TMP4, X, asr #16
4466    add       X, X, UX
4467    add       TMP4, TOP, TMP4, asl #2
4468    vmull.u8  q9, d22, d28
4469    vmlal.u8  q9, d23, d29
4470
4471    vld1.32   {d22}, [TMP3], STRIDE
4472    vld1.32   {d23}, [TMP3]
4473    vmull.u8  q10, d22, d28
4474    vmlal.u8  q10, d23, d29
4475
4476    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4477    vmlsl.u16 q0, d16, d30
4478    vmlal.u16 q0, d17, d30
4479
4480    pld       [TMP4, PF_OFFS]
4481    vld1.32   {d16}, [TMP4], STRIDE
4482    vld1.32   {d17}, [TMP4]
4483    pld       [TMP4, PF_OFFS]
4484    vmull.u8  q11, d16, d28
4485    vmlal.u8  q11, d17, d29
4486
4487    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4488    vmlsl.u16 q1, d18, d31
4489.endm
4490
4491.macro bilinear_interpolate_four_pixels_8888_8888_tail
4492    vmlal.u16 q1, d19, d31
4493    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4494    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4495    vmlsl.u16 q2, d20, d30
4496    vmlal.u16 q2, d21, d30
4497    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4498    vmlsl.u16 q3, d22, d31
4499    vmlal.u16 q3, d23, d31
4500    vadd.u16  q12, q12, q13
4501    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4502    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4503    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4504    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4505    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4506    vmovn.u16 d6, q0
4507    vmovn.u16 d7, q2
4508    vadd.u16  q12, q12, q13
4509    vst1.32   {d6, d7}, [OUT, :128]!
4510.endm
4511
4512.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
4513    mov       TMP1, X, asr #16
4514    add       X, X, UX
4515    add       TMP1, TOP, TMP1, asl #2
4516    mov       TMP2, X, asr #16
4517    add       X, X, UX
4518    add       TMP2, TOP, TMP2, asl #2
4519        vmlal.u16 q1, d19, d31
4520        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4521        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4522        vmlsl.u16 q2, d20, d30
4523        vmlal.u16 q2, d21, d30
4524        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4525    vld1.32   {d20}, [TMP1], STRIDE
4526        vmlsl.u16 q3, d22, d31
4527        vmlal.u16 q3, d23, d31
4528    vld1.32   {d21}, [TMP1]
4529    vmull.u8  q8, d20, d28
4530    vmlal.u8  q8, d21, d29
4531        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4532        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4533        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4534    vld1.32   {d22}, [TMP2], STRIDE
4535        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4536        vadd.u16  q12, q12, q13
4537    vld1.32   {d23}, [TMP2]
4538    vmull.u8  q9, d22, d28
4539    mov       TMP3, X, asr #16
4540    add       X, X, UX
4541    add       TMP3, TOP, TMP3, asl #2
4542    mov       TMP4, X, asr #16
4543    add       X, X, UX
4544    add       TMP4, TOP, TMP4, asl #2
4545    vmlal.u8  q9, d23, d29
4546    vld1.32   {d22}, [TMP3], STRIDE
4547        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4548    vld1.32   {d23}, [TMP3]
4549    vmull.u8  q10, d22, d28
4550    vmlal.u8  q10, d23, d29
4551        vmovn.u16 d6, q0
4552    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4553        vmovn.u16 d7, q2
4554    vmlsl.u16 q0, d16, d30
4555    vmlal.u16 q0, d17, d30
4556    pld       [TMP4, PF_OFFS]
4557    vld1.32   {d16}, [TMP4], STRIDE
4558        vadd.u16  q12, q12, q13
4559    vld1.32   {d17}, [TMP4]
4560    pld       [TMP4, PF_OFFS]
4561    vmull.u8  q11, d16, d28
4562    vmlal.u8  q11, d17, d29
4563        vst1.32   {d6, d7}, [OUT, :128]!
4564    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4565    vmlsl.u16 q1, d18, d31
4566.endm
4567
4568/*****************************************************************************/
4569
4570.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
4571
4572.macro bilinear_interpolate_eight_pixels_8888_0565_head
4573    mov       TMP1, X, asr #16
4574    add       X, X, UX
4575    add       TMP1, TOP, TMP1, asl #2
4576    mov       TMP2, X, asr #16
4577    add       X, X, UX
4578    add       TMP2, TOP, TMP2, asl #2
4579    vld1.32   {d20}, [TMP1], STRIDE
4580    vld1.32   {d21}, [TMP1]
4581    vmull.u8  q8, d20, d28
4582    vmlal.u8  q8, d21, d29
4583    vld1.32   {d22}, [TMP2], STRIDE
4584    vld1.32   {d23}, [TMP2]
4585    vmull.u8  q9, d22, d28
4586    mov       TMP3, X, asr #16
4587    add       X, X, UX
4588    add       TMP3, TOP, TMP3, asl #2
4589    mov       TMP4, X, asr #16
4590    add       X, X, UX
4591    add       TMP4, TOP, TMP4, asl #2
4592    vmlal.u8  q9, d23, d29
4593    vld1.32   {d22}, [TMP3], STRIDE
4594    vld1.32   {d23}, [TMP3]
4595    vmull.u8  q10, d22, d28
4596    vmlal.u8  q10, d23, d29
4597    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4598    vmlsl.u16 q0, d16, d30
4599    vmlal.u16 q0, d17, d30
4600    pld       [TMP4, PF_OFFS]
4601    vld1.32   {d16}, [TMP4], STRIDE
4602    vld1.32   {d17}, [TMP4]
4603    pld       [TMP4, PF_OFFS]
4604    vmull.u8  q11, d16, d28
4605    vmlal.u8  q11, d17, d29
4606    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4607    vmlsl.u16 q1, d18, d31
4608
4609    mov       TMP1, X, asr #16
4610    add       X, X, UX
4611    add       TMP1, TOP, TMP1, asl #2
4612    mov       TMP2, X, asr #16
4613    add       X, X, UX
4614    add       TMP2, TOP, TMP2, asl #2
4615        vmlal.u16 q1, d19, d31
4616        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4617        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4618        vmlsl.u16 q2, d20, d30
4619        vmlal.u16 q2, d21, d30
4620        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4621    vld1.32   {d20}, [TMP1], STRIDE
4622        vmlsl.u16 q3, d22, d31
4623        vmlal.u16 q3, d23, d31
4624    vld1.32   {d21}, [TMP1]
4625    vmull.u8  q8, d20, d28
4626    vmlal.u8  q8, d21, d29
4627        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4628        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4629        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4630    vld1.32   {d22}, [TMP2], STRIDE
4631        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4632        vadd.u16  q12, q12, q13
4633    vld1.32   {d23}, [TMP2]
4634    vmull.u8  q9, d22, d28
4635    mov       TMP3, X, asr #16
4636    add       X, X, UX
4637    add       TMP3, TOP, TMP3, asl #2
4638    mov       TMP4, X, asr #16
4639    add       X, X, UX
4640    add       TMP4, TOP, TMP4, asl #2
4641    vmlal.u8  q9, d23, d29
4642    vld1.32   {d22}, [TMP3], STRIDE
4643        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4644    vld1.32   {d23}, [TMP3]
4645    vmull.u8  q10, d22, d28
4646    vmlal.u8  q10, d23, d29
4647        vmovn.u16 d8, q0
4648    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4649        vmovn.u16 d9, q2
4650    vmlsl.u16 q0, d16, d30
4651    vmlal.u16 q0, d17, d30
4652    pld       [TMP4, PF_OFFS]
4653    vld1.32   {d16}, [TMP4], STRIDE
4654        vadd.u16  q12, q12, q13
4655    vld1.32   {d17}, [TMP4]
4656    pld       [TMP4, PF_OFFS]
4657    vmull.u8  q11, d16, d28
4658    vmlal.u8  q11, d17, d29
4659    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4660    vmlsl.u16 q1, d18, d31
4661.endm
4662
4663.macro bilinear_interpolate_eight_pixels_8888_0565_tail
4664    vmlal.u16 q1, d19, d31
4665    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4666    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4667    vmlsl.u16 q2, d20, d30
4668    vmlal.u16 q2, d21, d30
4669    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4670    vmlsl.u16 q3, d22, d31
4671    vmlal.u16 q3, d23, d31
4672    vadd.u16  q12, q12, q13
4673    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4674    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4675    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4676    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4677    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4678    vmovn.u16 d10, q0
4679    vmovn.u16 d11, q2
4680    vadd.u16  q12, q12, q13
4681
4682    vuzp.u8   d8, d9
4683    vuzp.u8   d10, d11
4684    vuzp.u8   d9, d11
4685    vuzp.u8   d8, d10
4686    vshll.u8  q6, d9, #8
4687    vshll.u8  q5, d10, #8
4688    vshll.u8  q7, d8, #8
4689    vsri.u16  q5, q6, #5
4690    vsri.u16  q5, q7, #11
4691    vst1.32   {d10, d11}, [OUT, :128]!
4692.endm
4693
4694.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
4695    mov       TMP1, X, asr #16
4696    add       X, X, UX
4697    add       TMP1, TOP, TMP1, asl #2
4698    mov       TMP2, X, asr #16
4699    add       X, X, UX
4700    add       TMP2, TOP, TMP2, asl #2
4701        vmlal.u16 q1, d19, d31
4702        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4703            vuzp.u8 d8, d9
4704        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4705        vmlsl.u16 q2, d20, d30
4706        vmlal.u16 q2, d21, d30
4707        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4708    vld1.32   {d20}, [TMP1], STRIDE
4709        vmlsl.u16 q3, d22, d31
4710        vmlal.u16 q3, d23, d31
4711    vld1.32   {d21}, [TMP1]
4712    vmull.u8  q8, d20, d28
4713    vmlal.u8  q8, d21, d29
4714        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4715        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4716        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4717    vld1.32   {d22}, [TMP2], STRIDE
4718        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4719        vadd.u16  q12, q12, q13
4720    vld1.32   {d23}, [TMP2]
4721    vmull.u8  q9, d22, d28
4722    mov       TMP3, X, asr #16
4723    add       X, X, UX
4724    add       TMP3, TOP, TMP3, asl #2
4725    mov       TMP4, X, asr #16
4726    add       X, X, UX
4727    add       TMP4, TOP, TMP4, asl #2
4728    vmlal.u8  q9, d23, d29
4729    vld1.32   {d22}, [TMP3], STRIDE
4730        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4731    vld1.32   {d23}, [TMP3]
4732    vmull.u8  q10, d22, d28
4733    vmlal.u8  q10, d23, d29
4734        vmovn.u16 d10, q0
4735    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4736        vmovn.u16 d11, q2
4737    vmlsl.u16 q0, d16, d30
4738    vmlal.u16 q0, d17, d30
4739    pld       [TMP4, PF_OFFS]
4740    vld1.32   {d16}, [TMP4], STRIDE
4741        vadd.u16  q12, q12, q13
4742    vld1.32   {d17}, [TMP4]
4743    pld       [TMP4, PF_OFFS]
4744    vmull.u8  q11, d16, d28
4745    vmlal.u8  q11, d17, d29
4746            vuzp.u8 d10, d11
4747    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4748    vmlsl.u16 q1, d18, d31
4749
4750    mov       TMP1, X, asr #16
4751    add       X, X, UX
4752    add       TMP1, TOP, TMP1, asl #2
4753    mov       TMP2, X, asr #16
4754    add       X, X, UX
4755    add       TMP2, TOP, TMP2, asl #2
4756        vmlal.u16 q1, d19, d31
4757            vuzp.u8 d9, d11
4758        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4759        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
4760            vuzp.u8 d8, d10
4761        vmlsl.u16 q2, d20, d30
4762        vmlal.u16 q2, d21, d30
4763        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
4764    vld1.32   {d20}, [TMP1], STRIDE
4765        vmlsl.u16 q3, d22, d31
4766        vmlal.u16 q3, d23, d31
4767    vld1.32   {d21}, [TMP1]
4768    vmull.u8  q8, d20, d28
4769    vmlal.u8  q8, d21, d29
4770            vshll.u8  q6, d9, #8
4771            vshll.u8  q5, d10, #8
4772            vshll.u8  q7, d8, #8
4773        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
4774            vsri.u16  q5, q6, #5
4775        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
4776            vsri.u16  q5, q7, #11
4777        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
4778    vld1.32   {d22}, [TMP2], STRIDE
4779        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
4780        vadd.u16  q12, q12, q13
4781    vld1.32   {d23}, [TMP2]
4782    vmull.u8  q9, d22, d28
4783    mov       TMP3, X, asr #16
4784    add       X, X, UX
4785    add       TMP3, TOP, TMP3, asl #2
4786    mov       TMP4, X, asr #16
4787    add       X, X, UX
4788    add       TMP4, TOP, TMP4, asl #2
4789    vmlal.u8  q9, d23, d29
4790    vld1.32   {d22}, [TMP3], STRIDE
4791        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
4792    vld1.32   {d23}, [TMP3]
4793    vmull.u8  q10, d22, d28
4794    vmlal.u8  q10, d23, d29
4795        vmovn.u16 d8, q0
4796    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
4797        vmovn.u16 d9, q2
4798    vmlsl.u16 q0, d16, d30
4799    vmlal.u16 q0, d17, d30
4800    pld       [TMP4, PF_OFFS]
4801    vld1.32   {d16}, [TMP4], STRIDE
4802        vadd.u16  q12, q12, q13
4803    vld1.32   {d17}, [TMP4]
4804    pld       [TMP4, PF_OFFS]
4805    vmull.u8  q11, d16, d28
4806    vmlal.u8  q11, d17, d29
4807    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
4808            vst1.32   {d10, d11}, [OUT, :128]!
4809    vmlsl.u16 q1, d18, d31
4810.endm
4811/*****************************************************************************/
4812
4813generate_bilinear_scanline_func \
4814    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
4815    2, 2, 28, BILINEAR_FLAG_UNROLL_4
4816
4817generate_bilinear_scanline_func \
4818    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
4819    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
4820
4821generate_bilinear_scanline_func \
4822    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
4823    1, 2, 28, BILINEAR_FLAG_UNROLL_4
4824
4825generate_bilinear_scanline_func \
4826    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
4827    1, 1, 28, BILINEAR_FLAG_UNROLL_4
4828