1/*
2 * Copyright © 2011 SCore Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 * Author:  Taekyun Kim (tkq.kim@samsung.com)
25 */
26
27/*
28 * This file contains scaled bilinear scanline functions implemented
29 * using older siarhei's bilinear macro template.
30 *
31 * << General scanline function procedures >>
32 *  1. bilinear interpolate source pixels
33 *  2. load mask pixels
34 *  3. load destination pixels
35 *  4. duplicate mask to fill whole register
36 *  5. interleave source & destination pixels
37 *  6. apply mask to source pixels
38 *  7. combine source & destination pixels
39 *  8, Deinterleave final result
40 *  9. store destination pixels
41 *
42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
43 * Registers with double numbers(src01, dst01) are 128-bits registers.
44 * All temp registers can be used freely outside the code block.
45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
46 *
47 * Remarks
48 *  There can be lots of pipeline stalls inside code block and between code blocks.
49 *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
50 */
51
52/* Prevent the stack from becoming executable for no reason... */
53#if defined(__linux__) && defined (__ELF__)
54.section .note.GNU-stack,"",%progbits
55#endif
56
57.text
58.fpu neon
59.arch armv7a
60.object_arch armv4
61.eabi_attribute 10, 0
62.eabi_attribute 12, 0
63.arm
64.altmacro
65.p2align 2
66
67#include "pixman-private.h"
68#include "pixman-arm-asm.h"
69#include "pixman-arm-neon-asm.h"
70
71/*
72 * Bilinear macros from pixman-arm-neon-asm.S
73 */
74
75/*
76 * Bilinear scaling support code which tries to provide pixel fetching, color
77 * format conversion, and interpolation as separate macros which can be used
78 * as the basic building blocks for constructing bilinear scanline functions.
79 */
80
81.macro bilinear_load_8888 reg1, reg2, tmp
82    mov       TMP1, X, asr #16
83    add       X, X, UX
84    add       TMP1, TOP, TMP1, asl #2
85    vld1.32   {reg1}, [TMP1], STRIDE
86    vld1.32   {reg2}, [TMP1]
87.endm
88
89.macro bilinear_load_0565 reg1, reg2, tmp
90    mov       TMP1, X, asr #16
91    add       X, X, UX
92    add       TMP1, TOP, TMP1, asl #1
93    vld1.32   {reg2[0]}, [TMP1], STRIDE
94    vld1.32   {reg2[1]}, [TMP1]
95    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
96.endm
97
98.macro bilinear_load_and_vertical_interpolate_two_8888 \
99                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
100
101    bilinear_load_8888 reg1, reg2, tmp1
102    vmull.u8  acc1, reg1, d28
103    vmlal.u8  acc1, reg2, d29
104    bilinear_load_8888 reg3, reg4, tmp2
105    vmull.u8  acc2, reg3, d28
106    vmlal.u8  acc2, reg4, d29
107.endm
108
109.macro bilinear_load_and_vertical_interpolate_four_8888 \
110                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
111                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
112
113    bilinear_load_and_vertical_interpolate_two_8888 \
114                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
115    bilinear_load_and_vertical_interpolate_two_8888 \
116                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
117.endm
118
119.macro bilinear_load_and_vertical_interpolate_two_0565 \
120                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
121
122    mov       TMP1, X, asr #16
123    add       X, X, UX
124    add       TMP1, TOP, TMP1, asl #1
125    mov       TMP2, X, asr #16
126    add       X, X, UX
127    add       TMP2, TOP, TMP2, asl #1
128    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
129    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
130    vld1.32   {acc2lo[1]}, [TMP1]
131    vld1.32   {acc2hi[1]}, [TMP2]
132    convert_0565_to_x888 acc2, reg3, reg2, reg1
133    vzip.u8   reg1, reg3
134    vzip.u8   reg2, reg4
135    vzip.u8   reg3, reg4
136    vzip.u8   reg1, reg2
137    vmull.u8  acc1, reg1, d28
138    vmlal.u8  acc1, reg2, d29
139    vmull.u8  acc2, reg3, d28
140    vmlal.u8  acc2, reg4, d29
141.endm
142
143.macro bilinear_load_and_vertical_interpolate_four_0565 \
144                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
145                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
146
147    mov       TMP1, X, asr #16
148    add       X, X, UX
149    add       TMP1, TOP, TMP1, asl #1
150    mov       TMP2, X, asr #16
151    add       X, X, UX
152    add       TMP2, TOP, TMP2, asl #1
153    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
154    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
155    vld1.32   {xacc2lo[1]}, [TMP1]
156    vld1.32   {xacc2hi[1]}, [TMP2]
157    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
158    mov       TMP1, X, asr #16
159    add       X, X, UX
160    add       TMP1, TOP, TMP1, asl #1
161    mov       TMP2, X, asr #16
162    add       X, X, UX
163    add       TMP2, TOP, TMP2, asl #1
164    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
165    vzip.u8   xreg1, xreg3
166    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
167    vzip.u8   xreg2, xreg4
168    vld1.32   {yacc2lo[1]}, [TMP1]
169    vzip.u8   xreg3, xreg4
170    vld1.32   {yacc2hi[1]}, [TMP2]
171    vzip.u8   xreg1, xreg2
172    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
173    vmull.u8  xacc1, xreg1, d28
174    vzip.u8   yreg1, yreg3
175    vmlal.u8  xacc1, xreg2, d29
176    vzip.u8   yreg2, yreg4
177    vmull.u8  xacc2, xreg3, d28
178    vzip.u8   yreg3, yreg4
179    vmlal.u8  xacc2, xreg4, d29
180    vzip.u8   yreg1, yreg2
181    vmull.u8  yacc1, yreg1, d28
182    vmlal.u8  yacc1, yreg2, d29
183    vmull.u8  yacc2, yreg3, d28
184    vmlal.u8  yacc2, yreg4, d29
185.endm
186
187.macro bilinear_store_8888 numpix, tmp1, tmp2
188.if numpix == 4
189    vst1.32   {d0, d1}, [OUT]!
190.elseif numpix == 2
191    vst1.32   {d0}, [OUT]!
192.elseif numpix == 1
193    vst1.32   {d0[0]}, [OUT, :32]!
194.else
195    .error bilinear_store_8888 numpix is unsupported
196.endif
197.endm
198
199.macro bilinear_store_0565 numpix, tmp1, tmp2
200    vuzp.u8 d0, d1
201    vuzp.u8 d2, d3
202    vuzp.u8 d1, d3
203    vuzp.u8 d0, d2
204    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
205.if numpix == 4
206    vst1.16   {d2}, [OUT]!
207.elseif numpix == 2
208    vst1.32   {d2[0]}, [OUT]!
209.elseif numpix == 1
210    vst1.16   {d2[0]}, [OUT]!
211.else
212    .error bilinear_store_0565 numpix is unsupported
213.endif
214.endm
215
216
217/*
218 * Macros for loading mask pixels into register 'mask'.
219 * vdup must be done in somewhere else.
220 */
221.macro bilinear_load_mask_x numpix, mask
222.endm
223
224.macro bilinear_load_mask_8 numpix, mask
225.if numpix == 4
226    vld1.32     {mask[0]}, [MASK]!
227.elseif numpix == 2
228    vld1.16     {mask[0]}, [MASK]!
229.elseif numpix == 1
230    vld1.8      {mask[0]}, [MASK]!
231.else
232    .error bilinear_load_mask_8 numpix is unsupported
233.endif
234    pld         [MASK, #prefetch_offset]
235.endm
236
237.macro bilinear_load_mask mask_fmt, numpix, mask
238    bilinear_load_mask_&mask_fmt numpix, mask
239.endm
240
241
242/*
243 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
244 * Interleave should be done somewhere else.
245 */
246.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
247.endm
248
249.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
250.endm
251
252.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
253.if numpix == 4
254    vld1.32     {dst0, dst1}, [OUT]
255.elseif numpix == 2
256    vld1.32     {dst0}, [OUT]
257.elseif numpix == 1
258    vld1.32     {dst0[0]}, [OUT]
259.else
260    .error bilinear_load_dst_8888 numpix is unsupported
261.endif
262    pld         [OUT, #(prefetch_offset * 4)]
263.endm
264
265.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
266    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
267.endm
268
269.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
270    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
271.endm
272
273.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
274    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
275.endm
276
277/*
278 * Macros for duplicating partially loaded mask to fill entire register.
279 * We will apply mask to interleaved source pixels, that is
280 *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
281 *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
282 * So, we need to duplicate loaded mask into whole register.
283 *
284 * For two pixel case
285 *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
286 *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
287 * We can do some optimizations for this including last pixel cases.
288 */
289.macro bilinear_duplicate_mask_x numpix, mask
290.endm
291
292.macro bilinear_duplicate_mask_8 numpix, mask
293.if numpix == 4
294    vdup.32     mask, mask[0]
295.elseif numpix == 2
296    vdup.16     mask, mask[0]
297.elseif numpix == 1
298    vdup.8      mask, mask[0]
299.else
300    .error bilinear_duplicate_mask_8 is unsupported
301.endif
302.endm
303
304.macro bilinear_duplicate_mask mask_fmt, numpix, mask
305    bilinear_duplicate_mask_&mask_fmt numpix, mask
306.endm
307
308/*
309 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
310 * Interleave should be done when maks is enabled or operator is 'over'.
311 */
312.macro bilinear_interleave src0, src1, dst0, dst1
313    vuzp.8      src0, src1
314    vuzp.8      dst0, dst1
315    vuzp.8      src0, src1
316    vuzp.8      dst0, dst1
317.endm
318
319.macro bilinear_interleave_src_dst_x_src \
320                numpix, src0, src1, src01, dst0, dst1, dst01
321.endm
322
323.macro bilinear_interleave_src_dst_x_over \
324                numpix, src0, src1, src01, dst0, dst1, dst01
325
326    bilinear_interleave src0, src1, dst0, dst1
327.endm
328
329.macro bilinear_interleave_src_dst_x_add \
330                numpix, src0, src1, src01, dst0, dst1, dst01
331.endm
332
333.macro bilinear_interleave_src_dst_8_src \
334                numpix, src0, src1, src01, dst0, dst1, dst01
335
336    bilinear_interleave src0, src1, dst0, dst1
337.endm
338
339.macro bilinear_interleave_src_dst_8_over \
340                numpix, src0, src1, src01, dst0, dst1, dst01
341
342    bilinear_interleave src0, src1, dst0, dst1
343.endm
344
345.macro bilinear_interleave_src_dst_8_add \
346                numpix, src0, src1, src01, dst0, dst1, dst01
347
348    bilinear_interleave src0, src1, dst0, dst1
349.endm
350
351.macro bilinear_interleave_src_dst \
352                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
353
354    bilinear_interleave_src_dst_&mask_fmt&_&op \
355                numpix, src0, src1, src01, dst0, dst1, dst01
356.endm
357
358
359/*
360 * Macros for applying masks to src pixels. (see combine_mask_u() function)
361 * src, dst should be in interleaved form.
362 * mask register should be in form (m0, m1, m2, m3).
363 */
364.macro bilinear_apply_mask_to_src_x \
365                numpix, src0, src1, src01, mask, \
366                tmp01, tmp23, tmp45, tmp67
367.endm
368
369.macro bilinear_apply_mask_to_src_8 \
370                numpix, src0, src1, src01, mask, \
371                tmp01, tmp23, tmp45, tmp67
372
373    vmull.u8        tmp01, src0, mask
374    vmull.u8        tmp23, src1, mask
375    /* bubbles */
376    vrshr.u16       tmp45, tmp01, #8
377    vrshr.u16       tmp67, tmp23, #8
378    /* bubbles */
379    vraddhn.u16     src0, tmp45, tmp01
380    vraddhn.u16     src1, tmp67, tmp23
381.endm
382
383.macro bilinear_apply_mask_to_src \
384                mask_fmt, numpix, src0, src1, src01, mask, \
385                tmp01, tmp23, tmp45, tmp67
386
387    bilinear_apply_mask_to_src_&mask_fmt \
388                numpix, src0, src1, src01, mask, \
389                tmp01, tmp23, tmp45, tmp67
390.endm
391
392
393/*
394 * Macros for combining src and destination pixels.
395 * Interleave or not is depending on operator 'op'.
396 */
397.macro bilinear_combine_src \
398                numpix, src0, src1, src01, dst0, dst1, dst01, \
399                tmp01, tmp23, tmp45, tmp67, tmp8
400.endm
401
402.macro bilinear_combine_over \
403                numpix, src0, src1, src01, dst0, dst1, dst01, \
404                tmp01, tmp23, tmp45, tmp67, tmp8
405
406    vdup.32     tmp8, src1[1]
407    /* bubbles */
408    vmvn.8      tmp8, tmp8
409    /* bubbles */
410    vmull.u8    tmp01, dst0, tmp8
411    /* bubbles */
412    vmull.u8    tmp23, dst1, tmp8
413    /* bubbles */
414    vrshr.u16   tmp45, tmp01, #8
415    vrshr.u16   tmp67, tmp23, #8
416    /* bubbles */
417    vraddhn.u16 dst0, tmp45, tmp01
418    vraddhn.u16 dst1, tmp67, tmp23
419    /* bubbles */
420    vqadd.u8    src01, dst01, src01
421.endm
422
423.macro bilinear_combine_add \
424                numpix, src0, src1, src01, dst0, dst1, dst01, \
425                tmp01, tmp23, tmp45, tmp67, tmp8
426
427    vqadd.u8    src01, dst01, src01
428.endm
429
430.macro bilinear_combine \
431                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
432                tmp01, tmp23, tmp45, tmp67, tmp8
433
434    bilinear_combine_&op \
435                numpix, src0, src1, src01, dst0, dst1, dst01, \
436                tmp01, tmp23, tmp45, tmp67, tmp8
437.endm
438
439/*
440 * Macros for final deinterleaving of destination pixels if needed.
441 */
442.macro bilinear_deinterleave numpix, dst0, dst1, dst01
443    vuzp.8      dst0, dst1
444    /* bubbles */
445    vuzp.8      dst0, dst1
446.endm
447
448.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
449.endm
450
451.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
452    bilinear_deinterleave numpix, dst0, dst1, dst01
453.endm
454
455.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
456.endm
457
458.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
459    bilinear_deinterleave numpix, dst0, dst1, dst01
460.endm
461
462.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
463    bilinear_deinterleave numpix, dst0, dst1, dst01
464.endm
465
466.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
467    bilinear_deinterleave numpix, dst0, dst1, dst01
468.endm
469
470.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
471    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
472.endm
473
474
475.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
476    bilinear_load_&src_fmt d0, d1, d2
477    bilinear_load_mask mask_fmt, 1, d4
478    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
479    vmull.u8  q1, d0, d28
480    vmlal.u8  q1, d1, d29
481    /* 5 cycles bubble */
482    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
483    vmlsl.u16 q0, d2, d30
484    vmlal.u16 q0, d3, d30
485    /* 5 cycles bubble */
486    bilinear_duplicate_mask mask_fmt, 1, d4
487    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
488    /* 3 cycles bubble */
489    vmovn.u16 d0, q0
490    /* 1 cycle bubble */
491    bilinear_interleave_src_dst \
492                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
493    bilinear_apply_mask_to_src \
494                mask_fmt, 1, d0, d1, q0, d4, \
495                q3, q8, q10, q11
496    bilinear_combine \
497                op, 1, d0, d1, q0, d18, d19, q9, \
498                q3, q8, q10, q11, d5
499    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
500    bilinear_store_&dst_fmt 1, q2, q3
501.endm
502
503.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
504    bilinear_load_and_vertical_interpolate_two_&src_fmt \
505                q1, q11, d0, d1, d20, d21, d22, d23
506    bilinear_load_mask mask_fmt, 2, d4
507    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
508    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
509    vmlsl.u16 q0, d2, d30
510    vmlal.u16 q0, d3, d30
511    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
512    vmlsl.u16 q10, d22, d31
513    vmlal.u16 q10, d23, d31
514    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
515    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
516    bilinear_duplicate_mask mask_fmt, 2, d4
517    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
518    vadd.u16  q12, q12, q13
519    vmovn.u16 d0, q0
520    bilinear_interleave_src_dst \
521                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
522    bilinear_apply_mask_to_src \
523                mask_fmt, 2, d0, d1, q0, d4, \
524                q3, q8, q10, q11
525    bilinear_combine \
526                op, 2, d0, d1, q0, d18, d19, q9, \
527                q3, q8, q10, q11, d5
528    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
529    bilinear_store_&dst_fmt 2, q2, q3
530.endm
531
532.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
533    bilinear_load_and_vertical_interpolate_four_&src_fmt \
534                q1, q11, d0, d1, d20, d21, d22, d23 \
535                q3, q9,  d4, d5, d16, d17, d18, d19
536    pld       [TMP1, PF_OFFS]
537    sub       TMP1, TMP1, STRIDE
538    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
539    vmlsl.u16 q0, d2, d30
540    vmlal.u16 q0, d3, d30
541    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
542    vmlsl.u16 q10, d22, d31
543    vmlal.u16 q10, d23, d31
544    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
545    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
546    vmlsl.u16 q2, d6, d30
547    vmlal.u16 q2, d7, d30
548    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
549    bilinear_load_mask mask_fmt, 4, d22
550    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
551    pld       [TMP1, PF_OFFS]
552    vmlsl.u16 q8, d18, d31
553    vmlal.u16 q8, d19, d31
554    vadd.u16  q12, q12, q13
555    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
556    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
557    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
558    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
559    bilinear_duplicate_mask mask_fmt, 4, d22
560    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
561    vmovn.u16 d0, q0
562    vmovn.u16 d1, q2
563    vadd.u16  q12, q12, q13
564    bilinear_interleave_src_dst \
565                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
566    bilinear_apply_mask_to_src \
567                mask_fmt, 4, d0, d1, q0, d22, \
568                q3, q8, q9, q10
569    bilinear_combine \
570                op, 4, d0, d1, q0, d2, d3, q1, \
571                q3, q8, q9, q10, d23
572    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
573    bilinear_store_&dst_fmt 4, q2, q3
574.endm
575
576.set BILINEAR_FLAG_USE_MASK,		1
577.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
578
579/*
580 * Main template macro for generating NEON optimized bilinear scanline functions.
581 *
582 * Bilinear scanline generator macro take folling arguments:
583 *  fname			- name of the function to generate
584 *  src_fmt			- source color format (8888 or 0565)
585 *  dst_fmt			- destination color format (8888 or 0565)
586 *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
587 *  process_last_pixel		- code block that interpolate one pixel and does not
588 *				  update horizontal weight
589 *  process_two_pixels		- code block that interpolate two pixels and update
590 *				  horizontal weight
591 *  process_four_pixels		- code block that interpolate four pixels and update
592 *				  horizontal weight
593 *  process_pixblock_head	- head part of middle loop
594 *  process_pixblock_tail	- tail part of middle loop
595 *  process_pixblock_tail_head	- tail_head of middle loop
596 *  pixblock_size		- number of pixels processed in a single middle loop
597 *  prefetch_distance		- prefetch in the source image by that many pixels ahead
598 */
599
600.macro generate_bilinear_scanline_func \
601	fname, \
602	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
603	bilinear_process_last_pixel, \
604	bilinear_process_two_pixels, \
605	bilinear_process_four_pixels, \
606	bilinear_process_pixblock_head, \
607	bilinear_process_pixblock_tail, \
608	bilinear_process_pixblock_tail_head, \
609	pixblock_size, \
610	prefetch_distance, \
611	flags
612
613pixman_asm_function fname
614.if pixblock_size == 8
615.elseif pixblock_size == 4
616.else
617    .error unsupported pixblock size
618.endif
619
620.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
621    OUT       .req    r0
622    TOP       .req    r1
623    BOTTOM    .req    r2
624    WT        .req    r3
625    WB        .req    r4
626    X         .req    r5
627    UX        .req    r6
628    WIDTH     .req    ip
629    TMP1      .req    r3
630    TMP2      .req    r4
631    PF_OFFS   .req    r7
632    TMP3      .req    r8
633    TMP4      .req    r9
634    STRIDE    .req    r2
635
636    mov		ip, sp
637    push	{r4, r5, r6, r7, r8, r9}
638    mov		PF_OFFS, #prefetch_distance
639    ldmia	ip, {WB, X, UX, WIDTH}
640.else
641    OUT       .req      r0
642    MASK      .req      r1
643    TOP       .req      r2
644    BOTTOM    .req      r3
645    WT        .req      r4
646    WB        .req      r5
647    X         .req      r6
648    UX        .req      r7
649    WIDTH     .req      ip
650    TMP1      .req      r4
651    TMP2      .req      r5
652    PF_OFFS   .req      r8
653    TMP3      .req      r9
654    TMP4      .req      r10
655    STRIDE    .req      r3
656
657    .set prefetch_offset, prefetch_distance
658
659    mov       ip, sp
660    push      {r4, r5, r6, r7, r8, r9, r10, ip}
661    mov       PF_OFFS, #prefetch_distance
662    ldmia     ip, {WT, WB, X, UX, WIDTH}
663.endif
664
665    mul       PF_OFFS, PF_OFFS, UX
666
667.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
668    vpush     {d8-d15}
669.endif
670
671    sub	      STRIDE, BOTTOM, TOP
672    .unreq    BOTTOM
673
674    cmp       WIDTH, #0
675    ble       3f
676
677    vdup.u16  q12, X
678    vdup.u16  q13, UX
679    vdup.u8   d28, WT
680    vdup.u8   d29, WB
681    vadd.u16  d25, d25, d26
682
683    /* ensure good destination alignment  */
684    cmp       WIDTH, #1
685    blt       0f
686    tst       OUT, #(1 << dst_bpp_shift)
687    beq       0f
688    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
689    vadd.u16  q12, q12, q13
690    bilinear_process_last_pixel
691    sub       WIDTH, WIDTH, #1
6920:
693    vadd.u16  q13, q13, q13
694    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
695    vadd.u16  q12, q12, q13
696
697    cmp       WIDTH, #2
698    blt       0f
699    tst       OUT, #(1 << (dst_bpp_shift + 1))
700    beq       0f
701    bilinear_process_two_pixels
702    sub       WIDTH, WIDTH, #2
7030:
704.if pixblock_size == 8
705    cmp       WIDTH, #4
706    blt       0f
707    tst       OUT, #(1 << (dst_bpp_shift + 2))
708    beq       0f
709    bilinear_process_four_pixels
710    sub       WIDTH, WIDTH, #4
7110:
712.endif
713    subs      WIDTH, WIDTH, #pixblock_size
714    blt       1f
715    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
716    bilinear_process_pixblock_head
717    subs      WIDTH, WIDTH, #pixblock_size
718    blt       5f
7190:
720    bilinear_process_pixblock_tail_head
721    subs      WIDTH, WIDTH, #pixblock_size
722    bge       0b
7235:
724    bilinear_process_pixblock_tail
7251:
726.if pixblock_size == 8
727    tst       WIDTH, #4
728    beq       2f
729    bilinear_process_four_pixels
7302:
731.endif
732    /* handle the remaining trailing pixels */
733    tst       WIDTH, #2
734    beq       2f
735    bilinear_process_two_pixels
7362:
737    tst       WIDTH, #1
738    beq       3f
739    bilinear_process_last_pixel
7403:
741.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
742    vpop      {d8-d15}
743.endif
744
745.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
746    pop       {r4, r5, r6, r7, r8, r9}
747.else
748    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
749.endif
750    bx        lr
751
752    .unreq    OUT
753    .unreq    TOP
754    .unreq    WT
755    .unreq    WB
756    .unreq    X
757    .unreq    UX
758    .unreq    WIDTH
759    .unreq    TMP1
760    .unreq    TMP2
761    .unreq    PF_OFFS
762    .unreq    TMP3
763    .unreq    TMP4
764    .unreq    STRIDE
765.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
766    .unreq    MASK
767.endif
768
769.endfunc
770
771.endm
772
773/* src_8888_8_8888 */
774.macro bilinear_src_8888_8_8888_process_last_pixel
775    bilinear_interpolate_last_pixel 8888, 8, 8888, src
776.endm
777
778.macro bilinear_src_8888_8_8888_process_two_pixels
779    bilinear_interpolate_two_pixels 8888, 8, 8888, src
780.endm
781
782.macro bilinear_src_8888_8_8888_process_four_pixels
783    bilinear_interpolate_four_pixels 8888, 8, 8888, src
784.endm
785
786.macro bilinear_src_8888_8_8888_process_pixblock_head
787    bilinear_src_8888_8_8888_process_four_pixels
788.endm
789
790.macro bilinear_src_8888_8_8888_process_pixblock_tail
791.endm
792
793.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
794    bilinear_src_8888_8_8888_process_pixblock_tail
795    bilinear_src_8888_8_8888_process_pixblock_head
796.endm
797
798/* src_8888_8_0565 */
799.macro bilinear_src_8888_8_0565_process_last_pixel
800    bilinear_interpolate_last_pixel 8888, 8, 0565, src
801.endm
802
803.macro bilinear_src_8888_8_0565_process_two_pixels
804    bilinear_interpolate_two_pixels 8888, 8, 0565, src
805.endm
806
807.macro bilinear_src_8888_8_0565_process_four_pixels
808    bilinear_interpolate_four_pixels 8888, 8, 0565, src
809.endm
810
811.macro bilinear_src_8888_8_0565_process_pixblock_head
812    bilinear_src_8888_8_0565_process_four_pixels
813.endm
814
815.macro bilinear_src_8888_8_0565_process_pixblock_tail
816.endm
817
818.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
819    bilinear_src_8888_8_0565_process_pixblock_tail
820    bilinear_src_8888_8_0565_process_pixblock_head
821.endm
822
823/* src_0565_8_x888 */
824.macro bilinear_src_0565_8_x888_process_last_pixel
825    bilinear_interpolate_last_pixel 0565, 8, 8888, src
826.endm
827
828.macro bilinear_src_0565_8_x888_process_two_pixels
829    bilinear_interpolate_two_pixels 0565, 8, 8888, src
830.endm
831
832.macro bilinear_src_0565_8_x888_process_four_pixels
833    bilinear_interpolate_four_pixels 0565, 8, 8888, src
834.endm
835
836.macro bilinear_src_0565_8_x888_process_pixblock_head
837    bilinear_src_0565_8_x888_process_four_pixels
838.endm
839
840.macro bilinear_src_0565_8_x888_process_pixblock_tail
841.endm
842
843.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
844    bilinear_src_0565_8_x888_process_pixblock_tail
845    bilinear_src_0565_8_x888_process_pixblock_head
846.endm
847
848/* src_0565_8_0565 */
849.macro bilinear_src_0565_8_0565_process_last_pixel
850    bilinear_interpolate_last_pixel 0565, 8, 0565, src
851.endm
852
853.macro bilinear_src_0565_8_0565_process_two_pixels
854    bilinear_interpolate_two_pixels 0565, 8, 0565, src
855.endm
856
857.macro bilinear_src_0565_8_0565_process_four_pixels
858    bilinear_interpolate_four_pixels 0565, 8, 0565, src
859.endm
860
861.macro bilinear_src_0565_8_0565_process_pixblock_head
862    bilinear_src_0565_8_0565_process_four_pixels
863.endm
864
865.macro bilinear_src_0565_8_0565_process_pixblock_tail
866.endm
867
868.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
869    bilinear_src_0565_8_0565_process_pixblock_tail
870    bilinear_src_0565_8_0565_process_pixblock_head
871.endm
872
873/* over_8888_8888 */
874.macro bilinear_over_8888_8888_process_last_pixel
875    bilinear_interpolate_last_pixel 8888, x, 8888, over
876.endm
877
878.macro bilinear_over_8888_8888_process_two_pixels
879    bilinear_interpolate_two_pixels 8888, x, 8888, over
880.endm
881
882.macro bilinear_over_8888_8888_process_four_pixels
883    bilinear_interpolate_four_pixels 8888, x, 8888, over
884.endm
885
886.macro bilinear_over_8888_8888_process_pixblock_head
887    mov         TMP1, X, asr #16
888    add         X, X, UX
889    add         TMP1, TOP, TMP1, asl #2
890    mov         TMP2, X, asr #16
891    add         X, X, UX
892    add         TMP2, TOP, TMP2, asl #2
893
894    vld1.32     {d22}, [TMP1], STRIDE
895    vld1.32     {d23}, [TMP1]
896    mov         TMP3, X, asr #16
897    add         X, X, UX
898    add         TMP3, TOP, TMP3, asl #2
899    vmull.u8    q8, d22, d28
900    vmlal.u8    q8, d23, d29
901
902    vld1.32     {d22}, [TMP2], STRIDE
903    vld1.32     {d23}, [TMP2]
904    mov         TMP4, X, asr #16
905    add         X, X, UX
906    add         TMP4, TOP, TMP4, asl #2
907    vmull.u8    q9, d22, d28
908    vmlal.u8    q9, d23, d29
909
910    vld1.32     {d22}, [TMP3], STRIDE
911    vld1.32     {d23}, [TMP3]
912    vmull.u8    q10, d22, d28
913    vmlal.u8    q10, d23, d29
914
915    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
916    vmlsl.u16   q0, d16, d30
917    vmlal.u16   q0, d17, d30
918
919    pld         [TMP4, PF_OFFS]
920    vld1.32     {d16}, [TMP4], STRIDE
921    vld1.32     {d17}, [TMP4]
922    pld         [TMP4, PF_OFFS]
923    vmull.u8    q11, d16, d28
924    vmlal.u8    q11, d17, d29
925
926    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
927    vmlsl.u16   q1, d18, d31
928    vmlal.u16   q1, d19, d31
929    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
930    vadd.u16    q12, q12, q13
931.endm
932
933.macro bilinear_over_8888_8888_process_pixblock_tail
934    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
935    vmlsl.u16   q2, d20, d30
936    vmlal.u16   q2, d21, d30
937    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
938    vmlsl.u16   q3, d22, d31
939    vmlal.u16   q3, d23, d31
940    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
941    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
942    vld1.32     {d2, d3}, [OUT, :128]
943    pld         [OUT, #(prefetch_offset * 4)]
944    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
945    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
946    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
947    vmovn.u16   d6, q0
948    vmovn.u16   d7, q2
949    vuzp.8      d6, d7
950    vuzp.8      d2, d3
951    vuzp.8      d6, d7
952    vuzp.8      d2, d3
953    vdup.32     d4, d7[1]
954    vmvn.8      d4, d4
955    vmull.u8    q11, d2, d4
956    vmull.u8    q2, d3, d4
957    vrshr.u16   q1, q11, #8
958    vrshr.u16   q10, q2, #8
959    vraddhn.u16 d2, q1, q11
960    vraddhn.u16 d3, q10, q2
961    vqadd.u8    q3, q1, q3
962    vuzp.8      d6, d7
963    vuzp.8      d6, d7
964    vadd.u16    q12, q12, q13
965    vst1.32     {d6, d7}, [OUT, :128]!
966.endm
967
968.macro bilinear_over_8888_8888_process_pixblock_tail_head
969                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
970    mov         TMP1, X, asr #16
971    add         X, X, UX
972    add         TMP1, TOP, TMP1, asl #2
973                                            vmlsl.u16   q2, d20, d30
974    mov         TMP2, X, asr #16
975    add         X, X, UX
976    add         TMP2, TOP, TMP2, asl #2
977                                            vmlal.u16   q2, d21, d30
978                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
979    vld1.32     {d20}, [TMP1], STRIDE
980                                            vmlsl.u16   q3, d22, d31
981                                            vmlal.u16   q3, d23, d31
982    vld1.32     {d21}, [TMP1]
983    vmull.u8    q8, d20, d28
984    vmlal.u8    q8, d21, d29
985                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
986                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
987                                            vld1.32     {d2, d3}, [OUT, :128]
988                                            pld         [OUT, PF_OFFS]
989                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
990                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
991    vld1.32     {d22}, [TMP2], STRIDE
992                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
993                                            vmovn.u16   d6, q0
994    vld1.32     {d23}, [TMP2]
995    vmull.u8    q9, d22, d28
996    mov         TMP3, X, asr #16
997    add         X, X, UX
998    add         TMP3, TOP, TMP3, asl #2
999    mov         TMP4, X, asr #16
1000    add         X, X, UX
1001    add         TMP4, TOP, TMP4, asl #2
1002    vmlal.u8    q9, d23, d29
1003                                            vmovn.u16   d7, q2
1004    vld1.32     {d22}, [TMP3], STRIDE
1005                                            vuzp.8      d6, d7
1006                                            vuzp.8      d2, d3
1007                                            vuzp.8      d6, d7
1008                                            vuzp.8      d2, d3
1009                                            vdup.32     d4, d7[1]
1010    vld1.32     {d23}, [TMP3]
1011                                            vmvn.8      d4, d4
1012    vmull.u8    q10, d22, d28
1013    vmlal.u8    q10, d23, d29
1014                                            vmull.u8    q11, d2, d4
1015                                            vmull.u8    q2, d3, d4
1016    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
1017    vmlsl.u16   q0, d16, d30
1018                                            vrshr.u16   q1, q11, #8
1019    vmlal.u16   q0, d17, d30
1020                                            vrshr.u16   q8, q2, #8
1021                                            vraddhn.u16 d2, q1, q11
1022                                            vraddhn.u16 d3, q8, q2
1023    pld         [TMP4, PF_OFFS]
1024    vld1.32     {d16}, [TMP4], STRIDE
1025                                            vqadd.u8    q3, q1, q3
1026    vld1.32     {d17}, [TMP4]
1027    pld         [TMP4, PF_OFFS]
1028    vmull.u8    q11, d16, d28
1029    vmlal.u8    q11, d17, d29
1030                                            vuzp.8      d6, d7
1031    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
1032                                            vuzp.8      d6, d7
1033    vmlsl.u16   q1, d18, d31
1034                                            vadd.u16    q12, q12, q13
1035    vmlal.u16   q1, d19, d31
1036    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1037    vadd.u16    q12, q12, q13
1038                                            vst1.32     {d6, d7}, [OUT, :128]!
1039.endm
1040
1041/* over_8888_8_8888 */
1042.macro bilinear_over_8888_8_8888_process_last_pixel
1043    bilinear_interpolate_last_pixel 8888, 8, 8888, over
1044.endm
1045
1046.macro bilinear_over_8888_8_8888_process_two_pixels
1047    bilinear_interpolate_two_pixels 8888, 8, 8888, over
1048.endm
1049
1050.macro bilinear_over_8888_8_8888_process_four_pixels
1051    bilinear_interpolate_four_pixels 8888, 8, 8888, over
1052.endm
1053
1054.macro bilinear_over_8888_8_8888_process_pixblock_head
1055    mov         TMP1, X, asr #16
1056    add         X, X, UX
1057    add         TMP1, TOP, TMP1, asl #2
1058    vld1.32     {d0}, [TMP1], STRIDE
1059    mov         TMP2, X, asr #16
1060    add         X, X, UX
1061    add         TMP2, TOP, TMP2, asl #2
1062    vld1.32     {d1}, [TMP1]
1063    mov         TMP3, X, asr #16
1064    add         X, X, UX
1065    add         TMP3, TOP, TMP3, asl #2
1066    vld1.32     {d2}, [TMP2], STRIDE
1067    mov         TMP4, X, asr #16
1068    add         X, X, UX
1069    add         TMP4, TOP, TMP4, asl #2
1070    vld1.32     {d3}, [TMP2]
1071    vmull.u8    q2, d0, d28
1072    vmull.u8    q3, d2, d28
1073    vmlal.u8    q2, d1, d29
1074    vmlal.u8    q3, d3, d29
1075    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
1076    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
1077    vmlsl.u16   q0, d4, d30
1078    vmlsl.u16   q1, d6, d31
1079    vmlal.u16   q0, d5, d30
1080    vmlal.u16   q1, d7, d31
1081    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1082    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1083    vld1.32     {d2}, [TMP3], STRIDE
1084    vld1.32     {d3}, [TMP3]
1085    pld         [TMP4, PF_OFFS]
1086    vld1.32     {d4}, [TMP4], STRIDE
1087    vld1.32     {d5}, [TMP4]
1088    pld         [TMP4, PF_OFFS]
1089    vmull.u8    q3, d2, d28
1090    vmlal.u8    q3, d3, d29
1091    vmull.u8    q1, d4, d28
1092    vmlal.u8    q1, d5, d29
1093    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1094    vld1.32     {d22[0]}, [MASK]!
1095    pld         [MASK, #prefetch_offset]
1096    vadd.u16    q12, q12, q13
1097    vmovn.u16   d16, q0
1098.endm
1099
1100.macro bilinear_over_8888_8_8888_process_pixblock_tail
1101    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
1102    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
1103    vmlsl.u16   q9, d6, d30
1104    vmlsl.u16   q10, d2, d31
1105    vmlal.u16   q9, d7, d30
1106    vmlal.u16   q10, d3, d31
1107    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1108    vadd.u16    q12, q12, q13
1109    vdup.32     d22, d22[0]
1110    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1111    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1112    vmovn.u16   d17, q9
1113    vld1.32     {d18, d19}, [OUT, :128]
1114    pld         [OUT, PF_OFFS]
1115    vuzp.8      d16, d17
1116    vuzp.8      d18, d19
1117    vuzp.8      d16, d17
1118    vuzp.8      d18, d19
1119    vmull.u8    q10, d16, d22
1120    vmull.u8    q11, d17, d22
1121    vrsra.u16   q10, q10, #8
1122    vrsra.u16   q11, q11, #8
1123    vrshrn.u16  d16, q10, #8
1124    vrshrn.u16  d17, q11, #8
1125    vdup.32     d22, d17[1]
1126    vmvn.8      d22, d22
1127    vmull.u8    q10, d18, d22
1128    vmull.u8    q11, d19, d22
1129    vrshr.u16   q9, q10, #8
1130    vrshr.u16   q0, q11, #8
1131    vraddhn.u16 d18, q9, q10
1132    vraddhn.u16 d19, q0, q11
1133    vqadd.u8    q9, q8, q9
1134    vuzp.8      d18, d19
1135    vuzp.8      d18, d19
1136    vst1.32     {d18, d19}, [OUT, :128]!
1137.endm
1138
1139.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1140                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
1141    mov         TMP1, X, asr #16
1142    add         X, X, UX
1143    add         TMP1, TOP, TMP1, asl #2
1144                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
1145    vld1.32     {d0}, [TMP1], STRIDE
1146    mov         TMP2, X, asr #16
1147    add         X, X, UX
1148    add         TMP2, TOP, TMP2, asl #2
1149                                            vmlsl.u16   q9, d6, d30
1150                                            vmlsl.u16   q10, d2, d31
1151    vld1.32     {d1}, [TMP1]
1152    mov         TMP3, X, asr #16
1153    add         X, X, UX
1154    add         TMP3, TOP, TMP3, asl #2
1155                                            vmlal.u16   q9, d7, d30
1156                                            vmlal.u16   q10, d3, d31
1157    vld1.32     {d2}, [TMP2], STRIDE
1158    mov         TMP4, X, asr #16
1159    add         X, X, UX
1160    add         TMP4, TOP, TMP4, asl #2
1161                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1162                                            vadd.u16    q12, q12, q13
1163    vld1.32     {d3}, [TMP2]
1164                                            vdup.32     d22, d22[0]
1165                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1166                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1167    vmull.u8    q2, d0, d28
1168    vmull.u8    q3, d2, d28
1169                                            vmovn.u16   d17, q9
1170                                            vld1.32     {d18, d19}, [OUT, :128]
1171                                            pld         [OUT, #(prefetch_offset * 4)]
1172    vmlal.u8    q2, d1, d29
1173    vmlal.u8    q3, d3, d29
1174                                            vuzp.8      d16, d17
1175                                            vuzp.8      d18, d19
1176    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
1177    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
1178                                            vuzp.8      d16, d17
1179                                            vuzp.8      d18, d19
1180    vmlsl.u16   q0, d4, d30
1181    vmlsl.u16   q1, d6, d31
1182                                            vmull.u8    q10, d16, d22
1183                                            vmull.u8    q11, d17, d22
1184    vmlal.u16   q0, d5, d30
1185    vmlal.u16   q1, d7, d31
1186                                            vrsra.u16   q10, q10, #8
1187                                            vrsra.u16   q11, q11, #8
1188    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1189    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1190                                            vrshrn.u16  d16, q10, #8
1191                                            vrshrn.u16  d17, q11, #8
1192    vld1.32     {d2}, [TMP3], STRIDE
1193                                            vdup.32     d22, d17[1]
1194    vld1.32     {d3}, [TMP3]
1195                                            vmvn.8      d22, d22
1196    pld         [TMP4, PF_OFFS]
1197    vld1.32     {d4}, [TMP4], STRIDE
1198                                            vmull.u8    q10, d18, d22
1199                                            vmull.u8    q11, d19, d22
1200    vld1.32     {d5}, [TMP4]
1201    pld         [TMP4, PF_OFFS]
1202    vmull.u8    q3, d2, d28
1203                                            vrshr.u16   q9, q10, #8
1204                                            vrshr.u16   q15, q11, #8
1205    vmlal.u8    q3, d3, d29
1206    vmull.u8    q1, d4, d28
1207                                            vraddhn.u16 d18, q9, q10
1208                                            vraddhn.u16 d19, q15, q11
1209    vmlal.u8    q1, d5, d29
1210    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1211                                            vqadd.u8    q9, q8, q9
1212    vld1.32     {d22[0]}, [MASK]!
1213                                            vuzp.8      d18, d19
1214    vadd.u16    q12, q12, q13
1215                                            vuzp.8      d18, d19
1216    vmovn.u16   d16, q0
1217                                            vst1.32     {d18, d19}, [OUT, :128]!
1218.endm
1219
1220/* add_8888_8888 */
1221.macro bilinear_add_8888_8888_process_last_pixel
1222    bilinear_interpolate_last_pixel 8888, x, 8888, add
1223.endm
1224
1225.macro bilinear_add_8888_8888_process_two_pixels
1226    bilinear_interpolate_two_pixels 8888, x, 8888, add
1227.endm
1228
1229.macro bilinear_add_8888_8888_process_four_pixels
1230    bilinear_interpolate_four_pixels 8888, x, 8888, add
1231.endm
1232
1233.macro bilinear_add_8888_8888_process_pixblock_head
1234    bilinear_add_8888_8888_process_four_pixels
1235.endm
1236
1237.macro bilinear_add_8888_8888_process_pixblock_tail
1238.endm
1239
1240.macro bilinear_add_8888_8888_process_pixblock_tail_head
1241    bilinear_add_8888_8888_process_pixblock_tail
1242    bilinear_add_8888_8888_process_pixblock_head
1243.endm
1244
1245/* add_8888_8_8888 */
1246.macro bilinear_add_8888_8_8888_process_last_pixel
1247    bilinear_interpolate_last_pixel 8888, 8, 8888, add
1248.endm
1249
1250.macro bilinear_add_8888_8_8888_process_two_pixels
1251    bilinear_interpolate_two_pixels 8888, 8, 8888, add
1252.endm
1253
1254.macro bilinear_add_8888_8_8888_process_four_pixels
1255    bilinear_interpolate_four_pixels 8888, 8, 8888, add
1256.endm
1257
1258.macro bilinear_add_8888_8_8888_process_pixblock_head
1259    bilinear_add_8888_8_8888_process_four_pixels
1260.endm
1261
1262.macro bilinear_add_8888_8_8888_process_pixblock_tail
1263.endm
1264
1265.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
1266    bilinear_add_8888_8_8888_process_pixblock_tail
1267    bilinear_add_8888_8_8888_process_pixblock_head
1268.endm
1269
1270
1271/* Bilinear scanline functions */
1272generate_bilinear_scanline_func \
1273    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
1274    8888, 8888, 2, 2, \
1275    bilinear_src_8888_8_8888_process_last_pixel, \
1276    bilinear_src_8888_8_8888_process_two_pixels, \
1277    bilinear_src_8888_8_8888_process_four_pixels, \
1278    bilinear_src_8888_8_8888_process_pixblock_head, \
1279    bilinear_src_8888_8_8888_process_pixblock_tail, \
1280    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
1281    4, 28, BILINEAR_FLAG_USE_MASK
1282
1283generate_bilinear_scanline_func \
1284    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
1285    8888, 0565, 2, 1, \
1286    bilinear_src_8888_8_0565_process_last_pixel, \
1287    bilinear_src_8888_8_0565_process_two_pixels, \
1288    bilinear_src_8888_8_0565_process_four_pixels, \
1289    bilinear_src_8888_8_0565_process_pixblock_head, \
1290    bilinear_src_8888_8_0565_process_pixblock_tail, \
1291    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
1292    4, 28, BILINEAR_FLAG_USE_MASK
1293
1294generate_bilinear_scanline_func \
1295    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
1296    0565, 8888, 1, 2, \
1297    bilinear_src_0565_8_x888_process_last_pixel, \
1298    bilinear_src_0565_8_x888_process_two_pixels, \
1299    bilinear_src_0565_8_x888_process_four_pixels, \
1300    bilinear_src_0565_8_x888_process_pixblock_head, \
1301    bilinear_src_0565_8_x888_process_pixblock_tail, \
1302    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
1303    4, 28, BILINEAR_FLAG_USE_MASK
1304
1305generate_bilinear_scanline_func \
1306    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
1307    0565, 0565, 1, 1, \
1308    bilinear_src_0565_8_0565_process_last_pixel, \
1309    bilinear_src_0565_8_0565_process_two_pixels, \
1310    bilinear_src_0565_8_0565_process_four_pixels, \
1311    bilinear_src_0565_8_0565_process_pixblock_head, \
1312    bilinear_src_0565_8_0565_process_pixblock_tail, \
1313    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
1314    4, 28, BILINEAR_FLAG_USE_MASK
1315
1316generate_bilinear_scanline_func \
1317    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
1318    8888, 8888, 2, 2, \
1319    bilinear_over_8888_8888_process_last_pixel, \
1320    bilinear_over_8888_8888_process_two_pixels, \
1321    bilinear_over_8888_8888_process_four_pixels, \
1322    bilinear_over_8888_8888_process_pixblock_head, \
1323    bilinear_over_8888_8888_process_pixblock_tail, \
1324    bilinear_over_8888_8888_process_pixblock_tail_head, \
1325    4, 28, 0
1326
1327generate_bilinear_scanline_func \
1328    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
1329    8888, 8888, 2, 2, \
1330    bilinear_over_8888_8_8888_process_last_pixel, \
1331    bilinear_over_8888_8_8888_process_two_pixels, \
1332    bilinear_over_8888_8_8888_process_four_pixels, \
1333    bilinear_over_8888_8_8888_process_pixblock_head, \
1334    bilinear_over_8888_8_8888_process_pixblock_tail, \
1335    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
1336    4, 28, BILINEAR_FLAG_USE_MASK
1337
1338generate_bilinear_scanline_func \
1339    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
1340    8888, 8888, 2, 2, \
1341    bilinear_add_8888_8888_process_last_pixel, \
1342    bilinear_add_8888_8888_process_two_pixels, \
1343    bilinear_add_8888_8888_process_four_pixels, \
1344    bilinear_add_8888_8888_process_pixblock_head, \
1345    bilinear_add_8888_8888_process_pixblock_tail, \
1346    bilinear_add_8888_8888_process_pixblock_tail_head, \
1347    4, 28, 0
1348
1349generate_bilinear_scanline_func \
1350    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
1351    8888, 8888, 2, 2, \
1352    bilinear_add_8888_8_8888_process_last_pixel, \
1353    bilinear_add_8888_8_8888_process_two_pixels, \
1354    bilinear_add_8888_8_8888_process_four_pixels, \
1355    bilinear_add_8888_8_8888_process_pixblock_head, \
1356    bilinear_add_8888_8_8888_process_pixblock_tail, \
1357    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
1358    4, 28, BILINEAR_FLAG_USE_MASK
1359