1/*
2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission.  The copyright holders make no
12 * representations about the suitability of this software for any purpose.  It
13 * is provided "as is" without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author:  Ben Avison (bavison@riscosopen.org)
25 *
26 */
27
28/* Prevent the stack from becoming executable */
29#if defined(__linux__) && defined(__ELF__)
30.section .note.GNU-stack,"",%progbits
31#endif
32
33	.text
34	.arch armv6
35	.object_arch armv4
36	.arm
37	.altmacro
38	.p2align 2
39
40#include "pixman-arm-asm.h"
41#include "pixman-arm-simd-asm.h"
42
43/* A head macro should do all processing which results in an output of up to
44 * 16 bytes, as far as the final load instruction. The corresponding tail macro
45 * should complete the processing of the up-to-16 bytes. The calling macro will
46 * sometimes choose to insert a preload or a decrement of X between them.
47 *   cond           ARM condition code for code block
48 *   numbytes       Number of output bytes that should be generated this time
49 *   firstreg       First WK register in which to place output
50 *   unaligned_src  Whether to use non-wordaligned loads of source image
51 *   unaligned_mask Whether to use non-wordaligned loads of mask image
52 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
53 */
54
55.macro blit_init
56        line_saved_regs STRIDE_D, STRIDE_S
57.endm
58
59.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
60        pixld   cond, numbytes, firstreg, SRC, unaligned_src
61.endm
62
63.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
64    WK4     .req    STRIDE_D
65    WK5     .req    STRIDE_S
66    WK6     .req    MASK
67    WK7     .req    STRIDE_M
68110:    pixld   , 16, 0, SRC, unaligned_src
69        pixld   , 16, 4, SRC, unaligned_src
70        pld     [SRC, SCRATCH]
71        pixst   , 16, 0, DST
72        pixst   , 16, 4, DST
73        subs    X, X, #32*8/src_bpp
74        bhs     110b
75    .unreq  WK4
76    .unreq  WK5
77    .unreq  WK6
78    .unreq  WK7
79.endm
80
81generate_composite_function \
82    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
83    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
84    4, /* prefetch distance */ \
85    blit_init, \
86    nop_macro, /* newline */ \
87    nop_macro, /* cleanup */ \
88    blit_process_head, \
89    nop_macro, /* process tail */ \
90    blit_inner_loop
91
92generate_composite_function \
93    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
94    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
95    4, /* prefetch distance */ \
96    blit_init, \
97    nop_macro, /* newline */ \
98    nop_macro, /* cleanup */ \
99    blit_process_head, \
100    nop_macro, /* process tail */ \
101    blit_inner_loop
102
103generate_composite_function \
104    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
105    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
106    3, /* prefetch distance */ \
107    blit_init, \
108    nop_macro, /* newline */ \
109    nop_macro, /* cleanup */ \
110    blit_process_head, \
111    nop_macro, /* process tail */ \
112    blit_inner_loop
113
114/******************************************************************************/
115
116.macro src_n_8888_init
117        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
118        mov     STRIDE_S, SRC
119        mov     MASK, SRC
120        mov     STRIDE_M, SRC
121.endm
122
123.macro src_n_0565_init
124        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
125        orr     SRC, SRC, lsl #16
126        mov     STRIDE_S, SRC
127        mov     MASK, SRC
128        mov     STRIDE_M, SRC
129.endm
130
131.macro src_n_8_init
132        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
133        orr     SRC, SRC, lsl #8
134        orr     SRC, SRC, lsl #16
135        mov     STRIDE_S, SRC
136        mov     MASK, SRC
137        mov     STRIDE_M, SRC
138.endm
139
140.macro fill_process_tail  cond, numbytes, firstreg
141    WK4     .req    SRC
142    WK5     .req    STRIDE_S
143    WK6     .req    MASK
144    WK7     .req    STRIDE_M
145        pixst   cond, numbytes, 4, DST
146    .unreq  WK4
147    .unreq  WK5
148    .unreq  WK6
149    .unreq  WK7
150.endm
151
152generate_composite_function \
153    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
154    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
155    0, /* prefetch distance doesn't apply */ \
156    src_n_8888_init \
157    nop_macro, /* newline */ \
158    nop_macro /* cleanup */ \
159    nop_macro /* process head */ \
160    fill_process_tail
161
162generate_composite_function \
163    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
164    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
165    0, /* prefetch distance doesn't apply */ \
166    src_n_0565_init \
167    nop_macro, /* newline */ \
168    nop_macro /* cleanup */ \
169    nop_macro /* process head */ \
170    fill_process_tail
171
172generate_composite_function \
173    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
174    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
175    0, /* prefetch distance doesn't apply */ \
176    src_n_8_init \
177    nop_macro, /* newline */ \
178    nop_macro /* cleanup */ \
179    nop_macro /* process head */ \
180    fill_process_tail
181
182/******************************************************************************/
183
184.macro src_x888_8888_pixel, cond, reg
185        orr&cond WK&reg, WK&reg, #0xFF000000
186.endm
187
188.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
189        pixld   cond, numbytes, firstreg, SRC, unaligned_src
190.endm
191
192.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
193        src_x888_8888_pixel cond, %(firstreg+0)
194 .if numbytes >= 8
195        src_x888_8888_pixel cond, %(firstreg+1)
196  .if numbytes == 16
197        src_x888_8888_pixel cond, %(firstreg+2)
198        src_x888_8888_pixel cond, %(firstreg+3)
199  .endif
200 .endif
201.endm
202
203generate_composite_function \
204    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
205    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
206    3, /* prefetch distance */ \
207    nop_macro, /* init */ \
208    nop_macro, /* newline */ \
209    nop_macro, /* cleanup */ \
210    pixman_composite_src_x888_8888_process_head, \
211    pixman_composite_src_x888_8888_process_tail
212
213/******************************************************************************/
214
215.macro src_0565_8888_init
216        /* Hold loop invariants in MASK and STRIDE_M */
217        ldr     MASK, =0x07E007E0
218        mov     STRIDE_M, #0xFF000000
219        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
220        ldr     SCRATCH, =0x80008000
221        uadd8   SCRATCH, SCRATCH, SCRATCH
222.endm
223
224.macro src_0565_8888_2pixels, reg1, reg2
225        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
226        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
227        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
228        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
229        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
230        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
231        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
232        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
233        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
234        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
235        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
236        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
237        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
238        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
239        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
240.endm
241
242/* This version doesn't need STRIDE_M, but is one instruction longer.
243   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
244        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
245        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
246        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
247        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
248        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
249        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
250        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
251        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
252        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
253        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
254        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
255        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
256        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
257        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
258        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
259        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
260*/
261
262.macro src_0565_8888_1pixel, reg
263        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
264        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
265        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
266        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
267        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
268        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
269        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
270        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
271        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
272.endm
273
274.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
275 .if numbytes == 16
276        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
277 .elseif numbytes == 8
278        pixld   , 4, firstreg, SRC, unaligned_src
279 .elseif numbytes == 4
280        pixld   , 2, firstreg, SRC, unaligned_src
281 .endif
282.endm
283
284.macro src_0565_8888_process_tail   cond, numbytes, firstreg
285 .if numbytes == 16
286        src_0565_8888_2pixels firstreg, %(firstreg+1)
287        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
288 .elseif numbytes == 8
289        src_0565_8888_2pixels firstreg, %(firstreg+1)
290 .else
291        src_0565_8888_1pixel firstreg
292 .endif
293.endm
294
295generate_composite_function \
296    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
297    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
298    3, /* prefetch distance */ \
299    src_0565_8888_init, \
300    nop_macro, /* newline */ \
301    nop_macro, /* cleanup */ \
302    src_0565_8888_process_head, \
303    src_0565_8888_process_tail
304
305/******************************************************************************/
306
307.macro src_x888_0565_init
308        /* Hold loop invariant in MASK */
309        ldr     MASK, =0x001F001F
310        line_saved_regs  STRIDE_S, ORIG_W
311.endm
312
313.macro src_x888_0565_1pixel  s, d
314        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
315        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
316        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
317        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
318        /* Top 16 bits are discarded during the following STRH */
319.endm
320
321.macro src_x888_0565_2pixels  slo, shi, d, tmp
322        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
323        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
324        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
325        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
326        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
327        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
328        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
329        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
330        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
331.endm
332
333.macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
334        WK4     .req    STRIDE_S
335        WK5     .req    STRIDE_M
336        WK6     .req    WK3
337        WK7     .req    ORIG_W
338 .if numbytes == 16
339        pixld   , 16, 4, SRC, 0
340        src_x888_0565_2pixels  4, 5, 0, 0
341        pixld   , 8, 4, SRC, 0
342        src_x888_0565_2pixels  6, 7, 1, 1
343        pixld   , 8, 6, SRC, 0
344 .else
345        pixld   , numbytes*2, 4, SRC, 0
346 .endif
347.endm
348
349.macro src_x888_0565_process_tail   cond, numbytes, firstreg
350 .if numbytes == 16
351        src_x888_0565_2pixels  4, 5, 2, 2
352        src_x888_0565_2pixels  6, 7, 3, 4
353 .elseif numbytes == 8
354        src_x888_0565_2pixels  4, 5, 1, 1
355        src_x888_0565_2pixels  6, 7, 2, 2
356 .elseif numbytes == 4
357        src_x888_0565_2pixels  4, 5, 1, 1
358 .else
359        src_x888_0565_1pixel  4, 1
360 .endif
361 .if numbytes == 16
362        pixst   , numbytes, 0, DST
363 .else
364        pixst   , numbytes, 1, DST
365 .endif
366        .unreq  WK4
367        .unreq  WK5
368        .unreq  WK6
369        .unreq  WK7
370.endm
371
372generate_composite_function \
373    pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
374    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
375    3, /* prefetch distance */ \
376    src_x888_0565_init, \
377    nop_macro, /* newline */ \
378    nop_macro, /* cleanup */ \
379    src_x888_0565_process_head, \
380    src_x888_0565_process_tail
381
382/******************************************************************************/
383
384.macro add_8_8_8pixels  cond, dst1, dst2
385        uqadd8&cond  WK&dst1, WK&dst1, MASK
386        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
387.endm
388
389.macro add_8_8_4pixels  cond, dst
390        uqadd8&cond  WK&dst, WK&dst, MASK
391.endm
392
393.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
394    WK4     .req    MASK
395    WK5     .req    STRIDE_M
396 .if numbytes == 16
397        pixld   cond, 8, 4, SRC, unaligned_src
398        pixld   cond, 16, firstreg, DST, 0
399        add_8_8_8pixels cond, firstreg, %(firstreg+1)
400        pixld   cond, 8, 4, SRC, unaligned_src
401 .else
402        pixld   cond, numbytes, 4, SRC, unaligned_src
403        pixld   cond, numbytes, firstreg, DST, 0
404 .endif
405    .unreq  WK4
406    .unreq  WK5
407.endm
408
409.macro add_8_8_process_tail  cond, numbytes, firstreg
410 .if numbytes == 16
411        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
412 .elseif numbytes == 8
413        add_8_8_8pixels cond, firstreg, %(firstreg+1)
414 .else
415        add_8_8_4pixels cond, firstreg
416 .endif
417.endm
418
419generate_composite_function \
420    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
421    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
422    2, /* prefetch distance */ \
423    nop_macro, /* init */ \
424    nop_macro, /* newline */ \
425    nop_macro, /* cleanup */ \
426    add_8_8_process_head, \
427    add_8_8_process_tail
428
429/******************************************************************************/
430
431.macro over_8888_8888_init
432        /* Hold loop invariant in MASK */
433        ldr     MASK, =0x00800080
434        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
435        uadd8   SCRATCH, MASK, MASK
436        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
437.endm
438
439.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
440    WK4     .req    STRIDE_D
441    WK5     .req    STRIDE_S
442    WK6     .req    STRIDE_M
443    WK7     .req    ORIG_W
444        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
445        pixld   , numbytes, firstreg, DST, 0
446    .unreq  WK4
447    .unreq  WK5
448    .unreq  WK6
449    .unreq  WK7
450.endm
451
452.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
453        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
454        teq     WK&reg0, #0
455 .if numbytes > 4
456        teqeq   WK&reg1, #0
457  .if numbytes > 8
458        teqeq   WK&reg2, #0
459        teqeq   WK&reg3, #0
460  .endif
461 .endif
462.endm
463
464.macro over_8888_8888_prepare  next
465        mov     WK&next, WK&next, lsr #24
466.endm
467
468.macro over_8888_8888_1pixel src, dst, offset, next
469        /* src = destination component multiplier */
470        rsb     WK&src, WK&src, #255
471        /* Split even/odd bytes of dst into SCRATCH/dst */
472        uxtb16  SCRATCH, WK&dst
473        uxtb16  WK&dst, WK&dst, ror #8
474        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
475        mla     SCRATCH, SCRATCH, WK&src, MASK
476        mla     WK&dst, WK&dst, WK&src, MASK
477        /* Where we would have had a stall between the result of the first MLA and the shifter input,
478         * reload the complete source pixel */
479        ldr     WK&src, [SRC, #offset]
480        /* Multiply by 257/256 to approximate 256/255 */
481        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
482        /* In this stall, start processing the next pixel */
483 .if offset < -4
484        mov     WK&next, WK&next, lsr #24
485 .endif
486        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
487        /* Recombine even/odd bytes of multiplied destination */
488        mov     SCRATCH, SCRATCH, ror #8
489        sel     WK&dst, SCRATCH, WK&dst
490        /* Saturated add of source to multiplied destination */
491        uqadd8  WK&dst, WK&dst, WK&src
492.endm
493
494.macro over_8888_8888_process_tail  cond, numbytes, firstreg
495    WK4     .req    STRIDE_D
496    WK5     .req    STRIDE_S
497    WK6     .req    STRIDE_M
498    WK7     .req    ORIG_W
499        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
500        beq     10f
501        over_8888_8888_prepare  %(4+firstreg)
502 .set PROCESS_REG, firstreg
503 .set PROCESS_OFF, -numbytes
504 .rept numbytes / 4
505        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
506  .set PROCESS_REG, PROCESS_REG+1
507  .set PROCESS_OFF, PROCESS_OFF+4
508 .endr
509        pixst   , numbytes, firstreg, DST
51010:
511    .unreq  WK4
512    .unreq  WK5
513    .unreq  WK6
514    .unreq  WK7
515.endm
516
517generate_composite_function \
518    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
519    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
520    2, /* prefetch distance */ \
521    over_8888_8888_init, \
522    nop_macro, /* newline */ \
523    nop_macro, /* cleanup */ \
524    over_8888_8888_process_head, \
525    over_8888_8888_process_tail
526
527/******************************************************************************/
528
529/* Multiply each byte of a word by a byte.
530 * Useful when there aren't any obvious ways to fill the stalls with other instructions.
531 * word  Register containing 4 bytes
532 * byte  Register containing byte multiplier (bits 8-31 must be 0)
533 * tmp   Scratch register
534 * half  Register containing the constant 0x00800080
535 * GE[3:0] bits must contain 0101
536 */
537.macro mul_8888_8  word, byte, tmp, half
538        /* Split even/odd bytes of word apart */
539        uxtb16  tmp, word
540        uxtb16  word, word, ror #8
541        /* Multiply bytes together with rounding, then by 257/256 */
542        mla     tmp, tmp, byte, half
543        mla     word, word, byte, half /* 1 stall follows */
544        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
545        uxtab16 word, word, word, ror #8
546        /* Recombine bytes */
547        mov     tmp, tmp, ror #8
548        sel     word, tmp, word
549.endm
550
551/******************************************************************************/
552
553.macro over_8888_n_8888_init
554        /* Mask is constant */
555        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
556        /* Hold loop invariant in STRIDE_M */
557        ldr     STRIDE_M, =0x00800080
558        /* We only want the alpha bits of the constant mask */
559        mov     MASK, MASK, lsr #24
560        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
561        uadd8   SCRATCH, STRIDE_M, STRIDE_M
562        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
563.endm
564
565.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
566    WK4     .req    Y
567    WK5     .req    STRIDE_D
568    WK6     .req    STRIDE_S
569    WK7     .req    ORIG_W
570        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
571        pixld   , numbytes, firstreg, DST, 0
572    .unreq  WK4
573    .unreq  WK5
574    .unreq  WK6
575    .unreq  WK7
576.endm
577
578.macro over_8888_n_8888_1pixel src, dst
579        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
580        sub     WK7, WK6, WK&src, lsr #24
581        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
582        uqadd8  WK&dst, WK&dst, WK&src
583.endm
584
585.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
586    WK4     .req    Y
587    WK5     .req    STRIDE_D
588    WK6     .req    STRIDE_S
589    WK7     .req    ORIG_W
590        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
591        beq     10f
592        mov     WK6, #255
593 .set PROCESS_REG, firstreg
594 .rept numbytes / 4
595  .if numbytes == 16 && PROCESS_REG == 2
596        /* We're using WK6 and WK7 as temporaries, so half way through
597         * 4 pixels, reload the second two source pixels but this time
598         * into WK4 and WK5 */
599        ldmdb   SRC, {WK4, WK5}
600  .endif
601        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
602  .set PROCESS_REG, PROCESS_REG+1
603 .endr
604        pixst   , numbytes, firstreg, DST
60510:
606    .unreq  WK4
607    .unreq  WK5
608    .unreq  WK6
609    .unreq  WK7
610.endm
611
612generate_composite_function \
613    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
614    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
615    2, /* prefetch distance */ \
616    over_8888_n_8888_init, \
617    nop_macro, /* newline */ \
618    nop_macro, /* cleanup */ \
619    over_8888_n_8888_process_head, \
620    over_8888_n_8888_process_tail
621
622/******************************************************************************/
623
624.macro over_n_8_8888_init
625        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
626        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
627        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
628        ldr     SCRATCH, =0x00800080
629        uxtb16  STRIDE_S, SRC
630        uxtb16  SRC, SRC, ror #8
631        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
632        uadd8   SCRATCH, SCRATCH, SCRATCH
633        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
634.endm
635
636.macro over_n_8_8888_newline
637        ldr     STRIDE_D, =0x00800080
638        b       1f
639 .ltorg
6401:
641.endm
642
643.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
644    WK4     .req    STRIDE_M
645        pixld   , numbytes/4, 4, MASK, unaligned_mask
646        pixld   , numbytes, firstreg, DST, 0
647    .unreq  WK4
648.endm
649
650.macro over_n_8_8888_1pixel src, dst
651        uxtb    Y, WK4, ror #src*8
652        /* Trailing part of multiplication of source */
653        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
654        mla     Y, SRC, Y, STRIDE_D
655        mov     ORIG_W, #255
656        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
657        uxtab16 Y, Y, Y, ror #8
658        mov     SCRATCH, SCRATCH, ror #8
659        sub     ORIG_W, ORIG_W, Y, lsr #24
660        sel     Y, SCRATCH, Y
661        /* Then multiply the destination */
662        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
663        uqadd8  WK&dst, WK&dst, Y
664.endm
665
666.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
667    WK4     .req    STRIDE_M
668        teq     WK4, #0
669        beq     10f
670 .set PROCESS_REG, firstreg
671 .rept numbytes / 4
672        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
673  .set PROCESS_REG, PROCESS_REG+1
674 .endr
675        pixst   , numbytes, firstreg, DST
67610:
677    .unreq  WK4
678.endm
679
680generate_composite_function \
681    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
682    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
683    2, /* prefetch distance */ \
684    over_n_8_8888_init, \
685    over_n_8_8888_newline, \
686    nop_macro, /* cleanup */ \
687    over_n_8_8888_process_head, \
688    over_n_8_8888_process_tail
689
690/******************************************************************************/
691
692.macro over_reverse_n_8888_init
693        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
694        ldr     MASK, =0x00800080
695        /* Split source pixel into RB/AG parts */
696        uxtb16  STRIDE_S, SRC
697        uxtb16  STRIDE_M, SRC, ror #8
698        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
699        uadd8   SCRATCH, MASK, MASK
700        line_saved_regs  STRIDE_D, ORIG_W
701.endm
702
703.macro over_reverse_n_8888_newline
704        mov     STRIDE_D, #0xFF
705.endm
706
707.macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
708        pixld   , numbytes, firstreg, DST, 0
709.endm
710
711.macro over_reverse_n_8888_1pixel  d, is_only
712        teq     WK&d, #0
713        beq     8f       /* replace with source */
714        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
715 .if is_only == 1
716        beq     49f      /* skip store */
717 .else
718        beq     9f       /* write same value back */
719 .endif
720        mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
721        mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
722        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
723        uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
724        mov     SCRATCH, SCRATCH, ror #8
725        sel     ORIG_W, SCRATCH, ORIG_W
726        uqadd8  WK&d, WK&d, ORIG_W
727        b       9f
7288:      mov     WK&d, SRC
7299:
730.endm
731
732.macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
733 .if numbytes == 4
734        over_reverse_n_8888_1pixel  reg1, 1
735 .else
736        and     SCRATCH, WK&reg1, WK&reg2
737  .if numbytes == 16
738        and     SCRATCH, SCRATCH, WK&reg3
739        and     SCRATCH, SCRATCH, WK&reg4
740  .endif
741        mvns    SCRATCH, SCRATCH, asr #24
742        beq     49f /* skip store if all opaque */
743        over_reverse_n_8888_1pixel  reg1, 0
744        over_reverse_n_8888_1pixel  reg2, 0
745  .if numbytes == 16
746        over_reverse_n_8888_1pixel  reg3, 0
747        over_reverse_n_8888_1pixel  reg4, 0
748  .endif
749 .endif
750        pixst   , numbytes, reg1, DST
75149:
752.endm
753
754.macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
755        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
756.endm
757
758generate_composite_function \
759    pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
760    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
761    3, /* prefetch distance */ \
762    over_reverse_n_8888_init, \
763    over_reverse_n_8888_newline, \
764    nop_macro, /* cleanup */ \
765    over_reverse_n_8888_process_head, \
766    over_reverse_n_8888_process_tail
767
768/******************************************************************************/
769
770.macro over_white_8888_8888_ca_init
771        HALF    .req    SRC
772        TMP0    .req    STRIDE_D
773        TMP1    .req    STRIDE_S
774        TMP2    .req    STRIDE_M
775        TMP3    .req    ORIG_W
776        WK4     .req    SCRATCH
777        line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
778        ldr     SCRATCH, =0x800080
779        mov     HALF, #0x80
780        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
781        uadd8   SCRATCH, SCRATCH, SCRATCH
782        .set DST_PRELOAD_BIAS, 8
783.endm
784
785.macro over_white_8888_8888_ca_cleanup
786        .set DST_PRELOAD_BIAS, 0
787        .unreq  HALF
788        .unreq  TMP0
789        .unreq  TMP1
790        .unreq  TMP2
791        .unreq  TMP3
792        .unreq  WK4
793.endm
794
795.macro over_white_8888_8888_ca_combine  m, d
796        uxtb16  TMP1, TMP0                /* rb_notmask */
797        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
798        smlatt  TMP3, TMP2, TMP1, HALF    /* red */
799        smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
800        uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
801        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
802        smlatt  d, TMP1, TMP0, HALF       /* alpha */
803        smlabb  TMP1, TMP1, TMP0, HALF    /* green */
804        pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
805        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
806        uxtab16 TMP0, TMP0, TMP0, ror #8
807        uxtab16 TMP1, TMP1, TMP1, ror #8
808        mov     TMP0, TMP0, ror #8
809        sel     d, TMP0, TMP1
810        uqadd8  d, d, m                   /* d is a late result */
811.endm
812
813.macro over_white_8888_8888_ca_1pixel_head
814        pixld   , 4, 1, MASK, 0
815        pixld   , 4, 3, DST, 0
816.endm
817
818.macro over_white_8888_8888_ca_1pixel_tail
819        mvn     TMP0, WK1
820        teq     WK1, WK1, asr #32
821        bne     01f
822        bcc     03f
823        mov     WK3, WK1
824        b       02f
82501:     over_white_8888_8888_ca_combine WK1, WK3
82602:     pixst   , 4, 3, DST
82703:
828.endm
829
830.macro over_white_8888_8888_ca_2pixels_head
831        pixld   , 8, 1, MASK, 0
832.endm
833
834.macro over_white_8888_8888_ca_2pixels_tail
835        pixld   , 8, 3, DST
836        mvn     TMP0, WK1
837        teq     WK1, WK1, asr #32
838        bne     01f
839        movcs   WK3, WK1
840        bcs     02f
841        teq     WK2, #0
842        beq     05f
843        b       02f
84401:     over_white_8888_8888_ca_combine WK1, WK3
84502:     mvn     TMP0, WK2
846        teq     WK2, WK2, asr #32
847        bne     03f
848        movcs   WK4, WK2
849        b       04f
85003:     over_white_8888_8888_ca_combine WK2, WK4
85104:     pixst   , 8, 3, DST
85205:
853.endm
854
855.macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
856 .if numbytes == 4
857        over_white_8888_8888_ca_1pixel_head
858 .else
859  .if numbytes == 16
860        over_white_8888_8888_ca_2pixels_head
861        over_white_8888_8888_ca_2pixels_tail
862  .endif
863        over_white_8888_8888_ca_2pixels_head
864 .endif
865.endm
866
867.macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
868 .if numbytes == 4
869        over_white_8888_8888_ca_1pixel_tail
870 .else
871        over_white_8888_8888_ca_2pixels_tail
872 .endif
873.endm
874
875generate_composite_function \
876    pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
877    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
878    2, /* prefetch distance */ \
879    over_white_8888_8888_ca_init, \
880    nop_macro, /* newline */ \
881    over_white_8888_8888_ca_cleanup, \
882    over_white_8888_8888_ca_process_head, \
883    over_white_8888_8888_ca_process_tail
884
885
886.macro over_n_8888_8888_ca_init
887        /* Set up constants. RB_SRC and AG_SRC are in registers;
888         * RB_FLDS, A_SRC, and the two HALF values need to go on the
889         * stack (and the ful SRC value is already there) */
890        ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
891        mov     WK0, #0x00FF0000
892        orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
893        mov     WK1, #0x80             /* HALF default value */
894        mov     WK2, SCRATCH, lsr #24  /* A_SRC */
895        orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
896        push    {WK0-WK3}
897 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
898        uxtb16  SRC, SCRATCH
899        uxtb16  STRIDE_S, SCRATCH, ror #8
900
901        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
902        uadd8   SCRATCH, WK3, WK3
903
904        .unreq  WK0
905        .unreq  WK1
906        .unreq  WK2
907        .unreq  WK3
908        WK0     .req    Y
909        WK1     .req    STRIDE_D
910        RB_SRC  .req    SRC
911        AG_SRC  .req    STRIDE_S
912        WK2     .req    STRIDE_M
913        RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
914        A_SRC   .req    r8
915        HALF    .req    r9
916        WK3     .req    r10
917        WK4     .req    r11
918        WK5     .req    SCRATCH
919        WK6     .req    ORIG_W
920
921        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
922.endm
923
924.macro over_n_8888_8888_ca_cleanup
925        add     sp, sp, #16
926 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
927
928        .unreq  WK0
929        .unreq  WK1
930        .unreq  RB_SRC
931        .unreq  AG_SRC
932        .unreq  WK2
933        .unreq  RB_FLDS
934        .unreq  A_SRC
935        .unreq  HALF
936        .unreq  WK3
937        .unreq  WK4
938        .unreq  WK5
939        .unreq  WK6
940        WK0     .req    r8
941        WK1     .req    r9
942        WK2     .req    r10
943        WK3     .req    r11
944.endm
945
946.macro over_n_8888_8888_ca_1pixel_head
947        pixld   , 4, 6, MASK, 0
948        pixld   , 4, 0, DST, 0
949.endm
950
951.macro over_n_8888_8888_ca_1pixel_tail
952        ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
953        uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
954        teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
955        bne     20f
956        bcc     40f
957        /* Mask is fully opaque (all channels) */
958        ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
959        eors    A_SRC, A_SRC, #0xFF
960        bne     10f
961        /* Source is also opaque - same as src_8888_8888 */
962        mov     WK0, WK6
963        b       30f
96410:     /* Same as over_8888_8888 */
965        mul_8888_8 WK0, A_SRC, WK5, HALF
966        uqadd8  WK0, WK0, WK6
967        b       30f
96820:     /* No simplifications possible - do it the hard way */
969        uxtb16  WK2, WK6, ror #8         /* ag_mask */
970        mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
971        mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
972        ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
973        uxtb16  WK5, WK0                 /* rb_dest */
974        uxtab16 WK3, WK3, WK3, ror #8
975        uxtb16  WK6, WK0, ror #8         /* ag_dest */
976        uxtab16 WK4, WK4, WK4, ror #8
977        smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
978        smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
979        bic     WK3, RB_FLDS, WK3, lsr #8
980        bic     WK4, RB_FLDS, WK4, lsr #8
981        pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
982        smlatt  WK0, WK5, WK3, HALF      /* red2 */
983        smlabb  WK3, WK5, WK3, HALF      /* blue2 */
984        uxtab16 WK1, WK1, WK1, ror #8
985        smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
986        pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
987        smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
988        smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
989        smlabb  WK4, WK6, WK4, HALF      /* green2 */
990        pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
991        uxtab16 WK3, WK3, WK3, ror #8
992        pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
993        uxtab16 WK0, WK0, WK0, ror #8
994        uxtab16 WK4, WK4, WK4, ror #8
995        mov     WK1, WK1, ror #8
996        mov     WK3, WK3, ror #8
997        sel     WK2, WK1, WK0            /* recombine source*mask */
998        sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
999        uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
100030:     /* The destination buffer is already in the L1 cache, so
1001         * there's little point in amalgamating writes */
1002        pixst   , 4, 0, DST
100340:
1004.endm
1005
1006.macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1007 .rept (numbytes / 4) - 1
1008        over_n_8888_8888_ca_1pixel_head
1009        over_n_8888_8888_ca_1pixel_tail
1010 .endr
1011        over_n_8888_8888_ca_1pixel_head
1012.endm
1013
1014.macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
1015        over_n_8888_8888_ca_1pixel_tail
1016.endm
1017
1018pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
1019        ldr     ip, [sp]
1020        cmp     ip, #-1
1021        beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
1022        /* else drop through... */
1023 .endfunc
1024generate_composite_function \
1025    pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
1026    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
1027    2, /* prefetch distance */ \
1028    over_n_8888_8888_ca_init, \
1029    nop_macro, /* newline */ \
1030    over_n_8888_8888_ca_cleanup, \
1031    over_n_8888_8888_ca_process_head, \
1032    over_n_8888_8888_ca_process_tail
1033
1034/******************************************************************************/
1035
1036.macro in_reverse_8888_8888_init
1037        /* Hold loop invariant in MASK */
1038        ldr     MASK, =0x00800080
1039        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1040        uadd8   SCRATCH, MASK, MASK
1041        /* Offset the source pointer: we only need the alpha bytes */
1042        add     SRC, SRC, #3
1043        line_saved_regs  ORIG_W
1044.endm
1045
1046.macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
1047        ldrb    ORIG_W, [SRC], #4
1048 .if numbytes >= 8
1049        ldrb    WK&reg1, [SRC], #4
1050  .if numbytes == 16
1051        ldrb    WK&reg2, [SRC], #4
1052        ldrb    WK&reg3, [SRC], #4
1053  .endif
1054 .endif
1055        add     DST, DST, #numbytes
1056.endm
1057
1058.macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1059        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
1060.endm
1061
1062.macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
1063 .if is_only != 1
1064        movs    s, ORIG_W
1065  .if offset != 0
1066        ldrb    ORIG_W, [SRC, #offset]
1067  .endif
1068        beq     01f
1069        teq     STRIDE_M, #0xFF
1070        beq     02f
1071 .endif
1072        uxtb16  SCRATCH, d                 /* rb_dest */
1073        uxtb16  d, d, ror #8               /* ag_dest */
1074        mla     SCRATCH, SCRATCH, s, MASK
1075        mla     d, d, s, MASK
1076        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
1077        uxtab16 d, d, d, ror #8
1078        mov     SCRATCH, SCRATCH, ror #8
1079        sel     d, SCRATCH, d
1080        b       02f
1081 .if offset == 0
108248:     /* Last mov d,#0 of the set - used as part of shortcut for
1083         * source values all 0 */
1084 .endif
108501:     mov     d, #0
108602:
1087.endm
1088
1089.macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
1090 .if numbytes == 4
1091        teq     ORIG_W, ORIG_W, asr #32
1092        ldrne   WK&reg1, [DST, #-4]
1093 .elseif numbytes == 8
1094        teq     ORIG_W, WK&reg1
1095        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1096        ldmnedb DST, {WK&reg1-WK&reg2}
1097 .else
1098        teq     ORIG_W, WK&reg1
1099        teqeq   ORIG_W, WK&reg2
1100        teqeq   ORIG_W, WK&reg3
1101        teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1102        ldmnedb DST, {WK&reg1-WK&reg4}
1103 .endif
1104        cmnne   DST, #0   /* clear C if NE */
1105        bcs     49f       /* no writes to dest if source all -1 */
1106        beq     48f       /* set dest to all 0 if source all 0 */
1107 .if numbytes == 4
1108        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
1109        str     WK&reg1, [DST, #-4]
1110 .elseif numbytes == 8
1111        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
1112        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
1113        stmdb   DST, {WK&reg1-WK&reg2}
1114 .else
1115        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
1116        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
1117        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
1118        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
1119        stmdb   DST, {WK&reg1-WK&reg4}
1120 .endif
112149:
1122.endm
1123
1124.macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
1125        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
1126.endm
1127
1128generate_composite_function \
1129    pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
1130    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
1131    2, /* prefetch distance */ \
1132    in_reverse_8888_8888_init, \
1133    nop_macro, /* newline */ \
1134    nop_macro, /* cleanup */ \
1135    in_reverse_8888_8888_process_head, \
1136    in_reverse_8888_8888_process_tail
1137
1138/******************************************************************************/
1139
1140.macro over_n_8888_init
1141        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
1142        /* Hold loop invariant in MASK */
1143        ldr     MASK, =0x00800080
1144        /* Hold multiplier for destination in STRIDE_M */
1145        mov     STRIDE_M, #255
1146        sub     STRIDE_M, STRIDE_M, SRC, lsr #24
1147        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1148        uadd8   SCRATCH, MASK, MASK
1149.endm
1150
1151.macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1152        pixld   , numbytes, firstreg, DST, 0
1153.endm
1154
1155.macro over_n_8888_1pixel dst
1156        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
1157        uqadd8  WK&dst, WK&dst, SRC
1158.endm
1159
1160.macro over_n_8888_process_tail  cond, numbytes, firstreg
1161 .set PROCESS_REG, firstreg
1162 .rept numbytes / 4
1163        over_n_8888_1pixel %(PROCESS_REG)
1164  .set PROCESS_REG, PROCESS_REG+1
1165 .endr
1166        pixst   , numbytes, firstreg, DST
1167.endm
1168
1169generate_composite_function \
1170    pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
1171    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
1172    2, /* prefetch distance */ \
1173    over_n_8888_init, \
1174    nop_macro, /* newline */ \
1175    nop_macro, /* cleanup */ \
1176    over_n_8888_process_head, \
1177    over_n_8888_process_tail
1178
1179/******************************************************************************/
1180