1/*
2 * Copyright © 2012 Raspberry Pi Foundation
3 * Copyright © 2012 RISC OS Open Ltd
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of the copyright holders not be used in
10 * advertising or publicity pertaining to distribution of the software without
11 * specific, written prior permission.  The copyright holders make no
12 * representations about the suitability of this software for any purpose.  It
13 * is provided "as is" without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author:  Ben Avison (bavison@riscosopen.org)
25 *
26 */
27
28/* Prevent the stack from becoming executable */
29#if defined(__linux__) && defined(__ELF__)
30.section .note.GNU-stack,"",%progbits
31#endif
32
33	.text
34	.arch armv6
35	.object_arch armv4
36	.arm
37	.altmacro
38	.p2align 2
39
40#include "pixman-arm-simd-asm.h"
41
42/* A head macro should do all processing which results in an output of up to
43 * 16 bytes, as far as the final load instruction. The corresponding tail macro
44 * should complete the processing of the up-to-16 bytes. The calling macro will
45 * sometimes choose to insert a preload or a decrement of X between them.
46 *   cond           ARM condition code for code block
47 *   numbytes       Number of output bytes that should be generated this time
48 *   firstreg       First WK register in which to place output
49 *   unaligned_src  Whether to use non-wordaligned loads of source image
50 *   unaligned_mask Whether to use non-wordaligned loads of mask image
51 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
52 */
53
54.macro blit_init
55        line_saved_regs STRIDE_D, STRIDE_S
56.endm
57
58.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
59        pixld   cond, numbytes, firstreg, SRC, unaligned_src
60.endm
61
62.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
63    WK4     .req    STRIDE_D
64    WK5     .req    STRIDE_S
65    WK6     .req    MASK
66    WK7     .req    STRIDE_M
67110:    pixld   , 16, 0, SRC, unaligned_src
68        pixld   , 16, 4, SRC, unaligned_src
69        pld     [SRC, SCRATCH]
70        pixst   , 16, 0, DST
71        pixst   , 16, 4, DST
72        subs    X, X, #32*8/src_bpp
73        bhs     110b
74    .unreq  WK4
75    .unreq  WK5
76    .unreq  WK6
77    .unreq  WK7
78.endm
79
80generate_composite_function \
81    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
82    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
83    4, /* prefetch distance */ \
84    blit_init, \
85    nop_macro, /* newline */ \
86    nop_macro, /* cleanup */ \
87    blit_process_head, \
88    nop_macro, /* process tail */ \
89    blit_inner_loop
90
91generate_composite_function \
92    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
93    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
94    4, /* prefetch distance */ \
95    blit_init, \
96    nop_macro, /* newline */ \
97    nop_macro, /* cleanup */ \
98    blit_process_head, \
99    nop_macro, /* process tail */ \
100    blit_inner_loop
101
102generate_composite_function \
103    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
104    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
105    3, /* prefetch distance */ \
106    blit_init, \
107    nop_macro, /* newline */ \
108    nop_macro, /* cleanup */ \
109    blit_process_head, \
110    nop_macro, /* process tail */ \
111    blit_inner_loop
112
113/******************************************************************************/
114
115.macro src_n_8888_init
116        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
117        mov     STRIDE_S, SRC
118        mov     MASK, SRC
119        mov     STRIDE_M, SRC
120.endm
121
122.macro src_n_0565_init
123        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
124        orr     SRC, SRC, lsl #16
125        mov     STRIDE_S, SRC
126        mov     MASK, SRC
127        mov     STRIDE_M, SRC
128.endm
129
130.macro src_n_8_init
131        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
132        orr     SRC, SRC, lsl #8
133        orr     SRC, SRC, lsl #16
134        mov     STRIDE_S, SRC
135        mov     MASK, SRC
136        mov     STRIDE_M, SRC
137.endm
138
139.macro fill_process_tail  cond, numbytes, firstreg
140    WK4     .req    SRC
141    WK5     .req    STRIDE_S
142    WK6     .req    MASK
143    WK7     .req    STRIDE_M
144        pixst   cond, numbytes, 4, DST
145    .unreq  WK4
146    .unreq  WK5
147    .unreq  WK6
148    .unreq  WK7
149.endm
150
151generate_composite_function \
152    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
153    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
154    0, /* prefetch distance doesn't apply */ \
155    src_n_8888_init \
156    nop_macro, /* newline */ \
157    nop_macro /* cleanup */ \
158    nop_macro /* process head */ \
159    fill_process_tail
160
161generate_composite_function \
162    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
163    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
164    0, /* prefetch distance doesn't apply */ \
165    src_n_0565_init \
166    nop_macro, /* newline */ \
167    nop_macro /* cleanup */ \
168    nop_macro /* process head */ \
169    fill_process_tail
170
171generate_composite_function \
172    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
173    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
174    0, /* prefetch distance doesn't apply */ \
175    src_n_8_init \
176    nop_macro, /* newline */ \
177    nop_macro /* cleanup */ \
178    nop_macro /* process head */ \
179    fill_process_tail
180
181/******************************************************************************/
182
183.macro src_x888_8888_pixel, cond, reg
184        orr&cond WK&reg, WK&reg, #0xFF000000
185.endm
186
187.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
188        pixld   cond, numbytes, firstreg, SRC, unaligned_src
189.endm
190
191.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
192        src_x888_8888_pixel cond, %(firstreg+0)
193 .if numbytes >= 8
194        src_x888_8888_pixel cond, %(firstreg+1)
195  .if numbytes == 16
196        src_x888_8888_pixel cond, %(firstreg+2)
197        src_x888_8888_pixel cond, %(firstreg+3)
198  .endif
199 .endif
200.endm
201
202generate_composite_function \
203    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
204    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
205    3, /* prefetch distance */ \
206    nop_macro, /* init */ \
207    nop_macro, /* newline */ \
208    nop_macro, /* cleanup */ \
209    pixman_composite_src_x888_8888_process_head, \
210    pixman_composite_src_x888_8888_process_tail
211
212/******************************************************************************/
213
214.macro src_0565_8888_init
215        /* Hold loop invariants in MASK and STRIDE_M */
216        ldr     MASK, =0x07E007E0
217        mov     STRIDE_M, #0xFF000000
218        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
219        ldr     SCRATCH, =0x80008000
220        uadd8   SCRATCH, SCRATCH, SCRATCH
221.endm
222
223.macro src_0565_8888_2pixels, reg1, reg2
224        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
225        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
226        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
227        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
228        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
229        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
230        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
231        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
232        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
233        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
234        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
235        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
236        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
237        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
238        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
239.endm
240
241/* This version doesn't need STRIDE_M, but is one instruction longer.
242   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
243        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
244        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
245        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
246        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
247        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
248        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
249        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
250        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
251        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
252        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
253        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
254        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
255        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
256        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
257        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
258        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
259*/
260
261.macro src_0565_8888_1pixel, reg
262        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
263        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
264        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
265        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
266        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
267        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
268        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
269        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
270        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
271.endm
272
273.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
274 .if numbytes == 16
275        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
276 .elseif numbytes == 8
277        pixld   , 4, firstreg, SRC, unaligned_src
278 .elseif numbytes == 4
279        pixld   , 2, firstreg, SRC, unaligned_src
280 .endif
281.endm
282
283.macro src_0565_8888_process_tail   cond, numbytes, firstreg
284 .if numbytes == 16
285        src_0565_8888_2pixels firstreg, %(firstreg+1)
286        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
287 .elseif numbytes == 8
288        src_0565_8888_2pixels firstreg, %(firstreg+1)
289 .else
290        src_0565_8888_1pixel firstreg
291 .endif
292.endm
293
294generate_composite_function \
295    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
296    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
297    3, /* prefetch distance */ \
298    src_0565_8888_init, \
299    nop_macro, /* newline */ \
300    nop_macro, /* cleanup */ \
301    src_0565_8888_process_head, \
302    src_0565_8888_process_tail
303
304/******************************************************************************/
305
306.macro add_8_8_8pixels  cond, dst1, dst2
307        uqadd8&cond  WK&dst1, WK&dst1, MASK
308        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
309.endm
310
311.macro add_8_8_4pixels  cond, dst
312        uqadd8&cond  WK&dst, WK&dst, MASK
313.endm
314
315.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
316    WK4     .req    MASK
317    WK5     .req    STRIDE_M
318 .if numbytes == 16
319        pixld   cond, 8, 4, SRC, unaligned_src
320        pixld   cond, 16, firstreg, DST, 0
321        add_8_8_8pixels cond, firstreg, %(firstreg+1)
322        pixld   cond, 8, 4, SRC, unaligned_src
323 .else
324        pixld   cond, numbytes, 4, SRC, unaligned_src
325        pixld   cond, numbytes, firstreg, DST, 0
326 .endif
327    .unreq  WK4
328    .unreq  WK5
329.endm
330
331.macro add_8_8_process_tail  cond, numbytes, firstreg
332 .if numbytes == 16
333        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
334 .elseif numbytes == 8
335        add_8_8_8pixels cond, firstreg, %(firstreg+1)
336 .else
337        add_8_8_4pixels cond, firstreg
338 .endif
339.endm
340
341generate_composite_function \
342    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
343    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
344    2, /* prefetch distance */ \
345    nop_macro, /* init */ \
346    nop_macro, /* newline */ \
347    nop_macro, /* cleanup */ \
348    add_8_8_process_head, \
349    add_8_8_process_tail
350
351/******************************************************************************/
352
353.macro over_8888_8888_init
354        /* Hold loop invariant in MASK */
355        ldr     MASK, =0x00800080
356        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
357        uadd8   SCRATCH, MASK, MASK
358        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
359.endm
360
361.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
362    WK4     .req    STRIDE_D
363    WK5     .req    STRIDE_S
364    WK6     .req    STRIDE_M
365    WK7     .req    ORIG_W
366        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
367        pixld   , numbytes, firstreg, DST, 0
368    .unreq  WK4
369    .unreq  WK5
370    .unreq  WK6
371    .unreq  WK7
372.endm
373
374.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
375        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
376        teq     WK&reg0, #0
377 .if numbytes > 4
378        teqeq   WK&reg1, #0
379  .if numbytes > 8
380        teqeq   WK&reg2, #0
381        teqeq   WK&reg3, #0
382  .endif
383 .endif
384.endm
385
386.macro over_8888_8888_prepare  next
387        mov     WK&next, WK&next, lsr #24
388.endm
389
390.macro over_8888_8888_1pixel src, dst, offset, next
391        /* src = destination component multiplier */
392        rsb     WK&src, WK&src, #255
393        /* Split even/odd bytes of dst into SCRATCH/dst */
394        uxtb16  SCRATCH, WK&dst
395        uxtb16  WK&dst, WK&dst, ror #8
396        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
397        mla     SCRATCH, SCRATCH, WK&src, MASK
398        mla     WK&dst, WK&dst, WK&src, MASK
399        /* Where we would have had a stall between the result of the first MLA and the shifter input,
400         * reload the complete source pixel */
401        ldr     WK&src, [SRC, #offset]
402        /* Multiply by 257/256 to approximate 256/255 */
403        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
404        /* In this stall, start processing the next pixel */
405 .if offset < -4
406        mov     WK&next, WK&next, lsr #24
407 .endif
408        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
409        /* Recombine even/odd bytes of multiplied destination */
410        mov     SCRATCH, SCRATCH, ror #8
411        sel     WK&dst, SCRATCH, WK&dst
412        /* Saturated add of source to multiplied destination */
413        uqadd8  WK&dst, WK&dst, WK&src
414.endm
415
416.macro over_8888_8888_process_tail  cond, numbytes, firstreg
417    WK4     .req    STRIDE_D
418    WK5     .req    STRIDE_S
419    WK6     .req    STRIDE_M
420    WK7     .req    ORIG_W
421        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
422        beq     10f
423        over_8888_8888_prepare  %(4+firstreg)
424 .set PROCESS_REG, firstreg
425 .set PROCESS_OFF, -numbytes
426 .rept numbytes / 4
427        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
428  .set PROCESS_REG, PROCESS_REG+1
429  .set PROCESS_OFF, PROCESS_OFF+4
430 .endr
431        pixst   , numbytes, firstreg, DST
43210:
433    .unreq  WK4
434    .unreq  WK5
435    .unreq  WK6
436    .unreq  WK7
437.endm
438
439generate_composite_function \
440    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
441    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
442    2, /* prefetch distance */ \
443    over_8888_8888_init, \
444    nop_macro, /* newline */ \
445    nop_macro, /* cleanup */ \
446    over_8888_8888_process_head, \
447    over_8888_8888_process_tail
448
449/******************************************************************************/
450
451/* Multiply each byte of a word by a byte.
452 * Useful when there aren't any obvious ways to fill the stalls with other instructions.
453 * word  Register containing 4 bytes
454 * byte  Register containing byte multiplier (bits 8-31 must be 0)
455 * tmp   Scratch register
456 * half  Register containing the constant 0x00800080
457 * GE[3:0] bits must contain 0101
458 */
459.macro mul_8888_8  word, byte, tmp, half
460        /* Split even/odd bytes of word apart */
461        uxtb16  tmp, word
462        uxtb16  word, word, ror #8
463        /* Multiply bytes together with rounding, then by 257/256 */
464        mla     tmp, tmp, byte, half
465        mla     word, word, byte, half /* 1 stall follows */
466        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
467        uxtab16 word, word, word, ror #8
468        /* Recombine bytes */
469        mov     tmp, tmp, ror #8
470        sel     word, tmp, word
471.endm
472
473/******************************************************************************/
474
475.macro over_8888_n_8888_init
476        /* Mask is constant */
477        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
478        /* Hold loop invariant in STRIDE_M */
479        ldr     STRIDE_M, =0x00800080
480        /* We only want the alpha bits of the constant mask */
481        mov     MASK, MASK, lsr #24
482        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
483        uadd8   SCRATCH, STRIDE_M, STRIDE_M
484        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
485.endm
486
487.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
488    WK4     .req    Y
489    WK5     .req    STRIDE_D
490    WK6     .req    STRIDE_S
491    WK7     .req    ORIG_W
492        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
493        pixld   , numbytes, firstreg, DST, 0
494    .unreq  WK4
495    .unreq  WK5
496    .unreq  WK6
497    .unreq  WK7
498.endm
499
500.macro over_8888_n_8888_1pixel src, dst
501        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
502        sub     WK7, WK6, WK&src, lsr #24
503        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
504        uqadd8  WK&dst, WK&dst, WK&src
505.endm
506
507.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
508    WK4     .req    Y
509    WK5     .req    STRIDE_D
510    WK6     .req    STRIDE_S
511    WK7     .req    ORIG_W
512        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
513        beq     10f
514        mov     WK6, #255
515 .set PROCESS_REG, firstreg
516 .rept numbytes / 4
517  .if numbytes == 16 && PROCESS_REG == 2
518        /* We're using WK6 and WK7 as temporaries, so half way through
519         * 4 pixels, reload the second two source pixels but this time
520         * into WK4 and WK5 */
521        ldmdb   SRC, {WK4, WK5}
522  .endif
523        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
524  .set PROCESS_REG, PROCESS_REG+1
525 .endr
526        pixst   , numbytes, firstreg, DST
52710:
528    .unreq  WK4
529    .unreq  WK5
530    .unreq  WK6
531    .unreq  WK7
532.endm
533
534generate_composite_function \
535    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
536    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
537    2, /* prefetch distance */ \
538    over_8888_n_8888_init, \
539    nop_macro, /* newline */ \
540    nop_macro, /* cleanup */ \
541    over_8888_n_8888_process_head, \
542    over_8888_n_8888_process_tail
543
544/******************************************************************************/
545
546.macro over_n_8_8888_init
547        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
548        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
549        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
550        ldr     SCRATCH, =0x00800080
551        uxtb16  STRIDE_S, SRC
552        uxtb16  SRC, SRC, ror #8
553        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
554        uadd8   SCRATCH, SCRATCH, SCRATCH
555        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
556.endm
557
558.macro over_n_8_8888_newline
559        ldr     STRIDE_D, =0x00800080
560        b       1f
561 .ltorg
5621:
563.endm
564
565.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
566    WK4     .req    STRIDE_M
567        pixld   , numbytes/4, 4, MASK, unaligned_mask
568        pixld   , numbytes, firstreg, DST, 0
569    .unreq  WK4
570.endm
571
572.macro over_n_8_8888_1pixel src, dst
573        uxtb    Y, WK4, ror #src*8
574        /* Trailing part of multiplication of source */
575        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
576        mla     Y, SRC, Y, STRIDE_D
577        mov     ORIG_W, #255
578        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
579        uxtab16 Y, Y, Y, ror #8
580        mov     SCRATCH, SCRATCH, ror #8
581        sub     ORIG_W, ORIG_W, Y, lsr #24
582        sel     Y, SCRATCH, Y
583        /* Then multiply the destination */
584        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
585        uqadd8  WK&dst, WK&dst, Y
586.endm
587
588.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
589    WK4     .req    STRIDE_M
590        teq     WK4, #0
591        beq     10f
592 .set PROCESS_REG, firstreg
593 .rept numbytes / 4
594        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
595  .set PROCESS_REG, PROCESS_REG+1
596 .endr
597        pixst   , numbytes, firstreg, DST
59810:
599    .unreq  WK4
600.endm
601
602generate_composite_function \
603    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
604    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
605    2, /* prefetch distance */ \
606    over_n_8_8888_init, \
607    over_n_8_8888_newline, \
608    nop_macro, /* cleanup */ \
609    over_n_8_8888_process_head, \
610    over_n_8_8888_process_tail
611
612/******************************************************************************/
613
614