1/* 2 * Copyright © 2012 Raspberry Pi Foundation 3 * Copyright © 2012 RISC OS Open Ltd 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of the copyright holders not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. The copyright holders make no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Ben Avison (bavison@riscosopen.org) 25 * 26 */ 27 28/* Prevent the stack from becoming executable */ 29#if defined(__linux__) && defined(__ELF__) 30.section .note.GNU-stack,"",%progbits 31#endif 32 33 .text 34 .arch armv6 35 .object_arch armv4 36 .arm 37 .altmacro 38 .p2align 2 39 40#include "pixman-arm-asm.h" 41#include "pixman-arm-simd-asm.h" 42 43/* A head macro should do all processing which results in an output of up to 44 * 16 bytes, as far as the final load instruction. The corresponding tail macro 45 * should complete the processing of the up-to-16 bytes. The calling macro will 46 * sometimes choose to insert a preload or a decrement of X between them. 47 * cond ARM condition code for code block 48 * numbytes Number of output bytes that should be generated this time 49 * firstreg First WK register in which to place output 50 * unaligned_src Whether to use non-wordaligned loads of source image 51 * unaligned_mask Whether to use non-wordaligned loads of mask image 52 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output 53 */ 54 55.macro blit_init 56 line_saved_regs STRIDE_D, STRIDE_S 57.endm 58 59.macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 60 pixld cond, numbytes, firstreg, SRC, unaligned_src 61.endm 62 63.macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 64 WK4 .req STRIDE_D 65 WK5 .req STRIDE_S 66 WK6 .req MASK 67 WK7 .req STRIDE_M 68110: pixld , 16, 0, SRC, unaligned_src 69 pixld , 16, 4, SRC, unaligned_src 70 pld [SRC, SCRATCH] 71 pixst , 16, 0, DST 72 pixst , 16, 4, DST 73 subs X, X, #32*8/src_bpp 74 bhs 110b 75 .unreq WK4 76 .unreq WK5 77 .unreq WK6 78 .unreq WK7 79.endm 80 81generate_composite_function \ 82 pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \ 83 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 84 4, /* prefetch distance */ \ 85 blit_init, \ 86 nop_macro, /* newline */ \ 87 nop_macro, /* cleanup */ \ 88 blit_process_head, \ 89 nop_macro, /* process tail */ \ 90 blit_inner_loop 91 92generate_composite_function \ 93 pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \ 94 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 95 4, /* prefetch distance */ \ 96 blit_init, \ 97 nop_macro, /* newline */ \ 98 nop_macro, /* cleanup */ \ 99 blit_process_head, \ 100 nop_macro, /* process tail */ \ 101 blit_inner_loop 102 103generate_composite_function \ 104 pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \ 105 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \ 106 3, /* prefetch distance */ \ 107 blit_init, \ 108 nop_macro, /* newline */ \ 109 nop_macro, /* cleanup */ \ 110 blit_process_head, \ 111 nop_macro, /* process tail */ \ 112 blit_inner_loop 113 114/******************************************************************************/ 115 116.macro src_n_8888_init 117 ldr SRC, [sp, #ARGS_STACK_OFFSET] 118 mov STRIDE_S, SRC 119 mov MASK, SRC 120 mov STRIDE_M, SRC 121.endm 122 123.macro src_n_0565_init 124 ldrh SRC, [sp, #ARGS_STACK_OFFSET] 125 orr SRC, SRC, lsl #16 126 mov STRIDE_S, SRC 127 mov MASK, SRC 128 mov STRIDE_M, SRC 129.endm 130 131.macro src_n_8_init 132 ldrb SRC, [sp, #ARGS_STACK_OFFSET] 133 orr SRC, SRC, lsl #8 134 orr SRC, SRC, lsl #16 135 mov STRIDE_S, SRC 136 mov MASK, SRC 137 mov STRIDE_M, SRC 138.endm 139 140.macro fill_process_tail cond, numbytes, firstreg 141 WK4 .req SRC 142 WK5 .req STRIDE_S 143 WK6 .req MASK 144 WK7 .req STRIDE_M 145 pixst cond, numbytes, 4, DST 146 .unreq WK4 147 .unreq WK5 148 .unreq WK6 149 .unreq WK7 150.endm 151 152generate_composite_function \ 153 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \ 154 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 155 0, /* prefetch distance doesn't apply */ \ 156 src_n_8888_init \ 157 nop_macro, /* newline */ \ 158 nop_macro /* cleanup */ \ 159 nop_macro /* process head */ \ 160 fill_process_tail 161 162generate_composite_function \ 163 pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \ 164 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 165 0, /* prefetch distance doesn't apply */ \ 166 src_n_0565_init \ 167 nop_macro, /* newline */ \ 168 nop_macro /* cleanup */ \ 169 nop_macro /* process head */ \ 170 fill_process_tail 171 172generate_composite_function \ 173 pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \ 174 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ 175 0, /* prefetch distance doesn't apply */ \ 176 src_n_8_init \ 177 nop_macro, /* newline */ \ 178 nop_macro /* cleanup */ \ 179 nop_macro /* process head */ \ 180 fill_process_tail 181 182/******************************************************************************/ 183 184.macro src_x888_8888_pixel, cond, reg 185 orr&cond WK®, WK®, #0xFF000000 186.endm 187 188.macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 189 pixld cond, numbytes, firstreg, SRC, unaligned_src 190.endm 191 192.macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg 193 src_x888_8888_pixel cond, %(firstreg+0) 194 .if numbytes >= 8 195 src_x888_8888_pixel cond, %(firstreg+1) 196 .if numbytes == 16 197 src_x888_8888_pixel cond, %(firstreg+2) 198 src_x888_8888_pixel cond, %(firstreg+3) 199 .endif 200 .endif 201.endm 202 203generate_composite_function \ 204 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \ 205 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ 206 3, /* prefetch distance */ \ 207 nop_macro, /* init */ \ 208 nop_macro, /* newline */ \ 209 nop_macro, /* cleanup */ \ 210 pixman_composite_src_x888_8888_process_head, \ 211 pixman_composite_src_x888_8888_process_tail 212 213/******************************************************************************/ 214 215.macro src_0565_8888_init 216 /* Hold loop invariants in MASK and STRIDE_M */ 217 ldr MASK, =0x07E007E0 218 mov STRIDE_M, #0xFF000000 219 /* Set GE[3:0] to 1010 so SEL instructions do what we want */ 220 ldr SCRATCH, =0x80008000 221 uadd8 SCRATCH, SCRATCH, SCRATCH 222.endm 223 224.macro src_0565_8888_2pixels, reg1, reg2 225 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 226 bic WK®2, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 227 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 228 mov WK®1, WK®2, lsl #16 @ rrrrr000000bbbbb0000000000000000 229 mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG 230 bic WK®2, WK®2, WK®1, lsr #16 @ RRRRR000000BBBBB0000000000000000 231 orr WK®1, WK®1, WK®1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000 232 orr WK®2, WK®2, WK®2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000 233 pkhtb WK®1, WK®1, WK®1, asr #5 @ rrrrrrrr--------bbbbbbbb-------- 234 sel WK®1, WK®1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb-------- 235 mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg 236 pkhtb WK®2, WK®2, WK®2, asr #5 @ RRRRRRRR--------BBBBBBBB-------- 237 sel WK®2, WK®2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB-------- 238 orr WK®1, STRIDE_M, WK®1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb 239 orr WK®2, STRIDE_M, WK®2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 240.endm 241 242/* This version doesn't need STRIDE_M, but is one instruction longer. 243 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case? 244 and SCRATCH, WK®1, MASK @ 00000GGGGGG0000000000gggggg00000 245 bic WK®1, WK®1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb 246 orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg 247 mov WK®2, WK®1, lsr #16 @ 0000000000000000RRRRR000000BBBBB 248 mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000 249 bic WK®1, WK®1, WK®2, lsl #16 @ 0000000000000000rrrrr000000bbbbb 250 mov WK®2, WK®2, lsl #3 @ 0000000000000RRRRR000000BBBBB000 251 mov WK®1, WK®1, lsl #3 @ 0000000000000rrrrr000000bbbbb000 252 orr WK®2, WK®2, WK®2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB 253 orr WK®1, WK®1, WK®1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 254 pkhbt WK®2, WK®2, WK®2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB 255 pkhbt WK®1, WK®1, WK®1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 256 sel WK®2, SCRATCH, WK®2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB 257 sel WK®1, SCRATCH, WK®1 @ --------rrrrrrrrggggggggbbbbbbbb 258 orr WK®2, WK®2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB 259 orr WK®1, WK®1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 260*/ 261 262.macro src_0565_8888_1pixel, reg 263 bic SCRATCH, WK®, MASK @ 0000000000000000rrrrr000000bbbbb 264 and WK®, WK®, MASK @ 000000000000000000000gggggg00000 265 mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000 266 mov WK®, WK®, lsl #5 @ 0000000000000000gggggg0000000000 267 orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb 268 orr WK®, WK®, WK®, lsr #6 @ 000000000000000gggggggggggg00000 269 pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb 270 sel WK®, WK®, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb 271 orr WK®, WK®, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb 272.endm 273 274.macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 275 .if numbytes == 16 276 pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src 277 .elseif numbytes == 8 278 pixld , 4, firstreg, SRC, unaligned_src 279 .elseif numbytes == 4 280 pixld , 2, firstreg, SRC, unaligned_src 281 .endif 282.endm 283 284.macro src_0565_8888_process_tail cond, numbytes, firstreg 285 .if numbytes == 16 286 src_0565_8888_2pixels firstreg, %(firstreg+1) 287 src_0565_8888_2pixels %(firstreg+2), %(firstreg+3) 288 .elseif numbytes == 8 289 src_0565_8888_2pixels firstreg, %(firstreg+1) 290 .else 291 src_0565_8888_1pixel firstreg 292 .endif 293.endm 294 295generate_composite_function \ 296 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \ 297 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ 298 3, /* prefetch distance */ \ 299 src_0565_8888_init, \ 300 nop_macro, /* newline */ \ 301 nop_macro, /* cleanup */ \ 302 src_0565_8888_process_head, \ 303 src_0565_8888_process_tail 304 305/******************************************************************************/ 306 307.macro src_x888_0565_init 308 /* Hold loop invariant in MASK */ 309 ldr MASK, =0x001F001F 310 line_saved_regs STRIDE_S, ORIG_W 311.endm 312 313.macro src_x888_0565_1pixel s, d 314 and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb 315 and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000 316 orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb 317 orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb 318 /* Top 16 bits are discarded during the following STRH */ 319.endm 320 321.macro src_x888_0565_2pixels slo, shi, d, tmp 322 and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000 323 and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB 324 and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb 325 orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB 326 orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB 327 and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000 328 orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb 329 orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb 330 pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb 331.endm 332 333.macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 334 WK4 .req STRIDE_S 335 WK5 .req STRIDE_M 336 WK6 .req WK3 337 WK7 .req ORIG_W 338 .if numbytes == 16 339 pixld , 16, 4, SRC, 0 340 src_x888_0565_2pixels 4, 5, 0, 0 341 pixld , 8, 4, SRC, 0 342 src_x888_0565_2pixels 6, 7, 1, 1 343 pixld , 8, 6, SRC, 0 344 .else 345 pixld , numbytes*2, 4, SRC, 0 346 .endif 347.endm 348 349.macro src_x888_0565_process_tail cond, numbytes, firstreg 350 .if numbytes == 16 351 src_x888_0565_2pixels 4, 5, 2, 2 352 src_x888_0565_2pixels 6, 7, 3, 4 353 .elseif numbytes == 8 354 src_x888_0565_2pixels 4, 5, 1, 1 355 src_x888_0565_2pixels 6, 7, 2, 2 356 .elseif numbytes == 4 357 src_x888_0565_2pixels 4, 5, 1, 1 358 .else 359 src_x888_0565_1pixel 4, 1 360 .endif 361 .if numbytes == 16 362 pixst , numbytes, 0, DST 363 .else 364 pixst , numbytes, 1, DST 365 .endif 366 .unreq WK4 367 .unreq WK5 368 .unreq WK6 369 .unreq WK7 370.endm 371 372generate_composite_function \ 373 pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \ 374 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 375 3, /* prefetch distance */ \ 376 src_x888_0565_init, \ 377 nop_macro, /* newline */ \ 378 nop_macro, /* cleanup */ \ 379 src_x888_0565_process_head, \ 380 src_x888_0565_process_tail 381 382/******************************************************************************/ 383 384.macro add_8_8_8pixels cond, dst1, dst2 385 uqadd8&cond WK&dst1, WK&dst1, MASK 386 uqadd8&cond WK&dst2, WK&dst2, STRIDE_M 387.endm 388 389.macro add_8_8_4pixels cond, dst 390 uqadd8&cond WK&dst, WK&dst, MASK 391.endm 392 393.macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 394 WK4 .req MASK 395 WK5 .req STRIDE_M 396 .if numbytes == 16 397 pixld cond, 8, 4, SRC, unaligned_src 398 pixld cond, 16, firstreg, DST, 0 399 add_8_8_8pixels cond, firstreg, %(firstreg+1) 400 pixld cond, 8, 4, SRC, unaligned_src 401 .else 402 pixld cond, numbytes, 4, SRC, unaligned_src 403 pixld cond, numbytes, firstreg, DST, 0 404 .endif 405 .unreq WK4 406 .unreq WK5 407.endm 408 409.macro add_8_8_process_tail cond, numbytes, firstreg 410 .if numbytes == 16 411 add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3) 412 .elseif numbytes == 8 413 add_8_8_8pixels cond, firstreg, %(firstreg+1) 414 .else 415 add_8_8_4pixels cond, firstreg 416 .endif 417.endm 418 419generate_composite_function \ 420 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \ 421 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \ 422 2, /* prefetch distance */ \ 423 nop_macro, /* init */ \ 424 nop_macro, /* newline */ \ 425 nop_macro, /* cleanup */ \ 426 add_8_8_process_head, \ 427 add_8_8_process_tail 428 429/******************************************************************************/ 430 431.macro over_8888_8888_init 432 /* Hold loop invariant in MASK */ 433 ldr MASK, =0x00800080 434 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 435 uadd8 SCRATCH, MASK, MASK 436 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W 437.endm 438 439.macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 440 WK4 .req STRIDE_D 441 WK5 .req STRIDE_S 442 WK6 .req STRIDE_M 443 WK7 .req ORIG_W 444 pixld , numbytes, %(4+firstreg), SRC, unaligned_src 445 pixld , numbytes, firstreg, DST, 0 446 .unreq WK4 447 .unreq WK5 448 .unreq WK6 449 .unreq WK7 450.endm 451 452.macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3 453 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */ 454 teq WK®0, #0 455 .if numbytes > 4 456 teqeq WK®1, #0 457 .if numbytes > 8 458 teqeq WK®2, #0 459 teqeq WK®3, #0 460 .endif 461 .endif 462.endm 463 464.macro over_8888_8888_prepare next 465 mov WK&next, WK&next, lsr #24 466.endm 467 468.macro over_8888_8888_1pixel src, dst, offset, next 469 /* src = destination component multiplier */ 470 rsb WK&src, WK&src, #255 471 /* Split even/odd bytes of dst into SCRATCH/dst */ 472 uxtb16 SCRATCH, WK&dst 473 uxtb16 WK&dst, WK&dst, ror #8 474 /* Multiply through, adding 0.5 to the upper byte of result for rounding */ 475 mla SCRATCH, SCRATCH, WK&src, MASK 476 mla WK&dst, WK&dst, WK&src, MASK 477 /* Where we would have had a stall between the result of the first MLA and the shifter input, 478 * reload the complete source pixel */ 479 ldr WK&src, [SRC, #offset] 480 /* Multiply by 257/256 to approximate 256/255 */ 481 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 482 /* In this stall, start processing the next pixel */ 483 .if offset < -4 484 mov WK&next, WK&next, lsr #24 485 .endif 486 uxtab16 WK&dst, WK&dst, WK&dst, ror #8 487 /* Recombine even/odd bytes of multiplied destination */ 488 mov SCRATCH, SCRATCH, ror #8 489 sel WK&dst, SCRATCH, WK&dst 490 /* Saturated add of source to multiplied destination */ 491 uqadd8 WK&dst, WK&dst, WK&src 492.endm 493 494.macro over_8888_8888_process_tail cond, numbytes, firstreg 495 WK4 .req STRIDE_D 496 WK5 .req STRIDE_S 497 WK6 .req STRIDE_M 498 WK7 .req ORIG_W 499 over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg) 500 beq 10f 501 over_8888_8888_prepare %(4+firstreg) 502 .set PROCESS_REG, firstreg 503 .set PROCESS_OFF, -numbytes 504 .rept numbytes / 4 505 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG) 506 .set PROCESS_REG, PROCESS_REG+1 507 .set PROCESS_OFF, PROCESS_OFF+4 508 .endr 509 pixst , numbytes, firstreg, DST 51010: 511 .unreq WK4 512 .unreq WK5 513 .unreq WK6 514 .unreq WK7 515.endm 516 517generate_composite_function \ 518 pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \ 519 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 520 2, /* prefetch distance */ \ 521 over_8888_8888_init, \ 522 nop_macro, /* newline */ \ 523 nop_macro, /* cleanup */ \ 524 over_8888_8888_process_head, \ 525 over_8888_8888_process_tail 526 527/******************************************************************************/ 528 529/* Multiply each byte of a word by a byte. 530 * Useful when there aren't any obvious ways to fill the stalls with other instructions. 531 * word Register containing 4 bytes 532 * byte Register containing byte multiplier (bits 8-31 must be 0) 533 * tmp Scratch register 534 * half Register containing the constant 0x00800080 535 * GE[3:0] bits must contain 0101 536 */ 537.macro mul_8888_8 word, byte, tmp, half 538 /* Split even/odd bytes of word apart */ 539 uxtb16 tmp, word 540 uxtb16 word, word, ror #8 541 /* Multiply bytes together with rounding, then by 257/256 */ 542 mla tmp, tmp, byte, half 543 mla word, word, byte, half /* 1 stall follows */ 544 uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */ 545 uxtab16 word, word, word, ror #8 546 /* Recombine bytes */ 547 mov tmp, tmp, ror #8 548 sel word, tmp, word 549.endm 550 551/******************************************************************************/ 552 553.macro over_8888_n_8888_init 554 /* Mask is constant */ 555 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 556 /* Hold loop invariant in STRIDE_M */ 557 ldr STRIDE_M, =0x00800080 558 /* We only want the alpha bits of the constant mask */ 559 mov MASK, MASK, lsr #24 560 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 561 uadd8 SCRATCH, STRIDE_M, STRIDE_M 562 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W 563.endm 564 565.macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 566 WK4 .req Y 567 WK5 .req STRIDE_D 568 WK6 .req STRIDE_S 569 WK7 .req ORIG_W 570 pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src 571 pixld , numbytes, firstreg, DST, 0 572 .unreq WK4 573 .unreq WK5 574 .unreq WK6 575 .unreq WK7 576.endm 577 578.macro over_8888_n_8888_1pixel src, dst 579 mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M 580 sub WK7, WK6, WK&src, lsr #24 581 mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M 582 uqadd8 WK&dst, WK&dst, WK&src 583.endm 584 585.macro over_8888_n_8888_process_tail cond, numbytes, firstreg 586 WK4 .req Y 587 WK5 .req STRIDE_D 588 WK6 .req STRIDE_S 589 WK7 .req ORIG_W 590 over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg) 591 beq 10f 592 mov WK6, #255 593 .set PROCESS_REG, firstreg 594 .rept numbytes / 4 595 .if numbytes == 16 && PROCESS_REG == 2 596 /* We're using WK6 and WK7 as temporaries, so half way through 597 * 4 pixels, reload the second two source pixels but this time 598 * into WK4 and WK5 */ 599 ldmdb SRC, {WK4, WK5} 600 .endif 601 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG) 602 .set PROCESS_REG, PROCESS_REG+1 603 .endr 604 pixst , numbytes, firstreg, DST 60510: 606 .unreq WK4 607 .unreq WK5 608 .unreq WK6 609 .unreq WK7 610.endm 611 612generate_composite_function \ 613 pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \ 614 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 615 2, /* prefetch distance */ \ 616 over_8888_n_8888_init, \ 617 nop_macro, /* newline */ \ 618 nop_macro, /* cleanup */ \ 619 over_8888_n_8888_process_head, \ 620 over_8888_n_8888_process_tail 621 622/******************************************************************************/ 623 624.macro over_n_8_8888_init 625 /* Source is constant, but splitting it into even/odd bytes is a loop invariant */ 626 ldr SRC, [sp, #ARGS_STACK_OFFSET] 627 /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */ 628 ldr SCRATCH, =0x00800080 629 uxtb16 STRIDE_S, SRC 630 uxtb16 SRC, SRC, ror #8 631 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 632 uadd8 SCRATCH, SCRATCH, SCRATCH 633 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 634.endm 635 636.macro over_n_8_8888_newline 637 ldr STRIDE_D, =0x00800080 638 b 1f 639 .ltorg 6401: 641.endm 642 643.macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 644 WK4 .req STRIDE_M 645 pixld , numbytes/4, 4, MASK, unaligned_mask 646 pixld , numbytes, firstreg, DST, 0 647 .unreq WK4 648.endm 649 650.macro over_n_8_8888_1pixel src, dst 651 uxtb Y, WK4, ror #src*8 652 /* Trailing part of multiplication of source */ 653 mla SCRATCH, STRIDE_S, Y, STRIDE_D 654 mla Y, SRC, Y, STRIDE_D 655 mov ORIG_W, #255 656 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 657 uxtab16 Y, Y, Y, ror #8 658 mov SCRATCH, SCRATCH, ror #8 659 sub ORIG_W, ORIG_W, Y, lsr #24 660 sel Y, SCRATCH, Y 661 /* Then multiply the destination */ 662 mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D 663 uqadd8 WK&dst, WK&dst, Y 664.endm 665 666.macro over_n_8_8888_process_tail cond, numbytes, firstreg 667 WK4 .req STRIDE_M 668 teq WK4, #0 669 beq 10f 670 .set PROCESS_REG, firstreg 671 .rept numbytes / 4 672 over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG) 673 .set PROCESS_REG, PROCESS_REG+1 674 .endr 675 pixst , numbytes, firstreg, DST 67610: 677 .unreq WK4 678.endm 679 680generate_composite_function \ 681 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \ 682 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \ 683 2, /* prefetch distance */ \ 684 over_n_8_8888_init, \ 685 over_n_8_8888_newline, \ 686 nop_macro, /* cleanup */ \ 687 over_n_8_8888_process_head, \ 688 over_n_8_8888_process_tail 689 690/******************************************************************************/ 691 692.macro over_reverse_n_8888_init 693 ldr SRC, [sp, #ARGS_STACK_OFFSET] 694 ldr MASK, =0x00800080 695 /* Split source pixel into RB/AG parts */ 696 uxtb16 STRIDE_S, SRC 697 uxtb16 STRIDE_M, SRC, ror #8 698 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 699 uadd8 SCRATCH, MASK, MASK 700 line_saved_regs STRIDE_D, ORIG_W 701.endm 702 703.macro over_reverse_n_8888_newline 704 mov STRIDE_D, #0xFF 705.endm 706 707.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 708 pixld , numbytes, firstreg, DST, 0 709.endm 710 711.macro over_reverse_n_8888_1pixel d, is_only 712 teq WK&d, #0 713 beq 8f /* replace with source */ 714 bics ORIG_W, STRIDE_D, WK&d, lsr #24 715 .if is_only == 1 716 beq 49f /* skip store */ 717 .else 718 beq 9f /* write same value back */ 719 .endif 720 mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */ 721 mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */ 722 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 723 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8 724 mov SCRATCH, SCRATCH, ror #8 725 sel ORIG_W, SCRATCH, ORIG_W 726 uqadd8 WK&d, WK&d, ORIG_W 727 b 9f 7288: mov WK&d, SRC 7299: 730.endm 731 732.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4 733 .if numbytes == 4 734 over_reverse_n_8888_1pixel reg1, 1 735 .else 736 and SCRATCH, WK®1, WK®2 737 .if numbytes == 16 738 and SCRATCH, SCRATCH, WK®3 739 and SCRATCH, SCRATCH, WK®4 740 .endif 741 mvns SCRATCH, SCRATCH, asr #24 742 beq 49f /* skip store if all opaque */ 743 over_reverse_n_8888_1pixel reg1, 0 744 over_reverse_n_8888_1pixel reg2, 0 745 .if numbytes == 16 746 over_reverse_n_8888_1pixel reg3, 0 747 over_reverse_n_8888_1pixel reg4, 0 748 .endif 749 .endif 750 pixst , numbytes, reg1, DST 75149: 752.endm 753 754.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg 755 over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) 756.endm 757 758generate_composite_function \ 759 pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \ 760 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \ 761 3, /* prefetch distance */ \ 762 over_reverse_n_8888_init, \ 763 over_reverse_n_8888_newline, \ 764 nop_macro, /* cleanup */ \ 765 over_reverse_n_8888_process_head, \ 766 over_reverse_n_8888_process_tail 767 768/******************************************************************************/ 769 770.macro over_white_8888_8888_ca_init 771 HALF .req SRC 772 TMP0 .req STRIDE_D 773 TMP1 .req STRIDE_S 774 TMP2 .req STRIDE_M 775 TMP3 .req ORIG_W 776 WK4 .req SCRATCH 777 line_saved_regs STRIDE_D, STRIDE_M, ORIG_W 778 ldr SCRATCH, =0x800080 779 mov HALF, #0x80 780 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 781 uadd8 SCRATCH, SCRATCH, SCRATCH 782 .set DST_PRELOAD_BIAS, 8 783.endm 784 785.macro over_white_8888_8888_ca_cleanup 786 .set DST_PRELOAD_BIAS, 0 787 .unreq HALF 788 .unreq TMP0 789 .unreq TMP1 790 .unreq TMP2 791 .unreq TMP3 792 .unreq WK4 793.endm 794 795.macro over_white_8888_8888_ca_combine m, d 796 uxtb16 TMP1, TMP0 /* rb_notmask */ 797 uxtb16 TMP2, d /* rb_dest; 1 stall follows */ 798 smlatt TMP3, TMP2, TMP1, HALF /* red */ 799 smlabb TMP2, TMP2, TMP1, HALF /* blue */ 800 uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */ 801 uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */ 802 smlatt d, TMP1, TMP0, HALF /* alpha */ 803 smlabb TMP1, TMP1, TMP0, HALF /* green */ 804 pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */ 805 pkhbt TMP1, TMP1, d, lsl #16 /* ag */ 806 uxtab16 TMP0, TMP0, TMP0, ror #8 807 uxtab16 TMP1, TMP1, TMP1, ror #8 808 mov TMP0, TMP0, ror #8 809 sel d, TMP0, TMP1 810 uqadd8 d, d, m /* d is a late result */ 811.endm 812 813.macro over_white_8888_8888_ca_1pixel_head 814 pixld , 4, 1, MASK, 0 815 pixld , 4, 3, DST, 0 816.endm 817 818.macro over_white_8888_8888_ca_1pixel_tail 819 mvn TMP0, WK1 820 teq WK1, WK1, asr #32 821 bne 01f 822 bcc 03f 823 mov WK3, WK1 824 b 02f 82501: over_white_8888_8888_ca_combine WK1, WK3 82602: pixst , 4, 3, DST 82703: 828.endm 829 830.macro over_white_8888_8888_ca_2pixels_head 831 pixld , 8, 1, MASK, 0 832.endm 833 834.macro over_white_8888_8888_ca_2pixels_tail 835 pixld , 8, 3, DST 836 mvn TMP0, WK1 837 teq WK1, WK1, asr #32 838 bne 01f 839 movcs WK3, WK1 840 bcs 02f 841 teq WK2, #0 842 beq 05f 843 b 02f 84401: over_white_8888_8888_ca_combine WK1, WK3 84502: mvn TMP0, WK2 846 teq WK2, WK2, asr #32 847 bne 03f 848 movcs WK4, WK2 849 b 04f 85003: over_white_8888_8888_ca_combine WK2, WK4 85104: pixst , 8, 3, DST 85205: 853.endm 854 855.macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 856 .if numbytes == 4 857 over_white_8888_8888_ca_1pixel_head 858 .else 859 .if numbytes == 16 860 over_white_8888_8888_ca_2pixels_head 861 over_white_8888_8888_ca_2pixels_tail 862 .endif 863 over_white_8888_8888_ca_2pixels_head 864 .endif 865.endm 866 867.macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg 868 .if numbytes == 4 869 over_white_8888_8888_ca_1pixel_tail 870 .else 871 over_white_8888_8888_ca_2pixels_tail 872 .endif 873.endm 874 875generate_composite_function \ 876 pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \ 877 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \ 878 2, /* prefetch distance */ \ 879 over_white_8888_8888_ca_init, \ 880 nop_macro, /* newline */ \ 881 over_white_8888_8888_ca_cleanup, \ 882 over_white_8888_8888_ca_process_head, \ 883 over_white_8888_8888_ca_process_tail 884 885 886.macro over_n_8888_8888_ca_init 887 /* Set up constants. RB_SRC and AG_SRC are in registers; 888 * RB_FLDS, A_SRC, and the two HALF values need to go on the 889 * stack (and the ful SRC value is already there) */ 890 ldr SCRATCH, [sp, #ARGS_STACK_OFFSET] 891 mov WK0, #0x00FF0000 892 orr WK0, WK0, #0xFF /* RB_FLDS (0x00FF00FF) */ 893 mov WK1, #0x80 /* HALF default value */ 894 mov WK2, SCRATCH, lsr #24 /* A_SRC */ 895 orr WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */ 896 push {WK0-WK3} 897 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16 898 uxtb16 SRC, SCRATCH 899 uxtb16 STRIDE_S, SCRATCH, ror #8 900 901 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 902 uadd8 SCRATCH, WK3, WK3 903 904 .unreq WK0 905 .unreq WK1 906 .unreq WK2 907 .unreq WK3 908 WK0 .req Y 909 WK1 .req STRIDE_D 910 RB_SRC .req SRC 911 AG_SRC .req STRIDE_S 912 WK2 .req STRIDE_M 913 RB_FLDS .req r8 /* the reloaded constants have to be at consecutive registers starting at an even one */ 914 A_SRC .req r8 915 HALF .req r9 916 WK3 .req r10 917 WK4 .req r11 918 WK5 .req SCRATCH 919 WK6 .req ORIG_W 920 921 line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W 922.endm 923 924.macro over_n_8888_8888_ca_cleanup 925 add sp, sp, #16 926 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16 927 928 .unreq WK0 929 .unreq WK1 930 .unreq RB_SRC 931 .unreq AG_SRC 932 .unreq WK2 933 .unreq RB_FLDS 934 .unreq A_SRC 935 .unreq HALF 936 .unreq WK3 937 .unreq WK4 938 .unreq WK5 939 .unreq WK6 940 WK0 .req r8 941 WK1 .req r9 942 WK2 .req r10 943 WK3 .req r11 944.endm 945 946.macro over_n_8888_8888_ca_1pixel_head 947 pixld , 4, 6, MASK, 0 948 pixld , 4, 0, DST, 0 949.endm 950 951.macro over_n_8888_8888_ca_1pixel_tail 952 ldrd A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8] 953 uxtb16 WK1, WK6 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */ 954 teq WK6, WK6, asr #32 /* Zc if transparent, ZC if opaque */ 955 bne 20f 956 bcc 40f 957 /* Mask is fully opaque (all channels) */ 958 ldr WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */ 959 eors A_SRC, A_SRC, #0xFF 960 bne 10f 961 /* Source is also opaque - same as src_8888_8888 */ 962 mov WK0, WK6 963 b 30f 96410: /* Same as over_8888_8888 */ 965 mul_8888_8 WK0, A_SRC, WK5, HALF 966 uqadd8 WK0, WK0, WK6 967 b 30f 96820: /* No simplifications possible - do it the hard way */ 969 uxtb16 WK2, WK6, ror #8 /* ag_mask */ 970 mla WK3, WK1, A_SRC, HALF /* rb_mul; 2 cycles */ 971 mla WK4, WK2, A_SRC, HALF /* ag_mul; 2 cycles */ 972 ldrd RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET] 973 uxtb16 WK5, WK0 /* rb_dest */ 974 uxtab16 WK3, WK3, WK3, ror #8 975 uxtb16 WK6, WK0, ror #8 /* ag_dest */ 976 uxtab16 WK4, WK4, WK4, ror #8 977 smlatt WK0, RB_SRC, WK1, HALF /* red1 */ 978 smlabb WK1, RB_SRC, WK1, HALF /* blue1 */ 979 bic WK3, RB_FLDS, WK3, lsr #8 980 bic WK4, RB_FLDS, WK4, lsr #8 981 pkhbt WK1, WK1, WK0, lsl #16 /* rb1 */ 982 smlatt WK0, WK5, WK3, HALF /* red2 */ 983 smlabb WK3, WK5, WK3, HALF /* blue2 */ 984 uxtab16 WK1, WK1, WK1, ror #8 985 smlatt WK5, AG_SRC, WK2, HALF /* alpha1 */ 986 pkhbt WK3, WK3, WK0, lsl #16 /* rb2 */ 987 smlabb WK0, AG_SRC, WK2, HALF /* green1 */ 988 smlatt WK2, WK6, WK4, HALF /* alpha2 */ 989 smlabb WK4, WK6, WK4, HALF /* green2 */ 990 pkhbt WK0, WK0, WK5, lsl #16 /* ag1 */ 991 uxtab16 WK3, WK3, WK3, ror #8 992 pkhbt WK4, WK4, WK2, lsl #16 /* ag2 */ 993 uxtab16 WK0, WK0, WK0, ror #8 994 uxtab16 WK4, WK4, WK4, ror #8 995 mov WK1, WK1, ror #8 996 mov WK3, WK3, ror #8 997 sel WK2, WK1, WK0 /* recombine source*mask */ 998 sel WK1, WK3, WK4 /* recombine dest*(1-source_alpha*mask) */ 999 uqadd8 WK0, WK1, WK2 /* followed by 1 stall */ 100030: /* The destination buffer is already in the L1 cache, so 1001 * there's little point in amalgamating writes */ 1002 pixst , 4, 0, DST 100340: 1004.endm 1005 1006.macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1007 .rept (numbytes / 4) - 1 1008 over_n_8888_8888_ca_1pixel_head 1009 over_n_8888_8888_ca_1pixel_tail 1010 .endr 1011 over_n_8888_8888_ca_1pixel_head 1012.endm 1013 1014.macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg 1015 over_n_8888_8888_ca_1pixel_tail 1016.endm 1017 1018pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6 1019 ldr ip, [sp] 1020 cmp ip, #-1 1021 beq pixman_composite_over_white_8888_8888_ca_asm_armv6 1022 /* else drop through... */ 1023 .endfunc 1024generate_composite_function \ 1025 pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \ 1026 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \ 1027 2, /* prefetch distance */ \ 1028 over_n_8888_8888_ca_init, \ 1029 nop_macro, /* newline */ \ 1030 over_n_8888_8888_ca_cleanup, \ 1031 over_n_8888_8888_ca_process_head, \ 1032 over_n_8888_8888_ca_process_tail 1033 1034/******************************************************************************/ 1035 1036.macro in_reverse_8888_8888_init 1037 /* Hold loop invariant in MASK */ 1038 ldr MASK, =0x00800080 1039 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1040 uadd8 SCRATCH, MASK, MASK 1041 /* Offset the source pointer: we only need the alpha bytes */ 1042 add SRC, SRC, #3 1043 line_saved_regs ORIG_W 1044.endm 1045 1046.macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3 1047 ldrb ORIG_W, [SRC], #4 1048 .if numbytes >= 8 1049 ldrb WK®1, [SRC], #4 1050 .if numbytes == 16 1051 ldrb WK®2, [SRC], #4 1052 ldrb WK®3, [SRC], #4 1053 .endif 1054 .endif 1055 add DST, DST, #numbytes 1056.endm 1057 1058.macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1059 in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2) 1060.endm 1061 1062.macro in_reverse_8888_8888_1pixel s, d, offset, is_only 1063 .if is_only != 1 1064 movs s, ORIG_W 1065 .if offset != 0 1066 ldrb ORIG_W, [SRC, #offset] 1067 .endif 1068 beq 01f 1069 teq STRIDE_M, #0xFF 1070 beq 02f 1071 .endif 1072 uxtb16 SCRATCH, d /* rb_dest */ 1073 uxtb16 d, d, ror #8 /* ag_dest */ 1074 mla SCRATCH, SCRATCH, s, MASK 1075 mla d, d, s, MASK 1076 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8 1077 uxtab16 d, d, d, ror #8 1078 mov SCRATCH, SCRATCH, ror #8 1079 sel d, SCRATCH, d 1080 b 02f 1081 .if offset == 0 108248: /* Last mov d,#0 of the set - used as part of shortcut for 1083 * source values all 0 */ 1084 .endif 108501: mov d, #0 108602: 1087.endm 1088 1089.macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4 1090 .if numbytes == 4 1091 teq ORIG_W, ORIG_W, asr #32 1092 ldrne WK®1, [DST, #-4] 1093 .elseif numbytes == 8 1094 teq ORIG_W, WK®1 1095 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ 1096 ldmnedb DST, {WK®1-WK®2} 1097 .else 1098 teq ORIG_W, WK®1 1099 teqeq ORIG_W, WK®2 1100 teqeq ORIG_W, WK®3 1101 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */ 1102 ldmnedb DST, {WK®1-WK®4} 1103 .endif 1104 cmnne DST, #0 /* clear C if NE */ 1105 bcs 49f /* no writes to dest if source all -1 */ 1106 beq 48f /* set dest to all 0 if source all 0 */ 1107 .if numbytes == 4 1108 in_reverse_8888_8888_1pixel ORIG_W, WK®1, 0, 1 1109 str WK®1, [DST, #-4] 1110 .elseif numbytes == 8 1111 in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -4, 0 1112 in_reverse_8888_8888_1pixel STRIDE_M, WK®2, 0, 0 1113 stmdb DST, {WK®1-WK®2} 1114 .else 1115 in_reverse_8888_8888_1pixel STRIDE_M, WK®1, -12, 0 1116 in_reverse_8888_8888_1pixel STRIDE_M, WK®2, -8, 0 1117 in_reverse_8888_8888_1pixel STRIDE_M, WK®3, -4, 0 1118 in_reverse_8888_8888_1pixel STRIDE_M, WK®4, 0, 0 1119 stmdb DST, {WK®1-WK®4} 1120 .endif 112149: 1122.endm 1123 1124.macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg 1125 in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3) 1126.endm 1127 1128generate_composite_function \ 1129 pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \ 1130 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \ 1131 2, /* prefetch distance */ \ 1132 in_reverse_8888_8888_init, \ 1133 nop_macro, /* newline */ \ 1134 nop_macro, /* cleanup */ \ 1135 in_reverse_8888_8888_process_head, \ 1136 in_reverse_8888_8888_process_tail 1137 1138/******************************************************************************/ 1139 1140.macro over_n_8888_init 1141 ldr SRC, [sp, #ARGS_STACK_OFFSET] 1142 /* Hold loop invariant in MASK */ 1143 ldr MASK, =0x00800080 1144 /* Hold multiplier for destination in STRIDE_M */ 1145 mov STRIDE_M, #255 1146 sub STRIDE_M, STRIDE_M, SRC, lsr #24 1147 /* Set GE[3:0] to 0101 so SEL instructions do what we want */ 1148 uadd8 SCRATCH, MASK, MASK 1149.endm 1150 1151.macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload 1152 pixld , numbytes, firstreg, DST, 0 1153.endm 1154 1155.macro over_n_8888_1pixel dst 1156 mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK 1157 uqadd8 WK&dst, WK&dst, SRC 1158.endm 1159 1160.macro over_n_8888_process_tail cond, numbytes, firstreg 1161 .set PROCESS_REG, firstreg 1162 .rept numbytes / 4 1163 over_n_8888_1pixel %(PROCESS_REG) 1164 .set PROCESS_REG, PROCESS_REG+1 1165 .endr 1166 pixst , numbytes, firstreg, DST 1167.endm 1168 1169generate_composite_function \ 1170 pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \ 1171 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \ 1172 2, /* prefetch distance */ \ 1173 over_n_8888_init, \ 1174 nop_macro, /* newline */ \ 1175 nop_macro, /* cleanup */ \ 1176 over_n_8888_process_head, \ 1177 over_n_8888_process_tail 1178 1179/******************************************************************************/ 1180