1/* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26/* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37/* Prevent the stack from becoming executable for no reason... */ 38#if defined(__linux__) && defined(__ELF__) 39.section .note.GNU-stack,"",%progbits 40#endif 41 42 .text 43 .fpu neon 44 .arch armv7a 45 .object_arch armv4 46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 48 .arm 49 .altmacro 50 .p2align 2 51 52/* Supplementary macro for setting function attributes */ 53.macro pixman_asm_function fname 54 .func fname 55 .global fname 56#ifdef __ELF__ 57 .hidden fname 58 .type fname, %function 59#endif 60fname: 61.endm 62 63/* 64 * The defines which are shared between C and assembly code 65 */ 66 67/* bilinear interpolation precision (must be < 8) */ 68#define BILINEAR_INTERPOLATION_BITS 7 69#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS) 70 71/* 72 * Copyright © 2009 Nokia Corporation 73 * 74 * Permission is hereby granted, free of charge, to any person obtaining a 75 * copy of this software and associated documentation files (the "Software"), 76 * to deal in the Software without restriction, including without limitation 77 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 78 * and/or sell copies of the Software, and to permit persons to whom the 79 * Software is furnished to do so, subject to the following conditions: 80 * 81 * The above copyright notice and this permission notice (including the next 82 * paragraph) shall be included in all copies or substantial portions of the 83 * Software. 84 * 85 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 86 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 87 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 88 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 89 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 90 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 91 * DEALINGS IN THE SOFTWARE. 92 * 93 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 94 */ 95 96/* 97 * This file contains a macro ('generate_composite_function') which can 98 * construct 2D image processing functions, based on a common template. 99 * Any combinations of source, destination and mask images with 8bpp, 100 * 16bpp, 24bpp, 32bpp color formats are supported. 101 * 102 * This macro takes care of: 103 * - handling of leading and trailing unaligned pixels 104 * - doing most of the work related to L2 cache preload 105 * - encourages the use of software pipelining for better instructions 106 * scheduling 107 * 108 * The user of this macro has to provide some configuration parameters 109 * (bit depths for the images, prefetch distance, etc.) and a set of 110 * macros, which should implement basic code chunks responsible for 111 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage 112 * examples. 113 * 114 * TODO: 115 * - try overlapped pixel method (from Ian Rickards) when processing 116 * exactly two blocks of pixels 117 * - maybe add an option to do reverse scanline processing 118 */ 119 120/* 121 * Bit flags for 'generate_composite_function' macro which are used 122 * to tune generated functions behavior. 123 */ 124.set FLAG_DST_WRITEONLY, 0 125.set FLAG_DST_READWRITE, 1 126.set FLAG_DEINTERLEAVE_32BPP, 2 127 128/* 129 * Offset in stack where mask and source pointer/stride can be accessed 130 * from 'init' macro. This is useful for doing special handling for solid mask. 131 */ 132.set ARGS_STACK_OFFSET, 40 133 134/* 135 * Constants for selecting preferable prefetch type. 136 */ 137.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ 138.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ 139.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ 140 141/* 142 * Definitions of supplementary pixld/pixst macros (for partial load/store of 143 * pixel data). 144 */ 145 146.macro pixldst1 op, elem_size, reg1, mem_operand, abits 147.if abits > 0 148 op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! 149.else 150 op&.&elem_size {d®1}, [&mem_operand&]! 151.endif 152.endm 153 154.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits 155.if abits > 0 156 op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! 157.else 158 op&.&elem_size {d®1, d®2}, [&mem_operand&]! 159.endif 160.endm 161 162.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits 163.if abits > 0 164 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! 165.else 166 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! 167.endif 168.endm 169 170.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits 171 op&.&elem_size {d®1[idx]}, [&mem_operand&]! 172.endm 173 174.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand 175 op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! 176.endm 177 178.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand 179 op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! 180.endm 181 182.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits 183.if numbytes == 32 184 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ 185 %(basereg+6), %(basereg+7), mem_operand, abits 186.elseif numbytes == 16 187 pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits 188.elseif numbytes == 8 189 pixldst1 op, elem_size, %(basereg+1), mem_operand, abits 190.elseif numbytes == 4 191 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) 192 pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits 193 .elseif elem_size == 16 194 pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits 195 pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits 196 .else 197 pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits 198 pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits 199 pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits 200 pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits 201 .endif 202.elseif numbytes == 2 203 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) 204 pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits 205 .else 206 pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits 207 pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits 208 .endif 209.elseif numbytes == 1 210 pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits 211.else 212 .error "unsupported size: numbytes" 213.endif 214.endm 215 216.macro pixld numpix, bpp, basereg, mem_operand, abits=0 217.if bpp > 0 218.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 219 pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ 220 %(basereg+6), %(basereg+7), mem_operand, abits 221.elseif (bpp == 24) && (numpix == 8) 222 pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 223.elseif (bpp == 24) && (numpix == 4) 224 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 225 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 226 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 227 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 228.elseif (bpp == 24) && (numpix == 2) 229 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 230 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 231.elseif (bpp == 24) && (numpix == 1) 232 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 233.else 234 pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits 235.endif 236.endif 237.endm 238 239.macro pixst numpix, bpp, basereg, mem_operand, abits=0 240.if bpp > 0 241.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) 242 pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ 243 %(basereg+6), %(basereg+7), mem_operand, abits 244.elseif (bpp == 24) && (numpix == 8) 245 pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand 246.elseif (bpp == 24) && (numpix == 4) 247 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand 248 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand 249 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand 250 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand 251.elseif (bpp == 24) && (numpix == 2) 252 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand 253 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand 254.elseif (bpp == 24) && (numpix == 1) 255 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand 256.else 257 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits 258.endif 259.endif 260.endm 261 262.macro pixld_a numpix, bpp, basereg, mem_operand 263.if (bpp * numpix) <= 128 264 pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) 265.else 266 pixld numpix, bpp, basereg, mem_operand, 128 267.endif 268.endm 269 270.macro pixst_a numpix, bpp, basereg, mem_operand 271.if (bpp * numpix) <= 128 272 pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) 273.else 274 pixst numpix, bpp, basereg, mem_operand, 128 275.endif 276.endm 277 278/* 279 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register 280 * aliases to be defined) 281 */ 282.macro pixld1_s elem_size, reg1, mem_operand 283.if elem_size == 16 284 mov TMP1, VX, asr #16 285 adds VX, VX, UNIT_X 2865: subpls VX, VX, SRC_WIDTH_FIXED 287 bpl 5b 288 add TMP1, mem_operand, TMP1, asl #1 289 mov TMP2, VX, asr #16 290 adds VX, VX, UNIT_X 2915: subpls VX, VX, SRC_WIDTH_FIXED 292 bpl 5b 293 add TMP2, mem_operand, TMP2, asl #1 294 vld1.16 {d®1&[0]}, [TMP1, :16] 295 mov TMP1, VX, asr #16 296 adds VX, VX, UNIT_X 2975: subpls VX, VX, SRC_WIDTH_FIXED 298 bpl 5b 299 add TMP1, mem_operand, TMP1, asl #1 300 vld1.16 {d®1&[1]}, [TMP2, :16] 301 mov TMP2, VX, asr #16 302 adds VX, VX, UNIT_X 3035: subpls VX, VX, SRC_WIDTH_FIXED 304 bpl 5b 305 add TMP2, mem_operand, TMP2, asl #1 306 vld1.16 {d®1&[2]}, [TMP1, :16] 307 vld1.16 {d®1&[3]}, [TMP2, :16] 308.elseif elem_size == 32 309 mov TMP1, VX, asr #16 310 adds VX, VX, UNIT_X 3115: subpls VX, VX, SRC_WIDTH_FIXED 312 bpl 5b 313 add TMP1, mem_operand, TMP1, asl #2 314 mov TMP2, VX, asr #16 315 adds VX, VX, UNIT_X 3165: subpls VX, VX, SRC_WIDTH_FIXED 317 bpl 5b 318 add TMP2, mem_operand, TMP2, asl #2 319 vld1.32 {d®1&[0]}, [TMP1, :32] 320 vld1.32 {d®1&[1]}, [TMP2, :32] 321.else 322 .error "unsupported" 323.endif 324.endm 325 326.macro pixld2_s elem_size, reg1, reg2, mem_operand 327.if 0 /* elem_size == 32 */ 328 mov TMP1, VX, asr #16 329 add VX, VX, UNIT_X, asl #1 330 add TMP1, mem_operand, TMP1, asl #2 331 mov TMP2, VX, asr #16 332 sub VX, VX, UNIT_X 333 add TMP2, mem_operand, TMP2, asl #2 334 vld1.32 {d®1&[0]}, [TMP1, :32] 335 mov TMP1, VX, asr #16 336 add VX, VX, UNIT_X, asl #1 337 add TMP1, mem_operand, TMP1, asl #2 338 vld1.32 {d®2&[0]}, [TMP2, :32] 339 mov TMP2, VX, asr #16 340 add VX, VX, UNIT_X 341 add TMP2, mem_operand, TMP2, asl #2 342 vld1.32 {d®1&[1]}, [TMP1, :32] 343 vld1.32 {d®2&[1]}, [TMP2, :32] 344.else 345 pixld1_s elem_size, reg1, mem_operand 346 pixld1_s elem_size, reg2, mem_operand 347.endif 348.endm 349 350.macro pixld0_s elem_size, reg1, idx, mem_operand 351.if elem_size == 16 352 mov TMP1, VX, asr #16 353 adds VX, VX, UNIT_X 3545: subpls VX, VX, SRC_WIDTH_FIXED 355 bpl 5b 356 add TMP1, mem_operand, TMP1, asl #1 357 vld1.16 {d®1&[idx]}, [TMP1, :16] 358.elseif elem_size == 32 359 mov TMP1, VX, asr #16 360 adds VX, VX, UNIT_X 3615: subpls VX, VX, SRC_WIDTH_FIXED 362 bpl 5b 363 add TMP1, mem_operand, TMP1, asl #2 364 vld1.32 {d®1&[idx]}, [TMP1, :32] 365.endif 366.endm 367 368.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand 369.if numbytes == 32 370 pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand 371 pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand 372 pixdeinterleave elem_size, %(basereg+4) 373.elseif numbytes == 16 374 pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand 375.elseif numbytes == 8 376 pixld1_s elem_size, %(basereg+1), mem_operand 377.elseif numbytes == 4 378 .if elem_size == 32 379 pixld0_s elem_size, %(basereg+0), 1, mem_operand 380 .elseif elem_size == 16 381 pixld0_s elem_size, %(basereg+0), 2, mem_operand 382 pixld0_s elem_size, %(basereg+0), 3, mem_operand 383 .else 384 pixld0_s elem_size, %(basereg+0), 4, mem_operand 385 pixld0_s elem_size, %(basereg+0), 5, mem_operand 386 pixld0_s elem_size, %(basereg+0), 6, mem_operand 387 pixld0_s elem_size, %(basereg+0), 7, mem_operand 388 .endif 389.elseif numbytes == 2 390 .if elem_size == 16 391 pixld0_s elem_size, %(basereg+0), 1, mem_operand 392 .else 393 pixld0_s elem_size, %(basereg+0), 2, mem_operand 394 pixld0_s elem_size, %(basereg+0), 3, mem_operand 395 .endif 396.elseif numbytes == 1 397 pixld0_s elem_size, %(basereg+0), 1, mem_operand 398.else 399 .error "unsupported size: numbytes" 400.endif 401.endm 402 403.macro pixld_s numpix, bpp, basereg, mem_operand 404.if bpp > 0 405 pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand 406.endif 407.endm 408 409.macro vuzp8 reg1, reg2 410 vuzp.8 d®1, d®2 411.endm 412 413.macro vzip8 reg1, reg2 414 vzip.8 d®1, d®2 415.endm 416 417/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 418.macro pixdeinterleave bpp, basereg 419.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 420 vuzp8 %(basereg+0), %(basereg+1) 421 vuzp8 %(basereg+2), %(basereg+3) 422 vuzp8 %(basereg+1), %(basereg+3) 423 vuzp8 %(basereg+0), %(basereg+2) 424.endif 425.endm 426 427/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ 428.macro pixinterleave bpp, basereg 429.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) 430 vzip8 %(basereg+0), %(basereg+2) 431 vzip8 %(basereg+1), %(basereg+3) 432 vzip8 %(basereg+2), %(basereg+3) 433 vzip8 %(basereg+0), %(basereg+1) 434.endif 435.endm 436 437/* 438 * This is a macro for implementing cache preload. The main idea is that 439 * cache preload logic is mostly independent from the rest of pixels 440 * processing code. It starts at the top left pixel and moves forward 441 * across pixels and can jump across scanlines. Prefetch distance is 442 * handled in an 'incremental' way: it starts from 0 and advances to the 443 * optimal distance over time. After reaching optimal prefetch distance, 444 * it is kept constant. There are some checks which prevent prefetching 445 * unneeded pixel lines below the image (but it still can prefetch a bit 446 * more data on the right side of the image - not a big issue and may 447 * be actually helpful when rendering text glyphs). Additional trick is 448 * the use of LDR instruction for prefetch instead of PLD when moving to 449 * the next line, the point is that we have a high chance of getting TLB 450 * miss in this case, and PLD would be useless. 451 * 452 * This sounds like it may introduce a noticeable overhead (when working with 453 * fully cached data). But in reality, due to having a separate pipeline and 454 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can 455 * execute simultaneously with NEON and be completely shadowed by it. Thus 456 * we get no performance overhead at all (*). This looks like a very nice 457 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, 458 * but still can implement some rather advanced prefetch logic in software 459 * for almost zero cost! 460 * 461 * (*) The overhead of the prefetcher is visible when running some trivial 462 * pixels processing like simple copy. Anyway, having prefetch is a must 463 * when working with the graphics data. 464 */ 465.macro PF a, x:vararg 466.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) 467 a x 468.endif 469.endm 470 471.macro cache_preload std_increment, boost_increment 472.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) 473.if regs_shortage 474 PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ 475.endif 476.if std_increment != 0 477 PF add PF_X, PF_X, #std_increment 478.endif 479 PF tst PF_CTL, #0xF 480 PF addne PF_X, PF_X, #boost_increment 481 PF subne PF_CTL, PF_CTL, #1 482 PF cmp PF_X, ORIG_W 483.if src_bpp_shift >= 0 484 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 485.endif 486.if dst_r_bpp != 0 487 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 488.endif 489.if mask_bpp_shift >= 0 490 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 491.endif 492 PF subge PF_X, PF_X, ORIG_W 493 PF subges PF_CTL, PF_CTL, #0x10 494.if src_bpp_shift >= 0 495 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 496.endif 497.if dst_r_bpp != 0 498 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 499.endif 500.if mask_bpp_shift >= 0 501 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 502.endif 503.endif 504.endm 505 506.macro cache_preload_simple 507.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) 508.if src_bpp > 0 509 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] 510.endif 511.if dst_r_bpp > 0 512 pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] 513.endif 514.if mask_bpp > 0 515 pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] 516.endif 517.endif 518.endm 519 520.macro fetch_mask_pixblock 521 pixld pixblock_size, mask_bpp, \ 522 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 523.endm 524 525/* 526 * Macro which is used to process leading pixels until destination 527 * pointer is properly aligned (at 16 bytes boundary). When destination 528 * buffer uses 16bpp format, this is unnecessary, or even pointless. 529 */ 530.macro ensure_destination_ptr_alignment process_pixblock_head, \ 531 process_pixblock_tail, \ 532 process_pixblock_tail_head 533.if dst_w_bpp != 24 534 tst DST_R, #0xF 535 beq 2f 536 537.irp lowbit, 1, 2, 4, 8, 16 538local skip1 539.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 540.if lowbit < 16 /* we don't need more than 16-byte alignment */ 541 tst DST_R, #lowbit 542 beq 1f 543.endif 544 pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC 545 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK 546.if dst_r_bpp > 0 547 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R 548.else 549 add DST_R, DST_R, #lowbit 550.endif 551 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) 552 sub W, W, #(lowbit * 8 / dst_w_bpp) 5531: 554.endif 555.endr 556 pixdeinterleave src_bpp, src_basereg 557 pixdeinterleave mask_bpp, mask_basereg 558 pixdeinterleave dst_r_bpp, dst_r_basereg 559 560 process_pixblock_head 561 cache_preload 0, pixblock_size 562 cache_preload_simple 563 process_pixblock_tail 564 565 pixinterleave dst_w_bpp, dst_w_basereg 566.irp lowbit, 1, 2, 4, 8, 16 567.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) 568.if lowbit < 16 /* we don't need more than 16-byte alignment */ 569 tst DST_W, #lowbit 570 beq 1f 571.endif 572 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W 5731: 574.endif 575.endr 576.endif 5772: 578.endm 579 580/* 581 * Special code for processing up to (pixblock_size - 1) remaining 582 * trailing pixels. As SIMD processing performs operation on 583 * pixblock_size pixels, anything smaller than this has to be loaded 584 * and stored in a special way. Loading and storing of pixel data is 585 * performed in such a way that we fill some 'slots' in the NEON 586 * registers (some slots naturally are unused), then perform compositing 587 * operation as usual. In the end, the data is taken from these 'slots' 588 * and saved to memory. 589 * 590 * cache_preload_flag - allows to suppress prefetch if 591 * set to 0 592 * dst_aligned_flag - selects whether destination buffer 593 * is aligned 594 */ 595.macro process_trailing_pixels cache_preload_flag, \ 596 dst_aligned_flag, \ 597 process_pixblock_head, \ 598 process_pixblock_tail, \ 599 process_pixblock_tail_head 600 tst W, #(pixblock_size - 1) 601 beq 2f 602.irp chunk_size, 16, 8, 4, 2, 1 603.if pixblock_size > chunk_size 604 tst W, #chunk_size 605 beq 1f 606 pixld_src chunk_size, src_bpp, src_basereg, SRC 607 pixld chunk_size, mask_bpp, mask_basereg, MASK 608.if dst_aligned_flag != 0 609 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R 610.else 611 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R 612.endif 613.if cache_preload_flag != 0 614 PF add PF_X, PF_X, #chunk_size 615.endif 6161: 617.endif 618.endr 619 pixdeinterleave src_bpp, src_basereg 620 pixdeinterleave mask_bpp, mask_basereg 621 pixdeinterleave dst_r_bpp, dst_r_basereg 622 623 process_pixblock_head 624.if cache_preload_flag != 0 625 cache_preload 0, pixblock_size 626 cache_preload_simple 627.endif 628 process_pixblock_tail 629 pixinterleave dst_w_bpp, dst_w_basereg 630.irp chunk_size, 16, 8, 4, 2, 1 631.if pixblock_size > chunk_size 632 tst W, #chunk_size 633 beq 1f 634.if dst_aligned_flag != 0 635 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W 636.else 637 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W 638.endif 6391: 640.endif 641.endr 6422: 643.endm 644 645/* 646 * Macro, which performs all the needed operations to switch to the next 647 * scanline and start the next loop iteration unless all the scanlines 648 * are already processed. 649 */ 650.macro advance_to_next_scanline start_of_loop_label 651.if regs_shortage 652 ldrd W, [sp] /* load W and H (width and height) from stack */ 653.else 654 mov W, ORIG_W 655.endif 656 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift 657.if src_bpp != 0 658 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift 659.endif 660.if mask_bpp != 0 661 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift 662.endif 663.if (dst_w_bpp != 24) 664 sub DST_W, DST_W, W, lsl #dst_bpp_shift 665.endif 666.if (src_bpp != 24) && (src_bpp != 0) 667 sub SRC, SRC, W, lsl #src_bpp_shift 668.endif 669.if (mask_bpp != 24) && (mask_bpp != 0) 670 sub MASK, MASK, W, lsl #mask_bpp_shift 671.endif 672 subs H, H, #1 673 mov DST_R, DST_W 674.if regs_shortage 675 str H, [sp, #4] /* save updated height to stack */ 676.endif 677 bge start_of_loop_label 678.endm 679 680/* 681 * Registers are allocated in the following way by default: 682 * d0, d1, d2, d3 - reserved for loading source pixel data 683 * d4, d5, d6, d7 - reserved for loading destination pixel data 684 * d24, d25, d26, d27 - reserved for loading mask pixel data 685 * d28, d29, d30, d31 - final destination pixel data for writeback to memory 686 */ 687.macro generate_composite_function fname, \ 688 src_bpp_, \ 689 mask_bpp_, \ 690 dst_w_bpp_, \ 691 flags, \ 692 pixblock_size_, \ 693 prefetch_distance, \ 694 init, \ 695 cleanup, \ 696 process_pixblock_head, \ 697 process_pixblock_tail, \ 698 process_pixblock_tail_head, \ 699 dst_w_basereg_ = 28, \ 700 dst_r_basereg_ = 4, \ 701 src_basereg_ = 0, \ 702 mask_basereg_ = 24 703 704 pixman_asm_function fname 705 706 push {r4-r12, lr} /* save all registers */ 707 708/* 709 * Select prefetch type for this function. If prefetch distance is 710 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch 711 * has to be used instead of ADVANCED. 712 */ 713 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT 714.if prefetch_distance == 0 715 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 716.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ 717 ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) 718 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE 719.endif 720 721/* 722 * Make some macro arguments globally visible and accessible 723 * from other macros 724 */ 725 .set src_bpp, src_bpp_ 726 .set mask_bpp, mask_bpp_ 727 .set dst_w_bpp, dst_w_bpp_ 728 .set pixblock_size, pixblock_size_ 729 .set dst_w_basereg, dst_w_basereg_ 730 .set dst_r_basereg, dst_r_basereg_ 731 .set src_basereg, src_basereg_ 732 .set mask_basereg, mask_basereg_ 733 734 .macro pixld_src x:vararg 735 pixld x 736 .endm 737 .macro fetch_src_pixblock 738 pixld_src pixblock_size, src_bpp, \ 739 (src_basereg - pixblock_size * src_bpp / 64), SRC 740 .endm 741/* 742 * Assign symbolic names to registers 743 */ 744 W .req r0 /* width (is updated during processing) */ 745 H .req r1 /* height (is updated during processing) */ 746 DST_W .req r2 /* destination buffer pointer for writes */ 747 DST_STRIDE .req r3 /* destination image stride */ 748 SRC .req r4 /* source buffer pointer */ 749 SRC_STRIDE .req r5 /* source image stride */ 750 DST_R .req r6 /* destination buffer pointer for reads */ 751 752 MASK .req r7 /* mask pointer */ 753 MASK_STRIDE .req r8 /* mask stride */ 754 755 PF_CTL .req r9 /* combined lines counter and prefetch */ 756 /* distance increment counter */ 757 PF_X .req r10 /* pixel index in a scanline for current */ 758 /* pretetch position */ 759 PF_SRC .req r11 /* pointer to source scanline start */ 760 /* for prefetch purposes */ 761 PF_DST .req r12 /* pointer to destination scanline start */ 762 /* for prefetch purposes */ 763 PF_MASK .req r14 /* pointer to mask scanline start */ 764 /* for prefetch purposes */ 765/* 766 * Check whether we have enough registers for all the local variables. 767 * If we don't have enough registers, original width and height are 768 * kept on top of stack (and 'regs_shortage' variable is set to indicate 769 * this for the rest of code). Even if there are enough registers, the 770 * allocation scheme may be a bit different depending on whether source 771 * or mask is not used. 772 */ 773.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) 774 ORIG_W .req r10 /* saved original width */ 775 DUMMY .req r12 /* temporary register */ 776 .set regs_shortage, 0 777.elseif mask_bpp == 0 778 ORIG_W .req r7 /* saved original width */ 779 DUMMY .req r8 /* temporary register */ 780 .set regs_shortage, 0 781.elseif src_bpp == 0 782 ORIG_W .req r4 /* saved original width */ 783 DUMMY .req r5 /* temporary register */ 784 .set regs_shortage, 0 785.else 786 ORIG_W .req r1 /* saved original width */ 787 DUMMY .req r1 /* temporary register */ 788 .set regs_shortage, 1 789.endif 790 791 .set mask_bpp_shift, -1 792.if src_bpp == 32 793 .set src_bpp_shift, 2 794.elseif src_bpp == 24 795 .set src_bpp_shift, 0 796.elseif src_bpp == 16 797 .set src_bpp_shift, 1 798.elseif src_bpp == 8 799 .set src_bpp_shift, 0 800.elseif src_bpp == 0 801 .set src_bpp_shift, -1 802.else 803 .error "requested src bpp (src_bpp) is not supported" 804.endif 805.if mask_bpp == 32 806 .set mask_bpp_shift, 2 807.elseif mask_bpp == 24 808 .set mask_bpp_shift, 0 809.elseif mask_bpp == 8 810 .set mask_bpp_shift, 0 811.elseif mask_bpp == 0 812 .set mask_bpp_shift, -1 813.else 814 .error "requested mask bpp (mask_bpp) is not supported" 815.endif 816.if dst_w_bpp == 32 817 .set dst_bpp_shift, 2 818.elseif dst_w_bpp == 24 819 .set dst_bpp_shift, 0 820.elseif dst_w_bpp == 16 821 .set dst_bpp_shift, 1 822.elseif dst_w_bpp == 8 823 .set dst_bpp_shift, 0 824.else 825 .error "requested dst bpp (dst_w_bpp) is not supported" 826.endif 827 828.if (((flags) & FLAG_DST_READWRITE) != 0) 829 .set dst_r_bpp, dst_w_bpp 830.else 831 .set dst_r_bpp, 0 832.endif 833.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 834 .set DEINTERLEAVE_32BPP_ENABLED, 1 835.else 836 .set DEINTERLEAVE_32BPP_ENABLED, 0 837.endif 838 839.if prefetch_distance < 0 || prefetch_distance > 15 840 .error "invalid prefetch distance (prefetch_distance)" 841.endif 842 843.if src_bpp > 0 844 ldr SRC, [sp, #40] 845.endif 846.if mask_bpp > 0 847 ldr MASK, [sp, #48] 848.endif 849 PF mov PF_X, #0 850.if src_bpp > 0 851 ldr SRC_STRIDE, [sp, #44] 852.endif 853.if mask_bpp > 0 854 ldr MASK_STRIDE, [sp, #52] 855.endif 856 mov DST_R, DST_W 857 858.if src_bpp == 24 859 sub SRC_STRIDE, SRC_STRIDE, W 860 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 861.endif 862.if mask_bpp == 24 863 sub MASK_STRIDE, MASK_STRIDE, W 864 sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 865.endif 866.if dst_w_bpp == 24 867 sub DST_STRIDE, DST_STRIDE, W 868 sub DST_STRIDE, DST_STRIDE, W, lsl #1 869.endif 870 871/* 872 * Setup advanced prefetcher initial state 873 */ 874 PF mov PF_SRC, SRC 875 PF mov PF_DST, DST_R 876 PF mov PF_MASK, MASK 877 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ 878 PF mov PF_CTL, H, lsl #4 879 PF add PF_CTL, #(prefetch_distance - 0x10) 880 881 init 882.if regs_shortage 883 push {r0, r1} 884.endif 885 subs H, H, #1 886.if regs_shortage 887 str H, [sp, #4] /* save updated height to stack */ 888.else 889 mov ORIG_W, W 890.endif 891 blt 9f 892 cmp W, #(pixblock_size * 2) 893 blt 8f 894/* 895 * This is the start of the pipelined loop, which if optimized for 896 * long scanlines 897 */ 8980: 899 ensure_destination_ptr_alignment process_pixblock_head, \ 900 process_pixblock_tail, \ 901 process_pixblock_tail_head 902 903 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 904 pixld_a pixblock_size, dst_r_bpp, \ 905 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 906 fetch_src_pixblock 907 pixld pixblock_size, mask_bpp, \ 908 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 909 PF add PF_X, PF_X, #pixblock_size 910 process_pixblock_head 911 cache_preload 0, pixblock_size 912 cache_preload_simple 913 subs W, W, #(pixblock_size * 2) 914 blt 2f 9151: 916 process_pixblock_tail_head 917 cache_preload_simple 918 subs W, W, #pixblock_size 919 bge 1b 9202: 921 process_pixblock_tail 922 pixst_a pixblock_size, dst_w_bpp, \ 923 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 924 925 /* Process the remaining trailing pixels in the scanline */ 926 process_trailing_pixels 1, 1, \ 927 process_pixblock_head, \ 928 process_pixblock_tail, \ 929 process_pixblock_tail_head 930 advance_to_next_scanline 0b 931 932.if regs_shortage 933 pop {r0, r1} 934.endif 935 cleanup 936 pop {r4-r12, pc} /* exit */ 937/* 938 * This is the start of the loop, designed to process images with small width 939 * (less than pixblock_size * 2 pixels). In this case neither pipelining 940 * nor prefetch are used. 941 */ 9428: 943 /* Process exactly pixblock_size pixels if needed */ 944 tst W, #pixblock_size 945 beq 1f 946 pixld pixblock_size, dst_r_bpp, \ 947 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 948 fetch_src_pixblock 949 pixld pixblock_size, mask_bpp, \ 950 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 951 process_pixblock_head 952 process_pixblock_tail 953 pixst pixblock_size, dst_w_bpp, \ 954 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 9551: 956 /* Process the remaining trailing pixels in the scanline */ 957 process_trailing_pixels 0, 0, \ 958 process_pixblock_head, \ 959 process_pixblock_tail, \ 960 process_pixblock_tail_head 961 advance_to_next_scanline 8b 9629: 963.if regs_shortage 964 pop {r0, r1} 965.endif 966 cleanup 967 pop {r4-r12, pc} /* exit */ 968 969 .purgem fetch_src_pixblock 970 .purgem pixld_src 971 972 .unreq SRC 973 .unreq MASK 974 .unreq DST_R 975 .unreq DST_W 976 .unreq ORIG_W 977 .unreq W 978 .unreq H 979 .unreq SRC_STRIDE 980 .unreq DST_STRIDE 981 .unreq MASK_STRIDE 982 .unreq PF_CTL 983 .unreq PF_X 984 .unreq PF_SRC 985 .unreq PF_DST 986 .unreq PF_MASK 987 .unreq DUMMY 988 .endfunc 989.endm 990 991/* 992 * A simplified variant of function generation template for a single 993 * scanline processing (for implementing pixman combine functions) 994 */ 995.macro generate_composite_function_scanline use_nearest_scaling, \ 996 fname, \ 997 src_bpp_, \ 998 mask_bpp_, \ 999 dst_w_bpp_, \ 1000 flags, \ 1001 pixblock_size_, \ 1002 init, \ 1003 cleanup, \ 1004 process_pixblock_head, \ 1005 process_pixblock_tail, \ 1006 process_pixblock_tail_head, \ 1007 dst_w_basereg_ = 28, \ 1008 dst_r_basereg_ = 4, \ 1009 src_basereg_ = 0, \ 1010 mask_basereg_ = 24 1011 1012 pixman_asm_function fname 1013 1014 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 1015/* 1016 * Make some macro arguments globally visible and accessible 1017 * from other macros 1018 */ 1019 .set src_bpp, src_bpp_ 1020 .set mask_bpp, mask_bpp_ 1021 .set dst_w_bpp, dst_w_bpp_ 1022 .set pixblock_size, pixblock_size_ 1023 .set dst_w_basereg, dst_w_basereg_ 1024 .set dst_r_basereg, dst_r_basereg_ 1025 .set src_basereg, src_basereg_ 1026 .set mask_basereg, mask_basereg_ 1027 1028.if use_nearest_scaling != 0 1029 /* 1030 * Assign symbolic names to registers for nearest scaling 1031 */ 1032 W .req r0 1033 DST_W .req r1 1034 SRC .req r2 1035 VX .req r3 1036 UNIT_X .req ip 1037 MASK .req lr 1038 TMP1 .req r4 1039 TMP2 .req r5 1040 DST_R .req r6 1041 SRC_WIDTH_FIXED .req r7 1042 1043 .macro pixld_src x:vararg 1044 pixld_s x 1045 .endm 1046 1047 ldr UNIT_X, [sp] 1048 push {r4-r8, lr} 1049 ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] 1050 .if mask_bpp != 0 1051 ldr MASK, [sp, #(24 + 8)] 1052 .endif 1053.else 1054 /* 1055 * Assign symbolic names to registers 1056 */ 1057 W .req r0 /* width (is updated during processing) */ 1058 DST_W .req r1 /* destination buffer pointer for writes */ 1059 SRC .req r2 /* source buffer pointer */ 1060 DST_R .req ip /* destination buffer pointer for reads */ 1061 MASK .req r3 /* mask pointer */ 1062 1063 .macro pixld_src x:vararg 1064 pixld x 1065 .endm 1066.endif 1067 1068.if (((flags) & FLAG_DST_READWRITE) != 0) 1069 .set dst_r_bpp, dst_w_bpp 1070.else 1071 .set dst_r_bpp, 0 1072.endif 1073.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) 1074 .set DEINTERLEAVE_32BPP_ENABLED, 1 1075.else 1076 .set DEINTERLEAVE_32BPP_ENABLED, 0 1077.endif 1078 1079 .macro fetch_src_pixblock 1080 pixld_src pixblock_size, src_bpp, \ 1081 (src_basereg - pixblock_size * src_bpp / 64), SRC 1082 .endm 1083 1084 init 1085 mov DST_R, DST_W 1086 1087 cmp W, #pixblock_size 1088 blt 8f 1089 1090 ensure_destination_ptr_alignment process_pixblock_head, \ 1091 process_pixblock_tail, \ 1092 process_pixblock_tail_head 1093 1094 subs W, W, #pixblock_size 1095 blt 7f 1096 1097 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ 1098 pixld_a pixblock_size, dst_r_bpp, \ 1099 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R 1100 fetch_src_pixblock 1101 pixld pixblock_size, mask_bpp, \ 1102 (mask_basereg - pixblock_size * mask_bpp / 64), MASK 1103 process_pixblock_head 1104 subs W, W, #pixblock_size 1105 blt 2f 11061: 1107 process_pixblock_tail_head 1108 subs W, W, #pixblock_size 1109 bge 1b 11102: 1111 process_pixblock_tail 1112 pixst_a pixblock_size, dst_w_bpp, \ 1113 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W 11147: 1115 /* Process the remaining trailing pixels in the scanline (dst aligned) */ 1116 process_trailing_pixels 0, 1, \ 1117 process_pixblock_head, \ 1118 process_pixblock_tail, \ 1119 process_pixblock_tail_head 1120 1121 cleanup 1122.if use_nearest_scaling != 0 1123 pop {r4-r8, pc} /* exit */ 1124.else 1125 bx lr /* exit */ 1126.endif 11278: 1128 /* Process the remaining trailing pixels in the scanline (dst unaligned) */ 1129 process_trailing_pixels 0, 0, \ 1130 process_pixblock_head, \ 1131 process_pixblock_tail, \ 1132 process_pixblock_tail_head 1133 1134 cleanup 1135 1136.if use_nearest_scaling != 0 1137 pop {r4-r8, pc} /* exit */ 1138 1139 .unreq DST_R 1140 .unreq SRC 1141 .unreq W 1142 .unreq VX 1143 .unreq UNIT_X 1144 .unreq TMP1 1145 .unreq TMP2 1146 .unreq DST_W 1147 .unreq MASK 1148 .unreq SRC_WIDTH_FIXED 1149 1150.else 1151 bx lr /* exit */ 1152 1153 .unreq SRC 1154 .unreq MASK 1155 .unreq DST_R 1156 .unreq DST_W 1157 .unreq W 1158.endif 1159 1160 .purgem fetch_src_pixblock 1161 .purgem pixld_src 1162 1163 .endfunc 1164.endm 1165 1166.macro generate_composite_function_single_scanline x:vararg 1167 generate_composite_function_scanline 0, x 1168.endm 1169 1170.macro generate_composite_function_nearest_scanline x:vararg 1171 generate_composite_function_scanline 1, x 1172.endm 1173 1174/* Default prologue/epilogue, nothing special needs to be done */ 1175 1176.macro default_init 1177.endm 1178 1179.macro default_cleanup 1180.endm 1181 1182/* 1183 * Prologue/epilogue variant which additionally saves/restores d8-d15 1184 * registers (they need to be saved/restored by callee according to ABI). 1185 * This is required if the code needs to use all the NEON registers. 1186 */ 1187 1188.macro default_init_need_all_regs 1189 vpush {d8-d15} 1190.endm 1191 1192.macro default_cleanup_need_all_regs 1193 vpop {d8-d15} 1194.endm 1195 1196/******************************************************************************/ 1197 1198/* 1199 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) 1200 * into a planar a8r8g8b8 format (with a, r, g, b color components 1201 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). 1202 * 1203 * Warning: the conversion is destructive and the original 1204 * value (in) is lost. 1205 */ 1206.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b 1207 vshrn.u16 out_r, in, #8 1208 vshrn.u16 out_g, in, #3 1209 vsli.u16 in, in, #5 1210 vmov.u8 out_a, #255 1211 vsri.u8 out_r, out_r, #5 1212 vsri.u8 out_g, out_g, #6 1213 vshrn.u16 out_b, in, #2 1214.endm 1215 1216.macro convert_0565_to_x888 in, out_r, out_g, out_b 1217 vshrn.u16 out_r, in, #8 1218 vshrn.u16 out_g, in, #3 1219 vsli.u16 in, in, #5 1220 vsri.u8 out_r, out_r, #5 1221 vsri.u8 out_g, out_g, #6 1222 vshrn.u16 out_b, in, #2 1223.endm 1224 1225/* 1226 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components 1227 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 1228 * pixels packed in 128-bit register (out). Requires two temporary 128-bit 1229 * registers (tmp1, tmp2) 1230 */ 1231.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 1232 vshll.u8 tmp1, in_g, #8 1233 vshll.u8 out, in_r, #8 1234 vshll.u8 tmp2, in_b, #8 1235 vsri.u16 out, tmp1, #5 1236 vsri.u16 out, tmp2, #11 1237.endm 1238 1239/* 1240 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels 1241 * returned in (out0, out1) registers pair. Requires one temporary 1242 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original 1243 * value from 'in' is lost 1244 */ 1245.macro convert_four_0565_to_x888_packed in, out0, out1, tmp 1246 vshl.u16 out0, in, #5 /* G top 6 bits */ 1247 vshl.u16 tmp, in, #11 /* B top 5 bits */ 1248 vsri.u16 in, in, #5 /* R is ready in top bits */ 1249 vsri.u16 out0, out0, #6 /* G is ready in top bits */ 1250 vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ 1251 vshr.u16 out1, in, #8 /* R is in place */ 1252 vsri.u16 out0, tmp, #8 /* G & B is in place */ 1253 vzip.u16 out0, out1 /* everything is in place */ 1254.endm 1255 1256/* Global configuration options and preferences */ 1257 1258/* 1259 * The code can optionally make use of unaligned memory accesses to improve 1260 * performance of handling leading/trailing pixels for each scanline. 1261 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 1262 * example in linux if unaligned memory accesses are not configured to 1263 * generate.exceptions. 1264 */ 1265.set RESPECT_STRICT_ALIGNMENT, 1 1266 1267/* 1268 * Set default prefetch type. There is a choice between the following options: 1269 * 1270 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 1271 * as NOP to workaround some HW bugs or for whatever other reason) 1272 * 1273 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 1274 * advanced prefetch intruduces heavy overhead) 1275 * 1276 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 1277 * which can run ARM and NEON instructions simultaneously so that extra ARM 1278 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 1279 * 1280 * Note: some types of function can't support advanced prefetch and fallback 1281 * to simple one (those which handle 24bpp pixels) 1282 */ 1283.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 1284 1285/* Prefetch distance in pixels for simple prefetch */ 1286.set PREFETCH_DISTANCE_SIMPLE, 64 1287 1288/* 1289 * Implementation of pixman_composite_over_8888_0565_asm_neon 1290 * 1291 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 1292 * performs OVER compositing operation. Function fast_composite_over_8888_0565 1293 * from pixman-fast-path.c does the same in C and can be used as a reference. 1294 * 1295 * First we need to have some NEON assembly code which can do the actual 1296 * operation on the pixels and provide it to the template macro. 1297 * 1298 * Template macro quite conveniently takes care of emitting all the necessary 1299 * code for memory reading and writing (including quite tricky cases of 1300 * handling unaligned leading/trailing pixels), so we only need to deal with 1301 * the data in NEON registers. 1302 * 1303 * NEON registers allocation in general is recommented to be the following: 1304 * d0, d1, d2, d3 - contain loaded source pixel data 1305 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 1306 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 1307 * d28, d29, d30, d31 - place for storing the result (destination pixels) 1308 * 1309 * As can be seen above, four 64-bit NEON registers are used for keeping 1310 * intermediate pixel data and up to 8 pixels can be processed in one step 1311 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 1312 * 1313 * This particular function uses the following registers allocation: 1314 * d0, d1, d2, d3 - contain loaded source pixel data 1315 * d4, d5 - contain loaded destination pixels (they are needed) 1316 * d28, d29 - place for storing the result (destination pixels) 1317 */ 1318 1319/* 1320 * Step one. We need to have some code to do some arithmetics on pixel data. 1321 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 1322 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 1323 * perform all the needed calculations and write the result to {d28, d29}. 1324 * The rationale for having two macros and not just one will be explained 1325 * later. In practice, any single monolitic function which does the work can 1326 * be split into two parts in any arbitrary way without affecting correctness. 1327 * 1328 * There is one special trick here too. Common template macro can optionally 1329 * make our life a bit easier by doing R, G, B, A color components 1330 * deinterleaving for 32bpp pixel formats (and this feature is used in 1331 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 1332 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 1333 * actually use d0 register for blue channel (a vector of eight 8-bit 1334 * values), d1 register for green, d2 for red and d3 for alpha. This 1335 * simple conversion can be also done with a few NEON instructions: 1336 * 1337 * Packed to planar conversion: 1338 * vuzp.8 d0, d1 1339 * vuzp.8 d2, d3 1340 * vuzp.8 d1, d3 1341 * vuzp.8 d0, d2 1342 * 1343 * Planar to packed conversion: 1344 * vzip.8 d0, d2 1345 * vzip.8 d1, d3 1346 * vzip.8 d2, d3 1347 * vzip.8 d0, d1 1348 * 1349 * But pixel can be loaded directly in planar format using VLD4.8 NEON 1350 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 1351 * desirable, that's why deinterleaving is optional. 1352 * 1353 * But anyway, here is the code: 1354 */ 1355.macro pixman_composite_over_8888_0565_process_pixblock_head 1356 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1357 and put data into d6 - red, d7 - green, d30 - blue */ 1358 vshrn.u16 d6, q2, #8 1359 vshrn.u16 d7, q2, #3 1360 vsli.u16 q2, q2, #5 1361 vsri.u8 d6, d6, #5 1362 vmvn.8 d3, d3 /* invert source alpha */ 1363 vsri.u8 d7, d7, #6 1364 vshrn.u16 d30, q2, #2 1365 /* now do alpha blending, storing results in 8-bit planar format 1366 into d16 - red, d19 - green, d18 - blue */ 1367 vmull.u8 q10, d3, d6 1368 vmull.u8 q11, d3, d7 1369 vmull.u8 q12, d3, d30 1370 vrshr.u16 q13, q10, #8 1371 vrshr.u16 q3, q11, #8 1372 vrshr.u16 q15, q12, #8 1373 vraddhn.u16 d20, q10, q13 1374 vraddhn.u16 d23, q11, q3 1375 vraddhn.u16 d22, q12, q15 1376.endm 1377 1378.macro pixman_composite_over_8888_0565_process_pixblock_tail 1379 /* ... continue alpha blending */ 1380 vqadd.u8 d16, d2, d20 1381 vqadd.u8 q9, q0, q11 1382 /* convert the result to r5g6b5 and store it into {d28, d29} */ 1383 vshll.u8 q14, d16, #8 1384 vshll.u8 q8, d19, #8 1385 vshll.u8 q9, d18, #8 1386 vsri.u16 q14, q8, #5 1387 vsri.u16 q14, q9, #11 1388.endm 1389 1390/* 1391 * OK, now we got almost everything that we need. Using the above two 1392 * macros, the work can be done right. But now we want to optimize 1393 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 1394 * a lot from good code scheduling and software pipelining. 1395 * 1396 * Let's construct some code, which will run in the core main loop. 1397 * Some pseudo-code of the main loop will look like this: 1398 * head 1399 * while (...) { 1400 * tail 1401 * head 1402 * } 1403 * tail 1404 * 1405 * It may look a bit weird, but this setup allows to hide instruction 1406 * latencies better and also utilize dual-issue capability more 1407 * efficiently (make pairs of load-store and ALU instructions). 1408 * 1409 * So what we need now is a '*_tail_head' macro, which will be used 1410 * in the core main loop. A trivial straightforward implementation 1411 * of this macro would look like this: 1412 * 1413 * pixman_composite_over_8888_0565_process_pixblock_tail 1414 * vst1.16 {d28, d29}, [DST_W, :128]! 1415 * vld1.16 {d4, d5}, [DST_R, :128]! 1416 * vld4.32 {d0, d1, d2, d3}, [SRC]! 1417 * pixman_composite_over_8888_0565_process_pixblock_head 1418 * cache_preload 8, 8 1419 * 1420 * Now it also got some VLD/VST instructions. We simply can't move from 1421 * processing one block of pixels to the other one with just arithmetics. 1422 * The previously processed data needs to be written to memory and new 1423 * data needs to be fetched. Fortunately, this main loop does not deal 1424 * with partial leading/trailing pixels and can load/store a full block 1425 * of pixels in a bulk. Additionally, destination buffer is already 1426 * 16 bytes aligned here (which is good for performance). 1427 * 1428 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 1429 * are the aliases for ARM registers which are used as pointers for 1430 * accessing data. We maintain separate pointers for reading and writing 1431 * destination buffer (DST_R and DST_W). 1432 * 1433 * Another new thing is 'cache_preload' macro. It is used for prefetching 1434 * data into CPU L2 cache and improve performance when dealing with large 1435 * images which are far larger than cache size. It uses one argument 1436 * (actually two, but they need to be the same here) - number of pixels 1437 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 1438 * details about this macro. Moreover, if good performance is needed 1439 * the code from this macro needs to be copied into '*_tail_head' macro 1440 * and mixed with the rest of code for optimal instructions scheduling. 1441 * We are actually doing it below. 1442 * 1443 * Now after all the explanations, here is the optimized code. 1444 * Different instruction streams (originaling from '*_head', '*_tail' 1445 * and 'cache_preload' macro) use different indentation levels for 1446 * better readability. Actually taking the code from one of these 1447 * indentation levels and ignoring a few VLD/VST instructions would 1448 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 1449 * macro! 1450 */ 1451 1452#if 1 1453 1454.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 1455 vqadd.u8 d16, d2, d20 1456 vld1.16 {d4, d5}, [DST_R, :128]! 1457 vqadd.u8 q9, q0, q11 1458 vshrn.u16 d6, q2, #8 1459 fetch_src_pixblock 1460 vshrn.u16 d7, q2, #3 1461 vsli.u16 q2, q2, #5 1462 vshll.u8 q14, d16, #8 1463 PF add PF_X, PF_X, #8 1464 vshll.u8 q8, d19, #8 1465 PF tst PF_CTL, #0xF 1466 vsri.u8 d6, d6, #5 1467 PF addne PF_X, PF_X, #8 1468 vmvn.8 d3, d3 1469 PF subne PF_CTL, PF_CTL, #1 1470 vsri.u8 d7, d7, #6 1471 vshrn.u16 d30, q2, #2 1472 vmull.u8 q10, d3, d6 1473 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1474 vmull.u8 q11, d3, d7 1475 vmull.u8 q12, d3, d30 1476 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1477 vsri.u16 q14, q8, #5 1478 PF cmp PF_X, ORIG_W 1479 vshll.u8 q9, d18, #8 1480 vrshr.u16 q13, q10, #8 1481 PF subge PF_X, PF_X, ORIG_W 1482 vrshr.u16 q3, q11, #8 1483 vrshr.u16 q15, q12, #8 1484 PF subges PF_CTL, PF_CTL, #0x10 1485 vsri.u16 q14, q9, #11 1486 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1487 vraddhn.u16 d20, q10, q13 1488 vraddhn.u16 d23, q11, q3 1489 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1490 vraddhn.u16 d22, q12, q15 1491 vst1.16 {d28, d29}, [DST_W, :128]! 1492.endm 1493 1494#else 1495 1496/* If we did not care much about the performance, we would just use this... */ 1497.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 1498 pixman_composite_over_8888_0565_process_pixblock_tail 1499 vst1.16 {d28, d29}, [DST_W, :128]! 1500 vld1.16 {d4, d5}, [DST_R, :128]! 1501 fetch_src_pixblock 1502 pixman_composite_over_8888_0565_process_pixblock_head 1503 cache_preload 8, 8 1504.endm 1505 1506#endif 1507 1508/* 1509 * And now the final part. We are using 'generate_composite_function' macro 1510 * to put all the stuff together. We are specifying the name of the function 1511 * which we want to get, number of bits per pixel for the source, mask and 1512 * destination (0 if unused, like mask in this case). Next come some bit 1513 * flags: 1514 * FLAG_DST_READWRITE - tells that the destination buffer is both read 1515 * and written, for write-only buffer we would use 1516 * FLAG_DST_WRITEONLY flag instead 1517 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 1518 * and separate color channels for 32bpp format. 1519 * The next things are: 1520 * - the number of pixels processed per iteration (8 in this case, because 1521 * that's the maximum what can fit into four 64-bit NEON registers). 1522 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 1523 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 1524 * prefetch distance can be selected by running some benchmarks. 1525 * 1526 * After that we specify some macros, these are 'default_init', 1527 * 'default_cleanup' here which are empty (but it is possible to have custom 1528 * init/cleanup macros to be able to save/restore some extra NEON registers 1529 * like d8-d15 or do anything else) followed by 1530 * 'pixman_composite_over_8888_0565_process_pixblock_head', 1531 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 1532 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 1533 * which we got implemented above. 1534 * 1535 * The last part is the NEON registers allocation scheme. 1536 */ 1537generate_composite_function \ 1538 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 1539 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1540 8, /* number of pixels, processed in a single block */ \ 1541 5, /* prefetch distance */ \ 1542 default_init, \ 1543 default_cleanup, \ 1544 pixman_composite_over_8888_0565_process_pixblock_head, \ 1545 pixman_composite_over_8888_0565_process_pixblock_tail, \ 1546 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 1547 28, /* dst_w_basereg */ \ 1548 4, /* dst_r_basereg */ \ 1549 0, /* src_basereg */ \ 1550 24 /* mask_basereg */ 1551 1552/******************************************************************************/ 1553 1554.macro pixman_composite_over_n_0565_process_pixblock_head 1555 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1556 and put data into d6 - red, d7 - green, d30 - blue */ 1557 vshrn.u16 d6, q2, #8 1558 vshrn.u16 d7, q2, #3 1559 vsli.u16 q2, q2, #5 1560 vsri.u8 d6, d6, #5 1561 vsri.u8 d7, d7, #6 1562 vshrn.u16 d30, q2, #2 1563 /* now do alpha blending, storing results in 8-bit planar format 1564 into d16 - red, d19 - green, d18 - blue */ 1565 vmull.u8 q10, d3, d6 1566 vmull.u8 q11, d3, d7 1567 vmull.u8 q12, d3, d30 1568 vrshr.u16 q13, q10, #8 1569 vrshr.u16 q3, q11, #8 1570 vrshr.u16 q15, q12, #8 1571 vraddhn.u16 d20, q10, q13 1572 vraddhn.u16 d23, q11, q3 1573 vraddhn.u16 d22, q12, q15 1574.endm 1575 1576.macro pixman_composite_over_n_0565_process_pixblock_tail 1577 /* ... continue alpha blending */ 1578 vqadd.u8 d16, d2, d20 1579 vqadd.u8 q9, q0, q11 1580 /* convert the result to r5g6b5 and store it into {d28, d29} */ 1581 vshll.u8 q14, d16, #8 1582 vshll.u8 q8, d19, #8 1583 vshll.u8 q9, d18, #8 1584 vsri.u16 q14, q8, #5 1585 vsri.u16 q14, q9, #11 1586.endm 1587 1588/* TODO: expand macros and do better instructions scheduling */ 1589.macro pixman_composite_over_n_0565_process_pixblock_tail_head 1590 pixman_composite_over_n_0565_process_pixblock_tail 1591 vld1.16 {d4, d5}, [DST_R, :128]! 1592 vst1.16 {d28, d29}, [DST_W, :128]! 1593 pixman_composite_over_n_0565_process_pixblock_head 1594 cache_preload 8, 8 1595.endm 1596 1597.macro pixman_composite_over_n_0565_init 1598 add DUMMY, sp, #ARGS_STACK_OFFSET 1599 vld1.32 {d3[0]}, [DUMMY] 1600 vdup.8 d0, d3[0] 1601 vdup.8 d1, d3[1] 1602 vdup.8 d2, d3[2] 1603 vdup.8 d3, d3[3] 1604 vmvn.8 d3, d3 /* invert source alpha */ 1605.endm 1606 1607generate_composite_function \ 1608 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 1609 FLAG_DST_READWRITE, \ 1610 8, /* number of pixels, processed in a single block */ \ 1611 5, /* prefetch distance */ \ 1612 pixman_composite_over_n_0565_init, \ 1613 default_cleanup, \ 1614 pixman_composite_over_n_0565_process_pixblock_head, \ 1615 pixman_composite_over_n_0565_process_pixblock_tail, \ 1616 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 1617 28, /* dst_w_basereg */ \ 1618 4, /* dst_r_basereg */ \ 1619 0, /* src_basereg */ \ 1620 24 /* mask_basereg */ 1621 1622/******************************************************************************/ 1623 1624.macro pixman_composite_src_8888_0565_process_pixblock_head 1625 vshll.u8 q8, d1, #8 1626 vshll.u8 q14, d2, #8 1627 vshll.u8 q9, d0, #8 1628.endm 1629 1630.macro pixman_composite_src_8888_0565_process_pixblock_tail 1631 vsri.u16 q14, q8, #5 1632 vsri.u16 q14, q9, #11 1633.endm 1634 1635.macro pixman_composite_src_8888_0565_process_pixblock_tail_head 1636 vsri.u16 q14, q8, #5 1637 PF add PF_X, PF_X, #8 1638 PF tst PF_CTL, #0xF 1639 fetch_src_pixblock 1640 PF addne PF_X, PF_X, #8 1641 PF subne PF_CTL, PF_CTL, #1 1642 vsri.u16 q14, q9, #11 1643 PF cmp PF_X, ORIG_W 1644 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1645 vshll.u8 q8, d1, #8 1646 vst1.16 {d28, d29}, [DST_W, :128]! 1647 PF subge PF_X, PF_X, ORIG_W 1648 PF subges PF_CTL, PF_CTL, #0x10 1649 vshll.u8 q14, d2, #8 1650 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1651 vshll.u8 q9, d0, #8 1652.endm 1653 1654generate_composite_function \ 1655 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 1656 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1657 8, /* number of pixels, processed in a single block */ \ 1658 10, /* prefetch distance */ \ 1659 default_init, \ 1660 default_cleanup, \ 1661 pixman_composite_src_8888_0565_process_pixblock_head, \ 1662 pixman_composite_src_8888_0565_process_pixblock_tail, \ 1663 pixman_composite_src_8888_0565_process_pixblock_tail_head 1664 1665/******************************************************************************/ 1666 1667.macro pixman_composite_src_0565_8888_process_pixblock_head 1668 vshrn.u16 d30, q0, #8 1669 vshrn.u16 d29, q0, #3 1670 vsli.u16 q0, q0, #5 1671 vmov.u8 d31, #255 1672 vsri.u8 d30, d30, #5 1673 vsri.u8 d29, d29, #6 1674 vshrn.u16 d28, q0, #2 1675.endm 1676 1677.macro pixman_composite_src_0565_8888_process_pixblock_tail 1678.endm 1679 1680/* TODO: expand macros and do better instructions scheduling */ 1681.macro pixman_composite_src_0565_8888_process_pixblock_tail_head 1682 pixman_composite_src_0565_8888_process_pixblock_tail 1683 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1684 fetch_src_pixblock 1685 pixman_composite_src_0565_8888_process_pixblock_head 1686 cache_preload 8, 8 1687.endm 1688 1689generate_composite_function \ 1690 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 1691 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1692 8, /* number of pixels, processed in a single block */ \ 1693 10, /* prefetch distance */ \ 1694 default_init, \ 1695 default_cleanup, \ 1696 pixman_composite_src_0565_8888_process_pixblock_head, \ 1697 pixman_composite_src_0565_8888_process_pixblock_tail, \ 1698 pixman_composite_src_0565_8888_process_pixblock_tail_head 1699 1700/******************************************************************************/ 1701 1702.macro pixman_composite_add_8_8_process_pixblock_head 1703 vqadd.u8 q14, q0, q2 1704 vqadd.u8 q15, q1, q3 1705.endm 1706 1707.macro pixman_composite_add_8_8_process_pixblock_tail 1708.endm 1709 1710.macro pixman_composite_add_8_8_process_pixblock_tail_head 1711 fetch_src_pixblock 1712 PF add PF_X, PF_X, #32 1713 PF tst PF_CTL, #0xF 1714 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1715 PF addne PF_X, PF_X, #32 1716 PF subne PF_CTL, PF_CTL, #1 1717 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1718 PF cmp PF_X, ORIG_W 1719 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1720 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1721 PF subge PF_X, PF_X, ORIG_W 1722 PF subges PF_CTL, PF_CTL, #0x10 1723 vqadd.u8 q14, q0, q2 1724 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1725 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1726 vqadd.u8 q15, q1, q3 1727.endm 1728 1729generate_composite_function \ 1730 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 1731 FLAG_DST_READWRITE, \ 1732 32, /* number of pixels, processed in a single block */ \ 1733 10, /* prefetch distance */ \ 1734 default_init, \ 1735 default_cleanup, \ 1736 pixman_composite_add_8_8_process_pixblock_head, \ 1737 pixman_composite_add_8_8_process_pixblock_tail, \ 1738 pixman_composite_add_8_8_process_pixblock_tail_head 1739 1740/******************************************************************************/ 1741 1742.macro pixman_composite_add_8888_8888_process_pixblock_tail_head 1743 fetch_src_pixblock 1744 PF add PF_X, PF_X, #8 1745 PF tst PF_CTL, #0xF 1746 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! 1747 PF addne PF_X, PF_X, #8 1748 PF subne PF_CTL, PF_CTL, #1 1749 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! 1750 PF cmp PF_X, ORIG_W 1751 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1752 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1753 PF subge PF_X, PF_X, ORIG_W 1754 PF subges PF_CTL, PF_CTL, #0x10 1755 vqadd.u8 q14, q0, q2 1756 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1757 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1758 vqadd.u8 q15, q1, q3 1759.endm 1760 1761generate_composite_function \ 1762 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 1763 FLAG_DST_READWRITE, \ 1764 8, /* number of pixels, processed in a single block */ \ 1765 10, /* prefetch distance */ \ 1766 default_init, \ 1767 default_cleanup, \ 1768 pixman_composite_add_8_8_process_pixblock_head, \ 1769 pixman_composite_add_8_8_process_pixblock_tail, \ 1770 pixman_composite_add_8888_8888_process_pixblock_tail_head 1771 1772generate_composite_function_single_scanline \ 1773 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 1774 FLAG_DST_READWRITE, \ 1775 8, /* number of pixels, processed in a single block */ \ 1776 default_init, \ 1777 default_cleanup, \ 1778 pixman_composite_add_8_8_process_pixblock_head, \ 1779 pixman_composite_add_8_8_process_pixblock_tail, \ 1780 pixman_composite_add_8888_8888_process_pixblock_tail_head 1781 1782/******************************************************************************/ 1783 1784.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 1785 vmvn.8 d24, d3 /* get inverted alpha */ 1786 /* do alpha blending */ 1787 vmull.u8 q8, d24, d4 1788 vmull.u8 q9, d24, d5 1789 vmull.u8 q10, d24, d6 1790 vmull.u8 q11, d24, d7 1791.endm 1792 1793.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 1794 vrshr.u16 q14, q8, #8 1795 vrshr.u16 q15, q9, #8 1796 vrshr.u16 q12, q10, #8 1797 vrshr.u16 q13, q11, #8 1798 vraddhn.u16 d28, q14, q8 1799 vraddhn.u16 d29, q15, q9 1800 vraddhn.u16 d30, q12, q10 1801 vraddhn.u16 d31, q13, q11 1802.endm 1803 1804.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 1805 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1806 vrshr.u16 q14, q8, #8 1807 PF add PF_X, PF_X, #8 1808 PF tst PF_CTL, #0xF 1809 vrshr.u16 q15, q9, #8 1810 vrshr.u16 q12, q10, #8 1811 vrshr.u16 q13, q11, #8 1812 PF addne PF_X, PF_X, #8 1813 PF subne PF_CTL, PF_CTL, #1 1814 vraddhn.u16 d28, q14, q8 1815 vraddhn.u16 d29, q15, q9 1816 PF cmp PF_X, ORIG_W 1817 vraddhn.u16 d30, q12, q10 1818 vraddhn.u16 d31, q13, q11 1819 fetch_src_pixblock 1820 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1821 vmvn.8 d22, d3 1822 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1823 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1824 PF subge PF_X, PF_X, ORIG_W 1825 vmull.u8 q8, d22, d4 1826 PF subges PF_CTL, PF_CTL, #0x10 1827 vmull.u8 q9, d22, d5 1828 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1829 vmull.u8 q10, d22, d6 1830 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1831 vmull.u8 q11, d22, d7 1832.endm 1833 1834generate_composite_function_single_scanline \ 1835 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 1836 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1837 8, /* number of pixels, processed in a single block */ \ 1838 default_init, \ 1839 default_cleanup, \ 1840 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ 1841 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ 1842 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 1843 1844/******************************************************************************/ 1845 1846.macro pixman_composite_over_8888_8888_process_pixblock_head 1847 pixman_composite_out_reverse_8888_8888_process_pixblock_head 1848.endm 1849 1850.macro pixman_composite_over_8888_8888_process_pixblock_tail 1851 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 1852 vqadd.u8 q14, q0, q14 1853 vqadd.u8 q15, q1, q15 1854.endm 1855 1856.macro pixman_composite_over_8888_8888_process_pixblock_tail_head 1857 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1858 vrshr.u16 q14, q8, #8 1859 PF add PF_X, PF_X, #8 1860 PF tst PF_CTL, #0xF 1861 vrshr.u16 q15, q9, #8 1862 vrshr.u16 q12, q10, #8 1863 vrshr.u16 q13, q11, #8 1864 PF addne PF_X, PF_X, #8 1865 PF subne PF_CTL, PF_CTL, #1 1866 vraddhn.u16 d28, q14, q8 1867 vraddhn.u16 d29, q15, q9 1868 PF cmp PF_X, ORIG_W 1869 vraddhn.u16 d30, q12, q10 1870 vraddhn.u16 d31, q13, q11 1871 vqadd.u8 q14, q0, q14 1872 vqadd.u8 q15, q1, q15 1873 fetch_src_pixblock 1874 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1875 vmvn.8 d22, d3 1876 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1877 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1878 PF subge PF_X, PF_X, ORIG_W 1879 vmull.u8 q8, d22, d4 1880 PF subges PF_CTL, PF_CTL, #0x10 1881 vmull.u8 q9, d22, d5 1882 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1883 vmull.u8 q10, d22, d6 1884 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1885 vmull.u8 q11, d22, d7 1886.endm 1887 1888generate_composite_function \ 1889 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 1890 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1891 8, /* number of pixels, processed in a single block */ \ 1892 5, /* prefetch distance */ \ 1893 default_init, \ 1894 default_cleanup, \ 1895 pixman_composite_over_8888_8888_process_pixblock_head, \ 1896 pixman_composite_over_8888_8888_process_pixblock_tail, \ 1897 pixman_composite_over_8888_8888_process_pixblock_tail_head 1898 1899generate_composite_function_single_scanline \ 1900 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 1901 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1902 8, /* number of pixels, processed in a single block */ \ 1903 default_init, \ 1904 default_cleanup, \ 1905 pixman_composite_over_8888_8888_process_pixblock_head, \ 1906 pixman_composite_over_8888_8888_process_pixblock_tail, \ 1907 pixman_composite_over_8888_8888_process_pixblock_tail_head 1908 1909/******************************************************************************/ 1910 1911.macro pixman_composite_over_n_8888_process_pixblock_head 1912 /* deinterleaved source pixels in {d0, d1, d2, d3} */ 1913 /* inverted alpha in {d24} */ 1914 /* destination pixels in {d4, d5, d6, d7} */ 1915 vmull.u8 q8, d24, d4 1916 vmull.u8 q9, d24, d5 1917 vmull.u8 q10, d24, d6 1918 vmull.u8 q11, d24, d7 1919.endm 1920 1921.macro pixman_composite_over_n_8888_process_pixblock_tail 1922 vrshr.u16 q14, q8, #8 1923 vrshr.u16 q15, q9, #8 1924 vrshr.u16 q2, q10, #8 1925 vrshr.u16 q3, q11, #8 1926 vraddhn.u16 d28, q14, q8 1927 vraddhn.u16 d29, q15, q9 1928 vraddhn.u16 d30, q2, q10 1929 vraddhn.u16 d31, q3, q11 1930 vqadd.u8 q14, q0, q14 1931 vqadd.u8 q15, q1, q15 1932.endm 1933 1934.macro pixman_composite_over_n_8888_process_pixblock_tail_head 1935 vrshr.u16 q14, q8, #8 1936 vrshr.u16 q15, q9, #8 1937 vrshr.u16 q2, q10, #8 1938 vrshr.u16 q3, q11, #8 1939 vraddhn.u16 d28, q14, q8 1940 vraddhn.u16 d29, q15, q9 1941 vraddhn.u16 d30, q2, q10 1942 vraddhn.u16 d31, q3, q11 1943 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1944 vqadd.u8 q14, q0, q14 1945 PF add PF_X, PF_X, #8 1946 PF tst PF_CTL, #0x0F 1947 PF addne PF_X, PF_X, #8 1948 PF subne PF_CTL, PF_CTL, #1 1949 vqadd.u8 q15, q1, q15 1950 PF cmp PF_X, ORIG_W 1951 vmull.u8 q8, d24, d4 1952 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1953 vmull.u8 q9, d24, d5 1954 PF subge PF_X, PF_X, ORIG_W 1955 vmull.u8 q10, d24, d6 1956 PF subges PF_CTL, PF_CTL, #0x10 1957 vmull.u8 q11, d24, d7 1958 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1959 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1960.endm 1961 1962.macro pixman_composite_over_n_8888_init 1963 add DUMMY, sp, #ARGS_STACK_OFFSET 1964 vld1.32 {d3[0]}, [DUMMY] 1965 vdup.8 d0, d3[0] 1966 vdup.8 d1, d3[1] 1967 vdup.8 d2, d3[2] 1968 vdup.8 d3, d3[3] 1969 vmvn.8 d24, d3 /* get inverted alpha */ 1970.endm 1971 1972generate_composite_function \ 1973 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 1974 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1975 8, /* number of pixels, processed in a single block */ \ 1976 5, /* prefetch distance */ \ 1977 pixman_composite_over_n_8888_init, \ 1978 default_cleanup, \ 1979 pixman_composite_over_8888_8888_process_pixblock_head, \ 1980 pixman_composite_over_8888_8888_process_pixblock_tail, \ 1981 pixman_composite_over_n_8888_process_pixblock_tail_head 1982 1983/******************************************************************************/ 1984 1985.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 1986 vrshr.u16 q14, q8, #8 1987 PF add PF_X, PF_X, #8 1988 PF tst PF_CTL, #0xF 1989 vrshr.u16 q15, q9, #8 1990 vrshr.u16 q12, q10, #8 1991 vrshr.u16 q13, q11, #8 1992 PF addne PF_X, PF_X, #8 1993 PF subne PF_CTL, PF_CTL, #1 1994 vraddhn.u16 d28, q14, q8 1995 vraddhn.u16 d29, q15, q9 1996 PF cmp PF_X, ORIG_W 1997 vraddhn.u16 d30, q12, q10 1998 vraddhn.u16 d31, q13, q11 1999 vqadd.u8 q14, q0, q14 2000 vqadd.u8 q15, q1, q15 2001 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 2002 vmvn.8 d22, d3 2003 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 2004 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2005 PF subge PF_X, PF_X, ORIG_W 2006 vmull.u8 q8, d22, d4 2007 PF subges PF_CTL, PF_CTL, #0x10 2008 vmull.u8 q9, d22, d5 2009 vmull.u8 q10, d22, d6 2010 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 2011 vmull.u8 q11, d22, d7 2012.endm 2013 2014.macro pixman_composite_over_reverse_n_8888_init 2015 add DUMMY, sp, #ARGS_STACK_OFFSET 2016 vld1.32 {d7[0]}, [DUMMY] 2017 vdup.8 d4, d7[0] 2018 vdup.8 d5, d7[1] 2019 vdup.8 d6, d7[2] 2020 vdup.8 d7, d7[3] 2021.endm 2022 2023generate_composite_function \ 2024 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 2025 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2026 8, /* number of pixels, processed in a single block */ \ 2027 5, /* prefetch distance */ \ 2028 pixman_composite_over_reverse_n_8888_init, \ 2029 default_cleanup, \ 2030 pixman_composite_over_8888_8888_process_pixblock_head, \ 2031 pixman_composite_over_8888_8888_process_pixblock_tail, \ 2032 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 2033 28, /* dst_w_basereg */ \ 2034 0, /* dst_r_basereg */ \ 2035 4, /* src_basereg */ \ 2036 24 /* mask_basereg */ 2037 2038/******************************************************************************/ 2039 2040.macro pixman_composite_over_8888_8_0565_process_pixblock_head 2041 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ 2042 vmull.u8 q1, d24, d9 2043 vmull.u8 q6, d24, d10 2044 vmull.u8 q7, d24, d11 2045 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ 2046 vshrn.u16 d7, q2, #3 2047 vsli.u16 q2, q2, #5 2048 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ 2049 vrshr.u16 q9, q1, #8 2050 vrshr.u16 q10, q6, #8 2051 vrshr.u16 q11, q7, #8 2052 vraddhn.u16 d0, q0, q8 2053 vraddhn.u16 d1, q1, q9 2054 vraddhn.u16 d2, q6, q10 2055 vraddhn.u16 d3, q7, q11 2056 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ 2057 vsri.u8 d7, d7, #6 2058 vmvn.8 d3, d3 2059 vshrn.u16 d30, q2, #2 2060 vmull.u8 q8, d3, d6 /* now do alpha blending */ 2061 vmull.u8 q9, d3, d7 2062 vmull.u8 q10, d3, d30 2063.endm 2064 2065.macro pixman_composite_over_8888_8_0565_process_pixblock_tail 2066 /* 3 cycle bubble (after vmull.u8) */ 2067 vrshr.u16 q13, q8, #8 2068 vrshr.u16 q11, q9, #8 2069 vrshr.u16 q15, q10, #8 2070 vraddhn.u16 d16, q8, q13 2071 vraddhn.u16 d27, q9, q11 2072 vraddhn.u16 d26, q10, q15 2073 vqadd.u8 d16, d2, d16 2074 /* 1 cycle bubble */ 2075 vqadd.u8 q9, q0, q13 2076 vshll.u8 q14, d16, #8 /* convert to 16bpp */ 2077 vshll.u8 q8, d19, #8 2078 vshll.u8 q9, d18, #8 2079 vsri.u16 q14, q8, #5 2080 /* 1 cycle bubble */ 2081 vsri.u16 q14, q9, #11 2082.endm 2083 2084.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 2085 vld1.16 {d4, d5}, [DST_R, :128]! 2086 vshrn.u16 d6, q2, #8 2087 fetch_mask_pixblock 2088 vshrn.u16 d7, q2, #3 2089 fetch_src_pixblock 2090 vmull.u8 q6, d24, d10 2091 vrshr.u16 q13, q8, #8 2092 vrshr.u16 q11, q9, #8 2093 vrshr.u16 q15, q10, #8 2094 vraddhn.u16 d16, q8, q13 2095 vraddhn.u16 d27, q9, q11 2096 vraddhn.u16 d26, q10, q15 2097 vqadd.u8 d16, d2, d16 2098 vmull.u8 q1, d24, d9 2099 vqadd.u8 q9, q0, q13 2100 vshll.u8 q14, d16, #8 2101 vmull.u8 q0, d24, d8 2102 vshll.u8 q8, d19, #8 2103 vshll.u8 q9, d18, #8 2104 vsri.u16 q14, q8, #5 2105 vmull.u8 q7, d24, d11 2106 vsri.u16 q14, q9, #11 2107 2108 cache_preload 8, 8 2109 2110 vsli.u16 q2, q2, #5 2111 vrshr.u16 q8, q0, #8 2112 vrshr.u16 q9, q1, #8 2113 vrshr.u16 q10, q6, #8 2114 vrshr.u16 q11, q7, #8 2115 vraddhn.u16 d0, q0, q8 2116 vraddhn.u16 d1, q1, q9 2117 vraddhn.u16 d2, q6, q10 2118 vraddhn.u16 d3, q7, q11 2119 vsri.u8 d6, d6, #5 2120 vsri.u8 d7, d7, #6 2121 vmvn.8 d3, d3 2122 vshrn.u16 d30, q2, #2 2123 vst1.16 {d28, d29}, [DST_W, :128]! 2124 vmull.u8 q8, d3, d6 2125 vmull.u8 q9, d3, d7 2126 vmull.u8 q10, d3, d30 2127.endm 2128 2129generate_composite_function \ 2130 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 2131 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2132 8, /* number of pixels, processed in a single block */ \ 2133 5, /* prefetch distance */ \ 2134 default_init_need_all_regs, \ 2135 default_cleanup_need_all_regs, \ 2136 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2137 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2138 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 2139 28, /* dst_w_basereg */ \ 2140 4, /* dst_r_basereg */ \ 2141 8, /* src_basereg */ \ 2142 24 /* mask_basereg */ 2143 2144/******************************************************************************/ 2145 2146/* 2147 * This function needs a special initialization of solid mask. 2148 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 2149 * offset, split into color components and replicated in d8-d11 2150 * registers. Additionally, this function needs all the NEON registers, 2151 * so it has to save d8-d15 registers which are callee saved according 2152 * to ABI. These registers are restored from 'cleanup' macro. All the 2153 * other NEON registers are caller saved, so can be clobbered freely 2154 * without introducing any problems. 2155 */ 2156.macro pixman_composite_over_n_8_0565_init 2157 add DUMMY, sp, #ARGS_STACK_OFFSET 2158 vpush {d8-d15} 2159 vld1.32 {d11[0]}, [DUMMY] 2160 vdup.8 d8, d11[0] 2161 vdup.8 d9, d11[1] 2162 vdup.8 d10, d11[2] 2163 vdup.8 d11, d11[3] 2164.endm 2165 2166.macro pixman_composite_over_n_8_0565_cleanup 2167 vpop {d8-d15} 2168.endm 2169 2170generate_composite_function \ 2171 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 2172 FLAG_DST_READWRITE, \ 2173 8, /* number of pixels, processed in a single block */ \ 2174 5, /* prefetch distance */ \ 2175 pixman_composite_over_n_8_0565_init, \ 2176 pixman_composite_over_n_8_0565_cleanup, \ 2177 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2178 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2179 pixman_composite_over_8888_8_0565_process_pixblock_tail_head 2180 2181/******************************************************************************/ 2182 2183.macro pixman_composite_over_8888_n_0565_init 2184 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2185 vpush {d8-d15} 2186 vld1.32 {d24[0]}, [DUMMY] 2187 vdup.8 d24, d24[3] 2188.endm 2189 2190.macro pixman_composite_over_8888_n_0565_cleanup 2191 vpop {d8-d15} 2192.endm 2193 2194generate_composite_function \ 2195 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 2196 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2197 8, /* number of pixels, processed in a single block */ \ 2198 5, /* prefetch distance */ \ 2199 pixman_composite_over_8888_n_0565_init, \ 2200 pixman_composite_over_8888_n_0565_cleanup, \ 2201 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2202 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2203 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 2204 28, /* dst_w_basereg */ \ 2205 4, /* dst_r_basereg */ \ 2206 8, /* src_basereg */ \ 2207 24 /* mask_basereg */ 2208 2209/******************************************************************************/ 2210 2211.macro pixman_composite_src_0565_0565_process_pixblock_head 2212.endm 2213 2214.macro pixman_composite_src_0565_0565_process_pixblock_tail 2215.endm 2216 2217.macro pixman_composite_src_0565_0565_process_pixblock_tail_head 2218 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 2219 fetch_src_pixblock 2220 cache_preload 16, 16 2221.endm 2222 2223generate_composite_function \ 2224 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 2225 FLAG_DST_WRITEONLY, \ 2226 16, /* number of pixels, processed in a single block */ \ 2227 10, /* prefetch distance */ \ 2228 default_init, \ 2229 default_cleanup, \ 2230 pixman_composite_src_0565_0565_process_pixblock_head, \ 2231 pixman_composite_src_0565_0565_process_pixblock_tail, \ 2232 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 2233 0, /* dst_w_basereg */ \ 2234 0, /* dst_r_basereg */ \ 2235 0, /* src_basereg */ \ 2236 0 /* mask_basereg */ 2237 2238/******************************************************************************/ 2239 2240.macro pixman_composite_src_n_8_process_pixblock_head 2241.endm 2242 2243.macro pixman_composite_src_n_8_process_pixblock_tail 2244.endm 2245 2246.macro pixman_composite_src_n_8_process_pixblock_tail_head 2247 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 2248.endm 2249 2250.macro pixman_composite_src_n_8_init 2251 add DUMMY, sp, #ARGS_STACK_OFFSET 2252 vld1.32 {d0[0]}, [DUMMY] 2253 vsli.u64 d0, d0, #8 2254 vsli.u64 d0, d0, #16 2255 vsli.u64 d0, d0, #32 2256 vorr d1, d0, d0 2257 vorr q1, q0, q0 2258.endm 2259 2260.macro pixman_composite_src_n_8_cleanup 2261.endm 2262 2263generate_composite_function \ 2264 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 2265 FLAG_DST_WRITEONLY, \ 2266 32, /* number of pixels, processed in a single block */ \ 2267 0, /* prefetch distance */ \ 2268 pixman_composite_src_n_8_init, \ 2269 pixman_composite_src_n_8_cleanup, \ 2270 pixman_composite_src_n_8_process_pixblock_head, \ 2271 pixman_composite_src_n_8_process_pixblock_tail, \ 2272 pixman_composite_src_n_8_process_pixblock_tail_head, \ 2273 0, /* dst_w_basereg */ \ 2274 0, /* dst_r_basereg */ \ 2275 0, /* src_basereg */ \ 2276 0 /* mask_basereg */ 2277 2278/******************************************************************************/ 2279 2280.macro pixman_composite_src_n_0565_process_pixblock_head 2281.endm 2282 2283.macro pixman_composite_src_n_0565_process_pixblock_tail 2284.endm 2285 2286.macro pixman_composite_src_n_0565_process_pixblock_tail_head 2287 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 2288.endm 2289 2290.macro pixman_composite_src_n_0565_init 2291 add DUMMY, sp, #ARGS_STACK_OFFSET 2292 vld1.32 {d0[0]}, [DUMMY] 2293 vsli.u64 d0, d0, #16 2294 vsli.u64 d0, d0, #32 2295 vorr d1, d0, d0 2296 vorr q1, q0, q0 2297.endm 2298 2299.macro pixman_composite_src_n_0565_cleanup 2300.endm 2301 2302generate_composite_function \ 2303 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 2304 FLAG_DST_WRITEONLY, \ 2305 16, /* number of pixels, processed in a single block */ \ 2306 0, /* prefetch distance */ \ 2307 pixman_composite_src_n_0565_init, \ 2308 pixman_composite_src_n_0565_cleanup, \ 2309 pixman_composite_src_n_0565_process_pixblock_head, \ 2310 pixman_composite_src_n_0565_process_pixblock_tail, \ 2311 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 2312 0, /* dst_w_basereg */ \ 2313 0, /* dst_r_basereg */ \ 2314 0, /* src_basereg */ \ 2315 0 /* mask_basereg */ 2316 2317/******************************************************************************/ 2318 2319.macro pixman_composite_src_n_8888_process_pixblock_head 2320.endm 2321 2322.macro pixman_composite_src_n_8888_process_pixblock_tail 2323.endm 2324 2325.macro pixman_composite_src_n_8888_process_pixblock_tail_head 2326 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 2327.endm 2328 2329.macro pixman_composite_src_n_8888_init 2330 add DUMMY, sp, #ARGS_STACK_OFFSET 2331 vld1.32 {d0[0]}, [DUMMY] 2332 vsli.u64 d0, d0, #32 2333 vorr d1, d0, d0 2334 vorr q1, q0, q0 2335.endm 2336 2337.macro pixman_composite_src_n_8888_cleanup 2338.endm 2339 2340generate_composite_function \ 2341 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 2342 FLAG_DST_WRITEONLY, \ 2343 8, /* number of pixels, processed in a single block */ \ 2344 0, /* prefetch distance */ \ 2345 pixman_composite_src_n_8888_init, \ 2346 pixman_composite_src_n_8888_cleanup, \ 2347 pixman_composite_src_n_8888_process_pixblock_head, \ 2348 pixman_composite_src_n_8888_process_pixblock_tail, \ 2349 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 2350 0, /* dst_w_basereg */ \ 2351 0, /* dst_r_basereg */ \ 2352 0, /* src_basereg */ \ 2353 0 /* mask_basereg */ 2354 2355/******************************************************************************/ 2356 2357.macro pixman_composite_src_8888_8888_process_pixblock_head 2358.endm 2359 2360.macro pixman_composite_src_8888_8888_process_pixblock_tail 2361.endm 2362 2363.macro pixman_composite_src_8888_8888_process_pixblock_tail_head 2364 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 2365 fetch_src_pixblock 2366 cache_preload 8, 8 2367.endm 2368 2369generate_composite_function \ 2370 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 2371 FLAG_DST_WRITEONLY, \ 2372 8, /* number of pixels, processed in a single block */ \ 2373 10, /* prefetch distance */ \ 2374 default_init, \ 2375 default_cleanup, \ 2376 pixman_composite_src_8888_8888_process_pixblock_head, \ 2377 pixman_composite_src_8888_8888_process_pixblock_tail, \ 2378 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 2379 0, /* dst_w_basereg */ \ 2380 0, /* dst_r_basereg */ \ 2381 0, /* src_basereg */ \ 2382 0 /* mask_basereg */ 2383 2384/******************************************************************************/ 2385 2386.macro pixman_composite_src_x888_8888_process_pixblock_head 2387 vorr q0, q0, q2 2388 vorr q1, q1, q2 2389.endm 2390 2391.macro pixman_composite_src_x888_8888_process_pixblock_tail 2392.endm 2393 2394.macro pixman_composite_src_x888_8888_process_pixblock_tail_head 2395 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 2396 fetch_src_pixblock 2397 vorr q0, q0, q2 2398 vorr q1, q1, q2 2399 cache_preload 8, 8 2400.endm 2401 2402.macro pixman_composite_src_x888_8888_init 2403 vmov.u8 q2, #0xFF 2404 vshl.u32 q2, q2, #24 2405.endm 2406 2407generate_composite_function \ 2408 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 2409 FLAG_DST_WRITEONLY, \ 2410 8, /* number of pixels, processed in a single block */ \ 2411 10, /* prefetch distance */ \ 2412 pixman_composite_src_x888_8888_init, \ 2413 default_cleanup, \ 2414 pixman_composite_src_x888_8888_process_pixblock_head, \ 2415 pixman_composite_src_x888_8888_process_pixblock_tail, \ 2416 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 2417 0, /* dst_w_basereg */ \ 2418 0, /* dst_r_basereg */ \ 2419 0, /* src_basereg */ \ 2420 0 /* mask_basereg */ 2421 2422/******************************************************************************/ 2423 2424.macro pixman_composite_src_n_8_8888_process_pixblock_head 2425 /* expecting solid source in {d0, d1, d2, d3} */ 2426 /* mask is in d24 (d25, d26, d27 are unused) */ 2427 2428 /* in */ 2429 vmull.u8 q8, d24, d0 2430 vmull.u8 q9, d24, d1 2431 vmull.u8 q10, d24, d2 2432 vmull.u8 q11, d24, d3 2433 vrsra.u16 q8, q8, #8 2434 vrsra.u16 q9, q9, #8 2435 vrsra.u16 q10, q10, #8 2436 vrsra.u16 q11, q11, #8 2437.endm 2438 2439.macro pixman_composite_src_n_8_8888_process_pixblock_tail 2440 vrshrn.u16 d28, q8, #8 2441 vrshrn.u16 d29, q9, #8 2442 vrshrn.u16 d30, q10, #8 2443 vrshrn.u16 d31, q11, #8 2444.endm 2445 2446.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 2447 fetch_mask_pixblock 2448 PF add PF_X, PF_X, #8 2449 vrshrn.u16 d28, q8, #8 2450 PF tst PF_CTL, #0x0F 2451 vrshrn.u16 d29, q9, #8 2452 PF addne PF_X, PF_X, #8 2453 vrshrn.u16 d30, q10, #8 2454 PF subne PF_CTL, PF_CTL, #1 2455 vrshrn.u16 d31, q11, #8 2456 PF cmp PF_X, ORIG_W 2457 vmull.u8 q8, d24, d0 2458 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 2459 vmull.u8 q9, d24, d1 2460 PF subge PF_X, PF_X, ORIG_W 2461 vmull.u8 q10, d24, d2 2462 PF subges PF_CTL, PF_CTL, #0x10 2463 vmull.u8 q11, d24, d3 2464 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 2465 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2466 vrsra.u16 q8, q8, #8 2467 vrsra.u16 q9, q9, #8 2468 vrsra.u16 q10, q10, #8 2469 vrsra.u16 q11, q11, #8 2470.endm 2471 2472.macro pixman_composite_src_n_8_8888_init 2473 add DUMMY, sp, #ARGS_STACK_OFFSET 2474 vld1.32 {d3[0]}, [DUMMY] 2475 vdup.8 d0, d3[0] 2476 vdup.8 d1, d3[1] 2477 vdup.8 d2, d3[2] 2478 vdup.8 d3, d3[3] 2479.endm 2480 2481.macro pixman_composite_src_n_8_8888_cleanup 2482.endm 2483 2484generate_composite_function \ 2485 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ 2486 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2487 8, /* number of pixels, processed in a single block */ \ 2488 5, /* prefetch distance */ \ 2489 pixman_composite_src_n_8_8888_init, \ 2490 pixman_composite_src_n_8_8888_cleanup, \ 2491 pixman_composite_src_n_8_8888_process_pixblock_head, \ 2492 pixman_composite_src_n_8_8888_process_pixblock_tail, \ 2493 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 2494 2495/******************************************************************************/ 2496 2497.macro pixman_composite_src_n_8_8_process_pixblock_head 2498 vmull.u8 q0, d24, d16 2499 vmull.u8 q1, d25, d16 2500 vmull.u8 q2, d26, d16 2501 vmull.u8 q3, d27, d16 2502 vrsra.u16 q0, q0, #8 2503 vrsra.u16 q1, q1, #8 2504 vrsra.u16 q2, q2, #8 2505 vrsra.u16 q3, q3, #8 2506.endm 2507 2508.macro pixman_composite_src_n_8_8_process_pixblock_tail 2509 vrshrn.u16 d28, q0, #8 2510 vrshrn.u16 d29, q1, #8 2511 vrshrn.u16 d30, q2, #8 2512 vrshrn.u16 d31, q3, #8 2513.endm 2514 2515.macro pixman_composite_src_n_8_8_process_pixblock_tail_head 2516 fetch_mask_pixblock 2517 PF add PF_X, PF_X, #8 2518 vrshrn.u16 d28, q0, #8 2519 PF tst PF_CTL, #0x0F 2520 vrshrn.u16 d29, q1, #8 2521 PF addne PF_X, PF_X, #8 2522 vrshrn.u16 d30, q2, #8 2523 PF subne PF_CTL, PF_CTL, #1 2524 vrshrn.u16 d31, q3, #8 2525 PF cmp PF_X, ORIG_W 2526 vmull.u8 q0, d24, d16 2527 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 2528 vmull.u8 q1, d25, d16 2529 PF subge PF_X, PF_X, ORIG_W 2530 vmull.u8 q2, d26, d16 2531 PF subges PF_CTL, PF_CTL, #0x10 2532 vmull.u8 q3, d27, d16 2533 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 2534 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 2535 vrsra.u16 q0, q0, #8 2536 vrsra.u16 q1, q1, #8 2537 vrsra.u16 q2, q2, #8 2538 vrsra.u16 q3, q3, #8 2539.endm 2540 2541.macro pixman_composite_src_n_8_8_init 2542 add DUMMY, sp, #ARGS_STACK_OFFSET 2543 vld1.32 {d16[0]}, [DUMMY] 2544 vdup.8 d16, d16[3] 2545.endm 2546 2547.macro pixman_composite_src_n_8_8_cleanup 2548.endm 2549 2550generate_composite_function \ 2551 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ 2552 FLAG_DST_WRITEONLY, \ 2553 32, /* number of pixels, processed in a single block */ \ 2554 5, /* prefetch distance */ \ 2555 pixman_composite_src_n_8_8_init, \ 2556 pixman_composite_src_n_8_8_cleanup, \ 2557 pixman_composite_src_n_8_8_process_pixblock_head, \ 2558 pixman_composite_src_n_8_8_process_pixblock_tail, \ 2559 pixman_composite_src_n_8_8_process_pixblock_tail_head 2560 2561/******************************************************************************/ 2562 2563.macro pixman_composite_over_n_8_8888_process_pixblock_head 2564 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 2565 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 2566 /* and destination data in {d4, d5, d6, d7} */ 2567 /* mask is in d24 (d25, d26, d27 are unused) */ 2568 2569 /* in */ 2570 vmull.u8 q6, d24, d8 2571 vmull.u8 q7, d24, d9 2572 vmull.u8 q8, d24, d10 2573 vmull.u8 q9, d24, d11 2574 vrshr.u16 q10, q6, #8 2575 vrshr.u16 q11, q7, #8 2576 vrshr.u16 q12, q8, #8 2577 vrshr.u16 q13, q9, #8 2578 vraddhn.u16 d0, q6, q10 2579 vraddhn.u16 d1, q7, q11 2580 vraddhn.u16 d2, q8, q12 2581 vraddhn.u16 d3, q9, q13 2582 vmvn.8 d25, d3 /* get inverted alpha */ 2583 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 2584 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 2585 /* now do alpha blending */ 2586 vmull.u8 q8, d25, d4 2587 vmull.u8 q9, d25, d5 2588 vmull.u8 q10, d25, d6 2589 vmull.u8 q11, d25, d7 2590.endm 2591 2592.macro pixman_composite_over_n_8_8888_process_pixblock_tail 2593 vrshr.u16 q14, q8, #8 2594 vrshr.u16 q15, q9, #8 2595 vrshr.u16 q6, q10, #8 2596 vrshr.u16 q7, q11, #8 2597 vraddhn.u16 d28, q14, q8 2598 vraddhn.u16 d29, q15, q9 2599 vraddhn.u16 d30, q6, q10 2600 vraddhn.u16 d31, q7, q11 2601 vqadd.u8 q14, q0, q14 2602 vqadd.u8 q15, q1, q15 2603.endm 2604 2605.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 2606 vrshr.u16 q14, q8, #8 2607 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2608 vrshr.u16 q15, q9, #8 2609 fetch_mask_pixblock 2610 vrshr.u16 q6, q10, #8 2611 PF add PF_X, PF_X, #8 2612 vrshr.u16 q7, q11, #8 2613 PF tst PF_CTL, #0x0F 2614 vraddhn.u16 d28, q14, q8 2615 PF addne PF_X, PF_X, #8 2616 vraddhn.u16 d29, q15, q9 2617 PF subne PF_CTL, PF_CTL, #1 2618 vraddhn.u16 d30, q6, q10 2619 PF cmp PF_X, ORIG_W 2620 vraddhn.u16 d31, q7, q11 2621 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 2622 vmull.u8 q6, d24, d8 2623 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 2624 vmull.u8 q7, d24, d9 2625 PF subge PF_X, PF_X, ORIG_W 2626 vmull.u8 q8, d24, d10 2627 PF subges PF_CTL, PF_CTL, #0x10 2628 vmull.u8 q9, d24, d11 2629 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 2630 vqadd.u8 q14, q0, q14 2631 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 2632 vqadd.u8 q15, q1, q15 2633 vrshr.u16 q10, q6, #8 2634 vrshr.u16 q11, q7, #8 2635 vrshr.u16 q12, q8, #8 2636 vrshr.u16 q13, q9, #8 2637 vraddhn.u16 d0, q6, q10 2638 vraddhn.u16 d1, q7, q11 2639 vraddhn.u16 d2, q8, q12 2640 vraddhn.u16 d3, q9, q13 2641 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2642 vmvn.8 d25, d3 2643 vmull.u8 q8, d25, d4 2644 vmull.u8 q9, d25, d5 2645 vmull.u8 q10, d25, d6 2646 vmull.u8 q11, d25, d7 2647.endm 2648 2649.macro pixman_composite_over_n_8_8888_init 2650 add DUMMY, sp, #ARGS_STACK_OFFSET 2651 vpush {d8-d15} 2652 vld1.32 {d11[0]}, [DUMMY] 2653 vdup.8 d8, d11[0] 2654 vdup.8 d9, d11[1] 2655 vdup.8 d10, d11[2] 2656 vdup.8 d11, d11[3] 2657.endm 2658 2659.macro pixman_composite_over_n_8_8888_cleanup 2660 vpop {d8-d15} 2661.endm 2662 2663generate_composite_function \ 2664 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 2665 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2666 8, /* number of pixels, processed in a single block */ \ 2667 5, /* prefetch distance */ \ 2668 pixman_composite_over_n_8_8888_init, \ 2669 pixman_composite_over_n_8_8888_cleanup, \ 2670 pixman_composite_over_n_8_8888_process_pixblock_head, \ 2671 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 2672 pixman_composite_over_n_8_8888_process_pixblock_tail_head 2673 2674/******************************************************************************/ 2675 2676.macro pixman_composite_over_n_8_8_process_pixblock_head 2677 vmull.u8 q0, d24, d8 2678 vmull.u8 q1, d25, d8 2679 vmull.u8 q6, d26, d8 2680 vmull.u8 q7, d27, d8 2681 vrshr.u16 q10, q0, #8 2682 vrshr.u16 q11, q1, #8 2683 vrshr.u16 q12, q6, #8 2684 vrshr.u16 q13, q7, #8 2685 vraddhn.u16 d0, q0, q10 2686 vraddhn.u16 d1, q1, q11 2687 vraddhn.u16 d2, q6, q12 2688 vraddhn.u16 d3, q7, q13 2689 vmvn.8 q12, q0 2690 vmvn.8 q13, q1 2691 vmull.u8 q8, d24, d4 2692 vmull.u8 q9, d25, d5 2693 vmull.u8 q10, d26, d6 2694 vmull.u8 q11, d27, d7 2695.endm 2696 2697.macro pixman_composite_over_n_8_8_process_pixblock_tail 2698 vrshr.u16 q14, q8, #8 2699 vrshr.u16 q15, q9, #8 2700 vrshr.u16 q12, q10, #8 2701 vrshr.u16 q13, q11, #8 2702 vraddhn.u16 d28, q14, q8 2703 vraddhn.u16 d29, q15, q9 2704 vraddhn.u16 d30, q12, q10 2705 vraddhn.u16 d31, q13, q11 2706 vqadd.u8 q14, q0, q14 2707 vqadd.u8 q15, q1, q15 2708.endm 2709 2710/* TODO: expand macros and do better instructions scheduling */ 2711.macro pixman_composite_over_n_8_8_process_pixblock_tail_head 2712 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 2713 pixman_composite_over_n_8_8_process_pixblock_tail 2714 fetch_mask_pixblock 2715 cache_preload 32, 32 2716 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 2717 pixman_composite_over_n_8_8_process_pixblock_head 2718.endm 2719 2720.macro pixman_composite_over_n_8_8_init 2721 add DUMMY, sp, #ARGS_STACK_OFFSET 2722 vpush {d8-d15} 2723 vld1.32 {d8[0]}, [DUMMY] 2724 vdup.8 d8, d8[3] 2725.endm 2726 2727.macro pixman_composite_over_n_8_8_cleanup 2728 vpop {d8-d15} 2729.endm 2730 2731generate_composite_function \ 2732 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 2733 FLAG_DST_READWRITE, \ 2734 32, /* number of pixels, processed in a single block */ \ 2735 5, /* prefetch distance */ \ 2736 pixman_composite_over_n_8_8_init, \ 2737 pixman_composite_over_n_8_8_cleanup, \ 2738 pixman_composite_over_n_8_8_process_pixblock_head, \ 2739 pixman_composite_over_n_8_8_process_pixblock_tail, \ 2740 pixman_composite_over_n_8_8_process_pixblock_tail_head 2741 2742/******************************************************************************/ 2743 2744.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 2745 /* 2746 * 'combine_mask_ca' replacement 2747 * 2748 * input: solid src (n) in {d8, d9, d10, d11} 2749 * dest in {d4, d5, d6, d7 } 2750 * mask in {d24, d25, d26, d27} 2751 * output: updated src in {d0, d1, d2, d3 } 2752 * updated mask in {d24, d25, d26, d3 } 2753 */ 2754 vmull.u8 q0, d24, d8 2755 vmull.u8 q1, d25, d9 2756 vmull.u8 q6, d26, d10 2757 vmull.u8 q7, d27, d11 2758 vmull.u8 q9, d11, d25 2759 vmull.u8 q12, d11, d24 2760 vmull.u8 q13, d11, d26 2761 vrshr.u16 q8, q0, #8 2762 vrshr.u16 q10, q1, #8 2763 vrshr.u16 q11, q6, #8 2764 vraddhn.u16 d0, q0, q8 2765 vraddhn.u16 d1, q1, q10 2766 vraddhn.u16 d2, q6, q11 2767 vrshr.u16 q11, q12, #8 2768 vrshr.u16 q8, q9, #8 2769 vrshr.u16 q6, q13, #8 2770 vrshr.u16 q10, q7, #8 2771 vraddhn.u16 d24, q12, q11 2772 vraddhn.u16 d25, q9, q8 2773 vraddhn.u16 d26, q13, q6 2774 vraddhn.u16 d3, q7, q10 2775 /* 2776 * 'combine_over_ca' replacement 2777 * 2778 * output: updated dest in {d28, d29, d30, d31} 2779 */ 2780 vmvn.8 q12, q12 2781 vmvn.8 d26, d26 2782 vmull.u8 q8, d24, d4 2783 vmull.u8 q9, d25, d5 2784 vmvn.8 d27, d3 2785 vmull.u8 q10, d26, d6 2786 vmull.u8 q11, d27, d7 2787.endm 2788 2789.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 2790 /* ... continue 'combine_over_ca' replacement */ 2791 vrshr.u16 q14, q8, #8 2792 vrshr.u16 q15, q9, #8 2793 vrshr.u16 q6, q10, #8 2794 vrshr.u16 q7, q11, #8 2795 vraddhn.u16 d28, q14, q8 2796 vraddhn.u16 d29, q15, q9 2797 vraddhn.u16 d30, q6, q10 2798 vraddhn.u16 d31, q7, q11 2799 vqadd.u8 q14, q0, q14 2800 vqadd.u8 q15, q1, q15 2801.endm 2802 2803.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 2804 vrshr.u16 q14, q8, #8 2805 vrshr.u16 q15, q9, #8 2806 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2807 vrshr.u16 q6, q10, #8 2808 vrshr.u16 q7, q11, #8 2809 vraddhn.u16 d28, q14, q8 2810 vraddhn.u16 d29, q15, q9 2811 vraddhn.u16 d30, q6, q10 2812 vraddhn.u16 d31, q7, q11 2813 fetch_mask_pixblock 2814 vqadd.u8 q14, q0, q14 2815 vqadd.u8 q15, q1, q15 2816 cache_preload 8, 8 2817 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 2818 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2819.endm 2820 2821.macro pixman_composite_over_n_8888_8888_ca_init 2822 add DUMMY, sp, #ARGS_STACK_OFFSET 2823 vpush {d8-d15} 2824 vld1.32 {d11[0]}, [DUMMY] 2825 vdup.8 d8, d11[0] 2826 vdup.8 d9, d11[1] 2827 vdup.8 d10, d11[2] 2828 vdup.8 d11, d11[3] 2829.endm 2830 2831.macro pixman_composite_over_n_8888_8888_ca_cleanup 2832 vpop {d8-d15} 2833.endm 2834 2835generate_composite_function \ 2836 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 2837 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2838 8, /* number of pixels, processed in a single block */ \ 2839 5, /* prefetch distance */ \ 2840 pixman_composite_over_n_8888_8888_ca_init, \ 2841 pixman_composite_over_n_8888_8888_ca_cleanup, \ 2842 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 2843 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 2844 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 2845 2846/******************************************************************************/ 2847 2848.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 2849 /* 2850 * 'combine_mask_ca' replacement 2851 * 2852 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 2853 * mask in {d24, d25, d26} [B, G, R] 2854 * output: updated src in {d0, d1, d2 } [B, G, R] 2855 * updated mask in {d24, d25, d26} [B, G, R] 2856 */ 2857 vmull.u8 q0, d24, d8 2858 vmull.u8 q1, d25, d9 2859 vmull.u8 q6, d26, d10 2860 vmull.u8 q9, d11, d25 2861 vmull.u8 q12, d11, d24 2862 vmull.u8 q13, d11, d26 2863 vrshr.u16 q8, q0, #8 2864 vrshr.u16 q10, q1, #8 2865 vrshr.u16 q11, q6, #8 2866 vraddhn.u16 d0, q0, q8 2867 vraddhn.u16 d1, q1, q10 2868 vraddhn.u16 d2, q6, q11 2869 vrshr.u16 q11, q12, #8 2870 vrshr.u16 q8, q9, #8 2871 vrshr.u16 q6, q13, #8 2872 vraddhn.u16 d24, q12, q11 2873 vraddhn.u16 d25, q9, q8 2874 /* 2875 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 2876 * and put data into d16 - blue, d17 - green, d18 - red 2877 */ 2878 vshrn.u16 d17, q2, #3 2879 vshrn.u16 d18, q2, #8 2880 vraddhn.u16 d26, q13, q6 2881 vsli.u16 q2, q2, #5 2882 vsri.u8 d18, d18, #5 2883 vsri.u8 d17, d17, #6 2884 /* 2885 * 'combine_over_ca' replacement 2886 * 2887 * output: updated dest in d16 - blue, d17 - green, d18 - red 2888 */ 2889 vmvn.8 q12, q12 2890 vshrn.u16 d16, q2, #2 2891 vmvn.8 d26, d26 2892 vmull.u8 q6, d16, d24 2893 vmull.u8 q7, d17, d25 2894 vmull.u8 q11, d18, d26 2895.endm 2896 2897.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 2898 /* ... continue 'combine_over_ca' replacement */ 2899 vrshr.u16 q10, q6, #8 2900 vrshr.u16 q14, q7, #8 2901 vrshr.u16 q15, q11, #8 2902 vraddhn.u16 d16, q10, q6 2903 vraddhn.u16 d17, q14, q7 2904 vraddhn.u16 d18, q15, q11 2905 vqadd.u8 q8, q0, q8 2906 vqadd.u8 d18, d2, d18 2907 /* 2908 * convert the results in d16, d17, d18 to r5g6b5 and store 2909 * them into {d28, d29} 2910 */ 2911 vshll.u8 q14, d18, #8 2912 vshll.u8 q10, d17, #8 2913 vshll.u8 q15, d16, #8 2914 vsri.u16 q14, q10, #5 2915 vsri.u16 q14, q15, #11 2916.endm 2917 2918.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 2919 fetch_mask_pixblock 2920 vrshr.u16 q10, q6, #8 2921 vrshr.u16 q14, q7, #8 2922 vld1.16 {d4, d5}, [DST_R, :128]! 2923 vrshr.u16 q15, q11, #8 2924 vraddhn.u16 d16, q10, q6 2925 vraddhn.u16 d17, q14, q7 2926 vraddhn.u16 d22, q15, q11 2927 /* process_pixblock_head */ 2928 /* 2929 * 'combine_mask_ca' replacement 2930 * 2931 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 2932 * mask in {d24, d25, d26} [B, G, R] 2933 * output: updated src in {d0, d1, d2 } [B, G, R] 2934 * updated mask in {d24, d25, d26} [B, G, R] 2935 */ 2936 vmull.u8 q6, d26, d10 2937 vqadd.u8 q8, q0, q8 2938 vmull.u8 q0, d24, d8 2939 vqadd.u8 d22, d2, d22 2940 vmull.u8 q1, d25, d9 2941 /* 2942 * convert the result in d16, d17, d22 to r5g6b5 and store 2943 * it into {d28, d29} 2944 */ 2945 vshll.u8 q14, d22, #8 2946 vshll.u8 q10, d17, #8 2947 vshll.u8 q15, d16, #8 2948 vmull.u8 q9, d11, d25 2949 vsri.u16 q14, q10, #5 2950 vmull.u8 q12, d11, d24 2951 vmull.u8 q13, d11, d26 2952 vsri.u16 q14, q15, #11 2953 cache_preload 8, 8 2954 vrshr.u16 q8, q0, #8 2955 vrshr.u16 q10, q1, #8 2956 vrshr.u16 q11, q6, #8 2957 vraddhn.u16 d0, q0, q8 2958 vraddhn.u16 d1, q1, q10 2959 vraddhn.u16 d2, q6, q11 2960 vrshr.u16 q11, q12, #8 2961 vrshr.u16 q8, q9, #8 2962 vrshr.u16 q6, q13, #8 2963 vraddhn.u16 d24, q12, q11 2964 vraddhn.u16 d25, q9, q8 2965 /* 2966 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 2967 * 8-bit format and put data into d16 - blue, d17 - green, 2968 * d18 - red 2969 */ 2970 vshrn.u16 d17, q2, #3 2971 vshrn.u16 d18, q2, #8 2972 vraddhn.u16 d26, q13, q6 2973 vsli.u16 q2, q2, #5 2974 vsri.u8 d17, d17, #6 2975 vsri.u8 d18, d18, #5 2976 /* 2977 * 'combine_over_ca' replacement 2978 * 2979 * output: updated dest in d16 - blue, d17 - green, d18 - red 2980 */ 2981 vmvn.8 q12, q12 2982 vshrn.u16 d16, q2, #2 2983 vmvn.8 d26, d26 2984 vmull.u8 q7, d17, d25 2985 vmull.u8 q6, d16, d24 2986 vmull.u8 q11, d18, d26 2987 vst1.16 {d28, d29}, [DST_W, :128]! 2988.endm 2989 2990.macro pixman_composite_over_n_8888_0565_ca_init 2991 add DUMMY, sp, #ARGS_STACK_OFFSET 2992 vpush {d8-d15} 2993 vld1.32 {d11[0]}, [DUMMY] 2994 vdup.8 d8, d11[0] 2995 vdup.8 d9, d11[1] 2996 vdup.8 d10, d11[2] 2997 vdup.8 d11, d11[3] 2998.endm 2999 3000.macro pixman_composite_over_n_8888_0565_ca_cleanup 3001 vpop {d8-d15} 3002.endm 3003 3004generate_composite_function \ 3005 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 3006 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3007 8, /* number of pixels, processed in a single block */ \ 3008 5, /* prefetch distance */ \ 3009 pixman_composite_over_n_8888_0565_ca_init, \ 3010 pixman_composite_over_n_8888_0565_ca_cleanup, \ 3011 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ 3012 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ 3013 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 3014 3015/******************************************************************************/ 3016 3017.macro pixman_composite_in_n_8_process_pixblock_head 3018 /* expecting source data in {d0, d1, d2, d3} */ 3019 /* and destination data in {d4, d5, d6, d7} */ 3020 vmull.u8 q8, d4, d3 3021 vmull.u8 q9, d5, d3 3022 vmull.u8 q10, d6, d3 3023 vmull.u8 q11, d7, d3 3024.endm 3025 3026.macro pixman_composite_in_n_8_process_pixblock_tail 3027 vrshr.u16 q14, q8, #8 3028 vrshr.u16 q15, q9, #8 3029 vrshr.u16 q12, q10, #8 3030 vrshr.u16 q13, q11, #8 3031 vraddhn.u16 d28, q8, q14 3032 vraddhn.u16 d29, q9, q15 3033 vraddhn.u16 d30, q10, q12 3034 vraddhn.u16 d31, q11, q13 3035.endm 3036 3037.macro pixman_composite_in_n_8_process_pixblock_tail_head 3038 pixman_composite_in_n_8_process_pixblock_tail 3039 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 3040 cache_preload 32, 32 3041 pixman_composite_in_n_8_process_pixblock_head 3042 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 3043.endm 3044 3045.macro pixman_composite_in_n_8_init 3046 add DUMMY, sp, #ARGS_STACK_OFFSET 3047 vld1.32 {d3[0]}, [DUMMY] 3048 vdup.8 d3, d3[3] 3049.endm 3050 3051.macro pixman_composite_in_n_8_cleanup 3052.endm 3053 3054generate_composite_function \ 3055 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ 3056 FLAG_DST_READWRITE, \ 3057 32, /* number of pixels, processed in a single block */ \ 3058 5, /* prefetch distance */ \ 3059 pixman_composite_in_n_8_init, \ 3060 pixman_composite_in_n_8_cleanup, \ 3061 pixman_composite_in_n_8_process_pixblock_head, \ 3062 pixman_composite_in_n_8_process_pixblock_tail, \ 3063 pixman_composite_in_n_8_process_pixblock_tail_head, \ 3064 28, /* dst_w_basereg */ \ 3065 4, /* dst_r_basereg */ \ 3066 0, /* src_basereg */ \ 3067 24 /* mask_basereg */ 3068 3069.macro pixman_composite_add_n_8_8_process_pixblock_head 3070 /* expecting source data in {d8, d9, d10, d11} */ 3071 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 3072 /* and destination data in {d4, d5, d6, d7} */ 3073 /* mask is in d24, d25, d26, d27 */ 3074 vmull.u8 q0, d24, d11 3075 vmull.u8 q1, d25, d11 3076 vmull.u8 q6, d26, d11 3077 vmull.u8 q7, d27, d11 3078 vrshr.u16 q10, q0, #8 3079 vrshr.u16 q11, q1, #8 3080 vrshr.u16 q12, q6, #8 3081 vrshr.u16 q13, q7, #8 3082 vraddhn.u16 d0, q0, q10 3083 vraddhn.u16 d1, q1, q11 3084 vraddhn.u16 d2, q6, q12 3085 vraddhn.u16 d3, q7, q13 3086 vqadd.u8 q14, q0, q2 3087 vqadd.u8 q15, q1, q3 3088.endm 3089 3090.macro pixman_composite_add_n_8_8_process_pixblock_tail 3091.endm 3092 3093/* TODO: expand macros and do better instructions scheduling */ 3094.macro pixman_composite_add_n_8_8_process_pixblock_tail_head 3095 pixman_composite_add_n_8_8_process_pixblock_tail 3096 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 3097 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 3098 fetch_mask_pixblock 3099 cache_preload 32, 32 3100 pixman_composite_add_n_8_8_process_pixblock_head 3101.endm 3102 3103.macro pixman_composite_add_n_8_8_init 3104 add DUMMY, sp, #ARGS_STACK_OFFSET 3105 vpush {d8-d15} 3106 vld1.32 {d11[0]}, [DUMMY] 3107 vdup.8 d11, d11[3] 3108.endm 3109 3110.macro pixman_composite_add_n_8_8_cleanup 3111 vpop {d8-d15} 3112.endm 3113 3114generate_composite_function \ 3115 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 3116 FLAG_DST_READWRITE, \ 3117 32, /* number of pixels, processed in a single block */ \ 3118 5, /* prefetch distance */ \ 3119 pixman_composite_add_n_8_8_init, \ 3120 pixman_composite_add_n_8_8_cleanup, \ 3121 pixman_composite_add_n_8_8_process_pixblock_head, \ 3122 pixman_composite_add_n_8_8_process_pixblock_tail, \ 3123 pixman_composite_add_n_8_8_process_pixblock_tail_head 3124 3125/******************************************************************************/ 3126 3127.macro pixman_composite_add_8_8_8_process_pixblock_head 3128 /* expecting source data in {d0, d1, d2, d3} */ 3129 /* destination data in {d4, d5, d6, d7} */ 3130 /* mask in {d24, d25, d26, d27} */ 3131 vmull.u8 q8, d24, d0 3132 vmull.u8 q9, d25, d1 3133 vmull.u8 q10, d26, d2 3134 vmull.u8 q11, d27, d3 3135 vrshr.u16 q0, q8, #8 3136 vrshr.u16 q1, q9, #8 3137 vrshr.u16 q12, q10, #8 3138 vrshr.u16 q13, q11, #8 3139 vraddhn.u16 d0, q0, q8 3140 vraddhn.u16 d1, q1, q9 3141 vraddhn.u16 d2, q12, q10 3142 vraddhn.u16 d3, q13, q11 3143 vqadd.u8 q14, q0, q2 3144 vqadd.u8 q15, q1, q3 3145.endm 3146 3147.macro pixman_composite_add_8_8_8_process_pixblock_tail 3148.endm 3149 3150/* TODO: expand macros and do better instructions scheduling */ 3151.macro pixman_composite_add_8_8_8_process_pixblock_tail_head 3152 pixman_composite_add_8_8_8_process_pixblock_tail 3153 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 3154 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 3155 fetch_mask_pixblock 3156 fetch_src_pixblock 3157 cache_preload 32, 32 3158 pixman_composite_add_8_8_8_process_pixblock_head 3159.endm 3160 3161.macro pixman_composite_add_8_8_8_init 3162.endm 3163 3164.macro pixman_composite_add_8_8_8_cleanup 3165.endm 3166 3167generate_composite_function \ 3168 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 3169 FLAG_DST_READWRITE, \ 3170 32, /* number of pixels, processed in a single block */ \ 3171 5, /* prefetch distance */ \ 3172 pixman_composite_add_8_8_8_init, \ 3173 pixman_composite_add_8_8_8_cleanup, \ 3174 pixman_composite_add_8_8_8_process_pixblock_head, \ 3175 pixman_composite_add_8_8_8_process_pixblock_tail, \ 3176 pixman_composite_add_8_8_8_process_pixblock_tail_head 3177 3178/******************************************************************************/ 3179 3180.macro pixman_composite_add_8888_8888_8888_process_pixblock_head 3181 /* expecting source data in {d0, d1, d2, d3} */ 3182 /* destination data in {d4, d5, d6, d7} */ 3183 /* mask in {d24, d25, d26, d27} */ 3184 vmull.u8 q8, d27, d0 3185 vmull.u8 q9, d27, d1 3186 vmull.u8 q10, d27, d2 3187 vmull.u8 q11, d27, d3 3188 /* 1 cycle bubble */ 3189 vrsra.u16 q8, q8, #8 3190 vrsra.u16 q9, q9, #8 3191 vrsra.u16 q10, q10, #8 3192 vrsra.u16 q11, q11, #8 3193.endm 3194 3195.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 3196 /* 2 cycle bubble */ 3197 vrshrn.u16 d28, q8, #8 3198 vrshrn.u16 d29, q9, #8 3199 vrshrn.u16 d30, q10, #8 3200 vrshrn.u16 d31, q11, #8 3201 vqadd.u8 q14, q2, q14 3202 /* 1 cycle bubble */ 3203 vqadd.u8 q15, q3, q15 3204.endm 3205 3206.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 3207 fetch_src_pixblock 3208 vrshrn.u16 d28, q8, #8 3209 fetch_mask_pixblock 3210 vrshrn.u16 d29, q9, #8 3211 vmull.u8 q8, d27, d0 3212 vrshrn.u16 d30, q10, #8 3213 vmull.u8 q9, d27, d1 3214 vrshrn.u16 d31, q11, #8 3215 vmull.u8 q10, d27, d2 3216 vqadd.u8 q14, q2, q14 3217 vmull.u8 q11, d27, d3 3218 vqadd.u8 q15, q3, q15 3219 vrsra.u16 q8, q8, #8 3220 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3221 vrsra.u16 q9, q9, #8 3222 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3223 vrsra.u16 q10, q10, #8 3224 3225 cache_preload 8, 8 3226 3227 vrsra.u16 q11, q11, #8 3228.endm 3229 3230generate_composite_function \ 3231 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 3232 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3233 8, /* number of pixels, processed in a single block */ \ 3234 10, /* prefetch distance */ \ 3235 default_init, \ 3236 default_cleanup, \ 3237 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 3238 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 3239 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 3240 3241generate_composite_function_single_scanline \ 3242 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 3243 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3244 8, /* number of pixels, processed in a single block */ \ 3245 default_init, \ 3246 default_cleanup, \ 3247 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 3248 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 3249 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 3250 3251/******************************************************************************/ 3252 3253generate_composite_function \ 3254 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 3255 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3256 8, /* number of pixels, processed in a single block */ \ 3257 5, /* prefetch distance */ \ 3258 default_init, \ 3259 default_cleanup, \ 3260 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 3261 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 3262 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 3263 28, /* dst_w_basereg */ \ 3264 4, /* dst_r_basereg */ \ 3265 0, /* src_basereg */ \ 3266 27 /* mask_basereg */ 3267 3268/******************************************************************************/ 3269 3270.macro pixman_composite_add_n_8_8888_init 3271 add DUMMY, sp, #ARGS_STACK_OFFSET 3272 vld1.32 {d3[0]}, [DUMMY] 3273 vdup.8 d0, d3[0] 3274 vdup.8 d1, d3[1] 3275 vdup.8 d2, d3[2] 3276 vdup.8 d3, d3[3] 3277.endm 3278 3279.macro pixman_composite_add_n_8_8888_cleanup 3280.endm 3281 3282generate_composite_function \ 3283 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ 3284 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3285 8, /* number of pixels, processed in a single block */ \ 3286 5, /* prefetch distance */ \ 3287 pixman_composite_add_n_8_8888_init, \ 3288 pixman_composite_add_n_8_8888_cleanup, \ 3289 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 3290 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 3291 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 3292 28, /* dst_w_basereg */ \ 3293 4, /* dst_r_basereg */ \ 3294 0, /* src_basereg */ \ 3295 27 /* mask_basereg */ 3296 3297/******************************************************************************/ 3298 3299.macro pixman_composite_add_8888_n_8888_init 3300 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 3301 vld1.32 {d27[0]}, [DUMMY] 3302 vdup.8 d27, d27[3] 3303.endm 3304 3305.macro pixman_composite_add_8888_n_8888_cleanup 3306.endm 3307 3308generate_composite_function \ 3309 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ 3310 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3311 8, /* number of pixels, processed in a single block */ \ 3312 5, /* prefetch distance */ \ 3313 pixman_composite_add_8888_n_8888_init, \ 3314 pixman_composite_add_8888_n_8888_cleanup, \ 3315 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 3316 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 3317 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 3318 28, /* dst_w_basereg */ \ 3319 4, /* dst_r_basereg */ \ 3320 0, /* src_basereg */ \ 3321 27 /* mask_basereg */ 3322 3323/******************************************************************************/ 3324 3325.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 3326 /* expecting source data in {d0, d1, d2, d3} */ 3327 /* destination data in {d4, d5, d6, d7} */ 3328 /* solid mask is in d15 */ 3329 3330 /* 'in' */ 3331 vmull.u8 q8, d15, d3 3332 vmull.u8 q6, d15, d2 3333 vmull.u8 q5, d15, d1 3334 vmull.u8 q4, d15, d0 3335 vrshr.u16 q13, q8, #8 3336 vrshr.u16 q12, q6, #8 3337 vrshr.u16 q11, q5, #8 3338 vrshr.u16 q10, q4, #8 3339 vraddhn.u16 d3, q8, q13 3340 vraddhn.u16 d2, q6, q12 3341 vraddhn.u16 d1, q5, q11 3342 vraddhn.u16 d0, q4, q10 3343 vmvn.8 d24, d3 /* get inverted alpha */ 3344 /* now do alpha blending */ 3345 vmull.u8 q8, d24, d4 3346 vmull.u8 q9, d24, d5 3347 vmull.u8 q10, d24, d6 3348 vmull.u8 q11, d24, d7 3349.endm 3350 3351.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 3352 vrshr.u16 q14, q8, #8 3353 vrshr.u16 q15, q9, #8 3354 vrshr.u16 q12, q10, #8 3355 vrshr.u16 q13, q11, #8 3356 vraddhn.u16 d28, q14, q8 3357 vraddhn.u16 d29, q15, q9 3358 vraddhn.u16 d30, q12, q10 3359 vraddhn.u16 d31, q13, q11 3360.endm 3361 3362/* TODO: expand macros and do better instructions scheduling */ 3363.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 3364 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3365 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 3366 fetch_src_pixblock 3367 cache_preload 8, 8 3368 fetch_mask_pixblock 3369 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 3370 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3371.endm 3372 3373generate_composite_function_single_scanline \ 3374 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 3375 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3376 8, /* number of pixels, processed in a single block */ \ 3377 default_init_need_all_regs, \ 3378 default_cleanup_need_all_regs, \ 3379 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ 3380 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ 3381 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ 3382 28, /* dst_w_basereg */ \ 3383 4, /* dst_r_basereg */ \ 3384 0, /* src_basereg */ \ 3385 12 /* mask_basereg */ 3386 3387/******************************************************************************/ 3388 3389.macro pixman_composite_over_8888_n_8888_process_pixblock_head 3390 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 3391.endm 3392 3393.macro pixman_composite_over_8888_n_8888_process_pixblock_tail 3394 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 3395 vqadd.u8 q14, q0, q14 3396 vqadd.u8 q15, q1, q15 3397.endm 3398 3399/* TODO: expand macros and do better instructions scheduling */ 3400.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 3401 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3402 pixman_composite_over_8888_n_8888_process_pixblock_tail 3403 fetch_src_pixblock 3404 cache_preload 8, 8 3405 pixman_composite_over_8888_n_8888_process_pixblock_head 3406 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3407.endm 3408 3409.macro pixman_composite_over_8888_n_8888_init 3410 add DUMMY, sp, #48 3411 vpush {d8-d15} 3412 vld1.32 {d15[0]}, [DUMMY] 3413 vdup.8 d15, d15[3] 3414.endm 3415 3416.macro pixman_composite_over_8888_n_8888_cleanup 3417 vpop {d8-d15} 3418.endm 3419 3420generate_composite_function \ 3421 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 3422 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3423 8, /* number of pixels, processed in a single block */ \ 3424 5, /* prefetch distance */ \ 3425 pixman_composite_over_8888_n_8888_init, \ 3426 pixman_composite_over_8888_n_8888_cleanup, \ 3427 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 3428 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 3429 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 3430 3431/******************************************************************************/ 3432 3433/* TODO: expand macros and do better instructions scheduling */ 3434.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 3435 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3436 pixman_composite_over_8888_n_8888_process_pixblock_tail 3437 fetch_src_pixblock 3438 cache_preload 8, 8 3439 fetch_mask_pixblock 3440 pixman_composite_over_8888_n_8888_process_pixblock_head 3441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3442.endm 3443 3444generate_composite_function \ 3445 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 3446 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3447 8, /* number of pixels, processed in a single block */ \ 3448 5, /* prefetch distance */ \ 3449 default_init_need_all_regs, \ 3450 default_cleanup_need_all_regs, \ 3451 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 3452 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 3453 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 3454 28, /* dst_w_basereg */ \ 3455 4, /* dst_r_basereg */ \ 3456 0, /* src_basereg */ \ 3457 12 /* mask_basereg */ 3458 3459generate_composite_function_single_scanline \ 3460 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 3461 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3462 8, /* number of pixels, processed in a single block */ \ 3463 default_init_need_all_regs, \ 3464 default_cleanup_need_all_regs, \ 3465 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 3466 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 3467 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 3468 28, /* dst_w_basereg */ \ 3469 4, /* dst_r_basereg */ \ 3470 0, /* src_basereg */ \ 3471 12 /* mask_basereg */ 3472 3473/******************************************************************************/ 3474 3475/* TODO: expand macros and do better instructions scheduling */ 3476.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 3477 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3478 pixman_composite_over_8888_n_8888_process_pixblock_tail 3479 fetch_src_pixblock 3480 cache_preload 8, 8 3481 fetch_mask_pixblock 3482 pixman_composite_over_8888_n_8888_process_pixblock_head 3483 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3484.endm 3485 3486generate_composite_function \ 3487 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 3488 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3489 8, /* number of pixels, processed in a single block */ \ 3490 5, /* prefetch distance */ \ 3491 default_init_need_all_regs, \ 3492 default_cleanup_need_all_regs, \ 3493 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 3494 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 3495 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 3496 28, /* dst_w_basereg */ \ 3497 4, /* dst_r_basereg */ \ 3498 0, /* src_basereg */ \ 3499 15 /* mask_basereg */ 3500 3501/******************************************************************************/ 3502 3503.macro pixman_composite_src_0888_0888_process_pixblock_head 3504.endm 3505 3506.macro pixman_composite_src_0888_0888_process_pixblock_tail 3507.endm 3508 3509.macro pixman_composite_src_0888_0888_process_pixblock_tail_head 3510 vst3.8 {d0, d1, d2}, [DST_W]! 3511 fetch_src_pixblock 3512 cache_preload 8, 8 3513.endm 3514 3515generate_composite_function \ 3516 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 3517 FLAG_DST_WRITEONLY, \ 3518 8, /* number of pixels, processed in a single block */ \ 3519 10, /* prefetch distance */ \ 3520 default_init, \ 3521 default_cleanup, \ 3522 pixman_composite_src_0888_0888_process_pixblock_head, \ 3523 pixman_composite_src_0888_0888_process_pixblock_tail, \ 3524 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 3525 0, /* dst_w_basereg */ \ 3526 0, /* dst_r_basereg */ \ 3527 0, /* src_basereg */ \ 3528 0 /* mask_basereg */ 3529 3530/******************************************************************************/ 3531 3532.macro pixman_composite_src_0888_8888_rev_process_pixblock_head 3533 vswp d0, d2 3534.endm 3535 3536.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 3537.endm 3538 3539.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 3540 vst4.8 {d0, d1, d2, d3}, [DST_W]! 3541 fetch_src_pixblock 3542 vswp d0, d2 3543 cache_preload 8, 8 3544.endm 3545 3546.macro pixman_composite_src_0888_8888_rev_init 3547 veor d3, d3, d3 3548.endm 3549 3550generate_composite_function \ 3551 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 3552 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3553 8, /* number of pixels, processed in a single block */ \ 3554 10, /* prefetch distance */ \ 3555 pixman_composite_src_0888_8888_rev_init, \ 3556 default_cleanup, \ 3557 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 3558 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 3559 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 3560 0, /* dst_w_basereg */ \ 3561 0, /* dst_r_basereg */ \ 3562 0, /* src_basereg */ \ 3563 0 /* mask_basereg */ 3564 3565/******************************************************************************/ 3566 3567.macro pixman_composite_src_0888_0565_rev_process_pixblock_head 3568 vshll.u8 q8, d1, #8 3569 vshll.u8 q9, d2, #8 3570.endm 3571 3572.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 3573 vshll.u8 q14, d0, #8 3574 vsri.u16 q14, q8, #5 3575 vsri.u16 q14, q9, #11 3576.endm 3577 3578.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 3579 vshll.u8 q14, d0, #8 3580 fetch_src_pixblock 3581 vsri.u16 q14, q8, #5 3582 vsri.u16 q14, q9, #11 3583 vshll.u8 q8, d1, #8 3584 vst1.16 {d28, d29}, [DST_W, :128]! 3585 vshll.u8 q9, d2, #8 3586.endm 3587 3588generate_composite_function \ 3589 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 3590 FLAG_DST_WRITEONLY, \ 3591 8, /* number of pixels, processed in a single block */ \ 3592 10, /* prefetch distance */ \ 3593 default_init, \ 3594 default_cleanup, \ 3595 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 3596 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 3597 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 3598 28, /* dst_w_basereg */ \ 3599 0, /* dst_r_basereg */ \ 3600 0, /* src_basereg */ \ 3601 0 /* mask_basereg */ 3602 3603/******************************************************************************/ 3604 3605.macro pixman_composite_src_pixbuf_8888_process_pixblock_head 3606 vmull.u8 q8, d3, d0 3607 vmull.u8 q9, d3, d1 3608 vmull.u8 q10, d3, d2 3609.endm 3610 3611.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 3612 vrshr.u16 q11, q8, #8 3613 vswp d3, d31 3614 vrshr.u16 q12, q9, #8 3615 vrshr.u16 q13, q10, #8 3616 vraddhn.u16 d30, q11, q8 3617 vraddhn.u16 d29, q12, q9 3618 vraddhn.u16 d28, q13, q10 3619.endm 3620 3621.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 3622 vrshr.u16 q11, q8, #8 3623 vswp d3, d31 3624 vrshr.u16 q12, q9, #8 3625 vrshr.u16 q13, q10, #8 3626 fetch_src_pixblock 3627 vraddhn.u16 d30, q11, q8 3628 PF add PF_X, PF_X, #8 3629 PF tst PF_CTL, #0xF 3630 PF addne PF_X, PF_X, #8 3631 PF subne PF_CTL, PF_CTL, #1 3632 vraddhn.u16 d29, q12, q9 3633 vraddhn.u16 d28, q13, q10 3634 vmull.u8 q8, d3, d0 3635 vmull.u8 q9, d3, d1 3636 vmull.u8 q10, d3, d2 3637 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3638 PF cmp PF_X, ORIG_W 3639 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 3640 PF subge PF_X, PF_X, ORIG_W 3641 PF subges PF_CTL, PF_CTL, #0x10 3642 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 3643.endm 3644 3645generate_composite_function \ 3646 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 3647 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3648 8, /* number of pixels, processed in a single block */ \ 3649 10, /* prefetch distance */ \ 3650 default_init, \ 3651 default_cleanup, \ 3652 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 3653 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 3654 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 3655 28, /* dst_w_basereg */ \ 3656 0, /* dst_r_basereg */ \ 3657 0, /* src_basereg */ \ 3658 0 /* mask_basereg */ 3659 3660/******************************************************************************/ 3661 3662.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 3663 vmull.u8 q8, d3, d0 3664 vmull.u8 q9, d3, d1 3665 vmull.u8 q10, d3, d2 3666.endm 3667 3668.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 3669 vrshr.u16 q11, q8, #8 3670 vswp d3, d31 3671 vrshr.u16 q12, q9, #8 3672 vrshr.u16 q13, q10, #8 3673 vraddhn.u16 d28, q11, q8 3674 vraddhn.u16 d29, q12, q9 3675 vraddhn.u16 d30, q13, q10 3676.endm 3677 3678.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 3679 vrshr.u16 q11, q8, #8 3680 vswp d3, d31 3681 vrshr.u16 q12, q9, #8 3682 vrshr.u16 q13, q10, #8 3683 fetch_src_pixblock 3684 vraddhn.u16 d28, q11, q8 3685 PF add PF_X, PF_X, #8 3686 PF tst PF_CTL, #0xF 3687 PF addne PF_X, PF_X, #8 3688 PF subne PF_CTL, PF_CTL, #1 3689 vraddhn.u16 d29, q12, q9 3690 vraddhn.u16 d30, q13, q10 3691 vmull.u8 q8, d3, d0 3692 vmull.u8 q9, d3, d1 3693 vmull.u8 q10, d3, d2 3694 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3695 PF cmp PF_X, ORIG_W 3696 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 3697 PF subge PF_X, PF_X, ORIG_W 3698 PF subges PF_CTL, PF_CTL, #0x10 3699 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 3700.endm 3701 3702generate_composite_function \ 3703 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 3704 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3705 8, /* number of pixels, processed in a single block */ \ 3706 10, /* prefetch distance */ \ 3707 default_init, \ 3708 default_cleanup, \ 3709 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ 3710 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ 3711 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ 3712 28, /* dst_w_basereg */ \ 3713 0, /* dst_r_basereg */ \ 3714 0, /* src_basereg */ \ 3715 0 /* mask_basereg */ 3716 3717/******************************************************************************/ 3718 3719.macro pixman_composite_over_0565_8_0565_process_pixblock_head 3720 /* mask is in d15 */ 3721 convert_0565_to_x888 q4, d2, d1, d0 3722 convert_0565_to_x888 q5, d6, d5, d4 3723 /* source pixel data is in {d0, d1, d2, XX} */ 3724 /* destination pixel data is in {d4, d5, d6, XX} */ 3725 vmvn.8 d7, d15 3726 vmull.u8 q6, d15, d2 3727 vmull.u8 q5, d15, d1 3728 vmull.u8 q4, d15, d0 3729 vmull.u8 q8, d7, d4 3730 vmull.u8 q9, d7, d5 3731 vmull.u8 q13, d7, d6 3732 vrshr.u16 q12, q6, #8 3733 vrshr.u16 q11, q5, #8 3734 vrshr.u16 q10, q4, #8 3735 vraddhn.u16 d2, q6, q12 3736 vraddhn.u16 d1, q5, q11 3737 vraddhn.u16 d0, q4, q10 3738.endm 3739 3740.macro pixman_composite_over_0565_8_0565_process_pixblock_tail 3741 vrshr.u16 q14, q8, #8 3742 vrshr.u16 q15, q9, #8 3743 vrshr.u16 q12, q13, #8 3744 vraddhn.u16 d28, q14, q8 3745 vraddhn.u16 d29, q15, q9 3746 vraddhn.u16 d30, q12, q13 3747 vqadd.u8 q0, q0, q14 3748 vqadd.u8 q1, q1, q15 3749 /* 32bpp result is in {d0, d1, d2, XX} */ 3750 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 3751.endm 3752 3753/* TODO: expand macros and do better instructions scheduling */ 3754.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 3755 fetch_mask_pixblock 3756 pixman_composite_over_0565_8_0565_process_pixblock_tail 3757 fetch_src_pixblock 3758 vld1.16 {d10, d11}, [DST_R, :128]! 3759 cache_preload 8, 8 3760 pixman_composite_over_0565_8_0565_process_pixblock_head 3761 vst1.16 {d28, d29}, [DST_W, :128]! 3762.endm 3763 3764generate_composite_function \ 3765 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 3766 FLAG_DST_READWRITE, \ 3767 8, /* number of pixels, processed in a single block */ \ 3768 5, /* prefetch distance */ \ 3769 default_init_need_all_regs, \ 3770 default_cleanup_need_all_regs, \ 3771 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 3772 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 3773 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 3774 28, /* dst_w_basereg */ \ 3775 10, /* dst_r_basereg */ \ 3776 8, /* src_basereg */ \ 3777 15 /* mask_basereg */ 3778 3779/******************************************************************************/ 3780 3781.macro pixman_composite_over_0565_n_0565_init 3782 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 3783 vpush {d8-d15} 3784 vld1.32 {d15[0]}, [DUMMY] 3785 vdup.8 d15, d15[3] 3786.endm 3787 3788.macro pixman_composite_over_0565_n_0565_cleanup 3789 vpop {d8-d15} 3790.endm 3791 3792generate_composite_function \ 3793 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 3794 FLAG_DST_READWRITE, \ 3795 8, /* number of pixels, processed in a single block */ \ 3796 5, /* prefetch distance */ \ 3797 pixman_composite_over_0565_n_0565_init, \ 3798 pixman_composite_over_0565_n_0565_cleanup, \ 3799 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 3800 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 3801 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 3802 28, /* dst_w_basereg */ \ 3803 10, /* dst_r_basereg */ \ 3804 8, /* src_basereg */ \ 3805 15 /* mask_basereg */ 3806 3807/******************************************************************************/ 3808 3809.macro pixman_composite_add_0565_8_0565_process_pixblock_head 3810 /* mask is in d15 */ 3811 convert_0565_to_x888 q4, d2, d1, d0 3812 convert_0565_to_x888 q5, d6, d5, d4 3813 /* source pixel data is in {d0, d1, d2, XX} */ 3814 /* destination pixel data is in {d4, d5, d6, XX} */ 3815 vmull.u8 q6, d15, d2 3816 vmull.u8 q5, d15, d1 3817 vmull.u8 q4, d15, d0 3818 vrshr.u16 q12, q6, #8 3819 vrshr.u16 q11, q5, #8 3820 vrshr.u16 q10, q4, #8 3821 vraddhn.u16 d2, q6, q12 3822 vraddhn.u16 d1, q5, q11 3823 vraddhn.u16 d0, q4, q10 3824.endm 3825 3826.macro pixman_composite_add_0565_8_0565_process_pixblock_tail 3827 vqadd.u8 q0, q0, q2 3828 vqadd.u8 q1, q1, q3 3829 /* 32bpp result is in {d0, d1, d2, XX} */ 3830 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 3831.endm 3832 3833/* TODO: expand macros and do better instructions scheduling */ 3834.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 3835 fetch_mask_pixblock 3836 pixman_composite_add_0565_8_0565_process_pixblock_tail 3837 fetch_src_pixblock 3838 vld1.16 {d10, d11}, [DST_R, :128]! 3839 cache_preload 8, 8 3840 pixman_composite_add_0565_8_0565_process_pixblock_head 3841 vst1.16 {d28, d29}, [DST_W, :128]! 3842.endm 3843 3844generate_composite_function \ 3845 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 3846 FLAG_DST_READWRITE, \ 3847 8, /* number of pixels, processed in a single block */ \ 3848 5, /* prefetch distance */ \ 3849 default_init_need_all_regs, \ 3850 default_cleanup_need_all_regs, \ 3851 pixman_composite_add_0565_8_0565_process_pixblock_head, \ 3852 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ 3853 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ 3854 28, /* dst_w_basereg */ \ 3855 10, /* dst_r_basereg */ \ 3856 8, /* src_basereg */ \ 3857 15 /* mask_basereg */ 3858 3859/******************************************************************************/ 3860 3861.macro pixman_composite_out_reverse_8_0565_process_pixblock_head 3862 /* mask is in d15 */ 3863 convert_0565_to_x888 q5, d6, d5, d4 3864 /* destination pixel data is in {d4, d5, d6, xx} */ 3865 vmvn.8 d24, d15 /* get inverted alpha */ 3866 /* now do alpha blending */ 3867 vmull.u8 q8, d24, d4 3868 vmull.u8 q9, d24, d5 3869 vmull.u8 q10, d24, d6 3870.endm 3871 3872.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 3873 vrshr.u16 q14, q8, #8 3874 vrshr.u16 q15, q9, #8 3875 vrshr.u16 q12, q10, #8 3876 vraddhn.u16 d0, q14, q8 3877 vraddhn.u16 d1, q15, q9 3878 vraddhn.u16 d2, q12, q10 3879 /* 32bpp result is in {d0, d1, d2, XX} */ 3880 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 3881.endm 3882 3883/* TODO: expand macros and do better instructions scheduling */ 3884.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 3885 fetch_src_pixblock 3886 pixman_composite_out_reverse_8_0565_process_pixblock_tail 3887 vld1.16 {d10, d11}, [DST_R, :128]! 3888 cache_preload 8, 8 3889 pixman_composite_out_reverse_8_0565_process_pixblock_head 3890 vst1.16 {d28, d29}, [DST_W, :128]! 3891.endm 3892 3893generate_composite_function \ 3894 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 3895 FLAG_DST_READWRITE, \ 3896 8, /* number of pixels, processed in a single block */ \ 3897 5, /* prefetch distance */ \ 3898 default_init_need_all_regs, \ 3899 default_cleanup_need_all_regs, \ 3900 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 3901 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 3902 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 3903 28, /* dst_w_basereg */ \ 3904 10, /* dst_r_basereg */ \ 3905 15, /* src_basereg */ \ 3906 0 /* mask_basereg */ 3907 3908/******************************************************************************/ 3909 3910.macro pixman_composite_out_reverse_8_8888_process_pixblock_head 3911 /* src is in d0 */ 3912 /* destination pixel data is in {d4, d5, d6, d7} */ 3913 vmvn.8 d1, d0 /* get inverted alpha */ 3914 /* now do alpha blending */ 3915 vmull.u8 q8, d1, d4 3916 vmull.u8 q9, d1, d5 3917 vmull.u8 q10, d1, d6 3918 vmull.u8 q11, d1, d7 3919.endm 3920 3921.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 3922 vrshr.u16 q14, q8, #8 3923 vrshr.u16 q15, q9, #8 3924 vrshr.u16 q12, q10, #8 3925 vrshr.u16 q13, q11, #8 3926 vraddhn.u16 d28, q14, q8 3927 vraddhn.u16 d29, q15, q9 3928 vraddhn.u16 d30, q12, q10 3929 vraddhn.u16 d31, q13, q11 3930 /* 32bpp result is in {d28, d29, d30, d31} */ 3931.endm 3932 3933/* TODO: expand macros and do better instructions scheduling */ 3934.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 3935 fetch_src_pixblock 3936 pixman_composite_out_reverse_8_8888_process_pixblock_tail 3937 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 3938 cache_preload 8, 8 3939 pixman_composite_out_reverse_8_8888_process_pixblock_head 3940 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 3941.endm 3942 3943generate_composite_function \ 3944 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 3945 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3946 8, /* number of pixels, processed in a single block */ \ 3947 5, /* prefetch distance */ \ 3948 default_init, \ 3949 default_cleanup, \ 3950 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ 3951 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ 3952 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 3953 28, /* dst_w_basereg */ \ 3954 4, /* dst_r_basereg */ \ 3955 0, /* src_basereg */ \ 3956 0 /* mask_basereg */ 3957 3958/******************************************************************************/ 3959 3960generate_composite_function_nearest_scanline \ 3961 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 3962 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3963 8, /* number of pixels, processed in a single block */ \ 3964 default_init, \ 3965 default_cleanup, \ 3966 pixman_composite_over_8888_8888_process_pixblock_head, \ 3967 pixman_composite_over_8888_8888_process_pixblock_tail, \ 3968 pixman_composite_over_8888_8888_process_pixblock_tail_head 3969 3970generate_composite_function_nearest_scanline \ 3971 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ 3972 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 3973 8, /* number of pixels, processed in a single block */ \ 3974 default_init, \ 3975 default_cleanup, \ 3976 pixman_composite_over_8888_0565_process_pixblock_head, \ 3977 pixman_composite_over_8888_0565_process_pixblock_tail, \ 3978 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 3979 28, /* dst_w_basereg */ \ 3980 4, /* dst_r_basereg */ \ 3981 0, /* src_basereg */ \ 3982 24 /* mask_basereg */ 3983 3984generate_composite_function_nearest_scanline \ 3985 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ 3986 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3987 8, /* number of pixels, processed in a single block */ \ 3988 default_init, \ 3989 default_cleanup, \ 3990 pixman_composite_src_8888_0565_process_pixblock_head, \ 3991 pixman_composite_src_8888_0565_process_pixblock_tail, \ 3992 pixman_composite_src_8888_0565_process_pixblock_tail_head 3993 3994generate_composite_function_nearest_scanline \ 3995 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 3996 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 3997 8, /* number of pixels, processed in a single block */ \ 3998 default_init, \ 3999 default_cleanup, \ 4000 pixman_composite_src_0565_8888_process_pixblock_head, \ 4001 pixman_composite_src_0565_8888_process_pixblock_tail, \ 4002 pixman_composite_src_0565_8888_process_pixblock_tail_head 4003 4004generate_composite_function_nearest_scanline \ 4005 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ 4006 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 4007 8, /* number of pixels, processed in a single block */ \ 4008 default_init_need_all_regs, \ 4009 default_cleanup_need_all_regs, \ 4010 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 4011 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 4012 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 4013 28, /* dst_w_basereg */ \ 4014 4, /* dst_r_basereg */ \ 4015 8, /* src_basereg */ \ 4016 24 /* mask_basereg */ 4017 4018generate_composite_function_nearest_scanline \ 4019 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ 4020 FLAG_DST_READWRITE, \ 4021 8, /* number of pixels, processed in a single block */ \ 4022 default_init_need_all_regs, \ 4023 default_cleanup_need_all_regs, \ 4024 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 4025 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 4026 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 4027 28, /* dst_w_basereg */ \ 4028 10, /* dst_r_basereg */ \ 4029 8, /* src_basereg */ \ 4030 15 /* mask_basereg */ 4031 4032/******************************************************************************/ 4033 4034/* 4035 * Bilinear scaling support code which tries to provide pixel fetching, color 4036 * format conversion, and interpolation as separate macros which can be used 4037 * as the basic building blocks for constructing bilinear scanline functions. 4038 */ 4039 4040.macro bilinear_load_8888 reg1, reg2, tmp 4041 mov TMP1, X, asr #16 4042 add X, X, UX 4043 add TMP1, TOP, TMP1, asl #2 4044 vld1.32 {reg1}, [TMP1], STRIDE 4045 vld1.32 {reg2}, [TMP1] 4046.endm 4047 4048.macro bilinear_load_0565 reg1, reg2, tmp 4049 mov TMP1, X, asr #16 4050 add X, X, UX 4051 add TMP1, TOP, TMP1, asl #1 4052 vld1.32 {reg2[0]}, [TMP1], STRIDE 4053 vld1.32 {reg2[1]}, [TMP1] 4054 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 4055.endm 4056 4057.macro bilinear_load_and_vertical_interpolate_two_8888 \ 4058 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 4059 4060 bilinear_load_8888 reg1, reg2, tmp1 4061 vmull.u8 acc1, reg1, d28 4062 vmlal.u8 acc1, reg2, d29 4063 bilinear_load_8888 reg3, reg4, tmp2 4064 vmull.u8 acc2, reg3, d28 4065 vmlal.u8 acc2, reg4, d29 4066.endm 4067 4068.macro bilinear_load_and_vertical_interpolate_four_8888 \ 4069 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 4070 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 4071 4072 bilinear_load_and_vertical_interpolate_two_8888 \ 4073 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 4074 bilinear_load_and_vertical_interpolate_two_8888 \ 4075 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 4076.endm 4077 4078.macro bilinear_load_and_vertical_interpolate_two_0565 \ 4079 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 4080 4081 mov TMP1, X, asr #16 4082 add X, X, UX 4083 add TMP1, TOP, TMP1, asl #1 4084 mov TMP2, X, asr #16 4085 add X, X, UX 4086 add TMP2, TOP, TMP2, asl #1 4087 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 4088 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 4089 vld1.32 {acc2lo[1]}, [TMP1] 4090 vld1.32 {acc2hi[1]}, [TMP2] 4091 convert_0565_to_x888 acc2, reg3, reg2, reg1 4092 vzip.u8 reg1, reg3 4093 vzip.u8 reg2, reg4 4094 vzip.u8 reg3, reg4 4095 vzip.u8 reg1, reg2 4096 vmull.u8 acc1, reg1, d28 4097 vmlal.u8 acc1, reg2, d29 4098 vmull.u8 acc2, reg3, d28 4099 vmlal.u8 acc2, reg4, d29 4100.endm 4101 4102.macro bilinear_load_and_vertical_interpolate_four_0565 \ 4103 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 4104 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 4105 4106 mov TMP1, X, asr #16 4107 add X, X, UX 4108 add TMP1, TOP, TMP1, asl #1 4109 mov TMP2, X, asr #16 4110 add X, X, UX 4111 add TMP2, TOP, TMP2, asl #1 4112 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 4113 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 4114 vld1.32 {xacc2lo[1]}, [TMP1] 4115 vld1.32 {xacc2hi[1]}, [TMP2] 4116 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 4117 mov TMP1, X, asr #16 4118 add X, X, UX 4119 add TMP1, TOP, TMP1, asl #1 4120 mov TMP2, X, asr #16 4121 add X, X, UX 4122 add TMP2, TOP, TMP2, asl #1 4123 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 4124 vzip.u8 xreg1, xreg3 4125 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 4126 vzip.u8 xreg2, xreg4 4127 vld1.32 {yacc2lo[1]}, [TMP1] 4128 vzip.u8 xreg3, xreg4 4129 vld1.32 {yacc2hi[1]}, [TMP2] 4130 vzip.u8 xreg1, xreg2 4131 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 4132 vmull.u8 xacc1, xreg1, d28 4133 vzip.u8 yreg1, yreg3 4134 vmlal.u8 xacc1, xreg2, d29 4135 vzip.u8 yreg2, yreg4 4136 vmull.u8 xacc2, xreg3, d28 4137 vzip.u8 yreg3, yreg4 4138 vmlal.u8 xacc2, xreg4, d29 4139 vzip.u8 yreg1, yreg2 4140 vmull.u8 yacc1, yreg1, d28 4141 vmlal.u8 yacc1, yreg2, d29 4142 vmull.u8 yacc2, yreg3, d28 4143 vmlal.u8 yacc2, yreg4, d29 4144.endm 4145 4146.macro bilinear_store_8888 numpix, tmp1, tmp2 4147.if numpix == 4 4148 vst1.32 {d0, d1}, [OUT, :128]! 4149.elseif numpix == 2 4150 vst1.32 {d0}, [OUT, :64]! 4151.elseif numpix == 1 4152 vst1.32 {d0[0]}, [OUT, :32]! 4153.else 4154 .error bilinear_store_8888 numpix is unsupported 4155.endif 4156.endm 4157 4158.macro bilinear_store_0565 numpix, tmp1, tmp2 4159 vuzp.u8 d0, d1 4160 vuzp.u8 d2, d3 4161 vuzp.u8 d1, d3 4162 vuzp.u8 d0, d2 4163 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 4164.if numpix == 4 4165 vst1.16 {d2}, [OUT, :64]! 4166.elseif numpix == 2 4167 vst1.32 {d2[0]}, [OUT, :32]! 4168.elseif numpix == 1 4169 vst1.16 {d2[0]}, [OUT, :16]! 4170.else 4171 .error bilinear_store_0565 numpix is unsupported 4172.endif 4173.endm 4174 4175.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 4176 bilinear_load_&src_fmt d0, d1, d2 4177 vmull.u8 q1, d0, d28 4178 vmlal.u8 q1, d1, d29 4179 /* 5 cycles bubble */ 4180 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 4181 vmlsl.u16 q0, d2, d30 4182 vmlal.u16 q0, d3, d30 4183 /* 5 cycles bubble */ 4184 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4185 /* 3 cycles bubble */ 4186 vmovn.u16 d0, q0 4187 /* 1 cycle bubble */ 4188 bilinear_store_&dst_fmt 1, q2, q3 4189.endm 4190 4191.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 4192 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 4193 q1, q11, d0, d1, d20, d21, d22, d23 4194 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 4195 vmlsl.u16 q0, d2, d30 4196 vmlal.u16 q0, d3, d30 4197 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 4198 vmlsl.u16 q10, d22, d31 4199 vmlal.u16 q10, d23, d31 4200 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4201 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 4202 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4203 vadd.u16 q12, q12, q13 4204 vmovn.u16 d0, q0 4205 bilinear_store_&dst_fmt 2, q2, q3 4206.endm 4207 4208.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 4209 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 4210 q1, q11, d0, d1, d20, d21, d22, d23 \ 4211 q3, q9, d4, d5, d16, d17, d18, d19 4212 pld [TMP1, PF_OFFS] 4213 sub TMP1, TMP1, STRIDE 4214 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 4215 vmlsl.u16 q0, d2, d30 4216 vmlal.u16 q0, d3, d30 4217 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 4218 vmlsl.u16 q10, d22, d31 4219 vmlal.u16 q10, d23, d31 4220 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4221 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 4222 vmlsl.u16 q2, d6, d30 4223 vmlal.u16 q2, d7, d30 4224 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 4225 pld [TMP2, PF_OFFS] 4226 vmlsl.u16 q8, d18, d31 4227 vmlal.u16 q8, d19, d31 4228 vadd.u16 q12, q12, q13 4229 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4230 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 4231 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4232 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 4233 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4234 vmovn.u16 d0, q0 4235 vmovn.u16 d1, q2 4236 vadd.u16 q12, q12, q13 4237 bilinear_store_&dst_fmt 4, q2, q3 4238.endm 4239 4240.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 4241.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 4242 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head 4243.else 4244 bilinear_interpolate_four_pixels src_fmt, dst_fmt 4245.endif 4246.endm 4247 4248.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 4249.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 4250 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail 4251.endif 4252.endm 4253 4254.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 4255.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 4256 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head 4257.else 4258 bilinear_interpolate_four_pixels src_fmt, dst_fmt 4259.endif 4260.endm 4261 4262.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 4263.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 4264 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head 4265.else 4266 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 4267 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 4268.endif 4269.endm 4270 4271.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 4272.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 4273 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail 4274.else 4275 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 4276.endif 4277.endm 4278 4279.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 4280.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 4281 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head 4282.else 4283 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 4284 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 4285.endif 4286.endm 4287 4288.set BILINEAR_FLAG_UNROLL_4, 0 4289.set BILINEAR_FLAG_UNROLL_8, 1 4290.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 4291 4292/* 4293 * Main template macro for generating NEON optimized bilinear scanline 4294 * functions. 4295 * 4296 * Bilinear scanline scaler macro template uses the following arguments: 4297 * fname - name of the function to generate 4298 * src_fmt - source color format (8888 or 0565) 4299 * dst_fmt - destination color format (8888 or 0565) 4300 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes 4301 * prefetch_distance - prefetch in the source image by that many 4302 * pixels ahead 4303 */ 4304 4305.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 4306 src_bpp_shift, dst_bpp_shift, \ 4307 prefetch_distance, flags 4308 4309pixman_asm_function fname 4310 OUT .req r0 4311 TOP .req r1 4312 BOTTOM .req r2 4313 WT .req r3 4314 WB .req r4 4315 X .req r5 4316 UX .req r6 4317 WIDTH .req ip 4318 TMP1 .req r3 4319 TMP2 .req r4 4320 PF_OFFS .req r7 4321 TMP3 .req r8 4322 TMP4 .req r9 4323 STRIDE .req r2 4324 4325 mov ip, sp 4326 push {r4, r5, r6, r7, r8, r9} 4327 mov PF_OFFS, #prefetch_distance 4328 ldmia ip, {WB, X, UX, WIDTH} 4329 mul PF_OFFS, PF_OFFS, UX 4330 4331.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 4332 vpush {d8-d15} 4333.endif 4334 4335 sub STRIDE, BOTTOM, TOP 4336 .unreq BOTTOM 4337 4338 cmp WIDTH, #0 4339 ble 3f 4340 4341 vdup.u16 q12, X 4342 vdup.u16 q13, UX 4343 vdup.u8 d28, WT 4344 vdup.u8 d29, WB 4345 vadd.u16 d25, d25, d26 4346 4347 /* ensure good destination alignment */ 4348 cmp WIDTH, #1 4349 blt 0f 4350 tst OUT, #(1 << dst_bpp_shift) 4351 beq 0f 4352 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4353 vadd.u16 q12, q12, q13 4354 bilinear_interpolate_last_pixel src_fmt, dst_fmt 4355 sub WIDTH, WIDTH, #1 43560: 4357 vadd.u16 q13, q13, q13 4358 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4359 vadd.u16 q12, q12, q13 4360 4361 cmp WIDTH, #2 4362 blt 0f 4363 tst OUT, #(1 << (dst_bpp_shift + 1)) 4364 beq 0f 4365 bilinear_interpolate_two_pixels src_fmt, dst_fmt 4366 sub WIDTH, WIDTH, #2 43670: 4368.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 4369/*********** 8 pixels per iteration *****************/ 4370 cmp WIDTH, #4 4371 blt 0f 4372 tst OUT, #(1 << (dst_bpp_shift + 2)) 4373 beq 0f 4374 bilinear_interpolate_four_pixels src_fmt, dst_fmt 4375 sub WIDTH, WIDTH, #4 43760: 4377 subs WIDTH, WIDTH, #8 4378 blt 1f 4379 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 4380 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 4381 subs WIDTH, WIDTH, #8 4382 blt 5f 43830: 4384 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 4385 subs WIDTH, WIDTH, #8 4386 bge 0b 43875: 4388 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 43891: 4390 tst WIDTH, #4 4391 beq 2f 4392 bilinear_interpolate_four_pixels src_fmt, dst_fmt 43932: 4394.else 4395/*********** 4 pixels per iteration *****************/ 4396 subs WIDTH, WIDTH, #4 4397 blt 1f 4398 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 4399 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 4400 subs WIDTH, WIDTH, #4 4401 blt 5f 44020: 4403 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 4404 subs WIDTH, WIDTH, #4 4405 bge 0b 44065: 4407 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 44081: 4409/****************************************************/ 4410.endif 4411 /* handle the remaining trailing pixels */ 4412 tst WIDTH, #2 4413 beq 2f 4414 bilinear_interpolate_two_pixels src_fmt, dst_fmt 44152: 4416 tst WIDTH, #1 4417 beq 3f 4418 bilinear_interpolate_last_pixel src_fmt, dst_fmt 44193: 4420.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 4421 vpop {d8-d15} 4422.endif 4423 pop {r4, r5, r6, r7, r8, r9} 4424 bx lr 4425 4426 .unreq OUT 4427 .unreq TOP 4428 .unreq WT 4429 .unreq WB 4430 .unreq X 4431 .unreq UX 4432 .unreq WIDTH 4433 .unreq TMP1 4434 .unreq TMP2 4435 .unreq PF_OFFS 4436 .unreq TMP3 4437 .unreq TMP4 4438 .unreq STRIDE 4439.endfunc 4440 4441.endm 4442 4443/*****************************************************************************/ 4444 4445.set have_bilinear_interpolate_four_pixels_8888_8888, 1 4446 4447.macro bilinear_interpolate_four_pixels_8888_8888_head 4448 mov TMP1, X, asr #16 4449 add X, X, UX 4450 add TMP1, TOP, TMP1, asl #2 4451 mov TMP2, X, asr #16 4452 add X, X, UX 4453 add TMP2, TOP, TMP2, asl #2 4454 4455 vld1.32 {d22}, [TMP1], STRIDE 4456 vld1.32 {d23}, [TMP1] 4457 mov TMP3, X, asr #16 4458 add X, X, UX 4459 add TMP3, TOP, TMP3, asl #2 4460 vmull.u8 q8, d22, d28 4461 vmlal.u8 q8, d23, d29 4462 4463 vld1.32 {d22}, [TMP2], STRIDE 4464 vld1.32 {d23}, [TMP2] 4465 mov TMP4, X, asr #16 4466 add X, X, UX 4467 add TMP4, TOP, TMP4, asl #2 4468 vmull.u8 q9, d22, d28 4469 vmlal.u8 q9, d23, d29 4470 4471 vld1.32 {d22}, [TMP3], STRIDE 4472 vld1.32 {d23}, [TMP3] 4473 vmull.u8 q10, d22, d28 4474 vmlal.u8 q10, d23, d29 4475 4476 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4477 vmlsl.u16 q0, d16, d30 4478 vmlal.u16 q0, d17, d30 4479 4480 pld [TMP4, PF_OFFS] 4481 vld1.32 {d16}, [TMP4], STRIDE 4482 vld1.32 {d17}, [TMP4] 4483 pld [TMP4, PF_OFFS] 4484 vmull.u8 q11, d16, d28 4485 vmlal.u8 q11, d17, d29 4486 4487 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4488 vmlsl.u16 q1, d18, d31 4489.endm 4490 4491.macro bilinear_interpolate_four_pixels_8888_8888_tail 4492 vmlal.u16 q1, d19, d31 4493 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4494 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4495 vmlsl.u16 q2, d20, d30 4496 vmlal.u16 q2, d21, d30 4497 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4498 vmlsl.u16 q3, d22, d31 4499 vmlal.u16 q3, d23, d31 4500 vadd.u16 q12, q12, q13 4501 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4502 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4503 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4504 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4505 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4506 vmovn.u16 d6, q0 4507 vmovn.u16 d7, q2 4508 vadd.u16 q12, q12, q13 4509 vst1.32 {d6, d7}, [OUT, :128]! 4510.endm 4511 4512.macro bilinear_interpolate_four_pixels_8888_8888_tail_head 4513 mov TMP1, X, asr #16 4514 add X, X, UX 4515 add TMP1, TOP, TMP1, asl #2 4516 mov TMP2, X, asr #16 4517 add X, X, UX 4518 add TMP2, TOP, TMP2, asl #2 4519 vmlal.u16 q1, d19, d31 4520 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4521 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4522 vmlsl.u16 q2, d20, d30 4523 vmlal.u16 q2, d21, d30 4524 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4525 vld1.32 {d20}, [TMP1], STRIDE 4526 vmlsl.u16 q3, d22, d31 4527 vmlal.u16 q3, d23, d31 4528 vld1.32 {d21}, [TMP1] 4529 vmull.u8 q8, d20, d28 4530 vmlal.u8 q8, d21, d29 4531 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4532 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4533 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4534 vld1.32 {d22}, [TMP2], STRIDE 4535 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4536 vadd.u16 q12, q12, q13 4537 vld1.32 {d23}, [TMP2] 4538 vmull.u8 q9, d22, d28 4539 mov TMP3, X, asr #16 4540 add X, X, UX 4541 add TMP3, TOP, TMP3, asl #2 4542 mov TMP4, X, asr #16 4543 add X, X, UX 4544 add TMP4, TOP, TMP4, asl #2 4545 vmlal.u8 q9, d23, d29 4546 vld1.32 {d22}, [TMP3], STRIDE 4547 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4548 vld1.32 {d23}, [TMP3] 4549 vmull.u8 q10, d22, d28 4550 vmlal.u8 q10, d23, d29 4551 vmovn.u16 d6, q0 4552 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4553 vmovn.u16 d7, q2 4554 vmlsl.u16 q0, d16, d30 4555 vmlal.u16 q0, d17, d30 4556 pld [TMP4, PF_OFFS] 4557 vld1.32 {d16}, [TMP4], STRIDE 4558 vadd.u16 q12, q12, q13 4559 vld1.32 {d17}, [TMP4] 4560 pld [TMP4, PF_OFFS] 4561 vmull.u8 q11, d16, d28 4562 vmlal.u8 q11, d17, d29 4563 vst1.32 {d6, d7}, [OUT, :128]! 4564 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4565 vmlsl.u16 q1, d18, d31 4566.endm 4567 4568/*****************************************************************************/ 4569 4570.set have_bilinear_interpolate_eight_pixels_8888_0565, 1 4571 4572.macro bilinear_interpolate_eight_pixels_8888_0565_head 4573 mov TMP1, X, asr #16 4574 add X, X, UX 4575 add TMP1, TOP, TMP1, asl #2 4576 mov TMP2, X, asr #16 4577 add X, X, UX 4578 add TMP2, TOP, TMP2, asl #2 4579 vld1.32 {d20}, [TMP1], STRIDE 4580 vld1.32 {d21}, [TMP1] 4581 vmull.u8 q8, d20, d28 4582 vmlal.u8 q8, d21, d29 4583 vld1.32 {d22}, [TMP2], STRIDE 4584 vld1.32 {d23}, [TMP2] 4585 vmull.u8 q9, d22, d28 4586 mov TMP3, X, asr #16 4587 add X, X, UX 4588 add TMP3, TOP, TMP3, asl #2 4589 mov TMP4, X, asr #16 4590 add X, X, UX 4591 add TMP4, TOP, TMP4, asl #2 4592 vmlal.u8 q9, d23, d29 4593 vld1.32 {d22}, [TMP3], STRIDE 4594 vld1.32 {d23}, [TMP3] 4595 vmull.u8 q10, d22, d28 4596 vmlal.u8 q10, d23, d29 4597 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4598 vmlsl.u16 q0, d16, d30 4599 vmlal.u16 q0, d17, d30 4600 pld [TMP4, PF_OFFS] 4601 vld1.32 {d16}, [TMP4], STRIDE 4602 vld1.32 {d17}, [TMP4] 4603 pld [TMP4, PF_OFFS] 4604 vmull.u8 q11, d16, d28 4605 vmlal.u8 q11, d17, d29 4606 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4607 vmlsl.u16 q1, d18, d31 4608 4609 mov TMP1, X, asr #16 4610 add X, X, UX 4611 add TMP1, TOP, TMP1, asl #2 4612 mov TMP2, X, asr #16 4613 add X, X, UX 4614 add TMP2, TOP, TMP2, asl #2 4615 vmlal.u16 q1, d19, d31 4616 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4617 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4618 vmlsl.u16 q2, d20, d30 4619 vmlal.u16 q2, d21, d30 4620 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4621 vld1.32 {d20}, [TMP1], STRIDE 4622 vmlsl.u16 q3, d22, d31 4623 vmlal.u16 q3, d23, d31 4624 vld1.32 {d21}, [TMP1] 4625 vmull.u8 q8, d20, d28 4626 vmlal.u8 q8, d21, d29 4627 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4628 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4629 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4630 vld1.32 {d22}, [TMP2], STRIDE 4631 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4632 vadd.u16 q12, q12, q13 4633 vld1.32 {d23}, [TMP2] 4634 vmull.u8 q9, d22, d28 4635 mov TMP3, X, asr #16 4636 add X, X, UX 4637 add TMP3, TOP, TMP3, asl #2 4638 mov TMP4, X, asr #16 4639 add X, X, UX 4640 add TMP4, TOP, TMP4, asl #2 4641 vmlal.u8 q9, d23, d29 4642 vld1.32 {d22}, [TMP3], STRIDE 4643 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4644 vld1.32 {d23}, [TMP3] 4645 vmull.u8 q10, d22, d28 4646 vmlal.u8 q10, d23, d29 4647 vmovn.u16 d8, q0 4648 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4649 vmovn.u16 d9, q2 4650 vmlsl.u16 q0, d16, d30 4651 vmlal.u16 q0, d17, d30 4652 pld [TMP4, PF_OFFS] 4653 vld1.32 {d16}, [TMP4], STRIDE 4654 vadd.u16 q12, q12, q13 4655 vld1.32 {d17}, [TMP4] 4656 pld [TMP4, PF_OFFS] 4657 vmull.u8 q11, d16, d28 4658 vmlal.u8 q11, d17, d29 4659 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4660 vmlsl.u16 q1, d18, d31 4661.endm 4662 4663.macro bilinear_interpolate_eight_pixels_8888_0565_tail 4664 vmlal.u16 q1, d19, d31 4665 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4666 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4667 vmlsl.u16 q2, d20, d30 4668 vmlal.u16 q2, d21, d30 4669 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4670 vmlsl.u16 q3, d22, d31 4671 vmlal.u16 q3, d23, d31 4672 vadd.u16 q12, q12, q13 4673 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4674 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4675 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4676 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4677 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4678 vmovn.u16 d10, q0 4679 vmovn.u16 d11, q2 4680 vadd.u16 q12, q12, q13 4681 4682 vuzp.u8 d8, d9 4683 vuzp.u8 d10, d11 4684 vuzp.u8 d9, d11 4685 vuzp.u8 d8, d10 4686 vshll.u8 q6, d9, #8 4687 vshll.u8 q5, d10, #8 4688 vshll.u8 q7, d8, #8 4689 vsri.u16 q5, q6, #5 4690 vsri.u16 q5, q7, #11 4691 vst1.32 {d10, d11}, [OUT, :128]! 4692.endm 4693 4694.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head 4695 mov TMP1, X, asr #16 4696 add X, X, UX 4697 add TMP1, TOP, TMP1, asl #2 4698 mov TMP2, X, asr #16 4699 add X, X, UX 4700 add TMP2, TOP, TMP2, asl #2 4701 vmlal.u16 q1, d19, d31 4702 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4703 vuzp.u8 d8, d9 4704 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4705 vmlsl.u16 q2, d20, d30 4706 vmlal.u16 q2, d21, d30 4707 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4708 vld1.32 {d20}, [TMP1], STRIDE 4709 vmlsl.u16 q3, d22, d31 4710 vmlal.u16 q3, d23, d31 4711 vld1.32 {d21}, [TMP1] 4712 vmull.u8 q8, d20, d28 4713 vmlal.u8 q8, d21, d29 4714 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4715 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4716 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4717 vld1.32 {d22}, [TMP2], STRIDE 4718 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4719 vadd.u16 q12, q12, q13 4720 vld1.32 {d23}, [TMP2] 4721 vmull.u8 q9, d22, d28 4722 mov TMP3, X, asr #16 4723 add X, X, UX 4724 add TMP3, TOP, TMP3, asl #2 4725 mov TMP4, X, asr #16 4726 add X, X, UX 4727 add TMP4, TOP, TMP4, asl #2 4728 vmlal.u8 q9, d23, d29 4729 vld1.32 {d22}, [TMP3], STRIDE 4730 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4731 vld1.32 {d23}, [TMP3] 4732 vmull.u8 q10, d22, d28 4733 vmlal.u8 q10, d23, d29 4734 vmovn.u16 d10, q0 4735 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4736 vmovn.u16 d11, q2 4737 vmlsl.u16 q0, d16, d30 4738 vmlal.u16 q0, d17, d30 4739 pld [TMP4, PF_OFFS] 4740 vld1.32 {d16}, [TMP4], STRIDE 4741 vadd.u16 q12, q12, q13 4742 vld1.32 {d17}, [TMP4] 4743 pld [TMP4, PF_OFFS] 4744 vmull.u8 q11, d16, d28 4745 vmlal.u8 q11, d17, d29 4746 vuzp.u8 d10, d11 4747 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4748 vmlsl.u16 q1, d18, d31 4749 4750 mov TMP1, X, asr #16 4751 add X, X, UX 4752 add TMP1, TOP, TMP1, asl #2 4753 mov TMP2, X, asr #16 4754 add X, X, UX 4755 add TMP2, TOP, TMP2, asl #2 4756 vmlal.u16 q1, d19, d31 4757 vuzp.u8 d9, d11 4758 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4759 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 4760 vuzp.u8 d8, d10 4761 vmlsl.u16 q2, d20, d30 4762 vmlal.u16 q2, d21, d30 4763 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 4764 vld1.32 {d20}, [TMP1], STRIDE 4765 vmlsl.u16 q3, d22, d31 4766 vmlal.u16 q3, d23, d31 4767 vld1.32 {d21}, [TMP1] 4768 vmull.u8 q8, d20, d28 4769 vmlal.u8 q8, d21, d29 4770 vshll.u8 q6, d9, #8 4771 vshll.u8 q5, d10, #8 4772 vshll.u8 q7, d8, #8 4773 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 4774 vsri.u16 q5, q6, #5 4775 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 4776 vsri.u16 q5, q7, #11 4777 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 4778 vld1.32 {d22}, [TMP2], STRIDE 4779 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 4780 vadd.u16 q12, q12, q13 4781 vld1.32 {d23}, [TMP2] 4782 vmull.u8 q9, d22, d28 4783 mov TMP3, X, asr #16 4784 add X, X, UX 4785 add TMP3, TOP, TMP3, asl #2 4786 mov TMP4, X, asr #16 4787 add X, X, UX 4788 add TMP4, TOP, TMP4, asl #2 4789 vmlal.u8 q9, d23, d29 4790 vld1.32 {d22}, [TMP3], STRIDE 4791 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 4792 vld1.32 {d23}, [TMP3] 4793 vmull.u8 q10, d22, d28 4794 vmlal.u8 q10, d23, d29 4795 vmovn.u16 d8, q0 4796 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 4797 vmovn.u16 d9, q2 4798 vmlsl.u16 q0, d16, d30 4799 vmlal.u16 q0, d17, d30 4800 pld [TMP4, PF_OFFS] 4801 vld1.32 {d16}, [TMP4], STRIDE 4802 vadd.u16 q12, q12, q13 4803 vld1.32 {d17}, [TMP4] 4804 pld [TMP4, PF_OFFS] 4805 vmull.u8 q11, d16, d28 4806 vmlal.u8 q11, d17, d29 4807 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 4808 vst1.32 {d10, d11}, [OUT, :128]! 4809 vmlsl.u16 q1, d18, d31 4810.endm 4811/*****************************************************************************/ 4812 4813generate_bilinear_scanline_func \ 4814 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 4815 2, 2, 28, BILINEAR_FLAG_UNROLL_4 4816 4817generate_bilinear_scanline_func \ 4818 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ 4819 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS 4820 4821generate_bilinear_scanline_func \ 4822 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ 4823 1, 2, 28, BILINEAR_FLAG_UNROLL_4 4824 4825generate_bilinear_scanline_func \ 4826 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ 4827 1, 1, 28, BILINEAR_FLAG_UNROLL_4 4828