1/* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26/* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37/* Prevent the stack from becoming executable for no reason... */ 38#if defined(__linux__) && defined(__ELF__) 39.section .note.GNU-stack,"",%progbits 40#endif 41 42 .text 43 .fpu neon 44 .arch armv7a 45 .altmacro 46 47#include "pixman-arm-neon-asm.h" 48 49/* Global configuration options and preferences */ 50 51/* 52 * The code can optionally make use of unaligned memory accesses to improve 53 * performance of handling leading/trailing pixels for each scanline. 54 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 55 * example in linux if unaligned memory accesses are not configured to 56 * generate.exceptions. 57 */ 58.set RESPECT_STRICT_ALIGNMENT, 1 59 60/* 61 * Set default prefetch type. There is a choice between the following options: 62 * 63 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 64 * as NOP to workaround some HW bugs or for whatever other reason) 65 * 66 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 67 * advanced prefetch intruduces heavy overhead) 68 * 69 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 70 * which can run ARM and NEON instructions simultaneously so that extra ARM 71 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 72 * 73 * Note: some types of function can't support advanced prefetch and fallback 74 * to simple one (those which handle 24bpp pixels) 75 */ 76.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 77 78/* Prefetch distance in pixels for simple prefetch */ 79.set PREFETCH_DISTANCE_SIMPLE, 64 80 81/* 82 * Implementation of pixman_composite_over_8888_0565_asm_neon 83 * 84 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 85 * performs OVER compositing operation. Function fast_composite_over_8888_0565 86 * from pixman-fast-path.c does the same in C and can be used as a reference. 87 * 88 * First we need to have some NEON assembly code which can do the actual 89 * operation on the pixels and provide it to the template macro. 90 * 91 * Template macro quite conveniently takes care of emitting all the necessary 92 * code for memory reading and writing (including quite tricky cases of 93 * handling unaligned leading/trailing pixels), so we only need to deal with 94 * the data in NEON registers. 95 * 96 * NEON registers allocation in general is recommented to be the following: 97 * d0, d1, d2, d3 - contain loaded source pixel data 98 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 99 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 100 * d28, d29, d30, d31 - place for storing the result (destination pixels) 101 * 102 * As can be seen above, four 64-bit NEON registers are used for keeping 103 * intermediate pixel data and up to 8 pixels can be processed in one step 104 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 105 * 106 * This particular function uses the following registers allocation: 107 * d0, d1, d2, d3 - contain loaded source pixel data 108 * d4, d5 - contain loaded destination pixels (they are needed) 109 * d28, d29 - place for storing the result (destination pixels) 110 */ 111 112/* 113 * Step one. We need to have some code to do some arithmetics on pixel data. 114 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 115 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 116 * perform all the needed calculations and write the result to {d28, d29}. 117 * The rationale for having two macros and not just one will be explained 118 * later. In practice, any single monolitic function which does the work can 119 * be split into two parts in any arbitrary way without affecting correctness. 120 * 121 * There is one special trick here too. Common template macro can optionally 122 * make our life a bit easier by doing R, G, B, A color components 123 * deinterleaving for 32bpp pixel formats (and this feature is used in 124 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 125 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 126 * actually use d0 register for blue channel (a vector of eight 8-bit 127 * values), d1 register for green, d2 for red and d3 for alpha. This 128 * simple conversion can be also done with a few NEON instructions: 129 * 130 * Packed to planar conversion: 131 * vuzp.8 d0, d1 132 * vuzp.8 d2, d3 133 * vuzp.8 d1, d3 134 * vuzp.8 d0, d2 135 * 136 * Planar to packed conversion: 137 * vzip.8 d0, d2 138 * vzip.8 d1, d3 139 * vzip.8 d2, d3 140 * vzip.8 d0, d1 141 * 142 * But pixel can be loaded directly in planar format using VLD4.8 NEON 143 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 144 * desirable, that's why deinterleaving is optional. 145 * 146 * But anyway, here is the code: 147 */ 148.macro pixman_composite_over_8888_0565_process_pixblock_head 149 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 150 and put data into d6 - red, d7 - green, d30 - blue */ 151 vshrn.u16 d6, q2, #8 152 vshrn.u16 d7, q2, #3 153 vsli.u16 q2, q2, #5 154 vsri.u8 d6, d6, #5 155 vmvn.8 d3, d3 /* invert source alpha */ 156 vsri.u8 d7, d7, #6 157 vshrn.u16 d30, q2, #2 158 /* now do alpha blending, storing results in 8-bit planar format 159 into d16 - red, d19 - green, d18 - blue */ 160 vmull.u8 q10, d3, d6 161 vmull.u8 q11, d3, d7 162 vmull.u8 q12, d3, d30 163 vrshr.u16 q13, q10, #8 164 vrshr.u16 q3, q11, #8 165 vrshr.u16 q15, q12, #8 166 vraddhn.u16 d20, q10, q13 167 vraddhn.u16 d23, q11, q3 168 vraddhn.u16 d22, q12, q15 169.endm 170 171.macro pixman_composite_over_8888_0565_process_pixblock_tail 172 /* ... continue alpha blending */ 173 vqadd.u8 d16, d2, d20 174 vqadd.u8 q9, q0, q11 175 /* convert the result to r5g6b5 and store it into {d28, d29} */ 176 vshll.u8 q14, d16, #8 177 vshll.u8 q8, d19, #8 178 vshll.u8 q9, d18, #8 179 vsri.u16 q14, q8, #5 180 vsri.u16 q14, q9, #11 181.endm 182 183/* 184 * OK, now we got almost everything that we need. Using the above two 185 * macros, the work can be done right. But now we want to optimize 186 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 187 * a lot from good code scheduling and software pipelining. 188 * 189 * Let's construct some code, which will run in the core main loop. 190 * Some pseudo-code of the main loop will look like this: 191 * head 192 * while (...) { 193 * tail 194 * head 195 * } 196 * tail 197 * 198 * It may look a bit weird, but this setup allows to hide instruction 199 * latencies better and also utilize dual-issue capability more 200 * efficiently (make pairs of load-store and ALU instructions). 201 * 202 * So what we need now is a '*_tail_head' macro, which will be used 203 * in the core main loop. A trivial straightforward implementation 204 * of this macro would look like this: 205 * 206 * pixman_composite_over_8888_0565_process_pixblock_tail 207 * vst1.16 {d28, d29}, [DST_W, :128]! 208 * vld1.16 {d4, d5}, [DST_R, :128]! 209 * vld4.32 {d0, d1, d2, d3}, [SRC]! 210 * pixman_composite_over_8888_0565_process_pixblock_head 211 * cache_preload 8, 8 212 * 213 * Now it also got some VLD/VST instructions. We simply can't move from 214 * processing one block of pixels to the other one with just arithmetics. 215 * The previously processed data needs to be written to memory and new 216 * data needs to be fetched. Fortunately, this main loop does not deal 217 * with partial leading/trailing pixels and can load/store a full block 218 * of pixels in a bulk. Additionally, destination buffer is already 219 * 16 bytes aligned here (which is good for performance). 220 * 221 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 222 * are the aliases for ARM registers which are used as pointers for 223 * accessing data. We maintain separate pointers for reading and writing 224 * destination buffer (DST_R and DST_W). 225 * 226 * Another new thing is 'cache_preload' macro. It is used for prefetching 227 * data into CPU L2 cache and improve performance when dealing with large 228 * images which are far larger than cache size. It uses one argument 229 * (actually two, but they need to be the same here) - number of pixels 230 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 231 * details about this macro. Moreover, if good performance is needed 232 * the code from this macro needs to be copied into '*_tail_head' macro 233 * and mixed with the rest of code for optimal instructions scheduling. 234 * We are actually doing it below. 235 * 236 * Now after all the explanations, here is the optimized code. 237 * Different instruction streams (originaling from '*_head', '*_tail' 238 * and 'cache_preload' macro) use different indentation levels for 239 * better readability. Actually taking the code from one of these 240 * indentation levels and ignoring a few VLD/VST instructions would 241 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 242 * macro! 243 */ 244 245#if 1 246 247.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 248 vqadd.u8 d16, d2, d20 249 vld1.16 {d4, d5}, [DST_R, :128]! 250 vqadd.u8 q9, q0, q11 251 vshrn.u16 d6, q2, #8 252 vld4.8 {d0, d1, d2, d3}, [SRC]! 253 vshrn.u16 d7, q2, #3 254 vsli.u16 q2, q2, #5 255 vshll.u8 q14, d16, #8 256 PF add PF_X, PF_X, #8 257 vshll.u8 q8, d19, #8 258 PF tst PF_CTL, #0xF 259 vsri.u8 d6, d6, #5 260 PF addne PF_X, PF_X, #8 261 vmvn.8 d3, d3 262 PF subne PF_CTL, PF_CTL, #1 263 vsri.u8 d7, d7, #6 264 vshrn.u16 d30, q2, #2 265 vmull.u8 q10, d3, d6 266 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 267 vmull.u8 q11, d3, d7 268 vmull.u8 q12, d3, d30 269 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 270 vsri.u16 q14, q8, #5 271 PF cmp PF_X, ORIG_W 272 vshll.u8 q9, d18, #8 273 vrshr.u16 q13, q10, #8 274 PF subge PF_X, PF_X, ORIG_W 275 vrshr.u16 q3, q11, #8 276 vrshr.u16 q15, q12, #8 277 PF subges PF_CTL, PF_CTL, #0x10 278 vsri.u16 q14, q9, #11 279 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 280 vraddhn.u16 d20, q10, q13 281 vraddhn.u16 d23, q11, q3 282 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 283 vraddhn.u16 d22, q12, q15 284 vst1.16 {d28, d29}, [DST_W, :128]! 285.endm 286 287#else 288 289/* If we did not care much about the performance, we would just use this... */ 290.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 291 pixman_composite_over_8888_0565_process_pixblock_tail 292 vst1.16 {d28, d29}, [DST_W, :128]! 293 vld1.16 {d4, d5}, [DST_R, :128]! 294 vld4.32 {d0, d1, d2, d3}, [SRC]! 295 pixman_composite_over_8888_0565_process_pixblock_head 296 cache_preload 8, 8 297.endm 298 299#endif 300 301/* 302 * And now the final part. We are using 'generate_composite_function' macro 303 * to put all the stuff together. We are specifying the name of the function 304 * which we want to get, number of bits per pixel for the source, mask and 305 * destination (0 if unused, like mask in this case). Next come some bit 306 * flags: 307 * FLAG_DST_READWRITE - tells that the destination buffer is both read 308 * and written, for write-only buffer we would use 309 * FLAG_DST_WRITEONLY flag instead 310 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 311 * and separate color channels for 32bpp format. 312 * The next things are: 313 * - the number of pixels processed per iteration (8 in this case, because 314 * that's the maximum what can fit into four 64-bit NEON registers). 315 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 316 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 317 * prefetch distance can be selected by running some benchmarks. 318 * 319 * After that we specify some macros, these are 'default_init', 320 * 'default_cleanup' here which are empty (but it is possible to have custom 321 * init/cleanup macros to be able to save/restore some extra NEON registers 322 * like d8-d15 or do anything else) followed by 323 * 'pixman_composite_over_8888_0565_process_pixblock_head', 324 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 325 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 326 * which we got implemented above. 327 * 328 * The last part is the NEON registers allocation scheme. 329 */ 330generate_composite_function \ 331 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 332 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 333 8, /* number of pixels, processed in a single block */ \ 334 5, /* prefetch distance */ \ 335 default_init, \ 336 default_cleanup, \ 337 pixman_composite_over_8888_0565_process_pixblock_head, \ 338 pixman_composite_over_8888_0565_process_pixblock_tail, \ 339 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 340 28, /* dst_w_basereg */ \ 341 4, /* dst_r_basereg */ \ 342 0, /* src_basereg */ \ 343 24 /* mask_basereg */ 344 345/******************************************************************************/ 346 347.macro pixman_composite_over_n_0565_process_pixblock_head 348 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 349 and put data into d6 - red, d7 - green, d30 - blue */ 350 vshrn.u16 d6, q2, #8 351 vshrn.u16 d7, q2, #3 352 vsli.u16 q2, q2, #5 353 vsri.u8 d6, d6, #5 354 vsri.u8 d7, d7, #6 355 vshrn.u16 d30, q2, #2 356 /* now do alpha blending, storing results in 8-bit planar format 357 into d16 - red, d19 - green, d18 - blue */ 358 vmull.u8 q10, d3, d6 359 vmull.u8 q11, d3, d7 360 vmull.u8 q12, d3, d30 361 vrshr.u16 q13, q10, #8 362 vrshr.u16 q3, q11, #8 363 vrshr.u16 q15, q12, #8 364 vraddhn.u16 d20, q10, q13 365 vraddhn.u16 d23, q11, q3 366 vraddhn.u16 d22, q12, q15 367.endm 368 369.macro pixman_composite_over_n_0565_process_pixblock_tail 370 /* ... continue alpha blending */ 371 vqadd.u8 d16, d2, d20 372 vqadd.u8 q9, q0, q11 373 /* convert the result to r5g6b5 and store it into {d28, d29} */ 374 vshll.u8 q14, d16, #8 375 vshll.u8 q8, d19, #8 376 vshll.u8 q9, d18, #8 377 vsri.u16 q14, q8, #5 378 vsri.u16 q14, q9, #11 379.endm 380 381/* TODO: expand macros and do better instructions scheduling */ 382.macro pixman_composite_over_n_0565_process_pixblock_tail_head 383 pixman_composite_over_n_0565_process_pixblock_tail 384 vld1.16 {d4, d5}, [DST_R, :128]! 385 vst1.16 {d28, d29}, [DST_W, :128]! 386 pixman_composite_over_n_0565_process_pixblock_head 387.endm 388 389.macro pixman_composite_over_n_0565_init 390 add DUMMY, sp, #ARGS_STACK_OFFSET 391 vld1.32 {d3[0]}, [DUMMY] 392 vdup.8 d0, d3[0] 393 vdup.8 d1, d3[1] 394 vdup.8 d2, d3[2] 395 vdup.8 d3, d3[3] 396 vmvn.8 d3, d3 /* invert source alpha */ 397.endm 398 399generate_composite_function \ 400 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 401 FLAG_DST_READWRITE, \ 402 8, /* number of pixels, processed in a single block */ \ 403 5, /* prefetch distance */ \ 404 pixman_composite_over_n_0565_init, \ 405 default_cleanup, \ 406 pixman_composite_over_n_0565_process_pixblock_head, \ 407 pixman_composite_over_n_0565_process_pixblock_tail, \ 408 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 409 28, /* dst_w_basereg */ \ 410 4, /* dst_r_basereg */ \ 411 0, /* src_basereg */ \ 412 24 /* mask_basereg */ 413 414/******************************************************************************/ 415 416.macro pixman_composite_src_8888_0565_process_pixblock_head 417 vshll.u8 q8, d1, #8 418 vshll.u8 q14, d2, #8 419 vshll.u8 q9, d0, #8 420.endm 421 422.macro pixman_composite_src_8888_0565_process_pixblock_tail 423 vsri.u16 q14, q8, #5 424 vsri.u16 q14, q9, #11 425.endm 426 427.macro pixman_composite_src_8888_0565_process_pixblock_tail_head 428 vsri.u16 q14, q8, #5 429 PF add PF_X, PF_X, #8 430 PF tst PF_CTL, #0xF 431 vld4.8 {d0, d1, d2, d3}, [SRC]! 432 PF addne PF_X, PF_X, #8 433 PF subne PF_CTL, PF_CTL, #1 434 vsri.u16 q14, q9, #11 435 PF cmp PF_X, ORIG_W 436 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 437 vshll.u8 q8, d1, #8 438 vst1.16 {d28, d29}, [DST_W, :128]! 439 PF subge PF_X, PF_X, ORIG_W 440 PF subges PF_CTL, PF_CTL, #0x10 441 vshll.u8 q14, d2, #8 442 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 443 vshll.u8 q9, d0, #8 444.endm 445 446generate_composite_function \ 447 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 448 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 449 8, /* number of pixels, processed in a single block */ \ 450 10, /* prefetch distance */ \ 451 default_init, \ 452 default_cleanup, \ 453 pixman_composite_src_8888_0565_process_pixblock_head, \ 454 pixman_composite_src_8888_0565_process_pixblock_tail, \ 455 pixman_composite_src_8888_0565_process_pixblock_tail_head 456 457/******************************************************************************/ 458 459.macro pixman_composite_src_0565_8888_process_pixblock_head 460 vshrn.u16 d30, q0, #8 461 vshrn.u16 d29, q0, #3 462 vsli.u16 q0, q0, #5 463 vmov.u8 d31, #255 464 vsri.u8 d30, d30, #5 465 vsri.u8 d29, d29, #6 466 vshrn.u16 d28, q0, #2 467.endm 468 469.macro pixman_composite_src_0565_8888_process_pixblock_tail 470.endm 471 472/* TODO: expand macros and do better instructions scheduling */ 473.macro pixman_composite_src_0565_8888_process_pixblock_tail_head 474 pixman_composite_src_0565_8888_process_pixblock_tail 475 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 476 vld1.16 {d0, d1}, [SRC]! 477 pixman_composite_src_0565_8888_process_pixblock_head 478 cache_preload 8, 8 479.endm 480 481generate_composite_function \ 482 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 483 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 484 8, /* number of pixels, processed in a single block */ \ 485 10, /* prefetch distance */ \ 486 default_init, \ 487 default_cleanup, \ 488 pixman_composite_src_0565_8888_process_pixblock_head, \ 489 pixman_composite_src_0565_8888_process_pixblock_tail, \ 490 pixman_composite_src_0565_8888_process_pixblock_tail_head 491 492/******************************************************************************/ 493 494.macro pixman_composite_add_8000_8000_process_pixblock_head 495 vqadd.u8 q14, q0, q2 496 vqadd.u8 q15, q1, q3 497.endm 498 499.macro pixman_composite_add_8000_8000_process_pixblock_tail 500.endm 501 502.macro pixman_composite_add_8000_8000_process_pixblock_tail_head 503 vld1.8 {d0, d1, d2, d3}, [SRC]! 504 PF add PF_X, PF_X, #32 505 PF tst PF_CTL, #0xF 506 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 507 PF addne PF_X, PF_X, #32 508 PF subne PF_CTL, PF_CTL, #1 509 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 510 PF cmp PF_X, ORIG_W 511 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 512 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 513 PF subge PF_X, PF_X, ORIG_W 514 PF subges PF_CTL, PF_CTL, #0x10 515 vqadd.u8 q14, q0, q2 516 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 517 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 518 vqadd.u8 q15, q1, q3 519.endm 520 521generate_composite_function \ 522 pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \ 523 FLAG_DST_READWRITE, \ 524 32, /* number of pixels, processed in a single block */ \ 525 10, /* prefetch distance */ \ 526 default_init, \ 527 default_cleanup, \ 528 pixman_composite_add_8000_8000_process_pixblock_head, \ 529 pixman_composite_add_8000_8000_process_pixblock_tail, \ 530 pixman_composite_add_8000_8000_process_pixblock_tail_head 531 532/******************************************************************************/ 533 534.macro pixman_composite_add_8888_8888_process_pixblock_tail_head 535 vld1.8 {d0, d1, d2, d3}, [SRC]! 536 PF add PF_X, PF_X, #8 537 PF tst PF_CTL, #0xF 538 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 539 PF addne PF_X, PF_X, #8 540 PF subne PF_CTL, PF_CTL, #1 541 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 542 PF cmp PF_X, ORIG_W 543 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 544 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 545 PF subge PF_X, PF_X, ORIG_W 546 PF subges PF_CTL, PF_CTL, #0x10 547 vqadd.u8 q14, q0, q2 548 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 549 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 550 vqadd.u8 q15, q1, q3 551.endm 552 553generate_composite_function \ 554 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 555 FLAG_DST_READWRITE, \ 556 8, /* number of pixels, processed in a single block */ \ 557 10, /* prefetch distance */ \ 558 default_init, \ 559 default_cleanup, \ 560 pixman_composite_add_8000_8000_process_pixblock_head, \ 561 pixman_composite_add_8000_8000_process_pixblock_tail, \ 562 pixman_composite_add_8888_8888_process_pixblock_tail_head 563 564generate_composite_function_single_scanline \ 565 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 566 FLAG_DST_READWRITE, \ 567 8, /* number of pixels, processed in a single block */ \ 568 default_init, \ 569 default_cleanup, \ 570 pixman_composite_add_8000_8000_process_pixblock_head, \ 571 pixman_composite_add_8000_8000_process_pixblock_tail, \ 572 pixman_composite_add_8888_8888_process_pixblock_tail_head 573 574/******************************************************************************/ 575 576.macro pixman_composite_over_8888_8888_process_pixblock_head 577 vmvn.8 d24, d3 /* get inverted alpha */ 578 /* do alpha blending */ 579 vmull.u8 q8, d24, d4 580 vmull.u8 q9, d24, d5 581 vmull.u8 q10, d24, d6 582 vmull.u8 q11, d24, d7 583.endm 584 585.macro pixman_composite_over_8888_8888_process_pixblock_tail 586 vrshr.u16 q14, q8, #8 587 vrshr.u16 q15, q9, #8 588 vrshr.u16 q12, q10, #8 589 vrshr.u16 q13, q11, #8 590 vraddhn.u16 d28, q14, q8 591 vraddhn.u16 d29, q15, q9 592 vraddhn.u16 d30, q12, q10 593 vraddhn.u16 d31, q13, q11 594 vqadd.u8 q14, q0, q14 595 vqadd.u8 q15, q1, q15 596.endm 597 598.macro pixman_composite_over_8888_8888_process_pixblock_tail_head 599 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 600 vrshr.u16 q14, q8, #8 601 PF add PF_X, PF_X, #8 602 PF tst PF_CTL, #0xF 603 vrshr.u16 q15, q9, #8 604 vrshr.u16 q12, q10, #8 605 vrshr.u16 q13, q11, #8 606 PF addne PF_X, PF_X, #8 607 PF subne PF_CTL, PF_CTL, #1 608 vraddhn.u16 d28, q14, q8 609 vraddhn.u16 d29, q15, q9 610 PF cmp PF_X, ORIG_W 611 vraddhn.u16 d30, q12, q10 612 vraddhn.u16 d31, q13, q11 613 vqadd.u8 q14, q0, q14 614 vqadd.u8 q15, q1, q15 615 vld4.8 {d0, d1, d2, d3}, [SRC]! 616 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 617 vmvn.8 d22, d3 618 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 619 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 620 PF subge PF_X, PF_X, ORIG_W 621 vmull.u8 q8, d22, d4 622 PF subges PF_CTL, PF_CTL, #0x10 623 vmull.u8 q9, d22, d5 624 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 625 vmull.u8 q10, d22, d6 626 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 627 vmull.u8 q11, d22, d7 628.endm 629 630generate_composite_function \ 631 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 632 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 633 8, /* number of pixels, processed in a single block */ \ 634 5, /* prefetch distance */ \ 635 default_init, \ 636 default_cleanup, \ 637 pixman_composite_over_8888_8888_process_pixblock_head, \ 638 pixman_composite_over_8888_8888_process_pixblock_tail, \ 639 pixman_composite_over_8888_8888_process_pixblock_tail_head 640 641generate_composite_function_single_scanline \ 642 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 643 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 644 8, /* number of pixels, processed in a single block */ \ 645 default_init, \ 646 default_cleanup, \ 647 pixman_composite_over_8888_8888_process_pixblock_head, \ 648 pixman_composite_over_8888_8888_process_pixblock_tail, \ 649 pixman_composite_over_8888_8888_process_pixblock_tail_head 650 651/******************************************************************************/ 652 653/* TODO: expand macros and do better instructions scheduling */ 654.macro pixman_composite_over_n_8888_process_pixblock_tail_head 655 pixman_composite_over_8888_8888_process_pixblock_tail 656 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 657 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 658 pixman_composite_over_8888_8888_process_pixblock_head 659.endm 660 661.macro pixman_composite_over_n_8888_init 662 add DUMMY, sp, #ARGS_STACK_OFFSET 663 vld1.32 {d3[0]}, [DUMMY] 664 vdup.8 d0, d3[0] 665 vdup.8 d1, d3[1] 666 vdup.8 d2, d3[2] 667 vdup.8 d3, d3[3] 668.endm 669 670generate_composite_function \ 671 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 672 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 673 8, /* number of pixels, processed in a single block */ \ 674 5, /* prefetch distance */ \ 675 pixman_composite_over_n_8888_init, \ 676 default_cleanup, \ 677 pixman_composite_over_8888_8888_process_pixblock_head, \ 678 pixman_composite_over_8888_8888_process_pixblock_tail, \ 679 pixman_composite_over_n_8888_process_pixblock_tail_head 680 681/******************************************************************************/ 682 683.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 684 vrshr.u16 q14, q8, #8 685 PF add PF_X, PF_X, #8 686 PF tst PF_CTL, #0xF 687 vrshr.u16 q15, q9, #8 688 vrshr.u16 q12, q10, #8 689 vrshr.u16 q13, q11, #8 690 PF addne PF_X, PF_X, #8 691 PF subne PF_CTL, PF_CTL, #1 692 vraddhn.u16 d28, q14, q8 693 vraddhn.u16 d29, q15, q9 694 PF cmp PF_X, ORIG_W 695 vraddhn.u16 d30, q12, q10 696 vraddhn.u16 d31, q13, q11 697 vqadd.u8 q14, q0, q14 698 vqadd.u8 q15, q1, q15 699 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 700 vmvn.8 d22, d3 701 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 702 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 703 PF subge PF_X, PF_X, ORIG_W 704 vmull.u8 q8, d22, d4 705 PF subges PF_CTL, PF_CTL, #0x10 706 vmull.u8 q9, d22, d5 707 vmull.u8 q10, d22, d6 708 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 709 vmull.u8 q11, d22, d7 710.endm 711 712.macro pixman_composite_over_reverse_n_8888_init 713 add DUMMY, sp, #ARGS_STACK_OFFSET 714 vld1.32 {d7[0]}, [DUMMY] 715 vdup.8 d4, d7[0] 716 vdup.8 d5, d7[1] 717 vdup.8 d6, d7[2] 718 vdup.8 d7, d7[3] 719.endm 720 721generate_composite_function \ 722 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 723 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 724 8, /* number of pixels, processed in a single block */ \ 725 5, /* prefetch distance */ \ 726 pixman_composite_over_reverse_n_8888_init, \ 727 default_cleanup, \ 728 pixman_composite_over_8888_8888_process_pixblock_head, \ 729 pixman_composite_over_8888_8888_process_pixblock_tail, \ 730 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 731 28, /* dst_w_basereg */ \ 732 0, /* dst_r_basereg */ \ 733 4, /* src_basereg */ \ 734 24 /* mask_basereg */ 735 736/******************************************************************************/ 737 738.macro pixman_composite_over_n_8_0565_process_pixblock_head 739 /* in */ 740 vmull.u8 q0, d24, d8 741 vmull.u8 q1, d24, d9 742 vmull.u8 q6, d24, d10 743 vmull.u8 q7, d24, d11 744 vrshr.u16 q10, q0, #8 745 vrshr.u16 q11, q1, #8 746 vrshr.u16 q12, q6, #8 747 vrshr.u16 q13, q7, #8 748 vraddhn.u16 d0, q0, q10 749 vraddhn.u16 d1, q1, q11 750 vraddhn.u16 d2, q6, q12 751 vraddhn.u16 d3, q7, q13 752 753 vshrn.u16 d6, q2, #8 754 vshrn.u16 d7, q2, #3 755 vsli.u16 q2, q2, #5 756 vsri.u8 d6, d6, #5 757 vmvn.8 d3, d3 758 vsri.u8 d7, d7, #6 759 vshrn.u16 d30, q2, #2 760 /* now do alpha blending */ 761 vmull.u8 q10, d3, d6 762 vmull.u8 q11, d3, d7 763 vmull.u8 q12, d3, d30 764 vrshr.u16 q13, q10, #8 765 vrshr.u16 q3, q11, #8 766 vrshr.u16 q15, q12, #8 767 vraddhn.u16 d20, q10, q13 768 vraddhn.u16 d23, q11, q3 769 vraddhn.u16 d22, q12, q15 770.endm 771 772.macro pixman_composite_over_n_8_0565_process_pixblock_tail 773 vqadd.u8 d16, d2, d20 774 vqadd.u8 q9, q0, q11 775 /* convert to r5g6b5 */ 776 vshll.u8 q14, d16, #8 777 vshll.u8 q8, d19, #8 778 vshll.u8 q9, d18, #8 779 vsri.u16 q14, q8, #5 780 vsri.u16 q14, q9, #11 781.endm 782 783/* TODO: expand macros and do better instructions scheduling */ 784.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head 785 pixman_composite_over_n_8_0565_process_pixblock_tail 786 vst1.16 {d28, d29}, [DST_W, :128]! 787 vld1.16 {d4, d5}, [DST_R, :128]! 788 vld1.8 {d24}, [MASK]! 789 cache_preload 8, 8 790 pixman_composite_over_n_8_0565_process_pixblock_head 791.endm 792 793/* 794 * This function needs a special initialization of solid mask. 795 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 796 * offset, split into color components and replicated in d8-d11 797 * registers. Additionally, this function needs all the NEON registers, 798 * so it has to save d8-d15 registers which are callee saved according 799 * to ABI. These registers are restored from 'cleanup' macro. All the 800 * other NEON registers are caller saved, so can be clobbered freely 801 * without introducing any problems. 802 */ 803.macro pixman_composite_over_n_8_0565_init 804 add DUMMY, sp, #ARGS_STACK_OFFSET 805 vpush {d8-d15} 806 vld1.32 {d11[0]}, [DUMMY] 807 vdup.8 d8, d11[0] 808 vdup.8 d9, d11[1] 809 vdup.8 d10, d11[2] 810 vdup.8 d11, d11[3] 811.endm 812 813.macro pixman_composite_over_n_8_0565_cleanup 814 vpop {d8-d15} 815.endm 816 817generate_composite_function \ 818 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 819 FLAG_DST_READWRITE, \ 820 8, /* number of pixels, processed in a single block */ \ 821 5, /* prefetch distance */ \ 822 pixman_composite_over_n_8_0565_init, \ 823 pixman_composite_over_n_8_0565_cleanup, \ 824 pixman_composite_over_n_8_0565_process_pixblock_head, \ 825 pixman_composite_over_n_8_0565_process_pixblock_tail, \ 826 pixman_composite_over_n_8_0565_process_pixblock_tail_head 827 828/******************************************************************************/ 829 830.macro pixman_composite_src_0565_0565_process_pixblock_head 831.endm 832 833.macro pixman_composite_src_0565_0565_process_pixblock_tail 834.endm 835 836.macro pixman_composite_src_0565_0565_process_pixblock_tail_head 837 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 838 vld1.16 {d0, d1, d2, d3}, [SRC]! 839 cache_preload 16, 16 840.endm 841 842generate_composite_function \ 843 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 844 FLAG_DST_WRITEONLY, \ 845 16, /* number of pixels, processed in a single block */ \ 846 10, /* prefetch distance */ \ 847 default_init, \ 848 default_cleanup, \ 849 pixman_composite_src_0565_0565_process_pixblock_head, \ 850 pixman_composite_src_0565_0565_process_pixblock_tail, \ 851 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 852 0, /* dst_w_basereg */ \ 853 0, /* dst_r_basereg */ \ 854 0, /* src_basereg */ \ 855 0 /* mask_basereg */ 856 857/******************************************************************************/ 858 859.macro pixman_composite_src_n_8_process_pixblock_head 860.endm 861 862.macro pixman_composite_src_n_8_process_pixblock_tail 863.endm 864 865.macro pixman_composite_src_n_8_process_pixblock_tail_head 866 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 867.endm 868 869.macro pixman_composite_src_n_8_init 870 add DUMMY, sp, #ARGS_STACK_OFFSET 871 vld1.32 {d0[0]}, [DUMMY] 872 vsli.u64 d0, d0, #8 873 vsli.u64 d0, d0, #16 874 vsli.u64 d0, d0, #32 875 vmov d1, d0 876 vmov q1, q0 877.endm 878 879.macro pixman_composite_src_n_8_cleanup 880.endm 881 882generate_composite_function \ 883 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 884 FLAG_DST_WRITEONLY, \ 885 32, /* number of pixels, processed in a single block */ \ 886 0, /* prefetch distance */ \ 887 pixman_composite_src_n_8_init, \ 888 pixman_composite_src_n_8_cleanup, \ 889 pixman_composite_src_n_8_process_pixblock_head, \ 890 pixman_composite_src_n_8_process_pixblock_tail, \ 891 pixman_composite_src_n_8_process_pixblock_tail_head, \ 892 0, /* dst_w_basereg */ \ 893 0, /* dst_r_basereg */ \ 894 0, /* src_basereg */ \ 895 0 /* mask_basereg */ 896 897/******************************************************************************/ 898 899.macro pixman_composite_src_n_0565_process_pixblock_head 900.endm 901 902.macro pixman_composite_src_n_0565_process_pixblock_tail 903.endm 904 905.macro pixman_composite_src_n_0565_process_pixblock_tail_head 906 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 907.endm 908 909.macro pixman_composite_src_n_0565_init 910 add DUMMY, sp, #ARGS_STACK_OFFSET 911 vld1.32 {d0[0]}, [DUMMY] 912 vsli.u64 d0, d0, #16 913 vsli.u64 d0, d0, #32 914 vmov d1, d0 915 vmov q1, q0 916.endm 917 918.macro pixman_composite_src_n_0565_cleanup 919.endm 920 921generate_composite_function \ 922 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 923 FLAG_DST_WRITEONLY, \ 924 16, /* number of pixels, processed in a single block */ \ 925 0, /* prefetch distance */ \ 926 pixman_composite_src_n_0565_init, \ 927 pixman_composite_src_n_0565_cleanup, \ 928 pixman_composite_src_n_0565_process_pixblock_head, \ 929 pixman_composite_src_n_0565_process_pixblock_tail, \ 930 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 931 0, /* dst_w_basereg */ \ 932 0, /* dst_r_basereg */ \ 933 0, /* src_basereg */ \ 934 0 /* mask_basereg */ 935 936/******************************************************************************/ 937 938.macro pixman_composite_src_n_8888_process_pixblock_head 939.endm 940 941.macro pixman_composite_src_n_8888_process_pixblock_tail 942.endm 943 944.macro pixman_composite_src_n_8888_process_pixblock_tail_head 945 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 946.endm 947 948.macro pixman_composite_src_n_8888_init 949 add DUMMY, sp, #ARGS_STACK_OFFSET 950 vld1.32 {d0[0]}, [DUMMY] 951 vsli.u64 d0, d0, #32 952 vmov d1, d0 953 vmov q1, q0 954.endm 955 956.macro pixman_composite_src_n_8888_cleanup 957.endm 958 959generate_composite_function \ 960 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 961 FLAG_DST_WRITEONLY, \ 962 8, /* number of pixels, processed in a single block */ \ 963 0, /* prefetch distance */ \ 964 pixman_composite_src_n_8888_init, \ 965 pixman_composite_src_n_8888_cleanup, \ 966 pixman_composite_src_n_8888_process_pixblock_head, \ 967 pixman_composite_src_n_8888_process_pixblock_tail, \ 968 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 969 0, /* dst_w_basereg */ \ 970 0, /* dst_r_basereg */ \ 971 0, /* src_basereg */ \ 972 0 /* mask_basereg */ 973 974/******************************************************************************/ 975 976.macro pixman_composite_src_8888_8888_process_pixblock_head 977.endm 978 979.macro pixman_composite_src_8888_8888_process_pixblock_tail 980.endm 981 982.macro pixman_composite_src_8888_8888_process_pixblock_tail_head 983 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 984 vld1.32 {d0, d1, d2, d3}, [SRC]! 985 cache_preload 8, 8 986.endm 987 988generate_composite_function \ 989 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 990 FLAG_DST_WRITEONLY, \ 991 8, /* number of pixels, processed in a single block */ \ 992 10, /* prefetch distance */ \ 993 default_init, \ 994 default_cleanup, \ 995 pixman_composite_src_8888_8888_process_pixblock_head, \ 996 pixman_composite_src_8888_8888_process_pixblock_tail, \ 997 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 998 0, /* dst_w_basereg */ \ 999 0, /* dst_r_basereg */ \ 1000 0, /* src_basereg */ \ 1001 0 /* mask_basereg */ 1002 1003/******************************************************************************/ 1004 1005.macro pixman_composite_src_x888_8888_process_pixblock_head 1006 vorr q0, q0, q2 1007 vorr q1, q1, q2 1008.endm 1009 1010.macro pixman_composite_src_x888_8888_process_pixblock_tail 1011.endm 1012 1013.macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1014 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1015 vld1.32 {d0, d1, d2, d3}, [SRC]! 1016 vorr q0, q0, q2 1017 vorr q1, q1, q2 1018 cache_preload 8, 8 1019.endm 1020 1021.macro pixman_composite_src_x888_8888_init 1022 vmov.u8 q2, #0xFF 1023 vshl.u32 q2, q2, #24 1024.endm 1025 1026generate_composite_function \ 1027 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1028 FLAG_DST_WRITEONLY, \ 1029 8, /* number of pixels, processed in a single block */ \ 1030 10, /* prefetch distance */ \ 1031 pixman_composite_src_x888_8888_init, \ 1032 default_cleanup, \ 1033 pixman_composite_src_x888_8888_process_pixblock_head, \ 1034 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1035 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1036 0, /* dst_w_basereg */ \ 1037 0, /* dst_r_basereg */ \ 1038 0, /* src_basereg */ \ 1039 0 /* mask_basereg */ 1040 1041/******************************************************************************/ 1042 1043.macro pixman_composite_over_n_8_8888_process_pixblock_head 1044 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1045 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1046 /* and destination data in {d4, d5, d6, d7} */ 1047 /* mask is in d24 (d25, d26, d27 are unused) */ 1048 1049 /* in */ 1050 vmull.u8 q0, d24, d8 1051 vmull.u8 q1, d24, d9 1052 vmull.u8 q6, d24, d10 1053 vmull.u8 q7, d24, d11 1054 vrshr.u16 q10, q0, #8 1055 vrshr.u16 q11, q1, #8 1056 vrshr.u16 q12, q6, #8 1057 vrshr.u16 q13, q7, #8 1058 vraddhn.u16 d0, q0, q10 1059 vraddhn.u16 d1, q1, q11 1060 vraddhn.u16 d2, q6, q12 1061 vraddhn.u16 d3, q7, q13 1062 vmvn.8 d24, d3 /* get inverted alpha */ 1063 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1064 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1065 /* now do alpha blending */ 1066 vmull.u8 q8, d24, d4 1067 vmull.u8 q9, d24, d5 1068 vmull.u8 q10, d24, d6 1069 vmull.u8 q11, d24, d7 1070.endm 1071 1072.macro pixman_composite_over_n_8_8888_process_pixblock_tail 1073 vrshr.u16 q14, q8, #8 1074 vrshr.u16 q15, q9, #8 1075 vrshr.u16 q12, q10, #8 1076 vrshr.u16 q13, q11, #8 1077 vraddhn.u16 d28, q14, q8 1078 vraddhn.u16 d29, q15, q9 1079 vraddhn.u16 d30, q12, q10 1080 vraddhn.u16 d31, q13, q11 1081 vqadd.u8 q14, q0, q14 1082 vqadd.u8 q15, q1, q15 1083.endm 1084 1085/* TODO: expand macros and do better instructions scheduling */ 1086.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1087 pixman_composite_over_n_8_8888_process_pixblock_tail 1088 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1089 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1090 vld1.8 {d24}, [MASK]! 1091 cache_preload 8, 8 1092 pixman_composite_over_n_8_8888_process_pixblock_head 1093.endm 1094 1095.macro pixman_composite_over_n_8_8888_init 1096 add DUMMY, sp, #ARGS_STACK_OFFSET 1097 vpush {d8-d15} 1098 vld1.32 {d11[0]}, [DUMMY] 1099 vdup.8 d8, d11[0] 1100 vdup.8 d9, d11[1] 1101 vdup.8 d10, d11[2] 1102 vdup.8 d11, d11[3] 1103.endm 1104 1105.macro pixman_composite_over_n_8_8888_cleanup 1106 vpop {d8-d15} 1107.endm 1108 1109generate_composite_function \ 1110 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1111 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1112 8, /* number of pixels, processed in a single block */ \ 1113 5, /* prefetch distance */ \ 1114 pixman_composite_over_n_8_8888_init, \ 1115 pixman_composite_over_n_8_8888_cleanup, \ 1116 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1117 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1118 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1119 1120/******************************************************************************/ 1121 1122.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1123 /* 1124 * 'combine_mask_ca' replacement 1125 * 1126 * input: solid src (n) in {d8, d9, d10, d11} 1127 * dest in {d4, d5, d6, d7 } 1128 * mask in {d24, d25, d26, d27} 1129 * output: updated src in {d0, d1, d2, d3 } 1130 * updated mask in {d24, d25, d26, d3 } 1131 */ 1132 vmull.u8 q0, d24, d8 1133 vmull.u8 q1, d25, d9 1134 vmull.u8 q6, d26, d10 1135 vmull.u8 q7, d27, d11 1136 vmull.u8 q9, d11, d25 1137 vmull.u8 q12, d11, d24 1138 vmull.u8 q13, d11, d26 1139 vrshr.u16 q8, q0, #8 1140 vrshr.u16 q10, q1, #8 1141 vrshr.u16 q11, q6, #8 1142 vraddhn.u16 d0, q0, q8 1143 vraddhn.u16 d1, q1, q10 1144 vraddhn.u16 d2, q6, q11 1145 vrshr.u16 q11, q12, #8 1146 vrshr.u16 q8, q9, #8 1147 vrshr.u16 q6, q13, #8 1148 vrshr.u16 q10, q7, #8 1149 vraddhn.u16 d24, q12, q11 1150 vraddhn.u16 d25, q9, q8 1151 vraddhn.u16 d26, q13, q6 1152 vraddhn.u16 d3, q7, q10 1153 /* 1154 * 'combine_over_ca' replacement 1155 * 1156 * output: updated dest in {d28, d29, d30, d31} 1157 */ 1158 vmvn.8 d24, d24 1159 vmvn.8 d25, d25 1160 vmull.u8 q8, d24, d4 1161 vmull.u8 q9, d25, d5 1162 vmvn.8 d26, d26 1163 vmvn.8 d27, d3 1164 vmull.u8 q10, d26, d6 1165 vmull.u8 q11, d27, d7 1166.endm 1167 1168.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1169 /* ... continue 'combine_over_ca' replacement */ 1170 vrshr.u16 q14, q8, #8 1171 vrshr.u16 q15, q9, #8 1172 vrshr.u16 q6, q10, #8 1173 vrshr.u16 q7, q11, #8 1174 vraddhn.u16 d28, q14, q8 1175 vraddhn.u16 d29, q15, q9 1176 vraddhn.u16 d30, q6, q10 1177 vraddhn.u16 d31, q7, q11 1178 vqadd.u8 q14, q0, q14 1179 vqadd.u8 q15, q1, q15 1180.endm 1181 1182.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1183 vrshr.u16 q14, q8, #8 1184 vrshr.u16 q15, q9, #8 1185 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1186 vrshr.u16 q6, q10, #8 1187 vrshr.u16 q7, q11, #8 1188 vraddhn.u16 d28, q14, q8 1189 vraddhn.u16 d29, q15, q9 1190 vraddhn.u16 d30, q6, q10 1191 vraddhn.u16 d31, q7, q11 1192 vld4.8 {d24, d25, d26, d27}, [MASK]! 1193 vqadd.u8 q14, q0, q14 1194 vqadd.u8 q15, q1, q15 1195 cache_preload 8, 8 1196 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1197 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1198.endm 1199 1200.macro pixman_composite_over_n_8888_8888_ca_init 1201 add DUMMY, sp, #ARGS_STACK_OFFSET 1202 vpush {d8-d15} 1203 vld1.32 {d11[0]}, [DUMMY] 1204 vdup.8 d8, d11[0] 1205 vdup.8 d9, d11[1] 1206 vdup.8 d10, d11[2] 1207 vdup.8 d11, d11[3] 1208.endm 1209 1210.macro pixman_composite_over_n_8888_8888_ca_cleanup 1211 vpop {d8-d15} 1212.endm 1213 1214generate_composite_function \ 1215 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1216 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1217 8, /* number of pixels, processed in a single block */ \ 1218 5, /* prefetch distance */ \ 1219 pixman_composite_over_n_8888_8888_ca_init, \ 1220 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1221 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1222 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1223 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1224 1225/******************************************************************************/ 1226 1227.macro pixman_composite_add_n_8_8_process_pixblock_head 1228 /* expecting source data in {d8, d9, d10, d11} */ 1229 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1230 /* and destination data in {d4, d5, d6, d7} */ 1231 /* mask is in d24, d25, d26, d27 */ 1232 vmull.u8 q0, d24, d11 1233 vmull.u8 q1, d25, d11 1234 vmull.u8 q6, d26, d11 1235 vmull.u8 q7, d27, d11 1236 vrshr.u16 q10, q0, #8 1237 vrshr.u16 q11, q1, #8 1238 vrshr.u16 q12, q6, #8 1239 vrshr.u16 q13, q7, #8 1240 vraddhn.u16 d0, q0, q10 1241 vraddhn.u16 d1, q1, q11 1242 vraddhn.u16 d2, q6, q12 1243 vraddhn.u16 d3, q7, q13 1244 vqadd.u8 q14, q0, q2 1245 vqadd.u8 q15, q1, q3 1246.endm 1247 1248.macro pixman_composite_add_n_8_8_process_pixblock_tail 1249.endm 1250 1251/* TODO: expand macros and do better instructions scheduling */ 1252.macro pixman_composite_add_n_8_8_process_pixblock_tail_head 1253 pixman_composite_add_n_8_8_process_pixblock_tail 1254 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1255 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1256 vld1.8 {d24, d25, d26, d27}, [MASK]! 1257 cache_preload 32, 32 1258 pixman_composite_add_n_8_8_process_pixblock_head 1259.endm 1260 1261.macro pixman_composite_add_n_8_8_init 1262 add DUMMY, sp, #ARGS_STACK_OFFSET 1263 vpush {d8-d15} 1264 vld1.32 {d11[0]}, [DUMMY] 1265 vdup.8 d11, d11[3] 1266.endm 1267 1268.macro pixman_composite_add_n_8_8_cleanup 1269 vpop {d8-d15} 1270.endm 1271 1272generate_composite_function \ 1273 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 1274 FLAG_DST_READWRITE, \ 1275 32, /* number of pixels, processed in a single block */ \ 1276 5, /* prefetch distance */ \ 1277 pixman_composite_add_n_8_8_init, \ 1278 pixman_composite_add_n_8_8_cleanup, \ 1279 pixman_composite_add_n_8_8_process_pixblock_head, \ 1280 pixman_composite_add_n_8_8_process_pixblock_tail, \ 1281 pixman_composite_add_n_8_8_process_pixblock_tail_head 1282 1283/******************************************************************************/ 1284 1285.macro pixman_composite_add_8_8_8_process_pixblock_head 1286 /* expecting source data in {d0, d1, d2, d3} */ 1287 /* destination data in {d4, d5, d6, d7} */ 1288 /* mask in {d24, d25, d26, d27} */ 1289 vmull.u8 q8, d24, d0 1290 vmull.u8 q9, d25, d1 1291 vmull.u8 q10, d26, d2 1292 vmull.u8 q11, d27, d3 1293 vrshr.u16 q0, q8, #8 1294 vrshr.u16 q1, q9, #8 1295 vrshr.u16 q12, q10, #8 1296 vrshr.u16 q13, q11, #8 1297 vraddhn.u16 d0, q0, q8 1298 vraddhn.u16 d1, q1, q9 1299 vraddhn.u16 d2, q12, q10 1300 vraddhn.u16 d3, q13, q11 1301 vqadd.u8 q14, q0, q2 1302 vqadd.u8 q15, q1, q3 1303.endm 1304 1305.macro pixman_composite_add_8_8_8_process_pixblock_tail 1306.endm 1307 1308/* TODO: expand macros and do better instructions scheduling */ 1309.macro pixman_composite_add_8_8_8_process_pixblock_tail_head 1310 pixman_composite_add_8_8_8_process_pixblock_tail 1311 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1312 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1313 vld1.8 {d24, d25, d26, d27}, [MASK]! 1314 vld1.8 {d0, d1, d2, d3}, [SRC]! 1315 cache_preload 32, 32 1316 pixman_composite_add_8_8_8_process_pixblock_head 1317.endm 1318 1319.macro pixman_composite_add_8_8_8_init 1320.endm 1321 1322.macro pixman_composite_add_8_8_8_cleanup 1323.endm 1324 1325generate_composite_function \ 1326 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 1327 FLAG_DST_READWRITE, \ 1328 32, /* number of pixels, processed in a single block */ \ 1329 5, /* prefetch distance */ \ 1330 pixman_composite_add_8_8_8_init, \ 1331 pixman_composite_add_8_8_8_cleanup, \ 1332 pixman_composite_add_8_8_8_process_pixblock_head, \ 1333 pixman_composite_add_8_8_8_process_pixblock_tail, \ 1334 pixman_composite_add_8_8_8_process_pixblock_tail_head 1335 1336/******************************************************************************/ 1337 1338.macro pixman_composite_add_8888_8888_8888_process_pixblock_head 1339 /* expecting source data in {d0, d1, d2, d3} */ 1340 /* destination data in {d4, d5, d6, d7} */ 1341 /* mask in {d24, d25, d26, d27} */ 1342 vmull.u8 q8, d27, d0 1343 vmull.u8 q9, d27, d1 1344 vmull.u8 q10, d27, d2 1345 vmull.u8 q11, d27, d3 1346 vrshr.u16 q0, q8, #8 1347 vrshr.u16 q1, q9, #8 1348 vrshr.u16 q12, q10, #8 1349 vrshr.u16 q13, q11, #8 1350 vraddhn.u16 d0, q0, q8 1351 vraddhn.u16 d1, q1, q9 1352 vraddhn.u16 d2, q12, q10 1353 vraddhn.u16 d3, q13, q11 1354 vqadd.u8 q14, q0, q2 1355 vqadd.u8 q15, q1, q3 1356.endm 1357 1358.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 1359.endm 1360 1361/* TODO: expand macros and do better instructions scheduling */ 1362.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1363 pixman_composite_add_8888_8888_8888_process_pixblock_tail 1364 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1365 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1366 vld4.8 {d24, d25, d26, d27}, [MASK]! 1367 vld4.8 {d0, d1, d2, d3}, [SRC]! 1368 cache_preload 8, 8 1369 pixman_composite_add_8888_8888_8888_process_pixblock_head 1370.endm 1371 1372generate_composite_function \ 1373 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 1374 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1375 8, /* number of pixels, processed in a single block */ \ 1376 10, /* prefetch distance */ \ 1377 default_init, \ 1378 default_cleanup, \ 1379 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 1380 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 1381 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1382 1383generate_composite_function_single_scanline \ 1384 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 1385 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1386 8, /* number of pixels, processed in a single block */ \ 1387 default_init, \ 1388 default_cleanup, \ 1389 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 1390 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 1391 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1392 1393/******************************************************************************/ 1394 1395.macro pixman_composite_over_8888_n_8888_process_pixblock_head 1396 /* expecting source data in {d0, d1, d2, d3} */ 1397 /* destination data in {d4, d5, d6, d7} */ 1398 /* solid mask is in d15 */ 1399 1400 /* 'in' */ 1401 vmull.u8 q8, d15, d3 1402 vmull.u8 q6, d15, d2 1403 vmull.u8 q5, d15, d1 1404 vmull.u8 q4, d15, d0 1405 vrshr.u16 q13, q8, #8 1406 vrshr.u16 q12, q6, #8 1407 vrshr.u16 q11, q5, #8 1408 vrshr.u16 q10, q4, #8 1409 vraddhn.u16 d3, q8, q13 1410 vraddhn.u16 d2, q6, q12 1411 vraddhn.u16 d1, q5, q11 1412 vraddhn.u16 d0, q4, q10 1413 vmvn.8 d24, d3 /* get inverted alpha */ 1414 /* now do alpha blending */ 1415 vmull.u8 q8, d24, d4 1416 vmull.u8 q9, d24, d5 1417 vmull.u8 q10, d24, d6 1418 vmull.u8 q11, d24, d7 1419.endm 1420 1421.macro pixman_composite_over_8888_n_8888_process_pixblock_tail 1422 vrshr.u16 q14, q8, #8 1423 vrshr.u16 q15, q9, #8 1424 vrshr.u16 q12, q10, #8 1425 vrshr.u16 q13, q11, #8 1426 vraddhn.u16 d28, q14, q8 1427 vraddhn.u16 d29, q15, q9 1428 vraddhn.u16 d30, q12, q10 1429 vraddhn.u16 d31, q13, q11 1430 vqadd.u8 q14, q0, q14 1431 vqadd.u8 q15, q1, q15 1432.endm 1433 1434/* TODO: expand macros and do better instructions scheduling */ 1435.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 1436 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1437 pixman_composite_over_8888_n_8888_process_pixblock_tail 1438 vld4.8 {d0, d1, d2, d3}, [SRC]! 1439 cache_preload 8, 8 1440 pixman_composite_over_8888_n_8888_process_pixblock_head 1441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1442.endm 1443 1444.macro pixman_composite_over_8888_n_8888_init 1445 add DUMMY, sp, #48 1446 vpush {d8-d15} 1447 vld1.32 {d15[0]}, [DUMMY] 1448 vdup.8 d15, d15[3] 1449.endm 1450 1451.macro pixman_composite_over_8888_n_8888_cleanup 1452 vpop {d8-d15} 1453.endm 1454 1455generate_composite_function \ 1456 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 1457 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1458 8, /* number of pixels, processed in a single block */ \ 1459 5, /* prefetch distance */ \ 1460 pixman_composite_over_8888_n_8888_init, \ 1461 pixman_composite_over_8888_n_8888_cleanup, \ 1462 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1463 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1464 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 1465 1466/******************************************************************************/ 1467 1468/* TODO: expand macros and do better instructions scheduling */ 1469.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 1470 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1471 pixman_composite_over_8888_n_8888_process_pixblock_tail 1472 vld4.8 {d0, d1, d2, d3}, [SRC]! 1473 cache_preload 8, 8 1474 vld4.8 {d12, d13, d14, d15}, [MASK]! 1475 pixman_composite_over_8888_n_8888_process_pixblock_head 1476 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1477.endm 1478 1479.macro pixman_composite_over_8888_8888_8888_init 1480 vpush {d8-d15} 1481.endm 1482 1483.macro pixman_composite_over_8888_8888_8888_cleanup 1484 vpop {d8-d15} 1485.endm 1486 1487generate_composite_function \ 1488 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 1489 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1490 8, /* number of pixels, processed in a single block */ \ 1491 5, /* prefetch distance */ \ 1492 pixman_composite_over_8888_8888_8888_init, \ 1493 pixman_composite_over_8888_8888_8888_cleanup, \ 1494 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1495 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1496 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 1497 28, /* dst_w_basereg */ \ 1498 4, /* dst_r_basereg */ \ 1499 0, /* src_basereg */ \ 1500 12 /* mask_basereg */ 1501 1502generate_composite_function_single_scanline \ 1503 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 1504 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1505 8, /* number of pixels, processed in a single block */ \ 1506 pixman_composite_over_8888_8888_8888_init, \ 1507 pixman_composite_over_8888_8888_8888_cleanup, \ 1508 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1509 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1510 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 1511 28, /* dst_w_basereg */ \ 1512 4, /* dst_r_basereg */ \ 1513 0, /* src_basereg */ \ 1514 12 /* mask_basereg */ 1515 1516/******************************************************************************/ 1517 1518/* TODO: expand macros and do better instructions scheduling */ 1519.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 1520 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1521 pixman_composite_over_8888_n_8888_process_pixblock_tail 1522 vld4.8 {d0, d1, d2, d3}, [SRC]! 1523 cache_preload 8, 8 1524 vld1.8 {d15}, [MASK]! 1525 pixman_composite_over_8888_n_8888_process_pixblock_head 1526 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1527.endm 1528 1529.macro pixman_composite_over_8888_8_8888_init 1530 vpush {d8-d15} 1531.endm 1532 1533.macro pixman_composite_over_8888_8_8888_cleanup 1534 vpop {d8-d15} 1535.endm 1536 1537generate_composite_function \ 1538 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 1539 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1540 8, /* number of pixels, processed in a single block */ \ 1541 5, /* prefetch distance */ \ 1542 pixman_composite_over_8888_8_8888_init, \ 1543 pixman_composite_over_8888_8_8888_cleanup, \ 1544 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1545 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1546 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 1547 28, /* dst_w_basereg */ \ 1548 4, /* dst_r_basereg */ \ 1549 0, /* src_basereg */ \ 1550 15 /* mask_basereg */ 1551 1552/******************************************************************************/ 1553 1554.macro pixman_composite_src_0888_0888_process_pixblock_head 1555.endm 1556 1557.macro pixman_composite_src_0888_0888_process_pixblock_tail 1558.endm 1559 1560.macro pixman_composite_src_0888_0888_process_pixblock_tail_head 1561 vst3.8 {d0, d1, d2}, [DST_W]! 1562 vld3.8 {d0, d1, d2}, [SRC]! 1563 cache_preload 8, 8 1564.endm 1565 1566generate_composite_function \ 1567 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 1568 FLAG_DST_WRITEONLY, \ 1569 8, /* number of pixels, processed in a single block */ \ 1570 10, /* prefetch distance */ \ 1571 default_init, \ 1572 default_cleanup, \ 1573 pixman_composite_src_0888_0888_process_pixblock_head, \ 1574 pixman_composite_src_0888_0888_process_pixblock_tail, \ 1575 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 1576 0, /* dst_w_basereg */ \ 1577 0, /* dst_r_basereg */ \ 1578 0, /* src_basereg */ \ 1579 0 /* mask_basereg */ 1580 1581/******************************************************************************/ 1582 1583.macro pixman_composite_src_0888_8888_rev_process_pixblock_head 1584 vswp d0, d2 1585.endm 1586 1587.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 1588.endm 1589 1590.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 1591 vst4.8 {d0, d1, d2, d3}, [DST_W]! 1592 vld3.8 {d0, d1, d2}, [SRC]! 1593 vswp d0, d2 1594 cache_preload 8, 8 1595.endm 1596 1597.macro pixman_composite_src_0888_8888_rev_init 1598 veor d3, d3, d3 1599.endm 1600 1601generate_composite_function \ 1602 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 1603 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1604 8, /* number of pixels, processed in a single block */ \ 1605 10, /* prefetch distance */ \ 1606 pixman_composite_src_0888_8888_rev_init, \ 1607 default_cleanup, \ 1608 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 1609 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 1610 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 1611 0, /* dst_w_basereg */ \ 1612 0, /* dst_r_basereg */ \ 1613 0, /* src_basereg */ \ 1614 0 /* mask_basereg */ 1615 1616/******************************************************************************/ 1617 1618.macro pixman_composite_src_0888_0565_rev_process_pixblock_head 1619 vshll.u8 q8, d1, #8 1620 vshll.u8 q9, d2, #8 1621.endm 1622 1623.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 1624 vshll.u8 q14, d0, #8 1625 vsri.u16 q14, q8, #5 1626 vsri.u16 q14, q9, #11 1627.endm 1628 1629.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 1630 vshll.u8 q14, d0, #8 1631 vld3.8 {d0, d1, d2}, [SRC]! 1632 vsri.u16 q14, q8, #5 1633 vsri.u16 q14, q9, #11 1634 vshll.u8 q8, d1, #8 1635 vst1.16 {d28, d29}, [DST_W, :128]! 1636 vshll.u8 q9, d2, #8 1637.endm 1638 1639generate_composite_function \ 1640 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 1641 FLAG_DST_WRITEONLY, \ 1642 8, /* number of pixels, processed in a single block */ \ 1643 10, /* prefetch distance */ \ 1644 default_init, \ 1645 default_cleanup, \ 1646 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 1647 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 1648 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 1649 28, /* dst_w_basereg */ \ 1650 0, /* dst_r_basereg */ \ 1651 0, /* src_basereg */ \ 1652 0 /* mask_basereg */ 1653 1654/******************************************************************************/ 1655 1656.macro pixman_composite_src_pixbuf_8888_process_pixblock_head 1657 vmull.u8 q8, d3, d0 1658 vmull.u8 q9, d3, d1 1659 vmull.u8 q10, d3, d2 1660.endm 1661 1662.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 1663 vrshr.u16 q11, q8, #8 1664 vswp d3, d31 1665 vrshr.u16 q12, q9, #8 1666 vrshr.u16 q13, q10, #8 1667 vraddhn.u16 d30, q11, q8 1668 vraddhn.u16 d29, q12, q9 1669 vraddhn.u16 d28, q13, q10 1670.endm 1671 1672.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 1673 vrshr.u16 q11, q8, #8 1674 vswp d3, d31 1675 vrshr.u16 q12, q9, #8 1676 vrshr.u16 q13, q10, #8 1677 vld4.8 {d0, d1, d2, d3}, [SRC]! 1678 vraddhn.u16 d30, q11, q8 1679 PF add PF_X, PF_X, #8 1680 PF tst PF_CTL, #0xF 1681 PF addne PF_X, PF_X, #8 1682 PF subne PF_CTL, PF_CTL, #1 1683 vraddhn.u16 d29, q12, q9 1684 vraddhn.u16 d28, q13, q10 1685 vmull.u8 q8, d3, d0 1686 vmull.u8 q9, d3, d1 1687 vmull.u8 q10, d3, d2 1688 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1689 PF cmp PF_X, ORIG_W 1690 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1691 PF subge PF_X, PF_X, ORIG_W 1692 PF subges PF_CTL, PF_CTL, #0x10 1693 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1694.endm 1695 1696generate_composite_function \ 1697 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 1698 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1699 8, /* number of pixels, processed in a single block */ \ 1700 10, /* prefetch distance */ \ 1701 default_init, \ 1702 default_cleanup, \ 1703 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 1704 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 1705 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 1706 28, /* dst_w_basereg */ \ 1707 0, /* dst_r_basereg */ \ 1708 0, /* src_basereg */ \ 1709 0 /* mask_basereg */ 1710