1/* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26/* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37#if defined(ENABLE_PIXMAN_DRAWHELPERS) 38 39/* Prevent the stack from becoming executable for no reason... */ 40#if defined(__linux__) && defined(__ELF__) 41.section .note.GNU-stack,"",%progbits 42#endif 43 44 .text 45 .fpu neon 46 .arch armv7a 47 .altmacro 48 49#include "pixman-arm-neon-asm.h" 50 51/* Global configuration options and preferences */ 52 53/* 54 * The code can optionally make use of unaligned memory accesses to improve 55 * performance of handling leading/trailing pixels for each scanline. 56 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 57 * example in linux if unaligned memory accesses are not configured to 58 * generate.exceptions. 59 */ 60.set RESPECT_STRICT_ALIGNMENT, 1 61 62/* 63 * Set default prefetch type. There is a choice between the following options: 64 * 65 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 66 * as NOP to workaround some HW bugs or for whatever other reason) 67 * 68 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 69 * advanced prefetch intruduces heavy overhead) 70 * 71 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 72 * which can run ARM and NEON instructions simultaneously so that extra ARM 73 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 74 * 75 * Note: some types of function can't support advanced prefetch and fallback 76 * to simple one (those which handle 24bpp pixels) 77 */ 78.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 79 80/* Prefetch distance in pixels for simple prefetch */ 81.set PREFETCH_DISTANCE_SIMPLE, 64 82 83/* 84 * Implementation of pixman_composite_over_8888_0565_asm_neon 85 * 86 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 87 * performs OVER compositing operation. Function fast_composite_over_8888_0565 88 * from pixman-fast-path.c does the same in C and can be used as a reference. 89 * 90 * First we need to have some NEON assembly code which can do the actual 91 * operation on the pixels and provide it to the template macro. 92 * 93 * Template macro quite conveniently takes care of emitting all the necessary 94 * code for memory reading and writing (including quite tricky cases of 95 * handling unaligned leading/trailing pixels), so we only need to deal with 96 * the data in NEON registers. 97 * 98 * NEON registers allocation in general is recommented to be the following: 99 * d0, d1, d2, d3 - contain loaded source pixel data 100 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 101 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 102 * d28, d29, d30, d31 - place for storing the result (destination pixels) 103 * 104 * As can be seen above, four 64-bit NEON registers are used for keeping 105 * intermediate pixel data and up to 8 pixels can be processed in one step 106 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 107 * 108 * This particular function uses the following registers allocation: 109 * d0, d1, d2, d3 - contain loaded source pixel data 110 * d4, d5 - contain loaded destination pixels (they are needed) 111 * d28, d29 - place for storing the result (destination pixels) 112 */ 113 114/* 115 * Step one. We need to have some code to do some arithmetics on pixel data. 116 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 117 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 118 * perform all the needed calculations and write the result to {d28, d29}. 119 * The rationale for having two macros and not just one will be explained 120 * later. In practice, any single monolitic function which does the work can 121 * be split into two parts in any arbitrary way without affecting correctness. 122 * 123 * There is one special trick here too. Common template macro can optionally 124 * make our life a bit easier by doing R, G, B, A color components 125 * deinterleaving for 32bpp pixel formats (and this feature is used in 126 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 127 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 128 * actually use d0 register for blue channel (a vector of eight 8-bit 129 * values), d1 register for green, d2 for red and d3 for alpha. This 130 * simple conversion can be also done with a few NEON instructions: 131 * 132 * Packed to planar conversion: 133 * vuzp.8 d0, d1 134 * vuzp.8 d2, d3 135 * vuzp.8 d1, d3 136 * vuzp.8 d0, d2 137 * 138 * Planar to packed conversion: 139 * vzip.8 d0, d2 140 * vzip.8 d1, d3 141 * vzip.8 d2, d3 142 * vzip.8 d0, d1 143 * 144 * But pixel can be loaded directly in planar format using VLD4.8 NEON 145 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 146 * desirable, that's why deinterleaving is optional. 147 * 148 * But anyway, here is the code: 149 */ 150.macro pixman_composite_over_8888_0565_process_pixblock_head 151 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 152 and put data into d6 - red, d7 - green, d30 - blue */ 153 vshrn.u16 d6, q2, #8 154 vshrn.u16 d7, q2, #3 155 vsli.u16 q2, q2, #5 156 vsri.u8 d6, d6, #5 157 vmvn.8 d3, d3 /* invert source alpha */ 158 vsri.u8 d7, d7, #6 159 vshrn.u16 d30, q2, #2 160 /* now do alpha blending, storing results in 8-bit planar format 161 into d16 - red, d19 - green, d18 - blue */ 162 vmull.u8 q10, d3, d6 163 vmull.u8 q11, d3, d7 164 vmull.u8 q12, d3, d30 165 vrshr.u16 q13, q10, #8 166 vrshr.u16 q3, q11, #8 167 vrshr.u16 q15, q12, #8 168 vraddhn.u16 d20, q10, q13 169 vraddhn.u16 d23, q11, q3 170 vraddhn.u16 d22, q12, q15 171.endm 172 173.macro pixman_composite_over_8888_0565_process_pixblock_tail 174 /* ... continue alpha blending */ 175 vqadd.u8 d16, d2, d20 176 vqadd.u8 q9, q0, q11 177 /* convert the result to r5g6b5 and store it into {d28, d29} */ 178 vshll.u8 q14, d16, #8 179 vshll.u8 q8, d19, #8 180 vshll.u8 q9, d18, #8 181 vsri.u16 q14, q8, #5 182 vsri.u16 q14, q9, #11 183.endm 184 185/* 186 * OK, now we got almost everything that we need. Using the above two 187 * macros, the work can be done right. But now we want to optimize 188 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 189 * a lot from good code scheduling and software pipelining. 190 * 191 * Let's construct some code, which will run in the core main loop. 192 * Some pseudo-code of the main loop will look like this: 193 * head 194 * while (...) { 195 * tail 196 * head 197 * } 198 * tail 199 * 200 * It may look a bit weird, but this setup allows to hide instruction 201 * latencies better and also utilize dual-issue capability more 202 * efficiently (make pairs of load-store and ALU instructions). 203 * 204 * So what we need now is a '*_tail_head' macro, which will be used 205 * in the core main loop. A trivial straightforward implementation 206 * of this macro would look like this: 207 * 208 * pixman_composite_over_8888_0565_process_pixblock_tail 209 * vst1.16 {d28, d29}, [DST_W, :128]! 210 * vld1.16 {d4, d5}, [DST_R, :128]! 211 * vld4.32 {d0, d1, d2, d3}, [SRC]! 212 * pixman_composite_over_8888_0565_process_pixblock_head 213 * cache_preload 8, 8 214 * 215 * Now it also got some VLD/VST instructions. We simply can't move from 216 * processing one block of pixels to the other one with just arithmetics. 217 * The previously processed data needs to be written to memory and new 218 * data needs to be fetched. Fortunately, this main loop does not deal 219 * with partial leading/trailing pixels and can load/store a full block 220 * of pixels in a bulk. Additionally, destination buffer is already 221 * 16 bytes aligned here (which is good for performance). 222 * 223 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 224 * are the aliases for ARM registers which are used as pointers for 225 * accessing data. We maintain separate pointers for reading and writing 226 * destination buffer (DST_R and DST_W). 227 * 228 * Another new thing is 'cache_preload' macro. It is used for prefetching 229 * data into CPU L2 cache and improve performance when dealing with large 230 * images which are far larger than cache size. It uses one argument 231 * (actually two, but they need to be the same here) - number of pixels 232 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 233 * details about this macro. Moreover, if good performance is needed 234 * the code from this macro needs to be copied into '*_tail_head' macro 235 * and mixed with the rest of code for optimal instructions scheduling. 236 * We are actually doing it below. 237 * 238 * Now after all the explanations, here is the optimized code. 239 * Different instruction streams (originaling from '*_head', '*_tail' 240 * and 'cache_preload' macro) use different indentation levels for 241 * better readability. Actually taking the code from one of these 242 * indentation levels and ignoring a few VLD/VST instructions would 243 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 244 * macro! 245 */ 246 247#if 1 248 249.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 250 vqadd.u8 d16, d2, d20 251 vld1.16 {d4, d5}, [DST_R, :128]! 252 vqadd.u8 q9, q0, q11 253 vshrn.u16 d6, q2, #8 254 vld4.8 {d0, d1, d2, d3}, [SRC]! 255 vshrn.u16 d7, q2, #3 256 vsli.u16 q2, q2, #5 257 vshll.u8 q14, d16, #8 258 PF add PF_X, PF_X, #8 259 vshll.u8 q8, d19, #8 260 PF tst PF_CTL, #0xF 261 vsri.u8 d6, d6, #5 262 PF addne PF_X, PF_X, #8 263 vmvn.8 d3, d3 264 PF subne PF_CTL, PF_CTL, #1 265 vsri.u8 d7, d7, #6 266 vshrn.u16 d30, q2, #2 267 vmull.u8 q10, d3, d6 268 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 269 vmull.u8 q11, d3, d7 270 vmull.u8 q12, d3, d30 271 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 272 vsri.u16 q14, q8, #5 273 PF cmp PF_X, ORIG_W 274 vshll.u8 q9, d18, #8 275 vrshr.u16 q13, q10, #8 276 PF subge PF_X, PF_X, ORIG_W 277 vrshr.u16 q3, q11, #8 278 vrshr.u16 q15, q12, #8 279 PF subges PF_CTL, PF_CTL, #0x10 280 vsri.u16 q14, q9, #11 281 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 282 vraddhn.u16 d20, q10, q13 283 vraddhn.u16 d23, q11, q3 284 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 285 vraddhn.u16 d22, q12, q15 286 vst1.16 {d28, d29}, [DST_W, :128]! 287.endm 288 289#else 290 291/* If we did not care much about the performance, we would just use this... */ 292.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 293 pixman_composite_over_8888_0565_process_pixblock_tail 294 vst1.16 {d28, d29}, [DST_W, :128]! 295 vld1.16 {d4, d5}, [DST_R, :128]! 296 vld4.32 {d0, d1, d2, d3}, [SRC]! 297 pixman_composite_over_8888_0565_process_pixblock_head 298 cache_preload 8, 8 299.endm 300 301#endif 302 303/* 304 * And now the final part. We are using 'generate_composite_function' macro 305 * to put all the stuff together. We are specifying the name of the function 306 * which we want to get, number of bits per pixel for the source, mask and 307 * destination (0 if unused, like mask in this case). Next come some bit 308 * flags: 309 * FLAG_DST_READWRITE - tells that the destination buffer is both read 310 * and written, for write-only buffer we would use 311 * FLAG_DST_WRITEONLY flag instead 312 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 313 * and separate color channels for 32bpp format. 314 * The next things are: 315 * - the number of pixels processed per iteration (8 in this case, because 316 * that's the maximum what can fit into four 64-bit NEON registers). 317 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 318 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 319 * prefetch distance can be selected by running some benchmarks. 320 * 321 * After that we specify some macros, these are 'default_init', 322 * 'default_cleanup' here which are empty (but it is possible to have custom 323 * init/cleanup macros to be able to save/restore some extra NEON registers 324 * like d8-d15 or do anything else) followed by 325 * 'pixman_composite_over_8888_0565_process_pixblock_head', 326 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 327 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 328 * which we got implemented above. 329 * 330 * The last part is the NEON registers allocation scheme. 331 */ 332generate_composite_function \ 333 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 334 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 335 8, /* number of pixels, processed in a single block */ \ 336 5, /* prefetch distance */ \ 337 default_init, \ 338 default_cleanup, \ 339 pixman_composite_over_8888_0565_process_pixblock_head, \ 340 pixman_composite_over_8888_0565_process_pixblock_tail, \ 341 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 342 28, /* dst_w_basereg */ \ 343 4, /* dst_r_basereg */ \ 344 0, /* src_basereg */ \ 345 24 /* mask_basereg */ 346 347/******************************************************************************/ 348 349.macro pixman_composite_over_n_0565_process_pixblock_head 350 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 351 and put data into d6 - red, d7 - green, d30 - blue */ 352 vshrn.u16 d6, q2, #8 353 vshrn.u16 d7, q2, #3 354 vsli.u16 q2, q2, #5 355 vsri.u8 d6, d6, #5 356 vsri.u8 d7, d7, #6 357 vshrn.u16 d30, q2, #2 358 /* now do alpha blending, storing results in 8-bit planar format 359 into d16 - red, d19 - green, d18 - blue */ 360 vmull.u8 q10, d3, d6 361 vmull.u8 q11, d3, d7 362 vmull.u8 q12, d3, d30 363 vrshr.u16 q13, q10, #8 364 vrshr.u16 q3, q11, #8 365 vrshr.u16 q15, q12, #8 366 vraddhn.u16 d20, q10, q13 367 vraddhn.u16 d23, q11, q3 368 vraddhn.u16 d22, q12, q15 369.endm 370 371.macro pixman_composite_over_n_0565_process_pixblock_tail 372 /* ... continue alpha blending */ 373 vqadd.u8 d16, d2, d20 374 vqadd.u8 q9, q0, q11 375 /* convert the result to r5g6b5 and store it into {d28, d29} */ 376 vshll.u8 q14, d16, #8 377 vshll.u8 q8, d19, #8 378 vshll.u8 q9, d18, #8 379 vsri.u16 q14, q8, #5 380 vsri.u16 q14, q9, #11 381.endm 382 383/* TODO: expand macros and do better instructions scheduling */ 384.macro pixman_composite_over_n_0565_process_pixblock_tail_head 385 pixman_composite_over_n_0565_process_pixblock_tail 386 vld1.16 {d4, d5}, [DST_R, :128]! 387 vst1.16 {d28, d29}, [DST_W, :128]! 388 pixman_composite_over_n_0565_process_pixblock_head 389.endm 390 391.macro pixman_composite_over_n_0565_init 392 add DUMMY, sp, #ARGS_STACK_OFFSET 393 vld1.32 {d3[0]}, [DUMMY] 394 vdup.8 d0, d3[0] 395 vdup.8 d1, d3[1] 396 vdup.8 d2, d3[2] 397 vdup.8 d3, d3[3] 398 vmvn.8 d3, d3 /* invert source alpha */ 399.endm 400 401generate_composite_function \ 402 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 403 FLAG_DST_READWRITE, \ 404 8, /* number of pixels, processed in a single block */ \ 405 5, /* prefetch distance */ \ 406 pixman_composite_over_n_0565_init, \ 407 default_cleanup, \ 408 pixman_composite_over_n_0565_process_pixblock_head, \ 409 pixman_composite_over_n_0565_process_pixblock_tail, \ 410 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 411 28, /* dst_w_basereg */ \ 412 4, /* dst_r_basereg */ \ 413 0, /* src_basereg */ \ 414 24 /* mask_basereg */ 415 416/******************************************************************************/ 417 418.macro pixman_composite_src_8888_0565_process_pixblock_head 419 vshll.u8 q8, d1, #8 420 vshll.u8 q14, d2, #8 421 vshll.u8 q9, d0, #8 422.endm 423 424.macro pixman_composite_src_8888_0565_process_pixblock_tail 425 vsri.u16 q14, q8, #5 426 vsri.u16 q14, q9, #11 427.endm 428 429.macro pixman_composite_src_8888_0565_process_pixblock_tail_head 430 vsri.u16 q14, q8, #5 431 PF add PF_X, PF_X, #8 432 PF tst PF_CTL, #0xF 433 vld4.8 {d0, d1, d2, d3}, [SRC]! 434 PF addne PF_X, PF_X, #8 435 PF subne PF_CTL, PF_CTL, #1 436 vsri.u16 q14, q9, #11 437 PF cmp PF_X, ORIG_W 438 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 439 vshll.u8 q8, d1, #8 440 vst1.16 {d28, d29}, [DST_W, :128]! 441 PF subge PF_X, PF_X, ORIG_W 442 PF subges PF_CTL, PF_CTL, #0x10 443 vshll.u8 q14, d2, #8 444 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 445 vshll.u8 q9, d0, #8 446.endm 447 448generate_composite_function \ 449 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 450 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 451 8, /* number of pixels, processed in a single block */ \ 452 10, /* prefetch distance */ \ 453 default_init, \ 454 default_cleanup, \ 455 pixman_composite_src_8888_0565_process_pixblock_head, \ 456 pixman_composite_src_8888_0565_process_pixblock_tail, \ 457 pixman_composite_src_8888_0565_process_pixblock_tail_head 458 459/******************************************************************************/ 460 461.macro pixman_composite_src_0565_8888_process_pixblock_head 462 vshrn.u16 d30, q0, #8 463 vshrn.u16 d29, q0, #3 464 vsli.u16 q0, q0, #5 465 vmov.u8 d31, #255 466 vsri.u8 d30, d30, #5 467 vsri.u8 d29, d29, #6 468 vshrn.u16 d28, q0, #2 469.endm 470 471.macro pixman_composite_src_0565_8888_process_pixblock_tail 472.endm 473 474/* TODO: expand macros and do better instructions scheduling */ 475.macro pixman_composite_src_0565_8888_process_pixblock_tail_head 476 pixman_composite_src_0565_8888_process_pixblock_tail 477 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 478 vld1.16 {d0, d1}, [SRC]! 479 pixman_composite_src_0565_8888_process_pixblock_head 480 cache_preload 8, 8 481.endm 482 483generate_composite_function \ 484 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 485 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 486 8, /* number of pixels, processed in a single block */ \ 487 10, /* prefetch distance */ \ 488 default_init, \ 489 default_cleanup, \ 490 pixman_composite_src_0565_8888_process_pixblock_head, \ 491 pixman_composite_src_0565_8888_process_pixblock_tail, \ 492 pixman_composite_src_0565_8888_process_pixblock_tail_head 493 494/******************************************************************************/ 495 496.macro pixman_composite_add_8000_8000_process_pixblock_head 497 vqadd.u8 q14, q0, q2 498 vqadd.u8 q15, q1, q3 499.endm 500 501.macro pixman_composite_add_8000_8000_process_pixblock_tail 502.endm 503 504.macro pixman_composite_add_8000_8000_process_pixblock_tail_head 505 vld1.8 {d0, d1, d2, d3}, [SRC]! 506 PF add PF_X, PF_X, #32 507 PF tst PF_CTL, #0xF 508 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 509 PF addne PF_X, PF_X, #32 510 PF subne PF_CTL, PF_CTL, #1 511 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 512 PF cmp PF_X, ORIG_W 513 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 514 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 515 PF subge PF_X, PF_X, ORIG_W 516 PF subges PF_CTL, PF_CTL, #0x10 517 vqadd.u8 q14, q0, q2 518 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 519 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 520 vqadd.u8 q15, q1, q3 521.endm 522 523generate_composite_function \ 524 pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \ 525 FLAG_DST_READWRITE, \ 526 32, /* number of pixels, processed in a single block */ \ 527 10, /* prefetch distance */ \ 528 default_init, \ 529 default_cleanup, \ 530 pixman_composite_add_8000_8000_process_pixblock_head, \ 531 pixman_composite_add_8000_8000_process_pixblock_tail, \ 532 pixman_composite_add_8000_8000_process_pixblock_tail_head 533 534/******************************************************************************/ 535 536.macro pixman_composite_add_8888_8888_process_pixblock_tail_head 537 vld1.8 {d0, d1, d2, d3}, [SRC]! 538 PF add PF_X, PF_X, #8 539 PF tst PF_CTL, #0xF 540 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 541 PF addne PF_X, PF_X, #8 542 PF subne PF_CTL, PF_CTL, #1 543 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 544 PF cmp PF_X, ORIG_W 545 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 546 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 547 PF subge PF_X, PF_X, ORIG_W 548 PF subges PF_CTL, PF_CTL, #0x10 549 vqadd.u8 q14, q0, q2 550 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 551 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 552 vqadd.u8 q15, q1, q3 553.endm 554 555generate_composite_function \ 556 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 557 FLAG_DST_READWRITE, \ 558 8, /* number of pixels, processed in a single block */ \ 559 10, /* prefetch distance */ \ 560 default_init, \ 561 default_cleanup, \ 562 pixman_composite_add_8000_8000_process_pixblock_head, \ 563 pixman_composite_add_8000_8000_process_pixblock_tail, \ 564 pixman_composite_add_8888_8888_process_pixblock_tail_head 565 566generate_composite_function_single_scanline \ 567 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 568 FLAG_DST_READWRITE, \ 569 8, /* number of pixels, processed in a single block */ \ 570 default_init, \ 571 default_cleanup, \ 572 pixman_composite_add_8000_8000_process_pixblock_head, \ 573 pixman_composite_add_8000_8000_process_pixblock_tail, \ 574 pixman_composite_add_8888_8888_process_pixblock_tail_head 575 576/******************************************************************************/ 577 578.macro pixman_composite_over_8888_8888_process_pixblock_head 579 vmvn.8 d24, d3 /* get inverted alpha */ 580 /* do alpha blending */ 581 vmull.u8 q8, d24, d4 582 vmull.u8 q9, d24, d5 583 vmull.u8 q10, d24, d6 584 vmull.u8 q11, d24, d7 585.endm 586 587.macro pixman_composite_over_8888_8888_process_pixblock_tail 588 vrshr.u16 q14, q8, #8 589 vrshr.u16 q15, q9, #8 590 vrshr.u16 q12, q10, #8 591 vrshr.u16 q13, q11, #8 592 vraddhn.u16 d28, q14, q8 593 vraddhn.u16 d29, q15, q9 594 vraddhn.u16 d30, q12, q10 595 vraddhn.u16 d31, q13, q11 596 vqadd.u8 q14, q0, q14 597 vqadd.u8 q15, q1, q15 598.endm 599 600.macro pixman_composite_over_8888_8888_process_pixblock_tail_head 601 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 602 vrshr.u16 q14, q8, #8 603 PF add PF_X, PF_X, #8 604 PF tst PF_CTL, #0xF 605 vrshr.u16 q15, q9, #8 606 vrshr.u16 q12, q10, #8 607 vrshr.u16 q13, q11, #8 608 PF addne PF_X, PF_X, #8 609 PF subne PF_CTL, PF_CTL, #1 610 vraddhn.u16 d28, q14, q8 611 vraddhn.u16 d29, q15, q9 612 PF cmp PF_X, ORIG_W 613 vraddhn.u16 d30, q12, q10 614 vraddhn.u16 d31, q13, q11 615 vqadd.u8 q14, q0, q14 616 vqadd.u8 q15, q1, q15 617 vld4.8 {d0, d1, d2, d3}, [SRC]! 618 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 619 vmvn.8 d22, d3 620 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 621 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 622 PF subge PF_X, PF_X, ORIG_W 623 vmull.u8 q8, d22, d4 624 PF subges PF_CTL, PF_CTL, #0x10 625 vmull.u8 q9, d22, d5 626 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 627 vmull.u8 q10, d22, d6 628 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 629 vmull.u8 q11, d22, d7 630.endm 631 632generate_composite_function \ 633 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 634 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 635 8, /* number of pixels, processed in a single block */ \ 636 5, /* prefetch distance */ \ 637 default_init, \ 638 default_cleanup, \ 639 pixman_composite_over_8888_8888_process_pixblock_head, \ 640 pixman_composite_over_8888_8888_process_pixblock_tail, \ 641 pixman_composite_over_8888_8888_process_pixblock_tail_head 642 643generate_composite_function_single_scanline \ 644 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 645 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 646 8, /* number of pixels, processed in a single block */ \ 647 default_init, \ 648 default_cleanup, \ 649 pixman_composite_over_8888_8888_process_pixblock_head, \ 650 pixman_composite_over_8888_8888_process_pixblock_tail, \ 651 pixman_composite_over_8888_8888_process_pixblock_tail_head 652 653/******************************************************************************/ 654 655/* TODO: expand macros and do better instructions scheduling */ 656.macro pixman_composite_over_n_8888_process_pixblock_tail_head 657 pixman_composite_over_8888_8888_process_pixblock_tail 658 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 659 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 660 pixman_composite_over_8888_8888_process_pixblock_head 661.endm 662 663.macro pixman_composite_over_n_8888_init 664 add DUMMY, sp, #ARGS_STACK_OFFSET 665 vld1.32 {d3[0]}, [DUMMY] 666 vdup.8 d0, d3[0] 667 vdup.8 d1, d3[1] 668 vdup.8 d2, d3[2] 669 vdup.8 d3, d3[3] 670.endm 671 672generate_composite_function \ 673 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 674 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 675 8, /* number of pixels, processed in a single block */ \ 676 5, /* prefetch distance */ \ 677 pixman_composite_over_n_8888_init, \ 678 default_cleanup, \ 679 pixman_composite_over_8888_8888_process_pixblock_head, \ 680 pixman_composite_over_8888_8888_process_pixblock_tail, \ 681 pixman_composite_over_n_8888_process_pixblock_tail_head 682 683/******************************************************************************/ 684 685.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 686 vrshr.u16 q14, q8, #8 687 PF add PF_X, PF_X, #8 688 PF tst PF_CTL, #0xF 689 vrshr.u16 q15, q9, #8 690 vrshr.u16 q12, q10, #8 691 vrshr.u16 q13, q11, #8 692 PF addne PF_X, PF_X, #8 693 PF subne PF_CTL, PF_CTL, #1 694 vraddhn.u16 d28, q14, q8 695 vraddhn.u16 d29, q15, q9 696 PF cmp PF_X, ORIG_W 697 vraddhn.u16 d30, q12, q10 698 vraddhn.u16 d31, q13, q11 699 vqadd.u8 q14, q0, q14 700 vqadd.u8 q15, q1, q15 701 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 702 vmvn.8 d22, d3 703 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 704 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 705 PF subge PF_X, PF_X, ORIG_W 706 vmull.u8 q8, d22, d4 707 PF subges PF_CTL, PF_CTL, #0x10 708 vmull.u8 q9, d22, d5 709 vmull.u8 q10, d22, d6 710 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 711 vmull.u8 q11, d22, d7 712.endm 713 714.macro pixman_composite_over_reverse_n_8888_init 715 add DUMMY, sp, #ARGS_STACK_OFFSET 716 vld1.32 {d7[0]}, [DUMMY] 717 vdup.8 d4, d7[0] 718 vdup.8 d5, d7[1] 719 vdup.8 d6, d7[2] 720 vdup.8 d7, d7[3] 721.endm 722 723generate_composite_function \ 724 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 725 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 726 8, /* number of pixels, processed in a single block */ \ 727 5, /* prefetch distance */ \ 728 pixman_composite_over_reverse_n_8888_init, \ 729 default_cleanup, \ 730 pixman_composite_over_8888_8888_process_pixblock_head, \ 731 pixman_composite_over_8888_8888_process_pixblock_tail, \ 732 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 733 28, /* dst_w_basereg */ \ 734 0, /* dst_r_basereg */ \ 735 4, /* src_basereg */ \ 736 24 /* mask_basereg */ 737 738/******************************************************************************/ 739 740.macro pixman_composite_over_n_8_0565_process_pixblock_head 741 /* in */ 742 vmull.u8 q0, d24, d8 743 vmull.u8 q1, d24, d9 744 vmull.u8 q6, d24, d10 745 vmull.u8 q7, d24, d11 746 vrshr.u16 q10, q0, #8 747 vrshr.u16 q11, q1, #8 748 vrshr.u16 q12, q6, #8 749 vrshr.u16 q13, q7, #8 750 vraddhn.u16 d0, q0, q10 751 vraddhn.u16 d1, q1, q11 752 vraddhn.u16 d2, q6, q12 753 vraddhn.u16 d3, q7, q13 754 755 vshrn.u16 d6, q2, #8 756 vshrn.u16 d7, q2, #3 757 vsli.u16 q2, q2, #5 758 vsri.u8 d6, d6, #5 759 vmvn.8 d3, d3 760 vsri.u8 d7, d7, #6 761 vshrn.u16 d30, q2, #2 762 /* now do alpha blending */ 763 vmull.u8 q10, d3, d6 764 vmull.u8 q11, d3, d7 765 vmull.u8 q12, d3, d30 766 vrshr.u16 q13, q10, #8 767 vrshr.u16 q3, q11, #8 768 vrshr.u16 q15, q12, #8 769 vraddhn.u16 d20, q10, q13 770 vraddhn.u16 d23, q11, q3 771 vraddhn.u16 d22, q12, q15 772.endm 773 774.macro pixman_composite_over_n_8_0565_process_pixblock_tail 775 vqadd.u8 d16, d2, d20 776 vqadd.u8 q9, q0, q11 777 /* convert to r5g6b5 */ 778 vshll.u8 q14, d16, #8 779 vshll.u8 q8, d19, #8 780 vshll.u8 q9, d18, #8 781 vsri.u16 q14, q8, #5 782 vsri.u16 q14, q9, #11 783.endm 784 785/* TODO: expand macros and do better instructions scheduling */ 786.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head 787 pixman_composite_over_n_8_0565_process_pixblock_tail 788 vst1.16 {d28, d29}, [DST_W, :128]! 789 vld1.16 {d4, d5}, [DST_R, :128]! 790 vld1.8 {d24}, [MASK]! 791 cache_preload 8, 8 792 pixman_composite_over_n_8_0565_process_pixblock_head 793.endm 794 795/* 796 * This function needs a special initialization of solid mask. 797 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 798 * offset, split into color components and replicated in d8-d11 799 * registers. Additionally, this function needs all the NEON registers, 800 * so it has to save d8-d15 registers which are callee saved according 801 * to ABI. These registers are restored from 'cleanup' macro. All the 802 * other NEON registers are caller saved, so can be clobbered freely 803 * without introducing any problems. 804 */ 805.macro pixman_composite_over_n_8_0565_init 806 add DUMMY, sp, #ARGS_STACK_OFFSET 807 vpush {d8-d15} 808 vld1.32 {d11[0]}, [DUMMY] 809 vdup.8 d8, d11[0] 810 vdup.8 d9, d11[1] 811 vdup.8 d10, d11[2] 812 vdup.8 d11, d11[3] 813.endm 814 815.macro pixman_composite_over_n_8_0565_cleanup 816 vpop {d8-d15} 817.endm 818 819generate_composite_function \ 820 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 821 FLAG_DST_READWRITE, \ 822 8, /* number of pixels, processed in a single block */ \ 823 5, /* prefetch distance */ \ 824 pixman_composite_over_n_8_0565_init, \ 825 pixman_composite_over_n_8_0565_cleanup, \ 826 pixman_composite_over_n_8_0565_process_pixblock_head, \ 827 pixman_composite_over_n_8_0565_process_pixblock_tail, \ 828 pixman_composite_over_n_8_0565_process_pixblock_tail_head 829 830/******************************************************************************/ 831 832.macro pixman_composite_src_0565_0565_process_pixblock_head 833.endm 834 835.macro pixman_composite_src_0565_0565_process_pixblock_tail 836.endm 837 838.macro pixman_composite_src_0565_0565_process_pixblock_tail_head 839 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 840 vld1.16 {d0, d1, d2, d3}, [SRC]! 841 cache_preload 16, 16 842.endm 843 844generate_composite_function \ 845 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 846 FLAG_DST_WRITEONLY, \ 847 16, /* number of pixels, processed in a single block */ \ 848 10, /* prefetch distance */ \ 849 default_init, \ 850 default_cleanup, \ 851 pixman_composite_src_0565_0565_process_pixblock_head, \ 852 pixman_composite_src_0565_0565_process_pixblock_tail, \ 853 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 854 0, /* dst_w_basereg */ \ 855 0, /* dst_r_basereg */ \ 856 0, /* src_basereg */ \ 857 0 /* mask_basereg */ 858 859/******************************************************************************/ 860 861.macro pixman_composite_src_n_8_process_pixblock_head 862.endm 863 864.macro pixman_composite_src_n_8_process_pixblock_tail 865.endm 866 867.macro pixman_composite_src_n_8_process_pixblock_tail_head 868 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 869.endm 870 871.macro pixman_composite_src_n_8_init 872 add DUMMY, sp, #ARGS_STACK_OFFSET 873 vld1.32 {d0[0]}, [DUMMY] 874 vsli.u64 d0, d0, #8 875 vsli.u64 d0, d0, #16 876 vsli.u64 d0, d0, #32 877 vmov d1, d0 878 vmov q1, q0 879.endm 880 881.macro pixman_composite_src_n_8_cleanup 882.endm 883 884generate_composite_function \ 885 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 886 FLAG_DST_WRITEONLY, \ 887 32, /* number of pixels, processed in a single block */ \ 888 0, /* prefetch distance */ \ 889 pixman_composite_src_n_8_init, \ 890 pixman_composite_src_n_8_cleanup, \ 891 pixman_composite_src_n_8_process_pixblock_head, \ 892 pixman_composite_src_n_8_process_pixblock_tail, \ 893 pixman_composite_src_n_8_process_pixblock_tail_head, \ 894 0, /* dst_w_basereg */ \ 895 0, /* dst_r_basereg */ \ 896 0, /* src_basereg */ \ 897 0 /* mask_basereg */ 898 899/******************************************************************************/ 900 901.macro pixman_composite_src_n_0565_process_pixblock_head 902.endm 903 904.macro pixman_composite_src_n_0565_process_pixblock_tail 905.endm 906 907.macro pixman_composite_src_n_0565_process_pixblock_tail_head 908 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 909.endm 910 911.macro pixman_composite_src_n_0565_init 912 add DUMMY, sp, #ARGS_STACK_OFFSET 913 vld1.32 {d0[0]}, [DUMMY] 914 vsli.u64 d0, d0, #16 915 vsli.u64 d0, d0, #32 916 vmov d1, d0 917 vmov q1, q0 918.endm 919 920.macro pixman_composite_src_n_0565_cleanup 921.endm 922 923generate_composite_function \ 924 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 925 FLAG_DST_WRITEONLY, \ 926 16, /* number of pixels, processed in a single block */ \ 927 0, /* prefetch distance */ \ 928 pixman_composite_src_n_0565_init, \ 929 pixman_composite_src_n_0565_cleanup, \ 930 pixman_composite_src_n_0565_process_pixblock_head, \ 931 pixman_composite_src_n_0565_process_pixblock_tail, \ 932 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 933 0, /* dst_w_basereg */ \ 934 0, /* dst_r_basereg */ \ 935 0, /* src_basereg */ \ 936 0 /* mask_basereg */ 937 938/******************************************************************************/ 939 940.macro pixman_composite_src_n_8888_process_pixblock_head 941.endm 942 943.macro pixman_composite_src_n_8888_process_pixblock_tail 944.endm 945 946.macro pixman_composite_src_n_8888_process_pixblock_tail_head 947 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 948.endm 949 950.macro pixman_composite_src_n_8888_init 951 add DUMMY, sp, #ARGS_STACK_OFFSET 952 vld1.32 {d0[0]}, [DUMMY] 953 vsli.u64 d0, d0, #32 954 vmov d1, d0 955 vmov q1, q0 956.endm 957 958.macro pixman_composite_src_n_8888_cleanup 959.endm 960 961generate_composite_function \ 962 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 963 FLAG_DST_WRITEONLY, \ 964 8, /* number of pixels, processed in a single block */ \ 965 0, /* prefetch distance */ \ 966 pixman_composite_src_n_8888_init, \ 967 pixman_composite_src_n_8888_cleanup, \ 968 pixman_composite_src_n_8888_process_pixblock_head, \ 969 pixman_composite_src_n_8888_process_pixblock_tail, \ 970 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 971 0, /* dst_w_basereg */ \ 972 0, /* dst_r_basereg */ \ 973 0, /* src_basereg */ \ 974 0 /* mask_basereg */ 975 976/******************************************************************************/ 977 978.macro pixman_composite_src_8888_8888_process_pixblock_head 979.endm 980 981.macro pixman_composite_src_8888_8888_process_pixblock_tail 982.endm 983 984.macro pixman_composite_src_8888_8888_process_pixblock_tail_head 985 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 986 vld1.32 {d0, d1, d2, d3}, [SRC]! 987 cache_preload 8, 8 988.endm 989 990generate_composite_function \ 991 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 992 FLAG_DST_WRITEONLY, \ 993 8, /* number of pixels, processed in a single block */ \ 994 10, /* prefetch distance */ \ 995 default_init, \ 996 default_cleanup, \ 997 pixman_composite_src_8888_8888_process_pixblock_head, \ 998 pixman_composite_src_8888_8888_process_pixblock_tail, \ 999 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 1000 0, /* dst_w_basereg */ \ 1001 0, /* dst_r_basereg */ \ 1002 0, /* src_basereg */ \ 1003 0 /* mask_basereg */ 1004 1005/******************************************************************************/ 1006 1007.macro pixman_composite_src_x888_8888_process_pixblock_head 1008 vorr q0, q0, q2 1009 vorr q1, q1, q2 1010.endm 1011 1012.macro pixman_composite_src_x888_8888_process_pixblock_tail 1013.endm 1014 1015.macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1016 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1017 vld1.32 {d0, d1, d2, d3}, [SRC]! 1018 vorr q0, q0, q2 1019 vorr q1, q1, q2 1020 cache_preload 8, 8 1021.endm 1022 1023.macro pixman_composite_src_x888_8888_init 1024 vmov.u8 q2, #0xFF 1025 vshl.u32 q2, q2, #24 1026.endm 1027 1028generate_composite_function \ 1029 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1030 FLAG_DST_WRITEONLY, \ 1031 8, /* number of pixels, processed in a single block */ \ 1032 10, /* prefetch distance */ \ 1033 pixman_composite_src_x888_8888_init, \ 1034 default_cleanup, \ 1035 pixman_composite_src_x888_8888_process_pixblock_head, \ 1036 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1037 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1038 0, /* dst_w_basereg */ \ 1039 0, /* dst_r_basereg */ \ 1040 0, /* src_basereg */ \ 1041 0 /* mask_basereg */ 1042 1043/******************************************************************************/ 1044 1045.macro pixman_composite_over_n_8_8888_process_pixblock_head 1046 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1047 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1048 /* and destination data in {d4, d5, d6, d7} */ 1049 /* mask is in d24 (d25, d26, d27 are unused) */ 1050 1051 /* in */ 1052 vmull.u8 q0, d24, d8 1053 vmull.u8 q1, d24, d9 1054 vmull.u8 q6, d24, d10 1055 vmull.u8 q7, d24, d11 1056 vrshr.u16 q10, q0, #8 1057 vrshr.u16 q11, q1, #8 1058 vrshr.u16 q12, q6, #8 1059 vrshr.u16 q13, q7, #8 1060 vraddhn.u16 d0, q0, q10 1061 vraddhn.u16 d1, q1, q11 1062 vraddhn.u16 d2, q6, q12 1063 vraddhn.u16 d3, q7, q13 1064 vmvn.8 d24, d3 /* get inverted alpha */ 1065 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1066 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1067 /* now do alpha blending */ 1068 vmull.u8 q8, d24, d4 1069 vmull.u8 q9, d24, d5 1070 vmull.u8 q10, d24, d6 1071 vmull.u8 q11, d24, d7 1072.endm 1073 1074.macro pixman_composite_over_n_8_8888_process_pixblock_tail 1075 vrshr.u16 q14, q8, #8 1076 vrshr.u16 q15, q9, #8 1077 vrshr.u16 q12, q10, #8 1078 vrshr.u16 q13, q11, #8 1079 vraddhn.u16 d28, q14, q8 1080 vraddhn.u16 d29, q15, q9 1081 vraddhn.u16 d30, q12, q10 1082 vraddhn.u16 d31, q13, q11 1083 vqadd.u8 q14, q0, q14 1084 vqadd.u8 q15, q1, q15 1085.endm 1086 1087/* TODO: expand macros and do better instructions scheduling */ 1088.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1089 pixman_composite_over_n_8_8888_process_pixblock_tail 1090 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1091 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1092 vld1.8 {d24}, [MASK]! 1093 cache_preload 8, 8 1094 pixman_composite_over_n_8_8888_process_pixblock_head 1095.endm 1096 1097.macro pixman_composite_over_n_8_8888_init 1098 add DUMMY, sp, #ARGS_STACK_OFFSET 1099 vpush {d8-d15} 1100 vld1.32 {d11[0]}, [DUMMY] 1101 vdup.8 d8, d11[0] 1102 vdup.8 d9, d11[1] 1103 vdup.8 d10, d11[2] 1104 vdup.8 d11, d11[3] 1105.endm 1106 1107.macro pixman_composite_over_n_8_8888_cleanup 1108 vpop {d8-d15} 1109.endm 1110 1111generate_composite_function \ 1112 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1113 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1114 8, /* number of pixels, processed in a single block */ \ 1115 5, /* prefetch distance */ \ 1116 pixman_composite_over_n_8_8888_init, \ 1117 pixman_composite_over_n_8_8888_cleanup, \ 1118 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1119 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1120 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1121 1122/******************************************************************************/ 1123 1124.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1125 /* 1126 * 'combine_mask_ca' replacement 1127 * 1128 * input: solid src (n) in {d8, d9, d10, d11} 1129 * dest in {d4, d5, d6, d7 } 1130 * mask in {d24, d25, d26, d27} 1131 * output: updated src in {d0, d1, d2, d3 } 1132 * updated mask in {d24, d25, d26, d3 } 1133 */ 1134 vmull.u8 q0, d24, d8 1135 vmull.u8 q1, d25, d9 1136 vmull.u8 q6, d26, d10 1137 vmull.u8 q7, d27, d11 1138 vmull.u8 q9, d11, d25 1139 vmull.u8 q12, d11, d24 1140 vmull.u8 q13, d11, d26 1141 vrshr.u16 q8, q0, #8 1142 vrshr.u16 q10, q1, #8 1143 vrshr.u16 q11, q6, #8 1144 vraddhn.u16 d0, q0, q8 1145 vraddhn.u16 d1, q1, q10 1146 vraddhn.u16 d2, q6, q11 1147 vrshr.u16 q11, q12, #8 1148 vrshr.u16 q8, q9, #8 1149 vrshr.u16 q6, q13, #8 1150 vrshr.u16 q10, q7, #8 1151 vraddhn.u16 d24, q12, q11 1152 vraddhn.u16 d25, q9, q8 1153 vraddhn.u16 d26, q13, q6 1154 vraddhn.u16 d3, q7, q10 1155 /* 1156 * 'combine_over_ca' replacement 1157 * 1158 * output: updated dest in {d28, d29, d30, d31} 1159 */ 1160 vmvn.8 d24, d24 1161 vmvn.8 d25, d25 1162 vmull.u8 q8, d24, d4 1163 vmull.u8 q9, d25, d5 1164 vmvn.8 d26, d26 1165 vmvn.8 d27, d3 1166 vmull.u8 q10, d26, d6 1167 vmull.u8 q11, d27, d7 1168.endm 1169 1170.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1171 /* ... continue 'combine_over_ca' replacement */ 1172 vrshr.u16 q14, q8, #8 1173 vrshr.u16 q15, q9, #8 1174 vrshr.u16 q6, q10, #8 1175 vrshr.u16 q7, q11, #8 1176 vraddhn.u16 d28, q14, q8 1177 vraddhn.u16 d29, q15, q9 1178 vraddhn.u16 d30, q6, q10 1179 vraddhn.u16 d31, q7, q11 1180 vqadd.u8 q14, q0, q14 1181 vqadd.u8 q15, q1, q15 1182.endm 1183 1184.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1185 vrshr.u16 q14, q8, #8 1186 vrshr.u16 q15, q9, #8 1187 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1188 vrshr.u16 q6, q10, #8 1189 vrshr.u16 q7, q11, #8 1190 vraddhn.u16 d28, q14, q8 1191 vraddhn.u16 d29, q15, q9 1192 vraddhn.u16 d30, q6, q10 1193 vraddhn.u16 d31, q7, q11 1194 vld4.8 {d24, d25, d26, d27}, [MASK]! 1195 vqadd.u8 q14, q0, q14 1196 vqadd.u8 q15, q1, q15 1197 cache_preload 8, 8 1198 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1199 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1200.endm 1201 1202.macro pixman_composite_over_n_8888_8888_ca_init 1203 add DUMMY, sp, #ARGS_STACK_OFFSET 1204 vpush {d8-d15} 1205 vld1.32 {d11[0]}, [DUMMY] 1206 vdup.8 d8, d11[0] 1207 vdup.8 d9, d11[1] 1208 vdup.8 d10, d11[2] 1209 vdup.8 d11, d11[3] 1210.endm 1211 1212.macro pixman_composite_over_n_8888_8888_ca_cleanup 1213 vpop {d8-d15} 1214.endm 1215 1216generate_composite_function \ 1217 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1218 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1219 8, /* number of pixels, processed in a single block */ \ 1220 5, /* prefetch distance */ \ 1221 pixman_composite_over_n_8888_8888_ca_init, \ 1222 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1223 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1224 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1225 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1226 1227/******************************************************************************/ 1228 1229.macro pixman_composite_add_n_8_8_process_pixblock_head 1230 /* expecting source data in {d8, d9, d10, d11} */ 1231 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1232 /* and destination data in {d4, d5, d6, d7} */ 1233 /* mask is in d24, d25, d26, d27 */ 1234 vmull.u8 q0, d24, d11 1235 vmull.u8 q1, d25, d11 1236 vmull.u8 q6, d26, d11 1237 vmull.u8 q7, d27, d11 1238 vrshr.u16 q10, q0, #8 1239 vrshr.u16 q11, q1, #8 1240 vrshr.u16 q12, q6, #8 1241 vrshr.u16 q13, q7, #8 1242 vraddhn.u16 d0, q0, q10 1243 vraddhn.u16 d1, q1, q11 1244 vraddhn.u16 d2, q6, q12 1245 vraddhn.u16 d3, q7, q13 1246 vqadd.u8 q14, q0, q2 1247 vqadd.u8 q15, q1, q3 1248.endm 1249 1250.macro pixman_composite_add_n_8_8_process_pixblock_tail 1251.endm 1252 1253/* TODO: expand macros and do better instructions scheduling */ 1254.macro pixman_composite_add_n_8_8_process_pixblock_tail_head 1255 pixman_composite_add_n_8_8_process_pixblock_tail 1256 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1257 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1258 vld1.8 {d24, d25, d26, d27}, [MASK]! 1259 cache_preload 32, 32 1260 pixman_composite_add_n_8_8_process_pixblock_head 1261.endm 1262 1263.macro pixman_composite_add_n_8_8_init 1264 add DUMMY, sp, #ARGS_STACK_OFFSET 1265 vpush {d8-d15} 1266 vld1.32 {d11[0]}, [DUMMY] 1267 vdup.8 d11, d11[3] 1268.endm 1269 1270.macro pixman_composite_add_n_8_8_cleanup 1271 vpop {d8-d15} 1272.endm 1273 1274generate_composite_function \ 1275 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 1276 FLAG_DST_READWRITE, \ 1277 32, /* number of pixels, processed in a single block */ \ 1278 5, /* prefetch distance */ \ 1279 pixman_composite_add_n_8_8_init, \ 1280 pixman_composite_add_n_8_8_cleanup, \ 1281 pixman_composite_add_n_8_8_process_pixblock_head, \ 1282 pixman_composite_add_n_8_8_process_pixblock_tail, \ 1283 pixman_composite_add_n_8_8_process_pixblock_tail_head 1284 1285/******************************************************************************/ 1286 1287.macro pixman_composite_add_8_8_8_process_pixblock_head 1288 /* expecting source data in {d0, d1, d2, d3} */ 1289 /* destination data in {d4, d5, d6, d7} */ 1290 /* mask in {d24, d25, d26, d27} */ 1291 vmull.u8 q8, d24, d0 1292 vmull.u8 q9, d25, d1 1293 vmull.u8 q10, d26, d2 1294 vmull.u8 q11, d27, d3 1295 vrshr.u16 q0, q8, #8 1296 vrshr.u16 q1, q9, #8 1297 vrshr.u16 q12, q10, #8 1298 vrshr.u16 q13, q11, #8 1299 vraddhn.u16 d0, q0, q8 1300 vraddhn.u16 d1, q1, q9 1301 vraddhn.u16 d2, q12, q10 1302 vraddhn.u16 d3, q13, q11 1303 vqadd.u8 q14, q0, q2 1304 vqadd.u8 q15, q1, q3 1305.endm 1306 1307.macro pixman_composite_add_8_8_8_process_pixblock_tail 1308.endm 1309 1310/* TODO: expand macros and do better instructions scheduling */ 1311.macro pixman_composite_add_8_8_8_process_pixblock_tail_head 1312 pixman_composite_add_8_8_8_process_pixblock_tail 1313 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1314 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1315 vld1.8 {d24, d25, d26, d27}, [MASK]! 1316 vld1.8 {d0, d1, d2, d3}, [SRC]! 1317 cache_preload 32, 32 1318 pixman_composite_add_8_8_8_process_pixblock_head 1319.endm 1320 1321.macro pixman_composite_add_8_8_8_init 1322.endm 1323 1324.macro pixman_composite_add_8_8_8_cleanup 1325.endm 1326 1327generate_composite_function \ 1328 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 1329 FLAG_DST_READWRITE, \ 1330 32, /* number of pixels, processed in a single block */ \ 1331 5, /* prefetch distance */ \ 1332 pixman_composite_add_8_8_8_init, \ 1333 pixman_composite_add_8_8_8_cleanup, \ 1334 pixman_composite_add_8_8_8_process_pixblock_head, \ 1335 pixman_composite_add_8_8_8_process_pixblock_tail, \ 1336 pixman_composite_add_8_8_8_process_pixblock_tail_head 1337 1338/******************************************************************************/ 1339 1340.macro pixman_composite_add_8888_8888_8888_process_pixblock_head 1341 /* expecting source data in {d0, d1, d2, d3} */ 1342 /* destination data in {d4, d5, d6, d7} */ 1343 /* mask in {d24, d25, d26, d27} */ 1344 vmull.u8 q8, d27, d0 1345 vmull.u8 q9, d27, d1 1346 vmull.u8 q10, d27, d2 1347 vmull.u8 q11, d27, d3 1348 vrshr.u16 q0, q8, #8 1349 vrshr.u16 q1, q9, #8 1350 vrshr.u16 q12, q10, #8 1351 vrshr.u16 q13, q11, #8 1352 vraddhn.u16 d0, q0, q8 1353 vraddhn.u16 d1, q1, q9 1354 vraddhn.u16 d2, q12, q10 1355 vraddhn.u16 d3, q13, q11 1356 vqadd.u8 q14, q0, q2 1357 vqadd.u8 q15, q1, q3 1358.endm 1359 1360.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 1361.endm 1362 1363/* TODO: expand macros and do better instructions scheduling */ 1364.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1365 pixman_composite_add_8888_8888_8888_process_pixblock_tail 1366 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1367 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1368 vld4.8 {d24, d25, d26, d27}, [MASK]! 1369 vld4.8 {d0, d1, d2, d3}, [SRC]! 1370 cache_preload 8, 8 1371 pixman_composite_add_8888_8888_8888_process_pixblock_head 1372.endm 1373 1374generate_composite_function \ 1375 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 1376 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1377 8, /* number of pixels, processed in a single block */ \ 1378 10, /* prefetch distance */ \ 1379 default_init, \ 1380 default_cleanup, \ 1381 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 1382 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 1383 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1384 1385generate_composite_function_single_scanline \ 1386 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 1387 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1388 8, /* number of pixels, processed in a single block */ \ 1389 default_init, \ 1390 default_cleanup, \ 1391 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 1392 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 1393 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 1394 1395/******************************************************************************/ 1396 1397.macro pixman_composite_over_8888_n_8888_process_pixblock_head 1398 /* expecting source data in {d0, d1, d2, d3} */ 1399 /* destination data in {d4, d5, d6, d7} */ 1400 /* solid mask is in d15 */ 1401 1402 /* 'in' */ 1403 vmull.u8 q8, d15, d3 1404 vmull.u8 q6, d15, d2 1405 vmull.u8 q5, d15, d1 1406 vmull.u8 q4, d15, d0 1407 vrshr.u16 q13, q8, #8 1408 vrshr.u16 q12, q6, #8 1409 vrshr.u16 q11, q5, #8 1410 vrshr.u16 q10, q4, #8 1411 vraddhn.u16 d3, q8, q13 1412 vraddhn.u16 d2, q6, q12 1413 vraddhn.u16 d1, q5, q11 1414 vraddhn.u16 d0, q4, q10 1415 vmvn.8 d24, d3 /* get inverted alpha */ 1416 /* now do alpha blending */ 1417 vmull.u8 q8, d24, d4 1418 vmull.u8 q9, d24, d5 1419 vmull.u8 q10, d24, d6 1420 vmull.u8 q11, d24, d7 1421.endm 1422 1423.macro pixman_composite_over_8888_n_8888_process_pixblock_tail 1424 vrshr.u16 q14, q8, #8 1425 vrshr.u16 q15, q9, #8 1426 vrshr.u16 q12, q10, #8 1427 vrshr.u16 q13, q11, #8 1428 vraddhn.u16 d28, q14, q8 1429 vraddhn.u16 d29, q15, q9 1430 vraddhn.u16 d30, q12, q10 1431 vraddhn.u16 d31, q13, q11 1432 vqadd.u8 q14, q0, q14 1433 vqadd.u8 q15, q1, q15 1434.endm 1435 1436/* TODO: expand macros and do better instructions scheduling */ 1437.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 1438 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1439 pixman_composite_over_8888_n_8888_process_pixblock_tail 1440 vld4.8 {d0, d1, d2, d3}, [SRC]! 1441 cache_preload 8, 8 1442 pixman_composite_over_8888_n_8888_process_pixblock_head 1443 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1444.endm 1445 1446.macro pixman_composite_over_8888_n_8888_init 1447 add DUMMY, sp, #48 1448 vpush {d8-d15} 1449 vld1.32 {d15[0]}, [DUMMY] 1450 vdup.8 d15, d15[3] 1451.endm 1452 1453.macro pixman_composite_over_8888_n_8888_cleanup 1454 vpop {d8-d15} 1455.endm 1456 1457generate_composite_function \ 1458 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 1459 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1460 8, /* number of pixels, processed in a single block */ \ 1461 5, /* prefetch distance */ \ 1462 pixman_composite_over_8888_n_8888_init, \ 1463 pixman_composite_over_8888_n_8888_cleanup, \ 1464 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1465 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1466 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 1467 1468/******************************************************************************/ 1469 1470/* TODO: expand macros and do better instructions scheduling */ 1471.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 1472 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1473 pixman_composite_over_8888_n_8888_process_pixblock_tail 1474 vld4.8 {d0, d1, d2, d3}, [SRC]! 1475 cache_preload 8, 8 1476 vld4.8 {d12, d13, d14, d15}, [MASK]! 1477 pixman_composite_over_8888_n_8888_process_pixblock_head 1478 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1479.endm 1480 1481.macro pixman_composite_over_8888_8888_8888_init 1482 vpush {d8-d15} 1483.endm 1484 1485.macro pixman_composite_over_8888_8888_8888_cleanup 1486 vpop {d8-d15} 1487.endm 1488 1489generate_composite_function \ 1490 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 1491 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1492 8, /* number of pixels, processed in a single block */ \ 1493 5, /* prefetch distance */ \ 1494 pixman_composite_over_8888_8888_8888_init, \ 1495 pixman_composite_over_8888_8888_8888_cleanup, \ 1496 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1497 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1498 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 1499 28, /* dst_w_basereg */ \ 1500 4, /* dst_r_basereg */ \ 1501 0, /* src_basereg */ \ 1502 12 /* mask_basereg */ 1503 1504generate_composite_function_single_scanline \ 1505 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 1506 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1507 8, /* number of pixels, processed in a single block */ \ 1508 pixman_composite_over_8888_8888_8888_init, \ 1509 pixman_composite_over_8888_8888_8888_cleanup, \ 1510 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1511 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1512 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 1513 28, /* dst_w_basereg */ \ 1514 4, /* dst_r_basereg */ \ 1515 0, /* src_basereg */ \ 1516 12 /* mask_basereg */ 1517 1518/******************************************************************************/ 1519 1520/* TODO: expand macros and do better instructions scheduling */ 1521.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 1522 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1523 pixman_composite_over_8888_n_8888_process_pixblock_tail 1524 vld4.8 {d0, d1, d2, d3}, [SRC]! 1525 cache_preload 8, 8 1526 vld1.8 {d15}, [MASK]! 1527 pixman_composite_over_8888_n_8888_process_pixblock_head 1528 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1529.endm 1530 1531.macro pixman_composite_over_8888_8_8888_init 1532 vpush {d8-d15} 1533.endm 1534 1535.macro pixman_composite_over_8888_8_8888_cleanup 1536 vpop {d8-d15} 1537.endm 1538 1539generate_composite_function \ 1540 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 1541 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1542 8, /* number of pixels, processed in a single block */ \ 1543 5, /* prefetch distance */ \ 1544 pixman_composite_over_8888_8_8888_init, \ 1545 pixman_composite_over_8888_8_8888_cleanup, \ 1546 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 1547 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 1548 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 1549 28, /* dst_w_basereg */ \ 1550 4, /* dst_r_basereg */ \ 1551 0, /* src_basereg */ \ 1552 15 /* mask_basereg */ 1553 1554/******************************************************************************/ 1555 1556.macro pixman_composite_src_0888_0888_process_pixblock_head 1557.endm 1558 1559.macro pixman_composite_src_0888_0888_process_pixblock_tail 1560.endm 1561 1562.macro pixman_composite_src_0888_0888_process_pixblock_tail_head 1563 vst3.8 {d0, d1, d2}, [DST_W]! 1564 vld3.8 {d0, d1, d2}, [SRC]! 1565 cache_preload 8, 8 1566.endm 1567 1568generate_composite_function \ 1569 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 1570 FLAG_DST_WRITEONLY, \ 1571 8, /* number of pixels, processed in a single block */ \ 1572 10, /* prefetch distance */ \ 1573 default_init, \ 1574 default_cleanup, \ 1575 pixman_composite_src_0888_0888_process_pixblock_head, \ 1576 pixman_composite_src_0888_0888_process_pixblock_tail, \ 1577 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 1578 0, /* dst_w_basereg */ \ 1579 0, /* dst_r_basereg */ \ 1580 0, /* src_basereg */ \ 1581 0 /* mask_basereg */ 1582 1583/******************************************************************************/ 1584 1585.macro pixman_composite_src_0888_8888_rev_process_pixblock_head 1586 vswp d0, d2 1587.endm 1588 1589.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 1590.endm 1591 1592.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 1593 vst4.8 {d0, d1, d2, d3}, [DST_W]! 1594 vld3.8 {d0, d1, d2}, [SRC]! 1595 vswp d0, d2 1596 cache_preload 8, 8 1597.endm 1598 1599.macro pixman_composite_src_0888_8888_rev_init 1600 veor d3, d3, d3 1601.endm 1602 1603generate_composite_function \ 1604 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 1605 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1606 8, /* number of pixels, processed in a single block */ \ 1607 10, /* prefetch distance */ \ 1608 pixman_composite_src_0888_8888_rev_init, \ 1609 default_cleanup, \ 1610 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 1611 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 1612 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 1613 0, /* dst_w_basereg */ \ 1614 0, /* dst_r_basereg */ \ 1615 0, /* src_basereg */ \ 1616 0 /* mask_basereg */ 1617 1618/******************************************************************************/ 1619 1620.macro pixman_composite_src_0888_0565_rev_process_pixblock_head 1621 vshll.u8 q8, d1, #8 1622 vshll.u8 q9, d2, #8 1623.endm 1624 1625.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 1626 vshll.u8 q14, d0, #8 1627 vsri.u16 q14, q8, #5 1628 vsri.u16 q14, q9, #11 1629.endm 1630 1631.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 1632 vshll.u8 q14, d0, #8 1633 vld3.8 {d0, d1, d2}, [SRC]! 1634 vsri.u16 q14, q8, #5 1635 vsri.u16 q14, q9, #11 1636 vshll.u8 q8, d1, #8 1637 vst1.16 {d28, d29}, [DST_W, :128]! 1638 vshll.u8 q9, d2, #8 1639.endm 1640 1641generate_composite_function \ 1642 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 1643 FLAG_DST_WRITEONLY, \ 1644 8, /* number of pixels, processed in a single block */ \ 1645 10, /* prefetch distance */ \ 1646 default_init, \ 1647 default_cleanup, \ 1648 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 1649 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 1650 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 1651 28, /* dst_w_basereg */ \ 1652 0, /* dst_r_basereg */ \ 1653 0, /* src_basereg */ \ 1654 0 /* mask_basereg */ 1655 1656/******************************************************************************/ 1657 1658.macro pixman_composite_src_pixbuf_8888_process_pixblock_head 1659 vmull.u8 q8, d3, d0 1660 vmull.u8 q9, d3, d1 1661 vmull.u8 q10, d3, d2 1662.endm 1663 1664.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 1665 vrshr.u16 q11, q8, #8 1666 vswp d3, d31 1667 vrshr.u16 q12, q9, #8 1668 vrshr.u16 q13, q10, #8 1669 vraddhn.u16 d30, q11, q8 1670 vraddhn.u16 d29, q12, q9 1671 vraddhn.u16 d28, q13, q10 1672.endm 1673 1674.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 1675 vrshr.u16 q11, q8, #8 1676 vswp d3, d31 1677 vrshr.u16 q12, q9, #8 1678 vrshr.u16 q13, q10, #8 1679 vld4.8 {d0, d1, d2, d3}, [SRC]! 1680 vraddhn.u16 d30, q11, q8 1681 PF add PF_X, PF_X, #8 1682 PF tst PF_CTL, #0xF 1683 PF addne PF_X, PF_X, #8 1684 PF subne PF_CTL, PF_CTL, #1 1685 vraddhn.u16 d29, q12, q9 1686 vraddhn.u16 d28, q13, q10 1687 vmull.u8 q8, d3, d0 1688 vmull.u8 q9, d3, d1 1689 vmull.u8 q10, d3, d2 1690 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1691 PF cmp PF_X, ORIG_W 1692 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 1693 PF subge PF_X, PF_X, ORIG_W 1694 PF subges PF_CTL, PF_CTL, #0x10 1695 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 1696.endm 1697 1698generate_composite_function \ 1699 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 1700 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1701 8, /* number of pixels, processed in a single block */ \ 1702 10, /* prefetch distance */ \ 1703 default_init, \ 1704 default_cleanup, \ 1705 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 1706 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 1707 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 1708 28, /* dst_w_basereg */ \ 1709 0, /* dst_r_basereg */ \ 1710 0, /* src_basereg */ \ 1711 0 /* mask_basereg */ 1712 1713#endif