1/* 2 * Copyright © 2009 Nokia Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 */ 25 26/* 27 * This file contains implementations of NEON optimized pixel processing 28 * functions. There is no full and detailed tutorial, but some functions 29 * (those which are exposing some new or interesting features) are 30 * extensively commented and can be used as examples. 31 * 32 * You may want to have a look at the comments for following functions: 33 * - pixman_composite_over_8888_0565_asm_neon 34 * - pixman_composite_over_n_8_0565_asm_neon 35 */ 36 37/* Prevent the stack from becoming executable for no reason... */ 38#if defined(__linux__) && defined(__ELF__) 39.section .note.GNU-stack,"",%progbits 40#endif 41 42 .text 43 .fpu neon 44 .arch armv7a 45 .object_arch armv4 46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ 47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ 48 .arm 49 .altmacro 50 .p2align 2 51 52#include "pixman-private.h" 53#include "pixman-arm-asm.h" 54#include "pixman-arm-neon-asm.h" 55 56/* Global configuration options and preferences */ 57 58/* 59 * The code can optionally make use of unaligned memory accesses to improve 60 * performance of handling leading/trailing pixels for each scanline. 61 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for 62 * example in linux if unaligned memory accesses are not configured to 63 * generate.exceptions. 64 */ 65.set RESPECT_STRICT_ALIGNMENT, 1 66 67/* 68 * Set default prefetch type. There is a choice between the following options: 69 * 70 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work 71 * as NOP to workaround some HW bugs or for whatever other reason) 72 * 73 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where 74 * advanced prefetch intruduces heavy overhead) 75 * 76 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 77 * which can run ARM and NEON instructions simultaneously so that extra ARM 78 * instructions do not add (many) extra cycles, but improve prefetch efficiency) 79 * 80 * Note: some types of function can't support advanced prefetch and fallback 81 * to simple one (those which handle 24bpp pixels) 82 */ 83.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED 84 85/* Prefetch distance in pixels for simple prefetch */ 86.set PREFETCH_DISTANCE_SIMPLE, 64 87 88/* 89 * Implementation of pixman_composite_over_8888_0565_asm_neon 90 * 91 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and 92 * performs OVER compositing operation. Function fast_composite_over_8888_0565 93 * from pixman-fast-path.c does the same in C and can be used as a reference. 94 * 95 * First we need to have some NEON assembly code which can do the actual 96 * operation on the pixels and provide it to the template macro. 97 * 98 * Template macro quite conveniently takes care of emitting all the necessary 99 * code for memory reading and writing (including quite tricky cases of 100 * handling unaligned leading/trailing pixels), so we only need to deal with 101 * the data in NEON registers. 102 * 103 * NEON registers allocation in general is recommented to be the following: 104 * d0, d1, d2, d3 - contain loaded source pixel data 105 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed) 106 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used) 107 * d28, d29, d30, d31 - place for storing the result (destination pixels) 108 * 109 * As can be seen above, four 64-bit NEON registers are used for keeping 110 * intermediate pixel data and up to 8 pixels can be processed in one step 111 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). 112 * 113 * This particular function uses the following registers allocation: 114 * d0, d1, d2, d3 - contain loaded source pixel data 115 * d4, d5 - contain loaded destination pixels (they are needed) 116 * d28, d29 - place for storing the result (destination pixels) 117 */ 118 119/* 120 * Step one. We need to have some code to do some arithmetics on pixel data. 121 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used 122 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5}, 123 * perform all the needed calculations and write the result to {d28, d29}. 124 * The rationale for having two macros and not just one will be explained 125 * later. In practice, any single monolitic function which does the work can 126 * be split into two parts in any arbitrary way without affecting correctness. 127 * 128 * There is one special trick here too. Common template macro can optionally 129 * make our life a bit easier by doing R, G, B, A color components 130 * deinterleaving for 32bpp pixel formats (and this feature is used in 131 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that 132 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we 133 * actually use d0 register for blue channel (a vector of eight 8-bit 134 * values), d1 register for green, d2 for red and d3 for alpha. This 135 * simple conversion can be also done with a few NEON instructions: 136 * 137 * Packed to planar conversion: 138 * vuzp.8 d0, d1 139 * vuzp.8 d2, d3 140 * vuzp.8 d1, d3 141 * vuzp.8 d0, d2 142 * 143 * Planar to packed conversion: 144 * vzip.8 d0, d2 145 * vzip.8 d1, d3 146 * vzip.8 d2, d3 147 * vzip.8 d0, d1 148 * 149 * But pixel can be loaded directly in planar format using VLD4.8 NEON 150 * instruction. It is 1 cycle slower than VLD1.32, so this is not always 151 * desirable, that's why deinterleaving is optional. 152 * 153 * But anyway, here is the code: 154 */ 155.macro pixman_composite_over_8888_0565_process_pixblock_head 156 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 157 and put data into d6 - red, d7 - green, d30 - blue */ 158 vshrn.u16 d6, q2, #8 159 vshrn.u16 d7, q2, #3 160 vsli.u16 q2, q2, #5 161 vsri.u8 d6, d6, #5 162 vmvn.8 d3, d3 /* invert source alpha */ 163 vsri.u8 d7, d7, #6 164 vshrn.u16 d30, q2, #2 165 /* now do alpha blending, storing results in 8-bit planar format 166 into d16 - red, d19 - green, d18 - blue */ 167 vmull.u8 q10, d3, d6 168 vmull.u8 q11, d3, d7 169 vmull.u8 q12, d3, d30 170 vrshr.u16 q13, q10, #8 171 vrshr.u16 q3, q11, #8 172 vrshr.u16 q15, q12, #8 173 vraddhn.u16 d20, q10, q13 174 vraddhn.u16 d23, q11, q3 175 vraddhn.u16 d22, q12, q15 176.endm 177 178.macro pixman_composite_over_8888_0565_process_pixblock_tail 179 /* ... continue alpha blending */ 180 vqadd.u8 d16, d2, d20 181 vqadd.u8 q9, q0, q11 182 /* convert the result to r5g6b5 and store it into {d28, d29} */ 183 vshll.u8 q14, d16, #8 184 vshll.u8 q8, d19, #8 185 vshll.u8 q9, d18, #8 186 vsri.u16 q14, q8, #5 187 vsri.u16 q14, q9, #11 188.endm 189 190/* 191 * OK, now we got almost everything that we need. Using the above two 192 * macros, the work can be done right. But now we want to optimize 193 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really 194 * a lot from good code scheduling and software pipelining. 195 * 196 * Let's construct some code, which will run in the core main loop. 197 * Some pseudo-code of the main loop will look like this: 198 * head 199 * while (...) { 200 * tail 201 * head 202 * } 203 * tail 204 * 205 * It may look a bit weird, but this setup allows to hide instruction 206 * latencies better and also utilize dual-issue capability more 207 * efficiently (make pairs of load-store and ALU instructions). 208 * 209 * So what we need now is a '*_tail_head' macro, which will be used 210 * in the core main loop. A trivial straightforward implementation 211 * of this macro would look like this: 212 * 213 * pixman_composite_over_8888_0565_process_pixblock_tail 214 * vst1.16 {d28, d29}, [DST_W, :128]! 215 * vld1.16 {d4, d5}, [DST_R, :128]! 216 * vld4.32 {d0, d1, d2, d3}, [SRC]! 217 * pixman_composite_over_8888_0565_process_pixblock_head 218 * cache_preload 8, 8 219 * 220 * Now it also got some VLD/VST instructions. We simply can't move from 221 * processing one block of pixels to the other one with just arithmetics. 222 * The previously processed data needs to be written to memory and new 223 * data needs to be fetched. Fortunately, this main loop does not deal 224 * with partial leading/trailing pixels and can load/store a full block 225 * of pixels in a bulk. Additionally, destination buffer is already 226 * 16 bytes aligned here (which is good for performance). 227 * 228 * New things here are DST_R, DST_W, SRC and MASK identifiers. These 229 * are the aliases for ARM registers which are used as pointers for 230 * accessing data. We maintain separate pointers for reading and writing 231 * destination buffer (DST_R and DST_W). 232 * 233 * Another new thing is 'cache_preload' macro. It is used for prefetching 234 * data into CPU L2 cache and improve performance when dealing with large 235 * images which are far larger than cache size. It uses one argument 236 * (actually two, but they need to be the same here) - number of pixels 237 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some 238 * details about this macro. Moreover, if good performance is needed 239 * the code from this macro needs to be copied into '*_tail_head' macro 240 * and mixed with the rest of code for optimal instructions scheduling. 241 * We are actually doing it below. 242 * 243 * Now after all the explanations, here is the optimized code. 244 * Different instruction streams (originaling from '*_head', '*_tail' 245 * and 'cache_preload' macro) use different indentation levels for 246 * better readability. Actually taking the code from one of these 247 * indentation levels and ignoring a few VLD/VST instructions would 248 * result in exactly the code from '*_head', '*_tail' or 'cache_preload' 249 * macro! 250 */ 251 252#if 1 253 254.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 255 vqadd.u8 d16, d2, d20 256 vld1.16 {d4, d5}, [DST_R, :128]! 257 vqadd.u8 q9, q0, q11 258 vshrn.u16 d6, q2, #8 259 fetch_src_pixblock 260 vshrn.u16 d7, q2, #3 261 vsli.u16 q2, q2, #5 262 vshll.u8 q14, d16, #8 263 PF add PF_X, PF_X, #8 264 vshll.u8 q8, d19, #8 265 PF tst PF_CTL, #0xF 266 vsri.u8 d6, d6, #5 267 PF addne PF_X, PF_X, #8 268 vmvn.8 d3, d3 269 PF subne PF_CTL, PF_CTL, #1 270 vsri.u8 d7, d7, #6 271 vshrn.u16 d30, q2, #2 272 vmull.u8 q10, d3, d6 273 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 274 vmull.u8 q11, d3, d7 275 vmull.u8 q12, d3, d30 276 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 277 vsri.u16 q14, q8, #5 278 PF cmp PF_X, ORIG_W 279 vshll.u8 q9, d18, #8 280 vrshr.u16 q13, q10, #8 281 PF subge PF_X, PF_X, ORIG_W 282 vrshr.u16 q3, q11, #8 283 vrshr.u16 q15, q12, #8 284 PF subges PF_CTL, PF_CTL, #0x10 285 vsri.u16 q14, q9, #11 286 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 287 vraddhn.u16 d20, q10, q13 288 vraddhn.u16 d23, q11, q3 289 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 290 vraddhn.u16 d22, q12, q15 291 vst1.16 {d28, d29}, [DST_W, :128]! 292.endm 293 294#else 295 296/* If we did not care much about the performance, we would just use this... */ 297.macro pixman_composite_over_8888_0565_process_pixblock_tail_head 298 pixman_composite_over_8888_0565_process_pixblock_tail 299 vst1.16 {d28, d29}, [DST_W, :128]! 300 vld1.16 {d4, d5}, [DST_R, :128]! 301 fetch_src_pixblock 302 pixman_composite_over_8888_0565_process_pixblock_head 303 cache_preload 8, 8 304.endm 305 306#endif 307 308/* 309 * And now the final part. We are using 'generate_composite_function' macro 310 * to put all the stuff together. We are specifying the name of the function 311 * which we want to get, number of bits per pixel for the source, mask and 312 * destination (0 if unused, like mask in this case). Next come some bit 313 * flags: 314 * FLAG_DST_READWRITE - tells that the destination buffer is both read 315 * and written, for write-only buffer we would use 316 * FLAG_DST_WRITEONLY flag instead 317 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data 318 * and separate color channels for 32bpp format. 319 * The next things are: 320 * - the number of pixels processed per iteration (8 in this case, because 321 * that's the maximum what can fit into four 64-bit NEON registers). 322 * - prefetch distance, measured in pixel blocks. In this case it is 5 times 323 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal 324 * prefetch distance can be selected by running some benchmarks. 325 * 326 * After that we specify some macros, these are 'default_init', 327 * 'default_cleanup' here which are empty (but it is possible to have custom 328 * init/cleanup macros to be able to save/restore some extra NEON registers 329 * like d8-d15 or do anything else) followed by 330 * 'pixman_composite_over_8888_0565_process_pixblock_head', 331 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and 332 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' 333 * which we got implemented above. 334 * 335 * The last part is the NEON registers allocation scheme. 336 */ 337generate_composite_function \ 338 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ 339 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 340 8, /* number of pixels, processed in a single block */ \ 341 5, /* prefetch distance */ \ 342 default_init, \ 343 default_cleanup, \ 344 pixman_composite_over_8888_0565_process_pixblock_head, \ 345 pixman_composite_over_8888_0565_process_pixblock_tail, \ 346 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 347 28, /* dst_w_basereg */ \ 348 4, /* dst_r_basereg */ \ 349 0, /* src_basereg */ \ 350 24 /* mask_basereg */ 351 352/******************************************************************************/ 353 354.macro pixman_composite_over_n_0565_process_pixblock_head 355 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 356 and put data into d6 - red, d7 - green, d30 - blue */ 357 vshrn.u16 d6, q2, #8 358 vshrn.u16 d7, q2, #3 359 vsli.u16 q2, q2, #5 360 vsri.u8 d6, d6, #5 361 vsri.u8 d7, d7, #6 362 vshrn.u16 d30, q2, #2 363 /* now do alpha blending, storing results in 8-bit planar format 364 into d16 - red, d19 - green, d18 - blue */ 365 vmull.u8 q10, d3, d6 366 vmull.u8 q11, d3, d7 367 vmull.u8 q12, d3, d30 368 vrshr.u16 q13, q10, #8 369 vrshr.u16 q3, q11, #8 370 vrshr.u16 q15, q12, #8 371 vraddhn.u16 d20, q10, q13 372 vraddhn.u16 d23, q11, q3 373 vraddhn.u16 d22, q12, q15 374.endm 375 376.macro pixman_composite_over_n_0565_process_pixblock_tail 377 /* ... continue alpha blending */ 378 vqadd.u8 d16, d2, d20 379 vqadd.u8 q9, q0, q11 380 /* convert the result to r5g6b5 and store it into {d28, d29} */ 381 vshll.u8 q14, d16, #8 382 vshll.u8 q8, d19, #8 383 vshll.u8 q9, d18, #8 384 vsri.u16 q14, q8, #5 385 vsri.u16 q14, q9, #11 386.endm 387 388/* TODO: expand macros and do better instructions scheduling */ 389.macro pixman_composite_over_n_0565_process_pixblock_tail_head 390 pixman_composite_over_n_0565_process_pixblock_tail 391 vld1.16 {d4, d5}, [DST_R, :128]! 392 vst1.16 {d28, d29}, [DST_W, :128]! 393 pixman_composite_over_n_0565_process_pixblock_head 394 cache_preload 8, 8 395.endm 396 397.macro pixman_composite_over_n_0565_init 398 add DUMMY, sp, #ARGS_STACK_OFFSET 399 vld1.32 {d3[0]}, [DUMMY] 400 vdup.8 d0, d3[0] 401 vdup.8 d1, d3[1] 402 vdup.8 d2, d3[2] 403 vdup.8 d3, d3[3] 404 vmvn.8 d3, d3 /* invert source alpha */ 405.endm 406 407generate_composite_function \ 408 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ 409 FLAG_DST_READWRITE, \ 410 8, /* number of pixels, processed in a single block */ \ 411 5, /* prefetch distance */ \ 412 pixman_composite_over_n_0565_init, \ 413 default_cleanup, \ 414 pixman_composite_over_n_0565_process_pixblock_head, \ 415 pixman_composite_over_n_0565_process_pixblock_tail, \ 416 pixman_composite_over_n_0565_process_pixblock_tail_head, \ 417 28, /* dst_w_basereg */ \ 418 4, /* dst_r_basereg */ \ 419 0, /* src_basereg */ \ 420 24 /* mask_basereg */ 421 422/******************************************************************************/ 423 424.macro pixman_composite_src_8888_0565_process_pixblock_head 425 vshll.u8 q8, d1, #8 426 vshll.u8 q14, d2, #8 427 vshll.u8 q9, d0, #8 428.endm 429 430.macro pixman_composite_src_8888_0565_process_pixblock_tail 431 vsri.u16 q14, q8, #5 432 vsri.u16 q14, q9, #11 433.endm 434 435.macro pixman_composite_src_8888_0565_process_pixblock_tail_head 436 vsri.u16 q14, q8, #5 437 PF add PF_X, PF_X, #8 438 PF tst PF_CTL, #0xF 439 fetch_src_pixblock 440 PF addne PF_X, PF_X, #8 441 PF subne PF_CTL, PF_CTL, #1 442 vsri.u16 q14, q9, #11 443 PF cmp PF_X, ORIG_W 444 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 445 vshll.u8 q8, d1, #8 446 vst1.16 {d28, d29}, [DST_W, :128]! 447 PF subge PF_X, PF_X, ORIG_W 448 PF subges PF_CTL, PF_CTL, #0x10 449 vshll.u8 q14, d2, #8 450 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 451 vshll.u8 q9, d0, #8 452.endm 453 454generate_composite_function \ 455 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ 456 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 457 8, /* number of pixels, processed in a single block */ \ 458 10, /* prefetch distance */ \ 459 default_init, \ 460 default_cleanup, \ 461 pixman_composite_src_8888_0565_process_pixblock_head, \ 462 pixman_composite_src_8888_0565_process_pixblock_tail, \ 463 pixman_composite_src_8888_0565_process_pixblock_tail_head 464 465/******************************************************************************/ 466 467.macro pixman_composite_src_0565_8888_process_pixblock_head 468 vshrn.u16 d30, q0, #8 469 vshrn.u16 d29, q0, #3 470 vsli.u16 q0, q0, #5 471 vmov.u8 d31, #255 472 vsri.u8 d30, d30, #5 473 vsri.u8 d29, d29, #6 474 vshrn.u16 d28, q0, #2 475.endm 476 477.macro pixman_composite_src_0565_8888_process_pixblock_tail 478.endm 479 480/* TODO: expand macros and do better instructions scheduling */ 481.macro pixman_composite_src_0565_8888_process_pixblock_tail_head 482 pixman_composite_src_0565_8888_process_pixblock_tail 483 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 484 fetch_src_pixblock 485 pixman_composite_src_0565_8888_process_pixblock_head 486 cache_preload 8, 8 487.endm 488 489generate_composite_function \ 490 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ 491 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 492 8, /* number of pixels, processed in a single block */ \ 493 10, /* prefetch distance */ \ 494 default_init, \ 495 default_cleanup, \ 496 pixman_composite_src_0565_8888_process_pixblock_head, \ 497 pixman_composite_src_0565_8888_process_pixblock_tail, \ 498 pixman_composite_src_0565_8888_process_pixblock_tail_head 499 500/******************************************************************************/ 501 502.macro pixman_composite_add_8_8_process_pixblock_head 503 vqadd.u8 q14, q0, q2 504 vqadd.u8 q15, q1, q3 505.endm 506 507.macro pixman_composite_add_8_8_process_pixblock_tail 508.endm 509 510.macro pixman_composite_add_8_8_process_pixblock_tail_head 511 fetch_src_pixblock 512 PF add PF_X, PF_X, #32 513 PF tst PF_CTL, #0xF 514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 515 PF addne PF_X, PF_X, #32 516 PF subne PF_CTL, PF_CTL, #1 517 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 518 PF cmp PF_X, ORIG_W 519 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 520 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 521 PF subge PF_X, PF_X, ORIG_W 522 PF subges PF_CTL, PF_CTL, #0x10 523 vqadd.u8 q14, q0, q2 524 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 525 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 526 vqadd.u8 q15, q1, q3 527.endm 528 529generate_composite_function \ 530 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ 531 FLAG_DST_READWRITE, \ 532 32, /* number of pixels, processed in a single block */ \ 533 10, /* prefetch distance */ \ 534 default_init, \ 535 default_cleanup, \ 536 pixman_composite_add_8_8_process_pixblock_head, \ 537 pixman_composite_add_8_8_process_pixblock_tail, \ 538 pixman_composite_add_8_8_process_pixblock_tail_head 539 540/******************************************************************************/ 541 542.macro pixman_composite_add_8888_8888_process_pixblock_tail_head 543 fetch_src_pixblock 544 PF add PF_X, PF_X, #8 545 PF tst PF_CTL, #0xF 546 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]! 547 PF addne PF_X, PF_X, #8 548 PF subne PF_CTL, PF_CTL, #1 549 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]! 550 PF cmp PF_X, ORIG_W 551 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 552 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 553 PF subge PF_X, PF_X, ORIG_W 554 PF subges PF_CTL, PF_CTL, #0x10 555 vqadd.u8 q14, q0, q2 556 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 557 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 558 vqadd.u8 q15, q1, q3 559.endm 560 561generate_composite_function \ 562 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ 563 FLAG_DST_READWRITE, \ 564 8, /* number of pixels, processed in a single block */ \ 565 10, /* prefetch distance */ \ 566 default_init, \ 567 default_cleanup, \ 568 pixman_composite_add_8_8_process_pixblock_head, \ 569 pixman_composite_add_8_8_process_pixblock_tail, \ 570 pixman_composite_add_8888_8888_process_pixblock_tail_head 571 572generate_composite_function_single_scanline \ 573 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ 574 FLAG_DST_READWRITE, \ 575 8, /* number of pixels, processed in a single block */ \ 576 default_init, \ 577 default_cleanup, \ 578 pixman_composite_add_8_8_process_pixblock_head, \ 579 pixman_composite_add_8_8_process_pixblock_tail, \ 580 pixman_composite_add_8888_8888_process_pixblock_tail_head 581 582/******************************************************************************/ 583 584.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head 585 vmvn.8 d24, d3 /* get inverted alpha */ 586 /* do alpha blending */ 587 vmull.u8 q8, d24, d4 588 vmull.u8 q9, d24, d5 589 vmull.u8 q10, d24, d6 590 vmull.u8 q11, d24, d7 591.endm 592 593.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail 594 vrshr.u16 q14, q8, #8 595 vrshr.u16 q15, q9, #8 596 vrshr.u16 q12, q10, #8 597 vrshr.u16 q13, q11, #8 598 vraddhn.u16 d28, q14, q8 599 vraddhn.u16 d29, q15, q9 600 vraddhn.u16 d30, q12, q10 601 vraddhn.u16 d31, q13, q11 602.endm 603 604.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 605 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 606 vrshr.u16 q14, q8, #8 607 PF add PF_X, PF_X, #8 608 PF tst PF_CTL, #0xF 609 vrshr.u16 q15, q9, #8 610 vrshr.u16 q12, q10, #8 611 vrshr.u16 q13, q11, #8 612 PF addne PF_X, PF_X, #8 613 PF subne PF_CTL, PF_CTL, #1 614 vraddhn.u16 d28, q14, q8 615 vraddhn.u16 d29, q15, q9 616 PF cmp PF_X, ORIG_W 617 vraddhn.u16 d30, q12, q10 618 vraddhn.u16 d31, q13, q11 619 fetch_src_pixblock 620 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 621 vmvn.8 d22, d3 622 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 623 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 624 PF subge PF_X, PF_X, ORIG_W 625 vmull.u8 q8, d22, d4 626 PF subges PF_CTL, PF_CTL, #0x10 627 vmull.u8 q9, d22, d5 628 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 629 vmull.u8 q10, d22, d6 630 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 631 vmull.u8 q11, d22, d7 632.endm 633 634generate_composite_function_single_scanline \ 635 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ 636 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 637 8, /* number of pixels, processed in a single block */ \ 638 default_init, \ 639 default_cleanup, \ 640 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ 641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ 642 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head 643 644/******************************************************************************/ 645 646.macro pixman_composite_over_8888_8888_process_pixblock_head 647 pixman_composite_out_reverse_8888_8888_process_pixblock_head 648.endm 649 650.macro pixman_composite_over_8888_8888_process_pixblock_tail 651 pixman_composite_out_reverse_8888_8888_process_pixblock_tail 652 vqadd.u8 q14, q0, q14 653 vqadd.u8 q15, q1, q15 654.endm 655 656.macro pixman_composite_over_8888_8888_process_pixblock_tail_head 657 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 658 vrshr.u16 q14, q8, #8 659 PF add PF_X, PF_X, #8 660 PF tst PF_CTL, #0xF 661 vrshr.u16 q15, q9, #8 662 vrshr.u16 q12, q10, #8 663 vrshr.u16 q13, q11, #8 664 PF addne PF_X, PF_X, #8 665 PF subne PF_CTL, PF_CTL, #1 666 vraddhn.u16 d28, q14, q8 667 vraddhn.u16 d29, q15, q9 668 PF cmp PF_X, ORIG_W 669 vraddhn.u16 d30, q12, q10 670 vraddhn.u16 d31, q13, q11 671 vqadd.u8 q14, q0, q14 672 vqadd.u8 q15, q1, q15 673 fetch_src_pixblock 674 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 675 vmvn.8 d22, d3 676 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 677 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 678 PF subge PF_X, PF_X, ORIG_W 679 vmull.u8 q8, d22, d4 680 PF subges PF_CTL, PF_CTL, #0x10 681 vmull.u8 q9, d22, d5 682 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 683 vmull.u8 q10, d22, d6 684 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 685 vmull.u8 q11, d22, d7 686.endm 687 688generate_composite_function \ 689 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ 690 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 691 8, /* number of pixels, processed in a single block */ \ 692 5, /* prefetch distance */ \ 693 default_init, \ 694 default_cleanup, \ 695 pixman_composite_over_8888_8888_process_pixblock_head, \ 696 pixman_composite_over_8888_8888_process_pixblock_tail, \ 697 pixman_composite_over_8888_8888_process_pixblock_tail_head 698 699generate_composite_function_single_scanline \ 700 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ 701 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 702 8, /* number of pixels, processed in a single block */ \ 703 default_init, \ 704 default_cleanup, \ 705 pixman_composite_over_8888_8888_process_pixblock_head, \ 706 pixman_composite_over_8888_8888_process_pixblock_tail, \ 707 pixman_composite_over_8888_8888_process_pixblock_tail_head 708 709/******************************************************************************/ 710 711.macro pixman_composite_over_n_8888_process_pixblock_head 712 /* deinterleaved source pixels in {d0, d1, d2, d3} */ 713 /* inverted alpha in {d24} */ 714 /* destination pixels in {d4, d5, d6, d7} */ 715 vmull.u8 q8, d24, d4 716 vmull.u8 q9, d24, d5 717 vmull.u8 q10, d24, d6 718 vmull.u8 q11, d24, d7 719.endm 720 721.macro pixman_composite_over_n_8888_process_pixblock_tail 722 vrshr.u16 q14, q8, #8 723 vrshr.u16 q15, q9, #8 724 vrshr.u16 q2, q10, #8 725 vrshr.u16 q3, q11, #8 726 vraddhn.u16 d28, q14, q8 727 vraddhn.u16 d29, q15, q9 728 vraddhn.u16 d30, q2, q10 729 vraddhn.u16 d31, q3, q11 730 vqadd.u8 q14, q0, q14 731 vqadd.u8 q15, q1, q15 732.endm 733 734.macro pixman_composite_over_n_8888_process_pixblock_tail_head 735 vrshr.u16 q14, q8, #8 736 vrshr.u16 q15, q9, #8 737 vrshr.u16 q2, q10, #8 738 vrshr.u16 q3, q11, #8 739 vraddhn.u16 d28, q14, q8 740 vraddhn.u16 d29, q15, q9 741 vraddhn.u16 d30, q2, q10 742 vraddhn.u16 d31, q3, q11 743 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 744 vqadd.u8 q14, q0, q14 745 PF add PF_X, PF_X, #8 746 PF tst PF_CTL, #0x0F 747 PF addne PF_X, PF_X, #8 748 PF subne PF_CTL, PF_CTL, #1 749 vqadd.u8 q15, q1, q15 750 PF cmp PF_X, ORIG_W 751 vmull.u8 q8, d24, d4 752 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 753 vmull.u8 q9, d24, d5 754 PF subge PF_X, PF_X, ORIG_W 755 vmull.u8 q10, d24, d6 756 PF subges PF_CTL, PF_CTL, #0x10 757 vmull.u8 q11, d24, d7 758 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 759 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 760.endm 761 762.macro pixman_composite_over_n_8888_init 763 add DUMMY, sp, #ARGS_STACK_OFFSET 764 vld1.32 {d3[0]}, [DUMMY] 765 vdup.8 d0, d3[0] 766 vdup.8 d1, d3[1] 767 vdup.8 d2, d3[2] 768 vdup.8 d3, d3[3] 769 vmvn.8 d24, d3 /* get inverted alpha */ 770.endm 771 772generate_composite_function \ 773 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ 774 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 775 8, /* number of pixels, processed in a single block */ \ 776 5, /* prefetch distance */ \ 777 pixman_composite_over_n_8888_init, \ 778 default_cleanup, \ 779 pixman_composite_over_8888_8888_process_pixblock_head, \ 780 pixman_composite_over_8888_8888_process_pixblock_tail, \ 781 pixman_composite_over_n_8888_process_pixblock_tail_head 782 783/******************************************************************************/ 784 785.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head 786 vrshr.u16 q14, q8, #8 787 PF add PF_X, PF_X, #8 788 PF tst PF_CTL, #0xF 789 vrshr.u16 q15, q9, #8 790 vrshr.u16 q12, q10, #8 791 vrshr.u16 q13, q11, #8 792 PF addne PF_X, PF_X, #8 793 PF subne PF_CTL, PF_CTL, #1 794 vraddhn.u16 d28, q14, q8 795 vraddhn.u16 d29, q15, q9 796 PF cmp PF_X, ORIG_W 797 vraddhn.u16 d30, q12, q10 798 vraddhn.u16 d31, q13, q11 799 vqadd.u8 q14, q0, q14 800 vqadd.u8 q15, q1, q15 801 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]! 802 vmvn.8 d22, d3 803 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 804 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 805 PF subge PF_X, PF_X, ORIG_W 806 vmull.u8 q8, d22, d4 807 PF subges PF_CTL, PF_CTL, #0x10 808 vmull.u8 q9, d22, d5 809 vmull.u8 q10, d22, d6 810 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 811 vmull.u8 q11, d22, d7 812.endm 813 814.macro pixman_composite_over_reverse_n_8888_init 815 add DUMMY, sp, #ARGS_STACK_OFFSET 816 vld1.32 {d7[0]}, [DUMMY] 817 vdup.8 d4, d7[0] 818 vdup.8 d5, d7[1] 819 vdup.8 d6, d7[2] 820 vdup.8 d7, d7[3] 821.endm 822 823generate_composite_function \ 824 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ 825 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 826 8, /* number of pixels, processed in a single block */ \ 827 5, /* prefetch distance */ \ 828 pixman_composite_over_reverse_n_8888_init, \ 829 default_cleanup, \ 830 pixman_composite_over_8888_8888_process_pixblock_head, \ 831 pixman_composite_over_8888_8888_process_pixblock_tail, \ 832 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ 833 28, /* dst_w_basereg */ \ 834 0, /* dst_r_basereg */ \ 835 4, /* src_basereg */ \ 836 24 /* mask_basereg */ 837 838/******************************************************************************/ 839 840.macro pixman_composite_over_8888_8_0565_process_pixblock_head 841 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */ 842 vmull.u8 q1, d24, d9 843 vmull.u8 q6, d24, d10 844 vmull.u8 q7, d24, d11 845 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */ 846 vshrn.u16 d7, q2, #3 847 vsli.u16 q2, q2, #5 848 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */ 849 vrshr.u16 q9, q1, #8 850 vrshr.u16 q10, q6, #8 851 vrshr.u16 q11, q7, #8 852 vraddhn.u16 d0, q0, q8 853 vraddhn.u16 d1, q1, q9 854 vraddhn.u16 d2, q6, q10 855 vraddhn.u16 d3, q7, q11 856 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */ 857 vsri.u8 d7, d7, #6 858 vmvn.8 d3, d3 859 vshrn.u16 d30, q2, #2 860 vmull.u8 q8, d3, d6 /* now do alpha blending */ 861 vmull.u8 q9, d3, d7 862 vmull.u8 q10, d3, d30 863.endm 864 865.macro pixman_composite_over_8888_8_0565_process_pixblock_tail 866 /* 3 cycle bubble (after vmull.u8) */ 867 vrshr.u16 q13, q8, #8 868 vrshr.u16 q11, q9, #8 869 vrshr.u16 q15, q10, #8 870 vraddhn.u16 d16, q8, q13 871 vraddhn.u16 d27, q9, q11 872 vraddhn.u16 d26, q10, q15 873 vqadd.u8 d16, d2, d16 874 /* 1 cycle bubble */ 875 vqadd.u8 q9, q0, q13 876 vshll.u8 q14, d16, #8 /* convert to 16bpp */ 877 vshll.u8 q8, d19, #8 878 vshll.u8 q9, d18, #8 879 vsri.u16 q14, q8, #5 880 /* 1 cycle bubble */ 881 vsri.u16 q14, q9, #11 882.endm 883 884.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head 885 vld1.16 {d4, d5}, [DST_R, :128]! 886 vshrn.u16 d6, q2, #8 887 fetch_mask_pixblock 888 vshrn.u16 d7, q2, #3 889 fetch_src_pixblock 890 vmull.u8 q6, d24, d10 891 vrshr.u16 q13, q8, #8 892 vrshr.u16 q11, q9, #8 893 vrshr.u16 q15, q10, #8 894 vraddhn.u16 d16, q8, q13 895 vraddhn.u16 d27, q9, q11 896 vraddhn.u16 d26, q10, q15 897 vqadd.u8 d16, d2, d16 898 vmull.u8 q1, d24, d9 899 vqadd.u8 q9, q0, q13 900 vshll.u8 q14, d16, #8 901 vmull.u8 q0, d24, d8 902 vshll.u8 q8, d19, #8 903 vshll.u8 q9, d18, #8 904 vsri.u16 q14, q8, #5 905 vmull.u8 q7, d24, d11 906 vsri.u16 q14, q9, #11 907 908 cache_preload 8, 8 909 910 vsli.u16 q2, q2, #5 911 vrshr.u16 q8, q0, #8 912 vrshr.u16 q9, q1, #8 913 vrshr.u16 q10, q6, #8 914 vrshr.u16 q11, q7, #8 915 vraddhn.u16 d0, q0, q8 916 vraddhn.u16 d1, q1, q9 917 vraddhn.u16 d2, q6, q10 918 vraddhn.u16 d3, q7, q11 919 vsri.u8 d6, d6, #5 920 vsri.u8 d7, d7, #6 921 vmvn.8 d3, d3 922 vshrn.u16 d30, q2, #2 923 vst1.16 {d28, d29}, [DST_W, :128]! 924 vmull.u8 q8, d3, d6 925 vmull.u8 q9, d3, d7 926 vmull.u8 q10, d3, d30 927.endm 928 929generate_composite_function \ 930 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ 931 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 932 8, /* number of pixels, processed in a single block */ \ 933 5, /* prefetch distance */ \ 934 default_init_need_all_regs, \ 935 default_cleanup_need_all_regs, \ 936 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 937 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 938 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 939 28, /* dst_w_basereg */ \ 940 4, /* dst_r_basereg */ \ 941 8, /* src_basereg */ \ 942 24 /* mask_basereg */ 943 944/******************************************************************************/ 945 946/* 947 * This function needs a special initialization of solid mask. 948 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET 949 * offset, split into color components and replicated in d8-d11 950 * registers. Additionally, this function needs all the NEON registers, 951 * so it has to save d8-d15 registers which are callee saved according 952 * to ABI. These registers are restored from 'cleanup' macro. All the 953 * other NEON registers are caller saved, so can be clobbered freely 954 * without introducing any problems. 955 */ 956.macro pixman_composite_over_n_8_0565_init 957 add DUMMY, sp, #ARGS_STACK_OFFSET 958 vpush {d8-d15} 959 vld1.32 {d11[0]}, [DUMMY] 960 vdup.8 d8, d11[0] 961 vdup.8 d9, d11[1] 962 vdup.8 d10, d11[2] 963 vdup.8 d11, d11[3] 964.endm 965 966.macro pixman_composite_over_n_8_0565_cleanup 967 vpop {d8-d15} 968.endm 969 970generate_composite_function \ 971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ 972 FLAG_DST_READWRITE, \ 973 8, /* number of pixels, processed in a single block */ \ 974 5, /* prefetch distance */ \ 975 pixman_composite_over_n_8_0565_init, \ 976 pixman_composite_over_n_8_0565_cleanup, \ 977 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head 980 981/******************************************************************************/ 982 983.macro pixman_composite_over_8888_n_0565_init 984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 985 vpush {d8-d15} 986 vld1.32 {d24[0]}, [DUMMY] 987 vdup.8 d24, d24[3] 988.endm 989 990.macro pixman_composite_over_8888_n_0565_cleanup 991 vpop {d8-d15} 992.endm 993 994generate_composite_function \ 995 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ 996 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 997 8, /* number of pixels, processed in a single block */ \ 998 5, /* prefetch distance */ \ 999 pixman_composite_over_8888_n_0565_init, \ 1000 pixman_composite_over_8888_n_0565_cleanup, \ 1001 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 1002 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 1003 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 1004 28, /* dst_w_basereg */ \ 1005 4, /* dst_r_basereg */ \ 1006 8, /* src_basereg */ \ 1007 24 /* mask_basereg */ 1008 1009/******************************************************************************/ 1010 1011.macro pixman_composite_src_0565_0565_process_pixblock_head 1012.endm 1013 1014.macro pixman_composite_src_0565_0565_process_pixblock_tail 1015.endm 1016 1017.macro pixman_composite_src_0565_0565_process_pixblock_tail_head 1018 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1019 fetch_src_pixblock 1020 cache_preload 16, 16 1021.endm 1022 1023generate_composite_function \ 1024 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ 1025 FLAG_DST_WRITEONLY, \ 1026 16, /* number of pixels, processed in a single block */ \ 1027 10, /* prefetch distance */ \ 1028 default_init, \ 1029 default_cleanup, \ 1030 pixman_composite_src_0565_0565_process_pixblock_head, \ 1031 pixman_composite_src_0565_0565_process_pixblock_tail, \ 1032 pixman_composite_src_0565_0565_process_pixblock_tail_head, \ 1033 0, /* dst_w_basereg */ \ 1034 0, /* dst_r_basereg */ \ 1035 0, /* src_basereg */ \ 1036 0 /* mask_basereg */ 1037 1038/******************************************************************************/ 1039 1040.macro pixman_composite_src_n_8_process_pixblock_head 1041.endm 1042 1043.macro pixman_composite_src_n_8_process_pixblock_tail 1044.endm 1045 1046.macro pixman_composite_src_n_8_process_pixblock_tail_head 1047 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]! 1048.endm 1049 1050.macro pixman_composite_src_n_8_init 1051 add DUMMY, sp, #ARGS_STACK_OFFSET 1052 vld1.32 {d0[0]}, [DUMMY] 1053 vsli.u64 d0, d0, #8 1054 vsli.u64 d0, d0, #16 1055 vsli.u64 d0, d0, #32 1056 vorr d1, d0, d0 1057 vorr q1, q0, q0 1058.endm 1059 1060.macro pixman_composite_src_n_8_cleanup 1061.endm 1062 1063generate_composite_function \ 1064 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ 1065 FLAG_DST_WRITEONLY, \ 1066 32, /* number of pixels, processed in a single block */ \ 1067 0, /* prefetch distance */ \ 1068 pixman_composite_src_n_8_init, \ 1069 pixman_composite_src_n_8_cleanup, \ 1070 pixman_composite_src_n_8_process_pixblock_head, \ 1071 pixman_composite_src_n_8_process_pixblock_tail, \ 1072 pixman_composite_src_n_8_process_pixblock_tail_head, \ 1073 0, /* dst_w_basereg */ \ 1074 0, /* dst_r_basereg */ \ 1075 0, /* src_basereg */ \ 1076 0 /* mask_basereg */ 1077 1078/******************************************************************************/ 1079 1080.macro pixman_composite_src_n_0565_process_pixblock_head 1081.endm 1082 1083.macro pixman_composite_src_n_0565_process_pixblock_tail 1084.endm 1085 1086.macro pixman_composite_src_n_0565_process_pixblock_tail_head 1087 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]! 1088.endm 1089 1090.macro pixman_composite_src_n_0565_init 1091 add DUMMY, sp, #ARGS_STACK_OFFSET 1092 vld1.32 {d0[0]}, [DUMMY] 1093 vsli.u64 d0, d0, #16 1094 vsli.u64 d0, d0, #32 1095 vorr d1, d0, d0 1096 vorr q1, q0, q0 1097.endm 1098 1099.macro pixman_composite_src_n_0565_cleanup 1100.endm 1101 1102generate_composite_function \ 1103 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ 1104 FLAG_DST_WRITEONLY, \ 1105 16, /* number of pixels, processed in a single block */ \ 1106 0, /* prefetch distance */ \ 1107 pixman_composite_src_n_0565_init, \ 1108 pixman_composite_src_n_0565_cleanup, \ 1109 pixman_composite_src_n_0565_process_pixblock_head, \ 1110 pixman_composite_src_n_0565_process_pixblock_tail, \ 1111 pixman_composite_src_n_0565_process_pixblock_tail_head, \ 1112 0, /* dst_w_basereg */ \ 1113 0, /* dst_r_basereg */ \ 1114 0, /* src_basereg */ \ 1115 0 /* mask_basereg */ 1116 1117/******************************************************************************/ 1118 1119.macro pixman_composite_src_n_8888_process_pixblock_head 1120.endm 1121 1122.macro pixman_composite_src_n_8888_process_pixblock_tail 1123.endm 1124 1125.macro pixman_composite_src_n_8888_process_pixblock_tail_head 1126 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1127.endm 1128 1129.macro pixman_composite_src_n_8888_init 1130 add DUMMY, sp, #ARGS_STACK_OFFSET 1131 vld1.32 {d0[0]}, [DUMMY] 1132 vsli.u64 d0, d0, #32 1133 vorr d1, d0, d0 1134 vorr q1, q0, q0 1135.endm 1136 1137.macro pixman_composite_src_n_8888_cleanup 1138.endm 1139 1140generate_composite_function \ 1141 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ 1142 FLAG_DST_WRITEONLY, \ 1143 8, /* number of pixels, processed in a single block */ \ 1144 0, /* prefetch distance */ \ 1145 pixman_composite_src_n_8888_init, \ 1146 pixman_composite_src_n_8888_cleanup, \ 1147 pixman_composite_src_n_8888_process_pixblock_head, \ 1148 pixman_composite_src_n_8888_process_pixblock_tail, \ 1149 pixman_composite_src_n_8888_process_pixblock_tail_head, \ 1150 0, /* dst_w_basereg */ \ 1151 0, /* dst_r_basereg */ \ 1152 0, /* src_basereg */ \ 1153 0 /* mask_basereg */ 1154 1155/******************************************************************************/ 1156 1157.macro pixman_composite_src_8888_8888_process_pixblock_head 1158.endm 1159 1160.macro pixman_composite_src_8888_8888_process_pixblock_tail 1161.endm 1162 1163.macro pixman_composite_src_8888_8888_process_pixblock_tail_head 1164 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1165 fetch_src_pixblock 1166 cache_preload 8, 8 1167.endm 1168 1169generate_composite_function \ 1170 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ 1171 FLAG_DST_WRITEONLY, \ 1172 8, /* number of pixels, processed in a single block */ \ 1173 10, /* prefetch distance */ \ 1174 default_init, \ 1175 default_cleanup, \ 1176 pixman_composite_src_8888_8888_process_pixblock_head, \ 1177 pixman_composite_src_8888_8888_process_pixblock_tail, \ 1178 pixman_composite_src_8888_8888_process_pixblock_tail_head, \ 1179 0, /* dst_w_basereg */ \ 1180 0, /* dst_r_basereg */ \ 1181 0, /* src_basereg */ \ 1182 0 /* mask_basereg */ 1183 1184/******************************************************************************/ 1185 1186.macro pixman_composite_src_x888_8888_process_pixblock_head 1187 vorr q0, q0, q2 1188 vorr q1, q1, q2 1189.endm 1190 1191.macro pixman_composite_src_x888_8888_process_pixblock_tail 1192.endm 1193 1194.macro pixman_composite_src_x888_8888_process_pixblock_tail_head 1195 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]! 1196 fetch_src_pixblock 1197 vorr q0, q0, q2 1198 vorr q1, q1, q2 1199 cache_preload 8, 8 1200.endm 1201 1202.macro pixman_composite_src_x888_8888_init 1203 vmov.u8 q2, #0xFF 1204 vshl.u32 q2, q2, #24 1205.endm 1206 1207generate_composite_function \ 1208 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ 1209 FLAG_DST_WRITEONLY, \ 1210 8, /* number of pixels, processed in a single block */ \ 1211 10, /* prefetch distance */ \ 1212 pixman_composite_src_x888_8888_init, \ 1213 default_cleanup, \ 1214 pixman_composite_src_x888_8888_process_pixblock_head, \ 1215 pixman_composite_src_x888_8888_process_pixblock_tail, \ 1216 pixman_composite_src_x888_8888_process_pixblock_tail_head, \ 1217 0, /* dst_w_basereg */ \ 1218 0, /* dst_r_basereg */ \ 1219 0, /* src_basereg */ \ 1220 0 /* mask_basereg */ 1221 1222/******************************************************************************/ 1223 1224.macro pixman_composite_src_n_8_8888_process_pixblock_head 1225 /* expecting solid source in {d0, d1, d2, d3} */ 1226 /* mask is in d24 (d25, d26, d27 are unused) */ 1227 1228 /* in */ 1229 vmull.u8 q8, d24, d0 1230 vmull.u8 q9, d24, d1 1231 vmull.u8 q10, d24, d2 1232 vmull.u8 q11, d24, d3 1233 vrsra.u16 q8, q8, #8 1234 vrsra.u16 q9, q9, #8 1235 vrsra.u16 q10, q10, #8 1236 vrsra.u16 q11, q11, #8 1237.endm 1238 1239.macro pixman_composite_src_n_8_8888_process_pixblock_tail 1240 vrshrn.u16 d28, q8, #8 1241 vrshrn.u16 d29, q9, #8 1242 vrshrn.u16 d30, q10, #8 1243 vrshrn.u16 d31, q11, #8 1244.endm 1245 1246.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head 1247 fetch_mask_pixblock 1248 PF add PF_X, PF_X, #8 1249 vrshrn.u16 d28, q8, #8 1250 PF tst PF_CTL, #0x0F 1251 vrshrn.u16 d29, q9, #8 1252 PF addne PF_X, PF_X, #8 1253 vrshrn.u16 d30, q10, #8 1254 PF subne PF_CTL, PF_CTL, #1 1255 vrshrn.u16 d31, q11, #8 1256 PF cmp PF_X, ORIG_W 1257 vmull.u8 q8, d24, d0 1258 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1259 vmull.u8 q9, d24, d1 1260 PF subge PF_X, PF_X, ORIG_W 1261 vmull.u8 q10, d24, d2 1262 PF subges PF_CTL, PF_CTL, #0x10 1263 vmull.u8 q11, d24, d3 1264 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1265 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1266 vrsra.u16 q8, q8, #8 1267 vrsra.u16 q9, q9, #8 1268 vrsra.u16 q10, q10, #8 1269 vrsra.u16 q11, q11, #8 1270.endm 1271 1272.macro pixman_composite_src_n_8_8888_init 1273 add DUMMY, sp, #ARGS_STACK_OFFSET 1274 vld1.32 {d3[0]}, [DUMMY] 1275 vdup.8 d0, d3[0] 1276 vdup.8 d1, d3[1] 1277 vdup.8 d2, d3[2] 1278 vdup.8 d3, d3[3] 1279.endm 1280 1281.macro pixman_composite_src_n_8_8888_cleanup 1282.endm 1283 1284generate_composite_function \ 1285 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ 1286 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 1287 8, /* number of pixels, processed in a single block */ \ 1288 5, /* prefetch distance */ \ 1289 pixman_composite_src_n_8_8888_init, \ 1290 pixman_composite_src_n_8_8888_cleanup, \ 1291 pixman_composite_src_n_8_8888_process_pixblock_head, \ 1292 pixman_composite_src_n_8_8888_process_pixblock_tail, \ 1293 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ 1294 1295/******************************************************************************/ 1296 1297.macro pixman_composite_src_n_8_8_process_pixblock_head 1298 vmull.u8 q0, d24, d16 1299 vmull.u8 q1, d25, d16 1300 vmull.u8 q2, d26, d16 1301 vmull.u8 q3, d27, d16 1302 vrsra.u16 q0, q0, #8 1303 vrsra.u16 q1, q1, #8 1304 vrsra.u16 q2, q2, #8 1305 vrsra.u16 q3, q3, #8 1306.endm 1307 1308.macro pixman_composite_src_n_8_8_process_pixblock_tail 1309 vrshrn.u16 d28, q0, #8 1310 vrshrn.u16 d29, q1, #8 1311 vrshrn.u16 d30, q2, #8 1312 vrshrn.u16 d31, q3, #8 1313.endm 1314 1315.macro pixman_composite_src_n_8_8_process_pixblock_tail_head 1316 fetch_mask_pixblock 1317 PF add PF_X, PF_X, #8 1318 vrshrn.u16 d28, q0, #8 1319 PF tst PF_CTL, #0x0F 1320 vrshrn.u16 d29, q1, #8 1321 PF addne PF_X, PF_X, #8 1322 vrshrn.u16 d30, q2, #8 1323 PF subne PF_CTL, PF_CTL, #1 1324 vrshrn.u16 d31, q3, #8 1325 PF cmp PF_X, ORIG_W 1326 vmull.u8 q0, d24, d16 1327 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1328 vmull.u8 q1, d25, d16 1329 PF subge PF_X, PF_X, ORIG_W 1330 vmull.u8 q2, d26, d16 1331 PF subges PF_CTL, PF_CTL, #0x10 1332 vmull.u8 q3, d27, d16 1333 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1334 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1335 vrsra.u16 q0, q0, #8 1336 vrsra.u16 q1, q1, #8 1337 vrsra.u16 q2, q2, #8 1338 vrsra.u16 q3, q3, #8 1339.endm 1340 1341.macro pixman_composite_src_n_8_8_init 1342 add DUMMY, sp, #ARGS_STACK_OFFSET 1343 vld1.32 {d16[0]}, [DUMMY] 1344 vdup.8 d16, d16[3] 1345.endm 1346 1347.macro pixman_composite_src_n_8_8_cleanup 1348.endm 1349 1350generate_composite_function \ 1351 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ 1352 FLAG_DST_WRITEONLY, \ 1353 32, /* number of pixels, processed in a single block */ \ 1354 5, /* prefetch distance */ \ 1355 pixman_composite_src_n_8_8_init, \ 1356 pixman_composite_src_n_8_8_cleanup, \ 1357 pixman_composite_src_n_8_8_process_pixblock_head, \ 1358 pixman_composite_src_n_8_8_process_pixblock_tail, \ 1359 pixman_composite_src_n_8_8_process_pixblock_tail_head 1360 1361/******************************************************************************/ 1362 1363.macro pixman_composite_over_n_8_8888_process_pixblock_head 1364 /* expecting deinterleaved source data in {d8, d9, d10, d11} */ 1365 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1366 /* and destination data in {d4, d5, d6, d7} */ 1367 /* mask is in d24 (d25, d26, d27 are unused) */ 1368 1369 /* in */ 1370 vmull.u8 q6, d24, d8 1371 vmull.u8 q7, d24, d9 1372 vmull.u8 q8, d24, d10 1373 vmull.u8 q9, d24, d11 1374 vrshr.u16 q10, q6, #8 1375 vrshr.u16 q11, q7, #8 1376 vrshr.u16 q12, q8, #8 1377 vrshr.u16 q13, q9, #8 1378 vraddhn.u16 d0, q6, q10 1379 vraddhn.u16 d1, q7, q11 1380 vraddhn.u16 d2, q8, q12 1381 vraddhn.u16 d3, q9, q13 1382 vmvn.8 d25, d3 /* get inverted alpha */ 1383 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */ 1384 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */ 1385 /* now do alpha blending */ 1386 vmull.u8 q8, d25, d4 1387 vmull.u8 q9, d25, d5 1388 vmull.u8 q10, d25, d6 1389 vmull.u8 q11, d25, d7 1390.endm 1391 1392.macro pixman_composite_over_n_8_8888_process_pixblock_tail 1393 vrshr.u16 q14, q8, #8 1394 vrshr.u16 q15, q9, #8 1395 vrshr.u16 q6, q10, #8 1396 vrshr.u16 q7, q11, #8 1397 vraddhn.u16 d28, q14, q8 1398 vraddhn.u16 d29, q15, q9 1399 vraddhn.u16 d30, q6, q10 1400 vraddhn.u16 d31, q7, q11 1401 vqadd.u8 q14, q0, q14 1402 vqadd.u8 q15, q1, q15 1403.endm 1404 1405.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head 1406 vrshr.u16 q14, q8, #8 1407 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1408 vrshr.u16 q15, q9, #8 1409 fetch_mask_pixblock 1410 vrshr.u16 q6, q10, #8 1411 PF add PF_X, PF_X, #8 1412 vrshr.u16 q7, q11, #8 1413 PF tst PF_CTL, #0x0F 1414 vraddhn.u16 d28, q14, q8 1415 PF addne PF_X, PF_X, #8 1416 vraddhn.u16 d29, q15, q9 1417 PF subne PF_CTL, PF_CTL, #1 1418 vraddhn.u16 d30, q6, q10 1419 PF cmp PF_X, ORIG_W 1420 vraddhn.u16 d31, q7, q11 1421 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] 1422 vmull.u8 q6, d24, d8 1423 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] 1424 vmull.u8 q7, d24, d9 1425 PF subge PF_X, PF_X, ORIG_W 1426 vmull.u8 q8, d24, d10 1427 PF subges PF_CTL, PF_CTL, #0x10 1428 vmull.u8 q9, d24, d11 1429 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! 1430 vqadd.u8 q14, q0, q14 1431 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! 1432 vqadd.u8 q15, q1, q15 1433 vrshr.u16 q10, q6, #8 1434 vrshr.u16 q11, q7, #8 1435 vrshr.u16 q12, q8, #8 1436 vrshr.u16 q13, q9, #8 1437 vraddhn.u16 d0, q6, q10 1438 vraddhn.u16 d1, q7, q11 1439 vraddhn.u16 d2, q8, q12 1440 vraddhn.u16 d3, q9, q13 1441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1442 vmvn.8 d25, d3 1443 vmull.u8 q8, d25, d4 1444 vmull.u8 q9, d25, d5 1445 vmull.u8 q10, d25, d6 1446 vmull.u8 q11, d25, d7 1447.endm 1448 1449.macro pixman_composite_over_n_8_8888_init 1450 add DUMMY, sp, #ARGS_STACK_OFFSET 1451 vpush {d8-d15} 1452 vld1.32 {d11[0]}, [DUMMY] 1453 vdup.8 d8, d11[0] 1454 vdup.8 d9, d11[1] 1455 vdup.8 d10, d11[2] 1456 vdup.8 d11, d11[3] 1457.endm 1458 1459.macro pixman_composite_over_n_8_8888_cleanup 1460 vpop {d8-d15} 1461.endm 1462 1463generate_composite_function \ 1464 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ 1465 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1466 8, /* number of pixels, processed in a single block */ \ 1467 5, /* prefetch distance */ \ 1468 pixman_composite_over_n_8_8888_init, \ 1469 pixman_composite_over_n_8_8888_cleanup, \ 1470 pixman_composite_over_n_8_8888_process_pixblock_head, \ 1471 pixman_composite_over_n_8_8888_process_pixblock_tail, \ 1472 pixman_composite_over_n_8_8888_process_pixblock_tail_head 1473 1474/******************************************************************************/ 1475 1476.macro pixman_composite_over_n_8_8_process_pixblock_head 1477 vmull.u8 q0, d24, d8 1478 vmull.u8 q1, d25, d8 1479 vmull.u8 q6, d26, d8 1480 vmull.u8 q7, d27, d8 1481 vrshr.u16 q10, q0, #8 1482 vrshr.u16 q11, q1, #8 1483 vrshr.u16 q12, q6, #8 1484 vrshr.u16 q13, q7, #8 1485 vraddhn.u16 d0, q0, q10 1486 vraddhn.u16 d1, q1, q11 1487 vraddhn.u16 d2, q6, q12 1488 vraddhn.u16 d3, q7, q13 1489 vmvn.8 q12, q0 1490 vmvn.8 q13, q1 1491 vmull.u8 q8, d24, d4 1492 vmull.u8 q9, d25, d5 1493 vmull.u8 q10, d26, d6 1494 vmull.u8 q11, d27, d7 1495.endm 1496 1497.macro pixman_composite_over_n_8_8_process_pixblock_tail 1498 vrshr.u16 q14, q8, #8 1499 vrshr.u16 q15, q9, #8 1500 vrshr.u16 q12, q10, #8 1501 vrshr.u16 q13, q11, #8 1502 vraddhn.u16 d28, q14, q8 1503 vraddhn.u16 d29, q15, q9 1504 vraddhn.u16 d30, q12, q10 1505 vraddhn.u16 d31, q13, q11 1506 vqadd.u8 q14, q0, q14 1507 vqadd.u8 q15, q1, q15 1508.endm 1509 1510/* TODO: expand macros and do better instructions scheduling */ 1511.macro pixman_composite_over_n_8_8_process_pixblock_tail_head 1512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1513 pixman_composite_over_n_8_8_process_pixblock_tail 1514 fetch_mask_pixblock 1515 cache_preload 32, 32 1516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1517 pixman_composite_over_n_8_8_process_pixblock_head 1518.endm 1519 1520.macro pixman_composite_over_n_8_8_init 1521 add DUMMY, sp, #ARGS_STACK_OFFSET 1522 vpush {d8-d15} 1523 vld1.32 {d8[0]}, [DUMMY] 1524 vdup.8 d8, d8[3] 1525.endm 1526 1527.macro pixman_composite_over_n_8_8_cleanup 1528 vpop {d8-d15} 1529.endm 1530 1531generate_composite_function \ 1532 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ 1533 FLAG_DST_READWRITE, \ 1534 32, /* number of pixels, processed in a single block */ \ 1535 5, /* prefetch distance */ \ 1536 pixman_composite_over_n_8_8_init, \ 1537 pixman_composite_over_n_8_8_cleanup, \ 1538 pixman_composite_over_n_8_8_process_pixblock_head, \ 1539 pixman_composite_over_n_8_8_process_pixblock_tail, \ 1540 pixman_composite_over_n_8_8_process_pixblock_tail_head 1541 1542/******************************************************************************/ 1543 1544.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1545 /* 1546 * 'combine_mask_ca' replacement 1547 * 1548 * input: solid src (n) in {d8, d9, d10, d11} 1549 * dest in {d4, d5, d6, d7 } 1550 * mask in {d24, d25, d26, d27} 1551 * output: updated src in {d0, d1, d2, d3 } 1552 * updated mask in {d24, d25, d26, d3 } 1553 */ 1554 vmull.u8 q0, d24, d8 1555 vmull.u8 q1, d25, d9 1556 vmull.u8 q6, d26, d10 1557 vmull.u8 q7, d27, d11 1558 vmull.u8 q9, d11, d25 1559 vmull.u8 q12, d11, d24 1560 vmull.u8 q13, d11, d26 1561 vrshr.u16 q8, q0, #8 1562 vrshr.u16 q10, q1, #8 1563 vrshr.u16 q11, q6, #8 1564 vraddhn.u16 d0, q0, q8 1565 vraddhn.u16 d1, q1, q10 1566 vraddhn.u16 d2, q6, q11 1567 vrshr.u16 q11, q12, #8 1568 vrshr.u16 q8, q9, #8 1569 vrshr.u16 q6, q13, #8 1570 vrshr.u16 q10, q7, #8 1571 vraddhn.u16 d24, q12, q11 1572 vraddhn.u16 d25, q9, q8 1573 vraddhn.u16 d26, q13, q6 1574 vraddhn.u16 d3, q7, q10 1575 /* 1576 * 'combine_over_ca' replacement 1577 * 1578 * output: updated dest in {d28, d29, d30, d31} 1579 */ 1580 vmvn.8 q12, q12 1581 vmvn.8 d26, d26 1582 vmull.u8 q8, d24, d4 1583 vmull.u8 q9, d25, d5 1584 vmvn.8 d27, d3 1585 vmull.u8 q10, d26, d6 1586 vmull.u8 q11, d27, d7 1587.endm 1588 1589.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail 1590 /* ... continue 'combine_over_ca' replacement */ 1591 vrshr.u16 q14, q8, #8 1592 vrshr.u16 q15, q9, #8 1593 vrshr.u16 q6, q10, #8 1594 vrshr.u16 q7, q11, #8 1595 vraddhn.u16 d28, q14, q8 1596 vraddhn.u16 d29, q15, q9 1597 vraddhn.u16 d30, q6, q10 1598 vraddhn.u16 d31, q7, q11 1599 vqadd.u8 q14, q0, q14 1600 vqadd.u8 q15, q1, q15 1601.endm 1602 1603.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1604 vrshr.u16 q14, q8, #8 1605 vrshr.u16 q15, q9, #8 1606 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 1607 vrshr.u16 q6, q10, #8 1608 vrshr.u16 q7, q11, #8 1609 vraddhn.u16 d28, q14, q8 1610 vraddhn.u16 d29, q15, q9 1611 vraddhn.u16 d30, q6, q10 1612 vraddhn.u16 d31, q7, q11 1613 fetch_mask_pixblock 1614 vqadd.u8 q14, q0, q14 1615 vqadd.u8 q15, q1, q15 1616 cache_preload 8, 8 1617 pixman_composite_over_n_8888_8888_ca_process_pixblock_head 1618 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 1619.endm 1620 1621.macro pixman_composite_over_n_8888_8888_ca_init 1622 add DUMMY, sp, #ARGS_STACK_OFFSET 1623 vpush {d8-d15} 1624 vld1.32 {d11[0]}, [DUMMY] 1625 vdup.8 d8, d11[0] 1626 vdup.8 d9, d11[1] 1627 vdup.8 d10, d11[2] 1628 vdup.8 d11, d11[3] 1629.endm 1630 1631.macro pixman_composite_over_n_8888_8888_ca_cleanup 1632 vpop {d8-d15} 1633.endm 1634 1635generate_composite_function \ 1636 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ 1637 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1638 8, /* number of pixels, processed in a single block */ \ 1639 5, /* prefetch distance */ \ 1640 pixman_composite_over_n_8888_8888_ca_init, \ 1641 pixman_composite_over_n_8888_8888_ca_cleanup, \ 1642 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ 1643 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ 1644 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head 1645 1646/******************************************************************************/ 1647 1648.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head 1649 /* 1650 * 'combine_mask_ca' replacement 1651 * 1652 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1653 * mask in {d24, d25, d26} [B, G, R] 1654 * output: updated src in {d0, d1, d2 } [B, G, R] 1655 * updated mask in {d24, d25, d26} [B, G, R] 1656 */ 1657 vmull.u8 q0, d24, d8 1658 vmull.u8 q1, d25, d9 1659 vmull.u8 q6, d26, d10 1660 vmull.u8 q9, d11, d25 1661 vmull.u8 q12, d11, d24 1662 vmull.u8 q13, d11, d26 1663 vrshr.u16 q8, q0, #8 1664 vrshr.u16 q10, q1, #8 1665 vrshr.u16 q11, q6, #8 1666 vraddhn.u16 d0, q0, q8 1667 vraddhn.u16 d1, q1, q10 1668 vraddhn.u16 d2, q6, q11 1669 vrshr.u16 q11, q12, #8 1670 vrshr.u16 q8, q9, #8 1671 vrshr.u16 q6, q13, #8 1672 vraddhn.u16 d24, q12, q11 1673 vraddhn.u16 d25, q9, q8 1674 /* 1675 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format 1676 * and put data into d16 - blue, d17 - green, d18 - red 1677 */ 1678 vshrn.u16 d17, q2, #3 1679 vshrn.u16 d18, q2, #8 1680 vraddhn.u16 d26, q13, q6 1681 vsli.u16 q2, q2, #5 1682 vsri.u8 d18, d18, #5 1683 vsri.u8 d17, d17, #6 1684 /* 1685 * 'combine_over_ca' replacement 1686 * 1687 * output: updated dest in d16 - blue, d17 - green, d18 - red 1688 */ 1689 vmvn.8 q12, q12 1690 vshrn.u16 d16, q2, #2 1691 vmvn.8 d26, d26 1692 vmull.u8 q6, d16, d24 1693 vmull.u8 q7, d17, d25 1694 vmull.u8 q11, d18, d26 1695.endm 1696 1697.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail 1698 /* ... continue 'combine_over_ca' replacement */ 1699 vrshr.u16 q10, q6, #8 1700 vrshr.u16 q14, q7, #8 1701 vrshr.u16 q15, q11, #8 1702 vraddhn.u16 d16, q10, q6 1703 vraddhn.u16 d17, q14, q7 1704 vraddhn.u16 d18, q15, q11 1705 vqadd.u8 q8, q0, q8 1706 vqadd.u8 d18, d2, d18 1707 /* 1708 * convert the results in d16, d17, d18 to r5g6b5 and store 1709 * them into {d28, d29} 1710 */ 1711 vshll.u8 q14, d18, #8 1712 vshll.u8 q10, d17, #8 1713 vshll.u8 q15, d16, #8 1714 vsri.u16 q14, q10, #5 1715 vsri.u16 q14, q15, #11 1716.endm 1717 1718.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1719 fetch_mask_pixblock 1720 vrshr.u16 q10, q6, #8 1721 vrshr.u16 q14, q7, #8 1722 vld1.16 {d4, d5}, [DST_R, :128]! 1723 vrshr.u16 q15, q11, #8 1724 vraddhn.u16 d16, q10, q6 1725 vraddhn.u16 d17, q14, q7 1726 vraddhn.u16 d22, q15, q11 1727 /* process_pixblock_head */ 1728 /* 1729 * 'combine_mask_ca' replacement 1730 * 1731 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A] 1732 * mask in {d24, d25, d26} [B, G, R] 1733 * output: updated src in {d0, d1, d2 } [B, G, R] 1734 * updated mask in {d24, d25, d26} [B, G, R] 1735 */ 1736 vmull.u8 q6, d26, d10 1737 vqadd.u8 q8, q0, q8 1738 vmull.u8 q0, d24, d8 1739 vqadd.u8 d22, d2, d22 1740 vmull.u8 q1, d25, d9 1741 /* 1742 * convert the result in d16, d17, d22 to r5g6b5 and store 1743 * it into {d28, d29} 1744 */ 1745 vshll.u8 q14, d22, #8 1746 vshll.u8 q10, d17, #8 1747 vshll.u8 q15, d16, #8 1748 vmull.u8 q9, d11, d25 1749 vsri.u16 q14, q10, #5 1750 vmull.u8 q12, d11, d24 1751 vmull.u8 q13, d11, d26 1752 vsri.u16 q14, q15, #11 1753 cache_preload 8, 8 1754 vrshr.u16 q8, q0, #8 1755 vrshr.u16 q10, q1, #8 1756 vrshr.u16 q11, q6, #8 1757 vraddhn.u16 d0, q0, q8 1758 vraddhn.u16 d1, q1, q10 1759 vraddhn.u16 d2, q6, q11 1760 vrshr.u16 q11, q12, #8 1761 vrshr.u16 q8, q9, #8 1762 vrshr.u16 q6, q13, #8 1763 vraddhn.u16 d24, q12, q11 1764 vraddhn.u16 d25, q9, q8 1765 /* 1766 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 1767 * 8-bit format and put data into d16 - blue, d17 - green, 1768 * d18 - red 1769 */ 1770 vshrn.u16 d17, q2, #3 1771 vshrn.u16 d18, q2, #8 1772 vraddhn.u16 d26, q13, q6 1773 vsli.u16 q2, q2, #5 1774 vsri.u8 d17, d17, #6 1775 vsri.u8 d18, d18, #5 1776 /* 1777 * 'combine_over_ca' replacement 1778 * 1779 * output: updated dest in d16 - blue, d17 - green, d18 - red 1780 */ 1781 vmvn.8 q12, q12 1782 vshrn.u16 d16, q2, #2 1783 vmvn.8 d26, d26 1784 vmull.u8 q7, d17, d25 1785 vmull.u8 q6, d16, d24 1786 vmull.u8 q11, d18, d26 1787 vst1.16 {d28, d29}, [DST_W, :128]! 1788.endm 1789 1790.macro pixman_composite_over_n_8888_0565_ca_init 1791 add DUMMY, sp, #ARGS_STACK_OFFSET 1792 vpush {d8-d15} 1793 vld1.32 {d11[0]}, [DUMMY] 1794 vdup.8 d8, d11[0] 1795 vdup.8 d9, d11[1] 1796 vdup.8 d10, d11[2] 1797 vdup.8 d11, d11[3] 1798.endm 1799 1800.macro pixman_composite_over_n_8888_0565_ca_cleanup 1801 vpop {d8-d15} 1802.endm 1803 1804generate_composite_function \ 1805 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ 1806 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 1807 8, /* number of pixels, processed in a single block */ \ 1808 5, /* prefetch distance */ \ 1809 pixman_composite_over_n_8888_0565_ca_init, \ 1810 pixman_composite_over_n_8888_0565_ca_cleanup, \ 1811 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ 1812 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ 1813 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head 1814 1815/******************************************************************************/ 1816 1817.macro pixman_composite_in_n_8_process_pixblock_head 1818 /* expecting source data in {d0, d1, d2, d3} */ 1819 /* and destination data in {d4, d5, d6, d7} */ 1820 vmull.u8 q8, d4, d3 1821 vmull.u8 q9, d5, d3 1822 vmull.u8 q10, d6, d3 1823 vmull.u8 q11, d7, d3 1824.endm 1825 1826.macro pixman_composite_in_n_8_process_pixblock_tail 1827 vrshr.u16 q14, q8, #8 1828 vrshr.u16 q15, q9, #8 1829 vrshr.u16 q12, q10, #8 1830 vrshr.u16 q13, q11, #8 1831 vraddhn.u16 d28, q8, q14 1832 vraddhn.u16 d29, q9, q15 1833 vraddhn.u16 d30, q10, q12 1834 vraddhn.u16 d31, q11, q13 1835.endm 1836 1837.macro pixman_composite_in_n_8_process_pixblock_tail_head 1838 pixman_composite_in_n_8_process_pixblock_tail 1839 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1840 cache_preload 32, 32 1841 pixman_composite_in_n_8_process_pixblock_head 1842 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1843.endm 1844 1845.macro pixman_composite_in_n_8_init 1846 add DUMMY, sp, #ARGS_STACK_OFFSET 1847 vld1.32 {d3[0]}, [DUMMY] 1848 vdup.8 d3, d3[3] 1849.endm 1850 1851.macro pixman_composite_in_n_8_cleanup 1852.endm 1853 1854generate_composite_function \ 1855 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ 1856 FLAG_DST_READWRITE, \ 1857 32, /* number of pixels, processed in a single block */ \ 1858 5, /* prefetch distance */ \ 1859 pixman_composite_in_n_8_init, \ 1860 pixman_composite_in_n_8_cleanup, \ 1861 pixman_composite_in_n_8_process_pixblock_head, \ 1862 pixman_composite_in_n_8_process_pixblock_tail, \ 1863 pixman_composite_in_n_8_process_pixblock_tail_head, \ 1864 28, /* dst_w_basereg */ \ 1865 4, /* dst_r_basereg */ \ 1866 0, /* src_basereg */ \ 1867 24 /* mask_basereg */ 1868 1869.macro pixman_composite_add_n_8_8_process_pixblock_head 1870 /* expecting source data in {d8, d9, d10, d11} */ 1871 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */ 1872 /* and destination data in {d4, d5, d6, d7} */ 1873 /* mask is in d24, d25, d26, d27 */ 1874 vmull.u8 q0, d24, d11 1875 vmull.u8 q1, d25, d11 1876 vmull.u8 q6, d26, d11 1877 vmull.u8 q7, d27, d11 1878 vrshr.u16 q10, q0, #8 1879 vrshr.u16 q11, q1, #8 1880 vrshr.u16 q12, q6, #8 1881 vrshr.u16 q13, q7, #8 1882 vraddhn.u16 d0, q0, q10 1883 vraddhn.u16 d1, q1, q11 1884 vraddhn.u16 d2, q6, q12 1885 vraddhn.u16 d3, q7, q13 1886 vqadd.u8 q14, q0, q2 1887 vqadd.u8 q15, q1, q3 1888.endm 1889 1890.macro pixman_composite_add_n_8_8_process_pixblock_tail 1891.endm 1892 1893/* TODO: expand macros and do better instructions scheduling */ 1894.macro pixman_composite_add_n_8_8_process_pixblock_tail_head 1895 pixman_composite_add_n_8_8_process_pixblock_tail 1896 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1897 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1898 fetch_mask_pixblock 1899 cache_preload 32, 32 1900 pixman_composite_add_n_8_8_process_pixblock_head 1901.endm 1902 1903.macro pixman_composite_add_n_8_8_init 1904 add DUMMY, sp, #ARGS_STACK_OFFSET 1905 vpush {d8-d15} 1906 vld1.32 {d11[0]}, [DUMMY] 1907 vdup.8 d11, d11[3] 1908.endm 1909 1910.macro pixman_composite_add_n_8_8_cleanup 1911 vpop {d8-d15} 1912.endm 1913 1914generate_composite_function \ 1915 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ 1916 FLAG_DST_READWRITE, \ 1917 32, /* number of pixels, processed in a single block */ \ 1918 5, /* prefetch distance */ \ 1919 pixman_composite_add_n_8_8_init, \ 1920 pixman_composite_add_n_8_8_cleanup, \ 1921 pixman_composite_add_n_8_8_process_pixblock_head, \ 1922 pixman_composite_add_n_8_8_process_pixblock_tail, \ 1923 pixman_composite_add_n_8_8_process_pixblock_tail_head 1924 1925/******************************************************************************/ 1926 1927.macro pixman_composite_add_8_8_8_process_pixblock_head 1928 /* expecting source data in {d0, d1, d2, d3} */ 1929 /* destination data in {d4, d5, d6, d7} */ 1930 /* mask in {d24, d25, d26, d27} */ 1931 vmull.u8 q8, d24, d0 1932 vmull.u8 q9, d25, d1 1933 vmull.u8 q10, d26, d2 1934 vmull.u8 q11, d27, d3 1935 vrshr.u16 q0, q8, #8 1936 vrshr.u16 q1, q9, #8 1937 vrshr.u16 q12, q10, #8 1938 vrshr.u16 q13, q11, #8 1939 vraddhn.u16 d0, q0, q8 1940 vraddhn.u16 d1, q1, q9 1941 vraddhn.u16 d2, q12, q10 1942 vraddhn.u16 d3, q13, q11 1943 vqadd.u8 q14, q0, q2 1944 vqadd.u8 q15, q1, q3 1945.endm 1946 1947.macro pixman_composite_add_8_8_8_process_pixblock_tail 1948.endm 1949 1950/* TODO: expand macros and do better instructions scheduling */ 1951.macro pixman_composite_add_8_8_8_process_pixblock_tail_head 1952 pixman_composite_add_8_8_8_process_pixblock_tail 1953 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]! 1954 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]! 1955 fetch_mask_pixblock 1956 fetch_src_pixblock 1957 cache_preload 32, 32 1958 pixman_composite_add_8_8_8_process_pixblock_head 1959.endm 1960 1961.macro pixman_composite_add_8_8_8_init 1962.endm 1963 1964.macro pixman_composite_add_8_8_8_cleanup 1965.endm 1966 1967generate_composite_function \ 1968 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ 1969 FLAG_DST_READWRITE, \ 1970 32, /* number of pixels, processed in a single block */ \ 1971 5, /* prefetch distance */ \ 1972 pixman_composite_add_8_8_8_init, \ 1973 pixman_composite_add_8_8_8_cleanup, \ 1974 pixman_composite_add_8_8_8_process_pixblock_head, \ 1975 pixman_composite_add_8_8_8_process_pixblock_tail, \ 1976 pixman_composite_add_8_8_8_process_pixblock_tail_head 1977 1978/******************************************************************************/ 1979 1980.macro pixman_composite_add_8888_8888_8888_process_pixblock_head 1981 /* expecting source data in {d0, d1, d2, d3} */ 1982 /* destination data in {d4, d5, d6, d7} */ 1983 /* mask in {d24, d25, d26, d27} */ 1984 vmull.u8 q8, d27, d0 1985 vmull.u8 q9, d27, d1 1986 vmull.u8 q10, d27, d2 1987 vmull.u8 q11, d27, d3 1988 /* 1 cycle bubble */ 1989 vrsra.u16 q8, q8, #8 1990 vrsra.u16 q9, q9, #8 1991 vrsra.u16 q10, q10, #8 1992 vrsra.u16 q11, q11, #8 1993.endm 1994 1995.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail 1996 /* 2 cycle bubble */ 1997 vrshrn.u16 d28, q8, #8 1998 vrshrn.u16 d29, q9, #8 1999 vrshrn.u16 d30, q10, #8 2000 vrshrn.u16 d31, q11, #8 2001 vqadd.u8 q14, q2, q14 2002 /* 1 cycle bubble */ 2003 vqadd.u8 q15, q3, q15 2004.endm 2005 2006.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2007 fetch_src_pixblock 2008 vrshrn.u16 d28, q8, #8 2009 fetch_mask_pixblock 2010 vrshrn.u16 d29, q9, #8 2011 vmull.u8 q8, d27, d0 2012 vrshrn.u16 d30, q10, #8 2013 vmull.u8 q9, d27, d1 2014 vrshrn.u16 d31, q11, #8 2015 vmull.u8 q10, d27, d2 2016 vqadd.u8 q14, q2, q14 2017 vmull.u8 q11, d27, d3 2018 vqadd.u8 q15, q3, q15 2019 vrsra.u16 q8, q8, #8 2020 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2021 vrsra.u16 q9, q9, #8 2022 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2023 vrsra.u16 q10, q10, #8 2024 2025 cache_preload 8, 8 2026 2027 vrsra.u16 q11, q11, #8 2028.endm 2029 2030generate_composite_function \ 2031 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ 2032 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2033 8, /* number of pixels, processed in a single block */ \ 2034 10, /* prefetch distance */ \ 2035 default_init, \ 2036 default_cleanup, \ 2037 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2038 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2039 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2040 2041generate_composite_function_single_scanline \ 2042 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ 2043 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2044 8, /* number of pixels, processed in a single block */ \ 2045 default_init, \ 2046 default_cleanup, \ 2047 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2048 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2049 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head 2050 2051/******************************************************************************/ 2052 2053generate_composite_function \ 2054 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ 2055 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2056 8, /* number of pixels, processed in a single block */ \ 2057 5, /* prefetch distance */ \ 2058 default_init, \ 2059 default_cleanup, \ 2060 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2061 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2062 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2063 28, /* dst_w_basereg */ \ 2064 4, /* dst_r_basereg */ \ 2065 0, /* src_basereg */ \ 2066 27 /* mask_basereg */ 2067 2068/******************************************************************************/ 2069 2070.macro pixman_composite_add_n_8_8888_init 2071 add DUMMY, sp, #ARGS_STACK_OFFSET 2072 vld1.32 {d3[0]}, [DUMMY] 2073 vdup.8 d0, d3[0] 2074 vdup.8 d1, d3[1] 2075 vdup.8 d2, d3[2] 2076 vdup.8 d3, d3[3] 2077.endm 2078 2079.macro pixman_composite_add_n_8_8888_cleanup 2080.endm 2081 2082generate_composite_function \ 2083 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ 2084 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2085 8, /* number of pixels, processed in a single block */ \ 2086 5, /* prefetch distance */ \ 2087 pixman_composite_add_n_8_8888_init, \ 2088 pixman_composite_add_n_8_8888_cleanup, \ 2089 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2090 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2091 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2092 28, /* dst_w_basereg */ \ 2093 4, /* dst_r_basereg */ \ 2094 0, /* src_basereg */ \ 2095 27 /* mask_basereg */ 2096 2097/******************************************************************************/ 2098 2099.macro pixman_composite_add_8888_n_8888_init 2100 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2101 vld1.32 {d27[0]}, [DUMMY] 2102 vdup.8 d27, d27[3] 2103.endm 2104 2105.macro pixman_composite_add_8888_n_8888_cleanup 2106.endm 2107 2108generate_composite_function \ 2109 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ 2110 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2111 8, /* number of pixels, processed in a single block */ \ 2112 5, /* prefetch distance */ \ 2113 pixman_composite_add_8888_n_8888_init, \ 2114 pixman_composite_add_8888_n_8888_cleanup, \ 2115 pixman_composite_add_8888_8888_8888_process_pixblock_head, \ 2116 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ 2117 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ 2118 28, /* dst_w_basereg */ \ 2119 4, /* dst_r_basereg */ \ 2120 0, /* src_basereg */ \ 2121 27 /* mask_basereg */ 2122 2123/******************************************************************************/ 2124 2125.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2126 /* expecting source data in {d0, d1, d2, d3} */ 2127 /* destination data in {d4, d5, d6, d7} */ 2128 /* solid mask is in d15 */ 2129 2130 /* 'in' */ 2131 vmull.u8 q8, d15, d3 2132 vmull.u8 q6, d15, d2 2133 vmull.u8 q5, d15, d1 2134 vmull.u8 q4, d15, d0 2135 vrshr.u16 q13, q8, #8 2136 vrshr.u16 q12, q6, #8 2137 vrshr.u16 q11, q5, #8 2138 vrshr.u16 q10, q4, #8 2139 vraddhn.u16 d3, q8, q13 2140 vraddhn.u16 d2, q6, q12 2141 vraddhn.u16 d1, q5, q11 2142 vraddhn.u16 d0, q4, q10 2143 vmvn.8 d24, d3 /* get inverted alpha */ 2144 /* now do alpha blending */ 2145 vmull.u8 q8, d24, d4 2146 vmull.u8 q9, d24, d5 2147 vmull.u8 q10, d24, d6 2148 vmull.u8 q11, d24, d7 2149.endm 2150 2151.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2152 vrshr.u16 q14, q8, #8 2153 vrshr.u16 q15, q9, #8 2154 vrshr.u16 q12, q10, #8 2155 vrshr.u16 q13, q11, #8 2156 vraddhn.u16 d28, q14, q8 2157 vraddhn.u16 d29, q15, q9 2158 vraddhn.u16 d30, q12, q10 2159 vraddhn.u16 d31, q13, q11 2160.endm 2161 2162/* TODO: expand macros and do better instructions scheduling */ 2163.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head 2164 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2165 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2166 fetch_src_pixblock 2167 cache_preload 8, 8 2168 fetch_mask_pixblock 2169 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2170 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2171.endm 2172 2173generate_composite_function_single_scanline \ 2174 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ 2175 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2176 8, /* number of pixels, processed in a single block */ \ 2177 default_init_need_all_regs, \ 2178 default_cleanup_need_all_regs, \ 2179 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ 2180 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ 2181 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ 2182 28, /* dst_w_basereg */ \ 2183 4, /* dst_r_basereg */ \ 2184 0, /* src_basereg */ \ 2185 12 /* mask_basereg */ 2186 2187/******************************************************************************/ 2188 2189.macro pixman_composite_over_8888_n_8888_process_pixblock_head 2190 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head 2191.endm 2192 2193.macro pixman_composite_over_8888_n_8888_process_pixblock_tail 2194 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail 2195 vqadd.u8 q14, q0, q14 2196 vqadd.u8 q15, q1, q15 2197.endm 2198 2199/* TODO: expand macros and do better instructions scheduling */ 2200.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2201 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2202 pixman_composite_over_8888_n_8888_process_pixblock_tail 2203 fetch_src_pixblock 2204 cache_preload 8, 8 2205 pixman_composite_over_8888_n_8888_process_pixblock_head 2206 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2207.endm 2208 2209.macro pixman_composite_over_8888_n_8888_init 2210 add DUMMY, sp, #48 2211 vpush {d8-d15} 2212 vld1.32 {d15[0]}, [DUMMY] 2213 vdup.8 d15, d15[3] 2214.endm 2215 2216.macro pixman_composite_over_8888_n_8888_cleanup 2217 vpop {d8-d15} 2218.endm 2219 2220generate_composite_function \ 2221 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ 2222 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2223 8, /* number of pixels, processed in a single block */ \ 2224 5, /* prefetch distance */ \ 2225 pixman_composite_over_8888_n_8888_init, \ 2226 pixman_composite_over_8888_n_8888_cleanup, \ 2227 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2228 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2229 pixman_composite_over_8888_n_8888_process_pixblock_tail_head 2230 2231/******************************************************************************/ 2232 2233/* TODO: expand macros and do better instructions scheduling */ 2234.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head 2235 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2236 pixman_composite_over_8888_n_8888_process_pixblock_tail 2237 fetch_src_pixblock 2238 cache_preload 8, 8 2239 fetch_mask_pixblock 2240 pixman_composite_over_8888_n_8888_process_pixblock_head 2241 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2242.endm 2243 2244generate_composite_function \ 2245 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ 2246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2247 8, /* number of pixels, processed in a single block */ \ 2248 5, /* prefetch distance */ \ 2249 default_init_need_all_regs, \ 2250 default_cleanup_need_all_regs, \ 2251 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2252 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2253 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2254 28, /* dst_w_basereg */ \ 2255 4, /* dst_r_basereg */ \ 2256 0, /* src_basereg */ \ 2257 12 /* mask_basereg */ 2258 2259generate_composite_function_single_scanline \ 2260 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ 2261 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2262 8, /* number of pixels, processed in a single block */ \ 2263 default_init_need_all_regs, \ 2264 default_cleanup_need_all_regs, \ 2265 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2266 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2267 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ 2268 28, /* dst_w_basereg */ \ 2269 4, /* dst_r_basereg */ \ 2270 0, /* src_basereg */ \ 2271 12 /* mask_basereg */ 2272 2273/******************************************************************************/ 2274 2275/* TODO: expand macros and do better instructions scheduling */ 2276.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head 2277 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2278 pixman_composite_over_8888_n_8888_process_pixblock_tail 2279 fetch_src_pixblock 2280 cache_preload 8, 8 2281 fetch_mask_pixblock 2282 pixman_composite_over_8888_n_8888_process_pixblock_head 2283 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2284.endm 2285 2286generate_composite_function \ 2287 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ 2288 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2289 8, /* number of pixels, processed in a single block */ \ 2290 5, /* prefetch distance */ \ 2291 default_init_need_all_regs, \ 2292 default_cleanup_need_all_regs, \ 2293 pixman_composite_over_8888_n_8888_process_pixblock_head, \ 2294 pixman_composite_over_8888_n_8888_process_pixblock_tail, \ 2295 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ 2296 28, /* dst_w_basereg */ \ 2297 4, /* dst_r_basereg */ \ 2298 0, /* src_basereg */ \ 2299 15 /* mask_basereg */ 2300 2301/******************************************************************************/ 2302 2303.macro pixman_composite_src_0888_0888_process_pixblock_head 2304.endm 2305 2306.macro pixman_composite_src_0888_0888_process_pixblock_tail 2307.endm 2308 2309.macro pixman_composite_src_0888_0888_process_pixblock_tail_head 2310 vst3.8 {d0, d1, d2}, [DST_W]! 2311 fetch_src_pixblock 2312 cache_preload 8, 8 2313.endm 2314 2315generate_composite_function \ 2316 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ 2317 FLAG_DST_WRITEONLY, \ 2318 8, /* number of pixels, processed in a single block */ \ 2319 10, /* prefetch distance */ \ 2320 default_init, \ 2321 default_cleanup, \ 2322 pixman_composite_src_0888_0888_process_pixblock_head, \ 2323 pixman_composite_src_0888_0888_process_pixblock_tail, \ 2324 pixman_composite_src_0888_0888_process_pixblock_tail_head, \ 2325 0, /* dst_w_basereg */ \ 2326 0, /* dst_r_basereg */ \ 2327 0, /* src_basereg */ \ 2328 0 /* mask_basereg */ 2329 2330/******************************************************************************/ 2331 2332.macro pixman_composite_src_0888_8888_rev_process_pixblock_head 2333 vswp d0, d2 2334.endm 2335 2336.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail 2337.endm 2338 2339.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head 2340 vst4.8 {d0, d1, d2, d3}, [DST_W]! 2341 fetch_src_pixblock 2342 vswp d0, d2 2343 cache_preload 8, 8 2344.endm 2345 2346.macro pixman_composite_src_0888_8888_rev_init 2347 veor d3, d3, d3 2348.endm 2349 2350generate_composite_function \ 2351 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ 2352 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2353 8, /* number of pixels, processed in a single block */ \ 2354 10, /* prefetch distance */ \ 2355 pixman_composite_src_0888_8888_rev_init, \ 2356 default_cleanup, \ 2357 pixman_composite_src_0888_8888_rev_process_pixblock_head, \ 2358 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ 2359 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ 2360 0, /* dst_w_basereg */ \ 2361 0, /* dst_r_basereg */ \ 2362 0, /* src_basereg */ \ 2363 0 /* mask_basereg */ 2364 2365/******************************************************************************/ 2366 2367.macro pixman_composite_src_0888_0565_rev_process_pixblock_head 2368 vshll.u8 q8, d1, #8 2369 vshll.u8 q9, d2, #8 2370.endm 2371 2372.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail 2373 vshll.u8 q14, d0, #8 2374 vsri.u16 q14, q8, #5 2375 vsri.u16 q14, q9, #11 2376.endm 2377 2378.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head 2379 vshll.u8 q14, d0, #8 2380 fetch_src_pixblock 2381 vsri.u16 q14, q8, #5 2382 vsri.u16 q14, q9, #11 2383 vshll.u8 q8, d1, #8 2384 vst1.16 {d28, d29}, [DST_W, :128]! 2385 vshll.u8 q9, d2, #8 2386.endm 2387 2388generate_composite_function \ 2389 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ 2390 FLAG_DST_WRITEONLY, \ 2391 8, /* number of pixels, processed in a single block */ \ 2392 10, /* prefetch distance */ \ 2393 default_init, \ 2394 default_cleanup, \ 2395 pixman_composite_src_0888_0565_rev_process_pixblock_head, \ 2396 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ 2397 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ 2398 28, /* dst_w_basereg */ \ 2399 0, /* dst_r_basereg */ \ 2400 0, /* src_basereg */ \ 2401 0 /* mask_basereg */ 2402 2403/******************************************************************************/ 2404 2405.macro pixman_composite_src_pixbuf_8888_process_pixblock_head 2406 vmull.u8 q8, d3, d0 2407 vmull.u8 q9, d3, d1 2408 vmull.u8 q10, d3, d2 2409.endm 2410 2411.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail 2412 vrshr.u16 q11, q8, #8 2413 vswp d3, d31 2414 vrshr.u16 q12, q9, #8 2415 vrshr.u16 q13, q10, #8 2416 vraddhn.u16 d30, q11, q8 2417 vraddhn.u16 d29, q12, q9 2418 vraddhn.u16 d28, q13, q10 2419.endm 2420 2421.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head 2422 vrshr.u16 q11, q8, #8 2423 vswp d3, d31 2424 vrshr.u16 q12, q9, #8 2425 vrshr.u16 q13, q10, #8 2426 fetch_src_pixblock 2427 vraddhn.u16 d30, q11, q8 2428 PF add PF_X, PF_X, #8 2429 PF tst PF_CTL, #0xF 2430 PF addne PF_X, PF_X, #8 2431 PF subne PF_CTL, PF_CTL, #1 2432 vraddhn.u16 d29, q12, q9 2433 vraddhn.u16 d28, q13, q10 2434 vmull.u8 q8, d3, d0 2435 vmull.u8 q9, d3, d1 2436 vmull.u8 q10, d3, d2 2437 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2438 PF cmp PF_X, ORIG_W 2439 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2440 PF subge PF_X, PF_X, ORIG_W 2441 PF subges PF_CTL, PF_CTL, #0x10 2442 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2443.endm 2444 2445generate_composite_function \ 2446 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ 2447 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2448 8, /* number of pixels, processed in a single block */ \ 2449 10, /* prefetch distance */ \ 2450 default_init, \ 2451 default_cleanup, \ 2452 pixman_composite_src_pixbuf_8888_process_pixblock_head, \ 2453 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ 2454 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ 2455 28, /* dst_w_basereg */ \ 2456 0, /* dst_r_basereg */ \ 2457 0, /* src_basereg */ \ 2458 0 /* mask_basereg */ 2459 2460/******************************************************************************/ 2461 2462.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head 2463 vmull.u8 q8, d3, d0 2464 vmull.u8 q9, d3, d1 2465 vmull.u8 q10, d3, d2 2466.endm 2467 2468.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail 2469 vrshr.u16 q11, q8, #8 2470 vswp d3, d31 2471 vrshr.u16 q12, q9, #8 2472 vrshr.u16 q13, q10, #8 2473 vraddhn.u16 d28, q11, q8 2474 vraddhn.u16 d29, q12, q9 2475 vraddhn.u16 d30, q13, q10 2476.endm 2477 2478.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head 2479 vrshr.u16 q11, q8, #8 2480 vswp d3, d31 2481 vrshr.u16 q12, q9, #8 2482 vrshr.u16 q13, q10, #8 2483 fetch_src_pixblock 2484 vraddhn.u16 d28, q11, q8 2485 PF add PF_X, PF_X, #8 2486 PF tst PF_CTL, #0xF 2487 PF addne PF_X, PF_X, #8 2488 PF subne PF_CTL, PF_CTL, #1 2489 vraddhn.u16 d29, q12, q9 2490 vraddhn.u16 d30, q13, q10 2491 vmull.u8 q8, d3, d0 2492 vmull.u8 q9, d3, d1 2493 vmull.u8 q10, d3, d2 2494 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2495 PF cmp PF_X, ORIG_W 2496 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] 2497 PF subge PF_X, PF_X, ORIG_W 2498 PF subges PF_CTL, PF_CTL, #0x10 2499 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! 2500.endm 2501 2502generate_composite_function \ 2503 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ 2504 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2505 8, /* number of pixels, processed in a single block */ \ 2506 10, /* prefetch distance */ \ 2507 default_init, \ 2508 default_cleanup, \ 2509 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ 2510 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ 2511 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ 2512 28, /* dst_w_basereg */ \ 2513 0, /* dst_r_basereg */ \ 2514 0, /* src_basereg */ \ 2515 0 /* mask_basereg */ 2516 2517/******************************************************************************/ 2518 2519.macro pixman_composite_over_0565_8_0565_process_pixblock_head 2520 /* mask is in d15 */ 2521 convert_0565_to_x888 q4, d2, d1, d0 2522 convert_0565_to_x888 q5, d6, d5, d4 2523 /* source pixel data is in {d0, d1, d2, XX} */ 2524 /* destination pixel data is in {d4, d5, d6, XX} */ 2525 vmvn.8 d7, d15 2526 vmull.u8 q6, d15, d2 2527 vmull.u8 q5, d15, d1 2528 vmull.u8 q4, d15, d0 2529 vmull.u8 q8, d7, d4 2530 vmull.u8 q9, d7, d5 2531 vmull.u8 q13, d7, d6 2532 vrshr.u16 q12, q6, #8 2533 vrshr.u16 q11, q5, #8 2534 vrshr.u16 q10, q4, #8 2535 vraddhn.u16 d2, q6, q12 2536 vraddhn.u16 d1, q5, q11 2537 vraddhn.u16 d0, q4, q10 2538.endm 2539 2540.macro pixman_composite_over_0565_8_0565_process_pixblock_tail 2541 vrshr.u16 q14, q8, #8 2542 vrshr.u16 q15, q9, #8 2543 vrshr.u16 q12, q13, #8 2544 vraddhn.u16 d28, q14, q8 2545 vraddhn.u16 d29, q15, q9 2546 vraddhn.u16 d30, q12, q13 2547 vqadd.u8 q0, q0, q14 2548 vqadd.u8 q1, q1, q15 2549 /* 32bpp result is in {d0, d1, d2, XX} */ 2550 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2551.endm 2552 2553/* TODO: expand macros and do better instructions scheduling */ 2554.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head 2555 fetch_mask_pixblock 2556 pixman_composite_over_0565_8_0565_process_pixblock_tail 2557 fetch_src_pixblock 2558 vld1.16 {d10, d11}, [DST_R, :128]! 2559 cache_preload 8, 8 2560 pixman_composite_over_0565_8_0565_process_pixblock_head 2561 vst1.16 {d28, d29}, [DST_W, :128]! 2562.endm 2563 2564generate_composite_function \ 2565 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ 2566 FLAG_DST_READWRITE, \ 2567 8, /* number of pixels, processed in a single block */ \ 2568 5, /* prefetch distance */ \ 2569 default_init_need_all_regs, \ 2570 default_cleanup_need_all_regs, \ 2571 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2572 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2573 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2574 28, /* dst_w_basereg */ \ 2575 10, /* dst_r_basereg */ \ 2576 8, /* src_basereg */ \ 2577 15 /* mask_basereg */ 2578 2579/******************************************************************************/ 2580 2581.macro pixman_composite_over_0565_n_0565_init 2582 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8) 2583 vpush {d8-d15} 2584 vld1.32 {d15[0]}, [DUMMY] 2585 vdup.8 d15, d15[3] 2586.endm 2587 2588.macro pixman_composite_over_0565_n_0565_cleanup 2589 vpop {d8-d15} 2590.endm 2591 2592generate_composite_function \ 2593 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ 2594 FLAG_DST_READWRITE, \ 2595 8, /* number of pixels, processed in a single block */ \ 2596 5, /* prefetch distance */ \ 2597 pixman_composite_over_0565_n_0565_init, \ 2598 pixman_composite_over_0565_n_0565_cleanup, \ 2599 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2600 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2601 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2602 28, /* dst_w_basereg */ \ 2603 10, /* dst_r_basereg */ \ 2604 8, /* src_basereg */ \ 2605 15 /* mask_basereg */ 2606 2607/******************************************************************************/ 2608 2609.macro pixman_composite_add_0565_8_0565_process_pixblock_head 2610 /* mask is in d15 */ 2611 convert_0565_to_x888 q4, d2, d1, d0 2612 convert_0565_to_x888 q5, d6, d5, d4 2613 /* source pixel data is in {d0, d1, d2, XX} */ 2614 /* destination pixel data is in {d4, d5, d6, XX} */ 2615 vmull.u8 q6, d15, d2 2616 vmull.u8 q5, d15, d1 2617 vmull.u8 q4, d15, d0 2618 vrshr.u16 q12, q6, #8 2619 vrshr.u16 q11, q5, #8 2620 vrshr.u16 q10, q4, #8 2621 vraddhn.u16 d2, q6, q12 2622 vraddhn.u16 d1, q5, q11 2623 vraddhn.u16 d0, q4, q10 2624.endm 2625 2626.macro pixman_composite_add_0565_8_0565_process_pixblock_tail 2627 vqadd.u8 q0, q0, q2 2628 vqadd.u8 q1, q1, q3 2629 /* 32bpp result is in {d0, d1, d2, XX} */ 2630 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2631.endm 2632 2633/* TODO: expand macros and do better instructions scheduling */ 2634.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head 2635 fetch_mask_pixblock 2636 pixman_composite_add_0565_8_0565_process_pixblock_tail 2637 fetch_src_pixblock 2638 vld1.16 {d10, d11}, [DST_R, :128]! 2639 cache_preload 8, 8 2640 pixman_composite_add_0565_8_0565_process_pixblock_head 2641 vst1.16 {d28, d29}, [DST_W, :128]! 2642.endm 2643 2644generate_composite_function \ 2645 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ 2646 FLAG_DST_READWRITE, \ 2647 8, /* number of pixels, processed in a single block */ \ 2648 5, /* prefetch distance */ \ 2649 default_init_need_all_regs, \ 2650 default_cleanup_need_all_regs, \ 2651 pixman_composite_add_0565_8_0565_process_pixblock_head, \ 2652 pixman_composite_add_0565_8_0565_process_pixblock_tail, \ 2653 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ 2654 28, /* dst_w_basereg */ \ 2655 10, /* dst_r_basereg */ \ 2656 8, /* src_basereg */ \ 2657 15 /* mask_basereg */ 2658 2659/******************************************************************************/ 2660 2661.macro pixman_composite_out_reverse_8_0565_process_pixblock_head 2662 /* mask is in d15 */ 2663 convert_0565_to_x888 q5, d6, d5, d4 2664 /* destination pixel data is in {d4, d5, d6, xx} */ 2665 vmvn.8 d24, d15 /* get inverted alpha */ 2666 /* now do alpha blending */ 2667 vmull.u8 q8, d24, d4 2668 vmull.u8 q9, d24, d5 2669 vmull.u8 q10, d24, d6 2670.endm 2671 2672.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail 2673 vrshr.u16 q14, q8, #8 2674 vrshr.u16 q15, q9, #8 2675 vrshr.u16 q12, q10, #8 2676 vraddhn.u16 d0, q14, q8 2677 vraddhn.u16 d1, q15, q9 2678 vraddhn.u16 d2, q12, q10 2679 /* 32bpp result is in {d0, d1, d2, XX} */ 2680 convert_8888_to_0565 d2, d1, d0, q14, q15, q3 2681.endm 2682 2683/* TODO: expand macros and do better instructions scheduling */ 2684.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head 2685 fetch_src_pixblock 2686 pixman_composite_out_reverse_8_0565_process_pixblock_tail 2687 vld1.16 {d10, d11}, [DST_R, :128]! 2688 cache_preload 8, 8 2689 pixman_composite_out_reverse_8_0565_process_pixblock_head 2690 vst1.16 {d28, d29}, [DST_W, :128]! 2691.endm 2692 2693generate_composite_function \ 2694 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ 2695 FLAG_DST_READWRITE, \ 2696 8, /* number of pixels, processed in a single block */ \ 2697 5, /* prefetch distance */ \ 2698 default_init_need_all_regs, \ 2699 default_cleanup_need_all_regs, \ 2700 pixman_composite_out_reverse_8_0565_process_pixblock_head, \ 2701 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ 2702 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ 2703 28, /* dst_w_basereg */ \ 2704 10, /* dst_r_basereg */ \ 2705 15, /* src_basereg */ \ 2706 0 /* mask_basereg */ 2707 2708/******************************************************************************/ 2709 2710.macro pixman_composite_out_reverse_8_8888_process_pixblock_head 2711 /* src is in d0 */ 2712 /* destination pixel data is in {d4, d5, d6, d7} */ 2713 vmvn.8 d1, d0 /* get inverted alpha */ 2714 /* now do alpha blending */ 2715 vmull.u8 q8, d1, d4 2716 vmull.u8 q9, d1, d5 2717 vmull.u8 q10, d1, d6 2718 vmull.u8 q11, d1, d7 2719.endm 2720 2721.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail 2722 vrshr.u16 q14, q8, #8 2723 vrshr.u16 q15, q9, #8 2724 vrshr.u16 q12, q10, #8 2725 vrshr.u16 q13, q11, #8 2726 vraddhn.u16 d28, q14, q8 2727 vraddhn.u16 d29, q15, q9 2728 vraddhn.u16 d30, q12, q10 2729 vraddhn.u16 d31, q13, q11 2730 /* 32bpp result is in {d28, d29, d30, d31} */ 2731.endm 2732 2733/* TODO: expand macros and do better instructions scheduling */ 2734.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head 2735 fetch_src_pixblock 2736 pixman_composite_out_reverse_8_8888_process_pixblock_tail 2737 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]! 2738 cache_preload 8, 8 2739 pixman_composite_out_reverse_8_8888_process_pixblock_head 2740 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]! 2741.endm 2742 2743generate_composite_function \ 2744 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ 2745 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2746 8, /* number of pixels, processed in a single block */ \ 2747 5, /* prefetch distance */ \ 2748 default_init, \ 2749 default_cleanup, \ 2750 pixman_composite_out_reverse_8_8888_process_pixblock_head, \ 2751 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ 2752 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ 2753 28, /* dst_w_basereg */ \ 2754 4, /* dst_r_basereg */ \ 2755 0, /* src_basereg */ \ 2756 0 /* mask_basereg */ 2757 2758/******************************************************************************/ 2759 2760generate_composite_function_nearest_scanline \ 2761 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ 2762 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2763 8, /* number of pixels, processed in a single block */ \ 2764 default_init, \ 2765 default_cleanup, \ 2766 pixman_composite_over_8888_8888_process_pixblock_head, \ 2767 pixman_composite_over_8888_8888_process_pixblock_tail, \ 2768 pixman_composite_over_8888_8888_process_pixblock_tail_head 2769 2770generate_composite_function_nearest_scanline \ 2771 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ 2772 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2773 8, /* number of pixels, processed in a single block */ \ 2774 default_init, \ 2775 default_cleanup, \ 2776 pixman_composite_over_8888_0565_process_pixblock_head, \ 2777 pixman_composite_over_8888_0565_process_pixblock_tail, \ 2778 pixman_composite_over_8888_0565_process_pixblock_tail_head, \ 2779 28, /* dst_w_basereg */ \ 2780 4, /* dst_r_basereg */ \ 2781 0, /* src_basereg */ \ 2782 24 /* mask_basereg */ 2783 2784generate_composite_function_nearest_scanline \ 2785 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ 2786 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2787 8, /* number of pixels, processed in a single block */ \ 2788 default_init, \ 2789 default_cleanup, \ 2790 pixman_composite_src_8888_0565_process_pixblock_head, \ 2791 pixman_composite_src_8888_0565_process_pixblock_tail, \ 2792 pixman_composite_src_8888_0565_process_pixblock_tail_head 2793 2794generate_composite_function_nearest_scanline \ 2795 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ 2796 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ 2797 8, /* number of pixels, processed in a single block */ \ 2798 default_init, \ 2799 default_cleanup, \ 2800 pixman_composite_src_0565_8888_process_pixblock_head, \ 2801 pixman_composite_src_0565_8888_process_pixblock_tail, \ 2802 pixman_composite_src_0565_8888_process_pixblock_tail_head 2803 2804generate_composite_function_nearest_scanline \ 2805 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ 2806 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ 2807 8, /* number of pixels, processed in a single block */ \ 2808 default_init_need_all_regs, \ 2809 default_cleanup_need_all_regs, \ 2810 pixman_composite_over_8888_8_0565_process_pixblock_head, \ 2811 pixman_composite_over_8888_8_0565_process_pixblock_tail, \ 2812 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ 2813 28, /* dst_w_basereg */ \ 2814 4, /* dst_r_basereg */ \ 2815 8, /* src_basereg */ \ 2816 24 /* mask_basereg */ 2817 2818generate_composite_function_nearest_scanline \ 2819 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ 2820 FLAG_DST_READWRITE, \ 2821 8, /* number of pixels, processed in a single block */ \ 2822 default_init_need_all_regs, \ 2823 default_cleanup_need_all_regs, \ 2824 pixman_composite_over_0565_8_0565_process_pixblock_head, \ 2825 pixman_composite_over_0565_8_0565_process_pixblock_tail, \ 2826 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ 2827 28, /* dst_w_basereg */ \ 2828 10, /* dst_r_basereg */ \ 2829 8, /* src_basereg */ \ 2830 15 /* mask_basereg */ 2831 2832/******************************************************************************/ 2833 2834/* 2835 * Bilinear scaling support code which tries to provide pixel fetching, color 2836 * format conversion, and interpolation as separate macros which can be used 2837 * as the basic building blocks for constructing bilinear scanline functions. 2838 */ 2839 2840.macro bilinear_load_8888 reg1, reg2, tmp 2841 mov TMP1, X, asr #16 2842 add X, X, UX 2843 add TMP1, TOP, TMP1, asl #2 2844 vld1.32 {reg1}, [TMP1], STRIDE 2845 vld1.32 {reg2}, [TMP1] 2846.endm 2847 2848.macro bilinear_load_0565 reg1, reg2, tmp 2849 mov TMP1, X, asr #16 2850 add X, X, UX 2851 add TMP1, TOP, TMP1, asl #1 2852 vld1.32 {reg2[0]}, [TMP1], STRIDE 2853 vld1.32 {reg2[1]}, [TMP1] 2854 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 2855.endm 2856 2857.macro bilinear_load_and_vertical_interpolate_two_8888 \ 2858 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 2859 2860 bilinear_load_8888 reg1, reg2, tmp1 2861 vmull.u8 acc1, reg1, d28 2862 vmlal.u8 acc1, reg2, d29 2863 bilinear_load_8888 reg3, reg4, tmp2 2864 vmull.u8 acc2, reg3, d28 2865 vmlal.u8 acc2, reg4, d29 2866.endm 2867 2868.macro bilinear_load_and_vertical_interpolate_four_8888 \ 2869 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2870 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2871 2872 bilinear_load_and_vertical_interpolate_two_8888 \ 2873 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 2874 bilinear_load_and_vertical_interpolate_two_8888 \ 2875 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2876.endm 2877 2878.macro bilinear_load_and_vertical_interpolate_two_0565 \ 2879 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 2880 2881 mov TMP1, X, asr #16 2882 add X, X, UX 2883 add TMP1, TOP, TMP1, asl #1 2884 mov TMP2, X, asr #16 2885 add X, X, UX 2886 add TMP2, TOP, TMP2, asl #1 2887 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 2888 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 2889 vld1.32 {acc2lo[1]}, [TMP1] 2890 vld1.32 {acc2hi[1]}, [TMP2] 2891 convert_0565_to_x888 acc2, reg3, reg2, reg1 2892 vzip.u8 reg1, reg3 2893 vzip.u8 reg2, reg4 2894 vzip.u8 reg3, reg4 2895 vzip.u8 reg1, reg2 2896 vmull.u8 acc1, reg1, d28 2897 vmlal.u8 acc1, reg2, d29 2898 vmull.u8 acc2, reg3, d28 2899 vmlal.u8 acc2, reg4, d29 2900.endm 2901 2902.macro bilinear_load_and_vertical_interpolate_four_0565 \ 2903 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 2904 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 2905 2906 mov TMP1, X, asr #16 2907 add X, X, UX 2908 add TMP1, TOP, TMP1, asl #1 2909 mov TMP2, X, asr #16 2910 add X, X, UX 2911 add TMP2, TOP, TMP2, asl #1 2912 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 2913 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 2914 vld1.32 {xacc2lo[1]}, [TMP1] 2915 vld1.32 {xacc2hi[1]}, [TMP2] 2916 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 2917 mov TMP1, X, asr #16 2918 add X, X, UX 2919 add TMP1, TOP, TMP1, asl #1 2920 mov TMP2, X, asr #16 2921 add X, X, UX 2922 add TMP2, TOP, TMP2, asl #1 2923 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 2924 vzip.u8 xreg1, xreg3 2925 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 2926 vzip.u8 xreg2, xreg4 2927 vld1.32 {yacc2lo[1]}, [TMP1] 2928 vzip.u8 xreg3, xreg4 2929 vld1.32 {yacc2hi[1]}, [TMP2] 2930 vzip.u8 xreg1, xreg2 2931 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 2932 vmull.u8 xacc1, xreg1, d28 2933 vzip.u8 yreg1, yreg3 2934 vmlal.u8 xacc1, xreg2, d29 2935 vzip.u8 yreg2, yreg4 2936 vmull.u8 xacc2, xreg3, d28 2937 vzip.u8 yreg3, yreg4 2938 vmlal.u8 xacc2, xreg4, d29 2939 vzip.u8 yreg1, yreg2 2940 vmull.u8 yacc1, yreg1, d28 2941 vmlal.u8 yacc1, yreg2, d29 2942 vmull.u8 yacc2, yreg3, d28 2943 vmlal.u8 yacc2, yreg4, d29 2944.endm 2945 2946.macro bilinear_store_8888 numpix, tmp1, tmp2 2947.if numpix == 4 2948 vst1.32 {d0, d1}, [OUT, :128]! 2949.elseif numpix == 2 2950 vst1.32 {d0}, [OUT, :64]! 2951.elseif numpix == 1 2952 vst1.32 {d0[0]}, [OUT, :32]! 2953.else 2954 .error bilinear_store_8888 numpix is unsupported 2955.endif 2956.endm 2957 2958.macro bilinear_store_0565 numpix, tmp1, tmp2 2959 vuzp.u8 d0, d1 2960 vuzp.u8 d2, d3 2961 vuzp.u8 d1, d3 2962 vuzp.u8 d0, d2 2963 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 2964.if numpix == 4 2965 vst1.16 {d2}, [OUT, :64]! 2966.elseif numpix == 2 2967 vst1.32 {d2[0]}, [OUT, :32]! 2968.elseif numpix == 1 2969 vst1.16 {d2[0]}, [OUT, :16]! 2970.else 2971 .error bilinear_store_0565 numpix is unsupported 2972.endif 2973.endm 2974 2975.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt 2976 bilinear_load_&src_fmt d0, d1, d2 2977 vmull.u8 q1, d0, d28 2978 vmlal.u8 q1, d1, d29 2979 /* 5 cycles bubble */ 2980 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 2981 vmlsl.u16 q0, d2, d30 2982 vmlal.u16 q0, d3, d30 2983 /* 5 cycles bubble */ 2984 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 2985 /* 3 cycles bubble */ 2986 vmovn.u16 d0, q0 2987 /* 1 cycle bubble */ 2988 bilinear_store_&dst_fmt 1, q2, q3 2989.endm 2990 2991.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt 2992 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 2993 q1, q11, d0, d1, d20, d21, d22, d23 2994 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 2995 vmlsl.u16 q0, d2, d30 2996 vmlal.u16 q0, d3, d30 2997 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 2998 vmlsl.u16 q10, d22, d31 2999 vmlal.u16 q10, d23, d31 3000 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3001 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3002 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3003 vadd.u16 q12, q12, q13 3004 vmovn.u16 d0, q0 3005 bilinear_store_&dst_fmt 2, q2, q3 3006.endm 3007 3008.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt 3009 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 3010 q1, q11, d0, d1, d20, d21, d22, d23 \ 3011 q3, q9, d4, d5, d16, d17, d18, d19 3012 pld [TMP1, PF_OFFS] 3013 sub TMP1, TMP1, STRIDE 3014 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 3015 vmlsl.u16 q0, d2, d30 3016 vmlal.u16 q0, d3, d30 3017 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 3018 vmlsl.u16 q10, d22, d31 3019 vmlal.u16 q10, d23, d31 3020 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3021 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 3022 vmlsl.u16 q2, d6, d30 3023 vmlal.u16 q2, d7, d30 3024 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 3025 pld [TMP2, PF_OFFS] 3026 vmlsl.u16 q8, d18, d31 3027 vmlal.u16 q8, d19, d31 3028 vadd.u16 q12, q12, q13 3029 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3030 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 3031 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3032 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 3033 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3034 vmovn.u16 d0, q0 3035 vmovn.u16 d1, q2 3036 vadd.u16 q12, q12, q13 3037 bilinear_store_&dst_fmt 4, q2, q3 3038.endm 3039 3040.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3041.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3042 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head 3043.else 3044 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3045.endif 3046.endm 3047 3048.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3049.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3050 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail 3051.endif 3052.endm 3053 3054.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3055.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt 3056 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head 3057.else 3058 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3059.endif 3060.endm 3061 3062.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3063.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3064 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head 3065.else 3066 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3067 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3068.endif 3069.endm 3070 3071.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 3072.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3073 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail 3074.else 3075 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 3076.endif 3077.endm 3078 3079.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3080.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt 3081 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head 3082.else 3083 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3084 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3085.endif 3086.endm 3087 3088.set BILINEAR_FLAG_UNROLL_4, 0 3089.set BILINEAR_FLAG_UNROLL_8, 1 3090.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 3091 3092/* 3093 * Main template macro for generating NEON optimized bilinear scanline 3094 * functions. 3095 * 3096 * Bilinear scanline scaler macro template uses the following arguments: 3097 * fname - name of the function to generate 3098 * src_fmt - source color format (8888 or 0565) 3099 * dst_fmt - destination color format (8888 or 0565) 3100 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes 3101 * prefetch_distance - prefetch in the source image by that many 3102 * pixels ahead 3103 */ 3104 3105.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ 3106 src_bpp_shift, dst_bpp_shift, \ 3107 prefetch_distance, flags 3108 3109pixman_asm_function fname 3110 OUT .req r0 3111 TOP .req r1 3112 BOTTOM .req r2 3113 WT .req r3 3114 WB .req r4 3115 X .req r5 3116 UX .req r6 3117 WIDTH .req ip 3118 TMP1 .req r3 3119 TMP2 .req r4 3120 PF_OFFS .req r7 3121 TMP3 .req r8 3122 TMP4 .req r9 3123 STRIDE .req r2 3124 3125 mov ip, sp 3126 push {r4, r5, r6, r7, r8, r9} 3127 mov PF_OFFS, #prefetch_distance 3128 ldmia ip, {WB, X, UX, WIDTH} 3129 mul PF_OFFS, PF_OFFS, UX 3130 3131.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3132 vpush {d8-d15} 3133.endif 3134 3135 sub STRIDE, BOTTOM, TOP 3136 .unreq BOTTOM 3137 3138 cmp WIDTH, #0 3139 ble 3f 3140 3141 vdup.u16 q12, X 3142 vdup.u16 q13, UX 3143 vdup.u8 d28, WT 3144 vdup.u8 d29, WB 3145 vadd.u16 d25, d25, d26 3146 3147 /* ensure good destination alignment */ 3148 cmp WIDTH, #1 3149 blt 0f 3150 tst OUT, #(1 << dst_bpp_shift) 3151 beq 0f 3152 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3153 vadd.u16 q12, q12, q13 3154 bilinear_interpolate_last_pixel src_fmt, dst_fmt 3155 sub WIDTH, WIDTH, #1 31560: 3157 vadd.u16 q13, q13, q13 3158 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3159 vadd.u16 q12, q12, q13 3160 3161 cmp WIDTH, #2 3162 blt 0f 3163 tst OUT, #(1 << (dst_bpp_shift + 1)) 3164 beq 0f 3165 bilinear_interpolate_two_pixels src_fmt, dst_fmt 3166 sub WIDTH, WIDTH, #2 31670: 3168.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 3169/*********** 8 pixels per iteration *****************/ 3170 cmp WIDTH, #4 3171 blt 0f 3172 tst OUT, #(1 << (dst_bpp_shift + 2)) 3173 beq 0f 3174 bilinear_interpolate_four_pixels src_fmt, dst_fmt 3175 sub WIDTH, WIDTH, #4 31760: 3177 subs WIDTH, WIDTH, #8 3178 blt 1f 3179 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3180 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt 3181 subs WIDTH, WIDTH, #8 3182 blt 5f 31830: 3184 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt 3185 subs WIDTH, WIDTH, #8 3186 bge 0b 31875: 3188 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt 31891: 3190 tst WIDTH, #4 3191 beq 2f 3192 bilinear_interpolate_four_pixels src_fmt, dst_fmt 31932: 3194.else 3195/*********** 4 pixels per iteration *****************/ 3196 subs WIDTH, WIDTH, #4 3197 blt 1f 3198 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 3199 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt 3200 subs WIDTH, WIDTH, #4 3201 blt 5f 32020: 3203 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt 3204 subs WIDTH, WIDTH, #4 3205 bge 0b 32065: 3207 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt 32081: 3209/****************************************************/ 3210.endif 3211 /* handle the remaining trailing pixels */ 3212 tst WIDTH, #2 3213 beq 2f 3214 bilinear_interpolate_two_pixels src_fmt, dst_fmt 32152: 3216 tst WIDTH, #1 3217 beq 3f 3218 bilinear_interpolate_last_pixel src_fmt, dst_fmt 32193: 3220.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 3221 vpop {d8-d15} 3222.endif 3223 pop {r4, r5, r6, r7, r8, r9} 3224 bx lr 3225 3226 .unreq OUT 3227 .unreq TOP 3228 .unreq WT 3229 .unreq WB 3230 .unreq X 3231 .unreq UX 3232 .unreq WIDTH 3233 .unreq TMP1 3234 .unreq TMP2 3235 .unreq PF_OFFS 3236 .unreq TMP3 3237 .unreq TMP4 3238 .unreq STRIDE 3239.endfunc 3240 3241.endm 3242 3243/*****************************************************************************/ 3244 3245.set have_bilinear_interpolate_four_pixels_8888_8888, 1 3246 3247.macro bilinear_interpolate_four_pixels_8888_8888_head 3248 mov TMP1, X, asr #16 3249 add X, X, UX 3250 add TMP1, TOP, TMP1, asl #2 3251 mov TMP2, X, asr #16 3252 add X, X, UX 3253 add TMP2, TOP, TMP2, asl #2 3254 3255 vld1.32 {d22}, [TMP1], STRIDE 3256 vld1.32 {d23}, [TMP1] 3257 mov TMP3, X, asr #16 3258 add X, X, UX 3259 add TMP3, TOP, TMP3, asl #2 3260 vmull.u8 q8, d22, d28 3261 vmlal.u8 q8, d23, d29 3262 3263 vld1.32 {d22}, [TMP2], STRIDE 3264 vld1.32 {d23}, [TMP2] 3265 mov TMP4, X, asr #16 3266 add X, X, UX 3267 add TMP4, TOP, TMP4, asl #2 3268 vmull.u8 q9, d22, d28 3269 vmlal.u8 q9, d23, d29 3270 3271 vld1.32 {d22}, [TMP3], STRIDE 3272 vld1.32 {d23}, [TMP3] 3273 vmull.u8 q10, d22, d28 3274 vmlal.u8 q10, d23, d29 3275 3276 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3277 vmlsl.u16 q0, d16, d30 3278 vmlal.u16 q0, d17, d30 3279 3280 pld [TMP4, PF_OFFS] 3281 vld1.32 {d16}, [TMP4], STRIDE 3282 vld1.32 {d17}, [TMP4] 3283 pld [TMP4, PF_OFFS] 3284 vmull.u8 q11, d16, d28 3285 vmlal.u8 q11, d17, d29 3286 3287 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3288 vmlsl.u16 q1, d18, d31 3289.endm 3290 3291.macro bilinear_interpolate_four_pixels_8888_8888_tail 3292 vmlal.u16 q1, d19, d31 3293 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3294 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3295 vmlsl.u16 q2, d20, d30 3296 vmlal.u16 q2, d21, d30 3297 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3298 vmlsl.u16 q3, d22, d31 3299 vmlal.u16 q3, d23, d31 3300 vadd.u16 q12, q12, q13 3301 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3302 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3303 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3304 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3305 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3306 vmovn.u16 d6, q0 3307 vmovn.u16 d7, q2 3308 vadd.u16 q12, q12, q13 3309 vst1.32 {d6, d7}, [OUT, :128]! 3310.endm 3311 3312.macro bilinear_interpolate_four_pixels_8888_8888_tail_head 3313 mov TMP1, X, asr #16 3314 add X, X, UX 3315 add TMP1, TOP, TMP1, asl #2 3316 mov TMP2, X, asr #16 3317 add X, X, UX 3318 add TMP2, TOP, TMP2, asl #2 3319 vmlal.u16 q1, d19, d31 3320 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3321 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3322 vmlsl.u16 q2, d20, d30 3323 vmlal.u16 q2, d21, d30 3324 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3325 vld1.32 {d20}, [TMP1], STRIDE 3326 vmlsl.u16 q3, d22, d31 3327 vmlal.u16 q3, d23, d31 3328 vld1.32 {d21}, [TMP1] 3329 vmull.u8 q8, d20, d28 3330 vmlal.u8 q8, d21, d29 3331 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3332 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3333 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3334 vld1.32 {d22}, [TMP2], STRIDE 3335 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3336 vadd.u16 q12, q12, q13 3337 vld1.32 {d23}, [TMP2] 3338 vmull.u8 q9, d22, d28 3339 mov TMP3, X, asr #16 3340 add X, X, UX 3341 add TMP3, TOP, TMP3, asl #2 3342 mov TMP4, X, asr #16 3343 add X, X, UX 3344 add TMP4, TOP, TMP4, asl #2 3345 vmlal.u8 q9, d23, d29 3346 vld1.32 {d22}, [TMP3], STRIDE 3347 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3348 vld1.32 {d23}, [TMP3] 3349 vmull.u8 q10, d22, d28 3350 vmlal.u8 q10, d23, d29 3351 vmovn.u16 d6, q0 3352 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3353 vmovn.u16 d7, q2 3354 vmlsl.u16 q0, d16, d30 3355 vmlal.u16 q0, d17, d30 3356 pld [TMP4, PF_OFFS] 3357 vld1.32 {d16}, [TMP4], STRIDE 3358 vadd.u16 q12, q12, q13 3359 vld1.32 {d17}, [TMP4] 3360 pld [TMP4, PF_OFFS] 3361 vmull.u8 q11, d16, d28 3362 vmlal.u8 q11, d17, d29 3363 vst1.32 {d6, d7}, [OUT, :128]! 3364 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3365 vmlsl.u16 q1, d18, d31 3366.endm 3367 3368/*****************************************************************************/ 3369 3370.set have_bilinear_interpolate_eight_pixels_8888_0565, 1 3371 3372.macro bilinear_interpolate_eight_pixels_8888_0565_head 3373 mov TMP1, X, asr #16 3374 add X, X, UX 3375 add TMP1, TOP, TMP1, asl #2 3376 mov TMP2, X, asr #16 3377 add X, X, UX 3378 add TMP2, TOP, TMP2, asl #2 3379 vld1.32 {d20}, [TMP1], STRIDE 3380 vld1.32 {d21}, [TMP1] 3381 vmull.u8 q8, d20, d28 3382 vmlal.u8 q8, d21, d29 3383 vld1.32 {d22}, [TMP2], STRIDE 3384 vld1.32 {d23}, [TMP2] 3385 vmull.u8 q9, d22, d28 3386 mov TMP3, X, asr #16 3387 add X, X, UX 3388 add TMP3, TOP, TMP3, asl #2 3389 mov TMP4, X, asr #16 3390 add X, X, UX 3391 add TMP4, TOP, TMP4, asl #2 3392 vmlal.u8 q9, d23, d29 3393 vld1.32 {d22}, [TMP3], STRIDE 3394 vld1.32 {d23}, [TMP3] 3395 vmull.u8 q10, d22, d28 3396 vmlal.u8 q10, d23, d29 3397 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3398 vmlsl.u16 q0, d16, d30 3399 vmlal.u16 q0, d17, d30 3400 pld [TMP4, PF_OFFS] 3401 vld1.32 {d16}, [TMP4], STRIDE 3402 vld1.32 {d17}, [TMP4] 3403 pld [TMP4, PF_OFFS] 3404 vmull.u8 q11, d16, d28 3405 vmlal.u8 q11, d17, d29 3406 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3407 vmlsl.u16 q1, d18, d31 3408 3409 mov TMP1, X, asr #16 3410 add X, X, UX 3411 add TMP1, TOP, TMP1, asl #2 3412 mov TMP2, X, asr #16 3413 add X, X, UX 3414 add TMP2, TOP, TMP2, asl #2 3415 vmlal.u16 q1, d19, d31 3416 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3417 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3418 vmlsl.u16 q2, d20, d30 3419 vmlal.u16 q2, d21, d30 3420 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3421 vld1.32 {d20}, [TMP1], STRIDE 3422 vmlsl.u16 q3, d22, d31 3423 vmlal.u16 q3, d23, d31 3424 vld1.32 {d21}, [TMP1] 3425 vmull.u8 q8, d20, d28 3426 vmlal.u8 q8, d21, d29 3427 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3428 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3429 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3430 vld1.32 {d22}, [TMP2], STRIDE 3431 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3432 vadd.u16 q12, q12, q13 3433 vld1.32 {d23}, [TMP2] 3434 vmull.u8 q9, d22, d28 3435 mov TMP3, X, asr #16 3436 add X, X, UX 3437 add TMP3, TOP, TMP3, asl #2 3438 mov TMP4, X, asr #16 3439 add X, X, UX 3440 add TMP4, TOP, TMP4, asl #2 3441 vmlal.u8 q9, d23, d29 3442 vld1.32 {d22}, [TMP3], STRIDE 3443 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3444 vld1.32 {d23}, [TMP3] 3445 vmull.u8 q10, d22, d28 3446 vmlal.u8 q10, d23, d29 3447 vmovn.u16 d8, q0 3448 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3449 vmovn.u16 d9, q2 3450 vmlsl.u16 q0, d16, d30 3451 vmlal.u16 q0, d17, d30 3452 pld [TMP4, PF_OFFS] 3453 vld1.32 {d16}, [TMP4], STRIDE 3454 vadd.u16 q12, q12, q13 3455 vld1.32 {d17}, [TMP4] 3456 pld [TMP4, PF_OFFS] 3457 vmull.u8 q11, d16, d28 3458 vmlal.u8 q11, d17, d29 3459 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3460 vmlsl.u16 q1, d18, d31 3461.endm 3462 3463.macro bilinear_interpolate_eight_pixels_8888_0565_tail 3464 vmlal.u16 q1, d19, d31 3465 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3466 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3467 vmlsl.u16 q2, d20, d30 3468 vmlal.u16 q2, d21, d30 3469 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3470 vmlsl.u16 q3, d22, d31 3471 vmlal.u16 q3, d23, d31 3472 vadd.u16 q12, q12, q13 3473 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3474 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3475 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3476 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3477 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3478 vmovn.u16 d10, q0 3479 vmovn.u16 d11, q2 3480 vadd.u16 q12, q12, q13 3481 3482 vuzp.u8 d8, d9 3483 vuzp.u8 d10, d11 3484 vuzp.u8 d9, d11 3485 vuzp.u8 d8, d10 3486 vshll.u8 q6, d9, #8 3487 vshll.u8 q5, d10, #8 3488 vshll.u8 q7, d8, #8 3489 vsri.u16 q5, q6, #5 3490 vsri.u16 q5, q7, #11 3491 vst1.32 {d10, d11}, [OUT, :128]! 3492.endm 3493 3494.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head 3495 mov TMP1, X, asr #16 3496 add X, X, UX 3497 add TMP1, TOP, TMP1, asl #2 3498 mov TMP2, X, asr #16 3499 add X, X, UX 3500 add TMP2, TOP, TMP2, asl #2 3501 vmlal.u16 q1, d19, d31 3502 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3503 vuzp.u8 d8, d9 3504 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3505 vmlsl.u16 q2, d20, d30 3506 vmlal.u16 q2, d21, d30 3507 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3508 vld1.32 {d20}, [TMP1], STRIDE 3509 vmlsl.u16 q3, d22, d31 3510 vmlal.u16 q3, d23, d31 3511 vld1.32 {d21}, [TMP1] 3512 vmull.u8 q8, d20, d28 3513 vmlal.u8 q8, d21, d29 3514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3515 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3516 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3517 vld1.32 {d22}, [TMP2], STRIDE 3518 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3519 vadd.u16 q12, q12, q13 3520 vld1.32 {d23}, [TMP2] 3521 vmull.u8 q9, d22, d28 3522 mov TMP3, X, asr #16 3523 add X, X, UX 3524 add TMP3, TOP, TMP3, asl #2 3525 mov TMP4, X, asr #16 3526 add X, X, UX 3527 add TMP4, TOP, TMP4, asl #2 3528 vmlal.u8 q9, d23, d29 3529 vld1.32 {d22}, [TMP3], STRIDE 3530 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3531 vld1.32 {d23}, [TMP3] 3532 vmull.u8 q10, d22, d28 3533 vmlal.u8 q10, d23, d29 3534 vmovn.u16 d10, q0 3535 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3536 vmovn.u16 d11, q2 3537 vmlsl.u16 q0, d16, d30 3538 vmlal.u16 q0, d17, d30 3539 pld [TMP4, PF_OFFS] 3540 vld1.32 {d16}, [TMP4], STRIDE 3541 vadd.u16 q12, q12, q13 3542 vld1.32 {d17}, [TMP4] 3543 pld [TMP4, PF_OFFS] 3544 vmull.u8 q11, d16, d28 3545 vmlal.u8 q11, d17, d29 3546 vuzp.u8 d10, d11 3547 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3548 vmlsl.u16 q1, d18, d31 3549 3550 mov TMP1, X, asr #16 3551 add X, X, UX 3552 add TMP1, TOP, TMP1, asl #2 3553 mov TMP2, X, asr #16 3554 add X, X, UX 3555 add TMP2, TOP, TMP2, asl #2 3556 vmlal.u16 q1, d19, d31 3557 vuzp.u8 d9, d11 3558 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3559 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 3560 vuzp.u8 d8, d10 3561 vmlsl.u16 q2, d20, d30 3562 vmlal.u16 q2, d21, d30 3563 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 3564 vld1.32 {d20}, [TMP1], STRIDE 3565 vmlsl.u16 q3, d22, d31 3566 vmlal.u16 q3, d23, d31 3567 vld1.32 {d21}, [TMP1] 3568 vmull.u8 q8, d20, d28 3569 vmlal.u8 q8, d21, d29 3570 vshll.u8 q6, d9, #8 3571 vshll.u8 q5, d10, #8 3572 vshll.u8 q7, d8, #8 3573 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 3574 vsri.u16 q5, q6, #5 3575 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 3576 vsri.u16 q5, q7, #11 3577 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 3578 vld1.32 {d22}, [TMP2], STRIDE 3579 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 3580 vadd.u16 q12, q12, q13 3581 vld1.32 {d23}, [TMP2] 3582 vmull.u8 q9, d22, d28 3583 mov TMP3, X, asr #16 3584 add X, X, UX 3585 add TMP3, TOP, TMP3, asl #2 3586 mov TMP4, X, asr #16 3587 add X, X, UX 3588 add TMP4, TOP, TMP4, asl #2 3589 vmlal.u8 q9, d23, d29 3590 vld1.32 {d22}, [TMP3], STRIDE 3591 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 3592 vld1.32 {d23}, [TMP3] 3593 vmull.u8 q10, d22, d28 3594 vmlal.u8 q10, d23, d29 3595 vmovn.u16 d8, q0 3596 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 3597 vmovn.u16 d9, q2 3598 vmlsl.u16 q0, d16, d30 3599 vmlal.u16 q0, d17, d30 3600 pld [TMP4, PF_OFFS] 3601 vld1.32 {d16}, [TMP4], STRIDE 3602 vadd.u16 q12, q12, q13 3603 vld1.32 {d17}, [TMP4] 3604 pld [TMP4, PF_OFFS] 3605 vmull.u8 q11, d16, d28 3606 vmlal.u8 q11, d17, d29 3607 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 3608 vst1.32 {d10, d11}, [OUT, :128]! 3609 vmlsl.u16 q1, d18, d31 3610.endm 3611/*****************************************************************************/ 3612 3613generate_bilinear_scanline_func \ 3614 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ 3615 2, 2, 28, BILINEAR_FLAG_UNROLL_4 3616 3617generate_bilinear_scanline_func \ 3618 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ 3619 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS 3620 3621generate_bilinear_scanline_func \ 3622 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ 3623 1, 2, 28, BILINEAR_FLAG_UNROLL_4 3624 3625generate_bilinear_scanline_func \ 3626 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ 3627 1, 1, 28, BILINEAR_FLAG_UNROLL_4 3628