1/* 2 * Copyright © 2011 SCore Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) 24 * Author: Taekyun Kim (tkq.kim@samsung.com) 25 */ 26 27/* 28 * This file contains scaled bilinear scanline functions implemented 29 * using older siarhei's bilinear macro template. 30 * 31 * << General scanline function procedures >> 32 * 1. bilinear interpolate source pixels 33 * 2. load mask pixels 34 * 3. load destination pixels 35 * 4. duplicate mask to fill whole register 36 * 5. interleave source & destination pixels 37 * 6. apply mask to source pixels 38 * 7. combine source & destination pixels 39 * 8, Deinterleave final result 40 * 9. store destination pixels 41 * 42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers. 43 * Registers with double numbers(src01, dst01) are 128-bits registers. 44 * All temp registers can be used freely outside the code block. 45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. 46 * 47 * Remarks 48 * There can be lots of pipeline stalls inside code block and between code blocks. 49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. 50 */ 51 52/* Prevent the stack from becoming executable for no reason... */ 53#if defined(__linux__) && defined (__ELF__) 54.section .note.GNU-stack,"",%progbits 55#endif 56 57.text 58.fpu neon 59.arch armv7a 60.object_arch armv4 61.eabi_attribute 10, 0 62.eabi_attribute 12, 0 63.arm 64.altmacro 65.p2align 2 66 67#include "pixman-private.h" 68#include "pixman-arm-asm.h" 69#include "pixman-arm-neon-asm.h" 70 71/* 72 * Bilinear macros from pixman-arm-neon-asm.S 73 */ 74 75/* 76 * Bilinear scaling support code which tries to provide pixel fetching, color 77 * format conversion, and interpolation as separate macros which can be used 78 * as the basic building blocks for constructing bilinear scanline functions. 79 */ 80 81.macro bilinear_load_8888 reg1, reg2, tmp 82 mov TMP1, X, asr #16 83 add X, X, UX 84 add TMP1, TOP, TMP1, asl #2 85 vld1.32 {reg1}, [TMP1], STRIDE 86 vld1.32 {reg2}, [TMP1] 87.endm 88 89.macro bilinear_load_0565 reg1, reg2, tmp 90 mov TMP1, X, asr #16 91 add X, X, UX 92 add TMP1, TOP, TMP1, asl #1 93 vld1.32 {reg2[0]}, [TMP1], STRIDE 94 vld1.32 {reg2[1]}, [TMP1] 95 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp 96.endm 97 98.macro bilinear_load_and_vertical_interpolate_two_8888 \ 99 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 100 101 bilinear_load_8888 reg1, reg2, tmp1 102 vmull.u8 acc1, reg1, d28 103 vmlal.u8 acc1, reg2, d29 104 bilinear_load_8888 reg3, reg4, tmp2 105 vmull.u8 acc2, reg3, d28 106 vmlal.u8 acc2, reg4, d29 107.endm 108 109.macro bilinear_load_and_vertical_interpolate_four_8888 \ 110 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 111 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 112 113 bilinear_load_and_vertical_interpolate_two_8888 \ 114 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi 115 bilinear_load_and_vertical_interpolate_two_8888 \ 116 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 117.endm 118 119.macro bilinear_load_and_vertical_interpolate_two_0565 \ 120 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi 121 122 mov TMP1, X, asr #16 123 add X, X, UX 124 add TMP1, TOP, TMP1, asl #1 125 mov TMP2, X, asr #16 126 add X, X, UX 127 add TMP2, TOP, TMP2, asl #1 128 vld1.32 {acc2lo[0]}, [TMP1], STRIDE 129 vld1.32 {acc2hi[0]}, [TMP2], STRIDE 130 vld1.32 {acc2lo[1]}, [TMP1] 131 vld1.32 {acc2hi[1]}, [TMP2] 132 convert_0565_to_x888 acc2, reg3, reg2, reg1 133 vzip.u8 reg1, reg3 134 vzip.u8 reg2, reg4 135 vzip.u8 reg3, reg4 136 vzip.u8 reg1, reg2 137 vmull.u8 acc1, reg1, d28 138 vmlal.u8 acc1, reg2, d29 139 vmull.u8 acc2, reg3, d28 140 vmlal.u8 acc2, reg4, d29 141.endm 142 143.macro bilinear_load_and_vertical_interpolate_four_0565 \ 144 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ 145 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi 146 147 mov TMP1, X, asr #16 148 add X, X, UX 149 add TMP1, TOP, TMP1, asl #1 150 mov TMP2, X, asr #16 151 add X, X, UX 152 add TMP2, TOP, TMP2, asl #1 153 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE 154 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE 155 vld1.32 {xacc2lo[1]}, [TMP1] 156 vld1.32 {xacc2hi[1]}, [TMP2] 157 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 158 mov TMP1, X, asr #16 159 add X, X, UX 160 add TMP1, TOP, TMP1, asl #1 161 mov TMP2, X, asr #16 162 add X, X, UX 163 add TMP2, TOP, TMP2, asl #1 164 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE 165 vzip.u8 xreg1, xreg3 166 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE 167 vzip.u8 xreg2, xreg4 168 vld1.32 {yacc2lo[1]}, [TMP1] 169 vzip.u8 xreg3, xreg4 170 vld1.32 {yacc2hi[1]}, [TMP2] 171 vzip.u8 xreg1, xreg2 172 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 173 vmull.u8 xacc1, xreg1, d28 174 vzip.u8 yreg1, yreg3 175 vmlal.u8 xacc1, xreg2, d29 176 vzip.u8 yreg2, yreg4 177 vmull.u8 xacc2, xreg3, d28 178 vzip.u8 yreg3, yreg4 179 vmlal.u8 xacc2, xreg4, d29 180 vzip.u8 yreg1, yreg2 181 vmull.u8 yacc1, yreg1, d28 182 vmlal.u8 yacc1, yreg2, d29 183 vmull.u8 yacc2, yreg3, d28 184 vmlal.u8 yacc2, yreg4, d29 185.endm 186 187.macro bilinear_store_8888 numpix, tmp1, tmp2 188.if numpix == 4 189 vst1.32 {d0, d1}, [OUT]! 190.elseif numpix == 2 191 vst1.32 {d0}, [OUT]! 192.elseif numpix == 1 193 vst1.32 {d0[0]}, [OUT, :32]! 194.else 195 .error bilinear_store_8888 numpix is unsupported 196.endif 197.endm 198 199.macro bilinear_store_0565 numpix, tmp1, tmp2 200 vuzp.u8 d0, d1 201 vuzp.u8 d2, d3 202 vuzp.u8 d1, d3 203 vuzp.u8 d0, d2 204 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2 205.if numpix == 4 206 vst1.16 {d2}, [OUT]! 207.elseif numpix == 2 208 vst1.32 {d2[0]}, [OUT]! 209.elseif numpix == 1 210 vst1.16 {d2[0]}, [OUT]! 211.else 212 .error bilinear_store_0565 numpix is unsupported 213.endif 214.endm 215 216 217/* 218 * Macros for loading mask pixels into register 'mask'. 219 * vdup must be done in somewhere else. 220 */ 221.macro bilinear_load_mask_x numpix, mask 222.endm 223 224.macro bilinear_load_mask_8 numpix, mask 225.if numpix == 4 226 vld1.32 {mask[0]}, [MASK]! 227.elseif numpix == 2 228 vld1.16 {mask[0]}, [MASK]! 229.elseif numpix == 1 230 vld1.8 {mask[0]}, [MASK]! 231.else 232 .error bilinear_load_mask_8 numpix is unsupported 233.endif 234 pld [MASK, #prefetch_offset] 235.endm 236 237.macro bilinear_load_mask mask_fmt, numpix, mask 238 bilinear_load_mask_&mask_fmt numpix, mask 239.endm 240 241 242/* 243 * Macros for loading destination pixels into register 'dst0' and 'dst1'. 244 * Interleave should be done somewhere else. 245 */ 246.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 247.endm 248 249.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 250.endm 251 252.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 253.if numpix == 4 254 vld1.32 {dst0, dst1}, [OUT] 255.elseif numpix == 2 256 vld1.32 {dst0}, [OUT] 257.elseif numpix == 1 258 vld1.32 {dst0[0]}, [OUT] 259.else 260 .error bilinear_load_dst_8888 numpix is unsupported 261.endif 262 pld [OUT, #(prefetch_offset * 4)] 263.endm 264 265.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 266 bilinear_load_dst_8888 numpix, dst0, dst1, dst01 267.endm 268 269.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 270 bilinear_load_dst_8888 numpix, dst0, dst1, dst01 271.endm 272 273.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 274 bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 275.endm 276 277/* 278 * Macros for duplicating partially loaded mask to fill entire register. 279 * We will apply mask to interleaved source pixels, that is 280 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) 281 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) 282 * So, we need to duplicate loaded mask into whole register. 283 * 284 * For two pixel case 285 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 286 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) 287 * We can do some optimizations for this including last pixel cases. 288 */ 289.macro bilinear_duplicate_mask_x numpix, mask 290.endm 291 292.macro bilinear_duplicate_mask_8 numpix, mask 293.if numpix == 4 294 vdup.32 mask, mask[0] 295.elseif numpix == 2 296 vdup.16 mask, mask[0] 297.elseif numpix == 1 298 vdup.8 mask, mask[0] 299.else 300 .error bilinear_duplicate_mask_8 is unsupported 301.endif 302.endm 303 304.macro bilinear_duplicate_mask mask_fmt, numpix, mask 305 bilinear_duplicate_mask_&mask_fmt numpix, mask 306.endm 307 308/* 309 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. 310 * Interleave should be done when maks is enabled or operator is 'over'. 311 */ 312.macro bilinear_interleave src0, src1, dst0, dst1 313 vuzp.8 src0, src1 314 vuzp.8 dst0, dst1 315 vuzp.8 src0, src1 316 vuzp.8 dst0, dst1 317.endm 318 319.macro bilinear_interleave_src_dst_x_src \ 320 numpix, src0, src1, src01, dst0, dst1, dst01 321.endm 322 323.macro bilinear_interleave_src_dst_x_over \ 324 numpix, src0, src1, src01, dst0, dst1, dst01 325 326 bilinear_interleave src0, src1, dst0, dst1 327.endm 328 329.macro bilinear_interleave_src_dst_x_add \ 330 numpix, src0, src1, src01, dst0, dst1, dst01 331.endm 332 333.macro bilinear_interleave_src_dst_8_src \ 334 numpix, src0, src1, src01, dst0, dst1, dst01 335 336 bilinear_interleave src0, src1, dst0, dst1 337.endm 338 339.macro bilinear_interleave_src_dst_8_over \ 340 numpix, src0, src1, src01, dst0, dst1, dst01 341 342 bilinear_interleave src0, src1, dst0, dst1 343.endm 344 345.macro bilinear_interleave_src_dst_8_add \ 346 numpix, src0, src1, src01, dst0, dst1, dst01 347 348 bilinear_interleave src0, src1, dst0, dst1 349.endm 350 351.macro bilinear_interleave_src_dst \ 352 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 353 354 bilinear_interleave_src_dst_&mask_fmt&_&op \ 355 numpix, src0, src1, src01, dst0, dst1, dst01 356.endm 357 358 359/* 360 * Macros for applying masks to src pixels. (see combine_mask_u() function) 361 * src, dst should be in interleaved form. 362 * mask register should be in form (m0, m1, m2, m3). 363 */ 364.macro bilinear_apply_mask_to_src_x \ 365 numpix, src0, src1, src01, mask, \ 366 tmp01, tmp23, tmp45, tmp67 367.endm 368 369.macro bilinear_apply_mask_to_src_8 \ 370 numpix, src0, src1, src01, mask, \ 371 tmp01, tmp23, tmp45, tmp67 372 373 vmull.u8 tmp01, src0, mask 374 vmull.u8 tmp23, src1, mask 375 /* bubbles */ 376 vrshr.u16 tmp45, tmp01, #8 377 vrshr.u16 tmp67, tmp23, #8 378 /* bubbles */ 379 vraddhn.u16 src0, tmp45, tmp01 380 vraddhn.u16 src1, tmp67, tmp23 381.endm 382 383.macro bilinear_apply_mask_to_src \ 384 mask_fmt, numpix, src0, src1, src01, mask, \ 385 tmp01, tmp23, tmp45, tmp67 386 387 bilinear_apply_mask_to_src_&mask_fmt \ 388 numpix, src0, src1, src01, mask, \ 389 tmp01, tmp23, tmp45, tmp67 390.endm 391 392 393/* 394 * Macros for combining src and destination pixels. 395 * Interleave or not is depending on operator 'op'. 396 */ 397.macro bilinear_combine_src \ 398 numpix, src0, src1, src01, dst0, dst1, dst01, \ 399 tmp01, tmp23, tmp45, tmp67, tmp8 400.endm 401 402.macro bilinear_combine_over \ 403 numpix, src0, src1, src01, dst0, dst1, dst01, \ 404 tmp01, tmp23, tmp45, tmp67, tmp8 405 406 vdup.32 tmp8, src1[1] 407 /* bubbles */ 408 vmvn.8 tmp8, tmp8 409 /* bubbles */ 410 vmull.u8 tmp01, dst0, tmp8 411 /* bubbles */ 412 vmull.u8 tmp23, dst1, tmp8 413 /* bubbles */ 414 vrshr.u16 tmp45, tmp01, #8 415 vrshr.u16 tmp67, tmp23, #8 416 /* bubbles */ 417 vraddhn.u16 dst0, tmp45, tmp01 418 vraddhn.u16 dst1, tmp67, tmp23 419 /* bubbles */ 420 vqadd.u8 src01, dst01, src01 421.endm 422 423.macro bilinear_combine_add \ 424 numpix, src0, src1, src01, dst0, dst1, dst01, \ 425 tmp01, tmp23, tmp45, tmp67, tmp8 426 427 vqadd.u8 src01, dst01, src01 428.endm 429 430.macro bilinear_combine \ 431 op, numpix, src0, src1, src01, dst0, dst1, dst01, \ 432 tmp01, tmp23, tmp45, tmp67, tmp8 433 434 bilinear_combine_&op \ 435 numpix, src0, src1, src01, dst0, dst1, dst01, \ 436 tmp01, tmp23, tmp45, tmp67, tmp8 437.endm 438 439/* 440 * Macros for final deinterleaving of destination pixels if needed. 441 */ 442.macro bilinear_deinterleave numpix, dst0, dst1, dst01 443 vuzp.8 dst0, dst1 444 /* bubbles */ 445 vuzp.8 dst0, dst1 446.endm 447 448.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 449.endm 450 451.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 452 bilinear_deinterleave numpix, dst0, dst1, dst01 453.endm 454 455.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 456.endm 457 458.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 459 bilinear_deinterleave numpix, dst0, dst1, dst01 460.endm 461 462.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 463 bilinear_deinterleave numpix, dst0, dst1, dst01 464.endm 465 466.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 467 bilinear_deinterleave numpix, dst0, dst1, dst01 468.endm 469 470.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 471 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 472.endm 473 474 475.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op 476 bilinear_load_&src_fmt d0, d1, d2 477 bilinear_load_mask mask_fmt, 1, d4 478 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9 479 vmull.u8 q1, d0, d28 480 vmlal.u8 q1, d1, d29 481 /* 5 cycles bubble */ 482 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 483 vmlsl.u16 q0, d2, d30 484 vmlal.u16 q0, d3, d30 485 /* 5 cycles bubble */ 486 bilinear_duplicate_mask mask_fmt, 1, d4 487 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 488 /* 3 cycles bubble */ 489 vmovn.u16 d0, q0 490 /* 1 cycle bubble */ 491 bilinear_interleave_src_dst \ 492 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9 493 bilinear_apply_mask_to_src \ 494 mask_fmt, 1, d0, d1, q0, d4, \ 495 q3, q8, q10, q11 496 bilinear_combine \ 497 op, 1, d0, d1, q0, d18, d19, q9, \ 498 q3, q8, q10, q11, d5 499 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0 500 bilinear_store_&dst_fmt 1, q2, q3 501.endm 502 503.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op 504 bilinear_load_and_vertical_interpolate_two_&src_fmt \ 505 q1, q11, d0, d1, d20, d21, d22, d23 506 bilinear_load_mask mask_fmt, 2, d4 507 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9 508 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 509 vmlsl.u16 q0, d2, d30 510 vmlal.u16 q0, d3, d30 511 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 512 vmlsl.u16 q10, d22, d31 513 vmlal.u16 q10, d23, d31 514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 515 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 516 bilinear_duplicate_mask mask_fmt, 2, d4 517 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 518 vadd.u16 q12, q12, q13 519 vmovn.u16 d0, q0 520 bilinear_interleave_src_dst \ 521 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9 522 bilinear_apply_mask_to_src \ 523 mask_fmt, 2, d0, d1, q0, d4, \ 524 q3, q8, q10, q11 525 bilinear_combine \ 526 op, 2, d0, d1, q0, d18, d19, q9, \ 527 q3, q8, q10, q11, d5 528 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0 529 bilinear_store_&dst_fmt 2, q2, q3 530.endm 531 532.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op 533 bilinear_load_and_vertical_interpolate_four_&src_fmt \ 534 q1, q11, d0, d1, d20, d21, d22, d23 \ 535 q3, q9, d4, d5, d16, d17, d18, d19 536 pld [TMP1, PF_OFFS] 537 sub TMP1, TMP1, STRIDE 538 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS 539 vmlsl.u16 q0, d2, d30 540 vmlal.u16 q0, d3, d30 541 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS 542 vmlsl.u16 q10, d22, d31 543 vmlal.u16 q10, d23, d31 544 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 545 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS 546 vmlsl.u16 q2, d6, d30 547 vmlal.u16 q2, d7, d30 548 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS 549 bilinear_load_mask mask_fmt, 4, d22 550 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1 551 pld [TMP1, PF_OFFS] 552 vmlsl.u16 q8, d18, d31 553 vmlal.u16 q8, d19, d31 554 vadd.u16 q12, q12, q13 555 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 556 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 557 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 558 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS) 559 bilinear_duplicate_mask mask_fmt, 4, d22 560 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 561 vmovn.u16 d0, q0 562 vmovn.u16 d1, q2 563 vadd.u16 q12, q12, q13 564 bilinear_interleave_src_dst \ 565 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1 566 bilinear_apply_mask_to_src \ 567 mask_fmt, 4, d0, d1, q0, d22, \ 568 q3, q8, q9, q10 569 bilinear_combine \ 570 op, 4, d0, d1, q0, d2, d3, q1, \ 571 q3, q8, q9, q10, d23 572 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0 573 bilinear_store_&dst_fmt 4, q2, q3 574.endm 575 576.set BILINEAR_FLAG_USE_MASK, 1 577.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 578 579/* 580 * Main template macro for generating NEON optimized bilinear scanline functions. 581 * 582 * Bilinear scanline generator macro take folling arguments: 583 * fname - name of the function to generate 584 * src_fmt - source color format (8888 or 0565) 585 * dst_fmt - destination color format (8888 or 0565) 586 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes 587 * process_last_pixel - code block that interpolate one pixel and does not 588 * update horizontal weight 589 * process_two_pixels - code block that interpolate two pixels and update 590 * horizontal weight 591 * process_four_pixels - code block that interpolate four pixels and update 592 * horizontal weight 593 * process_pixblock_head - head part of middle loop 594 * process_pixblock_tail - tail part of middle loop 595 * process_pixblock_tail_head - tail_head of middle loop 596 * pixblock_size - number of pixels processed in a single middle loop 597 * prefetch_distance - prefetch in the source image by that many pixels ahead 598 */ 599 600.macro generate_bilinear_scanline_func \ 601 fname, \ 602 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ 603 bilinear_process_last_pixel, \ 604 bilinear_process_two_pixels, \ 605 bilinear_process_four_pixels, \ 606 bilinear_process_pixblock_head, \ 607 bilinear_process_pixblock_tail, \ 608 bilinear_process_pixblock_tail_head, \ 609 pixblock_size, \ 610 prefetch_distance, \ 611 flags 612 613pixman_asm_function fname 614.if pixblock_size == 8 615.elseif pixblock_size == 4 616.else 617 .error unsupported pixblock size 618.endif 619 620.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 621 OUT .req r0 622 TOP .req r1 623 BOTTOM .req r2 624 WT .req r3 625 WB .req r4 626 X .req r5 627 UX .req r6 628 WIDTH .req ip 629 TMP1 .req r3 630 TMP2 .req r4 631 PF_OFFS .req r7 632 TMP3 .req r8 633 TMP4 .req r9 634 STRIDE .req r2 635 636 mov ip, sp 637 push {r4, r5, r6, r7, r8, r9} 638 mov PF_OFFS, #prefetch_distance 639 ldmia ip, {WB, X, UX, WIDTH} 640.else 641 OUT .req r0 642 MASK .req r1 643 TOP .req r2 644 BOTTOM .req r3 645 WT .req r4 646 WB .req r5 647 X .req r6 648 UX .req r7 649 WIDTH .req ip 650 TMP1 .req r4 651 TMP2 .req r5 652 PF_OFFS .req r8 653 TMP3 .req r9 654 TMP4 .req r10 655 STRIDE .req r3 656 657 .set prefetch_offset, prefetch_distance 658 659 mov ip, sp 660 push {r4, r5, r6, r7, r8, r9, r10, ip} 661 mov PF_OFFS, #prefetch_distance 662 ldmia ip, {WT, WB, X, UX, WIDTH} 663.endif 664 665 mul PF_OFFS, PF_OFFS, UX 666 667.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 668 vpush {d8-d15} 669.endif 670 671 sub STRIDE, BOTTOM, TOP 672 .unreq BOTTOM 673 674 cmp WIDTH, #0 675 ble 3f 676 677 vdup.u16 q12, X 678 vdup.u16 q13, UX 679 vdup.u8 d28, WT 680 vdup.u8 d29, WB 681 vadd.u16 d25, d25, d26 682 683 /* ensure good destination alignment */ 684 cmp WIDTH, #1 685 blt 0f 686 tst OUT, #(1 << dst_bpp_shift) 687 beq 0f 688 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 689 vadd.u16 q12, q12, q13 690 bilinear_process_last_pixel 691 sub WIDTH, WIDTH, #1 6920: 693 vadd.u16 q13, q13, q13 694 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 695 vadd.u16 q12, q12, q13 696 697 cmp WIDTH, #2 698 blt 0f 699 tst OUT, #(1 << (dst_bpp_shift + 1)) 700 beq 0f 701 bilinear_process_two_pixels 702 sub WIDTH, WIDTH, #2 7030: 704.if pixblock_size == 8 705 cmp WIDTH, #4 706 blt 0f 707 tst OUT, #(1 << (dst_bpp_shift + 2)) 708 beq 0f 709 bilinear_process_four_pixels 710 sub WIDTH, WIDTH, #4 7110: 712.endif 713 subs WIDTH, WIDTH, #pixblock_size 714 blt 1f 715 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift) 716 bilinear_process_pixblock_head 717 subs WIDTH, WIDTH, #pixblock_size 718 blt 5f 7190: 720 bilinear_process_pixblock_tail_head 721 subs WIDTH, WIDTH, #pixblock_size 722 bge 0b 7235: 724 bilinear_process_pixblock_tail 7251: 726.if pixblock_size == 8 727 tst WIDTH, #4 728 beq 2f 729 bilinear_process_four_pixels 7302: 731.endif 732 /* handle the remaining trailing pixels */ 733 tst WIDTH, #2 734 beq 2f 735 bilinear_process_two_pixels 7362: 737 tst WIDTH, #1 738 beq 3f 739 bilinear_process_last_pixel 7403: 741.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0 742 vpop {d8-d15} 743.endif 744 745.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 746 pop {r4, r5, r6, r7, r8, r9} 747.else 748 pop {r4, r5, r6, r7, r8, r9, r10, ip} 749.endif 750 bx lr 751 752 .unreq OUT 753 .unreq TOP 754 .unreq WT 755 .unreq WB 756 .unreq X 757 .unreq UX 758 .unreq WIDTH 759 .unreq TMP1 760 .unreq TMP2 761 .unreq PF_OFFS 762 .unreq TMP3 763 .unreq TMP4 764 .unreq STRIDE 765.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 766 .unreq MASK 767.endif 768 769.endfunc 770 771.endm 772 773/* src_8888_8_8888 */ 774.macro bilinear_src_8888_8_8888_process_last_pixel 775 bilinear_interpolate_last_pixel 8888, 8, 8888, src 776.endm 777 778.macro bilinear_src_8888_8_8888_process_two_pixels 779 bilinear_interpolate_two_pixels 8888, 8, 8888, src 780.endm 781 782.macro bilinear_src_8888_8_8888_process_four_pixels 783 bilinear_interpolate_four_pixels 8888, 8, 8888, src 784.endm 785 786.macro bilinear_src_8888_8_8888_process_pixblock_head 787 bilinear_src_8888_8_8888_process_four_pixels 788.endm 789 790.macro bilinear_src_8888_8_8888_process_pixblock_tail 791.endm 792 793.macro bilinear_src_8888_8_8888_process_pixblock_tail_head 794 bilinear_src_8888_8_8888_process_pixblock_tail 795 bilinear_src_8888_8_8888_process_pixblock_head 796.endm 797 798/* src_8888_8_0565 */ 799.macro bilinear_src_8888_8_0565_process_last_pixel 800 bilinear_interpolate_last_pixel 8888, 8, 0565, src 801.endm 802 803.macro bilinear_src_8888_8_0565_process_two_pixels 804 bilinear_interpolate_two_pixels 8888, 8, 0565, src 805.endm 806 807.macro bilinear_src_8888_8_0565_process_four_pixels 808 bilinear_interpolate_four_pixels 8888, 8, 0565, src 809.endm 810 811.macro bilinear_src_8888_8_0565_process_pixblock_head 812 bilinear_src_8888_8_0565_process_four_pixels 813.endm 814 815.macro bilinear_src_8888_8_0565_process_pixblock_tail 816.endm 817 818.macro bilinear_src_8888_8_0565_process_pixblock_tail_head 819 bilinear_src_8888_8_0565_process_pixblock_tail 820 bilinear_src_8888_8_0565_process_pixblock_head 821.endm 822 823/* src_0565_8_x888 */ 824.macro bilinear_src_0565_8_x888_process_last_pixel 825 bilinear_interpolate_last_pixel 0565, 8, 8888, src 826.endm 827 828.macro bilinear_src_0565_8_x888_process_two_pixels 829 bilinear_interpolate_two_pixels 0565, 8, 8888, src 830.endm 831 832.macro bilinear_src_0565_8_x888_process_four_pixels 833 bilinear_interpolate_four_pixels 0565, 8, 8888, src 834.endm 835 836.macro bilinear_src_0565_8_x888_process_pixblock_head 837 bilinear_src_0565_8_x888_process_four_pixels 838.endm 839 840.macro bilinear_src_0565_8_x888_process_pixblock_tail 841.endm 842 843.macro bilinear_src_0565_8_x888_process_pixblock_tail_head 844 bilinear_src_0565_8_x888_process_pixblock_tail 845 bilinear_src_0565_8_x888_process_pixblock_head 846.endm 847 848/* src_0565_8_0565 */ 849.macro bilinear_src_0565_8_0565_process_last_pixel 850 bilinear_interpolate_last_pixel 0565, 8, 0565, src 851.endm 852 853.macro bilinear_src_0565_8_0565_process_two_pixels 854 bilinear_interpolate_two_pixels 0565, 8, 0565, src 855.endm 856 857.macro bilinear_src_0565_8_0565_process_four_pixels 858 bilinear_interpolate_four_pixels 0565, 8, 0565, src 859.endm 860 861.macro bilinear_src_0565_8_0565_process_pixblock_head 862 bilinear_src_0565_8_0565_process_four_pixels 863.endm 864 865.macro bilinear_src_0565_8_0565_process_pixblock_tail 866.endm 867 868.macro bilinear_src_0565_8_0565_process_pixblock_tail_head 869 bilinear_src_0565_8_0565_process_pixblock_tail 870 bilinear_src_0565_8_0565_process_pixblock_head 871.endm 872 873/* over_8888_8888 */ 874.macro bilinear_over_8888_8888_process_last_pixel 875 bilinear_interpolate_last_pixel 8888, x, 8888, over 876.endm 877 878.macro bilinear_over_8888_8888_process_two_pixels 879 bilinear_interpolate_two_pixels 8888, x, 8888, over 880.endm 881 882.macro bilinear_over_8888_8888_process_four_pixels 883 bilinear_interpolate_four_pixels 8888, x, 8888, over 884.endm 885 886.macro bilinear_over_8888_8888_process_pixblock_head 887 mov TMP1, X, asr #16 888 add X, X, UX 889 add TMP1, TOP, TMP1, asl #2 890 mov TMP2, X, asr #16 891 add X, X, UX 892 add TMP2, TOP, TMP2, asl #2 893 894 vld1.32 {d22}, [TMP1], STRIDE 895 vld1.32 {d23}, [TMP1] 896 mov TMP3, X, asr #16 897 add X, X, UX 898 add TMP3, TOP, TMP3, asl #2 899 vmull.u8 q8, d22, d28 900 vmlal.u8 q8, d23, d29 901 902 vld1.32 {d22}, [TMP2], STRIDE 903 vld1.32 {d23}, [TMP2] 904 mov TMP4, X, asr #16 905 add X, X, UX 906 add TMP4, TOP, TMP4, asl #2 907 vmull.u8 q9, d22, d28 908 vmlal.u8 q9, d23, d29 909 910 vld1.32 {d22}, [TMP3], STRIDE 911 vld1.32 {d23}, [TMP3] 912 vmull.u8 q10, d22, d28 913 vmlal.u8 q10, d23, d29 914 915 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 916 vmlsl.u16 q0, d16, d30 917 vmlal.u16 q0, d17, d30 918 919 pld [TMP4, PF_OFFS] 920 vld1.32 {d16}, [TMP4], STRIDE 921 vld1.32 {d17}, [TMP4] 922 pld [TMP4, PF_OFFS] 923 vmull.u8 q11, d16, d28 924 vmlal.u8 q11, d17, d29 925 926 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 927 vmlsl.u16 q1, d18, d31 928 vmlal.u16 q1, d19, d31 929 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 930 vadd.u16 q12, q12, q13 931.endm 932 933.macro bilinear_over_8888_8888_process_pixblock_tail 934 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 935 vmlsl.u16 q2, d20, d30 936 vmlal.u16 q2, d21, d30 937 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 938 vmlsl.u16 q3, d22, d31 939 vmlal.u16 q3, d23, d31 940 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 941 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 942 vld1.32 {d2, d3}, [OUT, :128] 943 pld [OUT, #(prefetch_offset * 4)] 944 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 945 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 946 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 947 vmovn.u16 d6, q0 948 vmovn.u16 d7, q2 949 vuzp.8 d6, d7 950 vuzp.8 d2, d3 951 vuzp.8 d6, d7 952 vuzp.8 d2, d3 953 vdup.32 d4, d7[1] 954 vmvn.8 d4, d4 955 vmull.u8 q11, d2, d4 956 vmull.u8 q2, d3, d4 957 vrshr.u16 q1, q11, #8 958 vrshr.u16 q10, q2, #8 959 vraddhn.u16 d2, q1, q11 960 vraddhn.u16 d3, q10, q2 961 vqadd.u8 q3, q1, q3 962 vuzp.8 d6, d7 963 vuzp.8 d6, d7 964 vadd.u16 q12, q12, q13 965 vst1.32 {d6, d7}, [OUT, :128]! 966.endm 967 968.macro bilinear_over_8888_8888_process_pixblock_tail_head 969 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS 970 mov TMP1, X, asr #16 971 add X, X, UX 972 add TMP1, TOP, TMP1, asl #2 973 vmlsl.u16 q2, d20, d30 974 mov TMP2, X, asr #16 975 add X, X, UX 976 add TMP2, TOP, TMP2, asl #2 977 vmlal.u16 q2, d21, d30 978 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS 979 vld1.32 {d20}, [TMP1], STRIDE 980 vmlsl.u16 q3, d22, d31 981 vmlal.u16 q3, d23, d31 982 vld1.32 {d21}, [TMP1] 983 vmull.u8 q8, d20, d28 984 vmlal.u8 q8, d21, d29 985 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 986 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 987 vld1.32 {d2, d3}, [OUT, :128] 988 pld [OUT, PF_OFFS] 989 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS) 990 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 991 vld1.32 {d22}, [TMP2], STRIDE 992 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS) 993 vmovn.u16 d6, q0 994 vld1.32 {d23}, [TMP2] 995 vmull.u8 q9, d22, d28 996 mov TMP3, X, asr #16 997 add X, X, UX 998 add TMP3, TOP, TMP3, asl #2 999 mov TMP4, X, asr #16 1000 add X, X, UX 1001 add TMP4, TOP, TMP4, asl #2 1002 vmlal.u8 q9, d23, d29 1003 vmovn.u16 d7, q2 1004 vld1.32 {d22}, [TMP3], STRIDE 1005 vuzp.8 d6, d7 1006 vuzp.8 d2, d3 1007 vuzp.8 d6, d7 1008 vuzp.8 d2, d3 1009 vdup.32 d4, d7[1] 1010 vld1.32 {d23}, [TMP3] 1011 vmvn.8 d4, d4 1012 vmull.u8 q10, d22, d28 1013 vmlal.u8 q10, d23, d29 1014 vmull.u8 q11, d2, d4 1015 vmull.u8 q2, d3, d4 1016 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS 1017 vmlsl.u16 q0, d16, d30 1018 vrshr.u16 q1, q11, #8 1019 vmlal.u16 q0, d17, d30 1020 vrshr.u16 q8, q2, #8 1021 vraddhn.u16 d2, q1, q11 1022 vraddhn.u16 d3, q8, q2 1023 pld [TMP4, PF_OFFS] 1024 vld1.32 {d16}, [TMP4], STRIDE 1025 vqadd.u8 q3, q1, q3 1026 vld1.32 {d17}, [TMP4] 1027 pld [TMP4, PF_OFFS] 1028 vmull.u8 q11, d16, d28 1029 vmlal.u8 q11, d17, d29 1030 vuzp.8 d6, d7 1031 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS 1032 vuzp.8 d6, d7 1033 vmlsl.u16 q1, d18, d31 1034 vadd.u16 q12, q12, q13 1035 vmlal.u16 q1, d19, d31 1036 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1037 vadd.u16 q12, q12, q13 1038 vst1.32 {d6, d7}, [OUT, :128]! 1039.endm 1040 1041/* over_8888_8_8888 */ 1042.macro bilinear_over_8888_8_8888_process_last_pixel 1043 bilinear_interpolate_last_pixel 8888, 8, 8888, over 1044.endm 1045 1046.macro bilinear_over_8888_8_8888_process_two_pixels 1047 bilinear_interpolate_two_pixels 8888, 8, 8888, over 1048.endm 1049 1050.macro bilinear_over_8888_8_8888_process_four_pixels 1051 bilinear_interpolate_four_pixels 8888, 8, 8888, over 1052.endm 1053 1054.macro bilinear_over_8888_8_8888_process_pixblock_head 1055 mov TMP1, X, asr #16 1056 add X, X, UX 1057 add TMP1, TOP, TMP1, asl #2 1058 vld1.32 {d0}, [TMP1], STRIDE 1059 mov TMP2, X, asr #16 1060 add X, X, UX 1061 add TMP2, TOP, TMP2, asl #2 1062 vld1.32 {d1}, [TMP1] 1063 mov TMP3, X, asr #16 1064 add X, X, UX 1065 add TMP3, TOP, TMP3, asl #2 1066 vld1.32 {d2}, [TMP2], STRIDE 1067 mov TMP4, X, asr #16 1068 add X, X, UX 1069 add TMP4, TOP, TMP4, asl #2 1070 vld1.32 {d3}, [TMP2] 1071 vmull.u8 q2, d0, d28 1072 vmull.u8 q3, d2, d28 1073 vmlal.u8 q2, d1, d29 1074 vmlal.u8 q3, d3, d29 1075 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1076 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1077 vmlsl.u16 q0, d4, d30 1078 vmlsl.u16 q1, d6, d31 1079 vmlal.u16 q0, d5, d30 1080 vmlal.u16 q1, d7, d31 1081 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1082 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1083 vld1.32 {d2}, [TMP3], STRIDE 1084 vld1.32 {d3}, [TMP3] 1085 pld [TMP4, PF_OFFS] 1086 vld1.32 {d4}, [TMP4], STRIDE 1087 vld1.32 {d5}, [TMP4] 1088 pld [TMP4, PF_OFFS] 1089 vmull.u8 q3, d2, d28 1090 vmlal.u8 q3, d3, d29 1091 vmull.u8 q1, d4, d28 1092 vmlal.u8 q1, d5, d29 1093 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1094 vld1.32 {d22[0]}, [MASK]! 1095 pld [MASK, #prefetch_offset] 1096 vadd.u16 q12, q12, q13 1097 vmovn.u16 d16, q0 1098.endm 1099 1100.macro bilinear_over_8888_8_8888_process_pixblock_tail 1101 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1102 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1103 vmlsl.u16 q9, d6, d30 1104 vmlsl.u16 q10, d2, d31 1105 vmlal.u16 q9, d7, d30 1106 vmlal.u16 q10, d3, d31 1107 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1108 vadd.u16 q12, q12, q13 1109 vdup.32 d22, d22[0] 1110 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1111 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1112 vmovn.u16 d17, q9 1113 vld1.32 {d18, d19}, [OUT, :128] 1114 pld [OUT, PF_OFFS] 1115 vuzp.8 d16, d17 1116 vuzp.8 d18, d19 1117 vuzp.8 d16, d17 1118 vuzp.8 d18, d19 1119 vmull.u8 q10, d16, d22 1120 vmull.u8 q11, d17, d22 1121 vrsra.u16 q10, q10, #8 1122 vrsra.u16 q11, q11, #8 1123 vrshrn.u16 d16, q10, #8 1124 vrshrn.u16 d17, q11, #8 1125 vdup.32 d22, d17[1] 1126 vmvn.8 d22, d22 1127 vmull.u8 q10, d18, d22 1128 vmull.u8 q11, d19, d22 1129 vrshr.u16 q9, q10, #8 1130 vrshr.u16 q0, q11, #8 1131 vraddhn.u16 d18, q9, q10 1132 vraddhn.u16 d19, q0, q11 1133 vqadd.u8 q9, q8, q9 1134 vuzp.8 d18, d19 1135 vuzp.8 d18, d19 1136 vst1.32 {d18, d19}, [OUT, :128]! 1137.endm 1138 1139.macro bilinear_over_8888_8_8888_process_pixblock_tail_head 1140 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS 1141 mov TMP1, X, asr #16 1142 add X, X, UX 1143 add TMP1, TOP, TMP1, asl #2 1144 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS 1145 vld1.32 {d0}, [TMP1], STRIDE 1146 mov TMP2, X, asr #16 1147 add X, X, UX 1148 add TMP2, TOP, TMP2, asl #2 1149 vmlsl.u16 q9, d6, d30 1150 vmlsl.u16 q10, d2, d31 1151 vld1.32 {d1}, [TMP1] 1152 mov TMP3, X, asr #16 1153 add X, X, UX 1154 add TMP3, TOP, TMP3, asl #2 1155 vmlal.u16 q9, d7, d30 1156 vmlal.u16 q10, d3, d31 1157 vld1.32 {d2}, [TMP2], STRIDE 1158 mov TMP4, X, asr #16 1159 add X, X, UX 1160 add TMP4, TOP, TMP4, asl #2 1161 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1162 vadd.u16 q12, q12, q13 1163 vld1.32 {d3}, [TMP2] 1164 vdup.32 d22, d22[0] 1165 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS) 1166 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS) 1167 vmull.u8 q2, d0, d28 1168 vmull.u8 q3, d2, d28 1169 vmovn.u16 d17, q9 1170 vld1.32 {d18, d19}, [OUT, :128] 1171 pld [OUT, #(prefetch_offset * 4)] 1172 vmlal.u8 q2, d1, d29 1173 vmlal.u8 q3, d3, d29 1174 vuzp.8 d16, d17 1175 vuzp.8 d18, d19 1176 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS 1177 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS 1178 vuzp.8 d16, d17 1179 vuzp.8 d18, d19 1180 vmlsl.u16 q0, d4, d30 1181 vmlsl.u16 q1, d6, d31 1182 vmull.u8 q10, d16, d22 1183 vmull.u8 q11, d17, d22 1184 vmlal.u16 q0, d5, d30 1185 vmlal.u16 q1, d7, d31 1186 vrsra.u16 q10, q10, #8 1187 vrsra.u16 q11, q11, #8 1188 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS) 1189 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS) 1190 vrshrn.u16 d16, q10, #8 1191 vrshrn.u16 d17, q11, #8 1192 vld1.32 {d2}, [TMP3], STRIDE 1193 vdup.32 d22, d17[1] 1194 vld1.32 {d3}, [TMP3] 1195 vmvn.8 d22, d22 1196 pld [TMP4, PF_OFFS] 1197 vld1.32 {d4}, [TMP4], STRIDE 1198 vmull.u8 q10, d18, d22 1199 vmull.u8 q11, d19, d22 1200 vld1.32 {d5}, [TMP4] 1201 pld [TMP4, PF_OFFS] 1202 vmull.u8 q3, d2, d28 1203 vrshr.u16 q9, q10, #8 1204 vrshr.u16 q15, q11, #8 1205 vmlal.u8 q3, d3, d29 1206 vmull.u8 q1, d4, d28 1207 vraddhn.u16 d18, q9, q10 1208 vraddhn.u16 d19, q15, q11 1209 vmlal.u8 q1, d5, d29 1210 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS) 1211 vqadd.u8 q9, q8, q9 1212 vld1.32 {d22[0]}, [MASK]! 1213 vuzp.8 d18, d19 1214 vadd.u16 q12, q12, q13 1215 vuzp.8 d18, d19 1216 vmovn.u16 d16, q0 1217 vst1.32 {d18, d19}, [OUT, :128]! 1218.endm 1219 1220/* add_8888_8888 */ 1221.macro bilinear_add_8888_8888_process_last_pixel 1222 bilinear_interpolate_last_pixel 8888, x, 8888, add 1223.endm 1224 1225.macro bilinear_add_8888_8888_process_two_pixels 1226 bilinear_interpolate_two_pixels 8888, x, 8888, add 1227.endm 1228 1229.macro bilinear_add_8888_8888_process_four_pixels 1230 bilinear_interpolate_four_pixels 8888, x, 8888, add 1231.endm 1232 1233.macro bilinear_add_8888_8888_process_pixblock_head 1234 bilinear_add_8888_8888_process_four_pixels 1235.endm 1236 1237.macro bilinear_add_8888_8888_process_pixblock_tail 1238.endm 1239 1240.macro bilinear_add_8888_8888_process_pixblock_tail_head 1241 bilinear_add_8888_8888_process_pixblock_tail 1242 bilinear_add_8888_8888_process_pixblock_head 1243.endm 1244 1245/* add_8888_8_8888 */ 1246.macro bilinear_add_8888_8_8888_process_last_pixel 1247 bilinear_interpolate_last_pixel 8888, 8, 8888, add 1248.endm 1249 1250.macro bilinear_add_8888_8_8888_process_two_pixels 1251 bilinear_interpolate_two_pixels 8888, 8, 8888, add 1252.endm 1253 1254.macro bilinear_add_8888_8_8888_process_four_pixels 1255 bilinear_interpolate_four_pixels 8888, 8, 8888, add 1256.endm 1257 1258.macro bilinear_add_8888_8_8888_process_pixblock_head 1259 bilinear_add_8888_8_8888_process_four_pixels 1260.endm 1261 1262.macro bilinear_add_8888_8_8888_process_pixblock_tail 1263.endm 1264 1265.macro bilinear_add_8888_8_8888_process_pixblock_tail_head 1266 bilinear_add_8888_8_8888_process_pixblock_tail 1267 bilinear_add_8888_8_8888_process_pixblock_head 1268.endm 1269 1270 1271/* Bilinear scanline functions */ 1272generate_bilinear_scanline_func \ 1273 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ 1274 8888, 8888, 2, 2, \ 1275 bilinear_src_8888_8_8888_process_last_pixel, \ 1276 bilinear_src_8888_8_8888_process_two_pixels, \ 1277 bilinear_src_8888_8_8888_process_four_pixels, \ 1278 bilinear_src_8888_8_8888_process_pixblock_head, \ 1279 bilinear_src_8888_8_8888_process_pixblock_tail, \ 1280 bilinear_src_8888_8_8888_process_pixblock_tail_head, \ 1281 4, 28, BILINEAR_FLAG_USE_MASK 1282 1283generate_bilinear_scanline_func \ 1284 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ 1285 8888, 0565, 2, 1, \ 1286 bilinear_src_8888_8_0565_process_last_pixel, \ 1287 bilinear_src_8888_8_0565_process_two_pixels, \ 1288 bilinear_src_8888_8_0565_process_four_pixels, \ 1289 bilinear_src_8888_8_0565_process_pixblock_head, \ 1290 bilinear_src_8888_8_0565_process_pixblock_tail, \ 1291 bilinear_src_8888_8_0565_process_pixblock_tail_head, \ 1292 4, 28, BILINEAR_FLAG_USE_MASK 1293 1294generate_bilinear_scanline_func \ 1295 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ 1296 0565, 8888, 1, 2, \ 1297 bilinear_src_0565_8_x888_process_last_pixel, \ 1298 bilinear_src_0565_8_x888_process_two_pixels, \ 1299 bilinear_src_0565_8_x888_process_four_pixels, \ 1300 bilinear_src_0565_8_x888_process_pixblock_head, \ 1301 bilinear_src_0565_8_x888_process_pixblock_tail, \ 1302 bilinear_src_0565_8_x888_process_pixblock_tail_head, \ 1303 4, 28, BILINEAR_FLAG_USE_MASK 1304 1305generate_bilinear_scanline_func \ 1306 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ 1307 0565, 0565, 1, 1, \ 1308 bilinear_src_0565_8_0565_process_last_pixel, \ 1309 bilinear_src_0565_8_0565_process_two_pixels, \ 1310 bilinear_src_0565_8_0565_process_four_pixels, \ 1311 bilinear_src_0565_8_0565_process_pixblock_head, \ 1312 bilinear_src_0565_8_0565_process_pixblock_tail, \ 1313 bilinear_src_0565_8_0565_process_pixblock_tail_head, \ 1314 4, 28, BILINEAR_FLAG_USE_MASK 1315 1316generate_bilinear_scanline_func \ 1317 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ 1318 8888, 8888, 2, 2, \ 1319 bilinear_over_8888_8888_process_last_pixel, \ 1320 bilinear_over_8888_8888_process_two_pixels, \ 1321 bilinear_over_8888_8888_process_four_pixels, \ 1322 bilinear_over_8888_8888_process_pixblock_head, \ 1323 bilinear_over_8888_8888_process_pixblock_tail, \ 1324 bilinear_over_8888_8888_process_pixblock_tail_head, \ 1325 4, 28, 0 1326 1327generate_bilinear_scanline_func \ 1328 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ 1329 8888, 8888, 2, 2, \ 1330 bilinear_over_8888_8_8888_process_last_pixel, \ 1331 bilinear_over_8888_8_8888_process_two_pixels, \ 1332 bilinear_over_8888_8_8888_process_four_pixels, \ 1333 bilinear_over_8888_8_8888_process_pixblock_head, \ 1334 bilinear_over_8888_8_8888_process_pixblock_tail, \ 1335 bilinear_over_8888_8_8888_process_pixblock_tail_head, \ 1336 4, 28, BILINEAR_FLAG_USE_MASK 1337 1338generate_bilinear_scanline_func \ 1339 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ 1340 8888, 8888, 2, 2, \ 1341 bilinear_add_8888_8888_process_last_pixel, \ 1342 bilinear_add_8888_8888_process_two_pixels, \ 1343 bilinear_add_8888_8888_process_four_pixels, \ 1344 bilinear_add_8888_8888_process_pixblock_head, \ 1345 bilinear_add_8888_8888_process_pixblock_tail, \ 1346 bilinear_add_8888_8888_process_pixblock_tail_head, \ 1347 4, 28, 0 1348 1349generate_bilinear_scanline_func \ 1350 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ 1351 8888, 8888, 2, 2, \ 1352 bilinear_add_8888_8_8888_process_last_pixel, \ 1353 bilinear_add_8888_8_8888_process_two_pixels, \ 1354 bilinear_add_8888_8_8888_process_four_pixels, \ 1355 bilinear_add_8888_8_8888_process_pixblock_head, \ 1356 bilinear_add_8888_8_8888_process_pixblock_tail, \ 1357 bilinear_add_8888_8_8888_process_pixblock_tail_head, \ 1358 4, 28, BILINEAR_FLAG_USE_MASK 1359