1 /* 2 * Copyright (c) 2012 Raspberry Pi Foundation 3 * Copyright (c) 2012 RISC OS Open Ltd 4 * 5 * This software is provided 'as-is', without any express or implied 6 * warranty. In no event will the authors be held liable for any damages 7 * arising from the use of this software. 8 * 9 * Permission is granted to anyone to use this software for any purpose, 10 * including commercial applications, and to alter it and redistribute it 11 * freely, subject to the following restrictions: 12 * 13 * 1. The origin of this software must not be misrepresented; you must not 14 * claim that you wrote the original software. If you use this software 15 * in a product, an acknowledgment in the product documentation would be 16 * appreciated but is not required. 17 * 2. Altered source versions must be plainly marked as such, and must not be 18 * misrepresented as being the original software. 19 * 3. This notice may not be removed or altered from any source distribution. 20 */ 21 22 /* 23 * Because the alignment of pixel data to cachelines, and even the number of 24 * cachelines per row can vary from row to row, and because of the need to 25 * preload each scanline once and only once, this prefetch strategy treats 26 * each row of pixels independently. When a pixel row is long enough, there 27 * are three distinct phases of prefetch: 28 * * an inner loop section, where each time a cacheline of data is 29 * processed, another cacheline is preloaded (the exact distance ahead is 30 * determined empirically using profiling results from lowlevel-blt-bench) 31 * * a leading section, where enough cachelines are preloaded to ensure no 32 * cachelines escape being preloaded when the inner loop starts 33 * * a trailing section, where a limited number (0 or more) of cachelines 34 * are preloaded to deal with data (if any) that hangs off the end of the 35 * last iteration of the inner loop, plus any trailing bytes that were not 36 * enough to make up one whole iteration of the inner loop 37 * 38 * There are (in general) three distinct code paths, selected between 39 * depending upon how long the pixel row is. If it is long enough that there 40 * is at least one iteration of the inner loop (as described above) then 41 * this is described as the "wide" case. If it is shorter than that, but 42 * there are still enough bytes output that there is at least one 16-byte- 43 * long, 16-byte-aligned write to the destination (the optimum type of 44 * write), then this is the "medium" case. If it is not even this long, then 45 * this is the "narrow" case, and there is no attempt to align writes to 46 * 16-byte boundaries. In the "medium" and "narrow" cases, all the 47 * cachelines containing data from the pixel row are prefetched up-front. 48 */ 49 50 /* 51 * Determine whether we put the arguments on the stack for debugging. 52 */ 53 #undef DEBUG_PARAMS 54 55 /* 56 * Bit flags for 'generate_composite_function' macro which are used 57 * to tune generated functions behavior. 58 */ 59 .set FLAG_DST_WRITEONLY, 0 60 .set FLAG_DST_READWRITE, 1 61 .set FLAG_COND_EXEC, 0 62 .set FLAG_BRANCH_OVER, 2 63 .set FLAG_PROCESS_PRESERVES_PSR, 0 64 .set FLAG_PROCESS_CORRUPTS_PSR, 4 65 .set FLAG_PROCESS_DOESNT_STORE, 0 66 .set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ 67 .set FLAG_NO_SPILL_LINE_VARS, 0 68 .set FLAG_SPILL_LINE_VARS_WIDE, 16 69 .set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 70 .set FLAG_SPILL_LINE_VARS, 48 71 .set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 72 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64 73 .set FLAG_PROCESS_PRESERVES_WK0, 0 74 .set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ 75 .set FLAG_PRELOAD_DST, 0 76 .set FLAG_NO_PRELOAD_DST, 256 77 78 /* 79 * Number of bytes by which to adjust preload offset of destination 80 * buffer (allows preload instruction to be moved before the load(s)) 81 */ 82 .set DST_PRELOAD_BIAS, 0 83 84 /* 85 * Offset into stack where mask and source pointer/stride can be accessed. 86 */ 87 #ifdef DEBUG_PARAMS 88 .set ARGS_STACK_OFFSET, (9*4+9*4) 89 #else 90 .set ARGS_STACK_OFFSET, (9*4) 91 #endif 92 93 /* 94 * Offset into stack where space allocated during init macro can be accessed. 95 */ 96 .set LOCALS_STACK_OFFSET, 0 97 98 /* 99 * Constants for selecting preferable prefetch type. 100 */ 101 .set PREFETCH_TYPE_NONE, 0 102 .set PREFETCH_TYPE_STANDARD, 1 103 104 /* 105 * Definitions of macros for load/store of pixel data. 106 */ 107 108 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 109 .if numbytes == 16 110 .if unaligned == 1 111 op&r&cond WK®0, [base], #4 112 op&r&cond WK®1, [base], #4 113 op&r&cond WK®2, [base], #4 114 op&r&cond WK®3, [base], #4 115 .else 116 op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} 117 .endif 118 .elseif numbytes == 8 119 .if unaligned == 1 120 op&r&cond WK®0, [base], #4 121 op&r&cond WK®1, [base], #4 122 .else 123 op&m&cond&ia base!, {WK®0,WK®1} 124 .endif 125 .elseif numbytes == 4 126 op&r&cond WK®0, [base], #4 127 .elseif numbytes == 2 128 op&r&cond&h WK®0, [base], #2 129 .elseif numbytes == 1 130 op&r&cond&b WK®0, [base], #1 131 .else 132 .error "unsupported size: numbytes" 133 .endif 134 .endm 135 136 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base 137 .if numbytes == 16 138 stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} 139 .elseif numbytes == 8 140 stm&cond&db base, {WK®0,WK®1} 141 .elseif numbytes == 4 142 str&cond WK®0, [base, #-4] 143 .elseif numbytes == 2 144 str&cond&h WK®0, [base, #-2] 145 .elseif numbytes == 1 146 str&cond&b WK®0, [base, #-1] 147 .else 148 .error "unsupported size: numbytes" 149 .endif 150 .endm 151 152 .macro pixld cond, numbytes, firstreg, base, unaligned 153 pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned 154 .endm 155 156 .macro pixst cond, numbytes, firstreg, base 157 .if (flags) & FLAG_DST_READWRITE 158 pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 159 .else 160 pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base 161 .endif 162 .endm 163 164 .macro PF a, x:vararg 165 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) 166 a x 167 .endif 168 .endm 169 170 171 .macro preload_leading_step1 bpp, ptr, base 172 /* If the destination is already 16-byte aligned, then we need to preload 173 * between 0 and prefetch_distance (inclusive) cache lines ahead so there 174 * are no gaps when the inner loop starts. 175 */ 176 .if bpp > 0 177 PF bic, ptr, base, #31 178 .set OFFSET, 0 179 .rept prefetch_distance+1 180 PF pld, [ptr, #OFFSET] 181 .set OFFSET, OFFSET+32 182 .endr 183 .endif 184 .endm 185 186 .macro preload_leading_step2 bpp, bpp_shift, ptr, base 187 /* However, if the destination is not 16-byte aligned, we may need to 188 * preload more cache lines than that. The question we need to ask is: 189 * are the bytes corresponding to the leading pixels more than the amount 190 * by which the source pointer will be rounded down for preloading, and if 191 * so, by how many cache lines? Effectively, we want to calculate 192 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp 193 * inner_loop_offset = (src+leading_bytes)&31 194 * extra_needed = leading_bytes - inner_loop_offset 195 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only 196 * possible when there are 4 src bytes for every 1 dst byte). 197 */ 198 .if bpp > 0 199 .ifc base,DST 200 /* The test can be simplified further when preloading the destination */ 201 PF tst, base, #16 202 PF beq, 61f 203 .else 204 .if bpp/dst_w_bpp == 4 205 PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift 206 PF and, SCRATCH, SCRATCH, #31 207 PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift 208 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ 209 PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ 210 PF bcs, 61f 211 PF bpl, 60f 212 PF pld, [ptr, #32*(prefetch_distance+2)] 213 .else 214 PF mov, SCRATCH, base, lsl #32-5 215 PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 216 PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift 217 PF bls, 61f 218 .endif 219 .endif 220 60: PF pld, [ptr, #32*(prefetch_distance+1)] 221 61: 222 .endif 223 .endm 224 225 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) 226 .macro preload_middle bpp, base, scratch_holds_offset 227 .if bpp > 0 228 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ 229 .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) 230 .if scratch_holds_offset 231 PF pld, [base, SCRATCH] 232 .else 233 PF bic, SCRATCH, base, #31 234 PF pld, [SCRATCH, #32*prefetch_distance] 235 .endif 236 .endif 237 .endif 238 .endm 239 240 .macro preload_trailing bpp, bpp_shift, base 241 .if bpp > 0 242 .if bpp*pix_per_block > 256 243 /* Calculations are more complex if more than one fetch per block */ 244 PF and, WK1, base, #31 245 PF add, WK1, WK1, WK0, lsl #bpp_shift 246 PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) 247 PF bic, SCRATCH, base, #31 248 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 249 PF add, SCRATCH, SCRATCH, #32 250 PF subs, WK1, WK1, #32 251 PF bhi, 80b 252 .else 253 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ 254 PF mov, SCRATCH, base, lsl #32-5 255 PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift 256 PF adceqs, SCRATCH, SCRATCH, #0 257 /* The instruction above has two effects: ensures Z is only 258 * set if C was clear (so Z indicates that both shifted quantities 259 * were 0), and clears C if Z was set (so C indicates that the sum 260 * of the shifted quantities was greater and not equal to 32) */ 261 PF beq, 82f 262 PF bic, SCRATCH, base, #31 263 PF bcc, 81f 264 PF pld, [SCRATCH, #32*(prefetch_distance+2)] 265 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] 266 82: 267 .endif 268 .endif 269 .endm 270 271 272 .macro preload_line narrow_case, bpp, bpp_shift, base 273 /* "narrow_case" - just means that the macro was invoked from the "narrow" 274 * code path rather than the "medium" one - because in the narrow case, 275 * the row of pixels is known to output no more than 30 bytes, then 276 * (assuming the source pixels are no wider than the the destination 277 * pixels) they cannot possibly straddle more than 2 32-byte cachelines, 278 * meaning there's no need for a loop. 279 * "bpp" - number of bits per pixel in the channel (source, mask or 280 * destination) that's being preloaded, or 0 if this channel is not used 281 * for reading 282 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) 283 * "base" - base address register of channel to preload (SRC, MASK or DST) 284 */ 285 .if bpp > 0 286 .if narrow_case && (bpp <= dst_w_bpp) 287 /* In these cases, each line for each channel is in either 1 or 2 cache lines */ 288 PF bic, WK0, base, #31 289 PF pld, [WK0] 290 PF add, WK1, base, X, LSL #bpp_shift 291 PF sub, WK1, WK1, #1 292 PF bic, WK1, WK1, #31 293 PF cmp, WK1, WK0 294 PF beq, 90f 295 PF pld, [WK1] 296 90: 297 .else 298 PF bic, WK0, base, #31 299 PF pld, [WK0] 300 PF add, WK1, base, X, lsl #bpp_shift 301 PF sub, WK1, WK1, #1 302 PF bic, WK1, WK1, #31 303 PF cmp, WK1, WK0 304 PF beq, 92f 305 91: PF add, WK0, WK0, #32 306 PF cmp, WK0, WK1 307 PF pld, [WK0] 308 PF bne, 91b 309 92: 310 .endif 311 .endif 312 .endm 313 314 315 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 316 process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 317 .if decrementx 318 sub&cond X, X, #8*numbytes/dst_w_bpp 319 .endif 320 process_tail cond, numbytes, firstreg 321 .if !((flags) & FLAG_PROCESS_DOES_STORE) 322 pixst cond, numbytes, firstreg, DST 323 .endif 324 .endm 325 326 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 327 .if (flags) & FLAG_BRANCH_OVER 328 .ifc cond,mi 329 bpl 100f 330 .endif 331 .ifc cond,cs 332 bcc 100f 333 .endif 334 .ifc cond,ne 335 beq 100f 336 .endif 337 conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 338 100: 339 .else 340 conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx 341 .endif 342 .endm 343 344 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx 345 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) 346 /* Can't interleave reads and writes */ 347 test 348 conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx 349 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR 350 test 351 .endif 352 conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx 353 .else 354 /* Can interleave reads and writes for better scheduling */ 355 test 356 process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 357 process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 358 .if decrementx 359 sub&cond1 X, X, #8*numbytes1/dst_w_bpp 360 sub&cond2 X, X, #8*numbytes2/dst_w_bpp 361 .endif 362 process_tail cond1, numbytes1, firstreg1 363 process_tail cond2, numbytes2, firstreg2 364 pixst cond1, numbytes1, firstreg1, DST 365 pixst cond2, numbytes2, firstreg2, DST 366 .endif 367 .endm 368 369 370 .macro test_bits_1_0_ptr 371 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 372 movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ 373 .else 374 movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ 375 .endif 376 .endm 377 378 .macro test_bits_3_2_ptr 379 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 380 movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ 381 .else 382 movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ 383 .endif 384 .endm 385 386 .macro leading_15bytes process_head, process_tail 387 /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ 388 .set DECREMENT_X, 1 389 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 390 .set DECREMENT_X, 0 391 sub X, X, WK0, lsr #dst_bpp_shift 392 str X, [sp, #LINE_SAVED_REG_COUNT*4] 393 mov X, WK0 394 .endif 395 /* Use unaligned loads in all cases for simplicity */ 396 .if dst_w_bpp == 8 397 conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X 398 .elseif dst_w_bpp == 16 399 test_bits_1_0_ptr 400 conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X 401 .endif 402 conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X 403 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 404 ldr X, [sp, #LINE_SAVED_REG_COUNT*4] 405 .endif 406 .endm 407 408 .macro test_bits_3_2_pix 409 movs SCRATCH, X, lsl #dst_bpp_shift+32-3 410 .endm 411 412 .macro test_bits_1_0_pix 413 .if dst_w_bpp == 8 414 movs SCRATCH, X, lsl #dst_bpp_shift+32-1 415 .else 416 movs SCRATCH, X, lsr #1 417 .endif 418 .endm 419 420 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 421 conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 422 .if dst_w_bpp == 16 423 test_bits_1_0_pix 424 conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 425 .elseif dst_w_bpp == 8 426 conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 427 .endif 428 .endm 429 430 431 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment 432 110: 433 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ 434 .rept pix_per_block*dst_w_bpp/128 435 process_head , 16, 0, unaligned_src, unaligned_mask, 1 436 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 437 preload_middle src_bpp, SRC, 1 438 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 439 preload_middle mask_bpp, MASK, 1 440 .else 441 preload_middle src_bpp, SRC, 0 442 preload_middle mask_bpp, MASK, 0 443 .endif 444 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) 445 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that 446 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset 447 * preloads for, to achieve staggered prefetches for multiple channels, because there are 448 * always two STMs per prefetch, so there is always an opposite STM on which to put the 449 * preload. Note, no need to BIC the base register here */ 450 PF pld, [DST, #32*prefetch_distance - dst_alignment] 451 .endif 452 process_tail , 16, 0 453 .if !((flags) & FLAG_PROCESS_DOES_STORE) 454 pixst , 16, 0, DST 455 .endif 456 .set SUBBLOCK, SUBBLOCK+1 457 .endr 458 subs X, X, #pix_per_block 459 bhs 110b 460 .endm 461 462 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask 463 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ 464 .if dst_r_bpp > 0 465 tst DST, #16 466 bne 111f 467 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS 468 b 112f 469 111: 470 .endif 471 process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS 472 112: 473 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ 474 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) 475 PF and, WK0, X, #pix_per_block-1 476 .endif 477 preload_trailing src_bpp, src_bpp_shift, SRC 478 preload_trailing mask_bpp, mask_bpp_shift, MASK 479 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 480 preload_trailing dst_r_bpp, dst_bpp_shift, DST 481 .endif 482 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp 483 /* The remainder of the line is handled identically to the medium case */ 484 medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask 485 .endm 486 487 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 488 120: 489 process_head , 16, 0, unaligned_src, unaligned_mask, 0 490 process_tail , 16, 0 491 .if !((flags) & FLAG_PROCESS_DOES_STORE) 492 pixst , 16, 0, DST 493 .endif 494 subs X, X, #128/dst_w_bpp 495 bhs 120b 496 /* Trailing pixels */ 497 tst X, #128/dst_w_bpp - 1 498 beq exit_label 499 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 500 .endm 501 502 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask 503 tst X, #16*8/dst_w_bpp 504 conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 505 /* Trailing pixels */ 506 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ 507 trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask 508 .endm 509 510 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label 511 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ 512 .if mask_bpp == 8 || mask_bpp == 16 513 tst MASK, #3 514 bne 141f 515 .endif 516 .if src_bpp == 8 || src_bpp == 16 517 tst SRC, #3 518 bne 140f 519 .endif 520 action process_head, process_tail, process_inner_loop, exit_label, 0, 0 521 .if src_bpp == 8 || src_bpp == 16 522 b exit_label 523 140: 524 action process_head, process_tail, process_inner_loop, exit_label, 1, 0 525 .endif 526 .if mask_bpp == 8 || mask_bpp == 16 527 b exit_label 528 141: 529 .if src_bpp == 8 || src_bpp == 16 530 tst SRC, #3 531 bne 142f 532 .endif 533 action process_head, process_tail, process_inner_loop, exit_label, 0, 1 534 .if src_bpp == 8 || src_bpp == 16 535 b exit_label 536 142: 537 action process_head, process_tail, process_inner_loop, exit_label, 1, 1 538 .endif 539 .endif 540 .endm 541 542 543 .macro end_of_line restore_x, vars_spilled, loop_label, last_one 544 .if SINGLE_SCANLINE 545 .ifc "last_one","" 546 b 198f 547 .endif 548 .else 549 .if vars_spilled 550 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ 551 /* This is ldmia sp,{} */ 552 .word 0xE89D0000 | LINE_SAVED_REGS 553 .endif 554 subs Y, Y, #1 555 .if vars_spilled 556 .if (LINE_SAVED_REGS) & (1<<1) 557 str Y, [sp] 558 .endif 559 .endif 560 add DST, DST, STRIDE_D 561 .if src_bpp > 0 562 add SRC, SRC, STRIDE_S 563 .endif 564 .if mask_bpp > 0 565 add MASK, MASK, STRIDE_M 566 .endif 567 .if restore_x 568 mov X, ORIG_W 569 .endif 570 bhs loop_label 571 .ifc "last_one","" 572 .if vars_spilled 573 b 197f 574 .else 575 b 198f 576 .endif 577 .else 578 .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) 579 b 198f 580 .endif 581 .endif 582 .endif 583 .endm 584 585 586 .macro generate_composite_function_common fname, \ 587 src_bpp_, \ 588 mask_bpp_, \ 589 dst_w_bpp_, \ 590 flags_, \ 591 prefetch_distance_, \ 592 init, \ 593 newline, \ 594 cleanup, \ 595 process_head, \ 596 process_tail, \ 597 process_inner_loop 598 599 pixman_asm_function fname 600 601 /* 602 * Make some macro arguments globally visible and accessible 603 * from other macros 604 */ 605 .set src_bpp, src_bpp_ 606 .set mask_bpp, mask_bpp_ 607 .set dst_w_bpp, dst_w_bpp_ 608 .set flags, flags_ 609 .set prefetch_distance, prefetch_distance_ 610 611 /* 612 * Select prefetch type for this function. 613 */ 614 .if prefetch_distance == 0 615 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE 616 .else 617 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD 618 .endif 619 620 .if src_bpp == 32 621 .set src_bpp_shift, 2 622 .elseif src_bpp == 24 623 .set src_bpp_shift, 0 624 .elseif src_bpp == 16 625 .set src_bpp_shift, 1 626 .elseif src_bpp == 8 627 .set src_bpp_shift, 0 628 .elseif src_bpp == 0 629 .set src_bpp_shift, -1 630 .else 631 .error "requested src bpp (src_bpp) is not supported" 632 .endif 633 634 .if mask_bpp == 32 635 .set mask_bpp_shift, 2 636 .elseif mask_bpp == 24 637 .set mask_bpp_shift, 0 638 .elseif mask_bpp == 8 639 .set mask_bpp_shift, 0 640 .elseif mask_bpp == 0 641 .set mask_bpp_shift, -1 642 .else 643 .error "requested mask bpp (mask_bpp) is not supported" 644 .endif 645 646 .if dst_w_bpp == 32 647 .set dst_bpp_shift, 2 648 .elseif dst_w_bpp == 24 649 .set dst_bpp_shift, 0 650 .elseif dst_w_bpp == 16 651 .set dst_bpp_shift, 1 652 .elseif dst_w_bpp == 8 653 .set dst_bpp_shift, 0 654 .else 655 .error "requested dst bpp (dst_w_bpp) is not supported" 656 .endif 657 658 .if (((flags) & FLAG_DST_READWRITE) != 0) 659 .set dst_r_bpp, dst_w_bpp 660 .else 661 .set dst_r_bpp, 0 662 .endif 663 664 .set pix_per_block, 16*8/dst_w_bpp 665 .if src_bpp != 0 666 .if 32*8/src_bpp > pix_per_block 667 .set pix_per_block, 32*8/src_bpp 668 .endif 669 .endif 670 .if mask_bpp != 0 671 .if 32*8/mask_bpp > pix_per_block 672 .set pix_per_block, 32*8/mask_bpp 673 .endif 674 .endif 675 .if dst_r_bpp != 0 676 .if 32*8/dst_r_bpp > pix_per_block 677 .set pix_per_block, 32*8/dst_r_bpp 678 .endif 679 .endif 680 681 /* The standard entry conditions set up by pixman-arm-common.h are: 682 * r0 = width (pixels) 683 * r1 = height (rows) 684 * r2 = pointer to top-left pixel of destination 685 * r3 = destination stride (pixels) 686 * [sp] = source pixel value, or pointer to top-left pixel of source 687 * [sp,#4] = 0 or source stride (pixels) 688 * The following arguments are unused for non-mask operations 689 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask 690 * [sp,#12] = 0 or mask stride (pixels) 691 * 692 * or in the single-scanline case: 693 * r0 = width (pixels) 694 * r1 = pointer to top-left pixel of destination 695 * r2 = pointer to top-left pixel of source 696 * The following argument is unused for non-mask operations 697 * r3 = pointer to top-left pixel of mask 698 */ 699 700 /* 701 * Assign symbolic names to registers 702 */ 703 X .req r0 /* pixels to go on this line */ 704 .if SINGLE_SCANLINE 705 DST .req r1 /* destination pixel pointer */ 706 SRC .req r2 /* source pixel pointer */ 707 MASK .req r3 /* mask pixel pointer (if applicable) */ 708 Y .req r4 /* temporary */ 709 STRIDE_D .req r5 /* temporary */ 710 STRIDE_S .req r6 /* temporary */ 711 STRIDE_M .req r7 /* temporary */ 712 .else 713 Y .req r1 /* lines to go */ 714 DST .req r2 /* destination pixel pointer */ 715 STRIDE_D .req r3 /* destination stride (bytes, minus width) */ 716 SRC .req r4 /* source pixel pointer */ 717 STRIDE_S .req r5 /* source stride (bytes, minus width) */ 718 MASK .req r6 /* mask pixel pointer (if applicable) */ 719 STRIDE_M .req r7 /* mask stride (bytes, minus width) */ 720 .endif 721 WK0 .req r8 /* pixel data registers */ 722 WK1 .req r9 723 WK2 .req r10 724 WK3 .req r11 725 SCRATCH .req r12 726 ORIG_W .req r14 /* width (pixels) */ 727 728 push {r4-r11, lr} /* save all registers */ 729 730 .if !SINGLE_SCANLINE 731 subs Y, Y, #1 732 blo 199f 733 .endif 734 735 #ifdef DEBUG_PARAMS 736 sub sp, sp, #9*4 737 #endif 738 739 .if !SINGLE_SCANLINE 740 .if src_bpp > 0 741 ldr SRC, [sp, #ARGS_STACK_OFFSET] 742 ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] 743 .endif 744 .if mask_bpp > 0 745 ldr MASK, [sp, #ARGS_STACK_OFFSET+8] 746 ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] 747 .endif 748 .endif 749 750 #ifdef DEBUG_PARAMS 751 add Y, Y, #1 752 stmia sp, {r0-r7,pc} 753 sub Y, Y, #1 754 #endif 755 756 init 757 758 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 759 /* Reserve a word in which to store X during leading pixels */ 760 sub sp, sp, #4 761 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 762 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 763 .endif 764 765 .if !SINGLE_SCANLINE 766 lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ 767 sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift 768 .if src_bpp > 0 769 lsl STRIDE_S, #src_bpp_shift 770 sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift 771 .endif 772 .if mask_bpp > 0 773 lsl STRIDE_M, #mask_bpp_shift 774 sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift 775 .endif 776 .endif 777 778 /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ 779 cmp X, #2*16*8/dst_w_bpp - 1 780 blo 170f 781 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ 782 /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ 783 cmp X, #(prefetch_distance+3)*pix_per_block - 1 784 blo 160f 785 786 /* Wide case */ 787 /* Adjust X so that the decrement instruction can also test for 788 * inner loop termination. We want it to stop when there are 789 * (prefetch_distance+1) complete blocks to go. */ 790 sub X, X, #(prefetch_distance+2)*pix_per_block 791 .if !SINGLE_SCANLINE 792 mov ORIG_W, X 793 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE 794 /* This is stmdb sp!,{} */ 795 .word 0xE92D0000 | LINE_SAVED_REGS 796 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 797 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 798 .endif 799 .endif 800 151: /* New line */ 801 newline 802 preload_leading_step1 src_bpp, WK1, SRC 803 preload_leading_step1 mask_bpp, WK2, MASK 804 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 805 preload_leading_step1 dst_r_bpp, WK3, DST 806 .endif 807 808 ands WK0, DST, #15 809 beq 154f 810 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 811 812 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC 813 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK 814 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 815 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST 816 .endif 817 818 leading_15bytes process_head, process_tail 819 820 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ 821 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 822 and SCRATCH, SRC, #31 823 rsb SCRATCH, SCRATCH, #32*prefetch_distance 824 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) 825 and SCRATCH, MASK, #31 826 rsb SCRATCH, SCRATCH, #32*prefetch_distance 827 .endif 828 .ifc "process_inner_loop","" 829 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f 830 .else 831 switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f 832 .endif 833 834 157: /* Check for another line */ 835 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b 836 .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE) 837 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 838 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 839 .endif 840 .endif 841 842 .ltorg 843 844 160: /* Medium case */ 845 .if !SINGLE_SCANLINE 846 mov ORIG_W, X 847 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 848 /* This is stmdb sp!,{} */ 849 .word 0xE92D0000 | LINE_SAVED_REGS 850 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 851 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 852 .endif 853 .endif 854 161: /* New line */ 855 newline 856 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 857 preload_line 0, mask_bpp, mask_bpp_shift, MASK 858 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 859 preload_line 0, dst_r_bpp, dst_bpp_shift, DST 860 .endif 861 862 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ 863 ands WK0, DST, #15 864 beq 164f 865 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ 866 867 leading_15bytes process_head, process_tail 868 869 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ 870 switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f 871 872 167: /* Check for another line */ 873 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b 874 875 .ltorg 876 877 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ 878 .if !SINGLE_SCANLINE 879 .if dst_w_bpp < 32 880 mov ORIG_W, X 881 .endif 882 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE 883 /* This is stmdb sp!,{} */ 884 .word 0xE92D0000 | LINE_SAVED_REGS 885 .endif 886 .endif 887 171: /* New line */ 888 newline 889 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ 890 preload_line 1, mask_bpp, mask_bpp_shift, MASK 891 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 892 preload_line 1, dst_r_bpp, dst_bpp_shift, DST 893 .endif 894 895 .if dst_w_bpp == 8 896 tst DST, #3 897 beq 174f 898 172: subs X, X, #1 899 blo 177f 900 process_head , 1, 0, 1, 1, 0 901 process_tail , 1, 0 902 .if !((flags) & FLAG_PROCESS_DOES_STORE) 903 pixst , 1, 0, DST 904 .endif 905 tst DST, #3 906 bne 172b 907 .elseif dst_w_bpp == 16 908 tst DST, #2 909 beq 174f 910 subs X, X, #1 911 blo 177f 912 process_head , 2, 0, 1, 1, 0 913 process_tail , 2, 0 914 .if !((flags) & FLAG_PROCESS_DOES_STORE) 915 pixst , 2, 0, DST 916 .endif 917 .endif 918 919 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ 920 switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f 921 922 177: /* Check for another line */ 923 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one 924 .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE) 925 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 926 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 927 .endif 928 929 197: 930 .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS) 931 add sp, sp, #LINE_SAVED_REG_COUNT*4 932 .endif 933 198: 934 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 935 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 936 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 937 add sp, sp, #4 938 .endif 939 940 cleanup 941 942 #ifdef DEBUG_PARAMS 943 add sp, sp, #9*4 /* junk the debug copy of arguments */ 944 #endif 945 199: 946 pop {r4-r11, pc} /* exit */ 947 948 .ltorg 949 950 .unreq X 951 .unreq Y 952 .unreq DST 953 .unreq STRIDE_D 954 .unreq SRC 955 .unreq STRIDE_S 956 .unreq MASK 957 .unreq STRIDE_M 958 .unreq WK0 959 .unreq WK1 960 .unreq WK2 961 .unreq WK3 962 .unreq SCRATCH 963 .unreq ORIG_W 964 .endfunc 965 .endm 966 967 .macro generate_composite_function fname, \ 968 src_bpp_, \ 969 mask_bpp_, \ 970 dst_w_bpp_, \ 971 flags_, \ 972 prefetch_distance_, \ 973 init, \ 974 newline, \ 975 cleanup, \ 976 process_head, \ 977 process_tail, \ 978 process_inner_loop 979 .set SINGLE_SCANLINE, 0 980 generate_composite_function_common \ 981 fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ 982 init, newline, cleanup, process_head, process_tail, process_inner_loop 983 .endm 984 985 .macro generate_composite_function_single_scanline fname, \ 986 src_bpp_, \ 987 mask_bpp_, \ 988 dst_w_bpp_, \ 989 flags_, \ 990 prefetch_distance_, \ 991 init, \ 992 newline, \ 993 cleanup, \ 994 process_head, \ 995 process_tail, \ 996 process_inner_loop 997 .set SINGLE_SCANLINE, 1 998 generate_composite_function_common \ 999 fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ 1000 init, newline, cleanup, process_head, process_tail, process_inner_loop 1001 .endm 1002 1003 .macro line_saved_regs x:vararg 1004 .set LINE_SAVED_REGS, 0 1005 .set LINE_SAVED_REG_COUNT, 0 1006 .irp SAVED_REG,x 1007 .ifc "SAVED_REG","Y" 1008 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) 1009 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1010 .endif 1011 .ifc "SAVED_REG","STRIDE_D" 1012 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) 1013 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1014 .endif 1015 .ifc "SAVED_REG","STRIDE_S" 1016 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) 1017 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1018 .endif 1019 .ifc "SAVED_REG","STRIDE_M" 1020 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) 1021 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1022 .endif 1023 .ifc "SAVED_REG","ORIG_W" 1024 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) 1025 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 1026 .endif 1027 .endr 1028 .if SINGLE_SCANLINE 1029 .set LINE_SAVED_REG_COUNT, 0 1030 .endif 1031 .endm 1032 1033 .macro nop_macro x:vararg 1034 .endm 1035