1 /*
2  * Copyright (c) 2012 Raspberry Pi Foundation
3  * Copyright (c) 2012 RISC OS Open Ltd
4  *
5  * This software is provided 'as-is', without any express or implied
6  * warranty.  In no event will the authors be held liable for any damages
7  * arising from the use of this software.
8  *
9  * Permission is granted to anyone to use this software for any purpose,
10  * including commercial applications, and to alter it and redistribute it
11  * freely, subject to the following restrictions:
12  *
13  * 1. The origin of this software must not be misrepresented; you must not
14  *    claim that you wrote the original software. If you use this software
15  *    in a product, an acknowledgment in the product documentation would be
16  *    appreciated but is not required.
17  * 2. Altered source versions must be plainly marked as such, and must not be
18  *    misrepresented as being the original software.
19  * 3. This notice may not be removed or altered from any source distribution.
20  */
21 
22 /*
23  * Because the alignment of pixel data to cachelines, and even the number of
24  * cachelines per row can vary from row to row, and because of the need to
25  * preload each scanline once and only once, this prefetch strategy treats
26  * each row of pixels independently. When a pixel row is long enough, there
27  * are three distinct phases of prefetch:
28  * * an inner loop section, where each time a cacheline of data is
29  *    processed, another cacheline is preloaded (the exact distance ahead is
30  *    determined empirically using profiling results from lowlevel-blt-bench)
31  * * a leading section, where enough cachelines are preloaded to ensure no
32  *    cachelines escape being preloaded when the inner loop starts
33  * * a trailing section, where a limited number (0 or more) of cachelines
34  *    are preloaded to deal with data (if any) that hangs off the end of the
35  *    last iteration of the inner loop, plus any trailing bytes that were not
36  *    enough to make up one whole iteration of the inner loop
37  *
38  * There are (in general) three distinct code paths, selected between
39  * depending upon how long the pixel row is. If it is long enough that there
40  * is at least one iteration of the inner loop (as described above) then
41  * this is described as the "wide" case. If it is shorter than that, but
42  * there are still enough bytes output that there is at least one 16-byte-
43  * long, 16-byte-aligned write to the destination (the optimum type of
44  * write), then this is the "medium" case. If it is not even this long, then
45  * this is the "narrow" case, and there is no attempt to align writes to
46  * 16-byte boundaries. In the "medium" and "narrow" cases, all the
47  * cachelines containing data from the pixel row are prefetched up-front.
48  */
49 
50 /*
51  * Determine whether we put the arguments on the stack for debugging.
52  */
53 #undef DEBUG_PARAMS
54 
55 /*
56  * Bit flags for 'generate_composite_function' macro which are used
57  * to tune generated functions behavior.
58  */
59 .set FLAG_DST_WRITEONLY,         0
60 .set FLAG_DST_READWRITE,         1
61 .set FLAG_COND_EXEC,             0
62 .set FLAG_BRANCH_OVER,           2
63 .set FLAG_PROCESS_PRESERVES_PSR, 0
64 .set FLAG_PROCESS_CORRUPTS_PSR,  4
65 .set FLAG_PROCESS_DOESNT_STORE,  0
66 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
67 .set FLAG_NO_SPILL_LINE_VARS,        0
68 .set FLAG_SPILL_LINE_VARS_WIDE,      16
69 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
70 .set FLAG_SPILL_LINE_VARS,           48
71 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
72 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
73 .set FLAG_PROCESS_PRESERVES_WK0,     0
74 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
75 .set FLAG_PRELOAD_DST,               0
76 .set FLAG_NO_PRELOAD_DST,            256
77 
78 /*
79  * Number of bytes by which to adjust preload offset of destination
80  * buffer (allows preload instruction to be moved before the load(s))
81  */
82 .set DST_PRELOAD_BIAS, 0
83 
84 /*
85  * Offset into stack where mask and source pointer/stride can be accessed.
86  */
87 #ifdef DEBUG_PARAMS
88 .set ARGS_STACK_OFFSET,        (9*4+9*4)
89 #else
90 .set ARGS_STACK_OFFSET,        (9*4)
91 #endif
92 
93 /*
94  * Offset into stack where space allocated during init macro can be accessed.
95  */
96 .set LOCALS_STACK_OFFSET,     0
97 
98 /*
99  * Constants for selecting preferable prefetch type.
100  */
101 .set PREFETCH_TYPE_NONE,       0
102 .set PREFETCH_TYPE_STANDARD,   1
103 
104 /*
105  * Definitions of macros for load/store of pixel data.
106  */
107 
108 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
109  .if numbytes == 16
110   .if unaligned == 1
111         op&r&cond    WK&reg0, [base], #4
112         op&r&cond    WK&reg1, [base], #4
113         op&r&cond    WK&reg2, [base], #4
114         op&r&cond    WK&reg3, [base], #4
115   .else
116         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
117   .endif
118  .elseif numbytes == 8
119   .if unaligned == 1
120         op&r&cond    WK&reg0, [base], #4
121         op&r&cond    WK&reg1, [base], #4
122   .else
123         op&m&cond&ia base!, {WK&reg0,WK&reg1}
124   .endif
125  .elseif numbytes == 4
126         op&r&cond    WK&reg0, [base], #4
127  .elseif numbytes == 2
128         op&r&cond&h  WK&reg0, [base], #2
129  .elseif numbytes == 1
130         op&r&cond&b  WK&reg0, [base], #1
131  .else
132   .error "unsupported size: numbytes"
133  .endif
134 .endm
135 
136 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
137  .if numbytes == 16
138         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
139  .elseif numbytes == 8
140         stm&cond&db base, {WK&reg0,WK&reg1}
141  .elseif numbytes == 4
142         str&cond    WK&reg0, [base, #-4]
143  .elseif numbytes == 2
144         str&cond&h  WK&reg0, [base, #-2]
145  .elseif numbytes == 1
146         str&cond&b  WK&reg0, [base, #-1]
147  .else
148   .error "unsupported size: numbytes"
149  .endif
150 .endm
151 
152 .macro pixld cond, numbytes, firstreg, base, unaligned
153         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
154 .endm
155 
156 .macro pixst cond, numbytes, firstreg, base
157  .if (flags) & FLAG_DST_READWRITE
158         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
159  .else
160         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
161  .endif
162 .endm
163 
164 .macro PF a, x:vararg
165  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
166         a x
167  .endif
168 .endm
169 
170 
171 .macro preload_leading_step1  bpp, ptr, base
172 /* If the destination is already 16-byte aligned, then we need to preload
173  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
174  * are no gaps when the inner loop starts.
175  */
176  .if bpp > 0
177         PF  bic,    ptr, base, #31
178   .set OFFSET, 0
179   .rept prefetch_distance+1
180         PF  pld,    [ptr, #OFFSET]
181    .set OFFSET, OFFSET+32
182   .endr
183  .endif
184 .endm
185 
186 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
187 /* However, if the destination is not 16-byte aligned, we may need to
188  * preload more cache lines than that. The question we need to ask is:
189  * are the bytes corresponding to the leading pixels more than the amount
190  * by which the source pointer will be rounded down for preloading, and if
191  * so, by how many cache lines? Effectively, we want to calculate
192  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
193  *     inner_loop_offset = (src+leading_bytes)&31
194  *     extra_needed = leading_bytes - inner_loop_offset
195  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
196  * possible when there are 4 src bytes for every 1 dst byte).
197  */
198  .if bpp > 0
199   .ifc base,DST
200         /* The test can be simplified further when preloading the destination */
201         PF  tst,    base, #16
202         PF  beq,    61f
203   .else
204    .if bpp/dst_w_bpp == 4
205         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
206         PF  and,    SCRATCH, SCRATCH, #31
207         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
208         PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
209         PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
210         PF  bcs,    61f
211         PF  bpl,    60f
212         PF  pld,    [ptr, #32*(prefetch_distance+2)]
213    .else
214         PF  mov,    SCRATCH, base, lsl #32-5
215         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
216         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
217         PF  bls,    61f
218    .endif
219   .endif
220 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
221 61:
222  .endif
223 .endm
224 
225 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
226 .macro preload_middle   bpp, base, scratch_holds_offset
227  .if bpp > 0
228         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
229   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
230    .if scratch_holds_offset
231         PF  pld,    [base, SCRATCH]
232    .else
233         PF  bic,    SCRATCH, base, #31
234         PF  pld,    [SCRATCH, #32*prefetch_distance]
235    .endif
236   .endif
237  .endif
238 .endm
239 
240 .macro preload_trailing  bpp, bpp_shift, base
241  .if bpp > 0
242   .if bpp*pix_per_block > 256
243         /* Calculations are more complex if more than one fetch per block */
244         PF  and,    WK1, base, #31
245         PF  add,    WK1, WK1, WK0, lsl #bpp_shift
246         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
247         PF  bic,    SCRATCH, base, #31
248 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
249         PF  add,    SCRATCH, SCRATCH, #32
250         PF  subs,   WK1, WK1, #32
251         PF  bhi,    80b
252   .else
253         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
254         PF  mov,    SCRATCH, base, lsl #32-5
255         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
256         PF  adceqs, SCRATCH, SCRATCH, #0
257         /* The instruction above has two effects: ensures Z is only
258          * set if C was clear (so Z indicates that both shifted quantities
259          * were 0), and clears C if Z was set (so C indicates that the sum
260          * of the shifted quantities was greater and not equal to 32) */
261         PF  beq,    82f
262         PF  bic,    SCRATCH, base, #31
263         PF  bcc,    81f
264         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
265 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
266 82:
267   .endif
268  .endif
269 .endm
270 
271 
272 .macro preload_line    narrow_case, bpp, bpp_shift, base
273 /* "narrow_case" - just means that the macro was invoked from the "narrow"
274  *    code path rather than the "medium" one - because in the narrow case,
275  *    the row of pixels is known to output no more than 30 bytes, then
276  *    (assuming the source pixels are no wider than the the destination
277  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
278  *    meaning there's no need for a loop.
279  * "bpp" - number of bits per pixel in the channel (source, mask or
280  *    destination) that's being preloaded, or 0 if this channel is not used
281  *    for reading
282  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
283  * "base" - base address register of channel to preload (SRC, MASK or DST)
284  */
285  .if bpp > 0
286   .if narrow_case && (bpp <= dst_w_bpp)
287         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
288         PF  bic,    WK0, base, #31
289         PF  pld,    [WK0]
290         PF  add,    WK1, base, X, LSL #bpp_shift
291         PF  sub,    WK1, WK1, #1
292         PF  bic,    WK1, WK1, #31
293         PF  cmp,    WK1, WK0
294         PF  beq,    90f
295         PF  pld,    [WK1]
296 90:
297   .else
298         PF  bic,    WK0, base, #31
299         PF  pld,    [WK0]
300         PF  add,    WK1, base, X, lsl #bpp_shift
301         PF  sub,    WK1, WK1, #1
302         PF  bic,    WK1, WK1, #31
303         PF  cmp,    WK1, WK0
304         PF  beq,    92f
305 91:     PF  add,    WK0, WK0, #32
306         PF  cmp,    WK0, WK1
307         PF  pld,    [WK0]
308         PF  bne,    91b
309 92:
310   .endif
311  .endif
312 .endm
313 
314 
315 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
316         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
317  .if decrementx
318         sub&cond X, X, #8*numbytes/dst_w_bpp
319  .endif
320         process_tail  cond, numbytes, firstreg
321  .if !((flags) & FLAG_PROCESS_DOES_STORE)
322         pixst   cond, numbytes, firstreg, DST
323  .endif
324 .endm
325 
326 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
327  .if (flags) & FLAG_BRANCH_OVER
328   .ifc cond,mi
329         bpl     100f
330   .endif
331   .ifc cond,cs
332         bcc     100f
333   .endif
334   .ifc cond,ne
335         beq     100f
336   .endif
337         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
338 100:
339  .else
340         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
341  .endif
342 .endm
343 
344 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
345  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
346         /* Can't interleave reads and writes */
347         test
348         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
349   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
350         test
351   .endif
352         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
353  .else
354         /* Can interleave reads and writes for better scheduling */
355         test
356         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
357         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
358   .if decrementx
359         sub&cond1 X, X, #8*numbytes1/dst_w_bpp
360         sub&cond2 X, X, #8*numbytes2/dst_w_bpp
361   .endif
362         process_tail  cond1, numbytes1, firstreg1
363         process_tail  cond2, numbytes2, firstreg2
364         pixst   cond1, numbytes1, firstreg1, DST
365         pixst   cond2, numbytes2, firstreg2, DST
366  .endif
367 .endm
368 
369 
370 .macro test_bits_1_0_ptr
371  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
372         movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
373  .else
374         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
375  .endif
376 .endm
377 
378 .macro test_bits_3_2_ptr
379  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
380         movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
381  .else
382         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
383  .endif
384 .endm
385 
386 .macro leading_15bytes  process_head, process_tail
387         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
388  .set DECREMENT_X, 1
389  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
390   .set DECREMENT_X, 0
391         sub     X, X, WK0, lsr #dst_bpp_shift
392         str     X, [sp, #LINE_SAVED_REG_COUNT*4]
393         mov     X, WK0
394  .endif
395         /* Use unaligned loads in all cases for simplicity */
396  .if dst_w_bpp == 8
397         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
398  .elseif dst_w_bpp == 16
399         test_bits_1_0_ptr
400         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
401  .endif
402         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
403  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
404         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
405  .endif
406 .endm
407 
408 .macro test_bits_3_2_pix
409         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
410 .endm
411 
412 .macro test_bits_1_0_pix
413  .if dst_w_bpp == 8
414         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
415  .else
416         movs    SCRATCH, X, lsr #1
417  .endif
418 .endm
419 
420 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
421         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
422  .if dst_w_bpp == 16
423         test_bits_1_0_pix
424         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
425  .elseif dst_w_bpp == 8
426         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
427  .endif
428 .endm
429 
430 
431 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
432 110:
433  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
434  .rept pix_per_block*dst_w_bpp/128
435         process_head  , 16, 0, unaligned_src, unaligned_mask, 1
436   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
437         preload_middle  src_bpp, SRC, 1
438   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
439         preload_middle  mask_bpp, MASK, 1
440   .else
441         preload_middle  src_bpp, SRC, 0
442         preload_middle  mask_bpp, MASK, 0
443   .endif
444   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
445         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
446          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
447          * preloads for, to achieve staggered prefetches for multiple channels, because there are
448          * always two STMs per prefetch, so there is always an opposite STM on which to put the
449          * preload. Note, no need to BIC the base register here */
450         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
451   .endif
452         process_tail  , 16, 0
453   .if !((flags) & FLAG_PROCESS_DOES_STORE)
454         pixst   , 16, 0, DST
455   .endif
456   .set SUBBLOCK, SUBBLOCK+1
457  .endr
458         subs    X, X, #pix_per_block
459         bhs     110b
460 .endm
461 
462 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
463         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
464  .if dst_r_bpp > 0
465         tst     DST, #16
466         bne     111f
467         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
468         b       112f
469 111:
470  .endif
471         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
472 112:
473         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
474  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
475         PF  and,    WK0, X, #pix_per_block-1
476  .endif
477         preload_trailing  src_bpp, src_bpp_shift, SRC
478         preload_trailing  mask_bpp, mask_bpp_shift, MASK
479  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
480         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
481  .endif
482         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
483         /* The remainder of the line is handled identically to the medium case */
484         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
485 .endm
486 
487 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
488 120:
489         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
490         process_tail  , 16, 0
491  .if !((flags) & FLAG_PROCESS_DOES_STORE)
492         pixst   , 16, 0, DST
493  .endif
494         subs    X, X, #128/dst_w_bpp
495         bhs     120b
496         /* Trailing pixels */
497         tst     X, #128/dst_w_bpp - 1
498         beq     exit_label
499         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
500 .endm
501 
502 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
503         tst     X, #16*8/dst_w_bpp
504         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
505         /* Trailing pixels */
506         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
507         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
508 .endm
509 
510 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
511  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
512  .if mask_bpp == 8 || mask_bpp == 16
513         tst     MASK, #3
514         bne     141f
515  .endif
516   .if src_bpp == 8 || src_bpp == 16
517         tst     SRC, #3
518         bne     140f
519   .endif
520         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
521   .if src_bpp == 8 || src_bpp == 16
522         b       exit_label
523 140:
524         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
525   .endif
526  .if mask_bpp == 8 || mask_bpp == 16
527         b       exit_label
528 141:
529   .if src_bpp == 8 || src_bpp == 16
530         tst     SRC, #3
531         bne     142f
532   .endif
533         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
534   .if src_bpp == 8 || src_bpp == 16
535         b       exit_label
536 142:
537         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
538   .endif
539  .endif
540 .endm
541 
542 
543 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
544  .if SINGLE_SCANLINE
545   .ifc "last_one",""
546         b       198f
547   .endif
548  .else
549  .if vars_spilled
550         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
551         /* This is ldmia sp,{} */
552         .word   0xE89D0000 | LINE_SAVED_REGS
553  .endif
554         subs    Y, Y, #1
555  .if vars_spilled
556   .if (LINE_SAVED_REGS) & (1<<1)
557         str     Y, [sp]
558   .endif
559  .endif
560         add     DST, DST, STRIDE_D
561  .if src_bpp > 0
562         add     SRC, SRC, STRIDE_S
563  .endif
564  .if mask_bpp > 0
565         add     MASK, MASK, STRIDE_M
566  .endif
567  .if restore_x
568         mov     X, ORIG_W
569  .endif
570         bhs     loop_label
571  .ifc "last_one",""
572   .if vars_spilled
573         b       197f
574   .else
575         b       198f
576   .endif
577  .else
578   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
579         b       198f
580   .endif
581  .endif
582  .endif
583 .endm
584 
585 
586 .macro generate_composite_function_common fname, \
587                                           src_bpp_, \
588                                           mask_bpp_, \
589                                           dst_w_bpp_, \
590                                           flags_, \
591                                           prefetch_distance_, \
592                                           init, \
593                                           newline, \
594                                           cleanup, \
595                                           process_head, \
596                                           process_tail, \
597                                           process_inner_loop
598 
599     pixman_asm_function fname
600 
601 /*
602  * Make some macro arguments globally visible and accessible
603  * from other macros
604  */
605  .set src_bpp, src_bpp_
606  .set mask_bpp, mask_bpp_
607  .set dst_w_bpp, dst_w_bpp_
608  .set flags, flags_
609  .set prefetch_distance, prefetch_distance_
610 
611 /*
612  * Select prefetch type for this function.
613  */
614  .if prefetch_distance == 0
615   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
616  .else
617   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
618  .endif
619 
620  .if src_bpp == 32
621   .set src_bpp_shift, 2
622  .elseif src_bpp == 24
623   .set src_bpp_shift, 0
624  .elseif src_bpp == 16
625   .set src_bpp_shift, 1
626  .elseif src_bpp == 8
627   .set src_bpp_shift, 0
628  .elseif src_bpp == 0
629   .set src_bpp_shift, -1
630  .else
631   .error "requested src bpp (src_bpp) is not supported"
632  .endif
633 
634  .if mask_bpp == 32
635   .set mask_bpp_shift, 2
636  .elseif mask_bpp == 24
637   .set mask_bpp_shift, 0
638  .elseif mask_bpp == 8
639   .set mask_bpp_shift, 0
640  .elseif mask_bpp == 0
641   .set mask_bpp_shift, -1
642  .else
643   .error "requested mask bpp (mask_bpp) is not supported"
644  .endif
645 
646  .if dst_w_bpp == 32
647   .set dst_bpp_shift, 2
648  .elseif dst_w_bpp == 24
649   .set dst_bpp_shift, 0
650  .elseif dst_w_bpp == 16
651   .set dst_bpp_shift, 1
652  .elseif dst_w_bpp == 8
653   .set dst_bpp_shift, 0
654  .else
655   .error "requested dst bpp (dst_w_bpp) is not supported"
656  .endif
657 
658  .if (((flags) & FLAG_DST_READWRITE) != 0)
659   .set dst_r_bpp, dst_w_bpp
660  .else
661   .set dst_r_bpp, 0
662  .endif
663 
664  .set pix_per_block, 16*8/dst_w_bpp
665  .if src_bpp != 0
666   .if 32*8/src_bpp > pix_per_block
667    .set pix_per_block, 32*8/src_bpp
668   .endif
669  .endif
670  .if mask_bpp != 0
671   .if 32*8/mask_bpp > pix_per_block
672    .set pix_per_block, 32*8/mask_bpp
673   .endif
674  .endif
675  .if dst_r_bpp != 0
676   .if 32*8/dst_r_bpp > pix_per_block
677    .set pix_per_block, 32*8/dst_r_bpp
678   .endif
679  .endif
680 
681 /* The standard entry conditions set up by pixman-arm-common.h are:
682  * r0 = width (pixels)
683  * r1 = height (rows)
684  * r2 = pointer to top-left pixel of destination
685  * r3 = destination stride (pixels)
686  * [sp] = source pixel value, or pointer to top-left pixel of source
687  * [sp,#4] = 0 or source stride (pixels)
688  * The following arguments are unused for non-mask operations
689  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
690  * [sp,#12] = 0 or mask stride (pixels)
691  *
692  * or in the single-scanline case:
693  * r0 = width (pixels)
694  * r1 = pointer to top-left pixel of destination
695  * r2 = pointer to top-left pixel of source
696  * The following argument is unused for non-mask operations
697  * r3 = pointer to top-left pixel of mask
698  */
699 
700 /*
701  * Assign symbolic names to registers
702  */
703     X           .req    r0  /* pixels to go on this line */
704  .if SINGLE_SCANLINE
705     DST         .req    r1  /* destination pixel pointer */
706     SRC         .req    r2  /* source pixel pointer */
707     MASK        .req    r3  /* mask pixel pointer (if applicable) */
708     Y           .req    r4  /* temporary */
709     STRIDE_D    .req    r5  /* temporary */
710     STRIDE_S    .req    r6  /* temporary */
711     STRIDE_M    .req    r7  /* temporary */
712  .else
713     Y           .req    r1  /* lines to go */
714     DST         .req    r2  /* destination pixel pointer */
715     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
716     SRC         .req    r4  /* source pixel pointer */
717     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
718     MASK        .req    r6  /* mask pixel pointer (if applicable) */
719     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
720  .endif
721     WK0         .req    r8  /* pixel data registers */
722     WK1         .req    r9
723     WK2         .req    r10
724     WK3         .req    r11
725     SCRATCH     .req    r12
726     ORIG_W      .req    r14 /* width (pixels) */
727 
728         push    {r4-r11, lr}        /* save all registers */
729 
730  .if !SINGLE_SCANLINE
731         subs    Y, Y, #1
732         blo     199f
733  .endif
734 
735 #ifdef DEBUG_PARAMS
736         sub     sp, sp, #9*4
737 #endif
738 
739  .if !SINGLE_SCANLINE
740  .if src_bpp > 0
741         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
742         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
743  .endif
744  .if mask_bpp > 0
745         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
746         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
747  .endif
748  .endif
749 
750 #ifdef DEBUG_PARAMS
751         add     Y, Y, #1
752         stmia   sp, {r0-r7,pc}
753         sub     Y, Y, #1
754 #endif
755 
756         init
757 
758  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
759         /* Reserve a word in which to store X during leading pixels */
760         sub     sp, sp, #4
761   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
762   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
763  .endif
764 
765  .if !SINGLE_SCANLINE
766         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
767         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
768  .if src_bpp > 0
769         lsl     STRIDE_S, #src_bpp_shift
770         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
771  .endif
772  .if mask_bpp > 0
773         lsl     STRIDE_M, #mask_bpp_shift
774         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
775  .endif
776  .endif
777 
778         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
779         cmp     X, #2*16*8/dst_w_bpp - 1
780         blo     170f
781  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
782         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
783         cmp     X, #(prefetch_distance+3)*pix_per_block - 1
784         blo     160f
785 
786         /* Wide case */
787         /* Adjust X so that the decrement instruction can also test for
788          * inner loop termination. We want it to stop when there are
789          * (prefetch_distance+1) complete blocks to go. */
790         sub     X, X, #(prefetch_distance+2)*pix_per_block
791   .if !SINGLE_SCANLINE
792         mov     ORIG_W, X
793   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
794         /* This is stmdb sp!,{} */
795         .word   0xE92D0000 | LINE_SAVED_REGS
796    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
797    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
798   .endif
799   .endif
800 151:    /* New line */
801         newline
802         preload_leading_step1  src_bpp, WK1, SRC
803         preload_leading_step1  mask_bpp, WK2, MASK
804   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
805         preload_leading_step1  dst_r_bpp, WK3, DST
806   .endif
807 
808         ands    WK0, DST, #15
809         beq     154f
810         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
811 
812         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
813         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
814   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
815         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
816   .endif
817 
818         leading_15bytes  process_head, process_tail
819 
820 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
821   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
822         and     SCRATCH, SRC, #31
823         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
824   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
825         and     SCRATCH, MASK, #31
826         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
827   .endif
828   .ifc "process_inner_loop",""
829         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
830   .else
831         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
832   .endif
833 
834 157:    /* Check for another line */
835         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
836   .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE)
837    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
838    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
839   .endif
840  .endif
841 
842  .ltorg
843 
844 160:    /* Medium case */
845  .if !SINGLE_SCANLINE
846         mov     ORIG_W, X
847  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
848         /* This is stmdb sp!,{} */
849         .word   0xE92D0000 | LINE_SAVED_REGS
850   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
851   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
852  .endif
853  .endif
854 161:    /* New line */
855         newline
856         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
857         preload_line 0, mask_bpp, mask_bpp_shift, MASK
858  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
859         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
860  .endif
861 
862         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
863         ands    WK0, DST, #15
864         beq     164f
865         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
866 
867         leading_15bytes  process_head, process_tail
868 
869 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
870         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
871 
872 167:    /* Check for another line */
873         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
874 
875  .ltorg
876 
877 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
878  .if !SINGLE_SCANLINE
879  .if dst_w_bpp < 32
880         mov     ORIG_W, X
881  .endif
882  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
883         /* This is stmdb sp!,{} */
884         .word   0xE92D0000 | LINE_SAVED_REGS
885  .endif
886  .endif
887 171:    /* New line */
888         newline
889         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
890         preload_line 1, mask_bpp, mask_bpp_shift, MASK
891  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
892         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
893  .endif
894 
895  .if dst_w_bpp == 8
896         tst     DST, #3
897         beq     174f
898 172:    subs    X, X, #1
899         blo     177f
900         process_head  , 1, 0, 1, 1, 0
901         process_tail  , 1, 0
902   .if !((flags) & FLAG_PROCESS_DOES_STORE)
903         pixst   , 1, 0, DST
904   .endif
905         tst     DST, #3
906         bne     172b
907  .elseif dst_w_bpp == 16
908         tst     DST, #2
909         beq     174f
910         subs    X, X, #1
911         blo     177f
912         process_head  , 2, 0, 1, 1, 0
913         process_tail  , 2, 0
914   .if !((flags) & FLAG_PROCESS_DOES_STORE)
915         pixst   , 2, 0, DST
916   .endif
917  .endif
918 
919 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
920         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
921 
922 177:    /* Check for another line */
923         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
924  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE)
925   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
926   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
927  .endif
928 
929 197:
930  .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS)
931         add     sp, sp, #LINE_SAVED_REG_COUNT*4
932  .endif
933 198:
934  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
935   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
936   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
937         add     sp, sp, #4
938  .endif
939 
940         cleanup
941 
942 #ifdef DEBUG_PARAMS
943         add     sp, sp, #9*4 /* junk the debug copy of arguments */
944 #endif
945 199:
946         pop     {r4-r11, pc}  /* exit */
947 
948  .ltorg
949 
950     .unreq  X
951     .unreq  Y
952     .unreq  DST
953     .unreq  STRIDE_D
954     .unreq  SRC
955     .unreq  STRIDE_S
956     .unreq  MASK
957     .unreq  STRIDE_M
958     .unreq  WK0
959     .unreq  WK1
960     .unreq  WK2
961     .unreq  WK3
962     .unreq  SCRATCH
963     .unreq  ORIG_W
964     .endfunc
965 .endm
966 
967 .macro generate_composite_function fname, \
968                                    src_bpp_, \
969                                    mask_bpp_, \
970                                    dst_w_bpp_, \
971                                    flags_, \
972                                    prefetch_distance_, \
973                                    init, \
974                                    newline, \
975                                    cleanup, \
976                                    process_head, \
977                                    process_tail, \
978                                    process_inner_loop
979  .set SINGLE_SCANLINE, 0
980 generate_composite_function_common \
981     fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
982     init, newline, cleanup, process_head, process_tail, process_inner_loop
983 .endm
984 
985 .macro generate_composite_function_single_scanline fname, \
986                                                    src_bpp_, \
987                                                    mask_bpp_, \
988                                                    dst_w_bpp_, \
989                                                    flags_, \
990                                                    prefetch_distance_, \
991                                                    init, \
992                                                    newline, \
993                                                    cleanup, \
994                                                    process_head, \
995                                                    process_tail, \
996                                                    process_inner_loop
997  .set SINGLE_SCANLINE, 1
998 generate_composite_function_common \
999     fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \
1000     init, newline, cleanup, process_head, process_tail, process_inner_loop
1001 .endm
1002 
1003 .macro line_saved_regs  x:vararg
1004  .set LINE_SAVED_REGS, 0
1005  .set LINE_SAVED_REG_COUNT, 0
1006  .irp SAVED_REG,x
1007   .ifc "SAVED_REG","Y"
1008    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
1009    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
1010   .endif
1011   .ifc "SAVED_REG","STRIDE_D"
1012    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
1013    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
1014   .endif
1015   .ifc "SAVED_REG","STRIDE_S"
1016    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
1017    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
1018   .endif
1019   .ifc "SAVED_REG","STRIDE_M"
1020    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
1021    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
1022   .endif
1023   .ifc "SAVED_REG","ORIG_W"
1024    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
1025    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
1026   .endif
1027  .endr
1028  .if SINGLE_SCANLINE
1029   .set LINE_SAVED_REG_COUNT, 0
1030  .endif
1031 .endm
1032 
1033 .macro nop_macro x:vararg
1034 .endm
1035