1 /* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
2 /*
3  * Copyright © 2000 SuSE, Inc.
4  * Copyright © 2007 Red Hat, Inc.
5  *
6  * Permission to use, copy, modify, distribute, and sell this software and its
7  * documentation for any purpose is hereby granted without fee, provided that
8  * the above copyright notice appear in all copies and that both that
9  * copyright notice and this permission notice appear in supporting
10  * documentation, and that the name of SuSE not be used in advertising or
11  * publicity pertaining to distribution of the software without specific,
12  * written prior permission.  SuSE makes no representations about the
13  * suitability of this software for any purpose.  It is provided "as is"
14  * without express or implied warranty.
15  *
16  * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
18  * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22  *
23  * Author:  Keith Packard, SuSE, Inc.
24  */
25 
26 #ifndef PIXMAN_FAST_PATH_H__
27 #define PIXMAN_FAST_PATH_H__
28 
29 #include "pixman-private.h"
30 
31 #define PIXMAN_REPEAT_COVER -1
32 
33 /* Flags describing input parameters to fast path macro template.
34  * Turning on some flag values may indicate that
35  * "some property X is available so template can use this" or
36  * "some property X should be handled by template".
37  *
38  * FLAG_HAVE_SOLID_MASK
39  *  Input mask is solid so template should handle this.
40  *
41  * FLAG_HAVE_NON_SOLID_MASK
42  *  Input mask is bits mask so template should handle this.
43  *
44  * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
45  * exclusive. (It's not allowed to turn both flags on)
46  */
47 #define FLAG_NONE				(0)
48 #define FLAG_HAVE_SOLID_MASK			(1 <<   1)
49 #define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
50 
51 /* To avoid too short repeated scanline function calls, extend source
52  * scanlines having width less than below constant value.
53  */
54 #define REPEAT_NORMAL_MIN_WIDTH			64
55 
56 static force_inline pixman_bool_t
repeat(pixman_repeat_t repeat,int * c,int size)57 repeat (pixman_repeat_t repeat, int *c, int size)
58 {
59     if (repeat == PIXMAN_REPEAT_NONE)
60     {
61 	if (*c < 0 || *c >= size)
62 	    return FALSE;
63     }
64     else if (repeat == PIXMAN_REPEAT_NORMAL)
65     {
66 	while (*c >= size)
67 	    *c -= size;
68 	while (*c < 0)
69 	    *c += size;
70     }
71     else if (repeat == PIXMAN_REPEAT_PAD)
72     {
73 	*c = CLIP (*c, 0, size - 1);
74     }
75     else /* REFLECT */
76     {
77 	*c = MOD (*c, size * 2);
78 	if (*c >= size)
79 	    *c = size * 2 - *c - 1;
80     }
81     return TRUE;
82 }
83 
84 static force_inline int
pixman_fixed_to_bilinear_weight(pixman_fixed_t x)85 pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
86 {
87     return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
88 	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
89 }
90 
91 #if BILINEAR_INTERPOLATION_BITS <= 4
92 /* Inspired by Filter_32_opaque from Skia */
93 static force_inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)94 bilinear_interpolation (uint32_t tl, uint32_t tr,
95 			uint32_t bl, uint32_t br,
96 			int distx, int disty)
97 {
98     int distxy, distxiy, distixy, distixiy;
99     uint32_t lo, hi;
100 
101     distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
102     disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
103 
104     distxy = distx * disty;
105     distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
106     distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
107     distixiy =
108 	16 * 16 - (disty << 4) -
109 	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
110 
111     lo = (tl & 0xff00ff) * distixiy;
112     hi = ((tl >> 8) & 0xff00ff) * distixiy;
113 
114     lo += (tr & 0xff00ff) * distxiy;
115     hi += ((tr >> 8) & 0xff00ff) * distxiy;
116 
117     lo += (bl & 0xff00ff) * distixy;
118     hi += ((bl >> 8) & 0xff00ff) * distixy;
119 
120     lo += (br & 0xff00ff) * distxy;
121     hi += ((br >> 8) & 0xff00ff) * distxy;
122 
123     return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
124 }
125 
126 #else
127 #if SIZEOF_LONG > 4
128 
129 static force_inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)130 bilinear_interpolation (uint32_t tl, uint32_t tr,
131 			uint32_t bl, uint32_t br,
132 			int distx, int disty)
133 {
134     uint64_t distxy, distxiy, distixy, distixiy;
135     uint64_t tl64, tr64, bl64, br64;
136     uint64_t f, r;
137 
138     distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
139     disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
140 
141     distxy = distx * disty;
142     distxiy = distx * (256 - disty);
143     distixy = (256 - distx) * disty;
144     distixiy = (256 - distx) * (256 - disty);
145 
146     /* Alpha and Blue */
147     tl64 = tl & 0xff0000ff;
148     tr64 = tr & 0xff0000ff;
149     bl64 = bl & 0xff0000ff;
150     br64 = br & 0xff0000ff;
151 
152     f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
153     r = f & 0x0000ff0000ff0000ull;
154 
155     /* Red and Green */
156     tl64 = tl;
157     tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
158 
159     tr64 = tr;
160     tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
161 
162     bl64 = bl;
163     bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
164 
165     br64 = br;
166     br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
167 
168     f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
169     r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
170 
171     return (uint32_t)(r >> 16);
172 }
173 
174 #else
175 
176 static force_inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)177 bilinear_interpolation (uint32_t tl, uint32_t tr,
178 			uint32_t bl, uint32_t br,
179 			int distx, int disty)
180 {
181     int distxy, distxiy, distixy, distixiy;
182     uint32_t f, r;
183 
184     distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
185     disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
186 
187     distxy = distx * disty;
188     distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
189     distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
190     distixiy =
191 	256 * 256 - (disty << 8) -
192 	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
193 
194     /* Blue */
195     r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
196       + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
197 
198     /* Green */
199     f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
200       + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
201     r |= f & 0xff000000;
202 
203     tl >>= 16;
204     tr >>= 16;
205     bl >>= 16;
206     br >>= 16;
207     r >>= 16;
208 
209     /* Red */
210     f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
211       + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
212     r |= f & 0x00ff0000;
213 
214     /* Alpha */
215     f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
216       + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
217     r |= f & 0xff000000;
218 
219     return r;
220 }
221 
222 #endif
223 #endif // BILINEAR_INTERPOLATION_BITS <= 4
224 
225 static force_inline argb_t
bilinear_interpolation_float(argb_t tl,argb_t tr,argb_t bl,argb_t br,float distx,float disty)226 bilinear_interpolation_float (argb_t tl, argb_t tr,
227 			      argb_t bl, argb_t br,
228 			      float distx, float disty)
229 {
230     float distxy, distxiy, distixy, distixiy;
231     argb_t r;
232 
233     distxy = distx * disty;
234     distxiy = distx * (1.f - disty);
235     distixy = (1.f - distx) * disty;
236     distixiy = (1.f - distx) * (1.f - disty);
237 
238     r.a = tl.a * distixiy + tr.a * distxiy +
239           bl.a * distixy  + br.a * distxy;
240     r.r = tl.r * distixiy + tr.r * distxiy +
241           bl.r * distixy  + br.r * distxy;
242     r.g = tl.g * distixiy + tr.g * distxiy +
243           bl.g * distixy  + br.g * distxy;
244     r.b = tl.b * distixiy + tr.b * distxiy +
245           bl.b * distixy  + br.b * distxy;
246 
247     return r;
248 }
249 
250 /*
251  * For each scanline fetched from source image with PAD repeat:
252  * - calculate how many pixels need to be padded on the left side
253  * - calculate how many pixels need to be padded on the right side
254  * - update width to only count pixels which are fetched from the image
255  * All this information is returned via 'width', 'left_pad', 'right_pad'
256  * arguments. The code is assuming that 'unit_x' is positive.
257  *
258  * Note: 64-bit math is used in order to avoid potential overflows, which
259  *       is probably excessive in many cases. This particular function
260  *       may need its own correctness test and performance tuning.
261  */
262 static force_inline void
pad_repeat_get_scanline_bounds(int32_t source_image_width,pixman_fixed_t vx,pixman_fixed_t unit_x,int32_t * width,int32_t * left_pad,int32_t * right_pad)263 pad_repeat_get_scanline_bounds (int32_t         source_image_width,
264 				pixman_fixed_t  vx,
265 				pixman_fixed_t  unit_x,
266 				int32_t *       width,
267 				int32_t *       left_pad,
268 				int32_t *       right_pad)
269 {
270     int64_t max_vx = (int64_t) source_image_width << 16;
271     int64_t tmp;
272     if (vx < 0)
273     {
274 	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
275 	if (tmp > *width)
276 	{
277 	    *left_pad = *width;
278 	    *width = 0;
279 	}
280 	else
281 	{
282 	    *left_pad = (int32_t) tmp;
283 	    *width -= (int32_t) tmp;
284 	}
285     }
286     else
287     {
288 	*left_pad = 0;
289     }
290     tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
291     if (tmp < 0)
292     {
293 	*right_pad = *width;
294 	*width = 0;
295     }
296     else if (tmp >= *width)
297     {
298 	*right_pad = 0;
299     }
300     else
301     {
302 	*right_pad = *width - (int32_t) tmp;
303 	*width = (int32_t) tmp;
304     }
305 }
306 
307 /* A macroified version of specialized nearest scalers for some
308  * common 8888 and 565 formats. It supports SRC and OVER ops.
309  *
310  * There are two repeat versions, one that handles repeat normal,
311  * and one without repeat handling that only works if the src region
312  * used is completely covered by the pre-repeated source samples.
313  *
314  * The loops are unrolled to process two pixels per iteration for better
315  * performance on most CPU architectures (superscalar processors
316  * can issue several operations simultaneously, other processors can hide
317  * instructions latencies by pipelining operations). Unrolling more
318  * does not make much sense because the compiler will start running out
319  * of spare registers soon.
320  */
321 
322 #define GET_8888_ALPHA(s) ((s) >> 24)
323  /* This is not actually used since we don't have an OVER with
324     565 source, but it is needed to build. */
325 #define GET_0565_ALPHA(s) 0xff
326 #define GET_x888_ALPHA(s) 0xff
327 
328 #define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
329 			      src_type_t, dst_type_t, OP, repeat_mode)				\
330 static force_inline void									\
331 scanline_func_name (dst_type_t       *dst,							\
332 		    const src_type_t *src,							\
333 		    int32_t           w,							\
334 		    pixman_fixed_t    vx,							\
335 		    pixman_fixed_t    unit_x,							\
336 		    pixman_fixed_t    src_width_fixed,						\
337 		    pixman_bool_t     fully_transparent_src)					\
338 {												\
339 	uint32_t   d;										\
340 	src_type_t s1, s2;									\
341 	uint8_t    a1, a2;									\
342 	int        x1, x2;									\
343 												\
344 	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
345 	    return;										\
346 												\
347 	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
348 	    abort();										\
349 												\
350 	while ((w -= 2) >= 0)									\
351 	{											\
352 	    x1 = pixman_fixed_to_int (vx);							\
353 	    vx += unit_x;									\
354 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
355 	    {											\
356 		/* This works because we know that unit_x is positive */			\
357 		while (vx >= 0)									\
358 		    vx -= src_width_fixed;							\
359 	    }											\
360 	    s1 = *(src + x1);									\
361 												\
362 	    x2 = pixman_fixed_to_int (vx);							\
363 	    vx += unit_x;									\
364 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
365 	    {											\
366 		/* This works because we know that unit_x is positive */			\
367 		while (vx >= 0)									\
368 		    vx -= src_width_fixed;							\
369 	    }											\
370 	    s2 = *(src + x2);									\
371 												\
372 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
373 	    {											\
374 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
375 		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
376 												\
377 		if (a1 == 0xff)									\
378 		{										\
379 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
380 		}										\
381 		else if (s1)									\
382 		{										\
383 		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
384 		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
385 		    a1 ^= 0xff;									\
386 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
387 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
388 		}										\
389 		dst++;										\
390 												\
391 		if (a2 == 0xff)									\
392 		{										\
393 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
394 		}										\
395 		else if (s2)									\
396 		{										\
397 		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
398 		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
399 		    a2 ^= 0xff;									\
400 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
401 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
402 		}										\
403 		dst++;										\
404 	    }											\
405 	    else /* PIXMAN_OP_SRC */								\
406 	    {											\
407 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
408 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
409 	    }											\
410 	}											\
411 												\
412 	if (w & 1)										\
413 	{											\
414 	    x1 = pixman_fixed_to_int (vx);							\
415 	    s1 = *(src + x1);									\
416 												\
417 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
418 	    {											\
419 		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
420 												\
421 		if (a1 == 0xff)									\
422 		{										\
423 		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
424 		}										\
425 		else if (s1)									\
426 		{										\
427 		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
428 		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
429 		    a1 ^= 0xff;									\
430 		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
431 		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
432 		}										\
433 		dst++;										\
434 	    }											\
435 	    else /* PIXMAN_OP_SRC */								\
436 	    {											\
437 		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
438 	    }											\
439 	}											\
440 }
441 
442 #define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
443 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
444 static void											\
445 fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
446 						   pixman_composite_info_t *info)               \
447 {												\
448     PIXMAN_COMPOSITE_ARGS (info);					                        \
449     dst_type_t *dst_line;						                        \
450     mask_type_t *mask_line;									\
451     src_type_t *src_first_line;									\
452     int       y;										\
453     pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
454     pixman_fixed_t max_vy;									\
455     pixman_vector_t v;										\
456     pixman_fixed_t vx, vy;									\
457     pixman_fixed_t unit_x, unit_y;								\
458     int32_t left_pad, right_pad;								\
459 												\
460     src_type_t *src;										\
461     dst_type_t *dst;										\
462     mask_type_t solid_mask;									\
463     const mask_type_t *mask = &solid_mask;							\
464     int src_stride, mask_stride, dst_stride;							\
465 												\
466     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
467     if (have_mask)										\
468     {												\
469 	if (mask_is_solid)									\
470 	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
471 	else											\
472 	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
473 				   mask_stride, mask_line, 1);					\
474     }												\
475     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
476      * transformed from destination space to source space */					\
477     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
478 												\
479     /* reference point is the center of the pixel */						\
480     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
481     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
482     v.vector[2] = pixman_fixed_1;								\
483 												\
484     if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
485 	return;											\
486 												\
487     unit_x = src_image->common.transform->matrix[0][0];						\
488     unit_y = src_image->common.transform->matrix[1][1];						\
489 												\
490     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
491     v.vector[0] -= pixman_fixed_e;								\
492     v.vector[1] -= pixman_fixed_e;								\
493 												\
494     vx = v.vector[0];										\
495     vy = v.vector[1];										\
496 												\
497     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
498     {												\
499 	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
500 												\
501 	/* Clamp repeating positions inside the actual samples */				\
502 	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
503 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
504     }												\
505 												\
506     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
507 	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
508     {												\
509 	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
510 					&width, &left_pad, &right_pad);				\
511 	vx += left_pad * unit_x;								\
512     }												\
513 												\
514     while (--height >= 0)									\
515     {												\
516 	dst = dst_line;										\
517 	dst_line += dst_stride;									\
518 	if (have_mask && !mask_is_solid)							\
519 	{											\
520 	    mask = mask_line;									\
521 	    mask_line += mask_stride;								\
522 	}											\
523 												\
524 	y = pixman_fixed_to_int (vy);								\
525 	vy += unit_y;										\
526 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
527 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
528 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
529 	{											\
530 	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
531 	    src = src_first_line + src_stride * y;						\
532 	    if (left_pad > 0)									\
533 	    {											\
534 		scanline_func (mask, dst,							\
535 			       src + src_image->bits.width - src_image->bits.width + 1,		\
536 			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
537 	    }											\
538 	    if (width > 0)									\
539 	    {											\
540 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
541 			       dst + left_pad, src + src_image->bits.width, width,		\
542 			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
543 	    }											\
544 	    if (right_pad > 0)									\
545 	    {											\
546 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
547 			       dst + left_pad + width, src + src_image->bits.width,		\
548 			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
549 	    }											\
550 	}											\
551 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
552 	{											\
553 	    static const src_type_t zero[1] = { 0 };						\
554 	    if (y < 0 || y >= src_image->bits.height)						\
555 	    {											\
556 		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
557 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
558 		continue;									\
559 	    }											\
560 	    src = src_first_line + src_stride * y;						\
561 	    if (left_pad > 0)									\
562 	    {											\
563 		scanline_func (mask, dst, zero + 1, left_pad,					\
564 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
565 	    }											\
566 	    if (width > 0)									\
567 	    {											\
568 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
569 			       dst + left_pad, src + src_image->bits.width, width,		\
570 			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
571 	    }											\
572 	    if (right_pad > 0)									\
573 	    {											\
574 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
575 			       dst + left_pad + width, zero + 1, right_pad,			\
576 			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
577 	    }											\
578 	}											\
579 	else											\
580 	{											\
581 	    src = src_first_line + src_stride * y;						\
582 	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
583 			   unit_x, src_width_fixed, FALSE);					\
584 	}											\
585     }												\
586 }
587 
588 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
589 #define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
590 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
591 	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
592 				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
593 
594 #define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
595 			      repeat_mode)							\
596     static force_inline void									\
597     scanline_func##scale_func_name##_wrapper (							\
598 		    const uint8_t    *mask,							\
599 		    dst_type_t       *dst,							\
600 		    const src_type_t *src,							\
601 		    int32_t          w,								\
602 		    pixman_fixed_t   vx,							\
603 		    pixman_fixed_t   unit_x,							\
604 		    pixman_fixed_t   max_vx,							\
605 		    pixman_bool_t    fully_transparent_src)					\
606     {												\
607 	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
608     }												\
609     FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
610 			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
611 
612 #define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
613 			      repeat_mode)							\
614 	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
615 			      dst_type_t, repeat_mode)
616 
617 #define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
618 		     src_type_t, dst_type_t, OP, repeat_mode)				\
619     FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
620 			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
621 			  OP, repeat_mode)						\
622     FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
623 			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
624 			  src_type_t, dst_type_t, repeat_mode)
625 
626 
627 #define SCALED_NEAREST_FLAGS						\
628     (FAST_PATH_SCALE_TRANSFORM	|					\
629      FAST_PATH_NO_ALPHA_MAP	|					\
630      FAST_PATH_NEAREST_FILTER	|					\
631      FAST_PATH_NO_ACCESSORS	|					\
632      FAST_PATH_NARROW_FORMAT)
633 
634 #define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
635     {   PIXMAN_OP_ ## op,						\
636 	PIXMAN_ ## s,							\
637 	(SCALED_NEAREST_FLAGS		|				\
638 	 FAST_PATH_NORMAL_REPEAT	|				\
639 	 FAST_PATH_X_UNIT_POSITIVE),					\
640 	PIXMAN_null, 0,							\
641 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
642 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
643     }
644 
645 #define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
646     {   PIXMAN_OP_ ## op,						\
647 	PIXMAN_ ## s,							\
648 	(SCALED_NEAREST_FLAGS		|				\
649 	 FAST_PATH_PAD_REPEAT		|				\
650 	 FAST_PATH_X_UNIT_POSITIVE),					\
651 	PIXMAN_null, 0,							\
652 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
653 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
654     }
655 
656 #define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
657     {   PIXMAN_OP_ ## op,						\
658 	PIXMAN_ ## s,							\
659 	(SCALED_NEAREST_FLAGS		|				\
660 	 FAST_PATH_NONE_REPEAT		|				\
661 	 FAST_PATH_X_UNIT_POSITIVE),					\
662 	PIXMAN_null, 0,							\
663 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
664 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
665     }
666 
667 #define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
668     {   PIXMAN_OP_ ## op,						\
669 	PIXMAN_ ## s,							\
670 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
671 	PIXMAN_null, 0,							\
672 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
673 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
674     }
675 
676 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
677     {   PIXMAN_OP_ ## op,						\
678 	PIXMAN_ ## s,							\
679 	(SCALED_NEAREST_FLAGS		|				\
680 	 FAST_PATH_NORMAL_REPEAT	|				\
681 	 FAST_PATH_X_UNIT_POSITIVE),					\
682 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
683 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
684 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
685     }
686 
687 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
688     {   PIXMAN_OP_ ## op,						\
689 	PIXMAN_ ## s,							\
690 	(SCALED_NEAREST_FLAGS		|				\
691 	 FAST_PATH_PAD_REPEAT		|				\
692 	 FAST_PATH_X_UNIT_POSITIVE),					\
693 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
694 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
695 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
696     }
697 
698 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
699     {   PIXMAN_OP_ ## op,						\
700 	PIXMAN_ ## s,							\
701 	(SCALED_NEAREST_FLAGS		|				\
702 	 FAST_PATH_NONE_REPEAT		|				\
703 	 FAST_PATH_X_UNIT_POSITIVE),					\
704 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
705 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
706 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
707     }
708 
709 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
710     {   PIXMAN_OP_ ## op,						\
711 	PIXMAN_ ## s,							\
712 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
713 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
714 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
715 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
716     }
717 
718 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
719     {   PIXMAN_OP_ ## op,						\
720 	PIXMAN_ ## s,							\
721 	(SCALED_NEAREST_FLAGS		|				\
722 	 FAST_PATH_NORMAL_REPEAT	|				\
723 	 FAST_PATH_X_UNIT_POSITIVE),					\
724 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
725 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
726 	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
727     }
728 
729 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
730     {   PIXMAN_OP_ ## op,						\
731 	PIXMAN_ ## s,							\
732 	(SCALED_NEAREST_FLAGS		|				\
733 	 FAST_PATH_PAD_REPEAT		|				\
734 	 FAST_PATH_X_UNIT_POSITIVE),					\
735 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
736 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
737 	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
738     }
739 
740 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
741     {   PIXMAN_OP_ ## op,						\
742 	PIXMAN_ ## s,							\
743 	(SCALED_NEAREST_FLAGS		|				\
744 	 FAST_PATH_NONE_REPEAT		|				\
745 	 FAST_PATH_X_UNIT_POSITIVE),					\
746 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
747 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
748 	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
749     }
750 
751 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
752     {   PIXMAN_OP_ ## op,						\
753 	PIXMAN_ ## s,							\
754 	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
755 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
756 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
757 	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
758     }
759 
760 /* Prefer the use of 'cover' variant, because it is faster */
761 #define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
762     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
763     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
764     SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
765     SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
766 
767 #define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
768     SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
769     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
770     SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
771 
772 #define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
773     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
774     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
775     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),              \
776     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
777 
778 /*****************************************************************************/
779 
780 /*
781  * Identify 5 zones in each scanline for bilinear scaling. Depending on
782  * whether 2 pixels to be interpolated are fetched from the image itself,
783  * from the padding area around it or from both image and padding area.
784  */
785 static force_inline void
bilinear_pad_repeat_get_scanline_bounds(int32_t source_image_width,pixman_fixed_t vx,pixman_fixed_t unit_x,int32_t * left_pad,int32_t * left_tz,int32_t * width,int32_t * right_tz,int32_t * right_pad)786 bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
787 					 pixman_fixed_t  vx,
788 					 pixman_fixed_t  unit_x,
789 					 int32_t *       left_pad,
790 					 int32_t *       left_tz,
791 					 int32_t *       width,
792 					 int32_t *       right_tz,
793 					 int32_t *       right_pad)
794 {
795 	int width1 = *width, left_pad1, right_pad1;
796 	int width2 = *width, left_pad2, right_pad2;
797 
798 	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
799 					&width1, &left_pad1, &right_pad1);
800 	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
801 					unit_x, &width2, &left_pad2, &right_pad2);
802 
803 	*left_pad = left_pad2;
804 	*left_tz = left_pad1 - left_pad2;
805 	*right_tz = right_pad2 - right_pad1;
806 	*right_pad = right_pad1;
807 	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
808 }
809 
810 /*
811  * Main loop template for single pass bilinear scaling. It needs to be
812  * provided with 'scanline_func' which should do the compositing operation.
813  * The needed function has the following prototype:
814  *
815  *	scanline_func (dst_type_t *       dst,
816  *		       const mask_type_ * mask,
817  *		       const src_type_t * src_top,
818  *		       const src_type_t * src_bottom,
819  *		       int32_t            width,
820  *		       int                weight_top,
821  *		       int                weight_bottom,
822  *		       pixman_fixed_t     vx,
823  *		       pixman_fixed_t     unit_x,
824  *		       pixman_fixed_t     max_vx,
825  *		       pixman_bool_t      zero_src)
826  *
827  * Where:
828  *  dst                 - destination scanline buffer for storing results
829  *  mask                - mask buffer (or single value for solid mask)
830  *  src_top, src_bottom - two source scanlines
831  *  width               - number of pixels to process
832  *  weight_top          - weight of the top row for interpolation
833  *  weight_bottom       - weight of the bottom row for interpolation
834  *  vx                  - initial position for fetching the first pair of
835  *                        pixels from the source buffer
836  *  unit_x              - position increment needed to move to the next pair
837  *                        of pixels
838  *  max_vx              - image size as a fixed point value, can be used for
839  *                        implementing NORMAL repeat (when it is supported)
840  *  zero_src            - boolean hint variable, which is set to TRUE when
841  *                        all source pixels are fetched from zero padding
842  *                        zone for NONE repeat
843  *
844  * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
845  *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
846  *       for NONE repeat when handling fuzzy antialiased top or bottom image
847  *       edges. Also both top and bottom weight variables are guaranteed to
848  *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
849  *       For example, the weights can fit into unsigned byte or be used
850  *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
851  *       precision.
852  */
853 #define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
854 				  dst_type_t, repeat_mode, flags)				\
855 static void											\
856 fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
857 						   pixman_composite_info_t *info)		\
858 {												\
859     PIXMAN_COMPOSITE_ARGS (info);								\
860     dst_type_t *dst_line;									\
861     mask_type_t *mask_line;									\
862     src_type_t *src_first_line;									\
863     int       y1, y2;										\
864     pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
865     pixman_vector_t v;										\
866     pixman_fixed_t vx, vy;									\
867     pixman_fixed_t unit_x, unit_y;								\
868     int32_t left_pad, left_tz, right_tz, right_pad;						\
869 												\
870     dst_type_t *dst;										\
871     mask_type_t solid_mask;									\
872     const mask_type_t *mask = &solid_mask;							\
873     int src_stride, mask_stride, dst_stride;							\
874 												\
875     int src_width;										\
876     pixman_fixed_t src_width_fixed;								\
877     int max_x;											\
878     pixman_bool_t need_src_extension;								\
879 												\
880     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
881     if (flags & FLAG_HAVE_SOLID_MASK)								\
882     {												\
883 	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
884 	mask_stride = 0;									\
885     }												\
886     else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
887     {												\
888 	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
889 			       mask_stride, mask_line, 1);					\
890     }												\
891 												\
892     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
893      * transformed from destination space to source space */					\
894     PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
895 												\
896     /* reference point is the center of the pixel */						\
897     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
898     v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
899     v.vector[2] = pixman_fixed_1;								\
900 												\
901     if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
902 	return;											\
903 												\
904     unit_x = src_image->common.transform->matrix[0][0];						\
905     unit_y = src_image->common.transform->matrix[1][1];						\
906 												\
907     v.vector[0] -= pixman_fixed_1 / 2;								\
908     v.vector[1] -= pixman_fixed_1 / 2;								\
909 												\
910     vy = v.vector[1];										\
911 												\
912     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
913 	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
914     {												\
915 	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
916 					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
917 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
918 	{											\
919 	    /* PAD repeat does not need special handling for 'transition zones' and */		\
920 	    /* they can be combined with 'padding zones' safely */				\
921 	    left_pad += left_tz;								\
922 	    right_pad += right_tz;								\
923 	    left_tz = right_tz = 0;								\
924 	}											\
925 	v.vector[0] += left_pad * unit_x;							\
926     }												\
927 												\
928     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
929     {												\
930 	vx = v.vector[0];									\
931 	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
932 	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\
933 												\
934 	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
935 	{											\
936 	    src_width = 0;									\
937 												\
938 	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
939 		src_width += src_image->bits.width;						\
940 												\
941 	    need_src_extension = TRUE;								\
942 	}											\
943 	else											\
944 	{											\
945 	    src_width = src_image->bits.width;							\
946 	    need_src_extension = FALSE;								\
947 	}											\
948 												\
949 	src_width_fixed = pixman_int_to_fixed (src_width);					\
950     }												\
951 												\
952     while (--height >= 0)									\
953     {												\
954 	int weight1, weight2;									\
955 	dst = dst_line;										\
956 	dst_line += dst_stride;									\
957 	vx = v.vector[0];									\
958 	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
959 	{											\
960 	    mask = mask_line;									\
961 	    mask_line += mask_stride;								\
962 	}											\
963 												\
964 	y1 = pixman_fixed_to_int (vy);								\
965 	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
966 	if (weight2)										\
967 	{											\
968 	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
969 	    y2 = y1 + 1;									\
970 	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
971 	}											\
972 	else											\
973 	{											\
974 	    /* set both top and bottom row to the same scanline and tweak weights */		\
975 	    y2 = y1;										\
976 	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
977 	}											\
978 	vy += unit_y;										\
979 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
980 	{											\
981 	    src_type_t *src1, *src2;								\
982 	    src_type_t buf1[2];									\
983 	    src_type_t buf2[2];									\
984 	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
985 	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
986 	    src1 = src_first_line + src_stride * y1;						\
987 	    src2 = src_first_line + src_stride * y2;						\
988 												\
989 	    if (left_pad > 0)									\
990 	    {											\
991 		buf1[0] = buf1[1] = src1[0];							\
992 		buf2[0] = buf2[1] = src2[0];							\
993 		scanline_func (dst, mask,							\
994 			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
995 		dst += left_pad;								\
996 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
997 		    mask += left_pad;								\
998 	    }											\
999 	    if (width > 0)									\
1000 	    {											\
1001 		scanline_func (dst, mask,							\
1002 			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
1003 		dst += width;									\
1004 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1005 		    mask += width;								\
1006 	    }											\
1007 	    if (right_pad > 0)									\
1008 	    {											\
1009 		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
1010 		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
1011 		scanline_func (dst, mask,							\
1012 			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
1013 	    }											\
1014 	}											\
1015 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
1016 	{											\
1017 	    src_type_t *src1, *src2;								\
1018 	    src_type_t buf1[2];									\
1019 	    src_type_t buf2[2];									\
1020 	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
1021 	    if (y1 < 0)										\
1022 	    {											\
1023 		weight1 = 0;									\
1024 		y1 = 0;										\
1025 	    }											\
1026 	    if (y1 >= src_image->bits.height)							\
1027 	    {											\
1028 		weight1 = 0;									\
1029 		y1 = src_image->bits.height - 1;						\
1030 	    }											\
1031 	    if (y2 < 0)										\
1032 	    {											\
1033 		weight2 = 0;									\
1034 		y2 = 0;										\
1035 	    }											\
1036 	    if (y2 >= src_image->bits.height)							\
1037 	    {											\
1038 		weight2 = 0;									\
1039 		y2 = src_image->bits.height - 1;						\
1040 	    }											\
1041 	    src1 = src_first_line + src_stride * y1;						\
1042 	    src2 = src_first_line + src_stride * y2;						\
1043 												\
1044 	    if (left_pad > 0)									\
1045 	    {											\
1046 		buf1[0] = buf1[1] = 0;								\
1047 		buf2[0] = buf2[1] = 0;								\
1048 		scanline_func (dst, mask,							\
1049 			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
1050 		dst += left_pad;								\
1051 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1052 		    mask += left_pad;								\
1053 	    }											\
1054 	    if (left_tz > 0)									\
1055 	    {											\
1056 		buf1[0] = 0;									\
1057 		buf1[1] = src1[0];								\
1058 		buf2[0] = 0;									\
1059 		buf2[1] = src2[0];								\
1060 		scanline_func (dst, mask,							\
1061 			       buf1, buf2, left_tz, weight1, weight2,				\
1062 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
1063 		dst += left_tz;									\
1064 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1065 		    mask += left_tz;								\
1066 		vx += left_tz * unit_x;								\
1067 	    }											\
1068 	    if (width > 0)									\
1069 	    {											\
1070 		scanline_func (dst, mask,							\
1071 			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
1072 		dst += width;									\
1073 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1074 		    mask += width;								\
1075 		vx += width * unit_x;								\
1076 	    }											\
1077 	    if (right_tz > 0)									\
1078 	    {											\
1079 		buf1[0] = src1[src_image->bits.width - 1];					\
1080 		buf1[1] = 0;									\
1081 		buf2[0] = src2[src_image->bits.width - 1];					\
1082 		buf2[1] = 0;									\
1083 		scanline_func (dst, mask,							\
1084 			       buf1, buf2, right_tz, weight1, weight2,				\
1085 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
1086 		dst += right_tz;								\
1087 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1088 		    mask += right_tz;								\
1089 	    }											\
1090 	    if (right_pad > 0)									\
1091 	    {											\
1092 		buf1[0] = buf1[1] = 0;								\
1093 		buf2[0] = buf2[1] = 0;								\
1094 		scanline_func (dst, mask,							\
1095 			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
1096 	    }											\
1097 	}											\
1098 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
1099 	{											\
1100 	    int32_t	    num_pixels;								\
1101 	    int32_t	    width_remain;							\
1102 	    src_type_t *    src_line_top;							\
1103 	    src_type_t *    src_line_bottom;							\
1104 	    src_type_t	    buf1[2];								\
1105 	    src_type_t	    buf2[2];								\
1106 	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
1107 	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
1108 	    int		    i, j;								\
1109 												\
1110 	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
1111 	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
1112 	    src_line_top = src_first_line + src_stride * y1;					\
1113 	    src_line_bottom = src_first_line + src_stride * y2;					\
1114 												\
1115 	    if (need_src_extension)								\
1116 	    {											\
1117 		for (i=0; i<src_width;)								\
1118 		{										\
1119 		    for (j=0; j<src_image->bits.width; j++, i++)				\
1120 		    {										\
1121 			extended_src_line0[i] = src_line_top[j];				\
1122 			extended_src_line1[i] = src_line_bottom[j];				\
1123 		    }										\
1124 		}										\
1125 												\
1126 		src_line_top = &extended_src_line0[0];						\
1127 		src_line_bottom = &extended_src_line1[0];					\
1128 	    }											\
1129 												\
1130 	    /* Top & Bottom wrap around buffer */						\
1131 	    buf1[0] = src_line_top[src_width - 1];						\
1132 	    buf1[1] = src_line_top[0];								\
1133 	    buf2[0] = src_line_bottom[src_width - 1];						\
1134 	    buf2[1] = src_line_bottom[0];							\
1135 												\
1136 	    width_remain = width;								\
1137 												\
1138 	    while (width_remain > 0)								\
1139 	    {											\
1140 		/* We use src_width_fixed because it can make vx in original source range */	\
1141 		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
1142 												\
1143 		/* Wrap around part */								\
1144 		if (pixman_fixed_to_int (vx) == src_width - 1)					\
1145 		{										\
1146 		    /* for positive unit_x							\
1147 		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
1148 		     *										\
1149 		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
1150 		     * So we are safe from overflow.						\
1151 		     */										\
1152 		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
1153 												\
1154 		    if (num_pixels > width_remain)						\
1155 			num_pixels = width_remain;						\
1156 												\
1157 		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
1158 				   weight1, weight2, pixman_fixed_frac(vx),			\
1159 				   unit_x, src_width_fixed, FALSE);				\
1160 												\
1161 		    width_remain -= num_pixels;							\
1162 		    vx += num_pixels * unit_x;							\
1163 		    dst += num_pixels;								\
1164 												\
1165 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
1166 			mask += num_pixels;							\
1167 												\
1168 		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
1169 		}										\
1170 												\
1171 		/* Normal scanline composite */							\
1172 		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
1173 		{										\
1174 		    /* for positive unit_x							\
1175 		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
1176 		     *										\
1177 		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
1178 		     * So we are safe from overflow here.					\
1179 		     */										\
1180 		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
1181 				  / unit_x) + 1;						\
1182 												\
1183 		    if (num_pixels > width_remain)						\
1184 			num_pixels = width_remain;						\
1185 												\
1186 		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
1187 				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
1188 												\
1189 		    width_remain -= num_pixels;							\
1190 		    vx += num_pixels * unit_x;							\
1191 		    dst += num_pixels;								\
1192 												\
1193 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
1194 		        mask += num_pixels;							\
1195 		}										\
1196 	    }											\
1197 	}											\
1198 	else											\
1199 	{											\
1200 	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
1201 			   src_first_line + src_stride * y2, width,				\
1202 			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
1203 	}											\
1204     }												\
1205 }
1206 
1207 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
1208 #define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
1209 				  dst_type_t, repeat_mode, flags)				\
1210 	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
1211 				  dst_type_t, repeat_mode, flags)
1212 
1213 #define SCALED_BILINEAR_FLAGS						\
1214     (FAST_PATH_SCALE_TRANSFORM	|					\
1215      FAST_PATH_NO_ALPHA_MAP	|					\
1216      FAST_PATH_BILINEAR_FILTER	|					\
1217      FAST_PATH_NO_ACCESSORS	|					\
1218      FAST_PATH_NARROW_FORMAT)
1219 
1220 #define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
1221     {   PIXMAN_OP_ ## op,						\
1222 	PIXMAN_ ## s,							\
1223 	(SCALED_BILINEAR_FLAGS		|				\
1224 	 FAST_PATH_PAD_REPEAT		|				\
1225 	 FAST_PATH_X_UNIT_POSITIVE),					\
1226 	PIXMAN_null, 0,							\
1227 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1228 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1229     }
1230 
1231 #define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
1232     {   PIXMAN_OP_ ## op,						\
1233 	PIXMAN_ ## s,							\
1234 	(SCALED_BILINEAR_FLAGS		|				\
1235 	 FAST_PATH_NONE_REPEAT		|				\
1236 	 FAST_PATH_X_UNIT_POSITIVE),					\
1237 	PIXMAN_null, 0,							\
1238 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1239 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1240     }
1241 
1242 #define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
1243     {   PIXMAN_OP_ ## op,						\
1244 	PIXMAN_ ## s,							\
1245 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1246 	PIXMAN_null, 0,							\
1247 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1248 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1249     }
1250 
1251 #define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
1252     {   PIXMAN_OP_ ## op,						\
1253 	PIXMAN_ ## s,							\
1254 	(SCALED_BILINEAR_FLAGS		|				\
1255 	 FAST_PATH_NORMAL_REPEAT	|				\
1256 	 FAST_PATH_X_UNIT_POSITIVE),					\
1257 	PIXMAN_null, 0,							\
1258 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1259 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1260     }
1261 
1262 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
1263     {   PIXMAN_OP_ ## op,						\
1264 	PIXMAN_ ## s,							\
1265 	(SCALED_BILINEAR_FLAGS		|				\
1266 	 FAST_PATH_PAD_REPEAT		|				\
1267 	 FAST_PATH_X_UNIT_POSITIVE),					\
1268 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1269 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1270 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1271     }
1272 
1273 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
1274     {   PIXMAN_OP_ ## op,						\
1275 	PIXMAN_ ## s,							\
1276 	(SCALED_BILINEAR_FLAGS		|				\
1277 	 FAST_PATH_NONE_REPEAT		|				\
1278 	 FAST_PATH_X_UNIT_POSITIVE),					\
1279 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1280 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1281 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1282     }
1283 
1284 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
1285     {   PIXMAN_OP_ ## op,						\
1286 	PIXMAN_ ## s,							\
1287 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1288 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1289 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1290 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1291     }
1292 
1293 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
1294     {   PIXMAN_OP_ ## op,						\
1295 	PIXMAN_ ## s,							\
1296 	(SCALED_BILINEAR_FLAGS		|				\
1297 	 FAST_PATH_NORMAL_REPEAT	|				\
1298 	 FAST_PATH_X_UNIT_POSITIVE),					\
1299 	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1300 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1301 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1302     }
1303 
1304 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
1305     {   PIXMAN_OP_ ## op,						\
1306 	PIXMAN_ ## s,							\
1307 	(SCALED_BILINEAR_FLAGS		|				\
1308 	 FAST_PATH_PAD_REPEAT		|				\
1309 	 FAST_PATH_X_UNIT_POSITIVE),					\
1310 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1311 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1312 	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1313     }
1314 
1315 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
1316     {   PIXMAN_OP_ ## op,						\
1317 	PIXMAN_ ## s,							\
1318 	(SCALED_BILINEAR_FLAGS		|				\
1319 	 FAST_PATH_NONE_REPEAT		|				\
1320 	 FAST_PATH_X_UNIT_POSITIVE),					\
1321 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1322 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1323 	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1324     }
1325 
1326 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
1327     {   PIXMAN_OP_ ## op,						\
1328 	PIXMAN_ ## s,							\
1329 	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1330 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1331 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1332 	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1333     }
1334 
1335 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
1336     {   PIXMAN_OP_ ## op,						\
1337 	PIXMAN_ ## s,							\
1338 	(SCALED_BILINEAR_FLAGS		|				\
1339 	 FAST_PATH_NORMAL_REPEAT	|				\
1340 	 FAST_PATH_X_UNIT_POSITIVE),					\
1341 	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1342 	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1343 	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1344     }
1345 
1346 /* Prefer the use of 'cover' variant, because it is faster */
1347 #define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
1348     SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
1349     SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
1350     SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
1351     SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
1352 
1353 #define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
1354     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
1355     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
1356     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
1357     SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
1358 
1359 #define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
1360     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
1361     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
1362     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
1363     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
1364 
1365 #endif
1366