1changeset:   96613:3e003f0b8026
2tag:         2pass
3tag:         qbase
4tag:         qtip
5tag:         tip
6user:        Jeff Muizelaar <jmuizelaar@mozilla.com>
7date:        Thu May 17 19:23:53 2012 -0400
8summary:     Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe
9
10diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h
11--- a/gfx/cairo/libpixman/src/pixman-arm-common.h
12+++ b/gfx/cairo/libpixman/src/pixman-arm-common.h
13@@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n
14     if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
15 	return;                                                               \
16     pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
17                             dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
18 }                                                                             \
19                                                                               \
20 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
21                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
22-                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
23+                       NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE)  \
24 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
25                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
26-                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
27+                       NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE)   \
28 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
29                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
30-                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
31+                       NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE)    \
32 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
33                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
34-                       src_type, uint32_t, dst_type, NORMAL,                  \
35+                       NULL, src_type, uint32_t, dst_type, NORMAL,            \
36                        FLAG_NONE)
37
38
39 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
40                                                 src_type, dst_type)           \
41 void                                                                          \
42 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
43                                                 dst_type *       dst,         \
44@@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n
45     if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
46 	return;                                                                   \
47     pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
48                       dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
49 }                                                                             \
50                                                                               \
51 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
52                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
53-                       src_type, uint8_t, dst_type, COVER,                    \
54+                       NULL, src_type, uint8_t, dst_type, COVER,              \
55                        FLAG_HAVE_NON_SOLID_MASK)                              \
56 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
57                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
58-                       src_type, uint8_t, dst_type, NONE,                     \
59+                       NULL, src_type, uint8_t, dst_type, NONE,               \
60                        FLAG_HAVE_NON_SOLID_MASK)                              \
61 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
62                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
63-                       src_type, uint8_t, dst_type, PAD,                      \
64+                       NULL, src_type, uint8_t, dst_type, PAD,                \
65                        FLAG_HAVE_NON_SOLID_MASK)                              \
66 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
67                        scaled_bilinear_scanline_##cputype##_##name##_##op,    \
68-                       src_type, uint8_t, dst_type, NORMAL,                   \
69+                       NULL, src_type, uint8_t, dst_type, NORMAL,             \
70                        FLAG_HAVE_NON_SOLID_MASK)
71
72
73 #endif
74diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c
75--- a/gfx/cairo/libpixman/src/pixman-arm-neon.c
76+++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c
77@@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST
78 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
79                                          uint32_t, uint16_t)
80 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
81                                          uint16_t, uint32_t)
82 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
83                                          uint16_t, uint16_t)
84 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
85                                          uint32_t, uint32_t)
86+static force_inline void
87+pixman_scaled_bilinear_scanline_8888_8888_SRC (
88+                                                uint32_t *       dst,
89+                                                const uint32_t * mask,
90+                                                const uint32_t * src_top,
91+                                                const uint32_t * src_bottom,
92+                                                int32_t          w,
93+                                                int              wt,
94+                                                int              wb,
95+                                                pixman_fixed_t   vx,
96+                                                pixman_fixed_t   unit_x,
97+                                                pixman_fixed_t   max_vx,
98+                                                pixman_bool_t    zero_src)
99+{
100+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w);
101+}
102+
103 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
104                                          uint32_t, uint32_t)
105
106 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
107                                             uint32_t, uint32_t)
108 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
109                                             uint32_t, uint16_t)
110 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
111@@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits,
112 		(uint32_t *)(((char *) src_bits) +
113 		src_y * src_stride * 4 + src_x * 4), src_stride);
114 	return TRUE;
115     default:
116 	return FALSE;
117     }
118 }
119
120+static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
121+{
122+    pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0);
123+}
124+
125+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER,
126+			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
127+			       uint32_t, uint32_t, uint16_t,
128+			       COVER, FLAG_NONE)
129+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER,
130+			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
131+			       uint32_t, uint32_t, uint16_t,
132+			       PAD, FLAG_NONE)
133+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER,
134+			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
135+			       uint32_t, uint32_t, uint16_t,
136+			       NONE, FLAG_NONE)
137+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER,
138+			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
139+			       uint32_t, uint32_t, uint16_t,
140+			       NORMAL, FLAG_NONE)
141+
142 static const pixman_fast_path_t arm_neon_fast_paths[] =
143 {
144     PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
145     PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
146     PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
147     PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
148     PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
149     PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
150@@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon
151     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
152
153     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
154     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
155
156     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
157     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
158
159+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
160+
161     { PIXMAN_OP_NONE },
162 };
163
164 static pixman_bool_t
165 arm_neon_blt (pixman_implementation_t *imp,
166               uint32_t *               src_bits,
167               uint32_t *               dst_bits,
168               int                      src_stride,
169diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c
170--- a/gfx/cairo/libpixman/src/pixman-fast-path.c
171+++ b/gfx/cairo/libpixman/src/pixman-fast-path.c
172@@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui
173         vx += unit_x;
174         *dst++ = d;
175     }
176 }
177
178 #endif
179
180 FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC,
181-			       scaled_bilinear_scanline_565_565_SRC,
182+			       scaled_bilinear_scanline_565_565_SRC, NULL,
183 			       uint16_t, uint32_t, uint16_t,
184 			       COVER, FLAG_NONE)
185 FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC,
186-			       scaled_bilinear_scanline_565_565_SRC,
187+			       scaled_bilinear_scanline_565_565_SRC, NULL,
188 			       uint16_t, uint32_t, uint16_t,
189 			       PAD, FLAG_NONE)
190 FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC,
191-			       scaled_bilinear_scanline_565_565_SRC,
192+			       scaled_bilinear_scanline_565_565_SRC, NULL,
193 			       uint16_t, uint32_t, uint16_t,
194 			       NONE, FLAG_NONE)
195 FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC,
196-			       scaled_bilinear_scanline_565_565_SRC,
197+			       scaled_bilinear_scanline_565_565_SRC, NULL,
198 			       uint16_t, uint32_t, uint16_t,
199 			       NORMAL, FLAG_NONE)
200
201 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER,
202-			       scaled_bilinear_scanline_8888_565_OVER,
203+			       scaled_bilinear_scanline_8888_565_OVER, NULL,
204 			       uint32_t, uint32_t, uint16_t,
205 			       COVER, FLAG_NONE)
206 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER,
207-			       scaled_bilinear_scanline_8888_565_OVER,
208+			       scaled_bilinear_scanline_8888_565_OVER, NULL,
209 			       uint32_t, uint32_t, uint16_t,
210 			       PAD, FLAG_NONE)
211 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER,
212-			       scaled_bilinear_scanline_8888_565_OVER,
213+			       scaled_bilinear_scanline_8888_565_OVER, NULL,
214 			       uint32_t, uint32_t, uint16_t,
215 			       NONE, FLAG_NONE)
216 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER,
217-			       scaled_bilinear_scanline_8888_565_OVER,
218+			       scaled_bilinear_scanline_8888_565_OVER, NULL,
219 			       uint32_t, uint32_t, uint16_t,
220 			       NORMAL, FLAG_NONE)
221
222 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER,
223-			       scaled_bilinear_scanline_8888_8888_OVER,
224+			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
225 			       uint32_t, uint32_t, uint32_t,
226 			       COVER, FLAG_NONE)
227 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER,
228-			       scaled_bilinear_scanline_8888_8888_OVER,
229+			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
230 			       uint32_t, uint32_t, uint32_t,
231 			       PAD, FLAG_NONE)
232 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER,
233-			       scaled_bilinear_scanline_8888_8888_OVER,
234+			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
235 			       uint32_t, uint32_t, uint32_t,
236 			       NONE, FLAG_NONE)
237 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER,
238-			       scaled_bilinear_scanline_8888_8888_OVER,
239+			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
240 			       uint32_t, uint32_t, uint32_t,
241 			       NORMAL, FLAG_NONE)
242
243 #define REPEAT_MIN_WIDTH    32
244
245 static void
246 fast_composite_tiled_repeat (pixman_implementation_t *imp,
247 			     pixman_composite_info_t *info)
248diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h
249--- a/gfx/cairo/libpixman/src/pixman-inlines.h
250+++ b/gfx/cairo/libpixman/src/pixman-inlines.h
251@@ -21,16 +21,17 @@
252  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
253  *
254  * Author:  Keith Packard, SuSE, Inc.
255  */
256
257 #ifndef PIXMAN_FAST_PATH_H__
258 #define PIXMAN_FAST_PATH_H__
259
260+#include <stdlib.h>
261 #include "pixman-private.h"
262
263 #define PIXMAN_REPEAT_COVER -1
264
265 /* Flags describing input parameters to fast path macro template.
266  * Turning on some flag values may indicate that
267  * "some property X is available so template can use this" or
268  * "some property X should be handled by template".
269@@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds
270  *
271  * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
272  *       but sometimes it may be less than that for NONE repeat when handling
273  *       fuzzy antialiased top or bottom image edges. Also both top and
274  *       bottom weight variables are guaranteed to have value in 0-255
275  *       range and can fit into unsigned byte or be used with 8-bit SIMD
276  *       multiplication instructions.
277  */
278-#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
279-				  dst_type_t, repeat_mode, flags)				\
280+
281+/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional
282+ * two stage processing (bilinear fetch to a temp buffer, followed by unscaled
283+ * combine), "op_func" may be NULL, in this case we keep old behavior.
284+ * This is ugly and gcc issues some warnings, but works.
285+ *
286+ * An advice: clang has much better error reporting than gcc for deeply nested macros.
287+ */
288+
289+#define	scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,            \
290+                      scanline_buf, mask, src_top, src_bottom, width,                           \
291+                      weight_top, weight_bottom, vx, unit_x, max_vx, zero_src)                  \
292+ do {                                                                                           \
293+		if (op_func != NULL)								\
294+		{										\
295+		    fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \
296+                        (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src));   \
297+		    ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\
298+			((dst), (mask), (src_type_t *)scanline_buf, (width));			\
299+		}										\
300+		else										\
301+		{										\
302+		    fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top),  \
303+                                (weight_bottom), (vx), (unit_x), (max_vx), (zero_src));         \
304+		}                                                                               \
305+  } while (0)
306+
307+
308+#define SCANLINE_BUFFER_LENGTH 3072
309+
310+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t,		\
311+				  mask_type_t, dst_type_t, repeat_mode, flags)			\
312 static void											\
313 fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
314 						   pixman_composite_info_t *info)		\
315 {												\
316     PIXMAN_COMPOSITE_ARGS (info);								\
317     dst_type_t *dst_line;									\
318     mask_type_t *mask_line;									\
319     src_type_t *src_first_line;									\
320@@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_
321     mask_type_t solid_mask;									\
322     const mask_type_t *mask = &solid_mask;							\
323     int src_stride, mask_stride, dst_stride;							\
324 												\
325     int src_width;										\
326     pixman_fixed_t src_width_fixed;								\
327     int max_x;											\
328     pixman_bool_t need_src_extension;								\
329+                                                                                                \
330+    uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH];                                     \
331+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;                               \
332 												\
333     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
334     if (flags & FLAG_HAVE_SOLID_MASK)								\
335     {												\
336 	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
337 	mask_stride = 0;									\
338     }												\
339     else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
340@@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_
341 	else											\
342 	{											\
343 	    src_width = src_image->bits.width;							\
344 	    need_src_extension = FALSE;								\
345 	}											\
346 												\
347 	src_width_fixed = pixman_int_to_fixed (src_width);					\
348     }												\
349+                                                                                                \
350+    if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer))          \
351+    {                                                                                           \
352+	scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t));                         \
353+                                                                                                \
354+	if (!scanline_buffer)                                                                   \
355+	    return;                                                                             \
356+    }                                                                                           \
357 												\
358     while (--height >= 0)									\
359     {												\
360 	int weight1, weight2;									\
361 	dst = dst_line;										\
362 	dst_line += dst_stride;									\
363 	vx = v.vector[0];									\
364 	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
365@@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_
366 	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
367 	    src1 = src_first_line + src_stride * y1;						\
368 	    src2 = src_first_line + src_stride * y2;						\
369 												\
370 	    if (left_pad > 0)									\
371 	    {											\
372 		buf1[0] = buf1[1] = src1[0];							\
373 		buf2[0] = buf2[1] = src2[0];							\
374-		scanline_func (dst, mask,							\
375-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
376+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
377+			       scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2,   \
378+                               0, 0, 0, FALSE);	                                                \
379 		dst += left_pad;								\
380 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
381 		    mask += left_pad;								\
382 	    }											\
383 	    if (width > 0)									\
384 	    {											\
385-		scanline_func (dst, mask,							\
386-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
387+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
388+			       scanline_buffer, mask, src1, src2, width, weight1, weight2,      \
389+                               vx, unit_x, 0, FALSE);                                           \
390 		dst += width;									\
391 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
392 		    mask += width;								\
393 	    }											\
394 	    if (right_pad > 0)									\
395 	    {											\
396 		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
397 		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
398-		scanline_func (dst, mask,							\
399-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
400+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
401+			       scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2,  \
402+                               0, 0, 0, FALSE);                                                 \
403 	    }											\
404 	}											\
405 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
406 	{											\
407 	    src_type_t *src1, *src2;								\
408 	    src_type_t buf1[2];									\
409 	    src_type_t buf2[2];									\
410 	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
411@@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_
412 	    }											\
413 	    src1 = src_first_line + src_stride * y1;						\
414 	    src2 = src_first_line + src_stride * y2;						\
415 												\
416 	    if (left_pad > 0)									\
417 	    {											\
418 		buf1[0] = buf1[1] = 0;								\
419 		buf2[0] = buf2[1] = 0;								\
420-		scanline_func (dst, mask,							\
421-			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
422+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
423+			       scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2,   \
424+                               0, 0, 0, TRUE);	                                                \
425 		dst += left_pad;								\
426 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
427 		    mask += left_pad;								\
428 	    }											\
429 	    if (left_tz > 0)									\
430 	    {											\
431 		buf1[0] = 0;									\
432 		buf1[1] = src1[0];								\
433 		buf2[0] = 0;									\
434 		buf2[1] = src2[0];								\
435-		scanline_func (dst, mask,							\
436-			       buf1, buf2, left_tz, weight1, weight2,				\
437+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
438+			       scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2,	\
439 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
440 		dst += left_tz;									\
441 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
442 		    mask += left_tz;								\
443 		vx += left_tz * unit_x;								\
444 	    }											\
445 	    if (width > 0)									\
446 	    {											\
447-		scanline_func (dst, mask,							\
448-			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
449+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
450+			       scanline_buffer, mask, src1, src2, width, weight1, weight2,      \
451+                               vx, unit_x, 0, FALSE);                                           \
452 		dst += width;									\
453 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
454 		    mask += width;								\
455 		vx += width * unit_x;								\
456 	    }											\
457 	    if (right_tz > 0)									\
458 	    {											\
459 		buf1[0] = src1[src_image->bits.width - 1];					\
460 		buf1[1] = 0;									\
461 		buf2[0] = src2[src_image->bits.width - 1];					\
462 		buf2[1] = 0;									\
463-		scanline_func (dst, mask,							\
464-			       buf1, buf2, right_tz, weight1, weight2,				\
465+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
466+			       scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2,   \
467 			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
468 		dst += right_tz;								\
469 		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
470 		    mask += right_tz;								\
471 	    }											\
472 	    if (right_pad > 0)									\
473 	    {											\
474 		buf1[0] = buf1[1] = 0;								\
475 		buf2[0] = buf2[1] = 0;								\
476-		scanline_func (dst, mask,							\
477-			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
478+		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
479+			       scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2,  \
480+                               0, 0, 0, TRUE);	                                                \
481 	    }											\
482 	}											\
483 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
484 	{											\
485 	    int32_t	    num_pixels;								\
486 	    int32_t	    width_remain;							\
487 	    src_type_t *    src_line_top;							\
488 	    src_type_t *    src_line_bottom;							\
489@@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_
490 		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
491 		     * So we are safe from overflow.						\
492 		     */										\
493 		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
494 												\
495 		    if (num_pixels > width_remain)						\
496 			num_pixels = width_remain;						\
497 												\
498-		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
499+		    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func,    \
500+                                   dst, scanline_buffer, mask, buf1, buf2, num_pixels,          \
501 				   weight1, weight2, pixman_fixed_frac(vx),			\
502 				   unit_x, src_width_fixed, FALSE);				\
503 												\
504 		    width_remain -= num_pixels;							\
505 		    vx += num_pixels * unit_x;							\
506 		    dst += num_pixels;								\
507 												\
508 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
509@@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_
510 		     * So we are safe from overflow here.					\
511 		     */										\
512 		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
513 				  / unit_x) + 1;						\
514 												\
515 		    if (num_pixels > width_remain)						\
516 			num_pixels = width_remain;						\
517 												\
518-		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
519-				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
520+		    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func,    \
521+                                   dst, scanline_buffer, mask, src_line_top, src_line_bottom,   \
522+                                   num_pixels, weight1, weight2, vx, unit_x, src_width_fixed,   \
523+                                   FALSE);	                                                \
524 												\
525 		    width_remain -= num_pixels;							\
526 		    vx += num_pixels * unit_x;							\
527 		    dst += num_pixels;								\
528 												\
529 		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
530 		        mask += num_pixels;							\
531 		}										\
532 	    }											\
533 	}											\
534 	else											\
535 	{											\
536-	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
537+	    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,       \
538+                           scanline_buffer, mask,                                               \
539+                           src_first_line + src_stride * y1,					\
540 			   src_first_line + src_stride * y2, width,				\
541 			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
542 	}											\
543     }												\
544+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)                                   \
545+	free (scanline_buffer);                                                                 \
546 }
547
548 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
549-#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
550+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\
551 				  dst_type_t, repeat_mode, flags)				\
552-	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
553+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\
554 				  dst_type_t, repeat_mode, flags)
555
556 #define SCALED_BILINEAR_FLAGS						\
557     (FAST_PATH_SCALE_TRANSFORM	|					\
558      FAST_PATH_NO_ALPHA_MAP	|					\
559      FAST_PATH_BILINEAR_FILTER	|					\
560      FAST_PATH_NO_ACCESSORS	|					\
561      FAST_PATH_NARROW_FORMAT)
562diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c
563--- a/gfx/cairo/libpixman/src/pixman-sse2.c
564+++ b/gfx/cairo/libpixman/src/pixman-sse2.c
565@@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_
566     if (w & 1)
567     {
568 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
569 	*dst = pix1;
570     }
571
572 }
573
574+/* Add extra NULL argument to the existing bilinear fast paths to indicate
575+ * that we don't need two-pass processing */
576+
577 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
578-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
579+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
580 			       uint32_t, uint32_t, uint32_t,
581 			       COVER, FLAG_NONE)
582 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
583-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
584+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
585 			       uint32_t, uint32_t, uint32_t,
586 			       PAD, FLAG_NONE)
587 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
588-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
589+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
590 			       uint32_t, uint32_t, uint32_t,
591 			       NONE, FLAG_NONE)
592 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
593-			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
594+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
595 			       uint32_t, uint32_t, uint32_t,
596 			       NORMAL, FLAG_NONE)
597
598 static force_inline void
599 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
600 					      const uint32_t * mask,
601 					      const uint32_t * src_top,
602 					      const uint32_t * src_bottom,
603@@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_
604 	}
605
606 	w--;
607 	dst++;
608     }
609 }
610
611 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
612-			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
613+			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
614 			       uint32_t, uint32_t, uint32_t,
615 			       COVER, FLAG_NONE)
616 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
617-			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
618+			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
619 			       uint32_t, uint32_t, uint32_t,
620 			       PAD, FLAG_NONE)
621 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
622-			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
623+			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
624 			       uint32_t, uint32_t, uint32_t,
625 			       NONE, FLAG_NONE)
626 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
627-			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
628+			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
629 			       uint32_t, uint32_t, uint32_t,
630 			       NORMAL, FLAG_NONE)
631
632+
633+/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
634+   as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
635+
636+void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
637+{
638+    /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
639+    while (--width >= 0)
640+    {
641+	*dst = composite_over_8888_0565pixel (*src, *dst);
642+	src++;
643+	dst++;
644+    }
645+}
646+
647+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
648+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
649+			       uint32_t, uint32_t, uint16_t,
650+			       COVER, FLAG_NONE)
651+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
652+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
653+			       uint32_t, uint32_t, uint16_t,
654+			       PAD, FLAG_NONE)
655+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
656+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
657+			       uint32_t, uint32_t, uint16_t,
658+			       NONE, FLAG_NONE)
659+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
660+			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
661+			       uint32_t, uint32_t, uint16_t,
662+			       NORMAL, FLAG_NONE)
663+
664+/*****************************/
665+
666 static force_inline void
667 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
668 						const uint8_t  * mask,
669 						const uint32_t * src_top,
670 						const uint32_t * src_bottom,
671 						int32_t          w,
672 						int              wt,
673 						int              wb,
674@@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888
675 	}
676
677 	w--;
678 	dst++;
679     }
680 }
681
682 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
683-			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
684+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
685 			       uint32_t, uint8_t, uint32_t,
686 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
687 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
688-			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
689+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
690 			       uint32_t, uint8_t, uint32_t,
691 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
692 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
693-			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
694+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
695 			       uint32_t, uint8_t, uint32_t,
696 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
697 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
698-			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
699+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
700 			       uint32_t, uint8_t, uint32_t,
701 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
702
703 static const pixman_fast_path_t sse2_fast_paths[] =
704 {
705     /* PIXMAN_OP_OVER */
706     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
707     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
708@@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas
709     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
710     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
711
712     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
713     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
714     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
715     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
716
717+    /* and here the needed entries are added to the fast path table */
718+
719+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
720+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
721+
722     { PIXMAN_OP_NONE },
723 };
724
725 static pixman_bool_t
726 sse2_blt (pixman_implementation_t *imp,
727           uint32_t *               src_bits,
728           uint32_t *               dst_bits,
729           int                      src_stride,
730
731