1changeset: 96613:3e003f0b8026 2tag: 2pass 3tag: qbase 4tag: qtip 5tag: tip 6user: Jeff Muizelaar <jmuizelaar@mozilla.com> 7date: Thu May 17 19:23:53 2012 -0400 8summary: Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe 9 10diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h 11--- a/gfx/cairo/libpixman/src/pixman-arm-common.h 12+++ b/gfx/cairo/libpixman/src/pixman-arm-common.h 13@@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n 14 if ((flags & SKIP_ZERO_SRC) && zero_src) \ 15 return; \ 16 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 17 dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ 18 } \ 19 \ 20 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ 21 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 22- src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ 23+ NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ 24 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ 25 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 26- src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ 27+ NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ 28 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ 29 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 30- src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ 31+ NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ 32 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ 33 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 34- src_type, uint32_t, dst_type, NORMAL, \ 35+ NULL, src_type, uint32_t, dst_type, NORMAL, \ 36 FLAG_NONE) 37 38 39 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op, \ 40 src_type, dst_type) \ 41 void \ 42 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 43 dst_type * dst, \ 44@@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n 45 if ((flags & SKIP_ZERO_SRC) && zero_src) \ 46 return; \ 47 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 48 dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ 49 } \ 50 \ 51 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ 52 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 53- src_type, uint8_t, dst_type, COVER, \ 54+ NULL, src_type, uint8_t, dst_type, COVER, \ 55 FLAG_HAVE_NON_SOLID_MASK) \ 56 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ 57 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 58- src_type, uint8_t, dst_type, NONE, \ 59+ NULL, src_type, uint8_t, dst_type, NONE, \ 60 FLAG_HAVE_NON_SOLID_MASK) \ 61 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ 62 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 63- src_type, uint8_t, dst_type, PAD, \ 64+ NULL, src_type, uint8_t, dst_type, PAD, \ 65 FLAG_HAVE_NON_SOLID_MASK) \ 66 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ 67 scaled_bilinear_scanline_##cputype##_##name##_##op, \ 68- src_type, uint8_t, dst_type, NORMAL, \ 69+ NULL, src_type, uint8_t, dst_type, NORMAL, \ 70 FLAG_HAVE_NON_SOLID_MASK) 71 72 73 #endif 74diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c 75--- a/gfx/cairo/libpixman/src/pixman-arm-neon.c 76+++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c 77@@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST 78 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, 79 uint32_t, uint16_t) 80 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, 81 uint16_t, uint32_t) 82 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, 83 uint16_t, uint16_t) 84 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, 85 uint32_t, uint32_t) 86+static force_inline void 87+pixman_scaled_bilinear_scanline_8888_8888_SRC ( 88+ uint32_t * dst, 89+ const uint32_t * mask, 90+ const uint32_t * src_top, 91+ const uint32_t * src_bottom, 92+ int32_t w, 93+ int wt, 94+ int wb, 95+ pixman_fixed_t vx, 96+ pixman_fixed_t unit_x, 97+ pixman_fixed_t max_vx, 98+ pixman_bool_t zero_src) 99+{ 100+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w); 101+} 102+ 103 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, 104 uint32_t, uint32_t) 105 106 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, 107 uint32_t, uint32_t) 108 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, 109 uint32_t, uint16_t) 110 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, 111@@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits, 112 (uint32_t *)(((char *) src_bits) + 113 src_y * src_stride * 4 + src_x * 4), src_stride); 114 return TRUE; 115 default: 116 return FALSE; 117 } 118 } 119 120+static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) 121+{ 122+ pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0); 123+} 124+ 125+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER, 126+ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 127+ uint32_t, uint32_t, uint16_t, 128+ COVER, FLAG_NONE) 129+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER, 130+ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 131+ uint32_t, uint32_t, uint16_t, 132+ PAD, FLAG_NONE) 133+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER, 134+ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 135+ uint32_t, uint32_t, uint16_t, 136+ NONE, FLAG_NONE) 137+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER, 138+ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 139+ uint32_t, uint32_t, uint16_t, 140+ NORMAL, FLAG_NONE) 141+ 142 static const pixman_fast_path_t arm_neon_fast_paths[] = 143 { 144 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), 145 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565), 146 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), 147 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), 148 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), 149 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), 150@@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon 151 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), 152 153 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), 154 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), 155 156 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), 157 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), 158 159+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), 160+ 161 { PIXMAN_OP_NONE }, 162 }; 163 164 static pixman_bool_t 165 arm_neon_blt (pixman_implementation_t *imp, 166 uint32_t * src_bits, 167 uint32_t * dst_bits, 168 int src_stride, 169diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c 170--- a/gfx/cairo/libpixman/src/pixman-fast-path.c 171+++ b/gfx/cairo/libpixman/src/pixman-fast-path.c 172@@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui 173 vx += unit_x; 174 *dst++ = d; 175 } 176 } 177 178 #endif 179 180 FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC, 181- scaled_bilinear_scanline_565_565_SRC, 182+ scaled_bilinear_scanline_565_565_SRC, NULL, 183 uint16_t, uint32_t, uint16_t, 184 COVER, FLAG_NONE) 185 FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC, 186- scaled_bilinear_scanline_565_565_SRC, 187+ scaled_bilinear_scanline_565_565_SRC, NULL, 188 uint16_t, uint32_t, uint16_t, 189 PAD, FLAG_NONE) 190 FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC, 191- scaled_bilinear_scanline_565_565_SRC, 192+ scaled_bilinear_scanline_565_565_SRC, NULL, 193 uint16_t, uint32_t, uint16_t, 194 NONE, FLAG_NONE) 195 FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC, 196- scaled_bilinear_scanline_565_565_SRC, 197+ scaled_bilinear_scanline_565_565_SRC, NULL, 198 uint16_t, uint32_t, uint16_t, 199 NORMAL, FLAG_NONE) 200 201 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER, 202- scaled_bilinear_scanline_8888_565_OVER, 203+ scaled_bilinear_scanline_8888_565_OVER, NULL, 204 uint32_t, uint32_t, uint16_t, 205 COVER, FLAG_NONE) 206 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER, 207- scaled_bilinear_scanline_8888_565_OVER, 208+ scaled_bilinear_scanline_8888_565_OVER, NULL, 209 uint32_t, uint32_t, uint16_t, 210 PAD, FLAG_NONE) 211 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER, 212- scaled_bilinear_scanline_8888_565_OVER, 213+ scaled_bilinear_scanline_8888_565_OVER, NULL, 214 uint32_t, uint32_t, uint16_t, 215 NONE, FLAG_NONE) 216 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER, 217- scaled_bilinear_scanline_8888_565_OVER, 218+ scaled_bilinear_scanline_8888_565_OVER, NULL, 219 uint32_t, uint32_t, uint16_t, 220 NORMAL, FLAG_NONE) 221 222 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER, 223- scaled_bilinear_scanline_8888_8888_OVER, 224+ scaled_bilinear_scanline_8888_8888_OVER, NULL, 225 uint32_t, uint32_t, uint32_t, 226 COVER, FLAG_NONE) 227 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER, 228- scaled_bilinear_scanline_8888_8888_OVER, 229+ scaled_bilinear_scanline_8888_8888_OVER, NULL, 230 uint32_t, uint32_t, uint32_t, 231 PAD, FLAG_NONE) 232 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER, 233- scaled_bilinear_scanline_8888_8888_OVER, 234+ scaled_bilinear_scanline_8888_8888_OVER, NULL, 235 uint32_t, uint32_t, uint32_t, 236 NONE, FLAG_NONE) 237 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER, 238- scaled_bilinear_scanline_8888_8888_OVER, 239+ scaled_bilinear_scanline_8888_8888_OVER, NULL, 240 uint32_t, uint32_t, uint32_t, 241 NORMAL, FLAG_NONE) 242 243 #define REPEAT_MIN_WIDTH 32 244 245 static void 246 fast_composite_tiled_repeat (pixman_implementation_t *imp, 247 pixman_composite_info_t *info) 248diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h 249--- a/gfx/cairo/libpixman/src/pixman-inlines.h 250+++ b/gfx/cairo/libpixman/src/pixman-inlines.h 251@@ -21,16 +21,17 @@ 252 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 253 * 254 * Author: Keith Packard, SuSE, Inc. 255 */ 256 257 #ifndef PIXMAN_FAST_PATH_H__ 258 #define PIXMAN_FAST_PATH_H__ 259 260+#include <stdlib.h> 261 #include "pixman-private.h" 262 263 #define PIXMAN_REPEAT_COVER -1 264 265 /* Flags describing input parameters to fast path macro template. 266 * Turning on some flag values may indicate that 267 * "some property X is available so template can use this" or 268 * "some property X should be handled by template". 269@@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds 270 * 271 * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, 272 * but sometimes it may be less than that for NONE repeat when handling 273 * fuzzy antialiased top or bottom image edges. Also both top and 274 * bottom weight variables are guaranteed to have value in 0-255 275 * range and can fit into unsigned byte or be used with 8-bit SIMD 276 * multiplication instructions. 277 */ 278-#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ 279- dst_type_t, repeat_mode, flags) \ 280+ 281+/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional 282+ * two stage processing (bilinear fetch to a temp buffer, followed by unscaled 283+ * combine), "op_func" may be NULL, in this case we keep old behavior. 284+ * This is ugly and gcc issues some warnings, but works. 285+ * 286+ * An advice: clang has much better error reporting than gcc for deeply nested macros. 287+ */ 288+ 289+#define scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 290+ scanline_buf, mask, src_top, src_bottom, width, \ 291+ weight_top, weight_bottom, vx, unit_x, max_vx, zero_src) \ 292+ do { \ 293+ if (op_func != NULL) \ 294+ { \ 295+ fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \ 296+ (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ 297+ ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\ 298+ ((dst), (mask), (src_type_t *)scanline_buf, (width)); \ 299+ } \ 300+ else \ 301+ { \ 302+ fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top), \ 303+ (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ 304+ } \ 305+ } while (0) 306+ 307+ 308+#define SCANLINE_BUFFER_LENGTH 3072 309+ 310+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t, \ 311+ mask_type_t, dst_type_t, repeat_mode, flags) \ 312 static void \ 313 fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \ 314 pixman_composite_info_t *info) \ 315 { \ 316 PIXMAN_COMPOSITE_ARGS (info); \ 317 dst_type_t *dst_line; \ 318 mask_type_t *mask_line; \ 319 src_type_t *src_first_line; \ 320@@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_ 321 mask_type_t solid_mask; \ 322 const mask_type_t *mask = &solid_mask; \ 323 int src_stride, mask_stride, dst_stride; \ 324 \ 325 int src_width; \ 326 pixman_fixed_t src_width_fixed; \ 327 int max_x; \ 328 pixman_bool_t need_src_extension; \ 329+ \ 330+ uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH]; \ 331+ uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; \ 332 \ 333 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1); \ 334 if (flags & FLAG_HAVE_SOLID_MASK) \ 335 { \ 336 solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); \ 337 mask_stride = 0; \ 338 } \ 339 else if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 340@@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_ 341 else \ 342 { \ 343 src_width = src_image->bits.width; \ 344 need_src_extension = FALSE; \ 345 } \ 346 \ 347 src_width_fixed = pixman_int_to_fixed (src_width); \ 348 } \ 349+ \ 350+ if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer)) \ 351+ { \ 352+ scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t)); \ 353+ \ 354+ if (!scanline_buffer) \ 355+ return; \ 356+ } \ 357 \ 358 while (--height >= 0) \ 359 { \ 360 int weight1, weight2; \ 361 dst = dst_line; \ 362 dst_line += dst_stride; \ 363 vx = v.vector[0]; \ 364 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 365@@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_ 366 repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \ 367 src1 = src_first_line + src_stride * y1; \ 368 src2 = src_first_line + src_stride * y2; \ 369 \ 370 if (left_pad > 0) \ 371 { \ 372 buf1[0] = buf1[1] = src1[0]; \ 373 buf2[0] = buf2[1] = src2[0]; \ 374- scanline_func (dst, mask, \ 375- buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \ 376+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 377+ scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ 378+ 0, 0, 0, FALSE); \ 379 dst += left_pad; \ 380 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 381 mask += left_pad; \ 382 } \ 383 if (width > 0) \ 384 { \ 385- scanline_func (dst, mask, \ 386- src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ 387+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 388+ scanline_buffer, mask, src1, src2, width, weight1, weight2, \ 389+ vx, unit_x, 0, FALSE); \ 390 dst += width; \ 391 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 392 mask += width; \ 393 } \ 394 if (right_pad > 0) \ 395 { \ 396 buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \ 397 buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \ 398- scanline_func (dst, mask, \ 399- buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \ 400+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 401+ scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ 402+ 0, 0, 0, FALSE); \ 403 } \ 404 } \ 405 else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ 406 { \ 407 src_type_t *src1, *src2; \ 408 src_type_t buf1[2]; \ 409 src_type_t buf2[2]; \ 410 /* handle top/bottom zero padding by just setting weights to 0 if needed */ \ 411@@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_ 412 } \ 413 src1 = src_first_line + src_stride * y1; \ 414 src2 = src_first_line + src_stride * y2; \ 415 \ 416 if (left_pad > 0) \ 417 { \ 418 buf1[0] = buf1[1] = 0; \ 419 buf2[0] = buf2[1] = 0; \ 420- scanline_func (dst, mask, \ 421- buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \ 422+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 423+ scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ 424+ 0, 0, 0, TRUE); \ 425 dst += left_pad; \ 426 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 427 mask += left_pad; \ 428 } \ 429 if (left_tz > 0) \ 430 { \ 431 buf1[0] = 0; \ 432 buf1[1] = src1[0]; \ 433 buf2[0] = 0; \ 434 buf2[1] = src2[0]; \ 435- scanline_func (dst, mask, \ 436- buf1, buf2, left_tz, weight1, weight2, \ 437+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 438+ scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2, \ 439 pixman_fixed_frac (vx), unit_x, 0, FALSE); \ 440 dst += left_tz; \ 441 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 442 mask += left_tz; \ 443 vx += left_tz * unit_x; \ 444 } \ 445 if (width > 0) \ 446 { \ 447- scanline_func (dst, mask, \ 448- src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ 449+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 450+ scanline_buffer, mask, src1, src2, width, weight1, weight2, \ 451+ vx, unit_x, 0, FALSE); \ 452 dst += width; \ 453 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 454 mask += width; \ 455 vx += width * unit_x; \ 456 } \ 457 if (right_tz > 0) \ 458 { \ 459 buf1[0] = src1[src_image->bits.width - 1]; \ 460 buf1[1] = 0; \ 461 buf2[0] = src2[src_image->bits.width - 1]; \ 462 buf2[1] = 0; \ 463- scanline_func (dst, mask, \ 464- buf1, buf2, right_tz, weight1, weight2, \ 465+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 466+ scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2, \ 467 pixman_fixed_frac (vx), unit_x, 0, FALSE); \ 468 dst += right_tz; \ 469 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 470 mask += right_tz; \ 471 } \ 472 if (right_pad > 0) \ 473 { \ 474 buf1[0] = buf1[1] = 0; \ 475 buf2[0] = buf2[1] = 0; \ 476- scanline_func (dst, mask, \ 477- buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \ 478+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 479+ scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ 480+ 0, 0, 0, TRUE); \ 481 } \ 482 } \ 483 else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ 484 { \ 485 int32_t num_pixels; \ 486 int32_t width_remain; \ 487 src_type_t * src_line_top; \ 488 src_type_t * src_line_bottom; \ 489@@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_ 490 * vx is in range [0, src_width_fixed - pixman_fixed_e] \ 491 * So we are safe from overflow. \ 492 */ \ 493 num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1; \ 494 \ 495 if (num_pixels > width_remain) \ 496 num_pixels = width_remain; \ 497 \ 498- scanline_func (dst, mask, buf1, buf2, num_pixels, \ 499+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ 500+ dst, scanline_buffer, mask, buf1, buf2, num_pixels, \ 501 weight1, weight2, pixman_fixed_frac(vx), \ 502 unit_x, src_width_fixed, FALSE); \ 503 \ 504 width_remain -= num_pixels; \ 505 vx += num_pixels * unit_x; \ 506 dst += num_pixels; \ 507 \ 508 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 509@@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_ 510 * So we are safe from overflow here. \ 511 */ \ 512 num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e) \ 513 / unit_x) + 1; \ 514 \ 515 if (num_pixels > width_remain) \ 516 num_pixels = width_remain; \ 517 \ 518- scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels, \ 519- weight1, weight2, vx, unit_x, src_width_fixed, FALSE); \ 520+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ 521+ dst, scanline_buffer, mask, src_line_top, src_line_bottom, \ 522+ num_pixels, weight1, weight2, vx, unit_x, src_width_fixed, \ 523+ FALSE); \ 524 \ 525 width_remain -= num_pixels; \ 526 vx += num_pixels * unit_x; \ 527 dst += num_pixels; \ 528 \ 529 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 530 mask += num_pixels; \ 531 } \ 532 } \ 533 } \ 534 else \ 535 { \ 536- scanline_func (dst, mask, src_first_line + src_stride * y1, \ 537+ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 538+ scanline_buffer, mask, \ 539+ src_first_line + src_stride * y1, \ 540 src_first_line + src_stride * y2, width, \ 541 weight1, weight2, vx, unit_x, max_vx, FALSE); \ 542 } \ 543 } \ 544+ if (scanline_buffer != (uint8_t *) stack_scanline_buffer) \ 545+ free (scanline_buffer); \ 546 } 547 548 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ 549-#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ 550+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ 551 dst_type_t, repeat_mode, flags) \ 552- FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ 553+ FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ 554 dst_type_t, repeat_mode, flags) 555 556 #define SCALED_BILINEAR_FLAGS \ 557 (FAST_PATH_SCALE_TRANSFORM | \ 558 FAST_PATH_NO_ALPHA_MAP | \ 559 FAST_PATH_BILINEAR_FILTER | \ 560 FAST_PATH_NO_ACCESSORS | \ 561 FAST_PATH_NARROW_FORMAT) 562diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c 563--- a/gfx/cairo/libpixman/src/pixman-sse2.c 564+++ b/gfx/cairo/libpixman/src/pixman-sse2.c 565@@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_ 566 if (w & 1) 567 { 568 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 569 *dst = pix1; 570 } 571 572 } 573 574+/* Add extra NULL argument to the existing bilinear fast paths to indicate 575+ * that we don't need two-pass processing */ 576+ 577 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, 578- scaled_bilinear_scanline_sse2_8888_8888_SRC, 579+ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 580 uint32_t, uint32_t, uint32_t, 581 COVER, FLAG_NONE) 582 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, 583- scaled_bilinear_scanline_sse2_8888_8888_SRC, 584+ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 585 uint32_t, uint32_t, uint32_t, 586 PAD, FLAG_NONE) 587 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, 588- scaled_bilinear_scanline_sse2_8888_8888_SRC, 589+ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 590 uint32_t, uint32_t, uint32_t, 591 NONE, FLAG_NONE) 592 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, 593- scaled_bilinear_scanline_sse2_8888_8888_SRC, 594+ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 595 uint32_t, uint32_t, uint32_t, 596 NORMAL, FLAG_NONE) 597 598 static force_inline void 599 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, 600 const uint32_t * mask, 601 const uint32_t * src_top, 602 const uint32_t * src_bottom, 603@@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_ 604 } 605 606 w--; 607 dst++; 608 } 609 } 610 611 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, 612- scaled_bilinear_scanline_sse2_8888_8888_OVER, 613+ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 614 uint32_t, uint32_t, uint32_t, 615 COVER, FLAG_NONE) 616 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, 617- scaled_bilinear_scanline_sse2_8888_8888_OVER, 618+ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 619 uint32_t, uint32_t, uint32_t, 620 PAD, FLAG_NONE) 621 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, 622- scaled_bilinear_scanline_sse2_8888_8888_OVER, 623+ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 624 uint32_t, uint32_t, uint32_t, 625 NONE, FLAG_NONE) 626 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, 627- scaled_bilinear_scanline_sse2_8888_8888_OVER, 628+ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 629 uint32_t, uint32_t, uint32_t, 630 NORMAL, FLAG_NONE) 631 632+ 633+/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented 634+ as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ 635+ 636+void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) 637+{ 638+ /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ 639+ while (--width >= 0) 640+ { 641+ *dst = composite_over_8888_0565pixel (*src, *dst); 642+ src++; 643+ dst++; 644+ } 645+} 646+ 647+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, 648+ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 649+ uint32_t, uint32_t, uint16_t, 650+ COVER, FLAG_NONE) 651+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, 652+ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 653+ uint32_t, uint32_t, uint16_t, 654+ PAD, FLAG_NONE) 655+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, 656+ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 657+ uint32_t, uint32_t, uint16_t, 658+ NONE, FLAG_NONE) 659+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, 660+ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 661+ uint32_t, uint32_t, uint16_t, 662+ NORMAL, FLAG_NONE) 663+ 664+/*****************************/ 665+ 666 static force_inline void 667 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, 668 const uint8_t * mask, 669 const uint32_t * src_top, 670 const uint32_t * src_bottom, 671 int32_t w, 672 int wt, 673 int wb, 674@@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888 675 } 676 677 w--; 678 dst++; 679 } 680 } 681 682 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, 683- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 684+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 685 uint32_t, uint8_t, uint32_t, 686 COVER, FLAG_HAVE_NON_SOLID_MASK) 687 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, 688- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 689+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 690 uint32_t, uint8_t, uint32_t, 691 PAD, FLAG_HAVE_NON_SOLID_MASK) 692 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, 693- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 694+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 695 uint32_t, uint8_t, uint32_t, 696 NONE, FLAG_HAVE_NON_SOLID_MASK) 697 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, 698- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 699+ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 700 uint32_t, uint8_t, uint32_t, 701 NORMAL, FLAG_HAVE_NON_SOLID_MASK) 702 703 static const pixman_fast_path_t sse2_fast_paths[] = 704 { 705 /* PIXMAN_OP_OVER */ 706 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), 707 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), 708@@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas 709 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 710 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 711 712 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), 713 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), 714 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), 715 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), 716 717+ /* and here the needed entries are added to the fast path table */ 718+ 719+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), 720+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), 721+ 722 { PIXMAN_OP_NONE }, 723 }; 724 725 static pixman_bool_t 726 sse2_blt (pixman_implementation_t *imp, 727 uint32_t * src_bits, 728 uint32_t * dst_bits, 729 int src_stride, 730 731