1 /*
2  * Copyright (c) 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors:
24  *    Chris Wilson <chris@chris-wilson.co.uk>
25  *
26  */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "sna.h"
33 #include <pixman.h>
34 
35 #if defined(sse2)
36 #pragma GCC push_options
37 #pragma GCC target("sse2,inline-all-stringops,fpmath=sse")
38 #pragma GCC optimize("Ofast")
39 #include <xmmintrin.h>
40 
41 #if __x86_64__
42 #define have_sse2() 1
43 #else
have_sse2(void)44 static bool have_sse2(void)
45 {
46 	static int sse2_present = -1;
47 
48 	if (sse2_present == -1)
49 		sse2_present = sna_cpu_detect() & SSE2;
50 
51 	return sse2_present;
52 }
53 #endif
54 
55 static force_inline __m128i
xmm_create_mask_32(uint32_t mask)56 xmm_create_mask_32(uint32_t mask)
57 {
58 	return _mm_set_epi32(mask, mask, mask, mask);
59 }
60 
61 static force_inline __m128i
xmm_load_128(const __m128i * src)62 xmm_load_128(const __m128i *src)
63 {
64 	return _mm_load_si128(src);
65 }
66 
67 static force_inline __m128i
xmm_load_128u(const __m128i * src)68 xmm_load_128u(const __m128i *src)
69 {
70 	return _mm_loadu_si128(src);
71 }
72 
73 static force_inline void
xmm_save_128(__m128i * dst,__m128i data)74 xmm_save_128(__m128i *dst, __m128i data)
75 {
76 	_mm_store_si128(dst, data);
77 }
78 
79 static force_inline void
xmm_save_128u(__m128i * dst,__m128i data)80 xmm_save_128u(__m128i *dst, __m128i data)
81 {
82 	_mm_storeu_si128(dst, data);
83 }
84 
85 static force_inline void
to_sse128xN(uint8_t * dst,const uint8_t * src,int bytes)86 to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
87 {
88 	int i;
89 
90 	for (i = 0; i < bytes / 128; i++) {
91 		__m128i xmm0, xmm1, xmm2, xmm3;
92 		__m128i xmm4, xmm5, xmm6, xmm7;
93 
94 		xmm0 = xmm_load_128u((const __m128i*)src + 0);
95 		xmm1 = xmm_load_128u((const __m128i*)src + 1);
96 		xmm2 = xmm_load_128u((const __m128i*)src + 2);
97 		xmm3 = xmm_load_128u((const __m128i*)src + 3);
98 		xmm4 = xmm_load_128u((const __m128i*)src + 4);
99 		xmm5 = xmm_load_128u((const __m128i*)src + 5);
100 		xmm6 = xmm_load_128u((const __m128i*)src + 6);
101 		xmm7 = xmm_load_128u((const __m128i*)src + 7);
102 
103 		xmm_save_128((__m128i*)dst + 0, xmm0);
104 		xmm_save_128((__m128i*)dst + 1, xmm1);
105 		xmm_save_128((__m128i*)dst + 2, xmm2);
106 		xmm_save_128((__m128i*)dst + 3, xmm3);
107 		xmm_save_128((__m128i*)dst + 4, xmm4);
108 		xmm_save_128((__m128i*)dst + 5, xmm5);
109 		xmm_save_128((__m128i*)dst + 6, xmm6);
110 		xmm_save_128((__m128i*)dst + 7, xmm7);
111 
112 		dst += 128;
113 		src += 128;
114 	}
115 }
116 
117 static force_inline void
to_sse64(uint8_t * dst,const uint8_t * src)118 to_sse64(uint8_t *dst, const uint8_t *src)
119 {
120 	__m128i xmm1, xmm2, xmm3, xmm4;
121 
122 	xmm1 = xmm_load_128u((const __m128i*)src + 0);
123 	xmm2 = xmm_load_128u((const __m128i*)src + 1);
124 	xmm3 = xmm_load_128u((const __m128i*)src + 2);
125 	xmm4 = xmm_load_128u((const __m128i*)src + 3);
126 
127 	xmm_save_128((__m128i*)dst + 0, xmm1);
128 	xmm_save_128((__m128i*)dst + 1, xmm2);
129 	xmm_save_128((__m128i*)dst + 2, xmm3);
130 	xmm_save_128((__m128i*)dst + 3, xmm4);
131 }
132 
133 static force_inline void
to_sse32(uint8_t * dst,const uint8_t * src)134 to_sse32(uint8_t *dst, const uint8_t *src)
135 {
136 	__m128i xmm1, xmm2;
137 
138 	xmm1 = xmm_load_128u((const __m128i*)src + 0);
139 	xmm2 = xmm_load_128u((const __m128i*)src + 1);
140 
141 	xmm_save_128((__m128i*)dst + 0, xmm1);
142 	xmm_save_128((__m128i*)dst + 1, xmm2);
143 }
144 
145 static force_inline void
to_sse16(uint8_t * dst,const uint8_t * src)146 to_sse16(uint8_t *dst, const uint8_t *src)
147 {
148 	xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src));
149 }
150 
to_memcpy(uint8_t * dst,const uint8_t * src,unsigned len)151 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
152 {
153 	assert(len);
154 	if ((uintptr_t)dst & 15) {
155 		if (len <= 16 - ((uintptr_t)dst & 15)) {
156 			memcpy(dst, src, len);
157 			return;
158 		}
159 
160 		if ((uintptr_t)dst & 1) {
161 			assert(len >= 1);
162 			*dst++ = *src++;
163 			len--;
164 		}
165 		if ((uintptr_t)dst & 2) {
166 			assert(((uintptr_t)dst & 1) == 0);
167 			assert(len >= 2);
168 			*(uint16_t *)dst = *(const uint16_t *)src;
169 			dst += 2;
170 			src += 2;
171 			len -= 2;
172 		}
173 		if ((uintptr_t)dst & 4) {
174 			assert(((uintptr_t)dst & 3) == 0);
175 			assert(len >= 4);
176 			*(uint32_t *)dst = *(const uint32_t *)src;
177 			dst += 4;
178 			src += 4;
179 			len -= 4;
180 		}
181 		if ((uintptr_t)dst & 8) {
182 			assert(((uintptr_t)dst & 7) == 0);
183 			assert(len >= 8);
184 			*(uint64_t *)dst = *(const uint64_t *)src;
185 			dst += 8;
186 			src += 8;
187 			len -= 8;
188 		}
189 	}
190 
191 	assert(((uintptr_t)dst & 15) == 0);
192 	while (len >= 64) {
193 		to_sse64(dst, src);
194 		dst += 64;
195 		src += 64;
196 		len -= 64;
197 	}
198 	if (len == 0)
199 		return;
200 
201 	if (len & 32) {
202 		to_sse32(dst, src);
203 		dst += 32;
204 		src += 32;
205 	}
206 	if (len & 16) {
207 		to_sse16(dst, src);
208 		dst += 16;
209 		src += 16;
210 	}
211 	if (len & 8) {
212 		*(uint64_t *)dst = *(uint64_t *)src;
213 		dst += 8;
214 		src += 8;
215 	}
216 	if (len & 4) {
217 		*(uint32_t *)dst = *(uint32_t *)src;
218 		dst += 4;
219 		src += 4;
220 	}
221 	memcpy(dst, src, len & 3);
222 }
223 
224 static void
memcpy_to_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)225 memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
226 				   int32_t src_stride, int32_t dst_stride,
227 				   int16_t src_x, int16_t src_y,
228 				   int16_t dst_x, int16_t dst_y,
229 				   uint16_t width, uint16_t height)
230 {
231 	const unsigned tile_width = 512;
232 	const unsigned tile_height = 8;
233 	const unsigned tile_size = 4096;
234 
235 	const unsigned cpp = bpp / 8;
236 	const unsigned tile_pixels = tile_width / cpp;
237 	const unsigned tile_shift = ffs(tile_pixels) - 1;
238 	const unsigned tile_mask = tile_pixels - 1;
239 
240 	unsigned offset_x, length_x;
241 
242 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
243 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
244 	assert(src != dst);
245 
246 	if (src_x | src_y)
247 		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
248 	width *= cpp;
249 	assert(src_stride >= width);
250 
251 	if (dst_x & tile_mask) {
252 		offset_x = (dst_x & tile_mask) * cpp;
253 		length_x = min(tile_width - offset_x, width);
254 	} else
255 		length_x = 0;
256 	dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
257 
258 	while (height--) {
259 		unsigned w = width;
260 		const uint8_t *src_row = src;
261 		uint8_t *tile_row = dst;
262 
263 		src = (const uint8_t *)src + src_stride;
264 
265 		tile_row += dst_y / tile_height * dst_stride * tile_height;
266 		tile_row += (dst_y & (tile_height-1)) * tile_width;
267 		dst_y++;
268 
269 		if (length_x) {
270 			to_memcpy(tile_row + offset_x, src_row, length_x);
271 
272 			tile_row += tile_size;
273 			src_row = (const uint8_t *)src_row + length_x;
274 			w -= length_x;
275 		}
276 		while (w >= tile_width) {
277 			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
278 			to_sse128xN(assume_aligned(tile_row, tile_width),
279 				    src_row, tile_width);
280 			tile_row += tile_size;
281 			src_row = (const uint8_t *)src_row + tile_width;
282 			w -= tile_width;
283 		}
284 		if (w) {
285 			assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
286 			to_memcpy(assume_aligned(tile_row, tile_width),
287 				  src_row, w);
288 		}
289 	}
290 }
291 
292 static force_inline void
from_sse128xNu(uint8_t * dst,const uint8_t * src,int bytes)293 from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
294 {
295 	int i;
296 
297 	assert(((uintptr_t)src & 15) == 0);
298 
299 	for (i = 0; i < bytes / 128; i++) {
300 		__m128i xmm0, xmm1, xmm2, xmm3;
301 		__m128i xmm4, xmm5, xmm6, xmm7;
302 
303 		xmm0 = xmm_load_128((const __m128i*)src + 0);
304 		xmm1 = xmm_load_128((const __m128i*)src + 1);
305 		xmm2 = xmm_load_128((const __m128i*)src + 2);
306 		xmm3 = xmm_load_128((const __m128i*)src + 3);
307 		xmm4 = xmm_load_128((const __m128i*)src + 4);
308 		xmm5 = xmm_load_128((const __m128i*)src + 5);
309 		xmm6 = xmm_load_128((const __m128i*)src + 6);
310 		xmm7 = xmm_load_128((const __m128i*)src + 7);
311 
312 		xmm_save_128u((__m128i*)dst + 0, xmm0);
313 		xmm_save_128u((__m128i*)dst + 1, xmm1);
314 		xmm_save_128u((__m128i*)dst + 2, xmm2);
315 		xmm_save_128u((__m128i*)dst + 3, xmm3);
316 		xmm_save_128u((__m128i*)dst + 4, xmm4);
317 		xmm_save_128u((__m128i*)dst + 5, xmm5);
318 		xmm_save_128u((__m128i*)dst + 6, xmm6);
319 		xmm_save_128u((__m128i*)dst + 7, xmm7);
320 
321 		dst += 128;
322 		src += 128;
323 	}
324 }
325 
326 static force_inline void
from_sse128xNa(uint8_t * dst,const uint8_t * src,int bytes)327 from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
328 {
329 	int i;
330 
331 	assert(((uintptr_t)dst & 15) == 0);
332 	assert(((uintptr_t)src & 15) == 0);
333 
334 	for (i = 0; i < bytes / 128; i++) {
335 		__m128i xmm0, xmm1, xmm2, xmm3;
336 		__m128i xmm4, xmm5, xmm6, xmm7;
337 
338 		xmm0 = xmm_load_128((const __m128i*)src + 0);
339 		xmm1 = xmm_load_128((const __m128i*)src + 1);
340 		xmm2 = xmm_load_128((const __m128i*)src + 2);
341 		xmm3 = xmm_load_128((const __m128i*)src + 3);
342 		xmm4 = xmm_load_128((const __m128i*)src + 4);
343 		xmm5 = xmm_load_128((const __m128i*)src + 5);
344 		xmm6 = xmm_load_128((const __m128i*)src + 6);
345 		xmm7 = xmm_load_128((const __m128i*)src + 7);
346 
347 		xmm_save_128((__m128i*)dst + 0, xmm0);
348 		xmm_save_128((__m128i*)dst + 1, xmm1);
349 		xmm_save_128((__m128i*)dst + 2, xmm2);
350 		xmm_save_128((__m128i*)dst + 3, xmm3);
351 		xmm_save_128((__m128i*)dst + 4, xmm4);
352 		xmm_save_128((__m128i*)dst + 5, xmm5);
353 		xmm_save_128((__m128i*)dst + 6, xmm6);
354 		xmm_save_128((__m128i*)dst + 7, xmm7);
355 
356 		dst += 128;
357 		src += 128;
358 	}
359 }
360 
361 static force_inline void
from_sse64u(uint8_t * dst,const uint8_t * src)362 from_sse64u(uint8_t *dst, const uint8_t *src)
363 {
364 	__m128i xmm1, xmm2, xmm3, xmm4;
365 
366 	assert(((uintptr_t)src & 15) == 0);
367 
368 	xmm1 = xmm_load_128((const __m128i*)src + 0);
369 	xmm2 = xmm_load_128((const __m128i*)src + 1);
370 	xmm3 = xmm_load_128((const __m128i*)src + 2);
371 	xmm4 = xmm_load_128((const __m128i*)src + 3);
372 
373 	xmm_save_128u((__m128i*)dst + 0, xmm1);
374 	xmm_save_128u((__m128i*)dst + 1, xmm2);
375 	xmm_save_128u((__m128i*)dst + 2, xmm3);
376 	xmm_save_128u((__m128i*)dst + 3, xmm4);
377 }
378 
379 static force_inline void
from_sse64a(uint8_t * dst,const uint8_t * src)380 from_sse64a(uint8_t *dst, const uint8_t *src)
381 {
382 	__m128i xmm1, xmm2, xmm3, xmm4;
383 
384 	assert(((uintptr_t)dst & 15) == 0);
385 	assert(((uintptr_t)src & 15) == 0);
386 
387 	xmm1 = xmm_load_128((const __m128i*)src + 0);
388 	xmm2 = xmm_load_128((const __m128i*)src + 1);
389 	xmm3 = xmm_load_128((const __m128i*)src + 2);
390 	xmm4 = xmm_load_128((const __m128i*)src + 3);
391 
392 	xmm_save_128((__m128i*)dst + 0, xmm1);
393 	xmm_save_128((__m128i*)dst + 1, xmm2);
394 	xmm_save_128((__m128i*)dst + 2, xmm3);
395 	xmm_save_128((__m128i*)dst + 3, xmm4);
396 }
397 
398 static force_inline void
from_sse32u(uint8_t * dst,const uint8_t * src)399 from_sse32u(uint8_t *dst, const uint8_t *src)
400 {
401 	__m128i xmm1, xmm2;
402 
403 	xmm1 = xmm_load_128((const __m128i*)src + 0);
404 	xmm2 = xmm_load_128((const __m128i*)src + 1);
405 
406 	xmm_save_128u((__m128i*)dst + 0, xmm1);
407 	xmm_save_128u((__m128i*)dst + 1, xmm2);
408 }
409 
410 static force_inline void
from_sse32a(uint8_t * dst,const uint8_t * src)411 from_sse32a(uint8_t *dst, const uint8_t *src)
412 {
413 	__m128i xmm1, xmm2;
414 
415 	assert(((uintptr_t)dst & 15) == 0);
416 	assert(((uintptr_t)src & 15) == 0);
417 
418 	xmm1 = xmm_load_128((const __m128i*)src + 0);
419 	xmm2 = xmm_load_128((const __m128i*)src + 1);
420 
421 	xmm_save_128((__m128i*)dst + 0, xmm1);
422 	xmm_save_128((__m128i*)dst + 1, xmm2);
423 }
424 
425 static force_inline void
from_sse16u(uint8_t * dst,const uint8_t * src)426 from_sse16u(uint8_t *dst, const uint8_t *src)
427 {
428 	assert(((uintptr_t)src & 15) == 0);
429 
430 	xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
431 }
432 
433 static force_inline void
from_sse16a(uint8_t * dst,const uint8_t * src)434 from_sse16a(uint8_t *dst, const uint8_t *src)
435 {
436 	assert(((uintptr_t)dst & 15) == 0);
437 	assert(((uintptr_t)src & 15) == 0);
438 
439 	xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
440 }
441 
442 static void
memcpy_from_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)443 memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
444 				     int32_t src_stride, int32_t dst_stride,
445 				     int16_t src_x, int16_t src_y,
446 				     int16_t dst_x, int16_t dst_y,
447 				     uint16_t width, uint16_t height)
448 {
449 	const unsigned tile_width = 512;
450 	const unsigned tile_height = 8;
451 	const unsigned tile_size = 4096;
452 
453 	const unsigned cpp = bpp / 8;
454 	const unsigned tile_pixels = tile_width / cpp;
455 	const unsigned tile_shift = ffs(tile_pixels) - 1;
456 	const unsigned tile_mask = tile_pixels - 1;
457 
458 	unsigned length_x, offset_x;
459 
460 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
461 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
462 	assert(src != dst);
463 
464 	if (dst_x | dst_y)
465 		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
466 	width *= cpp;
467 	assert(dst_stride >= width);
468 	if (src_x & tile_mask) {
469 		offset_x = (src_x & tile_mask) * cpp;
470 		length_x = min(tile_width - offset_x, width);
471 		dst_stride -= width;
472 		dst_stride += (width - length_x) & 15;
473 	} else {
474 		offset_x = 0;
475 		dst_stride -= width & ~15;
476 	}
477 	assert(dst_stride >= 0);
478 	src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
479 
480 	while (height--) {
481 		unsigned w = width;
482 		const uint8_t *tile_row = src;
483 
484 		tile_row += src_y / tile_height * src_stride * tile_height;
485 		tile_row += (src_y & (tile_height-1)) * tile_width;
486 		src_y++;
487 
488 		if (offset_x) {
489 			memcpy(dst, tile_row + offset_x, length_x);
490 			tile_row += tile_size;
491 			dst = (uint8_t *)dst + length_x;
492 			w -= length_x;
493 		}
494 
495 		if ((uintptr_t)dst & 15) {
496 			while (w >= tile_width) {
497 				from_sse128xNu(dst,
498 					       assume_aligned(tile_row, tile_width),
499 					       tile_width);
500 				tile_row += tile_size;
501 				dst = (uint8_t *)dst + tile_width;
502 				w -= tile_width;
503 			}
504 			while (w >= 64) {
505 				from_sse64u(dst, tile_row);
506 				tile_row += 64;
507 				dst = (uint8_t *)dst + 64;
508 				w -= 64;
509 			}
510 			if (w & 32) {
511 				from_sse32u(dst, tile_row);
512 				tile_row += 32;
513 				dst = (uint8_t *)dst + 32;
514 			}
515 			if (w & 16) {
516 				from_sse16u(dst, tile_row);
517 				tile_row += 16;
518 				dst = (uint8_t *)dst + 16;
519 			}
520 			memcpy(dst, assume_aligned(tile_row, 16), w & 15);
521 		} else {
522 			while (w >= tile_width) {
523 				from_sse128xNa(assume_aligned(dst, 16),
524 					       assume_aligned(tile_row, tile_width),
525 					       tile_width);
526 				tile_row += tile_size;
527 				dst = (uint8_t *)dst + tile_width;
528 				w -= tile_width;
529 			}
530 			while (w >= 64) {
531 				from_sse64a(dst, tile_row);
532 				tile_row += 64;
533 				dst = (uint8_t *)dst + 64;
534 				w -= 64;
535 			}
536 			if (w & 32) {
537 				from_sse32a(dst, tile_row);
538 				tile_row += 32;
539 				dst = (uint8_t *)dst + 32;
540 			}
541 			if (w & 16) {
542 				from_sse16a(dst, tile_row);
543 				tile_row += 16;
544 				dst = (uint8_t *)dst + 16;
545 			}
546 			memcpy(assume_aligned(dst, 16),
547 			       assume_aligned(tile_row, 16),
548 			       w & 15);
549 		}
550 		dst = (uint8_t *)dst + dst_stride;
551 	}
552 }
553 
554 static void
memcpy_between_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)555 memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
556 					int32_t src_stride, int32_t dst_stride,
557 					int16_t src_x, int16_t src_y,
558 					int16_t dst_x, int16_t dst_y,
559 					uint16_t width, uint16_t height)
560 {
561 	const unsigned tile_width = 512;
562 	const unsigned tile_height = 8;
563 	const unsigned tile_size = 4096;
564 
565 	const unsigned cpp = bpp / 8;
566 	const unsigned tile_pixels = tile_width / cpp;
567 	const unsigned tile_shift = ffs(tile_pixels) - 1;
568 	const unsigned tile_mask = tile_pixels - 1;
569 
570 	unsigned ox, lx;
571 
572 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
573 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
574 	assert(src != dst);
575 
576 	width *= cpp;
577 	dst_stride *= tile_height;
578 	src_stride *= tile_height;
579 
580 	assert((dst_x & tile_mask) == (src_x & tile_mask));
581 	if (dst_x & tile_mask) {
582 		ox = (dst_x & tile_mask) * cpp;
583 		lx = min(tile_width - ox, width);
584 		assert(lx != 0);
585 	} else
586 		lx = 0;
587 
588 	if (dst_x)
589 		dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
590 	if (src_x)
591 		src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
592 
593 	while (height--) {
594 		const uint8_t *src_row;
595 		uint8_t *dst_row;
596 		unsigned w = width;
597 
598 		dst_row = dst;
599 		dst_row += dst_y / tile_height * dst_stride;
600 		dst_row += (dst_y & (tile_height-1)) * tile_width;
601 		dst_y++;
602 
603 		src_row = src;
604 		src_row += src_y / tile_height * src_stride;
605 		src_row += (src_y & (tile_height-1)) * tile_width;
606 		src_y++;
607 
608 		if (lx) {
609 			to_memcpy(dst_row + ox, src_row + ox, lx);
610 			dst_row += tile_size;
611 			src_row += tile_size;
612 			w -= lx;
613 		}
614 		while (w >= tile_width) {
615 			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
616 			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
617 			to_sse128xN(assume_aligned(dst_row, tile_width),
618 				    assume_aligned(src_row, tile_width),
619 				    tile_width);
620 			dst_row += tile_size;
621 			src_row += tile_size;
622 			w -= tile_width;
623 		}
624 		if (w) {
625 			assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
626 			assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
627 			to_memcpy(assume_aligned(dst_row, tile_width),
628 				  assume_aligned(src_row, tile_width),
629 				  w);
630 		}
631 	}
632 }
633 
634 #pragma GCC push_options
635 #endif
636 
637 fast void
memcpy_blt(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)638 memcpy_blt(const void *src, void *dst, int bpp,
639 	   int32_t src_stride, int32_t dst_stride,
640 	   int16_t src_x, int16_t src_y,
641 	   int16_t dst_x, int16_t dst_y,
642 	   uint16_t width, uint16_t height)
643 {
644 	const uint8_t *src_bytes;
645 	uint8_t *dst_bytes;
646 	int byte_width;
647 
648 	assert(src);
649 	assert(dst);
650 	assert(width && height);
651 	assert(bpp >= 8);
652 	assert(width*bpp <= 8*src_stride);
653 	assert(width*bpp <= 8*dst_stride);
654 
655 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
656 	     __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
657 
658 	bpp /= 8;
659 
660 	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
661 	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
662 
663 	byte_width = width * bpp;
664 	if (byte_width == src_stride && byte_width == dst_stride) {
665 		byte_width *= height;
666 		height = 1;
667 	}
668 
669 	switch (byte_width) {
670 	case 1:
671 		do {
672 			*dst_bytes = *src_bytes;
673 			src_bytes += src_stride;
674 			dst_bytes += dst_stride;
675 		} while (--height);
676 		break;
677 
678 	case 2:
679 		do {
680 			*(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
681 			src_bytes += src_stride;
682 			dst_bytes += dst_stride;
683 		} while (--height);
684 		break;
685 
686 	case 4:
687 		do {
688 			*(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
689 			src_bytes += src_stride;
690 			dst_bytes += dst_stride;
691 		} while (--height);
692 		break;
693 
694 	case 8:
695 		do {
696 			*(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
697 			src_bytes += src_stride;
698 			dst_bytes += dst_stride;
699 		} while (--height);
700 		break;
701 	case 16:
702 		do {
703 			((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
704 			((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
705 			src_bytes += src_stride;
706 			dst_bytes += dst_stride;
707 		} while (--height);
708 		break;
709 
710 	default:
711 		do {
712 			memcpy(dst_bytes, src_bytes, byte_width);
713 			src_bytes += src_stride;
714 			dst_bytes += dst_stride;
715 		} while (--height);
716 		break;
717 	}
718 }
719 
720 static fast_memcpy void
memcpy_to_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)721 memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
722 			     int32_t src_stride, int32_t dst_stride,
723 			     int16_t src_x, int16_t src_y,
724 			     int16_t dst_x, int16_t dst_y,
725 			     uint16_t width, uint16_t height)
726 {
727 	const unsigned tile_width = 512;
728 	const unsigned tile_height = 8;
729 	const unsigned tile_size = 4096;
730 
731 	const unsigned cpp = bpp / 8;
732 	const unsigned tile_pixels = tile_width / cpp;
733 	const unsigned tile_shift = ffs(tile_pixels) - 1;
734 	const unsigned tile_mask = tile_pixels - 1;
735 
736 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
737 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
738 	assert(src != dst);
739 
740 	if (src_x | src_y)
741 		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
742 	assert(src_stride >= width * cpp);
743 	src_stride -= width * cpp;
744 
745 	while (height--) {
746 		unsigned w = width * cpp;
747 		uint8_t *tile_row = dst;
748 
749 		tile_row += dst_y / tile_height * dst_stride * tile_height;
750 		tile_row += (dst_y & (tile_height-1)) * tile_width;
751 		if (dst_x) {
752 			tile_row += (dst_x >> tile_shift) * tile_size;
753 			if (dst_x & tile_mask) {
754 				const unsigned x = (dst_x & tile_mask) * cpp;
755 				const unsigned len = min(tile_width - x, w);
756 				memcpy(assume_misaligned(tile_row + x, tile_width, x),
757 				       src, len);
758 
759 				tile_row += tile_size;
760 				src = (const uint8_t *)src + len;
761 				w -= len;
762 			}
763 		}
764 		while (w >= tile_width) {
765 			memcpy(assume_aligned(tile_row, tile_width),
766 			       src, tile_width);
767 			tile_row += tile_size;
768 			src = (const uint8_t *)src + tile_width;
769 			w -= tile_width;
770 		}
771 		memcpy(assume_aligned(tile_row, tile_width), src, w);
772 		src = (const uint8_t *)src + src_stride + w;
773 		dst_y++;
774 	}
775 }
776 
777 static fast_memcpy void
memcpy_from_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)778 memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
779 			       int32_t src_stride, int32_t dst_stride,
780 			       int16_t src_x, int16_t src_y,
781 			       int16_t dst_x, int16_t dst_y,
782 			       uint16_t width, uint16_t height)
783 {
784 	const unsigned tile_width = 512;
785 	const unsigned tile_height = 8;
786 	const unsigned tile_size = 4096;
787 
788 	const unsigned cpp = bpp / 8;
789 	const unsigned tile_pixels = tile_width / cpp;
790 	const unsigned tile_shift = ffs(tile_pixels) - 1;
791 	const unsigned tile_mask = tile_pixels - 1;
792 
793 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
794 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
795 	assert(src != dst);
796 
797 	if (dst_x | dst_y)
798 		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
799 	assert(dst_stride >= width * cpp);
800 	dst_stride -= width * cpp;
801 
802 	while (height--) {
803 		unsigned w = width * cpp;
804 		const uint8_t *tile_row = src;
805 
806 		tile_row += src_y / tile_height * src_stride * tile_height;
807 		tile_row += (src_y & (tile_height-1)) * tile_width;
808 		if (src_x) {
809 			tile_row += (src_x >> tile_shift) * tile_size;
810 			if (src_x & tile_mask) {
811 				const unsigned x = (src_x & tile_mask) * cpp;
812 				const unsigned len = min(tile_width - x, w);
813 				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
814 
815 				tile_row += tile_size;
816 				dst = (uint8_t *)dst + len;
817 				w -= len;
818 			}
819 		}
820 		while (w >= tile_width) {
821 			memcpy(dst,
822 			       assume_aligned(tile_row, tile_width),
823 			       tile_width);
824 
825 			tile_row += tile_size;
826 			dst = (uint8_t *)dst + tile_width;
827 			w -= tile_width;
828 		}
829 		memcpy(dst, assume_aligned(tile_row, tile_width), w);
830 		dst = (uint8_t *)dst + dst_stride + w;
831 		src_y++;
832 	}
833 }
834 
835 static fast_memcpy void
memcpy_between_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)836 memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
837 				  int32_t src_stride, int32_t dst_stride,
838 				  int16_t src_x, int16_t src_y,
839 				  int16_t dst_x, int16_t dst_y,
840 				  uint16_t width, uint16_t height)
841 {
842 	const unsigned tile_width = 512;
843 	const unsigned tile_height = 8;
844 	const unsigned tile_size = 4096;
845 
846 	const unsigned cpp = bpp / 8;
847 	const unsigned tile_pixels = tile_width / cpp;
848 	const unsigned tile_shift = ffs(tile_pixels) - 1;
849 	const unsigned tile_mask = tile_pixels - 1;
850 
851 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
852 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
853 	assert(src != dst);
854 	assert((dst_x & tile_mask) == (src_x & tile_mask));
855 
856 	while (height--) {
857 		unsigned w = width * cpp;
858 		uint8_t *dst_row = dst;
859 		const uint8_t *src_row = src;
860 
861 		dst_row += dst_y / tile_height * dst_stride * tile_height;
862 		dst_row += (dst_y & (tile_height-1)) * tile_width;
863 		if (dst_x)
864 			dst_row += (dst_x >> tile_shift) * tile_size;
865 		dst_y++;
866 
867 		src_row += src_y / tile_height * src_stride * tile_height;
868 		src_row += (src_y & (tile_height-1)) * tile_width;
869 		if (src_x)
870 			src_row += (src_x >> tile_shift) * tile_size;
871 		src_y++;
872 
873 		if (dst_x & tile_mask) {
874 			const unsigned x = (dst_x & tile_mask) * cpp;
875 			const unsigned len = min(tile_width - x, w);
876 
877 			memcpy(assume_misaligned(dst_row + x, tile_width, x),
878 			       assume_misaligned(src_row + x, tile_width, x),
879 			       len);
880 
881 			dst_row += tile_size;
882 			src_row += tile_size;
883 			w -= len;
884 		}
885 
886 		while (w >= tile_width) {
887 			memcpy(assume_aligned(dst_row, tile_width),
888 			       assume_aligned(src_row, tile_width),
889 			       tile_width);
890 			dst_row += tile_size;
891 			src_row += tile_size;
892 			w -= tile_width;
893 		}
894 		memcpy(assume_aligned(dst_row, tile_width),
895 		       assume_aligned(src_row, tile_width),
896 		       w);
897 	}
898 }
899 
900 #define memcpy_to_tiled_x(swizzle) \
901 fast_memcpy static void \
902 memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
903 			      int32_t src_stride, int32_t dst_stride, \
904 			      int16_t src_x, int16_t src_y, \
905 			      int16_t dst_x, int16_t dst_y, \
906 			      uint16_t width, uint16_t height) \
907 { \
908 	const unsigned tile_width = 512; \
909 	const unsigned tile_height = 8; \
910 	const unsigned tile_size = 4096; \
911 	const unsigned cpp = bpp / 8; \
912 	const unsigned stride_tiles = dst_stride / tile_width; \
913 	const unsigned swizzle_pixels = 64 / cpp; \
914 	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
915 	const unsigned tile_mask = (1 << tile_pixels) - 1; \
916 	unsigned x, y; \
917 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
918 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
919 	src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; \
920 	for (y = 0; y < height; ++y) { \
921 		const uint32_t dy = y + dst_y; \
922 		const uint32_t tile_row = \
923 			(dy / tile_height * stride_tiles * tile_size + \
924 			 (dy & (tile_height-1)) * tile_width); \
925 		const uint8_t *src_row = (const uint8_t *)src + src_stride * y; \
926 		uint32_t dx = dst_x; \
927 		x = width * cpp; \
928 		if (dx & (swizzle_pixels - 1)) { \
929 			const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); \
930 			const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; \
931 			uint32_t offset = \
932 				tile_row + \
933 				(dx >> tile_pixels) * tile_size + \
934 				(dx & tile_mask) * cpp; \
935 			memcpy((char *)dst + swizzle(offset), src_row, length * cpp); \
936 			src_row += length * cpp; \
937 			x -= length * cpp; \
938 			dx += length; \
939 		} \
940 		while (x >= 64) { \
941 			uint32_t offset = \
942 				tile_row + \
943 				(dx >> tile_pixels) * tile_size + \
944 				(dx & tile_mask) * cpp; \
945 			memcpy(assume_aligned((char *)dst+swizzle(offset),64), \
946 			       src_row, 64); \
947 			src_row += 64; \
948 			x -= 64; \
949 			dx += swizzle_pixels; \
950 		} \
951 		if (x) { \
952 			uint32_t offset = \
953 				tile_row + \
954 				(dx >> tile_pixels) * tile_size + \
955 				(dx & tile_mask) * cpp; \
956 			memcpy(assume_aligned((char *)dst + swizzle(offset), 64), src_row, x); \
957 		} \
958 	} \
959 }
960 
961 #define memcpy_from_tiled_x(swizzle) \
962 fast_memcpy static void \
963 memcpy_from_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
964 				int32_t src_stride, int32_t dst_stride, \
965 				int16_t src_x, int16_t src_y, \
966 				int16_t dst_x, int16_t dst_y, \
967 				uint16_t width, uint16_t height) \
968 { \
969 	const unsigned tile_width = 512; \
970 	const unsigned tile_height = 8; \
971 	const unsigned tile_size = 4096; \
972 	const unsigned cpp = bpp / 8; \
973 	const unsigned stride_tiles = src_stride / tile_width; \
974 	const unsigned swizzle_pixels = 64 / cpp; \
975 	const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
976 	const unsigned tile_mask = (1 << tile_pixels) - 1; \
977 	unsigned x, y; \
978 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
979 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
980 	dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; \
981 	for (y = 0; y < height; ++y) { \
982 		const uint32_t sy = y + src_y; \
983 		const uint32_t tile_row = \
984 			(sy / tile_height * stride_tiles * tile_size + \
985 			 (sy & (tile_height-1)) * tile_width); \
986 		uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; \
987 		uint32_t sx = src_x; \
988 		x = width * cpp; \
989 		if (sx & (swizzle_pixels - 1)) { \
990 			const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); \
991 			const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; \
992 			uint32_t offset = \
993 				tile_row + \
994 				(sx >> tile_pixels) * tile_size + \
995 				(sx & tile_mask) * cpp; \
996 			memcpy(dst_row, (const char *)src + swizzle(offset), length * cpp); \
997 			dst_row += length * cpp; \
998 			x -= length * cpp; \
999 			sx += length; \
1000 		} \
1001 		while (x >= 64) { \
1002 			uint32_t offset = \
1003 				tile_row + \
1004 				(sx >> tile_pixels) * tile_size + \
1005 				(sx & tile_mask) * cpp; \
1006 			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), 64); \
1007 			dst_row += 64; \
1008 			x -= 64; \
1009 			sx += swizzle_pixels; \
1010 		} \
1011 		if (x) { \
1012 			uint32_t offset = \
1013 				tile_row + \
1014 				(sx >> tile_pixels) * tile_size + \
1015 				(sx & tile_mask) * cpp; \
1016 			memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), x); \
1017 		} \
1018 	} \
1019 }
1020 
1021 #define swizzle_9(X) ((X) ^ (((X) >> 3) & 64))
1022 memcpy_to_tiled_x(swizzle_9)
memcpy_from_tiled_x(swizzle_9)1023 memcpy_from_tiled_x(swizzle_9)
1024 #undef swizzle_9
1025 
1026 #define swizzle_9_10(X) ((X) ^ ((((X) ^ ((X) >> 1)) >> 3) & 64))
1027 memcpy_to_tiled_x(swizzle_9_10)
1028 memcpy_from_tiled_x(swizzle_9_10)
1029 #undef swizzle_9_10
1030 
1031 #define swizzle_9_11(X) ((X) ^ ((((X) ^ ((X) >> 2)) >> 3) & 64))
1032 memcpy_to_tiled_x(swizzle_9_11)
1033 memcpy_from_tiled_x(swizzle_9_11)
1034 #undef swizzle_9_11
1035 
1036 #define swizzle_9_10_11(X) ((X) ^ ((((X) ^ ((X) >> 1) ^ ((X) >> 2)) >> 3) & 64))
1037 memcpy_to_tiled_x(swizzle_9_10_11)
1038 memcpy_from_tiled_x(swizzle_9_10_11)
1039 #undef swizzle_9_10_11
1040 
1041 static fast_memcpy void
1042 memcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp,
1043 			int32_t src_stride, int32_t dst_stride,
1044 			int16_t src_x, int16_t src_y,
1045 			int16_t dst_x, int16_t dst_y,
1046 			uint16_t width, uint16_t height)
1047 {
1048 	const unsigned tile_width = 128;
1049 	const unsigned tile_height = 16;
1050 	const unsigned tile_size = 2048;
1051 
1052 	const unsigned cpp = bpp / 8;
1053 	const unsigned tile_pixels = tile_width / cpp;
1054 	const unsigned tile_shift = ffs(tile_pixels) - 1;
1055 	const unsigned tile_mask = tile_pixels - 1;
1056 
1057 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
1058 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1059 	assert(src != dst);
1060 
1061 	if (src_x | src_y)
1062 		src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
1063 	assert(src_stride >= width * cpp);
1064 	src_stride -= width * cpp;
1065 
1066 	while (height--) {
1067 		unsigned w = width * cpp;
1068 		uint8_t *tile_row = dst;
1069 
1070 		tile_row += dst_y / tile_height * dst_stride * tile_height;
1071 		tile_row += (dst_y & (tile_height-1)) * tile_width;
1072 		if (dst_x) {
1073 			tile_row += (dst_x >> tile_shift) * tile_size;
1074 			if (dst_x & tile_mask) {
1075 				const unsigned x = (dst_x & tile_mask) * cpp;
1076 				const unsigned len = min(tile_width - x, w);
1077 				memcpy(assume_misaligned(tile_row + x, tile_width, x), src, len);
1078 
1079 				tile_row += tile_size;
1080 				src = (const uint8_t *)src + len;
1081 				w -= len;
1082 			}
1083 		}
1084 		while (w >= tile_width) {
1085 			memcpy(assume_aligned(tile_row, tile_width),
1086 			       src, tile_width);
1087 
1088 			tile_row += tile_size;
1089 			src = (const uint8_t *)src + tile_width;
1090 			w -= tile_width;
1091 		}
1092 		memcpy(assume_aligned(tile_row, tile_width), src, w);
1093 		src = (const uint8_t *)src + src_stride + w;
1094 		dst_y++;
1095 	}
1096 }
1097 
1098 static fast_memcpy void
memcpy_from_tiled_x__gen2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)1099 memcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp,
1100 			  int32_t src_stride, int32_t dst_stride,
1101 			  int16_t src_x, int16_t src_y,
1102 			  int16_t dst_x, int16_t dst_y,
1103 			  uint16_t width, uint16_t height)
1104 {
1105 	const unsigned tile_width = 128;
1106 	const unsigned tile_height = 16;
1107 	const unsigned tile_size = 2048;
1108 
1109 	const unsigned cpp = bpp / 8;
1110 	const unsigned tile_pixels = tile_width / cpp;
1111 	const unsigned tile_shift = ffs(tile_pixels) - 1;
1112 	const unsigned tile_mask = tile_pixels - 1;
1113 
1114 	DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
1115 	     __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1116 	assert(src != dst);
1117 
1118 	if (dst_x | dst_y)
1119 		dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
1120 	assert(dst_stride >= width * cpp);
1121 	dst_stride -= width * cpp;
1122 
1123 	while (height--) {
1124 		unsigned w = width * cpp;
1125 		const uint8_t *tile_row = src;
1126 
1127 		tile_row += src_y / tile_height * src_stride * tile_height;
1128 		tile_row += (src_y & (tile_height-1)) * tile_width;
1129 		if (src_x) {
1130 			tile_row += (src_x >> tile_shift) * tile_size;
1131 			if (src_x & tile_mask) {
1132 				const unsigned x = (src_x & tile_mask) * cpp;
1133 				const unsigned len = min(tile_width - x, w);
1134 				memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
1135 
1136 				tile_row += tile_size;
1137 				dst = (uint8_t *)dst + len;
1138 				w -= len;
1139 			}
1140 		}
1141 		while (w >= tile_width) {
1142 			memcpy(dst,
1143 			       assume_aligned(tile_row, tile_width),
1144 			       tile_width);
1145 
1146 			tile_row += tile_size;
1147 			dst = (uint8_t *)dst + tile_width;
1148 			w -= tile_width;
1149 		}
1150 		memcpy(dst, assume_aligned(tile_row, tile_width), w);
1151 		dst = (uint8_t *)dst + dst_stride + w;
1152 		src_y++;
1153 	}
1154 }
1155 
choose_memcpy_tiled_x(struct kgem * kgem,int swizzling,unsigned cpu)1156 void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
1157 {
1158 	if (kgem->gen < 030) {
1159 		if (swizzling == I915_BIT_6_SWIZZLE_NONE) {
1160 			DBG(("%s: gen2, no swizzling\n", __FUNCTION__));
1161 			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2;
1162 			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2;
1163 		} else
1164 			DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__));
1165 		return;
1166 	}
1167 
1168 	switch (swizzling) {
1169 	default:
1170 		DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
1171 		break;
1172 	case I915_BIT_6_SWIZZLE_NONE:
1173 		DBG(("%s: no swizzling\n", __FUNCTION__));
1174 #if defined(sse2)
1175 		if (cpu & SSE2) {
1176 			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
1177 			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
1178 			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2;
1179 		} else
1180 #endif
1181 	       	{
1182 			kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
1183 			kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
1184 			kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0;
1185 		}
1186 		break;
1187 	case I915_BIT_6_SWIZZLE_9:
1188 		DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
1189 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
1190 		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9;
1191 		break;
1192 	case I915_BIT_6_SWIZZLE_9_10:
1193 		DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__));
1194 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
1195 		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10;
1196 		break;
1197 	case I915_BIT_6_SWIZZLE_9_11:
1198 		DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__));
1199 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
1200 		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
1201 		break;
1202 	case I915_BIT_6_SWIZZLE_9_10_11:
1203 		DBG(("%s: 6^9^10^11 swizzling\n", __FUNCTION__));
1204 		kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10_11;
1205 		kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10_11;
1206 		break;
1207 	}
1208 }
1209 
1210 void
memmove_box(const void * src,void * dst,int bpp,int32_t stride,const BoxRec * box,int dx,int dy)1211 memmove_box(const void *src, void *dst,
1212 	    int bpp, int32_t stride,
1213 	    const BoxRec *box,
1214 	    int dx, int dy)
1215 {
1216 #define FORCE_MEMMOVE 0
1217 	union {
1218 		uint8_t u8;
1219 		uint16_t u16;
1220 		uint32_t u32;
1221 		uint64_t u64;
1222 	} tmp;
1223 	const uint8_t *src_bytes;
1224 	uint8_t *dst_bytes;
1225 	int width, height;
1226 
1227 	assert(src);
1228 	assert(dst);
1229 	assert(src != dst);
1230 	assert(bpp >= 8);
1231 	assert(box->x2 > box->x1);
1232 	assert(box->y2 > box->y1);
1233 
1234 	DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n",
1235 	     __FUNCTION__,
1236 	     box->x1, box->y1, box->x2, box->y2,
1237 	     stride, bpp, dx, dy));
1238 
1239 	bpp /= 8;
1240 	width = box->y1 * stride + box->x1 * bpp;
1241 	src_bytes = (const uint8_t *)src + width;
1242 	dst_bytes = (uint8_t *)dst + width;
1243 	assert(dst_bytes != src_bytes);
1244 
1245 	width = (box->x2 - box->x1) * bpp;
1246 	height = (box->y2 - box->y1);
1247 	assert(width <= stride);
1248 	if (width == stride) {
1249 		width *= height;
1250 		height = 1;
1251 	}
1252 
1253 	if (dy >= 0) {
1254 		switch (width) {
1255 		case 1:
1256 			do {
1257 				*dst_bytes = tmp.u8 = *src_bytes;
1258 				src_bytes += stride;
1259 				dst_bytes += stride;
1260 			} while (--height);
1261 			break;
1262 
1263 		case 2:
1264 			do {
1265 				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
1266 				src_bytes += stride;
1267 				dst_bytes += stride;
1268 			} while (--height);
1269 			break;
1270 
1271 		case 4:
1272 			do {
1273 				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
1274 				src_bytes += stride;
1275 				dst_bytes += stride;
1276 			} while (--height);
1277 			break;
1278 
1279 		case 8:
1280 			do {
1281 				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
1282 				src_bytes += stride;
1283 				dst_bytes += stride;
1284 			} while (--height);
1285 			break;
1286 
1287 		default:
1288 			if (FORCE_MEMMOVE ||
1289 			    (dst_bytes < src_bytes + width &&
1290 			     src_bytes < dst_bytes + width)) {
1291 				do {
1292 					memmove(dst_bytes, src_bytes, width);
1293 					src_bytes += stride;
1294 					dst_bytes += stride;
1295 				} while (--height);
1296 			} else {
1297 				do {
1298 					memcpy(dst_bytes, src_bytes, width);
1299 					src_bytes += stride;
1300 					dst_bytes += stride;
1301 				} while (--height);
1302 			}
1303 			break;
1304 		}
1305 	} else {
1306 		src_bytes += (height-1) * stride;
1307 		dst_bytes += (height-1) * stride;
1308 
1309 		switch (width) {
1310 		case 1:
1311 			do {
1312 				*dst_bytes = tmp.u8 = *src_bytes;
1313 				src_bytes -= stride;
1314 				dst_bytes -= stride;
1315 			} while (--height);
1316 			break;
1317 
1318 		case 2:
1319 			do {
1320 				*(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
1321 				src_bytes -= stride;
1322 				dst_bytes -= stride;
1323 			} while (--height);
1324 			break;
1325 
1326 		case 4:
1327 			do {
1328 				*(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
1329 				src_bytes -= stride;
1330 				dst_bytes -= stride;
1331 			} while (--height);
1332 			break;
1333 
1334 		case 8:
1335 			do {
1336 				*(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
1337 				src_bytes -= stride;
1338 				dst_bytes -= stride;
1339 			} while (--height);
1340 			break;
1341 
1342 		default:
1343 			if (FORCE_MEMMOVE ||
1344 			    (dst_bytes < src_bytes + width &&
1345 			     src_bytes < dst_bytes + width)) {
1346 				do {
1347 					memmove(dst_bytes, src_bytes, width);
1348 					src_bytes -= stride;
1349 					dst_bytes -= stride;
1350 				} while (--height);
1351 			} else {
1352 				do {
1353 					memcpy(dst_bytes, src_bytes, width);
1354 					src_bytes -= stride;
1355 					dst_bytes -= stride;
1356 				} while (--height);
1357 			}
1358 			break;
1359 		}
1360 	}
1361 }
1362 
1363 void
memcpy_xor(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height,uint32_t and,uint32_t or)1364 memcpy_xor(const void *src, void *dst, int bpp,
1365 	   int32_t src_stride, int32_t dst_stride,
1366 	   int16_t src_x, int16_t src_y,
1367 	   int16_t dst_x, int16_t dst_y,
1368 	   uint16_t width, uint16_t height,
1369 	   uint32_t and, uint32_t or)
1370 {
1371 	const uint8_t *src_bytes;
1372 	uint8_t *dst_bytes;
1373 	int i, w;
1374 
1375 	assert(width && height);
1376 	assert(bpp >= 8);
1377 	assert(width*bpp <= 8*src_stride);
1378 	assert(width*bpp <= 8*dst_stride);
1379 
1380 	DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n",
1381 	     __FUNCTION__,
1382 	     src_x, src_y, dst_x, dst_y,
1383 	     width, height,
1384 	     src_stride, dst_stride,
1385 	     bpp, and, or));
1386 
1387 	bpp /= 8;
1388 	src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
1389 	dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
1390 
1391 	if (and == 0xffffffff) {
1392 		switch (bpp) {
1393 		case 1:
1394 			if (width & 1) {
1395 				do {
1396 					for (i = 0; i < width; i++)
1397 						dst_bytes[i] = src_bytes[i] | or;
1398 
1399 					src_bytes += src_stride;
1400 					dst_bytes += dst_stride;
1401 				} while (--height);
1402 				break;
1403 			} else {
1404 				width /= 2;
1405 				or |= or << 8;
1406 			}
1407 			/* fall through */
1408 		case 2:
1409 			if (width & 1) {
1410 				do {
1411 					uint16_t *d = (uint16_t *)dst_bytes;
1412 					const uint16_t *s = (const uint16_t *)src_bytes;
1413 
1414 					for (i = 0; i < width; i++)
1415 						d[i] = s[i] | or;
1416 
1417 					src_bytes += src_stride;
1418 					dst_bytes += dst_stride;
1419 				} while (--height);
1420 				break;
1421 			} else {
1422 				width /= 2;
1423 				or |= or << 16;
1424 			}
1425 			/* fall through */
1426 		case 4:
1427 			w = width;
1428 			if (w * 4 == dst_stride && dst_stride == src_stride) {
1429 				w *= height;
1430 				height = 1;
1431 			}
1432 
1433 #if defined(sse2) && __x86_64__
1434 			if (have_sse2()) {
1435 				do {
1436 					uint32_t *d = (uint32_t *)dst_bytes;
1437 					const uint32_t *s = (const uint32_t *)src_bytes;
1438 					__m128i mask = xmm_create_mask_32(or);
1439 
1440 					i = w;
1441 					while (i && (uintptr_t)d & 15) {
1442 						*d++ = *s++ | or;
1443 						i--;
1444 					}
1445 
1446 					while (i >= 16) {
1447 						__m128i xmm1, xmm2, xmm3, xmm4;
1448 
1449 						xmm1 = xmm_load_128u((const __m128i*)s + 0);
1450 						xmm2 = xmm_load_128u((const __m128i*)s + 1);
1451 						xmm3 = xmm_load_128u((const __m128i*)s + 2);
1452 						xmm4 = xmm_load_128u((const __m128i*)s + 3);
1453 
1454 						xmm_save_128((__m128i*)d + 0,
1455 							     _mm_or_si128(xmm1, mask));
1456 						xmm_save_128((__m128i*)d + 1,
1457 							     _mm_or_si128(xmm2, mask));
1458 						xmm_save_128((__m128i*)d + 2,
1459 							     _mm_or_si128(xmm3, mask));
1460 						xmm_save_128((__m128i*)d + 3,
1461 							     _mm_or_si128(xmm4, mask));
1462 
1463 						d += 16;
1464 						s += 16;
1465 						i -= 16;
1466 					}
1467 
1468 					if (i & 8) {
1469 						__m128i xmm1, xmm2;
1470 
1471 						xmm1 = xmm_load_128u((const __m128i*)s + 0);
1472 						xmm2 = xmm_load_128u((const __m128i*)s + 1);
1473 
1474 						xmm_save_128((__m128i*)d + 0,
1475 							     _mm_or_si128(xmm1, mask));
1476 						xmm_save_128((__m128i*)d + 1,
1477 							     _mm_or_si128(xmm2, mask));
1478 						d += 8;
1479 						s += 8;
1480 						i -= 8;
1481 					}
1482 
1483 					if (i & 4) {
1484 						xmm_save_128((__m128i*)d,
1485 							     _mm_or_si128(xmm_load_128u((const __m128i*)s),
1486 									  mask));
1487 
1488 						d += 4;
1489 						s += 4;
1490 						i -= 4;
1491 					}
1492 
1493 					while (i) {
1494 						*d++ = *s++ | or;
1495 						i--;
1496 					}
1497 
1498 					src_bytes += src_stride;
1499 					dst_bytes += dst_stride;
1500 				} while (--height);
1501 			} else
1502 #else
1503 				do {
1504 					uint32_t *d = (uint32_t *)dst_bytes;
1505 					uint32_t *s = (uint32_t *)src_bytes;
1506 
1507 					for (i = 0; i < w; i++)
1508 						d[i] = s[i] | or;
1509 
1510 					src_bytes += src_stride;
1511 					dst_bytes += dst_stride;
1512 				} while (--height);
1513 #endif
1514 			break;
1515 		}
1516 	} else {
1517 		switch (bpp) {
1518 		case 1:
1519 			do {
1520 				for (i = 0; i < width; i++)
1521 					dst_bytes[i] = (src_bytes[i] & and) | or;
1522 
1523 				src_bytes += src_stride;
1524 				dst_bytes += dst_stride;
1525 			} while (--height);
1526 			break;
1527 
1528 		case 2:
1529 			do {
1530 				uint16_t *d = (uint16_t *)dst_bytes;
1531 				const uint16_t *s = (const uint16_t *)src_bytes;
1532 
1533 				for (i = 0; i < width; i++)
1534 					d[i] = (s[i] & and) | or;
1535 
1536 				src_bytes += src_stride;
1537 				dst_bytes += dst_stride;
1538 			} while (--height);
1539 			break;
1540 
1541 		case 4:
1542 			do {
1543 				uint32_t *d = (uint32_t *)dst_bytes;
1544 				const uint32_t *s = (const uint32_t *)src_bytes;
1545 
1546 				for (i = 0; i < width; i++)
1547 					d[i] = (s[i] & and) | or;
1548 
1549 				src_bytes += src_stride;
1550 				dst_bytes += dst_stride;
1551 			} while (--height);
1552 			break;
1553 		}
1554 	}
1555 }
1556 
1557 #define BILINEAR_INTERPOLATION_BITS 4
1558 static inline int
bilinear_weight(pixman_fixed_t x)1559 bilinear_weight(pixman_fixed_t x)
1560 {
1561 	return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
1562 		((1 << BILINEAR_INTERPOLATION_BITS) - 1);
1563 }
1564 
1565 #if BILINEAR_INTERPOLATION_BITS <= 4
1566 /* Inspired by Filter_32_opaque from Skia */
1567 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1568 bilinear_interpolation(uint32_t tl, uint32_t tr,
1569 		       uint32_t bl, uint32_t br,
1570 		       int distx, int disty)
1571 {
1572 	int distxy, distxiy, distixy, distixiy;
1573 	uint32_t lo, hi;
1574 
1575 	distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
1576 	disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
1577 
1578 	distxy = distx * disty;
1579 	distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
1580 	distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
1581 	distixiy =
1582 		16 * 16 - (disty << 4) -
1583 		(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
1584 
1585 	lo = (tl & 0xff00ff) * distixiy;
1586 	hi = ((tl >> 8) & 0xff00ff) * distixiy;
1587 
1588 	lo += (tr & 0xff00ff) * distxiy;
1589 	hi += ((tr >> 8) & 0xff00ff) * distxiy;
1590 
1591 	lo += (bl & 0xff00ff) * distixy;
1592 	hi += ((bl >> 8) & 0xff00ff) * distixy;
1593 
1594 	lo += (br & 0xff00ff) * distxy;
1595 	hi += ((br >> 8) & 0xff00ff) * distxy;
1596 
1597 	return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
1598 }
1599 #elif SIZEOF_LONG > 4
1600 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1601 bilinear_interpolation(uint32_t tl, uint32_t tr,
1602 		       uint32_t bl, uint32_t br,
1603 		       int distx, int disty)
1604 {
1605 	uint64_t distxy, distxiy, distixy, distixiy;
1606 	uint64_t tl64, tr64, bl64, br64;
1607 	uint64_t f, r;
1608 
1609 	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1610 	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1611 
1612 	distxy = distx * disty;
1613 	distxiy = distx * (256 - disty);
1614 	distixy = (256 - distx) * disty;
1615 	distixiy = (256 - distx) * (256 - disty);
1616 
1617 	/* Alpha and Blue */
1618 	tl64 = tl & 0xff0000ff;
1619 	tr64 = tr & 0xff0000ff;
1620 	bl64 = bl & 0xff0000ff;
1621 	br64 = br & 0xff0000ff;
1622 
1623 	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1624 	r = f & 0x0000ff0000ff0000ull;
1625 
1626 	/* Red and Green */
1627 	tl64 = tl;
1628 	tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
1629 
1630 	tr64 = tr;
1631 	tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
1632 
1633 	bl64 = bl;
1634 	bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
1635 
1636 	br64 = br;
1637 	br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
1638 
1639 	f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1640 	r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
1641 
1642 	return (uint32_t)(r >> 16);
1643 }
1644 #else
1645 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1646 bilinear_interpolation(uint32_t tl, uint32_t tr,
1647 		       uint32_t bl, uint32_t br,
1648 		       int distx, int disty)
1649 {
1650 	int distxy, distxiy, distixy, distixiy;
1651 	uint32_t f, r;
1652 
1653 	distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1654 	disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1655 
1656 	distxy = distx * disty;
1657 	distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
1658 	distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
1659 	distixiy =
1660 		256 * 256 - (disty << 8) -
1661 		(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
1662 
1663 	/* Blue */
1664 	r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1665 	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
1666 
1667 	/* Green */
1668 	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1669 	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
1670 	r |= f & 0xff000000;
1671 
1672 	tl >>= 16;
1673 	tr >>= 16;
1674 	bl >>= 16;
1675 	br >>= 16;
1676 	r >>= 16;
1677 
1678 	/* Red */
1679 	f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1680 	     (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy);
1681 	r |= f & 0x00ff0000;
1682 
1683 	/* Alpha */
1684 	f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1685 	     (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy);
1686 	r |= f & 0xff000000;
1687 
1688 	return r;
1689 }
1690 #endif
1691 
convert_pixel(const uint8_t * p,int x)1692 static inline uint32_t convert_pixel(const uint8_t *p, int x)
1693 {
1694 	return ((uint32_t *)p)[x];
1695 }
1696 
1697 fast void
affine_blt(const void * src,void * dst,int bpp,int16_t src_x,int16_t src_y,int16_t src_width,int16_t src_height,int32_t src_stride,int16_t dst_x,int16_t dst_y,uint16_t dst_width,uint16_t dst_height,int32_t dst_stride,const struct pixman_f_transform * t)1698 affine_blt(const void *src, void *dst, int bpp,
1699 	   int16_t src_x, int16_t src_y,
1700 	   int16_t src_width, int16_t src_height,
1701 	   int32_t src_stride,
1702 	   int16_t dst_x, int16_t dst_y,
1703 	   uint16_t dst_width, uint16_t dst_height,
1704 	   int32_t dst_stride,
1705 	   const struct pixman_f_transform *t)
1706 {
1707 	static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1708 	const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]);
1709 	const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]);
1710 	int i, j;
1711 
1712 	assert(bpp == 32);
1713 
1714 	for (j = 0; j < dst_height; j++) {
1715 		pixman_fixed_t x, y;
1716 		struct pixman_f_vector v;
1717 		uint32_t *b;
1718 
1719 		/* reference point is the center of the pixel */
1720 		v.v[0] = dst_x + 0.5;
1721 		v.v[1] = dst_y + j + 0.5;
1722 		v.v[2] = 1.0;
1723 
1724 		pixman_f_transform_point_3d(t, &v);
1725 
1726 		x = pixman_double_to_fixed(v.v[0]);
1727 		x += pixman_int_to_fixed(src_x - dst_x);
1728 		y = pixman_double_to_fixed(v.v[1]);
1729 		y +=  pixman_int_to_fixed(src_y - dst_y);
1730 
1731 		b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8);
1732 		for (i = 0; i < dst_width; i++) {
1733 			const uint8_t *row1;
1734 			const uint8_t *row2;
1735 			int x1, y1, x2, y2;
1736 			uint32_t tl, tr, bl, br;
1737 			int32_t fx, fy;
1738 
1739 			x1 = x - pixman_fixed_1/2;
1740 			y1 = y - pixman_fixed_1/2;
1741 
1742 			fx = bilinear_weight(x1);
1743 			fy = bilinear_weight(y1);
1744 
1745 			x1 = pixman_fixed_to_int(x1);
1746 			x2 = x1 + 1;
1747 			y1 = pixman_fixed_to_int(y1);
1748 			y2 = y1 + 1;
1749 
1750 			if (x1 >= src_width  || x2 < 0 ||
1751 			    y1 >= src_height || y2 < 0) {
1752 				b[i] = 0;
1753 				goto next;
1754 			}
1755 
1756 			if (y2 == 0) {
1757 				row1 = zero;
1758 			} else {
1759 				row1 = (uint8_t *)src + src_stride * y1;
1760 				row1 += bpp / 8 * x1;
1761 			}
1762 
1763 			if (y1 == src_height - 1) {
1764 				row2 = zero;
1765 			} else {
1766 				row2 = (uint8_t *)src + src_stride * y2;
1767 				row2 += bpp / 8 * x1;
1768 			}
1769 
1770 			if (x2 == 0) {
1771 				tl = 0;
1772 				bl = 0;
1773 			} else {
1774 				tl = convert_pixel(row1, 0);
1775 				bl = convert_pixel(row2, 0);
1776 			}
1777 
1778 			if (x1 == src_width - 1) {
1779 				tr = 0;
1780 				br = 0;
1781 			} else {
1782 				tr = convert_pixel(row1, 1);
1783 				br = convert_pixel(row2, 1);
1784 			}
1785 
1786 			b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy);
1787 
1788 next:
1789 			x += ux;
1790 			y += uy;
1791 		}
1792 	}
1793 }
1794