1 /*
2 * Copyright (c) 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "sna.h"
33 #include <pixman.h>
34
35 #if defined(sse2)
36 #pragma GCC push_options
37 #pragma GCC target("sse2,inline-all-stringops,fpmath=sse")
38 #pragma GCC optimize("Ofast")
39 #include <xmmintrin.h>
40
41 #if __x86_64__
42 #define have_sse2() 1
43 #else
have_sse2(void)44 static bool have_sse2(void)
45 {
46 static int sse2_present = -1;
47
48 if (sse2_present == -1)
49 sse2_present = sna_cpu_detect() & SSE2;
50
51 return sse2_present;
52 }
53 #endif
54
55 static force_inline __m128i
xmm_create_mask_32(uint32_t mask)56 xmm_create_mask_32(uint32_t mask)
57 {
58 return _mm_set_epi32(mask, mask, mask, mask);
59 }
60
61 static force_inline __m128i
xmm_load_128(const __m128i * src)62 xmm_load_128(const __m128i *src)
63 {
64 return _mm_load_si128(src);
65 }
66
67 static force_inline __m128i
xmm_load_128u(const __m128i * src)68 xmm_load_128u(const __m128i *src)
69 {
70 return _mm_loadu_si128(src);
71 }
72
73 static force_inline void
xmm_save_128(__m128i * dst,__m128i data)74 xmm_save_128(__m128i *dst, __m128i data)
75 {
76 _mm_store_si128(dst, data);
77 }
78
79 static force_inline void
xmm_save_128u(__m128i * dst,__m128i data)80 xmm_save_128u(__m128i *dst, __m128i data)
81 {
82 _mm_storeu_si128(dst, data);
83 }
84
85 static force_inline void
to_sse128xN(uint8_t * dst,const uint8_t * src,int bytes)86 to_sse128xN(uint8_t *dst, const uint8_t *src, int bytes)
87 {
88 int i;
89
90 for (i = 0; i < bytes / 128; i++) {
91 __m128i xmm0, xmm1, xmm2, xmm3;
92 __m128i xmm4, xmm5, xmm6, xmm7;
93
94 xmm0 = xmm_load_128u((const __m128i*)src + 0);
95 xmm1 = xmm_load_128u((const __m128i*)src + 1);
96 xmm2 = xmm_load_128u((const __m128i*)src + 2);
97 xmm3 = xmm_load_128u((const __m128i*)src + 3);
98 xmm4 = xmm_load_128u((const __m128i*)src + 4);
99 xmm5 = xmm_load_128u((const __m128i*)src + 5);
100 xmm6 = xmm_load_128u((const __m128i*)src + 6);
101 xmm7 = xmm_load_128u((const __m128i*)src + 7);
102
103 xmm_save_128((__m128i*)dst + 0, xmm0);
104 xmm_save_128((__m128i*)dst + 1, xmm1);
105 xmm_save_128((__m128i*)dst + 2, xmm2);
106 xmm_save_128((__m128i*)dst + 3, xmm3);
107 xmm_save_128((__m128i*)dst + 4, xmm4);
108 xmm_save_128((__m128i*)dst + 5, xmm5);
109 xmm_save_128((__m128i*)dst + 6, xmm6);
110 xmm_save_128((__m128i*)dst + 7, xmm7);
111
112 dst += 128;
113 src += 128;
114 }
115 }
116
117 static force_inline void
to_sse64(uint8_t * dst,const uint8_t * src)118 to_sse64(uint8_t *dst, const uint8_t *src)
119 {
120 __m128i xmm1, xmm2, xmm3, xmm4;
121
122 xmm1 = xmm_load_128u((const __m128i*)src + 0);
123 xmm2 = xmm_load_128u((const __m128i*)src + 1);
124 xmm3 = xmm_load_128u((const __m128i*)src + 2);
125 xmm4 = xmm_load_128u((const __m128i*)src + 3);
126
127 xmm_save_128((__m128i*)dst + 0, xmm1);
128 xmm_save_128((__m128i*)dst + 1, xmm2);
129 xmm_save_128((__m128i*)dst + 2, xmm3);
130 xmm_save_128((__m128i*)dst + 3, xmm4);
131 }
132
133 static force_inline void
to_sse32(uint8_t * dst,const uint8_t * src)134 to_sse32(uint8_t *dst, const uint8_t *src)
135 {
136 __m128i xmm1, xmm2;
137
138 xmm1 = xmm_load_128u((const __m128i*)src + 0);
139 xmm2 = xmm_load_128u((const __m128i*)src + 1);
140
141 xmm_save_128((__m128i*)dst + 0, xmm1);
142 xmm_save_128((__m128i*)dst + 1, xmm2);
143 }
144
145 static force_inline void
to_sse16(uint8_t * dst,const uint8_t * src)146 to_sse16(uint8_t *dst, const uint8_t *src)
147 {
148 xmm_save_128((__m128i*)dst, xmm_load_128u((const __m128i*)src));
149 }
150
to_memcpy(uint8_t * dst,const uint8_t * src,unsigned len)151 static void to_memcpy(uint8_t *dst, const uint8_t *src, unsigned len)
152 {
153 assert(len);
154 if ((uintptr_t)dst & 15) {
155 if (len <= 16 - ((uintptr_t)dst & 15)) {
156 memcpy(dst, src, len);
157 return;
158 }
159
160 if ((uintptr_t)dst & 1) {
161 assert(len >= 1);
162 *dst++ = *src++;
163 len--;
164 }
165 if ((uintptr_t)dst & 2) {
166 assert(((uintptr_t)dst & 1) == 0);
167 assert(len >= 2);
168 *(uint16_t *)dst = *(const uint16_t *)src;
169 dst += 2;
170 src += 2;
171 len -= 2;
172 }
173 if ((uintptr_t)dst & 4) {
174 assert(((uintptr_t)dst & 3) == 0);
175 assert(len >= 4);
176 *(uint32_t *)dst = *(const uint32_t *)src;
177 dst += 4;
178 src += 4;
179 len -= 4;
180 }
181 if ((uintptr_t)dst & 8) {
182 assert(((uintptr_t)dst & 7) == 0);
183 assert(len >= 8);
184 *(uint64_t *)dst = *(const uint64_t *)src;
185 dst += 8;
186 src += 8;
187 len -= 8;
188 }
189 }
190
191 assert(((uintptr_t)dst & 15) == 0);
192 while (len >= 64) {
193 to_sse64(dst, src);
194 dst += 64;
195 src += 64;
196 len -= 64;
197 }
198 if (len == 0)
199 return;
200
201 if (len & 32) {
202 to_sse32(dst, src);
203 dst += 32;
204 src += 32;
205 }
206 if (len & 16) {
207 to_sse16(dst, src);
208 dst += 16;
209 src += 16;
210 }
211 if (len & 8) {
212 *(uint64_t *)dst = *(uint64_t *)src;
213 dst += 8;
214 src += 8;
215 }
216 if (len & 4) {
217 *(uint32_t *)dst = *(uint32_t *)src;
218 dst += 4;
219 src += 4;
220 }
221 memcpy(dst, src, len & 3);
222 }
223
224 static void
memcpy_to_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)225 memcpy_to_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
226 int32_t src_stride, int32_t dst_stride,
227 int16_t src_x, int16_t src_y,
228 int16_t dst_x, int16_t dst_y,
229 uint16_t width, uint16_t height)
230 {
231 const unsigned tile_width = 512;
232 const unsigned tile_height = 8;
233 const unsigned tile_size = 4096;
234
235 const unsigned cpp = bpp / 8;
236 const unsigned tile_pixels = tile_width / cpp;
237 const unsigned tile_shift = ffs(tile_pixels) - 1;
238 const unsigned tile_mask = tile_pixels - 1;
239
240 unsigned offset_x, length_x;
241
242 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
243 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
244 assert(src != dst);
245
246 if (src_x | src_y)
247 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
248 width *= cpp;
249 assert(src_stride >= width);
250
251 if (dst_x & tile_mask) {
252 offset_x = (dst_x & tile_mask) * cpp;
253 length_x = min(tile_width - offset_x, width);
254 } else
255 length_x = 0;
256 dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
257
258 while (height--) {
259 unsigned w = width;
260 const uint8_t *src_row = src;
261 uint8_t *tile_row = dst;
262
263 src = (const uint8_t *)src + src_stride;
264
265 tile_row += dst_y / tile_height * dst_stride * tile_height;
266 tile_row += (dst_y & (tile_height-1)) * tile_width;
267 dst_y++;
268
269 if (length_x) {
270 to_memcpy(tile_row + offset_x, src_row, length_x);
271
272 tile_row += tile_size;
273 src_row = (const uint8_t *)src_row + length_x;
274 w -= length_x;
275 }
276 while (w >= tile_width) {
277 assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
278 to_sse128xN(assume_aligned(tile_row, tile_width),
279 src_row, tile_width);
280 tile_row += tile_size;
281 src_row = (const uint8_t *)src_row + tile_width;
282 w -= tile_width;
283 }
284 if (w) {
285 assert(((uintptr_t)tile_row & (tile_width - 1)) == 0);
286 to_memcpy(assume_aligned(tile_row, tile_width),
287 src_row, w);
288 }
289 }
290 }
291
292 static force_inline void
from_sse128xNu(uint8_t * dst,const uint8_t * src,int bytes)293 from_sse128xNu(uint8_t *dst, const uint8_t *src, int bytes)
294 {
295 int i;
296
297 assert(((uintptr_t)src & 15) == 0);
298
299 for (i = 0; i < bytes / 128; i++) {
300 __m128i xmm0, xmm1, xmm2, xmm3;
301 __m128i xmm4, xmm5, xmm6, xmm7;
302
303 xmm0 = xmm_load_128((const __m128i*)src + 0);
304 xmm1 = xmm_load_128((const __m128i*)src + 1);
305 xmm2 = xmm_load_128((const __m128i*)src + 2);
306 xmm3 = xmm_load_128((const __m128i*)src + 3);
307 xmm4 = xmm_load_128((const __m128i*)src + 4);
308 xmm5 = xmm_load_128((const __m128i*)src + 5);
309 xmm6 = xmm_load_128((const __m128i*)src + 6);
310 xmm7 = xmm_load_128((const __m128i*)src + 7);
311
312 xmm_save_128u((__m128i*)dst + 0, xmm0);
313 xmm_save_128u((__m128i*)dst + 1, xmm1);
314 xmm_save_128u((__m128i*)dst + 2, xmm2);
315 xmm_save_128u((__m128i*)dst + 3, xmm3);
316 xmm_save_128u((__m128i*)dst + 4, xmm4);
317 xmm_save_128u((__m128i*)dst + 5, xmm5);
318 xmm_save_128u((__m128i*)dst + 6, xmm6);
319 xmm_save_128u((__m128i*)dst + 7, xmm7);
320
321 dst += 128;
322 src += 128;
323 }
324 }
325
326 static force_inline void
from_sse128xNa(uint8_t * dst,const uint8_t * src,int bytes)327 from_sse128xNa(uint8_t *dst, const uint8_t *src, int bytes)
328 {
329 int i;
330
331 assert(((uintptr_t)dst & 15) == 0);
332 assert(((uintptr_t)src & 15) == 0);
333
334 for (i = 0; i < bytes / 128; i++) {
335 __m128i xmm0, xmm1, xmm2, xmm3;
336 __m128i xmm4, xmm5, xmm6, xmm7;
337
338 xmm0 = xmm_load_128((const __m128i*)src + 0);
339 xmm1 = xmm_load_128((const __m128i*)src + 1);
340 xmm2 = xmm_load_128((const __m128i*)src + 2);
341 xmm3 = xmm_load_128((const __m128i*)src + 3);
342 xmm4 = xmm_load_128((const __m128i*)src + 4);
343 xmm5 = xmm_load_128((const __m128i*)src + 5);
344 xmm6 = xmm_load_128((const __m128i*)src + 6);
345 xmm7 = xmm_load_128((const __m128i*)src + 7);
346
347 xmm_save_128((__m128i*)dst + 0, xmm0);
348 xmm_save_128((__m128i*)dst + 1, xmm1);
349 xmm_save_128((__m128i*)dst + 2, xmm2);
350 xmm_save_128((__m128i*)dst + 3, xmm3);
351 xmm_save_128((__m128i*)dst + 4, xmm4);
352 xmm_save_128((__m128i*)dst + 5, xmm5);
353 xmm_save_128((__m128i*)dst + 6, xmm6);
354 xmm_save_128((__m128i*)dst + 7, xmm7);
355
356 dst += 128;
357 src += 128;
358 }
359 }
360
361 static force_inline void
from_sse64u(uint8_t * dst,const uint8_t * src)362 from_sse64u(uint8_t *dst, const uint8_t *src)
363 {
364 __m128i xmm1, xmm2, xmm3, xmm4;
365
366 assert(((uintptr_t)src & 15) == 0);
367
368 xmm1 = xmm_load_128((const __m128i*)src + 0);
369 xmm2 = xmm_load_128((const __m128i*)src + 1);
370 xmm3 = xmm_load_128((const __m128i*)src + 2);
371 xmm4 = xmm_load_128((const __m128i*)src + 3);
372
373 xmm_save_128u((__m128i*)dst + 0, xmm1);
374 xmm_save_128u((__m128i*)dst + 1, xmm2);
375 xmm_save_128u((__m128i*)dst + 2, xmm3);
376 xmm_save_128u((__m128i*)dst + 3, xmm4);
377 }
378
379 static force_inline void
from_sse64a(uint8_t * dst,const uint8_t * src)380 from_sse64a(uint8_t *dst, const uint8_t *src)
381 {
382 __m128i xmm1, xmm2, xmm3, xmm4;
383
384 assert(((uintptr_t)dst & 15) == 0);
385 assert(((uintptr_t)src & 15) == 0);
386
387 xmm1 = xmm_load_128((const __m128i*)src + 0);
388 xmm2 = xmm_load_128((const __m128i*)src + 1);
389 xmm3 = xmm_load_128((const __m128i*)src + 2);
390 xmm4 = xmm_load_128((const __m128i*)src + 3);
391
392 xmm_save_128((__m128i*)dst + 0, xmm1);
393 xmm_save_128((__m128i*)dst + 1, xmm2);
394 xmm_save_128((__m128i*)dst + 2, xmm3);
395 xmm_save_128((__m128i*)dst + 3, xmm4);
396 }
397
398 static force_inline void
from_sse32u(uint8_t * dst,const uint8_t * src)399 from_sse32u(uint8_t *dst, const uint8_t *src)
400 {
401 __m128i xmm1, xmm2;
402
403 xmm1 = xmm_load_128((const __m128i*)src + 0);
404 xmm2 = xmm_load_128((const __m128i*)src + 1);
405
406 xmm_save_128u((__m128i*)dst + 0, xmm1);
407 xmm_save_128u((__m128i*)dst + 1, xmm2);
408 }
409
410 static force_inline void
from_sse32a(uint8_t * dst,const uint8_t * src)411 from_sse32a(uint8_t *dst, const uint8_t *src)
412 {
413 __m128i xmm1, xmm2;
414
415 assert(((uintptr_t)dst & 15) == 0);
416 assert(((uintptr_t)src & 15) == 0);
417
418 xmm1 = xmm_load_128((const __m128i*)src + 0);
419 xmm2 = xmm_load_128((const __m128i*)src + 1);
420
421 xmm_save_128((__m128i*)dst + 0, xmm1);
422 xmm_save_128((__m128i*)dst + 1, xmm2);
423 }
424
425 static force_inline void
from_sse16u(uint8_t * dst,const uint8_t * src)426 from_sse16u(uint8_t *dst, const uint8_t *src)
427 {
428 assert(((uintptr_t)src & 15) == 0);
429
430 xmm_save_128u((__m128i*)dst, xmm_load_128((const __m128i*)src));
431 }
432
433 static force_inline void
from_sse16a(uint8_t * dst,const uint8_t * src)434 from_sse16a(uint8_t *dst, const uint8_t *src)
435 {
436 assert(((uintptr_t)dst & 15) == 0);
437 assert(((uintptr_t)src & 15) == 0);
438
439 xmm_save_128((__m128i*)dst, xmm_load_128((const __m128i*)src));
440 }
441
442 static void
memcpy_from_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)443 memcpy_from_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
444 int32_t src_stride, int32_t dst_stride,
445 int16_t src_x, int16_t src_y,
446 int16_t dst_x, int16_t dst_y,
447 uint16_t width, uint16_t height)
448 {
449 const unsigned tile_width = 512;
450 const unsigned tile_height = 8;
451 const unsigned tile_size = 4096;
452
453 const unsigned cpp = bpp / 8;
454 const unsigned tile_pixels = tile_width / cpp;
455 const unsigned tile_shift = ffs(tile_pixels) - 1;
456 const unsigned tile_mask = tile_pixels - 1;
457
458 unsigned length_x, offset_x;
459
460 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
461 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
462 assert(src != dst);
463
464 if (dst_x | dst_y)
465 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
466 width *= cpp;
467 assert(dst_stride >= width);
468 if (src_x & tile_mask) {
469 offset_x = (src_x & tile_mask) * cpp;
470 length_x = min(tile_width - offset_x, width);
471 dst_stride -= width;
472 dst_stride += (width - length_x) & 15;
473 } else {
474 offset_x = 0;
475 dst_stride -= width & ~15;
476 }
477 assert(dst_stride >= 0);
478 src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
479
480 while (height--) {
481 unsigned w = width;
482 const uint8_t *tile_row = src;
483
484 tile_row += src_y / tile_height * src_stride * tile_height;
485 tile_row += (src_y & (tile_height-1)) * tile_width;
486 src_y++;
487
488 if (offset_x) {
489 memcpy(dst, tile_row + offset_x, length_x);
490 tile_row += tile_size;
491 dst = (uint8_t *)dst + length_x;
492 w -= length_x;
493 }
494
495 if ((uintptr_t)dst & 15) {
496 while (w >= tile_width) {
497 from_sse128xNu(dst,
498 assume_aligned(tile_row, tile_width),
499 tile_width);
500 tile_row += tile_size;
501 dst = (uint8_t *)dst + tile_width;
502 w -= tile_width;
503 }
504 while (w >= 64) {
505 from_sse64u(dst, tile_row);
506 tile_row += 64;
507 dst = (uint8_t *)dst + 64;
508 w -= 64;
509 }
510 if (w & 32) {
511 from_sse32u(dst, tile_row);
512 tile_row += 32;
513 dst = (uint8_t *)dst + 32;
514 }
515 if (w & 16) {
516 from_sse16u(dst, tile_row);
517 tile_row += 16;
518 dst = (uint8_t *)dst + 16;
519 }
520 memcpy(dst, assume_aligned(tile_row, 16), w & 15);
521 } else {
522 while (w >= tile_width) {
523 from_sse128xNa(assume_aligned(dst, 16),
524 assume_aligned(tile_row, tile_width),
525 tile_width);
526 tile_row += tile_size;
527 dst = (uint8_t *)dst + tile_width;
528 w -= tile_width;
529 }
530 while (w >= 64) {
531 from_sse64a(dst, tile_row);
532 tile_row += 64;
533 dst = (uint8_t *)dst + 64;
534 w -= 64;
535 }
536 if (w & 32) {
537 from_sse32a(dst, tile_row);
538 tile_row += 32;
539 dst = (uint8_t *)dst + 32;
540 }
541 if (w & 16) {
542 from_sse16a(dst, tile_row);
543 tile_row += 16;
544 dst = (uint8_t *)dst + 16;
545 }
546 memcpy(assume_aligned(dst, 16),
547 assume_aligned(tile_row, 16),
548 w & 15);
549 }
550 dst = (uint8_t *)dst + dst_stride;
551 }
552 }
553
554 static void
memcpy_between_tiled_x__swizzle_0__sse2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)555 memcpy_between_tiled_x__swizzle_0__sse2(const void *src, void *dst, int bpp,
556 int32_t src_stride, int32_t dst_stride,
557 int16_t src_x, int16_t src_y,
558 int16_t dst_x, int16_t dst_y,
559 uint16_t width, uint16_t height)
560 {
561 const unsigned tile_width = 512;
562 const unsigned tile_height = 8;
563 const unsigned tile_size = 4096;
564
565 const unsigned cpp = bpp / 8;
566 const unsigned tile_pixels = tile_width / cpp;
567 const unsigned tile_shift = ffs(tile_pixels) - 1;
568 const unsigned tile_mask = tile_pixels - 1;
569
570 unsigned ox, lx;
571
572 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
573 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
574 assert(src != dst);
575
576 width *= cpp;
577 dst_stride *= tile_height;
578 src_stride *= tile_height;
579
580 assert((dst_x & tile_mask) == (src_x & tile_mask));
581 if (dst_x & tile_mask) {
582 ox = (dst_x & tile_mask) * cpp;
583 lx = min(tile_width - ox, width);
584 assert(lx != 0);
585 } else
586 lx = 0;
587
588 if (dst_x)
589 dst = (uint8_t *)dst + (dst_x >> tile_shift) * tile_size;
590 if (src_x)
591 src = (const uint8_t *)src + (src_x >> tile_shift) * tile_size;
592
593 while (height--) {
594 const uint8_t *src_row;
595 uint8_t *dst_row;
596 unsigned w = width;
597
598 dst_row = dst;
599 dst_row += dst_y / tile_height * dst_stride;
600 dst_row += (dst_y & (tile_height-1)) * tile_width;
601 dst_y++;
602
603 src_row = src;
604 src_row += src_y / tile_height * src_stride;
605 src_row += (src_y & (tile_height-1)) * tile_width;
606 src_y++;
607
608 if (lx) {
609 to_memcpy(dst_row + ox, src_row + ox, lx);
610 dst_row += tile_size;
611 src_row += tile_size;
612 w -= lx;
613 }
614 while (w >= tile_width) {
615 assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
616 assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
617 to_sse128xN(assume_aligned(dst_row, tile_width),
618 assume_aligned(src_row, tile_width),
619 tile_width);
620 dst_row += tile_size;
621 src_row += tile_size;
622 w -= tile_width;
623 }
624 if (w) {
625 assert(((uintptr_t)dst_row & (tile_width - 1)) == 0);
626 assert(((uintptr_t)src_row & (tile_width - 1)) == 0);
627 to_memcpy(assume_aligned(dst_row, tile_width),
628 assume_aligned(src_row, tile_width),
629 w);
630 }
631 }
632 }
633
634 #pragma GCC push_options
635 #endif
636
637 fast void
memcpy_blt(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)638 memcpy_blt(const void *src, void *dst, int bpp,
639 int32_t src_stride, int32_t dst_stride,
640 int16_t src_x, int16_t src_y,
641 int16_t dst_x, int16_t dst_y,
642 uint16_t width, uint16_t height)
643 {
644 const uint8_t *src_bytes;
645 uint8_t *dst_bytes;
646 int byte_width;
647
648 assert(src);
649 assert(dst);
650 assert(width && height);
651 assert(bpp >= 8);
652 assert(width*bpp <= 8*src_stride);
653 assert(width*bpp <= 8*dst_stride);
654
655 DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
656 __FUNCTION__, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
657
658 bpp /= 8;
659
660 src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
661 dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
662
663 byte_width = width * bpp;
664 if (byte_width == src_stride && byte_width == dst_stride) {
665 byte_width *= height;
666 height = 1;
667 }
668
669 switch (byte_width) {
670 case 1:
671 do {
672 *dst_bytes = *src_bytes;
673 src_bytes += src_stride;
674 dst_bytes += dst_stride;
675 } while (--height);
676 break;
677
678 case 2:
679 do {
680 *(uint16_t *)dst_bytes = *(const uint16_t *)src_bytes;
681 src_bytes += src_stride;
682 dst_bytes += dst_stride;
683 } while (--height);
684 break;
685
686 case 4:
687 do {
688 *(uint32_t *)dst_bytes = *(const uint32_t *)src_bytes;
689 src_bytes += src_stride;
690 dst_bytes += dst_stride;
691 } while (--height);
692 break;
693
694 case 8:
695 do {
696 *(uint64_t *)dst_bytes = *(const uint64_t *)src_bytes;
697 src_bytes += src_stride;
698 dst_bytes += dst_stride;
699 } while (--height);
700 break;
701 case 16:
702 do {
703 ((uint64_t *)dst_bytes)[0] = ((const uint64_t *)src_bytes)[0];
704 ((uint64_t *)dst_bytes)[1] = ((const uint64_t *)src_bytes)[1];
705 src_bytes += src_stride;
706 dst_bytes += dst_stride;
707 } while (--height);
708 break;
709
710 default:
711 do {
712 memcpy(dst_bytes, src_bytes, byte_width);
713 src_bytes += src_stride;
714 dst_bytes += dst_stride;
715 } while (--height);
716 break;
717 }
718 }
719
720 static fast_memcpy void
memcpy_to_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)721 memcpy_to_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
722 int32_t src_stride, int32_t dst_stride,
723 int16_t src_x, int16_t src_y,
724 int16_t dst_x, int16_t dst_y,
725 uint16_t width, uint16_t height)
726 {
727 const unsigned tile_width = 512;
728 const unsigned tile_height = 8;
729 const unsigned tile_size = 4096;
730
731 const unsigned cpp = bpp / 8;
732 const unsigned tile_pixels = tile_width / cpp;
733 const unsigned tile_shift = ffs(tile_pixels) - 1;
734 const unsigned tile_mask = tile_pixels - 1;
735
736 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
737 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
738 assert(src != dst);
739
740 if (src_x | src_y)
741 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
742 assert(src_stride >= width * cpp);
743 src_stride -= width * cpp;
744
745 while (height--) {
746 unsigned w = width * cpp;
747 uint8_t *tile_row = dst;
748
749 tile_row += dst_y / tile_height * dst_stride * tile_height;
750 tile_row += (dst_y & (tile_height-1)) * tile_width;
751 if (dst_x) {
752 tile_row += (dst_x >> tile_shift) * tile_size;
753 if (dst_x & tile_mask) {
754 const unsigned x = (dst_x & tile_mask) * cpp;
755 const unsigned len = min(tile_width - x, w);
756 memcpy(assume_misaligned(tile_row + x, tile_width, x),
757 src, len);
758
759 tile_row += tile_size;
760 src = (const uint8_t *)src + len;
761 w -= len;
762 }
763 }
764 while (w >= tile_width) {
765 memcpy(assume_aligned(tile_row, tile_width),
766 src, tile_width);
767 tile_row += tile_size;
768 src = (const uint8_t *)src + tile_width;
769 w -= tile_width;
770 }
771 memcpy(assume_aligned(tile_row, tile_width), src, w);
772 src = (const uint8_t *)src + src_stride + w;
773 dst_y++;
774 }
775 }
776
777 static fast_memcpy void
memcpy_from_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)778 memcpy_from_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
779 int32_t src_stride, int32_t dst_stride,
780 int16_t src_x, int16_t src_y,
781 int16_t dst_x, int16_t dst_y,
782 uint16_t width, uint16_t height)
783 {
784 const unsigned tile_width = 512;
785 const unsigned tile_height = 8;
786 const unsigned tile_size = 4096;
787
788 const unsigned cpp = bpp / 8;
789 const unsigned tile_pixels = tile_width / cpp;
790 const unsigned tile_shift = ffs(tile_pixels) - 1;
791 const unsigned tile_mask = tile_pixels - 1;
792
793 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
794 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
795 assert(src != dst);
796
797 if (dst_x | dst_y)
798 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
799 assert(dst_stride >= width * cpp);
800 dst_stride -= width * cpp;
801
802 while (height--) {
803 unsigned w = width * cpp;
804 const uint8_t *tile_row = src;
805
806 tile_row += src_y / tile_height * src_stride * tile_height;
807 tile_row += (src_y & (tile_height-1)) * tile_width;
808 if (src_x) {
809 tile_row += (src_x >> tile_shift) * tile_size;
810 if (src_x & tile_mask) {
811 const unsigned x = (src_x & tile_mask) * cpp;
812 const unsigned len = min(tile_width - x, w);
813 memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
814
815 tile_row += tile_size;
816 dst = (uint8_t *)dst + len;
817 w -= len;
818 }
819 }
820 while (w >= tile_width) {
821 memcpy(dst,
822 assume_aligned(tile_row, tile_width),
823 tile_width);
824
825 tile_row += tile_size;
826 dst = (uint8_t *)dst + tile_width;
827 w -= tile_width;
828 }
829 memcpy(dst, assume_aligned(tile_row, tile_width), w);
830 dst = (uint8_t *)dst + dst_stride + w;
831 src_y++;
832 }
833 }
834
835 static fast_memcpy void
memcpy_between_tiled_x__swizzle_0(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)836 memcpy_between_tiled_x__swizzle_0(const void *src, void *dst, int bpp,
837 int32_t src_stride, int32_t dst_stride,
838 int16_t src_x, int16_t src_y,
839 int16_t dst_x, int16_t dst_y,
840 uint16_t width, uint16_t height)
841 {
842 const unsigned tile_width = 512;
843 const unsigned tile_height = 8;
844 const unsigned tile_size = 4096;
845
846 const unsigned cpp = bpp / 8;
847 const unsigned tile_pixels = tile_width / cpp;
848 const unsigned tile_shift = ffs(tile_pixels) - 1;
849 const unsigned tile_mask = tile_pixels - 1;
850
851 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
852 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
853 assert(src != dst);
854 assert((dst_x & tile_mask) == (src_x & tile_mask));
855
856 while (height--) {
857 unsigned w = width * cpp;
858 uint8_t *dst_row = dst;
859 const uint8_t *src_row = src;
860
861 dst_row += dst_y / tile_height * dst_stride * tile_height;
862 dst_row += (dst_y & (tile_height-1)) * tile_width;
863 if (dst_x)
864 dst_row += (dst_x >> tile_shift) * tile_size;
865 dst_y++;
866
867 src_row += src_y / tile_height * src_stride * tile_height;
868 src_row += (src_y & (tile_height-1)) * tile_width;
869 if (src_x)
870 src_row += (src_x >> tile_shift) * tile_size;
871 src_y++;
872
873 if (dst_x & tile_mask) {
874 const unsigned x = (dst_x & tile_mask) * cpp;
875 const unsigned len = min(tile_width - x, w);
876
877 memcpy(assume_misaligned(dst_row + x, tile_width, x),
878 assume_misaligned(src_row + x, tile_width, x),
879 len);
880
881 dst_row += tile_size;
882 src_row += tile_size;
883 w -= len;
884 }
885
886 while (w >= tile_width) {
887 memcpy(assume_aligned(dst_row, tile_width),
888 assume_aligned(src_row, tile_width),
889 tile_width);
890 dst_row += tile_size;
891 src_row += tile_size;
892 w -= tile_width;
893 }
894 memcpy(assume_aligned(dst_row, tile_width),
895 assume_aligned(src_row, tile_width),
896 w);
897 }
898 }
899
900 #define memcpy_to_tiled_x(swizzle) \
901 fast_memcpy static void \
902 memcpy_to_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
903 int32_t src_stride, int32_t dst_stride, \
904 int16_t src_x, int16_t src_y, \
905 int16_t dst_x, int16_t dst_y, \
906 uint16_t width, uint16_t height) \
907 { \
908 const unsigned tile_width = 512; \
909 const unsigned tile_height = 8; \
910 const unsigned tile_size = 4096; \
911 const unsigned cpp = bpp / 8; \
912 const unsigned stride_tiles = dst_stride / tile_width; \
913 const unsigned swizzle_pixels = 64 / cpp; \
914 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
915 const unsigned tile_mask = (1 << tile_pixels) - 1; \
916 unsigned x, y; \
917 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
918 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
919 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp; \
920 for (y = 0; y < height; ++y) { \
921 const uint32_t dy = y + dst_y; \
922 const uint32_t tile_row = \
923 (dy / tile_height * stride_tiles * tile_size + \
924 (dy & (tile_height-1)) * tile_width); \
925 const uint8_t *src_row = (const uint8_t *)src + src_stride * y; \
926 uint32_t dx = dst_x; \
927 x = width * cpp; \
928 if (dx & (swizzle_pixels - 1)) { \
929 const uint32_t swizzle_bound_pixels = ALIGN(dx + 1, swizzle_pixels); \
930 const uint32_t length = min(dst_x + width, swizzle_bound_pixels) - dx; \
931 uint32_t offset = \
932 tile_row + \
933 (dx >> tile_pixels) * tile_size + \
934 (dx & tile_mask) * cpp; \
935 memcpy((char *)dst + swizzle(offset), src_row, length * cpp); \
936 src_row += length * cpp; \
937 x -= length * cpp; \
938 dx += length; \
939 } \
940 while (x >= 64) { \
941 uint32_t offset = \
942 tile_row + \
943 (dx >> tile_pixels) * tile_size + \
944 (dx & tile_mask) * cpp; \
945 memcpy(assume_aligned((char *)dst+swizzle(offset),64), \
946 src_row, 64); \
947 src_row += 64; \
948 x -= 64; \
949 dx += swizzle_pixels; \
950 } \
951 if (x) { \
952 uint32_t offset = \
953 tile_row + \
954 (dx >> tile_pixels) * tile_size + \
955 (dx & tile_mask) * cpp; \
956 memcpy(assume_aligned((char *)dst + swizzle(offset), 64), src_row, x); \
957 } \
958 } \
959 }
960
961 #define memcpy_from_tiled_x(swizzle) \
962 fast_memcpy static void \
963 memcpy_from_tiled_x__##swizzle (const void *src, void *dst, int bpp, \
964 int32_t src_stride, int32_t dst_stride, \
965 int16_t src_x, int16_t src_y, \
966 int16_t dst_x, int16_t dst_y, \
967 uint16_t width, uint16_t height) \
968 { \
969 const unsigned tile_width = 512; \
970 const unsigned tile_height = 8; \
971 const unsigned tile_size = 4096; \
972 const unsigned cpp = bpp / 8; \
973 const unsigned stride_tiles = src_stride / tile_width; \
974 const unsigned swizzle_pixels = 64 / cpp; \
975 const unsigned tile_pixels = ffs(tile_width / cpp) - 1; \
976 const unsigned tile_mask = (1 << tile_pixels) - 1; \
977 unsigned x, y; \
978 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n", \
979 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride)); \
980 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp; \
981 for (y = 0; y < height; ++y) { \
982 const uint32_t sy = y + src_y; \
983 const uint32_t tile_row = \
984 (sy / tile_height * stride_tiles * tile_size + \
985 (sy & (tile_height-1)) * tile_width); \
986 uint8_t *dst_row = (uint8_t *)dst + dst_stride * y; \
987 uint32_t sx = src_x; \
988 x = width * cpp; \
989 if (sx & (swizzle_pixels - 1)) { \
990 const uint32_t swizzle_bound_pixels = ALIGN(sx + 1, swizzle_pixels); \
991 const uint32_t length = min(src_x + width, swizzle_bound_pixels) - sx; \
992 uint32_t offset = \
993 tile_row + \
994 (sx >> tile_pixels) * tile_size + \
995 (sx & tile_mask) * cpp; \
996 memcpy(dst_row, (const char *)src + swizzle(offset), length * cpp); \
997 dst_row += length * cpp; \
998 x -= length * cpp; \
999 sx += length; \
1000 } \
1001 while (x >= 64) { \
1002 uint32_t offset = \
1003 tile_row + \
1004 (sx >> tile_pixels) * tile_size + \
1005 (sx & tile_mask) * cpp; \
1006 memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), 64); \
1007 dst_row += 64; \
1008 x -= 64; \
1009 sx += swizzle_pixels; \
1010 } \
1011 if (x) { \
1012 uint32_t offset = \
1013 tile_row + \
1014 (sx >> tile_pixels) * tile_size + \
1015 (sx & tile_mask) * cpp; \
1016 memcpy(dst_row, assume_aligned((const char *)src + swizzle(offset), 64), x); \
1017 } \
1018 } \
1019 }
1020
1021 #define swizzle_9(X) ((X) ^ (((X) >> 3) & 64))
1022 memcpy_to_tiled_x(swizzle_9)
memcpy_from_tiled_x(swizzle_9)1023 memcpy_from_tiled_x(swizzle_9)
1024 #undef swizzle_9
1025
1026 #define swizzle_9_10(X) ((X) ^ ((((X) ^ ((X) >> 1)) >> 3) & 64))
1027 memcpy_to_tiled_x(swizzle_9_10)
1028 memcpy_from_tiled_x(swizzle_9_10)
1029 #undef swizzle_9_10
1030
1031 #define swizzle_9_11(X) ((X) ^ ((((X) ^ ((X) >> 2)) >> 3) & 64))
1032 memcpy_to_tiled_x(swizzle_9_11)
1033 memcpy_from_tiled_x(swizzle_9_11)
1034 #undef swizzle_9_11
1035
1036 #define swizzle_9_10_11(X) ((X) ^ ((((X) ^ ((X) >> 1) ^ ((X) >> 2)) >> 3) & 64))
1037 memcpy_to_tiled_x(swizzle_9_10_11)
1038 memcpy_from_tiled_x(swizzle_9_10_11)
1039 #undef swizzle_9_10_11
1040
1041 static fast_memcpy void
1042 memcpy_to_tiled_x__gen2(const void *src, void *dst, int bpp,
1043 int32_t src_stride, int32_t dst_stride,
1044 int16_t src_x, int16_t src_y,
1045 int16_t dst_x, int16_t dst_y,
1046 uint16_t width, uint16_t height)
1047 {
1048 const unsigned tile_width = 128;
1049 const unsigned tile_height = 16;
1050 const unsigned tile_size = 2048;
1051
1052 const unsigned cpp = bpp / 8;
1053 const unsigned tile_pixels = tile_width / cpp;
1054 const unsigned tile_shift = ffs(tile_pixels) - 1;
1055 const unsigned tile_mask = tile_pixels - 1;
1056
1057 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
1058 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1059 assert(src != dst);
1060
1061 if (src_x | src_y)
1062 src = (const uint8_t *)src + src_y * src_stride + src_x * cpp;
1063 assert(src_stride >= width * cpp);
1064 src_stride -= width * cpp;
1065
1066 while (height--) {
1067 unsigned w = width * cpp;
1068 uint8_t *tile_row = dst;
1069
1070 tile_row += dst_y / tile_height * dst_stride * tile_height;
1071 tile_row += (dst_y & (tile_height-1)) * tile_width;
1072 if (dst_x) {
1073 tile_row += (dst_x >> tile_shift) * tile_size;
1074 if (dst_x & tile_mask) {
1075 const unsigned x = (dst_x & tile_mask) * cpp;
1076 const unsigned len = min(tile_width - x, w);
1077 memcpy(assume_misaligned(tile_row + x, tile_width, x), src, len);
1078
1079 tile_row += tile_size;
1080 src = (const uint8_t *)src + len;
1081 w -= len;
1082 }
1083 }
1084 while (w >= tile_width) {
1085 memcpy(assume_aligned(tile_row, tile_width),
1086 src, tile_width);
1087
1088 tile_row += tile_size;
1089 src = (const uint8_t *)src + tile_width;
1090 w -= tile_width;
1091 }
1092 memcpy(assume_aligned(tile_row, tile_width), src, w);
1093 src = (const uint8_t *)src + src_stride + w;
1094 dst_y++;
1095 }
1096 }
1097
1098 static fast_memcpy void
memcpy_from_tiled_x__gen2(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height)1099 memcpy_from_tiled_x__gen2(const void *src, void *dst, int bpp,
1100 int32_t src_stride, int32_t dst_stride,
1101 int16_t src_x, int16_t src_y,
1102 int16_t dst_x, int16_t dst_y,
1103 uint16_t width, uint16_t height)
1104 {
1105 const unsigned tile_width = 128;
1106 const unsigned tile_height = 16;
1107 const unsigned tile_size = 2048;
1108
1109 const unsigned cpp = bpp / 8;
1110 const unsigned tile_pixels = tile_width / cpp;
1111 const unsigned tile_shift = ffs(tile_pixels) - 1;
1112 const unsigned tile_mask = tile_pixels - 1;
1113
1114 DBG(("%s(bpp=%d): src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d\n",
1115 __FUNCTION__, bpp, src_x, src_y, dst_x, dst_y, width, height, src_stride, dst_stride));
1116 assert(src != dst);
1117
1118 if (dst_x | dst_y)
1119 dst = (uint8_t *)dst + dst_y * dst_stride + dst_x * cpp;
1120 assert(dst_stride >= width * cpp);
1121 dst_stride -= width * cpp;
1122
1123 while (height--) {
1124 unsigned w = width * cpp;
1125 const uint8_t *tile_row = src;
1126
1127 tile_row += src_y / tile_height * src_stride * tile_height;
1128 tile_row += (src_y & (tile_height-1)) * tile_width;
1129 if (src_x) {
1130 tile_row += (src_x >> tile_shift) * tile_size;
1131 if (src_x & tile_mask) {
1132 const unsigned x = (src_x & tile_mask) * cpp;
1133 const unsigned len = min(tile_width - x, w);
1134 memcpy(dst, assume_misaligned(tile_row + x, tile_width, x), len);
1135
1136 tile_row += tile_size;
1137 dst = (uint8_t *)dst + len;
1138 w -= len;
1139 }
1140 }
1141 while (w >= tile_width) {
1142 memcpy(dst,
1143 assume_aligned(tile_row, tile_width),
1144 tile_width);
1145
1146 tile_row += tile_size;
1147 dst = (uint8_t *)dst + tile_width;
1148 w -= tile_width;
1149 }
1150 memcpy(dst, assume_aligned(tile_row, tile_width), w);
1151 dst = (uint8_t *)dst + dst_stride + w;
1152 src_y++;
1153 }
1154 }
1155
choose_memcpy_tiled_x(struct kgem * kgem,int swizzling,unsigned cpu)1156 void choose_memcpy_tiled_x(struct kgem *kgem, int swizzling, unsigned cpu)
1157 {
1158 if (kgem->gen < 030) {
1159 if (swizzling == I915_BIT_6_SWIZZLE_NONE) {
1160 DBG(("%s: gen2, no swizzling\n", __FUNCTION__));
1161 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__gen2;
1162 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__gen2;
1163 } else
1164 DBG(("%s: no detiling with swizzle functions for gen2\n", __FUNCTION__));
1165 return;
1166 }
1167
1168 switch (swizzling) {
1169 default:
1170 DBG(("%s: unknown swizzling, %d\n", __FUNCTION__, swizzling));
1171 break;
1172 case I915_BIT_6_SWIZZLE_NONE:
1173 DBG(("%s: no swizzling\n", __FUNCTION__));
1174 #if defined(sse2)
1175 if (cpu & SSE2) {
1176 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0__sse2;
1177 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0__sse2;
1178 kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0__sse2;
1179 } else
1180 #endif
1181 {
1182 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_0;
1183 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_0;
1184 kgem->memcpy_between_tiled_x = memcpy_between_tiled_x__swizzle_0;
1185 }
1186 break;
1187 case I915_BIT_6_SWIZZLE_9:
1188 DBG(("%s: 6^9 swizzling\n", __FUNCTION__));
1189 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9;
1190 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9;
1191 break;
1192 case I915_BIT_6_SWIZZLE_9_10:
1193 DBG(("%s: 6^9^10 swizzling\n", __FUNCTION__));
1194 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10;
1195 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10;
1196 break;
1197 case I915_BIT_6_SWIZZLE_9_11:
1198 DBG(("%s: 6^9^11 swizzling\n", __FUNCTION__));
1199 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_11;
1200 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_11;
1201 break;
1202 case I915_BIT_6_SWIZZLE_9_10_11:
1203 DBG(("%s: 6^9^10^11 swizzling\n", __FUNCTION__));
1204 kgem->memcpy_to_tiled_x = memcpy_to_tiled_x__swizzle_9_10_11;
1205 kgem->memcpy_from_tiled_x = memcpy_from_tiled_x__swizzle_9_10_11;
1206 break;
1207 }
1208 }
1209
1210 void
memmove_box(const void * src,void * dst,int bpp,int32_t stride,const BoxRec * box,int dx,int dy)1211 memmove_box(const void *src, void *dst,
1212 int bpp, int32_t stride,
1213 const BoxRec *box,
1214 int dx, int dy)
1215 {
1216 #define FORCE_MEMMOVE 0
1217 union {
1218 uint8_t u8;
1219 uint16_t u16;
1220 uint32_t u32;
1221 uint64_t u64;
1222 } tmp;
1223 const uint8_t *src_bytes;
1224 uint8_t *dst_bytes;
1225 int width, height;
1226
1227 assert(src);
1228 assert(dst);
1229 assert(src != dst);
1230 assert(bpp >= 8);
1231 assert(box->x2 > box->x1);
1232 assert(box->y2 > box->y1);
1233
1234 DBG(("%s: box=(%d, %d), (%d, %d), pitch=%d, bpp=%d, dx=%d, dy=%d\n",
1235 __FUNCTION__,
1236 box->x1, box->y1, box->x2, box->y2,
1237 stride, bpp, dx, dy));
1238
1239 bpp /= 8;
1240 width = box->y1 * stride + box->x1 * bpp;
1241 src_bytes = (const uint8_t *)src + width;
1242 dst_bytes = (uint8_t *)dst + width;
1243 assert(dst_bytes != src_bytes);
1244
1245 width = (box->x2 - box->x1) * bpp;
1246 height = (box->y2 - box->y1);
1247 assert(width <= stride);
1248 if (width == stride) {
1249 width *= height;
1250 height = 1;
1251 }
1252
1253 if (dy >= 0) {
1254 switch (width) {
1255 case 1:
1256 do {
1257 *dst_bytes = tmp.u8 = *src_bytes;
1258 src_bytes += stride;
1259 dst_bytes += stride;
1260 } while (--height);
1261 break;
1262
1263 case 2:
1264 do {
1265 *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
1266 src_bytes += stride;
1267 dst_bytes += stride;
1268 } while (--height);
1269 break;
1270
1271 case 4:
1272 do {
1273 *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
1274 src_bytes += stride;
1275 dst_bytes += stride;
1276 } while (--height);
1277 break;
1278
1279 case 8:
1280 do {
1281 *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
1282 src_bytes += stride;
1283 dst_bytes += stride;
1284 } while (--height);
1285 break;
1286
1287 default:
1288 if (FORCE_MEMMOVE ||
1289 (dst_bytes < src_bytes + width &&
1290 src_bytes < dst_bytes + width)) {
1291 do {
1292 memmove(dst_bytes, src_bytes, width);
1293 src_bytes += stride;
1294 dst_bytes += stride;
1295 } while (--height);
1296 } else {
1297 do {
1298 memcpy(dst_bytes, src_bytes, width);
1299 src_bytes += stride;
1300 dst_bytes += stride;
1301 } while (--height);
1302 }
1303 break;
1304 }
1305 } else {
1306 src_bytes += (height-1) * stride;
1307 dst_bytes += (height-1) * stride;
1308
1309 switch (width) {
1310 case 1:
1311 do {
1312 *dst_bytes = tmp.u8 = *src_bytes;
1313 src_bytes -= stride;
1314 dst_bytes -= stride;
1315 } while (--height);
1316 break;
1317
1318 case 2:
1319 do {
1320 *(uint16_t *)dst_bytes = tmp.u16 = *(const uint16_t *)src_bytes;
1321 src_bytes -= stride;
1322 dst_bytes -= stride;
1323 } while (--height);
1324 break;
1325
1326 case 4:
1327 do {
1328 *(uint32_t *)dst_bytes = tmp.u32 = *(const uint32_t *)src_bytes;
1329 src_bytes -= stride;
1330 dst_bytes -= stride;
1331 } while (--height);
1332 break;
1333
1334 case 8:
1335 do {
1336 *(uint64_t *)dst_bytes = tmp.u64 = *(const uint64_t *)src_bytes;
1337 src_bytes -= stride;
1338 dst_bytes -= stride;
1339 } while (--height);
1340 break;
1341
1342 default:
1343 if (FORCE_MEMMOVE ||
1344 (dst_bytes < src_bytes + width &&
1345 src_bytes < dst_bytes + width)) {
1346 do {
1347 memmove(dst_bytes, src_bytes, width);
1348 src_bytes -= stride;
1349 dst_bytes -= stride;
1350 } while (--height);
1351 } else {
1352 do {
1353 memcpy(dst_bytes, src_bytes, width);
1354 src_bytes -= stride;
1355 dst_bytes -= stride;
1356 } while (--height);
1357 }
1358 break;
1359 }
1360 }
1361 }
1362
1363 void
memcpy_xor(const void * src,void * dst,int bpp,int32_t src_stride,int32_t dst_stride,int16_t src_x,int16_t src_y,int16_t dst_x,int16_t dst_y,uint16_t width,uint16_t height,uint32_t and,uint32_t or)1364 memcpy_xor(const void *src, void *dst, int bpp,
1365 int32_t src_stride, int32_t dst_stride,
1366 int16_t src_x, int16_t src_y,
1367 int16_t dst_x, int16_t dst_y,
1368 uint16_t width, uint16_t height,
1369 uint32_t and, uint32_t or)
1370 {
1371 const uint8_t *src_bytes;
1372 uint8_t *dst_bytes;
1373 int i, w;
1374
1375 assert(width && height);
1376 assert(bpp >= 8);
1377 assert(width*bpp <= 8*src_stride);
1378 assert(width*bpp <= 8*dst_stride);
1379
1380 DBG(("%s: src=(%d, %d), dst=(%d, %d), size=%dx%d, pitch=%d/%d, bpp=%d, and=%x, xor=%x\n",
1381 __FUNCTION__,
1382 src_x, src_y, dst_x, dst_y,
1383 width, height,
1384 src_stride, dst_stride,
1385 bpp, and, or));
1386
1387 bpp /= 8;
1388 src_bytes = (const uint8_t *)src + src_stride * src_y + src_x * bpp;
1389 dst_bytes = (uint8_t *)dst + dst_stride * dst_y + dst_x * bpp;
1390
1391 if (and == 0xffffffff) {
1392 switch (bpp) {
1393 case 1:
1394 if (width & 1) {
1395 do {
1396 for (i = 0; i < width; i++)
1397 dst_bytes[i] = src_bytes[i] | or;
1398
1399 src_bytes += src_stride;
1400 dst_bytes += dst_stride;
1401 } while (--height);
1402 break;
1403 } else {
1404 width /= 2;
1405 or |= or << 8;
1406 }
1407 /* fall through */
1408 case 2:
1409 if (width & 1) {
1410 do {
1411 uint16_t *d = (uint16_t *)dst_bytes;
1412 const uint16_t *s = (const uint16_t *)src_bytes;
1413
1414 for (i = 0; i < width; i++)
1415 d[i] = s[i] | or;
1416
1417 src_bytes += src_stride;
1418 dst_bytes += dst_stride;
1419 } while (--height);
1420 break;
1421 } else {
1422 width /= 2;
1423 or |= or << 16;
1424 }
1425 /* fall through */
1426 case 4:
1427 w = width;
1428 if (w * 4 == dst_stride && dst_stride == src_stride) {
1429 w *= height;
1430 height = 1;
1431 }
1432
1433 #if defined(sse2) && __x86_64__
1434 if (have_sse2()) {
1435 do {
1436 uint32_t *d = (uint32_t *)dst_bytes;
1437 const uint32_t *s = (const uint32_t *)src_bytes;
1438 __m128i mask = xmm_create_mask_32(or);
1439
1440 i = w;
1441 while (i && (uintptr_t)d & 15) {
1442 *d++ = *s++ | or;
1443 i--;
1444 }
1445
1446 while (i >= 16) {
1447 __m128i xmm1, xmm2, xmm3, xmm4;
1448
1449 xmm1 = xmm_load_128u((const __m128i*)s + 0);
1450 xmm2 = xmm_load_128u((const __m128i*)s + 1);
1451 xmm3 = xmm_load_128u((const __m128i*)s + 2);
1452 xmm4 = xmm_load_128u((const __m128i*)s + 3);
1453
1454 xmm_save_128((__m128i*)d + 0,
1455 _mm_or_si128(xmm1, mask));
1456 xmm_save_128((__m128i*)d + 1,
1457 _mm_or_si128(xmm2, mask));
1458 xmm_save_128((__m128i*)d + 2,
1459 _mm_or_si128(xmm3, mask));
1460 xmm_save_128((__m128i*)d + 3,
1461 _mm_or_si128(xmm4, mask));
1462
1463 d += 16;
1464 s += 16;
1465 i -= 16;
1466 }
1467
1468 if (i & 8) {
1469 __m128i xmm1, xmm2;
1470
1471 xmm1 = xmm_load_128u((const __m128i*)s + 0);
1472 xmm2 = xmm_load_128u((const __m128i*)s + 1);
1473
1474 xmm_save_128((__m128i*)d + 0,
1475 _mm_or_si128(xmm1, mask));
1476 xmm_save_128((__m128i*)d + 1,
1477 _mm_or_si128(xmm2, mask));
1478 d += 8;
1479 s += 8;
1480 i -= 8;
1481 }
1482
1483 if (i & 4) {
1484 xmm_save_128((__m128i*)d,
1485 _mm_or_si128(xmm_load_128u((const __m128i*)s),
1486 mask));
1487
1488 d += 4;
1489 s += 4;
1490 i -= 4;
1491 }
1492
1493 while (i) {
1494 *d++ = *s++ | or;
1495 i--;
1496 }
1497
1498 src_bytes += src_stride;
1499 dst_bytes += dst_stride;
1500 } while (--height);
1501 } else
1502 #else
1503 do {
1504 uint32_t *d = (uint32_t *)dst_bytes;
1505 uint32_t *s = (uint32_t *)src_bytes;
1506
1507 for (i = 0; i < w; i++)
1508 d[i] = s[i] | or;
1509
1510 src_bytes += src_stride;
1511 dst_bytes += dst_stride;
1512 } while (--height);
1513 #endif
1514 break;
1515 }
1516 } else {
1517 switch (bpp) {
1518 case 1:
1519 do {
1520 for (i = 0; i < width; i++)
1521 dst_bytes[i] = (src_bytes[i] & and) | or;
1522
1523 src_bytes += src_stride;
1524 dst_bytes += dst_stride;
1525 } while (--height);
1526 break;
1527
1528 case 2:
1529 do {
1530 uint16_t *d = (uint16_t *)dst_bytes;
1531 const uint16_t *s = (const uint16_t *)src_bytes;
1532
1533 for (i = 0; i < width; i++)
1534 d[i] = (s[i] & and) | or;
1535
1536 src_bytes += src_stride;
1537 dst_bytes += dst_stride;
1538 } while (--height);
1539 break;
1540
1541 case 4:
1542 do {
1543 uint32_t *d = (uint32_t *)dst_bytes;
1544 const uint32_t *s = (const uint32_t *)src_bytes;
1545
1546 for (i = 0; i < width; i++)
1547 d[i] = (s[i] & and) | or;
1548
1549 src_bytes += src_stride;
1550 dst_bytes += dst_stride;
1551 } while (--height);
1552 break;
1553 }
1554 }
1555 }
1556
1557 #define BILINEAR_INTERPOLATION_BITS 4
1558 static inline int
bilinear_weight(pixman_fixed_t x)1559 bilinear_weight(pixman_fixed_t x)
1560 {
1561 return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
1562 ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
1563 }
1564
1565 #if BILINEAR_INTERPOLATION_BITS <= 4
1566 /* Inspired by Filter_32_opaque from Skia */
1567 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1568 bilinear_interpolation(uint32_t tl, uint32_t tr,
1569 uint32_t bl, uint32_t br,
1570 int distx, int disty)
1571 {
1572 int distxy, distxiy, distixy, distixiy;
1573 uint32_t lo, hi;
1574
1575 distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
1576 disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
1577
1578 distxy = distx * disty;
1579 distxiy = (distx << 4) - distxy; /* distx * (16 - disty) */
1580 distixy = (disty << 4) - distxy; /* disty * (16 - distx) */
1581 distixiy =
1582 16 * 16 - (disty << 4) -
1583 (distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
1584
1585 lo = (tl & 0xff00ff) * distixiy;
1586 hi = ((tl >> 8) & 0xff00ff) * distixiy;
1587
1588 lo += (tr & 0xff00ff) * distxiy;
1589 hi += ((tr >> 8) & 0xff00ff) * distxiy;
1590
1591 lo += (bl & 0xff00ff) * distixy;
1592 hi += ((bl >> 8) & 0xff00ff) * distixy;
1593
1594 lo += (br & 0xff00ff) * distxy;
1595 hi += ((br >> 8) & 0xff00ff) * distxy;
1596
1597 return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
1598 }
1599 #elif SIZEOF_LONG > 4
1600 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1601 bilinear_interpolation(uint32_t tl, uint32_t tr,
1602 uint32_t bl, uint32_t br,
1603 int distx, int disty)
1604 {
1605 uint64_t distxy, distxiy, distixy, distixiy;
1606 uint64_t tl64, tr64, bl64, br64;
1607 uint64_t f, r;
1608
1609 distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1610 disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1611
1612 distxy = distx * disty;
1613 distxiy = distx * (256 - disty);
1614 distixy = (256 - distx) * disty;
1615 distixiy = (256 - distx) * (256 - disty);
1616
1617 /* Alpha and Blue */
1618 tl64 = tl & 0xff0000ff;
1619 tr64 = tr & 0xff0000ff;
1620 bl64 = bl & 0xff0000ff;
1621 br64 = br & 0xff0000ff;
1622
1623 f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1624 r = f & 0x0000ff0000ff0000ull;
1625
1626 /* Red and Green */
1627 tl64 = tl;
1628 tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
1629
1630 tr64 = tr;
1631 tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
1632
1633 bl64 = bl;
1634 bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
1635
1636 br64 = br;
1637 br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
1638
1639 f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
1640 r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
1641
1642 return (uint32_t)(r >> 16);
1643 }
1644 #else
1645 static inline uint32_t
bilinear_interpolation(uint32_t tl,uint32_t tr,uint32_t bl,uint32_t br,int distx,int disty)1646 bilinear_interpolation(uint32_t tl, uint32_t tr,
1647 uint32_t bl, uint32_t br,
1648 int distx, int disty)
1649 {
1650 int distxy, distxiy, distixy, distixiy;
1651 uint32_t f, r;
1652
1653 distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
1654 disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
1655
1656 distxy = distx * disty;
1657 distxiy = (distx << 8) - distxy; /* distx * (256 - disty) */
1658 distixy = (disty << 8) - distxy; /* disty * (256 - distx) */
1659 distixiy =
1660 256 * 256 - (disty << 8) -
1661 (distx << 8) + distxy; /* (256 - distx) * (256 - disty) */
1662
1663 /* Blue */
1664 r = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1665 (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy);
1666
1667 /* Green */
1668 f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1669 (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy);
1670 r |= f & 0xff000000;
1671
1672 tl >>= 16;
1673 tr >>= 16;
1674 bl >>= 16;
1675 br >>= 16;
1676 r >>= 16;
1677
1678 /* Red */
1679 f = ((tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy +
1680 (bl & 0x000000ff) * distixy + (br & 0x000000ff) * distxy);
1681 r |= f & 0x00ff0000;
1682
1683 /* Alpha */
1684 f = ((tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy +
1685 (bl & 0x0000ff00) * distixy + (br & 0x0000ff00) * distxy);
1686 r |= f & 0xff000000;
1687
1688 return r;
1689 }
1690 #endif
1691
convert_pixel(const uint8_t * p,int x)1692 static inline uint32_t convert_pixel(const uint8_t *p, int x)
1693 {
1694 return ((uint32_t *)p)[x];
1695 }
1696
1697 fast void
affine_blt(const void * src,void * dst,int bpp,int16_t src_x,int16_t src_y,int16_t src_width,int16_t src_height,int32_t src_stride,int16_t dst_x,int16_t dst_y,uint16_t dst_width,uint16_t dst_height,int32_t dst_stride,const struct pixman_f_transform * t)1698 affine_blt(const void *src, void *dst, int bpp,
1699 int16_t src_x, int16_t src_y,
1700 int16_t src_width, int16_t src_height,
1701 int32_t src_stride,
1702 int16_t dst_x, int16_t dst_y,
1703 uint16_t dst_width, uint16_t dst_height,
1704 int32_t dst_stride,
1705 const struct pixman_f_transform *t)
1706 {
1707 static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1708 const pixman_fixed_t ux = pixman_double_to_fixed(t->m[0][0]);
1709 const pixman_fixed_t uy = pixman_double_to_fixed(t->m[1][0]);
1710 int i, j;
1711
1712 assert(bpp == 32);
1713
1714 for (j = 0; j < dst_height; j++) {
1715 pixman_fixed_t x, y;
1716 struct pixman_f_vector v;
1717 uint32_t *b;
1718
1719 /* reference point is the center of the pixel */
1720 v.v[0] = dst_x + 0.5;
1721 v.v[1] = dst_y + j + 0.5;
1722 v.v[2] = 1.0;
1723
1724 pixman_f_transform_point_3d(t, &v);
1725
1726 x = pixman_double_to_fixed(v.v[0]);
1727 x += pixman_int_to_fixed(src_x - dst_x);
1728 y = pixman_double_to_fixed(v.v[1]);
1729 y += pixman_int_to_fixed(src_y - dst_y);
1730
1731 b = (uint32_t*)((uint8_t *)dst + (dst_y + j) * dst_stride + dst_x * bpp / 8);
1732 for (i = 0; i < dst_width; i++) {
1733 const uint8_t *row1;
1734 const uint8_t *row2;
1735 int x1, y1, x2, y2;
1736 uint32_t tl, tr, bl, br;
1737 int32_t fx, fy;
1738
1739 x1 = x - pixman_fixed_1/2;
1740 y1 = y - pixman_fixed_1/2;
1741
1742 fx = bilinear_weight(x1);
1743 fy = bilinear_weight(y1);
1744
1745 x1 = pixman_fixed_to_int(x1);
1746 x2 = x1 + 1;
1747 y1 = pixman_fixed_to_int(y1);
1748 y2 = y1 + 1;
1749
1750 if (x1 >= src_width || x2 < 0 ||
1751 y1 >= src_height || y2 < 0) {
1752 b[i] = 0;
1753 goto next;
1754 }
1755
1756 if (y2 == 0) {
1757 row1 = zero;
1758 } else {
1759 row1 = (uint8_t *)src + src_stride * y1;
1760 row1 += bpp / 8 * x1;
1761 }
1762
1763 if (y1 == src_height - 1) {
1764 row2 = zero;
1765 } else {
1766 row2 = (uint8_t *)src + src_stride * y2;
1767 row2 += bpp / 8 * x1;
1768 }
1769
1770 if (x2 == 0) {
1771 tl = 0;
1772 bl = 0;
1773 } else {
1774 tl = convert_pixel(row1, 0);
1775 bl = convert_pixel(row2, 0);
1776 }
1777
1778 if (x1 == src_width - 1) {
1779 tr = 0;
1780 br = 0;
1781 } else {
1782 tr = convert_pixel(row1, 1);
1783 br = convert_pixel(row2, 1);
1784 }
1785
1786 b[i] = bilinear_interpolation(tl, tr, bl, br, fx, fy);
1787
1788 next:
1789 x += ux;
1790 y += uy;
1791 }
1792 }
1793 }
1794