1/*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32/* Control flow enforcement support */
33#ifdef HAVE_CET_H
34#include <cet.h>
35#else
36#define _CET_ENDBR
37#endif
38
39	.file	"read_rgba_span_x86.S"
40#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
41/* Kevin F. Quinn 2nd July 2006
42 * Replaced data segment constants with text-segment instructions.
43 */
44#define	LOAD_MASK(mvins,m1,m2) \
45   	pushl	$0xff00ff00 ;\
46   	pushl	$0xff00ff00 ;\
47   	pushl	$0xff00ff00 ;\
48   	pushl	$0xff00ff00 ;\
49	mvins	(%esp), m1	;\
50   	pushl	$0x00ff0000 ;\
51   	pushl	$0x00ff0000 ;\
52   	pushl	$0x00ff0000 ;\
53   	pushl	$0x00ff0000 ;\
54	mvins	(%esp), m2	;\
55	addl	$32, %esp
56
57/* I implemented these as macros because they appear in several places,
58 * and I've tweaked them a number of times.  I got tired of changing every
59 * place they appear. :)
60 */
61
62#define DO_ONE_PIXEL() \
63	movl	(%ebx), %eax ; \
64	addl	$4, %ebx ; \
65	bswap	%eax          /* ARGB -> BGRA */ ; \
66	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
67	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
68	addl	$4, %ecx
69
70#define DO_ONE_LAST_PIXEL() \
71	movl	(%ebx), %eax ; \
72	bswap	%eax          /* ARGB -> BGRA */ ; \
73	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
74	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
75
76
77/**
78 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
79 *
80 * \warning
81 * This function assumes that the caller will issue the EMMS instruction
82 * at the correct places.
83 */
84
85.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
86.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
87	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
88_generic_read_RGBA_span_BGRA8888_REV_MMX:
89	_CET_ENDBR
90	pushl	%ebx
91
92#ifdef USE_INNER_EMMS
93	emms
94#endif
95	LOAD_MASK(movq,%mm1,%mm2)
96
97	movl	8(%esp), %ebx	/* source pointer */
98	movl	16(%esp), %edx	/* number of pixels to copy */
99	movl	12(%esp), %ecx	/* destination pointer */
100
101	testl	%edx, %edx
102	jle	.L20		/* Bail if there's nothing to do. */
103
104	movl	%ebx, %eax
105
106	negl	%eax
107	sarl	$2, %eax
108	andl	$1, %eax
109	je	.L17
110
111	subl	%eax, %edx
112	DO_ONE_PIXEL()
113.L17:
114
115	/* Would it be faster to unroll this loop once and process 4 pixels
116	 * per pass, instead of just two?
117	 */
118
119	movl	%edx, %eax
120	shrl	%eax
121	jmp	.L18
122.L19:
123	movq	(%ebx), %mm0
124	addl	$8, %ebx
125
126	/* These 9 instructions do what PSHUFB (if there were such an
127	 * instruction) could do in 1. :(
128	 */
129
130	movq	%mm0, %mm3
131	movq	%mm0, %mm4
132
133	pand	%mm2, %mm3
134	psllq	$16, %mm4
135	psrlq	$16, %mm3
136	pand	%mm2, %mm4
137
138	pand	%mm1, %mm0
139	por	%mm4, %mm3
140	por	%mm3, %mm0
141
142	movq	%mm0, (%ecx)
143	addl	$8, %ecx
144	subl	$1, %eax
145.L18:
146	jne	.L19
147
148#ifdef USE_INNER_EMMS
149	emms
150#endif
151
152	/* At this point there are either 1 or 0 pixels remaining to be
153	 * converted.  Convert the last pixel, if needed.
154	 */
155
156	testl	$1, %edx
157	je	.L20
158
159	DO_ONE_LAST_PIXEL()
160
161.L20:
162	popl	%ebx
163	ret
164	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
165
166
167/**
168 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
169 * instructions are only actually used to read data from the framebuffer.
170 * In practice, the speed-up is pretty small.
171 *
172 * \todo
173 * Do some more testing and determine if there's any reason to have this
174 * function in addition to the MMX version.
175 *
176 * \warning
177 * This function assumes that the caller will issue the EMMS instruction
178 * at the correct places.
179 */
180
181.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
182.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
183	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
184_generic_read_RGBA_span_BGRA8888_REV_SSE:
185	_CET_ENDBR
186	pushl	%esi
187	pushl	%ebx
188	pushl	%ebp
189
190#ifdef USE_INNER_EMMS
191	emms
192#endif
193
194	LOAD_MASK(movq,%mm1,%mm2)
195
196	movl	16(%esp), %ebx	/* source pointer */
197	movl	24(%esp), %edx	/* number of pixels to copy */
198	movl	20(%esp), %ecx	/* destination pointer */
199
200	testl	%edx, %edx
201	jle	.L35		/* Bail if there's nothing to do. */
202
203	movl	%esp, %ebp
204	subl	$16, %esp
205	andl	$0xfffffff0, %esp
206
207	movl	%ebx, %eax
208	movl	%edx, %esi
209
210	negl	%eax
211	andl	$15, %eax
212	sarl	$2, %eax
213	cmpl	%edx, %eax
214	cmovle	%eax, %esi
215
216	subl	%esi, %edx
217
218	testl	$1, %esi
219	je	.L32
220
221	DO_ONE_PIXEL()
222.L32:
223
224	testl	$2, %esi
225	je	.L31
226
227	movq	(%ebx), %mm0
228	addl	$8, %ebx
229
230	movq	%mm0, %mm3
231	movq	%mm0, %mm4
232
233	pand	%mm2, %mm3
234	psllq	$16, %mm4
235	psrlq	$16, %mm3
236	pand	%mm2, %mm4
237
238	pand	%mm1, %mm0
239	por	%mm4, %mm3
240	por	%mm3, %mm0
241
242	movq	%mm0, (%ecx)
243	addl	$8, %ecx
244.L31:
245
246	movl	%edx, %eax
247	shrl	$2, %eax
248	jmp	.L33
249.L34:
250	movaps	(%ebx), %xmm0
251	addl	$16, %ebx
252
253	/* This would be so much better if we could just move directly from
254	 * an SSE register to an MMX register.  Unfortunately, that
255	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
256	 * instruction.
257	 */
258
259	movaps	%xmm0, (%esp)
260	movq	(%esp), %mm0
261	movq	8(%esp), %mm5
262
263	movq	%mm0, %mm3
264	movq	%mm0, %mm4
265	movq	%mm5, %mm6
266	movq	%mm5, %mm7
267
268	pand	%mm2, %mm3
269	pand	%mm2, %mm6
270
271	psllq	$16, %mm4
272	psllq	$16, %mm7
273
274	psrlq	$16, %mm3
275	psrlq	$16, %mm6
276
277	pand	%mm2, %mm4
278	pand	%mm2, %mm7
279
280	pand	%mm1, %mm0
281	pand	%mm1, %mm5
282
283	por	%mm4, %mm3
284	por	%mm7, %mm6
285
286	por	%mm3, %mm0
287	por	%mm6, %mm5
288
289	movq	%mm0, (%ecx)
290	movq	%mm5, 8(%ecx)
291	addl	$16, %ecx
292
293	subl	$1, %eax
294.L33:
295	jne	.L34
296
297#ifdef USE_INNER_EMMS
298	emms
299#endif
300	movl	%ebp, %esp
301
302	/* At this point there are either [0, 3] pixels remaining to be
303	 * converted.
304	 */
305
306	testl	$2, %edx
307	je	.L36
308
309	movq	(%ebx), %mm0
310	addl	$8, %ebx
311
312	movq	%mm0, %mm3
313	movq	%mm0, %mm4
314
315	pand	%mm2, %mm3
316	psllq	$16, %mm4
317	psrlq	$16, %mm3
318	pand	%mm2, %mm4
319
320	pand	%mm1, %mm0
321	por	%mm4, %mm3
322	por	%mm3, %mm0
323
324	movq	%mm0, (%ecx)
325	addl	$8, %ecx
326.L36:
327
328	testl	$1, %edx
329	je	.L35
330
331	DO_ONE_LAST_PIXEL()
332.L35:
333	popl	%ebp
334	popl	%ebx
335	popl	%esi
336	ret
337	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
338
339
340/**
341 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
342 */
343
344	.text
345.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
346.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
347	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
348_generic_read_RGBA_span_BGRA8888_REV_SSE2:
349	_CET_ENDBR
350	pushl	%esi
351	pushl	%ebx
352
353	LOAD_MASK(movdqu,%xmm1,%xmm2)
354
355	movl	12(%esp), %ebx	/* source pointer */
356	movl	20(%esp), %edx	/* number of pixels to copy */
357	movl	16(%esp), %ecx	/* destination pointer */
358
359	movl	%ebx, %eax
360	movl	%edx, %esi
361
362	testl	%edx, %edx
363	jle	.L46		/* Bail if there's nothing to do. */
364
365	/* If the source pointer isn't a multiple of 16 we have to process
366	 * a few pixels the "slow" way to get the address aligned for
367	 * the SSE fetch intsructions.
368	 */
369
370	negl	%eax
371	andl	$15, %eax
372	sarl	$2, %eax
373
374	cmpl	%edx, %eax
375	cmovbe	%eax, %esi
376	subl	%esi, %edx
377
378	testl	$1, %esi
379	je	.L41
380
381	DO_ONE_PIXEL()
382.L41:
383	testl	$2, %esi
384	je	.L40
385
386	movq	(%ebx), %xmm0
387	addl	$8, %ebx
388
389	movdqa	%xmm0, %xmm3
390	movdqa	%xmm0, %xmm4
391	andps	%xmm1, %xmm0
392
393	andps	%xmm2, %xmm3
394	pslldq	$2, %xmm4
395	psrldq	$2, %xmm3
396	andps	%xmm2, %xmm4
397
398	orps	%xmm4, %xmm3
399	orps	%xmm3, %xmm0
400
401	movq	%xmm0, (%ecx)
402	addl	$8, %ecx
403.L40:
404
405	/* Would it be worth having a specialized version of this loop for
406	 * the case where the destination is 16-byte aligned?  That version
407	 * would be identical except that it could use movedqa instead of
408	 * movdqu.
409	 */
410
411	movl	%edx, %eax
412	shrl	$2, %eax
413	jmp	.L42
414.L43:
415	movdqa	(%ebx), %xmm0
416	addl	$16, %ebx
417
418	movdqa	%xmm0, %xmm3
419	movdqa	%xmm0, %xmm4
420	andps	%xmm1, %xmm0
421
422	andps	%xmm2, %xmm3
423	pslldq	$2, %xmm4
424	psrldq	$2, %xmm3
425	andps	%xmm2, %xmm4
426
427	orps	%xmm4, %xmm3
428	orps	%xmm3, %xmm0
429
430	movdqu	%xmm0, (%ecx)
431	addl	$16, %ecx
432	subl	$1, %eax
433.L42:
434	jne	.L43
435
436
437	/* There may be upto 3 pixels remaining to be copied.  Take care
438	 * of them now.  We do the 2 pixel case first because the data
439	 * will be aligned.
440	 */
441
442	testl	$2, %edx
443	je	.L47
444
445	movq	(%ebx), %xmm0
446	addl	$8, %ebx
447
448	movdqa	%xmm0, %xmm3
449	movdqa	%xmm0, %xmm4
450	andps	%xmm1, %xmm0
451
452	andps	%xmm2, %xmm3
453	pslldq	$2, %xmm4
454	psrldq	$2, %xmm3
455	andps	%xmm2, %xmm4
456
457	orps	%xmm4, %xmm3
458	orps	%xmm3, %xmm0
459
460	movq	%xmm0, (%ecx)
461	addl	$8, %ecx
462.L47:
463
464	testl	$1, %edx
465	je	.L46
466
467	DO_ONE_LAST_PIXEL()
468.L46:
469
470	popl	%ebx
471	popl	%esi
472	ret
473	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
474
475
476
477#define MASK_565_L	0x07e0f800
478#define MASK_565_H	0x0000001f
479/* Setting SCALE_ADJUST to 5 gives a perfect match with the
480 * classic C implementation in Mesa.  Setting SCALE_ADJUST
481 * to 0 is slightly faster but at a small cost to accuracy.
482 */
483#define SCALE_ADJUST	5
484#if SCALE_ADJUST == 5
485#define PRESCALE_L 0x00100001
486#define PRESCALE_H 0x00000200
487#define SCALE_L 0x40C620E8
488#define SCALE_H 0x0000839d
489#elif SCALE_ADJUST == 0
490#define PRESCALE_L 0x00200001
491#define PRESCALE_H 0x00000800
492#define SCALE_L 0x01040108
493#define SCALE_H 0x00000108
494#else
495#error SCALE_ADJUST must either be 5 or 0.
496#endif
497#define ALPHA_L 0x00000000
498#define ALPHA_H 0x00ff0000
499
500/**
501 * MMX optimized version of the RGB565 to RGBA copy routine.
502 */
503
504	.text
505	.globl	_generic_read_RGBA_span_RGB565_MMX
506        .hidden _generic_read_RGBA_span_RGB565_MMX
507	.type	_generic_read_RGBA_span_RGB565_MMX, @function
508
509_generic_read_RGBA_span_RGB565_MMX:
510	_CET_ENDBR
511#ifdef USE_INNER_EMMS
512	emms
513#endif
514
515	movl	4(%esp), %eax	/* source pointer */
516	movl	8(%esp), %edx	/* destination pointer */
517	movl	12(%esp), %ecx	/* number of pixels to copy */
518
519	pushl	$MASK_565_H
520	pushl	$MASK_565_L
521	movq	(%esp), %mm5
522	pushl	$PRESCALE_H
523	pushl	$PRESCALE_L
524	movq	(%esp), %mm6
525	pushl	$SCALE_H
526	pushl	$SCALE_L
527	movq	(%esp), %mm7
528	pushl	$ALPHA_H
529	pushl	$ALPHA_L
530	movq	(%esp), %mm3
531	addl	$32,%esp
532
533	sarl	$2, %ecx
534	jl	.L01		/* Bail early if the count is negative. */
535	jmp	.L02
536
537.L03:
538	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
539	 * second pixels into the four words of %mm0 and %mm2.
540      	 */
541
542	movq	(%eax), %mm4
543	addl	$8, %eax
544
545	pshufw	$0x00, %mm4, %mm0
546	pshufw	$0x55, %mm4, %mm2
547
548
549	/* Mask the pixels so that each word of each register contains only
550	 * one color component.
551	 */
552
553	pand	%mm5, %mm0
554	pand	%mm5, %mm2
555
556
557	/* Adjust the component values so that they are as small as possible,
558	 * but large enough so that we can multiply them by an unsigned 16-bit
559	 * number and get a value as large as 0x00ff0000.
560 	 */
561
562	pmullw	%mm6, %mm0
563	pmullw	%mm6, %mm2
564#if SCALE_ADJUST > 0
565	psrlw	$SCALE_ADJUST, %mm0
566	psrlw	$SCALE_ADJUST, %mm2
567#endif
568
569	/* Scale the input component values to be on the range
570	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
571	 */
572
573	pmulhuw	%mm7, %mm0
574	pmulhuw	%mm7, %mm2
575
576
577	/* Always set the alpha value to 0xff.
578	 */
579
580 	por %mm3, %mm0
581 	por %mm3, %mm2
582
583
584	/* Pack the 16-bit values to 8-bit values and store the converted
585	 * pixel data.
586	 */
587
588	packuswb	%mm2, %mm0
589	movq	%mm0, (%edx)
590	addl	$8, %edx
591
592	pshufw	$0xaa, %mm4, %mm0
593	pshufw	$0xff, %mm4, %mm2
594
595	pand	%mm5, %mm0
596	pand	%mm5, %mm2
597	pmullw	%mm6, %mm0
598	pmullw	%mm6, %mm2
599#if SCALE_ADJUST > 0
600	psrlw	$SCALE_ADJUST, %mm0
601	psrlw	$SCALE_ADJUST, %mm2
602#endif
603	pmulhuw	%mm7, %mm0
604	pmulhuw	%mm7, %mm2
605
606 	por %mm3, %mm0
607 	por %mm3, %mm2
608
609	packuswb	%mm2, %mm0
610
611	movq	%mm0, (%edx)
612	addl	$8, %edx
613
614	subl	$1, %ecx
615.L02:
616	jne	.L03
617
618
619	/* At this point there can be at most 3 pixels left to process.  If
620	 * there is either 2 or 3 left, process 2.
621         */
622
623	movl	12(%esp), %ecx
624	testl	$0x02, %ecx
625	je	.L04
626
627	movd	(%eax), %mm4
628	addl	$4, %eax
629
630	pshufw	$0x00, %mm4, %mm0
631	pshufw	$0x55, %mm4, %mm2
632
633	pand	%mm5, %mm0
634	pand	%mm5, %mm2
635	pmullw	%mm6, %mm0
636	pmullw	%mm6, %mm2
637#if SCALE_ADJUST > 0
638	psrlw	$SCALE_ADJUST, %mm0
639	psrlw	$SCALE_ADJUST, %mm2
640#endif
641	pmulhuw	%mm7, %mm0
642	pmulhuw	%mm7, %mm2
643
644 	por %mm3, %mm0
645 	por %mm3, %mm2
646
647	packuswb	%mm2, %mm0
648
649	movq	%mm0, (%edx)
650	addl	$8, %edx
651
652.L04:
653	/* At this point there can be at most 1 pixel left to process.
654	 * Process it if needed.
655         */
656
657	testl	$0x01, %ecx
658	je	.L01
659
660	movzwl	(%eax), %ecx
661	movd	%ecx, %mm4
662
663	pshufw	$0x00, %mm4, %mm0
664
665	pand	%mm5, %mm0
666	pmullw	%mm6, %mm0
667#if SCALE_ADJUST > 0
668	psrlw	$SCALE_ADJUST, %mm0
669#endif
670	pmulhuw	%mm7, %mm0
671
672 	por %mm3, %mm0
673
674	packuswb	%mm0, %mm0
675
676	movd	%mm0, (%edx)
677
678.L01:
679#ifdef USE_INNER_EMMS
680	emms
681#endif
682	ret
683#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
684
685#if defined (__ELF__) && defined (__linux__)
686	.section .note.GNU-stack,"",%progbits
687#endif
688