1;// file : mmx_zoom.s
2;// author : JC Hoelt <jeko@free.fr>
3;//
4;// history
5;// 07/01/2001 : Changing FEMMS to EMMS : slower... but run on intel machines
6;//	03/01/2001 : WIDTH and HEIGHT are now variable
7;//	28/12/2000 : adding comments to the code, suppress some useless lines
8;//	27/12/2000 : reducing memory access... improving performance by 20%
9;//		coefficients are now on 1 byte
10;//	22/12/2000 : Changing data structure
11;//	16/12/2000 : AT&T version
12;//	14/12/2000 : unrolling loop
13;//	12/12/2000 : 64 bits memory access
14
15
16.data
17
18chaine:
19	.string	"pos = %d\n\0"
20	.long 0x0
21
22thezero:
23	.long 0x00000000
24	.long 0x00000000
25
26.text
27
28.globl mmx_zoom		;// name of the function to call by C program
29/* .extern coeffs		;// the transformation buffer */
30.extern expix1,expix2 ;// the source and destination buffer
31.extern mmx_zoom_size, zoom_width ;// size of the buffers
32
33.extern brutS,brutD,buffratio,precalCoef,prevX,prevY
34
35#define PERTEMASK 15
36/* faire : a / sqrtperte <=> a >> PERTEDEC*/
37#define PERTEDEC 4
38
39.align 16
40mmx_zoom:
41
42		pushl %ebp
43		movl %esp,%ebp
44		subl $12,%esp
45
46		movl prevX,%eax
47		decl %eax
48		sarl $4,%eax
49		movl %eax,-4(%ebp)
50
51		movl prevY,%eax
52		decl %eax
53		sarl $4,%eax
54		movl %eax,-8(%ebp)
55
56;// initialisation du mm7 � zero
57		movq (thezero), %mm7
58
59movl mmx_zoom_size, %ecx
60decl %ecx
61
62.while:
63	;// esi <- nouvelle position
64	movl brutS, %eax
65	leal (%eax, %ecx, 8),%eax
66
67	movl (%eax),%edx /* = brutS.px (brutSmypos) */
68	movl 4(%eax),%eax /* = brutS.py */
69
70	movl brutD,%ebx
71	leal (%ebx, %ecx, 8),%ebx
72	movl (%ebx),%esi
73	subl %edx, %esi
74	imull buffratio,%esi
75	sarl $16,%esi
76	addl %edx,%esi /* esi = px */
77
78	/* eax contient deja brutS.py = le nouveau brutSmypos*/
79	/* ebx pointe sur brutD[myPos] */
80	movl 4(%ebx),%edi
81	subl %eax,%edi
82	imull buffratio,%edi
83	sarl $16,%edi
84	addl %eax,%edi /* edi = py */
85
86/*		pushl %eax
87		pushl %ebx*/
88/*		popl %ebx
89		popl %eax*/
90
91	movl %esi,%eax
92	andl $15,%eax /* eax = coefh */
93	movl %edi,%ebx
94	andl $15,%ebx /* ebx = coefv */
95
96	leal 0(,%ebx,4),%ebx
97	sall $6,%eax
98	addl %ebx,%eax
99	movl $precalCoef,%ebx
100/*	movd (%eax,%ebx),%mm6*/ /* mm6 = coeffs */
101
102	cmpl -8(%ebp),%edi
103	jge .then1
104	cmpl -4(%ebp),%esi
105	jge .then1
106
107	sarl $4,%esi
108	sarl $4,%edi
109	imull zoom_width,%edi
110	leal (%esi,%edi),%esi
111	jmp .finsi1
112
113.then1:
114	movl $0,%esi
115.finsi1:
116
117	/** apres ce calcul, %esi = pos, %mm6 = coeffs **/
118/*	pushl %esi
119	pushl $chaine
120	call printf
121	addl $8,%esp*/
122
123	movl expix1,%eax
124
125	;// recuperation des deux premiers pixels dans mm0 et mm1
126/*	movq (%eax,%esi,4), %mm0		/* b1-v1-r1-a1-b2-v2-r2-a2 */
127	movq %mm0, %mm1				/* b1-v1-r1-a1-b2-v2-r2-a2 */
128
129	;// depackage du premier pixel
130	punpcklbw %mm7, %mm0	/* 00-b2-00-v2-00-r2-00-a2 */
131
132	movq %mm6, %mm5			/* ??-??-??-??-c4-c3-c2-c1 */
133	;// depackage du 2ieme pixel
134	punpckhbw %mm7, %mm1	/* 00-b1-00-v1-00-r1-00-a1 */
135
136	;// extraction des coefficients...
137	punpcklbw %mm5, %mm6	/* c4-c4-c3-c3-c2-c2-c1-c1 */
138	movq %mm6, %mm4			/* c4-c4-c3-c3-c2-c2-c1-c1 */
139	movq %mm6, %mm5			/* c4-c4-c3-c3-c2-c2-c1-c1 */
140
141	punpcklbw %mm5, %mm6	/* c2-c2-c2-c2-c1-c1-c1-c1 */
142	punpckhbw %mm5, %mm4	/* c4-c4-c4-c4-c3-c3-c3-c3 */
143
144	movq %mm6, %mm3			/* c2-c2-c2-c2-c1-c1-c1-c1 */
145	punpcklbw %mm7, %mm6	/* 00-c1-00-c1-00-c1-00-c1 */
146	punpckhbw %mm7, %mm3	/* 00-c2-00-c2-00-c2-00-c2 */
147
148	;// multiplication des pixels par les coefficients
149	pmullw %mm6, %mm0		/* c1*b2-c1*v2-c1*r2-c1*a2 */
150	pmullw %mm3, %mm1		/* c2*b1-c2*v1-c2*r1-c2*a1 */
151	paddw %mm1, %mm0
152
153	;// ...extraction des 2 derniers coefficients
154	movq %mm4, %mm5			/* c4-c4-c4-c4-c3-c3-c3-c3 */
155	punpcklbw %mm7, %mm4	/* 00-c3-00-c3-00-c3-00-c3 */
156	punpckhbw %mm7, %mm5	/* 00-c4-00-c4-00-c4-00-c4 */
157
158	/* ajouter la longueur de ligne a esi */
159	addl prevX,%esi
160
161	;// recuperation des 2 derniers pixels
162/*	movq (%eax,%esi,4), %mm1*/
163	movq %mm1, %mm2
164
165	;// depackage des pixels
166	punpcklbw %mm7, %mm1
167	punpckhbw %mm7, %mm2
168
169	;// multiplication pas les coeffs
170	pmullw %mm4, %mm1
171	pmullw %mm5, %mm2
172
173	;// ajout des valeurs obtenues � la valeur finale
174	paddw %mm1, %mm0
175	paddw %mm2, %mm0
176
177	;// division par 256 = 16+16+16+16, puis repackage du pixel final
178	psrlw $8, %mm0
179	packuswb %mm7, %mm0
180
181	;// passage au suivant
182
183	;// enregistrement du resultat
184	movl expix2,%eax
185/*	movd %mm0,(%eax,%ecx,4)*/
186
187	decl %ecx
188	;// test de fin du tantque
189	cmpl $0, %ecx				;// 400x300
190
191	jz .fin_while
192	jmp .while
193
194.fin_while:
195	emms
196
197	movl %ebp,%esp
198	popl %ebp
199
200	ret                  ;//The End
201