xref: /reactos/dll/opengl/mesa/asm-386.S (revision d6d1efe7)
1/* $Id: asm-386.S,v 1.8 1997/12/17 00:50:51 brianp Exp $ */
2
3/*
4 * asm-386.S - special (hopefully faster) transformation functions for x86
5 *
6 * by Josh Vanderhoof
7 *
8 * This file is in the public domain.
9 */
10
11/*
12 * $Log: asm-386.S,v $
13 * Revision 1.8  1997/12/17 00:50:51  brianp
14 * applied Josh's patch to fix texture coordinate transformation bugs
15 *
16 * Revision 1.7  1997/12/17 00:27:11  brianp
17 * applied Josh's patch to fix bfris
18 *
19 * Revision 1.6  1997/12/01 01:02:41  brianp
20 * added FreeBSD patches (Daniel J. O'Connor)
21 *
22 * Revision 1.5  1997/11/19 23:52:17  brianp
23 * added missing "cld" instruction in asm_transform_points4_identity()
24 *
25 * Revision 1.4  1997/11/11 02:22:41  brianp
26 * small change per Josh to ensure U/V pairing
27 *
28 * Revision 1.3  1997/11/07 03:37:24  brianp
29 * added missing line from Stephane Rehel
30 *
31 * Revision 1.2  1997/11/07 03:30:37  brianp
32 * added Josh's 11-5-97 patches
33 *
34 * Revision 1.1  1997/10/30 06:00:33  brianp
35 * Initial revision
36 */
37
38#include <asm.inc>
39
40#define S(x)    dword ptr [esi + 4*x]
41#define D(x)    dword ptr [edi + 4*x]
42#define M(x, y) dword ptr [edx + 16*x + 4*y]
43
44.code
45
46/*
47 * void asm_transform_points3_general( GLuint n, GLfloat d[][4],
48 *                                     GLfloat m[16], GLfloat s[][4] );
49 */
50PUBLIC _asm_transform_points3_general
51_asm_transform_points3_general:
52.align 4
53	push esi
54	push edi
55
56	mov ecx, [esp + 12]	    /* ecx = n */
57	mov edi, [esp + 16] 	/* edi = d */
58	mov edx, [esp + 20] 	/* edx = m */
59	mov esi, [esp + 24] 	/* esi = s */
60
61	test ecx, ecx
62	jz _asm_transform_points3_general_end
63
64.align 4
65_asm_transform_points3_general_loop:
66	fld S(0)
67	fmul M(0, 0)
68	fld S(0)
69	fmul M(0, 1)
70	fld S(0)
71	fmul M(0, 2)
72	fld S(0)
73	fmul M(0, 3)
74
75	fld S(1)
76	fmul M(1, 0)
77	fld S(1)
78	fmul M(1, 1)
79	fld S(1)
80	fmul M(1, 2)
81	fld S(1)
82	fmul M(1, 3)
83
84	/*
85	 * The FPU stack should now look like this:
86	 *
87	 * st(7) = S(0) * M(0, 0)
88	 * st(6) = S(0) * M(0, 1)
89	 * st(5) = S(0) * M(0, 2)
90	 * st(4) = S(0) * M(0, 3)
91	 * st(3) = S(1) * M(1, 0)
92	 * st(2) = S(1) * M(1, 1)
93	 * st(1) = S(1) * M(1, 2)
94	 * st(0) = S(1) * M(1, 3)
95	 */
96
97	fxch st(3)		/* 3 1 2 0 4 5 6 7 */
98	faddp st(7), st 	/* 1 2 0 4 5 6 7 */
99	fxch st(1)		/* 2 1 0 4 5 6 7 */
100	faddp st(5), st 	/* 1 0 4 5 6 7 */
101	faddp st(3), st 	/* 0 4 5 6 7 */
102	faddp st(1), st  	/* 4 5 6 7 */
103
104	/*
105	 * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
106	 * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
107	 * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
108	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
109	 */
110
111	fld S(2)
112	fmul M(2, 0)
113	fld S(2)
114	fmul M(2, 1)
115	fld S(2)
116	fmul M(2, 2)
117	fld S(2)
118	fmul M(2, 3)
119
120	/*
121	 * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
122	 * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
123	 * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
124	 * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
125	 * st(3) = S(2) * M(2, 0)
126	 * st(2) = S(2) * M(2, 1)
127	 * st(1) = S(2) * M(2, 2)
128	 * st(0) = S(2) * M(2, 3)
129	 */
130
131	fxch st(3)          /* 3 1 2 0 4 5 6 7 */
132	faddp st(7), st 	/* 1 2 0 4 5 6 7 */
133	fxch st(1)          /* 2 1 0 4 5 6 7 */
134	faddp st(5), st     /* 1 0 4 5 6 7 */
135	faddp st(3), st     /* 0 4 5 6 7 */
136	faddp st(1), st     /* 4 5 6 7 */
137
138	/*
139	 * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
140	 * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
141	 * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
142	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
143	 */
144
145	fxch st(3) 	/* 3 1 2 0 */
146	fadd M(3, 0)
147	fxch st(2) 	/* 2 1 3 0 */
148	fadd M(3, 1)
149	fxch st(1) 	/* 1 2 3 0 */
150	fadd M(3, 2)
151	fxch st(3) 	/* 0 2 3 1 */
152	fadd M(3, 3)
153
154	/*
155	 * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + M(3, 2)
156	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + M(3, 0)
157	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + M(3, 1)
158	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + M(3, 3)
159	 */
160
161	fxch st(3) 	/* 3 1 2 0 */
162	fstp D(2) 	/* 1 2 0 */
163	fxch st(1) 	/* 2 1 0 */
164	fstp D(0) 	/* 1 0 */
165	lea esi, S(4)
166	fstp D(1) 	/* 0 */
167	dec ecx
168	fstp D(3) 	/* */
169
170	lea edi, D(4)
171
172	jnz _asm_transform_points3_general_loop
173
174_asm_transform_points3_general_end:
175	pop edi
176	pop esi
177	ret
178
179
180/*
181 * void asm_transform_points3_identity( GLuint n, GLfloat d[][4],
182 *                                      GLfloat s[][4] );
183 */
184PUBLIC _asm_transform_points3_identity
185_asm_transform_points3_identity:
186.align 4
187	push esi
188	push edi
189	mov ecx, [esp + 12] 	/* ecx = n */
190	mov edi, [esp + 16] 	/* edi = d */
191	mov esi, [esp + 20] 	/* esi = s */
192	push ebx
193	push ebp
194
195	test ecx, ecx
196	jz _asm_transform_points3_identity_end
197
198	mov ebp, HEX(3f800000)
199
200.align 4
201_asm_transform_points3_identity_loop:
202	mov eax, S(0)
203	mov edx, S(1)
204	mov ebx, S(2)
205	lea esi, S(4)
206	mov D(0), eax
207	mov D(1), edx
208	mov D(2), ebx
209	mov D(3), ebp
210	dec ecx
211	lea edi, D(4)
212	jnz _asm_transform_points3_identity_loop
213
214_asm_transform_points3_identity_end:
215	pop ebp
216	pop ebx
217	pop edi
218	pop esi
219	ret
220
221
222/*
223 * void asm_transform_points3_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
224 *                                GLfloat s[][4] );
225 */
226PUBLIC _asm_transform_points3_2d
227_asm_transform_points3_2d:
228.align 4
229	push esi
230	push edi
231	mov ecx, [esp + 12] 	/* ecx = n */
232	mov edi, [esp + 16] 	/* edi = d */
233	mov edx, [esp + 20] 	/* edx = m */
234	mov esi, [esp + 24] 	/* esi = s */
235	push ebp
236
237	mov ebp, HEX(3f800000)
238
239	test cl, DEC(1)
240	jz _asm_transform_points3_2d_step
241
242	dec ecx
243
244	fld S(0)
245	fmul M(0, 0)
246	fld S(0)
247	fmul M(0, 1)
248	fld S(1)
249	fmul M(1, 0)
250	fld S(1)
251	fmul M(1, 1)
252
253	/*
254	 * st(3) = S(0) * M(0, 0)
255	 * st(2) = S(0) * M(0, 1)
256	 * st(1) = S(1) * M(1, 0)
257	 * st(0) = S(1) * M(1, 1)
258	 */
259
260	fxch st(1) 	/* 1 0 2 3 */
261	fadd M(3, 0)
262	fxch st(1) 	/* 0 1 2 3 */
263	fadd M(3, 1)
264	fxch st(1) 	/* 1 0 2 3 */
265	faddp st(3), st 	/* 0 2 3 */
266	faddp st(1), st 	/* 2 3 */
267	fstp D(1) 	/* 3 */
268	fstp D(0) 	/* */
269	mov eax, S(2)
270	lea esi, S(4)
271	mov D(3), ebp
272	mov D(2), eax
273	lea edi, D(4)
274
275_asm_transform_points3_2d_step:
276	test ecx, ecx
277	jz _asm_transform_points3_2d_end
278
279.align 4
280_asm_transform_points3_2d_loop:
281	fld S(0)
282	fmul M(0, 0)
283	fld S(0)
284	fmul M(0, 1)
285	fld S(4)
286	fmul M(0, 0)
287	fld S(4)
288	fmul M(0, 1)
289	fld S(1)
290	fmul M(1, 0)
291	fld S(1)
292	fmul M(1, 1)
293	fld S(5)
294	fmul M(1, 0)
295	fld S(5)
296	fmul M(1, 1)
297
298	/*
299	 * st(7) = S(0) * M(0, 0)
300	 * st(6) = S(0) * M(0, 1)
301	 * st(5) = S(4) * M(0, 0)
302	 * st(4) = S(4) * M(0, 1)
303	 * st(3) = S(1) * M(1, 0)
304	 * st(2) = S(1) * M(1, 1)
305	 * st(1) = S(5) * M(1, 0)
306	 * st(0) = S(5) * M(1, 1)
307	 */
308
309	fxch st(7) 	/* 7 1 2 3 4 5 6 0 */
310	fadd M(3, 0)
311	fxch st(6) 	/* 6 1 2 3 4 5 7 0 */
312	fadd M(3, 1)
313	fxch st(5) 	/* 5 1 2 3 4 6 7 0 */
314	fadd M(3, 0)
315	fxch st(4) 	/* 4 1 2 3 5 6 7 0 */
316	fadd M(3, 1)
317
318	mov eax, S(2)
319	mov D(3), ebp
320	mov D(2), eax
321	mov eax, S(6)
322	mov D(7), ebp
323	mov D(6), eax
324	lea esi, S(8)
325	sub ecx, DEC(2)
326
327	/*
328	 * st(7) = S(5) * M(1, 1)
329	 * st(6) = S(0) * M(0, 0) + M(3, 0)
330	 * st(5) = S(0) * M(0, 1) + M(3, 1)
331	 * st(4) = S(4) * M(0, 0) + M(3, 0)
332	 * st(3) = S(1) * M(1, 0)
333	 * st(2) = S(1) * M(1, 1)
334	 * st(1) = S(5) * M(1, 0)
335	 * st(0) = S(4) * M(0, 1) + M(3, 1)
336	 */
337
338	faddp st(7), st 	/* 1 2 3 4 5 6 7 */
339	faddp st(3), st 	/* 2 3 4 5 6 7 */
340	faddp st(3), st 	/* 3 4 5 6 7 */
341	faddp st(3), st 	/* 4 5 6 7 */
342	fxch st(3) 	/* 7 5 6 4 */
343	fstp D(5) 	/* 5 6 4 */
344	fstp D(1) 	/* 6 4 */
345	fstp D(0) 	/* 4 */
346	fstp D(4) 	/* */
347
348	lea edi, D(8)
349	jnz _asm_transform_points3_2d_loop
350
351_asm_transform_points3_2d_end:
352	pop ebp
353	pop edi
354	pop esi
355	ret
356
357
358/*
359 * void asm_transform_points3_2d_no_rot( GLuint n, GLfloat d[][4],
360 *                                       GLfloat m[16], GLfloat s[][4] );
361 *
362 */
363PUBLIC _asm_transform_points3_2d_no_rot
364_asm_transform_points3_2d_no_rot:
365.align 4
366	push esi
367	push edi
368	mov ecx, [esp + 12] 	/* ecx = n */
369	mov edi, [esp + 16] 	/* edi = d */
370	mov edx, [esp + 20] 	/* edx = m */
371	mov esi, [esp + 24] 	/* esi = s */
372	push ebp
373
374	test ecx, ecx
375	jz _asm_transform_points3_2d_no_rot_end
376
377	mov ebp, HEX(3f800000)
378
379.align 4
380_asm_transform_points3_2d_no_rot_loop:
381	fld S(0)
382	fmul M(0, 0)
383	fld S(1)
384	fmul M(1, 1)
385	fxch st(1)
386	fadd M(3, 0)
387	fxch st(1)
388	fadd M(3, 1)
389	fxch st(1)
390	fstp D(0)
391	fstp D(1)
392
393	mov eax, S(2)   /* cycle 1: U pipe */
394	mov D(3), ebp   /*          V pipe */
395	mov D(2), eax   /* cycle 2: U pipe */
396
397	dec ecx
398	lea esi, S(4)
399	lea edi, D(4)
400	jnz _asm_transform_points3_2d_no_rot_loop
401
402_asm_transform_points3_2d_no_rot_end:
403	pop ebp
404	pop edi
405	pop esi
406	ret
407
408
409
410/*
411 * void asm_transform_points3_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
412 *                                GLfloat s[][4] );
413 */
414PUBLIC _asm_transform_points3_3d
415_asm_transform_points3_3d:
416.align 4
417	push esi
418	push edi
419	mov ecx, [esp + 12] 	/* ecx = n */
420	mov edi, [esp + 16] 	/* edi = d */
421	mov edx, [esp + 20] 	/* edx = m */
422	mov esi, [esp + 24] 	/* esi = s */
423
424	test ecx, ecx
425	jz _asm_transform_points3_3d_end
426
427	mov eax, HEX(3f800000)
428
429.align 4
430_asm_transform_points3_3d_loop:
431	fld S(0)
432	fmul M(0, 0)
433	fld S(0)
434	fmul M(0, 1)
435	fld S(0)
436	fmul M(0, 2)
437
438	fld S(1)
439	fmul M(1, 0)
440	fld S(1)
441	fmul M(1, 1)
442	fld S(1)
443	fmul M(1, 2)
444
445	/*
446	 * st(5) = S(0) * M(0, 0)
447	 * st(4) = S(0) * M(0, 1)
448	 * st(3) = S(0) * M(0, 2)
449	 * st(2) = S(1) * M(1, 0)
450	 * st(1) = S(1) * M(1, 1)
451	 * st(0) = S(1) * M(1, 2)
452	 */
453
454	fxch st(2) 		/* 2 1 0 3 4 5 */
455	faddp st(5), st	/* 1 0 3 4 5 */
456	faddp st(3), st /* 0 3 4 5 */
457	faddp st(1), st /* 3 4 5 */
458
459	/*
460	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
461	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
462	 * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
463	 */
464
465	fld S(2)
466	fmul M(2, 0)
467	fld S(2)
468	fmul M(2, 1)
469	fld S(2)
470	fmul M(2, 2)
471
472	/*
473	 * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
474	 * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
475	 * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
476	 * st(2) = S(2) * M(2, 0)
477	 * st(1) = S(2) * M(2, 1)
478	 * st(0) = S(2) * M(2, 2)
479	 */
480
481	fxch st(2) 		/* 2 1 0 3 4 5 */
482	faddp st(5), st	/* 1 0 3 4 5 */
483	faddp st(3), st	/* 0 3 4 5 */
484	faddp st(1), st	/* 3 4 5 */
485
486	/*
487	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
488	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
489	 * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
490	 */
491
492	fxch st(2) 	/* 2 1 0 */
493	fadd M(3, 0)
494	fxch st(1) 	/* 1 2 0 */
495	fadd M(3, 1)
496	fxch st(2) 	/* 0 2 1 */
497	fadd M(3, 2)
498
499	fxch st(1) 	/* 2 0 1 */
500	fstp D(0) 	/* 0 1 */
501	fstp D(2) 	/* 1 */
502	fstp D(1) 	/* */
503	mov D(3), eax
504
505	lea esi, S(4)
506	dec ecx
507
508	lea edi, D(4)
509
510	jnz _asm_transform_points3_3d_loop
511
512_asm_transform_points3_3d_end:
513	pop edi
514	pop esi
515	ret
516
517
518
519/*
520 * void asm_transform_points4_general( GLuint n, GLfloat d[][4],
521 *                                     GLfloat m[16], GLfloat s[][4] );
522 */
523PUBLIC _asm_transform_points4_general
524_asm_transform_points4_general:
525.align 4
526	push esi
527	push edi
528	mov ecx, [esp + 12] 	/* ecx = n */
529	mov edi, [esp + 16] 	/* edi = d */
530	mov edx, [esp + 20] 	/* edx = m */
531	mov esi, [esp + 24] 	/* esi = s */
532
533	test ecx, ecx
534	jz _asm_transform_points4_general_end
535
536.align 4
537_asm_transform_points4_general_loop:
538	fld S(0)
539	fmul M(0, 0)
540	fld S(0)
541	fmul M(0, 1)
542	fld S(0)
543	fmul M(0, 2)
544	fld S(0)
545	fmul M(0, 3)
546
547	fld S(1)
548	fmul M(1, 0)
549	fld S(1)
550	fmul M(1, 1)
551	fld S(1)
552	fmul M(1, 2)
553	fld S(1)
554	fmul M(1, 3)
555
556	/*
557	 * st(7) = S(0) * M(0, 0)
558	 * st(6) = S(0) * M(0, 1)
559	 * st(5) = S(0) * M(0, 2)
560	 * st(4) = S(0) * M(0, 3)
561	 * st(3) = S(1) * M(1, 0)
562	 * st(2) = S(1) * M(1, 1)
563	 * st(1) = S(1) * M(1, 2)
564	 * st(0) = S(1) * M(1, 3)
565	 */
566
567	fxch st(3)		/* 3 1 2 0 4 5 6 7 */
568	faddp st(7), st /* 1 2 0 4 5 6 7 */
569	fxch st(1)		/* 2 1 0 4 5 6 7 */
570	faddp st(5), st	/* 1 0 4 5 6 7 */
571	faddp st(3), st	/* 0 4 5 6 7 */
572	faddp st(1), st	/* 4 5 6 7 */
573
574	/*
575	 * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
576	 * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
577	 * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
578	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
579	 */
580
581	fld S(2)
582	fmul M(2, 0)
583	fld S(2)
584	fmul M(2, 1)
585	fld S(2)
586	fmul M(2, 2)
587	fld S(2)
588	fmul M(2, 3)
589
590	/*
591	 * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
592	 * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
593	 * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
594	 * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
595	 * st(3) = S(2) * M(2, 0)
596	 * st(2) = S(2) * M(2, 1)
597	 * st(1) = S(2) * M(2, 2)
598	 * st(0) = S(2) * M(2, 3)
599	 */
600
601	fxch st(3)		/* 3 1 2 0 4 5 6 7 */
602	faddp st(7), st	/* 1 2 0 4 5 6 7 */
603	fxch st(1)		/* 2 1 0 4 5 6 7 */
604	faddp st(5), st	/* 1 0 4 5 6 7 */
605	faddp st(3), st	/* 0 4 5 6 7 */
606	faddp st(1), st	/* 4 5 6 7 */
607
608	/*
609	 * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
610	 * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
611	 * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
612	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
613	 */
614
615	fld S(3)
616	fmul M(3, 0)
617	fld S(3)
618	fmul M(3, 1)
619	fld S(3)
620	fmul M(3, 2)
621	fld S(3)
622	fmul M(3, 3)
623
624	/*
625	 * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
626	 * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
627	 * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
628	 * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
629	 * st(3) = S(3) * M(3, 0)
630	 * st(2) = S(3) * M(3, 1)
631	 * st(1) = S(3) * M(3, 2)
632	 * st(0) = S(3) * M(3, 3)
633	 */
634
635	fxch st(3)		/* 3 1 2 0 4 5 6 7 */
636	faddp st(7), st	/* 1 2 0 4 5 6 7 */
637	fxch st(1)		/* 2 1 0 4 5 6 7 */
638	faddp st(5), st	/* 1 0 4 5 6 7 */
639	faddp st(3), st	/* 0 4 5 6 7 */
640
641	lea esi, S(4)
642	dec ecx
643
644	faddp st(1), st 	/* 4 5 6 7 */
645
646	/*
647	 * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
648	 * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
649	 * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
650	 * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + S(3) * M(3, 3)
651	 */
652
653	fxch st(3) 	/* 3 1 2 0 */
654	fstp D(0) 	/* 1 2 0 */
655	fxch st(1) 	/* 2 1 0 */
656	fstp D(1) 	/* 1 0 */
657	fstp D(2) 	/* 0 */
658	fstp D(3) 	/* */
659
660	lea edi, D(4)
661
662	jnz _asm_transform_points4_general_loop
663
664_asm_transform_points4_general_end:
665	pop edi
666	pop esi
667	ret
668
669
670
671/*
672 * void asm_transform_points4_identity( GLuint n, GLfloat d[][4],
673 *                                      GLfloat s[][4] );
674 */
675PUBLIC _asm_transform_points4_identity
676_asm_transform_points4_identity:
677.align 4
678	push esi
679	push edi
680	mov ecx, [esp + 12] 	/* ecx = n */
681	mov edi, [esp + 16] 	/* edi = d */
682	mov esi, [esp + 20] 	/* esi = s */
683
684	lea ecx,  [ecx * 4]
685
686	cld
687	rep movsd
688
689	pop edi
690	pop esi
691	ret
692
693
694
695/*
696 * void asm_transform_points4_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
697 *                                GLfloat s[][4] );
698 */
699PUBLIC _asm_transform_points4_2d
700_asm_transform_points4_2d:
701.align 4
702	push esi
703	push edi
704	mov ecx, [esp + 12] 	/* ecx = n */
705	mov edi, [esp + 16] 	/* edi = d */
706	mov edx, [esp + 20] 	/* edx = m */
707	mov esi, [esp + 24] 	/* esi = s */
708
709	test ecx, ecx
710	jz _asm_transform_points4_2d_end
711
712	push ebx
713
714.align 4
715_asm_transform_points4_2d_loop:
716	fld S(0)
717	fmul M(0, 0)
718	fld S(0)
719	fmul M(0, 1)
720	fld S(1)
721	fmul M(1, 0)
722	fld S(1)
723	fmul M(1, 1)
724	fld S(3)
725	fmul M(3, 0)
726	fld S(3)
727	fmul M(3, 1)
728
729	/*
730	 * st(5) = S(0) * M(0, 0)
731	 * st(4) = S(0) * M(0, 1)
732	 * st(3) = S(1) * M(1, 0)
733	 * st(2) = S(1) * M(1, 1)
734	 * st(1) = S(3) * M(3, 0)
735	 * st(0) = S(3) * M(3, 1)
736	 */
737
738	mov eax, S(2)
739	mov ebx, S(3)
740	lea esi, S(4)
741	dec ecx
742	mov D(2), eax
743	mov D(3), ebx
744	faddp st(4), st
745	faddp st(4), st
746	faddp st(2), st
747	faddp st(2), st
748	fstp D(1)
749	fstp D(0)
750	lea edi, D(4)
751	jnz _asm_transform_points4_2d_loop
752
753	pop ebx
754
755_asm_transform_points4_2d_end:
756	pop edi
757	pop esi
758	ret
759
760
761
762/*
763 * void asm_transform_points4_2d_no_rot( GLuint n, GLfloat d[][4],
764 *                                       GLfloat m[16], GLfloat s[][4] );
765 */
766PUBLIC _asm_transform_points4_2d_no_rot
767_asm_transform_points4_2d_no_rot:
768.align 4
769	push esi
770	push edi
771	mov ecx, [esp + 12] 	/* ecx = n */
772	mov edi, [esp + 16] 	/* edi = d */
773	mov edx, [esp + 20] 	/* edx = m */
774	mov esi, [esp + 24] 	/* esi = s */
775
776	test ecx, ecx
777	jz _asm_transform_points4_2d_no_rot_end
778	push ebx
779
780.align 4
781_asm_transform_points4_2d_no_rot_loop:
782	fld S(0)
783	fmul M(0, 0)
784	fld S(1)
785	fmul M(1, 1)
786	fld S(3)
787	fmul M(3, 0)
788	fld S(3)
789	fmul M(3, 1)
790	mov eax, S(2)
791	mov ebx, S(3)
792	lea esi, S(4)
793	dec ecx
794	mov D(2), eax
795	mov D(3), ebx
796	faddp st(2), st
797	faddp st(2), st
798	fstp D(1)
799	fstp D(0)
800	lea edi, D(4)
801	jnz _asm_transform_points4_2d_no_rot_loop
802
803	pop ebx
804
805_asm_transform_points4_2d_no_rot_end:
806	pop edi
807	pop esi
808	ret
809
810
811
812/*
813 * void asm_transform_points4_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
814 *                                GLfloat s[][4] );
815 */
816PUBLIC _asm_transform_points4_3d
817_asm_transform_points4_3d:
818.align 4
819	push esi
820	push edi
821	mov ecx, [esp + 12] 	/* ecx = n */
822	mov edi, [esp + 16] 	/* edi = d */
823	mov edx, [esp + 20] 	/* edx = m */
824	mov esi, [esp + 24] 	/* esi = s */
825
826	test ecx, ecx
827	jz _asm_transform_points4_3d_end
828
829.align 4
830_asm_transform_points4_3d_loop:
831	fld S(3)
832
833	fld S(0)
834	fmul M(0, 0)
835	fld S(0)
836	fmul M(0, 1)
837	fld S(0)
838	fmul M(0, 2)
839
840	fld S(1)
841	fmul M(1, 0)
842	fld S(1)
843	fmul M(1, 1)
844	fld S(1)
845	fmul M(1, 2)
846
847	/*
848	 * st(5) = S(0) * M(0, 0)
849	 * st(4) = S(0) * M(0, 1)
850	 * st(3) = S(0) * M(0, 2)
851	 * st(2) = S(1) * M(1, 0)
852	 * st(1) = S(1) * M(1, 1)
853	 * st(0) = S(1) * M(1, 2)
854	 */
855
856	fxch st(2) 		/* 2 1 0 3 4 5 */
857	faddp st(5), st 	/* 1 0 3 4 5 */
858	faddp st(3), st 	/* 0 3 4 5 */
859	faddp st(1), st 	/* 3 4 5 */
860
861	/*
862	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
863	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
864	 * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
865	 */
866
867	fld S(2)
868	fmul M(2, 0)
869	fld S(2)
870	fmul M(2, 1)
871	fld S(2)
872	fmul M(2, 2)
873
874	/*
875	 * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
876	 * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
877	 * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
878	 * st(2) = S(2) * M(2, 0)
879	 * st(1) = S(2) * M(2, 1)
880	 * st(0) = S(2) * M(2, 2)
881	 */
882
883	fxch st(2) 		/* 2 1 0 3 4 5 */
884	faddp st(5), st 	/* 1 0 3 4 5 */
885	faddp st(3), st 	/* 0 3 4 5 */
886	faddp st(1), st 	/* 3 4 5 */
887
888	/*
889	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
890	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
891	 * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
892	 */
893
894	fld S(3)
895	fmul M(3, 0)
896	fld S(3)
897	fmul M(3, 1)
898	fld S(3)
899	fmul M(3, 2)
900
901	/*
902	 * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
903	 * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
904	 * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
905	 * st(2) = S(3) * M(3, 0)
906	 * st(1) = S(3) * M(3, 1)
907	 * st(0) = S(3) * M(3, 2)
908	 */
909
910	fxch st(2)      /* 2 1 0 3 4 5 */
911	faddp st(5), st	/* 1 0 3 4 5 */
912	faddp st(3), st	/* 0 3 4 5 */
913
914	lea esi, S(4)
915	dec ecx
916
917	faddp st(1), st 	/* 3 4 5 */
918
919	/*
920	 * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
921	 * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
922	 * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
923	 */
924
925	fxch st(2) 	/* 2 1 0 */
926	fstp D(0) 	/* 1 0 */
927	fstp D(1) 	/* 0 */
928	fstp D(2) 	/* */
929	fstp D(3)
930
931	lea edi, D(4)
932
933	jnz _asm_transform_points4_3d_loop
934
935_asm_transform_points4_3d_end:
936	pop edi
937	pop esi
938	ret
939
940/*
941 * void asm_transform_points4_ortho( GLuint n, GLfloat d[][4],
942 *                                   GLfloat m[16], GLfloat s[][4] );
943 */
944PUBLIC _asm_transform_points4_ortho
945_asm_transform_points4_ortho:
946.align 4
947	push esi
948	push edi
949	mov ecx, [esp + 12] 	/* ecx = n */
950	mov edi, [esp + 16] 	/* edi = d */
951	mov edx, [esp + 20] 	/* edx = m */
952	mov esi, [esp + 24] 	/* esi = s */
953
954	test ecx, ecx
955	jz _asm_transform_points4_ortho_end
956
957.align 4
958_asm_transform_points4_ortho_loop:
959	fld S(0)
960	fmul M(0, 0)
961	fld S(1)
962	fmul M(1, 1)
963	fld S(2)
964	fmul M(2, 2)
965
966	fld S(3)
967	fmul M(3, 0)
968	fld S(3)
969	fmul M(3, 1)
970	fld S(3)
971	fmul M(3, 2)
972
973	mov eax, S(3)
974	lea esi, S(4)
975	dec ecx
976	mov D(3), eax
977
978	faddp st(3), st
979	faddp st(3), st
980	faddp st(3), st
981
982	fstp D(2)
983	fstp D(1)
984	fstp D(0)
985
986	lea edi, D(4)
987	jnz _asm_transform_points4_ortho_loop
988
989_asm_transform_points4_ortho_end:
990	pop edi
991	pop esi
992	ret
993
994/*
995 * void asm_transform_points4_perspective( GLuint n, GLfloat d[][4],
996 *                                         GLfloat m[16], GLfloat s[][4] );
997 */
998PUBLIC _asm_transform_points4_perspective
999_asm_transform_points4_perspective:
1000.align 4
1001	push esi
1002	push edi
1003	mov ecx, [esp + 12] 	/* ecx = n */
1004	mov edi, [esp + 16] 	/* edi = d */
1005	mov edx, [esp + 20] 	/* edx = m */
1006	mov esi, [esp + 24] 	/* esi = s */
1007
1008	test ecx, ecx
1009	jz _asm_transform_points4_perspective_end
1010
1011.align 4
1012_asm_transform_points4_perspective_loop:
1013	fld S(0)
1014	fmul M(0, 0)
1015	fld S(1)
1016	fmul M(1, 1)
1017	fld S(2)
1018	fmul M(2, 2)
1019
1020	fld S(2)
1021	fmul M(2, 0)
1022	fld S(2)
1023	fmul M(2, 1)
1024	fld S(3)
1025	fmul M(3, 2)
1026
1027	mov eax, S(2)
1028	lea esi, S(4)
1029	xor eax, HEX(80000000)
1030	dec ecx
1031
1032	faddp st(3), st
1033	faddp st(3), st
1034	faddp st(3), st
1035
1036	fstp D(2)
1037	fstp D(1)
1038	fstp D(0)
1039
1040	mov D(3), eax
1041	lea edi, D(4)
1042	jnz _asm_transform_points4_perspective_loop
1043
1044_asm_transform_points4_perspective_end:
1045	pop edi
1046	pop esi
1047	ret
1048
1049
1050
1051/*
1052 * Table for clip test.
1053 *
1054 * 	bit6 = S(3) < 0
1055 * 	bit5 = S(2) < 0
1056 * 	bit4 = abs(S(2)) > abs(S(3))
1057 * 	bit3 = S(1) < 0
1058 * 	bit2 = abs(S(1)) > abs(S(3))
1059 * 	bit1 = S(0) < 0
1060 * 	bit0 = abs(S(0)) > abs(S(3))
1061 */
1062
1063/* Vertex buffer clipping flags (from vb.h) */
1064#if 0
1065
1066#define CLIP_RIGHT_BIT   0x01
1067#define CLIP_LEFT_BIT    0x02
1068#define CLIP_TOP_BIT     0x04
1069#define CLIP_BOTTOM_BIT  0x08
1070#define CLIP_NEAR_BIT    0x10
1071#define CLIP_FAR_BIT     0x20
1072#define CLIP_USER_BIT    0x40
1073#define CLIP_ALL_BITS    0x3f
1074
1075#define MAGN_X(i) 	(~(((i) & 1) - 1))
1076#define SIGN_X(i) 	(~((((i) >> 1) & 1) - 1))
1077#define MAGN_Y(i) 	(~((((i) >> 2) & 1) - 1))
1078#define SIGN_Y(i) 	(~((((i) >> 3) & 1) - 1))
1079#define MAGN_Z(i) 	(~((((i) >> 4) & 1) - 1))
1080#define SIGN_Z(i) 	(~((((i) >> 5) & 1) - 1))
1081#define SIGN_W(i) 	(~((((i) >> 6) & 1) - 1))
1082
1083#define CLIP_VALUE(i) 						\
1084	 (CLIP_RIGHT_BIT 					\
1085	  & ((~SIGN_X(i) & SIGN_W(i)) 				\
1086	     | (~SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)) 		\
1087	     | (SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)))) 		\
1088	 | (CLIP_LEFT_BIT 					\
1089	    & ((SIGN_X(i) & SIGN_W(i)) 				\
1090	       | (~SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)) 		\
1091	       | (SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)))) 	\
1092	 | (CLIP_TOP_BIT 					\
1093	    & ((~SIGN_Y(i) & SIGN_W(i)) 			\
1094	       | (~SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)) 		\
1095	       | (SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)))) 	\
1096	 | (CLIP_BOTTOM_BIT 					\
1097	    & ((SIGN_Y(i) & SIGN_W(i)) 				\
1098	       | (~SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)) 		\
1099	       | (SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)))) 	\
1100	 | (CLIP_FAR_BIT 					\
1101	    & ((~SIGN_Z(i) & SIGN_W(i)) 			\
1102	       | (~SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)) 		\
1103	       | (SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)))) 	\
1104	 | (CLIP_NEAR_BIT 					\
1105	    & ((SIGN_Z(i) & SIGN_W(i)) 				\
1106	       | (~SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)) 		\
1107	       | (SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i))))
1108
1109#define CLIP_VALUE8(i) \
1110	CLIP_VALUE(i + 0), CLIP_VALUE(i + 1), CLIP_VALUE(i + 2), CLIP_VALUE(i + 3), \
1111	CLIP_VALUE(i + 4), CLIP_VALUE(i + 5), CLIP_VALUE(i + 6), CLIP_VALUE(i + 7)
1112
1113.rodata
1114
1115clip_table:
1116	.byte CLIP_VALUE8(0x00)
1117	.byte CLIP_VALUE8(0x08)
1118	.byte CLIP_VALUE8(0x10)
1119	.byte CLIP_VALUE8(0x18)
1120	.byte CLIP_VALUE8(0x20)
1121	.byte CLIP_VALUE8(0x28)
1122	.byte CLIP_VALUE8(0x30)
1123	.byte CLIP_VALUE8(0x38)
1124	.byte CLIP_VALUE8(0x40)
1125	.byte CLIP_VALUE8(0x48)
1126	.byte CLIP_VALUE8(0x50)
1127	.byte CLIP_VALUE8(0x58)
1128	.byte CLIP_VALUE8(0x60)
1129	.byte CLIP_VALUE8(0x68)
1130	.byte CLIP_VALUE8(0x70)
1131	.byte CLIP_VALUE8(0x78)
1132#else
1133
1134.const
1135ASSUME NOTHING
1136
1137clip_table:
1138	.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
1139	.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
1140	.byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(24), HEX(25), HEX(24), HEX(26)
1141	.byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(28), HEX(29), HEX(28), HEX(2a)
1142	.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
1143	.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
1144	.byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(14), HEX(15), HEX(14), HEX(16)
1145	.byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(18), HEX(19), HEX(18), HEX(1a)
1146	.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
1147	.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
1148	.byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(27), HEX(25), HEX(27), HEX(26)
1149	.byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(2b), HEX(29), HEX(2b), HEX(2a)
1150	.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
1151	.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
1152	.byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(17), HEX(15), HEX(17), HEX(16)
1153	.byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(1b), HEX(19), HEX(1b), HEX(1a)
1154
1155#endif
1156
1157.code
1158
1159/*
1160 * cliptest -
1161 *
1162 * inputs:
1163 * 	ecx = # points
1164 * 	esi = points
1165 * 	edi = clipmask[]
1166 *
1167 * inputs/outputs:
1168 * 	al = ormask
1169 * 	ah = andmask
1170 */
1171
1172cliptest:
1173	test ecx, ecx
1174	jz cliptest_end
1175
1176	push ebp
1177	push ebx
1178
1179.align 4
1180cliptest_loop:
1181	mov ebp, S(3)
1182	mov ebx, S(2)
1183
1184	xor edx, edx
1185	add ebp, ebp	/* %ebp = abs(S(3))*2 ; carry = sign of S(3) */
1186
1187	adc edx, edx
1188	add ebx, ebx	/* %ebx = abs(S(2))*2 ; carry = sign of S(2) */
1189
1190	adc edx, edx
1191	cmp ebp, ebx	/* carry = abs(S(2))*2 > abs(S(3))*2 */
1192
1193	adc edx, edx
1194	mov ebx, S(1)
1195
1196	add ebx, ebx	/* %ebx = abs(S(1))*2 ; carry = sign of S(1) */
1197
1198	adc edx, edx
1199	cmp ebp, ebx	/* carry = abs(S(1))*2 > abs(S(3))*2 */
1200
1201	adc edx, edx
1202	mov ebx, S(0)
1203
1204	add ebx, ebx	/* %ebx = abs(S(0))*2 ; carry = sign of S(0) */
1205
1206	adc edx, edx
1207	cmp ebp, ebx	/* carry = abs(S(0))*2 > abs(S(3))*2 */
1208
1209	adc edx, edx
1210
1211	lea esi, S(4)
1212
1213	mov bl, byte ptr [edi]
1214	mov dl, byte ptr [clip_table + edx]
1215
1216	or bl, dl
1217	or al, dl
1218
1219	and ah, dl
1220	mov [edi], bl
1221
1222	inc edi
1223	dec ecx
1224
1225	jnz cliptest_loop
1226
1227	pop ebx
1228	pop ebp
1229cliptest_end:
1230	ret
1231
1232/*
1233 * void asm_project_and_cliptest_general( GLuint n, GLfloat d[][4], GLfloat m[16],
1234 *                                        GLfloat s[][4], GLubyte clipmask[],
1235 *                                        GLubyte *ormask, GLubyte *andmask );
1236 */
1237PUBLIC _asm_project_and_cliptest_general
1238_asm_project_and_cliptest_general:
1239.align 4
1240	push esi
1241	push edi
1242	mov ecx, [esp + 12] 	/* ecx = n */
1243	mov edi, [esp + 16] 	/* edi = d */
1244	mov edx, [esp + 20] 	/* edx = m */
1245	mov esi, [esp + 24] 	/* esi = s */
1246
1247	push esi
1248	push edx
1249	push edi
1250	push ecx
1251	call _asm_transform_points4_general
1252	add esp, DEC(16)
1253
1254	mov edi, [esp + 32] 	/* ormask */
1255	mov esi, [esp + 36] 	/* andmask */
1256	mov al, [edi]
1257	mov ah, [esi]
1258
1259	mov ecx, [esp + 12] 	/* ecx = n */
1260	mov edi, [esp + 28] 	/* edi = clipmask */
1261	mov esi, [esp + 16] 	/* esi = d */
1262
1263	call cliptest
1264
1265	mov edi, [esp + 32] 	/* ormask */
1266	mov esi, [esp + 36] 	/* andmask */
1267	mov [edi], al
1268	mov [esi], ah
1269
1270	pop edi
1271	pop esi
1272	ret
1273
1274
1275/*
1276 * void asm_project_and_cliptest_identity( GLuint n, GLfloat d[][4],
1277 *                                         GLfloat s[][4], GLubyte clipmask[],
1278 *                                         GLubyte *ormask, GLubyte *andmask );
1279 */
1280PUBLIC _asm_project_and_cliptest_identity
1281_asm_project_and_cliptest_identity:
1282.align 4
1283	push esi
1284	push edi
1285	mov ecx, [esp + 12] 	/* ecx = n */
1286	mov edi, [esp + 16] 	/* edi = d */
1287	mov esi, [esp + 20] 	/* esi = s */
1288
1289	push esi
1290	push edi
1291	push ecx
1292
1293	call _asm_transform_points4_identity
1294
1295	add esp, DEC(12)
1296
1297	mov edi, [esp + 28] 	/* ormask */
1298	mov esi, [esp + 32] 	/* andmask */
1299	mov al, [edi]
1300	mov ah, [esi]
1301
1302	mov ecx, [esp + 12] 	/* ecx = n */
1303	mov edi, [esp + 24] 	/* edi = clipmask */
1304	mov esi, [esp + 16] 	/* esi = d */
1305
1306	call cliptest
1307
1308	mov edi, [esp + 28] 	/* ormask */
1309	mov esi, [esp + 32] 	/* andmask */
1310	mov [edi], al
1311	mov [esi], ah
1312
1313	pop edi
1314	pop esi
1315	ret
1316
1317/*
1318 * void asm_project_and_cliptest_ortho( GLuint n, GLfloat d[][4], GLfloat m[16],
1319 *                                      GLfloat s[][4], GLubyte clipmask[],
1320 *                                      GLubyte *ormask, GLubyte *andmask );
1321 */
1322PUBLIC _asm_project_and_cliptest_ortho
1323_asm_project_and_cliptest_ortho:
1324.align 4
1325	push esi
1326	push edi
1327	mov ecx, [esp + 12] 	/* ecx = n */
1328	mov edi, [esp + 16] 	/* edi = d */
1329	mov edx, [esp + 20] 	/* edx = m */
1330	mov esi, [esp + 24] 	/* esi = s */
1331
1332	push esi
1333	push edx
1334	push edi
1335	push ecx
1336
1337	call _asm_transform_points4_ortho
1338
1339	add esp, DEC(16)
1340
1341	mov edi, [esp + 32] 	/* ormask */
1342	mov esi, [esp + 36] 	/* andmask */
1343	mov al, [edi]
1344	mov ah, [esi]
1345
1346	mov ecx, [esp + 12] 	/* ecx = n */
1347	mov edi, [esp + 28] 	/* edi = clipmask */
1348	mov esi, [esp + 16] 	/* esi = d */
1349
1350	call cliptest
1351
1352	mov edi, [esp + 32] 	/* ormask */
1353	mov esi, [esp + 36] 	/* andmask */
1354	mov [edi], al
1355	mov [esi], ah
1356
1357	pop edi
1358	pop esi
1359	ret
1360
1361/*
1362 * void asm_project_and_cliptest_perspective( GLuint n, GLfloat d[][4], GLfloat m[16],
1363 *                                            GLfloat s[][4], GLubyte clipmask[],
1364 *                                            GLubyte *ormask, GLubyte *andmask );
1365 */
1366PUBLIC _asm_project_and_cliptest_perspective
1367_asm_project_and_cliptest_perspective:
1368.align 4
1369	push esi
1370	push edi
1371	mov ecx, [esp + 12] 	/* ecx = n */
1372	mov edi, [esp + 16] 	/* edi = d */
1373	mov edx, [esp + 20] 	/* edx = m */
1374	mov esi, [esp + 24] 	/* esi = s */
1375
1376	push esi
1377	push edx
1378	push edi
1379	push ecx
1380
1381	call _asm_transform_points4_perspective
1382
1383	add esp, DEC(16)
1384
1385	mov edi, [esp + 32] 	/* ormask */
1386	mov esi, [esp + 36] 	/* andmask */
1387	mov al, [edi]
1388	mov ah, [esi]
1389
1390	mov ecx, [esp + 12] 	/* ecx = n */
1391	mov edi, [esp + 28] 	/* edi = clipmask */
1392	mov esi, [esp + 16] 	/* esi = d */
1393
1394	call cliptest
1395
1396	mov edi, [esp + 32]     /* ormask */
1397	mov esi, [esp + 36]		/* andmask */
1398	mov byte ptr [edi], al
1399	mov byte ptr [esi], ah
1400
1401	pop edi
1402	pop esi
1403	ret
1404
1405
1406/*
1407 * unsigned int inverse_nofp( float f );
1408 *
1409 * Calculate the inverse of a float without using the FPU.
1410 * This function returns a float in eax, so it's return
1411 * type should be 'int' when called from C (and converted
1412 * to float with pointer/union abuse).
1413 */
1414.align 4
1415inverse_nofp:
1416
1417	/* get mantissa in eax */
1418	mov ecx, [esp + 4]
1419	and ecx, HEX(7fffff)
1420
1421	/* set implicit integer */
1422	or ecx, HEX(800000)
1423
1424	/* div 0x10000:0x00000000 by mantissa */
1425	xor eax, eax
1426	mov edx, HEX(10000)
1427
1428	div ecx
1429
1430	/* round result */
1431	shr eax, DEC(1)
1432	adc eax, DEC(0)
1433
1434	/* get exponent in ecx */
1435	mov ecx, HEX(7f800000)
1436	mov edx, [esp + 4]
1437	and ecx, edx
1438
1439	/* negate exponent and decrement it */
1440	mov edx, HEX(7E800000)
1441	sub edx, ecx
1442
1443	/* if bit 24 is set, shift and adjust exponent */
1444	test eax, HEX(1000000)
1445	jz inverse_nofp_combine
1446
1447	shr eax, HEX(1)
1448	add edx, HEX(800000)
1449
1450	/* combine mantissa and exponent, then set sign */
1451inverse_nofp_combine:
1452	and eax, HEX(7fffff)
1453	mov ecx, [esp + 4]
1454	or eax, edx
1455	and ecx, HEX(80000000)
1456	or eax, ecx
1457
1458	ret
1459
1460
1461/*
1462 * void gl_xform_normals_3fv( GLuint n, GLfloat d[][4], GLfloat m[16],
1463 *                             GLfloat s[][4], GLboolean normalize );
1464 */
1465PUBLIC _gl_xform_normals_3fv
1466_gl_xform_normals_3fv:
1467.align 4
1468	push esi
1469	push edi
1470	mov ecx, [esp + 12] 	/* ecx = n */
1471	mov edi, [esp + 16] 	/* edi = d */
1472	mov edx, [esp + 20] 	/* edx = m */
1473	mov esi, [esp + 24] 	/* esi = s */
1474
1475	test ecx, ecx
1476	jz _gl_xform_normals_3fv_end
1477
1478.align 4
1479_gl_xform_normals_3fv_loop:
1480	fld S(0)
1481	fmul M(0, 0)
1482	fld S(0)
1483	fmul M(1, 0)
1484	fld S(0)
1485	fmul M(2, 0)
1486
1487	fld S(1)
1488	fmul M(0, 1)
1489	fld S(1)
1490	fmul M(1, 1)
1491	fld S(1)
1492	fmul M(2, 1)
1493
1494	/*
1495	 * st(5) = S(0) * M(0, 0)
1496	 * st(4) = S(0) * M(1, 0)
1497	 * st(3) = S(0) * M(2, 0)
1498	 * st(2) = S(1) * M(0, 1)
1499	 * st(1) = S(1) * M(1, 1)
1500	 * st(0) = S(1) * M(2, 1)
1501	 */
1502
1503	fxch st(2)			/* 2 1 0 3 4 5 */
1504	faddp st(5), st		/* 1 0 3 4 5 */
1505	faddp st(3), st		/* 0 3 4 5 */
1506	faddp st(1), st		/* 3 4 5 */
1507
1508	/*
1509	 * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1)
1510	 * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1)
1511	 * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1)
1512	 */
1513
1514	fld S(2)
1515	fmul M(0, 2)
1516	fld S(2)
1517	fmul M(1, 2)
1518	fld S(2)
1519	fmul M(2, 2)
1520
1521	/*
1522	 * st(5) = S(0) * M(0, 0) + S(1) * M(0, 1)
1523	 * st(4) = S(0) * M(1, 0) + S(1) * M(1, 1)
1524	 * st(3) = S(0) * M(2, 0) + S(1) * M(2, 1)
1525	 * st(2) = S(2) * M(0, 2)
1526	 * st(1) = S(2) * M(1, 2)
1527	 * st(0) = S(2) * M(2, 2)
1528	 */
1529
1530	fxch st(2)			/* 2 1 0 3 4 5 */
1531	faddp st(5), st		/* 1 0 3 4 5 */
1532	faddp st(3), st		/* 0 3 4 5 */
1533	faddp st(1), st		/* 3 4 5 */
1534
1535	/*
1536	 * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2)
1537	 * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2)
1538	 * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2)
1539	 */
1540
1541	fxch st(2) 	/* 2 1 0 */
1542	fstp D(0) 	/* 1 0 */
1543	fstp D(1) 	/* 0 */
1544	fstp D(2) 	/* */
1545
1546	lea esi, S(3)
1547
1548	dec ecx
1549	lea edi, D(3)
1550
1551	jnz _gl_xform_normals_3fv_loop
1552
1553	/*
1554	 * Skip normalize if it isn't needed
1555	 */
1556	cmp dword ptr [esp + 28], DEC(0)
1557	jz _gl_xform_normals_3fv_end
1558
1559	/* Normalize required */
1560
1561	mov esi, [esp + 12]		/* esi = n */
1562	mov edi, [esp + 16]		/* edi = d */
1563
1564	sub esp, DEC(4)	/* temp var for 1.0 / len */
1565
1566	/*
1567	 * (%esp) = length of first normal
1568	 */
1569	fld D(0)
1570	fmul D(0)
1571	fld D(1)
1572	fmul D(1)
1573	fld D(2)
1574	fmul D(2)
1575	fxch st(2)
1576	faddp st(1), st
1577	faddp st(1), st
1578	fsqrt
1579	fstp dword ptr [esp]
1580
1581	jmp _gl_xform_normals_3fv_loop2_end
1582
1583.align 4
1584_gl_xform_normals_3fv_loop2:
1585	/* %st(0) = length of next normal */
1586	fld D(3)
1587	fmul D(3)
1588	fld D(4)
1589	fmul D(4)
1590	fld D(5)
1591	fmul D(5)
1592	fxch st(2)
1593	faddp st(1), st
1594	faddp st(1), st
1595	fsqrt
1596
1597	/*
1598	 * inverse the length of the current normal, which is
1599	 * already at (%esp).  This should overlap the prev
1600	 * fsqrt nicely.
1601	 */
1602	call inverse_nofp
1603	mov [esp], eax
1604
1605	/* multiply normal by 1/len */
1606	fld D(0)
1607	fmul dword ptr [esp]
1608	fld D(1)
1609	fmul dword ptr [esp]
1610	fld D(2)
1611	fmul dword ptr [esp]
1612	fxch st(3)
1613	fstp dword ptr [esp] 	/* store length of next normal */
1614	fstp D(1)
1615	fstp D(0)
1616	fstp D(2)
1617	lea edi, D(3)
1618
1619_gl_xform_normals_3fv_loop2_end:
1620	dec esi
1621	jnz _gl_xform_normals_3fv_loop2
1622
1623	/* finish up the last normal */
1624	call inverse_nofp
1625	mov [esp], eax
1626	fld D(0)
1627	fmul dword ptr [esp]
1628	fld D(1)
1629	fmul dword ptr [esp]
1630	fld D(2)
1631	fmul dword ptr [esp]
1632	fxch st(2)
1633	fstp D(0)
1634	fstp D(1)
1635	fstp D(2)
1636
1637	add esp, DEC(4)
1638
1639_gl_xform_normals_3fv_end:
1640	pop edi
1641	pop esi
1642	ret
1643
1644END
1645