1//
2// d_draw16.s
3// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
4// subdivision.
5//
6
7#include "qasm.h"
8#include "d_ifacea.h"
9
10#if	id386
11
12//----------------------------------------------------------------------
13// 8-bpp horizontal span drawing code for polygons, with no transparency and
14// 16-pixel subdivision.
15//
16// Assumes there is at least one span in pspans, and that every span
17// contains at least one pixel
18//----------------------------------------------------------------------
19
20	.data
21
22	.text
23
24// out-of-line, rarely-needed clamping code
25
26LClampHigh0:
27	movl	C(bbextents),%esi
28	jmp		LClampReentry0
29LClampHighOrLow0:
30	jg		LClampHigh0
31	xorl	%esi,%esi
32	jmp		LClampReentry0
33
34LClampHigh1:
35	movl	C(bbextentt),%edx
36	jmp		LClampReentry1
37LClampHighOrLow1:
38	jg		LClampHigh1
39	xorl	%edx,%edx
40	jmp		LClampReentry1
41
42LClampLow2:
43	movl	$4096,%ebp
44	jmp		LClampReentry2
45LClampHigh2:
46	movl	C(bbextents),%ebp
47	jmp		LClampReentry2
48
49LClampLow3:
50	movl	$4096,%ecx
51	jmp		LClampReentry3
52LClampHigh3:
53	movl	C(bbextentt),%ecx
54	jmp		LClampReentry3
55
56LClampLow4:
57	movl	$4096,%eax
58	jmp		LClampReentry4
59LClampHigh4:
60	movl	C(bbextents),%eax
61	jmp		LClampReentry4
62
63LClampLow5:
64	movl	$4096,%ebx
65	jmp		LClampReentry5
66LClampHigh5:
67	movl	C(bbextentt),%ebx
68	jmp		LClampReentry5
69
70
71#define pspans	4+16
72
73	.align 4
74.globl C(D_DrawSpans16)
75C(D_DrawSpans16):
76	pushl	%ebp				// preserve caller's stack frame
77	pushl	%edi
78	pushl	%esi				// preserve register variables
79	pushl	%ebx
80
81//
82// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
83// and span list pointers
84//
85// TODO: any overlap from rearranging?
86	flds	C(d_sdivzstepu)
87	fmuls	fp_16
88	movl	C(cacheblock),%edx
89	flds	C(d_tdivzstepu)
90	fmuls	fp_16
91	movl	pspans(%esp),%ebx	// point to the first span descriptor
92	flds	C(d_zistepu)
93	fmuls	fp_16
94	movl	%edx,pbase			// pbase = cacheblock
95	fstps	zi16stepu
96	fstps	tdivz16stepu
97	fstps	sdivz16stepu
98
99LSpanLoop:
100//
101// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
102// initial s and t values
103//
104// FIXME: pipeline FILD?
105	fildl	espan_t_v(%ebx)
106	fildl	espan_t_u(%ebx)
107
108	fld		%st(1)			// dv | du | dv
109	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
110	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
111	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
112	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
113	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
114							//  dv*d_sdivzstepv | du | dv
115	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
116							//  dv*d_sdivzstepv | du | dv
117	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
118							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
119	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
120							//  du*d_tdivzstepu | du | dv
121	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
122							//  du*d_tdivzstepu | du | dv
123	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
124							//  du*d_sdivzstepu + dv*d_sdivzstepv |
125							//  du*d_tdivzstepu | du | dv
126	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
127							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
128	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
129							//  du*d_sdivzstepu; stays in %st(2) at end
130	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
131							//  s/z
132	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
133							//  du*d_tdivzstepu | du | s/z
134	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
135							//  du*d_tdivzstepu | du | s/z
136	faddp	%st(0),%st(2)	// dv*d_zistepv |
137							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
138	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
139							//  dv*d_zistepv | s/z
140	fmuls	C(d_zistepu)		// du*d_zistepu |
141							//  dv*d_tdivzstepv + du*d_tdivzstepu |
142							//  dv*d_zistepv | s/z
143	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
144							//  du*d_zistepu | dv*d_zistepv | s/z
145	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
146							//  du*d_tdivzstepu; stays in %st(1) at end
147	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
148	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
149
150	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
151	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
152	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
153							//  du*d_zistepu; stays in %st(0) at end
154							// 1/z | fp_64k | t/z | s/z
155//
156// calculate and clamp s & t
157//
158	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
159
160//
161// point %edi to the first pixel in the span
162//
163	movl	C(d_viewbuffer),%ecx
164	movl	espan_t_v(%ebx),%eax
165	movl	%ebx,pspantemp	// preserve spans pointer
166
167	movl	C(tadjust),%edx
168	movl	C(sadjust),%esi
169	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
170	addl	%ecx,%edi
171	movl	espan_t_u(%ebx),%ecx
172	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
173	movl	espan_t_count(%ebx),%ecx
174
175//
176// now start the FDIV for the end of the span
177//
178	cmpl	$16,%ecx
179	ja		LSetupNotLast1
180
181	decl	%ecx
182	jz		LCleanup1		// if only one pixel, no need to start an FDIV
183	movl	%ecx,spancountminus1
184
185// finish up the s and t calcs
186	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
187
188	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
189	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
190	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
191	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
192	fxch	%st(1)			// s | t | 1/z | t/z | s/z
193	fistpl	s				// 1/z | t | t/z | s/z
194	fistpl	t				// 1/z | t/z | s/z
195
196	fildl	spancountminus1
197
198	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
199	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
200	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
201	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
202	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
203	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
204	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
205							//  C(d_tdivzstepu)*scm1
206	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
207							//  C(d_tdivzstepu)*scm1
208	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
209	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
210	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
211	faddp	%st(0),%st(3)
212
213	flds	fp_64k
214	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
215							//  overlap
216	jmp		LFDIVInFlight1
217
218LCleanup1:
219// finish up the s and t calcs
220	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
221
222	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
223	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
224	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
225	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
226	fxch	%st(1)			// s | t | 1/z | t/z | s/z
227	fistpl	s				// 1/z | t | t/z | s/z
228	fistpl	t				// 1/z | t/z | s/z
229	jmp		LFDIVInFlight1
230
231	.align	4
232LSetupNotLast1:
233// finish up the s and t calcs
234	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
235
236	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
237	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
238	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
239	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
240	fxch	%st(1)			// s | t | 1/z | t/z | s/z
241	fistpl	s				// 1/z | t | t/z | s/z
242	fistpl	t				// 1/z | t/z | s/z
243
244	fadds	zi16stepu
245	fxch	%st(2)
246	fadds	sdivz16stepu
247	fxch	%st(2)
248	flds	tdivz16stepu
249	faddp	%st(0),%st(2)
250	flds	fp_64k
251	fdiv	%st(1),%st(0)	// z = 1/1/z
252							// this is what we've gone to all this trouble to
253							//  overlap
254LFDIVInFlight1:
255
256	addl	s,%esi
257	addl	t,%edx
258	movl	C(bbextents),%ebx
259	movl	C(bbextentt),%ebp
260	cmpl	%ebx,%esi
261	ja		LClampHighOrLow0
262LClampReentry0:
263	movl	%esi,s
264	movl	pbase,%ebx
265	shll	$16,%esi
266	cmpl	%ebp,%edx
267	movl	%esi,sfracf
268	ja		LClampHighOrLow1
269LClampReentry1:
270	movl	%edx,t
271	movl	s,%esi					// sfrac = scans->sfrac;
272	shll	$16,%edx
273	movl	t,%eax					// tfrac = scans->tfrac;
274	sarl	$16,%esi
275	movl	%edx,tfracf
276
277//
278// calculate the texture starting address
279//
280	sarl	$16,%eax
281	movl	C(cachewidth),%edx
282	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
283	addl	%ebx,%esi
284	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
285									//           ((tfrac >> 16) * cachewidth);
286//
287// determine whether last span or not
288//
289	cmpl	$16,%ecx
290	jna		LLastSegment
291
292//
293// not the last segment; do full 16-wide segment
294//
295LNotLastSegment:
296
297//
298// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
299// get there
300//
301
302// pick up after the FDIV that was left in flight previously
303
304	fld		%st(0)			// duplicate it
305	fmul	%st(4),%st(0)	// s = s/z * z
306	fxch	%st(1)
307	fmul	%st(3),%st(0)	// t = t/z * z
308	fxch	%st(1)
309	fistpl	snext
310	fistpl	tnext
311	movl	snext,%eax
312	movl	tnext,%edx
313
314	movb	(%esi),%bl	// get first source texel
315	subl	$16,%ecx		// count off this segments' pixels
316	movl	C(sadjust),%ebp
317	movl	%ecx,counttemp	// remember count of remaining pixels
318
319	movl	C(tadjust),%ecx
320	movb	%bl,(%edi)	// store first dest pixel
321
322	addl	%eax,%ebp
323	addl	%edx,%ecx
324
325	movl	C(bbextents),%eax
326	movl	C(bbextentt),%edx
327
328	cmpl	$4096,%ebp
329	jl		LClampLow2
330	cmpl	%eax,%ebp
331	ja		LClampHigh2
332LClampReentry2:
333
334	cmpl	$4096,%ecx
335	jl		LClampLow3
336	cmpl	%edx,%ecx
337	ja		LClampHigh3
338LClampReentry3:
339
340	movl	%ebp,snext
341	movl	%ecx,tnext
342
343	subl	s,%ebp
344	subl	t,%ecx
345
346//
347// set up advancetable
348//
349	movl	%ecx,%eax
350	movl	%ebp,%edx
351	sarl	$20,%eax			// tstep >>= 16;
352	jz		LZero
353	sarl	$20,%edx			// sstep >>= 16;
354	movl	C(cachewidth),%ebx
355	imull	%ebx,%eax
356	jmp		LSetUp1
357
358LZero:
359	sarl	$20,%edx			// sstep >>= 16;
360	movl	C(cachewidth),%ebx
361
362LSetUp1:
363
364	addl	%edx,%eax			// add in sstep
365								// (tstep >> 16) * cachewidth + (sstep >> 16);
366	movl	tfracf,%edx
367	movl	%eax,advancetable+4	// advance base in t
368	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
369								//  (sstep >> 16);
370	shll	$12,%ebp			// left-justify sstep fractional part
371	movl	sfracf,%ebx
372	shll	$12,%ecx			// left-justify tstep fractional part
373	movl	%eax,advancetable	// advance extra in t
374
375	movl	%ecx,tstep
376	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
377
378	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
379	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
380	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
381
382	addl	tstep,%edx
383	sbbl	%ecx,%ecx
384	movb	(%esi),%al
385	addl	%ebp,%ebx
386	movb	%al,1(%edi)
387	adcl	advancetable+4(,%ecx,4),%esi
388
389	addl	tstep,%edx
390	sbbl	%ecx,%ecx
391	addl	%ebp,%ebx
392	movb	(%esi),%al
393	adcl	advancetable+4(,%ecx,4),%esi
394
395	addl	tstep,%edx
396	sbbl	%ecx,%ecx
397	movb	%al,2(%edi)
398	addl	%ebp,%ebx
399	movb	(%esi),%al
400	adcl	advancetable+4(,%ecx,4),%esi
401
402	addl	tstep,%edx
403	sbbl	%ecx,%ecx
404	movb	%al,3(%edi)
405	addl	%ebp,%ebx
406	movb	(%esi),%al
407	adcl	advancetable+4(,%ecx,4),%esi
408
409	addl	tstep,%edx
410	sbbl	%ecx,%ecx
411	movb	%al,4(%edi)
412	addl	%ebp,%ebx
413	movb	(%esi),%al
414	adcl	advancetable+4(,%ecx,4),%esi
415
416	addl	tstep,%edx
417	sbbl	%ecx,%ecx
418	movb	%al,5(%edi)
419	addl	%ebp,%ebx
420	movb	(%esi),%al
421	adcl	advancetable+4(,%ecx,4),%esi
422
423	addl	tstep,%edx
424	sbbl	%ecx,%ecx
425	movb	%al,6(%edi)
426	addl	%ebp,%ebx
427	movb	(%esi),%al
428	adcl	advancetable+4(,%ecx,4),%esi
429
430	addl	tstep,%edx
431	sbbl	%ecx,%ecx
432	movb	%al,7(%edi)
433	addl	%ebp,%ebx
434	movb	(%esi),%al
435	adcl	advancetable+4(,%ecx,4),%esi
436
437
438//
439// start FDIV for end of next segment in flight, so it can overlap
440//
441	movl	counttemp,%ecx
442	cmpl	$16,%ecx			// more than one segment after this?
443	ja		LSetupNotLast2	// yes
444
445	decl	%ecx
446	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
447	movl	%ecx,spancountminus1
448	fildl	spancountminus1
449
450	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
451	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
452	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
453	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
454	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
455	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
456	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
457	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
458	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
459	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
460	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
461	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
462	faddp	%st(0),%st(4)	// 64k
463
464	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
465							//  overlap
466	jmp		LFDIVInFlight2
467
468	.align	4
469LSetupNotLast2:
470	fadds	zi16stepu
471	fxch	%st(2)
472	fadds	sdivz16stepu
473	fxch	%st(2)
474	flds	tdivz16stepu
475	faddp	%st(0),%st(2)
476	flds	fp_64k
477	fdiv	%st(1),%st(0)	// z = 1/1/z
478							// this is what we've gone to all this trouble to
479							//  overlap
480LFDIVInFlight2:
481	movl	%ecx,counttemp
482
483	addl	tstep,%edx
484	sbbl	%ecx,%ecx
485	movb	%al,8(%edi)
486	addl	%ebp,%ebx
487	movb	(%esi),%al
488	adcl	advancetable+4(,%ecx,4),%esi
489
490	addl	tstep,%edx
491	sbbl	%ecx,%ecx
492	movb	%al,9(%edi)
493	addl	%ebp,%ebx
494	movb	(%esi),%al
495	adcl	advancetable+4(,%ecx,4),%esi
496
497	addl	tstep,%edx
498	sbbl	%ecx,%ecx
499	movb	%al,10(%edi)
500	addl	%ebp,%ebx
501	movb	(%esi),%al
502	adcl	advancetable+4(,%ecx,4),%esi
503
504	addl	tstep,%edx
505	sbbl	%ecx,%ecx
506	movb	%al,11(%edi)
507	addl	%ebp,%ebx
508	movb	(%esi),%al
509	adcl	advancetable+4(,%ecx,4),%esi
510
511	addl	tstep,%edx
512	sbbl	%ecx,%ecx
513	movb	%al,12(%edi)
514	addl	%ebp,%ebx
515	movb	(%esi),%al
516	adcl	advancetable+4(,%ecx,4),%esi
517
518	addl	tstep,%edx
519	sbbl	%ecx,%ecx
520	movb	%al,13(%edi)
521	addl	%ebp,%ebx
522	movb	(%esi),%al
523	adcl	advancetable+4(,%ecx,4),%esi
524
525	addl	tstep,%edx
526	sbbl	%ecx,%ecx
527	movb	%al,14(%edi)
528	addl	%ebp,%ebx
529	movb	(%esi),%al
530	adcl	advancetable+4(,%ecx,4),%esi
531
532	addl	$16,%edi
533	movl	%edx,tfracf
534	movl	snext,%edx
535	movl	%ebx,sfracf
536	movl	tnext,%ebx
537	movl	%edx,s
538	movl	%ebx,t
539
540	movl	counttemp,%ecx		// retrieve count
541
542//
543// determine whether last span or not
544//
545	cmpl	$16,%ecx				// are there multiple segments remaining?
546	movb	%al,-1(%edi)
547	ja		LNotLastSegment		// yes
548
549//
550// last segment of scan
551//
552LLastSegment:
553
554//
555// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
556// get there. The number of pixels left is variable, and we want to land on the
557// last pixel, not step one past it, so we can't run into arithmetic problems
558//
559	testl	%ecx,%ecx
560	jz		LNoSteps		// just draw the last pixel and we're done
561
562// pick up after the FDIV that was left in flight previously
563
564
565	fld		%st(0)			// duplicate it
566	fmul	%st(4),%st(0)	// s = s/z * z
567	fxch	%st(1)
568	fmul	%st(3),%st(0)	// t = t/z * z
569	fxch	%st(1)
570	fistpl	snext
571	fistpl	tnext
572
573	movb	(%esi),%al		// load first texel in segment
574	movl	C(tadjust),%ebx
575	movb	%al,(%edi)		// store first pixel in segment
576	movl	C(sadjust),%eax
577
578	addl	snext,%eax
579	addl	tnext,%ebx
580
581	movl	C(bbextents),%ebp
582	movl	C(bbextentt),%edx
583
584	cmpl	$4096,%eax
585	jl		LClampLow4
586	cmpl	%ebp,%eax
587	ja		LClampHigh4
588LClampReentry4:
589	movl	%eax,snext
590
591	cmpl	$4096,%ebx
592	jl		LClampLow5
593	cmpl	%edx,%ebx
594	ja		LClampHigh5
595LClampReentry5:
596
597	cmpl	$1,%ecx			// don't bother
598	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
599							//  of the segment length
600	subl	s,%eax
601	subl	t,%ebx
602
603	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
604	addl	%ebx,%ebx		//  reciprocal yields 16.48
605
606	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /
607											//  (spancount-1)
608	movl	%edx,%ebp
609
610	movl	%ebx,%eax
611	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /
612											//  (spancount-1)
613LSetEntryvec:
614//
615// set up advancetable
616//
617	movl	entryvec_table_16(,%ecx,4),%ebx
618	movl	%edx,%eax
619	movl	%ebx,jumptemp		// entry point into code for RET later
620	movl	%ebp,%ecx
621	sarl	$16,%edx			// tstep >>= 16;
622	movl	C(cachewidth),%ebx
623	sarl	$16,%ecx			// sstep >>= 16;
624	imull	%ebx,%edx
625
626	addl	%ecx,%edx			// add in sstep
627								// (tstep >> 16) * cachewidth + (sstep >> 16);
628	movl	tfracf,%ecx
629	movl	%edx,advancetable+4	// advance base in t
630	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
631								//  (sstep >> 16);
632	shll	$16,%ebp			// left-justify sstep fractional part
633	movl	sfracf,%ebx
634	shll	$16,%eax			// left-justify tstep fractional part
635	movl	%edx,advancetable	// advance extra in t
636
637	movl	%eax,tstep
638	movl	%ecx,%edx
639	addl	%eax,%edx
640	sbbl	%ecx,%ecx
641	addl	%ebp,%ebx
642	adcl	advancetable+4(,%ecx,4),%esi
643
644	jmp		*jumptemp			// jump to the number-of-pixels handler
645
646//----------------------------------------
647
648LNoSteps:
649	movb	(%esi),%al		// load first texel in segment
650	subl	$15,%edi			// adjust for hardwired offset
651	jmp		LEndSpan
652
653
654LOnlyOneStep:
655	subl	s,%eax
656	subl	t,%ebx
657	movl	%eax,%ebp
658	movl	%ebx,%edx
659	jmp		LSetEntryvec
660
661//----------------------------------------
662
663.globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16
664.globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16
665.globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16
666.globl	Entry14_16, Entry15_16, Entry16_16
667
668Entry2_16:
669	subl	$14,%edi		// adjust for hardwired offsets
670	movb	(%esi),%al
671	jmp		LEntry2_16
672
673//----------------------------------------
674
675Entry3_16:
676	subl	$13,%edi		// adjust for hardwired offsets
677	addl	%eax,%edx
678	movb	(%esi),%al
679	sbbl	%ecx,%ecx
680	addl	%ebp,%ebx
681	adcl	advancetable+4(,%ecx,4),%esi
682	jmp		LEntry3_16
683
684//----------------------------------------
685
686Entry4_16:
687	subl	$12,%edi		// adjust for hardwired offsets
688	addl	%eax,%edx
689	movb	(%esi),%al
690	sbbl	%ecx,%ecx
691	addl	%ebp,%ebx
692	adcl	advancetable+4(,%ecx,4),%esi
693	addl	tstep,%edx
694	jmp		LEntry4_16
695
696//----------------------------------------
697
698Entry5_16:
699	subl	$11,%edi		// adjust for hardwired offsets
700	addl	%eax,%edx
701	movb	(%esi),%al
702	sbbl	%ecx,%ecx
703	addl	%ebp,%ebx
704	adcl	advancetable+4(,%ecx,4),%esi
705	addl	tstep,%edx
706	jmp		LEntry5_16
707
708//----------------------------------------
709
710Entry6_16:
711	subl	$10,%edi		// adjust for hardwired offsets
712	addl	%eax,%edx
713	movb	(%esi),%al
714	sbbl	%ecx,%ecx
715	addl	%ebp,%ebx
716	adcl	advancetable+4(,%ecx,4),%esi
717	addl	tstep,%edx
718	jmp		LEntry6_16
719
720//----------------------------------------
721
722Entry7_16:
723	subl	$9,%edi		// adjust for hardwired offsets
724	addl	%eax,%edx
725	movb	(%esi),%al
726	sbbl	%ecx,%ecx
727	addl	%ebp,%ebx
728	adcl	advancetable+4(,%ecx,4),%esi
729	addl	tstep,%edx
730	jmp		LEntry7_16
731
732//----------------------------------------
733
734Entry8_16:
735	subl	$8,%edi		// adjust for hardwired offsets
736	addl	%eax,%edx
737	movb	(%esi),%al
738	sbbl	%ecx,%ecx
739	addl	%ebp,%ebx
740	adcl	advancetable+4(,%ecx,4),%esi
741	addl	tstep,%edx
742	jmp		LEntry8_16
743
744//----------------------------------------
745
746Entry9_16:
747	subl	$7,%edi		// adjust for hardwired offsets
748	addl	%eax,%edx
749	movb	(%esi),%al
750	sbbl	%ecx,%ecx
751	addl	%ebp,%ebx
752	adcl	advancetable+4(,%ecx,4),%esi
753	addl	tstep,%edx
754	jmp		LEntry9_16
755
756//----------------------------------------
757
758Entry10_16:
759	subl	$6,%edi		// adjust for hardwired offsets
760	addl	%eax,%edx
761	movb	(%esi),%al
762	sbbl	%ecx,%ecx
763	addl	%ebp,%ebx
764	adcl	advancetable+4(,%ecx,4),%esi
765	addl	tstep,%edx
766	jmp		LEntry10_16
767
768//----------------------------------------
769
770Entry11_16:
771	subl	$5,%edi		// adjust for hardwired offsets
772	addl	%eax,%edx
773	movb	(%esi),%al
774	sbbl	%ecx,%ecx
775	addl	%ebp,%ebx
776	adcl	advancetable+4(,%ecx,4),%esi
777	addl	tstep,%edx
778	jmp		LEntry11_16
779
780//----------------------------------------
781
782Entry12_16:
783	subl	$4,%edi		// adjust for hardwired offsets
784	addl	%eax,%edx
785	movb	(%esi),%al
786	sbbl	%ecx,%ecx
787	addl	%ebp,%ebx
788	adcl	advancetable+4(,%ecx,4),%esi
789	addl	tstep,%edx
790	jmp		LEntry12_16
791
792//----------------------------------------
793
794Entry13_16:
795	subl	$3,%edi		// adjust for hardwired offsets
796	addl	%eax,%edx
797	movb	(%esi),%al
798	sbbl	%ecx,%ecx
799	addl	%ebp,%ebx
800	adcl	advancetable+4(,%ecx,4),%esi
801	addl	tstep,%edx
802	jmp		LEntry13_16
803
804//----------------------------------------
805
806Entry14_16:
807	subl	$2,%edi		// adjust for hardwired offsets
808	addl	%eax,%edx
809	movb	(%esi),%al
810	sbbl	%ecx,%ecx
811	addl	%ebp,%ebx
812	adcl	advancetable+4(,%ecx,4),%esi
813	addl	tstep,%edx
814	jmp		LEntry14_16
815
816//----------------------------------------
817
818Entry15_16:
819	decl	%edi		// adjust for hardwired offsets
820	addl	%eax,%edx
821	movb	(%esi),%al
822	sbbl	%ecx,%ecx
823	addl	%ebp,%ebx
824	adcl	advancetable+4(,%ecx,4),%esi
825	addl	tstep,%edx
826	jmp		LEntry15_16
827
828//----------------------------------------
829
830Entry16_16:
831	addl	%eax,%edx
832	movb	(%esi),%al
833	sbbl	%ecx,%ecx
834	addl	%ebp,%ebx
835	adcl	advancetable+4(,%ecx,4),%esi
836
837	addl	tstep,%edx
838	sbbl	%ecx,%ecx
839	movb	%al,1(%edi)
840	addl	%ebp,%ebx
841	movb	(%esi),%al
842	adcl	advancetable+4(,%ecx,4),%esi
843	addl	tstep,%edx
844LEntry15_16:
845	sbbl	%ecx,%ecx
846	movb	%al,2(%edi)
847	addl	%ebp,%ebx
848	movb	(%esi),%al
849	adcl	advancetable+4(,%ecx,4),%esi
850	addl	tstep,%edx
851LEntry14_16:
852	sbbl	%ecx,%ecx
853	movb	%al,3(%edi)
854	addl	%ebp,%ebx
855	movb	(%esi),%al
856	adcl	advancetable+4(,%ecx,4),%esi
857	addl	tstep,%edx
858LEntry13_16:
859	sbbl	%ecx,%ecx
860	movb	%al,4(%edi)
861	addl	%ebp,%ebx
862	movb	(%esi),%al
863	adcl	advancetable+4(,%ecx,4),%esi
864	addl	tstep,%edx
865LEntry12_16:
866	sbbl	%ecx,%ecx
867	movb	%al,5(%edi)
868	addl	%ebp,%ebx
869	movb	(%esi),%al
870	adcl	advancetable+4(,%ecx,4),%esi
871	addl	tstep,%edx
872LEntry11_16:
873	sbbl	%ecx,%ecx
874	movb	%al,6(%edi)
875	addl	%ebp,%ebx
876	movb	(%esi),%al
877	adcl	advancetable+4(,%ecx,4),%esi
878	addl	tstep,%edx
879LEntry10_16:
880	sbbl	%ecx,%ecx
881	movb	%al,7(%edi)
882	addl	%ebp,%ebx
883	movb	(%esi),%al
884	adcl	advancetable+4(,%ecx,4),%esi
885	addl	tstep,%edx
886LEntry9_16:
887	sbbl	%ecx,%ecx
888	movb	%al,8(%edi)
889	addl	%ebp,%ebx
890	movb	(%esi),%al
891	adcl	advancetable+4(,%ecx,4),%esi
892	addl	tstep,%edx
893LEntry8_16:
894	sbbl	%ecx,%ecx
895	movb	%al,9(%edi)
896	addl	%ebp,%ebx
897	movb	(%esi),%al
898	adcl	advancetable+4(,%ecx,4),%esi
899	addl	tstep,%edx
900LEntry7_16:
901	sbbl	%ecx,%ecx
902	movb	%al,10(%edi)
903	addl	%ebp,%ebx
904	movb	(%esi),%al
905	adcl	advancetable+4(,%ecx,4),%esi
906	addl	tstep,%edx
907LEntry6_16:
908	sbbl	%ecx,%ecx
909	movb	%al,11(%edi)
910	addl	%ebp,%ebx
911	movb	(%esi),%al
912	adcl	advancetable+4(,%ecx,4),%esi
913	addl	tstep,%edx
914LEntry5_16:
915	sbbl	%ecx,%ecx
916	movb	%al,12(%edi)
917	addl	%ebp,%ebx
918	movb	(%esi),%al
919	adcl	advancetable+4(,%ecx,4),%esi
920	addl	tstep,%edx
921LEntry4_16:
922	sbbl	%ecx,%ecx
923	movb	%al,13(%edi)
924	addl	%ebp,%ebx
925	movb	(%esi),%al
926	adcl	advancetable+4(,%ecx,4),%esi
927LEntry3_16:
928	movb	%al,14(%edi)
929	movb	(%esi),%al
930LEntry2_16:
931
932LEndSpan:
933
934//
935// clear s/z, t/z, 1/z from FP stack
936//
937	fstp %st(0)
938	fstp %st(0)
939	fstp %st(0)
940
941	movl	pspantemp,%ebx				// restore spans pointer
942	movl	espan_t_pnext(%ebx),%ebx	// point to next span
943	testl	%ebx,%ebx			// any more spans?
944	movb	%al,15(%edi)
945	jnz		LSpanLoop			// more spans
946
947	popl	%ebx				// restore register variables
948	popl	%esi
949	popl	%edi
950	popl	%ebp				// restore the caller's stack frame
951	ret
952
953//----------------------------------------------------------------------
954// 8-bpp horizontal span z drawing codefor polygons, with no transparency.
955//
956// Assumes there is at least one span in pzspans, and that every span
957// contains at least one pixel
958//----------------------------------------------------------------------
959
960	.text
961
962// z-clamp on a non-negative gradient span
963LClamp:
964	movl	$0x40000000,%edx
965	xorl	%ebx,%ebx
966	fstp	%st(0)
967	jmp		LZDraw
968
969// z-clamp on a negative gradient span
970LClampNeg:
971	movl	$0x40000000,%edx
972	xorl	%ebx,%ebx
973	fstp	%st(0)
974	jmp		LZDrawNeg
975
976
977#define pzspans	4+16
978
979.globl C(D_DrawZSpans)
980C(D_DrawZSpans):
981	pushl	%ebp				// preserve caller's stack frame
982	pushl	%edi
983	pushl	%esi				// preserve register variables
984	pushl	%ebx
985
986	flds	C(d_zistepu)
987	movl	C(d_zistepu),%eax
988	movl	pzspans(%esp),%esi
989	testl	%eax,%eax
990	jz		LFNegSpan
991
992	fmuls	Float2ToThe31nd
993	fistpl	izistep		// note: we are relying on FP exceptions being turned
994						// off here to avoid range problems
995	movl	izistep,%ebx	// remains loaded for all spans
996
997LFSpanLoop:
998// set up the initial 1/z value
999	fildl	espan_t_v(%esi)
1000	fildl	espan_t_u(%esi)
1001	movl	espan_t_v(%esi),%ecx
1002	movl	C(d_pzbuffer),%edi
1003	fmuls	C(d_zistepu)
1004	fxch	%st(1)
1005	fmuls	C(d_zistepv)
1006	fxch	%st(1)
1007	fadds	C(d_ziorigin)
1008	imull	C(d_zrowbytes),%ecx
1009	faddp	%st(0),%st(1)
1010
1011// clamp if z is nearer than 2 (1/z > 0.5)
1012	fcoms	float_point5
1013	addl	%ecx,%edi
1014	movl	espan_t_u(%esi),%edx
1015	addl	%edx,%edx				// word count
1016	movl	espan_t_count(%esi),%ecx
1017	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
1018	pushl	%esi		// preserve spans pointer
1019	fnstsw	%ax
1020	testb	$0x45,%ah
1021	jz		LClamp
1022
1023	fmuls	Float2ToThe31nd
1024	fistpl	izi			// note: we are relying on FP exceptions being turned
1025						// off here to avoid problems when the span is closer
1026						// than 1/(2**31)
1027	movl	izi,%edx
1028
1029// at this point:
1030// %ebx = izistep
1031// %ecx = count
1032// %edx = izi
1033// %edi = pdest
1034
1035LZDraw:
1036
1037// do a single pixel up front, if necessary to dword align the destination
1038	testl	$2,%edi
1039	jz		LFMiddle
1040	movl	%edx,%eax
1041	addl	%ebx,%edx
1042	shrl	$16,%eax
1043	decl	%ecx
1044	movw	%ax,(%edi)
1045	addl	$2,%edi
1046
1047// do middle a pair of aligned dwords at a time
1048LFMiddle:
1049	pushl	%ecx
1050	shrl	$1,%ecx				// count / 2
1051	jz		LFLast				// no aligned dwords to do
1052	shrl	$1,%ecx				// (count / 2) / 2
1053	jnc		LFMiddleLoop		// even number of aligned dwords to do
1054
1055	movl	%edx,%eax
1056	addl	%ebx,%edx
1057	shrl	$16,%eax
1058	movl	%edx,%esi
1059	addl	%ebx,%edx
1060	andl	$0xFFFF0000,%esi
1061	orl		%esi,%eax
1062	movl	%eax,(%edi)
1063	addl	$4,%edi
1064	andl	%ecx,%ecx
1065	jz		LFLast
1066
1067LFMiddleLoop:
1068	movl	%edx,%eax
1069	addl	%ebx,%edx
1070	shrl	$16,%eax
1071	movl	%edx,%esi
1072	addl	%ebx,%edx
1073	andl	$0xFFFF0000,%esi
1074	orl		%esi,%eax
1075	movl	%edx,%ebp
1076	movl	%eax,(%edi)
1077	addl	%ebx,%edx
1078	shrl	$16,%ebp
1079	movl	%edx,%esi
1080	addl	%ebx,%edx
1081	andl	$0xFFFF0000,%esi
1082	orl		%esi,%ebp
1083	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
1084	addl	$8,%edi
1085
1086	decl	%ecx
1087	jnz		LFMiddleLoop
1088
1089LFLast:
1090	popl	%ecx			// retrieve count
1091	popl	%esi			// retrieve span pointer
1092
1093// do the last, unaligned pixel, if there is one
1094	andl	$1,%ecx			// is there an odd pixel left to do?
1095	jz		LFSpanDone		// no
1096	shrl	$16,%edx
1097	movw	%dx,(%edi)		// do the final pixel's z
1098
1099LFSpanDone:
1100	movl	espan_t_pnext(%esi),%esi
1101	testl	%esi,%esi
1102	jnz		LFSpanLoop
1103
1104	jmp		LFDone
1105
1106LFNegSpan:
1107	fmuls	FloatMinus2ToThe31nd
1108	fistpl	izistep		// note: we are relying on FP exceptions being turned
1109						// off here to avoid range problems
1110	movl	izistep,%ebx	// remains loaded for all spans
1111
1112LFNegSpanLoop:
1113// set up the initial 1/z value
1114	fildl	espan_t_v(%esi)
1115	fildl	espan_t_u(%esi)
1116	movl	espan_t_v(%esi),%ecx
1117	movl	C(d_pzbuffer),%edi
1118	fmuls	C(d_zistepu)
1119	fxch	%st(1)
1120	fmuls	C(d_zistepv)
1121	fxch	%st(1)
1122	fadds	C(d_ziorigin)
1123	imull	C(d_zrowbytes),%ecx
1124	faddp	%st(0),%st(1)
1125
1126// clamp if z is nearer than 2 (1/z > 0.5)
1127	fcoms	float_point5
1128	addl	%ecx,%edi
1129	movl	espan_t_u(%esi),%edx
1130	addl	%edx,%edx				// word count
1131	movl	espan_t_count(%esi),%ecx
1132	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
1133	pushl	%esi		// preserve spans pointer
1134	fnstsw	%ax
1135	testb	$0x45,%ah
1136	jz		LClampNeg
1137
1138	fmuls	Float2ToThe31nd
1139	fistpl	izi			// note: we are relying on FP exceptions being turned
1140						// off here to avoid problems when the span is closer
1141						// than 1/(2**31)
1142	movl	izi,%edx
1143
1144// at this point:
1145// %ebx = izistep
1146// %ecx = count
1147// %edx = izi
1148// %edi = pdest
1149
1150LZDrawNeg:
1151
1152// do a single pixel up front, if necessary to dword align the destination
1153	testl	$2,%edi
1154	jz		LFNegMiddle
1155	movl	%edx,%eax
1156	subl	%ebx,%edx
1157	shrl	$16,%eax
1158	decl	%ecx
1159	movw	%ax,(%edi)
1160	addl	$2,%edi
1161
1162// do middle a pair of aligned dwords at a time
1163LFNegMiddle:
1164	pushl	%ecx
1165	shrl	$1,%ecx				// count / 2
1166	jz		LFNegLast			// no aligned dwords to do
1167	shrl	$1,%ecx				// (count / 2) / 2
1168	jnc		LFNegMiddleLoop		// even number of aligned dwords to do
1169
1170	movl	%edx,%eax
1171	subl	%ebx,%edx
1172	shrl	$16,%eax
1173	movl	%edx,%esi
1174	subl	%ebx,%edx
1175	andl	$0xFFFF0000,%esi
1176	orl		%esi,%eax
1177	movl	%eax,(%edi)
1178	addl	$4,%edi
1179	andl	%ecx,%ecx
1180	jz		LFNegLast
1181
1182LFNegMiddleLoop:
1183	movl	%edx,%eax
1184	subl	%ebx,%edx
1185	shrl	$16,%eax
1186	movl	%edx,%esi
1187	subl	%ebx,%edx
1188	andl	$0xFFFF0000,%esi
1189	orl		%esi,%eax
1190	movl	%edx,%ebp
1191	movl	%eax,(%edi)
1192	subl	%ebx,%edx
1193	shrl	$16,%ebp
1194	movl	%edx,%esi
1195	subl	%ebx,%edx
1196	andl	$0xFFFF0000,%esi
1197	orl		%esi,%ebp
1198	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
1199	addl	$8,%edi
1200
1201	decl	%ecx
1202	jnz		LFNegMiddleLoop
1203
1204LFNegLast:
1205	popl	%ecx			// retrieve count
1206	popl	%esi			// retrieve span pointer
1207
1208// do the last, unaligned pixel, if there is one
1209	andl	$1,%ecx			// is there an odd pixel left to do?
1210	jz		LFNegSpanDone	// no
1211	shrl	$16,%edx
1212	movw	%dx,(%edi)		// do the final pixel's z
1213
1214LFNegSpanDone:
1215	movl	espan_t_pnext(%esi),%esi
1216	testl	%esi,%esi
1217	jnz		LFNegSpanLoop
1218
1219LFDone:
1220	popl	%ebx				// restore register variables
1221	popl	%esi
1222	popl	%edi
1223	popl	%ebp				// restore the caller's stack frame
1224	ret
1225
1226#endif	// id386
1227
1228