1//
2// d_spr8.s
3// x86 assembly-language horizontal 8-bpp transparent span-drawing code.
4//
5
6#include "qasm.h"
7
8#if id386
9
10//----------------------------------------------------------------------
11// 8-bpp horizontal span drawing code for polygons, with transparency.
12//----------------------------------------------------------------------
13
14	.text
15
16// out-of-line, rarely-needed clamping code
17
18LClampHigh0:
19	movl	C(bbextents),%esi
20	jmp		LClampReentry0
21LClampHighOrLow0:
22	jg		LClampHigh0
23	xorl	%esi,%esi
24	jmp		LClampReentry0
25
26LClampHigh1:
27	movl	C(bbextentt),%edx
28	jmp		LClampReentry1
29LClampHighOrLow1:
30	jg		LClampHigh1
31	xorl	%edx,%edx
32	jmp		LClampReentry1
33
34LClampLow2:
35	movl	$2048,%ebp
36	jmp		LClampReentry2
37LClampHigh2:
38	movl	C(bbextents),%ebp
39	jmp		LClampReentry2
40
41LClampLow3:
42	movl	$2048,%ecx
43	jmp		LClampReentry3
44LClampHigh3:
45	movl	C(bbextentt),%ecx
46	jmp		LClampReentry3
47
48LClampLow4:
49	movl	$2048,%eax
50	jmp		LClampReentry4
51LClampHigh4:
52	movl	C(bbextents),%eax
53	jmp		LClampReentry4
54
55LClampLow5:
56	movl	$2048,%ebx
57	jmp		LClampReentry5
58LClampHigh5:
59	movl	C(bbextentt),%ebx
60	jmp		LClampReentry5
61
62
63#define pspans	4+16
64
65	.align 4
66.globl C(D_SpriteDrawSpans)
67C(D_SpriteDrawSpans):
68	pushl	%ebp				// preserve caller's stack frame
69	pushl	%edi
70	pushl	%esi				// preserve register variables
71	pushl	%ebx
72
73//
74// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
75// and span list pointers, and 1/z step in 0.32 fixed-point
76//
77// FIXME: any overlap from rearranging?
78	flds	C(d_sdivzstepu)
79	fmuls	fp_8
80	movl	C(cacheblock),%edx
81	flds	C(d_tdivzstepu)
82	fmuls	fp_8
83	movl	pspans(%esp),%ebx	// point to the first span descriptor
84	flds	C(d_zistepu)
85	fmuls	fp_8
86	movl	%edx,pbase			// pbase = cacheblock
87	flds	C(d_zistepu)
88	fmuls	fp_64kx64k
89	fxch	%st(3)
90	fstps	sdivz8stepu
91	fstps	zi8stepu
92	fstps	tdivz8stepu
93	fistpl	izistep
94	movl	izistep,%eax
95	rorl	$16,%eax		// put upper 16 bits in low word
96	movl	sspan_t_count(%ebx),%ecx
97	movl	%eax,izistep
98
99	cmpl	$0,%ecx
100	jle		LNextSpan
101
102LSpanLoop:
103
104//
105// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
106// initial s and t values
107//
108// FIXME: pipeline FILD?
109	fildl	sspan_t_v(%ebx)
110	fildl	sspan_t_u(%ebx)
111
112	fld		%st(1)			// dv | du | dv
113	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
114	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
115	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
116	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
117	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
118							//  dv*d_sdivzstepv | du | dv
119	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
120							//  dv*d_sdivzstepv | du | dv
121	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
122							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
123	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
124							//  du*d_tdivzstepu | du | dv
125	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
126							//  du*d_tdivzstepu | du | dv
127	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
128							//  du*d_sdivzstepu + dv*d_sdivzstepv |
129							//  du*d_tdivzstepu | du | dv
130	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
131							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
132	fadds	C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv +
133							//  du*d_sdivzstepu; stays in %st(2) at end
134	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
135							//  s/z
136	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
137							//  du*d_tdivzstepu | du | s/z
138	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
139							//  du*d_tdivzstepu | du | s/z
140	faddp	%st(0),%st(2)	// dv*d_zistepv |
141							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
142	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
143							//  dv*d_zistepv | s/z
144	fmuls	C(d_zistepu)		// du*d_zistepu |
145							//  dv*d_tdivzstepv + du*d_tdivzstepu |
146							//  dv*d_zistepv | s/z
147	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
148							//  du*d_zistepu | dv*d_zistepv | s/z
149	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
150							//  du*d_tdivzstepu; stays in %st(1) at end
151	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
152	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
153
154	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
155	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
156	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
157							//  du*d_zistepu; stays in %st(0) at end
158							// 1/z | fp_64k | t/z | s/z
159
160	fld		%st(0)			// FIXME: get rid of stall on FMUL?
161	fmuls	fp_64kx64k
162	fxch	%st(1)
163
164//
165// calculate and clamp s & t
166//
167	fdivr	%st(0),%st(2)	// 1/z | z*64k | t/z | s/z
168	fxch	%st(1)
169
170	fistpl	izi				// 0.32 fixed-point 1/z
171	movl	izi,%ebp
172
173//
174// set pz to point to the first z-buffer pixel in the span
175//
176	rorl	$16,%ebp		// put upper 16 bits in low word
177	movl	sspan_t_v(%ebx),%eax
178	movl	%ebp,izi
179	movl	sspan_t_u(%ebx),%ebp
180	imull	C(d_zrowbytes)
181	shll	$1,%ebp					// a word per pixel
182	addl	C(d_pzbuffer),%eax
183	addl	%ebp,%eax
184	movl	%eax,pz
185
186//
187// point %edi to the first pixel in the span
188//
189	movl	C(d_viewbuffer),%ebp
190	movl	sspan_t_v(%ebx),%eax
191	pushl	%ebx		// preserve spans pointer
192	movl	C(tadjust),%edx
193	movl	C(sadjust),%esi
194	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
195	addl	%ebp,%edi
196	movl	sspan_t_u(%ebx),%ebp
197	addl	%ebp,%edi				// pdest = &pdestspan[scans->u];
198
199//
200// now start the FDIV for the end of the span
201//
202	cmpl	$8,%ecx
203	ja		LSetupNotLast1
204
205	decl	%ecx
206	jz		LCleanup1		// if only one pixel, no need to start an FDIV
207	movl	%ecx,spancountminus1
208
209// finish up the s and t calcs
210	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
211
212	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
213	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
214	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
215	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
216	fxch	%st(1)			// s | t | 1/z | t/z | s/z
217	fistpl	s				// 1/z | t | t/z | s/z
218	fistpl	t				// 1/z | t/z | s/z
219
220	fildl	spancountminus1
221
222	flds	C(d_tdivzstepu)	// _d_tdivzstepu | spancountminus1
223	flds	C(d_zistepu)	// _d_zistepu | _d_tdivzstepu | spancountminus1
224	fmul	%st(2),%st(0)	// _d_zistepu*scm1 | _d_tdivzstepu | scm1
225	fxch	%st(1)			// _d_tdivzstepu | _d_zistepu*scm1 | scm1
226	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1
227	fxch	%st(2)			// scm1 | _d_zistepu*scm1 | _d_tdivzstepu*scm1
228	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_zistepu*scm1 |
229							//  _d_tdivzstepu*scm1
230	fxch	%st(1)			// _d_zistepu*scm1 | _d_sdivzstepu*scm1 |
231							//  _d_tdivzstepu*scm1
232	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1
233	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1
234	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1
235	faddp	%st(0),%st(3)
236
237	flds	fp_64k
238	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
239							//  overlap
240	jmp		LFDIVInFlight1
241
242LCleanup1:
243// finish up the s and t calcs
244	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
245
246	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
247	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
248	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
249	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
250	fxch	%st(1)			// s | t | 1/z | t/z | s/z
251	fistpl	s				// 1/z | t | t/z | s/z
252	fistpl	t				// 1/z | t/z | s/z
253	jmp		LFDIVInFlight1
254
255	.align	4
256LSetupNotLast1:
257// finish up the s and t calcs
258	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
259
260	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
261	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
262	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
263	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
264	fxch	%st(1)			// s | t | 1/z | t/z | s/z
265	fistpl	s				// 1/z | t | t/z | s/z
266	fistpl	t				// 1/z | t/z | s/z
267
268	fadds	zi8stepu
269	fxch	%st(2)
270	fadds	sdivz8stepu
271	fxch	%st(2)
272	flds	tdivz8stepu
273	faddp	%st(0),%st(2)
274	flds	fp_64k
275	fdiv	%st(1),%st(0)	// z = 1/1/z
276							// this is what we've gone to all this trouble to
277							//  overlap
278LFDIVInFlight1:
279
280	addl	s,%esi
281	addl	t,%edx
282	movl	C(bbextents),%ebx
283	movl	C(bbextentt),%ebp
284	cmpl	%ebx,%esi
285	ja		LClampHighOrLow0
286LClampReentry0:
287	movl	%esi,s
288	movl	pbase,%ebx
289	shll	$16,%esi
290	cmpl	%ebp,%edx
291	movl	%esi,sfracf
292	ja		LClampHighOrLow1
293LClampReentry1:
294	movl	%edx,t
295	movl	s,%esi					// sfrac = scans->sfrac;
296	shll	$16,%edx
297	movl	t,%eax					// tfrac = scans->tfrac;
298	sarl	$16,%esi
299	movl	%edx,tfracf
300
301//
302// calculate the texture starting address
303//
304	sarl	$16,%eax
305	addl	%ebx,%esi
306	imull	C(cachewidth),%eax		// (tfrac >> 16) * cachewidth
307	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
308									//           ((tfrac >> 16) * cachewidth);
309
310//
311// determine whether last span or not
312//
313	cmpl	$8,%ecx
314	jna		LLastSegment
315
316//
317// not the last segment; do full 8-wide segment
318//
319LNotLastSegment:
320
321//
322// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
323// get there
324//
325
326// pick up after the FDIV that was left in flight previously
327
328	fld		%st(0)			// duplicate it
329	fmul	%st(4),%st(0)	// s = s/z * z
330	fxch	%st(1)
331	fmul	%st(3),%st(0)	// t = t/z * z
332	fxch	%st(1)
333	fistpl	snext
334	fistpl	tnext
335	movl	snext,%eax
336	movl	tnext,%edx
337
338	subl	$8,%ecx		// count off this segments' pixels
339	movl	C(sadjust),%ebp
340	pushl	%ecx		// remember count of remaining pixels
341	movl	C(tadjust),%ecx
342
343	addl	%eax,%ebp
344	addl	%edx,%ecx
345
346	movl	C(bbextents),%eax
347	movl	C(bbextentt),%edx
348
349	cmpl	$2048,%ebp
350	jl		LClampLow2
351	cmpl	%eax,%ebp
352	ja		LClampHigh2
353LClampReentry2:
354
355	cmpl	$2048,%ecx
356	jl		LClampLow3
357	cmpl	%edx,%ecx
358	ja		LClampHigh3
359LClampReentry3:
360
361	movl	%ebp,snext
362	movl	%ecx,tnext
363
364	subl	s,%ebp
365	subl	t,%ecx
366
367//
368// set up advancetable
369//
370	movl	%ecx,%eax
371	movl	%ebp,%edx
372	sarl	$19,%edx			// sstep >>= 16;
373	movl	C(cachewidth),%ebx
374	sarl	$19,%eax			// tstep >>= 16;
375	jz		LIsZero
376	imull	%ebx,%eax			// (tstep >> 16) * cachewidth;
377LIsZero:
378	addl	%edx,%eax			// add in sstep
379								// (tstep >> 16) * cachewidth + (sstep >> 16);
380	movl	tfracf,%edx
381	movl	%eax,advancetable+4	// advance base in t
382	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
383								//  (sstep >> 16);
384	shll	$13,%ebp			// left-justify sstep fractional part
385	movl	%ebp,sstep
386	movl	sfracf,%ebx
387	shll	$13,%ecx			// left-justify tstep fractional part
388	movl	%eax,advancetable	// advance extra in t
389	movl	%ecx,tstep
390
391	movl	pz,%ecx
392	movl	izi,%ebp
393
394	cmpw	(%ecx),%bp
395	jl		Lp1
396	movb	(%esi),%al			// get first source texel
397	cmpb	$(TRANSPARENT_COLOR),%al
398	jz		Lp1
399	movw	%bp,(%ecx)
400	movb	%al,(%edi)			// store first dest pixel
401Lp1:
402	addl	izistep,%ebp
403	adcl	$0,%ebp
404	addl	tstep,%edx			// advance tfrac fractional part by tstep frac
405
406	sbbl	%eax,%eax			// turn tstep carry into -1 (0 if none)
407	addl	sstep,%ebx			// advance sfrac fractional part by sstep frac
408	adcl	advancetable+4(,%eax,4),%esi	// point to next source texel
409
410	cmpw	2(%ecx),%bp
411	jl		Lp2
412	movb	(%esi),%al
413	cmpb	$(TRANSPARENT_COLOR),%al
414	jz		Lp2
415	movw	%bp,2(%ecx)
416	movb	%al,1(%edi)
417Lp2:
418	addl	izistep,%ebp
419	adcl	$0,%ebp
420	addl	tstep,%edx
421	sbbl	%eax,%eax
422	addl	sstep,%ebx
423	adcl	advancetable+4(,%eax,4),%esi
424
425	cmpw	4(%ecx),%bp
426	jl		Lp3
427	movb	(%esi),%al
428	cmpb	$(TRANSPARENT_COLOR),%al
429	jz		Lp3
430	movw	%bp,4(%ecx)
431	movb	%al,2(%edi)
432Lp3:
433	addl	izistep,%ebp
434	adcl	$0,%ebp
435	addl	tstep,%edx
436	sbbl	%eax,%eax
437	addl	sstep,%ebx
438	adcl	advancetable+4(,%eax,4),%esi
439
440	cmpw	6(%ecx),%bp
441	jl		Lp4
442	movb	(%esi),%al
443	cmpb	$(TRANSPARENT_COLOR),%al
444	jz		Lp4
445	movw	%bp,6(%ecx)
446	movb	%al,3(%edi)
447Lp4:
448	addl	izistep,%ebp
449	adcl	$0,%ebp
450	addl	tstep,%edx
451	sbbl	%eax,%eax
452	addl	sstep,%ebx
453	adcl	advancetable+4(,%eax,4),%esi
454
455	cmpw	8(%ecx),%bp
456	jl		Lp5
457	movb	(%esi),%al
458	cmpb	$(TRANSPARENT_COLOR),%al
459	jz		Lp5
460	movw	%bp,8(%ecx)
461	movb	%al,4(%edi)
462Lp5:
463	addl	izistep,%ebp
464	adcl	$0,%ebp
465	addl	tstep,%edx
466	sbbl	%eax,%eax
467	addl	sstep,%ebx
468	adcl	advancetable+4(,%eax,4),%esi
469
470//
471// start FDIV for end of next segment in flight, so it can overlap
472//
473	popl	%eax
474	cmpl	$8,%eax			// more than one segment after this?
475	ja		LSetupNotLast2	// yes
476
477	decl	%eax
478	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
479	movl	%eax,spancountminus1
480	fildl	spancountminus1
481
482	flds	C(d_zistepu)		// _d_zistepu | spancountminus1
483	fmul	%st(1),%st(0)	// _d_zistepu*scm1 | scm1
484	flds	C(d_tdivzstepu)	// _d_tdivzstepu | _d_zistepu*scm1 | scm1
485	fmul	%st(2),%st(0)	// _d_tdivzstepu*scm1 | _d_zistepu*scm1 | scm1
486	fxch	%st(1)			// _d_zistepu*scm1 | _d_tdivzstepu*scm1 | scm1
487	faddp	%st(0),%st(3)	// _d_tdivzstepu*scm1 | scm1
488	fxch	%st(1)			// scm1 | _d_tdivzstepu*scm1
489	fmuls	C(d_sdivzstepu)	// _d_sdivzstepu*scm1 | _d_tdivzstepu*scm1
490	fxch	%st(1)			// _d_tdivzstepu*scm1 | _d_sdivzstepu*scm1
491	faddp	%st(0),%st(3)	// _d_sdivzstepu*scm1
492	flds	fp_64k			// 64k | _d_sdivzstepu*scm1
493	fxch	%st(1)			// _d_sdivzstepu*scm1 | 64k
494	faddp	%st(0),%st(4)	// 64k
495
496	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
497							//  overlap
498	jmp		LFDIVInFlight2
499
500	.align	4
501LSetupNotLast2:
502	fadds	zi8stepu
503	fxch	%st(2)
504	fadds	sdivz8stepu
505	fxch	%st(2)
506	flds	tdivz8stepu
507	faddp	%st(0),%st(2)
508	flds	fp_64k
509	fdiv	%st(1),%st(0)	// z = 1/1/z
510							// this is what we've gone to all this trouble to
511							//  overlap
512LFDIVInFlight2:
513	pushl	%eax
514
515	cmpw	10(%ecx),%bp
516	jl		Lp6
517	movb	(%esi),%al
518	cmpb	$(TRANSPARENT_COLOR),%al
519	jz		Lp6
520	movw	%bp,10(%ecx)
521	movb	%al,5(%edi)
522Lp6:
523	addl	izistep,%ebp
524	adcl	$0,%ebp
525	addl	tstep,%edx
526	sbbl	%eax,%eax
527	addl	sstep,%ebx
528	adcl	advancetable+4(,%eax,4),%esi
529
530	cmpw	12(%ecx),%bp
531	jl		Lp7
532	movb	(%esi),%al
533	cmpb	$(TRANSPARENT_COLOR),%al
534	jz		Lp7
535	movw	%bp,12(%ecx)
536	movb	%al,6(%edi)
537Lp7:
538	addl	izistep,%ebp
539	adcl	$0,%ebp
540	addl	tstep,%edx
541	sbbl	%eax,%eax
542	addl	sstep,%ebx
543	adcl	advancetable+4(,%eax,4),%esi
544
545	cmpw	14(%ecx),%bp
546	jl		Lp8
547	movb	(%esi),%al
548	cmpb	$(TRANSPARENT_COLOR),%al
549	jz		Lp8
550	movw	%bp,14(%ecx)
551	movb	%al,7(%edi)
552Lp8:
553	addl	izistep,%ebp
554	adcl	$0,%ebp
555	addl	tstep,%edx
556	sbbl	%eax,%eax
557	addl	sstep,%ebx
558	adcl	advancetable+4(,%eax,4),%esi
559
560	addl	$8,%edi
561	addl	$16,%ecx
562	movl	%edx,tfracf
563	movl	snext,%edx
564	movl	%ebx,sfracf
565	movl	tnext,%ebx
566	movl	%edx,s
567	movl	%ebx,t
568
569	movl	%ecx,pz
570	movl	%ebp,izi
571
572	popl	%ecx				// retrieve count
573
574//
575// determine whether last span or not
576//
577	cmpl	$8,%ecx				// are there multiple segments remaining?
578	ja		LNotLastSegment		// yes
579
580//
581// last segment of scan
582//
583LLastSegment:
584
585//
586// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
587// get there. The number of pixels left is variable, and we want to land on the
588// last pixel, not step one past it, so we can't run into arithmetic problems
589//
590	testl	%ecx,%ecx
591	jz		LNoSteps		// just draw the last pixel and we're done
592
593// pick up after the FDIV that was left in flight previously
594
595
596	fld		%st(0)			// duplicate it
597	fmul	%st(4),%st(0)	// s = s/z * z
598	fxch	%st(1)
599	fmul	%st(3),%st(0)	// t = t/z * z
600	fxch	%st(1)
601	fistpl	snext
602	fistpl	tnext
603
604	movl	C(tadjust),%ebx
605	movl	C(sadjust),%eax
606
607	addl	snext,%eax
608	addl	tnext,%ebx
609
610	movl	C(bbextents),%ebp
611	movl	C(bbextentt),%edx
612
613	cmpl	$2048,%eax
614	jl		LClampLow4
615	cmpl	%ebp,%eax
616	ja		LClampHigh4
617LClampReentry4:
618	movl	%eax,snext
619
620	cmpl	$2048,%ebx
621	jl		LClampLow5
622	cmpl	%edx,%ebx
623	ja		LClampHigh5
624LClampReentry5:
625
626	cmpl	$1,%ecx			// don't bother
627	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
628							//  of the segment length
629	subl	s,%eax
630	subl	t,%ebx
631
632	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
633	addl	%ebx,%ebx		//  reciprocal yields 16.48
634	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
635	movl	%edx,%ebp
636
637	movl	%ebx,%eax
638	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
639
640LSetEntryvec:
641//
642// set up advancetable
643//
644	movl	spr8entryvec_table(,%ecx,4),%ebx
645	movl	%edx,%eax
646	pushl	%ebx				// entry point into code for RET later
647	movl	%ebp,%ecx
648	sarl	$16,%ecx			// sstep >>= 16;
649	movl	C(cachewidth),%ebx
650	sarl	$16,%edx			// tstep >>= 16;
651	jz		LIsZeroLast
652	imull	%ebx,%edx			// (tstep >> 16) * cachewidth;
653LIsZeroLast:
654	addl	%ecx,%edx			// add in sstep
655								// (tstep >> 16) * cachewidth + (sstep >> 16);
656	movl	tfracf,%ecx
657	movl	%edx,advancetable+4	// advance base in t
658	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
659								//  (sstep >> 16);
660	shll	$16,%ebp			// left-justify sstep fractional part
661	movl	sfracf,%ebx
662	shll	$16,%eax			// left-justify tstep fractional part
663	movl	%edx,advancetable	// advance extra in t
664
665	movl	%eax,tstep
666	movl	%ebp,sstep
667	movl	%ecx,%edx
668
669	movl	pz,%ecx
670	movl	izi,%ebp
671
672	ret							// jump to the number-of-pixels handler
673
674//----------------------------------------
675
676LNoSteps:
677	movl	pz,%ecx
678	subl	$7,%edi			// adjust for hardwired offset
679	subl	$14,%ecx
680	jmp		LEndSpan
681
682
683LOnlyOneStep:
684	subl	s,%eax
685	subl	t,%ebx
686	movl	%eax,%ebp
687	movl	%ebx,%edx
688	jmp		LSetEntryvec
689
690//----------------------------------------
691
692.globl	Spr8Entry2_8
693Spr8Entry2_8:
694	subl	$6,%edi		// adjust for hardwired offsets
695	subl	$12,%ecx
696	movb	(%esi),%al
697	jmp		LLEntry2_8
698
699//----------------------------------------
700
701.globl	Spr8Entry3_8
702Spr8Entry3_8:
703	subl	$5,%edi		// adjust for hardwired offsets
704	subl	$10,%ecx
705	jmp		LLEntry3_8
706
707//----------------------------------------
708
709.globl	Spr8Entry4_8
710Spr8Entry4_8:
711	subl	$4,%edi		// adjust for hardwired offsets
712	subl	$8,%ecx
713	jmp		LLEntry4_8
714
715//----------------------------------------
716
717.globl	Spr8Entry5_8
718Spr8Entry5_8:
719	subl	$3,%edi		// adjust for hardwired offsets
720	subl	$6,%ecx
721	jmp		LLEntry5_8
722
723//----------------------------------------
724
725.globl	Spr8Entry6_8
726Spr8Entry6_8:
727	subl	$2,%edi		// adjust for hardwired offsets
728	subl	$4,%ecx
729	jmp		LLEntry6_8
730
731//----------------------------------------
732
733.globl	Spr8Entry7_8
734Spr8Entry7_8:
735	decl	%edi		// adjust for hardwired offsets
736	subl	$2,%ecx
737	jmp		LLEntry7_8
738
739//----------------------------------------
740
741.globl	Spr8Entry8_8
742Spr8Entry8_8:
743	cmpw	(%ecx),%bp
744	jl		Lp9
745	movb	(%esi),%al
746	cmpb	$(TRANSPARENT_COLOR),%al
747	jz		Lp9
748	movw	%bp,(%ecx)
749	movb	%al,(%edi)
750Lp9:
751	addl	izistep,%ebp
752	adcl	$0,%ebp
753	addl	tstep,%edx
754	sbbl	%eax,%eax
755	addl	sstep,%ebx
756	adcl	advancetable+4(,%eax,4),%esi
757LLEntry7_8:
758	cmpw	2(%ecx),%bp
759	jl		Lp10
760	movb	(%esi),%al
761	cmpb	$(TRANSPARENT_COLOR),%al
762	jz		Lp10
763	movw	%bp,2(%ecx)
764	movb	%al,1(%edi)
765Lp10:
766	addl	izistep,%ebp
767	adcl	$0,%ebp
768	addl	tstep,%edx
769	sbbl	%eax,%eax
770	addl	sstep,%ebx
771	adcl	advancetable+4(,%eax,4),%esi
772LLEntry6_8:
773	cmpw	4(%ecx),%bp
774	jl		Lp11
775	movb	(%esi),%al
776	cmpb	$(TRANSPARENT_COLOR),%al
777	jz		Lp11
778	movw	%bp,4(%ecx)
779	movb	%al,2(%edi)
780Lp11:
781	addl	izistep,%ebp
782	adcl	$0,%ebp
783	addl	tstep,%edx
784	sbbl	%eax,%eax
785	addl	sstep,%ebx
786	adcl	advancetable+4(,%eax,4),%esi
787LLEntry5_8:
788	cmpw	6(%ecx),%bp
789	jl		Lp12
790	movb	(%esi),%al
791	cmpb	$(TRANSPARENT_COLOR),%al
792	jz		Lp12
793	movw	%bp,6(%ecx)
794	movb	%al,3(%edi)
795Lp12:
796	addl	izistep,%ebp
797	adcl	$0,%ebp
798	addl	tstep,%edx
799	sbbl	%eax,%eax
800	addl	sstep,%ebx
801	adcl	advancetable+4(,%eax,4),%esi
802LLEntry4_8:
803	cmpw	8(%ecx),%bp
804	jl		Lp13
805	movb	(%esi),%al
806	cmpb	$(TRANSPARENT_COLOR),%al
807	jz		Lp13
808	movw	%bp,8(%ecx)
809	movb	%al,4(%edi)
810Lp13:
811	addl	izistep,%ebp
812	adcl	$0,%ebp
813	addl	tstep,%edx
814	sbbl	%eax,%eax
815	addl	sstep,%ebx
816	adcl	advancetable+4(,%eax,4),%esi
817LLEntry3_8:
818	cmpw	10(%ecx),%bp
819	jl		Lp14
820	movb	(%esi),%al
821	cmpb	$(TRANSPARENT_COLOR),%al
822	jz		Lp14
823	movw	%bp,10(%ecx)
824	movb	%al,5(%edi)
825Lp14:
826	addl	izistep,%ebp
827	adcl	$0,%ebp
828	addl	tstep,%edx
829	sbbl	%eax,%eax
830	addl	sstep,%ebx
831	adcl	advancetable+4(,%eax,4),%esi
832LLEntry2_8:
833	cmpw	12(%ecx),%bp
834	jl		Lp15
835	movb	(%esi),%al
836	cmpb	$(TRANSPARENT_COLOR),%al
837	jz		Lp15
838	movw	%bp,12(%ecx)
839	movb	%al,6(%edi)
840Lp15:
841	addl	izistep,%ebp
842	adcl	$0,%ebp
843	addl	tstep,%edx
844	sbbl	%eax,%eax
845	addl	sstep,%ebx
846	adcl	advancetable+4(,%eax,4),%esi
847
848LEndSpan:
849	cmpw	14(%ecx),%bp
850	jl		Lp16
851	movb	(%esi),%al		// load first texel in segment
852	cmpb	$(TRANSPARENT_COLOR),%al
853	jz		Lp16
854	movw	%bp,14(%ecx)
855	movb	%al,7(%edi)
856Lp16:
857
858//
859// clear s/z, t/z, 1/z from FP stack
860//
861	fstp %st(0)
862	fstp %st(0)
863	fstp %st(0)
864
865	popl	%ebx				// restore spans pointer
866LNextSpan:
867	addl	$(sspan_t_size),%ebx // point to next span
868	movl	sspan_t_count(%ebx),%ecx
869	cmpl	$0,%ecx				// any more spans?
870	jg		LSpanLoop			// yes
871	jz		LNextSpan			// yes, but this one's empty
872
873	popl	%ebx				// restore register variables
874	popl	%esi
875	popl	%edi
876	popl	%ebp				// restore the caller's stack frame
877	ret
878
879#endif	// id386
880