1; $Id: tmap_per.asm,v 1.3 2003/02/18 20:15:48 btb Exp $
2;THE COMPUTER CODE CONTAINED HEREIN IS THE SOLE PROPERTY OF PARALLAX
3;SOFTWARE CORPORATION ("PARALLAX").  PARALLAX, IN DISTRIBUTING THE CODE TO
4;END-USERS, AND SUBJECT TO ALL OF THE TERMS AND CONDITIONS HEREIN, GRANTS A
5;ROYALTY-FREE, PERPETUAL LICENSE TO SUCH END-USERS FOR USE BY SUCH END-USERS
6;IN USING, DISPLAYING,  AND CREATING DERIVATIVE WORKS THEREOF, SO LONG AS
7;SUCH USE, DISPLAY OR CREATION IS FOR NON-COMMERCIAL, ROYALTY OR REVENUE
8;FREE PURPOSES.  IN NO EVENT SHALL THE END-USER USE THE COMPUTER CODE
9;CONTAINED HEREIN FOR REVENUE-BEARING PURPOSES.  THE END-USER UNDERSTANDS
10;AND AGREES TO THE TERMS HEREIN AND ACCEPTS THE SAME BY USE OF THIS FILE.
11;COPYRIGHT 1993-1998 PARALLAX SOFTWARE CORPORATION.  ALL RIGHTS RESERVED.
12;
13;
14; Perspective texture mapper inner loop.
15;
16; Old Log:
17; Revision 1.26  1995/02/20  18:22:55  john
18; Put all the externs in the assembly modules into tmap_inc.asm.
19; Also, moved all the C versions of the inner loops into a new module,
20; scanline.c.
21;
22; Revision 1.25  1995/02/20  17:09:08  john
23; Added code so that you can build the tmapper with no assembly!
24;
25; Revision 1.24  1995/01/10  09:32:07  mike
26; mostly fix garbage at end of scanline, but slow down by 1-4%.
27;
28; Revision 1.23  1994/12/02  23:29:57  mike
29; optimizations.
30;
31; Revision 1.22  1994/11/30  00:57:00  mike
32; optimization.
33;
34; Revision 1.21  1994/11/21  13:57:42  mike
35; fix right side shear bug
36;
37; Revision 1.20  1994/11/12  16:41:09  mike
38; jae -> ja.
39;
40; Revision 1.19  1994/10/27  19:40:00  john
41; Made lighting table lookup be _gr_fade_table[eax] instead
42; of fs:[eax], which gets rig of a segment override that
43; supposedly costs 1 clock on a 486.  Mainly, I wanted to verify
44; that the only reason we need selectors is for the source texture
45; data .
46;
47; Revision 1.18  1994/05/03  11:08:32  mike
48; Trap divide overflows.
49;
50; Revision 1.17  1994/04/21  15:03:41  mike
51; make faster.
52;
53; Revision 1.16  1994/04/08  16:46:57  john
54; Made 32 fade levels. Hacked.
55;
56; Revision 1.15  1994/03/31  08:35:18  mike
57; Fix quantized-by-4 bug in inner loop.
58;
59; Revision 1.14  1994/03/14  17:41:14  mike
60; Fix bug in unlighted version.
61;
62; Revision 1.13  1994/03/14  15:45:14  mike
63; streamline code.
64;
65; Revision 1.12  1994/01/14  14:01:58  mike
66; *** empty log message ***
67;
68; Revision 1.11  1993/12/18  14:43:44  john
69; Messed around with doing 1/z, the u*(1/z) and v*(1/z)
70; (Went from 23 fps to 21 fps... not good! )
71;
72; Revision 1.10  1993/12/17  16:14:17  john
73; Split lighted/nonlighted, so there is no cmp lighting
74; in the inner loop.
75;
76; Revision 1.9  1993/12/17  12:34:29  john
77; Made leftover bytes use linear approx instead of correct...
78; should save about 8 divides per scanline on average.
79; Also, took out anti-aliasing code and rearranged to
80; order of some instructions to help on 486 pipelining.
81; (The anti-aliasing code did *not* look good, so I
82; figure there was no reason to keep it in. )
83;
84; Revision 1.8  1993/12/16  18:37:52  mike
85; Align some stuff on 4 byte boundaries.
86;
87; Revision 1.7  1993/11/30  08:44:18  john
88; Made selector set check for < 64*64 bitmaps.
89;
90; Revision 1.6  1993/11/23  17:25:26  john
91; Added safety "and eax, 0fffh" in lighting lookup.
92;
93; Revision 1.5  1993/11/23  15:08:52  mike
94; Fixed lighting bug.
95;
96; Revision 1.4  1993/11/23  14:38:50  john
97; optimized NORMAL code by switching EBX and ESI, so BH can be used in
98; the lighting process.
99;
100; Revision 1.3  1993/11/23  14:30:53  john
101; Made the perspective tmapper do 1/8 divides; added lighting.
102;
103; Revision 1.2  1993/11/22  10:24:59  mike
104; *** empty log message ***
105;
106; Revision 1.1  1993/09/08  17:29:53  mike
107; Initial revision
108;
109;
110;
111
112[BITS 32]
113
114global	_asm_tmap_scanline_per
115global	asm_tmap_scanline_per
116
117%include        "tmap_inc.asm"
118
119[SECTION .data]
120align 4
121    ;extern _per2_flag;:dword
122%ifdef __linux__
123; Cater for linux ELF compilers...
124global x
125%define _loop_count loop_count
126%define _new_end new_end
127%define _scan_doubling_flag scan_doubling_flag
128%define _linear_if_far_flag linear_if_far_flag
129%endif
130
131	global _x
132	global _loop_count
133        global _new_end
134	global _scan_doubling_flag
135	global _linear_if_far_flag
136
137;	 global _max_ecx
138;	 global _min_ecx
139
140    mem_edx dd 0
141    x:
142    _x		dd	0
143    _loop_count dd	0
144
145;    _max_ecx	 dd	 0
146;    _min_ecx	 dd	 55555555h
147    _new_end     dd      1       ; if set, use new, but slower, way of finishing off extra pixels on scanline, 01/10/95 --MK
148
149    _scan_doubling_flag dd 0
150    _linear_if_far_flag dd 0
151
152;---------- local variables
153align 4
154    req_base    dd	0
155    req_size    dd	0
156    U0          dd	0
157    U1          dd	0
158    V0          dd	0
159    V1          dd	0
160    num_left_over   dd	0
161    DU1         dd	0
162    DV1         dd	0
163    DZ1         dd	0
164
165[SECTION .text]
166
167; --------------------------------------------------------------------------------------------------
168; Enter:
169;	_xleft	fixed point left x coordinate
170;	_xright	fixed point right x coordinate
171;	_y	fixed point y coordinate
172;	_pixptr	address of source pixel map
173;	_u	fixed point initial u coordinate
174;	_v	fixed point initial v coordinate
175;	_z	fixed point initial z coordinate
176;	_du_dx	fixed point du/dx
177;	_dv_dx	fixed point dv/dx
178;	_dz_dx	fixed point dz/dx
179
180;   for (x = (int) xleft; x <= (int) xright; x++) {
181;      _setcolor(read_pixel_from_tmap(srcb,((int) (u/z)) & 63,((int) (v/z)) & 63));
182;      _setpixel(x,y);
183;
184;      u += du_dx;
185;      v += dv_dx;
186;      z += dz_dx;
187;   }
188
189
190align	16
191_asm_tmap_scanline_per:
192asm_tmap_scanline_per:
193;        push    es
194	pusha
195
196;---------------------------- setup for loop ---------------------------------
197; Setup for loop:	_loop_count  iterations = (int) xright - (int) xleft
198;	esi	source pixel pointer = pixptr
199;	edi	initial row pointer = y*320+x
200; NOTE: fx_xright and fx_xleft changed from fix to int by mk on 12/01/94.
201
202; set esi = pointer to start of texture map data
203
204; set edi = address of first pixel to modify
205	mov	edi,[_fx_y]
206;        mov     es,[_pixel_data_selector]       ; selector[0*2]
207
208	mov	edi,[_y_pointers+edi*4]
209
210	mov	ebx,[_fx_xleft]
211	test	ebx, ebx
212	jns	ebx_ok
213	xor	ebx, ebx
214ebx_ok:	add	edi,[_write_buffer]
215	add	edi,ebx
216
217; set _loop_count = # of iterations
218	mov	eax,[_fx_xright]
219	sub	eax,ebx
220	js	near _none_to_do
221	mov	[_loop_count],eax
222
223; lighting values are passed in fixed point, but need to be in 8 bit integer, 8 bit fraction so we can easily
224; get the integer by reading %bh
225	sar	dword [_fx_l], 8
226	sar	dword [_fx_dl_dx],8
227	jns	dl_dx_ok
228	inc	dword [_fx_dl_dx]	; round towards 0 for negative deltas
229dl_dx_ok:
230
231; set initial values
232	mov	ebx,[_fx_u]
233	mov	ebp,[_fx_v]
234	mov	ecx,[_fx_z]
235
236	test	dword [_per2_flag],-1
237	je	tmap_loop
238
239	test	dword [_Lighting_on], -1
240        je     near _tmap_loop_fast_nolight
241        jmp     _tmap_loop_fast
242;tmap_loop_fast_nolight_jumper:
243;    jmp tmap_loop_fast_nolight
244
245;================ PERSPECTIVE TEXTURE MAP INNER LOOPS ========================
246;
247; Usage in loop:	eax	division, pixel value
248;	ebx	u
249;	ecx	z
250;	edx	division
251;	ebp	v
252;	esi	source pixel pointer
253;	edi	destination pixel pointer
254
255;-------------------- NORMAL PERSPECTIVE TEXTURE MAP LOOP -----------------
256tmap_loop:
257	mov	esi, ebx	; esi becomes u coordinate
258
259	align	4
260tmap_loop0:
261
262; compute v coordinate
263	mov	eax, ebp	; get v
264	mov	edx, eax
265	sar	edx, 31
266	idiv	ecx	; eax = (v/z)
267
268	and	eax,3fh	; mask with height-1
269	mov	ebx,eax
270
271; compute u coordinate
272	mov	eax, esi	; get u
273	mov	edx, eax
274	sar	edx, 31
275	idiv	ecx	; eax = (u/z)
276
277	shl 	eax,26
278	shld 	ebx,eax,6	; esi = v*64+u
279
280; read 1 pixel
281        add     ebx, [_pixptr]
282	xor	eax, eax
283	test	dword [_Lighting_on], -1
284        mov     al, [ebx]    ; get pixel from source bitmap
285	je	NoLight1
286
287; LIGHTING CODE
288	mov	ebx, [_fx_l]	; get temp copy of lighting value
289	mov	ah, bh	; get lighting level
290	add	ebx, [_fx_dl_dx]	; update lighting value
291	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables
292	mov	[_fx_l], ebx	; save temp copy of lighting value
293
294; transparency check
295NoLight1:	cmp	al,255
296	je	skip1
297
298	mov	[edi],al
299skip1:	inc	edi
300
301; update deltas
302	add	ebp,[_fx_dv_dx]
303	add	esi,[_fx_du_dx]
304	add	ecx,[_fx_dz_dx]
305	je	_div_0_abort	; would be dividing by 0, so abort
306
307	dec	dword [_loop_count]
308	jns	tmap_loop0
309
310_none_to_do:
311	popa
312;        pop     es
313	ret
314
315; We detected a z=0 condition, which seems pretty bogus, don't you think?
316; So, we abort, but maybe we want to know about it.
317_div_0_abort:
318	jmp	_none_to_do
319
320;-------------------------- PER/4 TMAPPER ----------------
321;
322;	x = x1
323;	U0 = u/w; V0 = v/w;
324;	while ( 1 )
325;		u += du_dx*4; v+= dv_dx*4
326;		U1 = u/w; V1 = v/w;
327;		DUDX = (U1-U0)/4; DVDX = (V1-V0)/4;
328;
329;	; Pixel 0
330;		pixels = texmap[V0*64+U0];
331;		U0 += DUDX; V0 += DVDX
332;	; Pixel 1
333;		pixels = (pixels<<8)+texmap[V0*64+U0];
334;		U0 += DUDX; V0 += DVDX
335;	; Pixel 2
336;		pixels = (pixels<<8)+texmap[V0*64+U0];
337;		U0 += DUDX; V0 += DVDX
338;	; Pixel 3
339;		pixels = (pixels<<8)+texmap[V0*64+U0];
340;
341;		screen[x] = pixel
342;		x += 4;
343;		U0 = U1; V0 = V1
344
345NBITS equ 4	; 2^NBITS pixels plotted per divide
346ZSHIFT equ 4	; precision used in PDIV macro
347
348
349;PDIV MACRO
350; Returns EAX/ECX in 16.16 format in EAX. Trashes EDX
351;          sig bits   6.3
352;	mov	edx,eax
353;	shl	eax,ZSHIFT
354;	sar	edx,32-ZSHIFT
355;	idiv	ecx	; eax = (v/z)
356;   shl	eax, 16-ZSHIFT
357;ENDM
358
359global _tmap_loop_fast
360
361; -------------------------------------- Start of Getting Dword Aligned ----------------------------------------------
362;	ebx	fx_u
363
364_tmap_loop_fast:
365	mov	esi,ebx
366
367	align	4
368NotDwordAligned1:
369	test	edi, 11b
370	jz	DwordAligned1
371
372; compute v coordinate
373	mov	eax, ebp	; get v
374	mov	edx, eax
375	sar	edx, 31
376	idiv	ecx	; eax = (v/z)
377
378	and	eax,3fh	; mask with height-1
379	mov	ebx,eax
380
381; compute u coordinate
382	mov	eax, esi	; get u
383	mov	edx, eax
384	sar	edx, 31
385	idiv	ecx	; eax = (u/z)
386
387	shl 	eax,26
388	shld 	ebx,eax,6	; esi = v*64+u
389
390; read 1  pixel
391        add     ebx,[_pixptr]
392	xor	eax, eax
393        mov     al, [ebx]    ; get pixel from source bitmap
394
395; lighting code
396	mov	ebx, [_fx_l]	; get temp copy of lighting value
397	mov	ah, bh	; get lighting level
398	add	ebx, [_fx_dl_dx]	; update lighting value
399	mov	[_fx_l], ebx	; save temp copy of lighting value
400
401; transparency check
402	cmp	al,255
403	je	skip2	; this pixel is transparent, so don't write it (or light it)
404
405	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables
406
407; write 1 pixel
408	mov	[edi],al
409skip2:	inc	edi
410
411; update deltas
412	add	ebp,[_fx_dv_dx]
413	add	esi,[_fx_du_dx]
414	add	ecx,[_fx_dz_dx]
415	je	_div_0_abort	; would be dividing by 0, so abort
416
417	dec	dword [_loop_count]
418	jns	NotDwordAligned1
419
420	jmp	_none_to_do
421
422; -------------------------------------- End of Getting Dword Aligned ----------------------------------------------
423
424DwordAligned1:
425
426	mov	eax, [_loop_count]
427	mov	ebx, esi	; get fx_u [pentium pipelining]
428	inc	eax
429	mov	esi, eax
430	and	esi, (1 << NBITS) - 1
431	sar	eax, NBITS
432	mov	[num_left_over], esi
433	je	near tmap_loop	; there are no 2^NBITS chunks, do divide/pixel for whole scanline
434	mov	[_loop_count], eax	; _loop_count = pixels / NPIXS
435
436; compute initial v coordinate
437	mov	eax,ebp	; get v
438	mov	edx,ebp
439	shl	eax,ZSHIFT
440	sar	edx,32-ZSHIFT
441	idiv	ecx	; eax = (v/z)
442	shl	eax, 16-ZSHIFT
443	mov	[V0], eax
444
445; compute initial u coordinate
446	mov	eax,ebx	; get u
447	mov	edx,ebx
448	shl	eax,ZSHIFT
449	sar	edx,32-ZSHIFT
450	idiv	ecx	; eax = (v/z)
451	shl	eax, 16-ZSHIFT
452	mov	[U0], eax
453
454; Set deltas to NPIXS pixel increments
455	mov	eax, [_fx_du_dx]
456	shl	eax, NBITS
457	mov	[DU1], eax
458	mov	eax, [_fx_dv_dx]
459	shl	eax, NBITS
460	mov	[DV1], eax
461	mov	eax, [_fx_dz_dx]
462	shl	eax, NBITS
463	mov	[DZ1], eax
464
465	align	4
466TopOfLoop4:
467	add	ebx, [DU1]
468	add	ebp, [DV1]
469	add	ecx, [DZ1]
470	je	near _div_0_abort	; would be dividing by 0, so abort
471
472; Done with ebx, ebp, ecx until next iteration
473	push	ebx
474	push	ecx
475	push	ebp
476	push	edi
477
478; Find fixed U1
479	mov	eax, ebx
480	mov	edx,ebx
481	shl	eax,ZSHIFT
482	sar	edx,32-ZSHIFT
483	idiv	ecx	; eax = (v/z)
484	shl	eax, 16-ZSHIFT
485	mov	ebx, eax	; ebx = U1 until pop's
486
487; Find fixed V1
488	mov	eax, ebp
489	mov	edx, ebp
490	shl	eax,ZSHIFT
491	sar	edx,32-ZSHIFT
492	idiv	ecx	; eax = (v/z)
493
494	mov	ecx, [U0]	; ecx = U0 until pop's
495	mov	edi, [V0]	; edi = V0 until pop's
496
497	shl	eax, 16-ZSHIFT
498	mov	ebp, eax	; ebp = V1 until pop's
499
500; Make ESI =  V0:U0 in 6:10,6:10 format
501	mov	eax, ecx
502	shr	eax, 6
503	mov	esi, edi
504	shl	esi, 10
505	mov	si, ax
506
507; Make EDX = DV:DU in 6:10,6:10 format
508	mov	eax, ebx
509	sub	eax, ecx
510	sar	eax, NBITS+6
511	mov	edx, ebp
512	sub	edx, edi
513	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
514	mov	dx, ax	; put delta u in low word
515
516; Save the U1 and V1 so we don't have to divide on the next iteration
517	mov	[U0], ebx
518	mov	[V0], ebp
519
520	pop	edi	; Restore EDI before using it
521
522; LIGHTING CODE
523	mov	ebx, [_fx_l]
524	mov	ebp, [_fx_dl_dx]
525
526	test	dword [_Transparency_on],-1
527	je	near no_trans1
528
529%macro repproc1 0
530	mov	eax, esi	; get u,v
531	shr	eax, 26	; shift out all but int(v)
532	shld	ax,si,6	; shift in u, shifting up v
533	add	esi, edx	; inc u,v
534        add     eax, [_pixptr]
535        movzx   eax, byte [eax]    ; get pixel from source bitmap
536	cmp	al,255
537	je	%%skipa1
538	mov	ah, bh	; form lighting table lookup value
539	add	ebx, ebp	; update lighting value
540	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
541	mov	[edi],al
542%%skipa1:
543	inc	edi
544
545; Do odd pixel
546	mov	eax, esi	; get u,v
547	shr	eax, 26	; shift out all but int(v)
548	shld	ax,si,6	; shift in u, shifting up v
549	add	esi, edx	; inc u,v
550        add     eax,[_pixptr]
551        movzx   eax, byte [eax]    ; get pixel from source bitmap
552	cmp	al,255
553	je	%%skipa2
554	mov	ah, bh	; form lighting table lookup value
555	add	ebx, ebp	; update lighting value
556	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
557	mov	[edi],al
558%%skipa2:
559	inc	edi
560%endmacro
561
562
563%rep (2 << (NBITS-2))
564;	local	skip3,no_trans1
565;	local	skipa1,skipa2
566    repproc1
567%endrep
568
569jmp	cont1
570
571; -------------------------------------------------------
572no_trans1:
573
574%macro repproc2 0
575	mov	eax, esi	; get u,v
576	shr	eax, 26	; shift out all but int(v)
577	shld	ax,si,6	; shift in u, shifting up v
578	add	esi, edx	; inc u,v
579        add     eax,[_pixptr]
580        movzx   eax, byte [eax]    ; get pixel from source bitmap
581	mov	ah, bh	; form lighting table lookup value
582	add	ebx, ebp	; update lighting value
583	mov	cl, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
584
585; Do odd pixel
586	mov	eax, esi	; get u,v
587	shr	eax, 26	; shift out all but int(v)
588	shld	ax,si,6	; shift in u, shifting up v
589	add	esi, edx	; inc u,v
590        add     eax,[_pixptr]
591        movzx   eax, byte [eax]    ; get pixel from source bitmap
592	mov	ah, bh	; form lighting table lookup value
593	add	ebx, ebp	; update lighting value
594	mov	ch, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
595
596; ----- This is about 1% faster than the above, and could probably be optimized more.
597; ----- Problem is, it gets the u,v coordinates backwards.  What you would need to do
598; ----- is switch the packing of the u,v coordinates above (about 95 lines up).
599;----------;	mov	eax, esi
600;----------;	shr	ax, 10
601;----------;	rol	eax, 6
602;----------;	mov	dx, ax
603;----------;	add	esi, mem_edx
604;----------;	mov	dl, es:[edx]
605;----------;	mov	dh, bh
606;----------;	add	ebx, ebp
607;----------;	mov	cl, _gr_fade_table[edx]
608;----------;
609;----------;	mov	eax, esi
610;----------;	shr	ax, 10
611;----------;	rol	eax, 6
612;----------;	mov	dx, ax
613;----------;	add	esi, mem_edx
614;----------;	mov	dl, es:[edx]
615;----------;	mov	dh, bh
616;----------;	add	ebx, ebp
617;----------;	mov	ch, _gr_fade_table[edx]
618
619	ror	ecx, 16	; move to next double dest pixel position
620%endmacro
621
622%rep (1 << (NBITS-2))
623
624    repproc2
625    repproc2
626
627	mov 	[edi],ecx	; Draw 4 pixels to display
628	add 	edi,4
629%endrep
630;; pop edx
631cont1:
632
633; -------------------------------------------------------
634
635; LIGHTING CODE
636	mov	[_fx_l], ebx
637	pop	ebp
638	pop	ecx
639	pop	ebx
640	dec	dword [_loop_count]
641	jnz	near TopOfLoop4
642
643EndOfLoop4:
644	test	dword [num_left_over], -1
645	je	near _none_to_do
646
647; ----------------------------------------- Start of LeftOver Pixels ------------------------------------------
648DoEndPixels:
649	push	ecx
650
651	mov	eax, ecx
652	lea	eax, [eax*2+eax]
653
654	add	ecx, [DZ1]
655	js	notokhere
656	shl	ecx,2
657	cmp	eax, ecx
658	pop	ecx
659	jl	okhere
660	jmp	bah_bah
661notokhere:
662	pop	ecx
663bah_bah:
664        test    dword [_new_end],-1
665	jne	near NewDoEndPixels
666okhere:
667
668	add	ebx, [DU1]
669	add	ebp, [DV1]
670	add	ecx, [DZ1]
671	je	near _div_0_abort
672	jns	dep_cont
673
674; z went negative.
675; this can happen because we added DZ1 to the current z, but dz1 represents dz for perhaps 16 pixels
676; though we might only plot one more pixel.
677	mov	cl, 1
678
679dep_loop:	mov	eax, [DU1]
680	sar	eax, cl
681	sub	ebx, eax
682
683	mov	eax, [DV1]
684	sar	eax, cl
685	sub	ebp, eax
686
687	mov	eax, [DZ1]
688	sar	eax, cl
689	sub	ecx, eax
690	je	near _div_0_abort
691	jns	dep_cont
692
693	inc	cl
694	cmp	cl, NBITS
695	jne	dep_loop
696
697dep_cont:
698	push	edi	; use edi as a temporary variable
699
700	cmp	ecx,1 << (ZSHIFT+1)
701	jg	ecx_ok
702	mov	ecx, 1 << (ZSHIFT+1)
703ecx_ok:
704
705; Find fixed U1
706	mov	eax, ebx
707	;PDIV
708	mov	edx,eax
709	shl	eax,ZSHIFT
710	sar	edx,32-ZSHIFT
711	idiv	ecx	; eax = (v/z)
712	shl	eax, 16-ZSHIFT
713
714	mov	ebx, eax	; ebx = U1 until pop's
715
716; Find fixed V1
717	mov	eax, ebp
718	;PDIV
719	mov	edx,eax
720	shl	eax,ZSHIFT
721	sar	edx,32-ZSHIFT
722	idiv	ecx	; eax = (v/z)
723	shl	eax, 16-ZSHIFT
724
725	mov	ebp, eax	; ebp = V1 until pop's
726
727	mov	ecx, [U0]	; ecx = U0 until pop's
728	mov	edi, [V0]	; edi = V0 until pop's
729
730; Make ESI =  V0:U0 in 6:10,6:10 format
731	mov	eax, ecx
732	shr	eax, 6
733	mov	esi, edi
734	shl	esi, 10
735	mov	si, ax
736
737; Make EDX = DV:DU in 6:10,6:10 format
738	mov	eax, ebx
739	sub	eax, ecx
740	sar	eax, NBITS+6
741	mov	edx, ebp
742	sub	edx, edi
743	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
744	mov	dx, ax	; put delta u in low word
745
746	pop	edi	; Restore EDI before using it
747
748	mov	ecx, [num_left_over]
749
750; LIGHTING CODE
751	mov	ebx, [_fx_l]
752	mov	ebp, [_fx_dl_dx]
753
754    ITERATION equ 0
755
756%macro repproc3 0
757; Do even pixel
758	mov	eax, esi	; get u,v
759	shr	eax, 26	; shift out all but int(v)
760	shld	ax,si,6	; shift in u, shifting up v
761        add     eax,[_pixptr]
762        movzx   eax, byte [eax]    ; get pixel from source bitmap
763	add	esi, edx	; inc u,v
764	mov	ah, bh	; form lighting table lookup value
765	add	ebx, ebp	; update lighting value
766	cmp	al,255
767	je	%%skip4
768	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
769	mov	[edi+ITERATION], al	; write pixel
770%%skip4:	dec	ecx
771	jz	near _none_to_do
772
773; Do odd pixel
774	mov	eax, esi	; get u,v
775	shr	eax, 26	; shift out all but int(v)
776        shld    ax,si,6 ; shift in u, shifting up v
777        add     eax,[_pixptr]
778        movzx   eax, byte [eax]    ; get pixel from source bitmap
779	add	esi, edx	; inc u,v
780	mov	ah, bh	; form lighting table lookup value
781	add	ebx, [_fx_dl_dx]	; update lighting value
782	cmp	al,255
783	je	%%skip5
784	mov	al, [_gr_fade_table+eax]	; xlat thru lighting table into dest buffer
785	mov	[edi+ITERATION+1], al	; write pixel
786%%skip5:	dec	ecx
787	jz	near _none_to_do
788%endmacro
789
790%rep (1 << (NBITS-1))
791	;local	skip4, skip5
792    repproc3
793%assign ITERATION  ITERATION + 2
794
795%endrep
796
797; Should never get here!!!!
798	int	3
799	jmp	_none_to_do
800
801; ----------------------------------------- End of LeftOver Pixels ------------------------------------------
802
803; --BUGGY NEW--NewDoEndPixels:
804; --BUGGY NEW--	mov	eax, num_left_over
805; --BUGGY NEW--	and	num_left_over, 3
806; --BUGGY NEW--	shr	eax, 2
807; --BUGGY NEW--	je	NDEP_1
808; --BUGGY NEW-- mov	_loop_count, eax
809; --BUGGY NEW--
810; --BUGGY NEW--; do 4 pixels per hunk, not 16, so div deltas by 4 (16/4=4)
811; --BUGGY NEW-- shr DU1,2
812; --BUGGY NEW-- shr DV1,2
813; --BUGGY NEW-- shr DZ1,2
814; --BUGGY NEW--
815; --BUGGY NEW--NDEP_TopOfLoop4:
816; --BUGGY NEW--	add	ebx, DU1
817; --BUGGY NEW--	add	ebp, DV1
818; --BUGGY NEW--	add	ecx, DZ1
819; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
820; --BUGGY NEW--
821; --BUGGY NEW--; Done with ebx, ebp, ecx until next iteration
822; --BUGGY NEW--	push	ebx
823; --BUGGY NEW--	push	ecx
824; --BUGGY NEW--	push	ebp
825; --BUGGY NEW--	push	edi
826; --BUGGY NEW--
827; --BUGGY NEW--; Find fixed U1
828; --BUGGY NEW--	mov	eax, ebx
829; --BUGGY NEW--	mov	edx,ebx
830; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
831; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
832; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
833; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
834; --BUGGY NEW--	mov	ebx, eax	; ebx = U1 until pop's
835; --BUGGY NEW--
836; --BUGGY NEW--; Find fixed V1
837; --BUGGY NEW--	mov	eax, ebp
838; --BUGGY NEW--	mov	edx, ebp
839; --BUGGY NEW--	shl	eax,(ZSHIFT-2)
840; --BUGGY NEW--	sar	edx,32-(ZSHIFT-2)
841; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
842; --BUGGY NEW--
843; --BUGGY NEW--	mov	ecx, U0	; ecx = U0 until pop's
844; --BUGGY NEW--	mov	edi, V0	; edi = V0 until pop's
845; --BUGGY NEW--
846; --BUGGY NEW--	shl	eax, 16-(ZSHIFT-2)
847; --BUGGY NEW--	mov	ebp, eax	; ebp = V1 until pop's
848; --BUGGY NEW--
849; --BUGGY NEW--; Make ESI =  V0:U0 in 6:10,6:10 format
850; --BUGGY NEW--	mov	eax, ecx
851; --BUGGY NEW--	shr	eax, 6
852; --BUGGY NEW--	mov	esi, edi
853; --BUGGY NEW--	shl	esi, 10
854; --BUGGY NEW--	mov	si, ax
855; --BUGGY NEW--
856; --BUGGY NEW--; Make EDX = DV:DU in 6:10,6:10 format
857; --BUGGY NEW--	mov	eax, ebx
858; --BUGGY NEW--	sub	eax, ecx
859; --BUGGY NEW--	sar	eax, (NBITS-2)+6
860; --BUGGY NEW--	mov	edx, ebp
861; --BUGGY NEW--	sub	edx, edi
862; --BUGGY NEW--	shl	edx, 10-(NBITS-2)	; EDX = V1-V0/ 4 in 6:10 int:frac
863; --BUGGY NEW--	mov	dx, ax	; put delta u in low word
864; --BUGGY NEW--
865; --BUGGY NEW--; Save the U1 and V1 so we don't have to divide on the next iteration
866; --BUGGY NEW--	mov	U0, ebx
867; --BUGGY NEW--	mov	V0, ebp
868; --BUGGY NEW--
869; --BUGGY NEW--	pop	edi	; Restore EDI before using it
870; --BUGGY NEW--
871; --BUGGY NEW--; LIGHTING CODE
872; --BUGGY NEW--	mov	ebx, _fx_l
873; --BUGGY NEW--	mov	ebp, _fx_dl_dx
874; --BUGGY NEW--
875; --BUGGY NEW--;**	test	_Transparency_on,-1
876; --BUGGY NEW--;**	je	NDEP_no_trans1
877; --BUGGY NEW--
878; --BUGGY NEW--        REPT 2
879; --BUGGY NEW--	local	NDEP_skipa1, NDEP_skipa2
880; --BUGGY NEW--
881; --BUGGY NEW--	mov	eax, esi	; get u,v
882; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
883; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
884; --BUGGY NEW--	add	esi, edx	; inc u,v
885; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
886; --BUGGY NEW--	cmp	al,255
887; --BUGGY NEW--	je	NDEP_skipa1
888; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
889; --BUGGY NEW--	add	ebx, ebp	; update lighting value
890; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
891; --BUGGY NEW--	mov	[edi],al
892; --BUGGY NEW--NDEP_skipa1:
893; --BUGGY NEW--	inc	edi
894; --BUGGY NEW--
895; --BUGGY NEW--; Do odd pixel
896; --BUGGY NEW--	mov	eax, esi	; get u,v
897; --BUGGY NEW--	shr	eax, 26	; shift out all but int(v)
898; --BUGGY NEW--	shld	ax,si,6	; shift in u, shifting up v
899; --BUGGY NEW--	add	esi, edx	; inc u,v
900; --BUGGY NEW--	mov 	al, es:[eax]	; get pixel from source bitmap
901; --BUGGY NEW--	cmp	al,255
902; --BUGGY NEW--	je	NDEP_skipa2
903; --BUGGY NEW--	mov	ah, bh	; form lighting table lookup value
904; --BUGGY NEW--	add	ebx, ebp	; update lighting value
905; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat thru lighting table into dest buffer
906; --BUGGY NEW--	mov	[edi],al
907; --BUGGY NEW--NDEP_skipa2:
908; --BUGGY NEW--	inc	edi
909; --BUGGY NEW--
910; --BUGGY NEW--        ENDM
911; --BUGGY NEW--
912; --BUGGY NEW--	mov	_fx_l, ebx
913; --BUGGY NEW--	pop	ebp
914; --BUGGY NEW--	pop	ecx
915; --BUGGY NEW--	pop	ebx
916; --BUGGY NEW-- dec	_loop_count
917; --BUGGY NEW--	jnz	NDEP_TopOfLoop4
918; --BUGGY NEW--
919; --BUGGY NEW--	test	num_left_over, -1
920; --BUGGY NEW--	je	_none_to_do
921; --BUGGY NEW--
922; --BUGGY NEW--NDEP_1:
923; --BUGGY NEW--	mov	esi,ebx
924; --BUGGY NEW--
925; --BUGGY NEW--	align	4
926; --BUGGY NEW--NDEP_loop:
927; --BUGGY NEW--
928; --BUGGY NEW--; compute v coordinate
929; --BUGGY NEW--	mov	eax, ebp	; get v
930; --BUGGY NEW--	mov	edx, eax
931; --BUGGY NEW--	sar	edx, 31
932; --BUGGY NEW--	idiv	ecx	; eax = (v/z)
933; --BUGGY NEW--
934; --BUGGY NEW--	and	eax,3fh	; mask with height-1
935; --BUGGY NEW--	mov	ebx,eax
936; --BUGGY NEW--
937; --BUGGY NEW--; compute u coordinate
938; --BUGGY NEW--	mov	eax, 	esi	; get u
939; --BUGGY NEW--	mov	edx, eax
940; --BUGGY NEW--	sar	edx, 31
941; --BUGGY NEW--	idiv	ecx	; eax = (u/z)
942; --BUGGY NEW--
943; --BUGGY NEW--	shl 	eax,26
944; --BUGGY NEW--	shld 	ebx,eax,6	; esi = v*64+u
945; --BUGGY NEW--
946; --BUGGY NEW--; read 1  pixel
947; --BUGGY NEW--	xor	eax, eax
948; --BUGGY NEW--	mov	al, es:[ebx]	; get pixel from source bitmap
949; --BUGGY NEW--
950; --BUGGY NEW--; lighting code
951; --BUGGY NEW--	mov	ebx, _fx_l	; get temp copy of lighting value
952; --BUGGY NEW--	mov	ah, bh	; get lighting level
953; --BUGGY NEW--	add	ebx, _fx_dl_dx	; update lighting value
954; --BUGGY NEW--	mov	_fx_l, ebx	; save temp copy of lighting value
955; --BUGGY NEW--
956; --BUGGY NEW--; transparency check
957; --BUGGY NEW--	cmp	al,255
958; --BUGGY NEW--	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)
959; --BUGGY NEW--
960; --BUGGY NEW--	mov	al, _gr_fade_table[eax]	; xlat pixel thru lighting tables
961; --BUGGY NEW--
962; --BUGGY NEW--; write 1 pixel
963; --BUGGY NEW--	mov	[edi],al
964; --BUGGY NEW--NDEP_skip2:	inc	edi
965; --BUGGY NEW--
966; --BUGGY NEW--; update deltas
967; --BUGGY NEW--	add	ebp,_fx_dv_dx
968; --BUGGY NEW--	add	esi,_fx_du_dx
969; --BUGGY NEW--	add	ecx,_fx_dz_dx
970; --BUGGY NEW--	je	_div_0_abort	; would be dividing by 0, so abort
971; --BUGGY NEW--
972; --BUGGY NEW--	dec	num_left_over
973; --BUGGY NEW--	jne	NDEP_loop
974; --BUGGY NEW--
975; --BUGGY NEW--	jmp	_none_to_do
976
977NewDoEndPixels:
978	mov	esi,ebx
979
980	align	4
981NDEP_loop:
982
983; compute v coordinate
984	mov	eax, ebp	; get v
985	mov	edx, eax
986	sar	edx, 31
987	idiv	ecx	; eax = (v/z)
988
989	and	eax,3fh	; mask with height-1
990	mov	ebx,eax
991
992; compute u coordinate
993	mov	eax, 	esi	; get u
994	mov	edx, eax
995	sar	edx, 31
996	idiv	ecx	; eax = (u/z)
997
998	shl 	eax,26
999	shld 	ebx,eax,6	; esi = v*64+u
1000
1001; read 1  pixel
1002        add     ebx,[_pixptr]
1003	xor	eax, eax
1004        mov     al, [ebx]    ; get pixel from source bitmap
1005
1006; lighting code
1007	mov	ebx, [_fx_l]	; get temp copy of lighting value
1008	mov	ah, bh	; get lighting level
1009	add	ebx, [_fx_dl_dx]	; update lighting value
1010	mov	[_fx_l], ebx	; save temp copy of lighting value
1011
1012; transparency check
1013	cmp	al,255
1014	je	NDEP_skip2	; this pixel is transparent, so don't write it (or light it)
1015
1016	mov	al, [_gr_fade_table+eax]	; xlat pixel thru lighting tables
1017
1018; write 1 pixel
1019	mov	[edi],al
1020NDEP_skip2:	inc	edi
1021
1022; update deltas
1023	add	ebp,[_fx_dv_dx]
1024	add	esi,[_fx_du_dx]
1025	add	ecx,[_fx_dz_dx]
1026        je      near _div_0_abort    ; would be dividing by 0, so abort
1027
1028	dec	dword [num_left_over]
1029	jne	NDEP_loop
1030
1031	jmp	_none_to_do
1032
1033; ==================================================== No Lighting Code ======================================================
1034global _tmap_loop_fast_nolight
1035_tmap_loop_fast_nolight:
1036	mov	esi,ebx
1037
1038	align	4
1039NotDwordAligned1_nolight:
1040        test    edi, 11b
1041        jz      DwordAligned1_nolight
1042
1043; compute v coordinate
1044	mov	eax,ebp	; get v
1045	mov	edx, eax
1046	sar	edx, 31
1047	idiv	ecx	; eax = (v/z)
1048
1049	and	eax,3fh	; mask with height-1
1050	mov	ebx,eax
1051
1052; compute u coordinate
1053	mov	eax, esi	; get u
1054	mov	edx, eax
1055	sar	edx, 31
1056	idiv	ecx	; eax = (u/z)
1057
1058	shl 	eax,26
1059	shld 	ebx,eax,6	; esi = v*64+u
1060
1061; read 1  pixel
1062        add     ebx,[_pixptr]
1063        mov     al,[ebx]     ; get pixel from source bitmap
1064
1065; write 1 pixel
1066	cmp	al,255
1067	je	skip6
1068	mov	[edi],al
1069skip6:	inc	edi
1070
1071; update deltas
1072	add	ebp,[_fx_dv_dx]
1073	add	esi,[_fx_du_dx]
1074	add	ecx,[_fx_dz_dx]
1075        je      near _div_0_abort    ; would be dividing by 0, so abort
1076
1077	dec	dword [_loop_count]
1078        jns     NotDwordAligned1_nolight
1079	jmp	_none_to_do
1080
1081DwordAligned1_nolight:
1082	mov	ebx,esi
1083
1084	mov	eax, [_loop_count]
1085	inc	eax
1086	mov	[num_left_over], eax
1087	shr	eax, NBITS
1088
1089	test	eax, -1
1090        je      near tmap_loop       ; no 2^NBITS chunks, do divide/pixel for whole scanline
1091
1092	mov	[_loop_count], eax	; _loop_count = pixels / NPIXS
1093	shl	eax, NBITS
1094	sub	[num_left_over], eax	; num_left_over = obvious
1095
1096; compute initial v coordinate
1097	mov	eax,ebp	; get v
1098	;PDIV
1099	mov	edx,eax
1100	shl	eax,ZSHIFT
1101	sar	edx,32-ZSHIFT
1102	idiv	ecx	; eax = (v/z)
1103	shl	eax, 16-ZSHIFT
1104
1105	mov	[V0], eax
1106
1107; compute initial u coordinate
1108	mov	eax,ebx	; get u
1109	;PDIV
1110	mov	edx,eax
1111	shl	eax,ZSHIFT
1112	sar	edx,32-ZSHIFT
1113	idiv	ecx	; eax = (v/z)
1114	shl	eax, 16-ZSHIFT
1115
1116	mov	[U0], eax
1117
1118; Set deltas to NPIXS pixel increments
1119	mov	eax, [_fx_du_dx]
1120	shl	eax, NBITS
1121	mov	[DU1], eax
1122	mov	eax, [_fx_dv_dx]
1123	shl	eax, NBITS
1124	mov	[DV1], eax
1125	mov	eax, [_fx_dz_dx]
1126	shl	eax, NBITS
1127	mov	[DZ1], eax
1128
1129	align	4
1130TopOfLoop4_nolight:
1131	add	ebx, [DU1]
1132	add	ebp, [DV1]
1133	add	ecx, [DZ1]
1134        je      near _div_0_abort
1135
1136; Done with ebx, ebp, ecx until next iteration
1137	push	ebx
1138	push	ecx
1139	push	ebp
1140	push	edi
1141
1142; Find fixed U1
1143	mov	eax, ebx
1144	;PDIV
1145	mov	edx,eax
1146	shl	eax,ZSHIFT
1147	sar	edx,32-ZSHIFT
1148	idiv	ecx	; eax = (v/z)
1149	shl	eax, 16-ZSHIFT
1150
1151	mov	ebx, eax	; ebx = U1 until pop's
1152
1153; Find fixed V1
1154	mov	eax, ebp
1155	;PDIV
1156	mov	edx,eax
1157	shl	eax,ZSHIFT
1158	sar	edx,32-ZSHIFT
1159	idiv	ecx	; eax = (v/z)
1160	shl	eax, 16-ZSHIFT
1161
1162	mov	ebp, eax	; ebp = V1 until pop's
1163
1164	mov	ecx, [U0]	; ecx = U0 until pop's
1165	mov	edi, [V0]	; edi = V0 until pop's
1166
1167; Make ESI =  V0:U0 in 6:10,6:10 format
1168	mov	eax, ecx
1169	shr	eax, 6
1170	mov	esi, edi
1171	shl	esi, 10
1172	mov	si, ax
1173
1174; Make EDX = DV:DU in 6:10,6:10 format
1175	mov	eax, ebx
1176	sub	eax, ecx
1177	sar	eax, NBITS+6
1178	mov	edx, ebp
1179	sub	edx, edi
1180	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
1181	mov	dx, ax	; put delta u in low word
1182
1183; Save the U1 and V1 so we don't have to divide on the next iteration
1184	mov	[U0], ebx
1185	mov	[V0], ebp
1186
1187	pop	edi	; Restore EDI before using it
1188
1189%macro repproc4 0
1190; Do 1 pixel
1191	mov	eax, esi	; get u,v
1192	shr	eax, 26	; shift out all but int(v)
1193	shld	ax,si,6	; shift in u, shifting up v
1194	add	esi, edx	; inc u,v
1195        add     eax,[_pixptr]
1196        mov     cl, [eax]    ; load into buffer register
1197
1198	mov	eax, esi	; get u,v
1199	shr	eax, 26	; shift out all but int(v)
1200	shld	ax,si,6	; shift in u, shifting up v
1201        add     eax,[_pixptr]
1202        mov     ch, [eax]    ; load into buffer register
1203	add	esi, edx	; inc u,v
1204	ror	ecx, 16	; move to next dest pixel
1205
1206	mov	eax, esi	; get u,v
1207	shr	eax, 26	; shift out all but int(v)
1208	shld	ax,si,6	; shift in u, shifting up v
1209        add     eax,[_pixptr]
1210        mov     cl, [eax]    ; load into buffer register
1211	add	esi, edx	; inc u,v
1212
1213	mov	eax, esi	; get u,v
1214	shr	eax, 26	; shift out all but int(v)
1215	shld	ax,si,6	; shift in u, shifting up v
1216        add     eax,[_pixptr]
1217        mov     ch, [eax]    ; load into buffer register
1218	add	esi, edx	; inc u,v
1219	ror	ecx, 16 ;-- can get rid of this, just write in different order below -- 	; move to next dest pixel
1220
1221	test	dword [_Transparency_on],-1
1222	je	%%no_trans2
1223	cmp	ecx,-1
1224	je	%%skip7
1225
1226	cmp	cl,255
1227	je	%%skip1q
1228	mov	[edi],cl
1229%%skip1q:
1230
1231	cmp	ch,255
1232	je	%%skip2q
1233	mov	[edi+1],ch
1234%%skip2q:
1235	ror	ecx,16
1236
1237	cmp	cl,255
1238        je      %%skip3q
1239	mov	[edi+2],cl
1240%%skip3q:
1241
1242
1243	cmp	ch,255
1244	je	%%skip4q
1245	mov	[edi+3],ch
1246%%skip4q:
1247
1248	jmp	%%skip7
1249%%no_trans2:
1250	mov 	[edi],ecx	; Draw 4 pixels to display
1251%%skip7:	add 	edi,4
1252%endmacro
1253
1254%rep (1 << (NBITS-2))
1255	;local	skip7, no_trans2, skip1q, skip2q, skip3q, skip4q
1256    repproc4
1257
1258%endrep
1259
1260	pop	ebp
1261	pop	ecx
1262	pop	ebx
1263	dec	dword [_loop_count]
1264        jnz     near TopOfLoop4_nolight
1265
1266EndOfLoop4_nolight:
1267
1268	test	dword [num_left_over], -1
1269        je      near _none_to_do
1270
1271DoEndPixels_nolight:
1272	add	ebx, [DU1]
1273	add	ebp, [DV1]
1274	add	ecx, [DZ1]
1275        je      near _div_0_abort
1276	push	edi	; use edi as a temporary variable
1277
1278; Find fixed U1
1279	mov	eax, ebx
1280	mov	edx,eax
1281	shl	eax,ZSHIFT
1282	sar	edx,32-ZSHIFT
1283	idiv	ecx	; eax = (v/z)
1284	shl	eax, 16-ZSHIFT
1285	mov	ebx, eax	; ebx = U1 until pop's
1286
1287; Find fixed V1
1288	mov	eax, ebp
1289	mov	edx,eax
1290	shl	eax,ZSHIFT
1291	sar	edx,32-ZSHIFT
1292	idiv	ecx	; eax = (v/z)
1293	shl	eax, 16-ZSHIFT
1294	mov	ebp, eax	; ebp = V1 until pop's
1295
1296	mov	ecx, [U0]	; ecx = U0 until pop's
1297	mov	edi, [V0]	; edi = V0 until pop's
1298
1299; Make ESI =  V0:U0 in 6:10,6:10 format
1300	mov	eax, ecx
1301	shr	eax, 6
1302	mov	esi, edi
1303	shl	esi, 10
1304	mov	si, ax
1305
1306; Make EDX = DV:DU in 6:10,6:10 format
1307	mov	eax, ebx
1308	sub	eax, ecx
1309	sar	eax, NBITS+6
1310	mov	edx, ebp
1311	sub	edx, edi
1312	shl	edx, 10-NBITS	; EDX = V1-V0/ 4 in 6:10 int:frac
1313	mov	dx, ax	; put delta u in low word
1314
1315	pop	edi	; Restore EDI before using it
1316
1317	mov	ecx, [num_left_over]
1318
1319%assign ITERATION 0
1320%macro repproc5 0
1321; Do 1 pixel
1322	mov	eax, esi	; get u,v
1323	shr	eax, 26	; shift out all but int(v)
1324	shld	ax,si,6	; shift in u, shifting up v
1325        add     eax,[_pixptr]
1326        movzx   eax, byte [eax]    ; load into buffer register
1327	add	esi, edx	; inc u,v
1328	cmp	al,255
1329	je	%%skip8
1330	mov	[edi+ITERATION], al	; write pixel
1331%%skip8:	dec	ecx
1332        jz      near _none_to_do
1333%endmacro
1334
1335%rep (1 << NBITS)
1336	;local	skip8
1337	repproc5
1338%assign ITERATION  ITERATION + 1
1339%endrep
1340
1341; Should never get here!!!!!
1342	int	3
1343	jmp	_none_to_do
1344
1345