1 /*
2 Copyright (C) 1997-2001 Id Software, Inc.
3 
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License
6 as published by the Free Software Foundation; either version 2
7 of the License, or (at your option) any later version.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 
13 See the GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18 
19 */
20 #include "sw_local.h"
21 
22 vec3_t r_pright, r_pup, r_ppn;
23 
24 #define PARTICLE_33     0
25 #define PARTICLE_66     1
26 #define PARTICLE_OPAQUE 2
27 
28 typedef struct
29 {
30 	particle_t *particle;
31 	int         level;
32 	int         color;
33 } partparms_t;
34 
35 static partparms_t partparms;
36 
37 #if id386 && defined _MSC_VER
38 
39 static unsigned s_prefetch_address;
40 
41 /*
42 ** BlendParticleXX
43 **
44 ** Inputs:
45 ** EAX = color
46 ** EDI = pdest
47 **
48 ** Scratch:
49 ** EBX = scratch (dstcolor)
50 ** EBP = scratch
51 **
52 ** Outputs:
53 ** none
54 */
BlendParticle33(void)55 __declspec(naked) void BlendParticle33( void )
56 {
57 	//	return vid.alphamap[color + dstcolor*256];
58 	__asm mov ebp, vid.alphamap
59 	__asm xor ebx, ebx
60 
61 	__asm mov bl,  byte ptr [edi]
62 	__asm shl ebx, 8
63 
64 	__asm add ebp, ebx
65 	__asm add ebp, eax
66 
67 	__asm mov al,  byte ptr [ebp]
68 
69 	__asm mov byte ptr [edi], al
70 
71 	__asm ret
72 }
73 
BlendParticle66(void)74 __declspec(naked) void BlendParticle66( void )
75 {
76 	//	return vid.alphamap[pcolor*256 + dstcolor];
77 	__asm mov ebp, vid.alphamap
78 	__asm xor ebx, ebx
79 
80 	__asm shl eax,  8
81 	__asm mov bl,   byte ptr [edi]
82 
83 	__asm add ebp, ebx
84 	__asm add ebp, eax
85 
86 	__asm mov al,  byte ptr [ebp]
87 
88 	__asm mov byte ptr [edi], al
89 
90 	__asm ret
91 }
92 
BlendParticle100(void)93 __declspec(naked) void BlendParticle100( void )
94 {
95 	__asm mov byte ptr [edi], al
96 	__asm ret
97 }
98 
99 /*
100 ** R_DrawParticle (asm version)
101 **
102 ** Since we use __declspec( naked ) we don't have a stack frame
103 ** that we can use.  Since I want to reserve EBP anyway, I tossed
104 ** all the important variables into statics.  This routine isn't
105 ** meant to be re-entrant, so this shouldn't cause any problems
106 ** other than a slightly higher global memory footprint.
107 **
108 */
R_DrawParticle(void)109 __declspec(naked) void R_DrawParticle( void )
110 {
111 	static vec3_t	local, transformed;
112 	static float	zi;
113 	static int      u, v, tmp;
114 	static short    izi;
115 	static int      ebpsave;
116 
117 	static byte (*blendfunc)(void);
118 
119 	/*
120 	** must be memvars since x86 can't load constants
121 	** directly.  I guess I could use fld1, but that
122 	** actually costs one more clock than fld [one]!
123 	*/
124 	static float    particle_z_clip    = PARTICLE_Z_CLIP;
125 	static float    one                = 1.0F;
126 	static float    point_five         = 0.5F;
127 	static float    eight_thousand_hex = 0x8000;
128 
129 	/*
130 	** save trashed variables
131 	*/
132 	__asm mov  ebpsave, ebp
133 	__asm push esi
134 	__asm push edi
135 
136 	/*
137 	** transform the particle
138 	*/
139 	// VectorSubtract (pparticle->origin, r_origin, local);
140 	__asm mov  esi, partparms.particle
141 	__asm fld  dword ptr [esi+0]          ; p_o.x
142 	__asm fsub dword ptr [r_origin+0]     ; p_o.x-r_o.x
143 	__asm fld  dword ptr [esi+4]          ; p_o.y | p_o.x-r_o.x
144 	__asm fsub dword ptr [r_origin+4]     ; p_o.y-r_o.y | p_o.x-r_o.x
145 	__asm fld  dword ptr [esi+8]          ; p_o.z | p_o.y-r_o.y | p_o.x-r_o.x
146 	__asm fsub dword ptr [r_origin+8]     ; p_o.z-r_o.z | p_o.y-r_o.y | p_o.x-r_o.x
147 	__asm fxch st(2)                      ; p_o.x-r_o.x | p_o.y-r_o.y | p_o.z-r_o.z
148 	__asm fstp dword ptr [local+0]        ; p_o.y-r_o.y | p_o.z-r_o.z
149 	__asm fstp dword ptr [local+4]        ; p_o.z-r_o.z
150 	__asm fstp dword ptr [local+8]        ; (empty)
151 
152 	// transformed[0] = DotProduct(local, r_pright);
153 	// transformed[1] = DotProduct(local, r_pup);
154 	// transformed[2] = DotProduct(local, r_ppn);
155 	__asm fld  dword ptr [local+0]        ; l.x
156 	__asm fmul dword ptr [r_pright+0]     ; l.x*pr.x
157 	__asm fld  dword ptr [local+4]        ; l.y | l.x*pr.x
158 	__asm fmul dword ptr [r_pright+4]     ; l.y*pr.y | l.x*pr.x
159 	__asm fld  dword ptr [local+8]        ; l.z | l.y*pr.y | l.x*pr.x
160 	__asm fmul dword ptr [r_pright+8]     ; l.z*pr.z | l.y*pr.y | l.x*pr.x
161 	__asm fxch st(2)                      ; l.x*pr.x | l.y*pr.y | l.z*pr.z
162 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y | l.z*pr.z
163 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y + l.z*pr.z
164 	__asm fstp  dword ptr [transformed+0] ; (empty)
165 
166 	__asm fld  dword ptr [local+0]        ; l.x
167 	__asm fmul dword ptr [r_pup+0]        ; l.x*pr.x
168 	__asm fld  dword ptr [local+4]        ; l.y | l.x*pr.x
169 	__asm fmul dword ptr [r_pup+4]        ; l.y*pr.y | l.x*pr.x
170 	__asm fld  dword ptr [local+8]        ; l.z | l.y*pr.y | l.x*pr.x
171 	__asm fmul dword ptr [r_pup+8]        ; l.z*pr.z | l.y*pr.y | l.x*pr.x
172 	__asm fxch st(2)                      ; l.x*pr.x | l.y*pr.y | l.z*pr.z
173 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y | l.z*pr.z
174 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y + l.z*pr.z
175 	__asm fstp  dword ptr [transformed+4] ; (empty)
176 
177 	__asm fld  dword ptr [local+0]        ; l.x
178 	__asm fmul dword ptr [r_ppn+0]        ; l.x*pr.x
179 	__asm fld  dword ptr [local+4]        ; l.y | l.x*pr.x
180 	__asm fmul dword ptr [r_ppn+4]        ; l.y*pr.y | l.x*pr.x
181 	__asm fld  dword ptr [local+8]        ; l.z | l.y*pr.y | l.x*pr.x
182 	__asm fmul dword ptr [r_ppn+8]        ; l.z*pr.z | l.y*pr.y | l.x*pr.x
183 	__asm fxch st(2)                      ; l.x*pr.x | l.y*pr.y | l.z*pr.z
184 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y | l.z*pr.z
185 	__asm faddp st(1), st                 ; l.x*pr.x + l.y*pr.y + l.z*pr.z
186 	__asm fstp  dword ptr [transformed+8] ; (empty)
187 
188 	/*
189 	** make sure that the transformed particle is not in front of
190 	** the particle Z clip plane.  We can do the comparison in
191 	** integer space since we know the sign of one of the inputs
192 	** and can figure out the sign of the other easily enough.
193 	*/
194 	//	if (transformed[2] < PARTICLE_Z_CLIP)
195 	//		return;
196 
197 	__asm mov  eax, dword ptr [transformed+8]
198 	__asm and  eax, eax
199 	__asm js   end
200 	__asm cmp  eax, particle_z_clip
201 	__asm jl   end
202 
203 	/*
204 	** project the point by initiating the 1/z calc
205 	*/
206 	//	zi = 1.0 / transformed[2];
207 	__asm fld   one
208 	__asm fdiv  dword ptr [transformed+8]
209 
210 	/*
211 	** bind the blend function pointer to the appropriate blender
212 	** while we're dividing
213 	*/
214 	//if ( level == PARTICLE_33 )
215 	//	blendparticle = BlendParticle33;
216 	//else if ( level == PARTICLE_66 )
217 	//	blendparticle = BlendParticle66;
218 	//else
219 	//	blendparticle = BlendParticle100;
220 
221 	__asm cmp partparms.level, PARTICLE_66
222 	__asm je  blendfunc_66
223 	__asm jl  blendfunc_33
224 	__asm lea ebx, BlendParticle100
225 	__asm jmp done_selecting_blend_func
226 blendfunc_33:
227 	__asm lea ebx, BlendParticle33
228 	__asm jmp done_selecting_blend_func
229 blendfunc_66:
230 	__asm lea ebx, BlendParticle66
231 done_selecting_blend_func:
232 	__asm mov blendfunc, ebx
233 
234 	// prefetch the next particle
235 	__asm mov ebp, s_prefetch_address
236 	__asm mov ebp, [ebp]
237 
238 	// finish the above divide
239 	__asm fstp  zi
240 
241 	// u = (int)(xcenter + zi * transformed[0] + 0.5);
242 	// v = (int)(ycenter - zi * transformed[1] + 0.5);
243 	__asm fld   zi                           ; zi
244 	__asm fmul  dword ptr [transformed+0]    ; zi * transformed[0]
245 	__asm fld   zi                           ; zi | zi * transformed[0]
246 	__asm fmul  dword ptr [transformed+4]    ; zi * transformed[1] | zi * transformed[0]
247 	__asm fxch  st(1)                        ; zi * transformed[0] | zi * transformed[1]
248 	__asm fadd  xcenter                      ; xcenter + zi * transformed[0] | zi * transformed[1]
249 	__asm fxch  st(1)                        ; zi * transformed[1] | xcenter + zi * transformed[0]
250 	__asm fld   ycenter                      ; ycenter | zi * transformed[1] | xcenter + zi * transformed[0]
251     __asm fsubrp st(1), st(0)                ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0]
252   	__asm fxch  st(1)                        ; xcenter + zi * transformed[0] | ycenter + zi * transformed[1]
253   	__asm fadd  point_five                   ; xcenter + zi * transformed[0] + 0.5 | ycenter - zi * transformed[1]
254   	__asm fxch  st(1)                        ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0] + 0.5
255   	__asm fadd  point_five                   ; ycenter - zi * transformed[1] + 0.5 | xcenter + zi * transformed[0] + 0.5
256   	__asm fxch  st(1)                        ; u | v
257   	__asm fistp dword ptr [u]                ; v
258   	__asm fistp dword ptr [v]                ; (empty)
259 
260 	/*
261 	** clip out the particle
262 	*/
263 
264 	//	if ((v > d_vrectbottom_particle) ||
265 	//		(u > d_vrectright_particle) ||
266 	//		(v < d_vrecty) ||
267 	//		(u < d_vrectx))
268 	//	{
269 	//		return;
270 	//	}
271 
272 	__asm mov ebx, u
273 	__asm mov ecx, v
274 	__asm cmp ecx, d_vrectbottom_particle
275 	__asm jg  end
276 	__asm cmp ecx, d_vrecty
277 	__asm jl  end
278 	__asm cmp ebx, d_vrectright_particle
279 	__asm jg  end
280 	__asm cmp ebx, d_vrectx
281 	__asm jl  end
282 
283 	/*
284 	** compute addresses of zbuffer, framebuffer, and
285 	** compute the Z-buffer reference value.
286 	**
287 	** EBX      = U
288 	** ECX      = V
289 	**
290 	** Outputs:
291 	** ESI = Z-buffer address
292 	** EDI = framebuffer address
293 	*/
294 	// ESI = d_pzbuffer + (d_zwidth * v) + u;
295 	__asm mov esi, d_pzbuffer             ; esi = d_pzbuffer
296 	__asm mov eax, d_zwidth               ; eax = d_zwidth
297 	__asm mul ecx                         ; eax = d_zwidth*v
298 	__asm add eax, ebx                    ; eax = d_zwidth*v+u
299 	__asm shl eax, 1                      ; eax = 2*(d_zwidth*v+u)
300 	__asm add esi, eax                    ; esi = ( short * ) ( d_pzbuffer + ( d_zwidth * v ) + u )
301 
302 	// initiate
303 	// izi = (int)(zi * 0x8000);
304 	__asm fld  zi
305 	__asm fmul eight_thousand_hex
306 
307 	// EDI = pdest = d_viewbuffer + d_scantable[v] + u;
308 	__asm lea edi, [d_scantable+ecx*4]
309 	__asm mov edi, [edi]
310 	__asm add edi, d_viewbuffer
311 	__asm add edi, ebx
312 
313 	// complete
314 	// izi = (int)(zi * 0x8000);
315 	__asm fistp tmp
316 	__asm mov   eax, tmp
317 	__asm mov   izi, ax
318 
319 	/*
320 	** determine the screen area covered by the particle,
321 	** which also means clamping to a min and max
322 	*/
323 	//	pix = izi >> d_pix_shift;
324 	__asm xor edx, edx
325 	__asm mov dx, izi
326 	__asm mov ecx, d_pix_shift
327 	__asm shr dx, cl
328 
329 	//	if (pix < d_pix_min)
330 	//		pix = d_pix_min;
331 	__asm cmp edx, d_pix_min
332 	__asm jge check_pix_max
333 	__asm mov edx, d_pix_min
334 	__asm jmp skip_pix_clamp
335 
336 	//	else if (pix > d_pix_max)
337 	//		pix = d_pix_max;
338 check_pix_max:
339 	__asm cmp edx, d_pix_max
340 	__asm jle skip_pix_clamp
341 	__asm mov edx, d_pix_max
342 
343 skip_pix_clamp:
344 
345 	/*
346 	** render the appropriate pixels
347 	**
348 	** ECX = count (used for inner loop)
349 	** EDX = count (used for outer loop)
350 	** ESI = zbuffer
351 	** EDI = framebuffer
352 	*/
353 	__asm mov ecx, edx
354 
355 	__asm cmp ecx, 1
356 	__asm ja  over
357 
358 over:
359 
360 	/*
361 	** at this point:
362 	**
363 	** ECX = count
364 	*/
365 	__asm push ecx
366 	__asm push edi
367 	__asm push esi
368 
369 top_of_pix_vert_loop:
370 
371 top_of_pix_horiz_loop:
372 
373 	//	for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
374 	//	{
375 	//		for (i=0 ; i<pix ; i++)
376 	//		{
377 	//			if (pz[i] <= izi)
378 	//			{
379 	//				pdest[i] = blendparticle( color, pdest[i] );
380 	//			}
381 	//		}
382 	//	}
383 	__asm xor   eax, eax
384 
385 	__asm mov   ax, word ptr [esi]
386 
387 	__asm cmp   ax, izi
388 	__asm jg    end_of_horiz_loop
389 
390 #if ENABLE_ZWRITES_FOR_PARTICLES
391   	__asm mov   bp, izi
392   	__asm mov   word ptr [esi], bp
393 #endif
394 
395 	__asm mov   eax, partparms.color
396 
397 	__asm call  [blendfunc]
398 
399 	__asm add   edi, 1
400 	__asm add   esi, 2
401 
402 end_of_horiz_loop:
403 
404 	__asm dec   ecx
405 	__asm jnz   top_of_pix_horiz_loop
406 
407 	__asm pop   esi
408 	__asm pop   edi
409 
410 	__asm mov   ebp, d_zwidth
411 	__asm shl   ebp, 1
412 
413 	__asm add   esi, ebp
414 	__asm add   edi, [r_screenwidth]
415 
416 	__asm pop   ecx
417 	__asm push  ecx
418 
419 	__asm push  edi
420 	__asm push  esi
421 
422 	__asm dec   edx
423 	__asm jnz   top_of_pix_vert_loop
424 
425 	__asm pop   ecx
426 	__asm pop   ecx
427 	__asm pop   ecx
428 
429 end:
430 	__asm pop edi
431 	__asm pop esi
432 	__asm mov ebp, ebpsave
433 	__asm ret
434 }
435 
436 #else
437 
BlendParticle33(int pcolor,int dstcolor)438 static byte BlendParticle33( int pcolor, int dstcolor )
439 {
440 	return vid.alphamap[pcolor + dstcolor*256];
441 }
442 
BlendParticle66(int pcolor,int dstcolor)443 static byte BlendParticle66( int pcolor, int dstcolor )
444 {
445 	return vid.alphamap[pcolor*256+dstcolor];
446 }
447 
BlendParticle100(int pcolor,int dstcolor)448 static byte BlendParticle100( int pcolor, int dstcolor )
449 {
450 	dstcolor = dstcolor;
451 	return pcolor;
452 }
453 
454 /*
455 ** R_DrawParticle
456 **
457 ** Yes, this is amazingly slow, but it's the C reference
458 ** implementation and should be both robust and vaguely
459 ** understandable.  The only time this path should be
460 ** executed is if we're debugging on x86 or if we're
461 ** recompiling and deploying on a non-x86 platform.
462 **
463 ** To minimize error and improve readability I went the
464 ** function pointer route.  This exacts some overhead, but
465 ** it pays off in clean and easy to understand code.
466 */
R_DrawParticle(void)467 void R_DrawParticle( void )
468 {
469 	particle_t *pparticle = partparms.particle;
470 	int         level     = partparms.level;
471 	vec3_t	local, transformed;
472 	float	zi;
473 	byte	*pdest;
474 	short	*pz;
475 	int      color = pparticle->color;
476 	int		i, izi, pix, count, u, v;
477 	byte  (*blendparticle)( int, int );
478 
479 	/*
480 	** transform the particle
481 	*/
482 	VectorSubtract (pparticle->origin, r_origin, local);
483 
484 	transformed[0] = DotProduct(local, r_pright);
485 	transformed[1] = DotProduct(local, r_pup);
486 	transformed[2] = DotProduct(local, r_ppn);
487 
488 	if (transformed[2] < PARTICLE_Z_CLIP)
489 		return;
490 
491 	/*
492 	** bind the blend function pointer to the appropriate blender
493 	*/
494 	if ( level == PARTICLE_33 )
495 		blendparticle = BlendParticle33;
496 	else if ( level == PARTICLE_66 )
497 		blendparticle = BlendParticle66;
498 	else
499 		blendparticle = BlendParticle100;
500 
501 	/*
502 	** project the point
503 	*/
504 	// FIXME: preadjust xcenter and ycenter
505 	zi = 1.0 / transformed[2];
506 	u = (int)(xcenter + zi * transformed[0] + 0.5);
507 	v = (int)(ycenter - zi * transformed[1] + 0.5);
508 
509 	if ((v > d_vrectbottom_particle) ||
510 		(u > d_vrectright_particle) ||
511 		(v < d_vrecty) ||
512 		(u < d_vrectx))
513 	{
514 		return;
515 	}
516 
517 	/*
518 	** compute addresses of zbuffer, framebuffer, and
519 	** compute the Z-buffer reference value.
520 	*/
521 	pz = d_pzbuffer + (d_zwidth * v) + u;
522 	pdest = d_viewbuffer + d_scantable[v] + u;
523 	izi = (int)(zi * 0x8000);
524 
525 	/*
526 	** determine the screen area covered by the particle,
527 	** which also means clamping to a min and max
528 	*/
529 	pix = izi >> d_pix_shift;
530 	if (pix < d_pix_min)
531 		pix = d_pix_min;
532 	else if (pix > d_pix_max)
533 		pix = d_pix_max;
534 
535 	/*
536 	** render the appropriate pixels
537 	*/
538 	count = pix;
539 
540     switch (level) {
541     case PARTICLE_33 :
542         for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
543         {
544 //FIXME--do it in blocks of 8?
545             for (i=0 ; i<pix ; i++)
546             {
547                 if (pz[i] <= izi)
548                 {
549                     pz[i]    = izi;
550                     pdest[i] = vid.alphamap[color + ((int)pdest[i]<<8)];
551                 }
552             }
553         }
554         break;
555 
556     case PARTICLE_66 :
557         for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
558         {
559             for (i=0 ; i<pix ; i++)
560             {
561                 if (pz[i] <= izi)
562                 {
563                     pz[i]    = izi;
564                     pdest[i] = vid.alphamap[(color<<8) + (int)pdest[i]];
565                 }
566             }
567         }
568         break;
569 
570     default:  //100
571         for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
572         {
573             for (i=0 ; i<pix ; i++)
574             {
575                 if (pz[i] <= izi)
576                 {
577                     pz[i]    = izi;
578                     pdest[i] = color;
579                 }
580             }
581         }
582         break;
583     }
584 }
585 
586 #endif	// !id386
587 
588 /*
589 ** R_DrawParticles
590 **
591 ** Responsible for drawing all of the particles in the particle list
592 ** throughout the world.  Doesn't care if we're using the C path or
593 ** if we're using the asm path, it simply assigns a function pointer
594 ** and goes.
595 */
R_DrawParticles(void)596 void R_DrawParticles (void)
597 {
598 	particle_t *p;
599 	int         i;
600 #if id386 && defined _MSC_VER
601 	extern unsigned long fpu_sp24_cw, fpu_chop_cw;
602 #endif
603 
604 	VectorScale( vright, xscaleshrink, r_pright );
605 	VectorScale( vup, yscaleshrink, r_pup );
606 	VectorCopy( vpn, r_ppn );
607 
608 #if id386 && defined _MSC_VER
609 	__asm fldcw word ptr [fpu_sp24_cw]
610 #endif
611 
612 	for (p=r_newrefdef.particles, i=0 ; i<r_newrefdef.num_particles ; i++,p++)
613 	{
614 
615 		if ( p->alpha > 0.66 )
616 			partparms.level = PARTICLE_OPAQUE;
617 		else if ( p->alpha > 0.33 )
618 			partparms.level = PARTICLE_66;
619 		else
620 			partparms.level = PARTICLE_33;
621 
622 		partparms.particle = p;
623 		partparms.color    = p->color;
624 
625 #if id386 && defined _MSC_VER
626 		if ( i < r_newrefdef.num_particles-1 )
627 			s_prefetch_address = ( unsigned int ) ( p + 1 );
628 		else
629 			s_prefetch_address = ( unsigned int ) r_newrefdef.particles;
630 #endif
631 
632 		R_DrawParticle();
633 	}
634 
635 #if id386 && defined _MSC_VER
636 	__asm fldcw word ptr [fpu_chop_cw]
637 #endif
638 
639 }
640 
641