1 /*
2 Copyright (C) 1997-2001 Id Software, Inc.
3
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License
6 as published by the Free Software Foundation; either version 2
7 of the License, or (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13 See the GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18
19 */
20 #include "sw_local.h"
21
22 vec3_t r_pright, r_pup, r_ppn;
23
24 #define PARTICLE_33 0
25 #define PARTICLE_66 1
26 #define PARTICLE_OPAQUE 2
27
28 typedef struct
29 {
30 particle_t *particle;
31 int level;
32 int color;
33 } partparms_t;
34
35 static partparms_t partparms;
36
37 #if id386 && defined _MSC_VER
38
39 static unsigned s_prefetch_address;
40
41 /*
42 ** BlendParticleXX
43 **
44 ** Inputs:
45 ** EAX = color
46 ** EDI = pdest
47 **
48 ** Scratch:
49 ** EBX = scratch (dstcolor)
50 ** EBP = scratch
51 **
52 ** Outputs:
53 ** none
54 */
BlendParticle33(void)55 __declspec(naked) void BlendParticle33( void )
56 {
57 // return vid.alphamap[color + dstcolor*256];
58 __asm mov ebp, vid.alphamap
59 __asm xor ebx, ebx
60
61 __asm mov bl, byte ptr [edi]
62 __asm shl ebx, 8
63
64 __asm add ebp, ebx
65 __asm add ebp, eax
66
67 __asm mov al, byte ptr [ebp]
68
69 __asm mov byte ptr [edi], al
70
71 __asm ret
72 }
73
BlendParticle66(void)74 __declspec(naked) void BlendParticle66( void )
75 {
76 // return vid.alphamap[pcolor*256 + dstcolor];
77 __asm mov ebp, vid.alphamap
78 __asm xor ebx, ebx
79
80 __asm shl eax, 8
81 __asm mov bl, byte ptr [edi]
82
83 __asm add ebp, ebx
84 __asm add ebp, eax
85
86 __asm mov al, byte ptr [ebp]
87
88 __asm mov byte ptr [edi], al
89
90 __asm ret
91 }
92
BlendParticle100(void)93 __declspec(naked) void BlendParticle100( void )
94 {
95 __asm mov byte ptr [edi], al
96 __asm ret
97 }
98
99 /*
100 ** R_DrawParticle (asm version)
101 **
102 ** Since we use __declspec( naked ) we don't have a stack frame
103 ** that we can use. Since I want to reserve EBP anyway, I tossed
104 ** all the important variables into statics. This routine isn't
105 ** meant to be re-entrant, so this shouldn't cause any problems
106 ** other than a slightly higher global memory footprint.
107 **
108 */
R_DrawParticle(void)109 __declspec(naked) void R_DrawParticle( void )
110 {
111 static vec3_t local, transformed;
112 static float zi;
113 static int u, v, tmp;
114 static short izi;
115 static int ebpsave;
116
117 static byte (*blendfunc)(void);
118
119 /*
120 ** must be memvars since x86 can't load constants
121 ** directly. I guess I could use fld1, but that
122 ** actually costs one more clock than fld [one]!
123 */
124 static float particle_z_clip = PARTICLE_Z_CLIP;
125 static float one = 1.0F;
126 static float point_five = 0.5F;
127 static float eight_thousand_hex = 0x8000;
128
129 /*
130 ** save trashed variables
131 */
132 __asm mov ebpsave, ebp
133 __asm push esi
134 __asm push edi
135
136 /*
137 ** transform the particle
138 */
139 // VectorSubtract (pparticle->origin, r_origin, local);
140 __asm mov esi, partparms.particle
141 __asm fld dword ptr [esi+0] ; p_o.x
142 __asm fsub dword ptr [r_origin+0] ; p_o.x-r_o.x
143 __asm fld dword ptr [esi+4] ; p_o.y | p_o.x-r_o.x
144 __asm fsub dword ptr [r_origin+4] ; p_o.y-r_o.y | p_o.x-r_o.x
145 __asm fld dword ptr [esi+8] ; p_o.z | p_o.y-r_o.y | p_o.x-r_o.x
146 __asm fsub dword ptr [r_origin+8] ; p_o.z-r_o.z | p_o.y-r_o.y | p_o.x-r_o.x
147 __asm fxch st(2) ; p_o.x-r_o.x | p_o.y-r_o.y | p_o.z-r_o.z
148 __asm fstp dword ptr [local+0] ; p_o.y-r_o.y | p_o.z-r_o.z
149 __asm fstp dword ptr [local+4] ; p_o.z-r_o.z
150 __asm fstp dword ptr [local+8] ; (empty)
151
152 // transformed[0] = DotProduct(local, r_pright);
153 // transformed[1] = DotProduct(local, r_pup);
154 // transformed[2] = DotProduct(local, r_ppn);
155 __asm fld dword ptr [local+0] ; l.x
156 __asm fmul dword ptr [r_pright+0] ; l.x*pr.x
157 __asm fld dword ptr [local+4] ; l.y | l.x*pr.x
158 __asm fmul dword ptr [r_pright+4] ; l.y*pr.y | l.x*pr.x
159 __asm fld dword ptr [local+8] ; l.z | l.y*pr.y | l.x*pr.x
160 __asm fmul dword ptr [r_pright+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
161 __asm fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
162 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y | l.z*pr.z
163 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y + l.z*pr.z
164 __asm fstp dword ptr [transformed+0] ; (empty)
165
166 __asm fld dword ptr [local+0] ; l.x
167 __asm fmul dword ptr [r_pup+0] ; l.x*pr.x
168 __asm fld dword ptr [local+4] ; l.y | l.x*pr.x
169 __asm fmul dword ptr [r_pup+4] ; l.y*pr.y | l.x*pr.x
170 __asm fld dword ptr [local+8] ; l.z | l.y*pr.y | l.x*pr.x
171 __asm fmul dword ptr [r_pup+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
172 __asm fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
173 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y | l.z*pr.z
174 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y + l.z*pr.z
175 __asm fstp dword ptr [transformed+4] ; (empty)
176
177 __asm fld dword ptr [local+0] ; l.x
178 __asm fmul dword ptr [r_ppn+0] ; l.x*pr.x
179 __asm fld dword ptr [local+4] ; l.y | l.x*pr.x
180 __asm fmul dword ptr [r_ppn+4] ; l.y*pr.y | l.x*pr.x
181 __asm fld dword ptr [local+8] ; l.z | l.y*pr.y | l.x*pr.x
182 __asm fmul dword ptr [r_ppn+8] ; l.z*pr.z | l.y*pr.y | l.x*pr.x
183 __asm fxch st(2) ; l.x*pr.x | l.y*pr.y | l.z*pr.z
184 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y | l.z*pr.z
185 __asm faddp st(1), st ; l.x*pr.x + l.y*pr.y + l.z*pr.z
186 __asm fstp dword ptr [transformed+8] ; (empty)
187
188 /*
189 ** make sure that the transformed particle is not in front of
190 ** the particle Z clip plane. We can do the comparison in
191 ** integer space since we know the sign of one of the inputs
192 ** and can figure out the sign of the other easily enough.
193 */
194 // if (transformed[2] < PARTICLE_Z_CLIP)
195 // return;
196
197 __asm mov eax, dword ptr [transformed+8]
198 __asm and eax, eax
199 __asm js end
200 __asm cmp eax, particle_z_clip
201 __asm jl end
202
203 /*
204 ** project the point by initiating the 1/z calc
205 */
206 // zi = 1.0 / transformed[2];
207 __asm fld one
208 __asm fdiv dword ptr [transformed+8]
209
210 /*
211 ** bind the blend function pointer to the appropriate blender
212 ** while we're dividing
213 */
214 //if ( level == PARTICLE_33 )
215 // blendparticle = BlendParticle33;
216 //else if ( level == PARTICLE_66 )
217 // blendparticle = BlendParticle66;
218 //else
219 // blendparticle = BlendParticle100;
220
221 __asm cmp partparms.level, PARTICLE_66
222 __asm je blendfunc_66
223 __asm jl blendfunc_33
224 __asm lea ebx, BlendParticle100
225 __asm jmp done_selecting_blend_func
226 blendfunc_33:
227 __asm lea ebx, BlendParticle33
228 __asm jmp done_selecting_blend_func
229 blendfunc_66:
230 __asm lea ebx, BlendParticle66
231 done_selecting_blend_func:
232 __asm mov blendfunc, ebx
233
234 // prefetch the next particle
235 __asm mov ebp, s_prefetch_address
236 __asm mov ebp, [ebp]
237
238 // finish the above divide
239 __asm fstp zi
240
241 // u = (int)(xcenter + zi * transformed[0] + 0.5);
242 // v = (int)(ycenter - zi * transformed[1] + 0.5);
243 __asm fld zi ; zi
244 __asm fmul dword ptr [transformed+0] ; zi * transformed[0]
245 __asm fld zi ; zi | zi * transformed[0]
246 __asm fmul dword ptr [transformed+4] ; zi * transformed[1] | zi * transformed[0]
247 __asm fxch st(1) ; zi * transformed[0] | zi * transformed[1]
248 __asm fadd xcenter ; xcenter + zi * transformed[0] | zi * transformed[1]
249 __asm fxch st(1) ; zi * transformed[1] | xcenter + zi * transformed[0]
250 __asm fld ycenter ; ycenter | zi * transformed[1] | xcenter + zi * transformed[0]
251 __asm fsubrp st(1), st(0) ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0]
252 __asm fxch st(1) ; xcenter + zi * transformed[0] | ycenter + zi * transformed[1]
253 __asm fadd point_five ; xcenter + zi * transformed[0] + 0.5 | ycenter - zi * transformed[1]
254 __asm fxch st(1) ; ycenter - zi * transformed[1] | xcenter + zi * transformed[0] + 0.5
255 __asm fadd point_five ; ycenter - zi * transformed[1] + 0.5 | xcenter + zi * transformed[0] + 0.5
256 __asm fxch st(1) ; u | v
257 __asm fistp dword ptr [u] ; v
258 __asm fistp dword ptr [v] ; (empty)
259
260 /*
261 ** clip out the particle
262 */
263
264 // if ((v > d_vrectbottom_particle) ||
265 // (u > d_vrectright_particle) ||
266 // (v < d_vrecty) ||
267 // (u < d_vrectx))
268 // {
269 // return;
270 // }
271
272 __asm mov ebx, u
273 __asm mov ecx, v
274 __asm cmp ecx, d_vrectbottom_particle
275 __asm jg end
276 __asm cmp ecx, d_vrecty
277 __asm jl end
278 __asm cmp ebx, d_vrectright_particle
279 __asm jg end
280 __asm cmp ebx, d_vrectx
281 __asm jl end
282
283 /*
284 ** compute addresses of zbuffer, framebuffer, and
285 ** compute the Z-buffer reference value.
286 **
287 ** EBX = U
288 ** ECX = V
289 **
290 ** Outputs:
291 ** ESI = Z-buffer address
292 ** EDI = framebuffer address
293 */
294 // ESI = d_pzbuffer + (d_zwidth * v) + u;
295 __asm mov esi, d_pzbuffer ; esi = d_pzbuffer
296 __asm mov eax, d_zwidth ; eax = d_zwidth
297 __asm mul ecx ; eax = d_zwidth*v
298 __asm add eax, ebx ; eax = d_zwidth*v+u
299 __asm shl eax, 1 ; eax = 2*(d_zwidth*v+u)
300 __asm add esi, eax ; esi = ( short * ) ( d_pzbuffer + ( d_zwidth * v ) + u )
301
302 // initiate
303 // izi = (int)(zi * 0x8000);
304 __asm fld zi
305 __asm fmul eight_thousand_hex
306
307 // EDI = pdest = d_viewbuffer + d_scantable[v] + u;
308 __asm lea edi, [d_scantable+ecx*4]
309 __asm mov edi, [edi]
310 __asm add edi, d_viewbuffer
311 __asm add edi, ebx
312
313 // complete
314 // izi = (int)(zi * 0x8000);
315 __asm fistp tmp
316 __asm mov eax, tmp
317 __asm mov izi, ax
318
319 /*
320 ** determine the screen area covered by the particle,
321 ** which also means clamping to a min and max
322 */
323 // pix = izi >> d_pix_shift;
324 __asm xor edx, edx
325 __asm mov dx, izi
326 __asm mov ecx, d_pix_shift
327 __asm shr dx, cl
328
329 // if (pix < d_pix_min)
330 // pix = d_pix_min;
331 __asm cmp edx, d_pix_min
332 __asm jge check_pix_max
333 __asm mov edx, d_pix_min
334 __asm jmp skip_pix_clamp
335
336 // else if (pix > d_pix_max)
337 // pix = d_pix_max;
338 check_pix_max:
339 __asm cmp edx, d_pix_max
340 __asm jle skip_pix_clamp
341 __asm mov edx, d_pix_max
342
343 skip_pix_clamp:
344
345 /*
346 ** render the appropriate pixels
347 **
348 ** ECX = count (used for inner loop)
349 ** EDX = count (used for outer loop)
350 ** ESI = zbuffer
351 ** EDI = framebuffer
352 */
353 __asm mov ecx, edx
354
355 __asm cmp ecx, 1
356 __asm ja over
357
358 over:
359
360 /*
361 ** at this point:
362 **
363 ** ECX = count
364 */
365 __asm push ecx
366 __asm push edi
367 __asm push esi
368
369 top_of_pix_vert_loop:
370
371 top_of_pix_horiz_loop:
372
373 // for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
374 // {
375 // for (i=0 ; i<pix ; i++)
376 // {
377 // if (pz[i] <= izi)
378 // {
379 // pdest[i] = blendparticle( color, pdest[i] );
380 // }
381 // }
382 // }
383 __asm xor eax, eax
384
385 __asm mov ax, word ptr [esi]
386
387 __asm cmp ax, izi
388 __asm jg end_of_horiz_loop
389
390 #if ENABLE_ZWRITES_FOR_PARTICLES
391 __asm mov bp, izi
392 __asm mov word ptr [esi], bp
393 #endif
394
395 __asm mov eax, partparms.color
396
397 __asm call [blendfunc]
398
399 __asm add edi, 1
400 __asm add esi, 2
401
402 end_of_horiz_loop:
403
404 __asm dec ecx
405 __asm jnz top_of_pix_horiz_loop
406
407 __asm pop esi
408 __asm pop edi
409
410 __asm mov ebp, d_zwidth
411 __asm shl ebp, 1
412
413 __asm add esi, ebp
414 __asm add edi, [r_screenwidth]
415
416 __asm pop ecx
417 __asm push ecx
418
419 __asm push edi
420 __asm push esi
421
422 __asm dec edx
423 __asm jnz top_of_pix_vert_loop
424
425 __asm pop ecx
426 __asm pop ecx
427 __asm pop ecx
428
429 end:
430 __asm pop edi
431 __asm pop esi
432 __asm mov ebp, ebpsave
433 __asm ret
434 }
435
436 #else
437
BlendParticle33(int pcolor,int dstcolor)438 static byte BlendParticle33( int pcolor, int dstcolor )
439 {
440 return vid.alphamap[pcolor + dstcolor*256];
441 }
442
BlendParticle66(int pcolor,int dstcolor)443 static byte BlendParticle66( int pcolor, int dstcolor )
444 {
445 return vid.alphamap[pcolor*256+dstcolor];
446 }
447
BlendParticle100(int pcolor,int dstcolor)448 static byte BlendParticle100( int pcolor, int dstcolor )
449 {
450 dstcolor = dstcolor;
451 return pcolor;
452 }
453
454 /*
455 ** R_DrawParticle
456 **
457 ** Yes, this is amazingly slow, but it's the C reference
458 ** implementation and should be both robust and vaguely
459 ** understandable. The only time this path should be
460 ** executed is if we're debugging on x86 or if we're
461 ** recompiling and deploying on a non-x86 platform.
462 **
463 ** To minimize error and improve readability I went the
464 ** function pointer route. This exacts some overhead, but
465 ** it pays off in clean and easy to understand code.
466 */
R_DrawParticle(void)467 void R_DrawParticle( void )
468 {
469 particle_t *pparticle = partparms.particle;
470 int level = partparms.level;
471 vec3_t local, transformed;
472 float zi;
473 byte *pdest;
474 short *pz;
475 int color = pparticle->color;
476 int i, izi, pix, count, u, v;
477 byte (*blendparticle)( int, int );
478
479 /*
480 ** transform the particle
481 */
482 VectorSubtract (pparticle->origin, r_origin, local);
483
484 transformed[0] = DotProduct(local, r_pright);
485 transformed[1] = DotProduct(local, r_pup);
486 transformed[2] = DotProduct(local, r_ppn);
487
488 if (transformed[2] < PARTICLE_Z_CLIP)
489 return;
490
491 /*
492 ** bind the blend function pointer to the appropriate blender
493 */
494 if ( level == PARTICLE_33 )
495 blendparticle = BlendParticle33;
496 else if ( level == PARTICLE_66 )
497 blendparticle = BlendParticle66;
498 else
499 blendparticle = BlendParticle100;
500
501 /*
502 ** project the point
503 */
504 // FIXME: preadjust xcenter and ycenter
505 zi = 1.0 / transformed[2];
506 u = (int)(xcenter + zi * transformed[0] + 0.5);
507 v = (int)(ycenter - zi * transformed[1] + 0.5);
508
509 if ((v > d_vrectbottom_particle) ||
510 (u > d_vrectright_particle) ||
511 (v < d_vrecty) ||
512 (u < d_vrectx))
513 {
514 return;
515 }
516
517 /*
518 ** compute addresses of zbuffer, framebuffer, and
519 ** compute the Z-buffer reference value.
520 */
521 pz = d_pzbuffer + (d_zwidth * v) + u;
522 pdest = d_viewbuffer + d_scantable[v] + u;
523 izi = (int)(zi * 0x8000);
524
525 /*
526 ** determine the screen area covered by the particle,
527 ** which also means clamping to a min and max
528 */
529 pix = izi >> d_pix_shift;
530 if (pix < d_pix_min)
531 pix = d_pix_min;
532 else if (pix > d_pix_max)
533 pix = d_pix_max;
534
535 /*
536 ** render the appropriate pixels
537 */
538 count = pix;
539
540 switch (level) {
541 case PARTICLE_33 :
542 for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
543 {
544 //FIXME--do it in blocks of 8?
545 for (i=0 ; i<pix ; i++)
546 {
547 if (pz[i] <= izi)
548 {
549 pz[i] = izi;
550 pdest[i] = vid.alphamap[color + ((int)pdest[i]<<8)];
551 }
552 }
553 }
554 break;
555
556 case PARTICLE_66 :
557 for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
558 {
559 for (i=0 ; i<pix ; i++)
560 {
561 if (pz[i] <= izi)
562 {
563 pz[i] = izi;
564 pdest[i] = vid.alphamap[(color<<8) + (int)pdest[i]];
565 }
566 }
567 }
568 break;
569
570 default: //100
571 for ( ; count ; count--, pz += d_zwidth, pdest += r_screenwidth)
572 {
573 for (i=0 ; i<pix ; i++)
574 {
575 if (pz[i] <= izi)
576 {
577 pz[i] = izi;
578 pdest[i] = color;
579 }
580 }
581 }
582 break;
583 }
584 }
585
586 #endif // !id386
587
588 /*
589 ** R_DrawParticles
590 **
591 ** Responsible for drawing all of the particles in the particle list
592 ** throughout the world. Doesn't care if we're using the C path or
593 ** if we're using the asm path, it simply assigns a function pointer
594 ** and goes.
595 */
R_DrawParticles(void)596 void R_DrawParticles (void)
597 {
598 particle_t *p;
599 int i;
600 #if id386 && defined _MSC_VER
601 extern unsigned long fpu_sp24_cw, fpu_chop_cw;
602 #endif
603
604 VectorScale( vright, xscaleshrink, r_pright );
605 VectorScale( vup, yscaleshrink, r_pup );
606 VectorCopy( vpn, r_ppn );
607
608 #if id386 && defined _MSC_VER
609 __asm fldcw word ptr [fpu_sp24_cw]
610 #endif
611
612 for (p=r_newrefdef.particles, i=0 ; i<r_newrefdef.num_particles ; i++,p++)
613 {
614
615 if ( p->alpha > 0.66 )
616 partparms.level = PARTICLE_OPAQUE;
617 else if ( p->alpha > 0.33 )
618 partparms.level = PARTICLE_66;
619 else
620 partparms.level = PARTICLE_33;
621
622 partparms.particle = p;
623 partparms.color = p->color;
624
625 #if id386 && defined _MSC_VER
626 if ( i < r_newrefdef.num_particles-1 )
627 s_prefetch_address = ( unsigned int ) ( p + 1 );
628 else
629 s_prefetch_address = ( unsigned int ) r_newrefdef.particles;
630 #endif
631
632 R_DrawParticle();
633 }
634
635 #if id386 && defined _MSC_VER
636 __asm fldcw word ptr [fpu_chop_cw]
637 #endif
638
639 }
640
641