1// 2// d_draw16.s 3// x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel 4// subdivision. 5// 6 7#include "qasm.h" 8#include "d_ifacea.h" 9 10#if id386 11 12//---------------------------------------------------------------------- 13// 8-bpp horizontal span drawing code for polygons, with no transparency and 14// 16-pixel subdivision. 15// 16// Assumes there is at least one span in pspans, and that every span 17// contains at least one pixel 18//---------------------------------------------------------------------- 19 20 .data 21 22 .text 23 24// out-of-line, rarely-needed clamping code 25 26LClampHigh0: 27 movl C(bbextents),%esi 28 jmp LClampReentry0 29LClampHighOrLow0: 30 jg LClampHigh0 31 xorl %esi,%esi 32 jmp LClampReentry0 33 34LClampHigh1: 35 movl C(bbextentt),%edx 36 jmp LClampReentry1 37LClampHighOrLow1: 38 jg LClampHigh1 39 xorl %edx,%edx 40 jmp LClampReentry1 41 42LClampLow2: 43 movl $4096,%ebp 44 jmp LClampReentry2 45LClampHigh2: 46 movl C(bbextents),%ebp 47 jmp LClampReentry2 48 49LClampLow3: 50 movl $4096,%ecx 51 jmp LClampReentry3 52LClampHigh3: 53 movl C(bbextentt),%ecx 54 jmp LClampReentry3 55 56LClampLow4: 57 movl $4096,%eax 58 jmp LClampReentry4 59LClampHigh4: 60 movl C(bbextents),%eax 61 jmp LClampReentry4 62 63LClampLow5: 64 movl $4096,%ebx 65 jmp LClampReentry5 66LClampHigh5: 67 movl C(bbextentt),%ebx 68 jmp LClampReentry5 69 70 71#define pspans 4+16 72 73 .align 4 74.globl C(D_DrawSpans16) 75C(D_DrawSpans16): 76 pushl %ebp // preserve caller's stack frame 77 pushl %edi 78 pushl %esi // preserve register variables 79 pushl %ebx 80 81// 82// set up scaled-by-16 steps, for 16-long segments; also set up cacheblock 83// and span list pointers 84// 85// TODO: any overlap from rearranging? 86 flds C(d_sdivzstepu) 87 fmuls fp_16 88 movl C(cacheblock),%edx 89 flds C(d_tdivzstepu) 90 fmuls fp_16 91 movl pspans(%esp),%ebx // point to the first span descriptor 92 flds C(d_zistepu) 93 fmuls fp_16 94 movl %edx,pbase // pbase = cacheblock 95 fstps zi16stepu 96 fstps tdivz16stepu 97 fstps sdivz16stepu 98 99LSpanLoop: 100// 101// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the 102// initial s and t values 103// 104// FIXME: pipeline FILD? 105 fildl espan_t_v(%ebx) 106 fildl espan_t_u(%ebx) 107 108 fld %st(1) // dv | du | dv 109 fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv 110 fld %st(1) // du | dv*d_sdivzstepv | du | dv 111 fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv 112 fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv 113 fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu | 114 // dv*d_sdivzstepv | du | dv 115 fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu | 116 // dv*d_sdivzstepv | du | dv 117 faddp %st(0),%st(2) // du*d_tdivzstepu | 118 // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv 119 fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | 120 // du*d_tdivzstepu | du | dv 121 fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv | 122 // du*d_tdivzstepu | du | dv 123 fmuls C(d_tdivzstepv) // dv*d_tdivzstepv | 124 // du*d_sdivzstepu + dv*d_sdivzstepv | 125 // du*d_tdivzstepu | du | dv 126 fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | 127 // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv 128 fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv + 129 // du*d_sdivzstepu; stays in %st(2) at end 130 fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | 131 // s/z 132 fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv | 133 // du*d_tdivzstepu | du | s/z 134 fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv | 135 // du*d_tdivzstepu | du | s/z 136 faddp %st(0),%st(2) // dv*d_zistepv | 137 // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z 138 fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu | 139 // dv*d_zistepv | s/z 140 fmuls C(d_zistepu) // du*d_zistepu | 141 // dv*d_tdivzstepv + du*d_tdivzstepu | 142 // dv*d_zistepv | s/z 143 fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu | 144 // du*d_zistepu | dv*d_zistepv | s/z 145 fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv + 146 // du*d_tdivzstepu; stays in %st(1) at end 147 fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z 148 faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z 149 150 flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z 151 fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z 152 fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv + 153 // du*d_zistepu; stays in %st(0) at end 154 // 1/z | fp_64k | t/z | s/z 155// 156// calculate and clamp s & t 157// 158 fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z 159 160// 161// point %edi to the first pixel in the span 162// 163 movl C(d_viewbuffer),%ecx 164 movl espan_t_v(%ebx),%eax 165 movl %ebx,pspantemp // preserve spans pointer 166 167 movl C(tadjust),%edx 168 movl C(sadjust),%esi 169 movl C(d_scantable)(,%eax,4),%edi // v * screenwidth 170 addl %ecx,%edi 171 movl espan_t_u(%ebx),%ecx 172 addl %ecx,%edi // pdest = &pdestspan[scans->u]; 173 movl espan_t_count(%ebx),%ecx 174 175// 176// now start the FDIV for the end of the span 177// 178 cmpl $16,%ecx 179 ja LSetupNotLast1 180 181 decl %ecx 182 jz LCleanup1 // if only one pixel, no need to start an FDIV 183 movl %ecx,spancountminus1 184 185// finish up the s and t calcs 186 fxch %st(1) // z*64k | 1/z | t/z | s/z 187 188 fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z 189 fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z 190 fxch %st(1) // z*64k | s | 1/z | t/z | s/z 191 fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z 192 fxch %st(1) // s | t | 1/z | t/z | s/z 193 fistpl s // 1/z | t | t/z | s/z 194 fistpl t // 1/z | t/z | s/z 195 196 fildl spancountminus1 197 198 flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1 199 flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1 200 fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1 201 fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 202 fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 203 fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 204 fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 | 205 // C(d_tdivzstepu)*scm1 206 fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 | 207 // C(d_tdivzstepu)*scm1 208 faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 209 fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 210 faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 211 faddp %st(0),%st(3) 212 213 flds fp_64k 214 fdiv %st(1),%st(0) // this is what we've gone to all this trouble to 215 // overlap 216 jmp LFDIVInFlight1 217 218LCleanup1: 219// finish up the s and t calcs 220 fxch %st(1) // z*64k | 1/z | t/z | s/z 221 222 fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z 223 fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z 224 fxch %st(1) // z*64k | s | 1/z | t/z | s/z 225 fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z 226 fxch %st(1) // s | t | 1/z | t/z | s/z 227 fistpl s // 1/z | t | t/z | s/z 228 fistpl t // 1/z | t/z | s/z 229 jmp LFDIVInFlight1 230 231 .align 4 232LSetupNotLast1: 233// finish up the s and t calcs 234 fxch %st(1) // z*64k | 1/z | t/z | s/z 235 236 fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z 237 fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z 238 fxch %st(1) // z*64k | s | 1/z | t/z | s/z 239 fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z 240 fxch %st(1) // s | t | 1/z | t/z | s/z 241 fistpl s // 1/z | t | t/z | s/z 242 fistpl t // 1/z | t/z | s/z 243 244 fadds zi16stepu 245 fxch %st(2) 246 fadds sdivz16stepu 247 fxch %st(2) 248 flds tdivz16stepu 249 faddp %st(0),%st(2) 250 flds fp_64k 251 fdiv %st(1),%st(0) // z = 1/1/z 252 // this is what we've gone to all this trouble to 253 // overlap 254LFDIVInFlight1: 255 256 addl s,%esi 257 addl t,%edx 258 movl C(bbextents),%ebx 259 movl C(bbextentt),%ebp 260 cmpl %ebx,%esi 261 ja LClampHighOrLow0 262LClampReentry0: 263 movl %esi,s 264 movl pbase,%ebx 265 shll $16,%esi 266 cmpl %ebp,%edx 267 movl %esi,sfracf 268 ja LClampHighOrLow1 269LClampReentry1: 270 movl %edx,t 271 movl s,%esi // sfrac = scans->sfrac; 272 shll $16,%edx 273 movl t,%eax // tfrac = scans->tfrac; 274 sarl $16,%esi 275 movl %edx,tfracf 276 277// 278// calculate the texture starting address 279// 280 sarl $16,%eax 281 movl C(cachewidth),%edx 282 imull %edx,%eax // (tfrac >> 16) * cachewidth 283 addl %ebx,%esi 284 addl %eax,%esi // psource = pbase + (sfrac >> 16) + 285 // ((tfrac >> 16) * cachewidth); 286// 287// determine whether last span or not 288// 289 cmpl $16,%ecx 290 jna LLastSegment 291 292// 293// not the last segment; do full 16-wide segment 294// 295LNotLastSegment: 296 297// 298// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to 299// get there 300// 301 302// pick up after the FDIV that was left in flight previously 303 304 fld %st(0) // duplicate it 305 fmul %st(4),%st(0) // s = s/z * z 306 fxch %st(1) 307 fmul %st(3),%st(0) // t = t/z * z 308 fxch %st(1) 309 fistpl snext 310 fistpl tnext 311 movl snext,%eax 312 movl tnext,%edx 313 314 movb (%esi),%bl // get first source texel 315 subl $16,%ecx // count off this segments' pixels 316 movl C(sadjust),%ebp 317 movl %ecx,counttemp // remember count of remaining pixels 318 319 movl C(tadjust),%ecx 320 movb %bl,(%edi) // store first dest pixel 321 322 addl %eax,%ebp 323 addl %edx,%ecx 324 325 movl C(bbextents),%eax 326 movl C(bbextentt),%edx 327 328 cmpl $4096,%ebp 329 jl LClampLow2 330 cmpl %eax,%ebp 331 ja LClampHigh2 332LClampReentry2: 333 334 cmpl $4096,%ecx 335 jl LClampLow3 336 cmpl %edx,%ecx 337 ja LClampHigh3 338LClampReentry3: 339 340 movl %ebp,snext 341 movl %ecx,tnext 342 343 subl s,%ebp 344 subl t,%ecx 345 346// 347// set up advancetable 348// 349 movl %ecx,%eax 350 movl %ebp,%edx 351 sarl $20,%eax // tstep >>= 16; 352 jz LZero 353 sarl $20,%edx // sstep >>= 16; 354 movl C(cachewidth),%ebx 355 imull %ebx,%eax 356 jmp LSetUp1 357 358LZero: 359 sarl $20,%edx // sstep >>= 16; 360 movl C(cachewidth),%ebx 361 362LSetUp1: 363 364 addl %edx,%eax // add in sstep 365 // (tstep >> 16) * cachewidth + (sstep >> 16); 366 movl tfracf,%edx 367 movl %eax,advancetable+4 // advance base in t 368 addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth + 369 // (sstep >> 16); 370 shll $12,%ebp // left-justify sstep fractional part 371 movl sfracf,%ebx 372 shll $12,%ecx // left-justify tstep fractional part 373 movl %eax,advancetable // advance extra in t 374 375 movl %ecx,tstep 376 addl %ecx,%edx // advance tfrac fractional part by tstep frac 377 378 sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none) 379 addl %ebp,%ebx // advance sfrac fractional part by sstep frac 380 adcl advancetable+4(,%ecx,4),%esi // point to next source texel 381 382 addl tstep,%edx 383 sbbl %ecx,%ecx 384 movb (%esi),%al 385 addl %ebp,%ebx 386 movb %al,1(%edi) 387 adcl advancetable+4(,%ecx,4),%esi 388 389 addl tstep,%edx 390 sbbl %ecx,%ecx 391 addl %ebp,%ebx 392 movb (%esi),%al 393 adcl advancetable+4(,%ecx,4),%esi 394 395 addl tstep,%edx 396 sbbl %ecx,%ecx 397 movb %al,2(%edi) 398 addl %ebp,%ebx 399 movb (%esi),%al 400 adcl advancetable+4(,%ecx,4),%esi 401 402 addl tstep,%edx 403 sbbl %ecx,%ecx 404 movb %al,3(%edi) 405 addl %ebp,%ebx 406 movb (%esi),%al 407 adcl advancetable+4(,%ecx,4),%esi 408 409 addl tstep,%edx 410 sbbl %ecx,%ecx 411 movb %al,4(%edi) 412 addl %ebp,%ebx 413 movb (%esi),%al 414 adcl advancetable+4(,%ecx,4),%esi 415 416 addl tstep,%edx 417 sbbl %ecx,%ecx 418 movb %al,5(%edi) 419 addl %ebp,%ebx 420 movb (%esi),%al 421 adcl advancetable+4(,%ecx,4),%esi 422 423 addl tstep,%edx 424 sbbl %ecx,%ecx 425 movb %al,6(%edi) 426 addl %ebp,%ebx 427 movb (%esi),%al 428 adcl advancetable+4(,%ecx,4),%esi 429 430 addl tstep,%edx 431 sbbl %ecx,%ecx 432 movb %al,7(%edi) 433 addl %ebp,%ebx 434 movb (%esi),%al 435 adcl advancetable+4(,%ecx,4),%esi 436 437 438// 439// start FDIV for end of next segment in flight, so it can overlap 440// 441 movl counttemp,%ecx 442 cmpl $16,%ecx // more than one segment after this? 443 ja LSetupNotLast2 // yes 444 445 decl %ecx 446 jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV 447 movl %ecx,spancountminus1 448 fildl spancountminus1 449 450 flds C(d_zistepu) // C(d_zistepu) | spancountminus1 451 fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1 452 flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 453 fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 454 fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1 455 faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1 456 fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1 457 fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 458 fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 459 faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 460 flds fp_64k // 64k | C(d_sdivzstepu)*scm1 461 fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k 462 faddp %st(0),%st(4) // 64k 463 464 fdiv %st(1),%st(0) // this is what we've gone to all this trouble to 465 // overlap 466 jmp LFDIVInFlight2 467 468 .align 4 469LSetupNotLast2: 470 fadds zi16stepu 471 fxch %st(2) 472 fadds sdivz16stepu 473 fxch %st(2) 474 flds tdivz16stepu 475 faddp %st(0),%st(2) 476 flds fp_64k 477 fdiv %st(1),%st(0) // z = 1/1/z 478 // this is what we've gone to all this trouble to 479 // overlap 480LFDIVInFlight2: 481 movl %ecx,counttemp 482 483 addl tstep,%edx 484 sbbl %ecx,%ecx 485 movb %al,8(%edi) 486 addl %ebp,%ebx 487 movb (%esi),%al 488 adcl advancetable+4(,%ecx,4),%esi 489 490 addl tstep,%edx 491 sbbl %ecx,%ecx 492 movb %al,9(%edi) 493 addl %ebp,%ebx 494 movb (%esi),%al 495 adcl advancetable+4(,%ecx,4),%esi 496 497 addl tstep,%edx 498 sbbl %ecx,%ecx 499 movb %al,10(%edi) 500 addl %ebp,%ebx 501 movb (%esi),%al 502 adcl advancetable+4(,%ecx,4),%esi 503 504 addl tstep,%edx 505 sbbl %ecx,%ecx 506 movb %al,11(%edi) 507 addl %ebp,%ebx 508 movb (%esi),%al 509 adcl advancetable+4(,%ecx,4),%esi 510 511 addl tstep,%edx 512 sbbl %ecx,%ecx 513 movb %al,12(%edi) 514 addl %ebp,%ebx 515 movb (%esi),%al 516 adcl advancetable+4(,%ecx,4),%esi 517 518 addl tstep,%edx 519 sbbl %ecx,%ecx 520 movb %al,13(%edi) 521 addl %ebp,%ebx 522 movb (%esi),%al 523 adcl advancetable+4(,%ecx,4),%esi 524 525 addl tstep,%edx 526 sbbl %ecx,%ecx 527 movb %al,14(%edi) 528 addl %ebp,%ebx 529 movb (%esi),%al 530 adcl advancetable+4(,%ecx,4),%esi 531 532 addl $16,%edi 533 movl %edx,tfracf 534 movl snext,%edx 535 movl %ebx,sfracf 536 movl tnext,%ebx 537 movl %edx,s 538 movl %ebx,t 539 540 movl counttemp,%ecx // retrieve count 541 542// 543// determine whether last span or not 544// 545 cmpl $16,%ecx // are there multiple segments remaining? 546 movb %al,-1(%edi) 547 ja LNotLastSegment // yes 548 549// 550// last segment of scan 551// 552LLastSegment: 553 554// 555// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to 556// get there. The number of pixels left is variable, and we want to land on the 557// last pixel, not step one past it, so we can't run into arithmetic problems 558// 559 testl %ecx,%ecx 560 jz LNoSteps // just draw the last pixel and we're done 561 562// pick up after the FDIV that was left in flight previously 563 564 565 fld %st(0) // duplicate it 566 fmul %st(4),%st(0) // s = s/z * z 567 fxch %st(1) 568 fmul %st(3),%st(0) // t = t/z * z 569 fxch %st(1) 570 fistpl snext 571 fistpl tnext 572 573 movb (%esi),%al // load first texel in segment 574 movl C(tadjust),%ebx 575 movb %al,(%edi) // store first pixel in segment 576 movl C(sadjust),%eax 577 578 addl snext,%eax 579 addl tnext,%ebx 580 581 movl C(bbextents),%ebp 582 movl C(bbextentt),%edx 583 584 cmpl $4096,%eax 585 jl LClampLow4 586 cmpl %ebp,%eax 587 ja LClampHigh4 588LClampReentry4: 589 movl %eax,snext 590 591 cmpl $4096,%ebx 592 jl LClampLow5 593 cmpl %edx,%ebx 594 ja LClampHigh5 595LClampReentry5: 596 597 cmpl $1,%ecx // don't bother 598 je LOnlyOneStep // if two pixels in segment, there's only one step, 599 // of the segment length 600 subl s,%eax 601 subl t,%ebx 602 603 addl %eax,%eax // convert to 15.17 format so multiply by 1.31 604 addl %ebx,%ebx // reciprocal yields 16.48 605 606 imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) / 607 // (spancount-1) 608 movl %edx,%ebp 609 610 movl %ebx,%eax 611 imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) / 612 // (spancount-1) 613LSetEntryvec: 614// 615// set up advancetable 616// 617 movl entryvec_table_16(,%ecx,4),%ebx 618 movl %edx,%eax 619 movl %ebx,jumptemp // entry point into code for RET later 620 movl %ebp,%ecx 621 sarl $16,%edx // tstep >>= 16; 622 movl C(cachewidth),%ebx 623 sarl $16,%ecx // sstep >>= 16; 624 imull %ebx,%edx 625 626 addl %ecx,%edx // add in sstep 627 // (tstep >> 16) * cachewidth + (sstep >> 16); 628 movl tfracf,%ecx 629 movl %edx,advancetable+4 // advance base in t 630 addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth + 631 // (sstep >> 16); 632 shll $16,%ebp // left-justify sstep fractional part 633 movl sfracf,%ebx 634 shll $16,%eax // left-justify tstep fractional part 635 movl %edx,advancetable // advance extra in t 636 637 movl %eax,tstep 638 movl %ecx,%edx 639 addl %eax,%edx 640 sbbl %ecx,%ecx 641 addl %ebp,%ebx 642 adcl advancetable+4(,%ecx,4),%esi 643 644 jmp *jumptemp // jump to the number-of-pixels handler 645 646//---------------------------------------- 647 648LNoSteps: 649 movb (%esi),%al // load first texel in segment 650 subl $15,%edi // adjust for hardwired offset 651 jmp LEndSpan 652 653 654LOnlyOneStep: 655 subl s,%eax 656 subl t,%ebx 657 movl %eax,%ebp 658 movl %ebx,%edx 659 jmp LSetEntryvec 660 661//---------------------------------------- 662 663.globl Entry2_16, Entry3_16, Entry4_16, Entry5_16 664.globl Entry6_16, Entry7_16, Entry8_16, Entry9_16 665.globl Entry10_16, Entry11_16, Entry12_16, Entry13_16 666.globl Entry14_16, Entry15_16, Entry16_16 667 668Entry2_16: 669 subl $14,%edi // adjust for hardwired offsets 670 movb (%esi),%al 671 jmp LEntry2_16 672 673//---------------------------------------- 674 675Entry3_16: 676 subl $13,%edi // adjust for hardwired offsets 677 addl %eax,%edx 678 movb (%esi),%al 679 sbbl %ecx,%ecx 680 addl %ebp,%ebx 681 adcl advancetable+4(,%ecx,4),%esi 682 jmp LEntry3_16 683 684//---------------------------------------- 685 686Entry4_16: 687 subl $12,%edi // adjust for hardwired offsets 688 addl %eax,%edx 689 movb (%esi),%al 690 sbbl %ecx,%ecx 691 addl %ebp,%ebx 692 adcl advancetable+4(,%ecx,4),%esi 693 addl tstep,%edx 694 jmp LEntry4_16 695 696//---------------------------------------- 697 698Entry5_16: 699 subl $11,%edi // adjust for hardwired offsets 700 addl %eax,%edx 701 movb (%esi),%al 702 sbbl %ecx,%ecx 703 addl %ebp,%ebx 704 adcl advancetable+4(,%ecx,4),%esi 705 addl tstep,%edx 706 jmp LEntry5_16 707 708//---------------------------------------- 709 710Entry6_16: 711 subl $10,%edi // adjust for hardwired offsets 712 addl %eax,%edx 713 movb (%esi),%al 714 sbbl %ecx,%ecx 715 addl %ebp,%ebx 716 adcl advancetable+4(,%ecx,4),%esi 717 addl tstep,%edx 718 jmp LEntry6_16 719 720//---------------------------------------- 721 722Entry7_16: 723 subl $9,%edi // adjust for hardwired offsets 724 addl %eax,%edx 725 movb (%esi),%al 726 sbbl %ecx,%ecx 727 addl %ebp,%ebx 728 adcl advancetable+4(,%ecx,4),%esi 729 addl tstep,%edx 730 jmp LEntry7_16 731 732//---------------------------------------- 733 734Entry8_16: 735 subl $8,%edi // adjust for hardwired offsets 736 addl %eax,%edx 737 movb (%esi),%al 738 sbbl %ecx,%ecx 739 addl %ebp,%ebx 740 adcl advancetable+4(,%ecx,4),%esi 741 addl tstep,%edx 742 jmp LEntry8_16 743 744//---------------------------------------- 745 746Entry9_16: 747 subl $7,%edi // adjust for hardwired offsets 748 addl %eax,%edx 749 movb (%esi),%al 750 sbbl %ecx,%ecx 751 addl %ebp,%ebx 752 adcl advancetable+4(,%ecx,4),%esi 753 addl tstep,%edx 754 jmp LEntry9_16 755 756//---------------------------------------- 757 758Entry10_16: 759 subl $6,%edi // adjust for hardwired offsets 760 addl %eax,%edx 761 movb (%esi),%al 762 sbbl %ecx,%ecx 763 addl %ebp,%ebx 764 adcl advancetable+4(,%ecx,4),%esi 765 addl tstep,%edx 766 jmp LEntry10_16 767 768//---------------------------------------- 769 770Entry11_16: 771 subl $5,%edi // adjust for hardwired offsets 772 addl %eax,%edx 773 movb (%esi),%al 774 sbbl %ecx,%ecx 775 addl %ebp,%ebx 776 adcl advancetable+4(,%ecx,4),%esi 777 addl tstep,%edx 778 jmp LEntry11_16 779 780//---------------------------------------- 781 782Entry12_16: 783 subl $4,%edi // adjust for hardwired offsets 784 addl %eax,%edx 785 movb (%esi),%al 786 sbbl %ecx,%ecx 787 addl %ebp,%ebx 788 adcl advancetable+4(,%ecx,4),%esi 789 addl tstep,%edx 790 jmp LEntry12_16 791 792//---------------------------------------- 793 794Entry13_16: 795 subl $3,%edi // adjust for hardwired offsets 796 addl %eax,%edx 797 movb (%esi),%al 798 sbbl %ecx,%ecx 799 addl %ebp,%ebx 800 adcl advancetable+4(,%ecx,4),%esi 801 addl tstep,%edx 802 jmp LEntry13_16 803 804//---------------------------------------- 805 806Entry14_16: 807 subl $2,%edi // adjust for hardwired offsets 808 addl %eax,%edx 809 movb (%esi),%al 810 sbbl %ecx,%ecx 811 addl %ebp,%ebx 812 adcl advancetable+4(,%ecx,4),%esi 813 addl tstep,%edx 814 jmp LEntry14_16 815 816//---------------------------------------- 817 818Entry15_16: 819 decl %edi // adjust for hardwired offsets 820 addl %eax,%edx 821 movb (%esi),%al 822 sbbl %ecx,%ecx 823 addl %ebp,%ebx 824 adcl advancetable+4(,%ecx,4),%esi 825 addl tstep,%edx 826 jmp LEntry15_16 827 828//---------------------------------------- 829 830Entry16_16: 831 addl %eax,%edx 832 movb (%esi),%al 833 sbbl %ecx,%ecx 834 addl %ebp,%ebx 835 adcl advancetable+4(,%ecx,4),%esi 836 837 addl tstep,%edx 838 sbbl %ecx,%ecx 839 movb %al,1(%edi) 840 addl %ebp,%ebx 841 movb (%esi),%al 842 adcl advancetable+4(,%ecx,4),%esi 843 addl tstep,%edx 844LEntry15_16: 845 sbbl %ecx,%ecx 846 movb %al,2(%edi) 847 addl %ebp,%ebx 848 movb (%esi),%al 849 adcl advancetable+4(,%ecx,4),%esi 850 addl tstep,%edx 851LEntry14_16: 852 sbbl %ecx,%ecx 853 movb %al,3(%edi) 854 addl %ebp,%ebx 855 movb (%esi),%al 856 adcl advancetable+4(,%ecx,4),%esi 857 addl tstep,%edx 858LEntry13_16: 859 sbbl %ecx,%ecx 860 movb %al,4(%edi) 861 addl %ebp,%ebx 862 movb (%esi),%al 863 adcl advancetable+4(,%ecx,4),%esi 864 addl tstep,%edx 865LEntry12_16: 866 sbbl %ecx,%ecx 867 movb %al,5(%edi) 868 addl %ebp,%ebx 869 movb (%esi),%al 870 adcl advancetable+4(,%ecx,4),%esi 871 addl tstep,%edx 872LEntry11_16: 873 sbbl %ecx,%ecx 874 movb %al,6(%edi) 875 addl %ebp,%ebx 876 movb (%esi),%al 877 adcl advancetable+4(,%ecx,4),%esi 878 addl tstep,%edx 879LEntry10_16: 880 sbbl %ecx,%ecx 881 movb %al,7(%edi) 882 addl %ebp,%ebx 883 movb (%esi),%al 884 adcl advancetable+4(,%ecx,4),%esi 885 addl tstep,%edx 886LEntry9_16: 887 sbbl %ecx,%ecx 888 movb %al,8(%edi) 889 addl %ebp,%ebx 890 movb (%esi),%al 891 adcl advancetable+4(,%ecx,4),%esi 892 addl tstep,%edx 893LEntry8_16: 894 sbbl %ecx,%ecx 895 movb %al,9(%edi) 896 addl %ebp,%ebx 897 movb (%esi),%al 898 adcl advancetable+4(,%ecx,4),%esi 899 addl tstep,%edx 900LEntry7_16: 901 sbbl %ecx,%ecx 902 movb %al,10(%edi) 903 addl %ebp,%ebx 904 movb (%esi),%al 905 adcl advancetable+4(,%ecx,4),%esi 906 addl tstep,%edx 907LEntry6_16: 908 sbbl %ecx,%ecx 909 movb %al,11(%edi) 910 addl %ebp,%ebx 911 movb (%esi),%al 912 adcl advancetable+4(,%ecx,4),%esi 913 addl tstep,%edx 914LEntry5_16: 915 sbbl %ecx,%ecx 916 movb %al,12(%edi) 917 addl %ebp,%ebx 918 movb (%esi),%al 919 adcl advancetable+4(,%ecx,4),%esi 920 addl tstep,%edx 921LEntry4_16: 922 sbbl %ecx,%ecx 923 movb %al,13(%edi) 924 addl %ebp,%ebx 925 movb (%esi),%al 926 adcl advancetable+4(,%ecx,4),%esi 927LEntry3_16: 928 movb %al,14(%edi) 929 movb (%esi),%al 930LEntry2_16: 931 932LEndSpan: 933 934// 935// clear s/z, t/z, 1/z from FP stack 936// 937 fstp %st(0) 938 fstp %st(0) 939 fstp %st(0) 940 941 movl pspantemp,%ebx // restore spans pointer 942 movl espan_t_pnext(%ebx),%ebx // point to next span 943 testl %ebx,%ebx // any more spans? 944 movb %al,15(%edi) 945 jnz LSpanLoop // more spans 946 947 popl %ebx // restore register variables 948 popl %esi 949 popl %edi 950 popl %ebp // restore the caller's stack frame 951 ret 952 953//---------------------------------------------------------------------- 954// 8-bpp horizontal span z drawing codefor polygons, with no transparency. 955// 956// Assumes there is at least one span in pzspans, and that every span 957// contains at least one pixel 958//---------------------------------------------------------------------- 959 960 .text 961 962// z-clamp on a non-negative gradient span 963LClamp: 964 movl $0x40000000,%edx 965 xorl %ebx,%ebx 966 fstp %st(0) 967 jmp LZDraw 968 969// z-clamp on a negative gradient span 970LClampNeg: 971 movl $0x40000000,%edx 972 xorl %ebx,%ebx 973 fstp %st(0) 974 jmp LZDrawNeg 975 976 977#define pzspans 4+16 978 979.globl C(D_DrawZSpans) 980C(D_DrawZSpans): 981 pushl %ebp // preserve caller's stack frame 982 pushl %edi 983 pushl %esi // preserve register variables 984 pushl %ebx 985 986 flds C(d_zistepu) 987 movl C(d_zistepu),%eax 988 movl pzspans(%esp),%esi 989 testl %eax,%eax 990 jz LFNegSpan 991 992 fmuls Float2ToThe31nd 993 fistpl izistep // note: we are relying on FP exceptions being turned 994 // off here to avoid range problems 995 movl izistep,%ebx // remains loaded for all spans 996 997LFSpanLoop: 998// set up the initial 1/z value 999 fildl espan_t_v(%esi) 1000 fildl espan_t_u(%esi) 1001 movl espan_t_v(%esi),%ecx 1002 movl C(d_pzbuffer),%edi 1003 fmuls C(d_zistepu) 1004 fxch %st(1) 1005 fmuls C(d_zistepv) 1006 fxch %st(1) 1007 fadds C(d_ziorigin) 1008 imull C(d_zrowbytes),%ecx 1009 faddp %st(0),%st(1) 1010 1011// clamp if z is nearer than 2 (1/z > 0.5) 1012 fcoms float_point5 1013 addl %ecx,%edi 1014 movl espan_t_u(%esi),%edx 1015 addl %edx,%edx // word count 1016 movl espan_t_count(%esi),%ecx 1017 addl %edx,%edi // pdest = &pdestspan[scans->u]; 1018 pushl %esi // preserve spans pointer 1019 fnstsw %ax 1020 testb $0x45,%ah 1021 jz LClamp 1022 1023 fmuls Float2ToThe31nd 1024 fistpl izi // note: we are relying on FP exceptions being turned 1025 // off here to avoid problems when the span is closer 1026 // than 1/(2**31) 1027 movl izi,%edx 1028 1029// at this point: 1030// %ebx = izistep 1031// %ecx = count 1032// %edx = izi 1033// %edi = pdest 1034 1035LZDraw: 1036 1037// do a single pixel up front, if necessary to dword align the destination 1038 testl $2,%edi 1039 jz LFMiddle 1040 movl %edx,%eax 1041 addl %ebx,%edx 1042 shrl $16,%eax 1043 decl %ecx 1044 movw %ax,(%edi) 1045 addl $2,%edi 1046 1047// do middle a pair of aligned dwords at a time 1048LFMiddle: 1049 pushl %ecx 1050 shrl $1,%ecx // count / 2 1051 jz LFLast // no aligned dwords to do 1052 shrl $1,%ecx // (count / 2) / 2 1053 jnc LFMiddleLoop // even number of aligned dwords to do 1054 1055 movl %edx,%eax 1056 addl %ebx,%edx 1057 shrl $16,%eax 1058 movl %edx,%esi 1059 addl %ebx,%edx 1060 andl $0xFFFF0000,%esi 1061 orl %esi,%eax 1062 movl %eax,(%edi) 1063 addl $4,%edi 1064 andl %ecx,%ecx 1065 jz LFLast 1066 1067LFMiddleLoop: 1068 movl %edx,%eax 1069 addl %ebx,%edx 1070 shrl $16,%eax 1071 movl %edx,%esi 1072 addl %ebx,%edx 1073 andl $0xFFFF0000,%esi 1074 orl %esi,%eax 1075 movl %edx,%ebp 1076 movl %eax,(%edi) 1077 addl %ebx,%edx 1078 shrl $16,%ebp 1079 movl %edx,%esi 1080 addl %ebx,%edx 1081 andl $0xFFFF0000,%esi 1082 orl %esi,%ebp 1083 movl %ebp,4(%edi) // FIXME: eliminate register contention 1084 addl $8,%edi 1085 1086 decl %ecx 1087 jnz LFMiddleLoop 1088 1089LFLast: 1090 popl %ecx // retrieve count 1091 popl %esi // retrieve span pointer 1092 1093// do the last, unaligned pixel, if there is one 1094 andl $1,%ecx // is there an odd pixel left to do? 1095 jz LFSpanDone // no 1096 shrl $16,%edx 1097 movw %dx,(%edi) // do the final pixel's z 1098 1099LFSpanDone: 1100 movl espan_t_pnext(%esi),%esi 1101 testl %esi,%esi 1102 jnz LFSpanLoop 1103 1104 jmp LFDone 1105 1106LFNegSpan: 1107 fmuls FloatMinus2ToThe31nd 1108 fistpl izistep // note: we are relying on FP exceptions being turned 1109 // off here to avoid range problems 1110 movl izistep,%ebx // remains loaded for all spans 1111 1112LFNegSpanLoop: 1113// set up the initial 1/z value 1114 fildl espan_t_v(%esi) 1115 fildl espan_t_u(%esi) 1116 movl espan_t_v(%esi),%ecx 1117 movl C(d_pzbuffer),%edi 1118 fmuls C(d_zistepu) 1119 fxch %st(1) 1120 fmuls C(d_zistepv) 1121 fxch %st(1) 1122 fadds C(d_ziorigin) 1123 imull C(d_zrowbytes),%ecx 1124 faddp %st(0),%st(1) 1125 1126// clamp if z is nearer than 2 (1/z > 0.5) 1127 fcoms float_point5 1128 addl %ecx,%edi 1129 movl espan_t_u(%esi),%edx 1130 addl %edx,%edx // word count 1131 movl espan_t_count(%esi),%ecx 1132 addl %edx,%edi // pdest = &pdestspan[scans->u]; 1133 pushl %esi // preserve spans pointer 1134 fnstsw %ax 1135 testb $0x45,%ah 1136 jz LClampNeg 1137 1138 fmuls Float2ToThe31nd 1139 fistpl izi // note: we are relying on FP exceptions being turned 1140 // off here to avoid problems when the span is closer 1141 // than 1/(2**31) 1142 movl izi,%edx 1143 1144// at this point: 1145// %ebx = izistep 1146// %ecx = count 1147// %edx = izi 1148// %edi = pdest 1149 1150LZDrawNeg: 1151 1152// do a single pixel up front, if necessary to dword align the destination 1153 testl $2,%edi 1154 jz LFNegMiddle 1155 movl %edx,%eax 1156 subl %ebx,%edx 1157 shrl $16,%eax 1158 decl %ecx 1159 movw %ax,(%edi) 1160 addl $2,%edi 1161 1162// do middle a pair of aligned dwords at a time 1163LFNegMiddle: 1164 pushl %ecx 1165 shrl $1,%ecx // count / 2 1166 jz LFNegLast // no aligned dwords to do 1167 shrl $1,%ecx // (count / 2) / 2 1168 jnc LFNegMiddleLoop // even number of aligned dwords to do 1169 1170 movl %edx,%eax 1171 subl %ebx,%edx 1172 shrl $16,%eax 1173 movl %edx,%esi 1174 subl %ebx,%edx 1175 andl $0xFFFF0000,%esi 1176 orl %esi,%eax 1177 movl %eax,(%edi) 1178 addl $4,%edi 1179 andl %ecx,%ecx 1180 jz LFNegLast 1181 1182LFNegMiddleLoop: 1183 movl %edx,%eax 1184 subl %ebx,%edx 1185 shrl $16,%eax 1186 movl %edx,%esi 1187 subl %ebx,%edx 1188 andl $0xFFFF0000,%esi 1189 orl %esi,%eax 1190 movl %edx,%ebp 1191 movl %eax,(%edi) 1192 subl %ebx,%edx 1193 shrl $16,%ebp 1194 movl %edx,%esi 1195 subl %ebx,%edx 1196 andl $0xFFFF0000,%esi 1197 orl %esi,%ebp 1198 movl %ebp,4(%edi) // FIXME: eliminate register contention 1199 addl $8,%edi 1200 1201 decl %ecx 1202 jnz LFNegMiddleLoop 1203 1204LFNegLast: 1205 popl %ecx // retrieve count 1206 popl %esi // retrieve span pointer 1207 1208// do the last, unaligned pixel, if there is one 1209 andl $1,%ecx // is there an odd pixel left to do? 1210 jz LFNegSpanDone // no 1211 shrl $16,%edx 1212 movw %dx,(%edi) // do the final pixel's z 1213 1214LFNegSpanDone: 1215 movl espan_t_pnext(%esi),%esi 1216 testl %esi,%esi 1217 jnz LFNegSpanLoop 1218 1219LFDone: 1220 popl %ebx // restore register variables 1221 popl %esi 1222 popl %edi 1223 popl %ebp // restore the caller's stack frame 1224 ret 1225 1226#endif // id386 1227 1228