1 /* 2 Copyright (c) 2013 yvt 3 4 This file is part of OpenSpades. 5 6 OpenSpades is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 OpenSpades is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with OpenSpades. If not, see <http://www.gnu.org/licenses/>. 18 19 */ 20 21 #include <array> 22 #include <cstdint> 23 #include <cstring> 24 25 #include "SWMapRenderer.h" 26 #include "SWRenderer.h" 27 #include "SWUtils.h" 28 #include <Client/GameMap.h> 29 #include <Core/Bitmap.h> 30 #include <Core/ConcurrentDispatch.h> 31 #include <Core/MiniHeap.h> 32 #include <Core/Settings.h> 33 #include <Core/Stopwatch.h> 34 35 using namespace std; 36 37 DEFINE_SPADES_SETTING(r_swUndersampling, "0"); 38 39 namespace spades { 40 namespace draw { 41 42 // special tan function whose value is finite. SpecialTan(float v)43 static inline float SpecialTan(float v) { 44 static const float pi = M_PI; 45 if (v <= -pi * 0.5f) { 46 return -2.f; 47 } else if (v < -pi * 0.25f) { 48 v = -2.f - 1.f / tanf(v); 49 } else if (v < pi * 0.25f) { 50 v = tanf(v); 51 } else if (v < pi * 0.5f) { 52 v = 2.f - 1.f / tanf(v); 53 } else { 54 return v = 2.f; 55 } 56 return v; 57 } 58 // convert from tan value to special tan value. ToSpecialTan(float v)59 static inline float ToSpecialTan(float v) { 60 if (v < -1.f) 61 return -2.f - fastRcp(v); 62 else if (v > 1.f) 63 return 2.f - fastRcp(v); 64 else 65 return v; 66 } 67 68 enum class Face : short { PosX, NegX, PosY, NegY, PosZ, NegZ }; 69 70 struct SWMapRenderer::LinePixel { 71 union { 72 struct { 73 uint32_t combined; 74 float depth; 75 }; 76 struct { 77 unsigned int color : 24; 78 // Face face: 7; 79 bool filled : 1; 80 }; 81 struct { 82 uint64_t allData; 83 }; 84 }; 85 86 // using "operator =" makes this struct non-POD Setspades::draw::SWMapRenderer::LinePixel87 void Set(const LinePixel &p) { allData = p.allData; } 88 Clearspades::draw::SWMapRenderer::LinePixel89 inline void Clear() { 90 combined = 0; 91 depth = 10000.f; 92 } 93 IsEmptyspades::draw::SWMapRenderer::LinePixel94 inline bool IsEmpty() const { return combined == 0; } 95 }; 96 97 // infinite length line from -z to +z 98 struct SWMapRenderer::Line { 99 std::vector<LinePixel> pixels; 100 Vector3 horizonDir; 101 float pitchTanMin; 102 float pitchScale; 103 int pitchTanMinI; 104 int pitchScaleI; 105 }; 106 SWMapRenderer(SWRenderer * r,client::GameMap * m,SWFeatureLevel level)107 SWMapRenderer::SWMapRenderer(SWRenderer *r, client::GameMap *m, SWFeatureLevel level) 108 : w(m->Width()), 109 h(m->Height()), 110 renderer(r), 111 level(level), 112 map(m), 113 frameBuf(nullptr), 114 depthBuf(nullptr), 115 rleHeap(m->Width() * m->Height() * 64) { 116 rle.resize(w * h); 117 rleLen.resize(w * h); 118 119 Stopwatch sw; 120 sw.Reset(); 121 SPLog("Building RLE map..."); 122 123 int idx = 0; 124 for (int y = 0; y < h; y++) 125 for (int x = 0; x < w; x++) { 126 BuildRle(x, y, rleBuf); 127 128 auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData)); 129 short *ptr = rleHeap.Dereference<short>(ref); 130 std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData)); 131 132 rle[idx] = ref; 133 rleLen[idx] = rleBuf.size() * sizeof(RleData); 134 135 idx++; 136 } 137 SPLog("RLE map created in %.6f seconds", sw.GetTime()); 138 } 139 ~SWMapRenderer()140 SWMapRenderer::~SWMapRenderer() {} 141 BuildRle(int x,int y,std::vector<RleData> & out)142 void SWMapRenderer::BuildRle(int x, int y, std::vector<RleData> &out) { 143 out.clear(); 144 145 out.push_back(0); // [0] = +Z face position address 146 out.push_back(0); 147 out.push_back(0); // [2] = +X face position address 148 out.push_back(0); 149 out.push_back(0); // [4] = -X face position address 150 out.push_back(0); 151 out.push_back(0); // [6] = +Y face position address 152 out.push_back(0); 153 out.push_back(0); // [8] = -Y face position address 154 out.push_back(0); 155 156 auto setHeader = [&](size_t idx, size_t val) { 157 reinterpret_cast<short *>(out.data())[idx] = static_cast<short>(val); 158 }; 159 160 uint64_t smap = map->GetSolidMapWrapped(x, y); 161 std::array<uint64_t, 4> adjs = { 162 map->GetSolidMapWrapped(x + 1, y), map->GetSolidMapWrapped(x - 1, y), 163 map->GetSolidMapWrapped(x, y + 1), map->GetSolidMapWrapped(x, y - 1)}; 164 bool old = false; 165 166 for (int z = 0; z < 64; z++) { 167 bool b = (smap >> z) & 1; 168 if (b && !old) { 169 out.push_back(static_cast<RleData>(z)); 170 } 171 old = b; 172 } 173 out.push_back(-1); 174 175 setHeader(0, out.size()); 176 177 old = true; 178 for (int z = 63; z >= 0; z--) { 179 bool b = (smap >> z) & 1; 180 if (b && !old) { 181 out.push_back(static_cast<RleData>(z)); 182 } 183 old = b; 184 } 185 out.push_back(-1); 186 187 for (int k = 0; k < 4; k++) { 188 setHeader(k + 1, out.size()); 189 for (int z = 0; z < 64; z++) { 190 if ((smap >> z) & 1) { 191 if (!((adjs[k] >> z) & 1)) { 192 out.push_back(static_cast<RleData>(z)); 193 } 194 } 195 } 196 out.push_back(-1); 197 } 198 199 // padding 200 while (out.size() & 3) { 201 out.push_back(42); 202 } 203 } 204 UpdateRle(int x,int y)205 void SWMapRenderer::UpdateRle(int x, int y) { 206 int idx = x + y * w; 207 BuildRle(x, y, rleBuf); 208 209 rleHeap.Free(rle[idx], rleLen[idx]); 210 211 auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData)); 212 short *ptr = rleHeap.Dereference<short>(ref); 213 std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData)); 214 215 rle[idx] = ref; 216 rleLen[idx] = rleBuf.size() * sizeof(RleData); 217 } 218 219 template <SWFeatureLevel flevel> BuildLine(Line & line,float minPitch,float maxPitch)220 void SWMapRenderer::BuildLine(Line &line, float minPitch, float maxPitch) { 221 222 // hard code for further optimization 223 enum { w = 512, h = 512 }; 224 SPAssert(map->Width() == 512); 225 SPAssert(map->Height() == 512); 226 227 const auto *rle = this->rle.data(); 228 auto &rleHeap = this->rleHeap; 229 client::GameMap *map = this->map; 230 231 // pitch culling 232 { 233 const auto &frustrum = renderer->frustrum; 234 static const float pi = M_PI; 235 const auto &horz = line.horizonDir; 236 minPitch = -pi * 0.4999f; 237 maxPitch = pi * 0.4999f; 238 239 auto cull = [&minPitch, &maxPitch]() { 240 minPitch = 2.f; 241 maxPitch = -2.f; 242 }; 243 auto clip = [&minPitch, &maxPitch, &horz, &cull](Vector3 plane) { 244 if (plane.x == 0.f && plane.y == 0.f) { 245 if (plane.z > 0.f) { 246 minPitch = std::max(minPitch, 0.f); 247 } else { 248 maxPitch = std::min(maxPitch, 0.f); 249 } 250 } else if (plane.z == 0.f) { 251 if (Vector3::Dot(plane, horz) < 0.f) { 252 cull(); 253 } 254 } else { 255 Vector3 prj = plane; 256 prj.z = 0.f; 257 prj = prj.Normalize(); 258 259 float zv = fabsf(plane.z); 260 float cs = Vector3::Dot(prj, horz); 261 262 float ang = zv * zv * (1.f - cs * cs) / (cs * cs); 263 ang = -cs * fastSqrt(1.f + ang); 264 ang = zv / ang; 265 if (std::isnan(ang) || std::isinf(ang) || ang == 0.f) 266 return; 267 268 // convert to tan 269 ang = fastSqrt(1.f - ang * ang) / ang; 270 271 // convert to angle 272 ang = atanf(ang); 273 274 if (std::isnan(ang) || std::isinf(ang)) 275 return; 276 277 if (plane.z > 0.f) { 278 minPitch = std::max(minPitch, ang - 0.01f); 279 } else { 280 maxPitch = std::min(maxPitch, -ang + 0.01f); 281 } 282 } 283 }; 284 285 clip(frustrum[2].n); 286 clip(frustrum[3].n); 287 clip(frustrum[4].n); 288 clip(frustrum[5].n); 289 } 290 291 float minTan = SpecialTan(minPitch); 292 float maxTan = SpecialTan(maxPitch); 293 294 { 295 float minDiff = lineResolution / 10000.f; 296 if (maxTan < minTan + minDiff) { 297 // too little difference; scale value might overflow. 298 maxTan = minTan + minDiff; 299 } 300 } 301 302 line.pitchTanMin = minTan; 303 line.pitchScale = lineResolution / (maxTan - minTan); 304 line.pitchTanMinI = static_cast<int>(minTan * 65536.f); 305 line.pitchScaleI = static_cast<int>(line.pitchScale * 65536.f); 306 307 // TODO: pitch culling 308 309 // ray direction 310 float dirX = line.horizonDir.x; 311 float dirY = line.horizonDir.y; 312 if (fabsf(dirY) < 1.e-4f) 313 dirY = 1.e-4f; 314 if (fabsf(dirX) < 1.e-4f) 315 dirX = 1.e-4f; 316 float invDirX = 1.f / dirX; 317 float invDirY = 1.f / dirY; 318 std::int_fast8_t signX = dirX > 0.f ? 1 : -1; 319 std::int_fast8_t signY = dirY > 0.f ? 1 : -1; 320 int invDirXI = static_cast<int>(invDirX * 256.f); 321 int invDirYI = static_cast<int>(invDirY * 256.f); 322 int dirXI = static_cast<int>(dirX * 512.f); 323 int dirYI = static_cast<int>(dirY * 512.f); 324 if (invDirXI < 0) 325 invDirXI = -invDirXI; 326 if (invDirYI < 0) 327 invDirYI = -invDirYI; 328 if (dirXI < 0) 329 dirXI = -dirXI; 330 if (dirYI < 0) 331 dirYI = -dirYI; 332 333 // camera position 334 float cx = sceneDef.viewOrigin.x; 335 float cy = sceneDef.viewOrigin.y; 336 float cz = sceneDef.viewOrigin.z; 337 338 int icz = static_cast<int>(floorf(cz)); 339 340 // ray position 341 // float rx = cx, ry = cy; 342 int rx = static_cast<int>(cx * 512.f); 343 int ry = static_cast<int>(cy * 512.f); 344 345 // ray position in integer 346 std::int_fast16_t irx = rx >> 9; // static_cast<int>(floorf(rx)); 347 std::int_fast16_t iry = ry >> 9; // static_cast<int>(floorf(ry)); 348 349 float fogDist = 128.f; 350 float distance = 1.e-20f; // traveled path 351 float invDist = 1.f / distance; 352 353 // auto& pixels = line.pixels; 354 355 line.pixels.resize(lineResolution); 356 auto *pixels = line.pixels.data(); // std::vector feels slow... 357 358 const float transScale = static_cast<float>(lineResolution) / (maxTan - minTan); 359 const float transOffset = -minTan * transScale; 360 361 #if ENABLE_SSE 362 if (lineResolution > 4) { 363 static_assert(sizeof(LinePixel) == 8, 364 "size of LinePixel has changed; needs code modification"); 365 union { 366 LinePixel pxs[2]; 367 __m128 m; 368 }; 369 pxs[0].Clear(); 370 pxs[1].Clear(); 371 auto *ptr = pixels; 372 for (auto *e = pixels + lineResolution; 373 (reinterpret_cast<size_t>(ptr) & 0xf) && (ptr < e); ptr++) { 374 ptr->Clear(); 375 } 376 for (auto *e = pixels + lineResolution - 2; ptr < e; ptr += 2) { 377 _mm_store_ps(reinterpret_cast<float *>(ptr), m); 378 } 379 for (auto *e = pixels + lineResolution; ptr < e; ptr++) { 380 ptr->Clear(); 381 } 382 } else 383 #endif 384 for (size_t i = 0; i < lineResolution; i++) 385 pixels[i].Clear(); 386 387 // if culled out, bail out now (pixels are filled) 388 if (minPitch >= maxPitch) 389 return; 390 391 std::array<float, 65> zval; // precompute (z - cz) * some 392 for (size_t i = 0; i < zval.size(); i++) 393 zval[i] = (static_cast<float>(i) - cz); 394 395 float vmax = lineResolution + 0.5f; 396 auto transform = [&zval, &transOffset, vmax, &transScale](float invDist, int z) { 397 float p = ToSpecialTan(invDist * zval[z]) * transScale + transOffset; 398 p = std::max(p, 0.f); 399 p = std::min(p, vmax); 400 return static_cast<std::uint_fast16_t>(p); 401 }; 402 403 float zscale; // travel distance -> view Z value factor 404 zscale = Vector3::Dot(line.horizonDir, sceneDef.viewAxis[2]); 405 406 float heightScale; // Z value -> view Z value factor 407 heightScale = sceneDef.viewAxis[2].z; 408 409 std::array<float, 65> heightScaleVal; // precompute (heightScale * z) 410 for (size_t i = 0; i < zval.size(); i++) 411 heightScaleVal[i] = (static_cast<float>(i) * heightScale); 412 413 float depthBias; 414 depthBias = -cz * heightScale; 415 416 RleData *lastRle; 417 { 418 auto ref = rle[(irx & w - 1) + ((iry & h - 1) * w)]; 419 lastRle = rleHeap.Dereference<RleData>(ref); 420 } 421 422 std::uint_fast16_t count = 1; 423 std::uint_fast16_t cnt2 = static_cast<int>(fogDist * 8.f); 424 425 while (distance < fogDist && (--cnt2) > 0) { 426 std::int_fast16_t nextIRX, nextIRY; 427 auto oirx = irx, oiry = iry; 428 429 // DDE 430 Face wallFace; 431 432 if (signX > 0) { 433 nextIRX = irx + 1; 434 if (signY > 0) { 435 nextIRY = iry + 1; 436 437 unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI; 438 unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI; 439 440 if (timeToNextX < timeToNextY) { 441 // go across x plane 442 irx = nextIRX; 443 rx = irx << 9; 444 ry += (dirYI * timeToNextX) >> 17; 445 distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f); 446 wallFace = Face::NegX; 447 } else { 448 // go across y plane 449 iry = nextIRY; 450 rx += (dirXI * timeToNextY) >> 17; 451 ry = iry << 9; 452 distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f); 453 wallFace = Face::NegY; 454 } 455 } else /* (signY < 0) */ { 456 nextIRY = iry - 1; 457 458 unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI; 459 unsigned int timeToNextY = (ry & 511) * invDirYI; 460 461 if (timeToNextX < timeToNextY) { 462 // go across x plane 463 irx = nextIRX; 464 rx = irx << 9; 465 ry -= (dirYI * timeToNextX) >> 17; 466 distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f); 467 wallFace = Face::NegX; 468 } else { 469 // go across y plane 470 iry = nextIRY; 471 rx += (dirXI * timeToNextY) >> 17; 472 ry = (iry << 9) - 1; 473 distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f); 474 wallFace = Face::PosY; 475 } 476 } 477 } else /* signX < 0 */ { 478 nextIRX = irx - 1; 479 if (signY > 0) { 480 nextIRY = iry + 1; 481 482 unsigned int timeToNextX = (rx & 511) * invDirXI; 483 unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI; 484 485 if (timeToNextX < timeToNextY) { 486 // go across x plane 487 irx = nextIRX; 488 rx = (irx << 9) - 1; 489 ry += (dirYI * timeToNextX) >> 17; 490 distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f); 491 wallFace = Face::PosX; 492 } else { 493 // go across y plane 494 iry = nextIRY; 495 rx -= (dirXI * timeToNextY) >> 17; 496 ry = iry << 9; 497 distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f); 498 wallFace = Face::NegY; 499 } 500 } else /* (signY < 0) */ { 501 nextIRY = iry - 1; 502 503 unsigned int timeToNextX = (rx & 511) * invDirXI; 504 unsigned int timeToNextY = (ry & 511) * invDirYI; 505 506 if (timeToNextX < timeToNextY) { 507 // go across x plane 508 irx = nextIRX; 509 rx = (irx << 9) - 1; 510 ry -= (dirYI * timeToNextX) >> 17; 511 distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f); 512 wallFace = Face::PosX; 513 } else { 514 // go across y plane 515 iry = nextIRY; 516 rx -= (dirXI * timeToNextY) >> 17; 517 ry = (iry << 9) - 1; 518 distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f); 519 wallFace = Face::PosY; 520 } 521 } 522 } 523 524 float oldInvDist = invDist; 525 526 invDist = fastRcp(distance); 527 528 float medDist = distance * zscale + depthBias; //(distance + oldDistance) * 0.5f; 529 530 // check for new spans 531 532 auto BuildLinePixel = [map](int x, int y, int z, Face face, float dist) { 533 LinePixel px; 534 px.depth = dist; 535 #if ENABLE_SSE 536 if (flevel == SWFeatureLevel::SSE2) { 537 __m128i m; 538 uint32_t col = map->GetColorWrapped(x, y, z); 539 m = _mm_setr_epi32(col, 0, 0, 0); 540 m = _mm_unpacklo_epi8(m, _mm_setzero_si128()); 541 m = _mm_shufflelo_epi16(m, 0xc6); 542 543 switch (face) { 544 case Face::PosZ: m = _mm_srli_epi16(m, 1); break; 545 case Face::PosX: 546 case Face::PosY: 547 case Face::NegX: 548 m = _mm_adds_epi16(_mm_srli_epi16(m, 1), _mm_srli_epi16(m, 2)); 549 break; 550 default: break; 551 } 552 if ((col >> 24) < 100) { 553 m = _mm_srli_epi16(m, 1); 554 } 555 m = _mm_packus_epi16(m, m); 556 _mm_store_ss(reinterpret_cast<float *>(&px.combined), _mm_castsi128_ps(m)); 557 px.filled = true; 558 } else 559 #endif 560 // non-optimized 561 { 562 uint32_t col; 563 col = map->GetColorWrapped(x, y, z); 564 col = (col & 0xff00) | ((col & 0xff) << 16) | ((col & 0xff0000) >> 16); 565 switch (face) { 566 case Face::PosZ: col = (col & 0xfcfcfc) >> 2; break; 567 case Face::PosX: 568 case Face::PosY: 569 case Face::NegX: col = (col & 0xfefefe) >> 1; break; 570 default: break; 571 } 572 px.combined = col; 573 px.filled = true; 574 } 575 return px; 576 }; 577 578 // floor/ceiling 579 { 580 581 // linear code 582 583 // RLE scan 584 RleData *rle = lastRle; 585 { 586 RleData *ptr = rle + 10; 587 while (*ptr != -1) { 588 std::int_fast8_t z = *ptr; 589 if (z > icz) { 590 std::uint_fast16_t p1 = transform(invDist, z); 591 std::uint_fast16_t p2 = transform(oldInvDist, z); 592 LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::NegZ, 593 medDist + heightScaleVal[z]); 594 595 for (std::uint_fast16_t j = p1; j < p2; j++) { 596 auto &p = pixels[j]; 597 if (!p.IsEmpty()) 598 continue; 599 p.Set(pix); 600 } 601 } 602 ptr++; 603 } 604 ptr++; 605 while (*ptr != -1) { 606 std::int_fast8_t z = *ptr; 607 if (z < icz) { 608 std::uint_fast16_t p1 = transform(invDist, z + 1); 609 std::uint_fast16_t p2 = transform(oldInvDist, z + 1); 610 LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::PosZ, 611 medDist + heightScaleVal[z + 1]); 612 613 for (std::uint_fast16_t j = p2; j < p1; j++) { 614 auto &p = pixels[j]; 615 if (!p.IsEmpty()) 616 continue; 617 p.Set(pix); 618 } 619 } 620 ptr++; 621 } 622 } 623 624 } // done: floor/ceiling 625 626 // add walls 627 { 628 // by RLE map 629 auto ref = rle[static_cast<std::uint_fast32_t>(irx & w - 1) + 630 static_cast<std::uint_fast32_t>(iry & h - 1) * w]; 631 RleData *rle = rleHeap.Dereference<RleData>(ref); 632 lastRle = rle; 633 auto *ptr = rle; 634 ptr += reinterpret_cast<unsigned short *>(rle)[1 + static_cast<int>(wallFace)]; 635 636 std::uint_fast16_t savedP = 0; 637 std::int_fast8_t savedZ = 127; 638 639 while (*ptr != -1) { 640 std::int_fast8_t z = *(ptr++); 641 642 std::uint_fast16_t p1 = savedZ == z ? savedP : transform(invDist, z); 643 std::uint_fast16_t p2 = transform(invDist, z + 1); 644 645 savedZ = z + 1; 646 savedP = p2; 647 648 LinePixel pix = 649 BuildLinePixel(irx, iry, z, wallFace, medDist + heightScaleVal[z]); 650 651 for (std::uint_fast16_t j = p1; j < p2; j++) { 652 auto &p = pixels[j]; 653 if (!p.IsEmpty()) 654 continue; 655 p.Set(pix); 656 } 657 } 658 659 } // add wall - end 660 661 // check pitch cull 662 if ((--count) == 0) { 663 if ((transform(invDist, 0) >= lineResolution - 1 && icz >= 0) || 664 transform(invDist, 63) <= 0) 665 break; 666 count = 4; 667 } 668 669 // let's go to next voxel! 670 } 671 } 672 673 struct AtanTable { 674 std::array<uint16_t, 5000> sm; 675 std::array<uint16_t, 5000> lg; 676 std::array<uint16_t, 5000> smN; 677 std::array<uint16_t, 5000> lgN; 678 679 // [0, 2pi] -> [0, 65536] ToFixedspades::draw::AtanTable680 static uint16_t ToFixed(float v) { 681 v /= (M_PI * 2.f); 682 v *= 65536.f; 683 int i = static_cast<int>(v); 684 return static_cast<uint16_t>(i & 65535); 685 } 686 AtanTablespades::draw::AtanTable687 AtanTable() { 688 for (int i = 0; i < 5000; i++) { 689 sm[i] = ToFixed(atanf(i / 4096.f)); 690 lg[i] = ToFixed(atanf(1.f / ((i + .5f) / 4096.f))); 691 smN[i] = ToFixed(-atanf(i / 4096.f)); 692 lgN[i] = ToFixed(-atanf(1.f / ((i + .5f) / 4096.f))); 693 } 694 } 695 }; 696 static AtanTable atanTable; fastATan(float v)697 static inline uint16_t fastATan(float v) { 698 if (v < 0.f) { 699 if (v > -1.f) { 700 v *= -4096.f; 701 int idx = static_cast<int>(v); 702 // v -= idx; 703 auto ret = atanTable.smN[idx]; 704 return ret; 705 } else { 706 v = fastDiv(-4096.f, v); 707 int idx = static_cast<int>(v); 708 // v -= idx; 709 auto ret = atanTable.lgN[idx]; 710 return ret; 711 } 712 } else { 713 if (v < 1.f) { 714 v *= 4096.f; 715 int idx = static_cast<int>(v); 716 // v -= idx; 717 auto ret = atanTable.sm[idx]; 718 return ret; 719 // ret += (atanTable.sm[idx + 1] - ret) * v; 720 // return ret; 721 } else { 722 v = fastDiv(4096.f, v); 723 int idx = static_cast<int>(v); 724 // v -= idx; 725 auto ret = atanTable.lg[idx]; 726 return ret; 727 // ret += (atanTable.lg[idx + 1] - ret) * v; 728 // return ret; 729 } 730 } 731 } 732 fastATan2(float y,float x)733 static inline uint16_t fastATan2(float y, float x) { 734 if (x == 0.f) { 735 return y > 0.f ? 16384 : -16384; 736 // y > 0.f ? (pi * 0.5f) : (-pi * 0.5f); 737 } else if (x > 0.f) { 738 return fastATan(fastDiv(y, x)); 739 } else { 740 return fastATan(fastDiv(y, x)) + 32768; 741 } 742 } 743 744 template <SWFeatureLevel flevel, int under> RenderFinal(float yawMin,float yawMax,unsigned int numLines,unsigned int threadId,unsigned int numThreads)745 void SWMapRenderer::RenderFinal(float yawMin, float yawMax, unsigned int numLines, 746 unsigned int threadId, unsigned int numThreads) { 747 float fovX = tanf(sceneDef.fovX * 0.5f); 748 float fovY = tanf(sceneDef.fovY * 0.5f); 749 Vector3 front = sceneDef.viewAxis[2]; 750 Vector3 right = sceneDef.viewAxis[0]; 751 Vector3 down = sceneDef.viewAxis[1]; 752 753 unsigned int fw = frameBuf->GetWidth(); 754 unsigned int fh = frameBuf->GetHeight(); 755 uint32_t *fb = frameBuf->GetPixels(); 756 float *depthBuf = this->depthBuf; 757 Vector3 v1 = front - right * fovX + down * fovY; 758 Vector3 deltaDown = -down * (fovY * 2.f / static_cast<float>(fh)); 759 Vector3 deltaRight = right * (fovX * 2.f / static_cast<float>(fw) * under); 760 761 Vector2 screenPos = {-fovX, -fovY}; 762 float deltaScreenPosRight = fovX * 2.f / static_cast<float>(fw); 763 float deltaScreenPosDown = fovY * 2.f / static_cast<float>(fh); 764 765 static const float pi = M_PI; 766 float yawScale = 65536.f / (pi * 2.f); 767 std::int32_t yawScale2 = 768 static_cast<std::int32_t>(pi * 2.f / (yawMax - yawMin) * 65536.f); 769 std::int32_t yawMin2 = static_cast<std::int32_t>(yawMin * yawScale); 770 auto &lineList = this->lines; 771 772 enum { blockSize = 8, hBlock = blockSize / under }; 773 774 Vector3 deltaDownLarge = deltaDown * blockSize; 775 Vector3 deltaRightLarge = deltaRight * hBlock; 776 777 unsigned int startX = threadId * fw / numThreads; 778 unsigned int endX = (threadId + 1) * fw / numThreads; 779 780 startX = (startX / blockSize) * blockSize; 781 endX = (endX / blockSize) * blockSize; 782 783 float deltaScreenPosRightSmall = deltaScreenPosRight * under; 784 float deltaScreenPosDownSmall = deltaScreenPosDown; 785 786 deltaScreenPosRight *= static_cast<float>(blockSize); 787 deltaScreenPosDown *= static_cast<float>(blockSize); 788 789 v1 += deltaRight * static_cast<float>(startX / under); 790 screenPos.x += deltaScreenPosRight * static_cast<float>(startX / blockSize); 791 792 for (unsigned int fx = startX; fx < endX; fx += blockSize) { 793 Vector3 v2 = v1; 794 screenPos.y = -fovY; 795 for (unsigned int fy = 0; fy < fh; fy += blockSize) { 796 797 uint32_t *fb2 = fb + fx + fy * fw; 798 float *db2 = depthBuf + fx + fy * fw; 799 800 if (v2.z > 0.99f || v2.z < -0.99f) { 801 // near to pole. cannot be approximated by piecewise 802 goto SlowBlockPath; 803 } 804 805 FastBlockPath : { 806 807 // Use bi-linear interpolation for faster yaw/pitch 808 // computation. 809 810 auto calcYawindex = [yawScale2, numLines, yawMin2](Vector3 v) { 811 std::int32_t yawIndex; 812 { 813 float x = v.x, y = v.y; 814 int yaw; 815 yaw = fastATan2(y, x); 816 yaw -= yawMin2; 817 yawIndex = static_cast<int>(yaw & 0xffff); 818 } 819 yawIndex <<= 8; 820 return yawIndex; 821 }; 822 auto calcPitch = [](Vector3 vv) { 823 float pitch; 824 pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y); 825 pitch = ToSpecialTan(pitch); 826 return static_cast<int>(pitch * (65536.f * 8192.f)); 827 }; 828 std::int32_t yawIndex1 = calcYawindex(v2); 829 std::int32_t pitch1 = calcPitch(v2); 830 std::int32_t yawIndex2 = calcYawindex(v2 + deltaRightLarge); 831 std::int32_t pitch2 = calcPitch(v2 + deltaRightLarge); 832 std::int32_t yawIndex3 = calcYawindex(v2 + deltaDownLarge); 833 std::int32_t pitch3 = calcPitch(v2 + deltaDownLarge); 834 std::int32_t yawIndex4 = calcYawindex(v2 + deltaRightLarge + deltaDownLarge); 835 std::int32_t pitch4 = calcPitch(v2 + deltaRightLarge + deltaDownLarge); 836 837 // note: `<<8>>8` is phase unwrapping 838 std::int32_t yawDiff1 = ((yawIndex2 - yawIndex1) << 8 >> 8) / hBlock; 839 std::int32_t yawDiff2 = ((yawIndex4 - yawIndex3) << 8 >> 8) / hBlock; 840 std::int32_t pitchDiff1 = (pitch2 - pitch1) / hBlock; 841 std::int32_t pitchDiff2 = (pitch4 - pitch3) / hBlock; 842 843 std::int32_t yawIndexA = yawIndex1; 844 std::int32_t yawIndexB = yawIndex3; 845 std::int32_t pitchA = pitch1; 846 std::int32_t pitchB = pitch3; 847 848 for (unsigned int x = 0; x < blockSize; x += under) { 849 uint32_t *fb3 = fb2 + x; 850 auto *db3 = db2 + x; 851 852 std::int32_t yawIndexC = yawIndexA; 853 std::int32_t yawDelta = ((yawIndexB - yawIndexA) << 8 >> 8) / blockSize; 854 std::int32_t pitchC = pitchA; 855 std::int32_t pitchDelta = (pitchB - pitchA) / blockSize; 856 857 for (unsigned int y = 0; y < blockSize; y++) { 858 859 std::uint32_t yawIndex = 860 static_cast<unsigned int>(yawIndexC << 8 >> 16); 861 yawIndex = (yawIndex * yawScale2) >> 16; 862 yawIndex = (yawIndex * numLines) >> 16; 863 auto &line = lineList[yawIndex]; 864 auto *pixels = line.pixels.data(); 865 866 // solve pitch 867 std::int32_t pitchIndex; 868 869 { 870 pitchIndex = pitchC >> 13; 871 pitchIndex -= line.pitchTanMinI; 872 pitchIndex = 873 static_cast<int>((static_cast<int64_t>(pitchIndex) * 874 static_cast<int64_t>(line.pitchScaleI)) >> 875 32); 876 // pitch = (pitch - line.pitchTanMin) * line.pitchScale; 877 // pitchIndex = static_cast<int>(pitch); 878 pitchIndex &= lineResolution - 1; 879 // pitchIndex = std::max(pitchIndex, 0); 880 // pitchIndex = std::min(pitchIndex, lineResolution - 1); 881 } 882 883 auto &pix = pixels[pitchIndex]; 884 885 // write color. 886 // NOTE: combined contains both color and other information, 887 // though this isn't a problem as long as the color comes 888 // in the LSB's 889 #if ENABLE_SSE 890 if (flevel == SWFeatureLevel::SSE2) { 891 __m128i m; 892 893 if (under == 1) { 894 *fb3 = pix.combined; 895 *db3 = pix.depth; 896 } else if (under == 2) { 897 m = _mm_castpd_si128( 898 _mm_load_sd(reinterpret_cast<const double *>(&pix))); 899 _mm_store_sd(reinterpret_cast<double *>(fb3), 900 _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00))); 901 _mm_store_sd(reinterpret_cast<double *>(db3), 902 _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55))); 903 } else if (under == 4) { 904 m = _mm_castpd_si128( 905 _mm_load_sd(reinterpret_cast<const double *>(&pix))); 906 _mm_stream_si128(reinterpret_cast<__m128i *>(fb3), 907 _mm_shuffle_epi32(m, 0x00)); 908 _mm_stream_si128(reinterpret_cast<__m128i *>(db3), 909 _mm_shuffle_epi32(m, 0x55)); 910 } 911 912 } else 913 #endif 914 // non-optimized 915 { 916 uint32_t col = pix.combined; 917 float d = pix.depth; 918 919 for (int k = 0; k < under; k++) { 920 fb3[k] = col; 921 db3[k] = d; 922 } 923 } 924 925 fb3 += fw; 926 db3 += fw; 927 928 yawIndexC += yawDelta; 929 pitchC += pitchDelta; 930 } 931 932 yawIndexA += yawDiff1; 933 yawIndexB += yawDiff2; 934 pitchA += pitchDiff1; 935 pitchB += pitchDiff2; 936 } 937 } 938 goto Converge; 939 940 SlowBlockPath : { 941 Vector3 v3 = v2; 942 Vector2 screenPos2 = screenPos; 943 for (unsigned int x = 0; x < blockSize; x += under) { 944 Vector3 v4 = v3; 945 uint32_t *fb3 = fb2 + x; 946 auto *db3 = db2 + x; 947 screenPos2.y = screenPos.y; 948 949 for (unsigned int y = 0; y < blockSize; y++) { 950 Vector3 vv = v4; 951 952 // solve yaw 953 std::uint32_t yawIndex; 954 { 955 float x = vv.x, y = vv.y; 956 int yaw; 957 yaw = fastATan2(y, x); 958 yaw -= yawMin2; 959 yawIndex = static_cast<unsigned int>(yaw & 0xffff); 960 } 961 yawIndex = (yawIndex * yawScale2) >> 16; 962 yawIndex = (yawIndex * numLines) >> 16; 963 964 auto &line = lineList[yawIndex]; 965 auto *pixels = line.pixels.data(); 966 967 // solve pitch 968 std::int32_t pitchIndex; 969 970 { 971 float pitch; 972 pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y); 973 pitch = ToSpecialTan(pitch); 974 pitch = (pitch - line.pitchTanMin) * line.pitchScale; 975 pitchIndex = static_cast<int>(pitch); 976 pitchIndex &= lineResolution - 1; 977 // pitchIndex = std::max(pitchIndex, 0); 978 // pitchIndex = std::min(pitchIndex, lineResolution - 1); 979 } 980 981 auto &pix = pixels[pitchIndex]; 982 983 // write color. 984 // NOTE: combined contains both color and other information, 985 // though this isn't a problem as long as the color comes 986 // in the LSB's 987 #if ENABLE_SSE 988 if (flevel == SWFeatureLevel::SSE2) { 989 __m128i m; 990 991 if (under == 1) { 992 *fb3 = pix.combined; 993 *db3 = pix.depth; 994 } else if (under == 2) { 995 m = _mm_castpd_si128( 996 _mm_load_sd(reinterpret_cast<const double *>(&pix))); 997 _mm_store_sd(reinterpret_cast<double *>(fb3), 998 _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00))); 999 _mm_store_sd(reinterpret_cast<double *>(db3), 1000 _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55))); 1001 } else if (under == 4) { 1002 m = _mm_castpd_si128( 1003 _mm_load_sd(reinterpret_cast<const double *>(&pix))); 1004 _mm_stream_si128(reinterpret_cast<__m128i *>(fb3), 1005 _mm_shuffle_epi32(m, 0x00)); 1006 _mm_stream_si128(reinterpret_cast<__m128i *>(db3), 1007 _mm_shuffle_epi32(m, 0x55)); 1008 } 1009 1010 } else 1011 #endif 1012 // non-optimized 1013 { 1014 uint32_t col = pix.combined; 1015 float d = pix.depth; 1016 1017 for (int k = 0; k < under; k++) { 1018 fb3[k] = col; 1019 db3[k] = d; 1020 } 1021 } 1022 1023 fb3 += fw; 1024 db3 += fw; 1025 1026 v4 += deltaDown; 1027 screenPos2.y += deltaScreenPosDownSmall; 1028 } // y 1029 v3 += deltaRight; 1030 screenPos2.x += deltaScreenPosRightSmall; 1031 } // x 1032 1033 } // end SlowBlockPath 1034 1035 Converge: 1036 1037 v2 += deltaDownLarge; 1038 screenPos.y += deltaScreenPosDown; 1039 } // fy 1040 v1 += deltaRightLarge; 1041 screenPos.x += deltaScreenPosRight; 1042 } // fx 1043 } 1044 1045 template <SWFeatureLevel flevel> RenderInner(const client::SceneDefinition & def,Bitmap * frame,float * depthBuffer)1046 void SWMapRenderer::RenderInner(const client::SceneDefinition &def, Bitmap *frame, 1047 float *depthBuffer) { 1048 1049 sceneDef = def; 1050 frameBuf = frame; 1051 depthBuf = depthBuffer; 1052 1053 // calculate line density. 1054 float yawMin, yawMax; 1055 float pitchMin, pitchMax; 1056 size_t numLines; 1057 { 1058 float fovX = tanf(def.fovX * 0.5f); 1059 float fovY = tanf(def.fovY * 0.5f); 1060 float fovDiag = sqrtf(fovX * fovX + fovY * fovY); 1061 float fovDiagAng = atanf(fovDiag); 1062 float pitch = asinf(def.viewAxis[2].z); 1063 static const float pi = M_PI; 1064 1065 // pitch = 0.f; 1066 1067 if (fabsf(pitch) >= pi * 0.49f - fovDiagAng) { 1068 // pole is visible 1069 yawMin = 0.f; 1070 yawMax = pi * 2.f; 1071 } else { 1072 float yaw = atan2l(def.viewAxis[2].y, def.viewAxis[2].x); 1073 // TODO: incorrect! 1074 yawMin = yaw - pi * .5f; // fovDiagAng; 1075 yawMax = yaw + pi * .5f; // fovDiagAng; 1076 } 1077 1078 pitchMin = pitch - fovDiagAng; 1079 pitchMax = pitch + fovDiagAng; 1080 if (pitchMin < -pi * 0.5f) { 1081 pitchMax = std::max(pitchMax, -pi - pitchMin); 1082 pitchMin = -pi * 0.5f; 1083 } 1084 if (pitchMax > pi * 0.5f) { 1085 pitchMin = std::min(pitchMin, pi - pitchMax); 1086 pitchMax = pi * 0.5f; 1087 } 1088 1089 // pitch of PI/2 will make tan(x) infinite 1090 pitchMin = std::max(pitchMin, -pi * 0.4999f); 1091 pitchMax = std::min(pitchMax, pi * 0.4999f); 1092 1093 float interval = static_cast<float>(frame->GetHeight()); 1094 interval = fovY * 2.f / interval; 1095 lineResolution = static_cast<int>((pitchMax - pitchMin) / interval * 1.5f); 1096 lineResolution = frame->GetHeight(); 1097 1098 for (int i = lineResolution, j = 1; j <= i; j <<= 1) { 1099 lineResolution = j; 1100 } 1101 1102 if (pitchMin > 0.f) { 1103 // interval /= cosf(pitchMin); 1104 } else if (pitchMax < 0.f) { 1105 // interval /= cosf(pitchMax); 1106 } 1107 1108 numLines = static_cast<size_t>((yawMax - yawMin) / interval); 1109 1110 int under = r_swUndersampling; 1111 under = std::max(std::min(under, 4), 1); 1112 numLines /= under; 1113 1114 if (numLines < 8) 1115 numLines = 8; 1116 if (numLines > 65536) { 1117 numLines = 1118 65536; // SPRaise("Too many lines emit: %d", static_cast<int>(numLines)); 1119 } 1120 lines.resize(std::max(numLines, lines.size())); 1121 /* 1122 SPLog("numlines: %d, each %f deg, and %d res", 1123 static_cast<int>(numLines), 1124 interval * 180.f / pi, 1125 static_cast<int>(lineResolution));*/ 1126 } 1127 1128 // calculate vector for each lines 1129 { 1130 float scl = (yawMax - yawMin) / numLines; 1131 Vector3 horiz = Vector3::Make(cosf(yawMin), sinf(yawMin), 0.f); 1132 float c = cosf(scl); 1133 float s = sinf(scl); 1134 for (size_t i = 0; i < numLines; i++) { 1135 Line &l = lines[i]; 1136 l.horizonDir = horiz; 1137 1138 float x = horiz.x * c - horiz.y * s; 1139 float y = horiz.x * s + horiz.y * c; 1140 horiz.x = x; 1141 horiz.y = y; 1142 } 1143 } 1144 1145 { 1146 unsigned int nlines = static_cast<unsigned int>(numLines); 1147 InvokeParallel2([&](unsigned int th, unsigned int numThreads) { 1148 unsigned int start = th * nlines / numThreads; 1149 unsigned int end = (th + 1) * nlines / numThreads; 1150 1151 for (size_t i = start; i < end; i++) { 1152 BuildLine<flevel>(lines[i], pitchMin, pitchMax); 1153 } 1154 }); 1155 } 1156 1157 int under = r_swUndersampling; 1158 1159 InvokeParallel2([&](unsigned int th, unsigned int numThreads) { 1160 1161 if (under <= 1) { 1162 RenderFinal<flevel, 1>(yawMin, yawMax, static_cast<unsigned int>(numLines), th, 1163 numThreads); 1164 } else if (under <= 2) { 1165 RenderFinal<flevel, 2>(yawMin, yawMax, static_cast<unsigned int>(numLines), th, 1166 numThreads); 1167 } else { 1168 RenderFinal<flevel, 4>(yawMin, yawMax, static_cast<unsigned int>(numLines), th, 1169 numThreads); 1170 } 1171 }); 1172 1173 frameBuf = nullptr; 1174 depthBuf = nullptr; 1175 } 1176 Render(const client::SceneDefinition & def,Bitmap * frame,float * depthBuffer)1177 void SWMapRenderer::Render(const client::SceneDefinition &def, Bitmap *frame, 1178 float *depthBuffer) { 1179 if (!frame) 1180 SPInvalidArgument("frame"); 1181 if (!depthBuffer) 1182 SPInvalidArgument("depthBuffer"); 1183 1184 auto p = def.viewOrigin.Floor(); 1185 if (map->IsSolidWrapped(p.x, p.y, p.z)) { 1186 return; 1187 } 1188 1189 #if ENABLE_SSE2 1190 if (static_cast<int>(level) >= static_cast<int>(SWFeatureLevel::SSE2)) { 1191 RenderInner<SWFeatureLevel::SSE2>(def, frame, depthBuffer); 1192 return; 1193 } 1194 #endif 1195 1196 RenderInner<SWFeatureLevel::None>(def, frame, depthBuffer); 1197 } 1198 } 1199 } 1200