1 /*
2  Copyright (c) 2013 yvt
3 
4  This file is part of OpenSpades.
5 
6  OpenSpades is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  OpenSpades is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with OpenSpades.  If not, see <http://www.gnu.org/licenses/>.
18 
19  */
20 
21 #include <array>
22 #include <cstdint>
23 #include <cstring>
24 
25 #include "SWMapRenderer.h"
26 #include "SWRenderer.h"
27 #include "SWUtils.h"
28 #include <Client/GameMap.h>
29 #include <Core/Bitmap.h>
30 #include <Core/ConcurrentDispatch.h>
31 #include <Core/MiniHeap.h>
32 #include <Core/Settings.h>
33 #include <Core/Stopwatch.h>
34 
35 using namespace std;
36 
37 DEFINE_SPADES_SETTING(r_swUndersampling, "0");
38 
39 namespace spades {
40 	namespace draw {
41 
42 		// special tan function whose value is finite.
SpecialTan(float v)43 		static inline float SpecialTan(float v) {
44 			static const float pi = M_PI;
45 			if (v <= -pi * 0.5f) {
46 				return -2.f;
47 			} else if (v < -pi * 0.25f) {
48 				v = -2.f - 1.f / tanf(v);
49 			} else if (v < pi * 0.25f) {
50 				v = tanf(v);
51 			} else if (v < pi * 0.5f) {
52 				v = 2.f - 1.f / tanf(v);
53 			} else {
54 				return v = 2.f;
55 			}
56 			return v;
57 		}
58 		// convert from tan value to special tan value.
ToSpecialTan(float v)59 		static inline float ToSpecialTan(float v) {
60 			if (v < -1.f)
61 				return -2.f - fastRcp(v);
62 			else if (v > 1.f)
63 				return 2.f - fastRcp(v);
64 			else
65 				return v;
66 		}
67 
68 		enum class Face : short { PosX, NegX, PosY, NegY, PosZ, NegZ };
69 
70 		struct SWMapRenderer::LinePixel {
71 			union {
72 				struct {
73 					uint32_t combined;
74 					float depth;
75 				};
76 				struct {
77 					unsigned int color : 24;
78 					// Face face: 7;
79 					bool filled : 1;
80 				};
81 				struct {
82 					uint64_t allData;
83 				};
84 			};
85 
86 			// using "operator =" makes this struct non-POD
Setspades::draw::SWMapRenderer::LinePixel87 			void Set(const LinePixel &p) { allData = p.allData; }
88 
Clearspades::draw::SWMapRenderer::LinePixel89 			inline void Clear() {
90 				combined = 0;
91 				depth = 10000.f;
92 			}
93 
IsEmptyspades::draw::SWMapRenderer::LinePixel94 			inline bool IsEmpty() const { return combined == 0; }
95 		};
96 
97 		// infinite length line from -z to +z
98 		struct SWMapRenderer::Line {
99 			std::vector<LinePixel> pixels;
100 			Vector3 horizonDir;
101 			float pitchTanMin;
102 			float pitchScale;
103 			int pitchTanMinI;
104 			int pitchScaleI;
105 		};
106 
SWMapRenderer(SWRenderer * r,client::GameMap * m,SWFeatureLevel level)107 		SWMapRenderer::SWMapRenderer(SWRenderer *r, client::GameMap *m, SWFeatureLevel level)
108 		    : w(m->Width()),
109 		      h(m->Height()),
110 		      renderer(r),
111 		      level(level),
112 		      map(m),
113 		      frameBuf(nullptr),
114 		      depthBuf(nullptr),
115 		      rleHeap(m->Width() * m->Height() * 64) {
116 			rle.resize(w * h);
117 			rleLen.resize(w * h);
118 
119 			Stopwatch sw;
120 			sw.Reset();
121 			SPLog("Building RLE map...");
122 
123 			int idx = 0;
124 			for (int y = 0; y < h; y++)
125 				for (int x = 0; x < w; x++) {
126 					BuildRle(x, y, rleBuf);
127 
128 					auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData));
129 					short *ptr = rleHeap.Dereference<short>(ref);
130 					std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData));
131 
132 					rle[idx] = ref;
133 					rleLen[idx] = rleBuf.size() * sizeof(RleData);
134 
135 					idx++;
136 				}
137 			SPLog("RLE map created in %.6f seconds", sw.GetTime());
138 		}
139 
~SWMapRenderer()140 		SWMapRenderer::~SWMapRenderer() {}
141 
BuildRle(int x,int y,std::vector<RleData> & out)142 		void SWMapRenderer::BuildRle(int x, int y, std::vector<RleData> &out) {
143 			out.clear();
144 
145 			out.push_back(0); // [0] = +Z face position address
146 			out.push_back(0);
147 			out.push_back(0); // [2] = +X face position address
148 			out.push_back(0);
149 			out.push_back(0); // [4] = -X face position address
150 			out.push_back(0);
151 			out.push_back(0); // [6] = +Y face position address
152 			out.push_back(0);
153 			out.push_back(0); // [8] = -Y face position address
154 			out.push_back(0);
155 
156 			auto setHeader = [&](size_t idx, size_t val) {
157 				reinterpret_cast<short *>(out.data())[idx] = static_cast<short>(val);
158 			};
159 
160 			uint64_t smap = map->GetSolidMapWrapped(x, y);
161 			std::array<uint64_t, 4> adjs = {
162 			  map->GetSolidMapWrapped(x + 1, y), map->GetSolidMapWrapped(x - 1, y),
163 			  map->GetSolidMapWrapped(x, y + 1), map->GetSolidMapWrapped(x, y - 1)};
164 			bool old = false;
165 
166 			for (int z = 0; z < 64; z++) {
167 				bool b = (smap >> z) & 1;
168 				if (b && !old) {
169 					out.push_back(static_cast<RleData>(z));
170 				}
171 				old = b;
172 			}
173 			out.push_back(-1);
174 
175 			setHeader(0, out.size());
176 
177 			old = true;
178 			for (int z = 63; z >= 0; z--) {
179 				bool b = (smap >> z) & 1;
180 				if (b && !old) {
181 					out.push_back(static_cast<RleData>(z));
182 				}
183 				old = b;
184 			}
185 			out.push_back(-1);
186 
187 			for (int k = 0; k < 4; k++) {
188 				setHeader(k + 1, out.size());
189 				for (int z = 0; z < 64; z++) {
190 					if ((smap >> z) & 1) {
191 						if (!((adjs[k] >> z) & 1)) {
192 							out.push_back(static_cast<RleData>(z));
193 						}
194 					}
195 				}
196 				out.push_back(-1);
197 			}
198 
199 			// padding
200 			while (out.size() & 3) {
201 				out.push_back(42);
202 			}
203 		}
204 
UpdateRle(int x,int y)205 		void SWMapRenderer::UpdateRle(int x, int y) {
206 			int idx = x + y * w;
207 			BuildRle(x, y, rleBuf);
208 
209 			rleHeap.Free(rle[idx], rleLen[idx]);
210 
211 			auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData));
212 			short *ptr = rleHeap.Dereference<short>(ref);
213 			std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData));
214 
215 			rle[idx] = ref;
216 			rleLen[idx] = rleBuf.size() * sizeof(RleData);
217 		}
218 
219 		template <SWFeatureLevel flevel>
BuildLine(Line & line,float minPitch,float maxPitch)220 		void SWMapRenderer::BuildLine(Line &line, float minPitch, float maxPitch) {
221 
222 			// hard code for further optimization
223 			enum { w = 512, h = 512 };
224 			SPAssert(map->Width() == 512);
225 			SPAssert(map->Height() == 512);
226 
227 			const auto *rle = this->rle.data();
228 			auto &rleHeap = this->rleHeap;
229 			client::GameMap *map = this->map;
230 
231 			// pitch culling
232 			{
233 				const auto &frustrum = renderer->frustrum;
234 				static const float pi = M_PI;
235 				const auto &horz = line.horizonDir;
236 				minPitch = -pi * 0.4999f;
237 				maxPitch = pi * 0.4999f;
238 
239 				auto cull = [&minPitch, &maxPitch]() {
240 					minPitch = 2.f;
241 					maxPitch = -2.f;
242 				};
243 				auto clip = [&minPitch, &maxPitch, &horz, &cull](Vector3 plane) {
244 					if (plane.x == 0.f && plane.y == 0.f) {
245 						if (plane.z > 0.f) {
246 							minPitch = std::max(minPitch, 0.f);
247 						} else {
248 							maxPitch = std::min(maxPitch, 0.f);
249 						}
250 					} else if (plane.z == 0.f) {
251 						if (Vector3::Dot(plane, horz) < 0.f) {
252 							cull();
253 						}
254 					} else {
255 						Vector3 prj = plane;
256 						prj.z = 0.f;
257 						prj = prj.Normalize();
258 
259 						float zv = fabsf(plane.z);
260 						float cs = Vector3::Dot(prj, horz);
261 
262 						float ang = zv * zv * (1.f - cs * cs) / (cs * cs);
263 						ang = -cs * fastSqrt(1.f + ang);
264 						ang = zv / ang;
265 						if (std::isnan(ang) || std::isinf(ang) || ang == 0.f)
266 							return;
267 
268 						// convert to tan
269 						ang = fastSqrt(1.f - ang * ang) / ang;
270 
271 						// convert to angle
272 						ang = atanf(ang);
273 
274 						if (std::isnan(ang) || std::isinf(ang))
275 							return;
276 
277 						if (plane.z > 0.f) {
278 							minPitch = std::max(minPitch, ang - 0.01f);
279 						} else {
280 							maxPitch = std::min(maxPitch, -ang + 0.01f);
281 						}
282 					}
283 				};
284 
285 				clip(frustrum[2].n);
286 				clip(frustrum[3].n);
287 				clip(frustrum[4].n);
288 				clip(frustrum[5].n);
289 			}
290 
291 			float minTan = SpecialTan(minPitch);
292 			float maxTan = SpecialTan(maxPitch);
293 
294 			{
295 				float minDiff = lineResolution / 10000.f;
296 				if (maxTan < minTan + minDiff) {
297 					// too little difference; scale value might overflow.
298 					maxTan = minTan + minDiff;
299 				}
300 			}
301 
302 			line.pitchTanMin = minTan;
303 			line.pitchScale = lineResolution / (maxTan - minTan);
304 			line.pitchTanMinI = static_cast<int>(minTan * 65536.f);
305 			line.pitchScaleI = static_cast<int>(line.pitchScale * 65536.f);
306 
307 			// TODO: pitch culling
308 
309 			// ray direction
310 			float dirX = line.horizonDir.x;
311 			float dirY = line.horizonDir.y;
312 			if (fabsf(dirY) < 1.e-4f)
313 				dirY = 1.e-4f;
314 			if (fabsf(dirX) < 1.e-4f)
315 				dirX = 1.e-4f;
316 			float invDirX = 1.f / dirX;
317 			float invDirY = 1.f / dirY;
318 			std::int_fast8_t signX = dirX > 0.f ? 1 : -1;
319 			std::int_fast8_t signY = dirY > 0.f ? 1 : -1;
320 			int invDirXI = static_cast<int>(invDirX * 256.f);
321 			int invDirYI = static_cast<int>(invDirY * 256.f);
322 			int dirXI = static_cast<int>(dirX * 512.f);
323 			int dirYI = static_cast<int>(dirY * 512.f);
324 			if (invDirXI < 0)
325 				invDirXI = -invDirXI;
326 			if (invDirYI < 0)
327 				invDirYI = -invDirYI;
328 			if (dirXI < 0)
329 				dirXI = -dirXI;
330 			if (dirYI < 0)
331 				dirYI = -dirYI;
332 
333 			// camera position
334 			float cx = sceneDef.viewOrigin.x;
335 			float cy = sceneDef.viewOrigin.y;
336 			float cz = sceneDef.viewOrigin.z;
337 
338 			int icz = static_cast<int>(floorf(cz));
339 
340 			// ray position
341 			// float rx = cx, ry = cy;
342 			int rx = static_cast<int>(cx * 512.f);
343 			int ry = static_cast<int>(cy * 512.f);
344 
345 			// ray position in integer
346 			std::int_fast16_t irx = rx >> 9; // static_cast<int>(floorf(rx));
347 			std::int_fast16_t iry = ry >> 9; // static_cast<int>(floorf(ry));
348 
349 			float fogDist = 128.f;
350 			float distance = 1.e-20f; // traveled path
351 			float invDist = 1.f / distance;
352 
353 			// auto& pixels = line.pixels;
354 
355 			line.pixels.resize(lineResolution);
356 			auto *pixels = line.pixels.data(); // std::vector feels slow...
357 
358 			const float transScale = static_cast<float>(lineResolution) / (maxTan - minTan);
359 			const float transOffset = -minTan * transScale;
360 
361 #if ENABLE_SSE
362 			if (lineResolution > 4) {
363 				static_assert(sizeof(LinePixel) == 8,
364 				              "size of LinePixel has changed; needs code modification");
365 				union {
366 					LinePixel pxs[2];
367 					__m128 m;
368 				};
369 				pxs[0].Clear();
370 				pxs[1].Clear();
371 				auto *ptr = pixels;
372 				for (auto *e = pixels + lineResolution;
373 				     (reinterpret_cast<size_t>(ptr) & 0xf) && (ptr < e); ptr++) {
374 					ptr->Clear();
375 				}
376 				for (auto *e = pixels + lineResolution - 2; ptr < e; ptr += 2) {
377 					_mm_store_ps(reinterpret_cast<float *>(ptr), m);
378 				}
379 				for (auto *e = pixels + lineResolution; ptr < e; ptr++) {
380 					ptr->Clear();
381 				}
382 			} else
383 #endif
384 				for (size_t i = 0; i < lineResolution; i++)
385 					pixels[i].Clear();
386 
387 			// if culled out, bail out now (pixels are filled)
388 			if (minPitch >= maxPitch)
389 				return;
390 
391 			std::array<float, 65> zval; // precompute (z - cz) * some
392 			for (size_t i = 0; i < zval.size(); i++)
393 				zval[i] = (static_cast<float>(i) - cz);
394 
395 			float vmax = lineResolution + 0.5f;
396 			auto transform = [&zval, &transOffset, vmax, &transScale](float invDist, int z) {
397 				float p = ToSpecialTan(invDist * zval[z]) * transScale + transOffset;
398 				p = std::max(p, 0.f);
399 				p = std::min(p, vmax);
400 				return static_cast<std::uint_fast16_t>(p);
401 			};
402 
403 			float zscale; // travel distance -> view Z value factor
404 			zscale = Vector3::Dot(line.horizonDir, sceneDef.viewAxis[2]);
405 
406 			float heightScale; // Z value -> view Z value factor
407 			heightScale = sceneDef.viewAxis[2].z;
408 
409 			std::array<float, 65> heightScaleVal; // precompute (heightScale * z)
410 			for (size_t i = 0; i < zval.size(); i++)
411 				heightScaleVal[i] = (static_cast<float>(i) * heightScale);
412 
413 			float depthBias;
414 			depthBias = -cz * heightScale;
415 
416 			RleData *lastRle;
417 			{
418 				auto ref = rle[(irx & w - 1) + ((iry & h - 1) * w)];
419 				lastRle = rleHeap.Dereference<RleData>(ref);
420 			}
421 
422 			std::uint_fast16_t count = 1;
423 			std::uint_fast16_t cnt2 = static_cast<int>(fogDist * 8.f);
424 
425 			while (distance < fogDist && (--cnt2) > 0) {
426 				std::int_fast16_t nextIRX, nextIRY;
427 				auto oirx = irx, oiry = iry;
428 
429 				// DDE
430 				Face wallFace;
431 
432 				if (signX > 0) {
433 					nextIRX = irx + 1;
434 					if (signY > 0) {
435 						nextIRY = iry + 1;
436 
437 						unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI;
438 						unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI;
439 
440 						if (timeToNextX < timeToNextY) {
441 							// go across x plane
442 							irx = nextIRX;
443 							rx = irx << 9;
444 							ry += (dirYI * timeToNextX) >> 17;
445 							distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f);
446 							wallFace = Face::NegX;
447 						} else {
448 							// go across y plane
449 							iry = nextIRY;
450 							rx += (dirXI * timeToNextY) >> 17;
451 							ry = iry << 9;
452 							distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f);
453 							wallFace = Face::NegY;
454 						}
455 					} else /* (signY < 0) */ {
456 						nextIRY = iry - 1;
457 
458 						unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI;
459 						unsigned int timeToNextY = (ry & 511) * invDirYI;
460 
461 						if (timeToNextX < timeToNextY) {
462 							// go across x plane
463 							irx = nextIRX;
464 							rx = irx << 9;
465 							ry -= (dirYI * timeToNextX) >> 17;
466 							distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f);
467 							wallFace = Face::NegX;
468 						} else {
469 							// go across y plane
470 							iry = nextIRY;
471 							rx += (dirXI * timeToNextY) >> 17;
472 							ry = (iry << 9) - 1;
473 							distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f);
474 							wallFace = Face::PosY;
475 						}
476 					}
477 				} else /* signX < 0 */ {
478 					nextIRX = irx - 1;
479 					if (signY > 0) {
480 						nextIRY = iry + 1;
481 
482 						unsigned int timeToNextX = (rx & 511) * invDirXI;
483 						unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI;
484 
485 						if (timeToNextX < timeToNextY) {
486 							// go across x plane
487 							irx = nextIRX;
488 							rx = (irx << 9) - 1;
489 							ry += (dirYI * timeToNextX) >> 17;
490 							distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f);
491 							wallFace = Face::PosX;
492 						} else {
493 							// go across y plane
494 							iry = nextIRY;
495 							rx -= (dirXI * timeToNextY) >> 17;
496 							ry = iry << 9;
497 							distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f);
498 							wallFace = Face::NegY;
499 						}
500 					} else /* (signY < 0) */ {
501 						nextIRY = iry - 1;
502 
503 						unsigned int timeToNextX = (rx & 511) * invDirXI;
504 						unsigned int timeToNextY = (ry & 511) * invDirYI;
505 
506 						if (timeToNextX < timeToNextY) {
507 							// go across x plane
508 							irx = nextIRX;
509 							rx = (irx << 9) - 1;
510 							ry -= (dirYI * timeToNextX) >> 17;
511 							distance += static_cast<float>(timeToNextX) * (1.f / 512.f / 256.f);
512 							wallFace = Face::PosX;
513 						} else {
514 							// go across y plane
515 							iry = nextIRY;
516 							rx -= (dirXI * timeToNextY) >> 17;
517 							ry = (iry << 9) - 1;
518 							distance += static_cast<float>(timeToNextY) * (1.f / 512.f / 256.f);
519 							wallFace = Face::PosY;
520 						}
521 					}
522 				}
523 
524 				float oldInvDist = invDist;
525 
526 				invDist = fastRcp(distance);
527 
528 				float medDist = distance * zscale + depthBias; //(distance + oldDistance) * 0.5f;
529 
530 				// check for new spans
531 
532 				auto BuildLinePixel = [map](int x, int y, int z, Face face, float dist) {
533 					LinePixel px;
534 					px.depth = dist;
535 #if ENABLE_SSE
536 					if (flevel == SWFeatureLevel::SSE2) {
537 						__m128i m;
538 						uint32_t col = map->GetColorWrapped(x, y, z);
539 						m = _mm_setr_epi32(col, 0, 0, 0);
540 						m = _mm_unpacklo_epi8(m, _mm_setzero_si128());
541 						m = _mm_shufflelo_epi16(m, 0xc6);
542 
543 						switch (face) {
544 							case Face::PosZ: m = _mm_srli_epi16(m, 1); break;
545 							case Face::PosX:
546 							case Face::PosY:
547 							case Face::NegX:
548 								m = _mm_adds_epi16(_mm_srli_epi16(m, 1), _mm_srli_epi16(m, 2));
549 								break;
550 							default: break;
551 						}
552 						if ((col >> 24) < 100) {
553 							m = _mm_srli_epi16(m, 1);
554 						}
555 						m = _mm_packus_epi16(m, m);
556 						_mm_store_ss(reinterpret_cast<float *>(&px.combined), _mm_castsi128_ps(m));
557 						px.filled = true;
558 					} else
559 #endif
560 					// non-optimized
561 					{
562 						uint32_t col;
563 						col = map->GetColorWrapped(x, y, z);
564 						col = (col & 0xff00) | ((col & 0xff) << 16) | ((col & 0xff0000) >> 16);
565 						switch (face) {
566 							case Face::PosZ: col = (col & 0xfcfcfc) >> 2; break;
567 							case Face::PosX:
568 							case Face::PosY:
569 							case Face::NegX: col = (col & 0xfefefe) >> 1; break;
570 							default: break;
571 						}
572 						px.combined = col;
573 						px.filled = true;
574 					}
575 					return px;
576 				};
577 
578 				// floor/ceiling
579 				{
580 
581 					// linear code
582 
583 					// RLE scan
584 					RleData *rle = lastRle;
585 					{
586 						RleData *ptr = rle + 10;
587 						while (*ptr != -1) {
588 							std::int_fast8_t z = *ptr;
589 							if (z > icz) {
590 								std::uint_fast16_t p1 = transform(invDist, z);
591 								std::uint_fast16_t p2 = transform(oldInvDist, z);
592 								LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::NegZ,
593 								                               medDist + heightScaleVal[z]);
594 
595 								for (std::uint_fast16_t j = p1; j < p2; j++) {
596 									auto &p = pixels[j];
597 									if (!p.IsEmpty())
598 										continue;
599 									p.Set(pix);
600 								}
601 							}
602 							ptr++;
603 						}
604 						ptr++;
605 						while (*ptr != -1) {
606 							std::int_fast8_t z = *ptr;
607 							if (z < icz) {
608 								std::uint_fast16_t p1 = transform(invDist, z + 1);
609 								std::uint_fast16_t p2 = transform(oldInvDist, z + 1);
610 								LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::PosZ,
611 								                               medDist + heightScaleVal[z + 1]);
612 
613 								for (std::uint_fast16_t j = p2; j < p1; j++) {
614 									auto &p = pixels[j];
615 									if (!p.IsEmpty())
616 										continue;
617 									p.Set(pix);
618 								}
619 							}
620 							ptr++;
621 						}
622 					}
623 
624 				} // done: floor/ceiling
625 
626 				// add walls
627 				{
628 					// by RLE map
629 					auto ref = rle[static_cast<std::uint_fast32_t>(irx & w - 1) +
630 					               static_cast<std::uint_fast32_t>(iry & h - 1) * w];
631 					RleData *rle = rleHeap.Dereference<RleData>(ref);
632 					lastRle = rle;
633 					auto *ptr = rle;
634 					ptr += reinterpret_cast<unsigned short *>(rle)[1 + static_cast<int>(wallFace)];
635 
636 					std::uint_fast16_t savedP = 0;
637 					std::int_fast8_t savedZ = 127;
638 
639 					while (*ptr != -1) {
640 						std::int_fast8_t z = *(ptr++);
641 
642 						std::uint_fast16_t p1 = savedZ == z ? savedP : transform(invDist, z);
643 						std::uint_fast16_t p2 = transform(invDist, z + 1);
644 
645 						savedZ = z + 1;
646 						savedP = p2;
647 
648 						LinePixel pix =
649 						  BuildLinePixel(irx, iry, z, wallFace, medDist + heightScaleVal[z]);
650 
651 						for (std::uint_fast16_t j = p1; j < p2; j++) {
652 							auto &p = pixels[j];
653 							if (!p.IsEmpty())
654 								continue;
655 							p.Set(pix);
656 						}
657 					}
658 
659 				} // add wall - end
660 
661 				// check pitch cull
662 				if ((--count) == 0) {
663 					if ((transform(invDist, 0) >= lineResolution - 1 && icz >= 0) ||
664 					    transform(invDist, 63) <= 0)
665 						break;
666 					count = 4;
667 				}
668 
669 				// let's go to next voxel!
670 			}
671 		}
672 
673 		struct AtanTable {
674 			std::array<uint16_t, 5000> sm;
675 			std::array<uint16_t, 5000> lg;
676 			std::array<uint16_t, 5000> smN;
677 			std::array<uint16_t, 5000> lgN;
678 
679 			// [0, 2pi] -> [0, 65536]
ToFixedspades::draw::AtanTable680 			static uint16_t ToFixed(float v) {
681 				v /= (M_PI * 2.f);
682 				v *= 65536.f;
683 				int i = static_cast<int>(v);
684 				return static_cast<uint16_t>(i & 65535);
685 			}
686 
AtanTablespades::draw::AtanTable687 			AtanTable() {
688 				for (int i = 0; i < 5000; i++) {
689 					sm[i] = ToFixed(atanf(i / 4096.f));
690 					lg[i] = ToFixed(atanf(1.f / ((i + .5f) / 4096.f)));
691 					smN[i] = ToFixed(-atanf(i / 4096.f));
692 					lgN[i] = ToFixed(-atanf(1.f / ((i + .5f) / 4096.f)));
693 				}
694 			}
695 		};
696 		static AtanTable atanTable;
fastATan(float v)697 		static inline uint16_t fastATan(float v) {
698 			if (v < 0.f) {
699 				if (v > -1.f) {
700 					v *= -4096.f;
701 					int idx = static_cast<int>(v);
702 					// v -= idx;
703 					auto ret = atanTable.smN[idx];
704 					return ret;
705 				} else {
706 					v = fastDiv(-4096.f, v);
707 					int idx = static_cast<int>(v);
708 					// v -= idx;
709 					auto ret = atanTable.lgN[idx];
710 					return ret;
711 				}
712 			} else {
713 				if (v < 1.f) {
714 					v *= 4096.f;
715 					int idx = static_cast<int>(v);
716 					// v -= idx;
717 					auto ret = atanTable.sm[idx];
718 					return ret;
719 					// ret += (atanTable.sm[idx + 1] - ret) * v;
720 					// return ret;
721 				} else {
722 					v = fastDiv(4096.f, v);
723 					int idx = static_cast<int>(v);
724 					// v -= idx;
725 					auto ret = atanTable.lg[idx];
726 					return ret;
727 					// ret += (atanTable.lg[idx + 1] - ret) * v;
728 					// return ret;
729 				}
730 			}
731 		}
732 
fastATan2(float y,float x)733 		static inline uint16_t fastATan2(float y, float x) {
734 			if (x == 0.f) {
735 				return y > 0.f ? 16384 : -16384;
736 				// y > 0.f ? (pi * 0.5f) : (-pi * 0.5f);
737 			} else if (x > 0.f) {
738 				return fastATan(fastDiv(y, x));
739 			} else {
740 				return fastATan(fastDiv(y, x)) + 32768;
741 			}
742 		}
743 
744 		template <SWFeatureLevel flevel, int under>
RenderFinal(float yawMin,float yawMax,unsigned int numLines,unsigned int threadId,unsigned int numThreads)745 		void SWMapRenderer::RenderFinal(float yawMin, float yawMax, unsigned int numLines,
746 		                                unsigned int threadId, unsigned int numThreads) {
747 			float fovX = tanf(sceneDef.fovX * 0.5f);
748 			float fovY = tanf(sceneDef.fovY * 0.5f);
749 			Vector3 front = sceneDef.viewAxis[2];
750 			Vector3 right = sceneDef.viewAxis[0];
751 			Vector3 down = sceneDef.viewAxis[1];
752 
753 			unsigned int fw = frameBuf->GetWidth();
754 			unsigned int fh = frameBuf->GetHeight();
755 			uint32_t *fb = frameBuf->GetPixels();
756 			float *depthBuf = this->depthBuf;
757 			Vector3 v1 = front - right * fovX + down * fovY;
758 			Vector3 deltaDown = -down * (fovY * 2.f / static_cast<float>(fh));
759 			Vector3 deltaRight = right * (fovX * 2.f / static_cast<float>(fw) * under);
760 
761 			Vector2 screenPos = {-fovX, -fovY};
762 			float deltaScreenPosRight = fovX * 2.f / static_cast<float>(fw);
763 			float deltaScreenPosDown = fovY * 2.f / static_cast<float>(fh);
764 
765 			static const float pi = M_PI;
766 			float yawScale = 65536.f / (pi * 2.f);
767 			std::int32_t yawScale2 =
768 			  static_cast<std::int32_t>(pi * 2.f / (yawMax - yawMin) * 65536.f);
769 			std::int32_t yawMin2 = static_cast<std::int32_t>(yawMin * yawScale);
770 			auto &lineList = this->lines;
771 
772 			enum { blockSize = 8, hBlock = blockSize / under };
773 
774 			Vector3 deltaDownLarge = deltaDown * blockSize;
775 			Vector3 deltaRightLarge = deltaRight * hBlock;
776 
777 			unsigned int startX = threadId * fw / numThreads;
778 			unsigned int endX = (threadId + 1) * fw / numThreads;
779 
780 			startX = (startX / blockSize) * blockSize;
781 			endX = (endX / blockSize) * blockSize;
782 
783 			float deltaScreenPosRightSmall = deltaScreenPosRight * under;
784 			float deltaScreenPosDownSmall = deltaScreenPosDown;
785 
786 			deltaScreenPosRight *= static_cast<float>(blockSize);
787 			deltaScreenPosDown *= static_cast<float>(blockSize);
788 
789 			v1 += deltaRight * static_cast<float>(startX / under);
790 			screenPos.x += deltaScreenPosRight * static_cast<float>(startX / blockSize);
791 
792 			for (unsigned int fx = startX; fx < endX; fx += blockSize) {
793 				Vector3 v2 = v1;
794 				screenPos.y = -fovY;
795 				for (unsigned int fy = 0; fy < fh; fy += blockSize) {
796 
797 					uint32_t *fb2 = fb + fx + fy * fw;
798 					float *db2 = depthBuf + fx + fy * fw;
799 
800 					if (v2.z > 0.99f || v2.z < -0.99f) {
801 						// near to pole. cannot be approximated by piecewise
802 						goto SlowBlockPath;
803 					}
804 
805 				FastBlockPath : {
806 
807 					// Use bi-linear interpolation for faster yaw/pitch
808 					// computation.
809 
810 					auto calcYawindex = [yawScale2, numLines, yawMin2](Vector3 v) {
811 						std::int32_t yawIndex;
812 						{
813 							float x = v.x, y = v.y;
814 							int yaw;
815 							yaw = fastATan2(y, x);
816 							yaw -= yawMin2;
817 							yawIndex = static_cast<int>(yaw & 0xffff);
818 						}
819 						yawIndex <<= 8;
820 						return yawIndex;
821 					};
822 					auto calcPitch = [](Vector3 vv) {
823 						float pitch;
824 						pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y);
825 						pitch = ToSpecialTan(pitch);
826 						return static_cast<int>(pitch * (65536.f * 8192.f));
827 					};
828 					std::int32_t yawIndex1 = calcYawindex(v2);
829 					std::int32_t pitch1 = calcPitch(v2);
830 					std::int32_t yawIndex2 = calcYawindex(v2 + deltaRightLarge);
831 					std::int32_t pitch2 = calcPitch(v2 + deltaRightLarge);
832 					std::int32_t yawIndex3 = calcYawindex(v2 + deltaDownLarge);
833 					std::int32_t pitch3 = calcPitch(v2 + deltaDownLarge);
834 					std::int32_t yawIndex4 = calcYawindex(v2 + deltaRightLarge + deltaDownLarge);
835 					std::int32_t pitch4 = calcPitch(v2 + deltaRightLarge + deltaDownLarge);
836 
837 					// note: `<<8>>8` is phase unwrapping
838 					std::int32_t yawDiff1 = ((yawIndex2 - yawIndex1) << 8 >> 8) / hBlock;
839 					std::int32_t yawDiff2 = ((yawIndex4 - yawIndex3) << 8 >> 8) / hBlock;
840 					std::int32_t pitchDiff1 = (pitch2 - pitch1) / hBlock;
841 					std::int32_t pitchDiff2 = (pitch4 - pitch3) / hBlock;
842 
843 					std::int32_t yawIndexA = yawIndex1;
844 					std::int32_t yawIndexB = yawIndex3;
845 					std::int32_t pitchA = pitch1;
846 					std::int32_t pitchB = pitch3;
847 
848 					for (unsigned int x = 0; x < blockSize; x += under) {
849 						uint32_t *fb3 = fb2 + x;
850 						auto *db3 = db2 + x;
851 
852 						std::int32_t yawIndexC = yawIndexA;
853 						std::int32_t yawDelta = ((yawIndexB - yawIndexA) << 8 >> 8) / blockSize;
854 						std::int32_t pitchC = pitchA;
855 						std::int32_t pitchDelta = (pitchB - pitchA) / blockSize;
856 
857 						for (unsigned int y = 0; y < blockSize; y++) {
858 
859 							std::uint32_t yawIndex =
860 							  static_cast<unsigned int>(yawIndexC << 8 >> 16);
861 							yawIndex = (yawIndex * yawScale2) >> 16;
862 							yawIndex = (yawIndex * numLines) >> 16;
863 							auto &line = lineList[yawIndex];
864 							auto *pixels = line.pixels.data();
865 
866 							// solve pitch
867 							std::int32_t pitchIndex;
868 
869 							{
870 								pitchIndex = pitchC >> 13;
871 								pitchIndex -= line.pitchTanMinI;
872 								pitchIndex =
873 								  static_cast<int>((static_cast<int64_t>(pitchIndex) *
874 								                    static_cast<int64_t>(line.pitchScaleI)) >>
875 								                   32);
876 								// pitch = (pitch - line.pitchTanMin) * line.pitchScale;
877 								// pitchIndex = static_cast<int>(pitch);
878 								pitchIndex &= lineResolution - 1;
879 								// pitchIndex = std::max(pitchIndex, 0);
880 								// pitchIndex = std::min(pitchIndex, lineResolution - 1);
881 							}
882 
883 							auto &pix = pixels[pitchIndex];
884 
885 // write color.
886 // NOTE: combined contains both color and other information,
887 // though this isn't a problem as long as the color comes
888 // in the LSB's
889 #if ENABLE_SSE
890 							if (flevel == SWFeatureLevel::SSE2) {
891 								__m128i m;
892 
893 								if (under == 1) {
894 									*fb3 = pix.combined;
895 									*db3 = pix.depth;
896 								} else if (under == 2) {
897 									m = _mm_castpd_si128(
898 									  _mm_load_sd(reinterpret_cast<const double *>(&pix)));
899 									_mm_store_sd(reinterpret_cast<double *>(fb3),
900 									             _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00)));
901 									_mm_store_sd(reinterpret_cast<double *>(db3),
902 									             _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55)));
903 								} else if (under == 4) {
904 									m = _mm_castpd_si128(
905 									  _mm_load_sd(reinterpret_cast<const double *>(&pix)));
906 									_mm_stream_si128(reinterpret_cast<__m128i *>(fb3),
907 									                 _mm_shuffle_epi32(m, 0x00));
908 									_mm_stream_si128(reinterpret_cast<__m128i *>(db3),
909 									                 _mm_shuffle_epi32(m, 0x55));
910 								}
911 
912 							} else
913 #endif
914 							// non-optimized
915 							{
916 								uint32_t col = pix.combined;
917 								float d = pix.depth;
918 
919 								for (int k = 0; k < under; k++) {
920 									fb3[k] = col;
921 									db3[k] = d;
922 								}
923 							}
924 
925 							fb3 += fw;
926 							db3 += fw;
927 
928 							yawIndexC += yawDelta;
929 							pitchC += pitchDelta;
930 						}
931 
932 						yawIndexA += yawDiff1;
933 						yawIndexB += yawDiff2;
934 						pitchA += pitchDiff1;
935 						pitchB += pitchDiff2;
936 					}
937 				}
938 					goto Converge;
939 
940 				SlowBlockPath : {
941 					Vector3 v3 = v2;
942 					Vector2 screenPos2 = screenPos;
943 					for (unsigned int x = 0; x < blockSize; x += under) {
944 						Vector3 v4 = v3;
945 						uint32_t *fb3 = fb2 + x;
946 						auto *db3 = db2 + x;
947 						screenPos2.y = screenPos.y;
948 
949 						for (unsigned int y = 0; y < blockSize; y++) {
950 							Vector3 vv = v4;
951 
952 							// solve yaw
953 							std::uint32_t yawIndex;
954 							{
955 								float x = vv.x, y = vv.y;
956 								int yaw;
957 								yaw = fastATan2(y, x);
958 								yaw -= yawMin2;
959 								yawIndex = static_cast<unsigned int>(yaw & 0xffff);
960 							}
961 							yawIndex = (yawIndex * yawScale2) >> 16;
962 							yawIndex = (yawIndex * numLines) >> 16;
963 
964 							auto &line = lineList[yawIndex];
965 							auto *pixels = line.pixels.data();
966 
967 							// solve pitch
968 							std::int32_t pitchIndex;
969 
970 							{
971 								float pitch;
972 								pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y);
973 								pitch = ToSpecialTan(pitch);
974 								pitch = (pitch - line.pitchTanMin) * line.pitchScale;
975 								pitchIndex = static_cast<int>(pitch);
976 								pitchIndex &= lineResolution - 1;
977 								// pitchIndex = std::max(pitchIndex, 0);
978 								// pitchIndex = std::min(pitchIndex, lineResolution - 1);
979 							}
980 
981 							auto &pix = pixels[pitchIndex];
982 
983 // write color.
984 // NOTE: combined contains both color and other information,
985 // though this isn't a problem as long as the color comes
986 // in the LSB's
987 #if ENABLE_SSE
988 							if (flevel == SWFeatureLevel::SSE2) {
989 								__m128i m;
990 
991 								if (under == 1) {
992 									*fb3 = pix.combined;
993 									*db3 = pix.depth;
994 								} else if (under == 2) {
995 									m = _mm_castpd_si128(
996 									  _mm_load_sd(reinterpret_cast<const double *>(&pix)));
997 									_mm_store_sd(reinterpret_cast<double *>(fb3),
998 									             _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00)));
999 									_mm_store_sd(reinterpret_cast<double *>(db3),
1000 									             _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55)));
1001 								} else if (under == 4) {
1002 									m = _mm_castpd_si128(
1003 									  _mm_load_sd(reinterpret_cast<const double *>(&pix)));
1004 									_mm_stream_si128(reinterpret_cast<__m128i *>(fb3),
1005 									                 _mm_shuffle_epi32(m, 0x00));
1006 									_mm_stream_si128(reinterpret_cast<__m128i *>(db3),
1007 									                 _mm_shuffle_epi32(m, 0x55));
1008 								}
1009 
1010 							} else
1011 #endif
1012 							// non-optimized
1013 							{
1014 								uint32_t col = pix.combined;
1015 								float d = pix.depth;
1016 
1017 								for (int k = 0; k < under; k++) {
1018 									fb3[k] = col;
1019 									db3[k] = d;
1020 								}
1021 							}
1022 
1023 							fb3 += fw;
1024 							db3 += fw;
1025 
1026 							v4 += deltaDown;
1027 							screenPos2.y += deltaScreenPosDownSmall;
1028 						} // y
1029 						v3 += deltaRight;
1030 						screenPos2.x += deltaScreenPosRightSmall;
1031 					} // x
1032 
1033 				} // end SlowBlockPath
1034 
1035 				Converge:
1036 
1037 					v2 += deltaDownLarge;
1038 					screenPos.y += deltaScreenPosDown;
1039 				} // fy
1040 				v1 += deltaRightLarge;
1041 				screenPos.x += deltaScreenPosRight;
1042 			} // fx
1043 		}
1044 
1045 		template <SWFeatureLevel flevel>
RenderInner(const client::SceneDefinition & def,Bitmap * frame,float * depthBuffer)1046 		void SWMapRenderer::RenderInner(const client::SceneDefinition &def, Bitmap *frame,
1047 		                                float *depthBuffer) {
1048 
1049 			sceneDef = def;
1050 			frameBuf = frame;
1051 			depthBuf = depthBuffer;
1052 
1053 			// calculate line density.
1054 			float yawMin, yawMax;
1055 			float pitchMin, pitchMax;
1056 			size_t numLines;
1057 			{
1058 				float fovX = tanf(def.fovX * 0.5f);
1059 				float fovY = tanf(def.fovY * 0.5f);
1060 				float fovDiag = sqrtf(fovX * fovX + fovY * fovY);
1061 				float fovDiagAng = atanf(fovDiag);
1062 				float pitch = asinf(def.viewAxis[2].z);
1063 				static const float pi = M_PI;
1064 
1065 				// pitch = 0.f;
1066 
1067 				if (fabsf(pitch) >= pi * 0.49f - fovDiagAng) {
1068 					// pole is visible
1069 					yawMin = 0.f;
1070 					yawMax = pi * 2.f;
1071 				} else {
1072 					float yaw = atan2l(def.viewAxis[2].y, def.viewAxis[2].x);
1073 					// TODO: incorrect!
1074 					yawMin = yaw - pi * .5f; // fovDiagAng;
1075 					yawMax = yaw + pi * .5f; // fovDiagAng;
1076 				}
1077 
1078 				pitchMin = pitch - fovDiagAng;
1079 				pitchMax = pitch + fovDiagAng;
1080 				if (pitchMin < -pi * 0.5f) {
1081 					pitchMax = std::max(pitchMax, -pi - pitchMin);
1082 					pitchMin = -pi * 0.5f;
1083 				}
1084 				if (pitchMax > pi * 0.5f) {
1085 					pitchMin = std::min(pitchMin, pi - pitchMax);
1086 					pitchMax = pi * 0.5f;
1087 				}
1088 
1089 				// pitch of PI/2 will make tan(x) infinite
1090 				pitchMin = std::max(pitchMin, -pi * 0.4999f);
1091 				pitchMax = std::min(pitchMax, pi * 0.4999f);
1092 
1093 				float interval = static_cast<float>(frame->GetHeight());
1094 				interval = fovY * 2.f / interval;
1095 				lineResolution = static_cast<int>((pitchMax - pitchMin) / interval * 1.5f);
1096 				lineResolution = frame->GetHeight();
1097 
1098 				for (int i = lineResolution, j = 1; j <= i; j <<= 1) {
1099 					lineResolution = j;
1100 				}
1101 
1102 				if (pitchMin > 0.f) {
1103 					// interval /= cosf(pitchMin);
1104 				} else if (pitchMax < 0.f) {
1105 					// interval /= cosf(pitchMax);
1106 				}
1107 
1108 				numLines = static_cast<size_t>((yawMax - yawMin) / interval);
1109 
1110 				int under = r_swUndersampling;
1111 				under = std::max(std::min(under, 4), 1);
1112 				numLines /= under;
1113 
1114 				if (numLines < 8)
1115 					numLines = 8;
1116 				if (numLines > 65536) {
1117 					numLines =
1118 					  65536; // SPRaise("Too many lines emit: %d", static_cast<int>(numLines));
1119 				}
1120 				lines.resize(std::max(numLines, lines.size()));
1121 				/*
1122 				SPLog("numlines: %d, each %f deg, and %d res",
1123 				      static_cast<int>(numLines),
1124 				      interval * 180.f / pi,
1125 				      static_cast<int>(lineResolution));*/
1126 			}
1127 
1128 			// calculate vector for each lines
1129 			{
1130 				float scl = (yawMax - yawMin) / numLines;
1131 				Vector3 horiz = Vector3::Make(cosf(yawMin), sinf(yawMin), 0.f);
1132 				float c = cosf(scl);
1133 				float s = sinf(scl);
1134 				for (size_t i = 0; i < numLines; i++) {
1135 					Line &l = lines[i];
1136 					l.horizonDir = horiz;
1137 
1138 					float x = horiz.x * c - horiz.y * s;
1139 					float y = horiz.x * s + horiz.y * c;
1140 					horiz.x = x;
1141 					horiz.y = y;
1142 				}
1143 			}
1144 
1145 			{
1146 				unsigned int nlines = static_cast<unsigned int>(numLines);
1147 				InvokeParallel2([&](unsigned int th, unsigned int numThreads) {
1148 					unsigned int start = th * nlines / numThreads;
1149 					unsigned int end = (th + 1) * nlines / numThreads;
1150 
1151 					for (size_t i = start; i < end; i++) {
1152 						BuildLine<flevel>(lines[i], pitchMin, pitchMax);
1153 					}
1154 				});
1155 			}
1156 
1157 			int under = r_swUndersampling;
1158 
1159 			InvokeParallel2([&](unsigned int th, unsigned int numThreads) {
1160 
1161 				if (under <= 1) {
1162 					RenderFinal<flevel, 1>(yawMin, yawMax, static_cast<unsigned int>(numLines), th,
1163 					                       numThreads);
1164 				} else if (under <= 2) {
1165 					RenderFinal<flevel, 2>(yawMin, yawMax, static_cast<unsigned int>(numLines), th,
1166 					                       numThreads);
1167 				} else {
1168 					RenderFinal<flevel, 4>(yawMin, yawMax, static_cast<unsigned int>(numLines), th,
1169 					                       numThreads);
1170 				}
1171 			});
1172 
1173 			frameBuf = nullptr;
1174 			depthBuf = nullptr;
1175 		}
1176 
Render(const client::SceneDefinition & def,Bitmap * frame,float * depthBuffer)1177 		void SWMapRenderer::Render(const client::SceneDefinition &def, Bitmap *frame,
1178 		                           float *depthBuffer) {
1179 			if (!frame)
1180 				SPInvalidArgument("frame");
1181 			if (!depthBuffer)
1182 				SPInvalidArgument("depthBuffer");
1183 
1184 			auto p = def.viewOrigin.Floor();
1185 			if (map->IsSolidWrapped(p.x, p.y, p.z)) {
1186 				return;
1187 			}
1188 
1189 #if ENABLE_SSE2
1190 			if (static_cast<int>(level) >= static_cast<int>(SWFeatureLevel::SSE2)) {
1191 				RenderInner<SWFeatureLevel::SSE2>(def, frame, depthBuffer);
1192 				return;
1193 			}
1194 #endif
1195 
1196 			RenderInner<SWFeatureLevel::None>(def, frame, depthBuffer);
1197 		}
1198 	}
1199 }
1200