1 /*
2 	Copyright (C) 2006-2007 shash
3 	Copyright (C) 2008-2015 DeSmuME team
4 
5 	This file is free software: you can redistribute it and/or modify
6 	it under the terms of the GNU General Public License as published by
7 	the Free Software Foundation, either version 2 of the License, or
8 	(at your option) any later version.
9 
10 	This file is distributed in the hope that it will be useful,
11 	but WITHOUT ANY WARRANTY; without even the implied warranty of
12 	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 	GNU General Public License for more details.
14 
15 	You should have received a copy of the GNU General Public License
16 	along with the this software.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include "render3D.h"
20 
21 #include <string.h>
22 
23 #ifdef ENABLE_SSE2
24 #include <emmintrin.h>
25 #endif
26 
27 #ifdef ENABLE_SSSE3
28 #include <tmmintrin.h>
29 #endif
30 
31 #include "utils/bits.h"
32 #include "common.h"
33 #include "gfx3d.h"
34 #include "MMU.h"
35 #include "texcache.h"
36 
37 
38 static CACHE_ALIGN u32 dsDepthToD24_LUT[32768] = {0};
39 int cur3DCore = GPU3D_NULL;
40 
41 GPU3DInterface gpu3DNull = {
42 	"None",
43 	Render3DBaseCreate,
44 	Render3DBaseDestroy
45 };
46 
47 GPU3DInterface *gpu3D = &gpu3DNull;
48 Render3D *BaseRenderer = NULL;
49 Render3D *CurrentRenderer = NULL;
50 
Render3D_Init()51 void Render3D_Init()
52 {
53 	if (BaseRenderer == NULL)
54 		BaseRenderer = new Render3D;
55 
56 	if (CurrentRenderer == NULL)
57 	{
58 		gpu3D = &gpu3DNull;
59 		cur3DCore = GPU3D_NULL;
60 		CurrentRenderer = BaseRenderer;
61 	}
62 }
63 
Render3D_DeInit()64 void Render3D_DeInit()
65 {
66 	gpu3D->NDS_3D_Close();
67 	delete BaseRenderer;
68 	BaseRenderer = NULL;
69 }
70 
NDS_3D_ChangeCore(int newCore)71 bool NDS_3D_ChangeCore(int newCore)
72 {
73 	bool result = false;
74 
75 	Render3DInterface *newRenderInterface = core3DList[newCore];
76 	if (newRenderInterface->NDS_3D_Init == NULL)
77 		return result;
78 
79 	// Some resources are shared between renderers, such as the texture cache,
80 	// so we need to shut down the current renderer now to ensure that any
81 	// shared resources aren't in use.
82 	CurrentRenderer->RenderFinish();
83 	gpu3D->NDS_3D_Close();
84 	gpu3D           = &gpu3DNull;
85 	cur3DCore       = GPU3D_NULL;
86 	CurrentRenderer = BaseRenderer;
87 
88 	Render3D *newRenderer = newRenderInterface->NDS_3D_Init();
89 	if (newRenderer == NULL)
90 		return result;
91 
92    Render3DError error = newRenderer->SetFramebufferSize(GPU->GetCustomFramebufferWidth(), GPU->GetCustomFramebufferHeight());
93 	if (error != RENDER3DERROR_NOERR)
94 		return result;
95 
96 	gpu3D = newRenderInterface;
97 	cur3DCore = newCore;
98 	CurrentRenderer = newRenderer;
99 
100 	result = true;
101 	return result;
102 }
103 
Render3DBaseCreate()104 Render3D* Render3DBaseCreate()
105 {
106 	BaseRenderer->Reset();
107 	return BaseRenderer;
108 }
109 
Render3DBaseDestroy()110 void Render3DBaseDestroy()
111 {
112 	if (CurrentRenderer != BaseRenderer)
113 	{
114 		delete CurrentRenderer;
115 		CurrentRenderer = BaseRenderer;
116 	}
117 }
118 
FragmentAttributesBuffer(size_t newCount)119 FragmentAttributesBuffer::FragmentAttributesBuffer(size_t newCount)
120 {
121 	count = newCount;
122 
123 	depth             = (u32 *)memalign_alloc_aligned(count * sizeof(u32));
124 	opaquePolyID      = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
125 	translucentPolyID = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
126 	stencil           = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
127 	isFogged          = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
128 	isTranslucentPoly = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
129 }
130 
~FragmentAttributesBuffer()131 FragmentAttributesBuffer::~FragmentAttributesBuffer()
132 {
133 	memalign_free(depth);
134 	memalign_free(opaquePolyID);
135 	memalign_free(translucentPolyID);
136 	memalign_free(stencil);
137 	memalign_free(isFogged);
138 	memalign_free(isTranslucentPoly);
139 }
140 
SetAtIndex(const size_t index,const FragmentAttributes & attr)141 void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttributes &attr)
142 {
143 	this->depth[index]				   = attr.depth;
144 	this->opaquePolyID[index]		   = attr.opaquePolyID;
145 	this->translucentPolyID[index]	= attr.translucentPolyID;
146 	this->stencil[index]			      = attr.stencil;
147 	this->isFogged[index]			   = attr.isFogged;
148 	this->isTranslucentPoly[index]	= attr.isTranslucentPoly;
149 }
150 
SetAll(const FragmentAttributes & attr)151 void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
152 {
153 	size_t i = 0;
154 
155 #ifdef ENABLE_SSE2
156 	const __m128i attrDepth_vec128				   = _mm_set1_epi32(attr.depth);
157 	const __m128i attrOpaquePolyID_vec128		   = _mm_set1_epi8(attr.opaquePolyID);
158 	const __m128i attrTranslucentPolyID_vec128	= _mm_set1_epi8(attr.translucentPolyID);
159 	const __m128i attrStencil_vec128			      = _mm_set1_epi8(attr.stencil);
160 	const __m128i attrIsFogged_vec128			   = _mm_set1_epi8(attr.isFogged);
161 	const __m128i attrIsTranslucentPoly_vec128	= _mm_set1_epi8(attr.isTranslucentPoly);
162 
163 	const size_t sseCount = count - (count % 16);
164 	for (; i < sseCount; i += 16)
165 	{
166 		_mm_stream_si128((__m128i *)(this->depth +  0), attrDepth_vec128);
167 		_mm_stream_si128((__m128i *)(this->depth +  4), attrDepth_vec128);
168 		_mm_stream_si128((__m128i *)(this->depth +  8), attrDepth_vec128);
169 		_mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128);
170 
171 		_mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128);
172 		_mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128);
173 		_mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128);
174 		_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
175 		_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
176 	}
177 #endif
178 
179 	for (; i < count; i++)
180 		this->SetAtIndex(i, attr);
181 }
182 
Render3D()183 Render3D::Render3D()
184 {
185 	_renderID = RENDERID_NULL;
186 	_renderName = "None";
187 
188 	static bool needTableInit = true;
189 
190 	if (needTableInit)
191 	{
192       size_t i;
193 		for (i = 0; i < 32768; i++)
194 			dsDepthToD24_LUT[i] = (u32)DS_DEPTH15TO24(i);
195 
196 		needTableInit = false;
197 	}
198 
199 	_framebufferWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
200 	_framebufferHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
201 	_framebufferColorSizeBytes = 0;
202 	_framebufferColor = NULL;
203 
204 	Reset();
205 }
206 
~Render3D()207 Render3D::~Render3D()
208 {
209 	memalign_free(_framebufferColor);
210 }
211 
GetRenderID()212 RendererID Render3D::GetRenderID()
213 {
214 	return this->_renderID;
215 }
216 
GetName()217 std::string Render3D::GetName()
218 {
219 	return this->_renderName;
220 }
221 
GetFramebuffer()222 FragmentColor* Render3D::GetFramebuffer()
223 {
224 	return this->_framebufferColor;
225 }
226 
GetFramebufferWidth()227 size_t Render3D::GetFramebufferWidth()
228 {
229 	return this->_framebufferWidth;
230 }
231 
GetFramebufferHeight()232 size_t Render3D::GetFramebufferHeight()
233 {
234 	return this->_framebufferHeight;
235 }
236 
SetFramebufferSize(size_t w,size_t h)237 Render3DError Render3D::SetFramebufferSize(size_t w, size_t h)
238 {
239 	if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT)
240 		return RENDER3DERROR_NOERR;
241 
242 	const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor);
243 	FragmentColor *oldFramebufferColor        = this->_framebufferColor;
244 	FragmentColor *newFramebufferColor        = (FragmentColor *)memalign_alloc_aligned(newFramebufferColorSizeBytes);
245 
246 	this->_framebufferWidth                   = w;
247 	this->_framebufferHeight                  = h;
248 	this->_framebufferColorSizeBytes          = newFramebufferColorSizeBytes;
249 	this->_framebufferColor                   = newFramebufferColor;
250 
251 	memalign_free(oldFramebufferColor);
252 
253 	return RENDER3DERROR_NOERR;
254 }
255 
BeginRender(const GFX3D & engine)256 Render3DError Render3D::BeginRender(const GFX3D &engine)
257 {
258 	return RENDER3DERROR_NOERR;
259 }
260 
RenderGeometry(const GFX3D_State & renderState,const POLYLIST * polyList,const INDEXLIST * indexList)261 Render3DError Render3D::RenderGeometry(const GFX3D_State &renderState, const POLYLIST *polyList, const INDEXLIST *indexList)
262 {
263 	return RENDER3DERROR_NOERR;
264 }
265 
RenderEdgeMarking(const u16 * colorTable,const bool useAntialias)266 Render3DError Render3D::RenderEdgeMarking(const u16 *colorTable, const bool useAntialias)
267 {
268 	return RENDER3DERROR_NOERR;
269 }
270 
RenderFog(const u8 * densityTable,const u32 color,const u32 offset,const u8 shift,const bool alphaOnly)271 Render3DError Render3D::RenderFog(const u8 *densityTable, const u32 color, const u32 offset, const u8 shift, const bool alphaOnly)
272 {
273 	return RENDER3DERROR_NOERR;
274 }
275 
EndRender(const u64 frameCount)276 Render3DError Render3D::EndRender(const u64 frameCount)
277 {
278 	return RENDER3DERROR_NOERR;
279 }
280 
FlushFramebuffer(FragmentColor * __restrict dstRGBA6665,u16 * __restrict dstRGBA5551)281 Render3DError Render3D::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
282 {
283    size_t i;
284 	memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes);
285 
286 	/* Convert to RGBA5551 */
287 	for (i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++)
288 	{
289 		dstRGBA5551[i] = R6G6B6TORGB15(
290             this->_framebufferColor[i].r,
291             this->_framebufferColor[i].g,
292             this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
293 	}
294 
295 	return RENDER3DERROR_NOERR;
296 }
297 
UpdateToonTable(const u16 * toonTableBuffer)298 Render3DError Render3D::UpdateToonTable(const u16 *toonTableBuffer)
299 {
300 	return RENDER3DERROR_NOERR;
301 }
302 
ClearFramebuffer(const GFX3D_State & renderState)303 Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
304 {
305 	Render3DError error = RENDER3DERROR_NOERR;
306 
307 	FragmentColor clearColor;
308 	clearColor.r =  renderState.clearColor & 0x1F;
309 	clearColor.g = (renderState.clearColor >> 5) & 0x1F;
310 	clearColor.b = (renderState.clearColor >> 10) & 0x1F;
311 	clearColor.a = (renderState.clearColor >> 16) & 0x1F;
312 
313 	FragmentAttributes clearFragment;
314 	clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
315 	//special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display
316 	//I am not sure whether it is right, though. previously this was cleared to 0, as a guess,
317 	//but in spiderman2 some fires with polyid 0 try to render on top of the background
318 	clearFragment.translucentPolyID = kUnsetTranslucentPolyID;
319 	clearFragment.depth = renderState.clearDepth;
320 	clearFragment.stencil = 0;
321 	clearFragment.isTranslucentPoly = 0;
322 	clearFragment.isFogged = BIT15(renderState.clearColor);
323 
324 	if (renderState.enableClearImage)
325 	{
326 		//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
327 		//uses the scroll registers in the main game engine
328 		const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
329 		const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
330 		const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
331 		const u8 xScroll     = scrollBits & 0xFF;
332 		const u8 yScroll     = (scrollBits >> 8) & 0xFF;
333 
334 		if (xScroll == 0 && yScroll == 0)
335 		{
336          size_t i;
337 
338 			for (i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
339 			{
340 				this->clearImageColor16Buffer[i] = clearColorBuffer[i];
341 				this->clearImageDepthBuffer[i]   = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF];
342 				this->clearImageFogBuffer[i]     = BIT15(clearDepthBuffer[i]);
343 				this->clearImagePolyIDBuffer[i]  = clearFragment.opaquePolyID;
344 			}
345 		}
346 		else
347 		{
348          size_t dstIndex;
349          size_t iy = 0;
350 			for (dstIndex = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
351 			{
352             size_t ix;
353 				const size_t y = ((iy + yScroll) & 0xFF) << 8;
354 
355 				for (ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
356 				{
357 					const size_t x = (ix + xScroll) & 0xFF;
358 					const size_t srcIndex = y | x;
359 
360 					//this is tested by harry potter and the order of the phoenix.
361 					//TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
362 					//(or use a special zero fill in the bulk clearing above)
363 					this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
364 
365 					//this is tested quite well in the sonic chronicles main map mode
366 					//where depth values are used for trees etc you can walk behind
367 					this->clearImageDepthBuffer[dstIndex]   = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
368 
369 					this->clearImageFogBuffer[dstIndex]     = BIT15(clearDepthBuffer[srcIndex]);
370 					this->clearImagePolyIDBuffer[dstIndex]  = clearFragment.opaquePolyID;
371 				}
372 			}
373 		}
374 
375 		error = this->ClearUsingImage(
376             this->clearImageColor16Buffer,
377             this->clearImageDepthBuffer,
378             this->clearImageFogBuffer,
379             this->clearImagePolyIDBuffer);
380 		if (error != RENDER3DERROR_NOERR)
381 			error = this->ClearUsingValues(clearColor, clearFragment);
382 	}
383 	else
384 		error = this->ClearUsingValues(clearColor, clearFragment);
385 
386 	return error;
387 }
388 
ClearUsingImage(const u16 * __restrict colorBuffer,const u32 * __restrict depthBuffer,const u8 * __restrict fogBuffer,const u8 * __restrict polyIDBuffer)389 Render3DError Render3D::ClearUsingImage(
390       const u16 *__restrict colorBuffer,
391       const u32 *__restrict depthBuffer,
392       const u8 *__restrict fogBuffer,
393       const u8 *__restrict polyIDBuffer)
394 {
395 	return RENDER3DERROR_NOERR;
396 }
397 
ClearUsingValues(const FragmentColor & clearColor,const FragmentAttributes & clearAttributes) const398 Render3DError Render3D::ClearUsingValues(
399       const FragmentColor &clearColor,
400       const FragmentAttributes &clearAttributes) const
401 {
402 	return RENDER3DERROR_NOERR;
403 }
404 
SetupPolygon(const POLY & thePoly)405 Render3DError Render3D::SetupPolygon(const POLY &thePoly)
406 {
407 	return RENDER3DERROR_NOERR;
408 }
409 
SetupTexture(const POLY & thePoly,bool enableTexturing)410 Render3DError Render3D::SetupTexture(const POLY &thePoly, bool enableTexturing)
411 {
412 	return RENDER3DERROR_NOERR;
413 }
414 
SetupViewport(const u32 viewportValue)415 Render3DError Render3D::SetupViewport(const u32 viewportValue)
416 {
417 	return RENDER3DERROR_NOERR;
418 }
419 
Reset()420 Render3DError Render3D::Reset()
421 {
422 	if (this->_framebufferColor != NULL)
423 	{
424 		memset(this->_framebufferColor, 0, this->_framebufferColorSizeBytes);
425       this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
426 	}
427 
428 	memset(this->clearImageColor16Buffer, 0, sizeof(this->clearImageColor16Buffer));
429 	memset(this->clearImageDepthBuffer,   0, sizeof(this->clearImageDepthBuffer));
430 	memset(this->clearImagePolyIDBuffer,  0, sizeof(this->clearImagePolyIDBuffer));
431 	memset(this->clearImageFogBuffer,     0, sizeof(this->clearImageFogBuffer));
432 
433 	TexCache_Reset();
434 
435 	return RENDER3DERROR_NOERR;
436 }
437 
Render(const GFX3D & engine)438 Render3DError Render3D::Render(const GFX3D &engine)
439 {
440 	Render3DError error = RENDER3DERROR_NOERR;
441 
442 	error = this->BeginRender(engine);
443 	if (error != RENDER3DERROR_NOERR)
444 	{
445 		return error;
446 	}
447 
448 	this->UpdateToonTable(engine.renderState.u16ToonTable);
449 	this->ClearFramebuffer(engine.renderState);
450 
451 	this->RenderGeometry(engine.renderState, engine.polylist, &engine.indexlist);
452 
453 	if (engine.renderState.enableEdgeMarking)
454 		this->RenderEdgeMarking(engine.renderState.edgeMarkColorTable,
455             engine.renderState.enableAntialiasing);
456 
457 	if (engine.renderState.enableFog)
458 	{
459 		this->RenderFog(
460             engine.renderState.fogDensityTable,
461             engine.renderState.fogColor,
462             engine.renderState.fogOffset,
463             engine.renderState.fogShift,
464             engine.renderState.enableFogAlphaOnly);
465 	}
466 
467 	this->EndRender(engine.frameCtr);
468 
469 	return error;
470 }
471 
RenderFinish()472 Render3DError Render3D::RenderFinish()
473 {
474 	return RENDER3DERROR_NOERR;
475 }
476 
VramReconfigureSignal()477 Render3DError Render3D::VramReconfigureSignal()
478 {
479 	TexCache_Invalidate();
480 	return RENDER3DERROR_NOERR;
481 }
482 
483 #ifdef ENABLE_SSE2
484 
FlushFramebuffer(FragmentColor * __restrict dstRGBA6665,u16 * __restrict dstRGBA5551)485 Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
486 {
487 	size_t i;
488 	const __m128i zero_vec128 = _mm_setzero_si128();
489 	const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
490 	const size_t ssePixCount = pixCount - (pixCount % 4);
491 
492 	for (i = 0; i < ssePixCount; i += 4)
493 	{
494 		// Copy the framebufferColor buffer
495 		__m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
496 		_mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
497 
498 		// Convert to RGBA5551
499 		__m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E));	// Read from R
500 		r = _mm_srli_epi32(r, 1);										// Shift to R
501 
502 		__m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00));	// Read from G
503 		g = _mm_srli_epi32(g, 4);										// Shift in G
504 
505 		__m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000));	// Read from B
506 		b = _mm_srli_epi32(b, 7);										// Shift to B
507 
508 		__m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000));	// Read from A
509 		a = _mm_cmpeq_epi32(a, zero_vec128);							// Determine A
510 
511 		// From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
512 		// 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
513 		// may be undefined. Now if we were using SSE4.1's packusdw (unsigned 16-bit pack),
514 		// we  wouldn't have to go through this hassle. But not everyone has an SSE4.1-capable
515 		// CPU, so doing this the SSE2 way is more guaranteed to work an everyone's CPU.
516 		//
517 		// To use packssdw, we take a bit one position lower for the alpha bit, run
518 		// packssdw, then shift the bit back to its original position. Then we por the
519 		// alpha vector with the post-packed color vector to get the final color.
520 
521 		a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000));			// Mask out the bit before A
522 		a = _mm_packs_epi32(a, zero_vec128);							// Pack 32-bit down to 16-bit
523 		a = _mm_slli_epi16(a, 1);										// Shift the A bit back to where it needs to be
524 
525 		// Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
526 		color = _mm_or_si128(_mm_or_si128(r, g), b);
527 		color = _mm_packs_epi32(color, zero_vec128);
528 		color = _mm_or_si128(color, a);
529 
530 		_mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
531 	}
532 
533 	for (; i < pixCount; i++)
534 	{
535 		dstRGBA6665[i] = this->_framebufferColor[i];
536 		dstRGBA5551[i] = R6G6B6TORGB15(
537             this->_framebufferColor[i].r,
538             this->_framebufferColor[i].g,
539             this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
540 	}
541 
542 	return RENDER3DERROR_NOERR;
543 }
544 
ClearFramebuffer(const GFX3D_State & renderState)545 Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
546 {
547 	Render3DError error = RENDER3DERROR_NOERR;
548 
549 	FragmentColor clearColor;
550 	clearColor.r =  renderState.clearColor & 0x1F;
551 	clearColor.g = (renderState.clearColor >> 5) & 0x1F;
552 	clearColor.b = (renderState.clearColor >> 10) & 0x1F;
553 	clearColor.a = (renderState.clearColor >> 16) & 0x1F;
554 
555 	FragmentAttributes clearFragment;
556 	clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
557 	//special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display
558 	//I am not sure whether it is right, though. previously this was cleared to 0, as a guess,
559 	//but in spiderman2 some fires with polyid 0 try to render on top of the background
560 	clearFragment.translucentPolyID = kUnsetTranslucentPolyID;
561 	clearFragment.depth = renderState.clearDepth;
562 	clearFragment.stencil = 0;
563 	clearFragment.isTranslucentPoly = 0;
564 	clearFragment.isFogged = BIT15(renderState.clearColor);
565 
566 	if (renderState.enableClearImage)
567 	{
568 		//the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
569 		//uses the scroll registers in the main game engine
570 		const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
571 		const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
572 		const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
573 		const u8 xScroll = scrollBits & 0xFF;
574 		const u8 yScroll = (scrollBits >> 8) & 0xFF;
575 
576 		if (xScroll == 0 && yScroll == 0)
577 		{
578          size_t i;
579 			const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
580 			const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
581 			const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
582 
583 			for (i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
584 			{
585 				// Copy the colors to the color buffer. Since we can only copy 8 elements at once,
586 				// we need to load-store twice.
587 				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_loadu_si128((__m128i *)(clearColorBuffer + i + 8)) );
588 				_mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_loadu_si128((__m128i *)(clearColorBuffer + i)) );
589 
590 				// Write the depth values to the depth buffer.
591 				__m128i clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8));
592 				__m128i clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i));
593 				clearDepthHi_vec128         = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128);
594 				clearDepthLo_vec128         = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128);
595 
596 				this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)];
597 				this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)];
598 				this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)];
599 				this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)];
600 				this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)];
601 				this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)];
602 				this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)];
603 				this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)];
604 				this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)];
605 				this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)];
606 				this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)];
607 				this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)];
608 				this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)];
609 				this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)];
610 				this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)];
611 				this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)];
612 
613 				// Write the fog flags to the fog flag buffer.
614 				clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8));
615 				clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i));
616 				clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
617 				clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
618 				clearDepthHi_vec128 = _mm_srli_epi16(clearDepthHi_vec128, 15);
619 				clearDepthLo_vec128 = _mm_srli_epi16(clearDepthLo_vec128, 15);
620 
621 				_mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));
622 
623 				// The one is easy. Just set the values in the polygon ID buffer.
624 				_mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
625 			}
626 		}
627 		else
628 		{
629          size_t dstIndex;
630 			const __m128i addrOffset          = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
631 			const __m128i addrRolloverMask    = _mm_set1_epi16(0x00FF);
632 			const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
633          size_t iy = 0;
634 
635 			for (dstIndex = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
636 			{
637             size_t ix;
638 				const size_t y   = ((iy + yScroll) & 0xFF) << 8;
639 				__m128i y_vec128 = _mm_set1_epi16(y);
640 
641 				for (ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
642 				{
643 					__m128i addr_vec128 = _mm_set1_epi16(ix + xScroll);
644 					addr_vec128         = _mm_add_epi16(addr_vec128, addrOffset);
645 					addr_vec128         = _mm_and_si128(addr_vec128, addrRolloverMask);
646 					addr_vec128         = _mm_or_si128(addr_vec128, y_vec128);
647 
648 					this->clearImageColor16Buffer[dstIndex+7] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 7)];
649 					this->clearImageColor16Buffer[dstIndex+6] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 6)];
650 					this->clearImageColor16Buffer[dstIndex+5] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 5)];
651 					this->clearImageColor16Buffer[dstIndex+4] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 4)];
652 					this->clearImageColor16Buffer[dstIndex+3] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 3)];
653 					this->clearImageColor16Buffer[dstIndex+2] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 2)];
654 					this->clearImageColor16Buffer[dstIndex+1] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 1)];
655 					this->clearImageColor16Buffer[dstIndex+0] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 0)];
656 
657 					this->clearImageDepthBuffer[dstIndex+7]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] & 0x7FFF];
658 					this->clearImageDepthBuffer[dstIndex+6]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] & 0x7FFF];
659 					this->clearImageDepthBuffer[dstIndex+5]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] & 0x7FFF];
660 					this->clearImageDepthBuffer[dstIndex+4]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] & 0x7FFF];
661 					this->clearImageDepthBuffer[dstIndex+3]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] & 0x7FFF];
662 					this->clearImageDepthBuffer[dstIndex+2]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] & 0x7FFF];
663 					this->clearImageDepthBuffer[dstIndex+1]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] & 0x7FFF];
664 					this->clearImageDepthBuffer[dstIndex+0]   = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] & 0x7FFF];
665 
666 					this->clearImageFogBuffer[dstIndex+7]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] );
667 					this->clearImageFogBuffer[dstIndex+6]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] );
668 					this->clearImageFogBuffer[dstIndex+5]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] );
669 					this->clearImageFogBuffer[dstIndex+4]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] );
670 					this->clearImageFogBuffer[dstIndex+3]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] );
671 					this->clearImageFogBuffer[dstIndex+2]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] );
672 					this->clearImageFogBuffer[dstIndex+1]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] );
673 					this->clearImageFogBuffer[dstIndex+0]     = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] );
674 
675 					_mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128);
676 				}
677 			}
678 		}
679 
680 		error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
681 		if (error != RENDER3DERROR_NOERR)
682 			error = this->ClearUsingValues(clearColor, clearFragment);
683 	}
684 	else
685 		error = this->ClearUsingValues(clearColor, clearFragment);
686 
687 	return error;
688 }
689 
690 #endif // ENABLE_SSE2
691