1 /*
2 Copyright (C) 2006-2007 shash
3 Copyright (C) 2008-2015 DeSmuME team
4
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 2 of the License, or
8 (at your option) any later version.
9
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with the this software. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "render3D.h"
20
21 #include <string.h>
22
23 #ifdef ENABLE_SSE2
24 #include <emmintrin.h>
25 #endif
26
27 #ifdef ENABLE_SSSE3
28 #include <tmmintrin.h>
29 #endif
30
31 #include "utils/bits.h"
32 #include "common.h"
33 #include "gfx3d.h"
34 #include "MMU.h"
35 #include "texcache.h"
36
37
38 static CACHE_ALIGN u32 dsDepthToD24_LUT[32768] = {0};
39 int cur3DCore = GPU3D_NULL;
40
41 GPU3DInterface gpu3DNull = {
42 "None",
43 Render3DBaseCreate,
44 Render3DBaseDestroy
45 };
46
47 GPU3DInterface *gpu3D = &gpu3DNull;
48 Render3D *BaseRenderer = NULL;
49 Render3D *CurrentRenderer = NULL;
50
Render3D_Init()51 void Render3D_Init()
52 {
53 if (BaseRenderer == NULL)
54 BaseRenderer = new Render3D;
55
56 if (CurrentRenderer == NULL)
57 {
58 gpu3D = &gpu3DNull;
59 cur3DCore = GPU3D_NULL;
60 CurrentRenderer = BaseRenderer;
61 }
62 }
63
Render3D_DeInit()64 void Render3D_DeInit()
65 {
66 gpu3D->NDS_3D_Close();
67 delete BaseRenderer;
68 BaseRenderer = NULL;
69 }
70
NDS_3D_ChangeCore(int newCore)71 bool NDS_3D_ChangeCore(int newCore)
72 {
73 bool result = false;
74
75 Render3DInterface *newRenderInterface = core3DList[newCore];
76 if (newRenderInterface->NDS_3D_Init == NULL)
77 return result;
78
79 // Some resources are shared between renderers, such as the texture cache,
80 // so we need to shut down the current renderer now to ensure that any
81 // shared resources aren't in use.
82 CurrentRenderer->RenderFinish();
83 gpu3D->NDS_3D_Close();
84 gpu3D = &gpu3DNull;
85 cur3DCore = GPU3D_NULL;
86 CurrentRenderer = BaseRenderer;
87
88 Render3D *newRenderer = newRenderInterface->NDS_3D_Init();
89 if (newRenderer == NULL)
90 return result;
91
92 Render3DError error = newRenderer->SetFramebufferSize(GPU->GetCustomFramebufferWidth(), GPU->GetCustomFramebufferHeight());
93 if (error != RENDER3DERROR_NOERR)
94 return result;
95
96 gpu3D = newRenderInterface;
97 cur3DCore = newCore;
98 CurrentRenderer = newRenderer;
99
100 result = true;
101 return result;
102 }
103
Render3DBaseCreate()104 Render3D* Render3DBaseCreate()
105 {
106 BaseRenderer->Reset();
107 return BaseRenderer;
108 }
109
Render3DBaseDestroy()110 void Render3DBaseDestroy()
111 {
112 if (CurrentRenderer != BaseRenderer)
113 {
114 delete CurrentRenderer;
115 CurrentRenderer = BaseRenderer;
116 }
117 }
118
FragmentAttributesBuffer(size_t newCount)119 FragmentAttributesBuffer::FragmentAttributesBuffer(size_t newCount)
120 {
121 count = newCount;
122
123 depth = (u32 *)memalign_alloc_aligned(count * sizeof(u32));
124 opaquePolyID = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
125 translucentPolyID = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
126 stencil = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
127 isFogged = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
128 isTranslucentPoly = (u8 *)memalign_alloc_aligned(count * sizeof(u8));
129 }
130
~FragmentAttributesBuffer()131 FragmentAttributesBuffer::~FragmentAttributesBuffer()
132 {
133 memalign_free(depth);
134 memalign_free(opaquePolyID);
135 memalign_free(translucentPolyID);
136 memalign_free(stencil);
137 memalign_free(isFogged);
138 memalign_free(isTranslucentPoly);
139 }
140
SetAtIndex(const size_t index,const FragmentAttributes & attr)141 void FragmentAttributesBuffer::SetAtIndex(const size_t index, const FragmentAttributes &attr)
142 {
143 this->depth[index] = attr.depth;
144 this->opaquePolyID[index] = attr.opaquePolyID;
145 this->translucentPolyID[index] = attr.translucentPolyID;
146 this->stencil[index] = attr.stencil;
147 this->isFogged[index] = attr.isFogged;
148 this->isTranslucentPoly[index] = attr.isTranslucentPoly;
149 }
150
SetAll(const FragmentAttributes & attr)151 void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
152 {
153 size_t i = 0;
154
155 #ifdef ENABLE_SSE2
156 const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth);
157 const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID);
158 const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID);
159 const __m128i attrStencil_vec128 = _mm_set1_epi8(attr.stencil);
160 const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged);
161 const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly);
162
163 const size_t sseCount = count - (count % 16);
164 for (; i < sseCount; i += 16)
165 {
166 _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128);
167 _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128);
168 _mm_stream_si128((__m128i *)(this->depth + 8), attrDepth_vec128);
169 _mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128);
170
171 _mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128);
172 _mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128);
173 _mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128);
174 _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
175 _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
176 }
177 #endif
178
179 for (; i < count; i++)
180 this->SetAtIndex(i, attr);
181 }
182
Render3D()183 Render3D::Render3D()
184 {
185 _renderID = RENDERID_NULL;
186 _renderName = "None";
187
188 static bool needTableInit = true;
189
190 if (needTableInit)
191 {
192 size_t i;
193 for (i = 0; i < 32768; i++)
194 dsDepthToD24_LUT[i] = (u32)DS_DEPTH15TO24(i);
195
196 needTableInit = false;
197 }
198
199 _framebufferWidth = GPU_FRAMEBUFFER_NATIVE_WIDTH;
200 _framebufferHeight = GPU_FRAMEBUFFER_NATIVE_HEIGHT;
201 _framebufferColorSizeBytes = 0;
202 _framebufferColor = NULL;
203
204 Reset();
205 }
206
~Render3D()207 Render3D::~Render3D()
208 {
209 memalign_free(_framebufferColor);
210 }
211
GetRenderID()212 RendererID Render3D::GetRenderID()
213 {
214 return this->_renderID;
215 }
216
GetName()217 std::string Render3D::GetName()
218 {
219 return this->_renderName;
220 }
221
GetFramebuffer()222 FragmentColor* Render3D::GetFramebuffer()
223 {
224 return this->_framebufferColor;
225 }
226
GetFramebufferWidth()227 size_t Render3D::GetFramebufferWidth()
228 {
229 return this->_framebufferWidth;
230 }
231
GetFramebufferHeight()232 size_t Render3D::GetFramebufferHeight()
233 {
234 return this->_framebufferHeight;
235 }
236
SetFramebufferSize(size_t w,size_t h)237 Render3DError Render3D::SetFramebufferSize(size_t w, size_t h)
238 {
239 if (w < GPU_FRAMEBUFFER_NATIVE_WIDTH || h < GPU_FRAMEBUFFER_NATIVE_HEIGHT)
240 return RENDER3DERROR_NOERR;
241
242 const size_t newFramebufferColorSizeBytes = w * h * sizeof(FragmentColor);
243 FragmentColor *oldFramebufferColor = this->_framebufferColor;
244 FragmentColor *newFramebufferColor = (FragmentColor *)memalign_alloc_aligned(newFramebufferColorSizeBytes);
245
246 this->_framebufferWidth = w;
247 this->_framebufferHeight = h;
248 this->_framebufferColorSizeBytes = newFramebufferColorSizeBytes;
249 this->_framebufferColor = newFramebufferColor;
250
251 memalign_free(oldFramebufferColor);
252
253 return RENDER3DERROR_NOERR;
254 }
255
BeginRender(const GFX3D & engine)256 Render3DError Render3D::BeginRender(const GFX3D &engine)
257 {
258 return RENDER3DERROR_NOERR;
259 }
260
RenderGeometry(const GFX3D_State & renderState,const POLYLIST * polyList,const INDEXLIST * indexList)261 Render3DError Render3D::RenderGeometry(const GFX3D_State &renderState, const POLYLIST *polyList, const INDEXLIST *indexList)
262 {
263 return RENDER3DERROR_NOERR;
264 }
265
RenderEdgeMarking(const u16 * colorTable,const bool useAntialias)266 Render3DError Render3D::RenderEdgeMarking(const u16 *colorTable, const bool useAntialias)
267 {
268 return RENDER3DERROR_NOERR;
269 }
270
RenderFog(const u8 * densityTable,const u32 color,const u32 offset,const u8 shift,const bool alphaOnly)271 Render3DError Render3D::RenderFog(const u8 *densityTable, const u32 color, const u32 offset, const u8 shift, const bool alphaOnly)
272 {
273 return RENDER3DERROR_NOERR;
274 }
275
EndRender(const u64 frameCount)276 Render3DError Render3D::EndRender(const u64 frameCount)
277 {
278 return RENDER3DERROR_NOERR;
279 }
280
FlushFramebuffer(FragmentColor * __restrict dstRGBA6665,u16 * __restrict dstRGBA5551)281 Render3DError Render3D::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
282 {
283 size_t i;
284 memcpy(dstRGBA6665, this->_framebufferColor, this->_framebufferColorSizeBytes);
285
286 /* Convert to RGBA5551 */
287 for (i = 0; i < (this->_framebufferWidth * this->_framebufferHeight); i++)
288 {
289 dstRGBA5551[i] = R6G6B6TORGB15(
290 this->_framebufferColor[i].r,
291 this->_framebufferColor[i].g,
292 this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
293 }
294
295 return RENDER3DERROR_NOERR;
296 }
297
UpdateToonTable(const u16 * toonTableBuffer)298 Render3DError Render3D::UpdateToonTable(const u16 *toonTableBuffer)
299 {
300 return RENDER3DERROR_NOERR;
301 }
302
ClearFramebuffer(const GFX3D_State & renderState)303 Render3DError Render3D::ClearFramebuffer(const GFX3D_State &renderState)
304 {
305 Render3DError error = RENDER3DERROR_NOERR;
306
307 FragmentColor clearColor;
308 clearColor.r = renderState.clearColor & 0x1F;
309 clearColor.g = (renderState.clearColor >> 5) & 0x1F;
310 clearColor.b = (renderState.clearColor >> 10) & 0x1F;
311 clearColor.a = (renderState.clearColor >> 16) & 0x1F;
312
313 FragmentAttributes clearFragment;
314 clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
315 //special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display
316 //I am not sure whether it is right, though. previously this was cleared to 0, as a guess,
317 //but in spiderman2 some fires with polyid 0 try to render on top of the background
318 clearFragment.translucentPolyID = kUnsetTranslucentPolyID;
319 clearFragment.depth = renderState.clearDepth;
320 clearFragment.stencil = 0;
321 clearFragment.isTranslucentPoly = 0;
322 clearFragment.isFogged = BIT15(renderState.clearColor);
323
324 if (renderState.enableClearImage)
325 {
326 //the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
327 //uses the scroll registers in the main game engine
328 const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
329 const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
330 const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
331 const u8 xScroll = scrollBits & 0xFF;
332 const u8 yScroll = (scrollBits >> 8) & 0xFF;
333
334 if (xScroll == 0 && yScroll == 0)
335 {
336 size_t i;
337
338 for (i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
339 {
340 this->clearImageColor16Buffer[i] = clearColorBuffer[i];
341 this->clearImageDepthBuffer[i] = dsDepthToD24_LUT[clearDepthBuffer[i] & 0x7FFF];
342 this->clearImageFogBuffer[i] = BIT15(clearDepthBuffer[i]);
343 this->clearImagePolyIDBuffer[i] = clearFragment.opaquePolyID;
344 }
345 }
346 else
347 {
348 size_t dstIndex;
349 size_t iy = 0;
350 for (dstIndex = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
351 {
352 size_t ix;
353 const size_t y = ((iy + yScroll) & 0xFF) << 8;
354
355 for (ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex++, ix++)
356 {
357 const size_t x = (ix + xScroll) & 0xFF;
358 const size_t srcIndex = y | x;
359
360 //this is tested by harry potter and the order of the phoenix.
361 //TODO (optimization) dont do this if we are mapped to blank memory (such as in sonic chronicles)
362 //(or use a special zero fill in the bulk clearing above)
363 this->clearImageColor16Buffer[dstIndex] = clearColorBuffer[srcIndex];
364
365 //this is tested quite well in the sonic chronicles main map mode
366 //where depth values are used for trees etc you can walk behind
367 this->clearImageDepthBuffer[dstIndex] = dsDepthToD24_LUT[clearDepthBuffer[srcIndex] & 0x7FFF];
368
369 this->clearImageFogBuffer[dstIndex] = BIT15(clearDepthBuffer[srcIndex]);
370 this->clearImagePolyIDBuffer[dstIndex] = clearFragment.opaquePolyID;
371 }
372 }
373 }
374
375 error = this->ClearUsingImage(
376 this->clearImageColor16Buffer,
377 this->clearImageDepthBuffer,
378 this->clearImageFogBuffer,
379 this->clearImagePolyIDBuffer);
380 if (error != RENDER3DERROR_NOERR)
381 error = this->ClearUsingValues(clearColor, clearFragment);
382 }
383 else
384 error = this->ClearUsingValues(clearColor, clearFragment);
385
386 return error;
387 }
388
ClearUsingImage(const u16 * __restrict colorBuffer,const u32 * __restrict depthBuffer,const u8 * __restrict fogBuffer,const u8 * __restrict polyIDBuffer)389 Render3DError Render3D::ClearUsingImage(
390 const u16 *__restrict colorBuffer,
391 const u32 *__restrict depthBuffer,
392 const u8 *__restrict fogBuffer,
393 const u8 *__restrict polyIDBuffer)
394 {
395 return RENDER3DERROR_NOERR;
396 }
397
ClearUsingValues(const FragmentColor & clearColor,const FragmentAttributes & clearAttributes) const398 Render3DError Render3D::ClearUsingValues(
399 const FragmentColor &clearColor,
400 const FragmentAttributes &clearAttributes) const
401 {
402 return RENDER3DERROR_NOERR;
403 }
404
SetupPolygon(const POLY & thePoly)405 Render3DError Render3D::SetupPolygon(const POLY &thePoly)
406 {
407 return RENDER3DERROR_NOERR;
408 }
409
SetupTexture(const POLY & thePoly,bool enableTexturing)410 Render3DError Render3D::SetupTexture(const POLY &thePoly, bool enableTexturing)
411 {
412 return RENDER3DERROR_NOERR;
413 }
414
SetupViewport(const u32 viewportValue)415 Render3DError Render3D::SetupViewport(const u32 viewportValue)
416 {
417 return RENDER3DERROR_NOERR;
418 }
419
Reset()420 Render3DError Render3D::Reset()
421 {
422 if (this->_framebufferColor != NULL)
423 {
424 memset(this->_framebufferColor, 0, this->_framebufferColorSizeBytes);
425 this->FlushFramebuffer(GPU->GetEngineMain()->Get3DFramebufferRGBA6665(), GPU->GetEngineMain()->Get3DFramebufferRGBA5551());
426 }
427
428 memset(this->clearImageColor16Buffer, 0, sizeof(this->clearImageColor16Buffer));
429 memset(this->clearImageDepthBuffer, 0, sizeof(this->clearImageDepthBuffer));
430 memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer));
431 memset(this->clearImageFogBuffer, 0, sizeof(this->clearImageFogBuffer));
432
433 TexCache_Reset();
434
435 return RENDER3DERROR_NOERR;
436 }
437
Render(const GFX3D & engine)438 Render3DError Render3D::Render(const GFX3D &engine)
439 {
440 Render3DError error = RENDER3DERROR_NOERR;
441
442 error = this->BeginRender(engine);
443 if (error != RENDER3DERROR_NOERR)
444 {
445 return error;
446 }
447
448 this->UpdateToonTable(engine.renderState.u16ToonTable);
449 this->ClearFramebuffer(engine.renderState);
450
451 this->RenderGeometry(engine.renderState, engine.polylist, &engine.indexlist);
452
453 if (engine.renderState.enableEdgeMarking)
454 this->RenderEdgeMarking(engine.renderState.edgeMarkColorTable,
455 engine.renderState.enableAntialiasing);
456
457 if (engine.renderState.enableFog)
458 {
459 this->RenderFog(
460 engine.renderState.fogDensityTable,
461 engine.renderState.fogColor,
462 engine.renderState.fogOffset,
463 engine.renderState.fogShift,
464 engine.renderState.enableFogAlphaOnly);
465 }
466
467 this->EndRender(engine.frameCtr);
468
469 return error;
470 }
471
RenderFinish()472 Render3DError Render3D::RenderFinish()
473 {
474 return RENDER3DERROR_NOERR;
475 }
476
VramReconfigureSignal()477 Render3DError Render3D::VramReconfigureSignal()
478 {
479 TexCache_Invalidate();
480 return RENDER3DERROR_NOERR;
481 }
482
483 #ifdef ENABLE_SSE2
484
FlushFramebuffer(FragmentColor * __restrict dstRGBA6665,u16 * __restrict dstRGBA5551)485 Render3DError Render3D_SSE2::FlushFramebuffer(FragmentColor *__restrict dstRGBA6665, u16 *__restrict dstRGBA5551)
486 {
487 size_t i;
488 const __m128i zero_vec128 = _mm_setzero_si128();
489 const size_t pixCount = this->_framebufferWidth * this->_framebufferHeight;
490 const size_t ssePixCount = pixCount - (pixCount % 4);
491
492 for (i = 0; i < ssePixCount; i += 4)
493 {
494 // Copy the framebufferColor buffer
495 __m128i color = _mm_load_si128((__m128i *)(this->_framebufferColor + i));
496 _mm_store_si128((__m128i *)(dstRGBA6665 + i), color);
497
498 // Convert to RGBA5551
499 __m128i r = _mm_and_si128(color, _mm_set1_epi32(0x0000003E)); // Read from R
500 r = _mm_srli_epi32(r, 1); // Shift to R
501
502 __m128i g = _mm_and_si128(color, _mm_set1_epi32(0x00003E00)); // Read from G
503 g = _mm_srli_epi32(g, 4); // Shift in G
504
505 __m128i b = _mm_and_si128(color, _mm_set1_epi32(0x003E0000)); // Read from B
506 b = _mm_srli_epi32(b, 7); // Shift to B
507
508 __m128i a = _mm_and_si128(color, _mm_set1_epi32(0xFF000000)); // Read from A
509 a = _mm_cmpeq_epi32(a, zero_vec128); // Determine A
510
511 // From here on, we're going to do an SSE2 trick to pack 32-bit down to unsigned
512 // 16-bit. Since SSE2 only has packssdw (signed 16-bit pack), then the alpha bit
513 // may be undefined. Now if we were using SSE4.1's packusdw (unsigned 16-bit pack),
514 // we wouldn't have to go through this hassle. But not everyone has an SSE4.1-capable
515 // CPU, so doing this the SSE2 way is more guaranteed to work an everyone's CPU.
516 //
517 // To use packssdw, we take a bit one position lower for the alpha bit, run
518 // packssdw, then shift the bit back to its original position. Then we por the
519 // alpha vector with the post-packed color vector to get the final color.
520
521 a = _mm_andnot_si128(a, _mm_set1_epi32(0x00004000)); // Mask out the bit before A
522 a = _mm_packs_epi32(a, zero_vec128); // Pack 32-bit down to 16-bit
523 a = _mm_slli_epi16(a, 1); // Shift the A bit back to where it needs to be
524
525 // Assemble the RGB colors, pack the 32-bit color into a signed 16-bit color, then por the alpha bit back in.
526 color = _mm_or_si128(_mm_or_si128(r, g), b);
527 color = _mm_packs_epi32(color, zero_vec128);
528 color = _mm_or_si128(color, a);
529
530 _mm_storel_epi64((__m128i *)(dstRGBA5551 + i), color);
531 }
532
533 for (; i < pixCount; i++)
534 {
535 dstRGBA6665[i] = this->_framebufferColor[i];
536 dstRGBA5551[i] = R6G6B6TORGB15(
537 this->_framebufferColor[i].r,
538 this->_framebufferColor[i].g,
539 this->_framebufferColor[i].b) | ((this->_framebufferColor[i].a == 0) ? 0x0000 : 0x8000);
540 }
541
542 return RENDER3DERROR_NOERR;
543 }
544
ClearFramebuffer(const GFX3D_State & renderState)545 Render3DError Render3D_SSE2::ClearFramebuffer(const GFX3D_State &renderState)
546 {
547 Render3DError error = RENDER3DERROR_NOERR;
548
549 FragmentColor clearColor;
550 clearColor.r = renderState.clearColor & 0x1F;
551 clearColor.g = (renderState.clearColor >> 5) & 0x1F;
552 clearColor.b = (renderState.clearColor >> 10) & 0x1F;
553 clearColor.a = (renderState.clearColor >> 16) & 0x1F;
554
555 FragmentAttributes clearFragment;
556 clearFragment.opaquePolyID = (renderState.clearColor >> 24) & 0x3F;
557 //special value for uninitialized translucent polyid. without this, fires in spiderman2 dont display
558 //I am not sure whether it is right, though. previously this was cleared to 0, as a guess,
559 //but in spiderman2 some fires with polyid 0 try to render on top of the background
560 clearFragment.translucentPolyID = kUnsetTranslucentPolyID;
561 clearFragment.depth = renderState.clearDepth;
562 clearFragment.stencil = 0;
563 clearFragment.isTranslucentPoly = 0;
564 clearFragment.isFogged = BIT15(renderState.clearColor);
565
566 if (renderState.enableClearImage)
567 {
568 //the lion, the witch, and the wardrobe (thats book 1, suck it you new-school numberers)
569 //uses the scroll registers in the main game engine
570 const u16 *__restrict clearColorBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[2];
571 const u16 *__restrict clearDepthBuffer = (u16 *__restrict)MMU.texInfo.textureSlotAddr[3];
572 const u16 scrollBits = T1ReadWord(MMU.ARM9_REG, 0x356); //CLRIMAGE_OFFSET
573 const u8 xScroll = scrollBits & 0xFF;
574 const u8 yScroll = (scrollBits >> 8) & 0xFF;
575
576 if (xScroll == 0 && yScroll == 0)
577 {
578 size_t i;
579 const __m128i depthBitMask_vec128 = _mm_set1_epi16(0x7FFF);
580 const __m128i fogBufferBitMask_vec128 = _mm_set1_epi16(BIT(15));
581 const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
582
583 for (i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i += 16)
584 {
585 // Copy the colors to the color buffer. Since we can only copy 8 elements at once,
586 // we need to load-store twice.
587 _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i + 8), _mm_loadu_si128((__m128i *)(clearColorBuffer + i + 8)) );
588 _mm_store_si128( (__m128i *)(this->clearImageColor16Buffer + i), _mm_loadu_si128((__m128i *)(clearColorBuffer + i)) );
589
590 // Write the depth values to the depth buffer.
591 __m128i clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8));
592 __m128i clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i));
593 clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, depthBitMask_vec128);
594 clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, depthBitMask_vec128);
595
596 this->clearImageDepthBuffer[i+15] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 7)];
597 this->clearImageDepthBuffer[i+14] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 6)];
598 this->clearImageDepthBuffer[i+13] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 5)];
599 this->clearImageDepthBuffer[i+12] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 4)];
600 this->clearImageDepthBuffer[i+11] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 3)];
601 this->clearImageDepthBuffer[i+10] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 2)];
602 this->clearImageDepthBuffer[i+ 9] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 1)];
603 this->clearImageDepthBuffer[i+ 8] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthHi_vec128, 0)];
604 this->clearImageDepthBuffer[i+ 7] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 7)];
605 this->clearImageDepthBuffer[i+ 6] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 6)];
606 this->clearImageDepthBuffer[i+ 5] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 5)];
607 this->clearImageDepthBuffer[i+ 4] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 4)];
608 this->clearImageDepthBuffer[i+ 3] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 3)];
609 this->clearImageDepthBuffer[i+ 2] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 2)];
610 this->clearImageDepthBuffer[i+ 1] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 1)];
611 this->clearImageDepthBuffer[i+ 0] = dsDepthToD24_LUT[_mm_extract_epi16(clearDepthLo_vec128, 0)];
612
613 // Write the fog flags to the fog flag buffer.
614 clearDepthHi_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i + 8));
615 clearDepthLo_vec128 = _mm_loadu_si128((__m128i *)(clearDepthBuffer + i));
616 clearDepthHi_vec128 = _mm_and_si128(clearDepthHi_vec128, fogBufferBitMask_vec128);
617 clearDepthLo_vec128 = _mm_and_si128(clearDepthLo_vec128, fogBufferBitMask_vec128);
618 clearDepthHi_vec128 = _mm_srli_epi16(clearDepthHi_vec128, 15);
619 clearDepthLo_vec128 = _mm_srli_epi16(clearDepthLo_vec128, 15);
620
621 _mm_store_si128((__m128i *)(this->clearImageFogBuffer + i), _mm_packus_epi16(clearDepthLo_vec128, clearDepthHi_vec128));
622
623 // The one is easy. Just set the values in the polygon ID buffer.
624 _mm_store_si128((__m128i *)(this->clearImagePolyIDBuffer + i), opaquePolyID_vec128);
625 }
626 }
627 else
628 {
629 size_t dstIndex;
630 const __m128i addrOffset = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
631 const __m128i addrRolloverMask = _mm_set1_epi16(0x00FF);
632 const __m128i opaquePolyID_vec128 = _mm_set1_epi8(clearFragment.opaquePolyID);
633 size_t iy = 0;
634
635 for (dstIndex = 0; iy < GPU_FRAMEBUFFER_NATIVE_HEIGHT; iy++)
636 {
637 size_t ix;
638 const size_t y = ((iy + yScroll) & 0xFF) << 8;
639 __m128i y_vec128 = _mm_set1_epi16(y);
640
641 for (ix = 0; ix < GPU_FRAMEBUFFER_NATIVE_WIDTH; dstIndex += 8, ix += 8)
642 {
643 __m128i addr_vec128 = _mm_set1_epi16(ix + xScroll);
644 addr_vec128 = _mm_add_epi16(addr_vec128, addrOffset);
645 addr_vec128 = _mm_and_si128(addr_vec128, addrRolloverMask);
646 addr_vec128 = _mm_or_si128(addr_vec128, y_vec128);
647
648 this->clearImageColor16Buffer[dstIndex+7] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 7)];
649 this->clearImageColor16Buffer[dstIndex+6] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 6)];
650 this->clearImageColor16Buffer[dstIndex+5] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 5)];
651 this->clearImageColor16Buffer[dstIndex+4] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 4)];
652 this->clearImageColor16Buffer[dstIndex+3] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 3)];
653 this->clearImageColor16Buffer[dstIndex+2] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 2)];
654 this->clearImageColor16Buffer[dstIndex+1] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 1)];
655 this->clearImageColor16Buffer[dstIndex+0] = clearColorBuffer[_mm_extract_epi16(addr_vec128, 0)];
656
657 this->clearImageDepthBuffer[dstIndex+7] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] & 0x7FFF];
658 this->clearImageDepthBuffer[dstIndex+6] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] & 0x7FFF];
659 this->clearImageDepthBuffer[dstIndex+5] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] & 0x7FFF];
660 this->clearImageDepthBuffer[dstIndex+4] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] & 0x7FFF];
661 this->clearImageDepthBuffer[dstIndex+3] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] & 0x7FFF];
662 this->clearImageDepthBuffer[dstIndex+2] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] & 0x7FFF];
663 this->clearImageDepthBuffer[dstIndex+1] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] & 0x7FFF];
664 this->clearImageDepthBuffer[dstIndex+0] = dsDepthToD24_LUT[clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] & 0x7FFF];
665
666 this->clearImageFogBuffer[dstIndex+7] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 7)] );
667 this->clearImageFogBuffer[dstIndex+6] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 6)] );
668 this->clearImageFogBuffer[dstIndex+5] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 5)] );
669 this->clearImageFogBuffer[dstIndex+4] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 4)] );
670 this->clearImageFogBuffer[dstIndex+3] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 3)] );
671 this->clearImageFogBuffer[dstIndex+2] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 2)] );
672 this->clearImageFogBuffer[dstIndex+1] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 1)] );
673 this->clearImageFogBuffer[dstIndex+0] = BIT15( clearDepthBuffer[_mm_extract_epi16(addr_vec128, 0)] );
674
675 _mm_storel_epi64((__m128i *)(this->clearImagePolyIDBuffer + dstIndex), opaquePolyID_vec128);
676 }
677 }
678 }
679
680 error = this->ClearUsingImage(this->clearImageColor16Buffer, this->clearImageDepthBuffer, this->clearImageFogBuffer, this->clearImagePolyIDBuffer);
681 if (error != RENDER3DERROR_NOERR)
682 error = this->ClearUsingValues(clearColor, clearFragment);
683 }
684 else
685 error = this->ClearUsingValues(clearColor, clearFragment);
686
687 return error;
688 }
689
690 #endif // ENABLE_SSE2
691