1 // Copyright 2009 Dolphin Emulator Project
2 // Licensed under GPLv2+
3 // Refer to the license.txt file included.
4
5 #include "VideoBackends/Software/Rasterizer.h"
6
7 #include <algorithm>
8 #include <cstring>
9
10 #include "Common/CommonTypes.h"
11 #include "VideoBackends/Software/EfbInterface.h"
12 #include "VideoBackends/Software/NativeVertexFormat.h"
13 #include "VideoBackends/Software/Tev.h"
14 #include "VideoCommon/PerfQueryBase.h"
15 #include "VideoCommon/Statistics.h"
16 #include "VideoCommon/VideoCommon.h"
17 #include "VideoCommon/VideoConfig.h"
18 #include "VideoCommon/XFMemory.h"
19
20 namespace Rasterizer
21 {
22 static constexpr int BLOCK_SIZE = 2;
23
24 static Slope ZSlope;
25 static Slope WSlope;
26 static Slope ColorSlopes[2][4];
27 static Slope TexSlopes[8][3];
28
29 static s32 vertex0X;
30 static s32 vertex0Y;
31 static float vertexOffsetX;
32 static float vertexOffsetY;
33
34 static Tev tev;
35 static RasterBlock rasterBlock;
36
Init()37 void Init()
38 {
39 tev.Init();
40
41 // Set initial z reference plane in the unlikely case that zfreeze is enabled when drawing the
42 // first primitive.
43 // TODO: This is just a guess!
44 ZSlope.dfdx = ZSlope.dfdy = 0.f;
45 ZSlope.f0 = 1.f;
46 }
47
48 // Returns approximation of log2(f) in s28.4
49 // results are close enough to use for LOD
FixedLog2(float f)50 static s32 FixedLog2(float f)
51 {
52 u32 x;
53 std::memcpy(&x, &f, sizeof(u32));
54
55 s32 logInt = ((x & 0x7F800000) >> 19) - 2032; // integer part
56 s32 logFract = (x & 0x007fffff) >> 19; // approximate fractional part
57
58 return logInt + logFract;
59 }
60
iround(float x)61 static inline int iround(float x)
62 {
63 int t = (int)x;
64 if ((x - t) >= 0.5)
65 return t + 1;
66
67 return t;
68 }
69
SetTevReg(int reg,int comp,s16 color)70 void SetTevReg(int reg, int comp, s16 color)
71 {
72 tev.SetRegColor(reg, comp, color);
73 }
74
Draw(s32 x,s32 y,s32 xi,s32 yi)75 static void Draw(s32 x, s32 y, s32 xi, s32 yi)
76 {
77 INCSTAT(g_stats.this_frame.rasterized_pixels);
78
79 float dx = vertexOffsetX + (float)(x - vertex0X);
80 float dy = vertexOffsetY + (float)(y - vertex0Y);
81
82 s32 z = (s32)std::clamp<float>(ZSlope.GetValue(dx, dy), 0.0f, 16777215.0f);
83
84 if (bpmem.UseEarlyDepthTest() && g_ActiveConfig.bZComploc)
85 {
86 // TODO: Test if perf regs are incremented even if test is disabled
87 EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_INPUT_ZCOMPLOC);
88 if (bpmem.zmode.testenable)
89 {
90 // early z
91 if (!EfbInterface::ZCompare(x, y, z))
92 return;
93 }
94 EfbInterface::IncPerfCounterQuadCount(PQ_ZCOMP_OUTPUT_ZCOMPLOC);
95 }
96
97 RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
98
99 tev.Position[0] = x;
100 tev.Position[1] = y;
101 tev.Position[2] = z;
102
103 // colors
104 for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
105 {
106 for (int comp = 0; comp < 4; comp++)
107 {
108 u16 color = (u16)ColorSlopes[i][comp].GetValue(dx, dy);
109
110 // clamp color value to 0
111 u16 mask = ~(color >> 8);
112
113 tev.Color[i][comp] = color & mask;
114 }
115 }
116
117 // tex coords
118 for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
119 {
120 // multiply by 128 because TEV stores UVs as s17.7
121 tev.Uv[i].s = (s32)(pixel.Uv[i][0] * 128);
122 tev.Uv[i].t = (s32)(pixel.Uv[i][1] * 128);
123 }
124
125 for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
126 {
127 tev.IndirectLod[i] = rasterBlock.IndirectLod[i];
128 tev.IndirectLinear[i] = rasterBlock.IndirectLinear[i];
129 }
130
131 for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
132 {
133 tev.TextureLod[i] = rasterBlock.TextureLod[i];
134 tev.TextureLinear[i] = rasterBlock.TextureLinear[i];
135 }
136
137 tev.Draw();
138 }
139
InitTriangle(float X1,float Y1,s32 xi,s32 yi)140 static void InitTriangle(float X1, float Y1, s32 xi, s32 yi)
141 {
142 vertex0X = xi;
143 vertex0Y = yi;
144
145 // adjust a little less than 0.5
146 const float adjust = 0.495f;
147
148 vertexOffsetX = ((float)xi - X1) + adjust;
149 vertexOffsetY = ((float)yi - Y1) + adjust;
150 }
151
InitSlope(Slope * slope,float f1,float f2,float f3,float DX31,float DX12,float DY12,float DY31)152 static void InitSlope(Slope* slope, float f1, float f2, float f3, float DX31, float DX12,
153 float DY12, float DY31)
154 {
155 float DF31 = f3 - f1;
156 float DF21 = f2 - f1;
157 float a = DF31 * -DY12 - DF21 * DY31;
158 float b = DX31 * DF21 + DX12 * DF31;
159 float c = -DX12 * DY31 - DX31 * -DY12;
160 slope->dfdx = -a / c;
161 slope->dfdy = -b / c;
162 slope->f0 = f1;
163 }
164
CalculateLOD(s32 * lodp,bool * linear,u32 texmap,u32 texcoord)165 static inline void CalculateLOD(s32* lodp, bool* linear, u32 texmap, u32 texcoord)
166 {
167 const FourTexUnits& texUnit = bpmem.tex[(texmap >> 2) & 1];
168 const u8 subTexmap = texmap & 3;
169
170 // LOD calculation requires data from the texture mode for bias, etc.
171 // it does not seem to use the actual texture size
172 const TexMode0& tm0 = texUnit.texMode0[subTexmap];
173 const TexMode1& tm1 = texUnit.texMode1[subTexmap];
174
175 float sDelta, tDelta;
176 if (tm0.diag_lod)
177 {
178 float* uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
179 float* uv1 = rasterBlock.Pixel[1][1].Uv[texcoord];
180
181 sDelta = fabsf(uv0[0] - uv1[0]);
182 tDelta = fabsf(uv0[1] - uv1[1]);
183 }
184 else
185 {
186 float* uv0 = rasterBlock.Pixel[0][0].Uv[texcoord];
187 float* uv1 = rasterBlock.Pixel[1][0].Uv[texcoord];
188 float* uv2 = rasterBlock.Pixel[0][1].Uv[texcoord];
189
190 sDelta = std::max(fabsf(uv0[0] - uv1[0]), fabsf(uv0[0] - uv2[0]));
191 tDelta = std::max(fabsf(uv0[1] - uv1[1]), fabsf(uv0[1] - uv2[1]));
192 }
193
194 // get LOD in s28.4
195 s32 lod = FixedLog2(std::max(sDelta, tDelta));
196
197 // bias is s2.5
198 int bias = tm0.lod_bias;
199 bias >>= 1;
200 lod += bias;
201
202 *linear = ((lod > 0 && (tm0.min_filter & 4)) || (lod <= 0 && tm0.mag_filter));
203
204 // NOTE: The order of comparisons for this clamp check matters.
205 if (lod > static_cast<s32>(tm1.max_lod))
206 lod = static_cast<s32>(tm1.max_lod);
207 else if (lod < static_cast<s32>(tm1.min_lod))
208 lod = static_cast<s32>(tm1.min_lod);
209
210 *lodp = lod;
211 }
212
BuildBlock(s32 blockX,s32 blockY)213 static void BuildBlock(s32 blockX, s32 blockY)
214 {
215 for (s32 yi = 0; yi < BLOCK_SIZE; yi++)
216 {
217 for (s32 xi = 0; xi < BLOCK_SIZE; xi++)
218 {
219 RasterBlockPixel& pixel = rasterBlock.Pixel[xi][yi];
220
221 float dx = vertexOffsetX + (float)(xi + blockX - vertex0X);
222 float dy = vertexOffsetY + (float)(yi + blockY - vertex0Y);
223
224 float invW = 1.0f / WSlope.GetValue(dx, dy);
225 pixel.InvW = invW;
226
227 // tex coords
228 for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
229 {
230 float projection = invW;
231 if (xfmem.texMtxInfo[i].projection)
232 {
233 float q = TexSlopes[i][2].GetValue(dx, dy) * invW;
234 if (q != 0.0f)
235 projection = invW / q;
236 }
237
238 pixel.Uv[i][0] = TexSlopes[i][0].GetValue(dx, dy) * projection;
239 pixel.Uv[i][1] = TexSlopes[i][1].GetValue(dx, dy) * projection;
240 }
241 }
242 }
243
244 u32 indref = bpmem.tevindref.hex;
245 for (unsigned int i = 0; i < bpmem.genMode.numindstages; i++)
246 {
247 u32 texmap = indref & 3;
248 indref >>= 3;
249 u32 texcoord = indref & 3;
250 indref >>= 3;
251
252 CalculateLOD(&rasterBlock.IndirectLod[i], &rasterBlock.IndirectLinear[i], texmap, texcoord);
253 }
254
255 for (unsigned int i = 0; i <= bpmem.genMode.numtevstages; i++)
256 {
257 int stageOdd = i & 1;
258 const TwoTevStageOrders& order = bpmem.tevorders[i >> 1];
259 if (order.getEnable(stageOdd))
260 {
261 u32 texmap = order.getTexMap(stageOdd);
262 u32 texcoord = order.getTexCoord(stageOdd);
263
264 CalculateLOD(&rasterBlock.TextureLod[i], &rasterBlock.TextureLinear[i], texmap, texcoord);
265 }
266 }
267 }
268
DrawTriangleFrontFace(const OutputVertexData * v0,const OutputVertexData * v1,const OutputVertexData * v2)269 void DrawTriangleFrontFace(const OutputVertexData* v0, const OutputVertexData* v1,
270 const OutputVertexData* v2)
271 {
272 INCSTAT(g_stats.this_frame.num_triangles_drawn);
273
274 // adapted from http://devmaster.net/posts/6145/advanced-rasterization
275
276 // 28.4 fixed-pou32 coordinates. rounded to nearest and adjusted to match hardware output
277 // could also take floor and adjust -8
278 const s32 Y1 = iround(16.0f * v0->screenPosition[1]) - 9;
279 const s32 Y2 = iround(16.0f * v1->screenPosition[1]) - 9;
280 const s32 Y3 = iround(16.0f * v2->screenPosition[1]) - 9;
281
282 const s32 X1 = iround(16.0f * v0->screenPosition[0]) - 9;
283 const s32 X2 = iround(16.0f * v1->screenPosition[0]) - 9;
284 const s32 X3 = iround(16.0f * v2->screenPosition[0]) - 9;
285
286 // Deltas
287 const s32 DX12 = X1 - X2;
288 const s32 DX23 = X2 - X3;
289 const s32 DX31 = X3 - X1;
290
291 const s32 DY12 = Y1 - Y2;
292 const s32 DY23 = Y2 - Y3;
293 const s32 DY31 = Y3 - Y1;
294
295 // Fixed-pos32 deltas
296 const s32 FDX12 = DX12 * 16;
297 const s32 FDX23 = DX23 * 16;
298 const s32 FDX31 = DX31 * 16;
299
300 const s32 FDY12 = DY12 * 16;
301 const s32 FDY23 = DY23 * 16;
302 const s32 FDY31 = DY31 * 16;
303
304 // Bounding rectangle
305 s32 minx = (std::min(std::min(X1, X2), X3) + 0xF) >> 4;
306 s32 maxx = (std::max(std::max(X1, X2), X3) + 0xF) >> 4;
307 s32 miny = (std::min(std::min(Y1, Y2), Y3) + 0xF) >> 4;
308 s32 maxy = (std::max(std::max(Y1, Y2), Y3) + 0xF) >> 4;
309
310 // scissor
311 int xoff = bpmem.scissorOffset.x * 2 - 342;
312 int yoff = bpmem.scissorOffset.y * 2 - 342;
313
314 s32 scissorLeft = bpmem.scissorTL.x - xoff - 342;
315 if (scissorLeft < 0)
316 scissorLeft = 0;
317
318 s32 scissorTop = bpmem.scissorTL.y - yoff - 342;
319 if (scissorTop < 0)
320 scissorTop = 0;
321
322 s32 scissorRight = bpmem.scissorBR.x - xoff - 341;
323 if (scissorRight > s32(EFB_WIDTH))
324 scissorRight = EFB_WIDTH;
325
326 s32 scissorBottom = bpmem.scissorBR.y - yoff - 341;
327 if (scissorBottom > s32(EFB_HEIGHT))
328 scissorBottom = EFB_HEIGHT;
329
330 minx = std::max(minx, scissorLeft);
331 maxx = std::min(maxx, scissorRight);
332 miny = std::max(miny, scissorTop);
333 maxy = std::min(maxy, scissorBottom);
334
335 if (minx >= maxx || miny >= maxy)
336 return;
337
338 // Setup slopes
339 float fltx1 = v0->screenPosition.x;
340 float flty1 = v0->screenPosition.y;
341 float fltdx31 = v2->screenPosition.x - fltx1;
342 float fltdx12 = fltx1 - v1->screenPosition.x;
343 float fltdy12 = flty1 - v1->screenPosition.y;
344 float fltdy31 = v2->screenPosition.y - flty1;
345
346 InitTriangle(fltx1, flty1, (X1 + 0xF) >> 4, (Y1 + 0xF) >> 4);
347
348 float w[3] = {1.0f / v0->projectedPosition.w, 1.0f / v1->projectedPosition.w,
349 1.0f / v2->projectedPosition.w};
350 InitSlope(&WSlope, w[0], w[1], w[2], fltdx31, fltdx12, fltdy12, fltdy31);
351
352 // TODO: The zfreeze emulation is not quite correct, yet!
353 // Many things might prevent us from reaching this line (culling, clipping, scissoring).
354 // However, the zslope is always guaranteed to be calculated unless all vertices are trivially
355 // rejected during clipping!
356 // We're currently sloppy at this since we abort early if any of the culling/clipping/scissoring
357 // tests fail.
358 if (!bpmem.genMode.zfreeze || !g_ActiveConfig.bZFreeze)
359 InitSlope(&ZSlope, v0->screenPosition[2], v1->screenPosition[2], v2->screenPosition[2], fltdx31,
360 fltdx12, fltdy12, fltdy31);
361
362 for (unsigned int i = 0; i < bpmem.genMode.numcolchans; i++)
363 {
364 for (int comp = 0; comp < 4; comp++)
365 InitSlope(&ColorSlopes[i][comp], v0->color[i][comp], v1->color[i][comp], v2->color[i][comp],
366 fltdx31, fltdx12, fltdy12, fltdy31);
367 }
368
369 for (unsigned int i = 0; i < bpmem.genMode.numtexgens; i++)
370 {
371 for (int comp = 0; comp < 3; comp++)
372 InitSlope(&TexSlopes[i][comp], v0->texCoords[i][comp] * w[0], v1->texCoords[i][comp] * w[1],
373 v2->texCoords[i][comp] * w[2], fltdx31, fltdx12, fltdy12, fltdy31);
374 }
375
376 // Half-edge constants
377 s32 C1 = DY12 * X1 - DX12 * Y1;
378 s32 C2 = DY23 * X2 - DX23 * Y2;
379 s32 C3 = DY31 * X3 - DX31 * Y3;
380
381 // Correct for fill convention
382 if (DY12 < 0 || (DY12 == 0 && DX12 > 0))
383 C1++;
384 if (DY23 < 0 || (DY23 == 0 && DX23 > 0))
385 C2++;
386 if (DY31 < 0 || (DY31 == 0 && DX31 > 0))
387 C3++;
388
389 // Start in corner of 8x8 block
390 minx &= ~(BLOCK_SIZE - 1);
391 miny &= ~(BLOCK_SIZE - 1);
392
393 // Loop through blocks
394 for (s32 y = miny; y < maxy; y += BLOCK_SIZE)
395 {
396 for (s32 x = minx; x < maxx; x += BLOCK_SIZE)
397 {
398 // Corners of block
399 s32 x0 = x << 4;
400 s32 x1 = (x + BLOCK_SIZE - 1) << 4;
401 s32 y0 = y << 4;
402 s32 y1 = (y + BLOCK_SIZE - 1) << 4;
403
404 // Evaluate half-space functions
405 bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
406 bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
407 bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
408 bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
409 int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
410
411 bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
412 bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
413 bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
414 bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
415 int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
416
417 bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
418 bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
419 bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
420 bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
421 int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
422
423 // Skip block when outside an edge
424 if (a == 0x0 || b == 0x0 || c == 0x0)
425 continue;
426
427 BuildBlock(x, y);
428
429 // Accept whole block when totally covered
430 if (a == 0xF && b == 0xF && c == 0xF)
431 {
432 for (s32 iy = 0; iy < BLOCK_SIZE; iy++)
433 {
434 for (s32 ix = 0; ix < BLOCK_SIZE; ix++)
435 {
436 Draw(x + ix, y + iy, ix, iy);
437 }
438 }
439 }
440 else // Partially covered block
441 {
442 s32 CY1 = C1 + DX12 * y0 - DY12 * x0;
443 s32 CY2 = C2 + DX23 * y0 - DY23 * x0;
444 s32 CY3 = C3 + DX31 * y0 - DY31 * x0;
445
446 for (s32 iy = 0; iy < BLOCK_SIZE; iy++)
447 {
448 s32 CX1 = CY1;
449 s32 CX2 = CY2;
450 s32 CX3 = CY3;
451
452 for (s32 ix = 0; ix < BLOCK_SIZE; ix++)
453 {
454 if (CX1 > 0 && CX2 > 0 && CX3 > 0)
455 {
456 Draw(x + ix, y + iy, ix, iy);
457 }
458
459 CX1 -= FDY12;
460 CX2 -= FDY23;
461 CX3 -= FDY31;
462 }
463
464 CY1 += FDX12;
465 CY2 += FDX23;
466 CY3 += FDX31;
467 }
468 }
469 }
470 }
471 }
472 } // namespace Rasterizer
473