1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelPipeline.hpp"
16 #include "SamplerCore.hpp"
17 #include "Renderer/Renderer.hpp"
18 
19 namespace sw
20 {
21 	extern bool postBlendSRGB;
22 
setBuiltins(Int & x,Int & y,Float4 (& z)[4],Float4 & w)23 	void PixelPipeline::setBuiltins(Int &x, Int &y, Float4(&z)[4], Float4 &w)
24 	{
25 		if(state.color[0].component & 0x1) diffuse.x = convertFixed12(v[0].x); else diffuse.x = Short4(0x1000);
26 		if(state.color[0].component & 0x2) diffuse.y = convertFixed12(v[0].y); else diffuse.y = Short4(0x1000);
27 		if(state.color[0].component & 0x4) diffuse.z = convertFixed12(v[0].z); else diffuse.z = Short4(0x1000);
28 		if(state.color[0].component & 0x8) diffuse.w = convertFixed12(v[0].w); else diffuse.w = Short4(0x1000);
29 
30 		if(state.color[1].component & 0x1) specular.x = convertFixed12(v[1].x); else specular.x = Short4(0x0000);
31 		if(state.color[1].component & 0x2) specular.y = convertFixed12(v[1].y); else specular.y = Short4(0x0000);
32 		if(state.color[1].component & 0x4) specular.z = convertFixed12(v[1].z); else specular.z = Short4(0x0000);
33 		if(state.color[1].component & 0x8) specular.w = convertFixed12(v[1].w); else specular.w = Short4(0x0000);
34 	}
35 
fixedFunction()36 	void PixelPipeline::fixedFunction()
37 	{
38 		current = diffuse;
39 		Vector4s temp(0x0000, 0x0000, 0x0000, 0x0000);
40 
41 		for(int stage = 0; stage < 8; stage++)
42 		{
43 			if(state.textureStage[stage].stageOperation == TextureStage::STAGE_DISABLE)
44 			{
45 				break;
46 			}
47 
48 			Vector4s texture;
49 
50 			if(state.textureStage[stage].usesTexture)
51 			{
52 				texture = sampleTexture(stage, stage);
53 			}
54 
55 			blendTexture(temp, texture, stage);
56 		}
57 
58 		specularPixel(current, specular);
59 	}
60 
applyShader(Int cMask[4])61 	void PixelPipeline::applyShader(Int cMask[4])
62 	{
63 		if(!shader)
64 		{
65 			fixedFunction();
66 			return;
67 		}
68 
69 		int pad = 0;        // Count number of texm3x3pad instructions
70 		Vector4s dPairing;   // Destination for first pairing instruction
71 
72 		for(size_t i = 0; i < shader->getLength(); i++)
73 		{
74 			const Shader::Instruction *instruction = shader->getInstruction(i);
75 			Shader::Opcode opcode = instruction->opcode;
76 
77 			//	#ifndef NDEBUG   // FIXME: Centralize debug output control
78 			//		shader->printInstruction(i, "debug.txt");
79 			//	#endif
80 
81 			if(opcode == Shader::OPCODE_DCL || opcode == Shader::OPCODE_DEF || opcode == Shader::OPCODE_DEFI || opcode == Shader::OPCODE_DEFB)
82 			{
83 				continue;
84 			}
85 
86 			const Dst &dst = instruction->dst;
87 			const Src &src0 = instruction->src[0];
88 			const Src &src1 = instruction->src[1];
89 			const Src &src2 = instruction->src[2];
90 
91 			unsigned short shaderModel = shader->getShaderModel();
92 			bool pairing = i + 1 < shader->getLength() && shader->getInstruction(i + 1)->coissue;   // First instruction of pair
93 			bool coissue = instruction->coissue;                                                              // Second instruction of pair
94 
95 			Vector4s d;
96 			Vector4s s0;
97 			Vector4s s1;
98 			Vector4s s2;
99 
100 			if(src0.type != Shader::PARAMETER_VOID) s0 = fetchRegister(src0);
101 			if(src1.type != Shader::PARAMETER_VOID) s1 = fetchRegister(src1);
102 			if(src2.type != Shader::PARAMETER_VOID) s2 = fetchRegister(src2);
103 
104 			Float4 x = shaderModel < 0x0104 ? v[2 + dst.index].x : v[2 + src0.index].x;
105 			Float4 y = shaderModel < 0x0104 ? v[2 + dst.index].y : v[2 + src0.index].y;
106 			Float4 z = shaderModel < 0x0104 ? v[2 + dst.index].z : v[2 + src0.index].z;
107 			Float4 w = shaderModel < 0x0104 ? v[2 + dst.index].w : v[2 + src0.index].w;
108 
109 			switch(opcode)
110 			{
111 			case Shader::OPCODE_PS_1_0: break;
112 			case Shader::OPCODE_PS_1_1: break;
113 			case Shader::OPCODE_PS_1_2: break;
114 			case Shader::OPCODE_PS_1_3: break;
115 			case Shader::OPCODE_PS_1_4: break;
116 
117 			case Shader::OPCODE_DEF:    break;
118 
119 			case Shader::OPCODE_NOP:    break;
120 			case Shader::OPCODE_MOV: MOV(d, s0);         break;
121 			case Shader::OPCODE_ADD: ADD(d, s0, s1);     break;
122 			case Shader::OPCODE_SUB: SUB(d, s0, s1);     break;
123 			case Shader::OPCODE_MAD: MAD(d, s0, s1, s2); break;
124 			case Shader::OPCODE_MUL: MUL(d, s0, s1);     break;
125 			case Shader::OPCODE_DP3: DP3(d, s0, s1);     break;
126 			case Shader::OPCODE_DP4: DP4(d, s0, s1);     break;
127 			case Shader::OPCODE_LRP: LRP(d, s0, s1, s2); break;
128 			case Shader::OPCODE_TEXCOORD:
129 				if(shaderModel < 0x0104)
130 				{
131 					TEXCOORD(d, x, y, z, dst.index);
132 			}
133 				else
134 				{
135 					if((src0.swizzle & 0x30) == 0x20)   // .xyz
136 					{
137 						TEXCRD(d, x, y, z, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
138 					}
139 					else   // .xwy
140 					{
141 						TEXCRD(d, x, y, w, src0.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
142 					}
143 				}
144 				break;
145 			case Shader::OPCODE_TEXKILL:
146 				if(shaderModel < 0x0104)
147 				{
148 					TEXKILL(cMask, x, y, z);
149 				}
150 				else if(shaderModel == 0x0104)
151 				{
152 					if(dst.type == Shader::PARAMETER_TEXTURE)
153 					{
154 						TEXKILL(cMask, x, y, z);
155 					}
156 					else
157 					{
158 						TEXKILL(cMask, rs[dst.index]);
159 					}
160 				}
161 				else ASSERT(false);
162 				break;
163 			case Shader::OPCODE_TEX:
164 				if(shaderModel < 0x0104)
165 				{
166 					TEX(d, x, y, z, dst.index, false);
167 				}
168 				else if(shaderModel == 0x0104)
169 				{
170 					if(src0.type == Shader::PARAMETER_TEXTURE)
171 					{
172 						if((src0.swizzle & 0x30) == 0x20)   // .xyz
173 						{
174 							TEX(d, x, y, z, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
175 						}
176 						else   // .xyw
177 						{
178 							TEX(d, x, y, w, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
179 						}
180 					}
181 					else
182 					{
183 						TEXLD(d, s0, dst.index, src0.modifier == Shader::MODIFIER_DZ || src0.modifier == Shader::MODIFIER_DW);
184 					}
185 				}
186 				else ASSERT(false);
187 				break;
188 			case Shader::OPCODE_TEXBEM:       TEXBEM(d, s0, x, y, z, dst.index);                                             break;
189 			case Shader::OPCODE_TEXBEML:      TEXBEML(d, s0, x, y, z, dst.index);                                            break;
190 			case Shader::OPCODE_TEXREG2AR:    TEXREG2AR(d, s0, dst.index);                                                   break;
191 			case Shader::OPCODE_TEXREG2GB:    TEXREG2GB(d, s0, dst.index);                                                   break;
192 			case Shader::OPCODE_TEXM3X2PAD:   TEXM3X2PAD(x, y, z, s0, 0, src0.modifier == Shader::MODIFIER_SIGN);            break;
193 			case Shader::OPCODE_TEXM3X2TEX:   TEXM3X2TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
194 			case Shader::OPCODE_TEXM3X3PAD:   TEXM3X3PAD(x, y, z, s0, pad++ % 2, src0.modifier == Shader::MODIFIER_SIGN);    break;
195 			case Shader::OPCODE_TEXM3X3TEX:   TEXM3X3TEX(d, x, y, z, dst.index, s0, src0.modifier == Shader::MODIFIER_SIGN); break;
196 			case Shader::OPCODE_TEXM3X3SPEC:  TEXM3X3SPEC(d, x, y, z, dst.index, s0, s1);                                    break;
197 			case Shader::OPCODE_TEXM3X3VSPEC: TEXM3X3VSPEC(d, x, y, z, dst.index, s0);                                       break;
198 			case Shader::OPCODE_CND:          CND(d, s0, s1, s2);                                                            break;
199 			case Shader::OPCODE_TEXREG2RGB:   TEXREG2RGB(d, s0, dst.index);                                                  break;
200 			case Shader::OPCODE_TEXDP3TEX:    TEXDP3TEX(d, x, y, z, dst.index, s0);                                          break;
201 			case Shader::OPCODE_TEXM3X2DEPTH: TEXM3X2DEPTH(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);          break;
202 			case Shader::OPCODE_TEXDP3:       TEXDP3(d, x, y, z, s0);                                                        break;
203 			case Shader::OPCODE_TEXM3X3:      TEXM3X3(d, x, y, z, s0, src0.modifier == Shader::MODIFIER_SIGN);               break;
204 			case Shader::OPCODE_TEXDEPTH:     TEXDEPTH();                                                                    break;
205 			case Shader::OPCODE_CMP0:         CMP(d, s0, s1, s2);                                                            break;
206 			case Shader::OPCODE_BEM:          BEM(d, s0, s1, dst.index);                                                     break;
207 			case Shader::OPCODE_PHASE:                                                                                       break;
208 			case Shader::OPCODE_END:                                                                                         break;
209 			default:
210 				ASSERT(false);
211 			}
212 
213 			if(dst.type != Shader::PARAMETER_VOID && opcode != Shader::OPCODE_TEXKILL)
214 			{
215 				if(dst.shift > 0)
216 				{
217 					if(dst.mask & 0x1) { d.x = AddSat(d.x, d.x); if(dst.shift > 1) d.x = AddSat(d.x, d.x); if(dst.shift > 2) d.x = AddSat(d.x, d.x); }
218 					if(dst.mask & 0x2) { d.y = AddSat(d.y, d.y); if(dst.shift > 1) d.y = AddSat(d.y, d.y); if(dst.shift > 2) d.y = AddSat(d.y, d.y); }
219 					if(dst.mask & 0x4) { d.z = AddSat(d.z, d.z); if(dst.shift > 1) d.z = AddSat(d.z, d.z); if(dst.shift > 2) d.z = AddSat(d.z, d.z); }
220 					if(dst.mask & 0x8) { d.w = AddSat(d.w, d.w); if(dst.shift > 1) d.w = AddSat(d.w, d.w); if(dst.shift > 2) d.w = AddSat(d.w, d.w); }
221 				}
222 				else if(dst.shift < 0)
223 				{
224 					if(dst.mask & 0x1) d.x = d.x >> -dst.shift;
225 					if(dst.mask & 0x2) d.y = d.y >> -dst.shift;
226 					if(dst.mask & 0x4) d.z = d.z >> -dst.shift;
227 					if(dst.mask & 0x8) d.w = d.w >> -dst.shift;
228 				}
229 
230 				if(dst.saturate)
231 				{
232 					if(dst.mask & 0x1) { d.x = Min(d.x, Short4(0x1000)); d.x = Max(d.x, Short4(0x0000)); }
233 					if(dst.mask & 0x2) { d.y = Min(d.y, Short4(0x1000)); d.y = Max(d.y, Short4(0x0000)); }
234 					if(dst.mask & 0x4) { d.z = Min(d.z, Short4(0x1000)); d.z = Max(d.z, Short4(0x0000)); }
235 					if(dst.mask & 0x8) { d.w = Min(d.w, Short4(0x1000)); d.w = Max(d.w, Short4(0x0000)); }
236 				}
237 
238 				if(pairing)
239 				{
240 					if(dst.mask & 0x1) dPairing.x = d.x;
241 					if(dst.mask & 0x2) dPairing.y = d.y;
242 					if(dst.mask & 0x4) dPairing.z = d.z;
243 					if(dst.mask & 0x8) dPairing.w = d.w;
244 				}
245 
246 				if(coissue)
247 				{
248 					const Dst &dst = shader->getInstruction(i - 1)->dst;
249 
250 					writeDestination(dPairing, dst);
251 				}
252 
253 				if(!pairing)
254 				{
255 					writeDestination(d, dst);
256 				}
257 			}
258 		}
259 
260 		current.x = Min(current.x, Short4(0x0FFF)); current.x = Max(current.x, Short4(0x0000));
261 		current.y = Min(current.y, Short4(0x0FFF)); current.y = Max(current.y, Short4(0x0000));
262 		current.z = Min(current.z, Short4(0x0FFF)); current.z = Max(current.z, Short4(0x0000));
263 		current.w = Min(current.w, Short4(0x0FFF)); current.w = Max(current.w, Short4(0x0000));
264 	}
265 
alphaTest(Int cMask[4])266 	Bool PixelPipeline::alphaTest(Int cMask[4])
267 	{
268 		if(!state.alphaTestActive())
269 		{
270 			return true;
271 		}
272 
273 		Int aMask;
274 
275 		if(state.transparencyAntialiasing == TRANSPARENCY_NONE)
276 		{
277 			PixelRoutine::alphaTest(aMask, current.w);
278 
279 			for(unsigned int q = 0; q < state.multiSample; q++)
280 			{
281 				cMask[q] &= aMask;
282 			}
283 		}
284 		else if(state.transparencyAntialiasing == TRANSPARENCY_ALPHA_TO_COVERAGE)
285 		{
286 			Float4 alpha = Float4(current.w) * Float4(1.0f / 0x1000);
287 
288 			alphaToCoverage(cMask, alpha);
289 		}
290 		else ASSERT(false);
291 
292 		Int pass = cMask[0];
293 
294 		for(unsigned int q = 1; q < state.multiSample; q++)
295 		{
296 			pass = pass | cMask[q];
297 		}
298 
299 		return pass != 0x0;
300 	}
301 
rasterOperation(Float4 & fog,Pointer<Byte> cBuffer[4],Int & x,Int sMask[4],Int zMask[4],Int cMask[4])302 	void PixelPipeline::rasterOperation(Float4 &fog, Pointer<Byte> cBuffer[4], Int &x, Int sMask[4], Int zMask[4], Int cMask[4])
303 	{
304 		if(!state.colorWriteActive(0))
305 		{
306 			return;
307 		}
308 
309 		Vector4f oC;
310 
311 		switch(state.targetFormat[0])
312 		{
313 		case FORMAT_R5G6B5:
314 		case FORMAT_X8R8G8B8:
315 		case FORMAT_X8B8G8R8:
316 		case FORMAT_A8R8G8B8:
317 		case FORMAT_A8B8G8R8:
318 		case FORMAT_A8:
319 		case FORMAT_G16R16:
320 		case FORMAT_A16B16G16R16:
321 			if(!postBlendSRGB && state.writeSRGB)
322 			{
323 				linearToSRGB12_16(current);
324 			}
325 			else
326 			{
327 				current.x <<= 4;
328 				current.y <<= 4;
329 				current.z <<= 4;
330 				current.w <<= 4;
331 			}
332 
333 			if(state.targetFormat[0] == FORMAT_R5G6B5)
334 			{
335 				current.x &= Short4(0xF800u);
336 				current.y &= Short4(0xFC00u);
337 				current.z &= Short4(0xF800u);
338 			}
339 
340 			fogBlend(current, fog);
341 
342 			for(unsigned int q = 0; q < state.multiSample; q++)
343 			{
344 				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
345 				Vector4s color = current;
346 
347 				if(state.multiSampleMask & (1 << q))
348 				{
349 					alphaBlend(0, buffer, color, x);
350 					logicOperation(0, buffer, color, x);
351 					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
352 				}
353 			}
354 			break;
355 		case FORMAT_R32F:
356 		case FORMAT_G32R32F:
357 		case FORMAT_X32B32G32R32F:
358 		case FORMAT_A32B32G32R32F:
359 	//	case FORMAT_X32B32G32R32F_UNSIGNED:   // Not renderable in any fixed-function API.
360 			convertSigned12(oC, current);
361 			PixelRoutine::fogBlend(oC, fog);
362 
363 			for(unsigned int q = 0; q < state.multiSample; q++)
364 			{
365 				Pointer<Byte> buffer = cBuffer[0] + q * *Pointer<Int>(data + OFFSET(DrawData, colorSliceB[0]));
366 				Vector4f color = oC;
367 
368 				if(state.multiSampleMask & (1 << q))
369 				{
370 					alphaBlend(0, buffer, color, x);
371 					writeColor(0, buffer, x, color, sMask[q], zMask[q], cMask[q]);
372 				}
373 			}
374 			break;
375 		default:
376 			ASSERT(false);
377 		}
378 	}
379 
blendTexture(Vector4s & temp,Vector4s & texture,int stage)380 	void PixelPipeline::blendTexture(Vector4s &temp, Vector4s &texture, int stage)
381 	{
382 		Vector4s *arg1 = nullptr;
383 		Vector4s *arg2 = nullptr;
384 		Vector4s *arg3 = nullptr;
385 		Vector4s res;
386 
387 		Vector4s constant;
388 		Vector4s tfactor;
389 
390 		const TextureStage::State &textureStage = state.textureStage[stage];
391 
392 		if(textureStage.firstArgument == TextureStage::SOURCE_CONSTANT ||
393 		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
394 		   textureStage.secondArgument == TextureStage::SOURCE_CONSTANT ||
395 		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_CONSTANT ||
396 		   textureStage.thirdArgument == TextureStage::SOURCE_CONSTANT ||
397 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_CONSTANT)
398 		{
399 			constant.x = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[0]));
400 			constant.y = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[1]));
401 			constant.z = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[2]));
402 			constant.w = *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].constantColor4[3]));
403 		}
404 
405 		if(textureStage.firstArgument == TextureStage::SOURCE_TFACTOR ||
406 		   textureStage.firstArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
407 		   textureStage.secondArgument == TextureStage::SOURCE_TFACTOR ||
408 		   textureStage.secondArgumentAlpha == TextureStage::SOURCE_TFACTOR ||
409 		   textureStage.thirdArgument == TextureStage::SOURCE_TFACTOR ||
410 		   textureStage.thirdArgumentAlpha == TextureStage::SOURCE_TFACTOR)
411 		{
412 			tfactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[0]));
413 			tfactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[1]));
414 			tfactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[2]));
415 			tfactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]));
416 		}
417 
418 		// Premodulate
419 		if(stage > 0 && textureStage.usesTexture)
420 		{
421 			if(state.textureStage[stage - 1].stageOperation == TextureStage::STAGE_PREMODULATE)
422 			{
423 				current.x = MulHigh(current.x, texture.x) << 4;
424 				current.y = MulHigh(current.y, texture.y) << 4;
425 				current.z = MulHigh(current.z, texture.z) << 4;
426 			}
427 
428 			if(state.textureStage[stage - 1].stageOperationAlpha == TextureStage::STAGE_PREMODULATE)
429 			{
430 				current.w = MulHigh(current.w, texture.w) << 4;
431 			}
432 		}
433 
434 		if(luminance)
435 		{
436 			texture.x = MulHigh(texture.x, L) << 4;
437 			texture.y = MulHigh(texture.y, L) << 4;
438 			texture.z = MulHigh(texture.z, L) << 4;
439 
440 			luminance = false;
441 		}
442 
443 		switch(textureStage.firstArgument)
444 		{
445 		case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;    break;
446 		case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;   break;
447 		case TextureStage::SOURCE_CURRENT:	arg1 = &current;  break;
448 		case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;  break;
449 		case TextureStage::SOURCE_SPECULAR:	arg1 = &specular; break;
450 		case TextureStage::SOURCE_TEMP:		arg1 = &temp;       break;
451 		case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;    break;
452 		default:
453 			ASSERT(false);
454 		}
455 
456 		switch(textureStage.secondArgument)
457 		{
458 		case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;    break;
459 		case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;   break;
460 		case TextureStage::SOURCE_CURRENT:	arg2 = &current;  break;
461 		case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;  break;
462 		case TextureStage::SOURCE_SPECULAR:	arg2 = &specular; break;
463 		case TextureStage::SOURCE_TEMP:		arg2 = &temp;       break;
464 		case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;    break;
465 		default:
466 			ASSERT(false);
467 		}
468 
469 		switch(textureStage.thirdArgument)
470 		{
471 		case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;    break;
472 		case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;   break;
473 		case TextureStage::SOURCE_CURRENT:	arg3 = &current;  break;
474 		case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;  break;
475 		case TextureStage::SOURCE_SPECULAR:	arg3 = &specular; break;
476 		case TextureStage::SOURCE_TEMP:		arg3 = &temp;       break;
477 		case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;    break;
478 		default:
479 			ASSERT(false);
480 		}
481 
482 		Vector4s mod1;
483 		Vector4s mod2;
484 		Vector4s mod3;
485 
486 		switch(textureStage.firstModifier)
487 		{
488 		case TextureStage::MODIFIER_COLOR:
489 			break;
490 		case TextureStage::MODIFIER_INVCOLOR:
491 			mod1.x = SubSat(Short4(0x1000), arg1->x);
492 			mod1.y = SubSat(Short4(0x1000), arg1->y);
493 			mod1.z = SubSat(Short4(0x1000), arg1->z);
494 			mod1.w = SubSat(Short4(0x1000), arg1->w);
495 
496 			arg1 = &mod1;
497 			break;
498 		case TextureStage::MODIFIER_ALPHA:
499 			mod1.x = arg1->w;
500 			mod1.y = arg1->w;
501 			mod1.z = arg1->w;
502 			mod1.w = arg1->w;
503 
504 			arg1 = &mod1;
505 			break;
506 		case TextureStage::MODIFIER_INVALPHA:
507 			mod1.x = SubSat(Short4(0x1000), arg1->w);
508 			mod1.y = SubSat(Short4(0x1000), arg1->w);
509 			mod1.z = SubSat(Short4(0x1000), arg1->w);
510 			mod1.w = SubSat(Short4(0x1000), arg1->w);
511 
512 			arg1 = &mod1;
513 			break;
514 		default:
515 			ASSERT(false);
516 		}
517 
518 		switch(textureStage.secondModifier)
519 		{
520 		case TextureStage::MODIFIER_COLOR:
521 			break;
522 		case TextureStage::MODIFIER_INVCOLOR:
523 			mod2.x = SubSat(Short4(0x1000), arg2->x);
524 			mod2.y = SubSat(Short4(0x1000), arg2->y);
525 			mod2.z = SubSat(Short4(0x1000), arg2->z);
526 			mod2.w = SubSat(Short4(0x1000), arg2->w);
527 
528 			arg2 = &mod2;
529 			break;
530 		case TextureStage::MODIFIER_ALPHA:
531 			mod2.x = arg2->w;
532 			mod2.y = arg2->w;
533 			mod2.z = arg2->w;
534 			mod2.w = arg2->w;
535 
536 			arg2 = &mod2;
537 			break;
538 		case TextureStage::MODIFIER_INVALPHA:
539 			mod2.x = SubSat(Short4(0x1000), arg2->w);
540 			mod2.y = SubSat(Short4(0x1000), arg2->w);
541 			mod2.z = SubSat(Short4(0x1000), arg2->w);
542 			mod2.w = SubSat(Short4(0x1000), arg2->w);
543 
544 			arg2 = &mod2;
545 			break;
546 		default:
547 			ASSERT(false);
548 		}
549 
550 		switch(textureStage.thirdModifier)
551 		{
552 		case TextureStage::MODIFIER_COLOR:
553 			break;
554 		case TextureStage::MODIFIER_INVCOLOR:
555 			mod3.x = SubSat(Short4(0x1000), arg3->x);
556 			mod3.y = SubSat(Short4(0x1000), arg3->y);
557 			mod3.z = SubSat(Short4(0x1000), arg3->z);
558 			mod3.w = SubSat(Short4(0x1000), arg3->w);
559 
560 			arg3 = &mod3;
561 			break;
562 		case TextureStage::MODIFIER_ALPHA:
563 			mod3.x = arg3->w;
564 			mod3.y = arg3->w;
565 			mod3.z = arg3->w;
566 			mod3.w = arg3->w;
567 
568 			arg3 = &mod3;
569 			break;
570 		case TextureStage::MODIFIER_INVALPHA:
571 			mod3.x = SubSat(Short4(0x1000), arg3->w);
572 			mod3.y = SubSat(Short4(0x1000), arg3->w);
573 			mod3.z = SubSat(Short4(0x1000), arg3->w);
574 			mod3.w = SubSat(Short4(0x1000), arg3->w);
575 
576 			arg3 = &mod3;
577 			break;
578 		default:
579 			ASSERT(false);
580 		}
581 
582 		switch(textureStage.stageOperation)
583 		{
584 		case TextureStage::STAGE_DISABLE:
585 			break;
586 		case TextureStage::STAGE_SELECTARG1: // Arg1
587 			res.x = arg1->x;
588 			res.y = arg1->y;
589 			res.z = arg1->z;
590 			break;
591 		case TextureStage::STAGE_SELECTARG2: // Arg2
592 			res.x = arg2->x;
593 			res.y = arg2->y;
594 			res.z = arg2->z;
595 			break;
596 		case TextureStage::STAGE_SELECTARG3: // Arg3
597 			res.x = arg3->x;
598 			res.y = arg3->y;
599 			res.z = arg3->z;
600 			break;
601 		case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
602 			res.x = MulHigh(arg1->x, arg2->x) << 4;
603 			res.y = MulHigh(arg1->y, arg2->y) << 4;
604 			res.z = MulHigh(arg1->z, arg2->z) << 4;
605 			break;
606 		case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
607 			res.x = MulHigh(arg1->x, arg2->x) << 5;
608 			res.y = MulHigh(arg1->y, arg2->y) << 5;
609 			res.z = MulHigh(arg1->z, arg2->z) << 5;
610 			break;
611 		case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
612 			res.x = MulHigh(arg1->x, arg2->x) << 6;
613 			res.y = MulHigh(arg1->y, arg2->y) << 6;
614 			res.z = MulHigh(arg1->z, arg2->z) << 6;
615 			break;
616 		case TextureStage::STAGE_ADD: // Arg1 + Arg2
617 			res.x = AddSat(arg1->x, arg2->x);
618 			res.y = AddSat(arg1->y, arg2->y);
619 			res.z = AddSat(arg1->z, arg2->z);
620 			break;
621 		case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
622 			res.x = AddSat(arg1->x, arg2->x);
623 			res.y = AddSat(arg1->y, arg2->y);
624 			res.z = AddSat(arg1->z, arg2->z);
625 
626 			res.x = SubSat(res.x, Short4(0x0800));
627 			res.y = SubSat(res.y, Short4(0x0800));
628 			res.z = SubSat(res.z, Short4(0x0800));
629 			break;
630 		case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
631 			res.x = AddSat(arg1->x, arg2->x);
632 			res.y = AddSat(arg1->y, arg2->y);
633 			res.z = AddSat(arg1->z, arg2->z);
634 
635 			res.x = SubSat(res.x, Short4(0x0800));
636 			res.y = SubSat(res.y, Short4(0x0800));
637 			res.z = SubSat(res.z, Short4(0x0800));
638 
639 			res.x = AddSat(res.x, res.x);
640 			res.y = AddSat(res.y, res.y);
641 			res.z = AddSat(res.z, res.z);
642 			break;
643 		case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
644 			res.x = SubSat(arg1->x, arg2->x);
645 			res.y = SubSat(arg1->y, arg2->y);
646 			res.z = SubSat(arg1->z, arg2->z);
647 			break;
648 		case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
649 			{
650 				Short4 tmp;
651 
652 				tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(arg1->x, arg2->x); res.x = SubSat(res.x, tmp);
653 				tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(arg1->y, arg2->y); res.y = SubSat(res.y, tmp);
654 				tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(arg1->z, arg2->z); res.z = SubSat(res.z, tmp);
655 			}
656 			break;
657 		case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
658 			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg3->x);
659 			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg3->y);
660 			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg3->z);
661 			break;
662 		case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
663 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, arg3->x) << 4; res.x = AddSat(res.x, arg2->x);
664 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, arg3->y) << 4; res.y = AddSat(res.y, arg2->y);
665 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, arg3->z) << 4; res.z = AddSat(res.z, arg2->z);
666 			break;
667 		case TextureStage::STAGE_DOT3: // 2 * (Arg1.x - 0.5) * 2 * (Arg2.x - 0.5) + 2 * (Arg1.y - 0.5) * 2 * (Arg2.y - 0.5) + 2 * (Arg1.z - 0.5) * 2 * (Arg2.z - 0.5)
668 			{
669 				Short4 tmp;
670 
671 				res.x = SubSat(arg1->x, Short4(0x0800)); tmp = SubSat(arg2->x, Short4(0x0800)); res.x = MulHigh(res.x, tmp);
672 				res.y = SubSat(arg1->y, Short4(0x0800)); tmp = SubSat(arg2->y, Short4(0x0800)); res.y = MulHigh(res.y, tmp);
673 				res.z = SubSat(arg1->z, Short4(0x0800)); tmp = SubSat(arg2->z, Short4(0x0800)); res.z = MulHigh(res.z, tmp);
674 
675 				res.x = res.x << 6;
676 				res.y = res.y << 6;
677 				res.z = res.z << 6;
678 
679 				res.x = AddSat(res.x, res.y);
680 				res.x = AddSat(res.x, res.z);
681 
682 				// Clamp to [0, 1]
683 				res.x = Max(res.x, Short4(0x0000));
684 				res.x = Min(res.x, Short4(0x1000));
685 
686 				res.y = res.x;
687 				res.z = res.x;
688 				res.w = res.x;
689 			}
690 			break;
691 		case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
692 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, current.w) << 4; res.x = AddSat(res.x, arg2->x);
693 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, current.w) << 4; res.y = AddSat(res.y, arg2->y);
694 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, current.w) << 4; res.z = AddSat(res.z, arg2->z);
695 			break;
696 		case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Alpha * (Arg1 - Arg2) + Arg2
697 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, diffuse.w) << 4; res.x = AddSat(res.x, arg2->x);
698 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, diffuse.w) << 4; res.y = AddSat(res.y, arg2->y);
699 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, diffuse.w) << 4; res.z = AddSat(res.z, arg2->z);
700 			break;
701 		case TextureStage::STAGE_BLENDFACTORALPHA: // Alpha * (Arg1 - Arg2) + Arg2
702 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.x = AddSat(res.x, arg2->x);
703 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.y = AddSat(res.y, arg2->y);
704 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.z = AddSat(res.z, arg2->z);
705 			break;
706 		case TextureStage::STAGE_BLENDTEXTUREALPHA: // Alpha * (Arg1 - Arg2) + Arg2
707 			res.x = SubSat(arg1->x, arg2->x); res.x = MulHigh(res.x, texture.w) << 4; res.x = AddSat(res.x, arg2->x);
708 			res.y = SubSat(arg1->y, arg2->y); res.y = MulHigh(res.y, texture.w) << 4; res.y = AddSat(res.y, arg2->y);
709 			res.z = SubSat(arg1->z, arg2->z); res.z = MulHigh(res.z, texture.w) << 4; res.z = AddSat(res.z, arg2->z);
710 			break;
711 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
712 			res.x = SubSat(Short4(0x1000), texture.w); res.x = MulHigh(res.x, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
713 			res.y = SubSat(Short4(0x1000), texture.w); res.y = MulHigh(res.y, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
714 			res.z = SubSat(Short4(0x1000), texture.w); res.z = MulHigh(res.z, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
715 			break;
716 		case TextureStage::STAGE_PREMODULATE:
717 			res.x = arg1->x;
718 			res.y = arg1->y;
719 			res.z = arg1->z;
720 			break;
721 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR: // Arg1 + Arg1.w * Arg2
722 			res.x = MulHigh(arg1->w, arg2->x) << 4; res.x = AddSat(res.x, arg1->x);
723 			res.y = MulHigh(arg1->w, arg2->y) << 4; res.y = AddSat(res.y, arg1->y);
724 			res.z = MulHigh(arg1->w, arg2->z) << 4; res.z = AddSat(res.z, arg1->z);
725 			break;
726 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA: // Arg1 * Arg2 + Arg1.w
727 			res.x = MulHigh(arg1->x, arg2->x) << 4; res.x = AddSat(res.x, arg1->w);
728 			res.y = MulHigh(arg1->y, arg2->y) << 4; res.y = AddSat(res.y, arg1->w);
729 			res.z = MulHigh(arg1->z, arg2->z) << 4; res.z = AddSat(res.z, arg1->w);
730 			break;
731 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR: // (1 - Arg1.w) * Arg2 + Arg1
732 			{
733 				Short4 tmp;
734 
735 				res.x = AddSat(arg1->x, arg2->x); tmp = MulHigh(arg1->w, arg2->x) << 4; res.x = SubSat(res.x, tmp);
736 				res.y = AddSat(arg1->y, arg2->y); tmp = MulHigh(arg1->w, arg2->y) << 4; res.y = SubSat(res.y, tmp);
737 				res.z = AddSat(arg1->z, arg2->z); tmp = MulHigh(arg1->w, arg2->z) << 4; res.z = SubSat(res.z, tmp);
738 			}
739 			break;
740 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA: // (1 - Arg1) * Arg2 + Arg1.w
741 			{
742 				Short4 tmp;
743 
744 				res.x = AddSat(arg1->w, arg2->x); tmp = MulHigh(arg1->x, arg2->x) << 4; res.x = SubSat(res.x, tmp);
745 				res.y = AddSat(arg1->w, arg2->y); tmp = MulHigh(arg1->y, arg2->y) << 4; res.y = SubSat(res.y, tmp);
746 				res.z = AddSat(arg1->w, arg2->z); tmp = MulHigh(arg1->z, arg2->z) << 4; res.z = SubSat(res.z, tmp);
747 			}
748 			break;
749 		case TextureStage::STAGE_BUMPENVMAP:
750 			{
751 				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
752 				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
753 
754 				Float4 du2;
755 				Float4 dv2;
756 
757 				du2 = du;
758 				dv2 = dv;
759 				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
760 				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
761 				du += dv2;
762 				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
763 				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
764 				dv += du2;
765 
766 				perturbate = true;
767 
768 				res.x = current.x;
769 				res.y = current.y;
770 				res.z = current.z;
771 				res.w = current.w;
772 			}
773 			break;
774 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
775 			{
776 				du = Float4(texture.x) * Float4(1.0f / 0x0FE0);
777 				dv = Float4(texture.y) * Float4(1.0f / 0x0FE0);
778 
779 				Float4 du2;
780 				Float4 dv2;
781 
782 				du2 = du;
783 				dv2 = dv;
784 
785 				du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
786 				dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
787 				du += dv2;
788 				dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
789 				du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
790 				dv += du2;
791 
792 				perturbate = true;
793 
794 				L = texture.z;
795 				L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
796 				L = L << 4;
797 				L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
798 				L = Max(L, Short4(0x0000));
799 				L = Min(L, Short4(0x1000));
800 
801 				luminance = true;
802 
803 				res.x = current.x;
804 				res.y = current.y;
805 				res.z = current.z;
806 				res.w = current.w;
807 			}
808 			break;
809 		default:
810 			ASSERT(false);
811 		}
812 
813 		if(textureStage.stageOperation != TextureStage::STAGE_DOT3)
814 		{
815 			switch(textureStage.firstArgumentAlpha)
816 			{
817 			case TextureStage::SOURCE_TEXTURE:	arg1 = &texture;		break;
818 			case TextureStage::SOURCE_CONSTANT:	arg1 = &constant;		break;
819 			case TextureStage::SOURCE_CURRENT:	arg1 = &current;		break;
820 			case TextureStage::SOURCE_DIFFUSE:	arg1 = &diffuse;		break;
821 			case TextureStage::SOURCE_SPECULAR:	arg1 = &specular;		break;
822 			case TextureStage::SOURCE_TEMP:		arg1 = &temp;			break;
823 			case TextureStage::SOURCE_TFACTOR:	arg1 = &tfactor;		break;
824 			default:
825 				ASSERT(false);
826 			}
827 
828 			switch(textureStage.secondArgumentAlpha)
829 			{
830 			case TextureStage::SOURCE_TEXTURE:	arg2 = &texture;		break;
831 			case TextureStage::SOURCE_CONSTANT:	arg2 = &constant;		break;
832 			case TextureStage::SOURCE_CURRENT:	arg2 = &current;		break;
833 			case TextureStage::SOURCE_DIFFUSE:	arg2 = &diffuse;		break;
834 			case TextureStage::SOURCE_SPECULAR:	arg2 = &specular;		break;
835 			case TextureStage::SOURCE_TEMP:		arg2 = &temp;			break;
836 			case TextureStage::SOURCE_TFACTOR:	arg2 = &tfactor;		break;
837 			default:
838 				ASSERT(false);
839 			}
840 
841 			switch(textureStage.thirdArgumentAlpha)
842 			{
843 			case TextureStage::SOURCE_TEXTURE:	arg3 = &texture;		break;
844 			case TextureStage::SOURCE_CONSTANT:	arg3 = &constant;		break;
845 			case TextureStage::SOURCE_CURRENT:	arg3 = &current;		break;
846 			case TextureStage::SOURCE_DIFFUSE:	arg3 = &diffuse;		break;
847 			case TextureStage::SOURCE_SPECULAR:	arg3 = &specular;		break;
848 			case TextureStage::SOURCE_TEMP:		arg3 = &temp;			break;
849 			case TextureStage::SOURCE_TFACTOR:	arg3 = &tfactor;		break;
850 			default:
851 				ASSERT(false);
852 			}
853 
854 			switch(textureStage.firstModifierAlpha)   // FIXME: Check if actually used
855 			{
856 			case TextureStage::MODIFIER_COLOR:
857 				break;
858 			case TextureStage::MODIFIER_INVCOLOR:
859 				mod1.w = SubSat(Short4(0x1000), arg1->w);
860 
861 				arg1 = &mod1;
862 				break;
863 			case TextureStage::MODIFIER_ALPHA:
864 				// Redudant
865 				break;
866 			case TextureStage::MODIFIER_INVALPHA:
867 				mod1.w = SubSat(Short4(0x1000), arg1->w);
868 
869 				arg1 = &mod1;
870 				break;
871 			default:
872 				ASSERT(false);
873 			}
874 
875 			switch(textureStage.secondModifierAlpha)   // FIXME: Check if actually used
876 			{
877 			case TextureStage::MODIFIER_COLOR:
878 				break;
879 			case TextureStage::MODIFIER_INVCOLOR:
880 				mod2.w = SubSat(Short4(0x1000), arg2->w);
881 
882 				arg2 = &mod2;
883 				break;
884 			case TextureStage::MODIFIER_ALPHA:
885 				// Redudant
886 				break;
887 			case TextureStage::MODIFIER_INVALPHA:
888 				mod2.w = SubSat(Short4(0x1000), arg2->w);
889 
890 				arg2 = &mod2;
891 				break;
892 			default:
893 				ASSERT(false);
894 			}
895 
896 			switch(textureStage.thirdModifierAlpha)   // FIXME: Check if actually used
897 			{
898 			case TextureStage::MODIFIER_COLOR:
899 				break;
900 			case TextureStage::MODIFIER_INVCOLOR:
901 				mod3.w = SubSat(Short4(0x1000), arg3->w);
902 
903 				arg3 = &mod3;
904 				break;
905 			case TextureStage::MODIFIER_ALPHA:
906 				// Redudant
907 				break;
908 			case TextureStage::MODIFIER_INVALPHA:
909 				mod3.w = SubSat(Short4(0x1000), arg3->w);
910 
911 				arg3 = &mod3;
912 				break;
913 			default:
914 				ASSERT(false);
915 			}
916 
917 			switch(textureStage.stageOperationAlpha)
918 			{
919 			case TextureStage::STAGE_DISABLE:
920 				break;
921 			case TextureStage::STAGE_SELECTARG1: // Arg1
922 				res.w = arg1->w;
923 				break;
924 			case TextureStage::STAGE_SELECTARG2: // Arg2
925 				res.w = arg2->w;
926 				break;
927 			case TextureStage::STAGE_SELECTARG3: // Arg3
928 				res.w = arg3->w;
929 				break;
930 			case TextureStage::STAGE_MODULATE: // Arg1 * Arg2
931 				res.w = MulHigh(arg1->w, arg2->w) << 4;
932 				break;
933 			case TextureStage::STAGE_MODULATE2X: // Arg1 * Arg2 * 2
934 				res.w = MulHigh(arg1->w, arg2->w) << 5;
935 				break;
936 			case TextureStage::STAGE_MODULATE4X: // Arg1 * Arg2 * 4
937 				res.w = MulHigh(arg1->w, arg2->w) << 6;
938 				break;
939 			case TextureStage::STAGE_ADD: // Arg1 + Arg2
940 				res.w = AddSat(arg1->w, arg2->w);
941 				break;
942 			case TextureStage::STAGE_ADDSIGNED: // Arg1 + Arg2 - 0.5
943 				res.w = AddSat(arg1->w, arg2->w);
944 				res.w = SubSat(res.w, Short4(0x0800));
945 				break;
946 			case TextureStage::STAGE_ADDSIGNED2X: // (Arg1 + Arg2 - 0.5) << 1
947 				res.w = AddSat(arg1->w, arg2->w);
948 				res.w = SubSat(res.w, Short4(0x0800));
949 				res.w = AddSat(res.w, res.w);
950 				break;
951 			case TextureStage::STAGE_SUBTRACT: // Arg1 - Arg2
952 				res.w = SubSat(arg1->w, arg2->w);
953 				break;
954 			case TextureStage::STAGE_ADDSMOOTH: // Arg1 + Arg2 - Arg1 * Arg2
955 				{
956 					Short4 tmp;
957 
958 					tmp = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(arg1->w, arg2->w); res.w = SubSat(res.w, tmp);
959 				}
960 				break;
961 			case TextureStage::STAGE_MULTIPLYADD: // Arg3 + Arg1 * Arg2
962 				res.w = MulHigh(arg1->w, arg2->w) << 4; res.w = AddSat(res.w, arg3->w);
963 				break;
964 			case TextureStage::STAGE_LERP: // Arg3 * (Arg1 - Arg2) + Arg2
965 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, arg3->w) << 4; res.w = AddSat(res.w, arg2->w);
966 				break;
967 			case TextureStage::STAGE_DOT3:
968 				break;   // Already computed in color channel
969 			case TextureStage::STAGE_BLENDCURRENTALPHA: // Alpha * (Arg1 - Arg2) + Arg2
970 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, current.w) << 4; res.w = AddSat(res.w, arg2->w);
971 				break;
972 			case TextureStage::STAGE_BLENDDIFFUSEALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
973 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, diffuse.w) << 4; res.w = AddSat(res.w, arg2->w);
974 				break;
975 			case TextureStage::STAGE_BLENDFACTORALPHA:
976 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, *Pointer<Short4>(data + OFFSET(DrawData, factor.textureFactor4[3]))) << 4; res.w = AddSat(res.w, arg2->w);
977 				break;
978 			case TextureStage::STAGE_BLENDTEXTUREALPHA: // Arg1 * (Alpha) + Arg2 * (1 - Alpha)
979 				res.w = SubSat(arg1->w, arg2->w); res.w = MulHigh(res.w, texture.w) << 4; res.w = AddSat(res.w, arg2->w);
980 				break;
981 			case TextureStage::STAGE_BLENDTEXTUREALPHAPM: // Arg1 + Arg2 * (1 - Alpha)
982 				res.w = SubSat(Short4(0x1000), texture.w); res.w = MulHigh(res.w, arg2->w) << 4; res.w = AddSat(res.w, arg1->w);
983 				break;
984 			case TextureStage::STAGE_PREMODULATE:
985 				res.w = arg1->w;
986 				break;
987 			case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
988 			case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
989 			case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
990 			case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
991 			case TextureStage::STAGE_BUMPENVMAP:
992 			case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
993 				break;   // Invalid alpha operations
994 			default:
995 				ASSERT(false);
996 			}
997 		}
998 
999 		// Clamp result to [0, 1]
1000 
1001 		switch(textureStage.stageOperation)
1002 		{
1003 		case TextureStage::STAGE_DISABLE:
1004 		case TextureStage::STAGE_SELECTARG1:
1005 		case TextureStage::STAGE_SELECTARG2:
1006 		case TextureStage::STAGE_SELECTARG3:
1007 		case TextureStage::STAGE_MODULATE:
1008 		case TextureStage::STAGE_MODULATE2X:
1009 		case TextureStage::STAGE_MODULATE4X:
1010 		case TextureStage::STAGE_ADD:
1011 		case TextureStage::STAGE_MULTIPLYADD:
1012 		case TextureStage::STAGE_LERP:
1013 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1014 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1015 		case TextureStage::STAGE_BLENDFACTORALPHA:
1016 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1017 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1018 		case TextureStage::STAGE_DOT3:   // Already clamped
1019 		case TextureStage::STAGE_PREMODULATE:
1020 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1021 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1022 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1023 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1024 		case TextureStage::STAGE_BUMPENVMAP:
1025 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1026 			if(state.textureStage[stage].cantUnderflow)
1027 			{
1028 				break;   // Can't go below zero
1029 			}
1030 		case TextureStage::STAGE_ADDSIGNED:
1031 		case TextureStage::STAGE_ADDSIGNED2X:
1032 		case TextureStage::STAGE_SUBTRACT:
1033 		case TextureStage::STAGE_ADDSMOOTH:
1034 			res.x = Max(res.x, Short4(0x0000));
1035 			res.y = Max(res.y, Short4(0x0000));
1036 			res.z = Max(res.z, Short4(0x0000));
1037 			break;
1038 		default:
1039 			ASSERT(false);
1040 		}
1041 
1042 		switch(textureStage.stageOperationAlpha)
1043 		{
1044 		case TextureStage::STAGE_DISABLE:
1045 		case TextureStage::STAGE_SELECTARG1:
1046 		case TextureStage::STAGE_SELECTARG2:
1047 		case TextureStage::STAGE_SELECTARG3:
1048 		case TextureStage::STAGE_MODULATE:
1049 		case TextureStage::STAGE_MODULATE2X:
1050 		case TextureStage::STAGE_MODULATE4X:
1051 		case TextureStage::STAGE_ADD:
1052 		case TextureStage::STAGE_MULTIPLYADD:
1053 		case TextureStage::STAGE_LERP:
1054 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1055 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1056 		case TextureStage::STAGE_BLENDFACTORALPHA:
1057 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1058 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1059 		case TextureStage::STAGE_DOT3:   // Already clamped
1060 		case TextureStage::STAGE_PREMODULATE:
1061 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1062 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1063 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1064 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1065 		case TextureStage::STAGE_BUMPENVMAP:
1066 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1067 			if(state.textureStage[stage].cantUnderflow)
1068 			{
1069 				break;   // Can't go below zero
1070 			}
1071 		case TextureStage::STAGE_ADDSIGNED:
1072 		case TextureStage::STAGE_ADDSIGNED2X:
1073 		case TextureStage::STAGE_SUBTRACT:
1074 		case TextureStage::STAGE_ADDSMOOTH:
1075 			res.w = Max(res.w, Short4(0x0000));
1076 			break;
1077 		default:
1078 			ASSERT(false);
1079 		}
1080 
1081 		switch(textureStage.stageOperation)
1082 		{
1083 		case TextureStage::STAGE_DISABLE:
1084 		case TextureStage::STAGE_SELECTARG1:
1085 		case TextureStage::STAGE_SELECTARG2:
1086 		case TextureStage::STAGE_SELECTARG3:
1087 		case TextureStage::STAGE_MODULATE:
1088 		case TextureStage::STAGE_SUBTRACT:
1089 		case TextureStage::STAGE_ADDSMOOTH:
1090 		case TextureStage::STAGE_LERP:
1091 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1092 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1093 		case TextureStage::STAGE_BLENDFACTORALPHA:
1094 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1095 		case TextureStage::STAGE_DOT3:   // Already clamped
1096 		case TextureStage::STAGE_PREMODULATE:
1097 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1098 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1099 		case TextureStage::STAGE_BUMPENVMAP:
1100 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1101 			break;   // Can't go above one
1102 		case TextureStage::STAGE_MODULATE2X:
1103 		case TextureStage::STAGE_MODULATE4X:
1104 		case TextureStage::STAGE_ADD:
1105 		case TextureStage::STAGE_ADDSIGNED:
1106 		case TextureStage::STAGE_ADDSIGNED2X:
1107 		case TextureStage::STAGE_MULTIPLYADD:
1108 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1109 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1110 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1111 			res.x = Min(res.x, Short4(0x1000));
1112 			res.y = Min(res.y, Short4(0x1000));
1113 			res.z = Min(res.z, Short4(0x1000));
1114 			break;
1115 		default:
1116 			ASSERT(false);
1117 		}
1118 
1119 		switch(textureStage.stageOperationAlpha)
1120 		{
1121 		case TextureStage::STAGE_DISABLE:
1122 		case TextureStage::STAGE_SELECTARG1:
1123 		case TextureStage::STAGE_SELECTARG2:
1124 		case TextureStage::STAGE_SELECTARG3:
1125 		case TextureStage::STAGE_MODULATE:
1126 		case TextureStage::STAGE_SUBTRACT:
1127 		case TextureStage::STAGE_ADDSMOOTH:
1128 		case TextureStage::STAGE_LERP:
1129 		case TextureStage::STAGE_BLENDCURRENTALPHA:
1130 		case TextureStage::STAGE_BLENDDIFFUSEALPHA:
1131 		case TextureStage::STAGE_BLENDFACTORALPHA:
1132 		case TextureStage::STAGE_BLENDTEXTUREALPHA:
1133 		case TextureStage::STAGE_DOT3:   // Already clamped
1134 		case TextureStage::STAGE_PREMODULATE:
1135 		case TextureStage::STAGE_MODULATEINVALPHA_ADDCOLOR:
1136 		case TextureStage::STAGE_MODULATEINVCOLOR_ADDALPHA:
1137 		case TextureStage::STAGE_BUMPENVMAP:
1138 		case TextureStage::STAGE_BUMPENVMAPLUMINANCE:
1139 			break;   // Can't go above one
1140 		case TextureStage::STAGE_MODULATE2X:
1141 		case TextureStage::STAGE_MODULATE4X:
1142 		case TextureStage::STAGE_ADD:
1143 		case TextureStage::STAGE_ADDSIGNED:
1144 		case TextureStage::STAGE_ADDSIGNED2X:
1145 		case TextureStage::STAGE_MULTIPLYADD:
1146 		case TextureStage::STAGE_BLENDTEXTUREALPHAPM:
1147 		case TextureStage::STAGE_MODULATEALPHA_ADDCOLOR:
1148 		case TextureStage::STAGE_MODULATECOLOR_ADDALPHA:
1149 			res.w = Min(res.w, Short4(0x1000));
1150 			break;
1151 		default:
1152 			ASSERT(false);
1153 		}
1154 
1155 		switch(textureStage.destinationArgument)
1156 		{
1157 		case TextureStage::DESTINATION_CURRENT:
1158 			current.x = res.x;
1159 			current.y = res.y;
1160 			current.z = res.z;
1161 			current.w = res.w;
1162 			break;
1163 		case TextureStage::DESTINATION_TEMP:
1164 			temp.x = res.x;
1165 			temp.y = res.y;
1166 			temp.z = res.z;
1167 			temp.w = res.w;
1168 			break;
1169 		default:
1170 			ASSERT(false);
1171 		}
1172 	}
1173 
fogBlend(Vector4s & current,Float4 & f)1174 	void PixelPipeline::fogBlend(Vector4s &current, Float4 &f)
1175 	{
1176 		if(!state.fogActive)
1177 		{
1178 			return;
1179 		}
1180 
1181 		if(state.pixelFogMode != FOG_NONE)
1182 		{
1183 			pixelFog(f);
1184 		}
1185 
1186 		UShort4 fog = convertFixed16(f, true);
1187 
1188 		current.x = As<Short4>(MulHigh(As<UShort4>(current.x), fog));
1189 		current.y = As<Short4>(MulHigh(As<UShort4>(current.y), fog));
1190 		current.z = As<Short4>(MulHigh(As<UShort4>(current.z), fog));
1191 
1192 		UShort4 invFog = UShort4(0xFFFFu) - fog;
1193 
1194 		current.x += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[0]))));
1195 		current.y += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[1]))));
1196 		current.z += As<Short4>(MulHigh(invFog, *Pointer<UShort4>(data + OFFSET(DrawData, fog.color4[2]))));
1197 	}
1198 
specularPixel(Vector4s & current,Vector4s & specular)1199 	void PixelPipeline::specularPixel(Vector4s &current, Vector4s &specular)
1200 	{
1201 		if(!state.specularAdd)
1202 		{
1203 			return;
1204 		}
1205 
1206 		current.x = AddSat(current.x, specular.x);
1207 		current.y = AddSat(current.y, specular.y);
1208 		current.z = AddSat(current.z, specular.z);
1209 	}
1210 
sampleTexture(int coordinates,int stage,bool project)1211 	Vector4s PixelPipeline::sampleTexture(int coordinates, int stage, bool project)
1212 	{
1213 		Float4 x = v[2 + coordinates].x;
1214 		Float4 y = v[2 + coordinates].y;
1215 		Float4 z = v[2 + coordinates].z;
1216 		Float4 w = v[2 + coordinates].w;
1217 
1218 		if(perturbate)
1219 		{
1220 			x += du;
1221 			y += dv;
1222 
1223 			perturbate = false;
1224 		}
1225 
1226 		return sampleTexture(stage, x, y, z, w, project);
1227 	}
1228 
sampleTexture(int stage,Float4 & u,Float4 & v,Float4 & w,Float4 & q,bool project)1229 	Vector4s PixelPipeline::sampleTexture(int stage, Float4 &u, Float4 &v, Float4 &w, Float4 &q, bool project)
1230 	{
1231 		Vector4s c;
1232 
1233 		#if PERF_PROFILE
1234 			Long texTime = Ticks();
1235 		#endif
1236 
1237 		Vector4f dsx;
1238 		Vector4f dsy;
1239 
1240 		Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + stage * sizeof(Texture);
1241 
1242 		if(!project)
1243 		{
1244 			c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u, v, w, q, q, dsx, dsy);
1245 		}
1246 		else
1247 		{
1248 			Float4 rq = reciprocal(q);
1249 
1250 			Float4 u_q = u * rq;
1251 			Float4 v_q = v * rq;
1252 			Float4 w_q = w * rq;
1253 
1254 			c = SamplerCore(constants, state.sampler[stage]).sampleTexture(texture, u_q, v_q, w_q, q, q, dsx, dsy);
1255 		}
1256 
1257 		#if PERF_PROFILE
1258 			cycles[PERF_TEX] += Ticks() - texTime;
1259 		#endif
1260 
1261 		return c;
1262 	}
1263 
convertFixed12(RValue<Float4> cf)1264 	Short4 PixelPipeline::convertFixed12(RValue<Float4> cf)
1265 	{
1266 		return RoundShort4(cf * Float4(0x1000));
1267 	}
1268 
convertFixed12(Vector4s & cs,Vector4f & cf)1269 	void PixelPipeline::convertFixed12(Vector4s &cs, Vector4f &cf)
1270 	{
1271 		cs.x = convertFixed12(cf.x);
1272 		cs.y = convertFixed12(cf.y);
1273 		cs.z = convertFixed12(cf.z);
1274 		cs.w = convertFixed12(cf.w);
1275 	}
1276 
convertSigned12(Short4 & cs)1277 	Float4 PixelPipeline::convertSigned12(Short4 &cs)
1278 	{
1279 		return Float4(cs) * Float4(1.0f / 0x0FFE);
1280 	}
1281 
convertSigned12(Vector4f & cf,Vector4s & cs)1282 	void PixelPipeline::convertSigned12(Vector4f &cf, Vector4s &cs)
1283 	{
1284 		cf.x = convertSigned12(cs.x);
1285 		cf.y = convertSigned12(cs.y);
1286 		cf.z = convertSigned12(cs.z);
1287 		cf.w = convertSigned12(cs.w);
1288 	}
1289 
writeDestination(Vector4s & d,const Dst & dst)1290 	void PixelPipeline::writeDestination(Vector4s &d, const Dst &dst)
1291 	{
1292 		switch(dst.type)
1293 		{
1294 		case Shader::PARAMETER_TEMP:
1295 			if(dst.mask & 0x1) rs[dst.index].x = d.x;
1296 			if(dst.mask & 0x2) rs[dst.index].y = d.y;
1297 			if(dst.mask & 0x4) rs[dst.index].z = d.z;
1298 			if(dst.mask & 0x8) rs[dst.index].w = d.w;
1299 			break;
1300 		case Shader::PARAMETER_INPUT:
1301 			if(dst.mask & 0x1) vs[dst.index].x = d.x;
1302 			if(dst.mask & 0x2) vs[dst.index].y = d.y;
1303 			if(dst.mask & 0x4) vs[dst.index].z = d.z;
1304 			if(dst.mask & 0x8) vs[dst.index].w = d.w;
1305 			break;
1306 		case Shader::PARAMETER_CONST: ASSERT(false); break;
1307 		case Shader::PARAMETER_TEXTURE:
1308 			if(dst.mask & 0x1) ts[dst.index].x = d.x;
1309 			if(dst.mask & 0x2) ts[dst.index].y = d.y;
1310 			if(dst.mask & 0x4) ts[dst.index].z = d.z;
1311 			if(dst.mask & 0x8) ts[dst.index].w = d.w;
1312 			break;
1313 		case Shader::PARAMETER_COLOROUT:
1314 			if(dst.mask & 0x1) vs[dst.index].x = d.x;
1315 			if(dst.mask & 0x2) vs[dst.index].y = d.y;
1316 			if(dst.mask & 0x4) vs[dst.index].z = d.z;
1317 			if(dst.mask & 0x8) vs[dst.index].w = d.w;
1318 			break;
1319 		default:
1320 			ASSERT(false);
1321 		}
1322 	}
1323 
fetchRegister(const Src & src)1324 	Vector4s PixelPipeline::fetchRegister(const Src &src)
1325 	{
1326 		Vector4s *reg;
1327 		int i = src.index;
1328 
1329 		Vector4s c;
1330 
1331 		if(src.type == Shader::PARAMETER_CONST)
1332 		{
1333 			c.x = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][0]));
1334 			c.y = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][1]));
1335 			c.z = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][2]));
1336 			c.w = *Pointer<Short4>(data + OFFSET(DrawData, ps.cW[i][3]));
1337 		}
1338 
1339 		switch(src.type)
1340 		{
1341 		case Shader::PARAMETER_TEMP:          reg = &rs[i]; break;
1342 		case Shader::PARAMETER_INPUT:         reg = &vs[i]; break;
1343 		case Shader::PARAMETER_CONST:         reg = &c;       break;
1344 		case Shader::PARAMETER_TEXTURE:       reg = &ts[i]; break;
1345 		case Shader::PARAMETER_VOID:          return rs[0]; // Dummy
1346 		case Shader::PARAMETER_FLOAT4LITERAL: return rs[0]; // Dummy
1347 		default: ASSERT(false); return rs[0];
1348 		}
1349 
1350 		const Short4 &x = (*reg)[(src.swizzle >> 0) & 0x3];
1351 		const Short4 &y = (*reg)[(src.swizzle >> 2) & 0x3];
1352 		const Short4 &z = (*reg)[(src.swizzle >> 4) & 0x3];
1353 		const Short4 &w = (*reg)[(src.swizzle >> 6) & 0x3];
1354 
1355 		Vector4s mod;
1356 
1357 		switch(src.modifier)
1358 		{
1359 		case Shader::MODIFIER_NONE:
1360 			mod.x = x;
1361 			mod.y = y;
1362 			mod.z = z;
1363 			mod.w = w;
1364 			break;
1365 		case Shader::MODIFIER_BIAS:
1366 			mod.x = SubSat(x, Short4(0x0800));
1367 			mod.y = SubSat(y, Short4(0x0800));
1368 			mod.z = SubSat(z, Short4(0x0800));
1369 			mod.w = SubSat(w, Short4(0x0800));
1370 			break;
1371 		case Shader::MODIFIER_BIAS_NEGATE:
1372 			mod.x = SubSat(Short4(0x0800), x);
1373 			mod.y = SubSat(Short4(0x0800), y);
1374 			mod.z = SubSat(Short4(0x0800), z);
1375 			mod.w = SubSat(Short4(0x0800), w);
1376 			break;
1377 		case Shader::MODIFIER_COMPLEMENT:
1378 			mod.x = SubSat(Short4(0x1000), x);
1379 			mod.y = SubSat(Short4(0x1000), y);
1380 			mod.z = SubSat(Short4(0x1000), z);
1381 			mod.w = SubSat(Short4(0x1000), w);
1382 			break;
1383 		case Shader::MODIFIER_NEGATE:
1384 			mod.x = -x;
1385 			mod.y = -y;
1386 			mod.z = -z;
1387 			mod.w = -w;
1388 			break;
1389 		case Shader::MODIFIER_X2:
1390 			mod.x = AddSat(x, x);
1391 			mod.y = AddSat(y, y);
1392 			mod.z = AddSat(z, z);
1393 			mod.w = AddSat(w, w);
1394 			break;
1395 		case Shader::MODIFIER_X2_NEGATE:
1396 			mod.x = -AddSat(x, x);
1397 			mod.y = -AddSat(y, y);
1398 			mod.z = -AddSat(z, z);
1399 			mod.w = -AddSat(w, w);
1400 			break;
1401 		case Shader::MODIFIER_SIGN:
1402 			mod.x = SubSat(x, Short4(0x0800));
1403 			mod.y = SubSat(y, Short4(0x0800));
1404 			mod.z = SubSat(z, Short4(0x0800));
1405 			mod.w = SubSat(w, Short4(0x0800));
1406 			mod.x = AddSat(mod.x, mod.x);
1407 			mod.y = AddSat(mod.y, mod.y);
1408 			mod.z = AddSat(mod.z, mod.z);
1409 			mod.w = AddSat(mod.w, mod.w);
1410 			break;
1411 		case Shader::MODIFIER_SIGN_NEGATE:
1412 			mod.x = SubSat(Short4(0x0800), x);
1413 			mod.y = SubSat(Short4(0x0800), y);
1414 			mod.z = SubSat(Short4(0x0800), z);
1415 			mod.w = SubSat(Short4(0x0800), w);
1416 			mod.x = AddSat(mod.x, mod.x);
1417 			mod.y = AddSat(mod.y, mod.y);
1418 			mod.z = AddSat(mod.z, mod.z);
1419 			mod.w = AddSat(mod.w, mod.w);
1420 			break;
1421 		case Shader::MODIFIER_DZ:
1422 			mod.x = x;
1423 			mod.y = y;
1424 			mod.z = z;
1425 			mod.w = w;
1426 			// Projection performed by texture sampler
1427 			break;
1428 		case Shader::MODIFIER_DW:
1429 			mod.x = x;
1430 			mod.y = y;
1431 			mod.z = z;
1432 			mod.w = w;
1433 			// Projection performed by texture sampler
1434 			break;
1435 		default:
1436 			ASSERT(false);
1437 		}
1438 
1439 		if(src.type == Shader::PARAMETER_CONST && (src.modifier == Shader::MODIFIER_X2 || src.modifier == Shader::MODIFIER_X2_NEGATE))
1440 		{
1441 			mod.x = Min(mod.x, Short4(0x1000)); mod.x = Max(mod.x, Short4(-0x1000));
1442 			mod.y = Min(mod.y, Short4(0x1000)); mod.y = Max(mod.y, Short4(-0x1000));
1443 			mod.z = Min(mod.z, Short4(0x1000)); mod.z = Max(mod.z, Short4(-0x1000));
1444 			mod.w = Min(mod.w, Short4(0x1000)); mod.w = Max(mod.w, Short4(-0x1000));
1445 		}
1446 
1447 		return mod;
1448 	}
1449 
MOV(Vector4s & dst,Vector4s & src0)1450 	void PixelPipeline::MOV(Vector4s &dst, Vector4s &src0)
1451 	{
1452 		dst.x = src0.x;
1453 		dst.y = src0.y;
1454 		dst.z = src0.z;
1455 		dst.w = src0.w;
1456 	}
1457 
ADD(Vector4s & dst,Vector4s & src0,Vector4s & src1)1458 	void PixelPipeline::ADD(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1459 	{
1460 		dst.x = AddSat(src0.x, src1.x);
1461 		dst.y = AddSat(src0.y, src1.y);
1462 		dst.z = AddSat(src0.z, src1.z);
1463 		dst.w = AddSat(src0.w, src1.w);
1464 	}
1465 
SUB(Vector4s & dst,Vector4s & src0,Vector4s & src1)1466 	void PixelPipeline::SUB(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1467 	{
1468 		dst.x = SubSat(src0.x, src1.x);
1469 		dst.y = SubSat(src0.y, src1.y);
1470 		dst.z = SubSat(src0.z, src1.z);
1471 		dst.w = SubSat(src0.w, src1.w);
1472 	}
1473 
MAD(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1474 	void PixelPipeline::MAD(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1475 	{
1476 		// FIXME: Long fixed-point multiply fixup
1477 		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1478 		{ dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y); }
1479 		{ dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1480 		{ dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1481 	}
1482 
MUL(Vector4s & dst,Vector4s & src0,Vector4s & src1)1483 	void PixelPipeline::MUL(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1484 	{
1485 		// FIXME: Long fixed-point multiply fixup
1486 		{ dst.x = MulHigh(src0.x, src1.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); }
1487 		{ dst.y = MulHigh(src0.y, src1.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); }
1488 		{ dst.z = MulHigh(src0.z, src1.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); }
1489 		{ dst.w = MulHigh(src0.w, src1.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); }
1490 	}
1491 
DP3(Vector4s & dst,Vector4s & src0,Vector4s & src1)1492 	void PixelPipeline::DP3(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1493 	{
1494 		Short4 t0;
1495 		Short4 t1;
1496 
1497 		// FIXME: Long fixed-point multiply fixup
1498 		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1499 		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1500 		t0 = AddSat(t0, t1);
1501 		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1502 		t0 = AddSat(t0, t1);
1503 
1504 		dst.x = t0;
1505 		dst.y = t0;
1506 		dst.z = t0;
1507 		dst.w = t0;
1508 	}
1509 
DP4(Vector4s & dst,Vector4s & src0,Vector4s & src1)1510 	void PixelPipeline::DP4(Vector4s &dst, Vector4s &src0, Vector4s &src1)
1511 	{
1512 		Short4 t0;
1513 		Short4 t1;
1514 
1515 		// FIXME: Long fixed-point multiply fixup
1516 		t0 = MulHigh(src0.x, src1.x); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0); t0 = AddSat(t0, t0);
1517 		t1 = MulHigh(src0.y, src1.y); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1518 		t0 = AddSat(t0, t1);
1519 		t1 = MulHigh(src0.z, src1.z); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1520 		t0 = AddSat(t0, t1);
1521 		t1 = MulHigh(src0.w, src1.w); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1); t1 = AddSat(t1, t1);
1522 		t0 = AddSat(t0, t1);
1523 
1524 		dst.x = t0;
1525 		dst.y = t0;
1526 		dst.z = t0;
1527 		dst.w = t0;
1528 	}
1529 
LRP(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1530 	void PixelPipeline::LRP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1531 	{
1532 		// FIXME: Long fixed-point multiply fixup
1533 		{ dst.x = SubSat(src1.x, src2.x); dst.x = MulHigh(dst.x, src0.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, dst.x); dst.x = AddSat(dst.x, src2.x); }
1534 		{
1535 		dst.y = SubSat(src1.y, src2.y); dst.y = MulHigh(dst.y, src0.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, dst.y); dst.y = AddSat(dst.y, src2.y);
1536 	}
1537 		{dst.z = SubSat(src1.z, src2.z); dst.z = MulHigh(dst.z, src0.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, dst.z); dst.z = AddSat(dst.z, src2.z); }
1538 		{dst.w = SubSat(src1.w, src2.w); dst.w = MulHigh(dst.w, src0.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, dst.w); dst.w = AddSat(dst.w, src2.w); }
1539 	}
1540 
TEXCOORD(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int coordinate)1541 	void PixelPipeline::TEXCOORD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate)
1542 	{
1543 		Float4 uw;
1544 		Float4 vw;
1545 		Float4 sw;
1546 
1547 		if(state.interpolant[2 + coordinate].component & 0x01)
1548 		{
1549 			uw = Max(u, Float4(0.0f));
1550 			uw = Min(uw, Float4(1.0f));
1551 			dst.x = convertFixed12(uw);
1552 		}
1553 		else
1554 		{
1555 			dst.x = Short4(0x0000);
1556 		}
1557 
1558 		if(state.interpolant[2 + coordinate].component & 0x02)
1559 		{
1560 			vw = Max(v, Float4(0.0f));
1561 			vw = Min(vw, Float4(1.0f));
1562 			dst.y = convertFixed12(vw);
1563 		}
1564 		else
1565 		{
1566 			dst.y = Short4(0x0000);
1567 		}
1568 
1569 		if(state.interpolant[2 + coordinate].component & 0x04)
1570 		{
1571 			sw = Max(s, Float4(0.0f));
1572 			sw = Min(sw, Float4(1.0f));
1573 			dst.z = convertFixed12(sw);
1574 		}
1575 		else
1576 		{
1577 			dst.z = Short4(0x0000);
1578 		}
1579 
1580 		dst.w = Short4(0x1000);
1581 	}
1582 
TEXCRD(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int coordinate,bool project)1583 	void PixelPipeline::TEXCRD(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int coordinate, bool project)
1584 	{
1585 		Float4 uw = u;
1586 		Float4 vw = v;
1587 		Float4 sw = s;
1588 
1589 		if(project)
1590 		{
1591 			uw *= Rcp_pp(s);
1592 			vw *= Rcp_pp(s);
1593 		}
1594 
1595 		if(state.interpolant[2 + coordinate].component & 0x01)
1596 		{
1597 			uw *= Float4(0x1000);
1598 			uw = Max(uw, Float4(-0x8000));
1599 			uw = Min(uw, Float4(0x7FFF));
1600 			dst.x = RoundShort4(uw);
1601 		}
1602 		else
1603 		{
1604 			dst.x = Short4(0x0000);
1605 		}
1606 
1607 		if(state.interpolant[2 + coordinate].component & 0x02)
1608 		{
1609 			vw *= Float4(0x1000);
1610 			vw = Max(vw, Float4(-0x8000));
1611 			vw = Min(vw, Float4(0x7FFF));
1612 			dst.y = RoundShort4(vw);
1613 		}
1614 		else
1615 		{
1616 			dst.y = Short4(0x0000);
1617 		}
1618 
1619 		if(state.interpolant[2 + coordinate].component & 0x04)
1620 		{
1621 			sw *= Float4(0x1000);
1622 			sw = Max(sw, Float4(-0x8000));
1623 			sw = Min(sw, Float4(0x7FFF));
1624 			dst.z = RoundShort4(sw);
1625 		}
1626 		else
1627 		{
1628 			dst.z = Short4(0x0000);
1629 		}
1630 	}
1631 
TEXDP3(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src)1632 	void PixelPipeline::TEXDP3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src)
1633 	{
1634 		TEXM3X3PAD(u, v, s, src, 0, false);
1635 
1636 		Short4 t0 = RoundShort4(u_ * Float4(0x1000));
1637 
1638 		dst.x = t0;
1639 		dst.y = t0;
1640 		dst.z = t0;
1641 		dst.w = t0;
1642 	}
1643 
TEXDP3TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0)1644 	void PixelPipeline::TEXDP3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0)
1645 	{
1646 		TEXM3X3PAD(u, v, s, src0, 0, false);
1647 
1648 		v_ = Float4(0.0f);
1649 		w_ = Float4(0.0f);
1650 
1651 		dst = sampleTexture(stage, u_, v_, w_, w_);
1652 	}
1653 
TEXKILL(Int cMask[4],Float4 & u,Float4 & v,Float4 & s)1654 	void PixelPipeline::TEXKILL(Int cMask[4], Float4 &u, Float4 &v, Float4 &s)
1655 	{
1656 		Int kill = SignMask(CmpNLT(u, Float4(0.0f))) &
1657 			SignMask(CmpNLT(v, Float4(0.0f))) &
1658 			SignMask(CmpNLT(s, Float4(0.0f)));
1659 
1660 		for(unsigned int q = 0; q < state.multiSample; q++)
1661 		{
1662 			cMask[q] &= kill;
1663 		}
1664 	}
1665 
TEXKILL(Int cMask[4],Vector4s & src)1666 	void PixelPipeline::TEXKILL(Int cMask[4], Vector4s &src)
1667 	{
1668 		Short4 test = src.x | src.y | src.z;
1669 		Int kill = SignMask(PackSigned(test, test)) ^ 0x0000000F;
1670 
1671 		for(unsigned int q = 0; q < state.multiSample; q++)
1672 		{
1673 			cMask[q] &= kill;
1674 		}
1675 	}
1676 
TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int sampler,bool project)1677 	void PixelPipeline::TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int sampler, bool project)
1678 	{
1679 		dst = sampleTexture(sampler, u, v, s, s, project);
1680 	}
1681 
TEXLD(Vector4s & dst,Vector4s & src,int sampler,bool project)1682 	void PixelPipeline::TEXLD(Vector4s &dst, Vector4s &src, int sampler, bool project)
1683 	{
1684 		Float4 u = Float4(src.x) * Float4(1.0f / 0x0FFE);
1685 		Float4 v = Float4(src.y) * Float4(1.0f / 0x0FFE);
1686 		Float4 s = Float4(src.z) * Float4(1.0f / 0x0FFE);
1687 
1688 		dst = sampleTexture(sampler, u, v, s, s, project);
1689 	}
1690 
TEXBEM(Vector4s & dst,Vector4s & src,Float4 & u,Float4 & v,Float4 & s,int stage)1691 	void PixelPipeline::TEXBEM(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1692 	{
1693 		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1694 		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1695 
1696 		Float4 du2 = du;
1697 		Float4 dv2 = dv;
1698 
1699 		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1700 		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1701 		du += dv2;
1702 		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1703 		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1704 		dv += du2;
1705 
1706 		Float4 u_ = u + du;
1707 		Float4 v_ = v + dv;
1708 
1709 		dst = sampleTexture(stage, u_, v_, s, s);
1710 	}
1711 
TEXBEML(Vector4s & dst,Vector4s & src,Float4 & u,Float4 & v,Float4 & s,int stage)1712 	void PixelPipeline::TEXBEML(Vector4s &dst, Vector4s &src, Float4 &u, Float4 &v, Float4 &s, int stage)
1713 	{
1714 		Float4 du = Float4(src.x) * Float4(1.0f / 0x0FFE);
1715 		Float4 dv = Float4(src.y) * Float4(1.0f / 0x0FFE);
1716 
1717 		Float4 du2 = du;
1718 		Float4 dv2 = dv;
1719 
1720 		du *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][0]));
1721 		dv2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][0]));
1722 		du += dv2;
1723 		dv *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[1][1]));
1724 		du2 *= *Pointer<Float4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4F[0][1]));
1725 		dv += du2;
1726 
1727 		Float4 u_ = u + du;
1728 		Float4 v_ = v + dv;
1729 
1730 		dst = sampleTexture(stage, u_, v_, s, s);
1731 
1732 		Short4 L;
1733 
1734 		L = src.z;
1735 		L = MulHigh(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceScale4)));
1736 		L = L << 4;
1737 		L = AddSat(L, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].luminanceOffset4)));
1738 		L = Max(L, Short4(0x0000));
1739 		L = Min(L, Short4(0x1000));
1740 
1741 		dst.x = MulHigh(dst.x, L); dst.x = dst.x << 4;
1742 		dst.y = MulHigh(dst.y, L); dst.y = dst.y << 4;
1743 		dst.z = MulHigh(dst.z, L); dst.z = dst.z << 4;
1744 	}
1745 
TEXREG2AR(Vector4s & dst,Vector4s & src0,int stage)1746 	void PixelPipeline::TEXREG2AR(Vector4s &dst, Vector4s &src0, int stage)
1747 	{
1748 		Float4 u = Float4(src0.w) * Float4(1.0f / 0x0FFE);
1749 		Float4 v = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1750 		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1751 
1752 		dst = sampleTexture(stage, u, v, s, s);
1753 	}
1754 
TEXREG2GB(Vector4s & dst,Vector4s & src0,int stage)1755 	void PixelPipeline::TEXREG2GB(Vector4s &dst, Vector4s &src0, int stage)
1756 	{
1757 		Float4 u = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1758 		Float4 v = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1759 		Float4 s = v;
1760 
1761 		dst = sampleTexture(stage, u, v, s, s);
1762 	}
1763 
TEXREG2RGB(Vector4s & dst,Vector4s & src0,int stage)1764 	void PixelPipeline::TEXREG2RGB(Vector4s &dst, Vector4s &src0, int stage)
1765 	{
1766 		Float4 u = Float4(src0.x) * Float4(1.0f / 0x0FFE);
1767 		Float4 v = Float4(src0.y) * Float4(1.0f / 0x0FFE);
1768 		Float4 s = Float4(src0.z) * Float4(1.0f / 0x0FFE);
1769 
1770 		dst = sampleTexture(stage, u, v, s, s);
1771 	}
1772 
TEXM3X2DEPTH(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src,bool signedScaling)1773 	void PixelPipeline::TEXM3X2DEPTH(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src, bool signedScaling)
1774 	{
1775 		TEXM3X2PAD(u, v, s, src, 1, signedScaling);
1776 
1777 		// z / w
1778 		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1779 
1780 		oDepth = u_;
1781 	}
1782 
TEXM3X2PAD(Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,int component,bool signedScaling)1783 	void PixelPipeline::TEXM3X2PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1784 	{
1785 		TEXM3X3PAD(u, v, s, src0, component, signedScaling);
1786 	}
1787 
TEXM3X2TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,bool signedScaling)1788 	void PixelPipeline::TEXM3X2TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1789 	{
1790 		TEXM3X2PAD(u, v, s, src0, 1, signedScaling);
1791 
1792 		w_ = Float4(0.0f);
1793 
1794 		dst = sampleTexture(stage, u_, v_, w_, w_);
1795 	}
1796 
TEXM3X3(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,bool signedScaling)1797 	void PixelPipeline::TEXM3X3(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, bool signedScaling)
1798 	{
1799 		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1800 
1801 		dst.x = RoundShort4(u_ * Float4(0x1000));
1802 		dst.y = RoundShort4(v_ * Float4(0x1000));
1803 		dst.z = RoundShort4(w_ * Float4(0x1000));
1804 		dst.w = Short4(0x1000);
1805 	}
1806 
TEXM3X3PAD(Float4 & u,Float4 & v,Float4 & s,Vector4s & src0,int component,bool signedScaling)1807 	void PixelPipeline::TEXM3X3PAD(Float4 &u, Float4 &v, Float4 &s, Vector4s &src0, int component, bool signedScaling)
1808 	{
1809 		if(component == 0 || previousScaling != signedScaling)   // FIXME: Other source modifiers?
1810 		{
1811 			U = Float4(src0.x);
1812 			V = Float4(src0.y);
1813 			W = Float4(src0.z);
1814 
1815 			previousScaling = signedScaling;
1816 		}
1817 
1818 		Float4 x = U * u + V * v + W * s;
1819 
1820 		x *= Float4(1.0f / 0x1000);
1821 
1822 		switch(component)
1823 		{
1824 		case 0:	u_ = x; break;
1825 		case 1:	v_ = x; break;
1826 		case 2: w_ = x; break;
1827 		default: ASSERT(false);
1828 		}
1829 	}
1830 
TEXM3X3SPEC(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,Vector4s & src1)1831 	void PixelPipeline::TEXM3X3SPEC(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, Vector4s &src1)
1832 	{
1833 		TEXM3X3PAD(u, v, s, src0, 2, false);
1834 
1835 		Float4 E[3];   // Eye vector
1836 
1837 		E[0] = Float4(src1.x) * Float4(1.0f / 0x0FFE);
1838 		E[1] = Float4(src1.y) * Float4(1.0f / 0x0FFE);
1839 		E[2] = Float4(src1.z) * Float4(1.0f / 0x0FFE);
1840 
1841 		// Reflection
1842 		Float4 u__;
1843 		Float4 v__;
1844 		Float4 w__;
1845 
1846 		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1847 		u__ = u_ * E[0];
1848 		v__ = v_ * E[1];
1849 		w__ = w_ * E[2];
1850 		u__ += v__ + w__;
1851 		u__ += u__;
1852 		v__ = u__;
1853 		w__ = u__;
1854 		u__ *= u_;
1855 		v__ *= v_;
1856 		w__ *= w_;
1857 		u_ *= u_;
1858 		v_ *= v_;
1859 		w_ *= w_;
1860 		u_ += v_ + w_;
1861 		u__ -= E[0] * u_;
1862 		v__ -= E[1] * u_;
1863 		w__ -= E[2] * u_;
1864 
1865 		dst = sampleTexture(stage, u__, v__, w__, w__);
1866 	}
1867 
TEXM3X3TEX(Vector4s & dst,Float4 & u,Float4 & v,Float4 & s,int stage,Vector4s & src0,bool signedScaling)1868 	void PixelPipeline::TEXM3X3TEX(Vector4s &dst, Float4 &u, Float4 &v, Float4 &s, int stage, Vector4s &src0, bool signedScaling)
1869 	{
1870 		TEXM3X3PAD(u, v, s, src0, 2, signedScaling);
1871 
1872 		dst = sampleTexture(stage, u_, v_, w_, w_);
1873 	}
1874 
TEXM3X3VSPEC(Vector4s & dst,Float4 & x,Float4 & y,Float4 & z,int stage,Vector4s & src0)1875 	void PixelPipeline::TEXM3X3VSPEC(Vector4s &dst, Float4 &x, Float4 &y, Float4 &z, int stage, Vector4s &src0)
1876 	{
1877 		TEXM3X3PAD(x, y, z, src0, 2, false);
1878 
1879 		Float4 E[3];   // Eye vector
1880 
1881 		E[0] = v[2 + stage - 2].w;
1882 		E[1] = v[2 + stage - 1].w;
1883 		E[2] = v[2 + stage - 0].w;
1884 
1885 		// Reflection
1886 		Float4 u__;
1887 		Float4 v__;
1888 		Float4 w__;
1889 
1890 		// (u'', v'', w'') = 2 * (N . E) * N - E * (N . N)
1891 		u__ = u_ * E[0];
1892 		v__ = v_ * E[1];
1893 		w__ = w_ * E[2];
1894 		u__ += v__ + w__;
1895 		u__ += u__;
1896 		v__ = u__;
1897 		w__ = u__;
1898 		u__ *= u_;
1899 		v__ *= v_;
1900 		w__ *= w_;
1901 		u_ *= u_;
1902 		v_ *= v_;
1903 		w_ *= w_;
1904 		u_ += v_ + w_;
1905 		u__ -= E[0] * u_;
1906 		v__ -= E[1] * u_;
1907 		w__ -= E[2] * u_;
1908 
1909 		dst = sampleTexture(stage, u__, v__, w__, w__);
1910 	}
1911 
TEXDEPTH()1912 	void PixelPipeline::TEXDEPTH()
1913 	{
1914 		u_ = Float4(rs[5].x);
1915 		v_ = Float4(rs[5].y);
1916 
1917 		// z / w
1918 		u_ *= Rcp_pp(v_);   // FIXME: Set result to 1.0 when division by zero
1919 
1920 		oDepth = u_;
1921 	}
1922 
CND(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1923 	void PixelPipeline::CND(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1924 	{
1925 		{Short4 t0; t0 = src0.x; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.x; t1 = t1 & t0; t0 = ~t0 & src2.x; t0 = t0 | t1; dst.x = t0; };
1926 		{Short4 t0; t0 = src0.y; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.y; t1 = t1 & t0; t0 = ~t0 & src2.y; t0 = t0 | t1; dst.y = t0; };
1927 		{Short4 t0; t0 = src0.z; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.z; t1 = t1 & t0; t0 = ~t0 & src2.z; t0 = t0 | t1; dst.z = t0; };
1928 		{Short4 t0; t0 = src0.w; t0 = CmpGT(t0, Short4(0x0800)); Short4 t1; t1 = src1.w; t1 = t1 & t0; t0 = ~t0 & src2.w; t0 = t0 | t1; dst.w = t0; };
1929 	}
1930 
CMP(Vector4s & dst,Vector4s & src0,Vector4s & src1,Vector4s & src2)1931 	void PixelPipeline::CMP(Vector4s &dst, Vector4s &src0, Vector4s &src1, Vector4s &src2)
1932 	{
1933 		{Short4 t0 = CmpGT(Short4(0x0000), src0.x); Short4 t1; t1 = src2.x; t1 &= t0; t0 = ~t0 & src1.x; t0 |= t1; dst.x = t0; };
1934 		{Short4 t0 = CmpGT(Short4(0x0000), src0.y); Short4 t1; t1 = src2.y; t1 &= t0; t0 = ~t0 & src1.y; t0 |= t1; dst.y = t0; };
1935 		{Short4 t0 = CmpGT(Short4(0x0000), src0.z); Short4 t1; t1 = src2.z; t1 &= t0; t0 = ~t0 & src1.z; t0 |= t1; dst.z = t0; };
1936 		{Short4 t0 = CmpGT(Short4(0x0000), src0.w); Short4 t1; t1 = src2.w; t1 &= t0; t0 = ~t0 & src1.w; t0 |= t1; dst.w = t0; };
1937 	}
1938 
BEM(Vector4s & dst,Vector4s & src0,Vector4s & src1,int stage)1939 	void PixelPipeline::BEM(Vector4s &dst, Vector4s &src0, Vector4s &src1, int stage)
1940 	{
1941 		Short4 t0;
1942 		Short4 t1;
1943 
1944 		// dst.x = src0.x + BUMPENVMAT00(stage) * src1.x + BUMPENVMAT10(stage) * src1.y
1945 		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][0]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1946 		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][0]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1947 		t0 = AddSat(t0, t1);
1948 		t0 = AddSat(t0, src0.x);
1949 		dst.x = t0;
1950 
1951 		// dst.y = src0.y + BUMPENVMAT01(stage) * src1.x + BUMPENVMAT11(stage) * src1.y
1952 		t0 = MulHigh(src1.x, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[0][1]))); t0 = t0 << 4;   // FIXME: Matrix components range? Overflow hazard.
1953 		t1 = MulHigh(src1.y, *Pointer<Short4>(data + OFFSET(DrawData, textureStage[stage].bumpmapMatrix4W[1][1]))); t1 = t1 << 4;   // FIXME: Matrix components range? Overflow hazard.
1954 		t0 = AddSat(t0, t1);
1955 		t0 = AddSat(t0, src0.y);
1956 		dst.y = t0;
1957 	}
1958 }
1959 
1960