1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "SamplerCore.hpp"
18 #include "Constants.hpp"
19 #include "Renderer/Renderer.hpp"
20 #include "Renderer/QuadRasterizer.hpp"
21 #include "Renderer/Surface.hpp"
22 #include "Renderer/Primitive.hpp"
23 #include "Common/Debug.hpp"
24 
25 namespace sw
26 {
27 	extern bool complementaryDepthBuffer;
28 	extern bool postBlendSRGB;
29 	extern bool exactColorRounding;
30 	extern bool forceClearRegisters;
31 
PixelRoutine(const PixelProcessor::State & state,const PixelShader * shader)32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader)
33 		: QuadRasterizer(state, shader), v(shader && shader->indirectAddressableInput)
34 	{
35 		if(!shader || shader->getShaderModel() < 0x0200 || forceClearRegisters)
36 		{
37 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
38 			{
39 				v[i].x = Float4(0.0f);
40 				v[i].y = Float4(0.0f);
41 				v[i].z = Float4(0.0f);
42 				v[i].w = Float4(0.0f);
43 			}
44 		}
45 	}
46 
~PixelRoutine()47 	PixelRoutine::~PixelRoutine()
48 	{
49 	}
50 
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x)51 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x)
52 	{
53 		#if PERF_PROFILE
54 			Long pipeTime = Ticks();
55 		#endif
56 
57 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
58 
59 		Int zMask[4];   // Depth mask
60 		Int sMask[4];   // Stencil mask
61 
62 		for(unsigned int q = 0; q < state.multiSample; q++)
63 		{
64 			zMask[q] = cMask[q];
65 			sMask[q] = cMask[q];
66 		}
67 
68 		for(unsigned int q = 0; q < state.multiSample; q++)
69 		{
70 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
71 		}
72 
73 		Float4 f;
74 		Float4 rhwCentroid;
75 
76 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
77 
78 		if(interpolateZ())
79 		{
80 			for(unsigned int q = 0; q < state.multiSample; q++)
81 			{
82 				Float4 x = xxxx;
83 
84 				if(state.multiSample > 1)
85 				{
86 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
87 				}
88 
89 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false, state.depthClamp);
90 			}
91 		}
92 
93 		Bool depthPass = false;
94 
95 		if(earlyDepthTest)
96 		{
97 			for(unsigned int q = 0; q < state.multiSample; q++)
98 			{
99 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
100 			}
101 		}
102 
103 		If(depthPass || Bool(!earlyDepthTest))
104 		{
105 			#if PERF_PROFILE
106 				Long interpTime = Ticks();
107 			#endif
108 
109 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
110 
111 			// Centroid locations
112 			Float4 XXXX = Float4(0.0f);
113 			Float4 YYYY = Float4(0.0f);
114 
115 			if(state.centroid)
116 			{
117 				Float4 WWWW(1.0e-9f);
118 
119 				for(unsigned int q = 0; q < state.multiSample; q++)
120 				{
121 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
122 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
123 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
124 				}
125 
126 				WWWW = Rcp_pp(WWWW);
127 				XXXX *= WWWW;
128 				YYYY *= WWWW;
129 
130 				XXXX += xxxx;
131 				YYYY += yyyy;
132 			}
133 
134 			if(interpolateW())
135 			{
136 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false, false);
137 				rhw = reciprocal(w, false, false, true);
138 
139 				if(state.centroid)
140 				{
141 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
142 				}
143 			}
144 
145 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
146 			{
147 				for(int component = 0; component < 4; component++)
148 				{
149 					if(state.interpolant[interpolant].component & (1 << component))
150 					{
151 						if(!state.interpolant[interpolant].centroid)
152 						{
153 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective, false);
154 						}
155 						else
156 						{
157 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
158 						}
159 					}
160 				}
161 
162 				Float4 rcp;
163 
164 				switch(state.interpolant[interpolant].project)
165 				{
166 				case 0:
167 					break;
168 				case 1:
169 					rcp = reciprocal(v[interpolant].y);
170 					v[interpolant].x = v[interpolant].x * rcp;
171 					break;
172 				case 2:
173 					rcp = reciprocal(v[interpolant].z);
174 					v[interpolant].x = v[interpolant].x * rcp;
175 					v[interpolant].y = v[interpolant].y * rcp;
176 					break;
177 				case 3:
178 					rcp = reciprocal(v[interpolant].w);
179 					v[interpolant].x = v[interpolant].x * rcp;
180 					v[interpolant].y = v[interpolant].y * rcp;
181 					v[interpolant].z = v[interpolant].z * rcp;
182 					break;
183 				}
184 			}
185 
186 			if(state.fog.component)
187 			{
188 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective, false);
189 			}
190 
191 			setBuiltins(x, y, z, w);
192 
193 			#if PERF_PROFILE
194 				cycles[PERF_INTERP] += Ticks() - interpTime;
195 			#endif
196 
197 			Bool alphaPass = true;
198 
199 			if(colorUsed())
200 			{
201 				#if PERF_PROFILE
202 					Long shaderTime = Ticks();
203 				#endif
204 
205 				applyShader(cMask);
206 
207 				#if PERF_PROFILE
208 					cycles[PERF_SHADER] += Ticks() - shaderTime;
209 				#endif
210 
211 				alphaPass = alphaTest(cMask);
212 
213 				if((shader && shader->containsKill()) || state.alphaTestActive())
214 				{
215 					for(unsigned int q = 0; q < state.multiSample; q++)
216 					{
217 						zMask[q] &= cMask[q];
218 						sMask[q] &= cMask[q];
219 					}
220 				}
221 			}
222 
223 			If(alphaPass)
224 			{
225 				if(!earlyDepthTest)
226 				{
227 					for(unsigned int q = 0; q < state.multiSample; q++)
228 					{
229 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
230 					}
231 				}
232 
233 				#if PERF_PROFILE
234 					Long ropTime = Ticks();
235 				#endif
236 
237 				If(depthPass || Bool(earlyDepthTest))
238 				{
239 					for(unsigned int q = 0; q < state.multiSample; q++)
240 					{
241 						if(state.multiSampleMask & (1 << q))
242 						{
243 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
244 
245 							if(state.occlusionEnabled)
246 							{
247 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
248 							}
249 						}
250 					}
251 
252 					if(colorUsed())
253 					{
254 						#if PERF_PROFILE
255 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
256 						#endif
257 
258 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
259 					}
260 				}
261 
262 				#if PERF_PROFILE
263 					cycles[PERF_ROP] += Ticks() - ropTime;
264 				#endif
265 			}
266 		}
267 
268 		for(unsigned int q = 0; q < state.multiSample; q++)
269 		{
270 			if(state.multiSampleMask & (1 << q))
271 			{
272 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
273 			}
274 		}
275 
276 		#if PERF_PROFILE
277 			cycles[PERF_PIPE] += Ticks() - pipeTime;
278 		#endif
279 	}
280 
interpolateCentroid(Float4 & x,Float4 & y,Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)281 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
282 	{
283 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
284 
285 		if(!flat)
286 		{
287 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
288 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
289 
290 			if(perspective)
291 			{
292 				interpolant *= rhw;
293 			}
294 		}
295 
296 		return interpolant;
297 	}
298 
stencilTest(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & cMask)299 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
300 	{
301 		if(!state.stencilActive)
302 		{
303 			return;
304 		}
305 
306 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
307 
308 		Pointer<Byte> buffer = sBuffer + 2 * x;
309 
310 		if(q > 0)
311 		{
312 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
313 		}
314 
315 		Byte8 value = *Pointer<Byte8>(buffer);
316 		Byte8 valueCCW = value;
317 
318 		if(!state.noStencilMask)
319 		{
320 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
321 		}
322 
323 		stencilTest(value, state.stencilCompareMode, false);
324 
325 		if(state.twoSidedStencil)
326 		{
327 			if(!state.noStencilMaskCCW)
328 			{
329 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
330 			}
331 
332 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
333 
334 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
335 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
336 			value |= valueCCW;
337 		}
338 
339 		sMask = SignMask(value) & cMask;
340 	}
341 
stencilTest(Byte8 & value,StencilCompareMode stencilCompareMode,bool CCW)342 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
343 	{
344 		Byte8 equal;
345 
346 		switch(stencilCompareMode)
347 		{
348 		case STENCIL_ALWAYS:
349 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
350 			break;
351 		case STENCIL_NEVER:
352 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
353 			break;
354 		case STENCIL_LESS:			// a < b ~ b > a
355 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
356 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
357 			break;
358 		case STENCIL_EQUAL:
359 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
360 			break;
361 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
362 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
363 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
364 			break;
365 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
366 			equal = value;
367 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
369 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
370 			value |= equal;
371 			break;
372 		case STENCIL_GREATER:		// a > b
373 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
374 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
375 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
376 			value = equal;
377 			break;
378 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
379 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
380 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
381 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
382 			break;
383 		default:
384 			ASSERT(false);
385 		}
386 	}
387 
depthTest(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & sMask,Int & zMask,Int & cMask)388 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
389 	{
390 		if(!state.depthTestActive)
391 		{
392 			return true;
393 		}
394 
395 		Float4 Z = z;
396 
397 		if(shader && shader->depthOverride())
398 		{
399 			if(complementaryDepthBuffer)
400 			{
401 				Z = Float4(1.0f) - oDepth;
402 			}
403 			else
404 			{
405 				Z = oDepth;
406 			}
407 		}
408 
409 		Pointer<Byte> buffer;
410 		Int pitch;
411 
412 		if(!state.quadLayoutDepthBuffer)
413 		{
414 			buffer = zBuffer + 4 * x;
415 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
416 		}
417 		else
418 		{
419 			buffer = zBuffer + 8 * x;
420 		}
421 
422 		if(q > 0)
423 		{
424 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
425 		}
426 
427 		Float4 zValue;
428 
429 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
430 		{
431 			if(!state.quadLayoutDepthBuffer)
432 			{
433 				// FIXME: Properly optimizes?
434 				zValue.xy = *Pointer<Float4>(buffer);
435 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
436 			}
437 			else
438 			{
439 				zValue = *Pointer<Float4>(buffer, 16);
440 			}
441 		}
442 
443 		Int4 zTest;
444 
445 		switch(state.depthCompareMode)
446 		{
447 		case DEPTH_ALWAYS:
448 			// Optimized
449 			break;
450 		case DEPTH_NEVER:
451 			// Optimized
452 			break;
453 		case DEPTH_EQUAL:
454 			zTest = CmpEQ(zValue, Z);
455 			break;
456 		case DEPTH_NOTEQUAL:
457 			zTest = CmpNEQ(zValue, Z);
458 			break;
459 		case DEPTH_LESS:
460 			if(complementaryDepthBuffer)
461 			{
462 				zTest = CmpLT(zValue, Z);
463 			}
464 			else
465 			{
466 				zTest = CmpNLE(zValue, Z);
467 			}
468 			break;
469 		case DEPTH_GREATEREQUAL:
470 			if(complementaryDepthBuffer)
471 			{
472 				zTest = CmpNLT(zValue, Z);
473 			}
474 			else
475 			{
476 				zTest = CmpLE(zValue, Z);
477 			}
478 			break;
479 		case DEPTH_LESSEQUAL:
480 			if(complementaryDepthBuffer)
481 			{
482 				zTest = CmpLE(zValue, Z);
483 			}
484 			else
485 			{
486 				zTest = CmpNLT(zValue, Z);
487 			}
488 			break;
489 		case DEPTH_GREATER:
490 			if(complementaryDepthBuffer)
491 			{
492 				zTest = CmpNLE(zValue, Z);
493 			}
494 			else
495 			{
496 				zTest = CmpLT(zValue, Z);
497 			}
498 			break;
499 		default:
500 			ASSERT(false);
501 		}
502 
503 		switch(state.depthCompareMode)
504 		{
505 		case DEPTH_ALWAYS:
506 			zMask = cMask;
507 			break;
508 		case DEPTH_NEVER:
509 			zMask = 0x0;
510 			break;
511 		default:
512 			zMask = SignMask(zTest) & cMask;
513 			break;
514 		}
515 
516 		if(state.stencilActive)
517 		{
518 			zMask &= sMask;
519 		}
520 
521 		return zMask != 0;
522 	}
523 
alphaTest(Int & aMask,Short4 & alpha)524 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
525 	{
526 		Short4 cmp;
527 		Short4 equal;
528 
529 		switch(state.alphaCompareMode)
530 		{
531 		case ALPHA_ALWAYS:
532 			aMask = 0xF;
533 			break;
534 		case ALPHA_NEVER:
535 			aMask = 0x0;
536 			break;
537 		case ALPHA_EQUAL:
538 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
539 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
540 			break;
541 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
542 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
543 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
544 			break;
545 		case ALPHA_LESS:           // a < b ~ b > a
546 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
547 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
548 			break;
549 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
550 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
551 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
552 			cmp |= equal;
553 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
554 			break;
555 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
556 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
557 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
558 			break;
559 		case ALPHA_GREATER:        // a > b
560 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
561 			aMask = SignMask(PackSigned(cmp, Short4(0x0000)));
562 			break;
563 		default:
564 			ASSERT(false);
565 		}
566 	}
567 
alphaToCoverage(Int cMask[4],Float4 & alpha)568 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
569 	{
570 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
571 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
572 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
573 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
574 
575 		Int aMask0 = SignMask(coverage0);
576 		Int aMask1 = SignMask(coverage1);
577 		Int aMask2 = SignMask(coverage2);
578 		Int aMask3 = SignMask(coverage3);
579 
580 		cMask[0] &= aMask0;
581 		cMask[1] &= aMask1;
582 		cMask[2] &= aMask2;
583 		cMask[3] &= aMask3;
584 	}
585 
fogBlend(Vector4f & c0,Float4 & fog)586 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
587 	{
588 		if(!state.fogActive)
589 		{
590 			return;
591 		}
592 
593 		if(state.pixelFogMode != FOG_NONE)
594 		{
595 			pixelFog(fog);
596 
597 			fog = Min(fog, Float4(1.0f));
598 			fog = Max(fog, Float4(0.0f));
599 		}
600 
601 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
602 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
603 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
604 
605 		c0.x *= fog;
606 		c0.y *= fog;
607 		c0.z *= fog;
608 
609 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
612 	}
613 
pixelFog(Float4 & visibility)614 	void PixelRoutine::pixelFog(Float4 &visibility)
615 	{
616 		Float4 &zw = visibility;
617 
618 		if(state.pixelFogMode != FOG_NONE)
619 		{
620 			if(state.wBasedFog)
621 			{
622 				zw = rhw;
623 			}
624 			else
625 			{
626 				if(complementaryDepthBuffer)
627 				{
628 					zw = Float4(1.0f) - z[0];
629 				}
630 				else
631 				{
632 					zw = z[0];
633 				}
634 			}
635 		}
636 
637 		switch(state.pixelFogMode)
638 		{
639 		case FOG_NONE:
640 			break;
641 		case FOG_LINEAR:
642 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
643 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
644 			break;
645 		case FOG_EXP:
646 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
647 			zw = exponential2(zw, true);
648 			break;
649 		case FOG_EXP2:
650 			zw *= zw;
651 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
652 			zw = exponential2(zw, true);
653 			break;
654 		default:
655 			ASSERT(false);
656 		}
657 	}
658 
writeDepth(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & zMask)659 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
660 	{
661 		if(!state.depthWriteEnable)
662 		{
663 			return;
664 		}
665 
666 		Float4 Z = z;
667 
668 		if(shader && shader->depthOverride())
669 		{
670 			if(complementaryDepthBuffer)
671 			{
672 				Z = Float4(1.0f) - oDepth;
673 			}
674 			else
675 			{
676 				Z = oDepth;
677 			}
678 		}
679 
680 		Pointer<Byte> buffer;
681 		Int pitch;
682 
683 		if(!state.quadLayoutDepthBuffer)
684 		{
685 			buffer = zBuffer + 4 * x;
686 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
687 		}
688 		else
689 		{
690 			buffer = zBuffer + 8 * x;
691 		}
692 
693 		if(q > 0)
694 		{
695 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
696 		}
697 
698 		Float4 zValue;
699 
700 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
701 		{
702 			if(!state.quadLayoutDepthBuffer)
703 			{
704 				// FIXME: Properly optimizes?
705 				zValue.xy = *Pointer<Float4>(buffer);
706 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
707 			}
708 			else
709 			{
710 				zValue = *Pointer<Float4>(buffer, 16);
711 			}
712 		}
713 
714 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
715 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
716 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
717 
718 		if(!state.quadLayoutDepthBuffer)
719 		{
720 			// FIXME: Properly optimizes?
721 			*Pointer<Float2>(buffer) = Float2(Z.xy);
722 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
723 		}
724 		else
725 		{
726 			*Pointer<Float4>(buffer, 16) = Z;
727 		}
728 	}
729 
writeStencil(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & zMask,Int & cMask)730 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
731 	{
732 		if(!state.stencilActive)
733 		{
734 			return;
735 		}
736 
737 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
738 		{
739 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
740 			{
741 				return;
742 			}
743 		}
744 
745 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
746 		{
747 			return;
748 		}
749 
750 		Pointer<Byte> buffer = sBuffer + 2 * x;
751 
752 		if(q > 0)
753 		{
754 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
755 		}
756 
757 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
758 
759 		Byte8 newValue;
760 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
761 
762 		if(!state.noStencilWriteMask)
763 		{
764 			Byte8 maskedValue = bufferValue;
765 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
766 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
767 			newValue |= maskedValue;
768 		}
769 
770 		if(state.twoSidedStencil)
771 		{
772 			Byte8 newValueCCW;
773 
774 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
775 
776 			if(!state.noStencilWriteMaskCCW)
777 			{
778 				Byte8 maskedValue = bufferValue;
779 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
780 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
781 				newValueCCW |= maskedValue;
782 			}
783 
784 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
785 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
786 			newValue |= newValueCCW;
787 		}
788 
789 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
790 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
791 		newValue |= bufferValue;
792 
793 		*Pointer<Byte4>(buffer) = Byte4(newValue);
794 	}
795 
stencilOperation(Byte8 & newValue,Byte8 & bufferValue,StencilOperation stencilPassOperation,StencilOperation stencilZFailOperation,StencilOperation stencilFailOperation,bool CCW,Int & zMask,Int & sMask)796 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
797 	{
798 		Byte8 &pass = newValue;
799 		Byte8 fail;
800 		Byte8 zFail;
801 
802 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
803 
804 		if(stencilZFailOperation != stencilPassOperation)
805 		{
806 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
807 		}
808 
809 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
810 		{
811 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
812 		}
813 
814 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
815 		{
816 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
817 			{
818 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
819 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
820 				pass |= zFail;
821 			}
822 
823 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
824 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
825 			pass |= fail;
826 		}
827 	}
828 
stencilOperation(Byte8 & output,Byte8 & bufferValue,StencilOperation operation,bool CCW)829 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
830 	{
831 		switch(operation)
832 		{
833 		case OPERATION_KEEP:
834 			output = bufferValue;
835 			break;
836 		case OPERATION_ZERO:
837 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
838 			break;
839 		case OPERATION_REPLACE:
840 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
841 			break;
842 		case OPERATION_INCRSAT:
843 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
844 			break;
845 		case OPERATION_DECRSAT:
846 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
847 			break;
848 		case OPERATION_INVERT:
849 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
850 			break;
851 		case OPERATION_INCR:
852 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
853 			break;
854 		case OPERATION_DECR:
855 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
856 			break;
857 		default:
858 			ASSERT(false);
859 		}
860 	}
861 
blendFactor(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorActive)862 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
863 	{
864 		switch(blendFactorActive)
865 		{
866 		case BLEND_ZERO:
867 			// Optimized
868 			break;
869 		case BLEND_ONE:
870 			// Optimized
871 			break;
872 		case BLEND_SOURCE:
873 			blendFactor.x = current.x;
874 			blendFactor.y = current.y;
875 			blendFactor.z = current.z;
876 			break;
877 		case BLEND_INVSOURCE:
878 			blendFactor.x = Short4(0xFFFFu) - current.x;
879 			blendFactor.y = Short4(0xFFFFu) - current.y;
880 			blendFactor.z = Short4(0xFFFFu) - current.z;
881 			break;
882 		case BLEND_DEST:
883 			blendFactor.x = pixel.x;
884 			blendFactor.y = pixel.y;
885 			blendFactor.z = pixel.z;
886 			break;
887 		case BLEND_INVDEST:
888 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
889 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
890 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
891 			break;
892 		case BLEND_SOURCEALPHA:
893 			blendFactor.x = current.w;
894 			blendFactor.y = current.w;
895 			blendFactor.z = current.w;
896 			break;
897 		case BLEND_INVSOURCEALPHA:
898 			blendFactor.x = Short4(0xFFFFu) - current.w;
899 			blendFactor.y = Short4(0xFFFFu) - current.w;
900 			blendFactor.z = Short4(0xFFFFu) - current.w;
901 			break;
902 		case BLEND_DESTALPHA:
903 			blendFactor.x = pixel.w;
904 			blendFactor.y = pixel.w;
905 			blendFactor.z = pixel.w;
906 			break;
907 		case BLEND_INVDESTALPHA:
908 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
909 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
910 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
911 			break;
912 		case BLEND_SRCALPHASAT:
913 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
914 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
915 			blendFactor.y = blendFactor.x;
916 			blendFactor.z = blendFactor.x;
917 			break;
918 		case BLEND_CONSTANT:
919 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
920 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
921 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
922 			break;
923 		case BLEND_INVCONSTANT:
924 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
925 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
926 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
927 			break;
928 		case BLEND_CONSTANTALPHA:
929 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
930 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
931 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
932 			break;
933 		case BLEND_INVCONSTANTALPHA:
934 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
935 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
936 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
937 			break;
938 		default:
939 			ASSERT(false);
940 		}
941 	}
942 
blendFactorAlpha(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorAlphaActive)943 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
944 	{
945 		switch(blendFactorAlphaActive)
946 		{
947 		case BLEND_ZERO:
948 			// Optimized
949 			break;
950 		case BLEND_ONE:
951 			// Optimized
952 			break;
953 		case BLEND_SOURCE:
954 			blendFactor.w = current.w;
955 			break;
956 		case BLEND_INVSOURCE:
957 			blendFactor.w = Short4(0xFFFFu) - current.w;
958 			break;
959 		case BLEND_DEST:
960 			blendFactor.w = pixel.w;
961 			break;
962 		case BLEND_INVDEST:
963 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
964 			break;
965 		case BLEND_SOURCEALPHA:
966 			blendFactor.w = current.w;
967 			break;
968 		case BLEND_INVSOURCEALPHA:
969 			blendFactor.w = Short4(0xFFFFu) - current.w;
970 			break;
971 		case BLEND_DESTALPHA:
972 			blendFactor.w = pixel.w;
973 			break;
974 		case BLEND_INVDESTALPHA:
975 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
976 			break;
977 		case BLEND_SRCALPHASAT:
978 			blendFactor.w = Short4(0xFFFFu);
979 			break;
980 		case BLEND_CONSTANT:
981 		case BLEND_CONSTANTALPHA:
982 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
983 			break;
984 		case BLEND_INVCONSTANT:
985 		case BLEND_INVCONSTANTALPHA:
986 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
987 			break;
988 		default:
989 			ASSERT(false);
990 		}
991 	}
992 
isSRGB(int index) const993 	bool PixelRoutine::isSRGB(int index) const
994 	{
995 		return Surface::isSRGBformat(state.targetFormat[index]);
996 	}
997 
readPixel(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & pixel)998 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
999 	{
1000 		Short4 c01;
1001 		Short4 c23;
1002 		Pointer<Byte> buffer;
1003 		Pointer<Byte> buffer2;
1004 
1005 		switch(state.targetFormat[index])
1006 		{
1007 		case FORMAT_R5G6B5:
1008 			buffer = cBuffer + 2 * x;
1009 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1010 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1011 
1012 			pixel.x = c01 & Short4(0xF800u);
1013 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
1014 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
1015 			pixel.w = Short4(0xFFFFu);
1016 			break;
1017 		case FORMAT_A8R8G8B8:
1018 			buffer = cBuffer + 4 * x;
1019 			c01 = *Pointer<Short4>(buffer);
1020 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1021 			c23 = *Pointer<Short4>(buffer);
1022 			pixel.z = c01;
1023 			pixel.y = c01;
1024 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1025 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1026 			pixel.x = pixel.z;
1027 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1028 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1029 			pixel.y = pixel.z;
1030 			pixel.w = pixel.x;
1031 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1032 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1033 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1034 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1035 			break;
1036 		case FORMAT_A8B8G8R8:
1037 		case FORMAT_SRGB8_A8:
1038 			buffer = cBuffer + 4 * x;
1039 			c01 = *Pointer<Short4>(buffer);
1040 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1041 			c23 = *Pointer<Short4>(buffer);
1042 			pixel.z = c01;
1043 			pixel.y = c01;
1044 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1045 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1046 			pixel.x = pixel.z;
1047 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1048 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1049 			pixel.y = pixel.z;
1050 			pixel.w = pixel.x;
1051 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1052 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1053 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1054 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1055 			break;
1056 		case FORMAT_A8:
1057 			buffer = cBuffer + 1 * x;
1058 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1059 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1060 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1061 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062 			pixel.x = Short4(0x0000);
1063 			pixel.y = Short4(0x0000);
1064 			pixel.z = Short4(0x0000);
1065 			break;
1066 		case FORMAT_R8:
1067 			buffer = cBuffer + 1 * x;
1068 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1069 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1070 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1071 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1072 			pixel.y = Short4(0x0000);
1073 			pixel.z = Short4(0x0000);
1074 			pixel.w = Short4(0xFFFFu);
1075 			break;
1076 		case FORMAT_X8R8G8B8:
1077 			buffer = cBuffer + 4 * x;
1078 			c01 = *Pointer<Short4>(buffer);
1079 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1080 			c23 = *Pointer<Short4>(buffer);
1081 			pixel.z = c01;
1082 			pixel.y = c01;
1083 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1084 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1085 			pixel.x = pixel.z;
1086 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1087 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1088 			pixel.y = pixel.z;
1089 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1090 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1091 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1092 			pixel.w = Short4(0xFFFFu);
1093 			break;
1094 		case FORMAT_G8R8:
1095 			buffer = cBuffer + 2 * x;
1096 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1097 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1098 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1099 			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1100 			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1101 			pixel.z = Short4(0x0000u);
1102 			pixel.w = Short4(0xFFFFu);
1103 			break;
1104 		case FORMAT_X8B8G8R8:
1105 		case FORMAT_SRGB8_X8:
1106 			buffer = cBuffer + 4 * x;
1107 			c01 = *Pointer<Short4>(buffer);
1108 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1109 			c23 = *Pointer<Short4>(buffer);
1110 			pixel.z = c01;
1111 			pixel.y = c01;
1112 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1113 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1114 			pixel.x = pixel.z;
1115 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1116 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1117 			pixel.y = pixel.z;
1118 			pixel.w = pixel.x;
1119 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1120 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1121 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1122 			pixel.w = Short4(0xFFFFu);
1123 			break;
1124 		case FORMAT_A8G8R8B8Q:
1125 			UNIMPLEMENTED();
1126 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1127 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1128 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1129 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1130 			break;
1131 		case FORMAT_X8G8R8B8Q:
1132 			UNIMPLEMENTED();
1133 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1134 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1135 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1136 		//	pixel.w = Short4(0xFFFFu);
1137 			break;
1138 		case FORMAT_A16B16G16R16:
1139 			buffer = cBuffer;
1140 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
1141 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1142 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1143 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
1144 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1145 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1146 			break;
1147 		case FORMAT_G16R16:
1148 			buffer = cBuffer;
1149 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
1150 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1151 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
1152 			pixel.z = pixel.x;
1153 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1154 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1155 			pixel.y = pixel.z;
1156 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1157 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1158 			pixel.z = Short4(0xFFFFu);
1159 			pixel.w = Short4(0xFFFFu);
1160 			break;
1161 		default:
1162 			ASSERT(false);
1163 		}
1164 
1165 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1166 		{
1167 			sRGBtoLinear16_12_16(pixel);
1168 		}
1169 	}
1170 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1171 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1172 	{
1173 		if(!state.alphaBlendActive)
1174 		{
1175 			return;
1176 		}
1177 
1178 		Vector4s pixel;
1179 		readPixel(index, cBuffer, x, pixel);
1180 
1181 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1182 		Vector4s sourceFactor;
1183 		Vector4s destFactor;
1184 
1185 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1186 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
1187 
1188 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1189 		{
1190 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1191 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1192 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1193 		}
1194 
1195 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1196 		{
1197 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1198 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1199 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1200 		}
1201 
1202 		switch(state.blendOperation)
1203 		{
1204 		case BLENDOP_ADD:
1205 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1206 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1207 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1208 			break;
1209 		case BLENDOP_SUB:
1210 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1211 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1212 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1213 			break;
1214 		case BLENDOP_INVSUB:
1215 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1216 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1217 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1218 			break;
1219 		case BLENDOP_MIN:
1220 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1221 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1222 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1223 			break;
1224 		case BLENDOP_MAX:
1225 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1226 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1227 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1228 			break;
1229 		case BLENDOP_SOURCE:
1230 			// No operation
1231 			break;
1232 		case BLENDOP_DEST:
1233 			current.x = pixel.x;
1234 			current.y = pixel.y;
1235 			current.z = pixel.z;
1236 			break;
1237 		case BLENDOP_NULL:
1238 			current.x = Short4(0x0000);
1239 			current.y = Short4(0x0000);
1240 			current.z = Short4(0x0000);
1241 			break;
1242 		default:
1243 			ASSERT(false);
1244 		}
1245 
1246 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1247 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1248 
1249 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1250 		{
1251 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1252 		}
1253 
1254 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1255 		{
1256 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1257 		}
1258 
1259 		switch(state.blendOperationAlpha)
1260 		{
1261 		case BLENDOP_ADD:
1262 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263 			break;
1264 		case BLENDOP_SUB:
1265 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1266 			break;
1267 		case BLENDOP_INVSUB:
1268 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1269 			break;
1270 		case BLENDOP_MIN:
1271 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1272 			break;
1273 		case BLENDOP_MAX:
1274 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1275 			break;
1276 		case BLENDOP_SOURCE:
1277 			// No operation
1278 			break;
1279 		case BLENDOP_DEST:
1280 			current.w = pixel.w;
1281 			break;
1282 		case BLENDOP_NULL:
1283 			current.w = Short4(0x0000);
1284 			break;
1285 		default:
1286 			ASSERT(false);
1287 		}
1288 	}
1289 
logicOperation(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1290 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1291 	{
1292 		if(state.logicalOperation == LOGICALOP_COPY)
1293 		{
1294 			return;
1295 		}
1296 
1297 		Vector4s pixel;
1298 		readPixel(index, cBuffer, x, pixel);
1299 
1300 		switch(state.logicalOperation)
1301 		{
1302 		case LOGICALOP_CLEAR:
1303 			current.x = UShort4(0);
1304 			current.y = UShort4(0);
1305 			current.z = UShort4(0);
1306 			break;
1307 		case LOGICALOP_SET:
1308 			current.x = UShort4(0xFFFFu);
1309 			current.y = UShort4(0xFFFFu);
1310 			current.z = UShort4(0xFFFFu);
1311 			break;
1312 		case LOGICALOP_COPY:
1313 			ASSERT(false);   // Optimized out
1314 			break;
1315 		case LOGICALOP_COPY_INVERTED:
1316 			current.x = ~current.x;
1317 			current.y = ~current.y;
1318 			current.z = ~current.z;
1319 			break;
1320 		case LOGICALOP_NOOP:
1321 			current.x = pixel.x;
1322 			current.y = pixel.y;
1323 			current.z = pixel.z;
1324 			break;
1325 		case LOGICALOP_INVERT:
1326 			current.x = ~pixel.x;
1327 			current.y = ~pixel.y;
1328 			current.z = ~pixel.z;
1329 			break;
1330 		case LOGICALOP_AND:
1331 			current.x = pixel.x & current.x;
1332 			current.y = pixel.y & current.y;
1333 			current.z = pixel.z & current.z;
1334 			break;
1335 		case LOGICALOP_NAND:
1336 			current.x = ~(pixel.x & current.x);
1337 			current.y = ~(pixel.y & current.y);
1338 			current.z = ~(pixel.z & current.z);
1339 			break;
1340 		case LOGICALOP_OR:
1341 			current.x = pixel.x | current.x;
1342 			current.y = pixel.y | current.y;
1343 			current.z = pixel.z | current.z;
1344 			break;
1345 		case LOGICALOP_NOR:
1346 			current.x = ~(pixel.x | current.x);
1347 			current.y = ~(pixel.y | current.y);
1348 			current.z = ~(pixel.z | current.z);
1349 			break;
1350 		case LOGICALOP_XOR:
1351 			current.x = pixel.x ^ current.x;
1352 			current.y = pixel.y ^ current.y;
1353 			current.z = pixel.z ^ current.z;
1354 			break;
1355 		case LOGICALOP_EQUIV:
1356 			current.x = ~(pixel.x ^ current.x);
1357 			current.y = ~(pixel.y ^ current.y);
1358 			current.z = ~(pixel.z ^ current.z);
1359 			break;
1360 		case LOGICALOP_AND_REVERSE:
1361 			current.x = ~pixel.x & current.x;
1362 			current.y = ~pixel.y & current.y;
1363 			current.z = ~pixel.z & current.z;
1364 			break;
1365 		case LOGICALOP_AND_INVERTED:
1366 			current.x = pixel.x & ~current.x;
1367 			current.y = pixel.y & ~current.y;
1368 			current.z = pixel.z & ~current.z;
1369 			break;
1370 		case LOGICALOP_OR_REVERSE:
1371 			current.x = ~pixel.x | current.x;
1372 			current.y = ~pixel.y | current.y;
1373 			current.z = ~pixel.z | current.z;
1374 			break;
1375 		case LOGICALOP_OR_INVERTED:
1376 			current.x = pixel.x | ~current.x;
1377 			current.y = pixel.y | ~current.y;
1378 			current.z = pixel.z | ~current.z;
1379 			break;
1380 		default:
1381 			ASSERT(false);
1382 		}
1383 	}
1384 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & current,Int & sMask,Int & zMask,Int & cMask)1385 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1386 	{
1387 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1388 		{
1389 			linearToSRGB16_12_16(current);
1390 		}
1391 
1392 		if(exactColorRounding)
1393 		{
1394 			switch(state.targetFormat[index])
1395 			{
1396 			case FORMAT_R5G6B5:
1397 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1398 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1399 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1400 				break;
1401 			case FORMAT_X8G8R8B8Q:
1402 			case FORMAT_A8G8R8B8Q:
1403 			case FORMAT_X8R8G8B8:
1404 			case FORMAT_X8B8G8R8:
1405 			case FORMAT_A8R8G8B8:
1406 			case FORMAT_A8B8G8R8:
1407 			case FORMAT_SRGB8_X8:
1408 			case FORMAT_SRGB8_A8:
1409 			case FORMAT_G8R8:
1410 			case FORMAT_R8:
1411 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1412 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1413 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1414 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1415 				break;
1416 			default:
1417 				break;
1418 			}
1419 		}
1420 
1421 		int rgbaWriteMask = state.colorWriteActive(index);
1422 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1423 
1424 		switch(state.targetFormat[index])
1425 		{
1426 		case FORMAT_R5G6B5:
1427 			{
1428 				current.x = current.x & Short4(0xF800u);
1429 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1430 				current.z = As<UShort4>(current.z) >> 11;
1431 
1432 				current.x = current.x | current.y | current.z;
1433 			}
1434 			break;
1435 		case FORMAT_X8G8R8B8Q:
1436 			UNIMPLEMENTED();
1437 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1438 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1439 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1440 
1441 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1442 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1443 			break;
1444 		case FORMAT_A8G8R8B8Q:
1445 			UNIMPLEMENTED();
1446 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1450 
1451 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1452 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1453 			break;
1454 		case FORMAT_X8R8G8B8:
1455 		case FORMAT_A8R8G8B8:
1456 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1457 			{
1458 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1459 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1460 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1461 
1462 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1463 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1464 
1465 				current.x = current.z;
1466 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1467 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1468 				current.y = current.z;
1469 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1470 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1471 			}
1472 			else
1473 			{
1474 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1475 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1476 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1477 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1478 
1479 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1480 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1481 
1482 				current.x = current.z;
1483 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1484 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1485 				current.y = current.z;
1486 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1487 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1488 			}
1489 			break;
1490 		case FORMAT_X8B8G8R8:
1491 		case FORMAT_A8B8G8R8:
1492 		case FORMAT_SRGB8_X8:
1493 		case FORMAT_SRGB8_A8:
1494 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1495 			{
1496 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1497 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1498 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1499 
1500 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1501 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1502 
1503 				current.x = current.z;
1504 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1505 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1506 				current.y = current.z;
1507 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1508 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1509 			}
1510 			else
1511 			{
1512 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1513 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1514 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1515 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1516 
1517 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1518 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1519 
1520 				current.x = current.z;
1521 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1522 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1523 				current.y = current.z;
1524 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1525 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1526 			}
1527 			break;
1528 		case FORMAT_G8R8:
1529 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1530 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1531 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1532 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1533 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1534 			break;
1535 		case FORMAT_R8:
1536 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1537 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1538 			break;
1539 		case FORMAT_A8:
1540 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1541 			current.w = As<Short4>(PackUnsigned(current.w, current.w));
1542 			break;
1543 		case FORMAT_G16R16:
1544 			current.z = current.x;
1545 			current.x = As<Short4>(UnpackLow(current.x, current.y));
1546 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1547 			current.y = current.z;
1548 			break;
1549 		case FORMAT_A16B16G16R16:
1550 			transpose4x4(current.x, current.y, current.z, current.w);
1551 			break;
1552 		default:
1553 			ASSERT(false);
1554 		}
1555 
1556 		Short4 c01 = current.z;
1557 		Short4 c23 = current.y;
1558 
1559 		Int xMask;   // Combination of all masks
1560 
1561 		if(state.depthTestActive)
1562 		{
1563 			xMask = zMask;
1564 		}
1565 		else
1566 		{
1567 			xMask = cMask;
1568 		}
1569 
1570 		if(state.stencilActive)
1571 		{
1572 			xMask &= sMask;
1573 		}
1574 
1575 		switch(state.targetFormat[index])
1576 		{
1577 		case FORMAT_R5G6B5:
1578 			{
1579 				Pointer<Byte> buffer = cBuffer + 2 * x;
1580 				Int value = *Pointer<Int>(buffer);
1581 
1582 				Int c01 = Extract(As<Int2>(current.x), 0);
1583 
1584 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1585 				{
1586 					Int masked = value;
1587 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1588 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1589 					c01 |= masked;
1590 				}
1591 
1592 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1593 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1594 				c01 |= value;
1595 				*Pointer<Int>(buffer) = c01;
1596 
1597 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1598 				value = *Pointer<Int>(buffer);
1599 
1600 				Int c23 = Extract(As<Int2>(current.x), 1);
1601 
1602 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1603 				{
1604 					Int masked = value;
1605 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1606 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1607 					c23 |= masked;
1608 				}
1609 
1610 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1611 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1612 				c23 |= value;
1613 				*Pointer<Int>(buffer) = c23;
1614 			}
1615 			break;
1616 		case FORMAT_A8G8R8B8Q:
1617 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1618 			UNIMPLEMENTED();
1619 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1620 
1621 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1622 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1623 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1624 		//	{
1625 		//		Short4 masked = value;
1626 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1627 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1628 		//		c01 |= masked;
1629 		//	}
1630 
1631 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1632 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1633 		//	c01 |= value;
1634 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1635 
1636 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1637 
1638 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1639 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1640 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1641 		//	{
1642 		//		Short4 masked = value;
1643 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1644 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1645 		//		c23 |= masked;
1646 		//	}
1647 
1648 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1649 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1650 		//	c23 |= value;
1651 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1652 			break;
1653 		case FORMAT_A8R8G8B8:
1654 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1655 			{
1656 				Pointer<Byte> buffer = cBuffer + x * 4;
1657 				Short4 value = *Pointer<Short4>(buffer);
1658 
1659 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1660 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1661 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1662 				{
1663 					Short4 masked = value;
1664 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1665 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1666 					c01 |= masked;
1667 				}
1668 
1669 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1670 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1671 				c01 |= value;
1672 				*Pointer<Short4>(buffer) = c01;
1673 
1674 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1675 				value = *Pointer<Short4>(buffer);
1676 
1677 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1678 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1679 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1680 				{
1681 					Short4 masked = value;
1682 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1683 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1684 					c23 |= masked;
1685 				}
1686 
1687 #ifdef __APPLE__
1688 				// On Mac we render directly to an IOSurface that isn't vertically padded. So we
1689 				// only render the bottom half of quads when it won't overflow the buffer.
1690 				If ((y + 1) < yMax)
1691 #endif
1692 				{
1693 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1694 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1695 					c23 |= value;
1696 					*Pointer<Short4>(buffer) = c23;
1697 				}
1698 			}
1699 			break;
1700 		case FORMAT_A8B8G8R8:
1701 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1702 		case FORMAT_SRGB8_X8:
1703 		case FORMAT_SRGB8_A8:
1704 			{
1705 				Pointer<Byte> buffer = cBuffer + x * 4;
1706 				Short4 value = *Pointer<Short4>(buffer);
1707 
1708 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1709 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1710 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1711 
1712 				if(masked)
1713 				{
1714 					Short4 masked = value;
1715 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1716 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1717 					c01 |= masked;
1718 				}
1719 
1720 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1721 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1722 				c01 |= value;
1723 				*Pointer<Short4>(buffer) = c01;
1724 
1725 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1726 				value = *Pointer<Short4>(buffer);
1727 
1728 				if(masked)
1729 				{
1730 					Short4 masked = value;
1731 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1732 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1733 					c23 |= masked;
1734 				}
1735 
1736 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1737 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1738 				c23 |= value;
1739 				*Pointer<Short4>(buffer) = c23;
1740 			}
1741 			break;
1742 		case FORMAT_G8R8:
1743 			if((rgbaWriteMask & 0x00000003) != 0x0)
1744 			{
1745 				Pointer<Byte> buffer = cBuffer + 2 * x;
1746 				Int2 value;
1747 				value = Insert(value, *Pointer<Int>(buffer), 0);
1748 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1749 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1750 
1751 				Int2 packedCol = As<Int2>(current.x);
1752 
1753 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1754 				if((rgbaWriteMask & 0x3) != 0x3)
1755 				{
1756 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1757 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1758 					mergedMask &= rgbaMask;
1759 				}
1760 
1761 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1762 
1763 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1764 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1765 			}
1766 			break;
1767 		case FORMAT_R8:
1768 			if(rgbaWriteMask & 0x00000001)
1769 			{
1770 				Pointer<Byte> buffer = cBuffer + 1 * x;
1771 				Short4 value;
1772 				value = Insert(value, *Pointer<Short>(buffer), 0);
1773 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1774 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1775 
1776 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1777 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1778 				current.x |= value;
1779 
1780 				*Pointer<Short>(buffer) = Extract(current.x, 0);
1781 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1782 			}
1783 			break;
1784 		case FORMAT_A8:
1785 			if(rgbaWriteMask & 0x00000008)
1786 			{
1787 				Pointer<Byte> buffer = cBuffer + 1 * x;
1788 				Short4 value;
1789 				value = Insert(value, *Pointer<Short>(buffer), 0);
1790 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1791 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1792 
1793 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1794 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1795 				current.w |= value;
1796 
1797 				*Pointer<Short>(buffer) = Extract(current.w, 0);
1798 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1799 			}
1800 			break;
1801 		case FORMAT_G16R16:
1802 			{
1803 				Pointer<Byte> buffer = cBuffer + 4 * x;
1804 
1805 				Short4 value = *Pointer<Short4>(buffer);
1806 
1807 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1808 				{
1809 					Short4 masked = value;
1810 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1812 					current.x |= masked;
1813 				}
1814 
1815 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1816 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1817 				current.x |= value;
1818 				*Pointer<Short4>(buffer) = current.x;
1819 
1820 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1821 
1822 				value = *Pointer<Short4>(buffer);
1823 
1824 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1825 				{
1826 					Short4 masked = value;
1827 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1828 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1829 					current.y |= masked;
1830 				}
1831 
1832 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1833 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1834 				current.y |= value;
1835 				*Pointer<Short4>(buffer) = current.y;
1836 			}
1837 			break;
1838 		case FORMAT_A16B16G16R16:
1839 			{
1840 				Pointer<Byte> buffer = cBuffer + 8 * x;
1841 
1842 				{
1843 					Short4 value = *Pointer<Short4>(buffer);
1844 
1845 					if(rgbaWriteMask != 0x0000000F)
1846 					{
1847 						Short4 masked = value;
1848 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850 						current.x |= masked;
1851 					}
1852 
1853 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1854 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1855 					current.x |= value;
1856 					*Pointer<Short4>(buffer) = current.x;
1857 				}
1858 
1859 				{
1860 					Short4 value = *Pointer<Short4>(buffer + 8);
1861 
1862 					if(rgbaWriteMask != 0x0000000F)
1863 					{
1864 						Short4 masked = value;
1865 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1866 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1867 						current.y |= masked;
1868 					}
1869 
1870 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1871 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1872 					current.y |= value;
1873 					*Pointer<Short4>(buffer + 8) = current.y;
1874 				}
1875 
1876 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1877 
1878 				{
1879 					Short4 value = *Pointer<Short4>(buffer);
1880 
1881 					if(rgbaWriteMask != 0x0000000F)
1882 					{
1883 						Short4 masked = value;
1884 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886 						current.z |= masked;
1887 					}
1888 
1889 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1890 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1891 					current.z |= value;
1892 					*Pointer<Short4>(buffer) = current.z;
1893 				}
1894 
1895 				{
1896 					Short4 value = *Pointer<Short4>(buffer + 8);
1897 
1898 					if(rgbaWriteMask != 0x0000000F)
1899 					{
1900 						Short4 masked = value;
1901 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1902 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1903 						current.w |= masked;
1904 					}
1905 
1906 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1907 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1908 					current.w |= value;
1909 					*Pointer<Short4>(buffer + 8) = current.w;
1910 				}
1911 			}
1912 			break;
1913 		default:
1914 			ASSERT(false);
1915 		}
1916 	}
1917 
blendFactor(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorActive)1918 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1919 	{
1920 		switch(blendFactorActive)
1921 		{
1922 		case BLEND_ZERO:
1923 			// Optimized
1924 			break;
1925 		case BLEND_ONE:
1926 			// Optimized
1927 			break;
1928 		case BLEND_SOURCE:
1929 			blendFactor.x = oC.x;
1930 			blendFactor.y = oC.y;
1931 			blendFactor.z = oC.z;
1932 			break;
1933 		case BLEND_INVSOURCE:
1934 			blendFactor.x = Float4(1.0f) - oC.x;
1935 			blendFactor.y = Float4(1.0f) - oC.y;
1936 			blendFactor.z = Float4(1.0f) - oC.z;
1937 			break;
1938 		case BLEND_DEST:
1939 			blendFactor.x = pixel.x;
1940 			blendFactor.y = pixel.y;
1941 			blendFactor.z = pixel.z;
1942 			break;
1943 		case BLEND_INVDEST:
1944 			blendFactor.x = Float4(1.0f) - pixel.x;
1945 			blendFactor.y = Float4(1.0f) - pixel.y;
1946 			blendFactor.z = Float4(1.0f) - pixel.z;
1947 			break;
1948 		case BLEND_SOURCEALPHA:
1949 			blendFactor.x = oC.w;
1950 			blendFactor.y = oC.w;
1951 			blendFactor.z = oC.w;
1952 			break;
1953 		case BLEND_INVSOURCEALPHA:
1954 			blendFactor.x = Float4(1.0f) - oC.w;
1955 			blendFactor.y = Float4(1.0f) - oC.w;
1956 			blendFactor.z = Float4(1.0f) - oC.w;
1957 			break;
1958 		case BLEND_DESTALPHA:
1959 			blendFactor.x = pixel.w;
1960 			blendFactor.y = pixel.w;
1961 			blendFactor.z = pixel.w;
1962 			break;
1963 		case BLEND_INVDESTALPHA:
1964 			blendFactor.x = Float4(1.0f) - pixel.w;
1965 			blendFactor.y = Float4(1.0f) - pixel.w;
1966 			blendFactor.z = Float4(1.0f) - pixel.w;
1967 			break;
1968 		case BLEND_SRCALPHASAT:
1969 			blendFactor.x = Float4(1.0f) - pixel.w;
1970 			blendFactor.x = Min(blendFactor.x, oC.w);
1971 			blendFactor.y = blendFactor.x;
1972 			blendFactor.z = blendFactor.x;
1973 			break;
1974 		case BLEND_CONSTANT:
1975 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1976 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1977 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1978 			break;
1979 		case BLEND_INVCONSTANT:
1980 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1981 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1982 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1983 			break;
1984 		default:
1985 			ASSERT(false);
1986 		}
1987 	}
1988 
blendFactorAlpha(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorAlphaActive)1989 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1990 	{
1991 		switch(blendFactorAlphaActive)
1992 		{
1993 		case BLEND_ZERO:
1994 			// Optimized
1995 			break;
1996 		case BLEND_ONE:
1997 			// Optimized
1998 			break;
1999 		case BLEND_SOURCE:
2000 			blendFactor.w = oC.w;
2001 			break;
2002 		case BLEND_INVSOURCE:
2003 			blendFactor.w = Float4(1.0f) - oC.w;
2004 			break;
2005 		case BLEND_DEST:
2006 			blendFactor.w = pixel.w;
2007 			break;
2008 		case BLEND_INVDEST:
2009 			blendFactor.w = Float4(1.0f) - pixel.w;
2010 			break;
2011 		case BLEND_SOURCEALPHA:
2012 			blendFactor.w = oC.w;
2013 			break;
2014 		case BLEND_INVSOURCEALPHA:
2015 			blendFactor.w = Float4(1.0f) - oC.w;
2016 			break;
2017 		case BLEND_DESTALPHA:
2018 			blendFactor.w = pixel.w;
2019 			break;
2020 		case BLEND_INVDESTALPHA:
2021 			blendFactor.w = Float4(1.0f) - pixel.w;
2022 			break;
2023 		case BLEND_SRCALPHASAT:
2024 			blendFactor.w = Float4(1.0f);
2025 			break;
2026 		case BLEND_CONSTANT:
2027 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2028 			break;
2029 		case BLEND_INVCONSTANT:
2030 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2031 			break;
2032 		default:
2033 			ASSERT(false);
2034 		}
2035 	}
2036 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4f & oC,Int & x)2037 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2038 	{
2039 		if(!state.alphaBlendActive)
2040 		{
2041 			return;
2042 		}
2043 
2044 		Pointer<Byte> buffer;
2045 		Vector4f pixel;
2046 
2047 		Vector4s color;
2048 		Short4 c01;
2049 		Short4 c23;
2050 
2051 		Float4 one;
2052 		if(Surface::isFloatFormat(state.targetFormat[index]))
2053 		{
2054 			one = Float4(1.0f);
2055 		}
2056 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2057 		{
2058 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2059 		}
2060 
2061 		switch(state.targetFormat[index])
2062 		{
2063 		case FORMAT_R32I:
2064 		case FORMAT_R32UI:
2065 		case FORMAT_R32F:
2066 			buffer = cBuffer;
2067 			// FIXME: movlps
2068 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2069 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2070 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2071 			// FIXME: movhps
2072 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2073 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2074 			pixel.y = pixel.z = pixel.w = one;
2075 			break;
2076 		case FORMAT_G32R32I:
2077 		case FORMAT_G32R32UI:
2078 		case FORMAT_G32R32F:
2079 			buffer = cBuffer;
2080 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2081 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2082 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2083 			pixel.z = pixel.x;
2084 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x0202);
2085 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0x1313);
2086 			pixel.y = pixel.z;
2087 			pixel.z = pixel.w = one;
2088 			break;
2089 		case FORMAT_X32B32G32R32F:
2090 		case FORMAT_A32B32G32R32F:
2091 		case FORMAT_X32B32G32R32F_UNSIGNED:
2092 		case FORMAT_A32B32G32R32I:
2093 		case FORMAT_A32B32G32R32UI:
2094 			buffer = cBuffer;
2095 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2096 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2097 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2098 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2099 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2100 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2101 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F ||
2102 			   state.targetFormat[index] == FORMAT_X32B32G32R32F_UNSIGNED)
2103 			{
2104 				pixel.w = Float4(1.0f);
2105 			}
2106 			break;
2107 		default:
2108 			ASSERT(false);
2109 		}
2110 
2111 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2112 		{
2113 			sRGBtoLinear(pixel.x);
2114 			sRGBtoLinear(pixel.y);
2115 			sRGBtoLinear(pixel.z);
2116 		}
2117 
2118 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2119 		Vector4f sourceFactor;
2120 		Vector4f destFactor;
2121 
2122 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2123 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2124 
2125 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2126 		{
2127 			oC.x *= sourceFactor.x;
2128 			oC.y *= sourceFactor.y;
2129 			oC.z *= sourceFactor.z;
2130 		}
2131 
2132 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2133 		{
2134 			pixel.x *= destFactor.x;
2135 			pixel.y *= destFactor.y;
2136 			pixel.z *= destFactor.z;
2137 		}
2138 
2139 		switch(state.blendOperation)
2140 		{
2141 		case BLENDOP_ADD:
2142 			oC.x += pixel.x;
2143 			oC.y += pixel.y;
2144 			oC.z += pixel.z;
2145 			break;
2146 		case BLENDOP_SUB:
2147 			oC.x -= pixel.x;
2148 			oC.y -= pixel.y;
2149 			oC.z -= pixel.z;
2150 			break;
2151 		case BLENDOP_INVSUB:
2152 			oC.x = pixel.x - oC.x;
2153 			oC.y = pixel.y - oC.y;
2154 			oC.z = pixel.z - oC.z;
2155 			break;
2156 		case BLENDOP_MIN:
2157 			oC.x = Min(oC.x, pixel.x);
2158 			oC.y = Min(oC.y, pixel.y);
2159 			oC.z = Min(oC.z, pixel.z);
2160 			break;
2161 		case BLENDOP_MAX:
2162 			oC.x = Max(oC.x, pixel.x);
2163 			oC.y = Max(oC.y, pixel.y);
2164 			oC.z = Max(oC.z, pixel.z);
2165 			break;
2166 		case BLENDOP_SOURCE:
2167 			// No operation
2168 			break;
2169 		case BLENDOP_DEST:
2170 			oC.x = pixel.x;
2171 			oC.y = pixel.y;
2172 			oC.z = pixel.z;
2173 			break;
2174 		case BLENDOP_NULL:
2175 			oC.x = Float4(0.0f);
2176 			oC.y = Float4(0.0f);
2177 			oC.z = Float4(0.0f);
2178 			break;
2179 		default:
2180 			ASSERT(false);
2181 		}
2182 
2183 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2184 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2185 
2186 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2187 		{
2188 			oC.w *= sourceFactor.w;
2189 		}
2190 
2191 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2192 		{
2193 			pixel.w *= destFactor.w;
2194 		}
2195 
2196 		switch(state.blendOperationAlpha)
2197 		{
2198 		case BLENDOP_ADD:
2199 			oC.w += pixel.w;
2200 			break;
2201 		case BLENDOP_SUB:
2202 			oC.w -= pixel.w;
2203 			break;
2204 		case BLENDOP_INVSUB:
2205 			pixel.w -= oC.w;
2206 			oC.w = pixel.w;
2207 			break;
2208 		case BLENDOP_MIN:
2209 			oC.w = Min(oC.w, pixel.w);
2210 			break;
2211 		case BLENDOP_MAX:
2212 			oC.w = Max(oC.w, pixel.w);
2213 			break;
2214 		case BLENDOP_SOURCE:
2215 			// No operation
2216 			break;
2217 		case BLENDOP_DEST:
2218 			oC.w = pixel.w;
2219 			break;
2220 		case BLENDOP_NULL:
2221 			oC.w = Float4(0.0f);
2222 			break;
2223 		default:
2224 			ASSERT(false);
2225 		}
2226 	}
2227 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4f & oC,Int & sMask,Int & zMask,Int & cMask)2228 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2229 	{
2230 		switch(state.targetFormat[index])
2231 		{
2232 		case FORMAT_R32F:
2233 		case FORMAT_R32I:
2234 		case FORMAT_R32UI:
2235 		case FORMAT_R16I:
2236 		case FORMAT_R16UI:
2237 		case FORMAT_R8I:
2238 		case FORMAT_R8UI:
2239 			break;
2240 		case FORMAT_G32R32F:
2241 		case FORMAT_G32R32I:
2242 		case FORMAT_G32R32UI:
2243 		case FORMAT_G16R16I:
2244 		case FORMAT_G16R16UI:
2245 		case FORMAT_G8R8I:
2246 		case FORMAT_G8R8UI:
2247 			oC.z = oC.x;
2248 			oC.x = UnpackLow(oC.x, oC.y);
2249 			oC.z = UnpackHigh(oC.z, oC.y);
2250 			oC.y = oC.z;
2251 			break;
2252 		case FORMAT_X32B32G32R32F:
2253 		case FORMAT_A32B32G32R32F:
2254 		case FORMAT_X32B32G32R32F_UNSIGNED:
2255 		case FORMAT_A32B32G32R32I:
2256 		case FORMAT_A32B32G32R32UI:
2257 		case FORMAT_A16B16G16R16I:
2258 		case FORMAT_A16B16G16R16UI:
2259 		case FORMAT_A8B8G8R8I:
2260 		case FORMAT_A8B8G8R8UI:
2261 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2262 			break;
2263 		default:
2264 			ASSERT(false);
2265 		}
2266 
2267 		int rgbaWriteMask = state.colorWriteActive(index);
2268 
2269 		Int xMask;   // Combination of all masks
2270 
2271 		if(state.depthTestActive)
2272 		{
2273 			xMask = zMask;
2274 		}
2275 		else
2276 		{
2277 			xMask = cMask;
2278 		}
2279 
2280 		if(state.stencilActive)
2281 		{
2282 			xMask &= sMask;
2283 		}
2284 
2285 		Pointer<Byte> buffer;
2286 		Float4 value;
2287 
2288 		switch(state.targetFormat[index])
2289 		{
2290 		case FORMAT_R32F:
2291 		case FORMAT_R32I:
2292 		case FORMAT_R32UI:
2293 			if(rgbaWriteMask & 0x00000001)
2294 			{
2295 				buffer = cBuffer + 4 * x;
2296 
2297 				// FIXME: movlps
2298 				value.x = *Pointer<Float>(buffer + 0);
2299 				value.y = *Pointer<Float>(buffer + 4);
2300 
2301 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2302 
2303 				// FIXME: movhps
2304 				value.z = *Pointer<Float>(buffer + 0);
2305 				value.w = *Pointer<Float>(buffer + 4);
2306 
2307 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2308 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2309 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2310 
2311 				// FIXME: movhps
2312 				*Pointer<Float>(buffer + 0) = oC.x.z;
2313 				*Pointer<Float>(buffer + 4) = oC.x.w;
2314 
2315 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2316 
2317 				// FIXME: movlps
2318 				*Pointer<Float>(buffer + 0) = oC.x.x;
2319 				*Pointer<Float>(buffer + 4) = oC.x.y;
2320 			}
2321 			break;
2322 		case FORMAT_R16I:
2323 		case FORMAT_R16UI:
2324 			if(rgbaWriteMask & 0x00000001)
2325 			{
2326 				buffer = cBuffer + 2 * x;
2327 
2328 				UShort4 xyzw;
2329 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2330 
2331 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2332 
2333 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2334 				value = As<Float4>(Int4(xyzw));
2335 
2336 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2337 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2338 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2339 
2340 				if(state.targetFormat[index] == FORMAT_R16I)
2341 				{
2342 					Float component = oC.x.z;
2343 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2344 					component = oC.x.w;
2345 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2346 
2347 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2348 
2349 					component = oC.x.x;
2350 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2351 					component = oC.x.y;
2352 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2353 				}
2354 				else // FORMAT_R16UI
2355 				{
2356 					Float component = oC.x.z;
2357 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2358 					component = oC.x.w;
2359 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2360 
2361 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2362 
2363 					component = oC.x.x;
2364 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2365 					component = oC.x.y;
2366 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2367 				}
2368 			}
2369 			break;
2370 		case FORMAT_R8I:
2371 		case FORMAT_R8UI:
2372 			if(rgbaWriteMask & 0x00000001)
2373 			{
2374 				buffer = cBuffer + x;
2375 
2376 				UInt xyzw, packedCol;
2377 
2378 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2379 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2380 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2381 
2382 				Short4 tmpCol = Short4(As<Int4>(oC.x));
2383 				if(state.targetFormat[index] == FORMAT_R8I)
2384 				{
2385 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2386 				}
2387 				else
2388 				{
2389 					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2390 				}
2391 				packedCol = Extract(As<Int2>(tmpCol), 0);
2392 
2393 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2394 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2395 
2396 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2397 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2398 				*Pointer<UShort>(buffer) = UShort(packedCol);
2399 			}
2400 			break;
2401 		case FORMAT_G32R32F:
2402 		case FORMAT_G32R32I:
2403 		case FORMAT_G32R32UI:
2404 			buffer = cBuffer + 8 * x;
2405 
2406 			value = *Pointer<Float4>(buffer);
2407 
2408 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2409 			{
2410 				Float4 masked = value;
2411 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2412 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2413 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2414 			}
2415 
2416 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2417 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2418 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2419 			*Pointer<Float4>(buffer) = oC.x;
2420 
2421 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2422 
2423 			value = *Pointer<Float4>(buffer);
2424 
2425 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2426 			{
2427 				Float4 masked;
2428 
2429 				masked = value;
2430 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2431 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2432 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2433 			}
2434 
2435 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2436 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2437 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2438 			*Pointer<Float4>(buffer) = oC.y;
2439 			break;
2440 		case FORMAT_G16R16I:
2441 		case FORMAT_G16R16UI:
2442 			if((rgbaWriteMask & 0x00000003) != 0x0)
2443 			{
2444 				buffer = cBuffer + 4 * x;
2445 
2446 				UInt2 rgbaMask;
2447 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
2448 				UShort4 value = *Pointer<UShort4>(buffer);
2449 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2450 				if((rgbaWriteMask & 0x3) != 0x3)
2451 				{
2452 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2453 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2454 					mergedMask &= rgbaMask;
2455 				}
2456 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2457 
2458 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2459 
2460 				packedCol = UShort4(As<Int4>(oC.y));
2461 				value = *Pointer<UShort4>(buffer);
2462 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2463 				if((rgbaWriteMask & 0x3) != 0x3)
2464 				{
2465 					mergedMask &= rgbaMask;
2466 				}
2467 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2468 			}
2469 			break;
2470 		case FORMAT_G8R8I:
2471 		case FORMAT_G8R8UI:
2472 			if((rgbaWriteMask & 0x00000003) != 0x0)
2473 			{
2474 				buffer = cBuffer + 2 * x;
2475 
2476 				Int2 xyzw, packedCol;
2477 
2478 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2479 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2480 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2481 
2482 				if(state.targetFormat[index] == FORMAT_G8R8I)
2483 				{
2484 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2485 				}
2486 				else
2487 				{
2488 					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2489 				}
2490 
2491 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2492 				if((rgbaWriteMask & 0x3) != 0x3)
2493 				{
2494 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2495 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2496 					mergedMask &= rgbaMask;
2497 				}
2498 
2499 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2500 
2501 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2502 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2503 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2504 			}
2505 			break;
2506 		case FORMAT_X32B32G32R32F:
2507 		case FORMAT_A32B32G32R32F:
2508 		case FORMAT_X32B32G32R32F_UNSIGNED:
2509 		case FORMAT_A32B32G32R32I:
2510 		case FORMAT_A32B32G32R32UI:
2511 			buffer = cBuffer + 16 * x;
2512 
2513 			{
2514 				value = *Pointer<Float4>(buffer, 16);
2515 
2516 				if(rgbaWriteMask != 0x0000000F)
2517 				{
2518 					Float4 masked = value;
2519 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2520 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2521 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2522 				}
2523 
2524 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2525 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2526 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2527 				*Pointer<Float4>(buffer, 16) = oC.x;
2528 			}
2529 
2530 			{
2531 				value = *Pointer<Float4>(buffer + 16, 16);
2532 
2533 				if(rgbaWriteMask != 0x0000000F)
2534 				{
2535 					Float4 masked = value;
2536 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2537 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2538 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2539 				}
2540 
2541 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2542 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2543 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2544 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2545 			}
2546 
2547 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2548 
2549 			{
2550 				value = *Pointer<Float4>(buffer, 16);
2551 
2552 				if(rgbaWriteMask != 0x0000000F)
2553 				{
2554 					Float4 masked = value;
2555 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2556 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2557 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2558 				}
2559 
2560 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2561 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2562 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2563 				*Pointer<Float4>(buffer, 16) = oC.z;
2564 			}
2565 
2566 			{
2567 				value = *Pointer<Float4>(buffer + 16, 16);
2568 
2569 				if(rgbaWriteMask != 0x0000000F)
2570 				{
2571 					Float4 masked = value;
2572 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2573 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2574 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2575 				}
2576 
2577 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2578 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2579 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2580 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2581 			}
2582 			break;
2583 		case FORMAT_A16B16G16R16I:
2584 		case FORMAT_A16B16G16R16UI:
2585 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2586 			{
2587 				buffer = cBuffer + 8 * x;
2588 
2589 				UInt4 rgbaMask;
2590 				UShort8 value = *Pointer<UShort8>(buffer);
2591 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2592 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2593 				if((rgbaWriteMask & 0xF) != 0xF)
2594 				{
2595 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2596 					rgbaMask = UInt4(tmpMask, tmpMask);
2597 					mergedMask &= rgbaMask;
2598 				}
2599 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2600 
2601 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2602 
2603 				value = *Pointer<UShort8>(buffer);
2604 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2605 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2606 				if((rgbaWriteMask & 0xF) != 0xF)
2607 				{
2608 					mergedMask &= rgbaMask;
2609 				}
2610 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2611 			}
2612 			break;
2613 		case FORMAT_A8B8G8R8I:
2614 		case FORMAT_A8B8G8R8UI:
2615 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2616 			{
2617 				UInt2 value, packedCol, mergedMask;
2618 
2619 				buffer = cBuffer + 4 * x;
2620 
2621 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2622 				{
2623 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2624 				}
2625 				else
2626 				{
2627 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2628 				}
2629 				value = *Pointer<UInt2>(buffer, 16);
2630 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2631 				if(rgbaWriteMask != 0xF)
2632 				{
2633 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2634 				}
2635 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2636 
2637 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2638 
2639 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2640 				{
2641 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2642 				}
2643 				else
2644 				{
2645 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2646 				}
2647 				value = *Pointer<UInt2>(buffer, 16);
2648 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2649 				if(rgbaWriteMask != 0xF)
2650 				{
2651 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2652 				}
2653 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2654 			}
2655 			break;
2656 		default:
2657 			ASSERT(false);
2658 		}
2659 	}
2660 
convertFixed16(Float4 & cf,bool saturate)2661 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2662 	{
2663 		return UShort4(cf * Float4(0xFFFF), saturate);
2664 	}
2665 
sRGBtoLinear16_12_16(Vector4s & c)2666 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2667 	{
2668 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2669 
2670 		c.x = As<UShort4>(c.x) >> 4;
2671 		c.y = As<UShort4>(c.y) >> 4;
2672 		c.z = As<UShort4>(c.z) >> 4;
2673 
2674 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2675 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2676 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2677 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2678 
2679 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2680 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2681 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2682 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2683 
2684 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2685 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2686 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2687 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2688 	}
2689 
linearToSRGB16_12_16(Vector4s & c)2690 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2691 	{
2692 		c.x = As<UShort4>(c.x) >> 4;
2693 		c.y = As<UShort4>(c.y) >> 4;
2694 		c.z = As<UShort4>(c.z) >> 4;
2695 
2696 		linearToSRGB12_16(c);
2697 	}
2698 
linearToSRGB12_16(Vector4s & c)2699 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2700 	{
2701 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2702 
2703 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2704 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2705 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2706 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2707 
2708 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2709 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2710 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2711 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2712 
2713 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2714 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2715 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2716 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2717 	}
2718 
sRGBtoLinear(const Float4 & x)2719 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2720 	{
2721 		Float4 linear = x * x;
2722 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2723 
2724 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2725 	}
2726 
colorUsed()2727 	bool PixelRoutine::colorUsed()
2728 	{
2729 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2730 	}
2731 }
2732