1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 	#include <xmmintrin.h>
30 	#include <emmintrin.h>
31 #endif
32 
33 #undef min
34 #undef max
35 
36 namespace sw
37 {
38 	extern bool quadLayoutEnabled;
39 	extern bool complementaryDepthBuffer;
40 	extern TranscendentalPrecision logPrecision;
41 
42 	unsigned int *Surface::palette = 0;
43 	unsigned int Surface::paletteID = 0;
44 
write(int x,int y,int z,const Color<float> & color)45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
46 	{
47 		ASSERT((x >= -border) && (x < (width + border)));
48 		ASSERT((y >= -border) && (y < (height + border)));
49 		ASSERT((z >= 0) && (z < depth));
50 
51 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
52 
53 		for(int i = 0; i < samples; i++)
54 		{
55 			write(element, color);
56 			element += sliceB;
57 		}
58 	}
59 
write(int x,int y,const Color<float> & color)60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
61 	{
62 		ASSERT((x >= -border) && (x < (width + border)));
63 		ASSERT((y >= -border) && (y < (height + border)));
64 
65 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
66 
67 		for(int i = 0; i < samples; i++)
68 		{
69 			write(element, color);
70 			element += sliceB;
71 		}
72 	}
73 
write(void * element,const Color<float> & color)74 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
75 	{
76 		float r = color.r;
77 		float g = color.g;
78 		float b = color.b;
79 		float a = color.a;
80 
81 		if(isSRGBformat(format))
82 		{
83 			r = linearToSRGB(r);
84 			g = linearToSRGB(g);
85 			b = linearToSRGB(b);
86 		}
87 
88 		switch(format)
89 		{
90 		case FORMAT_A8:
91 			*(unsigned char*)element = unorm<8>(a);
92 			break;
93 		case FORMAT_R8_SNORM:
94 			*(char*)element = snorm<8>(r);
95 			break;
96 		case FORMAT_R8:
97 			*(unsigned char*)element = unorm<8>(r);
98 			break;
99 		case FORMAT_R8I:
100 			*(char*)element = scast<8>(r);
101 			break;
102 		case FORMAT_R8UI:
103 			*(unsigned char*)element = ucast<8>(r);
104 			break;
105 		case FORMAT_R16I:
106 			*(short*)element = scast<16>(r);
107 			break;
108 		case FORMAT_R16UI:
109 			*(unsigned short*)element = ucast<16>(r);
110 			break;
111 		case FORMAT_R32I:
112 			*(int*)element = static_cast<int>(r);
113 			break;
114 		case FORMAT_R32UI:
115 			*(unsigned int*)element = static_cast<unsigned int>(r);
116 			break;
117 		case FORMAT_R3G3B2:
118 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
119 			break;
120 		case FORMAT_A8R3G3B2:
121 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
122 			break;
123 		case FORMAT_X4R4G4B4:
124 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
125 			break;
126 		case FORMAT_A4R4G4B4:
127 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
128 			break;
129 		case FORMAT_R4G4B4A4:
130 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
131 			break;
132 		case FORMAT_R5G6B5:
133 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
134 			break;
135 		case FORMAT_A1R5G5B5:
136 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
137 			break;
138 		case FORMAT_R5G5B5A1:
139 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
140 			break;
141 		case FORMAT_X1R5G5B5:
142 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
143 			break;
144 		case FORMAT_A8R8G8B8:
145 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
146 			break;
147 		case FORMAT_X8R8G8B8:
148 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
149 			break;
150 		case FORMAT_A8B8G8R8_SNORM:
151 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
152 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
153 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
154 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
155 			break;
156 		case FORMAT_A8B8G8R8:
157 		case FORMAT_SRGB8_A8:
158 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
159 			break;
160 		case FORMAT_A8B8G8R8I:
161 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
162 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
163 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
164 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
165 			break;
166 		case FORMAT_A8B8G8R8UI:
167 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
168 			break;
169 		case FORMAT_X8B8G8R8_SNORM:
170 			*(unsigned int*)element = 0x7F000000 |
171 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
172 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
173 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
174 			break;
175 		case FORMAT_X8B8G8R8:
176 		case FORMAT_SRGB8_X8:
177 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
178 			break;
179 		case FORMAT_X8B8G8R8I:
180 			*(unsigned int*)element = 0x7F000000 |
181 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
182 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
183 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
184 		case FORMAT_X8B8G8R8UI:
185 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
186 			break;
187 		case FORMAT_A2R10G10B10:
188 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
189 			break;
190 		case FORMAT_A2B10G10R10:
191 		case FORMAT_A2B10G10R10UI:
192 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
193 			break;
194 		case FORMAT_G8R8_SNORM:
195 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
196 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
197 			break;
198 		case FORMAT_G8R8:
199 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
200 			break;
201 		case FORMAT_G8R8I:
202 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
203 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
204 			break;
205 		case FORMAT_G8R8UI:
206 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
207 			break;
208 		case FORMAT_G16R16:
209 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
210 			break;
211 		case FORMAT_G16R16I:
212 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
213 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
214 			break;
215 		case FORMAT_G16R16UI:
216 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
217 			break;
218 		case FORMAT_G32R32I:
219 		case FORMAT_G32R32UI:
220 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
221 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
222 			break;
223 		case FORMAT_A16B16G16R16:
224 			((unsigned short*)element)[0] = unorm<16>(r);
225 			((unsigned short*)element)[1] = unorm<16>(g);
226 			((unsigned short*)element)[2] = unorm<16>(b);
227 			((unsigned short*)element)[3] = unorm<16>(a);
228 			break;
229 		case FORMAT_A16B16G16R16I:
230 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
231 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
232 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
233 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
234 			break;
235 		case FORMAT_A16B16G16R16UI:
236 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
237 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
238 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
239 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
240 			break;
241 		case FORMAT_X16B16G16R16I:
242 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
243 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
244 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
245 			break;
246 		case FORMAT_X16B16G16R16UI:
247 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
248 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
249 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
250 			break;
251 		case FORMAT_A32B32G32R32I:
252 		case FORMAT_A32B32G32R32UI:
253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
256 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
257 			break;
258 		case FORMAT_X32B32G32R32I:
259 		case FORMAT_X32B32G32R32UI:
260 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
261 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
262 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
263 			break;
264 		case FORMAT_V8U8:
265 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
266 			break;
267 		case FORMAT_L6V5U5:
268 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
269 			break;
270 		case FORMAT_Q8W8V8U8:
271 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
272 			break;
273 		case FORMAT_X8L8V8U8:
274 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
275 			break;
276 		case FORMAT_V16U16:
277 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
278 			break;
279 		case FORMAT_A2W10V10U10:
280 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
281 			break;
282 		case FORMAT_A16W16V16U16:
283 			((unsigned short*)element)[0] = snorm<16>(r);
284 			((unsigned short*)element)[1] = snorm<16>(g);
285 			((unsigned short*)element)[2] = snorm<16>(b);
286 			((unsigned short*)element)[3] = unorm<16>(a);
287 			break;
288 		case FORMAT_Q16W16V16U16:
289 			((unsigned short*)element)[0] = snorm<16>(r);
290 			((unsigned short*)element)[1] = snorm<16>(g);
291 			((unsigned short*)element)[2] = snorm<16>(b);
292 			((unsigned short*)element)[3] = snorm<16>(a);
293 			break;
294 		case FORMAT_R8G8B8:
295 			((unsigned char*)element)[0] = unorm<8>(b);
296 			((unsigned char*)element)[1] = unorm<8>(g);
297 			((unsigned char*)element)[2] = unorm<8>(r);
298 			break;
299 		case FORMAT_B8G8R8:
300 			((unsigned char*)element)[0] = unorm<8>(r);
301 			((unsigned char*)element)[1] = unorm<8>(g);
302 			((unsigned char*)element)[2] = unorm<8>(b);
303 			break;
304 		case FORMAT_R16F:
305 			*(half*)element = (half)r;
306 			break;
307 		case FORMAT_A16F:
308 			*(half*)element = (half)a;
309 			break;
310 		case FORMAT_G16R16F:
311 			((half*)element)[0] = (half)r;
312 			((half*)element)[1] = (half)g;
313 			break;
314 		case FORMAT_X16B16G16R16F_UNSIGNED:
315 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
316 			// Fall through to FORMAT_X16B16G16R16F.
317 		case FORMAT_X16B16G16R16F:
318 			((half*)element)[3] = 1.0f;
319 			// Fall through to FORMAT_B16G16R16F.
320 		case FORMAT_B16G16R16F:
321 			((half*)element)[0] = (half)r;
322 			((half*)element)[1] = (half)g;
323 			((half*)element)[2] = (half)b;
324 			break;
325 		case FORMAT_A16B16G16R16F:
326 			((half*)element)[0] = (half)r;
327 			((half*)element)[1] = (half)g;
328 			((half*)element)[2] = (half)b;
329 			((half*)element)[3] = (half)a;
330 			break;
331 		case FORMAT_A32F:
332 			*(float*)element = a;
333 			break;
334 		case FORMAT_R32F:
335 			*(float*)element = r;
336 			break;
337 		case FORMAT_G32R32F:
338 			((float*)element)[0] = r;
339 			((float*)element)[1] = g;
340 			break;
341 		case FORMAT_X32B32G32R32F_UNSIGNED:
342 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
343 			// Fall through to FORMAT_X32B32G32R32F.
344 		case FORMAT_X32B32G32R32F:
345 			((float*)element)[3] = 1.0f;
346 			// Fall through to FORMAT_B32G32R32F.
347 		case FORMAT_B32G32R32F:
348 			((float*)element)[0] = r;
349 			((float*)element)[1] = g;
350 			((float*)element)[2] = b;
351 			break;
352 		case FORMAT_A32B32G32R32F:
353 			((float*)element)[0] = r;
354 			((float*)element)[1] = g;
355 			((float*)element)[2] = b;
356 			((float*)element)[3] = a;
357 			break;
358 		case FORMAT_D32F:
359 		case FORMAT_D32FS8:
360 		case FORMAT_D32F_LOCKABLE:
361 		case FORMAT_D32FS8_TEXTURE:
362 		case FORMAT_D32F_SHADOW:
363 		case FORMAT_D32FS8_SHADOW:
364 			*((float*)element) = r;
365 			break;
366 		case FORMAT_D32F_COMPLEMENTARY:
367 		case FORMAT_D32FS8_COMPLEMENTARY:
368 			*((float*)element) = 1 - r;
369 			break;
370 		case FORMAT_S8:
371 			*((unsigned char*)element) = unorm<8>(r);
372 			break;
373 		case FORMAT_L8:
374 			*(unsigned char*)element = unorm<8>(r);
375 			break;
376 		case FORMAT_A4L4:
377 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
378 			break;
379 		case FORMAT_L16:
380 			*(unsigned short*)element = unorm<16>(r);
381 			break;
382 		case FORMAT_A8L8:
383 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
384 			break;
385 		case FORMAT_L16F:
386 			*(half*)element = (half)r;
387 			break;
388 		case FORMAT_A16L16F:
389 			((half*)element)[0] = (half)r;
390 			((half*)element)[1] = (half)a;
391 			break;
392 		case FORMAT_L32F:
393 			*(float*)element = r;
394 			break;
395 		case FORMAT_A32L32F:
396 			((float*)element)[0] = r;
397 			((float*)element)[1] = a;
398 			break;
399 		default:
400 			ASSERT(false);
401 		}
402 	}
403 
read(int x,int y,int z) const404 	Color<float> Surface::Buffer::read(int x, int y, int z) const
405 	{
406 		ASSERT((x >= -border) && (x < (width + border)));
407 		ASSERT((y >= -border) && (y < (height + border)));
408 		ASSERT((z >= 0) && (z < depth));
409 
410 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
411 
412 		return read(element);
413 	}
414 
read(int x,int y) const415 	Color<float> Surface::Buffer::read(int x, int y) const
416 	{
417 		ASSERT((x >= -border) && (x < (width + border)));
418 		ASSERT((y >= -border) && (y < (height + border)));
419 
420 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
421 
422 		return read(element);
423 	}
424 
read(void * element) const425 	inline Color<float> Surface::Buffer::read(void *element) const
426 	{
427 		float r = 0.0f;
428 		float g = 0.0f;
429 		float b = 0.0f;
430 		float a = 1.0f;
431 
432 		switch(format)
433 		{
434 		case FORMAT_P8:
435 			{
436 				ASSERT(palette);
437 
438 				unsigned int abgr = palette[*(unsigned char*)element];
439 
440 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
441 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
442 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
443 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
444 			}
445 			break;
446 		case FORMAT_A8P8:
447 			{
448 				ASSERT(palette);
449 
450 				unsigned int bgr = palette[((unsigned char*)element)[0]];
451 
452 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
453 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
454 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
455 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
456 			}
457 			break;
458 		case FORMAT_A8:
459 			r = 0;
460 			g = 0;
461 			b = 0;
462 			a = *(unsigned char*)element * (1.0f / 0xFF);
463 			break;
464 		case FORMAT_R8_SNORM:
465 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
466 			break;
467 		case FORMAT_R8:
468 			r = *(unsigned char*)element * (1.0f / 0xFF);
469 			break;
470 		case FORMAT_R8I:
471 			r = *(signed char*)element;
472 			break;
473 		case FORMAT_R8UI:
474 			r = *(unsigned char*)element;
475 			break;
476 		case FORMAT_R3G3B2:
477 			{
478 				unsigned char rgb = *(unsigned char*)element;
479 
480 				r = (rgb & 0xE0) * (1.0f / 0xE0);
481 				g = (rgb & 0x1C) * (1.0f / 0x1C);
482 				b = (rgb & 0x03) * (1.0f / 0x03);
483 			}
484 			break;
485 		case FORMAT_A8R3G3B2:
486 			{
487 				unsigned short argb = *(unsigned short*)element;
488 
489 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
490 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
491 				g = (argb & 0x001C) * (1.0f / 0x001C);
492 				b = (argb & 0x0003) * (1.0f / 0x0003);
493 			}
494 			break;
495 		case FORMAT_X4R4G4B4:
496 			{
497 				unsigned short rgb = *(unsigned short*)element;
498 
499 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
500 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
501 				b = (rgb & 0x000F) * (1.0f / 0x000F);
502 			}
503 			break;
504 		case FORMAT_A4R4G4B4:
505 			{
506 				unsigned short argb = *(unsigned short*)element;
507 
508 				a = (argb & 0xF000) * (1.0f / 0xF000);
509 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
510 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
511 				b = (argb & 0x000F) * (1.0f / 0x000F);
512 			}
513 			break;
514 		case FORMAT_R4G4B4A4:
515 			{
516 				unsigned short rgba = *(unsigned short*)element;
517 
518 				r = (rgba & 0xF000) * (1.0f / 0xF000);
519 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
520 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
521 				a = (rgba & 0x000F) * (1.0f / 0x000F);
522 			}
523 			break;
524 		case FORMAT_R5G6B5:
525 			{
526 				unsigned short rgb = *(unsigned short*)element;
527 
528 				r = (rgb & 0xF800) * (1.0f / 0xF800);
529 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
530 				b = (rgb & 0x001F) * (1.0f / 0x001F);
531 			}
532 			break;
533 		case FORMAT_A1R5G5B5:
534 			{
535 				unsigned short argb = *(unsigned short*)element;
536 
537 				a = (argb & 0x8000) * (1.0f / 0x8000);
538 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
539 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
540 				b = (argb & 0x001F) * (1.0f / 0x001F);
541 			}
542 			break;
543 		case FORMAT_R5G5B5A1:
544 			{
545 				unsigned short rgba = *(unsigned short*)element;
546 
547 				r = (rgba & 0xF800) * (1.0f / 0xF800);
548 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
549 				b = (rgba & 0x003E) * (1.0f / 0x003E);
550 				a = (rgba & 0x0001) * (1.0f / 0x0001);
551 			}
552 			break;
553 		case FORMAT_X1R5G5B5:
554 			{
555 				unsigned short xrgb = *(unsigned short*)element;
556 
557 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
558 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
559 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
560 			}
561 			break;
562 		case FORMAT_A8R8G8B8:
563 			{
564 				unsigned int argb = *(unsigned int*)element;
565 
566 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
567 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
568 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
569 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
570 			}
571 			break;
572 		case FORMAT_X8R8G8B8:
573 			{
574 				unsigned int xrgb = *(unsigned int*)element;
575 
576 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
577 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
578 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
579 			}
580 			break;
581 		case FORMAT_A8B8G8R8_SNORM:
582 			{
583 				signed char* abgr = (signed char*)element;
584 
585 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
586 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
587 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
588 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
589 			}
590 			break;
591 		case FORMAT_A8B8G8R8:
592 		case FORMAT_SRGB8_A8:
593 			{
594 				unsigned int abgr = *(unsigned int*)element;
595 
596 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
597 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
598 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
599 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
600 			}
601 			break;
602 		case FORMAT_A8B8G8R8I:
603 			{
604 				signed char* abgr = (signed char*)element;
605 
606 				r = abgr[0];
607 				g = abgr[1];
608 				b = abgr[2];
609 				a = abgr[3];
610 			}
611 			break;
612 		case FORMAT_A8B8G8R8UI:
613 			{
614 				unsigned char* abgr = (unsigned char*)element;
615 
616 				r = abgr[0];
617 				g = abgr[1];
618 				b = abgr[2];
619 				a = abgr[3];
620 			}
621 			break;
622 		case FORMAT_X8B8G8R8_SNORM:
623 			{
624 				signed char* bgr = (signed char*)element;
625 
626 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
627 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
628 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
629 			}
630 			break;
631 		case FORMAT_X8B8G8R8:
632 		case FORMAT_SRGB8_X8:
633 			{
634 				unsigned int xbgr = *(unsigned int*)element;
635 
636 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
637 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
638 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
639 			}
640 			break;
641 		case FORMAT_X8B8G8R8I:
642 			{
643 				signed char* bgr = (signed char*)element;
644 
645 				r = bgr[0];
646 				g = bgr[1];
647 				b = bgr[2];
648 			}
649 			break;
650 		case FORMAT_X8B8G8R8UI:
651 			{
652 				unsigned char* bgr = (unsigned char*)element;
653 
654 				r = bgr[0];
655 				g = bgr[1];
656 				b = bgr[2];
657 			}
658 			break;
659 		case FORMAT_G8R8_SNORM:
660 			{
661 				signed char* gr = (signed char*)element;
662 
663 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
664 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
665 			}
666 			break;
667 		case FORMAT_G8R8:
668 			{
669 				unsigned short gr = *(unsigned short*)element;
670 
671 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
672 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
673 			}
674 			break;
675 		case FORMAT_G8R8I:
676 			{
677 				signed char* gr = (signed char*)element;
678 
679 				r = gr[0];
680 				g = gr[1];
681 			}
682 			break;
683 		case FORMAT_G8R8UI:
684 			{
685 				unsigned char* gr = (unsigned char*)element;
686 
687 				r = gr[0];
688 				g = gr[1];
689 			}
690 			break;
691 		case FORMAT_R16I:
692 			r = *((short*)element);
693 			break;
694 		case FORMAT_R16UI:
695 			r = *((unsigned short*)element);
696 			break;
697 		case FORMAT_G16R16I:
698 			{
699 				short* gr = (short*)element;
700 
701 				r = gr[0];
702 				g = gr[1];
703 			}
704 			break;
705 		case FORMAT_G16R16:
706 			{
707 				unsigned int gr = *(unsigned int*)element;
708 
709 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
710 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
711 			}
712 			break;
713 		case FORMAT_G16R16UI:
714 			{
715 				unsigned short* gr = (unsigned short*)element;
716 
717 				r = gr[0];
718 				g = gr[1];
719 			}
720 			break;
721 		case FORMAT_A2R10G10B10:
722 			{
723 				unsigned int argb = *(unsigned int*)element;
724 
725 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
726 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
727 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
728 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
729 			}
730 			break;
731 		case FORMAT_A2B10G10R10:
732 			{
733 				unsigned int abgr = *(unsigned int*)element;
734 
735 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
736 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
737 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
738 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
739 			}
740 			break;
741 		case FORMAT_A2B10G10R10UI:
742 			{
743 				unsigned int abgr = *(unsigned int*)element;
744 
745 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
746 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
747 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
748 				r = static_cast<float>(abgr & 0x000003FF);
749 			}
750 			break;
751 		case FORMAT_A16B16G16R16I:
752 			{
753 				short* abgr = (short*)element;
754 
755 				r = abgr[0];
756 				g = abgr[1];
757 				b = abgr[2];
758 				a = abgr[3];
759 			}
760 			break;
761 		case FORMAT_A16B16G16R16:
762 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
763 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
764 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
765 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
766 			break;
767 		case FORMAT_A16B16G16R16UI:
768 			{
769 				unsigned short* abgr = (unsigned short*)element;
770 
771 				r = abgr[0];
772 				g = abgr[1];
773 				b = abgr[2];
774 				a = abgr[3];
775 			}
776 			break;
777 		case FORMAT_X16B16G16R16I:
778 			{
779 				short* bgr = (short*)element;
780 
781 				r = bgr[0];
782 				g = bgr[1];
783 				b = bgr[2];
784 			}
785 			break;
786 		case FORMAT_X16B16G16R16UI:
787 			{
788 				unsigned short* bgr = (unsigned short*)element;
789 
790 				r = bgr[0];
791 				g = bgr[1];
792 				b = bgr[2];
793 			}
794 			break;
795 		case FORMAT_A32B32G32R32I:
796 			{
797 				int* abgr = (int*)element;
798 
799 				r = static_cast<float>(abgr[0]);
800 				g = static_cast<float>(abgr[1]);
801 				b = static_cast<float>(abgr[2]);
802 				a = static_cast<float>(abgr[3]);
803 			}
804 			break;
805 		case FORMAT_A32B32G32R32UI:
806 			{
807 				unsigned int* abgr = (unsigned int*)element;
808 
809 				r = static_cast<float>(abgr[0]);
810 				g = static_cast<float>(abgr[1]);
811 				b = static_cast<float>(abgr[2]);
812 				a = static_cast<float>(abgr[3]);
813 			}
814 			break;
815 		case FORMAT_X32B32G32R32I:
816 			{
817 				int* bgr = (int*)element;
818 
819 				r = static_cast<float>(bgr[0]);
820 				g = static_cast<float>(bgr[1]);
821 				b = static_cast<float>(bgr[2]);
822 			}
823 			break;
824 		case FORMAT_X32B32G32R32UI:
825 			{
826 				unsigned int* bgr = (unsigned int*)element;
827 
828 				r = static_cast<float>(bgr[0]);
829 				g = static_cast<float>(bgr[1]);
830 				b = static_cast<float>(bgr[2]);
831 			}
832 			break;
833 		case FORMAT_G32R32I:
834 			{
835 				int* gr = (int*)element;
836 
837 				r = static_cast<float>(gr[0]);
838 				g = static_cast<float>(gr[1]);
839 			}
840 			break;
841 		case FORMAT_G32R32UI:
842 			{
843 				unsigned int* gr = (unsigned int*)element;
844 
845 				r = static_cast<float>(gr[0]);
846 				g = static_cast<float>(gr[1]);
847 			}
848 			break;
849 		case FORMAT_R32I:
850 			r = static_cast<float>(*((int*)element));
851 			break;
852 		case FORMAT_R32UI:
853 			r = static_cast<float>(*((unsigned int*)element));
854 			break;
855 		case FORMAT_V8U8:
856 			{
857 				unsigned short vu = *(unsigned short*)element;
858 
859 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
860 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
861 			}
862 			break;
863 		case FORMAT_L6V5U5:
864 			{
865 				unsigned short lvu = *(unsigned short*)element;
866 
867 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
868 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
869 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
870 			}
871 			break;
872 		case FORMAT_Q8W8V8U8:
873 			{
874 				unsigned int qwvu = *(unsigned int*)element;
875 
876 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
877 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
878 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
879 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
880 			}
881 			break;
882 		case FORMAT_X8L8V8U8:
883 			{
884 				unsigned int xlvu = *(unsigned int*)element;
885 
886 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
887 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
888 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
889 			}
890 			break;
891 		case FORMAT_R8G8B8:
892 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
893 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
894 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
895 			break;
896 		case FORMAT_B8G8R8:
897 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
898 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
899 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
900 			break;
901 		case FORMAT_V16U16:
902 			{
903 				unsigned int vu = *(unsigned int*)element;
904 
905 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
906 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
907 			}
908 			break;
909 		case FORMAT_A2W10V10U10:
910 			{
911 				unsigned int awvu = *(unsigned int*)element;
912 
913 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
914 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
915 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
916 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
917 			}
918 			break;
919 		case FORMAT_A16W16V16U16:
920 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
921 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
922 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
923 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
924 			break;
925 		case FORMAT_Q16W16V16U16:
926 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
927 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
928 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
929 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
930 			break;
931 		case FORMAT_L8:
932 			r =
933 			g =
934 			b = *(unsigned char*)element * (1.0f / 0xFF);
935 			break;
936 		case FORMAT_A4L4:
937 			{
938 				unsigned char al = *(unsigned char*)element;
939 
940 				r =
941 				g =
942 				b = (al & 0x0F) * (1.0f / 0x0F);
943 				a = (al & 0xF0) * (1.0f / 0xF0);
944 			}
945 			break;
946 		case FORMAT_L16:
947 			r =
948 			g =
949 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
950 			break;
951 		case FORMAT_A8L8:
952 			r =
953 			g =
954 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
955 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
956 			break;
957 		case FORMAT_L16F:
958 			r =
959 			g =
960 			b = *(half*)element;
961 			break;
962 		case FORMAT_A16L16F:
963 			r =
964 			g =
965 			b = ((half*)element)[0];
966 			a = ((half*)element)[1];
967 			break;
968 		case FORMAT_L32F:
969 			r =
970 			g =
971 			b = *(float*)element;
972 			break;
973 		case FORMAT_A32L32F:
974 			r =
975 			g =
976 			b = ((float*)element)[0];
977 			a = ((float*)element)[1];
978 			break;
979 		case FORMAT_A16F:
980 			a = *(half*)element;
981 			break;
982 		case FORMAT_R16F:
983 			r = *(half*)element;
984 			break;
985 		case FORMAT_G16R16F:
986 			r = ((half*)element)[0];
987 			g = ((half*)element)[1];
988 			break;
989 		case FORMAT_X16B16G16R16F:
990 		case FORMAT_X16B16G16R16F_UNSIGNED:
991 		case FORMAT_B16G16R16F:
992 			r = ((half*)element)[0];
993 			g = ((half*)element)[1];
994 			b = ((half*)element)[2];
995 			break;
996 		case FORMAT_A16B16G16R16F:
997 			r = ((half*)element)[0];
998 			g = ((half*)element)[1];
999 			b = ((half*)element)[2];
1000 			a = ((half*)element)[3];
1001 			break;
1002 		case FORMAT_A32F:
1003 			a = *(float*)element;
1004 			break;
1005 		case FORMAT_R32F:
1006 			r = *(float*)element;
1007 			break;
1008 		case FORMAT_G32R32F:
1009 			r = ((float*)element)[0];
1010 			g = ((float*)element)[1];
1011 			break;
1012 		case FORMAT_X32B32G32R32F:
1013 		case FORMAT_X32B32G32R32F_UNSIGNED:
1014 		case FORMAT_B32G32R32F:
1015 			r = ((float*)element)[0];
1016 			g = ((float*)element)[1];
1017 			b = ((float*)element)[2];
1018 			break;
1019 		case FORMAT_A32B32G32R32F:
1020 			r = ((float*)element)[0];
1021 			g = ((float*)element)[1];
1022 			b = ((float*)element)[2];
1023 			a = ((float*)element)[3];
1024 			break;
1025 		case FORMAT_D32F:
1026 		case FORMAT_D32FS8:
1027 		case FORMAT_D32F_LOCKABLE:
1028 		case FORMAT_D32FS8_TEXTURE:
1029 		case FORMAT_D32F_SHADOW:
1030 		case FORMAT_D32FS8_SHADOW:
1031 			r = *(float*)element;
1032 			g = r;
1033 			b = r;
1034 			a = r;
1035 			break;
1036 		case FORMAT_D32F_COMPLEMENTARY:
1037 		case FORMAT_D32FS8_COMPLEMENTARY:
1038 			r = 1.0f - *(float*)element;
1039 			g = r;
1040 			b = r;
1041 			a = r;
1042 			break;
1043 		case FORMAT_S8:
1044 			r = *(unsigned char*)element * (1.0f / 0xFF);
1045 			break;
1046 		default:
1047 			ASSERT(false);
1048 		}
1049 
1050 		if(isSRGBformat(format))
1051 		{
1052 			r = sRGBtoLinear(r);
1053 			g = sRGBtoLinear(g);
1054 			b = sRGBtoLinear(b);
1055 		}
1056 
1057 		return Color<float>(r, g, b, a);
1058 	}
1059 
sample(float x,float y,float z) const1060 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1061 	{
1062 		x -= 0.5f;
1063 		y -= 0.5f;
1064 		z -= 0.5f;
1065 
1066 		int x0 = clamp((int)x, 0, width - 1);
1067 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1068 
1069 		int y0 = clamp((int)y, 0, height - 1);
1070 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1071 
1072 		int z0 = clamp((int)z, 0, depth - 1);
1073 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1074 
1075 		Color<float> c000 = read(x0, y0, z0);
1076 		Color<float> c100 = read(x1, y0, z0);
1077 		Color<float> c010 = read(x0, y1, z0);
1078 		Color<float> c110 = read(x1, y1, z0);
1079 		Color<float> c001 = read(x0, y0, z1);
1080 		Color<float> c101 = read(x1, y0, z1);
1081 		Color<float> c011 = read(x0, y1, z1);
1082 		Color<float> c111 = read(x1, y1, z1);
1083 
1084 		float fx = x - x0;
1085 		float fy = y - y0;
1086 		float fz = z - z0;
1087 
1088 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1089 		c100 *= fx * (1 - fy) * (1 - fz);
1090 		c010 *= (1 - fx) * fy * (1 - fz);
1091 		c110 *= fx * fy * (1 - fz);
1092 		c001 *= (1 - fx) * (1 - fy) * fz;
1093 		c101 *= fx * (1 - fy) * fz;
1094 		c011 *= (1 - fx) * fy * fz;
1095 		c111 *= fx * fy * fz;
1096 
1097 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1098 	}
1099 
sample(float x,float y,int layer) const1100 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
1101 	{
1102 		x -= 0.5f;
1103 		y -= 0.5f;
1104 
1105 		int x0 = clamp((int)x, 0, width - 1);
1106 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1107 
1108 		int y0 = clamp((int)y, 0, height - 1);
1109 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1110 
1111 		Color<float> c00 = read(x0, y0, layer);
1112 		Color<float> c10 = read(x1, y0, layer);
1113 		Color<float> c01 = read(x0, y1, layer);
1114 		Color<float> c11 = read(x1, y1, layer);
1115 
1116 		float fx = x - x0;
1117 		float fy = y - y0;
1118 
1119 		c00 *= (1 - fx) * (1 - fy);
1120 		c10 *= fx * (1 - fy);
1121 		c01 *= (1 - fx) * fy;
1122 		c11 *= fx * fy;
1123 
1124 		return c00 + c10 + c01 + c11;
1125 	}
1126 
lockRect(int x,int y,int z,Lock lock)1127 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1128 	{
1129 		this->lock = lock;
1130 
1131 		switch(lock)
1132 		{
1133 		case LOCK_UNLOCKED:
1134 		case LOCK_READONLY:
1135 		case LOCK_UPDATE:
1136 			break;
1137 		case LOCK_WRITEONLY:
1138 		case LOCK_READWRITE:
1139 		case LOCK_DISCARD:
1140 			dirty = true;
1141 			break;
1142 		default:
1143 			ASSERT(false);
1144 		}
1145 
1146 		if(buffer)
1147 		{
1148 			x += border;
1149 			y += border;
1150 
1151 			switch(format)
1152 			{
1153 			case FORMAT_DXT1:
1154 			case FORMAT_ATI1:
1155 			case FORMAT_ETC1:
1156 			case FORMAT_R11_EAC:
1157 			case FORMAT_SIGNED_R11_EAC:
1158 			case FORMAT_RGB8_ETC2:
1159 			case FORMAT_SRGB8_ETC2:
1160 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1161 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1162 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1163 			case FORMAT_RG11_EAC:
1164 			case FORMAT_SIGNED_RG11_EAC:
1165 			case FORMAT_RGBA8_ETC2_EAC:
1166 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1167 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1168 			case FORMAT_DXT3:
1169 			case FORMAT_DXT5:
1170 			case FORMAT_ATI2:
1171 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1172 			default:
1173 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
1174 			}
1175 		}
1176 
1177 		return nullptr;
1178 	}
1179 
unlockRect()1180 	void Surface::Buffer::unlockRect()
1181 	{
1182 		lock = LOCK_UNLOCKED;
1183 	}
1184 
1185 	class SurfaceImplementation : public Surface
1186 	{
1187 	public:
SurfaceImplementation(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1188 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1189 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
SurfaceImplementation(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchP=0)1190 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1191 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
~SurfaceImplementation()1192 		~SurfaceImplementation() override {}
1193 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1194 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1195 		{
1196 			return Surface::lockInternal(x, y, z, lock, client);
1197 		}
1198 
unlockInternal()1199 		void unlockInternal() override
1200 		{
1201 			Surface::unlockInternal();
1202 		}
1203 	};
1204 
create(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1205 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1206 	{
1207 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1208 	}
1209 
create(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1210 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1211 	{
1212 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
1213 	}
1214 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1215 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1216 	{
1217 		resource = new Resource(0);
1218 		hasParent = false;
1219 		ownExternal = false;
1220 		depth = max(1, depth);
1221 
1222 		external.buffer = pixels;
1223 		external.width = width;
1224 		external.height = height;
1225 		external.depth = depth;
1226 		external.samples = 1;
1227 		external.format = format;
1228 		external.bytes = bytes(external.format);
1229 		external.pitchB = pitch;
1230 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1231 		external.sliceB = slice;
1232 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1233 		external.border = 0;
1234 		external.lock = LOCK_UNLOCKED;
1235 		external.dirty = true;
1236 
1237 		internal.buffer = nullptr;
1238 		internal.width = width;
1239 		internal.height = height;
1240 		internal.depth = depth;
1241 		internal.samples = 1;
1242 		internal.format = selectInternalFormat(format);
1243 		internal.bytes = bytes(internal.format);
1244 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
1245 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
1246 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
1247 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
1248 		internal.border = 0;
1249 		internal.lock = LOCK_UNLOCKED;
1250 		internal.dirty = false;
1251 
1252 		stencil.buffer = nullptr;
1253 		stencil.width = width;
1254 		stencil.height = height;
1255 		stencil.depth = depth;
1256 		stencil.samples = 1;
1257 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1258 		stencil.bytes = bytes(stencil.format);
1259 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
1260 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
1261 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
1262 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
1263 		stencil.border = 0;
1264 		stencil.lock = LOCK_UNLOCKED;
1265 		stencil.dirty = false;
1266 
1267 		dirtyContents = true;
1268 		paletteUsed = 0;
1269 	}
1270 
Surface(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1271 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1272 	{
1273 		resource = texture ? texture : new Resource(0);
1274 		hasParent = texture != nullptr;
1275 		ownExternal = true;
1276 		depth = max(1, depth);
1277 		samples = max(1, samples);
1278 
1279 		external.buffer = nullptr;
1280 		external.width = width;
1281 		external.height = height;
1282 		external.depth = depth;
1283 		external.samples = (short)samples;
1284 		external.format = format;
1285 		external.bytes = bytes(external.format);
1286 		external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
1287 		external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
1288 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
1289 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
1290 		external.border = 0;
1291 		external.lock = LOCK_UNLOCKED;
1292 		external.dirty = false;
1293 
1294 		internal.buffer = nullptr;
1295 		internal.width = width;
1296 		internal.height = height;
1297 		internal.depth = depth;
1298 		internal.samples = (short)samples;
1299 		internal.format = selectInternalFormat(format);
1300 		internal.bytes = bytes(internal.format);
1301 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1302 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
1303 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
1304 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
1305 		internal.border = (short)border;
1306 		internal.lock = LOCK_UNLOCKED;
1307 		internal.dirty = false;
1308 
1309 		stencil.buffer = nullptr;
1310 		stencil.width = width;
1311 		stencil.height = height;
1312 		stencil.depth = depth;
1313 		stencil.samples = (short)samples;
1314 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1315 		stencil.bytes = bytes(stencil.format);
1316 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
1317 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
1318 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1319 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1320 		stencil.border = 0;
1321 		stencil.lock = LOCK_UNLOCKED;
1322 		stencil.dirty = false;
1323 
1324 		dirtyContents = true;
1325 		paletteUsed = 0;
1326 	}
1327 
~Surface()1328 	Surface::~Surface()
1329 	{
1330 		// sync() must be called before this destructor to ensure all locks have been released.
1331 		// We can't call it here because the parent resource may already have been destroyed.
1332 		ASSERT(isUnlocked());
1333 
1334 		if(!hasParent)
1335 		{
1336 			resource->destruct();
1337 		}
1338 
1339 		if(ownExternal)
1340 		{
1341 			deallocate(external.buffer);
1342 		}
1343 
1344 		if(internal.buffer != external.buffer)
1345 		{
1346 			deallocate(internal.buffer);
1347 		}
1348 
1349 		deallocate(stencil.buffer);
1350 
1351 		external.buffer = nullptr;
1352 		internal.buffer = nullptr;
1353 		stencil.buffer = nullptr;
1354 	}
1355 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1356 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1357 	{
1358 		resource->lock(client);
1359 
1360 		if(!external.buffer)
1361 		{
1362 			if(internal.buffer && identicalBuffers())
1363 			{
1364 				external.buffer = internal.buffer;
1365 			}
1366 			else
1367 			{
1368 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
1369 			}
1370 		}
1371 
1372 		if(internal.dirty)
1373 		{
1374 			if(lock != LOCK_DISCARD)
1375 			{
1376 				update(external, internal);
1377 			}
1378 
1379 			internal.dirty = false;
1380 		}
1381 
1382 		switch(lock)
1383 		{
1384 		case LOCK_READONLY:
1385 			break;
1386 		case LOCK_WRITEONLY:
1387 		case LOCK_READWRITE:
1388 		case LOCK_DISCARD:
1389 			dirtyContents = true;
1390 			break;
1391 		default:
1392 			ASSERT(false);
1393 		}
1394 
1395 		return external.lockRect(x, y, z, lock);
1396 	}
1397 
unlockExternal()1398 	void Surface::unlockExternal()
1399 	{
1400 		external.unlockRect();
1401 
1402 		resource->unlock();
1403 	}
1404 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1405 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1406 	{
1407 		if(lock != LOCK_UNLOCKED)
1408 		{
1409 			resource->lock(client);
1410 		}
1411 
1412 		if(!internal.buffer)
1413 		{
1414 			if(external.buffer && identicalBuffers())
1415 			{
1416 				internal.buffer = external.buffer;
1417 			}
1418 			else
1419 			{
1420 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
1421 			}
1422 		}
1423 
1424 		// FIXME: WHQL requires conversion to lower external precision and back
1425 		if(logPrecision >= WHQL)
1426 		{
1427 			if(internal.dirty && renderTarget && internal.format != external.format)
1428 			{
1429 				if(lock != LOCK_DISCARD)
1430 				{
1431 					switch(external.format)
1432 					{
1433 					case FORMAT_R3G3B2:
1434 					case FORMAT_A8R3G3B2:
1435 					case FORMAT_A1R5G5B5:
1436 					case FORMAT_A2R10G10B10:
1437 					case FORMAT_A2B10G10R10:
1438 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1439 						unlockExternal();
1440 						break;
1441 					default:
1442 						// Difference passes WHQL
1443 						break;
1444 					}
1445 				}
1446 			}
1447 		}
1448 
1449 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1450 		{
1451 			if(lock != LOCK_DISCARD)
1452 			{
1453 				update(internal, external);
1454 			}
1455 
1456 			external.dirty = false;
1457 			paletteUsed = Surface::paletteID;
1458 		}
1459 
1460 		switch(lock)
1461 		{
1462 		case LOCK_UNLOCKED:
1463 		case LOCK_READONLY:
1464 			break;
1465 		case LOCK_WRITEONLY:
1466 		case LOCK_READWRITE:
1467 		case LOCK_DISCARD:
1468 			dirtyContents = true;
1469 			break;
1470 		default:
1471 			ASSERT(false);
1472 		}
1473 
1474 		if(lock == LOCK_READONLY && client == PUBLIC)
1475 		{
1476 			resolve();
1477 		}
1478 
1479 		return internal.lockRect(x, y, z, lock);
1480 	}
1481 
unlockInternal()1482 	void Surface::unlockInternal()
1483 	{
1484 		internal.unlockRect();
1485 
1486 		resource->unlock();
1487 	}
1488 
lockStencil(int x,int y,int front,Accessor client)1489 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1490 	{
1491 		resource->lock(client);
1492 
1493 		if(stencil.format == FORMAT_NULL)
1494 		{
1495 			return nullptr;
1496 		}
1497 
1498 		if(!stencil.buffer)
1499 		{
1500 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
1501 		}
1502 
1503 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1504 	}
1505 
unlockStencil()1506 	void Surface::unlockStencil()
1507 	{
1508 		stencil.unlockRect();
1509 
1510 		resource->unlock();
1511 	}
1512 
bytes(Format format)1513 	int Surface::bytes(Format format)
1514 	{
1515 		switch(format)
1516 		{
1517 		case FORMAT_NULL:				return 0;
1518 		case FORMAT_P8:					return 1;
1519 		case FORMAT_A8P8:				return 2;
1520 		case FORMAT_A8:					return 1;
1521 		case FORMAT_R8I:				return 1;
1522 		case FORMAT_R8:					return 1;
1523 		case FORMAT_R3G3B2:				return 1;
1524 		case FORMAT_R16I:				return 2;
1525 		case FORMAT_R16UI:				return 2;
1526 		case FORMAT_A8R3G3B2:			return 2;
1527 		case FORMAT_R5G6B5:				return 2;
1528 		case FORMAT_A1R5G5B5:			return 2;
1529 		case FORMAT_X1R5G5B5:			return 2;
1530 		case FORMAT_R5G5B5A1:           return 2;
1531 		case FORMAT_X4R4G4B4:			return 2;
1532 		case FORMAT_A4R4G4B4:			return 2;
1533 		case FORMAT_R4G4B4A4:           return 2;
1534 		case FORMAT_R8G8B8:				return 3;
1535 		case FORMAT_B8G8R8:             return 3;
1536 		case FORMAT_R32I:				return 4;
1537 		case FORMAT_R32UI:				return 4;
1538 		case FORMAT_X8R8G8B8:			return 4;
1539 	//	case FORMAT_X8G8R8B8Q:			return 4;
1540 		case FORMAT_A8R8G8B8:			return 4;
1541 	//	case FORMAT_A8G8R8B8Q:			return 4;
1542 		case FORMAT_X8B8G8R8I:			return 4;
1543 		case FORMAT_X8B8G8R8:			return 4;
1544 		case FORMAT_SRGB8_X8:			return 4;
1545 		case FORMAT_SRGB8_A8:			return 4;
1546 		case FORMAT_A8B8G8R8I:			return 4;
1547 		case FORMAT_R8UI:				return 1;
1548 		case FORMAT_G8R8UI:				return 2;
1549 		case FORMAT_X8B8G8R8UI:			return 4;
1550 		case FORMAT_A8B8G8R8UI:			return 4;
1551 		case FORMAT_A8B8G8R8:			return 4;
1552 		case FORMAT_R8_SNORM:			return 1;
1553 		case FORMAT_G8R8_SNORM:		return 2;
1554 		case FORMAT_X8B8G8R8_SNORM:	return 4;
1555 		case FORMAT_A8B8G8R8_SNORM:	return 4;
1556 		case FORMAT_A2R10G10B10:		return 4;
1557 		case FORMAT_A2B10G10R10:		return 4;
1558 		case FORMAT_A2B10G10R10UI:		return 4;
1559 		case FORMAT_G8R8I:				return 2;
1560 		case FORMAT_G8R8:				return 2;
1561 		case FORMAT_G16R16I:			return 4;
1562 		case FORMAT_G16R16UI:			return 4;
1563 		case FORMAT_G16R16:				return 4;
1564 		case FORMAT_G32R32I:			return 8;
1565 		case FORMAT_G32R32UI:			return 8;
1566 		case FORMAT_X16B16G16R16I:		return 8;
1567 		case FORMAT_X16B16G16R16UI:		return 8;
1568 		case FORMAT_A16B16G16R16I:		return 8;
1569 		case FORMAT_A16B16G16R16UI:		return 8;
1570 		case FORMAT_A16B16G16R16:		return 8;
1571 		case FORMAT_X32B32G32R32I:		return 16;
1572 		case FORMAT_X32B32G32R32UI:		return 16;
1573 		case FORMAT_A32B32G32R32I:		return 16;
1574 		case FORMAT_A32B32G32R32UI:		return 16;
1575 		// Compressed formats
1576 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1577 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1578 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1579 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1580 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1581 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1582 		case FORMAT_R11_EAC:			return 2;
1583 		case FORMAT_SIGNED_R11_EAC:		return 2;
1584 		case FORMAT_RG11_EAC:			return 4;
1585 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1586 		case FORMAT_RGB8_ETC2:			return 2;
1587 		case FORMAT_SRGB8_ETC2:			return 2;
1588 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1589 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1590 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1591 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1592 		// Bumpmap formats
1593 		case FORMAT_V8U8:				return 2;
1594 		case FORMAT_L6V5U5:				return 2;
1595 		case FORMAT_Q8W8V8U8:			return 4;
1596 		case FORMAT_X8L8V8U8:			return 4;
1597 		case FORMAT_A2W10V10U10:		return 4;
1598 		case FORMAT_V16U16:				return 4;
1599 		case FORMAT_A16W16V16U16:		return 8;
1600 		case FORMAT_Q16W16V16U16:		return 8;
1601 		// Luminance formats
1602 		case FORMAT_L8:					return 1;
1603 		case FORMAT_A4L4:				return 1;
1604 		case FORMAT_L16:				return 2;
1605 		case FORMAT_A8L8:				return 2;
1606 		case FORMAT_L16F:               return 2;
1607 		case FORMAT_A16L16F:            return 4;
1608 		case FORMAT_L32F:               return 4;
1609 		case FORMAT_A32L32F:            return 8;
1610 		// Floating-point formats
1611 		case FORMAT_A16F:				return 2;
1612 		case FORMAT_R16F:				return 2;
1613 		case FORMAT_G16R16F:			return 4;
1614 		case FORMAT_B16G16R16F:			return 6;
1615 		case FORMAT_X16B16G16R16F:		return 8;
1616 		case FORMAT_A16B16G16R16F:		return 8;
1617 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
1618 		case FORMAT_A32F:				return 4;
1619 		case FORMAT_R32F:				return 4;
1620 		case FORMAT_G32R32F:			return 8;
1621 		case FORMAT_B32G32R32F:			return 12;
1622 		case FORMAT_X32B32G32R32F:		return 16;
1623 		case FORMAT_A32B32G32R32F:		return 16;
1624 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
1625 		// Depth/stencil formats
1626 		case FORMAT_D16:				return 2;
1627 		case FORMAT_D32:				return 4;
1628 		case FORMAT_D24X8:				return 4;
1629 		case FORMAT_D24S8:				return 4;
1630 		case FORMAT_D24FS8:				return 4;
1631 		case FORMAT_D32F:				return 4;
1632 		case FORMAT_D32FS8:				return 4;
1633 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1634 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
1635 		case FORMAT_D32F_LOCKABLE:		return 4;
1636 		case FORMAT_D32FS8_TEXTURE:		return 4;
1637 		case FORMAT_D32F_SHADOW:		return 4;
1638 		case FORMAT_D32FS8_SHADOW:		return 4;
1639 		case FORMAT_DF24S8:				return 4;
1640 		case FORMAT_DF16S8:				return 2;
1641 		case FORMAT_INTZ:				return 4;
1642 		case FORMAT_S8:					return 1;
1643 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1644 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1645 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1646 		default:
1647 			ASSERT(false);
1648 		}
1649 
1650 		return 0;
1651 	}
1652 
pitchB(int width,int border,Format format,bool target)1653 	int Surface::pitchB(int width, int border, Format format, bool target)
1654 	{
1655 		width += 2 * border;
1656 
1657 		// Render targets require 2x2 quads
1658 		if(target || isDepth(format) || isStencil(format))
1659 		{
1660 			width = align<2>(width);
1661 		}
1662 
1663 		switch(format)
1664 		{
1665 		case FORMAT_DXT1:
1666 		case FORMAT_ETC1:
1667 		case FORMAT_R11_EAC:
1668 		case FORMAT_SIGNED_R11_EAC:
1669 		case FORMAT_RGB8_ETC2:
1670 		case FORMAT_SRGB8_ETC2:
1671 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1672 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1673 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1674 		case FORMAT_RG11_EAC:
1675 		case FORMAT_SIGNED_RG11_EAC:
1676 		case FORMAT_RGBA8_ETC2_EAC:
1677 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1678 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1679 		case FORMAT_DXT3:
1680 		case FORMAT_DXT5:
1681 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1682 		case FORMAT_ATI1:
1683 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1684 		case FORMAT_ATI2:
1685 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1686 		case FORMAT_YV12_BT601:
1687 		case FORMAT_YV12_BT709:
1688 		case FORMAT_YV12_JFIF:
1689 			return align<16>(width);
1690 		default:
1691 			return bytes(format) * width;
1692 		}
1693 	}
1694 
pitchP(int width,int border,Format format,bool target)1695 	int Surface::pitchP(int width, int border, Format format, bool target)
1696 	{
1697 		int B = bytes(format);
1698 
1699 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
1700 	}
1701 
sliceB(int width,int height,int border,Format format,bool target)1702 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
1703 	{
1704 		height += 2 * border;
1705 
1706 		// Render targets require 2x2 quads
1707 		if(target || isDepth(format) || isStencil(format))
1708 		{
1709 			height = align<2>(height);
1710 		}
1711 
1712 		switch(format)
1713 		{
1714 		case FORMAT_DXT1:
1715 		case FORMAT_DXT3:
1716 		case FORMAT_DXT5:
1717 		case FORMAT_ETC1:
1718 		case FORMAT_R11_EAC:
1719 		case FORMAT_SIGNED_R11_EAC:
1720 		case FORMAT_RG11_EAC:
1721 		case FORMAT_SIGNED_RG11_EAC:
1722 		case FORMAT_RGB8_ETC2:
1723 		case FORMAT_SRGB8_ETC2:
1724 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1725 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1726 		case FORMAT_RGBA8_ETC2_EAC:
1727 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1728 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1729 		case FORMAT_ATI1:
1730 		case FORMAT_ATI2:
1731 			return pitchB(width, border, format, target) * align<4>(height);   // Pitch computed per row
1732 		default:
1733 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
1734 		}
1735 	}
1736 
sliceP(int width,int height,int border,Format format,bool target)1737 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
1738 	{
1739 		int B = bytes(format);
1740 
1741 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
1742 	}
1743 
update(Buffer & destination,Buffer & source)1744 	void Surface::update(Buffer &destination, Buffer &source)
1745 	{
1746 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1747 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1748 
1749 		if(destination.buffer != source.buffer)
1750 		{
1751 			ASSERT(source.dirty && !destination.dirty);
1752 
1753 			switch(source.format)
1754 			{
1755 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1756 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1757 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1758 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1759 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1760 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1761 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1762 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1763 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1764 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1765 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1766 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1767 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1768 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1769 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1770 			case FORMAT_ETC1:
1771 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1772 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1773 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1774 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1775 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1776 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1777 			default:				genericUpdate(destination, source);		break;
1778 			}
1779 		}
1780 	}
1781 
genericUpdate(Buffer & destination,Buffer & source)1782 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1783 	{
1784 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1785 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1786 
1787 		int depth = min(destination.depth, source.depth);
1788 		int height = min(destination.height, source.height);
1789 		int width = min(destination.width, source.width);
1790 		int rowBytes = width * source.bytes;
1791 
1792 		for(int z = 0; z < depth; z++)
1793 		{
1794 			unsigned char *sourceRow = sourceSlice;
1795 			unsigned char *destinationRow = destinationSlice;
1796 
1797 			for(int y = 0; y < height; y++)
1798 			{
1799 				if(source.format == destination.format)
1800 				{
1801 					memcpy(destinationRow, sourceRow, rowBytes);
1802 				}
1803 				else
1804 				{
1805 					unsigned char *sourceElement = sourceRow;
1806 					unsigned char *destinationElement = destinationRow;
1807 
1808 					for(int x = 0; x < width; x++)
1809 					{
1810 						Color<float> color = source.read(sourceElement);
1811 						destination.write(destinationElement, color);
1812 
1813 						sourceElement += source.bytes;
1814 						destinationElement += destination.bytes;
1815 					}
1816 				}
1817 
1818 				sourceRow += source.pitchB;
1819 				destinationRow += destination.pitchB;
1820 			}
1821 
1822 			sourceSlice += source.sliceB;
1823 			destinationSlice += destination.sliceB;
1824 		}
1825 
1826 		source.unlockRect();
1827 		destination.unlockRect();
1828 	}
1829 
decodeR8G8B8(Buffer & destination,Buffer & source)1830 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
1831 	{
1832 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1833 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1834 
1835 		int depth = min(destination.depth, source.depth);
1836 		int height = min(destination.height, source.height);
1837 		int width = min(destination.width, source.width);
1838 
1839 		for(int z = 0; z < depth; z++)
1840 		{
1841 			unsigned char *sourceRow = sourceSlice;
1842 			unsigned char *destinationRow = destinationSlice;
1843 
1844 			for(int y = 0; y < height; y++)
1845 			{
1846 				unsigned char *sourceElement = sourceRow;
1847 				unsigned char *destinationElement = destinationRow;
1848 
1849 				for(int x = 0; x < width; x++)
1850 				{
1851 					unsigned int b = sourceElement[0];
1852 					unsigned int g = sourceElement[1];
1853 					unsigned int r = sourceElement[2];
1854 
1855 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1856 
1857 					sourceElement += source.bytes;
1858 					destinationElement += destination.bytes;
1859 				}
1860 
1861 				sourceRow += source.pitchB;
1862 				destinationRow += destination.pitchB;
1863 			}
1864 
1865 			sourceSlice += source.sliceB;
1866 			destinationSlice += destination.sliceB;
1867 		}
1868 
1869 		source.unlockRect();
1870 		destination.unlockRect();
1871 	}
1872 
decodeX1R5G5B5(Buffer & destination,Buffer & source)1873 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
1874 	{
1875 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1876 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1877 
1878 		int depth = min(destination.depth, source.depth);
1879 		int height = min(destination.height, source.height);
1880 		int width = min(destination.width, source.width);
1881 
1882 		for(int z = 0; z < depth; z++)
1883 		{
1884 			unsigned char *sourceRow = sourceSlice;
1885 			unsigned char *destinationRow = destinationSlice;
1886 
1887 			for(int y = 0; y < height; y++)
1888 			{
1889 				unsigned char *sourceElement = sourceRow;
1890 				unsigned char *destinationElement = destinationRow;
1891 
1892 				for(int x = 0; x < width; x++)
1893 				{
1894 					unsigned int xrgb = *(unsigned short*)sourceElement;
1895 
1896 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1897 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1898 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1899 
1900 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1901 
1902 					sourceElement += source.bytes;
1903 					destinationElement += destination.bytes;
1904 				}
1905 
1906 				sourceRow += source.pitchB;
1907 				destinationRow += destination.pitchB;
1908 			}
1909 
1910 			sourceSlice += source.sliceB;
1911 			destinationSlice += destination.sliceB;
1912 		}
1913 
1914 		source.unlockRect();
1915 		destination.unlockRect();
1916 	}
1917 
decodeA1R5G5B5(Buffer & destination,Buffer & source)1918 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
1919 	{
1920 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1921 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1922 
1923 		int depth = min(destination.depth, source.depth);
1924 		int height = min(destination.height, source.height);
1925 		int width = min(destination.width, source.width);
1926 
1927 		for(int z = 0; z < depth; z++)
1928 		{
1929 			unsigned char *sourceRow = sourceSlice;
1930 			unsigned char *destinationRow = destinationSlice;
1931 
1932 			for(int y = 0; y < height; y++)
1933 			{
1934 				unsigned char *sourceElement = sourceRow;
1935 				unsigned char *destinationElement = destinationRow;
1936 
1937 				for(int x = 0; x < width; x++)
1938 				{
1939 					unsigned int argb = *(unsigned short*)sourceElement;
1940 
1941 					unsigned int a =   (argb & 0x8000) * 130560;
1942 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1943 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1944 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1945 
1946 					*(unsigned int*)destinationElement = a | r | g | b;
1947 
1948 					sourceElement += source.bytes;
1949 					destinationElement += destination.bytes;
1950 				}
1951 
1952 				sourceRow += source.pitchB;
1953 				destinationRow += destination.pitchB;
1954 			}
1955 
1956 			sourceSlice += source.sliceB;
1957 			destinationSlice += destination.sliceB;
1958 		}
1959 
1960 		source.unlockRect();
1961 		destination.unlockRect();
1962 	}
1963 
decodeX4R4G4B4(Buffer & destination,Buffer & source)1964 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
1965 	{
1966 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1967 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1968 
1969 		int depth = min(destination.depth, source.depth);
1970 		int height = min(destination.height, source.height);
1971 		int width = min(destination.width, source.width);
1972 
1973 		for(int z = 0; z < depth; z++)
1974 		{
1975 			unsigned char *sourceRow = sourceSlice;
1976 			unsigned char *destinationRow = destinationSlice;
1977 
1978 			for(int y = 0; y < height; y++)
1979 			{
1980 				unsigned char *sourceElement = sourceRow;
1981 				unsigned char *destinationElement = destinationRow;
1982 
1983 				for(int x = 0; x < width; x++)
1984 				{
1985 					unsigned int xrgb = *(unsigned short*)sourceElement;
1986 
1987 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
1988 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
1989 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
1990 
1991 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1992 
1993 					sourceElement += source.bytes;
1994 					destinationElement += destination.bytes;
1995 				}
1996 
1997 				sourceRow += source.pitchB;
1998 				destinationRow += destination.pitchB;
1999 			}
2000 
2001 			sourceSlice += source.sliceB;
2002 			destinationSlice += destination.sliceB;
2003 		}
2004 
2005 		source.unlockRect();
2006 		destination.unlockRect();
2007 	}
2008 
decodeA4R4G4B4(Buffer & destination,Buffer & source)2009 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
2010 	{
2011 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2012 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2013 
2014 		int depth = min(destination.depth, source.depth);
2015 		int height = min(destination.height, source.height);
2016 		int width = min(destination.width, source.width);
2017 
2018 		for(int z = 0; z < depth; z++)
2019 		{
2020 			unsigned char *sourceRow = sourceSlice;
2021 			unsigned char *destinationRow = destinationSlice;
2022 
2023 			for(int y = 0; y < height; y++)
2024 			{
2025 				unsigned char *sourceElement = sourceRow;
2026 				unsigned char *destinationElement = destinationRow;
2027 
2028 				for(int x = 0; x < width; x++)
2029 				{
2030 					unsigned int argb = *(unsigned short*)sourceElement;
2031 
2032 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2033 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2034 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2035 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2036 
2037 					*(unsigned int*)destinationElement = a | r | g | b;
2038 
2039 					sourceElement += source.bytes;
2040 					destinationElement += destination.bytes;
2041 				}
2042 
2043 				sourceRow += source.pitchB;
2044 				destinationRow += destination.pitchB;
2045 			}
2046 
2047 			sourceSlice += source.sliceB;
2048 			destinationSlice += destination.sliceB;
2049 		}
2050 
2051 		source.unlockRect();
2052 		destination.unlockRect();
2053 	}
2054 
decodeP8(Buffer & destination,Buffer & source)2055 	void Surface::decodeP8(Buffer &destination, Buffer &source)
2056 	{
2057 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2058 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2059 
2060 		int depth = min(destination.depth, source.depth);
2061 		int height = min(destination.height, source.height);
2062 		int width = min(destination.width, source.width);
2063 
2064 		for(int z = 0; z < depth; z++)
2065 		{
2066 			unsigned char *sourceRow = sourceSlice;
2067 			unsigned char *destinationRow = destinationSlice;
2068 
2069 			for(int y = 0; y < height; y++)
2070 			{
2071 				unsigned char *sourceElement = sourceRow;
2072 				unsigned char *destinationElement = destinationRow;
2073 
2074 				for(int x = 0; x < width; x++)
2075 				{
2076 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2077 
2078 					unsigned int r = (abgr & 0x000000FF) << 16;
2079 					unsigned int g = (abgr & 0x0000FF00) << 0;
2080 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2081 					unsigned int a = (abgr & 0xFF000000) >> 0;
2082 
2083 					*(unsigned int*)destinationElement = a | r | g | b;
2084 
2085 					sourceElement += source.bytes;
2086 					destinationElement += destination.bytes;
2087 				}
2088 
2089 				sourceRow += source.pitchB;
2090 				destinationRow += destination.pitchB;
2091 			}
2092 
2093 			sourceSlice += source.sliceB;
2094 			destinationSlice += destination.sliceB;
2095 		}
2096 
2097 		source.unlockRect();
2098 		destination.unlockRect();
2099 	}
2100 
decodeDXT1(Buffer & internal,Buffer & external)2101 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
2102 	{
2103 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2104 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2105 
2106 		for(int z = 0; z < external.depth; z++)
2107 		{
2108 			unsigned int *dest = destSlice;
2109 
2110 			for(int y = 0; y < external.height; y += 4)
2111 			{
2112 				for(int x = 0; x < external.width; x += 4)
2113 				{
2114 					Color<byte> c[4];
2115 
2116 					c[0] = source->c0;
2117 					c[1] = source->c1;
2118 
2119 					if(source->c0 > source->c1)   // No transparency
2120 					{
2121 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2122 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2123 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2124 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2125 						c[2].a = 0xFF;
2126 
2127 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2128 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2129 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2130 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2131 						c[3].a = 0xFF;
2132 					}
2133 					else   // c3 transparent
2134 					{
2135 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2136 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2137 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2138 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2139 						c[2].a = 0xFF;
2140 
2141 						c[3].r = 0;
2142 						c[3].g = 0;
2143 						c[3].b = 0;
2144 						c[3].a = 0;
2145 					}
2146 
2147 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2148 					{
2149 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2150 						{
2151 							dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2152 						}
2153 					}
2154 
2155 					source++;
2156 				}
2157 			}
2158 
2159 			(byte*&)destSlice += internal.sliceB;
2160 		}
2161 
2162 		external.unlockRect();
2163 		internal.unlockRect();
2164 	}
2165 
decodeDXT3(Buffer & internal,Buffer & external)2166 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
2167 	{
2168 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2169 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
2170 
2171 		for(int z = 0; z < external.depth; z++)
2172 		{
2173 			unsigned int *dest = destSlice;
2174 
2175 			for(int y = 0; y < external.height; y += 4)
2176 			{
2177 				for(int x = 0; x < external.width; x += 4)
2178 				{
2179 					Color<byte> c[4];
2180 
2181 					c[0] = source->c0;
2182 					c[1] = source->c1;
2183 
2184 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2185 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2186 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2187 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2188 
2189 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2190 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2191 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2192 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2193 
2194 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2195 					{
2196 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2197 						{
2198 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2199 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2200 
2201 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2202 						}
2203 					}
2204 
2205 					source++;
2206 				}
2207 			}
2208 
2209 			(byte*&)destSlice += internal.sliceB;
2210 		}
2211 
2212 		external.unlockRect();
2213 		internal.unlockRect();
2214 	}
2215 
decodeDXT5(Buffer & internal,Buffer & external)2216 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
2217 	{
2218 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2219 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
2220 
2221 		for(int z = 0; z < external.depth; z++)
2222 		{
2223 			unsigned int *dest = destSlice;
2224 
2225 			for(int y = 0; y < external.height; y += 4)
2226 			{
2227 				for(int x = 0; x < external.width; x += 4)
2228 				{
2229 					Color<byte> c[4];
2230 
2231 					c[0] = source->c0;
2232 					c[1] = source->c1;
2233 
2234 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2235 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2236 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2237 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2238 
2239 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2240 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2241 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2242 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2243 
2244 					byte a[8];
2245 
2246 					a[0] = source->a0;
2247 					a[1] = source->a1;
2248 
2249 					if(a[0] > a[1])
2250 					{
2251 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2252 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2253 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2254 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2255 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2256 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2257 					}
2258 					else
2259 					{
2260 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2261 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2262 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2263 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2264 						a[6] = 0;
2265 						a[7] = 0xFF;
2266 					}
2267 
2268 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2269 					{
2270 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2271 						{
2272 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2273 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2274 
2275 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2276 						}
2277 					}
2278 
2279 					source++;
2280 				}
2281 			}
2282 
2283 			(byte*&)destSlice += internal.sliceB;
2284 		}
2285 
2286 		external.unlockRect();
2287 		internal.unlockRect();
2288 	}
2289 
decodeATI1(Buffer & internal,Buffer & external)2290 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
2291 	{
2292 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2293 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2294 
2295 		for(int z = 0; z < external.depth; z++)
2296 		{
2297 			byte *dest = destSlice;
2298 
2299 			for(int y = 0; y < external.height; y += 4)
2300 			{
2301 				for(int x = 0; x < external.width; x += 4)
2302 				{
2303 					byte r[8];
2304 
2305 					r[0] = source->r0;
2306 					r[1] = source->r1;
2307 
2308 					if(r[0] > r[1])
2309 					{
2310 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2311 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2312 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2313 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2314 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2315 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2316 					}
2317 					else
2318 					{
2319 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2320 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2321 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2322 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2323 						r[6] = 0;
2324 						r[7] = 0xFF;
2325 					}
2326 
2327 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2328 					{
2329 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2330 						{
2331 							dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2332 						}
2333 					}
2334 
2335 					source++;
2336 				}
2337 			}
2338 
2339 			destSlice += internal.sliceB;
2340 		}
2341 
2342 		external.unlockRect();
2343 		internal.unlockRect();
2344 	}
2345 
decodeATI2(Buffer & internal,Buffer & external)2346 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
2347 	{
2348 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2349 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
2350 
2351 		for(int z = 0; z < external.depth; z++)
2352 		{
2353 			word *dest = destSlice;
2354 
2355 			for(int y = 0; y < external.height; y += 4)
2356 			{
2357 				for(int x = 0; x < external.width; x += 4)
2358 				{
2359 					byte X[8];
2360 
2361 					X[0] = source->x0;
2362 					X[1] = source->x1;
2363 
2364 					if(X[0] > X[1])
2365 					{
2366 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2367 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2368 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2369 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2370 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2371 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2372 					}
2373 					else
2374 					{
2375 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2376 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2377 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2378 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2379 						X[6] = 0;
2380 						X[7] = 0xFF;
2381 					}
2382 
2383 					byte Y[8];
2384 
2385 					Y[0] = source->y0;
2386 					Y[1] = source->y1;
2387 
2388 					if(Y[0] > Y[1])
2389 					{
2390 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2391 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2392 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2393 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2394 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2395 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2396 					}
2397 					else
2398 					{
2399 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2400 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2401 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2402 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2403 						Y[6] = 0;
2404 						Y[7] = 0xFF;
2405 					}
2406 
2407 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2408 					{
2409 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2410 						{
2411 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2412 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2413 
2414 							dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
2415 						}
2416 					}
2417 
2418 					source++;
2419 				}
2420 			}
2421 
2422 			(byte*&)destSlice += internal.sliceB;
2423 		}
2424 
2425 		external.unlockRect();
2426 		internal.unlockRect();
2427 	}
2428 
decodeETC2(Buffer & internal,Buffer & external,int nbAlphaBits,bool isSRGB)2429 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
2430 	{
2431 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2432 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2433 		external.unlockRect();
2434 		internal.unlockRect();
2435 
2436 		if(isSRGB)
2437 		{
2438 			static byte sRGBtoLinearTable[256];
2439 			static bool sRGBtoLinearTableDirty = true;
2440 			if(sRGBtoLinearTableDirty)
2441 			{
2442 				for(int i = 0; i < 256; i++)
2443 				{
2444 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2445 				}
2446 				sRGBtoLinearTableDirty = false;
2447 			}
2448 
2449 			// Perform sRGB conversion in place after decoding
2450 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2451 			for(int y = 0; y < internal.height; y++)
2452 			{
2453 				byte *srcRow = src + y * internal.pitchB;
2454 				for(int x = 0; x <  internal.width; x++)
2455 				{
2456 					byte *srcPix = srcRow + x * internal.bytes;
2457 					for(int i = 0; i < 3; i++)
2458 					{
2459 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2460 					}
2461 				}
2462 			}
2463 			internal.unlockRect();
2464 		}
2465 	}
2466 
decodeEAC(Buffer & internal,Buffer & external,int nbChannels,bool isSigned)2467 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
2468 	{
2469 		ASSERT(nbChannels == 1 || nbChannels == 2);
2470 
2471 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2472 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2473 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2474 		external.unlockRect();
2475 
2476 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
2477 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
2478 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
2479 		for(int y = 0; y < internal.height; y++)
2480 		{
2481 			byte* srcRow = src + y * internal.pitchB;
2482 			for(int x = internal.width - 1; x >= 0; x--)
2483 			{
2484 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
2485 				float* dstPix = reinterpret_cast<float*>(srcPix);
2486 				for(int c = nbChannels - 1; c >= 0; c--)
2487 				{
2488 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2489 				}
2490 			}
2491 		}
2492 
2493 		internal.unlockRect();
2494 	}
2495 
size(int width,int height,int depth,int border,int samples,Format format)2496 	size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
2497 	{
2498 		samples = max(1, samples);
2499 
2500 		switch(format)
2501 		{
2502 		default:
2503 			{
2504 				uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
2505 
2506 				// We can only sample buffers smaller than 2 GiB, due to signed 32-bit offset calculations.
2507 				// Force an out-of-memory if larger, or let the caller report an error.
2508 				if(size >= 0x80000000u)
2509 				{
2510 					return std::numeric_limits<size_t>::max();
2511 				}
2512 
2513 				// Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
2514 				// and stencil operations also read 8 bytes per four 8-bit stencil values,
2515 				// so we have to allocate 4 extra bytes to avoid buffer overruns.
2516 			    // TODO(b/145229887): Eliminate if possible, or don't hard-code.
2517 				return static_cast<size_t>(size) + 4;
2518 			}
2519 		case FORMAT_YV12_BT601:
2520 		case FORMAT_YV12_BT709:
2521 		case FORMAT_YV12_JFIF:
2522 			{
2523 				width += 2 * border;
2524 				height += 2 * border;
2525 
2526 				size_t YStride = align<16>(width);
2527 				size_t YSize = YStride * height;
2528 				size_t CStride = align<16>(YStride / 2);
2529 				size_t CSize = CStride * height / 2;
2530 
2531 				return YSize + 2 * CSize;
2532 			}
2533 		}
2534 	}
2535 
isStencil(Format format)2536 	bool Surface::isStencil(Format format)
2537 	{
2538 		switch(format)
2539 		{
2540 		case FORMAT_D32:
2541 		case FORMAT_D16:
2542 		case FORMAT_D24X8:
2543 		case FORMAT_D32F:
2544 		case FORMAT_D32F_COMPLEMENTARY:
2545 		case FORMAT_D32F_LOCKABLE:
2546 		case FORMAT_D32F_SHADOW:
2547 			return false;
2548 		case FORMAT_D24S8:
2549 		case FORMAT_D24FS8:
2550 		case FORMAT_S8:
2551 		case FORMAT_DF24S8:
2552 		case FORMAT_DF16S8:
2553 		case FORMAT_D32FS8_TEXTURE:
2554 		case FORMAT_D32FS8_SHADOW:
2555 		case FORMAT_D32FS8:
2556 		case FORMAT_D32FS8_COMPLEMENTARY:
2557 		case FORMAT_INTZ:
2558 			return true;
2559 		default:
2560 			return false;
2561 		}
2562 	}
2563 
isDepth(Format format)2564 	bool Surface::isDepth(Format format)
2565 	{
2566 		switch(format)
2567 		{
2568 		case FORMAT_D32:
2569 		case FORMAT_D16:
2570 		case FORMAT_D24X8:
2571 		case FORMAT_D24S8:
2572 		case FORMAT_D24FS8:
2573 		case FORMAT_D32F:
2574 		case FORMAT_D32FS8:
2575 		case FORMAT_D32F_COMPLEMENTARY:
2576 		case FORMAT_D32FS8_COMPLEMENTARY:
2577 		case FORMAT_D32F_LOCKABLE:
2578 		case FORMAT_DF24S8:
2579 		case FORMAT_DF16S8:
2580 		case FORMAT_D32FS8_TEXTURE:
2581 		case FORMAT_D32F_SHADOW:
2582 		case FORMAT_D32FS8_SHADOW:
2583 		case FORMAT_INTZ:
2584 			return true;
2585 		case FORMAT_S8:
2586 			return false;
2587 		default:
2588 			return false;
2589 		}
2590 	}
2591 
hasQuadLayout(Format format)2592 	bool Surface::hasQuadLayout(Format format)
2593 	{
2594 		switch(format)
2595 		{
2596 		case FORMAT_D32:
2597 		case FORMAT_D16:
2598 		case FORMAT_D24X8:
2599 		case FORMAT_D24S8:
2600 		case FORMAT_D24FS8:
2601 		case FORMAT_D32F:
2602 		case FORMAT_D32FS8:
2603 		case FORMAT_D32F_COMPLEMENTARY:
2604 		case FORMAT_D32FS8_COMPLEMENTARY:
2605 		case FORMAT_DF24S8:
2606 		case FORMAT_DF16S8:
2607 		case FORMAT_INTZ:
2608 		case FORMAT_S8:
2609 		case FORMAT_A8G8R8B8Q:
2610 		case FORMAT_X8G8R8B8Q:
2611 			return true;
2612 		case FORMAT_D32F_LOCKABLE:
2613 		case FORMAT_D32FS8_TEXTURE:
2614 		case FORMAT_D32F_SHADOW:
2615 		case FORMAT_D32FS8_SHADOW:
2616 		default:
2617 			break;
2618 		}
2619 
2620 		return false;
2621 	}
2622 
isPalette(Format format)2623 	bool Surface::isPalette(Format format)
2624 	{
2625 		switch(format)
2626 		{
2627 		case FORMAT_P8:
2628 		case FORMAT_A8P8:
2629 			return true;
2630 		default:
2631 			return false;
2632 		}
2633 	}
2634 
isFloatFormat(Format format)2635 	bool Surface::isFloatFormat(Format format)
2636 	{
2637 		switch(format)
2638 		{
2639 		case FORMAT_R5G6B5:
2640 		case FORMAT_R8G8B8:
2641 		case FORMAT_B8G8R8:
2642 		case FORMAT_X8R8G8B8:
2643 		case FORMAT_X8B8G8R8I:
2644 		case FORMAT_X8B8G8R8:
2645 		case FORMAT_A8R8G8B8:
2646 		case FORMAT_SRGB8_X8:
2647 		case FORMAT_SRGB8_A8:
2648 		case FORMAT_A8B8G8R8I:
2649 		case FORMAT_R8UI:
2650 		case FORMAT_G8R8UI:
2651 		case FORMAT_X8B8G8R8UI:
2652 		case FORMAT_A8B8G8R8UI:
2653 		case FORMAT_A8B8G8R8:
2654 		case FORMAT_G8R8I:
2655 		case FORMAT_G8R8:
2656 		case FORMAT_A2B10G10R10:
2657 		case FORMAT_A2B10G10R10UI:
2658 		case FORMAT_R8_SNORM:
2659 		case FORMAT_G8R8_SNORM:
2660 		case FORMAT_X8B8G8R8_SNORM:
2661 		case FORMAT_A8B8G8R8_SNORM:
2662 		case FORMAT_R16I:
2663 		case FORMAT_R16UI:
2664 		case FORMAT_G16R16I:
2665 		case FORMAT_G16R16UI:
2666 		case FORMAT_G16R16:
2667 		case FORMAT_X16B16G16R16I:
2668 		case FORMAT_X16B16G16R16UI:
2669 		case FORMAT_A16B16G16R16I:
2670 		case FORMAT_A16B16G16R16UI:
2671 		case FORMAT_A16B16G16R16:
2672 		case FORMAT_V8U8:
2673 		case FORMAT_Q8W8V8U8:
2674 		case FORMAT_X8L8V8U8:
2675 		case FORMAT_V16U16:
2676 		case FORMAT_A16W16V16U16:
2677 		case FORMAT_Q16W16V16U16:
2678 		case FORMAT_A8:
2679 		case FORMAT_R8I:
2680 		case FORMAT_R8:
2681 		case FORMAT_S8:
2682 		case FORMAT_L8:
2683 		case FORMAT_L16:
2684 		case FORMAT_A8L8:
2685 		case FORMAT_YV12_BT601:
2686 		case FORMAT_YV12_BT709:
2687 		case FORMAT_YV12_JFIF:
2688 		case FORMAT_R32I:
2689 		case FORMAT_R32UI:
2690 		case FORMAT_G32R32I:
2691 		case FORMAT_G32R32UI:
2692 		case FORMAT_X32B32G32R32I:
2693 		case FORMAT_X32B32G32R32UI:
2694 		case FORMAT_A32B32G32R32I:
2695 		case FORMAT_A32B32G32R32UI:
2696 			return false;
2697 		case FORMAT_R16F:
2698 		case FORMAT_G16R16F:
2699 		case FORMAT_B16G16R16F:
2700 		case FORMAT_X16B16G16R16F:
2701 		case FORMAT_A16B16G16R16F:
2702 		case FORMAT_X16B16G16R16F_UNSIGNED:
2703 		case FORMAT_R32F:
2704 		case FORMAT_G32R32F:
2705 		case FORMAT_B32G32R32F:
2706 		case FORMAT_X32B32G32R32F:
2707 		case FORMAT_A32B32G32R32F:
2708 		case FORMAT_X32B32G32R32F_UNSIGNED:
2709 		case FORMAT_D32F:
2710 		case FORMAT_D32FS8:
2711 		case FORMAT_D32F_COMPLEMENTARY:
2712 		case FORMAT_D32FS8_COMPLEMENTARY:
2713 		case FORMAT_D32F_LOCKABLE:
2714 		case FORMAT_D32FS8_TEXTURE:
2715 		case FORMAT_D32F_SHADOW:
2716 		case FORMAT_D32FS8_SHADOW:
2717 		case FORMAT_L16F:
2718 		case FORMAT_A16L16F:
2719 		case FORMAT_L32F:
2720 		case FORMAT_A32L32F:
2721 			return true;
2722 		default:
2723 			ASSERT(false);
2724 		}
2725 
2726 		return false;
2727 	}
2728 
isUnsignedComponent(Format format,int component)2729 	bool Surface::isUnsignedComponent(Format format, int component)
2730 	{
2731 		switch(format)
2732 		{
2733 		case FORMAT_NULL:
2734 		case FORMAT_R5G6B5:
2735 		case FORMAT_R8G8B8:
2736 		case FORMAT_B8G8R8:
2737 		case FORMAT_X8R8G8B8:
2738 		case FORMAT_X8B8G8R8:
2739 		case FORMAT_A8R8G8B8:
2740 		case FORMAT_A8B8G8R8:
2741 		case FORMAT_SRGB8_X8:
2742 		case FORMAT_SRGB8_A8:
2743 		case FORMAT_G8R8:
2744 		case FORMAT_A2B10G10R10:
2745 		case FORMAT_A2B10G10R10UI:
2746 		case FORMAT_R16UI:
2747 		case FORMAT_G16R16:
2748 		case FORMAT_G16R16UI:
2749 		case FORMAT_X16B16G16R16UI:
2750 		case FORMAT_A16B16G16R16:
2751 		case FORMAT_A16B16G16R16UI:
2752 		case FORMAT_R32UI:
2753 		case FORMAT_G32R32UI:
2754 		case FORMAT_X32B32G32R32UI:
2755 		case FORMAT_A32B32G32R32UI:
2756 		case FORMAT_X32B32G32R32F_UNSIGNED:
2757 		case FORMAT_R8UI:
2758 		case FORMAT_G8R8UI:
2759 		case FORMAT_X8B8G8R8UI:
2760 		case FORMAT_A8B8G8R8UI:
2761 		case FORMAT_D32F:
2762 		case FORMAT_D32FS8:
2763 		case FORMAT_D32F_COMPLEMENTARY:
2764 		case FORMAT_D32FS8_COMPLEMENTARY:
2765 		case FORMAT_D32F_LOCKABLE:
2766 		case FORMAT_D32FS8_TEXTURE:
2767 		case FORMAT_D32F_SHADOW:
2768 		case FORMAT_D32FS8_SHADOW:
2769 		case FORMAT_A8:
2770 		case FORMAT_R8:
2771 		case FORMAT_L8:
2772 		case FORMAT_L16:
2773 		case FORMAT_A8L8:
2774 		case FORMAT_YV12_BT601:
2775 		case FORMAT_YV12_BT709:
2776 		case FORMAT_YV12_JFIF:
2777 			return true;
2778 		case FORMAT_A8B8G8R8I:
2779 		case FORMAT_A16B16G16R16I:
2780 		case FORMAT_A32B32G32R32I:
2781 		case FORMAT_A8B8G8R8_SNORM:
2782 		case FORMAT_Q8W8V8U8:
2783 		case FORMAT_Q16W16V16U16:
2784 		case FORMAT_A32B32G32R32F:
2785 			return false;
2786 		case FORMAT_R32F:
2787 		case FORMAT_R8I:
2788 		case FORMAT_R16I:
2789 		case FORMAT_R32I:
2790 		case FORMAT_R8_SNORM:
2791 			return component >= 1;
2792 		case FORMAT_V8U8:
2793 		case FORMAT_X8L8V8U8:
2794 		case FORMAT_V16U16:
2795 		case FORMAT_G32R32F:
2796 		case FORMAT_G8R8I:
2797 		case FORMAT_G16R16I:
2798 		case FORMAT_G32R32I:
2799 		case FORMAT_G8R8_SNORM:
2800 			return component >= 2;
2801 		case FORMAT_A16W16V16U16:
2802 		case FORMAT_B32G32R32F:
2803 		case FORMAT_X32B32G32R32F:
2804 		case FORMAT_X8B8G8R8I:
2805 		case FORMAT_X16B16G16R16I:
2806 		case FORMAT_X32B32G32R32I:
2807 		case FORMAT_X8B8G8R8_SNORM:
2808 			return component >= 3;
2809 		default:
2810 			ASSERT(false);
2811 		}
2812 
2813 		return false;
2814 	}
2815 
isSRGBreadable(Format format)2816 	bool Surface::isSRGBreadable(Format format)
2817 	{
2818 		// Keep in sync with Capabilities::isSRGBreadable
2819 		switch(format)
2820 		{
2821 		case FORMAT_L8:
2822 		case FORMAT_A8L8:
2823 		case FORMAT_R8G8B8:
2824 		case FORMAT_A8R8G8B8:
2825 		case FORMAT_X8R8G8B8:
2826 		case FORMAT_A8B8G8R8:
2827 		case FORMAT_X8B8G8R8:
2828 		case FORMAT_SRGB8_X8:
2829 		case FORMAT_SRGB8_A8:
2830 		case FORMAT_R5G6B5:
2831 		case FORMAT_X1R5G5B5:
2832 		case FORMAT_A1R5G5B5:
2833 		case FORMAT_A4R4G4B4:
2834 		case FORMAT_DXT1:
2835 		case FORMAT_DXT3:
2836 		case FORMAT_DXT5:
2837 		case FORMAT_ATI1:
2838 		case FORMAT_ATI2:
2839 			return true;
2840 		default:
2841 			return false;
2842 		}
2843 	}
2844 
isSRGBwritable(Format format)2845 	bool Surface::isSRGBwritable(Format format)
2846 	{
2847 		// Keep in sync with Capabilities::isSRGBwritable
2848 		switch(format)
2849 		{
2850 		case FORMAT_NULL:
2851 		case FORMAT_A8R8G8B8:
2852 		case FORMAT_X8R8G8B8:
2853 		case FORMAT_A8B8G8R8:
2854 		case FORMAT_X8B8G8R8:
2855 		case FORMAT_SRGB8_X8:
2856 		case FORMAT_SRGB8_A8:
2857 		case FORMAT_R5G6B5:
2858 			return true;
2859 		default:
2860 			return false;
2861 		}
2862 	}
2863 
isSRGBformat(Format format)2864 	bool Surface::isSRGBformat(Format format)
2865 	{
2866 		switch(format)
2867 		{
2868 		case FORMAT_SRGB8_X8:
2869 		case FORMAT_SRGB8_A8:
2870 			return true;
2871 		default:
2872 			return false;
2873 		}
2874 	}
2875 
isCompressed(Format format)2876 	bool Surface::isCompressed(Format format)
2877 	{
2878 		switch(format)
2879 		{
2880 		case FORMAT_DXT1:
2881 		case FORMAT_DXT3:
2882 		case FORMAT_DXT5:
2883 		case FORMAT_ATI1:
2884 		case FORMAT_ATI2:
2885 		case FORMAT_ETC1:
2886 		case FORMAT_R11_EAC:
2887 		case FORMAT_SIGNED_R11_EAC:
2888 		case FORMAT_RG11_EAC:
2889 		case FORMAT_SIGNED_RG11_EAC:
2890 		case FORMAT_RGB8_ETC2:
2891 		case FORMAT_SRGB8_ETC2:
2892 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2893 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2894 		case FORMAT_RGBA8_ETC2_EAC:
2895 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2896 			return true;
2897 		default:
2898 			return false;
2899 		}
2900 	}
2901 
isSignedNonNormalizedInteger(Format format)2902 	bool Surface::isSignedNonNormalizedInteger(Format format)
2903 	{
2904 		switch(format)
2905 		{
2906 		case FORMAT_A8B8G8R8I:
2907 		case FORMAT_X8B8G8R8I:
2908 		case FORMAT_G8R8I:
2909 		case FORMAT_R8I:
2910 		case FORMAT_A16B16G16R16I:
2911 		case FORMAT_X16B16G16R16I:
2912 		case FORMAT_G16R16I:
2913 		case FORMAT_R16I:
2914 		case FORMAT_A32B32G32R32I:
2915 		case FORMAT_X32B32G32R32I:
2916 		case FORMAT_G32R32I:
2917 		case FORMAT_R32I:
2918 			return true;
2919 		default:
2920 			return false;
2921 		}
2922 	}
2923 
isUnsignedNonNormalizedInteger(Format format)2924 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
2925 	{
2926 		switch(format)
2927 		{
2928 		case FORMAT_A8B8G8R8UI:
2929 		case FORMAT_X8B8G8R8UI:
2930 		case FORMAT_G8R8UI:
2931 		case FORMAT_R8UI:
2932 		case FORMAT_A16B16G16R16UI:
2933 		case FORMAT_X16B16G16R16UI:
2934 		case FORMAT_G16R16UI:
2935 		case FORMAT_R16UI:
2936 		case FORMAT_A32B32G32R32UI:
2937 		case FORMAT_X32B32G32R32UI:
2938 		case FORMAT_G32R32UI:
2939 		case FORMAT_R32UI:
2940 			return true;
2941 		default:
2942 			return false;
2943 		}
2944 	}
2945 
isNonNormalizedInteger(Format format)2946 	bool Surface::isNonNormalizedInteger(Format format)
2947 	{
2948 		return isSignedNonNormalizedInteger(format) ||
2949 		       isUnsignedNonNormalizedInteger(format);
2950 	}
2951 
isNormalizedInteger(Format format)2952 	bool Surface::isNormalizedInteger(Format format)
2953 	{
2954 		return !isFloatFormat(format) &&
2955 		       !isNonNormalizedInteger(format) &&
2956 		       !isCompressed(format) &&
2957 		       !isDepth(format) &&
2958 		       !isStencil(format);
2959 	}
2960 
componentCount(Format format)2961 	int Surface::componentCount(Format format)
2962 	{
2963 		switch(format)
2964 		{
2965 		case FORMAT_R5G6B5:         return 3;
2966 		case FORMAT_X8R8G8B8:       return 3;
2967 		case FORMAT_X8B8G8R8I:      return 3;
2968 		case FORMAT_X8B8G8R8:       return 3;
2969 		case FORMAT_A8R8G8B8:       return 4;
2970 		case FORMAT_SRGB8_X8:       return 3;
2971 		case FORMAT_SRGB8_A8:       return 4;
2972 		case FORMAT_A8B8G8R8I:      return 4;
2973 		case FORMAT_A8B8G8R8:       return 4;
2974 		case FORMAT_G8R8I:          return 2;
2975 		case FORMAT_G8R8:           return 2;
2976 		case FORMAT_R8_SNORM:      return 1;
2977 		case FORMAT_G8R8_SNORM:    return 2;
2978 		case FORMAT_X8B8G8R8_SNORM:return 3;
2979 		case FORMAT_A8B8G8R8_SNORM:return 4;
2980 		case FORMAT_R8UI:           return 1;
2981 		case FORMAT_G8R8UI:         return 2;
2982 		case FORMAT_X8B8G8R8UI:     return 3;
2983 		case FORMAT_A8B8G8R8UI:     return 4;
2984 		case FORMAT_A2B10G10R10:    return 4;
2985 		case FORMAT_A2B10G10R10UI:  return 4;
2986 		case FORMAT_G16R16I:        return 2;
2987 		case FORMAT_G16R16UI:       return 2;
2988 		case FORMAT_G16R16:         return 2;
2989 		case FORMAT_G32R32I:        return 2;
2990 		case FORMAT_G32R32UI:       return 2;
2991 		case FORMAT_X16B16G16R16I:  return 3;
2992 		case FORMAT_X16B16G16R16UI: return 3;
2993 		case FORMAT_A16B16G16R16I:  return 4;
2994 		case FORMAT_A16B16G16R16UI: return 4;
2995 		case FORMAT_A16B16G16R16:   return 4;
2996 		case FORMAT_X32B32G32R32I:  return 3;
2997 		case FORMAT_X32B32G32R32UI: return 3;
2998 		case FORMAT_A32B32G32R32I:  return 4;
2999 		case FORMAT_A32B32G32R32UI: return 4;
3000 		case FORMAT_V8U8:           return 2;
3001 		case FORMAT_Q8W8V8U8:       return 4;
3002 		case FORMAT_X8L8V8U8:       return 3;
3003 		case FORMAT_V16U16:         return 2;
3004 		case FORMAT_A16W16V16U16:   return 4;
3005 		case FORMAT_Q16W16V16U16:   return 4;
3006 		case FORMAT_R32F:           return 1;
3007 		case FORMAT_G32R32F:        return 2;
3008 		case FORMAT_X32B32G32R32F:  return 3;
3009 		case FORMAT_A32B32G32R32F:  return 4;
3010 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
3011 		case FORMAT_D32F:           return 1;
3012 		case FORMAT_D32FS8:         return 1;
3013 		case FORMAT_D32F_LOCKABLE:  return 1;
3014 		case FORMAT_D32FS8_TEXTURE: return 1;
3015 		case FORMAT_D32F_SHADOW:    return 1;
3016 		case FORMAT_D32FS8_SHADOW:  return 1;
3017 		case FORMAT_A8:             return 1;
3018 		case FORMAT_R8I:            return 1;
3019 		case FORMAT_R8:             return 1;
3020 		case FORMAT_R16I:           return 1;
3021 		case FORMAT_R16UI:          return 1;
3022 		case FORMAT_R32I:           return 1;
3023 		case FORMAT_R32UI:          return 1;
3024 		case FORMAT_L8:             return 1;
3025 		case FORMAT_L16:            return 1;
3026 		case FORMAT_A8L8:           return 2;
3027 		case FORMAT_YV12_BT601:     return 3;
3028 		case FORMAT_YV12_BT709:     return 3;
3029 		case FORMAT_YV12_JFIF:      return 3;
3030 		default:
3031 			ASSERT(false);
3032 		}
3033 
3034 		return 1;
3035 	}
3036 
allocateBuffer(int width,int height,int depth,int border,int samples,Format format)3037 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
3038 	{
3039 		return allocate(size(width, height, depth, border, samples, format));
3040 	}
3041 
memfill4(void * buffer,int pattern,int bytes)3042 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3043 	{
3044 		while((size_t)buffer & 0x1 && bytes >= 1)
3045 		{
3046 			*(char*)buffer = (char)pattern;
3047 			(char*&)buffer += 1;
3048 			bytes -= 1;
3049 		}
3050 
3051 		while((size_t)buffer & 0x3 && bytes >= 2)
3052 		{
3053 			*(short*)buffer = (short)pattern;
3054 			(short*&)buffer += 1;
3055 			bytes -= 2;
3056 		}
3057 
3058 		#if defined(__i386__) || defined(__x86_64__)
3059 			if(CPUID::supportsSSE())
3060 			{
3061 				while((size_t)buffer & 0xF && bytes >= 4)
3062 				{
3063 					*(int*)buffer = pattern;
3064 					(int*&)buffer += 1;
3065 					bytes -= 4;
3066 				}
3067 
3068 				__m128 quad = _mm_set_ps1((float&)pattern);
3069 
3070 				float *pointer = (float*)buffer;
3071 				int qxwords = bytes / 64;
3072 				bytes -= qxwords * 64;
3073 
3074 				while(qxwords--)
3075 				{
3076 					_mm_stream_ps(pointer + 0, quad);
3077 					_mm_stream_ps(pointer + 4, quad);
3078 					_mm_stream_ps(pointer + 8, quad);
3079 					_mm_stream_ps(pointer + 12, quad);
3080 
3081 					pointer += 16;
3082 				}
3083 
3084 				buffer = pointer;
3085 			}
3086 		#endif
3087 
3088 		while(bytes >= 4)
3089 		{
3090 			*(int*)buffer = (int)pattern;
3091 			(int*&)buffer += 1;
3092 			bytes -= 4;
3093 		}
3094 
3095 		while(bytes >= 2)
3096 		{
3097 			*(short*)buffer = (short)pattern;
3098 			(short*&)buffer += 1;
3099 			bytes -= 2;
3100 		}
3101 
3102 		while(bytes >= 1)
3103 		{
3104 			*(char*)buffer = (char)pattern;
3105 			(char*&)buffer += 1;
3106 			bytes -= 1;
3107 		}
3108 	}
3109 
sync()3110 	void Surface::sync()
3111 	{
3112 		resource->lock(EXCLUSIVE);
3113 		resource->unlock();
3114 	}
3115 
isEntire(const Rect & rect) const3116 	bool Surface::isEntire(const Rect& rect) const
3117 	{
3118 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3119 	}
3120 
getRect() const3121 	Rect Surface::getRect() const
3122 	{
3123 		return Rect(0, 0, internal.width, internal.height);
3124 	}
3125 
clearDepth(float depth,int x0,int y0,int width,int height)3126 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3127 	{
3128 		if(width == 0 || height == 0)
3129 		{
3130 			return;
3131 		}
3132 
3133 		if(internal.format == FORMAT_NULL)
3134 		{
3135 			return;
3136 		}
3137 
3138 		// Not overlapping
3139 		if(x0 > internal.width) return;
3140 		if(y0 > internal.height) return;
3141 		if(x0 + width < 0) return;
3142 		if(y0 + height < 0) return;
3143 
3144 		// Clip against dimensions
3145 		if(x0 < 0) {width += x0; x0 = 0;}
3146 		if(x0 + width > internal.width) width = internal.width - x0;
3147 		if(y0 < 0) {height += y0; y0 = 0;}
3148 		if(y0 + height > internal.height) height = internal.height - y0;
3149 
3150 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3151 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3152 
3153 		int x1 = x0 + width;
3154 		int y1 = y0 + height;
3155 
3156 		if(!hasQuadLayout(internal.format))
3157 		{
3158 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
3159 
3160 			for(int z = 0; z < internal.samples; z++)
3161 			{
3162 				float *row = target;
3163 				for(int y = y0; y < y1; y++)
3164 				{
3165 					memfill4(row, (int&)depth, width * sizeof(float));
3166 					row += internal.pitchP;
3167 				}
3168 				target += internal.sliceP;
3169 			}
3170 
3171 			unlockInternal();
3172 		}
3173 		else   // Quad layout
3174 		{
3175 			if(complementaryDepthBuffer)
3176 			{
3177 				depth = 1 - depth;
3178 			}
3179 
3180 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3181 
3182 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3183 			int oddX1 = (x1 & ~1) * 2;
3184 			int evenX0 = ((x0 + 1) & ~1) * 2;
3185 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3186 
3187 			for(int z = 0; z < internal.samples; z++)
3188 			{
3189 				for(int y = y0; y < y1; y++)
3190 				{
3191 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
3192 
3193 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3194 					{
3195 						if((x0 & 1) != 0)
3196 						{
3197 							target[oddX0 + 0] = depth;
3198 							target[oddX0 + 2] = depth;
3199 						}
3200 
3201 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3202 					//	{
3203 					//		target[x2 + 0] = depth;
3204 					//		target[x2 + 1] = depth;
3205 					//		target[x2 + 2] = depth;
3206 					//		target[x2 + 3] = depth;
3207 					//	}
3208 
3209 					//	__asm
3210 					//	{
3211 					//		movss xmm0, depth
3212 					//		shufps xmm0, xmm0, 0x00
3213 					//
3214 					//		mov eax, x0
3215 					//		add eax, 1
3216 					//		and eax, 0xFFFFFFFE
3217 					//		cmp eax, x1
3218 					//		jge qEnd
3219 					//
3220 					//		mov edi, target
3221 					//
3222 					//	qLoop:
3223 					//		movntps [edi+8*eax], xmm0
3224 					//
3225 					//		add eax, 2
3226 					//		cmp eax, x1
3227 					//		jl qLoop
3228 					//	qEnd:
3229 					//	}
3230 
3231 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3232 
3233 						if((x1 & 1) != 0)
3234 						{
3235 							target[oddX1 + 0] = depth;
3236 							target[oddX1 + 2] = depth;
3237 						}
3238 
3239 						y++;
3240 					}
3241 					else
3242 					{
3243 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3244 						{
3245 							target[i] = depth;
3246 						}
3247 					}
3248 				}
3249 
3250 				buffer += internal.sliceP;
3251 			}
3252 
3253 			unlockInternal();
3254 		}
3255 	}
3256 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3257 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3258 	{
3259 		if(mask == 0 || width == 0 || height == 0)
3260 		{
3261 			return;
3262 		}
3263 
3264 		if(stencil.format == FORMAT_NULL)
3265 		{
3266 			return;
3267 		}
3268 
3269 		// Not overlapping
3270 		if(x0 > internal.width) return;
3271 		if(y0 > internal.height) return;
3272 		if(x0 + width < 0) return;
3273 		if(y0 + height < 0) return;
3274 
3275 		// Clip against dimensions
3276 		if(x0 < 0) {width += x0; x0 = 0;}
3277 		if(x0 + width > internal.width) width = internal.width - x0;
3278 		if(y0 < 0) {height += y0; y0 = 0;}
3279 		if(y0 + height > internal.height) height = internal.height - y0;
3280 
3281 		int x1 = x0 + width;
3282 		int y1 = y0 + height;
3283 
3284 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3285 		int oddX1 = (x1 & ~1) * 2;
3286 		int evenX0 = ((x0 + 1) & ~1) * 2;
3287 		int evenBytes = oddX1 - evenX0;
3288 
3289 		unsigned char maskedS = s & mask;
3290 		unsigned char invMask = ~mask;
3291 		unsigned int fill = maskedS;
3292 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3293 
3294 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3295 
3296 		// Stencil buffers are assumed to use quad layout
3297 		for(int z = 0; z < stencil.samples; z++)
3298 		{
3299 			for(int y = y0; y < y1; y++)
3300 			{
3301 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
3302 
3303 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3304 				{
3305 					if((x0 & 1) != 0)
3306 					{
3307 						target[oddX0 + 0] = fill;
3308 						target[oddX0 + 2] = fill;
3309 					}
3310 
3311 					memfill4(&target[evenX0], fill, evenBytes);
3312 
3313 					if((x1 & 1) != 0)
3314 					{
3315 						target[oddX1 + 0] = fill;
3316 						target[oddX1 + 2] = fill;
3317 					}
3318 
3319 					y++;
3320 				}
3321 				else
3322 				{
3323 					for(int x = x0; x < x1; x++)
3324 					{
3325 						int i = (x & ~1) * 2 + (x & 1);
3326 						target[i] = maskedS | (target[i] & invMask);
3327 					}
3328 				}
3329 			}
3330 
3331 			buffer += stencil.sliceP;
3332 		}
3333 
3334 		unlockStencil();
3335 	}
3336 
fill(const Color<float> & color,int x0,int y0,int width,int height)3337 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3338 	{
3339 		unsigned char *row;
3340 		Buffer *buffer;
3341 
3342 		if(internal.dirty)
3343 		{
3344 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3345 			buffer = &internal;
3346 		}
3347 		else
3348 		{
3349 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3350 			buffer = &external;
3351 		}
3352 
3353 		if(buffer->bytes <= 4)
3354 		{
3355 			int c;
3356 			buffer->write(&c, color);
3357 
3358 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3359 			if(buffer->bytes <= 2) c = (c << 16) | c;
3360 
3361 			for(int y = 0; y < height; y++)
3362 			{
3363 				memfill4(row, c, width * buffer->bytes);
3364 
3365 				row += buffer->pitchB;
3366 			}
3367 		}
3368 		else   // Generic
3369 		{
3370 			for(int y = 0; y < height; y++)
3371 			{
3372 				unsigned char *element = row;
3373 
3374 				for(int x = 0; x < width; x++)
3375 				{
3376 					buffer->write(element, color);
3377 
3378 					element += buffer->bytes;
3379 				}
3380 
3381 				row += buffer->pitchB;
3382 			}
3383 		}
3384 
3385 		if(buffer == &internal)
3386 		{
3387 			unlockInternal();
3388 		}
3389 		else
3390 		{
3391 			unlockExternal();
3392 		}
3393 	}
3394 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3395 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
3396 	{
3397 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3398 
3399 		sw::Color<float> color;
3400 
3401 		if(!filter)
3402 		{
3403 			color = source->internal.read((int)srcX, (int)srcY, 0);
3404 		}
3405 		else   // Bilinear filtering
3406 		{
3407 			color = source->internal.sample(srcX, srcY, 0);
3408 		}
3409 
3410 		internal.write(x, y, color);
3411 	}
3412 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3413 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3414 	{
3415 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3416 
3417 		sw::Color<float> color;
3418 
3419 		if(!filter)
3420 		{
3421 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3422 		}
3423 		else   // Bilinear filtering
3424 		{
3425 			color = source->internal.sample(srcX, srcY, srcZ);
3426 		}
3427 
3428 		internal.write(x, y, z, color);
3429 	}
3430 
copyCubeEdge(Edge dstEdge,Surface * src,Edge srcEdge)3431 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
3432 	{
3433 		Surface *dst = this;
3434 
3435 		// Figure out if the edges to be copied in reverse order respectively from one another
3436 		// The copy should be reversed whenever the same edges are contiguous or if we're
3437 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
3438 		//
3439 		//      | +y |
3440 		// | -x | +z | +x | -z |
3441 		//      | -y |
3442 
3443 		bool reverse = (srcEdge == dstEdge) ||
3444 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
3445 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
3446 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
3447 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
3448 
3449 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
3450 		int srcPitch = src->getInternalPitchB();
3451 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
3452 		int dstPitch = dst->getInternalPitchB();
3453 
3454 		int srcW = src->getWidth();
3455 		int srcH = src->getHeight();
3456 		int dstW = dst->getWidth();
3457 		int dstH = dst->getHeight();
3458 
3459 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
3460 
3461 		// Src is expressed in the regular [0, width-1], [0, height-1] space
3462 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
3463 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
3464 
3465 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
3466 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
3467 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
3468 
3469 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
3470 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
3471 
3472 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
3473 		{
3474 			memcpy(dstBuf, srcBuf, srcBytes);
3475 		}
3476 
3477 		if(dstEdge == LEFT || dstEdge == RIGHT)
3478 		{
3479 			// TOP and BOTTOM are already set, let's average out the corners
3480 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
3481 			int y0 = -1;
3482 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
3483 			int y1 = 0;
3484 			dst->computeCubeCorner(x0, y0, x1, y1);
3485 			y0 = dstH;
3486 			y1 = dstH - 1;
3487 			dst->computeCubeCorner(x0, y0, x1, y1);
3488 		}
3489 
3490 		src->unlockInternal();
3491 		dst->unlockInternal();
3492 	}
3493 
computeCubeCorner(int x0,int y0,int x1,int y1)3494 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
3495 	{
3496 		ASSERT(internal.lock != LOCK_UNLOCKED);
3497 
3498 		sw::Color<float> color = internal.read(x0, y1);
3499 		color += internal.read(x1, y0);
3500 		color += internal.read(x1, y1);
3501 		color *= (1.0f / 3.0f);
3502 
3503 		internal.write(x0, y0, color);
3504 	}
3505 
hasStencil() const3506 	bool Surface::hasStencil() const
3507 	{
3508 		return isStencil(external.format);
3509 	}
3510 
hasDepth() const3511 	bool Surface::hasDepth() const
3512 	{
3513 		return isDepth(external.format);
3514 	}
3515 
hasPalette() const3516 	bool Surface::hasPalette() const
3517 	{
3518 		return isPalette(external.format);
3519 	}
3520 
isRenderTarget() const3521 	bool Surface::isRenderTarget() const
3522 	{
3523 		return renderTarget;
3524 	}
3525 
hasDirtyContents() const3526 	bool Surface::hasDirtyContents() const
3527 	{
3528 		return dirtyContents;
3529 	}
3530 
markContentsClean()3531 	void Surface::markContentsClean()
3532 	{
3533 		dirtyContents = false;
3534 	}
3535 
getResource()3536 	Resource *Surface::getResource()
3537 	{
3538 		return resource;
3539 	}
3540 
identicalBuffers() const3541 	bool Surface::identicalBuffers() const
3542 	{
3543 		return external.format == internal.format &&
3544 		       external.width  == internal.width &&
3545 		       external.height == internal.height &&
3546 		       external.depth  == internal.depth &&
3547 		       external.pitchB == internal.pitchB &&
3548 		       external.sliceB == internal.sliceB &&
3549 		       external.border == internal.border &&
3550 		       external.samples == internal.samples;
3551 	}
3552 
selectInternalFormat(Format format) const3553 	Format Surface::selectInternalFormat(Format format) const
3554 	{
3555 		switch(format)
3556 		{
3557 		case FORMAT_NULL:
3558 			return FORMAT_NULL;
3559 		case FORMAT_P8:
3560 		case FORMAT_A8P8:
3561 		case FORMAT_A4R4G4B4:
3562 		case FORMAT_A1R5G5B5:
3563 		case FORMAT_A8R3G3B2:
3564 			return FORMAT_A8R8G8B8;
3565 		case FORMAT_A8:
3566 			return FORMAT_A8;
3567 		case FORMAT_R8I:
3568 			return FORMAT_R8I;
3569 		case FORMAT_R8UI:
3570 			return FORMAT_R8UI;
3571 		case FORMAT_R8_SNORM:
3572 			return FORMAT_R8_SNORM;
3573 		case FORMAT_R8:
3574 			return FORMAT_R8;
3575 		case FORMAT_R16I:
3576 			return FORMAT_R16I;
3577 		case FORMAT_R16UI:
3578 			return FORMAT_R16UI;
3579 		case FORMAT_R32I:
3580 			return FORMAT_R32I;
3581 		case FORMAT_R32UI:
3582 			return FORMAT_R32UI;
3583 		case FORMAT_X16B16G16R16I:
3584 			return FORMAT_X16B16G16R16I;
3585 		case FORMAT_A16B16G16R16I:
3586 			return FORMAT_A16B16G16R16I;
3587 		case FORMAT_X16B16G16R16UI:
3588 			return FORMAT_X16B16G16R16UI;
3589 		case FORMAT_A16B16G16R16UI:
3590 			return FORMAT_A16B16G16R16UI;
3591 		case FORMAT_A2R10G10B10:
3592 		case FORMAT_A2B10G10R10:
3593 		case FORMAT_A16B16G16R16:
3594 			return FORMAT_A16B16G16R16;
3595 		case FORMAT_A2B10G10R10UI:
3596 			return FORMAT_A16B16G16R16UI;
3597 		case FORMAT_X32B32G32R32I:
3598 			return FORMAT_X32B32G32R32I;
3599 		case FORMAT_A32B32G32R32I:
3600 			return FORMAT_A32B32G32R32I;
3601 		case FORMAT_X32B32G32R32UI:
3602 			return FORMAT_X32B32G32R32UI;
3603 		case FORMAT_A32B32G32R32UI:
3604 			return FORMAT_A32B32G32R32UI;
3605 		case FORMAT_G8R8I:
3606 			return FORMAT_G8R8I;
3607 		case FORMAT_G8R8UI:
3608 			return FORMAT_G8R8UI;
3609 		case FORMAT_G8R8_SNORM:
3610 			return FORMAT_G8R8_SNORM;
3611 		case FORMAT_G8R8:
3612 			return FORMAT_G8R8;
3613 		case FORMAT_G16R16I:
3614 			return FORMAT_G16R16I;
3615 		case FORMAT_G16R16UI:
3616 			return FORMAT_G16R16UI;
3617 		case FORMAT_G16R16:
3618 			return FORMAT_G16R16;
3619 		case FORMAT_G32R32I:
3620 			return FORMAT_G32R32I;
3621 		case FORMAT_G32R32UI:
3622 			return FORMAT_G32R32UI;
3623 		case FORMAT_A8R8G8B8:
3624 			if(lockable || !quadLayoutEnabled)
3625 			{
3626 				return FORMAT_A8R8G8B8;
3627 			}
3628 			else
3629 			{
3630 				return FORMAT_A8G8R8B8Q;
3631 			}
3632 		case FORMAT_A8B8G8R8I:
3633 			return FORMAT_A8B8G8R8I;
3634 		case FORMAT_A8B8G8R8UI:
3635 			return FORMAT_A8B8G8R8UI;
3636 		case FORMAT_A8B8G8R8_SNORM:
3637 			return FORMAT_A8B8G8R8_SNORM;
3638 		case FORMAT_R5G5B5A1:
3639 		case FORMAT_R4G4B4A4:
3640 		case FORMAT_A8B8G8R8:
3641 			return FORMAT_A8B8G8R8;
3642 		case FORMAT_R5G6B5:
3643 			return FORMAT_R5G6B5;
3644 		case FORMAT_R3G3B2:
3645 		case FORMAT_R8G8B8:
3646 		case FORMAT_X4R4G4B4:
3647 		case FORMAT_X1R5G5B5:
3648 		case FORMAT_X8R8G8B8:
3649 			if(lockable || !quadLayoutEnabled)
3650 			{
3651 				return FORMAT_X8R8G8B8;
3652 			}
3653 			else
3654 			{
3655 				return FORMAT_X8G8R8B8Q;
3656 			}
3657 		case FORMAT_X8B8G8R8I:
3658 			return FORMAT_X8B8G8R8I;
3659 		case FORMAT_X8B8G8R8UI:
3660 			return FORMAT_X8B8G8R8UI;
3661 		case FORMAT_X8B8G8R8_SNORM:
3662 			return FORMAT_X8B8G8R8_SNORM;
3663 		case FORMAT_B8G8R8:
3664 		case FORMAT_X8B8G8R8:
3665 			return FORMAT_X8B8G8R8;
3666 		case FORMAT_SRGB8_X8:
3667 			return FORMAT_SRGB8_X8;
3668 		case FORMAT_SRGB8_A8:
3669 			return FORMAT_SRGB8_A8;
3670 		// Compressed formats
3671 		case FORMAT_DXT1:
3672 		case FORMAT_DXT3:
3673 		case FORMAT_DXT5:
3674 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3675 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3676 		case FORMAT_RGBA8_ETC2_EAC:
3677 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3678 			return FORMAT_A8R8G8B8;
3679 		case FORMAT_ATI1:
3680 			return FORMAT_R8;
3681 		case FORMAT_R11_EAC:
3682 		case FORMAT_SIGNED_R11_EAC:
3683 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3684 		case FORMAT_ATI2:
3685 			return FORMAT_G8R8;
3686 		case FORMAT_RG11_EAC:
3687 		case FORMAT_SIGNED_RG11_EAC:
3688 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3689 		case FORMAT_ETC1:
3690 		case FORMAT_RGB8_ETC2:
3691 		case FORMAT_SRGB8_ETC2:
3692 			return FORMAT_X8R8G8B8;
3693 		// Bumpmap formats
3694 		case FORMAT_V8U8:			return FORMAT_V8U8;
3695 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3696 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3697 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3698 		case FORMAT_V16U16:			return FORMAT_V16U16;
3699 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3700 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3701 		// Floating-point formats
3702 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3703 		case FORMAT_R16F:			return FORMAT_R32F;
3704 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3705 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3706 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
3707 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3708 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3709 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3710 		case FORMAT_R32F:			return FORMAT_R32F;
3711 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3712 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3713 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3714 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3715 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3716 		// Luminance formats
3717 		case FORMAT_L8:				return FORMAT_L8;
3718 		case FORMAT_A4L4:			return FORMAT_A8L8;
3719 		case FORMAT_L16:			return FORMAT_L16;
3720 		case FORMAT_A8L8:			return FORMAT_A8L8;
3721 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3722 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3723 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3724 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3725 		// Depth/stencil formats
3726 		case FORMAT_D16:
3727 		case FORMAT_D32:
3728 		case FORMAT_D24X8:
3729 			if(hasParent)   // Texture
3730 			{
3731 				return FORMAT_D32F_SHADOW;
3732 			}
3733 			else if(complementaryDepthBuffer)
3734 			{
3735 				return FORMAT_D32F_COMPLEMENTARY;
3736 			}
3737 			else
3738 			{
3739 				return FORMAT_D32F;
3740 			}
3741 		case FORMAT_D24S8:
3742 		case FORMAT_D24FS8:
3743 			if(hasParent)   // Texture
3744 			{
3745 				return FORMAT_D32FS8_SHADOW;
3746 			}
3747 			else if(complementaryDepthBuffer)
3748 			{
3749 				return FORMAT_D32FS8_COMPLEMENTARY;
3750 			}
3751 			else
3752 			{
3753 				return FORMAT_D32FS8;
3754 			}
3755 		case FORMAT_D32F:           return FORMAT_D32F;
3756 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
3757 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3758 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3759 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3760 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3761 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3762 		case FORMAT_S8:             return FORMAT_S8;
3763 		// YUV formats
3764 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3765 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3766 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3767 		default:
3768 			ASSERT(false);
3769 		}
3770 
3771 		return FORMAT_NULL;
3772 	}
3773 
setTexturePalette(unsigned int * palette)3774 	void Surface::setTexturePalette(unsigned int *palette)
3775 	{
3776 		Surface::palette = palette;
3777 		Surface::paletteID++;
3778 	}
3779 
resolve()3780 	void Surface::resolve()
3781 	{
3782 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3783 		{
3784 			return;
3785 		}
3786 
3787 		ASSERT(internal.depth == 1);  // Unimplemented
3788 
3789 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3790 
3791 		int width = internal.width;
3792 		int height = internal.height;
3793 		int pitch = internal.pitchB;
3794 		int slice = internal.sliceB;
3795 
3796 		unsigned char *source0 = (unsigned char*)source;
3797 		unsigned char *source1 = source0 + slice;
3798 		unsigned char *source2 = source1 + slice;
3799 		unsigned char *source3 = source2 + slice;
3800 		unsigned char *source4 = source3 + slice;
3801 		unsigned char *source5 = source4 + slice;
3802 		unsigned char *source6 = source5 + slice;
3803 		unsigned char *source7 = source6 + slice;
3804 		unsigned char *source8 = source7 + slice;
3805 		unsigned char *source9 = source8 + slice;
3806 		unsigned char *sourceA = source9 + slice;
3807 		unsigned char *sourceB = sourceA + slice;
3808 		unsigned char *sourceC = sourceB + slice;
3809 		unsigned char *sourceD = sourceC + slice;
3810 		unsigned char *sourceE = sourceD + slice;
3811 		unsigned char *sourceF = sourceE + slice;
3812 
3813 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
3814 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
3815 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
3816 		{
3817 			#if defined(__i386__) || defined(__x86_64__)
3818 				if(CPUID::supportsSSE2() && (width % 4) == 0)
3819 				{
3820 					if(internal.samples == 2)
3821 					{
3822 						for(int y = 0; y < height; y++)
3823 						{
3824 							for(int x = 0; x < width; x += 4)
3825 							{
3826 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3827 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3828 
3829 								c0 = _mm_avg_epu8(c0, c1);
3830 
3831 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3832 							}
3833 
3834 							source0 += pitch;
3835 							source1 += pitch;
3836 						}
3837 					}
3838 					else if(internal.samples == 4)
3839 					{
3840 						for(int y = 0; y < height; y++)
3841 						{
3842 							for(int x = 0; x < width; x += 4)
3843 							{
3844 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3845 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3846 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3847 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3848 
3849 								c0 = _mm_avg_epu8(c0, c1);
3850 								c2 = _mm_avg_epu8(c2, c3);
3851 								c0 = _mm_avg_epu8(c0, c2);
3852 
3853 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3854 							}
3855 
3856 							source0 += pitch;
3857 							source1 += pitch;
3858 							source2 += pitch;
3859 							source3 += pitch;
3860 						}
3861 					}
3862 					else if(internal.samples == 8)
3863 					{
3864 						for(int y = 0; y < height; y++)
3865 						{
3866 							for(int x = 0; x < width; x += 4)
3867 							{
3868 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3869 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3870 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3871 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3872 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3873 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3874 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3875 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3876 
3877 								c0 = _mm_avg_epu8(c0, c1);
3878 								c2 = _mm_avg_epu8(c2, c3);
3879 								c4 = _mm_avg_epu8(c4, c5);
3880 								c6 = _mm_avg_epu8(c6, c7);
3881 								c0 = _mm_avg_epu8(c0, c2);
3882 								c4 = _mm_avg_epu8(c4, c6);
3883 								c0 = _mm_avg_epu8(c0, c4);
3884 
3885 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3886 							}
3887 
3888 							source0 += pitch;
3889 							source1 += pitch;
3890 							source2 += pitch;
3891 							source3 += pitch;
3892 							source4 += pitch;
3893 							source5 += pitch;
3894 							source6 += pitch;
3895 							source7 += pitch;
3896 						}
3897 					}
3898 					else if(internal.samples == 16)
3899 					{
3900 						for(int y = 0; y < height; y++)
3901 						{
3902 							for(int x = 0; x < width; x += 4)
3903 							{
3904 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3905 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3906 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3907 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3908 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3909 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3910 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3911 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3912 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3913 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3914 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3915 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3916 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3917 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3918 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3919 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3920 
3921 								c0 = _mm_avg_epu8(c0, c1);
3922 								c2 = _mm_avg_epu8(c2, c3);
3923 								c4 = _mm_avg_epu8(c4, c5);
3924 								c6 = _mm_avg_epu8(c6, c7);
3925 								c8 = _mm_avg_epu8(c8, c9);
3926 								cA = _mm_avg_epu8(cA, cB);
3927 								cC = _mm_avg_epu8(cC, cD);
3928 								cE = _mm_avg_epu8(cE, cF);
3929 								c0 = _mm_avg_epu8(c0, c2);
3930 								c4 = _mm_avg_epu8(c4, c6);
3931 								c8 = _mm_avg_epu8(c8, cA);
3932 								cC = _mm_avg_epu8(cC, cE);
3933 								c0 = _mm_avg_epu8(c0, c4);
3934 								c8 = _mm_avg_epu8(c8, cC);
3935 								c0 = _mm_avg_epu8(c0, c8);
3936 
3937 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3938 							}
3939 
3940 							source0 += pitch;
3941 							source1 += pitch;
3942 							source2 += pitch;
3943 							source3 += pitch;
3944 							source4 += pitch;
3945 							source5 += pitch;
3946 							source6 += pitch;
3947 							source7 += pitch;
3948 							source8 += pitch;
3949 							source9 += pitch;
3950 							sourceA += pitch;
3951 							sourceB += pitch;
3952 							sourceC += pitch;
3953 							sourceD += pitch;
3954 							sourceE += pitch;
3955 							sourceF += pitch;
3956 						}
3957 					}
3958 					else ASSERT(false);
3959 				}
3960 				else
3961 			#endif
3962 			{
3963 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3964 
3965 				if(internal.samples == 2)
3966 				{
3967 					for(int y = 0; y < height; y++)
3968 					{
3969 						for(int x = 0; x < width; x++)
3970 						{
3971 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3972 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3973 
3974 							c0 = AVERAGE(c0, c1);
3975 
3976 							*(unsigned int*)(source0 + 4 * x) = c0;
3977 						}
3978 
3979 						source0 += pitch;
3980 						source1 += pitch;
3981 					}
3982 				}
3983 				else if(internal.samples == 4)
3984 				{
3985 					for(int y = 0; y < height; y++)
3986 					{
3987 						for(int x = 0; x < width; x++)
3988 						{
3989 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3990 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3991 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3992 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3993 
3994 							c0 = AVERAGE(c0, c1);
3995 							c2 = AVERAGE(c2, c3);
3996 							c0 = AVERAGE(c0, c2);
3997 
3998 							*(unsigned int*)(source0 + 4 * x) = c0;
3999 						}
4000 
4001 						source0 += pitch;
4002 						source1 += pitch;
4003 						source2 += pitch;
4004 						source3 += pitch;
4005 					}
4006 				}
4007 				else if(internal.samples == 8)
4008 				{
4009 					for(int y = 0; y < height; y++)
4010 					{
4011 						for(int x = 0; x < width; x++)
4012 						{
4013 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4014 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4015 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4016 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4017 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4018 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4019 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4020 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4021 
4022 							c0 = AVERAGE(c0, c1);
4023 							c2 = AVERAGE(c2, c3);
4024 							c4 = AVERAGE(c4, c5);
4025 							c6 = AVERAGE(c6, c7);
4026 							c0 = AVERAGE(c0, c2);
4027 							c4 = AVERAGE(c4, c6);
4028 							c0 = AVERAGE(c0, c4);
4029 
4030 							*(unsigned int*)(source0 + 4 * x) = c0;
4031 						}
4032 
4033 						source0 += pitch;
4034 						source1 += pitch;
4035 						source2 += pitch;
4036 						source3 += pitch;
4037 						source4 += pitch;
4038 						source5 += pitch;
4039 						source6 += pitch;
4040 						source7 += pitch;
4041 					}
4042 				}
4043 				else if(internal.samples == 16)
4044 				{
4045 					for(int y = 0; y < height; y++)
4046 					{
4047 						for(int x = 0; x < width; x++)
4048 						{
4049 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4050 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4051 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4052 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4053 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4054 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4055 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4056 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4057 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4058 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4059 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4060 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4061 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4062 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4063 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4064 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4065 
4066 							c0 = AVERAGE(c0, c1);
4067 							c2 = AVERAGE(c2, c3);
4068 							c4 = AVERAGE(c4, c5);
4069 							c6 = AVERAGE(c6, c7);
4070 							c8 = AVERAGE(c8, c9);
4071 							cA = AVERAGE(cA, cB);
4072 							cC = AVERAGE(cC, cD);
4073 							cE = AVERAGE(cE, cF);
4074 							c0 = AVERAGE(c0, c2);
4075 							c4 = AVERAGE(c4, c6);
4076 							c8 = AVERAGE(c8, cA);
4077 							cC = AVERAGE(cC, cE);
4078 							c0 = AVERAGE(c0, c4);
4079 							c8 = AVERAGE(c8, cC);
4080 							c0 = AVERAGE(c0, c8);
4081 
4082 							*(unsigned int*)(source0 + 4 * x) = c0;
4083 						}
4084 
4085 						source0 += pitch;
4086 						source1 += pitch;
4087 						source2 += pitch;
4088 						source3 += pitch;
4089 						source4 += pitch;
4090 						source5 += pitch;
4091 						source6 += pitch;
4092 						source7 += pitch;
4093 						source8 += pitch;
4094 						source9 += pitch;
4095 						sourceA += pitch;
4096 						sourceB += pitch;
4097 						sourceC += pitch;
4098 						sourceD += pitch;
4099 						sourceE += pitch;
4100 						sourceF += pitch;
4101 					}
4102 				}
4103 				else ASSERT(false);
4104 
4105 				#undef AVERAGE
4106 			}
4107 		}
4108 		else if(internal.format == FORMAT_G16R16)
4109 		{
4110 
4111 			#if defined(__i386__) || defined(__x86_64__)
4112 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4113 				{
4114 					if(internal.samples == 2)
4115 					{
4116 						for(int y = 0; y < height; y++)
4117 						{
4118 							for(int x = 0; x < width; x += 4)
4119 							{
4120 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4121 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4122 
4123 								c0 = _mm_avg_epu16(c0, c1);
4124 
4125 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4126 							}
4127 
4128 							source0 += pitch;
4129 							source1 += pitch;
4130 						}
4131 					}
4132 					else if(internal.samples == 4)
4133 					{
4134 						for(int y = 0; y < height; y++)
4135 						{
4136 							for(int x = 0; x < width; x += 4)
4137 							{
4138 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4139 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4140 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4141 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4142 
4143 								c0 = _mm_avg_epu16(c0, c1);
4144 								c2 = _mm_avg_epu16(c2, c3);
4145 								c0 = _mm_avg_epu16(c0, c2);
4146 
4147 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4148 							}
4149 
4150 							source0 += pitch;
4151 							source1 += pitch;
4152 							source2 += pitch;
4153 							source3 += pitch;
4154 						}
4155 					}
4156 					else if(internal.samples == 8)
4157 					{
4158 						for(int y = 0; y < height; y++)
4159 						{
4160 							for(int x = 0; x < width; x += 4)
4161 							{
4162 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4163 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4164 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4165 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4166 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4167 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4168 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4169 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4170 
4171 								c0 = _mm_avg_epu16(c0, c1);
4172 								c2 = _mm_avg_epu16(c2, c3);
4173 								c4 = _mm_avg_epu16(c4, c5);
4174 								c6 = _mm_avg_epu16(c6, c7);
4175 								c0 = _mm_avg_epu16(c0, c2);
4176 								c4 = _mm_avg_epu16(c4, c6);
4177 								c0 = _mm_avg_epu16(c0, c4);
4178 
4179 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4180 							}
4181 
4182 							source0 += pitch;
4183 							source1 += pitch;
4184 							source2 += pitch;
4185 							source3 += pitch;
4186 							source4 += pitch;
4187 							source5 += pitch;
4188 							source6 += pitch;
4189 							source7 += pitch;
4190 						}
4191 					}
4192 					else if(internal.samples == 16)
4193 					{
4194 						for(int y = 0; y < height; y++)
4195 						{
4196 							for(int x = 0; x < width; x += 4)
4197 							{
4198 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4199 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4200 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4201 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4202 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4203 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4204 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4205 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4206 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4207 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4208 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4209 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4210 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4211 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4212 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4213 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4214 
4215 								c0 = _mm_avg_epu16(c0, c1);
4216 								c2 = _mm_avg_epu16(c2, c3);
4217 								c4 = _mm_avg_epu16(c4, c5);
4218 								c6 = _mm_avg_epu16(c6, c7);
4219 								c8 = _mm_avg_epu16(c8, c9);
4220 								cA = _mm_avg_epu16(cA, cB);
4221 								cC = _mm_avg_epu16(cC, cD);
4222 								cE = _mm_avg_epu16(cE, cF);
4223 								c0 = _mm_avg_epu16(c0, c2);
4224 								c4 = _mm_avg_epu16(c4, c6);
4225 								c8 = _mm_avg_epu16(c8, cA);
4226 								cC = _mm_avg_epu16(cC, cE);
4227 								c0 = _mm_avg_epu16(c0, c4);
4228 								c8 = _mm_avg_epu16(c8, cC);
4229 								c0 = _mm_avg_epu16(c0, c8);
4230 
4231 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4232 							}
4233 
4234 							source0 += pitch;
4235 							source1 += pitch;
4236 							source2 += pitch;
4237 							source3 += pitch;
4238 							source4 += pitch;
4239 							source5 += pitch;
4240 							source6 += pitch;
4241 							source7 += pitch;
4242 							source8 += pitch;
4243 							source9 += pitch;
4244 							sourceA += pitch;
4245 							sourceB += pitch;
4246 							sourceC += pitch;
4247 							sourceD += pitch;
4248 							sourceE += pitch;
4249 							sourceF += pitch;
4250 						}
4251 					}
4252 					else ASSERT(false);
4253 				}
4254 				else
4255 			#endif
4256 			{
4257 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4258 
4259 				if(internal.samples == 2)
4260 				{
4261 					for(int y = 0; y < height; y++)
4262 					{
4263 						for(int x = 0; x < width; x++)
4264 						{
4265 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4266 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4267 
4268 							c0 = AVERAGE(c0, c1);
4269 
4270 							*(unsigned int*)(source0 + 4 * x) = c0;
4271 						}
4272 
4273 						source0 += pitch;
4274 						source1 += pitch;
4275 					}
4276 				}
4277 				else if(internal.samples == 4)
4278 				{
4279 					for(int y = 0; y < height; y++)
4280 					{
4281 						for(int x = 0; x < width; x++)
4282 						{
4283 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4284 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4285 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4286 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4287 
4288 							c0 = AVERAGE(c0, c1);
4289 							c2 = AVERAGE(c2, c3);
4290 							c0 = AVERAGE(c0, c2);
4291 
4292 							*(unsigned int*)(source0 + 4 * x) = c0;
4293 						}
4294 
4295 						source0 += pitch;
4296 						source1 += pitch;
4297 						source2 += pitch;
4298 						source3 += pitch;
4299 					}
4300 				}
4301 				else if(internal.samples == 8)
4302 				{
4303 					for(int y = 0; y < height; y++)
4304 					{
4305 						for(int x = 0; x < width; x++)
4306 						{
4307 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4308 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4309 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4310 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4311 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4312 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4313 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4314 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4315 
4316 							c0 = AVERAGE(c0, c1);
4317 							c2 = AVERAGE(c2, c3);
4318 							c4 = AVERAGE(c4, c5);
4319 							c6 = AVERAGE(c6, c7);
4320 							c0 = AVERAGE(c0, c2);
4321 							c4 = AVERAGE(c4, c6);
4322 							c0 = AVERAGE(c0, c4);
4323 
4324 							*(unsigned int*)(source0 + 4 * x) = c0;
4325 						}
4326 
4327 						source0 += pitch;
4328 						source1 += pitch;
4329 						source2 += pitch;
4330 						source3 += pitch;
4331 						source4 += pitch;
4332 						source5 += pitch;
4333 						source6 += pitch;
4334 						source7 += pitch;
4335 					}
4336 				}
4337 				else if(internal.samples == 16)
4338 				{
4339 					for(int y = 0; y < height; y++)
4340 					{
4341 						for(int x = 0; x < width; x++)
4342 						{
4343 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4344 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4345 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4346 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4347 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4348 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4349 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4350 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4351 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4352 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4353 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4354 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4355 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4356 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4357 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4358 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4359 
4360 							c0 = AVERAGE(c0, c1);
4361 							c2 = AVERAGE(c2, c3);
4362 							c4 = AVERAGE(c4, c5);
4363 							c6 = AVERAGE(c6, c7);
4364 							c8 = AVERAGE(c8, c9);
4365 							cA = AVERAGE(cA, cB);
4366 							cC = AVERAGE(cC, cD);
4367 							cE = AVERAGE(cE, cF);
4368 							c0 = AVERAGE(c0, c2);
4369 							c4 = AVERAGE(c4, c6);
4370 							c8 = AVERAGE(c8, cA);
4371 							cC = AVERAGE(cC, cE);
4372 							c0 = AVERAGE(c0, c4);
4373 							c8 = AVERAGE(c8, cC);
4374 							c0 = AVERAGE(c0, c8);
4375 
4376 							*(unsigned int*)(source0 + 4 * x) = c0;
4377 						}
4378 
4379 						source0 += pitch;
4380 						source1 += pitch;
4381 						source2 += pitch;
4382 						source3 += pitch;
4383 						source4 += pitch;
4384 						source5 += pitch;
4385 						source6 += pitch;
4386 						source7 += pitch;
4387 						source8 += pitch;
4388 						source9 += pitch;
4389 						sourceA += pitch;
4390 						sourceB += pitch;
4391 						sourceC += pitch;
4392 						sourceD += pitch;
4393 						sourceE += pitch;
4394 						sourceF += pitch;
4395 					}
4396 				}
4397 				else ASSERT(false);
4398 
4399 				#undef AVERAGE
4400 			}
4401 		}
4402 		else if(internal.format == FORMAT_A16B16G16R16)
4403 		{
4404 			#if defined(__i386__) || defined(__x86_64__)
4405 				if(CPUID::supportsSSE2() && (width % 2) == 0)
4406 				{
4407 					if(internal.samples == 2)
4408 					{
4409 						for(int y = 0; y < height; y++)
4410 						{
4411 							for(int x = 0; x < width; x += 2)
4412 							{
4413 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4414 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4415 
4416 								c0 = _mm_avg_epu16(c0, c1);
4417 
4418 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4419 							}
4420 
4421 							source0 += pitch;
4422 							source1 += pitch;
4423 						}
4424 					}
4425 					else if(internal.samples == 4)
4426 					{
4427 						for(int y = 0; y < height; y++)
4428 						{
4429 							for(int x = 0; x < width; x += 2)
4430 							{
4431 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4432 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4433 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4434 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4435 
4436 								c0 = _mm_avg_epu16(c0, c1);
4437 								c2 = _mm_avg_epu16(c2, c3);
4438 								c0 = _mm_avg_epu16(c0, c2);
4439 
4440 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4441 							}
4442 
4443 							source0 += pitch;
4444 							source1 += pitch;
4445 							source2 += pitch;
4446 							source3 += pitch;
4447 						}
4448 					}
4449 					else if(internal.samples == 8)
4450 					{
4451 						for(int y = 0; y < height; y++)
4452 						{
4453 							for(int x = 0; x < width; x += 2)
4454 							{
4455 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4456 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4457 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4458 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4459 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4460 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4461 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4462 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4463 
4464 								c0 = _mm_avg_epu16(c0, c1);
4465 								c2 = _mm_avg_epu16(c2, c3);
4466 								c4 = _mm_avg_epu16(c4, c5);
4467 								c6 = _mm_avg_epu16(c6, c7);
4468 								c0 = _mm_avg_epu16(c0, c2);
4469 								c4 = _mm_avg_epu16(c4, c6);
4470 								c0 = _mm_avg_epu16(c0, c4);
4471 
4472 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4473 							}
4474 
4475 							source0 += pitch;
4476 							source1 += pitch;
4477 							source2 += pitch;
4478 							source3 += pitch;
4479 							source4 += pitch;
4480 							source5 += pitch;
4481 							source6 += pitch;
4482 							source7 += pitch;
4483 						}
4484 					}
4485 					else if(internal.samples == 16)
4486 					{
4487 						for(int y = 0; y < height; y++)
4488 						{
4489 							for(int x = 0; x < width; x += 2)
4490 							{
4491 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4492 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4493 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4494 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4495 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4496 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4497 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4498 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4499 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4500 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4501 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4502 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4503 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4504 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4505 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4506 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4507 
4508 								c0 = _mm_avg_epu16(c0, c1);
4509 								c2 = _mm_avg_epu16(c2, c3);
4510 								c4 = _mm_avg_epu16(c4, c5);
4511 								c6 = _mm_avg_epu16(c6, c7);
4512 								c8 = _mm_avg_epu16(c8, c9);
4513 								cA = _mm_avg_epu16(cA, cB);
4514 								cC = _mm_avg_epu16(cC, cD);
4515 								cE = _mm_avg_epu16(cE, cF);
4516 								c0 = _mm_avg_epu16(c0, c2);
4517 								c4 = _mm_avg_epu16(c4, c6);
4518 								c8 = _mm_avg_epu16(c8, cA);
4519 								cC = _mm_avg_epu16(cC, cE);
4520 								c0 = _mm_avg_epu16(c0, c4);
4521 								c8 = _mm_avg_epu16(c8, cC);
4522 								c0 = _mm_avg_epu16(c0, c8);
4523 
4524 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4525 							}
4526 
4527 							source0 += pitch;
4528 							source1 += pitch;
4529 							source2 += pitch;
4530 							source3 += pitch;
4531 							source4 += pitch;
4532 							source5 += pitch;
4533 							source6 += pitch;
4534 							source7 += pitch;
4535 							source8 += pitch;
4536 							source9 += pitch;
4537 							sourceA += pitch;
4538 							sourceB += pitch;
4539 							sourceC += pitch;
4540 							sourceD += pitch;
4541 							sourceE += pitch;
4542 							sourceF += pitch;
4543 						}
4544 					}
4545 					else ASSERT(false);
4546 				}
4547 				else
4548 			#endif
4549 			{
4550 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4551 
4552 				if(internal.samples == 2)
4553 				{
4554 					for(int y = 0; y < height; y++)
4555 					{
4556 						for(int x = 0; x < 2 * width; x++)
4557 						{
4558 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4559 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4560 
4561 							c0 = AVERAGE(c0, c1);
4562 
4563 							*(unsigned int*)(source0 + 4 * x) = c0;
4564 						}
4565 
4566 						source0 += pitch;
4567 						source1 += pitch;
4568 					}
4569 				}
4570 				else if(internal.samples == 4)
4571 				{
4572 					for(int y = 0; y < height; y++)
4573 					{
4574 						for(int x = 0; x < 2 * width; x++)
4575 						{
4576 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4577 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4578 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4579 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4580 
4581 							c0 = AVERAGE(c0, c1);
4582 							c2 = AVERAGE(c2, c3);
4583 							c0 = AVERAGE(c0, c2);
4584 
4585 							*(unsigned int*)(source0 + 4 * x) = c0;
4586 						}
4587 
4588 						source0 += pitch;
4589 						source1 += pitch;
4590 						source2 += pitch;
4591 						source3 += pitch;
4592 					}
4593 				}
4594 				else if(internal.samples == 8)
4595 				{
4596 					for(int y = 0; y < height; y++)
4597 					{
4598 						for(int x = 0; x < 2 * width; x++)
4599 						{
4600 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4601 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4602 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4603 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4604 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4605 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4606 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4607 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4608 
4609 							c0 = AVERAGE(c0, c1);
4610 							c2 = AVERAGE(c2, c3);
4611 							c4 = AVERAGE(c4, c5);
4612 							c6 = AVERAGE(c6, c7);
4613 							c0 = AVERAGE(c0, c2);
4614 							c4 = AVERAGE(c4, c6);
4615 							c0 = AVERAGE(c0, c4);
4616 
4617 							*(unsigned int*)(source0 + 4 * x) = c0;
4618 						}
4619 
4620 						source0 += pitch;
4621 						source1 += pitch;
4622 						source2 += pitch;
4623 						source3 += pitch;
4624 						source4 += pitch;
4625 						source5 += pitch;
4626 						source6 += pitch;
4627 						source7 += pitch;
4628 					}
4629 				}
4630 				else if(internal.samples == 16)
4631 				{
4632 					for(int y = 0; y < height; y++)
4633 					{
4634 						for(int x = 0; x < 2 * width; x++)
4635 						{
4636 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4637 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4638 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4639 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4640 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4641 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4642 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4643 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4644 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4645 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4646 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4647 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4648 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4649 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4650 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4651 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4652 
4653 							c0 = AVERAGE(c0, c1);
4654 							c2 = AVERAGE(c2, c3);
4655 							c4 = AVERAGE(c4, c5);
4656 							c6 = AVERAGE(c6, c7);
4657 							c8 = AVERAGE(c8, c9);
4658 							cA = AVERAGE(cA, cB);
4659 							cC = AVERAGE(cC, cD);
4660 							cE = AVERAGE(cE, cF);
4661 							c0 = AVERAGE(c0, c2);
4662 							c4 = AVERAGE(c4, c6);
4663 							c8 = AVERAGE(c8, cA);
4664 							cC = AVERAGE(cC, cE);
4665 							c0 = AVERAGE(c0, c4);
4666 							c8 = AVERAGE(c8, cC);
4667 							c0 = AVERAGE(c0, c8);
4668 
4669 							*(unsigned int*)(source0 + 4 * x) = c0;
4670 						}
4671 
4672 						source0 += pitch;
4673 						source1 += pitch;
4674 						source2 += pitch;
4675 						source3 += pitch;
4676 						source4 += pitch;
4677 						source5 += pitch;
4678 						source6 += pitch;
4679 						source7 += pitch;
4680 						source8 += pitch;
4681 						source9 += pitch;
4682 						sourceA += pitch;
4683 						sourceB += pitch;
4684 						sourceC += pitch;
4685 						sourceD += pitch;
4686 						sourceE += pitch;
4687 						sourceF += pitch;
4688 					}
4689 				}
4690 				else ASSERT(false);
4691 
4692 				#undef AVERAGE
4693 			}
4694 		}
4695 		else if(internal.format == FORMAT_R32F)
4696 		{
4697 			#if defined(__i386__) || defined(__x86_64__)
4698 				if(CPUID::supportsSSE() && (width % 4) == 0)
4699 				{
4700 					if(internal.samples == 2)
4701 					{
4702 						for(int y = 0; y < height; y++)
4703 						{
4704 							for(int x = 0; x < width; x += 4)
4705 							{
4706 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4707 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4708 
4709 								c0 = _mm_add_ps(c0, c1);
4710 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4711 
4712 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4713 							}
4714 
4715 							source0 += pitch;
4716 							source1 += pitch;
4717 						}
4718 					}
4719 					else if(internal.samples == 4)
4720 					{
4721 						for(int y = 0; y < height; y++)
4722 						{
4723 							for(int x = 0; x < width; x += 4)
4724 							{
4725 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4726 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4727 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4728 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4729 
4730 								c0 = _mm_add_ps(c0, c1);
4731 								c2 = _mm_add_ps(c2, c3);
4732 								c0 = _mm_add_ps(c0, c2);
4733 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4734 
4735 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4736 							}
4737 
4738 							source0 += pitch;
4739 							source1 += pitch;
4740 							source2 += pitch;
4741 							source3 += pitch;
4742 						}
4743 					}
4744 					else if(internal.samples == 8)
4745 					{
4746 						for(int y = 0; y < height; y++)
4747 						{
4748 							for(int x = 0; x < width; x += 4)
4749 							{
4750 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4751 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4752 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4753 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4754 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4755 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4756 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4757 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4758 
4759 								c0 = _mm_add_ps(c0, c1);
4760 								c2 = _mm_add_ps(c2, c3);
4761 								c4 = _mm_add_ps(c4, c5);
4762 								c6 = _mm_add_ps(c6, c7);
4763 								c0 = _mm_add_ps(c0, c2);
4764 								c4 = _mm_add_ps(c4, c6);
4765 								c0 = _mm_add_ps(c0, c4);
4766 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4767 
4768 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4769 							}
4770 
4771 							source0 += pitch;
4772 							source1 += pitch;
4773 							source2 += pitch;
4774 							source3 += pitch;
4775 							source4 += pitch;
4776 							source5 += pitch;
4777 							source6 += pitch;
4778 							source7 += pitch;
4779 						}
4780 					}
4781 					else if(internal.samples == 16)
4782 					{
4783 						for(int y = 0; y < height; y++)
4784 						{
4785 							for(int x = 0; x < width; x += 4)
4786 							{
4787 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4788 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4789 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4790 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4791 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4792 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4793 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4794 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4795 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4796 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4797 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4798 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4799 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4800 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4801 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4802 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4803 
4804 								c0 = _mm_add_ps(c0, c1);
4805 								c2 = _mm_add_ps(c2, c3);
4806 								c4 = _mm_add_ps(c4, c5);
4807 								c6 = _mm_add_ps(c6, c7);
4808 								c8 = _mm_add_ps(c8, c9);
4809 								cA = _mm_add_ps(cA, cB);
4810 								cC = _mm_add_ps(cC, cD);
4811 								cE = _mm_add_ps(cE, cF);
4812 								c0 = _mm_add_ps(c0, c2);
4813 								c4 = _mm_add_ps(c4, c6);
4814 								c8 = _mm_add_ps(c8, cA);
4815 								cC = _mm_add_ps(cC, cE);
4816 								c0 = _mm_add_ps(c0, c4);
4817 								c8 = _mm_add_ps(c8, cC);
4818 								c0 = _mm_add_ps(c0, c8);
4819 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4820 
4821 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4822 							}
4823 
4824 							source0 += pitch;
4825 							source1 += pitch;
4826 							source2 += pitch;
4827 							source3 += pitch;
4828 							source4 += pitch;
4829 							source5 += pitch;
4830 							source6 += pitch;
4831 							source7 += pitch;
4832 							source8 += pitch;
4833 							source9 += pitch;
4834 							sourceA += pitch;
4835 							sourceB += pitch;
4836 							sourceC += pitch;
4837 							sourceD += pitch;
4838 							sourceE += pitch;
4839 							sourceF += pitch;
4840 						}
4841 					}
4842 					else ASSERT(false);
4843 				}
4844 				else
4845 			#endif
4846 			{
4847 				if(internal.samples == 2)
4848 				{
4849 					for(int y = 0; y < height; y++)
4850 					{
4851 						for(int x = 0; x < width; x++)
4852 						{
4853 							float c0 = *(float*)(source0 + 4 * x);
4854 							float c1 = *(float*)(source1 + 4 * x);
4855 
4856 							c0 = c0 + c1;
4857 							c0 *= 1.0f / 2.0f;
4858 
4859 							*(float*)(source0 + 4 * x) = c0;
4860 						}
4861 
4862 						source0 += pitch;
4863 						source1 += pitch;
4864 					}
4865 				}
4866 				else if(internal.samples == 4)
4867 				{
4868 					for(int y = 0; y < height; y++)
4869 					{
4870 						for(int x = 0; x < width; x++)
4871 						{
4872 							float c0 = *(float*)(source0 + 4 * x);
4873 							float c1 = *(float*)(source1 + 4 * x);
4874 							float c2 = *(float*)(source2 + 4 * x);
4875 							float c3 = *(float*)(source3 + 4 * x);
4876 
4877 							c0 = c0 + c1;
4878 							c2 = c2 + c3;
4879 							c0 = c0 + c2;
4880 							c0 *= 1.0f / 4.0f;
4881 
4882 							*(float*)(source0 + 4 * x) = c0;
4883 						}
4884 
4885 						source0 += pitch;
4886 						source1 += pitch;
4887 						source2 += pitch;
4888 						source3 += pitch;
4889 					}
4890 				}
4891 				else if(internal.samples == 8)
4892 				{
4893 					for(int y = 0; y < height; y++)
4894 					{
4895 						for(int x = 0; x < width; x++)
4896 						{
4897 							float c0 = *(float*)(source0 + 4 * x);
4898 							float c1 = *(float*)(source1 + 4 * x);
4899 							float c2 = *(float*)(source2 + 4 * x);
4900 							float c3 = *(float*)(source3 + 4 * x);
4901 							float c4 = *(float*)(source4 + 4 * x);
4902 							float c5 = *(float*)(source5 + 4 * x);
4903 							float c6 = *(float*)(source6 + 4 * x);
4904 							float c7 = *(float*)(source7 + 4 * x);
4905 
4906 							c0 = c0 + c1;
4907 							c2 = c2 + c3;
4908 							c4 = c4 + c5;
4909 							c6 = c6 + c7;
4910 							c0 = c0 + c2;
4911 							c4 = c4 + c6;
4912 							c0 = c0 + c4;
4913 							c0 *= 1.0f / 8.0f;
4914 
4915 							*(float*)(source0 + 4 * x) = c0;
4916 						}
4917 
4918 						source0 += pitch;
4919 						source1 += pitch;
4920 						source2 += pitch;
4921 						source3 += pitch;
4922 						source4 += pitch;
4923 						source5 += pitch;
4924 						source6 += pitch;
4925 						source7 += pitch;
4926 					}
4927 				}
4928 				else if(internal.samples == 16)
4929 				{
4930 					for(int y = 0; y < height; y++)
4931 					{
4932 						for(int x = 0; x < width; x++)
4933 						{
4934 							float c0 = *(float*)(source0 + 4 * x);
4935 							float c1 = *(float*)(source1 + 4 * x);
4936 							float c2 = *(float*)(source2 + 4 * x);
4937 							float c3 = *(float*)(source3 + 4 * x);
4938 							float c4 = *(float*)(source4 + 4 * x);
4939 							float c5 = *(float*)(source5 + 4 * x);
4940 							float c6 = *(float*)(source6 + 4 * x);
4941 							float c7 = *(float*)(source7 + 4 * x);
4942 							float c8 = *(float*)(source8 + 4 * x);
4943 							float c9 = *(float*)(source9 + 4 * x);
4944 							float cA = *(float*)(sourceA + 4 * x);
4945 							float cB = *(float*)(sourceB + 4 * x);
4946 							float cC = *(float*)(sourceC + 4 * x);
4947 							float cD = *(float*)(sourceD + 4 * x);
4948 							float cE = *(float*)(sourceE + 4 * x);
4949 							float cF = *(float*)(sourceF + 4 * x);
4950 
4951 							c0 = c0 + c1;
4952 							c2 = c2 + c3;
4953 							c4 = c4 + c5;
4954 							c6 = c6 + c7;
4955 							c8 = c8 + c9;
4956 							cA = cA + cB;
4957 							cC = cC + cD;
4958 							cE = cE + cF;
4959 							c0 = c0 + c2;
4960 							c4 = c4 + c6;
4961 							c8 = c8 + cA;
4962 							cC = cC + cE;
4963 							c0 = c0 + c4;
4964 							c8 = c8 + cC;
4965 							c0 = c0 + c8;
4966 							c0 *= 1.0f / 16.0f;
4967 
4968 							*(float*)(source0 + 4 * x) = c0;
4969 						}
4970 
4971 						source0 += pitch;
4972 						source1 += pitch;
4973 						source2 += pitch;
4974 						source3 += pitch;
4975 						source4 += pitch;
4976 						source5 += pitch;
4977 						source6 += pitch;
4978 						source7 += pitch;
4979 						source8 += pitch;
4980 						source9 += pitch;
4981 						sourceA += pitch;
4982 						sourceB += pitch;
4983 						sourceC += pitch;
4984 						sourceD += pitch;
4985 						sourceE += pitch;
4986 						sourceF += pitch;
4987 					}
4988 				}
4989 				else ASSERT(false);
4990 			}
4991 		}
4992 		else if(internal.format == FORMAT_G32R32F)
4993 		{
4994 			#if defined(__i386__) || defined(__x86_64__)
4995 				if(CPUID::supportsSSE() && (width % 2) == 0)
4996 				{
4997 					if(internal.samples == 2)
4998 					{
4999 						for(int y = 0; y < height; y++)
5000 						{
5001 							for(int x = 0; x < width; x += 2)
5002 							{
5003 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5004 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5005 
5006 								c0 = _mm_add_ps(c0, c1);
5007 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5008 
5009 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5010 							}
5011 
5012 							source0 += pitch;
5013 							source1 += pitch;
5014 						}
5015 					}
5016 					else if(internal.samples == 4)
5017 					{
5018 						for(int y = 0; y < height; y++)
5019 						{
5020 							for(int x = 0; x < width; x += 2)
5021 							{
5022 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5023 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5024 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5025 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5026 
5027 								c0 = _mm_add_ps(c0, c1);
5028 								c2 = _mm_add_ps(c2, c3);
5029 								c0 = _mm_add_ps(c0, c2);
5030 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5031 
5032 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5033 							}
5034 
5035 							source0 += pitch;
5036 							source1 += pitch;
5037 							source2 += pitch;
5038 							source3 += pitch;
5039 						}
5040 					}
5041 					else if(internal.samples == 8)
5042 					{
5043 						for(int y = 0; y < height; y++)
5044 						{
5045 							for(int x = 0; x < width; x += 2)
5046 							{
5047 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5048 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5049 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5050 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5051 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5052 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5053 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5054 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5055 
5056 								c0 = _mm_add_ps(c0, c1);
5057 								c2 = _mm_add_ps(c2, c3);
5058 								c4 = _mm_add_ps(c4, c5);
5059 								c6 = _mm_add_ps(c6, c7);
5060 								c0 = _mm_add_ps(c0, c2);
5061 								c4 = _mm_add_ps(c4, c6);
5062 								c0 = _mm_add_ps(c0, c4);
5063 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5064 
5065 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5066 							}
5067 
5068 							source0 += pitch;
5069 							source1 += pitch;
5070 							source2 += pitch;
5071 							source3 += pitch;
5072 							source4 += pitch;
5073 							source5 += pitch;
5074 							source6 += pitch;
5075 							source7 += pitch;
5076 						}
5077 					}
5078 					else if(internal.samples == 16)
5079 					{
5080 						for(int y = 0; y < height; y++)
5081 						{
5082 							for(int x = 0; x < width; x += 2)
5083 							{
5084 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5085 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5086 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5087 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5088 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5089 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5090 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5091 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5092 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5093 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5094 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5095 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5096 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5097 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5098 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5099 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5100 
5101 								c0 = _mm_add_ps(c0, c1);
5102 								c2 = _mm_add_ps(c2, c3);
5103 								c4 = _mm_add_ps(c4, c5);
5104 								c6 = _mm_add_ps(c6, c7);
5105 								c8 = _mm_add_ps(c8, c9);
5106 								cA = _mm_add_ps(cA, cB);
5107 								cC = _mm_add_ps(cC, cD);
5108 								cE = _mm_add_ps(cE, cF);
5109 								c0 = _mm_add_ps(c0, c2);
5110 								c4 = _mm_add_ps(c4, c6);
5111 								c8 = _mm_add_ps(c8, cA);
5112 								cC = _mm_add_ps(cC, cE);
5113 								c0 = _mm_add_ps(c0, c4);
5114 								c8 = _mm_add_ps(c8, cC);
5115 								c0 = _mm_add_ps(c0, c8);
5116 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5117 
5118 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5119 							}
5120 
5121 							source0 += pitch;
5122 							source1 += pitch;
5123 							source2 += pitch;
5124 							source3 += pitch;
5125 							source4 += pitch;
5126 							source5 += pitch;
5127 							source6 += pitch;
5128 							source7 += pitch;
5129 							source8 += pitch;
5130 							source9 += pitch;
5131 							sourceA += pitch;
5132 							sourceB += pitch;
5133 							sourceC += pitch;
5134 							sourceD += pitch;
5135 							sourceE += pitch;
5136 							sourceF += pitch;
5137 						}
5138 					}
5139 					else ASSERT(false);
5140 				}
5141 				else
5142 			#endif
5143 			{
5144 				if(internal.samples == 2)
5145 				{
5146 					for(int y = 0; y < height; y++)
5147 					{
5148 						for(int x = 0; x < 2 * width; x++)
5149 						{
5150 							float c0 = *(float*)(source0 + 4 * x);
5151 							float c1 = *(float*)(source1 + 4 * x);
5152 
5153 							c0 = c0 + c1;
5154 							c0 *= 1.0f / 2.0f;
5155 
5156 							*(float*)(source0 + 4 * x) = c0;
5157 						}
5158 
5159 						source0 += pitch;
5160 						source1 += pitch;
5161 					}
5162 				}
5163 				else if(internal.samples == 4)
5164 				{
5165 					for(int y = 0; y < height; y++)
5166 					{
5167 						for(int x = 0; x < 2 * width; x++)
5168 						{
5169 							float c0 = *(float*)(source0 + 4 * x);
5170 							float c1 = *(float*)(source1 + 4 * x);
5171 							float c2 = *(float*)(source2 + 4 * x);
5172 							float c3 = *(float*)(source3 + 4 * x);
5173 
5174 							c0 = c0 + c1;
5175 							c2 = c2 + c3;
5176 							c0 = c0 + c2;
5177 							c0 *= 1.0f / 4.0f;
5178 
5179 							*(float*)(source0 + 4 * x) = c0;
5180 						}
5181 
5182 						source0 += pitch;
5183 						source1 += pitch;
5184 						source2 += pitch;
5185 						source3 += pitch;
5186 					}
5187 				}
5188 				else if(internal.samples == 8)
5189 				{
5190 					for(int y = 0; y < height; y++)
5191 					{
5192 						for(int x = 0; x < 2 * width; x++)
5193 						{
5194 							float c0 = *(float*)(source0 + 4 * x);
5195 							float c1 = *(float*)(source1 + 4 * x);
5196 							float c2 = *(float*)(source2 + 4 * x);
5197 							float c3 = *(float*)(source3 + 4 * x);
5198 							float c4 = *(float*)(source4 + 4 * x);
5199 							float c5 = *(float*)(source5 + 4 * x);
5200 							float c6 = *(float*)(source6 + 4 * x);
5201 							float c7 = *(float*)(source7 + 4 * x);
5202 
5203 							c0 = c0 + c1;
5204 							c2 = c2 + c3;
5205 							c4 = c4 + c5;
5206 							c6 = c6 + c7;
5207 							c0 = c0 + c2;
5208 							c4 = c4 + c6;
5209 							c0 = c0 + c4;
5210 							c0 *= 1.0f / 8.0f;
5211 
5212 							*(float*)(source0 + 4 * x) = c0;
5213 						}
5214 
5215 						source0 += pitch;
5216 						source1 += pitch;
5217 						source2 += pitch;
5218 						source3 += pitch;
5219 						source4 += pitch;
5220 						source5 += pitch;
5221 						source6 += pitch;
5222 						source7 += pitch;
5223 					}
5224 				}
5225 				else if(internal.samples == 16)
5226 				{
5227 					for(int y = 0; y < height; y++)
5228 					{
5229 						for(int x = 0; x < 2 * width; x++)
5230 						{
5231 							float c0 = *(float*)(source0 + 4 * x);
5232 							float c1 = *(float*)(source1 + 4 * x);
5233 							float c2 = *(float*)(source2 + 4 * x);
5234 							float c3 = *(float*)(source3 + 4 * x);
5235 							float c4 = *(float*)(source4 + 4 * x);
5236 							float c5 = *(float*)(source5 + 4 * x);
5237 							float c6 = *(float*)(source6 + 4 * x);
5238 							float c7 = *(float*)(source7 + 4 * x);
5239 							float c8 = *(float*)(source8 + 4 * x);
5240 							float c9 = *(float*)(source9 + 4 * x);
5241 							float cA = *(float*)(sourceA + 4 * x);
5242 							float cB = *(float*)(sourceB + 4 * x);
5243 							float cC = *(float*)(sourceC + 4 * x);
5244 							float cD = *(float*)(sourceD + 4 * x);
5245 							float cE = *(float*)(sourceE + 4 * x);
5246 							float cF = *(float*)(sourceF + 4 * x);
5247 
5248 							c0 = c0 + c1;
5249 							c2 = c2 + c3;
5250 							c4 = c4 + c5;
5251 							c6 = c6 + c7;
5252 							c8 = c8 + c9;
5253 							cA = cA + cB;
5254 							cC = cC + cD;
5255 							cE = cE + cF;
5256 							c0 = c0 + c2;
5257 							c4 = c4 + c6;
5258 							c8 = c8 + cA;
5259 							cC = cC + cE;
5260 							c0 = c0 + c4;
5261 							c8 = c8 + cC;
5262 							c0 = c0 + c8;
5263 							c0 *= 1.0f / 16.0f;
5264 
5265 							*(float*)(source0 + 4 * x) = c0;
5266 						}
5267 
5268 						source0 += pitch;
5269 						source1 += pitch;
5270 						source2 += pitch;
5271 						source3 += pitch;
5272 						source4 += pitch;
5273 						source5 += pitch;
5274 						source6 += pitch;
5275 						source7 += pitch;
5276 						source8 += pitch;
5277 						source9 += pitch;
5278 						sourceA += pitch;
5279 						sourceB += pitch;
5280 						sourceC += pitch;
5281 						sourceD += pitch;
5282 						sourceE += pitch;
5283 						sourceF += pitch;
5284 					}
5285 				}
5286 				else ASSERT(false);
5287 			}
5288 		}
5289 		else if(internal.format == FORMAT_A32B32G32R32F ||
5290 		        internal.format == FORMAT_X32B32G32R32F ||
5291 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
5292 		{
5293 			#if defined(__i386__) || defined(__x86_64__)
5294 				if(CPUID::supportsSSE())
5295 				{
5296 					if(internal.samples == 2)
5297 					{
5298 						for(int y = 0; y < height; y++)
5299 						{
5300 							for(int x = 0; x < width; x++)
5301 							{
5302 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5303 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5304 
5305 								c0 = _mm_add_ps(c0, c1);
5306 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5307 
5308 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5309 							}
5310 
5311 							source0 += pitch;
5312 							source1 += pitch;
5313 						}
5314 					}
5315 					else if(internal.samples == 4)
5316 					{
5317 						for(int y = 0; y < height; y++)
5318 						{
5319 							for(int x = 0; x < width; x++)
5320 							{
5321 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5322 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5323 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5324 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5325 
5326 								c0 = _mm_add_ps(c0, c1);
5327 								c2 = _mm_add_ps(c2, c3);
5328 								c0 = _mm_add_ps(c0, c2);
5329 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5330 
5331 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5332 							}
5333 
5334 							source0 += pitch;
5335 							source1 += pitch;
5336 							source2 += pitch;
5337 							source3 += pitch;
5338 						}
5339 					}
5340 					else if(internal.samples == 8)
5341 					{
5342 						for(int y = 0; y < height; y++)
5343 						{
5344 							for(int x = 0; x < width; x++)
5345 							{
5346 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5347 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5348 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5349 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5350 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5351 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5352 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5353 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5354 
5355 								c0 = _mm_add_ps(c0, c1);
5356 								c2 = _mm_add_ps(c2, c3);
5357 								c4 = _mm_add_ps(c4, c5);
5358 								c6 = _mm_add_ps(c6, c7);
5359 								c0 = _mm_add_ps(c0, c2);
5360 								c4 = _mm_add_ps(c4, c6);
5361 								c0 = _mm_add_ps(c0, c4);
5362 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5363 
5364 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5365 							}
5366 
5367 							source0 += pitch;
5368 							source1 += pitch;
5369 							source2 += pitch;
5370 							source3 += pitch;
5371 							source4 += pitch;
5372 							source5 += pitch;
5373 							source6 += pitch;
5374 							source7 += pitch;
5375 						}
5376 					}
5377 					else if(internal.samples == 16)
5378 					{
5379 						for(int y = 0; y < height; y++)
5380 						{
5381 							for(int x = 0; x < width; x++)
5382 							{
5383 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5384 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5385 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5386 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5387 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5388 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5389 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5390 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5391 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5392 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5393 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5394 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5395 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5396 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5397 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5398 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5399 
5400 								c0 = _mm_add_ps(c0, c1);
5401 								c2 = _mm_add_ps(c2, c3);
5402 								c4 = _mm_add_ps(c4, c5);
5403 								c6 = _mm_add_ps(c6, c7);
5404 								c8 = _mm_add_ps(c8, c9);
5405 								cA = _mm_add_ps(cA, cB);
5406 								cC = _mm_add_ps(cC, cD);
5407 								cE = _mm_add_ps(cE, cF);
5408 								c0 = _mm_add_ps(c0, c2);
5409 								c4 = _mm_add_ps(c4, c6);
5410 								c8 = _mm_add_ps(c8, cA);
5411 								cC = _mm_add_ps(cC, cE);
5412 								c0 = _mm_add_ps(c0, c4);
5413 								c8 = _mm_add_ps(c8, cC);
5414 								c0 = _mm_add_ps(c0, c8);
5415 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5416 
5417 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5418 							}
5419 
5420 							source0 += pitch;
5421 							source1 += pitch;
5422 							source2 += pitch;
5423 							source3 += pitch;
5424 							source4 += pitch;
5425 							source5 += pitch;
5426 							source6 += pitch;
5427 							source7 += pitch;
5428 							source8 += pitch;
5429 							source9 += pitch;
5430 							sourceA += pitch;
5431 							sourceB += pitch;
5432 							sourceC += pitch;
5433 							sourceD += pitch;
5434 							sourceE += pitch;
5435 							sourceF += pitch;
5436 						}
5437 					}
5438 					else ASSERT(false);
5439 				}
5440 				else
5441 			#endif
5442 			{
5443 				if(internal.samples == 2)
5444 				{
5445 					for(int y = 0; y < height; y++)
5446 					{
5447 						for(int x = 0; x < 4 * width; x++)
5448 						{
5449 							float c0 = *(float*)(source0 + 4 * x);
5450 							float c1 = *(float*)(source1 + 4 * x);
5451 
5452 							c0 = c0 + c1;
5453 							c0 *= 1.0f / 2.0f;
5454 
5455 							*(float*)(source0 + 4 * x) = c0;
5456 						}
5457 
5458 						source0 += pitch;
5459 						source1 += pitch;
5460 					}
5461 				}
5462 				else if(internal.samples == 4)
5463 				{
5464 					for(int y = 0; y < height; y++)
5465 					{
5466 						for(int x = 0; x < 4 * width; x++)
5467 						{
5468 							float c0 = *(float*)(source0 + 4 * x);
5469 							float c1 = *(float*)(source1 + 4 * x);
5470 							float c2 = *(float*)(source2 + 4 * x);
5471 							float c3 = *(float*)(source3 + 4 * x);
5472 
5473 							c0 = c0 + c1;
5474 							c2 = c2 + c3;
5475 							c0 = c0 + c2;
5476 							c0 *= 1.0f / 4.0f;
5477 
5478 							*(float*)(source0 + 4 * x) = c0;
5479 						}
5480 
5481 						source0 += pitch;
5482 						source1 += pitch;
5483 						source2 += pitch;
5484 						source3 += pitch;
5485 					}
5486 				}
5487 				else if(internal.samples == 8)
5488 				{
5489 					for(int y = 0; y < height; y++)
5490 					{
5491 						for(int x = 0; x < 4 * width; x++)
5492 						{
5493 							float c0 = *(float*)(source0 + 4 * x);
5494 							float c1 = *(float*)(source1 + 4 * x);
5495 							float c2 = *(float*)(source2 + 4 * x);
5496 							float c3 = *(float*)(source3 + 4 * x);
5497 							float c4 = *(float*)(source4 + 4 * x);
5498 							float c5 = *(float*)(source5 + 4 * x);
5499 							float c6 = *(float*)(source6 + 4 * x);
5500 							float c7 = *(float*)(source7 + 4 * x);
5501 
5502 							c0 = c0 + c1;
5503 							c2 = c2 + c3;
5504 							c4 = c4 + c5;
5505 							c6 = c6 + c7;
5506 							c0 = c0 + c2;
5507 							c4 = c4 + c6;
5508 							c0 = c0 + c4;
5509 							c0 *= 1.0f / 8.0f;
5510 
5511 							*(float*)(source0 + 4 * x) = c0;
5512 						}
5513 
5514 						source0 += pitch;
5515 						source1 += pitch;
5516 						source2 += pitch;
5517 						source3 += pitch;
5518 						source4 += pitch;
5519 						source5 += pitch;
5520 						source6 += pitch;
5521 						source7 += pitch;
5522 					}
5523 				}
5524 				else if(internal.samples == 16)
5525 				{
5526 					for(int y = 0; y < height; y++)
5527 					{
5528 						for(int x = 0; x < 4 * width; x++)
5529 						{
5530 							float c0 = *(float*)(source0 + 4 * x);
5531 							float c1 = *(float*)(source1 + 4 * x);
5532 							float c2 = *(float*)(source2 + 4 * x);
5533 							float c3 = *(float*)(source3 + 4 * x);
5534 							float c4 = *(float*)(source4 + 4 * x);
5535 							float c5 = *(float*)(source5 + 4 * x);
5536 							float c6 = *(float*)(source6 + 4 * x);
5537 							float c7 = *(float*)(source7 + 4 * x);
5538 							float c8 = *(float*)(source8 + 4 * x);
5539 							float c9 = *(float*)(source9 + 4 * x);
5540 							float cA = *(float*)(sourceA + 4 * x);
5541 							float cB = *(float*)(sourceB + 4 * x);
5542 							float cC = *(float*)(sourceC + 4 * x);
5543 							float cD = *(float*)(sourceD + 4 * x);
5544 							float cE = *(float*)(sourceE + 4 * x);
5545 							float cF = *(float*)(sourceF + 4 * x);
5546 
5547 							c0 = c0 + c1;
5548 							c2 = c2 + c3;
5549 							c4 = c4 + c5;
5550 							c6 = c6 + c7;
5551 							c8 = c8 + c9;
5552 							cA = cA + cB;
5553 							cC = cC + cD;
5554 							cE = cE + cF;
5555 							c0 = c0 + c2;
5556 							c4 = c4 + c6;
5557 							c8 = c8 + cA;
5558 							cC = cC + cE;
5559 							c0 = c0 + c4;
5560 							c8 = c8 + cC;
5561 							c0 = c0 + c8;
5562 							c0 *= 1.0f / 16.0f;
5563 
5564 							*(float*)(source0 + 4 * x) = c0;
5565 						}
5566 
5567 						source0 += pitch;
5568 						source1 += pitch;
5569 						source2 += pitch;
5570 						source3 += pitch;
5571 						source4 += pitch;
5572 						source5 += pitch;
5573 						source6 += pitch;
5574 						source7 += pitch;
5575 						source8 += pitch;
5576 						source9 += pitch;
5577 						sourceA += pitch;
5578 						sourceB += pitch;
5579 						sourceC += pitch;
5580 						sourceD += pitch;
5581 						sourceE += pitch;
5582 						sourceF += pitch;
5583 					}
5584 				}
5585 				else ASSERT(false);
5586 			}
5587 		}
5588 		else if(internal.format == FORMAT_R5G6B5)
5589 		{
5590 			#if defined(__i386__) || defined(__x86_64__)
5591 				if(CPUID::supportsSSE2() && (width % 8) == 0)
5592 				{
5593 					if(internal.samples == 2)
5594 					{
5595 						for(int y = 0; y < height; y++)
5596 						{
5597 							for(int x = 0; x < width; x += 8)
5598 							{
5599 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5600 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5601 
5602 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5603 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5604 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5605 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5606 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5607 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5608 
5609 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5610 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5611 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5612 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5613 								c0 = _mm_or_si128(c0, c1);
5614 
5615 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5616 							}
5617 
5618 							source0 += pitch;
5619 							source1 += pitch;
5620 						}
5621 					}
5622 					else if(internal.samples == 4)
5623 					{
5624 						for(int y = 0; y < height; y++)
5625 						{
5626 							for(int x = 0; x < width; x += 8)
5627 							{
5628 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5629 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5630 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5631 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5632 
5633 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5634 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5635 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5636 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5637 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5638 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5639 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5640 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5641 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5642 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5643 
5644 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5645 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5646 								c0 = _mm_avg_epu8(c0, c2);
5647 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5648 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5649 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5650 								c1 = _mm_avg_epu16(c1, c3);
5651 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5652 								c0 = _mm_or_si128(c0, c1);
5653 
5654 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5655 							}
5656 
5657 							source0 += pitch;
5658 							source1 += pitch;
5659 							source2 += pitch;
5660 							source3 += pitch;
5661 						}
5662 					}
5663 					else if(internal.samples == 8)
5664 					{
5665 						for(int y = 0; y < height; y++)
5666 						{
5667 							for(int x = 0; x < width; x += 8)
5668 							{
5669 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5670 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5671 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5672 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5673 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5674 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5675 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5676 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5677 
5678 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5679 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5680 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5681 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5682 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5683 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5684 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5685 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5686 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5687 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5688 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5689 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5690 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5691 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5692 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5693 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5694 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5695 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5696 
5697 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5698 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5699 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5700 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5701 								c0 = _mm_avg_epu8(c0, c2);
5702 								c4 = _mm_avg_epu8(c4, c6);
5703 								c0 = _mm_avg_epu8(c0, c4);
5704 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5705 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5706 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5707 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5708 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5709 								c1 = _mm_avg_epu16(c1, c3);
5710 								c5 = _mm_avg_epu16(c5, c7);
5711 								c1 = _mm_avg_epu16(c1, c5);
5712 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5713 								c0 = _mm_or_si128(c0, c1);
5714 
5715 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5716 							}
5717 
5718 							source0 += pitch;
5719 							source1 += pitch;
5720 							source2 += pitch;
5721 							source3 += pitch;
5722 							source4 += pitch;
5723 							source5 += pitch;
5724 							source6 += pitch;
5725 							source7 += pitch;
5726 						}
5727 					}
5728 					else if(internal.samples == 16)
5729 					{
5730 						for(int y = 0; y < height; y++)
5731 						{
5732 							for(int x = 0; x < width; x += 8)
5733 							{
5734 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5735 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5736 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5737 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5738 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5739 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5740 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5741 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5742 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5743 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5744 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5745 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5746 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5747 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5748 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5749 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5750 
5751 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5752 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5753 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5754 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5755 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5756 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5757 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5758 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5759 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5760 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5761 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5762 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5763 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5764 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5765 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5766 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5767 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5768 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5769 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5770 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5771 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5772 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5773 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5774 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5775 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5776 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5777 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5778 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5779 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5780 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5781 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5782 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5783 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5784 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5785 
5786 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5787 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5788 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5789 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5790 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5791 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5792 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5793 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5794 								c0 = _mm_avg_epu8(c0, c2);
5795 								c4 = _mm_avg_epu8(c4, c6);
5796 								c8 = _mm_avg_epu8(c8, cA);
5797 								cC = _mm_avg_epu8(cC, cE);
5798 								c0 = _mm_avg_epu8(c0, c4);
5799 								c8 = _mm_avg_epu8(c8, cC);
5800 								c0 = _mm_avg_epu8(c0, c8);
5801 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5802 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5803 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5804 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5805 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5806 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
5807 								cB = _mm_avg_epu16(cA__g_, cB__g_);
5808 								cD = _mm_avg_epu16(cC__g_, cD__g_);
5809 								cF = _mm_avg_epu16(cE__g_, cF__g_);
5810 								c1 = _mm_avg_epu8(c1, c3);
5811 								c5 = _mm_avg_epu8(c5, c7);
5812 								c9 = _mm_avg_epu8(c9, cB);
5813 								cD = _mm_avg_epu8(cD, cF);
5814 								c1 = _mm_avg_epu8(c1, c5);
5815 								c9 = _mm_avg_epu8(c9, cD);
5816 								c1 = _mm_avg_epu8(c1, c9);
5817 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5818 								c0 = _mm_or_si128(c0, c1);
5819 
5820 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5821 							}
5822 
5823 							source0 += pitch;
5824 							source1 += pitch;
5825 							source2 += pitch;
5826 							source3 += pitch;
5827 							source4 += pitch;
5828 							source5 += pitch;
5829 							source6 += pitch;
5830 							source7 += pitch;
5831 							source8 += pitch;
5832 							source9 += pitch;
5833 							sourceA += pitch;
5834 							sourceB += pitch;
5835 							sourceC += pitch;
5836 							sourceD += pitch;
5837 							sourceE += pitch;
5838 							sourceF += pitch;
5839 						}
5840 					}
5841 					else ASSERT(false);
5842 				}
5843 				else
5844 			#endif
5845 			{
5846 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5847 
5848 				if(internal.samples == 2)
5849 				{
5850 					for(int y = 0; y < height; y++)
5851 					{
5852 						for(int x = 0; x < width; x++)
5853 						{
5854 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5855 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5856 
5857 							c0 = AVERAGE(c0, c1);
5858 
5859 							*(unsigned short*)(source0 + 2 * x) = c0;
5860 						}
5861 
5862 						source0 += pitch;
5863 						source1 += pitch;
5864 					}
5865 				}
5866 				else if(internal.samples == 4)
5867 				{
5868 					for(int y = 0; y < height; y++)
5869 					{
5870 						for(int x = 0; x < width; x++)
5871 						{
5872 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5873 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5874 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5875 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5876 
5877 							c0 = AVERAGE(c0, c1);
5878 							c2 = AVERAGE(c2, c3);
5879 							c0 = AVERAGE(c0, c2);
5880 
5881 							*(unsigned short*)(source0 + 2 * x) = c0;
5882 						}
5883 
5884 						source0 += pitch;
5885 						source1 += pitch;
5886 						source2 += pitch;
5887 						source3 += pitch;
5888 					}
5889 				}
5890 				else if(internal.samples == 8)
5891 				{
5892 					for(int y = 0; y < height; y++)
5893 					{
5894 						for(int x = 0; x < width; x++)
5895 						{
5896 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5897 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5898 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5899 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5900 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5901 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5902 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5903 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5904 
5905 							c0 = AVERAGE(c0, c1);
5906 							c2 = AVERAGE(c2, c3);
5907 							c4 = AVERAGE(c4, c5);
5908 							c6 = AVERAGE(c6, c7);
5909 							c0 = AVERAGE(c0, c2);
5910 							c4 = AVERAGE(c4, c6);
5911 							c0 = AVERAGE(c0, c4);
5912 
5913 							*(unsigned short*)(source0 + 2 * x) = c0;
5914 						}
5915 
5916 						source0 += pitch;
5917 						source1 += pitch;
5918 						source2 += pitch;
5919 						source3 += pitch;
5920 						source4 += pitch;
5921 						source5 += pitch;
5922 						source6 += pitch;
5923 						source7 += pitch;
5924 					}
5925 				}
5926 				else if(internal.samples == 16)
5927 				{
5928 					for(int y = 0; y < height; y++)
5929 					{
5930 						for(int x = 0; x < width; x++)
5931 						{
5932 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5933 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5934 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5935 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5936 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5937 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5938 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5939 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5940 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5941 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5942 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5943 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5944 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5945 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5946 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5947 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5948 
5949 							c0 = AVERAGE(c0, c1);
5950 							c2 = AVERAGE(c2, c3);
5951 							c4 = AVERAGE(c4, c5);
5952 							c6 = AVERAGE(c6, c7);
5953 							c8 = AVERAGE(c8, c9);
5954 							cA = AVERAGE(cA, cB);
5955 							cC = AVERAGE(cC, cD);
5956 							cE = AVERAGE(cE, cF);
5957 							c0 = AVERAGE(c0, c2);
5958 							c4 = AVERAGE(c4, c6);
5959 							c8 = AVERAGE(c8, cA);
5960 							cC = AVERAGE(cC, cE);
5961 							c0 = AVERAGE(c0, c4);
5962 							c8 = AVERAGE(c8, cC);
5963 							c0 = AVERAGE(c0, c8);
5964 
5965 							*(unsigned short*)(source0 + 2 * x) = c0;
5966 						}
5967 
5968 						source0 += pitch;
5969 						source1 += pitch;
5970 						source2 += pitch;
5971 						source3 += pitch;
5972 						source4 += pitch;
5973 						source5 += pitch;
5974 						source6 += pitch;
5975 						source7 += pitch;
5976 						source8 += pitch;
5977 						source9 += pitch;
5978 						sourceA += pitch;
5979 						sourceB += pitch;
5980 						sourceC += pitch;
5981 						sourceD += pitch;
5982 						sourceE += pitch;
5983 						sourceF += pitch;
5984 					}
5985 				}
5986 				else ASSERT(false);
5987 
5988 				#undef AVERAGE
5989 			}
5990 		}
5991 		else
5992 		{
5993 		//	UNIMPLEMENTED();
5994 		}
5995 	}
5996 }
5997