1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef sw_ShaderCore_hpp
16 #define sw_ShaderCore_hpp
17 
18 #include "Reactor/Print.hpp"
19 #include "Reactor/Reactor.hpp"
20 #include "System/Debug.hpp"
21 
22 #include <array>
23 #include <atomic>   // std::memory_order
24 #include <utility>  // std::pair
25 
26 namespace sw {
27 
28 using namespace rr;
29 
30 class Vector4s
31 {
32 public:
33 	Vector4s();
34 	Vector4s(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
35 	Vector4s(const Vector4s &rhs);
36 
37 	Short4 &operator[](int i);
38 	Vector4s &operator=(const Vector4s &rhs);
39 
40 	Short4 x;
41 	Short4 y;
42 	Short4 z;
43 	Short4 w;
44 };
45 
46 class Vector4f
47 {
48 public:
49 	Vector4f();
50 	Vector4f(float x, float y, float z, float w);
51 	Vector4f(const Vector4f &rhs);
52 
53 	Float4 &operator[](int i);
54 	Vector4f &operator=(const Vector4f &rhs);
55 
56 	Float4 x;
57 	Float4 y;
58 	Float4 z;
59 	Float4 w;
60 };
61 
62 enum class OutOfBoundsBehavior
63 {
64 	Nullify,             // Loads become zero, stores are elided.
65 	RobustBufferAccess,  // As defined by the Vulkan spec (in short: access anywhere within bounds, or zeroing).
66 	UndefinedValue,      // Only for load operations. Not secure. No program termination.
67 	UndefinedBehavior,   // Program may terminate.
68 };
69 
70 // SIMD contains types that represent multiple scalars packed into a single
71 // vector data type. Types in the SIMD namespace provide a semantic hint
72 // that the data should be treated as a per-execution-lane scalar instead of
73 // a typical euclidean-style vector type.
74 namespace SIMD {
75 
76 // Width is the number of per-lane scalars packed into each SIMD vector.
77 static constexpr int Width = 4;
78 
79 using Float = rr::Float4;
80 using Int = rr::Int4;
81 using UInt = rr::UInt4;
82 
83 struct Pointer
84 {
85 	Pointer(rr::Pointer<Byte> base, rr::Int limit);
86 	Pointer(rr::Pointer<Byte> base, unsigned int limit);
87 	Pointer(rr::Pointer<Byte> base, rr::Int limit, SIMD::Int offset);
88 	Pointer(rr::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
89 
90 	Pointer &operator+=(Int i);
91 	Pointer &operator*=(Int i);
92 
93 	Pointer operator+(SIMD::Int i);
94 	Pointer operator*(SIMD::Int i);
95 
96 	Pointer &operator+=(int i);
97 	Pointer &operator*=(int i);
98 
99 	Pointer operator+(int i);
100 	Pointer operator*(int i);
101 
102 	SIMD::Int offsets() const;
103 
104 	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
105 
106 	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
107 
108 	Int limit() const;
109 
110 	// Returns true if all offsets are sequential
111 	// (N+0*step, N+1*step, N+2*step, N+3*step)
112 	rr::Bool hasSequentialOffsets(unsigned int step) const;
113 
114 	// Returns true if all offsets are are compile-time static and
115 	// sequential (N+0*step, N+1*step, N+2*step, N+3*step)
116 	bool hasStaticSequentialOffsets(unsigned int step) const;
117 
118 	// Returns true if all offsets are equal (N, N, N, N)
119 	rr::Bool hasEqualOffsets() const;
120 
121 	// Returns true if all offsets are compile-time static and are equal
122 	// (N, N, N, N)
123 	bool hasStaticEqualOffsets() const;
124 
125 	template<typename T>
126 	inline T Load(OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
127 
128 	template<typename T>
129 	inline void Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
130 
131 	template<typename T>
132 	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
133 
134 	// Base address for the pointer, common across all lanes.
135 	rr::Pointer<rr::Byte> base;
136 
137 	// Upper (non-inclusive) limit for offsets from base.
138 	rr::Int dynamicLimit;  // If hasDynamicLimit is false, dynamicLimit is zero.
139 	unsigned int staticLimit;
140 
141 	// Per lane offsets from base.
142 	SIMD::Int dynamicOffsets;  // If hasDynamicOffsets is false, all dynamicOffsets are zero.
143 	std::array<int32_t, SIMD::Width> staticOffsets;
144 
145 	bool hasDynamicLimit;    // True if dynamicLimit is non-zero.
146 	bool hasDynamicOffsets;  // True if any dynamicOffsets are non-zero.
147 };
148 
149 template<typename T>
150 struct Element
151 {};
152 template<>
153 struct Element<Float>
154 {
155 	using type = rr::Float;
156 };
157 template<>
158 struct Element<Int>
159 {
160 	using type = rr::Int;
161 };
162 template<>
163 struct Element<UInt>
164 {
165 	using type = rr::UInt;
166 };
167 
168 }  // namespace SIMD
169 
170 Float4 exponential2(RValue<Float4> x, bool pp = false);
171 Float4 logarithm2(RValue<Float4> x, bool pp = false);
172 Float4 exponential(RValue<Float4> x, bool pp = false);
173 Float4 logarithm(RValue<Float4> x, bool pp = false);
174 Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
175 Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
176 Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
177 Float4 modulo(RValue<Float4> x, RValue<Float4> y);
178 Float4 sine_pi(RValue<Float4> x, bool pp = false);    // limited to [-pi, pi] range
179 Float4 cosine_pi(RValue<Float4> x, bool pp = false);  // limited to [-pi, pi] range
180 Float4 sine(RValue<Float4> x, bool pp = false);
181 Float4 cosine(RValue<Float4> x, bool pp = false);
182 Float4 tangent(RValue<Float4> x, bool pp = false);
183 Float4 arccos(RValue<Float4> x, bool pp = false);
184 Float4 arcsin(RValue<Float4> x, bool pp = false);
185 Float4 arctan(RValue<Float4> x, bool pp = false);
186 Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp = false);
187 Float4 sineh(RValue<Float4> x, bool pp = false);
188 Float4 cosineh(RValue<Float4> x, bool pp = false);
189 Float4 tangenth(RValue<Float4> x, bool pp = false);
190 Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
191 Float4 arcsinh(RValue<Float4> x, bool pp = false);
192 Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
193 
194 Float4 dot2(const Vector4f &v0, const Vector4f &v1);
195 Float4 dot3(const Vector4f &v0, const Vector4f &v1);
196 Float4 dot4(const Vector4f &v0, const Vector4f &v1);
197 
198 void transpose4x4(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
199 void transpose4x3(Short4 &row0, Short4 &row1, Short4 &row2, Short4 &row3);
200 void transpose4x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
201 void transpose4x3(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
202 void transpose4x2(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
203 void transpose4x1(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
204 void transpose2x4(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3);
205 void transpose4xN(Float4 &row0, Float4 &row1, Float4 &row2, Float4 &row3, int N);
206 
207 sw::SIMD::UInt halfToFloatBits(sw::SIMD::UInt halfBits);
208 sw::SIMD::UInt floatToHalfBits(sw::SIMD::UInt floatBits, bool storeInUpperBits);
209 Float4 r11g11b10Unpack(UInt r11g11b10bits);
210 UInt r11g11b10Pack(const Float4 &value);
211 Vector4s a2b10g10r10Unpack(const Int4 &value);
212 Vector4s a2r10g10b10Unpack(const Int4 &value);
213 
214 rr::RValue<rr::Bool> AnyTrue(rr::RValue<sw::SIMD::Int> const &ints);
215 
216 rr::RValue<rr::Bool> AnyFalse(rr::RValue<sw::SIMD::Int> const &ints);
217 
218 template<typename T>
219 inline rr::RValue<T> AndAll(rr::RValue<T> const &mask);
220 
221 template<typename T>
222 inline rr::RValue<T> OrAll(rr::RValue<T> const &mask);
223 
224 rr::RValue<sw::SIMD::Float> Sign(rr::RValue<sw::SIMD::Float> const &val);
225 
226 // Returns the <whole, frac> of val.
227 // Both whole and frac will have the same sign as val.
228 std::pair<rr::RValue<sw::SIMD::Float>, rr::RValue<sw::SIMD::Float>>
229 Modf(rr::RValue<sw::SIMD::Float> const &val);
230 
231 // Returns the number of 1s in bits, per lane.
232 sw::SIMD::UInt CountBits(rr::RValue<sw::SIMD::UInt> const &bits);
233 
234 // Returns 1 << bits.
235 // If the resulting bit overflows a 32 bit integer, 0 is returned.
236 rr::RValue<sw::SIMD::UInt> NthBit32(rr::RValue<sw::SIMD::UInt> const &bits);
237 
238 // Returns bitCount number of of 1's starting from the LSB.
239 rr::RValue<sw::SIMD::UInt> Bitmask32(rr::RValue<sw::SIMD::UInt> const &bitCount);
240 
241 // Performs a fused-multiply add, returning a * b + c.
242 rr::RValue<sw::SIMD::Float> FMA(
243     rr::RValue<sw::SIMD::Float> const &a,
244     rr::RValue<sw::SIMD::Float> const &b,
245     rr::RValue<sw::SIMD::Float> const &c);
246 
247 // Returns the exponent of the floating point number f.
248 // Assumes IEEE 754
249 rr::RValue<sw::SIMD::Int> Exponent(rr::RValue<sw::SIMD::Float> f);
250 
251 // Returns y if y < x; otherwise result is x.
252 // If one operand is a NaN, the other operand is the result.
253 // If both operands are NaN, the result is a NaN.
254 rr::RValue<sw::SIMD::Float> NMin(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
255 
256 // Returns y if y > x; otherwise result is x.
257 // If one operand is a NaN, the other operand is the result.
258 // If both operands are NaN, the result is a NaN.
259 rr::RValue<sw::SIMD::Float> NMax(rr::RValue<sw::SIMD::Float> const &x, rr::RValue<sw::SIMD::Float> const &y);
260 
261 // Returns the determinant of a 2x2 matrix.
262 rr::RValue<sw::SIMD::Float> Determinant(
263     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
264     rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
265 
266 // Returns the determinant of a 3x3 matrix.
267 rr::RValue<sw::SIMD::Float> Determinant(
268     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
269     rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
270     rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
271 
272 // Returns the determinant of a 4x4 matrix.
273 rr::RValue<sw::SIMD::Float> Determinant(
274     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
275     rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
276     rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
277     rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
278 
279 // Returns the inverse of a 2x2 matrix.
280 std::array<rr::RValue<sw::SIMD::Float>, 4> MatrixInverse(
281     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b,
282     rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d);
283 
284 // Returns the inverse of a 3x3 matrix.
285 std::array<rr::RValue<sw::SIMD::Float>, 9> MatrixInverse(
286     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c,
287     rr::RValue<sw::SIMD::Float> const &d, rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f,
288     rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h, rr::RValue<sw::SIMD::Float> const &i);
289 
290 // Returns the inverse of a 4x4 matrix.
291 std::array<rr::RValue<sw::SIMD::Float>, 16> MatrixInverse(
292     rr::RValue<sw::SIMD::Float> const &a, rr::RValue<sw::SIMD::Float> const &b, rr::RValue<sw::SIMD::Float> const &c, rr::RValue<sw::SIMD::Float> const &d,
293     rr::RValue<sw::SIMD::Float> const &e, rr::RValue<sw::SIMD::Float> const &f, rr::RValue<sw::SIMD::Float> const &g, rr::RValue<sw::SIMD::Float> const &h,
294     rr::RValue<sw::SIMD::Float> const &i, rr::RValue<sw::SIMD::Float> const &j, rr::RValue<sw::SIMD::Float> const &k, rr::RValue<sw::SIMD::Float> const &l,
295     rr::RValue<sw::SIMD::Float> const &m, rr::RValue<sw::SIMD::Float> const &n, rr::RValue<sw::SIMD::Float> const &o, rr::RValue<sw::SIMD::Float> const &p);
296 
297 ////////////////////////////////////////////////////////////////////////////
298 // Inline functions
299 ////////////////////////////////////////////////////////////////////////////
300 
301 template<typename T>
Load(OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order,int alignment)302 inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
303 {
304 	using EL = typename Element<T>::type;
305 
306 	if(isStaticallyInBounds(sizeof(float), robustness))
307 	{
308 		// All elements are statically known to be in-bounds.
309 		// We can avoid costly conditional on masks.
310 
311 		if(hasStaticSequentialOffsets(sizeof(float)))
312 		{
313 			// Offsets are sequential. Perform regular load.
314 			return rr::Load(rr::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
315 		}
316 		if(hasStaticEqualOffsets())
317 		{
318 			// Load one, replicate.
319 			return T(*rr::Pointer<EL>(base + staticOffsets[0], alignment));
320 		}
321 	}
322 	else
323 	{
324 		switch(robustness)
325 		{
326 			case OutOfBoundsBehavior::Nullify:
327 			case OutOfBoundsBehavior::RobustBufferAccess:
328 			case OutOfBoundsBehavior::UndefinedValue:
329 				mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
330 				break;
331 			case OutOfBoundsBehavior::UndefinedBehavior:
332 				// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
333 				break;
334 		}
335 	}
336 
337 	auto offs = offsets();
338 
339 	if(!atomic && order == std::memory_order_relaxed)
340 	{
341 		if(hasStaticEqualOffsets())
342 		{
343 			// Load one, replicate.
344 			// Be careful of the case where the post-bounds-check mask
345 			// is 0, in which case we must not load.
346 			T out = T(0);
347 			If(AnyTrue(mask))
348 			{
349 				EL el = *rr::Pointer<EL>(base + staticOffsets[0], alignment);
350 				out = T(el);
351 			}
352 			return out;
353 		}
354 
355 		bool zeroMaskedLanes = true;
356 		switch(robustness)
357 		{
358 			case OutOfBoundsBehavior::Nullify:
359 			case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
360 				zeroMaskedLanes = true;
361 				break;
362 			case OutOfBoundsBehavior::UndefinedValue:
363 			case OutOfBoundsBehavior::UndefinedBehavior:
364 				zeroMaskedLanes = false;
365 				break;
366 		}
367 
368 		if(hasStaticSequentialOffsets(sizeof(float)))
369 		{
370 			return rr::MaskedLoad(rr::Pointer<T>(base + staticOffsets[0]), mask, alignment, zeroMaskedLanes);
371 		}
372 
373 		return rr::Gather(rr::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
374 	}
375 	else
376 	{
377 		T out;
378 		auto anyLanesDisabled = AnyFalse(mask);
379 		If(hasEqualOffsets() && !anyLanesDisabled)
380 		{
381 			// Load one, replicate.
382 			auto offset = Extract(offs, 0);
383 			out = T(rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order));
384 		}
385 		Else If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
386 		{
387 			// Load all elements in a single SIMD instruction.
388 			auto offset = Extract(offs, 0);
389 			out = rr::Load(rr::Pointer<T>(&base[offset]), alignment, atomic, order);
390 		}
391 		Else
392 		{
393 			// Divergent offsets or masked lanes.
394 			out = T(0);
395 			for(int i = 0; i < SIMD::Width; i++)
396 			{
397 				If(Extract(mask, i) != 0)
398 				{
399 					auto offset = Extract(offs, i);
400 					auto el = rr::Load(rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
401 					out = Insert(out, el, i);
402 				}
403 			}
404 		}
405 		return out;
406 	}
407 }
408 
409 template<typename T>
Store(T val,OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order)410 inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
411 {
412 	using EL = typename Element<T>::type;
413 	constexpr size_t alignment = sizeof(float);
414 	auto offs = offsets();
415 
416 	switch(robustness)
417 	{
418 		case OutOfBoundsBehavior::Nullify:
419 		case OutOfBoundsBehavior::RobustBufferAccess:       // TODO: Allows writing anywhere within bounds. Could be faster than masking.
420 		case OutOfBoundsBehavior::UndefinedValue:           // Should not be used for store operations. Treat as robust buffer access.
421 			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
422 			break;
423 		case OutOfBoundsBehavior::UndefinedBehavior:
424 			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
425 			break;
426 	}
427 
428 	if(!atomic && order == std::memory_order_relaxed)
429 	{
430 		if(hasStaticEqualOffsets())
431 		{
432 			If(AnyTrue(mask))
433 			{
434 				// All equal. One of these writes will win -- elect the winning lane.
435 				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
436 				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
437 				auto maskedVal = As<SIMD::Int>(val) & elect;
438 				auto scalarVal = Extract(maskedVal, 0) |
439 				                 Extract(maskedVal, 1) |
440 				                 Extract(maskedVal, 2) |
441 				                 Extract(maskedVal, 3);
442 				*rr::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
443 			}
444 		}
445 		else if(hasStaticSequentialOffsets(sizeof(float)))
446 		{
447 			if(isStaticallyInBounds(sizeof(float), robustness))
448 			{
449 				// Pointer has no elements OOB, and the store is not atomic.
450 				// Perform a RMW.
451 				auto p = rr::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
452 				auto prev = *p;
453 				*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
454 			}
455 			else
456 			{
457 				rr::MaskedStore(rr::Pointer<T>(base + staticOffsets[0]), val, mask, alignment);
458 			}
459 		}
460 		else
461 		{
462 			rr::Scatter(rr::Pointer<EL>(base), val, offs, mask, alignment);
463 		}
464 	}
465 	else
466 	{
467 		auto anyLanesDisabled = AnyFalse(mask);
468 		If(hasSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
469 		{
470 			// Store all elements in a single SIMD instruction.
471 			auto offset = Extract(offs, 0);
472 			rr::Store(val, rr::Pointer<T>(&base[offset]), alignment, atomic, order);
473 		}
474 		Else
475 		{
476 			// Divergent offsets or masked lanes.
477 			for(int i = 0; i < SIMD::Width; i++)
478 			{
479 				If(Extract(mask, i) != 0)
480 				{
481 					auto offset = Extract(offs, i);
482 					rr::Store(Extract(val, i), rr::Pointer<EL>(&base[offset]), alignment, atomic, order);
483 				}
484 			}
485 		}
486 	}
487 }
488 
489 template<typename T>
Store(RValue<T> val,OutOfBoundsBehavior robustness,Int mask,bool atomic,std::memory_order order)490 inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
491 {
492 	Store(T(val), robustness, mask, atomic, order);
493 }
494 
495 template<typename T>
AndAll(rr::RValue<T> const & mask)496 inline rr::RValue<T> AndAll(rr::RValue<T> const &mask)
497 {
498 	T v1 = mask;               // [x]    [y]    [z]    [w]
499 	T v2 = v1.xzxz & v1.ywyw;  // [xy]   [zw]   [xy]   [zw]
500 	return v2.xxxx & v2.yyyy;  // [xyzw] [xyzw] [xyzw] [xyzw]
501 }
502 
503 template<typename T>
OrAll(rr::RValue<T> const & mask)504 inline rr::RValue<T> OrAll(rr::RValue<T> const &mask)
505 {
506 	T v1 = mask;               // [x]    [y]    [z]    [w]
507 	T v2 = v1.xzxz | v1.ywyw;  // [xy]   [zw]   [xy]   [zw]
508 	return v2.xxxx | v2.yyyy;  // [xyzw] [xyzw] [xyzw] [xyzw]
509 }
510 
511 }  // namespace sw
512 
513 #ifdef ENABLE_RR_PRINT
514 namespace rr {
515 template<>
516 struct PrintValue::Ty<sw::Vector4f>
517 {
fmtrr::PrintValue::Ty518 	static std::string fmt(const sw::Vector4f &v)
519 	{
520 		return "[x: " + PrintValue::fmt(v.x) +
521 		       ", y: " + PrintValue::fmt(v.y) +
522 		       ", z: " + PrintValue::fmt(v.z) +
523 		       ", w: " + PrintValue::fmt(v.w) + "]";
524 	}
525 
valrr::PrintValue::Ty526 	static std::vector<rr::Value *> val(const sw::Vector4f &v)
527 	{
528 		return PrintValue::vals(v.x, v.y, v.z, v.w);
529 	}
530 };
531 template<>
532 struct PrintValue::Ty<sw::Vector4s>
533 {
fmtrr::PrintValue::Ty534 	static std::string fmt(const sw::Vector4s &v)
535 	{
536 		return "[x: " + PrintValue::fmt(v.x) +
537 		       ", y: " + PrintValue::fmt(v.y) +
538 		       ", z: " + PrintValue::fmt(v.z) +
539 		       ", w: " + PrintValue::fmt(v.w) + "]";
540 	}
541 
valrr::PrintValue::Ty542 	static std::vector<rr::Value *> val(const sw::Vector4s &v)
543 	{
544 		return PrintValue::vals(v.x, v.y, v.z, v.w);
545 	}
546 };
547 
548 }  // namespace rr
549 #endif  // ENABLE_RR_PRINT
550 
551 #endif  // sw_ShaderCore_hpp
552