1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
packRGBA8(I32 a,I32 b)5 static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) {
6 #if USE_SSE2
7   return _mm_packs_epi32(a, b);
8 #elif USE_NEON
9   return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
10 #else
11   return CONVERT(combine(a, b), HalfRGBA8);
12 #endif
13 }
14 
15 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v,
16                                                  float scale = 255.0f) {
17   ivec4 i = round_pixel(v, scale);
18   HalfRGBA8 xz = packRGBA8(i.z, i.x);
19   HalfRGBA8 yw = packRGBA8(i.y, i.w);
20   HalfRGBA8 xyzwl = zipLow(xz, yw);
21   HalfRGBA8 xyzwh = zipHigh(xz, yw);
22   HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
23   HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
24   return combine(lo, hi);
25 }
26 
27 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha,
28                                                  float scale = 255.0f) {
29   I32 i = round_pixel(alpha, scale);
30   HalfRGBA8 c = packRGBA8(i, i);
31   c = zipLow(c, c);
32   return zip(c, c);
33 }
34 
35 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha,
36                                                  float scale = 255.0f) {
37   I32 i = round_pixel(alpha, scale);
38   return repeat2(packRGBA8(i, i));
39 }
40 
41 UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v,
42                                                         float scale = 255.0f) {
43   I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale);
44   return repeat2(packRGBA8(i, i));
45 }
46 
pack_pixels_RGBA8()47 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
48   return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
49 }
50 
51 static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v,
52                                                  float scale = 255.0f) {
53   ivec4 i = round_pixel(bit_cast<vec4>(v), scale);
54   return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w));
55 }
56 
packR8(I32 a)57 static ALWAYS_INLINE WideR8 packR8(I32 a) {
58 #if USE_SSE2
59   return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
60 #elif USE_NEON
61   return vqmovun_s32(a);
62 #else
63   return CONVERT(a, WideR8);
64 #endif
65 }
66 
67 static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) {
68   return packR8(round_pixel(c, scale));
69 }
70 
pack_pixels_R8()71 static ALWAYS_INLINE WideR8 pack_pixels_R8() {
72   return pack_pixels_R8(fragment_shader->gl_FragColor.x);
73 }
74 
75 // Load a partial span > 0 and < 4 pixels.
76 template <typename V, typename P>
partial_load_span(const P * src,int span)77 static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
78   return bit_cast<V>(
79       (span >= 2
80            ? combine(unaligned_load<V2<P>>(src),
81                      V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0})
82            : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
83 }
84 
85 // Store a partial span > 0 and < 4 pixels.
86 template <typename V, typename P>
partial_store_span(P * dst,V src,int span)87 static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
88   auto pixels = bit_cast<V4<P>>(src);
89   if (span >= 2) {
90     unaligned_store(dst, lowHalf(pixels));
91     if (span > 2) {
92       unaligned_store(dst + 2, pixels.z);
93     }
94   } else {
95     unaligned_store(dst, pixels.x);
96   }
97 }
98 
99 // Dispatcher that chooses when to load a full or partial span
100 template <typename V, typename P>
load_span(const P * src,int span)101 static ALWAYS_INLINE V load_span(const P* src, int span) {
102   if (span >= 4) {
103     return unaligned_load<V, P>(src);
104   } else {
105     return partial_load_span<V, P>(src, span);
106   }
107 }
108 
109 // Dispatcher that chooses when to store a full or partial span
110 template <typename V, typename P>
store_span(P * dst,V src,int span)111 static ALWAYS_INLINE void store_span(P* dst, V src, int span) {
112   if (span >= 4) {
113     unaligned_store<V, P>(dst, src);
114   } else {
115     partial_store_span<V, P>(dst, src, span);
116   }
117 }
118 
119 template <typename T>
muldiv256(T x,T y)120 static ALWAYS_INLINE T muldiv256(T x, T y) {
121   return (x * y) >> 8;
122 }
123 
124 // (x*y + x) >> 8, cheap approximation of (x*y) / 255
125 template <typename T>
muldiv255(T x,T y)126 static ALWAYS_INLINE T muldiv255(T x, T y) {
127   return (x * y + x) >> 8;
128 }
129 
130 template <typename V>
131 static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v,
132                                          float scale = 255.0f) {
133   return pack_pixels_RGBA8(v, scale);
134 }
135 
136 template <typename C>
137 static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) {
138   return pack_pixels_R8(c, scale);
139 }
140 
141 // Helper functions to apply a color modulus when available.
142 struct NoColor {};
143 
144 template <typename P>
applyColor(P src,NoColor)145 static ALWAYS_INLINE P applyColor(P src, NoColor) {
146   return src;
147 }
148 
149 struct InvertColor {};
150 
151 template <typename P>
applyColor(P src,InvertColor)152 static ALWAYS_INLINE P applyColor(P src, InvertColor) {
153   return 255 - src;
154 }
155 
156 template <typename P>
applyColor(P src,P color)157 static ALWAYS_INLINE P applyColor(P src, P color) {
158   return muldiv255(color, src);
159 }
160 
applyColor(PackedRGBA8 src,WideRGBA8 color)161 static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) {
162   return applyColor(unpack(src), color);
163 }
164 
165 template <typename P, typename C>
packColor(P * buf,C color)166 static ALWAYS_INLINE auto packColor(P* buf, C color) {
167   return pack_span(buf, color, 255.0f);
168 }
169 
170 template <typename P>
packColor(UNUSED P * buf,NoColor noColor)171 static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) {
172   return noColor;
173 }
174 
175 template <typename P>
packColor(UNUSED P * buf,InvertColor invertColor)176 static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf,
177                                            InvertColor invertColor) {
178   return invertColor;
179 }
180 
181 // Single argument variation that takes an explicit destination buffer type.
182 template <typename P, typename C>
packColor(C color)183 static ALWAYS_INLINE auto packColor(C color) {
184   // Just pass in a typed null pointer, as the pack routines never use the
185   // pointer's value, just its type.
186   return packColor((P*)0, color);
187 }
188 
189 // Byte-wise addition for when x or y is a signed 8-bit value stored in the
190 // low byte of a larger type T only with zeroed-out high bits, where T is
191 // greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
192 // upon signed operands, using up all the precision in a 16 bit integer, and
193 // potentially losing the sign bit in the last >> 8 shift. Due to the
194 // properties of two's complement arithmetic, even though we've discarded the
195 // sign bit, we can still represent a negative number under addition (without
196 // requiring any extra sign bits), just that any negative number will behave
197 // like a large unsigned number under addition, generating a single carry bit
198 // on overflow that we need to discard. Thus, just doing a byte-wise add will
199 // overflow without the troublesome carry, giving us only the remaining 8 low
200 // bits we actually need while keeping the high bits at zero.
201 template <typename T>
addlow(T x,T y)202 static ALWAYS_INLINE T addlow(T x, T y) {
203   typedef VectorType<uint8_t, sizeof(T)> bytes;
204   return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
205 }
206 
207 // Replace color components of each pixel with the pixel's alpha values.
208 template <typename T>
alphas(T c)209 static ALWAYS_INLINE T alphas(T c) {
210   return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
211 }
212 
213 // Replace the alpha values of the first vector with alpha values from the
214 // second, while leaving the color components unmodified.
215 template <typename T>
set_alphas(T c,T a)216 static ALWAYS_INLINE T set_alphas(T c, T a) {
217   return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31);
218 }
219 
220 // Miscellaneous helper functions for working with packed RGBA8 data.
if_then_else(V8<int16_t> c,HalfRGBA8 t,HalfRGBA8 e)221 static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t,
222                                             HalfRGBA8 e) {
223   return bit_cast<HalfRGBA8>((c & t) | (~c & e));
224 }
225 
226 template <typename T, typename C, int N>
if_then_else(VectorType<C,N> c,VectorType<T,N> t,VectorType<T,N> e)227 static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c,
228                                                    VectorType<T, N> t,
229                                                    VectorType<T, N> e) {
230   return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)),
231                  if_then_else(highHalf(c), highHalf(t), highHalf(e)));
232 }
233 
min(HalfRGBA8 x,HalfRGBA8 y)234 static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) {
235 #if USE_SSE2
236   return bit_cast<HalfRGBA8>(
237       _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
238 #elif USE_NEON
239   return vminq_u16(x, y);
240 #else
241   return if_then_else(x < y, x, y);
242 #endif
243 }
244 
245 template <typename T, int N>
min(VectorType<T,N> x,VectorType<T,N> y)246 static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x,
247                                           VectorType<T, N> y) {
248   return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y)));
249 }
250 
max(HalfRGBA8 x,HalfRGBA8 y)251 static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) {
252 #if USE_SSE2
253   return bit_cast<HalfRGBA8>(
254       _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
255 #elif USE_NEON
256   return vmaxq_u16(x, y);
257 #else
258   return if_then_else(x > y, x, y);
259 #endif
260 }
261 
262 template <typename T, int N>
max(VectorType<T,N> x,VectorType<T,N> y)263 static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x,
264                                           VectorType<T, N> y) {
265   return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y)));
266 }
267 
268 template <typename T, int N>
recip(VectorType<T,N> v)269 static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) {
270   return combine(recip(lowHalf(v)), recip(highHalf(v)));
271 }
272 
273 // Helper to get the reciprocal if the value is non-zero, or otherwise default
274 // to the supplied fallback value.
275 template <typename V>
recip_or(V v,float f)276 static ALWAYS_INLINE V recip_or(V v, float f) {
277   return if_then_else(v != V(0.0f), recip(v), V(f));
278 }
279 
280 template <typename T, int N>
inversesqrt(VectorType<T,N> v)281 static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
282   return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v)));
283 }
284 
285 // Extract the alpha components so that we can cheaply calculate the reciprocal
286 // on a single SIMD register. Then multiply the duplicated alpha reciprocal with
287 // the pixel data. 0 alpha is treated as transparent black.
unpremultiply(WideRGBA32F v)288 static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
289   Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
290   return v * a.xxxxyyyyzzzzwwww;
291 }
292 
293 // Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
294 // RGBA to unpack.
unpack(PackedRGBA32F c)295 static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) {
296   return bit_cast<vec4>(
297       SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15));
298 }
299 
300 // The following lum/sat functions mostly follow the KHR_blend_equation_advanced
301 // specification but are rearranged to work on premultiplied data.
lumv3(vec3 v)302 static ALWAYS_INLINE Float lumv3(vec3 v) {
303   return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f;
304 }
305 
minv3(vec3 v)306 static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); }
307 
maxv3(vec3 v)308 static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); }
309 
clip_color(vec3 v,Float lum,Float alpha)310 static inline vec3 clip_color(vec3 v, Float lum, Float alpha) {
311   Float mincol = max(-minv3(v), lum);
312   Float maxcol = max(maxv3(v), alpha - lum);
313   return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f));
314 }
315 
set_lum(vec3 base,vec3 ref,Float alpha)316 static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) {
317   return clip_color(base - lumv3(base), lumv3(ref), alpha);
318 }
319 
set_lum_sat(vec3 base,vec3 sref,vec3 lref,Float alpha)320 static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) {
321   vec3 diff = base - minv3(base);
322   Float sbase = maxv3(diff);
323   Float ssat = maxv3(sref) - minv3(sref);
324   // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale
325   // to black, as per specification.
326   return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha);
327 }
328 
329 // Flags the reflect the current blend-stage clipping to be applied.
330 enum SWGLClipFlag {
331   SWGL_CLIP_FLAG_MASK = 1 << 0,
332   SWGL_CLIP_FLAG_AA = 1 << 1,
333   SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2,
334 };
335 static int swgl_ClipFlags = 0;
336 static BlendKey swgl_BlendOverride = BLEND_KEY_NONE;
337 static WideRGBA8 swgl_BlendColorRGBA8 = {0};
338 static WideRGBA8 swgl_BlendAlphaRGBA8 = {0};
339 
340 // A pointer into the color buffer for the start of the span.
341 static void* swgl_SpanBuf = nullptr;
342 // A pointer into the clip mask for the start of the span.
343 static uint8_t* swgl_ClipMaskBuf = nullptr;
344 
expand_mask(UNUSED uint8_t * buf,WideR8 mask)345 static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) {
346   return mask;
347 }
expand_mask(UNUSED uint32_t * buf,WideR8 mask)348 static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) {
349   WideRG8 maskRG = zip(mask, mask);
350   return zip(maskRG, maskRG);
351 }
352 
353 // Loads a chunk of clip masks. The current pointer into the color buffer is
354 // used to reconstruct the relative position within the span. From there, the
355 // pointer into the clip mask can be generated from the start of the clip mask
356 // span.
357 template <typename P>
get_clip_mask(P * buf)358 static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) {
359   return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf];
360 }
361 
362 template <typename P>
363 static ALWAYS_INLINE auto load_clip_mask(P* buf, int span)
364     -> decltype(expand_mask(buf, 0)) {
365   return expand_mask(buf,
366                      unpack(load_span<PackedR8>(get_clip_mask(buf), span)));
367 }
368 
369 // Temporarily removes masking from the blend stage, assuming the caller will
370 // handle it.
override_clip_mask()371 static ALWAYS_INLINE void override_clip_mask() {
372   blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE);
373 }
374 
375 // Restores masking to the blend stage, assuming it was previously overridden.
restore_clip_mask()376 static ALWAYS_INLINE void restore_clip_mask() {
377   blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key);
378 }
379 
380 // A pointer to the start of the opaque destination region of the span for AA.
381 static const uint8_t* swgl_OpaqueStart = nullptr;
382 // The size, in bytes, of the opaque region.
383 static uint32_t swgl_OpaqueSize = 0;
384 // AA coverage distance offsets for the left and right edges.
385 static Float swgl_LeftAADist = 0.0f;
386 static Float swgl_RightAADist = 0.0f;
387 // AA coverage slope values used for accumulating coverage for each step.
388 static Float swgl_AASlope = 0.0f;
389 
390 // Get the amount of pixels we need to process before the start of the opaque
391 // region.
392 template <typename P>
get_aa_opaque_start(P * buf)393 static ALWAYS_INLINE int get_aa_opaque_start(P* buf) {
394   return max(int((P*)swgl_OpaqueStart - buf), 0);
395 }
396 
397 // Assuming we are already in the opaque part of the span, return the remaining
398 // size of the opaque part.
399 template <typename P>
get_aa_opaque_size(P * buf)400 static ALWAYS_INLINE int get_aa_opaque_size(P* buf) {
401   return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0);
402 }
403 
404 // Temporarily removes anti-aliasing from the blend stage, assuming the caller
405 // will handle it.
override_aa()406 static ALWAYS_INLINE void override_aa() {
407   blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE);
408 }
409 
410 // Restores anti-aliasing to the blend stage, assuming it was previously
411 // overridden.
restore_aa()412 static ALWAYS_INLINE void restore_aa() {
413   blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key);
414 }
415 
416 static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
417                                             WideRGBA8 src, int span = 4) {
418   WideRGBA8 dst = unpack(pdst);
419   const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0,      0xFFFF, 0xFFFF,
420                               0xFFFF, 0,      0xFFFF, 0xFFFF, 0xFFFF, 0,
421                               0xFFFF, 0xFFFF, 0xFFFF, 0};
422   const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
423                                 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
424   const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
425                                   0, 0, 0, 255, 0, 0, 0, 255};
426 
427 // clang-format off
428   // Computes AA for the given pixel based on the offset of the pixel within
429   // destination row. Given the initial coverage offsets for the left and right
430   // edges, the offset is scaled by the slope and accumulated to find the
431   // minimum coverage value for the pixel. A final weight is generated that
432   // can be used to scale the source pixel.
433 #define DO_AA(format, body)                                   \
434   do {                                                        \
435     int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \
436     if (uint32_t(offset) >= swgl_OpaqueSize) {                \
437       Float delta = swgl_AASlope * float(offset);             \
438       Float dist = clamp(min(swgl_LeftAADist + delta.x,       \
439                              swgl_RightAADist + delta.y),     \
440                          0.0f, 256.0f);                       \
441       auto aa = pack_pixels_##format(dist, 1.0f);             \
442       body;                                                   \
443     }                                                         \
444   } while (0)
445 
446   // Each blend case is preceded by the MASK_ variant. The MASK_ case first
447   // loads the mask values and multiplies the source value by them. After, it
448   // falls through to the normal blending case using the masked source. The
449   // AA_ variations may further precede the blend cases, in which case the
450   // source value is further modified before use.
451 #define BLEND_CASE_KEY(key)                          \
452   case AA_##key:                                     \
453     DO_AA(RGBA8, src = muldiv256(src, aa));          \
454     goto key;                                        \
455   case AA_MASK_##key:                                \
456     DO_AA(RGBA8, src = muldiv256(src, aa));          \
457     FALLTHROUGH;                                     \
458   case MASK_##key:                                   \
459     src = muldiv255(src, load_clip_mask(buf, span)); \
460     FALLTHROUGH;                                     \
461   case key: key
462 
463 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
464 
465   switch (blend_key) {
466   BLEND_CASE(GL_ONE, GL_ZERO):
467     return src;
468   BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE,
469                   GL_ONE_MINUS_SRC_ALPHA):
470     // dst + src.a*(src.rgb1 - dst)
471     // use addlow for signed overflow
472     return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst));
473   BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
474     return src + dst - muldiv255(dst, alphas(src));
475   BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
476     return dst - muldiv255(dst, src);
477   BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
478     return dst - (muldiv255(dst, src) & RGB_MASK);
479   BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
480     return dst - muldiv255(dst, alphas(src));
481   BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
482     return muldiv255(src, dst);
483   BLEND_CASE(GL_ONE, GL_ONE):
484     return src + dst;
485   BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
486     return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
487   BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
488     // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
489     return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
490   BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
491     // src*k + (1-src)*dst = src*k + dst -
492     // src*dst = dst + src*(k - dst) use addlow
493     // for signed overflow
494     return addlow(
495         dst, muldiv255(src, repeat2(ctx->blendcolor) - dst));
496 
497   // We must explicitly handle the masked/anti-aliased secondary blend case.
498   // The secondary color as well as the source must be multiplied by the
499   // weights.
500   case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
501     WideRGBA8 secondary =
502         applyColor(dst,
503             packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
504     return src + dst - secondary;
505   }
506   case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
507     WideRGBA8 secondary =
508         applyColor(dst,
509             packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
510     WideRGBA8 mask = load_clip_mask(buf, span);
511     return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
512   }
513   case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
514     WideRGBA8 secondary =
515         applyColor(dst,
516             packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
517     DO_AA(RGBA8, {
518       src = muldiv256(src, aa);
519       secondary = muldiv256(secondary, aa);
520     });
521     return src + dst - secondary;
522   }
523   case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
524     WideRGBA8 secondary =
525         applyColor(dst,
526             packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
527     WideRGBA8 mask = load_clip_mask(buf, span);
528     DO_AA(RGBA8, mask = muldiv256(mask, aa));
529     return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
530   }
531 
532   BLEND_CASE(GL_MIN):
533     return min(src, dst);
534   BLEND_CASE(GL_MAX):
535     return max(src, dst);
536 
537   // The KHR_blend_equation_advanced spec describes the blend equations such
538   // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to
539   // the result:
540   //     Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As)
541   //     Ar = As*Ad + As*(1-Ad) + Ad*(1-As)
542   // However, working with unpremultiplied values requires expensive math to
543   // unpremultiply and premultiply again during blending. We can use the fact
544   // that premultiplied value P = C*A and simplify the equations such that no
545   // unpremultiplied colors are necessary, allowing us to stay with integer
546   // math that avoids floating-point conversions in the common case. Some of
547   // the blend modes require division or sqrt, in which case we do convert
548   // to (possibly transposed/unpacked) floating-point to implement the mode.
549   // However, most common modes can still use cheaper premultiplied integer
550   // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified
551   // to:
552   //     Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As)
553   //     .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As
554   //     Ar = As*Ad + As - As*Ad + Ad - Ad*As
555   //     .. Ar = As + Ad - As*Ad
556   // Note that the alpha equation is the same for all blend equations, such
557   // that so long as the implementation results in As + Ad - As*Ad, we can
558   // avoid using separate instructions to compute the alpha result, which is
559   // dependent on the math used to implement each blend mode. The exact
560   // reductions used to get the final math for every blend mode are too
561   // involved to show here in comments, but mostly follows from replacing
562   // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms
563   // as possible.
564 
BLEND_CASE(GL_MULTIPLY_KHR)565   BLEND_CASE(GL_MULTIPLY_KHR): {
566     WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK),
567                                alphas(dst) - (dst & RGB_MASK));
568     return src + dst + (diff & RGB_MASK) - alphas(diff);
569   }
570   BLEND_CASE(GL_SCREEN_KHR):
571     return src + dst - muldiv255(src, dst);
BLEND_CASE(GL_OVERLAY_KHR)572   BLEND_CASE(GL_OVERLAY_KHR): {
573     WideRGBA8 srcA = alphas(src);
574     WideRGBA8 dstA = alphas(dst);
575     WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
576     return src + dst +
577            if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff),
578                         -diff);
579   }
580   BLEND_CASE(GL_DARKEN_KHR):
581     return src + dst -
582            max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
583   BLEND_CASE(GL_LIGHTEN_KHR):
584     return src + dst -
585            min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
586 
BLEND_CASE(GL_COLORDODGE_KHR)587   BLEND_CASE(GL_COLORDODGE_KHR): {
588     // Color-dodge and color-burn require division, so we convert to FP math
589     // here, but avoid transposing to a vec4.
590     WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
591     WideRGBA32F srcA = alphas(srcF);
592     WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
593     WideRGBA32F dstA = alphas(dstF);
594     return pack_pixels_RGBA8(
595         srcA * set_alphas(
596                    min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)),
597                    dstF) +
598             srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
599         1.0f / 255.0f);
600   }
BLEND_CASE(GL_COLORBURN_KHR)601   BLEND_CASE(GL_COLORBURN_KHR): {
602     WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
603     WideRGBA32F srcA = alphas(srcF);
604     WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
605     WideRGBA32F dstA = alphas(dstF);
606     return pack_pixels_RGBA8(
607         srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA *
608                                                 recip_or(srcF, 255.0f))),
609                           dstF) +
610             srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
611         1.0f / 255.0f);
612   }
BLEND_CASE(GL_HARDLIGHT_KHR)613   BLEND_CASE(GL_HARDLIGHT_KHR): {
614     WideRGBA8 srcA = alphas(src);
615     WideRGBA8 dstA = alphas(dst);
616     WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
617     return src + dst +
618            if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff),
619                         -diff);
620   }
621 
BLEND_CASE(GL_SOFTLIGHT_KHR)622   BLEND_CASE(GL_SOFTLIGHT_KHR): {
623     // Soft-light requires an unpremultiply that can't be factored out as
624     // well as a sqrt, so we convert to FP math here, but avoid transposing
625     // to a vec4.
626     WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
627     WideRGBA32F srcA = alphas(srcF);
628     WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
629     WideRGBA32F dstA = alphas(dstF);
630     WideRGBA32F dstU = unpremultiply(dstF);
631     WideRGBA32F scale = srcF + srcF - srcA;
632     return pack_pixels_RGBA8(
633         dstF * (255.0f +
634                 set_alphas(
635                     scale *
636                         if_then_else(scale < 0.0f, 1.0f - dstU,
637                                      min((16.0f * dstU - 12.0f) * dstU + 3.0f,
638                                          inversesqrt(dstU) - 1.0f)),
639                     WideRGBA32F(0.0f))) +
640             srcF * (255.0f - dstA),
641         1.0f / 255.0f);
642   }
BLEND_CASE(GL_DIFFERENCE_KHR)643   BLEND_CASE(GL_DIFFERENCE_KHR): {
644     WideRGBA8 diff =
645         min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst)));
646     return src + dst - diff - (diff & RGB_MASK);
647   }
BLEND_CASE(GL_EXCLUSION_KHR)648   BLEND_CASE(GL_EXCLUSION_KHR): {
649     WideRGBA8 diff = muldiv255(src, dst);
650     return src + dst - diff - (diff & RGB_MASK);
651   }
652 
653   // The HSL blend modes are non-separable and require complicated use of
654   // division. It is advantageous to convert to FP and transpose to vec4
655   // math to more easily manipulate the individual color components.
656 #define DO_HSL(rgb)                                                            \
657   do {                                                                         \
658     vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));                           \
659     vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));                           \
660     Float srcA = srcV.w * (1.0f / 255.0f);                                     \
661     Float dstA = dstV.w * (1.0f / 255.0f);                                     \
662     Float srcDstA = srcV.w * dstA;                                             \
663     vec3 srcC = vec3(srcV) * dstA;                                             \
664     vec3 dstC = vec3(dstV) * srcA;                                             \
665     return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \
666                                   srcV.w + dstV.w - srcDstA),                  \
667                              1.0f);                                            \
668   } while (0)
669 
670   BLEND_CASE(GL_HSL_HUE_KHR):
671     DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA));
672   BLEND_CASE(GL_HSL_SATURATION_KHR):
673     DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA));
674   BLEND_CASE(GL_HSL_COLOR_KHR):
675     DO_HSL(set_lum(srcC, dstC, srcDstA));
676   BLEND_CASE(GL_HSL_LUMINOSITY_KHR):
677     DO_HSL(set_lum(dstC, srcC, srcDstA));
678 
679   // SWGL-specific extended blend modes.
BLEND_CASE(SWGL_BLEND_DROP_SHADOW)680   BLEND_CASE(SWGL_BLEND_DROP_SHADOW): {
681     // Premultiplied alpha over blend, but with source color set to source alpha
682     // modulated with a constant color.
683     WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8);
684     return color + dst - muldiv255(dst, alphas(color));
685   }
686 
687   BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT):
688     // Premultiplied alpha over blend, but treats the source as a subpixel mask
689     // modulated with a constant color.
690     return applyColor(src, swgl_BlendColorRGBA8) + dst -
691            muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8));
692 
693   default:
694     UNREACHABLE;
695     // return src;
696   }
697 
698 #undef BLEND_CASE
699 #undef BLEND_CASE_KEY
700   // clang-format on
701 }
702 
703 static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src,
704                                          int span = 4) {
705 // clang-format off
706 #define BLEND_CASE_KEY(key)                          \
707   case AA_##key:                                     \
708     DO_AA(R8, src = muldiv256(src, aa));             \
709     goto key;                                        \
710   case AA_MASK_##key:                                \
711     DO_AA(R8, src = muldiv256(src, aa));             \
712     FALLTHROUGH;                                     \
713   case MASK_##key:                                   \
714     src = muldiv255(src, load_clip_mask(buf, span)); \
715     FALLTHROUGH;                                     \
716   case key: key
717 
718 #define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
719 
720   switch (blend_key) {
721   BLEND_CASE(GL_ONE, GL_ZERO):
722     return src;
723   BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
724     return muldiv255(src, dst);
725   BLEND_CASE(GL_ONE, GL_ONE):
726     return src + dst;
727   default:
728     UNREACHABLE;
729     // return src;
730   }
731 
732 #undef BLEND_CASE
733 #undef BLEND_CASE_KEY
734   // clang-format on
735 }
736 
commit_span(uint32_t * buf,WideRGBA8 r)737 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) {
738   unaligned_store(buf, pack(r));
739 }
740 
commit_span(uint32_t * buf,WideRGBA8 r,int len)741 static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) {
742   partial_store_span(buf, pack(r), len);
743 }
744 
blend_span(uint32_t * buf,WideRGBA8 r)745 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) {
746   return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r);
747 }
748 
blend_span(uint32_t * buf,WideRGBA8 r,int len)749 static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) {
750   return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len);
751 }
752 
commit_span(uint32_t * buf,PackedRGBA8 r)753 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) {
754   unaligned_store(buf, r);
755 }
756 
commit_span(uint32_t * buf,PackedRGBA8 r,int len)757 static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) {
758   partial_store_span(buf, r, len);
759 }
760 
blend_span(uint32_t * buf,PackedRGBA8 r)761 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) {
762   return pack(blend_span(buf, unpack(r)));
763 }
764 
blend_span(uint32_t * buf,PackedRGBA8 r,int len)765 static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r,
766                                             int len) {
767   return pack(blend_span(buf, unpack(r), len));
768 }
769 
commit_span(uint8_t * buf,WideR8 r)770 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) {
771   unaligned_store(buf, pack(r));
772 }
773 
commit_span(uint8_t * buf,WideR8 r,int len)774 static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) {
775   partial_store_span(buf, pack(r), len);
776 }
777 
blend_span(uint8_t * buf,WideR8 r)778 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) {
779   return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r);
780 }
781 
blend_span(uint8_t * buf,WideR8 r,int len)782 static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) {
783   return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r,
784                       len);
785 }
786 
commit_span(uint8_t * buf,PackedR8 r)787 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) {
788   unaligned_store(buf, r);
789 }
790 
commit_span(uint8_t * buf,PackedR8 r,int len)791 static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) {
792   partial_store_span(buf, r, len);
793 }
794 
blend_span(uint8_t * buf,PackedR8 r)795 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) {
796   return pack(blend_span(buf, unpack(r)));
797 }
798 
blend_span(uint8_t * buf,PackedR8 r,int len)799 static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) {
800   return pack(blend_span(buf, unpack(r), len));
801 }
802 
803 template <bool BLEND, typename P, typename R>
commit_blend_span(P * buf,R r)804 static ALWAYS_INLINE void commit_blend_span(P* buf, R r) {
805   if (BLEND) {
806     commit_span(buf, blend_span(buf, r));
807   } else {
808     commit_span(buf, r);
809   }
810 }
811 
812 template <bool BLEND, typename P, typename R>
commit_blend_span(P * buf,R r,int len)813 static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) {
814   if (BLEND) {
815     commit_span(buf, blend_span(buf, r, len), len);
816   } else {
817     commit_span(buf, r, len);
818   }
819 }
820 
821 template <typename P, typename R>
commit_blend_solid_span(P * buf,R r,int len)822 static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) {
823   for (P* end = &buf[len & ~3]; buf < end; buf += 4) {
824     commit_span(buf, blend_span(buf, r));
825   }
826   len &= 3;
827   if (len > 0) {
828     partial_store_span(buf, pack(blend_span(buf, r, len)), len);
829   }
830 }
831 
832 template <bool BLEND>
commit_solid_span(uint32_t * buf,WideRGBA8 r,int len)833 static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) {
834   commit_blend_solid_span(buf, r, len);
835 }
836 
837 template <>
838 ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r,
839                                             int len) {
840   fill_n(buf, len, bit_cast<U32>(pack(r)).x);
841 }
842 
843 template <bool BLEND>
commit_solid_span(uint8_t * buf,WideR8 r,int len)844 static void commit_solid_span(uint8_t* buf, WideR8 r, int len) {
845   commit_blend_solid_span(buf, r, len);
846 }
847 
848 template <>
849 ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) {
850   PackedR8 p = pack(r);
851   if (uintptr_t(buf) & 3) {
852     int align = 4 - (uintptr_t(buf) & 3);
853     align = min(align, len);
854     partial_store_span(buf, p, align);
855     buf += align;
856     len -= align;
857   }
858   fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p));
859   buf += len & ~3;
860   len &= 3;
861   if (len > 0) {
862     partial_store_span(buf, p, len);
863   }
864 }
865