1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 // Intentionally NO #pragma once... included multiple times.
9 
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 //    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define)
12 //    - V<T>: a template to create a vector of N T's.
13 
14 using F   = V<Color>;   // Called F for historic reasons... maybe rename C?
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8  = V<uint8_t>;
20 
21 
22 #if defined(__GNUC__) && !defined(__clang__)
23     // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24     static constexpr F F0 = F() + 0.0f,
25                        F1 = F() + 1.0f;
26 #else
27     static constexpr F F0 = 0.0f,
28                        F1 = 1.0f;
29 #endif
30 
31 // Instead of checking __AVX__ below, we'll check USING_AVX.
32 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
33 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
34 
35 #if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
36     #define  USING_AVX
37 #endif
38 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
39     #define  USING AVX_F16C
40 #endif
41 #if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
42     #define  USING_AVX2
43 #endif
44 #if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__)
45     #define  USING_AVX512F
46 #endif
47 
48 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
49 // This is more for organizational clarity... skcms.cc doesn't force these.
50 #if N > 1 && defined(__ARM_NEON)
51     #define USING_NEON
52     #if __ARM_FP & 2
53         #define USING_NEON_F16C
54     #endif
55     #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
56         #define USING_NEON_FP16
57     #endif
58 #endif
59 
60 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
61 // like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
62 #if defined(USING_NEON) && defined(__clang__)
63     #pragma clang diagnostic ignored "-Wvector-conversion"
64 #endif
65 
66 // GCC warns us about returning U64 on x86 because it's larger than a register.
67 // You'd see warnings like, "using AVX even though AVX is not enabled".
68 // We stifle these warnings... our helpers that return U64 are always inlined.
69 #if defined(__SSE__) && defined(__GNUC__) && !defined(__clang__)
70     #pragma GCC diagnostic ignored "-Wpsabi"
71 #endif
72 
73 #if defined(__clang__)
74     #define FALLTHROUGH [[clang::fallthrough]]
75 #else
76     #define FALLTHROUGH
77 #endif
78 
79 // We tag most helper functions as SI, to enforce good code generation
80 // but also work around what we think is a bug in GCC: when targeting 32-bit
81 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
82 // MMX mm0 register, which seems to mess with unrelated code that later uses
83 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
84 //
85 // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
86 #if defined(__clang__) || defined(__GNUC__)
87     #define SI static inline __attribute__((always_inline))
88 #else
89     #define SI static inline
90 #endif
91 
92 template <typename T, typename P>
load(const P * ptr)93 SI T load(const P* ptr) {
94     T val;
95     small_memcpy(&val, ptr, sizeof(val));
96     return val;
97 }
98 template <typename T, typename P>
store(P * ptr,const T & val)99 SI void store(P* ptr, const T& val) {
100     small_memcpy(ptr, &val, sizeof(val));
101 }
102 
103 // (T)v is a cast when N == 1 and a bit-pun when N>1,
104 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
105 template <typename D, typename S>
cast(const S & v)106 SI D cast(const S& v) {
107 #if N == 1
108     return (D)v;
109 #elif defined(__clang__)
110     return __builtin_convertvector(v, D);
111 #else
112     D d;
113     for (int i = 0; i < N; i++) {
114         d[i] = v[i];
115     }
116     return d;
117 #endif
118 }
119 
120 template <typename D, typename S>
bit_pun(const S & v)121 SI D bit_pun(const S& v) {
122     static_assert(sizeof(D) == sizeof(v), "");
123     return load<D>(&v);
124 }
125 
126 // When we convert from float to fixed point, it's very common to want to round,
127 // and for some reason compilers generate better code when converting to int32_t.
128 // To serve both those ends, we use this function to_fixed() instead of direct cast().
129 #if defined(USING_NEON_FP16)
130     // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
to_fixed(F f)131     SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
132 #else
to_fixed(F f)133     SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
134 #endif
135 
136 
137 // Sometimes we do something crazy on one branch of a conditonal,
138 // like divide by zero or convert a huge float to an integer,
139 // but then harmlessly select the other side.  That trips up N==1
140 // sanitizer builds, so we make if_then_else() a macro to avoid
141 // evaluating the unused side.
142 
143 #if N == 1
144     #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
145 #else
146     template <typename C, typename T>
if_then_else(C cond,T t,T e)147     SI T if_then_else(C cond, T t, T e) {
148         return bit_pun<T>( ( cond & bit_pun<C>(t)) |
149                            (~cond & bit_pun<C>(e)) );
150     }
151 #endif
152 
153 
F_from_Half(U16 half)154 SI F F_from_Half(U16 half) {
155 #if defined(USING_NEON_FP16)
156     return bit_pun<F>(half);
157 #elif defined(USING_NEON_F16C)
158     return vcvt_f32_f16((float16x4_t)half);
159 #elif defined(USING_AVX512F)
160     return (F)_mm512_cvtph_ps((__m256i)half);
161 #elif defined(USING_AVX_F16C)
162     typedef int16_t __attribute__((vector_size(16))) I16;
163     return __builtin_ia32_vcvtph2ps256((I16)half);
164 #else
165     U32 wide = cast<U32>(half);
166     // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
167     U32 s  = wide & 0x8000,
168         em = wide ^ s;
169 
170     // Constructing the float is easy if the half is not denormalized.
171     F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
172 
173     // Simply flush all denorm half floats to zero.
174     return if_then_else(em < 0x0400, F0, norm);
175 #endif
176 }
177 
178 #if defined(__clang__)
179     // The -((127-15)<<10) underflows that side of the math when
180     // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
181     __attribute__((no_sanitize("unsigned-integer-overflow")))
182 #endif
Half_from_F(F f)183 SI U16 Half_from_F(F f) {
184 #if defined(USING_NEON_FP16)
185     return bit_pun<U16>(f);
186 #elif defined(USING_NEON_F16C)
187     return (U16)vcvt_f16_f32(f);
188 #elif defined(USING_AVX512F)
189     return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
190 #elif defined(USING_AVX_F16C)
191     return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
192 #else
193     // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
194     U32 sem = bit_pun<U32>(f),
195         s   = sem & 0x80000000,
196          em = sem ^ s;
197 
198     // For simplicity we flush denorm half floats (including all denorm floats) to zero.
199     return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
200                                                  , (s>>16) + (em>>13) - ((127-15)<<10)));
201 #endif
202 }
203 
204 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
205 #if defined(USING_NEON_FP16)
swap_endian_16(U16 v)206     SI U16 swap_endian_16(U16 v) {
207         return (U16)vrev16q_u8((uint8x16_t) v);
208     }
209 #elif defined(USING_NEON)
swap_endian_16(U16 v)210     SI U16 swap_endian_16(U16 v) {
211         return (U16)vrev16_u8((uint8x8_t) v);
212     }
213 #endif
214 
swap_endian_16x4(const U64 & rgba)215 SI U64 swap_endian_16x4(const U64& rgba) {
216     return (rgba & 0x00ff00ff00ff00ff) << 8
217          | (rgba & 0xff00ff00ff00ff00) >> 8;
218 }
219 
220 #if defined(USING_NEON_FP16)
min_(F x,F y)221     SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
max_(F x,F y)222     SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
223 #elif defined(USING_NEON)
min_(F x,F y)224     SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)225     SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
226 #else
min_(F x,F y)227     SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)228     SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
229 #endif
230 
floor_(F x)231 SI F floor_(F x) {
232 #if N == 1
233     return floorf_(x);
234 #elif defined(USING_NEON_FP16)
235     return vrndmq_f16(x);
236 #elif defined(__aarch64__)
237     return vrndmq_f32(x);
238 #elif defined(USING_AVX512F)
239     // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
240     // and integer santizer catches that this implicit cast changes the
241     // value from -1 to 65535.  We'll cast manually to work around it.
242     // Read this as `return _mm512_floor_ps(x)`.
243     return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
244 #elif defined(USING_AVX)
245     return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
246 #elif defined(__SSE4_1__)
247     return _mm_floor_ps(x);
248 #else
249     // Round trip through integers with a truncating cast.
250     F roundtrip = cast<F>(cast<I32>(x));
251     // If x is negative, truncating gives the ceiling instead of the floor.
252     return roundtrip - if_then_else(roundtrip > x, F1, F0);
253 
254     // This implementation fails for values of x that are outside
255     // the range an integer can represent.  We expect most x to be small.
256 #endif
257 }
258 
approx_log2(F x)259 SI F approx_log2(F x) {
260 #if defined(USING_NEON_FP16)
261     // TODO(mtklein)
262     return x;
263 #else
264     // The first approximation of log2(x) is its exponent 'e', minus 127.
265     I32 bits = bit_pun<I32>(x);
266 
267     F e = cast<F>(bits) * (1.0f / (1<<23));
268 
269     // If we use the mantissa too we can refine the error signficantly.
270     F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
271 
272     return e - 124.225514990f
273              -   1.498030302f*m
274              -   1.725879990f/(0.3520887068f + m);
275 #endif
276 }
277 
approx_log(F x)278 SI F approx_log(F x) {
279     const float ln2 = 0.69314718f;
280     return ln2 * approx_log2(x);
281 }
282 
approx_exp2(F x)283 SI F approx_exp2(F x) {
284 #if defined(USING_NEON_FP16)
285     // TODO(mtklein)
286     return x;
287 #else
288     F fract = x - floor_(x);
289 
290     I32 bits = cast<I32>((1.0f * (1<<23)) * (x + 121.274057500f
291                                                -   1.490129070f*fract
292                                                +  27.728023300f/(4.84252568f - fract)));
293     return bit_pun<F>(bits);
294 #endif
295 }
296 
approx_pow(F x,float y)297 SI F approx_pow(F x, float y) {
298     return if_then_else((x == F0) | (x == F1), x
299                                              , approx_exp2(approx_log2(x) * y));
300 }
301 
approx_exp(F x)302 SI F approx_exp(F x) {
303     const float log2_e = 1.4426950408889634074f;
304     return approx_exp2(log2_e * x);
305 }
306 
307 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)308 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
309 #if defined(USING_NEON_FP16)
310     // TODO(mtklein)
311     (void)tf;
312     return x;
313 #else
314     // Peel off the sign bit and set x = |x|.
315     U32 bits = bit_pun<U32>(x),
316         sign = bits & 0x80000000;
317     x = bit_pun<F>(bits ^ sign);
318 
319     // The transfer function has a linear part up to d, exponential at d and after.
320     F v = if_then_else(x < tf->d,            tf->c*x + tf->f
321                                 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
322 
323     // Tack the sign bit back on.
324     return bit_pun<F>(sign | bit_pun<U32>(v));
325 #endif
326 }
327 
apply_pq(const skcms_TransferFunction * tf,F x)328 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
329 #if defined(USING_NEON_FP16)
330     // TODO(mtklein)
331     (void)tf;
332     return x;
333 #else
334     U32 bits = bit_pun<U32>(x),
335         sign = bits & 0x80000000;
336     x = bit_pun<F>(bits ^ sign);
337 
338     F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
339                        / (tf->d + tf->e * approx_pow(x, tf->c)),
340                      tf->f);
341 
342     return bit_pun<F>(sign | bit_pun<U32>(v));
343 #endif
344 }
345 
apply_hlg(const skcms_TransferFunction * tf,F x)346 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
347 #if defined(USING_NEON_FP16)
348     // TODO(mtklein)
349     (void)tf;
350     return x;
351 #else
352     const float R = tf->a, G = tf->b,
353                 a = tf->c, b = tf->d, c = tf->e;
354     U32 bits = bit_pun<U32>(x),
355         sign = bits & 0x80000000;
356     x = bit_pun<F>(bits ^ sign);
357 
358     F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
359                                , approx_exp((x-c)*a) + b);
360 
361     return bit_pun<F>(sign | bit_pun<U32>(v));
362 #endif
363 }
364 
apply_hlginv(const skcms_TransferFunction * tf,F x)365 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
366 #if defined(USING_NEON_FP16)
367     // TODO(mtklein)
368     (void)tf;
369     return x;
370 #else
371     const float R = tf->a, G = tf->b,
372                 a = tf->c, b = tf->d, c = tf->e;
373     U32 bits = bit_pun<U32>(x),
374         sign = bits & 0x80000000;
375     x = bit_pun<F>(bits ^ sign);
376 
377     F v = if_then_else(x <= 1, R * approx_pow(x, G)
378                              , a * approx_log(x - b) + c);
379 
380     return bit_pun<F>(sign | bit_pun<U32>(v));
381 #endif
382 }
383 
384 
385 // Strided loads and stores of N values, starting from p.
386 template <typename T, typename P>
load_3(const P * p)387 SI T load_3(const P* p) {
388 #if N == 1
389     return (T)p[0];
390 #elif N == 4
391     return T{p[ 0],p[ 3],p[ 6],p[ 9]};
392 #elif N == 8
393     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
394 #elif N == 16
395     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
396              p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
397 #endif
398 }
399 
400 template <typename T, typename P>
load_4(const P * p)401 SI T load_4(const P* p) {
402 #if N == 1
403     return (T)p[0];
404 #elif N == 4
405     return T{p[ 0],p[ 4],p[ 8],p[12]};
406 #elif N == 8
407     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
408 #elif N == 16
409     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
410              p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
411 #endif
412 }
413 
414 template <typename T, typename P>
store_3(P * p,const T & v)415 SI void store_3(P* p, const T& v) {
416 #if N == 1
417     p[0] = v;
418 #elif N == 4
419     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
420 #elif N == 8
421     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
422     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
423 #elif N == 16
424     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
425     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
426     p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
427     p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
428 #endif
429 }
430 
431 template <typename T, typename P>
store_4(P * p,const T & v)432 SI void store_4(P* p, const T& v) {
433 #if N == 1
434     p[0] = v;
435 #elif N == 4
436     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
437 #elif N == 8
438     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
439     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
440 #elif N == 16
441     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
442     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
443     p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
444     p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
445 #endif
446 }
447 
448 
gather_8(const uint8_t * p,I32 ix)449 SI U8 gather_8(const uint8_t* p, I32 ix) {
450 #if N == 1
451     U8 v = p[ix];
452 #elif N == 4
453     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
454 #elif N == 8
455     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
456              p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
457 #elif N == 16
458     U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
459              p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
460              p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
461              p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
462 #endif
463     return v;
464 }
465 
gather_16(const uint8_t * p,I32 ix)466 SI U16 gather_16(const uint8_t* p, I32 ix) {
467     // Load the i'th 16-bit value from p.
468     auto load_16 = [p](int i) {
469         return load<uint16_t>(p + 2*i);
470     };
471 #if N == 1
472     U16 v = load_16(ix);
473 #elif N == 4
474     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
475 #elif N == 8
476     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
477               load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
478 #elif N == 16
479     U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
480               load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
481               load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
482               load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
483 #endif
484     return v;
485 }
486 
gather_32(const uint8_t * p,I32 ix)487 SI U32 gather_32(const uint8_t* p, I32 ix) {
488     // Load the i'th 32-bit value from p.
489     auto load_32 = [p](int i) {
490         return load<uint32_t>(p + 4*i);
491     };
492 #if N == 1
493     U32 v = load_32(ix);
494 #elif N == 4
495     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
496 #elif N == 8
497     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
498               load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
499 #elif N == 16
500     U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
501               load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
502               load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
503               load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
504 #endif
505     // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
506     return v;
507 }
508 
gather_24(const uint8_t * p,I32 ix)509 SI U32 gather_24(const uint8_t* p, I32 ix) {
510     // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
511     // in front of it, either a previous table value, or some tag metadata.
512     p -= 1;
513 
514     // Load the i'th 24-bit value from p, and 1 extra byte.
515     auto load_24_32 = [p](int i) {
516         return load<uint32_t>(p + 3*i);
517     };
518 
519     // Now load multiples of 4 bytes (a junk byte, then r,g,b).
520 #if N == 1
521     U32 v = load_24_32(ix);
522 #elif N == 4
523     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
524 #elif N == 8 && !defined(USING_AVX2)
525     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
526               load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
527 #elif N == 8
528     (void)load_24_32;
529     // The gather instruction here doesn't need any particular alignment,
530     // but the intrinsic takes a const int*.
531     const int* p4 = bit_pun<const int*>(p);
532     I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
533         mask = {-1,-1,-1,-1, -1,-1,-1,-1};
534     #if defined(__clang__)
535         U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
536     #elif defined(__GNUC__)
537         U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
538     #endif
539 #elif N == 16
540     (void)load_24_32;
541     // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
542     // And AVX-512 swapped the order of arguments.  :/
543     const int* p4 = bit_pun<const int*>(p);
544     U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
545 #endif
546 
547     // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
548     return v >> 8;
549 }
550 
551 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)552     SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
553         // As in gather_24(), with everything doubled.
554         p -= 2;
555 
556         // Load the i'th 48-bit value from p, and 2 extra bytes.
557         auto load_48_64 = [p](int i) {
558             return load<uint64_t>(p + 6*i);
559         };
560 
561     #if N == 1
562         *v = load_48_64(ix);
563     #elif N == 4
564         *v = U64{
565             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
566         };
567     #elif N == 8 && !defined(USING_AVX2)
568         *v = U64{
569             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
570             load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
571         };
572     #elif N == 8
573         (void)load_48_64;
574         typedef int32_t   __attribute__((vector_size(16))) Half_I32;
575         typedef long long __attribute__((vector_size(32))) Half_I64;
576 
577         // The gather instruction here doesn't need any particular alignment,
578         // but the intrinsic takes a const long long*.
579         const long long int* p8 = bit_pun<const long long int*>(p);
580 
581         Half_I64 zero = { 0, 0, 0, 0},
582                  mask = {-1,-1,-1,-1};
583 
584         ix *= 6;
585         Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
586                  ix_hi = { ix[4], ix[5], ix[6], ix[7] };
587 
588         #if defined(__clang__)
589             Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
590                      hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
591         #elif defined(__GNUC__)
592             Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
593                      hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
594         #endif
595         store((char*)v +  0, lo);
596         store((char*)v + 32, hi);
597     #elif N == 16
598         (void)load_48_64;
599         const long long int* p8 = bit_pun<const long long int*>(p);
600         __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
601                 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
602         store((char*)v +  0, lo);
603         store((char*)v + 64, hi);
604     #endif
605 
606         *v >>= 16;
607     }
608 #endif
609 
F_from_U8(U8 v)610 SI F F_from_U8(U8 v) {
611     return cast<F>(v) * (1/255.0f);
612 }
613 
F_from_U16_BE(U16 v)614 SI F F_from_U16_BE(U16 v) {
615     // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
616     // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
617     U16 lo = (v >> 8),
618         hi = (v << 8) & 0xffff;
619     return cast<F>(lo|hi) * (1/65535.0f);
620 }
621 
U16_from_F(F v)622 SI U16 U16_from_F(F v) {
623     // 65535 == inf in FP16, so promote to FP32 before converting.
624     return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
625 }
626 
minus_1_ulp(F v)627 SI F minus_1_ulp(F v) {
628 #if defined(USING_NEON_FP16)
629     return bit_pun<F>( bit_pun<U16>(v) - 1 );
630 #else
631     return bit_pun<F>( bit_pun<U32>(v) - 1 );
632 #endif
633 }
634 
table(const skcms_Curve * curve,F v)635 SI F table(const skcms_Curve* curve, F v) {
636     // Clamp the input to [0,1], then scale to a table index.
637     F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
638 
639     // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
640     I32 lo = cast<I32>(            ix      ),
641         hi = cast<I32>(minus_1_ulp(ix+1.0f));
642     F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
643 
644     // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
645     // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
646     // to read adjacent entries and perhaps underflow the table by a byte or two
647     // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
648     F l,h;
649     if (curve->table_8) {
650         l = F_from_U8(gather_8(curve->table_8, lo));
651         h = F_from_U8(gather_8(curve->table_8, hi));
652     } else {
653         l = F_from_U16_BE(gather_16(curve->table_16, lo));
654         h = F_from_U16_BE(gather_16(curve->table_16, hi));
655     }
656     return l + (h-l)*t;
657 }
658 
sample_clut_8(const skcms_A2B * a2b,I32 ix,F * r,F * g,F * b)659 SI void sample_clut_8(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
660     U32 rgb = gather_24(a2b->grid_8, ix);
661 
662     *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
663     *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
664     *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
665 }
666 
sample_clut_16(const skcms_A2B * a2b,I32 ix,F * r,F * g,F * b)667 SI void sample_clut_16(const skcms_A2B* a2b, I32 ix, F* r, F* g, F* b) {
668 #if defined(__arm__)
669     // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
670     *r = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+0));
671     *g = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+1));
672     *b = F_from_U16_BE(gather_16(a2b->grid_16, 3*ix+2));
673 #else
674     // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
675     U64 rgb;
676     gather_48(a2b->grid_16, ix, &rgb);
677     rgb = swap_endian_16x4(rgb);
678 
679     *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
680     *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
681     *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
682 #endif
683 }
684 
685 // GCC 7.2.0 hits an internal compiler error with -finline-functions (or -O3)
686 // when targeting MIPS 64, i386, or s390x,  I think attempting to inline clut() into exec_ops().
687 #if 1 && defined(__GNUC__) && !defined(__clang__) \
688       && (defined(__mips64) || defined(__i386) || defined(__s390x__))
689     #define MAYBE_NOINLINE __attribute__((noinline))
690 #else
691     #define MAYBE_NOINLINE
692 #endif
693 
694 MAYBE_NOINLINE
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)695 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
696     const int dim = (int)a2b->input_channels;
697     assert (0 < dim && dim <= 4);
698 
699     // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
700     I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
701     F   weight[8];  // Weight for each contribution, again first low, then high.
702 
703     // O(dim) work first: calculate index,weight from r,g,b,a.
704     const F inputs[] = { *r,*g,*b,a };
705     for (int i = dim-1, stride = 1; i >= 0; i--) {
706         // x is where we logically want to sample the grid in the i-th dimension.
707         F x = inputs[i] * (float)(a2b->grid_points[i] - 1);
708 
709         // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
710         I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
711             hi = cast<I32>(minus_1_ulp(x+1.0f));
712         // Notice how we fold in the accumulated stride across previous dimensions here.
713         index[i+0] = lo * stride;
714         index[i+4] = hi * stride;
715         stride *= a2b->grid_points[i];
716 
717         // We'll interpolate between those two integer grid points by t.
718         F t = x - cast<F>(lo);  // i.e. fract(x)
719         weight[i+0] = 1-t;
720         weight[i+4] = t;
721     }
722 
723     *r = *g = *b = F0;
724 
725     // We'll sample 2^dim == 1<<dim table entries per pixel,
726     // in all combinations of low and high in each dimension.
727     for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
728 
729         // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
730         // where 0 selects the low index contribution and its weight 1-t,
731         // or 4 the high index contribution and its weight t.
732 
733         // Since 0<dim≤4, we can always just start off with the 0-th channel,
734         // then handle the others conditionally.
735         I32 ix = index [0 + (combo&1)*4];
736         F    w = weight[0 + (combo&1)*4];
737 
738         switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
739             case 3: ix += index [3 + (combo&8)/2];
740                     w  *= weight[3 + (combo&8)/2];
741                     FALLTHROUGH;
742                     // fall through
743 
744             case 2: ix += index [2 + (combo&4)*1];
745                     w  *= weight[2 + (combo&4)*1];
746                     FALLTHROUGH;
747                     // fall through
748 
749             case 1: ix += index [1 + (combo&2)*2];
750                     w  *= weight[1 + (combo&2)*2];
751         }
752 
753         F R,G,B;
754         if (a2b->grid_8) {
755             sample_clut_8 (a2b,ix, &R,&G,&B);
756         } else {
757             sample_clut_16(a2b,ix, &R,&G,&B);
758         }
759 
760         *r += w*R;
761         *g += w*G;
762         *b += w*B;
763     }
764 }
765 
exec_ops(const Op * ops,const void ** args,const char * src,char * dst,int i)766 static void exec_ops(const Op* ops, const void** args,
767                      const char* src, char* dst, int i) {
768     F r = F0, g = F0, b = F0, a = F1;
769     while (true) {
770         switch (*ops++) {
771             case Op_load_a8:{
772                 a = F_from_U8(load<U8>(src + 1*i));
773             } break;
774 
775             case Op_load_g8:{
776                 r = g = b = F_from_U8(load<U8>(src + 1*i));
777             } break;
778 
779             case Op_load_4444:{
780                 U16 abgr = load<U16>(src + 2*i);
781 
782                 r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
783                 g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
784                 b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
785                 a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
786             } break;
787 
788             case Op_load_565:{
789                 U16 rgb = load<U16>(src + 2*i);
790 
791                 r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
792                 g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
793                 b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
794             } break;
795 
796             case Op_load_888:{
797                 const uint8_t* rgb = (const uint8_t*)(src + 3*i);
798             #if defined(USING_NEON_FP16)
799                 // See the explanation under USING_NEON below.  This is that doubled up.
800                 uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
801                 v = vld3q_lane_u8(rgb+ 0, v,  0);
802                 v = vld3q_lane_u8(rgb+ 3, v,  2);
803                 v = vld3q_lane_u8(rgb+ 6, v,  4);
804                 v = vld3q_lane_u8(rgb+ 9, v,  6);
805 
806                 v = vld3q_lane_u8(rgb+12, v,  8);
807                 v = vld3q_lane_u8(rgb+15, v, 10);
808                 v = vld3q_lane_u8(rgb+18, v, 12);
809                 v = vld3q_lane_u8(rgb+21, v, 14);
810 
811                 r = cast<F>((U16)v.val[0]) * (1/255.0f);
812                 g = cast<F>((U16)v.val[1]) * (1/255.0f);
813                 b = cast<F>((U16)v.val[2]) * (1/255.0f);
814             #elif defined(USING_NEON)
815                 // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
816                 // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
817                 // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
818                 uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
819                 v = vld3_lane_u8(rgb+0, v, 0);
820                 v = vld3_lane_u8(rgb+3, v, 2);
821                 v = vld3_lane_u8(rgb+6, v, 4);
822                 v = vld3_lane_u8(rgb+9, v, 6);
823 
824                 // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
825                 // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
826                 // ARMv7 and ARMv8 impls.)
827                 r = cast<F>((U16)v.val[0]) * (1/255.0f);
828                 g = cast<F>((U16)v.val[1]) * (1/255.0f);
829                 b = cast<F>((U16)v.val[2]) * (1/255.0f);
830             #else
831                 r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
832                 g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
833                 b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
834             #endif
835             } break;
836 
837             case Op_load_8888:{
838                 U32 rgba = load<U32>(src + 4*i);
839 
840                 r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
841                 g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
842                 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
843                 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
844             } break;
845 
846             case Op_load_8888_palette8:{
847                 const uint8_t* palette = (const uint8_t*) *args++;
848                 I32 ix = cast<I32>(load<U8>(src + 1*i));
849                 U32 rgba = gather_32(palette, ix);
850 
851                 r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
852                 g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
853                 b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
854                 a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
855             } break;
856 
857             case Op_load_1010102:{
858                 U32 rgba = load<U32>(src + 4*i);
859 
860                 r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
861                 g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
862                 b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
863                 a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
864             } break;
865 
866             case Op_load_161616LE:{
867                 uintptr_t ptr = (uintptr_t)(src + 6*i);
868                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
869                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
870             #if defined(USING_NEON_FP16)
871                 uint16x8x3_t v = vld3q_u16(rgb);
872                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
873                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
874                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
875             #elif defined(USING_NEON)
876                 uint16x4x3_t v = vld3_u16(rgb);
877                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
878                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
879                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
880             #else
881                 r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
882                 g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
883                 b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
884             #endif
885             } break;
886 
887             case Op_load_16161616LE:{
888                 uintptr_t ptr = (uintptr_t)(src + 8*i);
889                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
890                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
891             #if defined(USING_NEON_FP16)
892                 uint16x8x4_t v = vld4q_u16(rgba);
893                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
894                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
895                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
896                 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
897             #elif defined(USING_NEON)
898                 uint16x4x4_t v = vld4_u16(rgba);
899                 r = cast<F>((U16)v.val[0]) * (1/65535.0f);
900                 g = cast<F>((U16)v.val[1]) * (1/65535.0f);
901                 b = cast<F>((U16)v.val[2]) * (1/65535.0f);
902                 a = cast<F>((U16)v.val[3]) * (1/65535.0f);
903             #else
904                 U64 px = load<U64>(rgba);
905 
906                 r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
907                 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
908                 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
909                 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
910             #endif
911             } break;
912 
913             case Op_load_161616BE:{
914                 uintptr_t ptr = (uintptr_t)(src + 6*i);
915                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
916                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
917             #if defined(USING_NEON_FP16)
918                 uint16x8x3_t v = vld3q_u16(rgb);
919                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
920                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
921                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
922             #elif defined(USING_NEON)
923                 uint16x4x3_t v = vld3_u16(rgb);
924                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
925                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
926                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
927             #else
928                 U32 R = load_3<U32>(rgb+0),
929                     G = load_3<U32>(rgb+1),
930                     B = load_3<U32>(rgb+2);
931                 // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
932                 r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
933                 g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
934                 b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
935             #endif
936             } break;
937 
938             case Op_load_16161616BE:{
939                 uintptr_t ptr = (uintptr_t)(src + 8*i);
940                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
941                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
942             #if defined(USING_NEON_FP16)
943                 uint16x8x4_t v = vld4q_u16(rgba);
944                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
945                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
946                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
947                 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
948             #elif defined(USING_NEON)
949                 uint16x4x4_t v = vld4_u16(rgba);
950                 r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
951                 g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
952                 b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
953                 a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
954             #else
955                 U64 px = swap_endian_16x4(load<U64>(rgba));
956 
957                 r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
958                 g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
959                 b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
960                 a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
961             #endif
962             } break;
963 
964             case Op_load_hhh:{
965                 uintptr_t ptr = (uintptr_t)(src + 6*i);
966                 assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
967                 const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
968             #if defined(USING_NEON_FP16)
969                 uint16x8x3_t v = vld3q_u16(rgb);
970                 U16 R = (U16)v.val[0],
971                     G = (U16)v.val[1],
972                     B = (U16)v.val[2];
973             #elif defined(USING_NEON)
974                 uint16x4x3_t v = vld3_u16(rgb);
975                 U16 R = (U16)v.val[0],
976                     G = (U16)v.val[1],
977                     B = (U16)v.val[2];
978             #else
979                 U16 R = load_3<U16>(rgb+0),
980                     G = load_3<U16>(rgb+1),
981                     B = load_3<U16>(rgb+2);
982             #endif
983                 r = F_from_Half(R);
984                 g = F_from_Half(G);
985                 b = F_from_Half(B);
986             } break;
987 
988             case Op_load_hhhh:{
989                 uintptr_t ptr = (uintptr_t)(src + 8*i);
990                 assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
991                 const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
992             #if defined(USING_NEON_FP16)
993                 uint16x8x4_t v = vld4q_u16(rgba);
994                 U16 R = (U16)v.val[0],
995                     G = (U16)v.val[1],
996                     B = (U16)v.val[2],
997                     A = (U16)v.val[3];
998             #elif defined(USING_NEON)
999                 uint16x4x4_t v = vld4_u16(rgba);
1000                 U16 R = (U16)v.val[0],
1001                     G = (U16)v.val[1],
1002                     B = (U16)v.val[2],
1003                     A = (U16)v.val[3];
1004             #else
1005                 U64 px = load<U64>(rgba);
1006                 U16 R = cast<U16>((px >>  0) & 0xffff),
1007                     G = cast<U16>((px >> 16) & 0xffff),
1008                     B = cast<U16>((px >> 32) & 0xffff),
1009                     A = cast<U16>((px >> 48) & 0xffff);
1010             #endif
1011                 r = F_from_Half(R);
1012                 g = F_from_Half(G);
1013                 b = F_from_Half(B);
1014                 a = F_from_Half(A);
1015             } break;
1016 
1017             case Op_load_fff:{
1018                 uintptr_t ptr = (uintptr_t)(src + 12*i);
1019                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1020                 const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
1021             #if defined(USING_NEON_FP16)
1022                 float32x4x3_t lo = vld3q_f32(rgb +  0),
1023                               hi = vld3q_f32(rgb + 12);
1024                 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1025                 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1026                 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1027             #elif defined(USING_NEON)
1028                 float32x4x3_t v = vld3q_f32(rgb);
1029                 r = (F)v.val[0];
1030                 g = (F)v.val[1];
1031                 b = (F)v.val[2];
1032             #else
1033                 r = load_3<F>(rgb+0);
1034                 g = load_3<F>(rgb+1);
1035                 b = load_3<F>(rgb+2);
1036             #endif
1037             } break;
1038 
1039             case Op_load_ffff:{
1040                 uintptr_t ptr = (uintptr_t)(src + 16*i);
1041                 assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1042                 const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
1043             #if defined(USING_NEON_FP16)
1044                 float32x4x4_t lo = vld4q_f32(rgba +  0),
1045                               hi = vld4q_f32(rgba + 16);
1046                 r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1047                 g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1048                 b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1049                 a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1050             #elif defined(USING_NEON)
1051                 float32x4x4_t v = vld4q_f32(rgba);
1052                 r = (F)v.val[0];
1053                 g = (F)v.val[1];
1054                 b = (F)v.val[2];
1055                 a = (F)v.val[3];
1056             #else
1057                 r = load_4<F>(rgba+0);
1058                 g = load_4<F>(rgba+1);
1059                 b = load_4<F>(rgba+2);
1060                 a = load_4<F>(rgba+3);
1061             #endif
1062             } break;
1063 
1064             case Op_swap_rb:{
1065                 F t = r;
1066                 r = b;
1067                 b = t;
1068             } break;
1069 
1070             case Op_clamp:{
1071                 r = max_(F0, min_(r, F1));
1072                 g = max_(F0, min_(g, F1));
1073                 b = max_(F0, min_(b, F1));
1074                 a = max_(F0, min_(a, F1));
1075             } break;
1076 
1077             case Op_invert:{
1078                 r = F1 - r;
1079                 g = F1 - g;
1080                 b = F1 - b;
1081                 a = F1 - a;
1082             } break;
1083 
1084             case Op_force_opaque:{
1085                 a = F1;
1086             } break;
1087 
1088             case Op_premul:{
1089                 r *= a;
1090                 g *= a;
1091                 b *= a;
1092             } break;
1093 
1094             case Op_unpremul:{
1095                 F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1096                 r *= scale;
1097                 g *= scale;
1098                 b *= scale;
1099             } break;
1100 
1101             case Op_matrix_3x3:{
1102                 const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1103                 const float* m = &matrix->vals[0][0];
1104 
1105                 F R = m[0]*r + m[1]*g + m[2]*b,
1106                   G = m[3]*r + m[4]*g + m[5]*b,
1107                   B = m[6]*r + m[7]*g + m[8]*b;
1108 
1109                 r = R;
1110                 g = G;
1111                 b = B;
1112             } break;
1113 
1114             case Op_matrix_3x4:{
1115                 const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1116                 const float* m = &matrix->vals[0][0];
1117 
1118                 F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1119                   G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1120                   B = m[8]*r + m[9]*g + m[10]*b + m[11];
1121 
1122                 r = R;
1123                 g = G;
1124                 b = B;
1125             } break;
1126 
1127             case Op_lab_to_xyz:{
1128                 // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
1129                 F L = r * 100.0f,
1130                   A = g * 255.0f - 128.0f,
1131                   B = b * 255.0f - 128.0f;
1132 
1133                 // Convert to CIE XYZ.
1134                 F Y = (L + 16.0f) * (1/116.0f),
1135                   X = Y + A*(1/500.0f),
1136                   Z = Y - B*(1/200.0f);
1137 
1138                 X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1139                 Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1140                 Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1141 
1142                 // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1143                 r = X * 0.9642f;
1144                 g = Y          ;
1145                 b = Z * 0.8249f;
1146             } break;
1147 
1148             case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1149             case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1150             case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1151             case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1152 
1153             case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1154             case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1155             case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1156             case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1157 
1158             case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1159             case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1160             case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1161             case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1162 
1163             case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1164             case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1165             case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1166             case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1167 
1168             case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1169             case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1170             case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1171             case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1172 
1173             case Op_clut: {
1174                 const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1175                 clut(a2b, &r,&g,&b,a);
1176 
1177                 if (a2b->input_channels == 4) {
1178                     // CMYK is opaque.
1179                     a = F1;
1180                 }
1181             } break;
1182 
1183     // Notice, from here on down the store_ ops all return, ending the loop.
1184 
1185             case Op_store_a8: {
1186                 store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1187             } return;
1188 
1189             case Op_store_g8: {
1190                 // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1191                 store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1192             } return;
1193 
1194             case Op_store_4444: {
1195                 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1196                                     | cast<U16>(to_fixed(g * 15) <<  8)
1197                                     | cast<U16>(to_fixed(b * 15) <<  4)
1198                                     | cast<U16>(to_fixed(a * 15) <<  0));
1199             } return;
1200 
1201             case Op_store_565: {
1202                 store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
1203                                     | cast<U16>(to_fixed(g * 63) <<  5 )
1204                                     | cast<U16>(to_fixed(b * 31) << 11 ));
1205             } return;
1206 
1207             case Op_store_888: {
1208                 uint8_t* rgb = (uint8_t*)dst + 3*i;
1209             #if defined(USING_NEON_FP16)
1210                 // See the explanation under USING_NEON below.  This is that doubled up.
1211                 U16 R = to_fixed(r * 255),
1212                     G = to_fixed(g * 255),
1213                     B = to_fixed(b * 255);
1214 
1215                 uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1216                 vst3q_lane_u8(rgb+ 0, v,  0);
1217                 vst3q_lane_u8(rgb+ 3, v,  2);
1218                 vst3q_lane_u8(rgb+ 6, v,  4);
1219                 vst3q_lane_u8(rgb+ 9, v,  6);
1220 
1221                 vst3q_lane_u8(rgb+12, v,  8);
1222                 vst3q_lane_u8(rgb+15, v, 10);
1223                 vst3q_lane_u8(rgb+18, v, 12);
1224                 vst3q_lane_u8(rgb+21, v, 14);
1225             #elif defined(USING_NEON)
1226                 // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1227                 // get there via U16 to save some instructions converting to float.  And just
1228                 // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1229                 U16 R = cast<U16>(to_fixed(r * 255)),
1230                     G = cast<U16>(to_fixed(g * 255)),
1231                     B = cast<U16>(to_fixed(b * 255));
1232 
1233                 uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1234                 vst3_lane_u8(rgb+0, v, 0);
1235                 vst3_lane_u8(rgb+3, v, 2);
1236                 vst3_lane_u8(rgb+6, v, 4);
1237                 vst3_lane_u8(rgb+9, v, 6);
1238             #else
1239                 store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1240                 store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1241                 store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1242             #endif
1243             } return;
1244 
1245             case Op_store_8888: {
1246                 store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
1247                                | cast<U32>(to_fixed(g * 255)) <<  8
1248                                | cast<U32>(to_fixed(b * 255)) << 16
1249                                | cast<U32>(to_fixed(a * 255)) << 24);
1250             } return;
1251 
1252             case Op_store_1010102: {
1253                 store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
1254                                | cast<U32>(to_fixed(g * 1023)) << 10
1255                                | cast<U32>(to_fixed(b * 1023)) << 20
1256                                | cast<U32>(to_fixed(a *    3)) << 30);
1257             } return;
1258 
1259             case Op_store_161616LE: {
1260                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1261                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1262                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1263             #if defined(USING_NEON_FP16)
1264                 uint16x8x3_t v = {{
1265                     (uint16x8_t)U16_from_F(r),
1266                     (uint16x8_t)U16_from_F(g),
1267                     (uint16x8_t)U16_from_F(b),
1268                 }};
1269                 vst3q_u16(rgb, v);
1270             #elif defined(USING_NEON)
1271                 uint16x4x3_t v = {{
1272                     (uint16x4_t)U16_from_F(r),
1273                     (uint16x4_t)U16_from_F(g),
1274                     (uint16x4_t)U16_from_F(b),
1275                 }};
1276                 vst3_u16(rgb, v);
1277             #else
1278                 store_3(rgb+0, U16_from_F(r));
1279                 store_3(rgb+1, U16_from_F(g));
1280                 store_3(rgb+2, U16_from_F(b));
1281             #endif
1282 
1283             } return;
1284 
1285             case Op_store_16161616LE: {
1286                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1287                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1288                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1289             #if defined(USING_NEON_FP16)
1290                 uint16x8x4_t v = {{
1291                     (uint16x8_t)U16_from_F(r),
1292                     (uint16x8_t)U16_from_F(g),
1293                     (uint16x8_t)U16_from_F(b),
1294                     (uint16x8_t)U16_from_F(a),
1295                 }};
1296                 vst4q_u16(rgba, v);
1297             #elif defined(USING_NEON)
1298                 uint16x4x4_t v = {{
1299                     (uint16x4_t)U16_from_F(r),
1300                     (uint16x4_t)U16_from_F(g),
1301                     (uint16x4_t)U16_from_F(b),
1302                     (uint16x4_t)U16_from_F(a),
1303                 }};
1304                 vst4_u16(rgba, v);
1305             #else
1306                 U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1307                        | cast<U64>(to_fixed(g * 65535)) << 16
1308                        | cast<U64>(to_fixed(b * 65535)) << 32
1309                        | cast<U64>(to_fixed(a * 65535)) << 48;
1310                 store(rgba, px);
1311             #endif
1312             } return;
1313 
1314             case Op_store_161616BE: {
1315                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1316                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1317                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1318             #if defined(USING_NEON_FP16)
1319                 uint16x8x3_t v = {{
1320                     (uint16x8_t)swap_endian_16(U16_from_F(r)),
1321                     (uint16x8_t)swap_endian_16(U16_from_F(g)),
1322                     (uint16x8_t)swap_endian_16(U16_from_F(b)),
1323                 }};
1324                 vst3q_u16(rgb, v);
1325             #elif defined(USING_NEON)
1326                 uint16x4x3_t v = {{
1327                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1328                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1329                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1330                 }};
1331                 vst3_u16(rgb, v);
1332             #else
1333                 U32 R = to_fixed(r * 65535),
1334                     G = to_fixed(g * 65535),
1335                     B = to_fixed(b * 65535);
1336                 store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1337                 store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1338                 store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1339             #endif
1340 
1341             } return;
1342 
1343             case Op_store_16161616BE: {
1344                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1345                 assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1346                 uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1347             #if defined(USING_NEON_FP16)
1348                 uint16x8x4_t v = {{
1349                     (uint16x8_t)swap_endian_16(U16_from_F(r)),
1350                     (uint16x8_t)swap_endian_16(U16_from_F(g)),
1351                     (uint16x8_t)swap_endian_16(U16_from_F(b)),
1352                     (uint16x8_t)swap_endian_16(U16_from_F(a)),
1353                 }};
1354                 vst4q_u16(rgba, v);
1355             #elif defined(USING_NEON)
1356                 uint16x4x4_t v = {{
1357                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1358                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1359                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1360                     (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1361                 }};
1362                 vst4_u16(rgba, v);
1363             #else
1364                 U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1365                        | cast<U64>(to_fixed(g * 65535)) << 16
1366                        | cast<U64>(to_fixed(b * 65535)) << 32
1367                        | cast<U64>(to_fixed(a * 65535)) << 48;
1368                 store(rgba, swap_endian_16x4(px));
1369             #endif
1370             } return;
1371 
1372             case Op_store_hhh: {
1373                 uintptr_t ptr = (uintptr_t)(dst + 6*i);
1374                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1375                 uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1376 
1377                 U16 R = Half_from_F(r),
1378                     G = Half_from_F(g),
1379                     B = Half_from_F(b);
1380             #if defined(USING_NEON_FP16)
1381                 uint16x8x3_t v = {{
1382                     (uint16x8_t)R,
1383                     (uint16x8_t)G,
1384                     (uint16x8_t)B,
1385                 }};
1386                 vst3q_u16(rgb, v);
1387             #elif defined(USING_NEON)
1388                 uint16x4x3_t v = {{
1389                     (uint16x4_t)R,
1390                     (uint16x4_t)G,
1391                     (uint16x4_t)B,
1392                 }};
1393                 vst3_u16(rgb, v);
1394             #else
1395                 store_3(rgb+0, R);
1396                 store_3(rgb+1, G);
1397                 store_3(rgb+2, B);
1398             #endif
1399             } return;
1400 
1401             case Op_store_hhhh: {
1402                 uintptr_t ptr = (uintptr_t)(dst + 8*i);
1403                 assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1404                 uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
1405 
1406                 U16 R = Half_from_F(r),
1407                     G = Half_from_F(g),
1408                     B = Half_from_F(b),
1409                     A = Half_from_F(a);
1410             #if defined(USING_NEON_FP16)
1411                 uint16x8x4_t v = {{
1412                     (uint16x8_t)R,
1413                     (uint16x8_t)G,
1414                     (uint16x8_t)B,
1415                     (uint16x8_t)A,
1416                 }};
1417                 vst4q_u16(rgba, v);
1418             #elif defined(USING_NEON)
1419                 uint16x4x4_t v = {{
1420                     (uint16x4_t)R,
1421                     (uint16x4_t)G,
1422                     (uint16x4_t)B,
1423                     (uint16x4_t)A,
1424                 }};
1425                 vst4_u16(rgba, v);
1426             #else
1427                 store(rgba, cast<U64>(R) <<  0
1428                           | cast<U64>(G) << 16
1429                           | cast<U64>(B) << 32
1430                           | cast<U64>(A) << 48);
1431             #endif
1432 
1433             } return;
1434 
1435             case Op_store_fff: {
1436                 uintptr_t ptr = (uintptr_t)(dst + 12*i);
1437                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1438                 float* rgb = (float*)ptr;                // for this cast to float* to be safe.
1439             #if defined(USING_NEON_FP16)
1440                 float32x4x3_t lo = {{
1441                     vcvt_f32_f16(vget_low_f16(r)),
1442                     vcvt_f32_f16(vget_low_f16(g)),
1443                     vcvt_f32_f16(vget_low_f16(b)),
1444                 }}, hi = {{
1445                     vcvt_f32_f16(vget_high_f16(r)),
1446                     vcvt_f32_f16(vget_high_f16(g)),
1447                     vcvt_f32_f16(vget_high_f16(b)),
1448                 }};
1449                 vst3q_f32(rgb +  0, lo);
1450                 vst3q_f32(rgb + 12, hi);
1451             #elif defined(USING_NEON)
1452                 float32x4x3_t v = {{
1453                     (float32x4_t)r,
1454                     (float32x4_t)g,
1455                     (float32x4_t)b,
1456                 }};
1457                 vst3q_f32(rgb, v);
1458             #else
1459                 store_3(rgb+0, r);
1460                 store_3(rgb+1, g);
1461                 store_3(rgb+2, b);
1462             #endif
1463             } return;
1464 
1465             case Op_store_ffff: {
1466                 uintptr_t ptr = (uintptr_t)(dst + 16*i);
1467                 assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1468                 float* rgba = (float*)ptr;               // for this cast to float* to be safe.
1469             #if defined(USING_NEON_FP16)
1470                 float32x4x4_t lo = {{
1471                     vcvt_f32_f16(vget_low_f16(r)),
1472                     vcvt_f32_f16(vget_low_f16(g)),
1473                     vcvt_f32_f16(vget_low_f16(b)),
1474                     vcvt_f32_f16(vget_low_f16(a)),
1475                 }}, hi = {{
1476                     vcvt_f32_f16(vget_high_f16(r)),
1477                     vcvt_f32_f16(vget_high_f16(g)),
1478                     vcvt_f32_f16(vget_high_f16(b)),
1479                     vcvt_f32_f16(vget_high_f16(a)),
1480                 }};
1481                 vst4q_f32(rgba +  0, lo);
1482                 vst4q_f32(rgba + 16, hi);
1483             #elif defined(USING_NEON)
1484                 float32x4x4_t v = {{
1485                     (float32x4_t)r,
1486                     (float32x4_t)g,
1487                     (float32x4_t)b,
1488                     (float32x4_t)a,
1489                 }};
1490                 vst4q_f32(rgba, v);
1491             #else
1492                 store_4(rgba+0, r);
1493                 store_4(rgba+1, g);
1494                 store_4(rgba+2, b);
1495                 store_4(rgba+3, a);
1496             #endif
1497             } return;
1498         }
1499     }
1500 }
1501 
1502 
run_program(const Op * program,const void ** arguments,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1503 static void run_program(const Op* program, const void** arguments,
1504                         const char* src, char* dst, int n,
1505                         const size_t src_bpp, const size_t dst_bpp) {
1506     int i = 0;
1507     while (n >= N) {
1508         exec_ops(program, arguments, src, dst, i);
1509         i += N;
1510         n -= N;
1511     }
1512     if (n > 0) {
1513         char tmp[4*4*N] = {0};
1514 
1515         memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1516         exec_ops(program, arguments, tmp, tmp, 0);
1517         memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1518     }
1519 }
1520 
1521 // Clean up any #defines we may have set so that we can be #included again.
1522 #if defined(USING_AVX)
1523     #undef  USING_AVX
1524 #endif
1525 #if defined(USING_AVX_F16C)
1526     #undef  USING_AVX_F16C
1527 #endif
1528 #if defined(USING_AVX2)
1529     #undef  USING_AVX2
1530 #endif
1531 #if defined(USING_AVX512F)
1532     #undef  USING_AVX512F
1533 #endif
1534 
1535 #if defined(USING_NEON)
1536     #undef  USING_NEON
1537 #endif
1538 #if defined(USING_NEON_F16C)
1539     #undef  USING_NEON_F16C
1540 #endif
1541 #if defined(USING_NEON_FP16)
1542     #undef  USING_NEON_FP16
1543 #endif
1544 
1545 #undef FALLTHROUGH
1546