1 /* graphene-simd4f.h: SIMD wrappers and operations
2  *
3  * SPDX-License-Identifier: MIT
4  *
5  * Copyright 2014  Emmanuele Bassi
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a copy
8  * of this software and associated documentation files (the "Software"), to deal
9  * in the Software without restriction, including without limitation the rights
10  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11  * copies of the Software, and to permit persons to whom the Software is
12  * furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SH1_0 THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23  * THE SOFTWARE.
24  */
25 
26 #pragma once
27 
28 #if !defined(GRAPHENE_H_INSIDE) && !defined(GRAPHENE_COMPILATION)
29 #error "Only graphene.h can be included directly."
30 #endif
31 
32 /* needed for memcpy() */
33 #include <string.h>
34 #include <math.h>
35 #include <float.h>
36 
37 #include "graphene-config.h"
38 #include "graphene-macros.h"
39 #include "graphene-version-macros.h"
40 
41 GRAPHENE_BEGIN_DECLS
42 
43 /* Platform specific operations */
44 
45 GRAPHENE_AVAILABLE_IN_1_0
46 graphene_simd4f_t       graphene_simd4f_init            (float                   x,
47                                                          float                   y,
48                                                          float                   z,
49                                                          float                   w);
50 GRAPHENE_AVAILABLE_IN_1_0
51 graphene_simd4f_t       graphene_simd4f_init_zero       (void);
52 GRAPHENE_AVAILABLE_IN_1_0
53 graphene_simd4f_t       graphene_simd4f_init_4f         (const float            *v);
54 GRAPHENE_AVAILABLE_IN_1_0
55 graphene_simd4f_t       graphene_simd4f_init_3f         (const float            *v);
56 GRAPHENE_AVAILABLE_IN_1_0
57 graphene_simd4f_t       graphene_simd4f_init_2f         (const float            *v);
58 
59 GRAPHENE_AVAILABLE_IN_1_0
60 void                    graphene_simd4f_dup_4f          (const graphene_simd4f_t s,
61                                                          float                  *v);
62 GRAPHENE_AVAILABLE_IN_1_0
63 void                    graphene_simd4f_dup_3f          (const graphene_simd4f_t s,
64                                                          float                  *v);
65 GRAPHENE_AVAILABLE_IN_1_0
66 void                    graphene_simd4f_dup_2f          (const graphene_simd4f_t s,
67                                                          float                  *v);
68 
69 GRAPHENE_AVAILABLE_IN_1_2
70 float                   graphene_simd4f_get             (const graphene_simd4f_t s,
71                                                          unsigned int            i);
72 GRAPHENE_AVAILABLE_IN_1_0
73 float                   graphene_simd4f_get_x           (const graphene_simd4f_t s);
74 GRAPHENE_AVAILABLE_IN_1_0
75 float                   graphene_simd4f_get_y           (const graphene_simd4f_t s);
76 GRAPHENE_AVAILABLE_IN_1_0
77 float                   graphene_simd4f_get_z           (const graphene_simd4f_t s);
78 GRAPHENE_AVAILABLE_IN_1_0
79 float                   graphene_simd4f_get_w           (const graphene_simd4f_t s);
80 
81 GRAPHENE_AVAILABLE_IN_1_0
82 graphene_simd4f_t       graphene_simd4f_splat           (float                   v);
83 GRAPHENE_AVAILABLE_IN_1_0
84 graphene_simd4f_t       graphene_simd4f_splat_x         (const graphene_simd4f_t s);
85 GRAPHENE_AVAILABLE_IN_1_0
86 graphene_simd4f_t       graphene_simd4f_splat_y         (const graphene_simd4f_t s);
87 GRAPHENE_AVAILABLE_IN_1_0
88 graphene_simd4f_t       graphene_simd4f_splat_z         (const graphene_simd4f_t s);
89 GRAPHENE_AVAILABLE_IN_1_0
90 graphene_simd4f_t       graphene_simd4f_splat_w         (const graphene_simd4f_t s);
91 
92 GRAPHENE_AVAILABLE_IN_1_0
93 graphene_simd4f_t       graphene_simd4f_add             (const graphene_simd4f_t a,
94                                                          const graphene_simd4f_t b);
95 GRAPHENE_AVAILABLE_IN_1_0
96 graphene_simd4f_t       graphene_simd4f_sub             (const graphene_simd4f_t a,
97                                                          const graphene_simd4f_t b);
98 GRAPHENE_AVAILABLE_IN_1_0
99 graphene_simd4f_t       graphene_simd4f_mul             (const graphene_simd4f_t a,
100                                                          const graphene_simd4f_t b);
101 GRAPHENE_AVAILABLE_IN_1_0
102 graphene_simd4f_t       graphene_simd4f_div             (const graphene_simd4f_t a,
103                                                          const graphene_simd4f_t b);
104 
105 GRAPHENE_AVAILABLE_IN_1_0
106 graphene_simd4f_t       graphene_simd4f_sqrt            (const graphene_simd4f_t s);
107 GRAPHENE_AVAILABLE_IN_1_0
108 graphene_simd4f_t       graphene_simd4f_reciprocal      (const graphene_simd4f_t s);
109 GRAPHENE_AVAILABLE_IN_1_0
110 graphene_simd4f_t       graphene_simd4f_rsqrt           (const graphene_simd4f_t s);
111 
112 GRAPHENE_AVAILABLE_IN_1_0
113 graphene_simd4f_t       graphene_simd4f_cross3          (const graphene_simd4f_t a,
114                                                          const graphene_simd4f_t b);
115 GRAPHENE_AVAILABLE_IN_1_0
116 graphene_simd4f_t       graphene_simd4f_dot3            (const graphene_simd4f_t a,
117                                                          const graphene_simd4f_t b);
118 GRAPHENE_AVAILABLE_IN_1_4
119 float                   graphene_simd4f_dot3_scalar     (const graphene_simd4f_t a,
120                                                          const graphene_simd4f_t b);
121 
122 GRAPHENE_AVAILABLE_IN_1_0
123 graphene_simd4f_t       graphene_simd4f_min             (const graphene_simd4f_t a,
124                                                          const graphene_simd4f_t b);
125 GRAPHENE_AVAILABLE_IN_1_0
126 graphene_simd4f_t       graphene_simd4f_max             (const graphene_simd4f_t a,
127                                                          const graphene_simd4f_t b);
128 
129 GRAPHENE_AVAILABLE_IN_1_0
130 graphene_simd4f_t       graphene_simd4f_shuffle_wxyz    (const graphene_simd4f_t s);
131 GRAPHENE_AVAILABLE_IN_1_0
132 graphene_simd4f_t       graphene_simd4f_shuffle_zwxy    (const graphene_simd4f_t s);
133 GRAPHENE_AVAILABLE_IN_1_0
134 graphene_simd4f_t       graphene_simd4f_shuffle_yzwx    (const graphene_simd4f_t s);
135 
136 GRAPHENE_AVAILABLE_IN_1_0
137 graphene_simd4f_t       graphene_simd4f_zero_w          (const graphene_simd4f_t s);
138 GRAPHENE_AVAILABLE_IN_1_0
139 graphene_simd4f_t       graphene_simd4f_zero_zw         (const graphene_simd4f_t s);
140 
141 GRAPHENE_AVAILABLE_IN_1_0
142 graphene_simd4f_t       graphene_simd4f_merge_high      (const graphene_simd4f_t a,
143                                                          const graphene_simd4f_t b);
144 GRAPHENE_AVAILABLE_IN_1_0
145 graphene_simd4f_t       graphene_simd4f_merge_low       (const graphene_simd4f_t a,
146                                                          const graphene_simd4f_t b);
147 GRAPHENE_AVAILABLE_IN_1_0
148 graphene_simd4f_t       graphene_simd4f_merge_w         (const graphene_simd4f_t s,
149                                                          float                   v);
150 
151 GRAPHENE_AVAILABLE_IN_1_0
152 graphene_simd4f_t       graphene_simd4f_flip_sign_0101  (const graphene_simd4f_t s);
153 GRAPHENE_AVAILABLE_IN_1_0
154 graphene_simd4f_t       graphene_simd4f_flip_sign_1010  (const graphene_simd4f_t s);
155 
156 GRAPHENE_AVAILABLE_IN_1_0
157 bool                    graphene_simd4f_cmp_eq          (const graphene_simd4f_t a,
158                                                          const graphene_simd4f_t b);
159 GRAPHENE_AVAILABLE_IN_1_0
160 bool                    graphene_simd4f_cmp_neq         (const graphene_simd4f_t a,
161                                                          const graphene_simd4f_t b);
162 GRAPHENE_AVAILABLE_IN_1_2
163 bool                    graphene_simd4f_cmp_lt          (const graphene_simd4f_t a,
164                                                          const graphene_simd4f_t b);
165 GRAPHENE_AVAILABLE_IN_1_2
166 bool                    graphene_simd4f_cmp_le          (const graphene_simd4f_t a,
167                                                          const graphene_simd4f_t b);
168 GRAPHENE_AVAILABLE_IN_1_2
169 bool                    graphene_simd4f_cmp_ge          (const graphene_simd4f_t a,
170                                                          const graphene_simd4f_t b);
171 GRAPHENE_AVAILABLE_IN_1_2
172 bool                    graphene_simd4f_cmp_gt          (const graphene_simd4f_t a,
173                                                          const graphene_simd4f_t b);
174 GRAPHENE_AVAILABLE_IN_1_0
175 graphene_simd4f_t       graphene_simd4f_neg             (const graphene_simd4f_t s);
176 
177 #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
178 
179 /* SSE2 implementation of SIMD 4f */
180 
181 /* Union type used to do single lane reading without memcpy */
182 typedef union {
183   graphene_simd4f_t s;
184   float f[4];
185 } graphene_simd4f_union_t;
186 
187 /* On GCC, we use __extension__ macros to avoid a static inline */
188 # if defined(__GNUC__)
189 
190 /* Use GCC statement __extension__ to inline all these functions */
191 
192 #  define graphene_simd4f_init(x,y,z,w) \
193   (__extension__ ({ \
194     (graphene_simd4f_t) { (x), (y), (z), (w) }; \
195   }))
196 
197 #  define graphene_simd4f_init_zero() \
198   (__extension__ ({ \
199     (graphene_simd4f_t) _mm_setzero_ps(); \
200   }))
201 
202 #  define graphene_simd4f_init_4f(v) \
203   (__extension__ ({ \
204     (graphene_simd4f_t) _mm_loadu_ps (v); \
205   }))
206 
207 #  define graphene_simd4f_init_3f(v) \
208   (__extension__ ({ \
209     (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
210   }))
211 
212 #  define graphene_simd4f_init_2f(v) \
213   (__extension__ ({ \
214     (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
215   }))
216 
217 #  define graphene_simd4f_dup_4f(s,v) \
218   (__extension__ ({ \
219     _mm_storeu_ps ((v), (s)); \
220   }))
221 
222 #  define graphene_simd4f_dup_3f(s,v) \
223   (__extension__ ({ \
224     memcpy ((v), &(s), sizeof (float) * 3); \
225   }))
226 
227 #  define graphene_simd4f_dup_2f(s,v) \
228   (__extension__ ({ \
229     memcpy ((v), &(s), sizeof (float) * 2); \
230   }))
231 
232 #  define graphene_simd4f_get(s,i) \
233   (__extension__ ({ \
234     graphene_simd4f_union_t __u = { (s) }; \
235     (float) __u.f[(i)]; \
236   }))
237 
238 #  define graphene_simd4f_get_x(s)      graphene_simd4f_get (s, 0)
239 #  define graphene_simd4f_get_y(s)      graphene_simd4f_get (s, 1)
240 #  define graphene_simd4f_get_z(s)      graphene_simd4f_get (s, 2)
241 #  define graphene_simd4f_get_w(s)      graphene_simd4f_get (s, 3)
242 
243 #  define graphene_simd4f_splat(v) \
244   (__extension__ ({ \
245     (graphene_simd4f_t) _mm_set1_ps ((v)); \
246   }))
247 
248 #  define graphene_simd4f_splat_x(v) \
249   (__extension__ ({ \
250     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0)); \
251   }))
252 
253 #  define graphene_simd4f_splat_y(v) \
254   (__extension__ ({ \
255     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1)); \
256   }))
257 
258 #  define graphene_simd4f_splat_z(v) \
259   (__extension__ ({ \
260     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2)); \
261   }))
262 
263 #  define graphene_simd4f_splat_w(v) \
264   (__extension__ ({ \
265     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3)); \
266   }))
267 
268 #  define graphene_simd4f_add(a,b) \
269   (__extension__ ({ \
270     (graphene_simd4f_t) _mm_add_ps ((a), (b)); \
271   }))
272 
273 #  define graphene_simd4f_sub(a,b) \
274   (__extension__ ({ \
275     (graphene_simd4f_t) _mm_sub_ps ((a), (b)); \
276   }))
277 
278 #  define graphene_simd4f_mul(a,b) \
279   (__extension__ ({ \
280     (graphene_simd4f_t) _mm_mul_ps ((a), (b)); \
281   }))
282 
283 #  define graphene_simd4f_div(a,b) \
284   (__extension__ ({ \
285     (graphene_simd4f_t) _mm_div_ps ((a), (b)); \
286   }))
287 
288 #  define graphene_simd4f_sqrt(v) \
289   (__extension__ ({ \
290     (graphene_simd4f_t) _mm_sqrt_ps ((v)); \
291   }))
292 
293 #  define graphene_simd4f_reciprocal(v) \
294   (__extension__ ({ \
295     const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); \
296     graphene_simd4f_t __s = _mm_rcp_ps ((v)); \
297     graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, graphene_simd4f_mul ((v), __s))); \
298   }))
299 
300 #  define graphene_simd4f_rsqrt(v) \
301   (__extension__ ({ \
302     const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); \
303     const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); \
304     graphene_simd4f_t __s = _mm_rsqrt_ps ((v)); \
305     graphene_simd4f_mul (graphene_simd4f_mul (__s, __half), \
306                          graphene_simd4f_sub (__three, \
307                                               graphene_simd4f_mul (__s, graphene_simd4f_mul ((v), __s)))); \
308   }))
309 
310 #  define graphene_simd4f_cross3(a,b) \
311   (__extension__ ({ \
312     const graphene_simd4f_t __a_yzx = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 0, 2, 1)); \
313     const graphene_simd4f_t __a_zxy = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 1, 0, 2)); \
314     const graphene_simd4f_t __b_yzx = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 0, 2, 1)); \
315     const graphene_simd4f_t __b_zxy = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 1, 0, 2)); \
316     (graphene_simd4f_t) _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); \
317   }))
318 
319 #  if defined(GRAPHENE_USE_SSE4_1)
320 #   define graphene_simd4f_dot3(a,b) \
321   (__extension__ ({ \
322     (graphene_simd4f_t) _mm_dp_ps ((a), (b), 0x7f); \
323   }))
324 #  else
325 #   define graphene_simd4f_dot3(a,b) \
326   (__extension__ ({ \
327     const unsigned int __mask_bits[] GRAPHENE_ALIGN16 = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
328     const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); \
329     const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); \
330     const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask); \
331     const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0)); \
332     const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); \
333     (graphene_simd4f_t) _mm_shuffle_ps (__s2, __s2, 0); \
334   }))
335 #  endif
336 
337 #  define graphene_simd4f_dot3_scalar(a,b) \
338   (__extension__ ({ \
339     float __res; \
340     _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); \
341     __res; \
342   }))
343 
344 #  define graphene_simd4f_min(a,b) \
345   (__extension__ ({ \
346     (graphene_simd4f_t) _mm_min_ps ((a), (b)); \
347   }))
348 
349 #  define graphene_simd4f_max(a,b) \
350   (__extension__ ({ \
351     (graphene_simd4f_t) _mm_max_ps ((a), (b)); \
352   }))
353 
354 #  define graphene_simd4f_shuffle_wxyz(v) \
355   (__extension__ ({ \
356     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 1, 0, 3)); \
357   }))
358 
359 #  define graphene_simd4f_shuffle_zwxy(v) \
360   (__extension__ ({ \
361     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 0, 3, 2)); \
362   }))
363 
364 #  define graphene_simd4f_shuffle_yzwx(v) \
365   (__extension__ ({ \
366     (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 3, 2, 1)); \
367   }))
368 
369 #  define graphene_simd4f_zero_w(v) \
370   (__extension__ ({ \
371     graphene_simd4f_t __s = _mm_unpackhi_ps ((v), _mm_setzero_ps ()); \
372     (graphene_simd4f_t) _mm_movelh_ps ((v), __s); \
373   }))
374 
375 #  define graphene_simd4f_zero_zw(v) \
376   (__extension__ ({ \
377     (graphene_simd4f_t) _mm_movelh_ps ((v), _mm_setzero_ps ()); \
378   }))
379 
380 #  define graphene_simd4f_merge_w(s,v) \
381   (__extension__ ({ \
382     graphene_simd4f_t __s = _mm_unpackhi_ps ((s), _mm_set1_ps ((v))); \
383     (graphene_simd4f_t) _mm_movelh_ps ((s), __s); \
384   }))
385 
386 #  define graphene_simd4f_merge_high(a,b) \
387   (__extension__ ({ \
388     (graphene_simd4f_t) _mm_movehl_ps ((b), (a)); \
389   }))
390 
391 #  define graphene_simd4f_merge_low(a,b) \
392   (__extension__ ({ \
393     (graphene_simd4f_t) _mm_movelh_ps ((a), (b)); \
394   }))
395 
396 typedef GRAPHENE_ALIGN16 union {
397   unsigned int ui[4];
398   float f[4];
399 } graphene_simd4f_uif_t;
400 
401 #  define graphene_simd4f_flip_sign_0101(v) \
402   (__extension__ ({ \
403     const graphene_simd4f_uif_t __pnpn = { { \
404       0x00000000, \
405       0x80000000, \
406       0x00000000, \
407       0x80000000  \
408     } }; \
409     (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__pnpn.f)); \
410   }))
411 
412 #  define graphene_simd4f_flip_sign_1010(v) \
413   (__extension__ ({ \
414     const graphene_simd4f_uif_t __npnp = { { \
415       0x80000000, \
416       0x00000000, \
417       0x80000000, \
418       0x00000000, \
419     } }; \
420     (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__npnp.f)); \
421   }))
422 
423 #  define graphene_simd4f_cmp_eq(a,b) \
424   (__extension__ ({ \
425     __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
426     (bool) (_mm_movemask_epi8 (__res) == 0); \
427   }))
428 
429 #  define graphene_simd4f_cmp_neq(a,b) \
430   (__extension__ ({ \
431     __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \
432     (bool) (_mm_movemask_epi8 (__res) != 0); \
433   }))
434 
435 #  define graphene_simd4f_cmp_lt(a,b) \
436   (__extension__ ({ \
437     __m128i __res = (__m128i) _mm_cmplt_ps ((a), (b)); \
438     (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
439   }))
440 
441 #  define graphene_simd4f_cmp_le(a,b) \
442   (__extension__ ({ \
443     __m128i __res = (__m128i) _mm_cmple_ps ((a), (b)); \
444     (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
445   }))
446 
447 #  define graphene_simd4f_cmp_ge(a,b) \
448   (__extension__ ({ \
449     __m128i __res = (__m128i) _mm_cmpge_ps ((a), (b)); \
450     (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
451   }))
452 
453 #  define graphene_simd4f_cmp_gt(a,b) \
454   (__extension__ ({ \
455     __m128i __res = (__m128i) _mm_cmpgt_ps ((a), (b)); \
456     (bool) (_mm_movemask_epi8 (__res) == 0xffff); \
457   }))
458 
459 #  define graphene_simd4f_neg(s) \
460   (__extension__ ({ \
461     const graphene_simd4f_uif_t __mask = { { \
462       0x80000000, \
463       0x80000000, \
464       0x80000000, \
465       0x80000000, \
466     } }; \
467     (graphene_simd4f_t) _mm_xor_ps ((s), _mm_load_ps (__mask.f)); \
468   }))
469 
470 /* On MSVC, we use static inlines */
471 # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
472 
473 /* Use static inline to inline all these functions */
474 
475 #define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
476 
477 static inline graphene_simd4f_t
_simd4f_init(float x,float y,float z,float w)478 _simd4f_init (float x, float y, float z, float w)
479 {
480   graphene_simd4f_t __s = { x, y, z, w };
481   return __s;
482 }
483 
484 #define graphene_simd4f_init_zero() \
485   _mm_setzero_ps()
486 
487 #define graphene_simd4f_init_4f(v) \
488   _mm_loadu_ps(v)
489 
490 #define graphene_simd4f_init_3f(v) \
491   graphene_simd4f_init (v[0], v[1], v[2], 0.f)
492 
493 #define graphene_simd4f_init_2f(v) \
494   graphene_simd4f_init (v[0], v[1], 0.f, 0.f)
495 
496 #define graphene_simd4f_dup_4f(s,v) \
497   _mm_storeu_ps (v, s)
498 
499 #define graphene_simd4f_dup_3f(s,v) \
500   memcpy (v, &s, sizeof (float) * 3)
501 
502 #define graphene_simd4f_dup_2f(s,v) \
503   memcpy (v, &s, sizeof (float) * 2)
504 
505 #define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i)
506 #define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0)
507 #define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1)
508 #define graphene_simd4f_get_z(s) _simd4f_get_xyzw(s, 2)
509 #define graphene_simd4f_get_w(s) _simd4f_get_xyzw(s, 3)
510 
511 static inline float
_simd4f_get_xyzw(graphene_simd4f_t s,int mode)512 _simd4f_get_xyzw (graphene_simd4f_t s, int mode)
513 {
514   /* mode: get_x=0
515            get_y=1
516            get_z=2
517            get_w=3 */
518 
519   graphene_simd4f_union_t u;
520   u.s = s;
521   return u.f[mode];
522 }
523 
524 #define graphene_simd4f_splat(v) \
525   _mm_set1_ps (v)
526 
527 #define graphene_simd4f_splat_x(v) \
528   _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 0, 0, 0))
529 
530 #define graphene_simd4f_splat_y(v) \
531   _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 1, 1, 1))
532 
533 #define graphene_simd4f_splat_z(v) \
534   _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 2, 2, 2))
535 
536 #define graphene_simd4f_splat_w(v) \
537   _mm_shuffle_ps (v, v, _MM_SHUFFLE (3, 3, 3, 3))
538 
539 #define graphene_simd4f_add(a,b) \
540   _mm_add_ps (a, b)
541 
542 #define graphene_simd4f_sub(a,b) \
543   _mm_sub_ps (a, b)
544 
545 #define graphene_simd4f_mul(a,b) \
546   _mm_mul_ps (a, b)
547 
548 #define graphene_simd4f_div(a,b) \
549   _mm_div_ps (a, b)
550 
551 #define graphene_simd4f_sqrt(v) \
552   _mm_sqrt_ps (v)
553 
554 #define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v)
555 
556 static inline graphene_simd4f_t
_simd4f_reciprocal(const graphene_simd4f_t v)557 _simd4f_reciprocal(const graphene_simd4f_t v)
558 {
559   const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f);
560   graphene_simd4f_t __s = _mm_rcp_ps (v);
561   return graphene_simd4f_mul (__s,
562                               graphene_simd4f_sub (__two,
563                                                    graphene_simd4f_mul (v, __s)));
564 }
565 
566 #define graphene_simd4f_rsqrt(v) _simd4f_rsqrt(v)
567 
568 static inline graphene_simd4f_t
_simd4f_rsqrt(const graphene_simd4f_t v)569 _simd4f_rsqrt(const graphene_simd4f_t v)
570 {
571   const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f);
572   const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f);
573   graphene_simd4f_t __s = _mm_rsqrt_ps (v);
574   return graphene_simd4f_mul (graphene_simd4f_mul (__s, __half),
575                               graphene_simd4f_sub (__three,
576                                                    graphene_simd4f_mul (__s, graphene_simd4f_mul (v, __s))));
577 }
578 
579 #define graphene_simd4f_cross3(a,b) \
580   _simd4f_cross3(a,b)
581 
582 static inline graphene_simd4f_t
_simd4f_cross3(const graphene_simd4f_t a,const graphene_simd4f_t b)583 _simd4f_cross3 (const graphene_simd4f_t a,
584                 const graphene_simd4f_t b)
585 {
586   const graphene_simd4f_t __a_yzx = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 0, 2, 1));
587   const graphene_simd4f_t __a_zxy = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 1, 0, 2));
588   const graphene_simd4f_t __b_yzx = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 0, 2, 1));
589   const graphene_simd4f_t __b_zxy = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 1, 0, 2));
590 
591   return _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx));
592 }
593 
594 #define graphene_simd4f_dot3(a,b) \
595   _simd4f_dot3(a,b)
596 
597 static inline graphene_simd4f_t
_simd4f_dot3(const graphene_simd4f_t a,const graphene_simd4f_t b)598 _simd4f_dot3 (const graphene_simd4f_t a,
599               const graphene_simd4f_t b)
600 {
601 #if defined(GRAPHENE_USE_SSE4_1)
602   return _mm_dp_ps (a, b, 0x7f);
603 #else
604   GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
605   const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits);
606   const graphene_simd4f_t __m = _mm_mul_ps ((a), (b));
607   const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask);
608   const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0));
609   const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1));
610 
611   return _mm_shuffle_ps (__s2, __s2, 0);
612 #endif
613 }
614 
615 #define graphene_simd4f_dot3_scalar(a,b) \
616   _simd4f_dot3_scalar(a,b)
617 
618 static inline float
_simd4f_dot3_scalar(const graphene_simd4f_t a,const graphene_simd4f_t b)619 _simd4f_dot3_scalar (const graphene_simd4f_t a,
620                      const graphene_simd4f_t b)
621 {
622   float __res;
623   _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b));
624   return __res;
625 }
626 
627 #define graphene_simd4f_min(a,b) \
628   _mm_min_ps (a, b)
629 
630 #define graphene_simd4f_max(a,b) \
631   _mm_max_ps (a, b)
632 
633 
634 #define graphene_simd4f_shuffle_wxyz(v) \
635   _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3))
636 
637 #define graphene_simd4f_shuffle_zwxy(v) \
638   _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2))
639 
640 #define graphene_simd4f_shuffle_yzwx(v) \
641   _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1))
642 
643 #define graphene_simd4f_zero_w(v) \
644   _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ()))
645 
646 #define graphene_simd4f_zero_zw(v) \
647   _mm_movelh_ps (v, _mm_setzero_ps ())
648 
649 #define graphene_simd4f_merge_w(s,v) \
650   _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v)))
651 
652 #define graphene_simd4f_merge_high(a,b) \
653   _mm_movehl_ps (b, a)
654 
655 #define graphene_simd4f_merge_low(a,b) \
656   _mm_movelh_ps (a, b)
657 
658 typedef GRAPHENE_ALIGN16 union {
659   unsigned int ui[4];
660   float f[4];
661 } graphene_simd4f_uif_t;
662 
663 #define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v)
664 
665 static inline graphene_simd4f_t
_simd4f_flip_sign_0101(const graphene_simd4f_t v)666 _simd4f_flip_sign_0101 (const graphene_simd4f_t v)
667 {
668   const graphene_simd4f_uif_t __pnpn = { {
669     0x00000000,
670     0x80000000,
671     0x00000000,
672     0x80000000
673   } };
674 
675   return _mm_xor_ps (v, _mm_load_ps (__pnpn.f));
676 }
677 
678 #define graphene_simd4f_flip_sign_1010(v) _simd4f_flip_sign_1010(v)
679 
680 static inline graphene_simd4f_t
_simd4f_flip_sign_1010(const graphene_simd4f_t v)681 _simd4f_flip_sign_1010(const graphene_simd4f_t v)
682 {
683   const graphene_simd4f_uif_t __npnp = { {
684     0x80000000,
685     0x00000000,
686     0x80000000,
687     0x00000000,
688   } };
689 
690   return _mm_xor_ps (v, _mm_load_ps (__npnp.f));
691 }
692 
693 #define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
694 
695 static inline bool
_simd4f_cmp_eq(const graphene_simd4f_t a,const graphene_simd4f_t b)696 _simd4f_cmp_eq (const graphene_simd4f_t a,
697                         const graphene_simd4f_t b)
698 {
699   __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b));
700   return (_mm_movemask_epi8 (__res) == 0);
701 }
702 
703 #define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
704 
705 static inline bool
_simd4f_cmp_neq(const graphene_simd4f_t a,const graphene_simd4f_t b)706 _simd4f_cmp_neq (const graphene_simd4f_t a,
707                          const graphene_simd4f_t b)
708 {
709   __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b));
710   return (_mm_movemask_epi8 (__res) != 0);
711 }
712 
713 #define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
714 
715 static inline bool
_simd4f_cmp_lt(const graphene_simd4f_t a,const graphene_simd4f_t b)716 _simd4f_cmp_lt (const graphene_simd4f_t a,
717                 const graphene_simd4f_t b)
718 {
719   __m128i __res = _mm_castps_si128 (_mm_cmplt_ps (a, b));
720   return (_mm_movemask_epi8 (__res) == 0xffff);
721 }
722 
723 #define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
724 
725 static inline bool
_simd4f_cmp_le(const graphene_simd4f_t a,const graphene_simd4f_t b)726 _simd4f_cmp_le (const graphene_simd4f_t a,
727                 const graphene_simd4f_t b)
728 {
729   __m128i __res = _mm_castps_si128 (_mm_cmple_ps (a, b));
730   return (_mm_movemask_epi8 (__res) == 0xffff);
731 }
732 
733 #define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
734 
735 static inline bool
_simd4f_cmp_ge(const graphene_simd4f_t a,const graphene_simd4f_t b)736 _simd4f_cmp_ge (const graphene_simd4f_t a,
737                 const graphene_simd4f_t b)
738 {
739   __m128i __res = _mm_castps_si128 (_mm_cmpge_ps (a, b));
740   return (_mm_movemask_epi8 (__res) == 0xffff);
741 }
742 
743 #define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
744 
745 static inline bool
_simd4f_cmp_gt(const graphene_simd4f_t a,const graphene_simd4f_t b)746 _simd4f_cmp_gt (const graphene_simd4f_t a,
747                 const graphene_simd4f_t b)
748 {
749   __m128i __res = _mm_castps_si128 (_mm_cmpgt_ps (a, b));
750   return (_mm_movemask_epi8 (__res) == 0xffff);
751 }
752 
753 #define graphene_simd4f_neg(s) _simd4f_neg(s)
754 
755 static inline graphene_simd4f_t
_simd4f_neg(const graphene_simd4f_t s)756 _simd4f_neg (const graphene_simd4f_t s)
757 {
758   const graphene_simd4f_uif_t __mask = { {
759     0x80000000,
760     0x80000000,
761     0x80000000,
762     0x80000000,
763   } };
764 
765   return _mm_xor_ps (s, _mm_load_ps (__mask.f));
766 }
767 
768 #else /* SSE intrinsics-not GCC or Visual Studio */
769 
770 #  error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
771 
772 /* Use static inline to inline all these functions */
773 
774 # endif /* !__GNUC__ && !_MSC_VER */
775 
776 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_GCC)
777 
778 /* GCC vector intrinsic implementation of SIMD 4f */
779 
780 typedef int graphene_simd4i_t __attribute__((vector_size (16)));
781 
782 # define graphene_simd4f_init(x,y,z,w) \
783   (__extension__ ({ \
784     (graphene_simd4f_t) { (x), (y), (z), (w) }; \
785   }))
786 
787 # define graphene_simd4f_init_zero() \
788   (__extension__ ({ \
789     (graphene_simd4f_t) { 0.f, 0.f, 0.f, 0.f }; \
790   }))
791 
792 # define graphene_simd4f_init_4f(v) \
793   (__extension__ ({ \
794     (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], (v)[3] }; \
795   }))
796 
797 # define graphene_simd4f_init_3f(v) \
798   (__extension__ ({ \
799     (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \
800   }))
801 
802 # define graphene_simd4f_init_2f(v) \
803   (__extension__ ({ \
804     (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \
805   }))
806 
807 # define graphene_simd4f_dup_4f(s,v) \
808   (__extension__ ({ \
809     memcpy ((v), &(s), sizeof (float) * 4); \
810   }))
811 
812 # define graphene_simd4f_dup_3f(s,v) \
813   (__extension__ ({ \
814     memcpy ((v), &(s), sizeof (float) * 3); \
815   }))
816 
817 # define graphene_simd4f_dup_2f(s,v) \
818   (__extension__ ({ \
819     memcpy ((v), &(s), sizeof (float) * 2); \
820   }))
821 
822 # define graphene_simd4f_get(s,i)       (__extension__ ({ (float) (s)[(i)]; }))
823 # define graphene_simd4f_get_x(s)       graphene_simd4f_get ((s), 0)
824 # define graphene_simd4f_get_y(s)       graphene_simd4f_get ((s), 1)
825 # define graphene_simd4f_get_z(s)       graphene_simd4f_get ((s), 2)
826 # define graphene_simd4f_get_w(s)       graphene_simd4f_get ((s), 3)
827 
828 # define graphene_simd4f_splat(v) \
829   (__extension__ ({ \
830     (graphene_simd4f_t) { (v), (v), (v), (v) }; \
831   }))
832 
833 # define graphene_simd4f_splat_x(v) \
834   (__extension__ ({ \
835     float __val = graphene_simd4f_get_x ((v)); \
836     (graphene_simd4f_t) { __val, __val, __val, __val }; \
837   }))
838 
839 # define graphene_simd4f_splat_y(v) \
840   (__extension__ ({ \
841     float __val = graphene_simd4f_get_y ((v)); \
842     (graphene_simd4f_t) { __val, __val, __val, __val }; \
843   }))
844 
845 # define graphene_simd4f_splat_z(v) \
846   (__extension__ ({ \
847     float __val = graphene_simd4f_get_z ((v)); \
848     (graphene_simd4f_t) { __val, __val, __val, __val }; \
849   }))
850 
851 # define graphene_simd4f_splat_w(v) \
852   (__extension__ ({ \
853     float __val = graphene_simd4f_get_w ((v)); \
854     (graphene_simd4f_t) { __val, __val, __val, __val }; \
855   }))
856 
857 # define graphene_simd4f_reciprocal(v) \
858   (__extension__ ({ \
859     (graphene_simd4f_t) { \
860       (v)[0] != 0.f ? 1.f / (v)[0] : 0.f, \
861       (v)[1] != 0.f ? 1.f / (v)[1] : 0.f, \
862       (v)[2] != 0.f ? 1.f / (v)[2] : 0.f, \
863       (v)[3] != 0.f ? 1.f / (v)[3] : 0.f, \
864     }; \
865   }))
866 
867 # define graphene_simd4f_sqrt(v) \
868   (__extension__ ({ \
869     (graphene_simd4f_t) { \
870       sqrtf ((v)[0]), \
871       sqrtf ((v)[1]), \
872       sqrtf ((v)[2]), \
873       sqrtf ((v)[3]), \
874     }; \
875   }))
876 
877 # define graphene_simd4f_rsqrt(v) \
878   (__extension__ ({ \
879     (graphene_simd4f_t) { \
880       (v)[0] != 0.f ? 1.f / sqrtf ((v)[0]) : 0.f, \
881       (v)[1] != 0.f ? 1.f / sqrtf ((v)[1]) : 0.f, \
882       (v)[2] != 0.f ? 1.f / sqrtf ((v)[2]) : 0.f, \
883       (v)[3] != 0.f ? 1.f / sqrtf ((v)[3]) : 0.f, \
884     }; \
885   }))
886 
887 # define graphene_simd4f_add(a,b)       (__extension__ ({ (graphene_simd4f_t) ((a) + (b)); }))
888 # define graphene_simd4f_sub(a,b)       (__extension__ ({ (graphene_simd4f_t) ((a) - (b)); }))
889 # define graphene_simd4f_mul(a,b)       (__extension__ ({ (graphene_simd4f_t) ((a) * (b)); }))
890 # define graphene_simd4f_div(a,b)       (__extension__ ({ (graphene_simd4f_t) ((a) / (b)); }))
891 
892 # define graphene_simd4f_cross3(a,b) \
893   (__extension__ ({ \
894     const graphene_simd4f_t __a = (a); \
895     const graphene_simd4f_t __b = (b); \
896     graphene_simd4f_init (__a[1] * __b[2] - __a[2] * __b[1], \
897                           __a[2] * __b[0] - __a[0] * __b[2], \
898                           __a[0] * __b[1] - __a[1] * __b[0], \
899                           0.f); \
900   }))
901 
902 # define graphene_simd4f_dot3(a,b) \
903   (__extension__ ({ \
904     const graphene_simd4f_t __a = (a); \
905     const graphene_simd4f_t __b = (b); \
906     const float __res = __a[0] * __b[0] + __a[1] * __b[1] + __a[2] * __b[2]; \
907     graphene_simd4f_init (__res, __res, __res, __res); \
908   }))
909 
910 # define graphene_simd4f_dot3_scalar(a,b) \
911   (__extension__ ({ \
912     graphene_simd4f_get_x (graphene_simd4f_dot3 (a, b)); \
913   }))
914 
915 # define graphene_simd4f_min(a,b) \
916   (__extension__ ({ \
917     const graphene_simd4f_t __a = (a); \
918     const graphene_simd4f_t __b = (b); \
919     graphene_simd4f_init (__a[0] < __b[0] ? __a[0] : __b[0], \
920                           __a[1] < __b[1] ? __a[1] : __b[1], \
921                           __a[2] < __b[2] ? __a[2] : __b[2], \
922                           __a[3] < __b[3] ? __a[3] : __b[3]); \
923   }))
924 
925 # define graphene_simd4f_max(a,b) \
926   (__extension__ ({ \
927     const graphene_simd4f_t __a = (a); \
928     const graphene_simd4f_t __b = (b); \
929     graphene_simd4f_init (__a[0] > __b[0] ? __a[0] : __b[0], \
930                           __a[1] > __b[1] ? __a[1] : __b[1], \
931                           __a[2] > __b[2] ? __a[2] : __b[2], \
932                           __a[3] > __b[3] ? __a[3] : __b[3]); \
933   }))
934 
935 # define graphene_simd4f_shuffle_wxyz(v) \
936   (__extension__ ({ \
937     const graphene_simd4i_t __mask = { 3, 0, 1, 2 }; \
938     (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
939   }))
940 
941 # define graphene_simd4f_shuffle_zwxy(v) \
942   (__extension__ ({ \
943     const graphene_simd4i_t __mask = { 2, 3, 0, 1 }; \
944     (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
945   }))
946 
947 # define graphene_simd4f_shuffle_yzwx(v) \
948   (__extension__ ({ \
949     const graphene_simd4i_t __mask = { 1, 2, 3, 0 }; \
950     (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \
951   }))
952 
953 # define graphene_simd4f_zero_w(v) \
954   (__extension__ ({ \
955     const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
956     (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
957   }))
958 
959 # define graphene_simd4f_zero_zw(v) \
960   (__extension__ ({ \
961     const graphene_simd4i_t __mask = { 0, 1, 4, 4 }; \
962     (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \
963   }))
964 
965 # define graphene_simd4f_merge_w(s,v) \
966   (__extension__ ({ \
967     const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \
968     (graphene_simd4f_t) __builtin_shuffle ((s), graphene_simd4f_splat ((v)), __mask); \
969   }))
970 
971 # define graphene_simd4f_merge_high(a,b) \
972   (__extension__ ({ \
973     const graphene_simd4i_t __mask = { 2, 3, 6, 7 }; \
974     (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
975   }))
976 
977 # define graphene_simd4f_merge_low(a,b) \
978   (__extension__ ({ \
979     const graphene_simd4i_t __mask = { 0, 1, 4, 5 }; \
980     (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \
981   }))
982 
983 # define graphene_simd4f_flip_sign_0101(v) \
984   (__extension__ ({ \
985     const graphene_simd4f_t __v = (v); \
986     graphene_simd4f_init (__v[0], -__v[1], __v[2], -__v[3]); \
987   }))
988 
989 # define graphene_simd4f_flip_sign_1010(v) \
990   (__extension__ ({ \
991     const graphene_simd4f_t __v = (v); \
992     graphene_simd4f_init (-__v[0], __v[1], -__v[2], __v[3]); \
993   }))
994 
995 # define graphene_simd4f_cmp_eq(a,b) \
996   (__extension__ ({ \
997     const graphene_simd4i_t __res = (a) == (b); \
998     (bool) (__res[0] != 0 && \
999             __res[1] != 0 && \
1000             __res[2] != 0 && \
1001             __res[3] != 0); \
1002   }))
1003 
1004 # define graphene_simd4f_cmp_neq(a,b) (!graphene_simd4f_cmp_eq (a,b))
1005 
1006 # define graphene_simd4f_cmp_lt(a,b) \
1007   (__extension__ ({ \
1008     const graphene_simd4i_t __res = (a) < (b); \
1009     (bool) (__res[0] != 0 && \
1010             __res[1] != 0 && \
1011             __res[2] != 0 && \
1012             __res[3] != 0); \
1013   }))
1014 
1015 # define graphene_simd4f_cmp_le(a,b) \
1016   (__extension__ ({ \
1017     const graphene_simd4i_t __res = (a) <= (b); \
1018     (bool) (__res[0] != 0 && \
1019             __res[1] != 0 && \
1020             __res[2] != 0 && \
1021             __res[3] != 0); \
1022   }))
1023 
1024 # define graphene_simd4f_cmp_ge(a,b) \
1025   (__extension__ ({ \
1026     const graphene_simd4i_t __res = (a) >= (b); \
1027     (bool) (__res[0] != 0 && \
1028             __res[1] != 0 && \
1029             __res[2] != 0 && \
1030             __res[3] != 0); \
1031   }))
1032 
1033 # define graphene_simd4f_cmp_gt(a,b) \
1034   (__extension__ ({ \
1035     const graphene_simd4i_t __res = (a) > (b); \
1036     (bool) (__res[0] != 0 && \
1037             __res[1] != 0 && \
1038             __res[2] != 0 && \
1039             __res[3] != 0); \
1040   }))
1041 
1042 # define graphene_simd4f_neg(s) \
1043   (__extension__ ({ \
1044     const graphene_simd4f_t __s = (s); \
1045     const graphene_simd4f_t __minus_one = graphene_simd4f_splat (-1.f); \
1046     graphene_simd4f_mul (__s, __minus_one); \
1047   }))
1048 
1049 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
1050 
1051 /* ARM Neon implementation of SIMD4f */
1052 
1053 /* Union type used for single lane reading without memcpy */
1054 typedef union {
1055   graphene_simd4f_t s;
1056   float f[4];
1057 } graphene_simd4f_union_t;
1058 
1059 /* NEON has optimised 2-lanes vectors we can use */
1060 typedef float32x2_t graphene_simd2f_t;
1061 
1062 #ifdef __GNUC__
1063 # define graphene_simd4f_init(x,y,z,w) \
1064   (__extension__ ({ \
1065     const float32_t __v[4] = { (x), (y), (z), (w) }; \
1066     (graphene_simd4f_t) vld1q_f32 (__v); \
1067   }))
1068 
1069 # define graphene_simd4f_init_zero() \
1070   (__extension__ ({ \
1071     (graphene_simd4f_t) vdupq_n_f32 (0.f); \
1072   }))
1073 
1074 # define graphene_simd4f_init_4f(v) \
1075   (__extension__ ({ \
1076     const float32_t *__v32 = (const float32_t *) (v); \
1077     (graphene_simd4f_t) vld1q_f32 (__v32); \
1078   }))
1079 
1080 # define graphene_simd4f_init_3f(v) \
1081   (__extension__ ({ \
1082     graphene_simd4f_init (v[0], v[1], v[2], 0.f); \
1083   }))
1084 
1085 # define graphene_simd4f_init_2f(v) \
1086   (__extension__ ({ \
1087     const float32_t *__v32 = (const float32_t *) (v); \
1088     const graphene_simd2f_t __low = vld1_f32 (__v32); \
1089     const float32_t __zero = 0; \
1090     const graphene_simd2f_t __high = vld1_dup_f32 (&__zero); \
1091     (graphene_simd4f_t) vcombine_f32 (__low, __high); \
1092   }))
1093 
1094 # define graphene_simd4f_dup_4f(s,v) \
1095   (__extension__ ({ \
1096     vst1q_f32 ((float32_t *) (v), (s)); \
1097   }))
1098 
1099 # define graphene_simd4f_dup_3f(s,v) \
1100   (__extension__ ({ \
1101     float *__v = (v); \
1102     vst1q_lane_f32 (__v++, (s), 0); \
1103     vst1q_lane_f32 (__v++, (s), 1); \
1104     vst1q_lane_f32 (__v, (s), 2); \
1105   }))
1106 
1107 # define graphene_simd4f_dup_2f(s,v) \
1108   (__extension__ ({ \
1109     const graphene_simd2f_t __low = vget_low_f32 ((s)); \
1110     vst1_f32 ((float32_t *) (v), __low); \
1111   }))
1112 
1113 # define graphene_simd4f_get(s,i) \
1114   (__extension__ ({ \
1115     (float) vgetq_lane_f32 ((s), (i)); \
1116   }))
1117 
1118 # define graphene_simd4f_splat(v) \
1119   (__extension__ ({ \
1120     (graphene_simd4f_t) vdupq_n_f32 ((v)); \
1121   }))
1122 
1123 # define graphene_simd4f_splat_x(s) \
1124   (__extension__ ({ \
1125     graphene_simd4f_splat (graphene_simd4f_get_x ((s))); \
1126   }))
1127 
1128 # define graphene_simd4f_splat_y(s) \
1129   (__extension__ ({ \
1130     graphene_simd4f_splat (graphene_simd4f_get_y ((s))); \
1131   }))
1132 
1133 # define graphene_simd4f_splat_z(s) \
1134   (__extension__ ({ \
1135     graphene_simd4f_splat (graphene_simd4f_get_z ((s))); \
1136   }))
1137 
1138 # define graphene_simd4f_splat_w(s) \
1139   (__extension__ ({ \
1140     graphene_simd4f_splat (graphene_simd4f_get_w ((s))); \
1141   }))
1142 
1143 # define graphene_simd4f_reciprocal(s) \
1144   (__extension__ ({ \
1145     graphene_simd4f_t __est = vrecpeq_f32 ((s)); \
1146     __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1147     (graphene_simd4f_t) vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \
1148   }))
1149 
1150 # define graphene_simd4f_add(a,b) \
1151   (__extension__ ({ \
1152     (graphene_simd4f_t) vaddq_f32 ((a), (b)); \
1153   }))
1154 
1155 # define graphene_simd4f_sub(a,b) \
1156   (__extension__ ({ \
1157     (graphene_simd4f_t) vsubq_f32 ((a), (b)); \
1158   }))
1159 
1160 # define graphene_simd4f_mul(a,b) \
1161   (__extension__ ({ \
1162     (graphene_simd4f_t) vmulq_f32 ((a), (b)); \
1163   }))
1164 
1165 # define graphene_simd4f_div(a,b) \
1166   (__extension__ ({ \
1167     graphene_simd4f_t __rec = graphene_simd4f_reciprocal ((b)); \
1168     (graphene_simd4f_t) vmulq_f32 ((a), __rec); \
1169   }))
1170 
1171 # define _simd4f_rsqrt_iter(v,estimate) \
1172   (__extension__ ({ \
1173     const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); \
1174     (graphene_simd4f_t) vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); \
1175   }))
1176 
1177 # define graphene_simd4f_rsqrt(s) \
1178   (__extension__ ({ \
1179     graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); \
1180     __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1181     __estimate = _simd4f_rsqrt_iter ((s), __estimate); \
1182     _simd4f_rsqrt_iter ((s), __estimate); \
1183   }))
1184 
1185 # define graphene_simd4f_sqrt(s) \
1186   (__extension__ ({ \
1187     graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s)); \
1188     graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq); \
1189     uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \
1190     (graphene_simd4f_t) vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); \
1191   }))
1192 
1193 # define graphene_simd4f_cross3(a,b) \
1194   (__extension__ ({ \
1195     const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \
1196     const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); \
1197     const graphene_simd4f_t __a = (a), __b = (b); \
1198     const graphene_simd2f_t __a_low = vget_low_f32 (__a); \
1199     const graphene_simd2f_t __b_low = vget_low_f32 (__b); \
1200     const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low); \
1201     const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low); \
1202     graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a), \
1203                                                   graphene_simd4f_mul (__a_yzx, __b)); \
1204     graphene_simd2f_t __s3_low = vget_low_f32 (__s3); \
1205     __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low); \
1206     (graphene_simd4f_t) vandq_s32 ((int32x4_t) __s3, __mask); \
1207   }))
1208 
1209 # define graphene_simd4f_dot3(a,b) \
1210   (__extension__ ({ \
1211     graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)); \
1212   }))
1213 
1214 # define graphene_simd4f_dot3_scalar(a,b) \
1215   (__extension__ ({ \
1216     const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); \
1217     const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); \
1218     (float) vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); \
1219   }))
1220 
1221 # define graphene_simd4f_min(a,b) \
1222   (__extension__ ({ \
1223     (graphene_simd4f_t) vminq_f32 ((a), (b)); \
1224   }))
1225 
1226 # define graphene_simd4f_max(a,b) \
1227   (__extension__ ({ \
1228     (graphene_simd4f_t) vmaxq_f32 (a, b); \
1229   }))
1230 
1231 # define graphene_simd4f_shuffle_wxyz(v) \
1232   (__extension__ ({ \
1233     graphene_simd4f_union_t __u = { (v) }; \
1234     graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); \
1235   }))
1236 
1237 # define graphene_simd4f_shuffle_zwxy(v) \
1238   (__extension__ ({ \
1239     graphene_simd4f_union_t __u = { (v) }; \
1240     graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); \
1241   }))
1242 
1243 # define graphene_simd4f_shuffle_yzwx(v) \
1244   (__extension__ ({ \
1245     graphene_simd4f_union_t __u = { (v) }; \
1246     graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); \
1247   }))
1248 
1249 # define graphene_simd4f_zero_w(v) \
1250   (__extension__ ({ \
1251     graphene_simd4f_union_t __u = { (v) }; \
1252     graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); \
1253   }))
1254 
1255 # define graphene_simd4f_zero_zw(v) \
1256   (__extension__ ({ \
1257     graphene_simd4f_union_t __u = { (v) }; \
1258     graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); \
1259   }))
1260 
1261 # define graphene_simd4f_merge_w(s,v) \
1262   (__extension__ ({ \
1263     graphene_simd4f_union_t __u = { (s) }; \
1264     graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); \
1265   }))
1266 
1267 # define graphene_simd4f_merge_high(a,b) \
1268   (__extension__ ({ \
1269     graphene_simd4f_union_t __u_a = { (a) }; \
1270     graphene_simd4f_union_t __u_b = { (b) }; \
1271     graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); \
1272   }))
1273 
1274 # define graphene_simd4f_merge_low(a,b) \
1275   (__extension__ ({ \
1276     graphene_simd4f_union_t __u_a = { (a) }; \
1277     graphene_simd4f_union_t __u_b = { (b) }; \
1278     graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]); \
1279   }))
1280 
1281 # define graphene_simd4f_flip_sign_0101(s) \
1282   (__extension__ ({ \
1283     const unsigned int __upnpn[4] = { \
1284       0x00000000, \
1285       0x80000000, \
1286       0x00000000, \
1287       0x80000000 \
1288     }; \
1289     const uint32x4_t __pnpn = vld1q_u32 (__upnpn); \
1290     (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); \
1291   }))
1292 
1293 # define graphene_simd4f_flip_sign_1010(s) \
1294   (__extension__ ({ \
1295     const unsigned int __unpnp[4] = { \
1296       0x80000000, \
1297       0x00000000, \
1298       0x80000000, \
1299       0x00000000 \
1300     }; \
1301     const uint32x4_t __npnp = vld1q_u32 (__unpnp); \
1302     (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); \
1303   }))
1304 
1305 # define graphene_simd4f_cmp_eq(a,b) \
1306   (__extension__ ({ \
1307     const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1308     (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1309             vgetq_lane_u32 (__mask, 1) != 0 && \
1310             vgetq_lane_u32 (__mask, 2) != 0 && \
1311             vgetq_lane_u32 (__mask, 3) != 0); \
1312   }))
1313 
1314 # define graphene_simd4f_cmp_neq(a,b) \
1315   (__extension__ ({ \
1316     const uint32x4_t __mask = vceqq_f32 ((a), (b)); \
1317     (bool) (vgetq_lane_u32 (__mask, 0) == 0 || \
1318             vgetq_lane_u32 (__mask, 1) == 0 || \
1319             vgetq_lane_u32 (__mask, 2) == 0 || \
1320             vgetq_lane_u32 (__mask, 3) == 0); \
1321   }))
1322 
1323 # define graphene_simd4f_cmp_lt(a,b) \
1324   (__extension__ ({ \
1325     const uint32x4_t __mask = vcltq_f32 ((a), (b)); \
1326     (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1327             vgetq_lane_u32 (__mask, 1) != 0 && \
1328             vgetq_lane_u32 (__mask, 2) != 0 && \
1329             vgetq_lane_u32 (__mask, 3) != 0); \
1330   }))
1331 
1332 # define graphene_simd4f_cmp_le(a,b) \
1333   (__extension__ ({ \
1334     const uint32x4_t __mask = vcleq_f32 ((a), (b)); \
1335     (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1336             vgetq_lane_u32 (__mask, 1) != 0 && \
1337             vgetq_lane_u32 (__mask, 2) != 0 && \
1338             vgetq_lane_u32 (__mask, 3) != 0); \
1339   }))
1340 
1341 # define graphene_simd4f_cmp_ge(a,b) \
1342   (__extension__ ({ \
1343     const uint32x4_t __mask = vcgeq_f32 ((a), (b)); \
1344     (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1345             vgetq_lane_u32 (__mask, 1) != 0 && \
1346             vgetq_lane_u32 (__mask, 2) != 0 && \
1347             vgetq_lane_u32 (__mask, 3) != 0); \
1348   }))
1349 
1350 # define graphene_simd4f_cmp_gt(a,b) \
1351   (__extension__ ({ \
1352     const uint32x4_t __mask = vcgtq_f32 ((a), (b)); \
1353     (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \
1354             vgetq_lane_u32 (__mask, 1) != 0 && \
1355             vgetq_lane_u32 (__mask, 2) != 0 && \
1356             vgetq_lane_u32 (__mask, 3) != 0); \
1357   }))
1358 
1359 # define graphene_simd4f_neg(s) \
1360   (__extension__ ({ \
1361     const unsigned int __umask[4] = { \
1362       0x80000000, \
1363       0x80000000, \
1364       0x80000000, \
1365       0x80000000 \
1366     }; \
1367     const uint32x4_t __mask = vld1q_u32 (__umask); \
1368     (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask)); \
1369   }))
1370 
1371 #elif defined _MSC_VER /* Visual Studio ARM */
1372 
1373 # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
1374 static inline graphene_simd4f_t
_simd4f_init(float x,float y,float z,float w)1375 _simd4f_init (float x, float y, float z, float w)
1376 {
1377   const float32_t __v[4] = { (x), (y), (z), (w) };
1378   return vld1q_f32 (__v);
1379 }
1380 
1381 # define graphene_simd4f_init_zero() vdupq_n_f32 (0.f)
1382 
1383 # define graphene_simd4f_init_4f(v) vld1q_f32 (v)
1384 
1385 # define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f)
1386 
1387 # define graphene_simd4f_init_2f(v) _simd4f_init_2f(v)
1388 static inline graphene_simd4f_t
_simd4f_init_2f(const float * v)1389 _simd4f_init_2f (const float *v)
1390 {
1391   const float32_t *__v32 = (const float32_t *) (v);
1392   const graphene_simd2f_t __low = vld1_f32 (__v32);
1393   const float32_t __zero = 0;
1394   const graphene_simd2f_t __high = vld1_dup_f32 (&__zero);
1395   return vcombine_f32 (__low, __high);
1396 }
1397 
1398 # define graphene_simd4f_dup_4f(s,v) vst1q_f32 ((float32_t *) (v), (s))
1399 
1400 # define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v)
1401 static inline
_simd4f_dup_3f(const graphene_simd4f_t s,float * v)1402 void _simd4f_dup_3f (const graphene_simd4f_t s,
1403                      float *v)
1404 {
1405   float *__v = (v);
1406   vst1q_lane_f32 (__v++, (s), 0);
1407   vst1q_lane_f32 (__v++, (s), 1);
1408   vst1q_lane_f32 (__v, (s), 2);
1409 }
1410 
1411 # define graphene_simd4f_dup_2f(s,v) vst1_f32 (v, vget_low_f32 (s))
1412 
1413 # define graphene_simd4f_get(s,i) vgetq_lane_f32 ((s), (i))
1414 
1415 # define graphene_simd4f_splat(v) vdupq_n_f32 ((v))
1416 
1417 # define graphene_simd4f_splat_x(s) graphene_simd4f_splat (graphene_simd4f_get_x ((s)))
1418 
1419 # define graphene_simd4f_splat_y(s) graphene_simd4f_splat (graphene_simd4f_get_y ((s)))
1420 
1421 # define graphene_simd4f_splat_z(s) graphene_simd4f_splat (graphene_simd4f_get_z ((s)))
1422 
1423 # define graphene_simd4f_splat_w(s) graphene_simd4f_splat (graphene_simd4f_get_w ((s)))
1424 
1425 # define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s)
1426 static inline graphene_simd4f_t
_simd4f_reciprocal(const graphene_simd4f_t s)1427 _simd4f_reciprocal (const graphene_simd4f_t s)
1428 {
1429   graphene_simd4f_t __est = vrecpeq_f32 ((s));
1430   __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est);
1431   return vmulq_f32 (vrecpsq_f32 (__est, (s)), __est);
1432 }
1433 
1434 # define graphene_simd4f_add(a,b) vaddq_f32 ((a), (b))
1435 
1436 # define graphene_simd4f_sub(a,b) vsubq_f32 ((a), (b))
1437 
1438 # define graphene_simd4f_mul(a,b) vmulq_f32 ((a), (b))
1439 
1440 # define graphene_simd4f_div(a,b) vmulq_f32 (a, graphene_simd4f_reciprocal (b))
1441 
1442 static inline graphene_simd4f_t
_simd4f_rsqrt_iter(const graphene_simd4f_t v,const graphene_simd4f_t estimate)1443 _simd4f_rsqrt_iter (const graphene_simd4f_t v,
1444                     const graphene_simd4f_t estimate)
1445 {
1446   const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v));
1447   return vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate)));
1448 }
1449 
1450 # define graphene_simd4f_rsqrt(s) _simd4f_rsqrt(s)
1451 static inline graphene_simd4f_t
_simd4f_rsqrt(const graphene_simd4f_t s)1452 _simd4f_rsqrt (const graphene_simd4f_t s)
1453 {
1454   graphene_simd4f_t __estimate = vrsqrteq_f32 ((s));
1455   __estimate = _simd4f_rsqrt_iter ((s), __estimate);
1456   __estimate = _simd4f_rsqrt_iter ((s), __estimate);
1457   return _simd4f_rsqrt_iter ((s), __estimate);
1458 }
1459 
1460 # define graphene_simd4f_sqrt(s) _simd4f_sqrt(s)
1461 static inline graphene_simd4f_t
_simd4f_sqrt(const graphene_simd4f_t s)1462 _simd4f_sqrt (const graphene_simd4f_t s)
1463 {
1464   graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s));
1465   graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq);
1466   uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \
1467   return vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq)));
1468 }
1469 
1470 # define graphene_simd4f_cross3(a,b) _simd4f_cross3(a,b)
1471 static inline graphene_simd4f_t
_simd4f_cross3(const graphene_simd4f_t a,const graphene_simd4f_t b)1472 _simd4f_cross3 (const graphene_simd4f_t a,
1473                 const graphene_simd4f_t b)
1474 {
1475   const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
1476   const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits);
1477   const graphene_simd4f_t __a = (a), __b = (b);
1478   const graphene_simd2f_t __a_low = vget_low_f32 (__a);
1479   const graphene_simd2f_t __b_low = vget_low_f32 (__b);
1480   const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low);
1481   const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low);
1482   graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a),
1483                                                 graphene_simd4f_mul (__a_yzx, __b));
1484   graphene_simd2f_t __s3_low = vget_low_f32 (__s3);
1485   __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low);
1486   return vandq_s32 (__s3, __mask);
1487 }
1488 
1489 # define graphene_simd4f_dot3(a,b) graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b))
1490 
1491 # define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b)
1492 static inline float
_simd4f_dot3_scalar(const graphene_simd4f_t a,const graphene_simd4f_t b)1493 _simd4f_dot3_scalar (const graphene_simd4f_t a,
1494                      const graphene_simd4f_t b)
1495 {
1496   const graphene_simd4f_t __m = graphene_simd4f_mul (a, b);
1497   const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m));
1498   return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0);
1499 }
1500 
1501 # define graphene_simd4f_min(a,b) vminq_f32 ((a), (b))
1502 
1503 # define graphene_simd4f_max(a,b) vmaxq_f32 (a, b)
1504 
1505 # define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v)
1506 static inline graphene_simd4f_t
_simd4f_shuffle_wxyz(const graphene_simd4f_t v)1507 _simd4f_shuffle_wxyz (const graphene_simd4f_t v)
1508 {
1509   graphene_simd4f_union_t __u = { (v) };
1510   return graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]);
1511 }
1512 
1513 # define graphene_simd4f_shuffle_zwxy(v) _simd4f_shuffle_zwxy(v)
1514 static inline graphene_simd4f_t
_simd4f_shuffle_zwxy(const graphene_simd4f_t v)1515 _simd4f_shuffle_zwxy (const graphene_simd4f_t v)
1516 {
1517   graphene_simd4f_union_t __u = { (v) };
1518   return graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]);
1519 }
1520 
1521 # define graphene_simd4f_shuffle_yzwx(v) _simd4f_shuffle_yzwx(v)
1522 static inline graphene_simd4f_t
_simd4f_shuffle_yzwx(const graphene_simd4f_t v)1523 _simd4f_shuffle_yzwx (const graphene_simd4f_t v)
1524 {
1525   graphene_simd4f_union_t __u = { (v) };
1526   return graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]);
1527 }
1528 
1529 # define graphene_simd4f_zero_w(v) _simd4f_zero_w(v)
1530 static inline graphene_simd4f_t
_simd4f_zero_w(const graphene_simd4f_t v)1531 _simd4f_zero_w (const graphene_simd4f_t v)
1532 {
1533   graphene_simd4f_union_t __u = { (v) };
1534   return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f);
1535 }
1536 
1537 # define graphene_simd4f_zero_zw(v) _simd4f_zero_zw(v)
1538 static inline graphene_simd4f_t
_simd4f_zero_zw(const graphene_simd4f_t v)1539 _simd4f_zero_zw (const graphene_simd4f_t v)
1540 {
1541   graphene_simd4f_union_t __u = { (v) };
1542   return graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f);
1543 }
1544 
1545 # define graphene_simd4f_merge_w(s,v) _simd4f_merge_w(s,v)
1546 static inline graphene_simd4f_t
_simd4f_merge_w(const graphene_simd4f_t s,float v)1547 _simd4f_merge_w (const graphene_simd4f_t s,
1548                  float                   v)
1549 {
1550   graphene_simd4f_union_t __u = { (s) };
1551   return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v));
1552 }
1553 
1554 # define graphene_simd4f_merge_high(a,b) _simd4f_merge_high(a,b)
1555 static inline graphene_simd4f_t
_simd4f_merge_high(const graphene_simd4f_t a,const graphene_simd4f_t b)1556 _simd4f_merge_high (const graphene_simd4f_t a,
1557                     const graphene_simd4f_t b)
1558 {
1559   graphene_simd4f_union_t __u_a = { (a) };
1560   graphene_simd4f_union_t __u_b = { (b) };
1561   return graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]);
1562 }
1563 
1564 # define graphene_simd4f_merge_low(a,b) _simd4f_merge_low(a,b)
1565 static inline graphene_simd4f_t
_simd4f_merge_low(const graphene_simd4f_t a,const graphene_simd4f_t b)1566 _simd4f_merge_low (const graphene_simd4f_t a,
1567                    const graphene_simd4f_t b)
1568 {
1569   graphene_simd4f_union_t __u_a = { (a) };
1570   graphene_simd4f_union_t __u_b = { (b) };
1571   return graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]);
1572 }
1573 
1574 
1575 # define graphene_simd4f_flip_sign_0101(s) _simd4f_flip_sign_0101(s)
1576 static inline graphene_simd4f_t
_simd4f_flip_sign_0101(const graphene_simd4f_t s)1577 _simd4f_flip_sign_0101 (const graphene_simd4f_t s)
1578 {
1579   const unsigned int __upnpn[4] = {
1580     0x00000000,
1581     0x80000000,
1582     0x00000000,
1583     0x80000000
1584   };
1585   const uint32x4_t __pnpn = vld1q_u32 (__upnpn);
1586   return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn));
1587 }
1588 
1589 # define graphene_simd4f_flip_sign_1010(s) _simd4f_flip_sign_1010(s)
1590 static inline graphene_simd4f_t
_simd4f_flip_sign_1010(const graphene_simd4f_t s)1591 _simd4f_flip_sign_1010 (const graphene_simd4f_t s)
1592 {
1593   const unsigned int __unpnp[4] = {
1594     0x80000000,
1595     0x00000000,
1596     0x80000000,
1597     0x00000000
1598   };
1599 
1600   const uint32x4_t __npnp = vld1q_u32 (__unpnp);
1601   return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp));
1602 }
1603 
1604 # define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b)
1605 static inline bool
_simd4f_cmp_eq(const graphene_simd4f_t a,const graphene_simd4f_t b)1606 _simd4f_cmp_eq (const graphene_simd4f_t a,
1607                 const graphene_simd4f_t b)
1608 {
1609   const uint32x4_t __mask = vceqq_f32 ((a), (b));
1610   return  (vgetq_lane_u32 (__mask, 0) != 0 &&
1611            vgetq_lane_u32 (__mask, 1) != 0 &&
1612            vgetq_lane_u32 (__mask, 2) != 0 &&
1613            vgetq_lane_u32 (__mask, 3) != 0);
1614 }
1615 
1616 # define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b)
1617 static inline bool
_simd4f_cmp_neq(const graphene_simd4f_t a,const graphene_simd4f_t b)1618 _simd4f_cmp_neq (const graphene_simd4f_t a,
1619                  const graphene_simd4f_t b)
1620 {
1621   const uint32x4_t __mask = vceqq_f32 ((a), (b));
1622   return (vgetq_lane_u32 (__mask, 0) == 0 ||
1623           vgetq_lane_u32 (__mask, 1) == 0 ||
1624           vgetq_lane_u32 (__mask, 2) == 0 ||
1625           vgetq_lane_u32 (__mask, 3) == 0);
1626 }
1627 
1628 # define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b)
1629 static inline bool
_simd4f_cmp_lt(const graphene_simd4f_t a,const graphene_simd4f_t b)1630 _simd4f_cmp_lt (const graphene_simd4f_t a,
1631                 const graphene_simd4f_t b)
1632 {
1633   const uint32x4_t __mask = vcltq_f32 ((a), (b));
1634   return (vgetq_lane_u32 (__mask, 0) != 0 &&
1635           vgetq_lane_u32 (__mask, 1) != 0 &&
1636           vgetq_lane_u32 (__mask, 2) != 0 &&
1637           vgetq_lane_u32 (__mask, 3) != 0);
1638 }
1639 
1640 # define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b)
1641 static inline bool
_simd4f_cmp_le(const graphene_simd4f_t a,const graphene_simd4f_t b)1642 _simd4f_cmp_le (const graphene_simd4f_t a,
1643                 const graphene_simd4f_t b)
1644 {
1645   const uint32x4_t __mask = vcleq_f32 ((a), (b));
1646   return (vgetq_lane_u32 (__mask, 0) != 0 &&
1647           vgetq_lane_u32 (__mask, 1) != 0 &&
1648           vgetq_lane_u32 (__mask, 2) != 0 &&
1649           vgetq_lane_u32 (__mask, 3) != 0);
1650 }
1651 
1652 # define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b)
1653 static inline bool
_simd4f_cmp_ge(const graphene_simd4f_t a,const graphene_simd4f_t b)1654 _simd4f_cmp_ge (const graphene_simd4f_t a,
1655                 const graphene_simd4f_t b)
1656 {
1657   const uint32x4_t __mask = vcgeq_f32 ((a), (b));
1658   return (vgetq_lane_u32 (__mask, 0) != 0 &&
1659           vgetq_lane_u32 (__mask, 1) != 0 &&
1660           vgetq_lane_u32 (__mask, 2) != 0 &&
1661           vgetq_lane_u32 (__mask, 3) != 0);
1662 }
1663 
1664 # define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b)
1665 static inline bool
_simd4f_cmp_gt(const graphene_simd4f_t a,const graphene_simd4f_t b)1666 _simd4f_cmp_gt (const graphene_simd4f_t a,
1667                 const graphene_simd4f_t b)
1668 {
1669   const uint32x4_t __mask = vcgtq_f32 ((a), (b));
1670   return (vgetq_lane_u32 (__mask, 0) != 0 &&
1671           vgetq_lane_u32 (__mask, 1) != 0 &&
1672           vgetq_lane_u32 (__mask, 2) != 0 &&
1673           vgetq_lane_u32 (__mask, 3) != 0);
1674 }
1675 
1676 # define graphene_simd4f_neg(s) _simd4f_neg(s)
1677 static inline graphene_simd4f_t
_simd4f_neg(const graphene_simd4f_t s)1678 _simd4f_neg (const graphene_simd4f_t s)
1679 {
1680   const unsigned int __umask[4] = {
1681     0x80000000,
1682     0x80000000,
1683     0x80000000,
1684     0x80000000
1685   };
1686   const uint32x4_t __mask = vld1q_u32 (__umask);
1687   return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask));
1688 }
1689 
1690 #else /* ARM NEON intrinsics-not GCC or Visual Studio */
1691 
1692 #  error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
1693 
1694 /* Use static inline to inline all these functions */
1695 
1696 # endif /* !__GNUC__ && !_MSC_VER */
1697 
1698 /* macros that are not compiler-dependent */
1699 # define graphene_simd4f_get_x(s)       graphene_simd4f_get (s, 0)
1700 # define graphene_simd4f_get_y(s)       graphene_simd4f_get (s, 1)
1701 # define graphene_simd4f_get_z(s)       graphene_simd4f_get (s, 2)
1702 # define graphene_simd4f_get_w(s)       graphene_simd4f_get (s, 3)
1703 
1704 #elif defined(__GI_SCANNER__) || defined(GRAPHENE_USE_SCALAR)
1705 
1706 /* Fallback implementation using scalar types */
1707 
1708 #define graphene_simd4f_init(x,y,z,w) \
1709   (graphene_simd4f_init ((x), (y), (z), (w)))
1710 #define graphene_simd4f_init_zero() \
1711   (graphene_simd4f_init_zero ())
1712 #define graphene_simd4f_init_4f(v) \
1713   (graphene_simd4f_init_4f ((const float *) (v)))
1714 #define graphene_simd4f_init_3f(v) \
1715   (graphene_simd4f_init_3f ((const float *) (v)))
1716 #define graphene_simd4f_init_2f(v) \
1717   (graphene_simd4f_init_2f ((const float *) (v)))
1718 #define graphene_simd4f_dup_4f(s,v) \
1719   (graphene_simd4f_dup_4f ((s), (float *) (v)))
1720 #define graphene_simd4f_dup_3f(s,v) \
1721   (graphene_simd4f_dup_3f ((s), (float *) (v)))
1722 #define graphene_simd4f_dup_2f(s,v) \
1723   (graphene_simd4f_dup_2f ((s), (float *) (v)))
1724 #define graphene_simd4f_get(s,i) \
1725   (graphene_simd4f_get ((s), (i)))
1726 #define graphene_simd4f_get_x(s) \
1727   (graphene_simd4f_get_x ((s)))
1728 #define graphene_simd4f_get_y(s) \
1729   (graphene_simd4f_get_y ((s)))
1730 #define graphene_simd4f_get_z(s) \
1731   (graphene_simd4f_get_z ((s)))
1732 #define graphene_simd4f_get_w(s) \
1733   (graphene_simd4f_get_w ((s)))
1734 #define graphene_simd4f_splat(v) \
1735   (graphene_simd4f_splat ((v)))
1736 #define graphene_simd4f_splat_x(s) \
1737   (graphene_simd4f_splat_x ((s)))
1738 #define graphene_simd4f_splat_y(s) \
1739   (graphene_simd4f_splat_y ((s)))
1740 #define graphene_simd4f_splat_z(s) \
1741   (graphene_simd4f_splat_z ((s)))
1742 #define graphene_simd4f_splat_w(s) \
1743   (graphene_simd4f_splat_w ((s)))
1744 #define graphene_simd4f_add(a,b) \
1745   (graphene_simd4f_add ((a), (b)))
1746 #define graphene_simd4f_sub(a,b) \
1747   (graphene_simd4f_sub ((a), (b)))
1748 #define graphene_simd4f_mul(a,b) \
1749   (graphene_simd4f_mul ((a), (b)))
1750 #define graphene_simd4f_div(a,b) \
1751   (graphene_simd4f_div ((a), (b)))
1752 #define graphene_simd4f_sqrt(s) \
1753   (graphene_simd4f_sqrt ((s)))
1754 #define graphene_simd4f_rsqrt(s) \
1755   (graphene_simd4f_rsqrt ((s)))
1756 #define graphene_simd4f_reciprocal(s) \
1757   (graphene_simd4f_reciprocal ((s)))
1758 #define graphene_simd4f_cross3(a,b) \
1759   (graphene_simd4f_cross3 ((a), (b)))
1760 #define graphene_simd4f_dot3(a,b) \
1761   (graphene_simd4f_dot3 ((a), (b)))
1762 #define graphene_simd4f_dot3_scalar(a,b) \
1763   (graphene_simd4f_dot3_scalar ((a), (b)))
1764 #define graphene_simd4f_min(a,b) \
1765   (graphene_simd4f_min ((a), (b)))
1766 #define graphene_simd4f_max(a,b) \
1767   (graphene_simd4f_max ((a), (b)))
1768 #define graphene_simd4f_shuffle_wxyz(s) \
1769   (graphene_simd4f_shuffle_wxyz ((s)))
1770 #define graphene_simd4f_shuffle_zwxy(s) \
1771   (graphene_simd4f_shuffle_zwxy ((s)))
1772 #define graphene_simd4f_shuffle_yzwx(s) \
1773   (graphene_simd4f_shuffle_yzwx ((s)))
1774 #define graphene_simd4f_flip_sign_0101(s) \
1775   (graphene_simd4f_flip_sign_0101 ((s)))
1776 #define graphene_simd4f_flip_sign_1010(s) \
1777   (graphene_simd4f_flip_sign_1010 ((s)))
1778 #define graphene_simd4f_zero_w(v) \
1779   (graphene_simd4f_zero_w ((v)))
1780 #define graphene_simd4f_zero_zw(v) \
1781   (graphene_simd4f_zero_zw ((v)))
1782 #define graphene_simd4f_merge_w(s,v) \
1783   (graphene_simd4f_merge_w ((s), (v)))
1784 #define graphene_simd4f_merge_high(a,b) \
1785   (graphene_simd4f_merge_high ((a), (b)))
1786 #define graphene_simd4f_merge_low(a,b) \
1787   (graphene_simd4f_merge_low ((a), (b)))
1788 #define graphene_simd4f_cmp_eq(a,b) \
1789   (graphene_simd4f_cmp_eq ((a), (b)))
1790 #define graphene_simd4f_cmp_neq(a,b) \
1791   (graphene_simd4f_cmp_neq ((a), (b)))
1792 #define graphene_simd4f_cmp_lt(a,b) \
1793   (graphene_simd4f_cmp_lt ((a), (b)))
1794 #define graphene_simd4f_cmp_le(a,b) \
1795   (graphene_simd4f_cmp_le ((a), (b)))
1796 #define graphene_simd4f_cmp_ge(a,b) \
1797   (graphene_simd4f_cmp_ge ((a), (b)))
1798 #define graphene_simd4f_cmp_gt(a,b) \
1799   (graphene_simd4f_cmp_gt ((a), (b)))
1800 #define graphene_simd4f_neg(s) \
1801   (graphene_simd4f_neg ((s)))
1802 
1803 #else
1804 # error "Unsupported simd4f implementation."
1805 #endif
1806 
1807 /* Generic operations, inlined */
1808 
1809 /**
1810  * graphene_simd4f_madd:
1811  * @m1: a #graphene_simd4f_t
1812  * @m2: a #graphene_simd4f_t
1813  * @a: a #graphene_simd4f_t
1814  *
1815  * Adds @a to the product of @m1 and @m2.
1816  *
1817  * Returns: the result vector
1818  *
1819  * Since: 1.0
1820  */
1821 static inline graphene_simd4f_t
graphene_simd4f_madd(const graphene_simd4f_t m1,const graphene_simd4f_t m2,const graphene_simd4f_t a)1822 graphene_simd4f_madd (const graphene_simd4f_t m1,
1823                       const graphene_simd4f_t m2,
1824                       const graphene_simd4f_t a)
1825 {
1826   return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
1827 }
1828 
1829 /**
1830  * graphene_simd4f_sum:
1831  * @v: a #graphene_simd4f_t
1832  *
1833  * Sums all components of the given vector.
1834  *
1835  * Returns: a vector with all components set to be the
1836  *   sum of the passed #graphene_simd4f_t
1837  *
1838  * Since: 1.0
1839  */
1840 static inline graphene_simd4f_t
graphene_simd4f_sum(const graphene_simd4f_t v)1841 graphene_simd4f_sum (const graphene_simd4f_t v)
1842 {
1843   const graphene_simd4f_t x = graphene_simd4f_splat_x (v);
1844   const graphene_simd4f_t y = graphene_simd4f_splat_y (v);
1845   const graphene_simd4f_t z = graphene_simd4f_splat_z (v);
1846   const graphene_simd4f_t w = graphene_simd4f_splat_w (v);
1847 
1848   return graphene_simd4f_add (graphene_simd4f_add (x, y),
1849                               graphene_simd4f_add (z, w));
1850 }
1851 
1852 /**
1853  * graphene_simd4f_sum_scalar:
1854  * @v: a #graphene_simd4f_t
1855  *
1856  * Sums all the components of the given vector.
1857  *
1858  * Returns: a scalar value with the sum of the components
1859  *   of the given #graphene_simd4f_t
1860  *
1861  * Since: 1.0
1862  */
1863 static inline float
graphene_simd4f_sum_scalar(const graphene_simd4f_t v)1864 graphene_simd4f_sum_scalar (const graphene_simd4f_t v)
1865 {
1866   return graphene_simd4f_get_x (graphene_simd4f_sum (v));
1867 }
1868 
1869 /**
1870  * graphene_simd4f_dot4:
1871  * @a: a #graphene_simd4f_t
1872  * @b: a #graphene_simd4f_t
1873  *
1874  * Computes the dot product of all the components of the two
1875  * given #graphene_simd4f_t.
1876  *
1877  * Returns: a vector whose components are all set to be the
1878  *   dot product of the components of the two operands
1879  *
1880  * Since: 1.0
1881  */
1882 static inline graphene_simd4f_t
graphene_simd4f_dot4(const graphene_simd4f_t a,const graphene_simd4f_t b)1883 graphene_simd4f_dot4 (const graphene_simd4f_t a,
1884                       const graphene_simd4f_t b)
1885 {
1886   return graphene_simd4f_sum (graphene_simd4f_mul (a, b));
1887 }
1888 
1889 /**
1890  * graphene_simd4f_dot2:
1891  * @a: a #graphene_simd4f_t
1892  * @b: a #graphene_simd4f_t
1893  *
1894  * Computes the dot product of the first two components of the
1895  * two given #graphene_simd4f_t.
1896  *
1897  * Returns: a vector whose components are all set to the
1898  *   dot product of the components of the two operands
1899  *
1900  * Since: 1.0
1901  */
1902 static inline graphene_simd4f_t
graphene_simd4f_dot2(const graphene_simd4f_t a,const graphene_simd4f_t b)1903 graphene_simd4f_dot2 (const graphene_simd4f_t a,
1904                       const graphene_simd4f_t b)
1905 {
1906   const graphene_simd4f_t m = graphene_simd4f_mul (a, b);
1907   const graphene_simd4f_t x = graphene_simd4f_splat_x (m);
1908   const graphene_simd4f_t y = graphene_simd4f_splat_y (m);
1909 
1910   return graphene_simd4f_add (x, y);
1911 }
1912 
1913 /**
1914  * graphene_simd4f_length4:
1915  * @v: a #graphene_simd4f_t
1916  *
1917  * Computes the length of the given #graphene_simd4f_t vector,
1918  * using all four of its components.
1919  *
1920  * Returns: the length vector
1921  *
1922  * Since: 1.0
1923  */
1924 static inline graphene_simd4f_t
graphene_simd4f_length4(const graphene_simd4f_t v)1925 graphene_simd4f_length4 (const graphene_simd4f_t v)
1926 {
1927   return graphene_simd4f_sqrt (graphene_simd4f_dot4 (v, v));
1928 }
1929 
1930 /**
1931  * graphene_simd4f_length3:
1932  * @v: a #graphene_simd4f_t
1933  *
1934  * Computes the length of the given #graphene_simd4f_t vector,
1935  * using the first three of its components.
1936  *
1937  * Returns: the length vector
1938  *
1939  * Since: 1.0
1940  */
1941 static inline graphene_simd4f_t
graphene_simd4f_length3(const graphene_simd4f_t v)1942 graphene_simd4f_length3 (const graphene_simd4f_t v)
1943 {
1944   return graphene_simd4f_sqrt (graphene_simd4f_dot3 (v, v));
1945 }
1946 
1947 /**
1948  * graphene_simd4f_length2:
1949  * @v: a #graphene_simd4f_t
1950  *
1951  * Computes the length of the given #graphene_simd4f_t vector,
1952  * using the first two of its components.
1953  *
1954  * Returns: the length vector
1955  *
1956  * Since: 1.0
1957  */
1958 static inline graphene_simd4f_t
graphene_simd4f_length2(const graphene_simd4f_t v)1959 graphene_simd4f_length2 (const graphene_simd4f_t v)
1960 {
1961   return graphene_simd4f_sqrt (graphene_simd4f_dot2 (v, v));
1962 }
1963 
1964 /**
1965  * graphene_simd4f_normalize4:
1966  * @v: a #graphene_simd4f_t
1967  *
1968  * Computes the normalization of the given #graphene_simd4f_t vector,
1969  * using all of its components.
1970  *
1971  * Returns: the normalized vector
1972  *
1973  * Since: 1.0
1974  */
1975 static inline graphene_simd4f_t
graphene_simd4f_normalize4(const graphene_simd4f_t v)1976 graphene_simd4f_normalize4 (const graphene_simd4f_t v)
1977 {
1978   graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot4 (v, v));
1979   return graphene_simd4f_mul (v, invlen);
1980 }
1981 
1982 /**
1983  * graphene_simd4f_normalize3:
1984  * @v: a #graphene_simd4f_t
1985  *
1986  * Computes the normalization of the given #graphene_simd4f_t vector,
1987  * using the first three of its components.
1988  *
1989  * Returns: the normalized vector
1990  *
1991  * Since: 1.0
1992  */
1993 static inline graphene_simd4f_t
graphene_simd4f_normalize3(const graphene_simd4f_t v)1994 graphene_simd4f_normalize3 (const graphene_simd4f_t v)
1995 {
1996   graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot3 (v, v));
1997   return graphene_simd4f_mul (v, invlen);
1998 }
1999 
2000 /**
2001  * graphene_simd4f_normalize2:
2002  * @v: a #graphene_simd4f_t
2003  *
2004  * Computes the normalization of the given #graphene_simd4f_t vector,
2005  * using the first two of its components.
2006  *
2007  * Returns: the normalized vector
2008  *
2009  * Since: 1.0
2010  */
2011 static inline graphene_simd4f_t
graphene_simd4f_normalize2(const graphene_simd4f_t v)2012 graphene_simd4f_normalize2 (const graphene_simd4f_t v)
2013 {
2014   graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot2 (v, v));
2015   return graphene_simd4f_mul (v, invlen);
2016 }
2017 
2018 /**
2019  * graphene_simd4f_is_zero4:
2020  * @v: a #graphene_simd4f_t
2021  *
2022  * Checks whether the given #graphene_simd4f_t has all its components
2023  * set to 0.
2024  *
2025  * Returns: `true` if all the vector components are zero
2026  *
2027  * Since: 1.0
2028  */
2029 static inline bool
graphene_simd4f_is_zero4(const graphene_simd4f_t v)2030 graphene_simd4f_is_zero4 (const graphene_simd4f_t v)
2031 {
2032   graphene_simd4f_t zero = graphene_simd4f_init_zero ();
2033   return graphene_simd4f_cmp_eq (v, zero);
2034 }
2035 
2036 /**
2037  * graphene_simd4f_is_zero3:
2038  * @v: a #graphene_simd4f_t
2039  *
2040  * Checks whether the given #graphene_simd4f_t has the first three of
2041  * its components set to 0.
2042  *
2043  * Returns: `true` if the vector's components are zero
2044  *
2045  * Since: 1.0
2046  */
2047 static inline bool
graphene_simd4f_is_zero3(const graphene_simd4f_t v)2048 graphene_simd4f_is_zero3 (const graphene_simd4f_t v)
2049 {
2050   return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON &&
2051          fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON &&
2052          fabsf (graphene_simd4f_get_z (v)) <= FLT_EPSILON;
2053 }
2054 
2055 /**
2056  * graphene_simd4f_is_zero2:
2057  * @v: a #graphene_simd4f_t
2058  *
2059  * Checks whether the given #graphene_simd4f_t has the first two of
2060  * its components set to 0.
2061  *
2062  * Returns: `true` if the vector's components are zero
2063  *
2064  * Since: 1.0
2065  */
2066 static inline bool
graphene_simd4f_is_zero2(const graphene_simd4f_t v)2067 graphene_simd4f_is_zero2 (const graphene_simd4f_t v)
2068 {
2069   return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON &&
2070          fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON;
2071 }
2072 
2073 /**
2074  * graphene_simd4f_interpolate:
2075  * @a: a #graphene_simd4f_t
2076  * @b: a #graphene_simd4f_t
2077  * @f: the interpolation factor
2078  *
2079  * Linearly interpolates all components of the two given
2080  * #graphene_simd4f_t vectors using the given factor @f.
2081  *
2082  * Returns: the intrerpolated vector
2083  *
2084  * Since: 1.0
2085  */
2086 static inline graphene_simd4f_t
graphene_simd4f_interpolate(const graphene_simd4f_t a,const graphene_simd4f_t b,float f)2087 graphene_simd4f_interpolate (const graphene_simd4f_t a,
2088                              const graphene_simd4f_t b,
2089                              float                   f)
2090 {
2091   const graphene_simd4f_t one_minus_f = graphene_simd4f_sub (graphene_simd4f_splat (1.f),
2092                                                              graphene_simd4f_splat (f));
2093 
2094   return graphene_simd4f_add (graphene_simd4f_mul (one_minus_f, a),
2095                               graphene_simd4f_mul (graphene_simd4f_splat (f), b));
2096 }
2097 
2098 /**
2099  * graphene_simd4f_clamp:
2100  * @v: a #graphene_simd4f_t
2101  * @min: the lower boundary
2102  * @max: the upper boundary
2103  *
2104  * Ensures that all components of the vector @v are within
2105  * the components of the @lower and @upper boundaries.
2106  *
2107  * Returns: the clamped vector
2108  *
2109  * Since: 1.2
2110  */
2111 static inline graphene_simd4f_t
graphene_simd4f_clamp(const graphene_simd4f_t v,const graphene_simd4f_t min,const graphene_simd4f_t max)2112 graphene_simd4f_clamp (const graphene_simd4f_t v,
2113                        const graphene_simd4f_t min,
2114                        const graphene_simd4f_t max)
2115 {
2116   const graphene_simd4f_t tmp = graphene_simd4f_max (min, v);
2117 
2118   return graphene_simd4f_min (tmp, max);
2119 }
2120 
2121 /**
2122  * graphene_simd4f_clamp_scalar:
2123  * @v: a #graphene_simd4f_t
2124  * @min: the lower boundary
2125  * @max: the upper boundary
2126  *
2127  * Ensures that all components of the vector @v are within
2128  * the @lower and @upper boundary scalar values.
2129  *
2130  * Returns: the clamped vector
2131  *
2132  * Since: 1.2
2133  */
2134 static inline graphene_simd4f_t
graphene_simd4f_clamp_scalar(const graphene_simd4f_t v,float min,float max)2135 graphene_simd4f_clamp_scalar (const graphene_simd4f_t v,
2136                               float                   min,
2137                               float                   max)
2138 {
2139   return graphene_simd4f_clamp (v,
2140                                 graphene_simd4f_splat (min),
2141                                 graphene_simd4f_splat (max));
2142 }
2143 
2144 /**
2145  * graphene_simd4f_min_val:
2146  * @v: a #graphene_simd4f_t
2147  *
2148  * Computes the minimum value of all the channels in the given vector.
2149  *
2150  * Returns: a vector whose components are all set to the
2151  *   minimum value in the original vector
2152  *
2153  * Since: 1.4
2154  */
2155 static inline graphene_simd4f_t
graphene_simd4f_min_val(const graphene_simd4f_t v)2156 graphene_simd4f_min_val (const graphene_simd4f_t v)
2157 {
2158   graphene_simd4f_t s = v;
2159 
2160   s = graphene_simd4f_min (s, graphene_simd4f_shuffle_wxyz (s));
2161   s = graphene_simd4f_min (s, graphene_simd4f_shuffle_zwxy (s));
2162 
2163   return s;
2164 }
2165 
2166 /**
2167  * graphene_simd4f_max_val:
2168  * @v: a #graphene_simd4f_t
2169  *
2170  * Computes the maximum value of all the channels in the given vector.
2171  *
2172  * Returns: a vector whose components are all set to the
2173  *   maximum value in the original vector
2174  *
2175  * Since: 1.4
2176  */
2177 static inline graphene_simd4f_t
graphene_simd4f_max_val(const graphene_simd4f_t v)2178 graphene_simd4f_max_val (const graphene_simd4f_t v)
2179 {
2180   graphene_simd4f_t s = v;
2181 
2182   s = graphene_simd4f_max (s, graphene_simd4f_shuffle_wxyz (s));
2183   s = graphene_simd4f_max (s, graphene_simd4f_shuffle_zwxy (s));
2184 
2185   return s;
2186 }
2187 
2188 GRAPHENE_END_DECLS
2189