1 #include <immintrin.h>
2 #include <stdint.h>
3 
4 #ifdef __SSE2__
5 #define VECTLENDP 2
6 #define VECTLENSP 4
7 
8 typedef __m128d vdouble;
9 typedef __m128i vint;
10 
11 typedef __m128 vfloat;
12 typedef __m128i vint2;
13 typedef __m128i vmask;
14 
vloadu(double * p)15 static vdouble vloadu(double *p)
16 {
17     return _mm_loadu_pd(p);
18 }
vstoreu(double * p,vdouble v)19 static void vstoreu(double *p, vdouble v)
20 {
21     _mm_storeu_pd(p, v);
22 }
23 
vloaduf(float * p)24 static vfloat vloaduf(float *p)
25 {
26     return _mm_loadu_ps(p);
27 }
vstoreuf(float * p,vfloat v)28 static void vstoreuf(float *p, vfloat v)
29 {
30     _mm_storeu_ps(p, v);
31 }
32 
vloadui2(int32_t * p)33 static vint2 vloadui2(int32_t *p)
34 {
35     return (vint2)_mm_loadu_si128((__m128i *)p);
36 }
vstoreui2(int32_t * p,vint2 v)37 static void vstoreui2(int32_t *p, vint2 v)
38 {
39     _mm_storeu_si128((__m128i *)p, (__m128i)v);
40 }
41 #endif
42 
43 #ifdef ENABLE_AVX
44 #define VECTLENDP 4
45 #define VECTLENSP 8
46 
47 typedef __m256d vdouble;
48 typedef __m128i vint;
49 
50 
51 typedef __m256 vfloat;
52 typedef struct {
53     vint x, y;
54 } vint2;
55 
vloadu(double * p)56 static vdouble vloadu(double *p)
57 {
58     return _mm256_loadu_pd(p);
59 }
vstoreu(double * p,vdouble v)60 static void vstoreu(double *p, vdouble v)
61 {
62     return _mm256_storeu_pd(p, v);
63 }
64 
vloaduf(float * p)65 static vfloat vloaduf(float *p)
66 {
67     return _mm256_loadu_ps(p);
68 }
vstoreuf(float * p,vfloat v)69 static void vstoreuf(float *p, vfloat v)
70 {
71     return _mm256_storeu_ps(p, v);
72 }
73 
vloadui2(int32_t * p)74 static vint2 vloadui2(int32_t *p)
75 {
76     vint2 r;
77     r.x = _mm_loadu_si128((__m128i *) p     );
78     r.y = _mm_loadu_si128((__m128i *)(p + 4));
79     return r;
80 }
81 
vstoreui2(int32_t * p,vint2 v)82 static void vstoreui2(int32_t *p, vint2 v)
83 {
84     _mm_storeu_si128((__m128i *) p     , v.x);
85     _mm_storeu_si128((__m128i *)(p + 4), v.y);
86 }
87 #endif
88 
89 typedef struct {
90     vdouble x, y;
91 } vdouble2;
92 
93 vdouble xldexp(vdouble x, vint q);
94 vint xilogb(vdouble d);
95 
96 vdouble xsin(vdouble d);
97 vdouble xcos(vdouble d);
98 vdouble2 xsincos(vdouble d);
99 vdouble xtan(vdouble d);
100 vdouble xasin(vdouble s);
101 vdouble xacos(vdouble s);
102 vdouble xatan(vdouble s);
103 vdouble xatan2(vdouble y, vdouble x);
104 vdouble xlog(vdouble d);
105 vdouble xexp(vdouble d);
106 vdouble xpow(vdouble x, vdouble y);
107 
108 vdouble xsinh(vdouble d);
109 vdouble xcosh(vdouble d);
110 vdouble xtanh(vdouble d);
111 vdouble xasinh(vdouble s);
112 vdouble xacosh(vdouble s);
113 vdouble xatanh(vdouble s);
114 
115 vdouble xcbrt(vdouble d);
116 
117 vdouble xexp2(vdouble a);
118 vdouble xexp10(vdouble a);
119 vdouble xexpm1(vdouble a);
120 vdouble xlog10(vdouble a);
121 vdouble xlog1p(vdouble a);
122 
123 //
124 
125 typedef struct {
126     vfloat x, y;
127 } vfloat2;
128 
129 vfloat xsinf(vfloat d);
130 vfloat xcosf(vfloat d);
131 vfloat2 xsincosf(vfloat d);
132 vfloat xtanf(vfloat d);
133 vfloat xasinf(vfloat s);
134 vfloat xacosf(vfloat s);
135 vfloat xatanf(vfloat s);
136 vfloat xatan2f(vfloat y, vfloat x);
137 vfloat xlogf(vfloat d);
138 vfloat xlogf0(vfloat d);
139 vfloat xexpf(vfloat d);
140 vfloat xcbrtf(vfloat s);
141