1 #include <immintrin.h>
2 #include <stdint.h>
3
4 #ifdef __SSE2__
5 #define VECTLENDP 2
6 #define VECTLENSP 4
7
8 typedef __m128d vdouble;
9 typedef __m128i vint;
10
11 typedef __m128 vfloat;
12 typedef __m128i vint2;
13 typedef __m128i vmask;
14
vloadu(double * p)15 static vdouble vloadu(double *p)
16 {
17 return _mm_loadu_pd(p);
18 }
vstoreu(double * p,vdouble v)19 static void vstoreu(double *p, vdouble v)
20 {
21 _mm_storeu_pd(p, v);
22 }
23
vloaduf(float * p)24 static vfloat vloaduf(float *p)
25 {
26 return _mm_loadu_ps(p);
27 }
vstoreuf(float * p,vfloat v)28 static void vstoreuf(float *p, vfloat v)
29 {
30 _mm_storeu_ps(p, v);
31 }
32
vloadui2(int32_t * p)33 static vint2 vloadui2(int32_t *p)
34 {
35 return (vint2)_mm_loadu_si128((__m128i *)p);
36 }
vstoreui2(int32_t * p,vint2 v)37 static void vstoreui2(int32_t *p, vint2 v)
38 {
39 _mm_storeu_si128((__m128i *)p, (__m128i)v);
40 }
41 #endif
42
43 #ifdef ENABLE_AVX
44 #define VECTLENDP 4
45 #define VECTLENSP 8
46
47 typedef __m256d vdouble;
48 typedef __m128i vint;
49
50
51 typedef __m256 vfloat;
52 typedef struct {
53 vint x, y;
54 } vint2;
55
vloadu(double * p)56 static vdouble vloadu(double *p)
57 {
58 return _mm256_loadu_pd(p);
59 }
vstoreu(double * p,vdouble v)60 static void vstoreu(double *p, vdouble v)
61 {
62 return _mm256_storeu_pd(p, v);
63 }
64
vloaduf(float * p)65 static vfloat vloaduf(float *p)
66 {
67 return _mm256_loadu_ps(p);
68 }
vstoreuf(float * p,vfloat v)69 static void vstoreuf(float *p, vfloat v)
70 {
71 return _mm256_storeu_ps(p, v);
72 }
73
vloadui2(int32_t * p)74 static vint2 vloadui2(int32_t *p)
75 {
76 vint2 r;
77 r.x = _mm_loadu_si128((__m128i *) p );
78 r.y = _mm_loadu_si128((__m128i *)(p + 4));
79 return r;
80 }
81
vstoreui2(int32_t * p,vint2 v)82 static void vstoreui2(int32_t *p, vint2 v)
83 {
84 _mm_storeu_si128((__m128i *) p , v.x);
85 _mm_storeu_si128((__m128i *)(p + 4), v.y);
86 }
87 #endif
88
89 typedef struct {
90 vdouble x, y;
91 } vdouble2;
92
93 vdouble xldexp(vdouble x, vint q);
94 vint xilogb(vdouble d);
95
96 vdouble xsin(vdouble d);
97 vdouble xcos(vdouble d);
98 vdouble2 xsincos(vdouble d);
99 vdouble xtan(vdouble d);
100 vdouble xasin(vdouble s);
101 vdouble xacos(vdouble s);
102 vdouble xatan(vdouble s);
103 vdouble xatan2(vdouble y, vdouble x);
104 vdouble xlog(vdouble d);
105 vdouble xexp(vdouble d);
106 vdouble xpow(vdouble x, vdouble y);
107
108 vdouble xsinh(vdouble d);
109 vdouble xcosh(vdouble d);
110 vdouble xtanh(vdouble d);
111 vdouble xasinh(vdouble s);
112 vdouble xacosh(vdouble s);
113 vdouble xatanh(vdouble s);
114
115 vdouble xcbrt(vdouble d);
116
117 vdouble xexp2(vdouble a);
118 vdouble xexp10(vdouble a);
119 vdouble xexpm1(vdouble a);
120 vdouble xlog10(vdouble a);
121 vdouble xlog1p(vdouble a);
122
123 //
124
125 typedef struct {
126 vfloat x, y;
127 } vfloat2;
128
129 vfloat xsinf(vfloat d);
130 vfloat xcosf(vfloat d);
131 vfloat2 xsincosf(vfloat d);
132 vfloat xtanf(vfloat d);
133 vfloat xasinf(vfloat s);
134 vfloat xacosf(vfloat s);
135 vfloat xatanf(vfloat s);
136 vfloat xatan2f(vfloat y, vfloat x);
137 vfloat xlogf(vfloat d);
138 vfloat xlogf0(vfloat d);
139 vfloat xexpf(vfloat d);
140 vfloat xcbrtf(vfloat s);
141