1 #ifndef CLUNK_SSE_FFT_CONTEXT_H__ 2 #define CLUNK_SSE_FFT_CONTEXT_H__ 3 4 #ifndef USE_SIMD 5 # error turn on SSE support with USE_SIMD macro 6 #endif 7 8 #ifndef _USE_MATH_DEFINES 9 #define _USE_MATH_DEFINES 10 #endif 11 #include <math.h> 12 #include <sys/types.h> 13 #include <xmmintrin.h> 14 #include <complex> 15 #include "clunk_assert.h" 16 17 namespace clunk { 18 19 struct aligned_allocator { 20 static void * allocate(size_t size, size_t alignment); 21 static void deallocate(void *ptr); 22 }; 23 24 template<typename T, int N, int ALIGNMENT = sizeof(T)> 25 class aligned_array { 26 T * data; 27 public: aligned_array()28 aligned_array() : data((T*)aligned_allocator::allocate(sizeof(T) * N, ALIGNMENT)) {} 29 operator T*() { return data; } 30 operator const T*() const { return data; } ~aligned_array()31 ~aligned_array() { aligned_allocator::deallocate(data); } 32 }; 33 34 template<int N, typename T> 35 struct sse_danielson_lanczos { 36 typedef __m128 sse_type; 37 enum { SSE_DIV = sizeof(sse_type) / sizeof(T) }; 38 39 typedef sse_danielson_lanczos<N / 2, T> next_type; 40 next_type next; 41 42 aligned_array<sse_type, N / 2> angle_re; 43 aligned_array<sse_type, N / 2> angle_im; 44 sse_danielson_lanczossse_danielson_lanczos45 sse_danielson_lanczos() { 46 T a = (T)(-2 * M_PI / N / SSE_DIV); 47 T wtemp = sin(a / 2); 48 49 std::complex<T> wp (-2 * wtemp * wtemp, sin(a)), w(1, 0); 50 for (unsigned i = 0; i < N / 2 ; ++i) { 51 T w_re_buf[SSE_DIV], w_im_buf[SSE_DIV]; 52 for (unsigned k = 0; k < SSE_DIV; ++k) { 53 w_re_buf[k] = w.real(); 54 w_im_buf[k] = w.imag(); 55 w += w * wp; 56 } 57 58 angle_re[i] = _mm_loadu_ps(w_re_buf); 59 angle_im[i] = _mm_loadu_ps(w_im_buf); 60 } 61 } 62 63 template<int SIGN> applysse_danielson_lanczos64 void apply(sse_type * data_re, sse_type * data_im) { 65 next.template apply<SIGN>(data_re, data_im); 66 next.template apply<SIGN>(data_re + N / 2, data_im + N / 2); 67 68 for (unsigned i = 0; i < N / 2 ; ++i) { 69 int j = i + N / 2; 70 71 sse_type w_re = angle_re[i], w_im = _mm_mul_ps(_mm_set_ps1(SIGN), angle_im[i]); 72 73 sse_type temp_re = _mm_sub_ps(_mm_mul_ps(data_re[j], w_re), _mm_mul_ps(data_im[j], w_im)), 74 temp_im = _mm_add_ps(_mm_mul_ps(data_im[j], w_re), _mm_mul_ps(data_re[j], w_im)); 75 76 data_re[j] = _mm_sub_ps(data_re[i], temp_re); 77 data_im[j] = _mm_sub_ps(data_im[i], temp_im); 78 data_re[i] = _mm_add_ps(data_re[i], temp_re); 79 data_im[i] = _mm_add_ps(data_im[i], temp_im); 80 } 81 } 82 }; 83 84 template<typename T> 85 struct sse_danielson_lanczos<1, T> { 86 typedef __m128 sse_type; 87 enum { SSE_DIV = sizeof(sse_type) / sizeof(T) }; 88 89 typedef danielson_lanczos<SSE_DIV, T> next_type; 90 91 template<int SIGN> 92 static void apply(sse_type * data_re, sse_type * data_im) { 93 T re[SSE_DIV], im[SSE_DIV]; 94 _mm_storeu_ps(re, *data_re); 95 _mm_storeu_ps(im, *data_im); 96 97 std::complex<T> data[SSE_DIV]; 98 for(unsigned i = 0; i < SSE_DIV; ++i) { 99 data[i] = std::complex<T>(re[i], im[i]); 100 } 101 102 next_type::template apply<SIGN>(data); 103 104 for(unsigned i = 0; i < SSE_DIV; ++i) { 105 re[i] = data[i].real(); 106 im[i] = data[i].imag(); 107 } 108 109 *data_re = _mm_loadu_ps(re); 110 *data_im = _mm_loadu_ps(im); 111 } 112 }; 113 114 template<int BITS> 115 class fft_context<BITS, float> { 116 public: 117 typedef __m128 sse_type; 118 enum { N = 1 << BITS }; 119 enum { SSE_DIV = sizeof(sse_type) / sizeof(float) }; 120 enum { SSE_N = (N - 1) / SSE_DIV + 1 }; 121 122 //clunk_static_assert(SSE_DIV == 4); 123 124 private: 125 aligned_array<sse_type, SSE_N> data_re; 126 aligned_array<sse_type, SSE_N> data_im; 127 128 public: 129 130 typedef std::complex<float> value_type; 131 value_type data[N]; 132 133 inline void fft() { 134 scramble(data); 135 load(); 136 next.template apply<1>(data_re, data_im); 137 save(); 138 } 139 140 inline void ifft() { 141 scramble(data); 142 load(); 143 next.template apply<-1>(data_re, data_im); 144 145 sse_type n = _mm_set_ps1(N); 146 for(unsigned i = 0; i < SSE_N; ++i) { 147 data_re[i] = _mm_div_ps(data_re[i], n); 148 data_im[i] = _mm_div_ps(data_im[i], n); 149 } 150 save(); 151 } 152 153 private: 154 sse_danielson_lanczos<SSE_N, float> next; 155 156 static void scramble(std::complex<float> * data) { 157 int j = 0; 158 for(int i = 0; i < N; ++i) { 159 if (i > j) { 160 std::swap(data[i], data[j]); 161 } 162 int m = N / 2; 163 while(j >= m && m >= 2) { 164 j -= m; 165 m >>= 1; 166 } 167 j += m; 168 } 169 } 170 171 void load() { 172 for(int i = 0; i < SSE_N; ++i) { 173 float buf_re[SSE_DIV], buf_im[SSE_DIV]; 174 for(int j = 0; j < SSE_DIV; ++j) { 175 int idx = i * SSE_DIV + j; 176 buf_re[j] = (idx < N)? data[idx].real(): 0; 177 buf_im[j] = (idx < N)? data[idx].imag(): 0; 178 } 179 data_re[i] = _mm_loadu_ps(buf_re); 180 data_im[i] = _mm_loadu_ps(buf_im); 181 } 182 } 183 184 void save() { 185 for(int i = 0; i < SSE_N; ++i) { 186 float buf_re[SSE_DIV], buf_im[SSE_DIV]; 187 _mm_storeu_ps(buf_re, data_re[i]); 188 _mm_storeu_ps(buf_im, data_im[i]); 189 190 for(int j = 0; j < SSE_DIV; ++j) { 191 int idx = i * SSE_DIV + j; 192 if (idx >= N) 193 break; 194 data[idx] = value_type(buf_re[j], buf_im[j]); 195 } 196 } 197 } 198 199 }; 200 201 } 202 203 #endif 204