1 #ifndef CLUNK_SSE_FFT_CONTEXT_H__
2 #define CLUNK_SSE_FFT_CONTEXT_H__
3 
4 #ifndef USE_SIMD
5 #	error turn on SSE support with USE_SIMD macro
6 #endif
7 
8 #ifndef _USE_MATH_DEFINES
9 #define _USE_MATH_DEFINES
10 #endif
11 #include <math.h>
12 #include <sys/types.h>
13 #include <xmmintrin.h>
14 #include <complex>
15 #include "clunk_assert.h"
16 
17 namespace clunk {
18 
19 struct aligned_allocator {
20 	static void * allocate(size_t size, size_t alignment);
21 	static void deallocate(void *ptr);
22 };
23 
24 template<typename T, int N, int ALIGNMENT = sizeof(T)>
25 class aligned_array {
26 	T * data;
27 public:
aligned_array()28 	aligned_array() : data((T*)aligned_allocator::allocate(sizeof(T) * N, ALIGNMENT)) {}
29 	operator T*() { return data; }
30 	operator const T*() const { return data; }
~aligned_array()31 	~aligned_array() { aligned_allocator::deallocate(data); }
32 };
33 
34 template<int N, typename T>
35 struct sse_danielson_lanczos {
36 	typedef __m128 sse_type;
37 	enum { SSE_DIV = sizeof(sse_type) / sizeof(T) };
38 
39 	typedef sse_danielson_lanczos<N / 2, T> next_type;
40 	next_type next;
41 
42 	aligned_array<sse_type, N / 2> angle_re;
43 	aligned_array<sse_type, N / 2> angle_im;
44 
sse_danielson_lanczossse_danielson_lanczos45 	sse_danielson_lanczos() {
46 		T a = (T)(-2 * M_PI / N / SSE_DIV);
47 		T wtemp = sin(a / 2);
48 
49 		std::complex<T> wp (-2 * wtemp * wtemp, sin(a)), w(1, 0);
50 		for (unsigned i = 0; i < N / 2 ; ++i) {
51 			T w_re_buf[SSE_DIV], w_im_buf[SSE_DIV];
52 			for (unsigned k = 0; k < SSE_DIV; ++k) {
53 				w_re_buf[k] = w.real();
54 				w_im_buf[k] = w.imag();
55 				w += w * wp;
56 			}
57 
58 			angle_re[i] = _mm_loadu_ps(w_re_buf);
59 			angle_im[i] = _mm_loadu_ps(w_im_buf);
60 		}
61 	}
62 
63 	template<int SIGN>
applysse_danielson_lanczos64 	void apply(sse_type * data_re, sse_type * data_im) {
65 		next.template apply<SIGN>(data_re, data_im);
66 		next.template apply<SIGN>(data_re + N / 2, data_im + N / 2);
67 
68 		for (unsigned i = 0; i < N / 2 ; ++i) {
69 			int j = i + N / 2;
70 
71 			sse_type w_re = angle_re[i], w_im = _mm_mul_ps(_mm_set_ps1(SIGN), angle_im[i]);
72 
73 			sse_type temp_re = _mm_sub_ps(_mm_mul_ps(data_re[j], w_re), _mm_mul_ps(data_im[j], w_im)),
74 					 temp_im = _mm_add_ps(_mm_mul_ps(data_im[j], w_re), _mm_mul_ps(data_re[j], w_im));
75 
76 			data_re[j] = _mm_sub_ps(data_re[i], temp_re);
77 			data_im[j] = _mm_sub_ps(data_im[i], temp_im);
78 			data_re[i] = _mm_add_ps(data_re[i], temp_re);
79 			data_im[i] = _mm_add_ps(data_im[i], temp_im);
80 		}
81 	}
82 };
83 
84 template<typename T>
85 struct sse_danielson_lanczos<1, T> {
86 	typedef __m128 sse_type;
87 	enum { SSE_DIV = sizeof(sse_type) / sizeof(T) };
88 
89 	typedef danielson_lanczos<SSE_DIV, T> next_type;
90 
91 	template<int SIGN>
92 	static void apply(sse_type * data_re, sse_type * data_im) {
93 		T re[SSE_DIV], im[SSE_DIV];
94 		_mm_storeu_ps(re, *data_re);
95 		_mm_storeu_ps(im, *data_im);
96 
97 		std::complex<T> data[SSE_DIV];
98 		for(unsigned i = 0; i < SSE_DIV; ++i) {
99 			data[i] = std::complex<T>(re[i], im[i]);
100 		}
101 
102 		next_type::template apply<SIGN>(data);
103 
104 		for(unsigned i = 0; i < SSE_DIV; ++i) {
105 			re[i] = data[i].real();
106 			im[i] = data[i].imag();
107 		}
108 
109 		*data_re = _mm_loadu_ps(re);
110 		*data_im = _mm_loadu_ps(im);
111 	}
112 };
113 
114 template<int BITS>
115 class fft_context<BITS, float> {
116 public:
117 	typedef __m128 sse_type;
118 	enum { N = 1 << BITS };
119 	enum { SSE_DIV = sizeof(sse_type) / sizeof(float) };
120 	enum { SSE_N = (N - 1) / SSE_DIV + 1 };
121 
122 	//clunk_static_assert(SSE_DIV == 4);
123 
124 private:
125 	aligned_array<sse_type, SSE_N> data_re;
126 	aligned_array<sse_type, SSE_N> data_im;
127 
128 public:
129 
130 	typedef std::complex<float> value_type;
131 	value_type data[N];
132 
133 	inline void fft() {
134 		scramble(data);
135 		load();
136 		next.template apply<1>(data_re, data_im);
137 		save();
138 	}
139 
140 	inline void ifft() {
141 		scramble(data);
142 		load();
143 		next.template apply<-1>(data_re, data_im);
144 
145 		sse_type n = _mm_set_ps1(N);
146 		for(unsigned i = 0; i < SSE_N; ++i) {
147 			data_re[i] = _mm_div_ps(data_re[i], n);
148 			data_im[i] = _mm_div_ps(data_im[i], n);
149 		}
150 		save();
151 	}
152 
153 private:
154 	sse_danielson_lanczos<SSE_N, float> next;
155 
156 	static void scramble(std::complex<float> * data) {
157 		int j = 0;
158 		for(int i = 0; i < N; ++i) {
159 			if (i > j) {
160 				std::swap(data[i], data[j]);
161 			}
162 			int m = N / 2;
163 			while(j >= m && m >= 2) {
164 				j -= m;
165 				m >>= 1;
166 			}
167 			j += m;
168 		}
169 	}
170 
171 	void load() {
172 		for(int i = 0; i < SSE_N; ++i) {
173 			float buf_re[SSE_DIV], buf_im[SSE_DIV];
174 			for(int j = 0; j < SSE_DIV; ++j) {
175 				int idx = i * SSE_DIV + j;
176 				buf_re[j] = (idx < N)? data[idx].real(): 0;
177 				buf_im[j] = (idx < N)? data[idx].imag(): 0;
178 			}
179 			data_re[i] = _mm_loadu_ps(buf_re);
180 			data_im[i] = _mm_loadu_ps(buf_im);
181 		}
182 	}
183 
184 	void save() {
185 		for(int i = 0; i < SSE_N; ++i) {
186 			float buf_re[SSE_DIV], buf_im[SSE_DIV];
187 			_mm_storeu_ps(buf_re, data_re[i]);
188 			_mm_storeu_ps(buf_im, data_im[i]);
189 
190 			for(int j = 0; j < SSE_DIV; ++j) {
191 				int idx = i * SSE_DIV + j;
192 				if (idx >= N)
193 					break;
194 				data[idx] = value_type(buf_re[j], buf_im[j]);
195 			}
196 		}
197 	}
198 
199 };
200 
201 }
202 
203 #endif
204