1 /* -*- c++ -*- */
2 /*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23 /*!
24 * \page volk_32fc_x2_divide_32fc
25 *
26 * \b Overview
27 *
28 * Divide first vector of complexes element-wise by second.
29 *
30 * <b>Dispatcher Prototype</b>
31 * \code
32 * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
33 * \endcode
34 *
35 * \b Inputs
36 * \li numeratorVector: The numerator complex values.
37 * \li numeratorVector: The denumerator complex values.
38 * \li num_points: The number of data points.
39 *
40 * \b Outputs
41 * \li outputVector: The output vector complex floats.
42 *
43 * \b Example
44 * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
45 *
46 * \code
47 * int N = 10;
48 * unsigned int alignment = volk_get_alignment();
49 * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50 * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
51 *
52 * float delta = 2.f*M_PI / (float)N;
53 * for(unsigned int ii = 0; ii < N; ++ii){
54 * float real_1 = std::cos(0.3f * (float)ii);
55 * float imag_1 = std::sin(0.3f * (float)ii);
56 * input_vector[ii] = lv_cmake(real_1, imag_1);
57 * }
58 *
59 * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
60 *
61 * for(unsigned int ii = 0; ii < N; ++ii){
62 * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
63 * }
64 * printf("\n");
65 *
66 * volk_free(input_vector);
67 * volk_free(out);
68 * \endcode
69 */
70
71 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_divide_32fc_u_H
73
74 #include <inttypes.h>
75 #include <volk/volk_complex.h>
76 #include <float.h>
77
78 #ifdef LV_HAVE_SSE3
79 #include <pmmintrin.h>
80 #include <volk/volk_sse3_intrinsics.h>
81
82 static inline void
volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)83 volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
84 const lv_32fc_t* denumeratorVector, unsigned int num_points)
85 {
86 /*
87 * we'll do the "classical"
88 * a a b*
89 * --- = -------
90 * b |b|^2
91 * */
92 unsigned int number = 0;
93 const unsigned int quarterPoints = num_points / 4;
94
95 __m128 num01, num23, den01, den23, norm, result;
96 lv_32fc_t* c = cVector;
97 const lv_32fc_t* a = numeratorVector;
98 const lv_32fc_t* b = denumeratorVector;
99
100 for(; number < quarterPoints; number++){
101 num01 = _mm_loadu_ps((float*) a); // first pair
102 den01 = _mm_loadu_ps((float*) b); // first pair
103 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
104 a += 2;
105 b += 2;
106
107 num23 = _mm_loadu_ps((float*) a); // second pair
108 den23 = _mm_loadu_ps((float*) b); // second pair
109 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
110 a += 2;
111 b += 2;
112
113 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
114 den01 = _mm_unpacklo_ps(norm,norm);
115 den23 = _mm_unpackhi_ps(norm,norm);
116
117 result = _mm_div_ps(num01, den01);
118 _mm_storeu_ps((float*) c, result); // Store the results back into the C container
119 c += 2;
120 result = _mm_div_ps(num23, den23);
121 _mm_storeu_ps((float*) c, result); // Store the results back into the C container
122 c += 2;
123 }
124
125 number *= 4;
126 for(;number < num_points; number++){
127 *c = (*a) / (*b);
128 a++; b++; c++;
129 }
130 }
131 #endif /* LV_HAVE_SSE3 */
132
133
134 #ifdef LV_HAVE_AVX
135 #include <immintrin.h>
136 #include <volk/volk_avx_intrinsics.h>
137
138 static inline void
volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)139 volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
140 const lv_32fc_t* denumeratorVector, unsigned int num_points)
141 {
142 /*
143 * we'll do the "classical"
144 * a a b*
145 * --- = -------
146 * b |b|^2
147 * */
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
150
151 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
152 lv_32fc_t* c = cVector;
153 const lv_32fc_t* a = numeratorVector;
154 const lv_32fc_t* b = denumeratorVector;
155
156 for(; number < quarterPoints; number++){
157 num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
158 denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
159 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
160 sq = _mm256_mul_ps(denum, denum); // Square the values
161 mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
162 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
163 // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
164 div = _mm256_div_ps(mul_conj,mag_sq);
165
166 _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
167
168 a += 4;
169 b += 4;
170 c += 4;
171 }
172
173 number = quarterPoints * 4;
174
175 for(; number < num_points; number++){
176 *c++ = (*a++) / (*b++);
177 }
178
179 }
180 #endif /* LV_HAVE_AVX */
181
182
183 #ifdef LV_HAVE_GENERIC
184
185 static inline void
volk_32fc_x2_divide_32fc_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)186 volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
187 const lv_32fc_t* bVector, unsigned int num_points)
188 {
189 lv_32fc_t* cPtr = cVector;
190 const lv_32fc_t* aPtr = aVector;
191 const lv_32fc_t* bPtr= bVector;
192 unsigned int number = 0;
193
194 for(number = 0; number < num_points; number++){
195 *cPtr++ = (*aPtr++) / (*bPtr++);
196 }
197 }
198 #endif /* LV_HAVE_GENERIC */
199
200
201
202 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
203
204
205 #ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
206 #define INCLUDED_volk_32fc_x2_divide_32fc_a_H
207
208 #include <inttypes.h>
209 #include <stdio.h>
210 #include <volk/volk_complex.h>
211 #include <float.h>
212
213 #ifdef LV_HAVE_SSE3
214 #include <pmmintrin.h>
215 #include <volk/volk_sse3_intrinsics.h>
216
217 static inline void
volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)218 volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
219 const lv_32fc_t* denumeratorVector, unsigned int num_points)
220 {
221 /*
222 * we'll do the "classical"
223 * a a b*
224 * --- = -------
225 * b |b|^2
226 * */
227 unsigned int number = 0;
228 const unsigned int quarterPoints = num_points / 4;
229
230 __m128 num01, num23, den01, den23, norm, result;
231 lv_32fc_t* c = cVector;
232 const lv_32fc_t* a = numeratorVector;
233 const lv_32fc_t* b = denumeratorVector;
234
235 for(; number < quarterPoints; number++){
236 num01 = _mm_load_ps((float*) a); // first pair
237 den01 = _mm_load_ps((float*) b); // first pair
238 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
239 a += 2;
240 b += 2;
241
242 num23 = _mm_load_ps((float*) a); // second pair
243 den23 = _mm_load_ps((float*) b); // second pair
244 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
245 a += 2;
246 b += 2;
247
248 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
249
250 den01 = _mm_unpacklo_ps(norm,norm); // select the lower floats twice
251 den23 = _mm_unpackhi_ps(norm,norm); // select the upper floats twice
252
253 result = _mm_div_ps(num01, den01);
254 _mm_store_ps((float*) c, result); // Store the results back into the C container
255 c += 2;
256 result = _mm_div_ps(num23, den23);
257 _mm_store_ps((float*) c, result); // Store the results back into the C container
258 c += 2;
259 }
260
261 number *= 4;
262 for(;number < num_points; number++){
263 *c = (*a) / (*b);
264 a++; b++; c++;
265 }
266 }
267 #endif /* LV_HAVE_SSE */
268
269 #ifdef LV_HAVE_AVX
270 #include <immintrin.h>
271 #include <volk/volk_avx_intrinsics.h>
272
273 static inline void
volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t * cVector,const lv_32fc_t * numeratorVector,const lv_32fc_t * denumeratorVector,unsigned int num_points)274 volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
275 const lv_32fc_t* denumeratorVector, unsigned int num_points)
276 {
277 /*
278 * we'll do the "classical"
279 * a a b*
280 * --- = -------
281 * b |b|^2
282 * */
283 unsigned int number = 0;
284 const unsigned int quarterPoints = num_points / 4;
285
286 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
287 lv_32fc_t* c = cVector;
288 const lv_32fc_t* a = numeratorVector;
289 const lv_32fc_t* b = denumeratorVector;
290
291 for(; number < quarterPoints; number++){
292 num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
293 denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
294 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
295 sq = _mm256_mul_ps(denum, denum); // Square the values
296 mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
297 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
298 // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
299 div = _mm256_div_ps(mul_conj,mag_sq);
300
301 _mm256_store_ps((float*) c, div); // Store the results back into the C container
302
303 a += 4;
304 b += 4;
305 c += 4;
306 }
307
308 number = quarterPoints * 4;
309
310 for(; number < num_points; number++){
311 *c++ = (*a++) / (*b++);
312 }
313
314
315 }
316 #endif /* LV_HAVE_AVX */
317
318 #ifdef LV_HAVE_NEON
319 #include <arm_neon.h>
320
321 static inline void
volk_32fc_x2_divide_32fc_neon(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)322 volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
323 const lv_32fc_t* bVector, unsigned int num_points)
324 {
325 lv_32fc_t* cPtr = cVector;
326 const lv_32fc_t* aPtr = aVector;
327 const lv_32fc_t* bPtr = bVector;
328
329 float32x4x2_t aVal, bVal, cVal;
330 float32x4_t bAbs, bAbsInv;
331
332 const unsigned int quarterPoints = num_points / 4;
333 unsigned int number = 0;
334 for(; number < quarterPoints; number++){
335 aVal = vld2q_f32((const float*)(aPtr));
336 bVal = vld2q_f32((const float*)(bPtr));
337 aPtr += 4;
338 bPtr += 4;
339 __VOLK_PREFETCH(aPtr+4);
340 __VOLK_PREFETCH(bPtr+4);
341
342 bAbs = vmulq_f32( bVal.val[0], bVal.val[0]);
343 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
344
345 bAbsInv = vrecpeq_f32(bAbs);
346 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
347 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
348
349 cVal.val[0] = vmulq_f32( aVal.val[0], bVal.val[0]);
350 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
351 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
352
353 cVal.val[1] = vmulq_f32( aVal.val[1], bVal.val[0]);
354 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
355 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
356
357 vst2q_f32((float*)(cPtr), cVal);
358 cPtr += 4;
359 }
360
361 for(number = quarterPoints * 4; number < num_points; number++){
362 *cPtr++ = (*aPtr++) / (*bPtr++);
363 }
364 }
365 #endif /* LV_HAVE_NEON */
366
367
368 #ifdef LV_HAVE_GENERIC
369
370 static inline void
volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t * cVector,const lv_32fc_t * aVector,const lv_32fc_t * bVector,unsigned int num_points)371 volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
372 const lv_32fc_t* bVector, unsigned int num_points)
373 {
374 lv_32fc_t* cPtr = cVector;
375 const lv_32fc_t* aPtr = aVector;
376 const lv_32fc_t* bPtr= bVector;
377 unsigned int number = 0;
378
379 for(number = 0; number < num_points; number++){
380 *cPtr++ = (*aPtr++) / (*bPtr++);
381 }
382 }
383 #endif /* LV_HAVE_GENERIC */
384
385
386 #endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
387